• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/4x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# Vector register usage
39# A0   v0
40# A1   v1
41# A2   v2
42# A3   v3
43# B   v20 v21 v22 v23
44# C   v16
45# C   v18
46# C   v28
47# C   v30
48# Clamp v4, v5, v6
49# unused A   v7 v8 v9 v10 v11
50# unused B   v19
51
52BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64
53
54        # Load cn_stride, params pointer
55        LDP x14, x8, [sp]
56
57        # Load params values
58        LD3R {v4.8h, v5.8h, v6.8h}, [x8]
59
60        # Clamp A and C pointers
61        CMP x0, 2                // if mr < 2
62        ADD x11, x3, x4          // a1 = a0 + a_stride
63        ADD x9, x6, x7           // c1 = c0 + cm_stride
64        CSEL x11, x3, x11, LO    //   a1 = a0
65        CSEL x9, x6, x9, LO      //   c1 = c0
66
67        ADD x12, x11, x4         // a2 = a1 + a_stride
68        ADD x10, x9, x7          // c2 = c1 + cm_stride
69                                 // if mr <= 2
70        CSEL x12, x11, x12, LS   //   a2 = a1
71        CSEL x10, x9, x10, LS    //   c2 = c1
72
73        CMP x0, 4                // if mr < 4
74        ADD x4, x12, x4          // a3 = a2 + a_stride
75        ADD x7, x10, x7          // c3 = c2 + cm_stride
76        CSEL x4, x12, x4, LO     //   a3 = a2
77        CSEL x7, x10, x7, LO     //   c3 = c2
78
790:
80        # Load initial bias from w into accumulators
81        LDR q16, [x5], 16
82        MOV v18.16b, v16.16b
83        MOV v28.16b, v16.16b
84        MOV v30.16b, v16.16b
85
86        # Is there at least 4 halffloats (8 bytes)?
87        SUBS x0, x2, 8  // k = kc - 8
88        B.LO 3f
89
90        # Main loop - 4 halffloats of A (8 bytes)
911:
92        LDR  d0,  [x3], 8
93        LDR q20,  [x5], 16
94        LDR q21,  [x5], 16
95        LDR  d1, [x11], 8
96        LDR  d2, [x12], 8
97        LDR  d3,  [x4], 8
98        SUBS x0, x0, 8
99        FMLA v16.8h, v20.8h, v0.h[0]
100        FMLA v18.8h, v20.8h, v1.h[0]
101        FMLA v28.8h, v20.8h, v2.h[0]
102        FMLA v30.8h, v20.8h, v3.h[0]
103        LDR q22, [x5], 16
104        LDR q23, [x5], 16
105
106        FMLA v16.8h, v21.8h, v0.h[1]
107        FMLA v18.8h, v21.8h, v1.h[1]
108        FMLA v28.8h, v21.8h, v2.h[1]
109        FMLA v30.8h, v21.8h, v3.h[1]
110
111        FMLA v16.8h, v22.8h, v0.h[2]
112        FMLA v18.8h, v22.8h, v1.h[2]
113        FMLA v28.8h, v22.8h, v2.h[2]
114        FMLA v30.8h, v22.8h, v3.h[2]
115
116        FMLA v16.8h, v23.8h, v0.h[3]
117        FMLA v18.8h, v23.8h, v1.h[3]
118        FMLA v28.8h, v23.8h, v2.h[3]
119        FMLA v30.8h, v23.8h, v3.h[3]
120        B.HS 1b
121
122        # Is there a remainder?- 2 halffloats of A (4 bytes)
123        TBNZ x0, 2, 4f
124        # Is there a remainder?- 1 halffloats of A (2 bytes)
125        TBNZ x0, 1, 5f
1262:
127        # Scale and Clamp
128        FMUL v16.8h, v16.8h, v4.8h
129        SUBS x1, x1, 8
130        FMUL v18.8h, v18.8h, v4.8h
131        FMUL v28.8h, v28.8h, v4.8h
132        FMUL v30.8h, v30.8h, v4.8h
133        FMAX v16.8h, v16.8h, v5.8h
134        FMAX v18.8h, v18.8h, v5.8h
135        FMAX v28.8h, v28.8h, v5.8h
136        FMAX v30.8h, v30.8h, v5.8h
137        FMIN v16.8h, v16.8h, v6.8h
138        FMIN v18.8h, v18.8h, v6.8h
139        FMIN v28.8h, v28.8h, v6.8h
140        FMIN v30.8h, v30.8h, v6.8h
141
142        # Store full 4 x 8
143        B.LO 6f
144
145        ST1 {v16.16b},  [x6], x14
146        SUB  x3,  x3, x2 // a0 -= kc
147        ST1 {v18.16b},  [x9], x14
148        SUB x11, x11, x2 // a1 -= kc
149        ST1 {v28.16b}, [x10], x14
150        SUB x12, x12, x2 // a2 -= kc
151        ST1 {v30.16b},  [x7], x14
152        SUB  x4,  x4, x2 // a3 -= kc
153
154        B.HI 0b
155        RET
156
1573:
158        TBZ x0, 2, 5f
1594:
160        # Remainder- 2 halffloats of A (4 bytes)
161        LDR s0,  [x3], 4
162        LDR q20, [x5], 16
163        LDR q21, [x5], 16
164        LDR s1, [x11], 4
165        LDR s2, [x12], 4
166        LDR s3,  [x4], 4
167
168        FMLA v16.8h, v20.8h, v0.h[0]
169        FMLA v18.8h, v20.8h, v1.h[0]
170        FMLA v28.8h, v20.8h, v2.h[0]
171        FMLA v30.8h, v20.8h, v3.h[0]
172
173        FMLA v16.8h, v21.8h, v0.h[1]
174        FMLA v18.8h, v21.8h, v1.h[1]
175        FMLA v28.8h, v21.8h, v2.h[1]
176        FMLA v30.8h, v21.8h, v3.h[1]
177
178        TBZ x0, 1, 2b
179
1805:
181        # Remainder- 1 halffloat of A (2 bytes)
182        LDR h0,  [x3], 2
183        LDR q20, [x5], 16
184        LDR h1, [x11], 2
185        LDR h2, [x12], 2
186        LDR h3 , [x4], 2
187        FMLA v16.8h, v20.8h, v0.h[0]
188        FMLA v18.8h, v20.8h, v1.h[0]
189        FMLA v28.8h, v20.8h, v2.h[0]
190        FMLA v30.8h, v20.8h, v3.h[0]
191        B 2b
192
193        # Store odd width
1946:
195        TBZ x1, 2, 7f
196        STR d16, [x6], 8
197        DUP d16, v16.d[1]
198        STR d18, [x9], 8
199        DUP d18, v18.d[1]
200        STR d28, [x10], 8
201        DUP d28, v28.d[1]
202        STR d30, [x7], 8
203        DUP d30, v30.d[1]
204
2057:
206        TBZ x1, 1, 8f
207        STR s16,  [x6], 4
208        DUP s16, v16.s[1]
209        STR s18,  [x9], 4
210        DUP s18, v18.s[1]
211        STR s28, [x10], 4
212        DUP s28, v28.s[1]
213        STR s30,  [x7], 4
214        DUP s30, v30.s[1]
215
2168:
217        TBZ x1, 0, 9f
218        STR h16,  [x6]
219        STR h18,  [x9]
220        STR h28, [x10]
221        STR h30,  [x7]
2229:
223        RET
224
225END_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64
226
227#ifdef __ELF__
228.section ".note.GNU-stack","",%progbits
229#endif
230