• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0
44# A1   v1
45# A2   v2
46# A3   v3
47# A4   v4
48# A5   v5
49# B   v16 v17 v18 v19
50# C   v20
51# C   v22
52# C   v24
53# C   v26
54# C   v28
55# C   v30
56# Clamp v6, (v4), (v5)
57# unused A   v8 v9 v10 v11
58# unused B   v12 v13 v14 v15
59
60
61BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
62
63        # Load params pointer
64        LDR x8, [sp, 8]
65
66        # Clamp A and C pointers
67        CMP x0, 2                // if mr < 2
68        ADD x9, x3, x4           // a1 = a0 + a_stride
69        ADD x16, x6, x7          // c1 = c0 + cm_stride
70        CSEL x9, x3, x9, LO      //   a1 = a0
71        CSEL x16, x6, x16, LO    //   c1 = c0
72
73        ADD x10, x9, x4          // a2 = a1 + a_stride
74        ADD x17, x16, x7         // c2 = c1 + cm_stride
75                                 // if mr <= 2
76        CSEL x10, x9, x10, LS    //   a2 = a1
77        CSEL x17, x16, x17, LS   //   c2 = c1
78
79        CMP x0, 4                // if mr < 4
80        ADD x11, x10, x4         // a3 = a2 + a_stride
81        ADD x14, x17, x7         // c3 = c2 + cm_stride
82        CSEL x11, x10, x11, LO   //   a3 = a2
83        CSEL x14, x17, x14, LO   //   c3 = c2
84
85        ADD x12, x11, x4         // a4 = a3 + a_stride
86        ADD x13, x14, x7         // c4 = c3 + cm_stride
87                                 // if mr <= 4
88        CSEL x12, x11, x12, LS   //   a4 = a3
89        CSEL x13, x14, x13, LS   //   c4 = c3
90
91        CMP x0, 6                // if mr < 6
92        ADD x4, x12, x4          // a5 = a4 + a_stride
93        ADD x7, x13, x7          // c5 = c4 + cm_stride
94        CSEL x4, x12, x4, LO     //   a5 = a4
95        CSEL x7, x13, x7, LO     //   c5 = c4
96
97        # Load params scale value
98        LD1R {v6.8h}, [x8]
99        ADD x8, x8, 2
100
1010:
102        # Load initial bias from w into accumulators
103        LDR q20, [x5], 16
104        MOV v22.16b, v20.16b
105        MOV v24.16b, v20.16b
106        MOV v26.16b, v20.16b
107        MOV v28.16b, v20.16b
108        MOV v30.16b, v20.16b
109
110         # Is there at least 4 halffloats (8 bytes)?
111        SUBS x0, x2, 8  // k = kc - 8
112        B.LO 3f
113
114        # Main loop - 4 halffloats of A (8 bytes)
115        # 24 FMA + 6 ld64 A + 4 LDR B
1161:
117        LDR   d0,  [x3], 8
118        LDR  q16, [x5], 16
119        LDR  q17, [x5], 16
120        LDR   d1,  [x9], 8
121        LDR   d2, [x10], 8
122        LDR   d3, [x11], 8
123        LDR   d4, [x12], 8
124        LDR   d5,  [x4], 8
125        SUBS x0, x0, 8
126        FMLA v20.8h, v16.8h,  v0.h[0]
127        FMLA v22.8h, v16.8h,  v1.h[0]
128        FMLA v24.8h, v16.8h,  v2.h[0]
129        FMLA v26.8h, v16.8h,  v3.h[0]
130        FMLA v28.8h, v16.8h,  v4.h[0]
131        FMLA v30.8h, v16.8h,  v5.h[0]
132        LDR  q18, [x5], 16
133        LDR  q19, [x5], 16
134
135        FMLA v20.8h, v17.8h,  v0.h[1]
136        FMLA v22.8h, v17.8h,  v1.h[1]
137        FMLA v24.8h, v17.8h,  v2.h[1]
138        FMLA v26.8h, v17.8h,  v3.h[1]
139        FMLA v28.8h, v17.8h,  v4.h[1]
140        FMLA v30.8h, v17.8h,  v5.h[1]
141
142        FMLA v20.8h, v18.8h,  v0.h[2]
143        FMLA v22.8h, v18.8h,  v1.h[2]
144        FMLA v24.8h, v18.8h,  v2.h[2]
145        FMLA v26.8h, v18.8h,  v3.h[2]
146        FMLA v28.8h, v18.8h,  v4.h[2]
147        FMLA v30.8h, v18.8h,  v5.h[2]
148
149        FMLA v20.8h, v19.8h,  v0.h[3]
150        FMLA v22.8h, v19.8h,  v1.h[3]
151        FMLA v24.8h, v19.8h,  v2.h[3]
152        FMLA v26.8h, v19.8h,  v3.h[3]
153        FMLA v28.8h, v19.8h,  v4.h[3]
154        FMLA v30.8h, v19.8h,  v5.h[3]
155        B.HS 1b
156
157        # Is there a remainder?- 2 halffloats of A (4 bytes)
158        TBNZ x0, 2, 4f
159        # Is there a remainder?- 1 halffloats of A (2 bytes)
160        TBNZ x0, 1, 5f
1612:
162        # Scale and Clamp
163        FMUL v20.8h, v20.8h, v6.8h
164        # Load params values
165        LD2R {v4.8h, v5.8h}, [x8]
166        FMUL v22.8h, v22.8h, v6.8h
167        FMUL v24.8h, v24.8h, v6.8h
168        FMUL v26.8h, v26.8h, v6.8h
169        FMUL v28.8h, v28.8h, v6.8h
170        FMUL v30.8h, v30.8h, v6.8h
171        # Load cn_stride
172        LDR x0, [sp, 0]
173        FMAX v20.8h, v20.8h, v4.8h
174        FMAX v22.8h, v22.8h, v4.8h
175        FMAX v24.8h, v24.8h, v4.8h
176        FMAX v26.8h, v26.8h, v4.8h
177        FMAX v28.8h, v28.8h, v4.8h
178        FMAX v30.8h, v30.8h, v4.8h
179        SUBS x1, x1, 8
180        FMIN v20.8h, v20.8h, v5.8h
181        FMIN v22.8h, v22.8h, v5.8h
182        FMIN v24.8h, v24.8h, v5.8h
183        FMIN v26.8h, v26.8h, v5.8h
184        FMIN v28.8h, v28.8h, v5.8h
185        FMIN v30.8h, v30.8h, v5.8h
186
187        # Store full 6 x 8
188        B.LO 6f
189
190        ST1 {v20.16b},  [x6], x0
191        SUB  x3,  x3, x2 // a0 -= kc
192        ST1 {v22.16b}, [x16], x0
193        SUB  x9,  x9, x2 // a1 -= kc
194        ST1 {v24.16b}, [x17], x0
195        SUB x10, x10, x2 // a2 -= kc
196        ST1 {v26.16b}, [x14], x0
197        SUB x11, x11, x2 // a3 -= kc
198        ST1 {v28.16b}, [x13], x0
199        SUB x12, x12, x2 // a4 -= kc
200        ST1 {v30.16b},  [x7], x0
201        SUB  x4,  x4, x2 // a5 -= kc
202
203        B.HI 0b
204        RET
205
2063:
207        TBZ x0, 2, 5f
2084:
209        # Remainder- 2 halffloats of A (4 bytes)
210        LDR   s0,  [x3], 4
211        LDR  q16, [x5], 16
212        LDR  q17, [x5], 16
213        LDR   s1,  [x9], 4
214        LDR   s2, [x10], 4
215        LDR   s3, [x11], 4
216        LDR   s4, [x12], 4
217        LDR   s5,  [x4], 4
218
219        FMLA v20.8h, v16.8h,  v0.h[0]
220        FMLA v22.8h, v16.8h,  v1.h[0]
221        FMLA v24.8h, v16.8h,  v2.h[0]
222        FMLA v26.8h, v16.8h,  v3.h[0]
223        FMLA v28.8h, v16.8h,  v4.h[0]
224        FMLA v30.8h, v16.8h,  v5.h[0]
225
226        FMLA v20.8h, v17.8h,  v0.h[1]
227        FMLA v22.8h, v17.8h,  v1.h[1]
228        FMLA v24.8h, v17.8h,  v2.h[1]
229        FMLA v26.8h, v17.8h,  v3.h[1]
230        FMLA v28.8h, v17.8h,  v4.h[1]
231        FMLA v30.8h, v17.8h,  v5.h[1]
232
233        TBZ x0, 1, 2b
234
2355:
236        # Remainder- 1 halffloat of A (2 bytes)
237        LDR   h0,  [x3], 2
238        LDR  q16,  [x5], 16
239        LDR   h1,  [x9], 2
240        LDR   h2, [x10], 2
241        LDR   h3, [x11], 2
242        LDR   h4, [x12], 2
243        LDR   h5,  [x4], 2
244        FMLA v20.8h, v16.8h,  v0.h[0]
245        FMLA v22.8h, v16.8h,  v1.h[0]
246        FMLA v24.8h, v16.8h,  v2.h[0]
247        FMLA v26.8h, v16.8h,  v3.h[0]
248        FMLA v28.8h, v16.8h,  v4.h[0]
249        FMLA v30.8h, v16.8h,  v5.h[0]
250        B 2b
251
252        # Store odd width
2536:
254        TBZ x1, 2, 7f
255        STR d20,  [x6], 8
256        DUP d20, v20.d[1]
257        STR d22, [x16], 8
258        DUP d22, v22.d[1]
259        STR d24, [x17], 8
260        DUP d24, v24.d[1]
261        STR d26, [x14], 8
262        DUP d26, v26.d[1]
263        STR d28, [x13], 8
264        DUP d28, v28.d[1]
265        STR d30,  [x7], 8
266        DUP d30, v30.d[1]
267
2687:
269        TBZ x1, 1, 8f
270        STR s20,  [x6], 4
271        DUP s20, v20.s[1]
272        STR s22, [x16], 4
273        DUP s22, v22.s[1]
274        STR s24, [x17], 4
275        DUP s24, v24.s[1]
276        STR s26, [x14], 4
277        DUP s26, v26.s[1]
278        STR s28, [x13], 4
279        DUP s28, v28.s[1]
280        STR s30,  [x7], 4
281        DUP s30, v30.s[1]
282
2838:
284        TBZ x1, 0, 9f
285        STR h20,  [x6]
286        STR h22, [x16]
287        STR h24, [x17]
288        STR h26, [x14]
289        STR h28, [x13]
290        STR h30,  [x7]
2919:
292        RET
293
294END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
295
296#ifdef __ELF__
297.section ".note.GNU-stack","",%progbits
298#endif
299