• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x8)
22
23#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x14 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0
45# A1   v1
46# A2   v2
47# A3   v3
48# A4   v4
49# A5   v5
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6, (v4), (v5)
58# unused     v7
59# unused A   v8 v9 v10 v11
60# unused B   v12 v13 v14 v15
61
62
63BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
64
65        # Load params pointer
66        LDR     x8, [sp, 8]
67
68        # Clamp A and C pointers
69        CMP     x0, 2                   // if mr < 2
70        ADD     x9, x3, x4              // a1 = a0 + a_stride
71        ADD     x16, x6, x7             // c1 = c0 + cm_stride
72        CSEL    x9, x3, x9, LO          //   a1 = a0
73        CSEL    x16, x6, x16, LO        //   c1 = c0
74
75        # Load params
76        LDR     d6, [x8]
77
78        ADD     x10, x9, x4             // a2 = a1 + a_stride
79        ADD     x17, x16, x7            // c2 = c1 + cm_stride
80                                        // if mr <= 2
81        CSEL    x10, x9, x10, LS        //   a2 = a1
82        CSEL    x17, x16, x17, LS       //   c2 = c1
83
84        CMP     x0, 4                   // if mr < 4
85        ADD     x11, x10, x4            // a3 = a2 + a_stride
86        ADD     x14, x17, x7            // c3 = c2 + cm_stride
87        CSEL    x11, x10, x11, LO       //   a3 = a2
88        CSEL    x14, x17, x14, LO       //   c3 = c2
89
90        ADD     x12, x11, x4            // a4 = a3 + a_stride
91        ADD     x13, x14, x7            // c4 = c3 + cm_stride
92                                        // if mr <= 4
93        CSEL    x12, x11, x12, LS       //   a4 = a3
94        CSEL    x13, x14, x13, LS       //   c4 = c3
95
96        CMP     x0, 6                   // if mr < 6
97        ADD     x4, x12, x4             // a5 = a4 + a_stride
98        ADD     x7, x13, x7             // c5 = c4 + cm_stride
99        CSEL    x4, x12, x4, LO         //   a5 = a4
100        CSEL    x7, x13, x7, LO         //   c5 = c4
101
102        LDR     x8, [sp]                // load cn_stride
103
1040:
105        # Load initial bias from w into accumulators
106        LDP     q20, q21, [x5], 32
107        MOV     v22.16b, v20.16b
108        MOV     v23.16b, v21.16b
109        MOV     v24.16b, v20.16b
110        MOV     v25.16b, v21.16b
111        MOV     v26.16b, v20.16b
112        MOV     v27.16b, v21.16b
113        MOV     v28.16b, v20.16b
114        MOV     v29.16b, v21.16b
115        MOV     v30.16b, v20.16b
116        MOV     v31.16b, v21.16b
117
118        # Is there at least 2 halffloats (4 bytes)?
119        SUBS    x0, x2, 4               // k = kc - 4
120        B.LO    4f
121
122        # Prologue - load 4 A and 2 B
123
124        LDR     s0,  [x3], 4
125        LDR     q16, [x5], 16
126        LDR     q17, [x5], 16
127        LDR     s1,  [x9], 4
128        LDR     s2, [x10], 4
129        LDR     s3, [x11], 4
130
131        # Is there at least 2 halffloats for main loop?
132        SUBS    x0, x0, 4
133        B.LO    2f
134
135        # Main loop - 2 halffloats of A (4 bytes)
136        # 24 FMA + 6 ld32 A + 4 LDR B
1371:
138        FMLA    v20.8h, v16.8h,  v0.h[0]
139        LDR     s4, [x12], 4
140        FMLA    v21.8h, v17.8h,  v0.h[0]
141        LDR     s5,  [x4], 4
142        FMLA    v22.8h, v16.8h,  v1.h[0]
143        LDR     q18, [x5], 16
144        FMLA    v23.8h, v17.8h,  v1.h[0]
145        FMLA    v24.8h, v16.8h,  v2.h[0]
146        LDR     q19, [x5], 16
147        FMLA    v25.8h, v17.8h,  v2.h[0]
148        FMLA    v26.8h, v16.8h,  v3.h[0]
149        FMLA    v27.8h, v17.8h,  v3.h[0]
150        FMLA    v28.8h, v16.8h,  v4.h[0]
151        FMLA    v29.8h, v17.8h,  v4.h[0]
152        FMLA    v30.8h, v16.8h,  v5.h[0]
153        FMLA    v31.8h, v17.8h,  v5.h[0]
154        SUBS    x0, x0, 4
155
156        FMLA    v20.8h, v18.8h,  v0.h[1]
157        LDR     q16, [x5], 16
158        FMLA    v21.8h, v19.8h,  v0.h[1]
159        FMLA    v22.8h, v18.8h,  v1.h[1]
160        LDR     q17, [x5], 16
161        FMLA    v23.8h, v19.8h,  v1.h[1]
162        FMLA    v24.8h, v18.8h,  v2.h[1]
163        LDR     s0,  [x3], 4
164        FMLA    v25.8h, v19.8h,  v2.h[1]
165        FMLA    v26.8h, v18.8h,  v3.h[1]
166        LDR     s1,  [x9], 4
167        FMLA    v27.8h, v19.8h,  v3.h[1]
168        FMLA    v28.8h, v18.8h,  v4.h[1]
169        LDR     s2, [x10], 4
170        FMLA    v29.8h, v19.8h,  v4.h[1]
171        FMLA    v30.8h, v18.8h,  v5.h[1]
172        LDR     s3, [x11], 4
173        FMLA    v31.8h, v19.8h,  v5.h[1]
174        B.HS    1b
175
176        # Epilogue - same as main loop but no loads for next loop
1772:
178        FMLA    v20.8h, v16.8h,  v0.h[0]
179        LDR     s4, [x12], 4
180        FMLA    v21.8h, v17.8h,  v0.h[0]
181        LDR     s5,  [x4], 4
182        FMLA    v22.8h, v16.8h,  v1.h[0]
183        LDR     q18, [x5], 16
184        FMLA    v23.8h, v17.8h,  v1.h[0]
185        FMLA    v24.8h, v16.8h,  v2.h[0]
186        LDR     q19, [x5], 16
187        FMLA    v25.8h, v17.8h,  v2.h[0]
188        FMLA    v26.8h, v16.8h,  v3.h[0]
189        FMLA    v27.8h, v17.8h,  v3.h[0]
190        FMLA    v28.8h, v16.8h,  v4.h[0]
191        FMLA    v29.8h, v17.8h,  v4.h[0]
192        FMLA    v30.8h, v16.8h,  v5.h[0]
193        FMLA    v31.8h, v17.8h,  v5.h[0]
194
195        FMLA    v20.8h, v18.8h,  v0.h[1]
196        FMLA    v21.8h, v19.8h,  v0.h[1]
197        FMLA    v22.8h, v18.8h,  v1.h[1]
198        FMLA    v23.8h, v19.8h,  v1.h[1]
199        FMLA    v24.8h, v18.8h,  v2.h[1]
200        FMLA    v25.8h, v19.8h,  v2.h[1]
201        FMLA    v26.8h, v18.8h,  v3.h[1]
202        FMLA    v27.8h, v19.8h,  v3.h[1]
203        FMLA    v28.8h, v18.8h,  v4.h[1]
204        FMLA    v29.8h, v19.8h,  v4.h[1]
205        FMLA    v30.8h, v18.8h,  v5.h[1]
206        FMLA    v31.8h, v19.8h,  v5.h[1]
207
208        # Is there a remainder?- 1 halffloat of A (2 bytes)
209        TBNZ    x0, 1, 4f
2103:
211        # Scale and Clamp
212        FMUL    v20.8h, v20.8h, v6.h[0]
213        DUP     v4.8h, v6.h[1]
214        FMUL    v21.8h, v21.8h, v6.h[0]
215        DUP     v5.8h, v6.h[2]
216        FMUL    v22.8h, v22.8h, v6.h[0]
217        FMUL    v23.8h, v23.8h, v6.h[0]
218        FMUL    v24.8h, v24.8h, v6.h[0]
219        FMUL    v25.8h, v25.8h, v6.h[0]
220        FMUL    v26.8h, v26.8h, v6.h[0]
221        FMUL    v27.8h, v27.8h, v6.h[0]
222        FMUL    v28.8h, v28.8h, v6.h[0]
223        FMUL    v29.8h, v29.8h, v6.h[0]
224        FMUL    v30.8h, v30.8h, v6.h[0]
225        FMUL    v31.8h, v31.8h, v6.h[0]
226        FMAX    v20.8h, v20.8h, v4.8h
227        FMAX    v21.8h, v21.8h, v4.8h
228        FMAX    v22.8h, v22.8h, v4.8h
229        FMAX    v23.8h, v23.8h, v4.8h
230        FMAX    v24.8h, v24.8h, v4.8h
231        FMAX    v25.8h, v25.8h, v4.8h
232        FMAX    v26.8h, v26.8h, v4.8h
233        FMAX    v27.8h, v27.8h, v4.8h
234        FMAX    v28.8h, v28.8h, v4.8h
235        FMAX    v29.8h, v29.8h, v4.8h
236        FMAX    v30.8h, v30.8h, v4.8h
237        FMAX    v31.8h, v31.8h, v4.8h
238        SUBS    x1, x1, 16
239        FMIN    v20.8h, v20.8h, v5.8h
240        FMIN    v21.8h, v21.8h, v5.8h
241        FMIN    v22.8h, v22.8h, v5.8h
242        FMIN    v23.8h, v23.8h, v5.8h
243        FMIN    v24.8h, v24.8h, v5.8h
244        FMIN    v25.8h, v25.8h, v5.8h
245        FMIN    v26.8h, v26.8h, v5.8h
246        FMIN    v27.8h, v27.8h, v5.8h
247        FMIN    v28.8h, v28.8h, v5.8h
248        FMIN    v29.8h, v29.8h, v5.8h
249        FMIN    v30.8h, v30.8h, v5.8h
250        FMIN    v31.8h, v31.8h, v5.8h
251
252        # Store full 6 x 16
253        B.LO    5f
254
255        ST1     {v20.16b, v21.16b},  [x6], x8
256        SUB     x3,  x3, x2             // a0 -= kc
257        ST1     {v22.16b, v23.16b}, [x16], x8
258        SUB     x9,  x9, x2             // a1 -= kc
259        ST1     {v24.16b, v25.16b}, [x17], x8
260        SUB     x10, x10, x2            // a2 -= kc
261        ST1     {v26.16b, v27.16b}, [x14], x8
262        SUB     x11, x11, x2            // a3 -= kc
263        ST1     {v28.16b, v29.16b}, [x13], x8
264        SUB     x12, x12, x2            // a4 -= kc
265        ST1     {v30.16b, v31.16b},  [x7], x8
266        SUB     x4,  x4, x2             // a5 -= kc
267
268        B.HI    0b
269        RET
270
2714:
272        # Remainder- 1 halffloat of A (2 bytes)
273        LDR     h0,  [x3], 2
274        LDR     q16, [x5], 16
275        LDR     q17, [x5], 16
276        LDR     h1,  [x9], 2
277        LDR     h2, [x10], 2
278        LDR     h3, [x11], 2
279        LDR     h4, [x12], 2
280        LDR     h5,  [x4], 2
281        FMLA    v20.8h, v16.8h,  v0.h[0]
282        FMLA    v22.8h, v16.8h,  v1.h[0]
283        FMLA    v24.8h, v16.8h,  v2.h[0]
284        FMLA    v26.8h, v16.8h,  v3.h[0]
285        FMLA    v28.8h, v16.8h,  v4.h[0]
286        FMLA    v30.8h, v16.8h,  v5.h[0]
287        FMLA    v21.8h, v17.8h,  v0.h[0]
288        FMLA    v23.8h, v17.8h,  v1.h[0]
289        FMLA    v25.8h, v17.8h,  v2.h[0]
290        FMLA    v27.8h, v17.8h,  v3.h[0]
291        FMLA    v29.8h, v17.8h,  v4.h[0]
292        FMLA    v31.8h, v17.8h,  v5.h[0]
293        B       3b
294
295        # Store odd width
2965:
297        TBZ     x1, 3, 6f
298        STR     q20,  [x6], 16
299        MOV     v20.16b, v21.16b
300        STR     q22, [x16], 16
301        MOV     v22.16b, v23.16b
302        STR     q24, [x17], 16
303        MOV     v24.16b, v25.16b
304        STR     q26, [x14], 16
305        MOV     v26.16b, v27.16b
306        STR     q28, [x13], 16
307        MOV     v28.16b, v29.16b
308        STR     q30,  [x7], 16
309        MOV     v30.16b, v31.16b
310
3116:
312        TBZ     x1, 2, 7f
313        STR     d20,  [x6], 8
314        STR     d22, [x16], 8
315        DUP     d20, v20.d[1]
316        DUP     d22, v22.d[1]
317        STR     d24, [x17], 8
318        STR     d26, [x14], 8
319        DUP     d24, v24.d[1]
320        DUP     d26, v26.d[1]
321        STR     d28, [x13], 8
322        STR     d30,  [x7], 8
323        DUP     d28, v28.d[1]
324        DUP     d30, v30.d[1]
325
3267:
327        TBZ     x1, 1, 8f
328        STR     s20,  [x6], 4
329        STR     s22, [x16], 4
330        DUP     s20, v20.s[1]
331        DUP     s22, v22.s[1]
332        STR     s24, [x17], 4
333        STR     s26, [x14], 4
334        DUP     s24, v24.s[1]
335        DUP     s26, v26.s[1]
336        STR     s28, [x13], 4
337        STR     s30,  [x7], 4
338        DUP     s28, v28.s[1]
339        DUP     s30, v30.s[1]
340
3418:
342        TBZ     x1, 0, 9f
343        STR     h20,  [x6]
344        STR     h22, [x16]
345        STR     h24, [x17]
346        STR     h26, [x14]
347        STR     h28, [x13]
348        STR     h30,  [x7]
3499:
350        RET
351
352END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
353
354#ifdef __ELF__
355.section ".note.GNU-stack","",%progbits
356#endif
357