• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x8)
22
23#     const float*restrict acc,  [sp + 8] -> x15
24#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29#  x3 a0
30#  x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34#  x4 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x14 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0
46# A1   v1
47# A2   v2
48# A3   v3
49# A4   v4
50# A5   v5
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6, (v4), (v5)
59# unused     v7
60# unused A   v8 v9 v10 v11
61# unused B   v12 v13 v14 v15
62
63
64BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
65
66        # Load acc, params pointer
67        LDP     x15, x8, [sp, 8]
68
69        # Clamp A and C pointers
70        CMP     x0, 2                   // if mr < 2
71        ADD     x9, x3, x4              // a1 = a0 + a_stride
72        ADD     x16, x6, x7             // c1 = c0 + cm_stride
73        CSEL    x9, x3, x9, LO          //   a1 = a0
74        CSEL    x16, x6, x16, LO        //   c1 = c0
75
76        # Load params
77        LDR     d6, [x8]
78
79        ADD     x10, x9, x4             // a2 = a1 + a_stride
80        ADD     x17, x16, x7            // c2 = c1 + cm_stride
81                                        // if mr <= 2
82        CSEL    x10, x9, x10, LS        //   a2 = a1
83        CSEL    x17, x16, x17, LS       //   c2 = c1
84
85        CMP     x0, 4                   // if mr < 4
86        ADD     x11, x10, x4            // a3 = a2 + a_stride
87        ADD     x14, x17, x7            // c3 = c2 + cm_stride
88        CSEL    x11, x10, x11, LO       //   a3 = a2
89        CSEL    x14, x17, x14, LO       //   c3 = c2
90
91        ADD     x12, x11, x4            // a4 = a3 + a_stride
92        ADD     x13, x14, x7            // c4 = c3 + cm_stride
93                                        // if mr <= 4
94        CSEL    x12, x11, x12, LS       //   a4 = a3
95        CSEL    x13, x14, x13, LS       //   c4 = c3
96
97        CMP     x0, 6                   // if mr < 6
98        ADD     x4, x12, x4             // a5 = a4 + a_stride
99        ADD     x7, x13, x7             // c5 = c4 + cm_stride
100        CSEL    x4, x12, x4, LO         //   a5 = a4
101        CSEL    x7, x13, x7, LO         //   c5 = c4
102
103        LDR     x8, [sp]                // load cn_stride
104
1050:
106        # Load initial accumulators
107        LDP     q20, q21, [x15], 32
108        LDP     q22, q23, [x15], 32
109        LDP     q24, q25, [x15], 32
110        LDP     q26, q27, [x15], 32
111        LDP     q28, q29, [x15], 32
112        LDP     q30, q31, [x15], 32
113
114        # Is there at least 2 halffloats (4 bytes)?
115        SUBS    x0, x2, 4               // k = kc - 4
116        B.LO    4f
117
118        # Prologue - load 4 A and 2 B
119
120        LDR     s0,  [x3], 4
121        LDR     q16, [x5], 16
122        LDR     q17, [x5], 16
123        LDR     s1,  [x9], 4
124        LDR     s2, [x10], 4
125        LDR     s3, [x11], 4
126
127        # Is there at least 2 halffloats for main loop?
128        SUBS    x0, x0, 4
129        B.LO    2f
130
131        # Main loop - 2 halffloats of A (4 bytes)
132        # 24 FMA + 6 ld32 A + 4 LDR B
1331:
134        FMLA    v20.8h, v16.8h,  v0.h[0]
135        LDR     s4, [x12], 4
136        FMLA    v21.8h, v17.8h,  v0.h[0]
137        LDR     s5,  [x4], 4
138        FMLA    v22.8h, v16.8h,  v1.h[0]
139        LDR     q18, [x5], 16
140        FMLA    v23.8h, v17.8h,  v1.h[0]
141        FMLA    v24.8h, v16.8h,  v2.h[0]
142        LDR     q19, [x5], 16
143        FMLA    v25.8h, v17.8h,  v2.h[0]
144        FMLA    v26.8h, v16.8h,  v3.h[0]
145        FMLA    v27.8h, v17.8h,  v3.h[0]
146        FMLA    v28.8h, v16.8h,  v4.h[0]
147        FMLA    v29.8h, v17.8h,  v4.h[0]
148        FMLA    v30.8h, v16.8h,  v5.h[0]
149        FMLA    v31.8h, v17.8h,  v5.h[0]
150        SUBS    x0, x0, 4
151
152        FMLA    v20.8h, v18.8h,  v0.h[1]
153        LDR     q16, [x5], 16
154        FMLA    v21.8h, v19.8h,  v0.h[1]
155        FMLA    v22.8h, v18.8h,  v1.h[1]
156        LDR     q17, [x5], 16
157        FMLA    v23.8h, v19.8h,  v1.h[1]
158        FMLA    v24.8h, v18.8h,  v2.h[1]
159        LDR     s0,  [x3], 4
160        FMLA    v25.8h, v19.8h,  v2.h[1]
161        FMLA    v26.8h, v18.8h,  v3.h[1]
162        LDR     s1,  [x9], 4
163        FMLA    v27.8h, v19.8h,  v3.h[1]
164        FMLA    v28.8h, v18.8h,  v4.h[1]
165        LDR     s2, [x10], 4
166        FMLA    v29.8h, v19.8h,  v4.h[1]
167        FMLA    v30.8h, v18.8h,  v5.h[1]
168        LDR     s3, [x11], 4
169        FMLA    v31.8h, v19.8h,  v5.h[1]
170        B.HS    1b
171
172        # Epilogue - same as main loop but no loads for next loop
1732:
174        FMLA    v20.8h, v16.8h,  v0.h[0]
175        LDR     s4, [x12], 4
176        FMLA    v21.8h, v17.8h,  v0.h[0]
177        LDR     s5,  [x4], 4
178        FMLA    v22.8h, v16.8h,  v1.h[0]
179        LDR     q18, [x5], 16
180        FMLA    v23.8h, v17.8h,  v1.h[0]
181        FMLA    v24.8h, v16.8h,  v2.h[0]
182        LDR     q19, [x5], 16
183        FMLA    v25.8h, v17.8h,  v2.h[0]
184        FMLA    v26.8h, v16.8h,  v3.h[0]
185        FMLA    v27.8h, v17.8h,  v3.h[0]
186        FMLA    v28.8h, v16.8h,  v4.h[0]
187        FMLA    v29.8h, v17.8h,  v4.h[0]
188        FMLA    v30.8h, v16.8h,  v5.h[0]
189        FMLA    v31.8h, v17.8h,  v5.h[0]
190
191        FMLA    v20.8h, v18.8h,  v0.h[1]
192        FMLA    v21.8h, v19.8h,  v0.h[1]
193        FMLA    v22.8h, v18.8h,  v1.h[1]
194        FMLA    v23.8h, v19.8h,  v1.h[1]
195        FMLA    v24.8h, v18.8h,  v2.h[1]
196        FMLA    v25.8h, v19.8h,  v2.h[1]
197        FMLA    v26.8h, v18.8h,  v3.h[1]
198        FMLA    v27.8h, v19.8h,  v3.h[1]
199        FMLA    v28.8h, v18.8h,  v4.h[1]
200        FMLA    v29.8h, v19.8h,  v4.h[1]
201        FMLA    v30.8h, v18.8h,  v5.h[1]
202        FMLA    v31.8h, v19.8h,  v5.h[1]
203
204        # Is there a remainder?- 1 halffloat of A (2 bytes)
205        TBNZ    x0, 1, 4f
2063:
207        # Scale and Clamp
208        FMUL    v20.8h, v20.8h, v6.h[0]
209        DUP     v4.8h, v6.h[1]
210        FMUL    v21.8h, v21.8h, v6.h[0]
211        DUP     v5.8h, v6.h[2]
212        FMUL    v22.8h, v22.8h, v6.h[0]
213        FMUL    v23.8h, v23.8h, v6.h[0]
214        FMUL    v24.8h, v24.8h, v6.h[0]
215        FMUL    v25.8h, v25.8h, v6.h[0]
216        FMUL    v26.8h, v26.8h, v6.h[0]
217        FMUL    v27.8h, v27.8h, v6.h[0]
218        FMUL    v28.8h, v28.8h, v6.h[0]
219        FMUL    v29.8h, v29.8h, v6.h[0]
220        FMUL    v30.8h, v30.8h, v6.h[0]
221        FMUL    v31.8h, v31.8h, v6.h[0]
222        FMAX    v20.8h, v20.8h, v4.8h
223        FMAX    v21.8h, v21.8h, v4.8h
224        FMAX    v22.8h, v22.8h, v4.8h
225        FMAX    v23.8h, v23.8h, v4.8h
226        FMAX    v24.8h, v24.8h, v4.8h
227        FMAX    v25.8h, v25.8h, v4.8h
228        FMAX    v26.8h, v26.8h, v4.8h
229        FMAX    v27.8h, v27.8h, v4.8h
230        FMAX    v28.8h, v28.8h, v4.8h
231        FMAX    v29.8h, v29.8h, v4.8h
232        FMAX    v30.8h, v30.8h, v4.8h
233        FMAX    v31.8h, v31.8h, v4.8h
234        SUBS    x1, x1, 16
235        FMIN    v20.8h, v20.8h, v5.8h
236        FMIN    v21.8h, v21.8h, v5.8h
237        FMIN    v22.8h, v22.8h, v5.8h
238        FMIN    v23.8h, v23.8h, v5.8h
239        FMIN    v24.8h, v24.8h, v5.8h
240        FMIN    v25.8h, v25.8h, v5.8h
241        FMIN    v26.8h, v26.8h, v5.8h
242        FMIN    v27.8h, v27.8h, v5.8h
243        FMIN    v28.8h, v28.8h, v5.8h
244        FMIN    v29.8h, v29.8h, v5.8h
245        FMIN    v30.8h, v30.8h, v5.8h
246        FMIN    v31.8h, v31.8h, v5.8h
247
248        # Store full 6 x 16
249        B.LO    5f
250
251        ST1     {v30.16b, v31.16b},  [x7], x8
252        SUB     x3,  x3, x2             // a0 -= kc
253        ST1     {v28.16b, v29.16b}, [x13], x8
254        SUB     x9,  x9, x2             // a1 -= kc
255        ST1     {v26.16b, v27.16b}, [x14], x8
256        SUB     x10, x10, x2            // a2 -= kc
257        ST1     {v24.16b, v25.16b}, [x17], x8
258        SUB     x11, x11, x2            // a3 -= kc
259        ST1     {v22.16b, v23.16b}, [x16], x8
260        SUB     x12, x12, x2            // a4 -= kc
261        ST1     {v20.16b, v21.16b},  [x6], x8
262        SUB     x4,  x4, x2             // a5 -= kc
263
264        B.HI    0b
265        RET
266
2674:
268        # Remainder- 1 halffloat of A (2 bytes)
269        LDR     h0,  [x3], 2
270        LDR     q16, [x5], 16
271        LDR     q17, [x5], 16
272        LDR     h1,  [x9], 2
273        LDR     h2, [x10], 2
274        LDR     h3, [x11], 2
275        LDR     h4, [x12], 2
276        LDR     h5,  [x4], 2
277        FMLA    v20.8h, v16.8h,  v0.h[0]
278        FMLA    v22.8h, v16.8h,  v1.h[0]
279        FMLA    v24.8h, v16.8h,  v2.h[0]
280        FMLA    v26.8h, v16.8h,  v3.h[0]
281        FMLA    v28.8h, v16.8h,  v4.h[0]
282        FMLA    v30.8h, v16.8h,  v5.h[0]
283        FMLA    v21.8h, v17.8h,  v0.h[0]
284        FMLA    v23.8h, v17.8h,  v1.h[0]
285        FMLA    v25.8h, v17.8h,  v2.h[0]
286        FMLA    v27.8h, v17.8h,  v3.h[0]
287        FMLA    v29.8h, v17.8h,  v4.h[0]
288        FMLA    v31.8h, v17.8h,  v5.h[0]
289        B       3b
290
291        # Store odd width
2925:
293        TBZ     x1, 3, 6f
294        STR     q30,  [x7], 16
295        MOV     v30.16b, v31.16b
296        STR     q28, [x13], 16
297        MOV     v28.16b, v29.16b
298        STR     q26, [x14], 16
299        MOV     v26.16b, v27.16b
300        STR     q24, [x17], 16
301        MOV     v24.16b, v25.16b
302        STR     q22, [x16], 16
303        MOV     v22.16b, v23.16b
304        STR     q20,  [x6], 16
305        MOV     v20.16b, v21.16b
306
3076:
308        TBZ     x1, 2, 7f
309        STR     d30,  [x7], 8
310        STR     d28, [x13], 8
311        DUP     d30, v30.d[1]
312        DUP     d28, v28.d[1]
313        STR     d26, [x14], 8
314        STR     d24, [x17], 8
315        DUP     d26, v26.d[1]
316        DUP     d24, v24.d[1]
317        STR     d22, [x16], 8
318        STR     d20,  [x6], 8
319        DUP     d22, v22.d[1]
320        DUP     d20, v20.d[1]
321
3227:
323        TBZ     x1, 1, 8f
324        STR     s30,  [x7], 4
325        STR     s28, [x13], 4
326        DUP     s30, v30.s[1]
327        DUP     s28, v28.s[1]
328        STR     s26, [x14], 4
329        STR     s24, [x17], 4
330        DUP     s26, v26.s[1]
331        DUP     s24, v24.s[1]
332        STR     s22, [x16], 4
333        STR     s20,  [x6], 4
334        DUP     s22, v22.s[1]
335        DUP     s20, v20.s[1]
336
3378:
338        TBZ     x1, 0, 9f
339        STR     h30,  [x7]
340        STR     h28, [x13]
341        STR     h26, [x14]
342        STR     h24, [x17]
343        STR     h22, [x16]
344        STR     h20,  [x6]
3459:
346        RET
347
348END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
349
350#ifdef __ELF__
351.section ".note.GNU-stack","",%progbits
352#endif
353