• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x8)
22
23#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x14 c3
40# x13 c4
41#  x7 c5
42
43# x8 temporary vector shadow register
44
45# Vector register usage
46# A0   v0
47# A1   v1
48# A2   v2
49# A3   v3
50# A4   v4
51# A5   v5
52# B   v16 v17 v18 v19
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6, (v4), (v5)
60# unused     v7
61# unused A   v8 v9 v10 v11
62# unused B   v12 v13 v14 v15
63
64
65BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55
66
67        # Load params pointer
68        LDR     x8, [sp, 8]
69
70        # Clamp A and C pointers
71        CMP     x0, 2                   // if mr < 2
72        ADD     x9, x3, x4              // a1 = a0 + a_stride
73        ADD     x16, x6, x7             // c1 = c0 + cm_stride
74        CSEL    x9, x3, x9, LO          //   a1 = a0
75        CSEL    x16, x6, x16, LO        //   c1 = c0
76
77        # Load params
78        LDR     d6, [x8]
79
80        ADD     x10, x9, x4             // a2 = a1 + a_stride
81        ADD     x17, x16, x7            // c2 = c1 + cm_stride
82                                        // if mr <= 2
83        CSEL    x10, x9, x10, LS        //   a2 = a1
84        CSEL    x17, x16, x17, LS       //   c2 = c1
85
86        CMP     x0, 4                   // if mr < 4
87        ADD     x11, x10, x4            // a3 = a2 + a_stride
88        ADD     x14, x17, x7            // c3 = c2 + cm_stride
89        CSEL    x11, x10, x11, LO       //   a3 = a2
90        CSEL    x14, x17, x14, LO       //   c3 = c2
91
92        ADD     x12, x11, x4            // a4 = a3 + a_stride
93        ADD     x13, x14, x7            // c4 = c3 + cm_stride
94                                        // if mr <= 4
95        CSEL    x12, x11, x12, LS       //   a4 = a3
96        CSEL    x13, x14, x13, LS       //   c4 = c3
97
98        CMP     x0, 6                   // if mr < 6
99        ADD     x4, x12, x4             // a5 = a4 + a_stride
100        ADD     x7, x13, x7             // c5 = c4 + cm_stride
101        CSEL    x4, x12, x4, LO         //   a5 = a4
102        CSEL    x7, x13, x7, LO         //   c5 = c4
103
104        LDR     x8, [sp]                // load cn_stride
105
1060:
107        # Load initial bias from w into accumulators
108        LDP     q20, q21, [x5], 32
109        MOV     v22.16b, v20.16b
110        MOV     v23.16b, v21.16b
111        MOV     v24.16b, v20.16b
112        MOV     v25.16b, v21.16b
113        MOV     v26.16b, v20.16b
114        MOV     v27.16b, v21.16b
115        MOV     v28.16b, v20.16b
116        MOV     v29.16b, v21.16b
117        MOV     v30.16b, v20.16b
118        MOV     v31.16b, v21.16b
119
120        # Is there at least 2 halffloats (4 bytes)?
121        SUBS    x0, x2, 4               // k = kc - 4
122        B.LO    4f
123
124        # Prologue - load 4 A and 2 B
125
126        LDR     s0,  [x3], 4
127        LDR     q16, [x5], 16
128        LDR     q17, [x5], 16
129        LDR     s1,  [x9], 4
130        LDR     s2, [x10], 4
131        LDR     s3, [x11], 4
132
133        # Is there at least 2 halffloats for main loop?
134        SUBS    x0, x0, 4
135        B.LO    2f
136
137        # Main loop - 2 halffloats of A (4 bytes)
138        # 24 FMA + 6 ld32 A + 4 LDR B
1391:
140        FMLA    v20.8h, v16.8h,  v0.h[0]
141        LDR     s4, [x12], 4              // a4
142        FMLA    v21.8h, v17.8h,  v0.h[0]
143        LDR     s5,  [x4], 4              // a5
144        FMLA    v22.8h, v16.8h,  v1.h[0]
145        LDR     d18, [x5], 8              // b0
146        FMLA    v23.8h, v17.8h,  v1.h[0]
147        FMLA    v24.8h, v16.8h,  v2.h[0]
148        LD1     {v18.d}[1], [x5], 8       // b0
149        FMLA    v25.8h, v17.8h,  v2.h[0]
150        FMLA    v26.8h, v16.8h,  v3.h[0]
151        LDR     d19, [x5], 8              // b1
152        FMLA    v27.8h, v17.8h,  v3.h[0]
153        FMLA    v28.8h, v16.8h,  v4.h[0]
154        LD1     {v19.d}[1], [x5], 8       // b1
155        FMLA    v29.8h, v17.8h,  v4.h[0]
156        FMLA    v30.8h, v16.8h,  v5.h[0]
157        FMLA    v31.8h, v17.8h,  v5.h[0]
158        SUBS    x0, x0, 4
159
160        FMLA    v20.8h, v18.8h,  v0.h[1]
161        LDR     q16, [x5], 16
162        FMLA    v21.8h, v19.8h,  v0.h[1]
163        FMLA    v22.8h, v18.8h,  v1.h[1]
164        LDR     q17, [x5], 16
165        FMLA    v23.8h, v19.8h,  v1.h[1]
166        FMLA    v24.8h, v18.8h,  v2.h[1]
167        LDR     s0,  [x3], 4
168        FMLA    v25.8h, v19.8h,  v2.h[1]
169        FMLA    v26.8h, v18.8h,  v3.h[1]
170        LDR     s1,  [x9], 4
171        FMLA    v27.8h, v19.8h,  v3.h[1]
172        FMLA    v28.8h, v18.8h,  v4.h[1]
173        LDR     s2, [x10], 4
174        FMLA    v29.8h, v19.8h,  v4.h[1]
175        FMLA    v30.8h, v18.8h,  v5.h[1]
176        LDR     s3, [x11], 4
177        FMLA    v31.8h, v19.8h,  v5.h[1]
178        B.HS    1b
179
180        # Epilogue - same as main loop but no loads for next loop
1812:
182        FMLA    v20.8h, v16.8h,  v0.h[0]
183        LDR     s4, [x12], 4
184        FMLA    v21.8h, v17.8h,  v0.h[0]
185        LDR     s5,  [x4], 4
186        FMLA    v22.8h, v16.8h,  v1.h[0]
187        LDR     q18, [x5], 16
188        FMLA    v23.8h, v17.8h,  v1.h[0]
189        FMLA    v24.8h, v16.8h,  v2.h[0]
190        LDR     q19, [x5], 16
191        FMLA    v25.8h, v17.8h,  v2.h[0]
192        FMLA    v26.8h, v16.8h,  v3.h[0]
193        FMLA    v27.8h, v17.8h,  v3.h[0]
194        FMLA    v28.8h, v16.8h,  v4.h[0]
195        FMLA    v29.8h, v17.8h,  v4.h[0]
196        FMLA    v30.8h, v16.8h,  v5.h[0]
197        FMLA    v31.8h, v17.8h,  v5.h[0]
198
199        FMLA    v20.8h, v18.8h,  v0.h[1]
200        FMLA    v21.8h, v19.8h,  v0.h[1]
201        FMLA    v22.8h, v18.8h,  v1.h[1]
202        FMLA    v23.8h, v19.8h,  v1.h[1]
203        FMLA    v24.8h, v18.8h,  v2.h[1]
204        FMLA    v25.8h, v19.8h,  v2.h[1]
205        FMLA    v26.8h, v18.8h,  v3.h[1]
206        FMLA    v27.8h, v19.8h,  v3.h[1]
207        FMLA    v28.8h, v18.8h,  v4.h[1]
208        FMLA    v29.8h, v19.8h,  v4.h[1]
209        FMLA    v30.8h, v18.8h,  v5.h[1]
210        FMLA    v31.8h, v19.8h,  v5.h[1]
211
212        # Is there a remainder?- 1 halffloat of A (2 bytes)
213        TBNZ    x0, 1, 4f
2143:
215        # Scale and Clamp
216        FMUL    v20.8h, v20.8h, v6.h[0]
217        DUP     v4.8h, v6.h[1]
218        FMUL    v21.8h, v21.8h, v6.h[0]
219        DUP     v5.8h, v6.h[2]
220        FMUL    v22.8h, v22.8h, v6.h[0]
221        FMUL    v23.8h, v23.8h, v6.h[0]
222        FMUL    v24.8h, v24.8h, v6.h[0]
223        FMUL    v25.8h, v25.8h, v6.h[0]
224        FMUL    v26.8h, v26.8h, v6.h[0]
225        FMUL    v27.8h, v27.8h, v6.h[0]
226        FMUL    v28.8h, v28.8h, v6.h[0]
227        FMUL    v29.8h, v29.8h, v6.h[0]
228        FMUL    v30.8h, v30.8h, v6.h[0]
229        FMUL    v31.8h, v31.8h, v6.h[0]
230        FMAX    v20.8h, v20.8h, v4.8h
231        FMAX    v21.8h, v21.8h, v4.8h
232        FMAX    v22.8h, v22.8h, v4.8h
233        FMAX    v23.8h, v23.8h, v4.8h
234        FMAX    v24.8h, v24.8h, v4.8h
235        FMAX    v25.8h, v25.8h, v4.8h
236        FMAX    v26.8h, v26.8h, v4.8h
237        FMAX    v27.8h, v27.8h, v4.8h
238        FMAX    v28.8h, v28.8h, v4.8h
239        FMAX    v29.8h, v29.8h, v4.8h
240        FMAX    v30.8h, v30.8h, v4.8h
241        FMAX    v31.8h, v31.8h, v4.8h
242        SUBS    x1, x1, 16
243        FMIN    v20.8h, v20.8h, v5.8h
244        FMIN    v21.8h, v21.8h, v5.8h
245        FMIN    v22.8h, v22.8h, v5.8h
246        FMIN    v23.8h, v23.8h, v5.8h
247        FMIN    v24.8h, v24.8h, v5.8h
248        FMIN    v25.8h, v25.8h, v5.8h
249        FMIN    v26.8h, v26.8h, v5.8h
250        FMIN    v27.8h, v27.8h, v5.8h
251        FMIN    v28.8h, v28.8h, v5.8h
252        FMIN    v29.8h, v29.8h, v5.8h
253        FMIN    v30.8h, v30.8h, v5.8h
254        FMIN    v31.8h, v31.8h, v5.8h
255
256        # Store full 6 x 16
257        B.LO    5f
258
259        ST1     {v20.16b, v21.16b},  [x6], x8
260        SUB     x3,  x3, x2             // a0 -= kc
261        ST1     {v22.16b, v23.16b}, [x16], x8
262        SUB     x9,  x9, x2             // a1 -= kc
263        ST1     {v24.16b, v25.16b}, [x17], x8
264        SUB     x10, x10, x2            // a2 -= kc
265        ST1     {v26.16b, v27.16b}, [x14], x8
266        SUB     x11, x11, x2            // a3 -= kc
267        ST1     {v28.16b, v29.16b}, [x13], x8
268        SUB     x12, x12, x2            // a4 -= kc
269        ST1     {v30.16b, v31.16b},  [x7], x8
270        SUB     x4,  x4, x2             // a5 -= kc
271
272        B.HI    0b
273        RET
274
2754:
276        # Remainder- 1 halffloat of A (2 bytes)
277        LDR     h0,  [x3], 2
278        LDR     q16, [x5], 16
279        LDR     q17, [x5], 16
280        LDR     h1,  [x9], 2
281        LDR     h2, [x10], 2
282        LDR     h3, [x11], 2
283        LDR     h4, [x12], 2
284        LDR     h5,  [x4], 2
285        FMLA    v20.8h, v16.8h,  v0.h[0]
286        FMLA    v22.8h, v16.8h,  v1.h[0]
287        FMLA    v24.8h, v16.8h,  v2.h[0]
288        FMLA    v26.8h, v16.8h,  v3.h[0]
289        FMLA    v28.8h, v16.8h,  v4.h[0]
290        FMLA    v30.8h, v16.8h,  v5.h[0]
291        FMLA    v21.8h, v17.8h,  v0.h[0]
292        FMLA    v23.8h, v17.8h,  v1.h[0]
293        FMLA    v25.8h, v17.8h,  v2.h[0]
294        FMLA    v27.8h, v17.8h,  v3.h[0]
295        FMLA    v29.8h, v17.8h,  v4.h[0]
296        FMLA    v31.8h, v17.8h,  v5.h[0]
297        B       3b
298
299        # Store odd width
3005:
301        TBZ     x1, 3, 6f
302        STR     q20,  [x6], 16
303        MOV     v20.16b, v21.16b
304        STR     q22, [x16], 16
305        MOV     v22.16b, v23.16b
306        STR     q24, [x17], 16
307        MOV     v24.16b, v25.16b
308        STR     q26, [x14], 16
309        MOV     v26.16b, v27.16b
310        STR     q28, [x13], 16
311        MOV     v28.16b, v29.16b
312        STR     q30,  [x7], 16
313        MOV     v30.16b, v31.16b
314
3156:
316        TBZ     x1, 2, 7f
317        STR     d20,  [x6], 8
318        STR     d22, [x16], 8
319        DUP     d20, v20.d[1]
320        DUP     d22, v22.d[1]
321        STR     d24, [x17], 8
322        STR     d26, [x14], 8
323        DUP     d24, v24.d[1]
324        DUP     d26, v26.d[1]
325        STR     d28, [x13], 8
326        STR     d30,  [x7], 8
327        DUP     d28, v28.d[1]
328        DUP     d30, v30.d[1]
329
3307:
331        TBZ     x1, 1, 8f
332        STR     s20,  [x6], 4
333        STR     s22, [x16], 4
334        DUP     s20, v20.s[1]
335        DUP     s22, v22.s[1]
336        STR     s24, [x17], 4
337        STR     s26, [x14], 4
338        DUP     s24, v24.s[1]
339        DUP     s26, v26.s[1]
340        STR     s28, [x13], 4
341        STR     s30,  [x7], 4
342        DUP     s28, v28.s[1]
343        DUP     s30, v30.s[1]
344
3458:
346        TBZ     x1, 0, 9f
347        STR     h20,  [x6]
348        STR     h22, [x16]
349        STR     h24, [x17]
350        STR     h26, [x14]
351        STR     h28, [x13]
352        STR     h30,  [x7]
3539:
354        RET
355
356END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55
357
358#ifdef __ELF__
359.section ".note.GNU-stack","",%progbits
360#endif
361