• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const void*restrict a,    x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     void*restrict c,          x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x8
22
23#     const float*restrict acc,  [sp + 8] -> x15
24#     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# Register usage
29# A0  x3 v0
30# A1  x9 v1
31# A2 x10 v2
32# A3 x11 v3
33# A4 x12 v4
34# A5  x4 v5
35
36# B   x5 v16 v17 v18 v19
37
38# C0  x6  v20 v21
39# C1 x16  v22 v23
40# C2 x17  v24 v25
41# C3 x14  v26 v27
42# C4 x13  v28 v29
43# C5  x7  v30 v31
44
45# Clamp v6, (v4), (v5)
46# unused     v7
47# unused A   v8 v9 v10 v11
48# unused B   v12 v13 v14 v15
49
50BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
51
52        # Load acc, params pointer
53        LDP     x15, x8, [sp, 8]
54
55        # Clamp A and C pointers
56        CMP     x0, 2                   // if mr < 2
57        ADD     x9, x3, x4              // a1 = a0 + a_stride
58        ADD     x16, x6, x7             // c1 = c0 + cm_stride
59        CSEL    x9, x3, x9, LO          //   a1 = a0
60        CSEL    x16, x6, x16, LO        //   c1 = c0
61
62        # Load params
63        LDR     s6, [x8]
64
65        ADD     x10, x9, x4             // a2 = a1 + a_stride
66        ADD     x17, x16, x7            // c2 = c1 + cm_stride
67                                        // if mr <= 2
68        CSEL    x10, x9, x10, LS        //   a2 = a1
69        CSEL    x17, x16, x17, LS       //   c2 = c1
70
71        CMP     x0, 4                   // if mr < 4
72        ADD     x11, x10, x4            // a3 = a2 + a_stride
73        ADD     x14, x17, x7            // c3 = c2 + cm_stride
74        CSEL    x11, x10, x11, LO       //   a3 = a2
75        CSEL    x14, x17, x14, LO       //   c3 = c2
76
77        ADD     x12, x11, x4            // a4 = a3 + a_stride
78        ADD     x13, x14, x7            // c4 = c3 + cm_stride
79                                        // if mr <= 4
80        CSEL    x12, x11, x12, LS       //   a4 = a3
81        CSEL    x13, x14, x13, LS       //   c4 = c3
82
83        CMP     x0, 6                   // if mr < 6
84        ADD     x4, x12, x4             // a5 = a4 + a_stride
85        ADD     x7, x13, x7             // c5 = c4 + cm_stride
86        CSEL    x4, x12, x4, LO         //   a5 = a4
87        CSEL    x7, x13, x7, LO         //   c5 = c4
88
89        LDR     x8, [sp]                // load cn_stride
90
910:
92        # Load initial accumulators
93        LDP     q20, q21, [x15], 32
94        LDP     q22, q23, [x15], 32
95        LDP     q24, q25, [x15], 32
96        LDP     q26, q27, [x15], 32
97        LDP     q28, q29, [x15], 32
98        LDP     q30, q31, [x15], 32
99
100        # Is there at least 4 halffloats (8 bytes)?
101        SUBS    x0, x2, 8               // k = kc - 8
102        B.LO    4f
103
104        # Prologue - load 4 A and 2 B
105
106        LDR     d0,  [x3], 8              // A0
107        LDR     q16, [x5], 16             // B0
108        LDR     q17, [x5], 16             // B1
109        LDR     d1,  [x9], 8              // A1
110        LDR     d2, [x10], 8              // A2
111        LDR     d3, [x11], 8              // A3
112
113        # Is there at least 4 halffloats for main loop?
114        SUBS    x0, x0, 8
115        B.LO    2f
116
117       .p2align 3
118        # Main loop - 4 halffloats of A (8 bytes)
119        # 48 FMA + 6 ld32 A + 8 LDR B
1201:
121        FMLA    v20.8h, v16.8h,  v0.h[0]
122        FMLA    v21.8h, v17.8h,  v0.h[0]
123        LDR     d4, [x12], 8              // A4
124        FMLA    v22.8h, v16.8h,  v1.h[0]
125        FMLA    v23.8h, v17.8h,  v1.h[0]
126        LDR     d5,  [x4], 8              // A5
127        FMLA    v24.8h, v16.8h,  v2.h[0]
128        FMLA    v25.8h, v17.8h,  v2.h[0]
129        LDR     q18, [x5], 16             // B2
130        FMLA    v26.8h, v16.8h,  v3.h[0]
131        FMLA    v27.8h, v17.8h,  v3.h[0]
132        LDR     q19, [x5], 16             // B3
133        FMLA    v28.8h, v16.8h,  v4.h[0]
134        FMLA    v29.8h, v17.8h,  v4.h[0]
135        FMLA    v30.8h, v16.8h,  v5.h[0]
136        FMLA    v31.8h, v17.8h,  v5.h[0]
137        SUBS    x0, x0, 8
138
139        FMLA    v20.8h, v18.8h,  v0.h[1]
140        FMLA    v21.8h, v19.8h,  v0.h[1]
141        LDR     q16, [x5], 16             // B4
142        FMLA    v22.8h, v18.8h,  v1.h[1]
143        FMLA    v23.8h, v19.8h,  v1.h[1]
144        LDR     q17, [x5], 16             // B5
145        FMLA    v24.8h, v18.8h,  v2.h[1]
146        FMLA    v25.8h, v19.8h,  v2.h[1]
147        FMLA    v26.8h, v18.8h,  v3.h[1]
148        FMLA    v27.8h, v19.8h,  v3.h[1]
149        FMLA    v28.8h, v18.8h,  v4.h[1]
150        FMLA    v29.8h, v19.8h,  v4.h[1]
151        FMLA    v30.8h, v18.8h,  v5.h[1]
152        FMLA    v31.8h, v19.8h,  v5.h[1]
153
154        FMLA    v20.8h, v16.8h,  v0.h[2]
155        FMLA    v21.8h, v17.8h,  v0.h[2]
156        LDR     q18, [x5], 16             // B6
157        FMLA    v22.8h, v16.8h,  v1.h[2]
158        FMLA    v23.8h, v17.8h,  v1.h[2]
159        LDR     q19, [x5], 16             // B7
160        FMLA    v24.8h, v16.8h,  v2.h[2]
161        FMLA    v25.8h, v17.8h,  v2.h[2]
162        FMLA    v26.8h, v16.8h,  v3.h[2]
163        FMLA    v27.8h, v17.8h,  v3.h[2]
164        FMLA    v28.8h, v16.8h,  v4.h[2]
165        FMLA    v29.8h, v17.8h,  v4.h[2]
166        FMLA    v30.8h, v16.8h,  v5.h[2]
167        FMLA    v31.8h, v17.8h,  v5.h[2]
168
169        LDR     q16, [x5], 16             // B0
170        FMLA    v20.8h, v18.8h,  v0.h[3]
171        FMLA    v21.8h, v19.8h,  v0.h[3]
172        LDR     q17, [x5], 16             // B1
173        FMLA    v22.8h, v18.8h,  v1.h[3]
174        FMLA    v23.8h, v19.8h,  v1.h[3]
175        LDR     d0,  [x3], 8              // A0
176        FMLA    v24.8h, v18.8h,  v2.h[3]
177        FMLA    v25.8h, v19.8h,  v2.h[3]
178        LDR     d1,  [x9], 8              // A1
179        FMLA    v26.8h, v18.8h,  v3.h[3]
180        FMLA    v27.8h, v19.8h,  v3.h[3]
181        LDR     d2, [x10], 8              // A2
182        FMLA    v28.8h, v18.8h,  v4.h[3]
183        FMLA    v29.8h, v19.8h,  v4.h[3]
184        LDR     d3, [x11], 8              // A3
185        FMLA    v30.8h, v18.8h,  v5.h[3]
186        FMLA    v31.8h, v19.8h,  v5.h[3]
187        B.HS    1b
188
189        # Epilogue - same as main loop but no loads for next loop
1902:
191        FMLA    v20.8h, v16.8h,  v0.h[0]
192        FMLA    v21.8h, v17.8h,  v0.h[0]
193        LDR     d4, [x12], 8              // A4
194        FMLA    v22.8h, v16.8h,  v1.h[0]
195        FMLA    v23.8h, v17.8h,  v1.h[0]
196        LDR     d5,  [x4], 8              // A5
197        FMLA    v24.8h, v16.8h,  v2.h[0]
198        FMLA    v25.8h, v17.8h,  v2.h[0]
199        LDR     q18, [x5], 16             // B2
200        FMLA    v26.8h, v16.8h,  v3.h[0]
201        FMLA    v27.8h, v17.8h,  v3.h[0]
202        LDR     q19, [x5], 16             // B3
203        FMLA    v28.8h, v16.8h,  v4.h[0]
204        FMLA    v29.8h, v17.8h,  v4.h[0]
205        FMLA    v30.8h, v16.8h,  v5.h[0]
206        FMLA    v31.8h, v17.8h,  v5.h[0]
207        ADDS    x0, x0, 8
208
209        FMLA    v20.8h, v18.8h,  v0.h[1]
210        FMLA    v21.8h, v19.8h,  v0.h[1]
211        LDR     q16, [x5], 16             // B4
212        FMLA    v22.8h, v18.8h,  v1.h[1]
213        FMLA    v23.8h, v19.8h,  v1.h[1]
214        LDR     q17, [x5], 16             // B5
215        FMLA    v24.8h, v18.8h,  v2.h[1]
216        FMLA    v25.8h, v19.8h,  v2.h[1]
217        FMLA    v26.8h, v18.8h,  v3.h[1]
218        FMLA    v27.8h, v19.8h,  v3.h[1]
219        FMLA    v28.8h, v18.8h,  v4.h[1]
220        FMLA    v29.8h, v19.8h,  v4.h[1]
221        FMLA    v30.8h, v18.8h,  v5.h[1]
222        FMLA    v31.8h, v19.8h,  v5.h[1]
223
224        FMLA    v20.8h, v16.8h,  v0.h[2]
225        FMLA    v21.8h, v17.8h,  v0.h[2]
226        LDR     q18, [x5], 16             // B6
227        FMLA    v22.8h, v16.8h,  v1.h[2]
228        FMLA    v23.8h, v17.8h,  v1.h[2]
229        LDR     q19, [x5], 16             // B7
230        FMLA    v24.8h, v16.8h,  v2.h[2]
231        FMLA    v25.8h, v17.8h,  v2.h[2]
232        FMLA    v26.8h, v16.8h,  v3.h[2]
233        FMLA    v27.8h, v17.8h,  v3.h[2]
234        FMLA    v28.8h, v16.8h,  v4.h[2]
235        FMLA    v29.8h, v17.8h,  v4.h[2]
236        FMLA    v30.8h, v16.8h,  v5.h[2]
237        FMLA    v31.8h, v17.8h,  v5.h[2]
238
239        FMLA    v20.8h, v18.8h,  v0.h[3]
240        FMLA    v21.8h, v19.8h,  v0.h[3]
241        FMLA    v22.8h, v18.8h,  v1.h[3]
242        FMLA    v23.8h, v19.8h,  v1.h[3]
243        FMLA    v24.8h, v18.8h,  v2.h[3]
244        FMLA    v25.8h, v19.8h,  v2.h[3]
245        FMLA    v26.8h, v18.8h,  v3.h[3]
246        FMLA    v27.8h, v19.8h,  v3.h[3]
247        FMLA    v28.8h, v18.8h,  v4.h[3]
248        FMLA    v29.8h, v19.8h,  v4.h[3]
249        FMLA    v30.8h, v18.8h,  v5.h[3]
250        FMLA    v31.8h, v19.8h,  v5.h[3]
251
252        # Is there a remainder?- 1-3 halffloats of A (2-6 bytes)
253        B.NE    4f
254
2553:
256        # Clamp
257        DUP     v4.8h, v6.h[0]
258        DUP     v5.8h, v6.h[1]
259        FMAX    v20.8h, v20.8h, v4.8h
260        FMAX    v21.8h, v21.8h, v4.8h
261        FMAX    v22.8h, v22.8h, v4.8h
262        FMAX    v23.8h, v23.8h, v4.8h
263        FMAX    v24.8h, v24.8h, v4.8h
264        FMAX    v25.8h, v25.8h, v4.8h
265        FMAX    v26.8h, v26.8h, v4.8h
266        FMAX    v27.8h, v27.8h, v4.8h
267        FMAX    v28.8h, v28.8h, v4.8h
268        FMAX    v29.8h, v29.8h, v4.8h
269        FMAX    v30.8h, v30.8h, v4.8h
270        FMAX    v31.8h, v31.8h, v4.8h
271        SUBS    x1, x1, 16
272        FMIN    v20.8h, v20.8h, v5.8h
273        FMIN    v21.8h, v21.8h, v5.8h
274        FMIN    v22.8h, v22.8h, v5.8h
275        FMIN    v23.8h, v23.8h, v5.8h
276        FMIN    v24.8h, v24.8h, v5.8h
277        FMIN    v25.8h, v25.8h, v5.8h
278        FMIN    v26.8h, v26.8h, v5.8h
279        FMIN    v27.8h, v27.8h, v5.8h
280        FMIN    v28.8h, v28.8h, v5.8h
281        FMIN    v29.8h, v29.8h, v5.8h
282        FMIN    v30.8h, v30.8h, v5.8h
283        FMIN    v31.8h, v31.8h, v5.8h
284
285        # Store full 6 x 16
286        B.LO    6f
287
288        ST1     {v30.16b, v31.16b},  [x7], x8
289        SUB     x3,  x3, x2             // a0 -= kc
290        ST1     {v28.16b, v29.16b}, [x13], x8
291        SUB     x9,  x9, x2             // a1 -= kc
292        ST1     {v26.16b, v27.16b}, [x14], x8
293        SUB     x10, x10, x2            // a2 -= kc
294        ST1     {v24.16b, v25.16b}, [x17], x8
295        SUB     x11, x11, x2            // a3 -= kc
296        ST1     {v22.16b, v23.16b}, [x16], x8
297        SUB     x12, x12, x2            // a4 -= kc
298        ST1     {v20.16b, v21.16b},  [x6], x8
299        SUB     x4,  x4, x2             // a5 -= kc
300
301        B.HI    0b
302        RET
303
304        # Remainder- 1-3 halffloats of A (2-6 bytes)
3054:
306        TBZ     x0, 2, 5f
307        LDR     s0,  [x3], 4
308        LDR     q16, [x5], 16
309        LDR     q17, [x5], 16
310        LDR     s1,  [x9], 4
311        LDR     s2, [x10], 4
312        LDR     s3, [x11], 4
313        LDR     s4, [x12], 4
314        LDR     s5,  [x4], 4
315        LDR     q18, [x5], 16
316        LDR     q19, [x5], 16
317        FMLA    v20.8h, v16.8h,  v0.h[0]
318        FMLA    v22.8h, v16.8h,  v1.h[0]
319        FMLA    v24.8h, v16.8h,  v2.h[0]
320        FMLA    v26.8h, v16.8h,  v3.h[0]
321        FMLA    v28.8h, v16.8h,  v4.h[0]
322        FMLA    v30.8h, v16.8h,  v5.h[0]
323        FMLA    v21.8h, v17.8h,  v0.h[0]
324        FMLA    v23.8h, v17.8h,  v1.h[0]
325        FMLA    v25.8h, v17.8h,  v2.h[0]
326        FMLA    v27.8h, v17.8h,  v3.h[0]
327        FMLA    v29.8h, v17.8h,  v4.h[0]
328        FMLA    v31.8h, v17.8h,  v5.h[0]
329
330        FMLA    v20.8h, v18.8h,  v0.h[1]
331        FMLA    v22.8h, v18.8h,  v1.h[1]
332        FMLA    v24.8h, v18.8h,  v2.h[1]
333        FMLA    v26.8h, v18.8h,  v3.h[1]
334        FMLA    v28.8h, v18.8h,  v4.h[1]
335        FMLA    v30.8h, v18.8h,  v5.h[1]
336        FMLA    v21.8h, v19.8h,  v0.h[1]
337        FMLA    v23.8h, v19.8h,  v1.h[1]
338        FMLA    v25.8h, v19.8h,  v2.h[1]
339        FMLA    v27.8h, v19.8h,  v3.h[1]
340        FMLA    v29.8h, v19.8h,  v4.h[1]
341        FMLA    v31.8h, v19.8h,  v5.h[1]
342        TBZ     x0, 1, 3b
343
3445:
345        LDR     h0,  [x3], 2
346        LDR     q16, [x5], 16
347        LDR     q17, [x5], 16
348        LDR     h1,  [x9], 2
349        LDR     h2, [x10], 2
350        LDR     h3, [x11], 2
351        LDR     h4, [x12], 2
352        LDR     h5,  [x4], 2
353        FMLA    v20.8h, v16.8h,  v0.h[0]
354        FMLA    v22.8h, v16.8h,  v1.h[0]
355        FMLA    v24.8h, v16.8h,  v2.h[0]
356        FMLA    v26.8h, v16.8h,  v3.h[0]
357        FMLA    v28.8h, v16.8h,  v4.h[0]
358        FMLA    v30.8h, v16.8h,  v5.h[0]
359        FMLA    v21.8h, v17.8h,  v0.h[0]
360        FMLA    v23.8h, v17.8h,  v1.h[0]
361        FMLA    v25.8h, v17.8h,  v2.h[0]
362        FMLA    v27.8h, v17.8h,  v3.h[0]
363        FMLA    v29.8h, v17.8h,  v4.h[0]
364        FMLA    v31.8h, v17.8h,  v5.h[0]
365        B       3b
366
367        # Store odd width
3686:
369        TBZ     x1, 3, 7f
370        STR     q30,  [x7], 16
371        MOV     v30.16b, v31.16b
372        STR     q28, [x13], 16
373        MOV     v28.16b, v29.16b
374        STR     q26, [x14], 16
375        MOV     v26.16b, v27.16b
376        STR     q24, [x17], 16
377        MOV     v24.16b, v25.16b
378        STR     q22, [x16], 16
379        MOV     v22.16b, v23.16b
380        STR     q20,  [x6], 16
381        MOV     v20.16b, v21.16b
382
3837:
384        TBZ     x1, 2, 8f
385        STR     d30,  [x7], 8
386        STR     d28, [x13], 8
387        DUP     d30, v30.d[1]
388        DUP     d28, v28.d[1]
389        STR     d26, [x14], 8
390        STR     d24, [x17], 8
391        DUP     d26, v26.d[1]
392        DUP     d24, v24.d[1]
393        STR     d22, [x16], 8
394        STR     d20,  [x6], 8
395        DUP     d22, v22.d[1]
396        DUP     d20, v20.d[1]
397
3988:
399        TBZ     x1, 1, 9f
400        STR     s30,  [x7], 4
401        STR     s28, [x13], 4
402        DUP     s30, v30.s[1]
403        DUP     s28, v28.s[1]
404        STR     s26, [x14], 4
405        STR     s24, [x17], 4
406        DUP     s26, v26.s[1]
407        DUP     s24, v24.s[1]
408        STR     s22, [x16], 4
409        STR     s20,  [x6], 4
410        DUP     s22, v22.s[1]
411        DUP     s20, v20.s[1]
412
4139:
414        TBZ     x1, 0, 10f
415        STR     h30,  [x7]
416        STR     h28, [x13]
417        STR     h26, [x14]
418        STR     h24, [x17]
419        STR     h22, [x16]
420        STR     h20,  [x6]
42110:
422        RET
423
424END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75
425
426#ifdef __ELF__
427.section ".note.GNU-stack","",%progbits
428#endif
429