• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3  a0
29# x11 a1
30# x12 a2
31# x4  a3 / a_stride
32
33# C pointers
34# x6  c0
35# x9  c1
36# x10 c2
37# x7  c3 / cm_stride
38
39# x8 temporary vector shadow register
40
41# Vector register usage and GPR shadows
42# a0  v0
43# a1  v0[1]
44# a2  v1
45# a3  v1[1]
46# a0  v2
47# a1  v2[1]
48# a2  v3
49# a3  v3[1]
50# B   v6  v7  v8
51# B   v9 v10 v11
52# B  v14 v15 v16
53# B  v17 v18 v19
54# C  v20 v21 v22
55# C  v23 v24 v25
56# C  v26 v27 v28
57# C  v29 v30 v31
58# Clamp v4 v5
59# v12 to v13 unused.
60
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
62
63        $if INC:
64          # Load cn_stride, acc
65          LDP x14, x15, [sp]
66          # Load params pointer
67          LDR x8, [sp, 16]
68        $else:
69          # Load cn_stride, params pointer
70          LDP x14, x8, [sp]
71
72        # Load clamping_params values
73        LD2R {v4.4s, v5.4s}, [x8]
74
75        # Save d8-d11,d14,d15 on stack
76        STP  d8,  d9, [sp, -48]!
77        STP d10, d11, [sp, 16]
78        STP d14, d15, [sp, 32]
79
80        # Clamp A and C pointers
81        CMP x0, 2                // if mr < 2
82        ADD x11, x3, x4          // a1 = a0 + a_stride
83        ADD x9, x6, x7           // c1 = c0 + cm_stride
84        CSEL x11, x3, x11, LO    //   a1 = a0
85        CSEL x9, x6, x9, LO      //   c1 = c0
86        ADD x12, x11, x4         // a2 = a1 + a_stride
87        ADD x10, x9, x7          // c2 = c1 + cm_stride
88                                 // if mr <= 2
89        CSEL x12, x11, x12, LS   //   a2 = a1
90        CSEL x10, x9, x10, LS    //   c2 = c1
91        CMP x0, 4                // if mr < 4
92        ADD x4, x12, x4          // a3 = a2 + a_stride
93        ADD x7, x10, x7          // c3 = c2 + cm_stride
94        CSEL x4, x12, x4, LO     //   a3 = a2
95        CSEL x7, x10, x7, LO     //   c3 = c2
96
970:
98        $if INC:
99          # Load initial accumulators
100          LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48
101          LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
102          LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
103          LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
104          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
105          PRFM PLDL1KEEP,  [x3, 64]
106          PRFM PLDL1KEEP, [x11,  0]
107          PRFM PLDL1KEEP, [x11, 64]
108          PRFM PLDL1KEEP, [x12,  0]
109          PRFM PLDL1KEEP, [x12, 64]
110          PRFM PLDL1KEEP,  [x4,  0]
111          PRFM PLDL1KEEP,  [x4, 64]
112          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
113          PRFM PLDL1KEEP, [x5,  64]
114          PRFM PLDL1KEEP, [x5, 128]
115          PRFM PLDL1KEEP, [x5, 192]
116          PRFM PLDL1KEEP, [x5, 256]
117          PRFM PLDL1KEEP, [x5, 320]
118        $else:
119          # Load initial bias from w into accumulators
120          LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
121          MOV v23.16b, v20.16b
122          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
123          PRFM PLDL1KEEP,  [x3, 64]
124          MOV v24.16b, v21.16b
125          PRFM PLDL1KEEP,  [x11,  0]
126          PRFM PLDL1KEEP,  [x11, 64]
127          MOV v25.16b, v22.16b
128          PRFM PLDL1KEEP, [x12,  0]
129          PRFM PLDL1KEEP, [x12, 64]
130          MOV v26.16b, v20.16b
131          PRFM PLDL1KEEP, [x4,  0]
132          PRFM PLDL1KEEP, [x4, 64]
133          MOV v27.16b, v21.16b
134          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
135          PRFM PLDL1KEEP, [x5,  64]
136          MOV v28.16b, v22.16b
137          PRFM PLDL1KEEP, [x5, 128]
138          PRFM PLDL1KEEP, [x5, 192]
139          MOV v29.16b, v20.16b
140          PRFM PLDL1KEEP, [x5, 256]
141          MOV v30.16b, v21.16b
142          PRFM PLDL1KEEP, [x5, 320]
143          MOV v31.16b, v22.16b
144
145        # Is there at least 4 floats (16 bytes)?
146        SUBS x0, x2, 16  // k = kc - 16
147        B.LO 5f
148
149        SUBS x0, x0, 16
150
151        # Prologue - loads for first group of 24 FMA
152
153        # Read first block of 4 A.
154        LDR d0,  [x3], 8              // a0
155        LDR d1, [x12], 8              // a2
156        LD1 {v0.d}[1], [x11], 8       // a1
157        LD1 {v1.d}[1],  [x4], 8       // a3
158
159        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
160        LD1 {v9.16b, v10.16b}, [x5], 32
161        LDR d11, [x5], 8
162        LDR x8, [x5], 8
163
164        # Is there at least 4 floats (16 bytes) for main loop?
165        B.LO 2f
166
167        # Main loop - 4 floats of A (16 bytes)
1681:
169        # First group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
170        # A is loaded for 2nd group into v2/v3
171        # INS is 4 blocks (16 cycles) after load
172
173        # BLOCK 0
174        LDR d2, [x3], 8                // a0
175        INS v11.d[1], x8
176        FMLA v20.4s, v6.4s, v0.s[0]
177        LDR x8, [x11], 8               // a1
178        FMLA v23.4s, v6.4s, v0.s[2]
179        FMLA v26.4s, v6.4s, v1.s[0]
180        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
181
182        # BLOCK 1
183        LDR d3, [x12], 8               // a2
184        INS v2.d[1], x8                // a1 was loaded in block 0
185        FMLA v29.4s, v6.4s, v1.s[2]
186        LDR x8, [x4], 8                // a3
187        FMLA v21.4s, v7.4s, v0.s[0]
188        FMLA v24.4s, v7.4s, v0.s[2]
189        PRFM PLDL1KEEP, [x11, 128]      // Prefetch A1
190
191        # BLOCK 2
192        LDR d14, [x5]                  // vb0x0123
193        INS v3.d[1], x8                // a3 was loaded in block 1
194        FMLA v27.4s, v7.4s, v1.s[0]
195        LDR x8, [x5, 8]
196        FMLA v30.4s, v7.4s, v1.s[2]
197        FMLA v22.4s, v8.4s, v0.s[0]
198        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A2
199
200        # BLOCK 3
201        LDR d15, [x5, 16]              // vb0x4567
202        INS v14.d[1], x8               // v14 was loaded in block 2
203        FMLA v25.4s, v8.4s, v0.s[2]
204        LDR x8, [x5, 24]
205        FMLA v28.4s, v8.4s, v1.s[0]
206        FMLA v31.4s, v8.4s, v1.s[2]
207        PRFM PLDL1KEEP, [x4, 128]      // Prefetch A3
208
209        # BLOCK 4
210        LDR d16, [x5, 32]              // vb0x89AB
211        INS v15.d[1], x8
212        FMLA v20.4s, v9.4s, v0.s[1]
213        LDR x8, [x5, 40]
214        FMLA v23.4s, v9.4s, v0.s[3]
215        FMLA v26.4s, v9.4s, v1.s[1]
216        PRFM PLDL1KEEP, [x5, 320]      // Prefetch B
217
218        # BLOCK 5
219        LDR d17, [x5, 48]              // vb1x0123
220        INS v16.d[1], x8
221        FMLA v29.4s, v9.4s, v1.s[3]
222        LDR x8, [x5, 56]
223        FMLA v21.4s, v10.4s, v0.s[1]
224        FMLA v24.4s, v10.4s, v0.s[3]
225        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
226
227        # BLOCK 6
228        LDR d18, [x5, 64]              // vb1x4567
229        INS v17.d[1], x8
230        FMLA v27.4s, v10.4s, v1.s[1]
231        LDR x8, [x5, 72]
232        FMLA v30.4s, v10.4s, v1.s[3]
233        FMLA v22.4s, v11.4s, v0.s[1]
234        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
235
236        # BLOCK 7
237        LDR d19, [x5, 80]              // vb1x89AB
238        INS v18.d[1], x8
239        FMLA v25.4s, v11.4s, v0.s[3]
240        LDR x8, [x5, 88]
241        FMLA v28.4s, v11.4s, v1.s[1]
242        FMLA v31.4s, v11.4s, v1.s[3]
243
244        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
245        # A is loaded for 1st group into v0/v1
246
247        # BLOCK 0
248        LDR d0, [x3], 8                // a0
249        INS v19.d[1], x8
250        FMLA v20.4s, v14.4s, v2.s[0]
251        LDR x8, [x11], 8               // a1
252        FMLA v23.4s, v14.4s, v2.s[2]
253        FMLA v26.4s, v14.4s, v3.s[0]
254
255        # BLOCK 1
256        LDR d1, [x12], 8               // a2
257        INS v0.d[1], x8                // a1
258        FMLA v29.4s, v14.4s, v3.s[2]
259        LDR x8, [x4], 8                // a3
260        FMLA v21.4s, v15.4s, v2.s[0]
261        FMLA v24.4s, v15.4s, v2.s[2]
262
263        # BLOCK 2
264        LDR d6, [x5, 96]               // vb0x0123
265        INS v1.d[1], x8                // a3
266        FMLA v27.4s, v15.4s, v3.s[0]
267        LDR x8, [x5, 104]
268        FMLA v30.4s, v15.4s, v3.s[2]
269        FMLA v22.4s, v16.4s, v2.s[0]
270
271        # BLOCK 3
272        LDR d7, [x5, 112]              // vb0x4567
273        INS v6.d[1], x8
274        FMLA v25.4s, v16.4s, v2.s[2]
275        LDR x8, [x5, 120]
276        FMLA v28.4s, v16.4s, v3.s[0]
277        FMLA v31.4s, v16.4s, v3.s[2]
278
279        # BLOCK 4
280        LDR d8, [x5, 128]              // vb0x89AB
281        INS v7.d[1], x8
282        FMLA v20.4s, v17.4s, v2.s[1]
283        LDR x8, [x5, 136]
284        FMLA v23.4s, v17.4s, v2.s[3]
285        FMLA v26.4s, v17.4s, v3.s[1]
286
287        # BLOCK 5
288        LDR d9, [x5, 144]              // vb1x0123
289        INS v8.d[1], x8
290        FMLA v29.4s, v17.4s, v3.s[3]
291        LDR x8, [x5, 152]
292        FMLA v21.4s, v18.4s, v2.s[1]
293        FMLA v24.4s, v18.4s, v2.s[3]
294
295        # BLOCK 6
296        LDR d10, [x5, 160]             // vb1x4567
297        INS v9.d[1], x8
298        FMLA v27.4s, v18.4s, v3.s[1]
299        LDR x8, [x5, 168]
300        FMLA v30.4s, v18.4s, v3.s[3]
301        SUBS x0, x0, 16
302        FMLA v22.4s, v19.4s, v2.s[1]
303
304        # BLOCK 7
305        LDR d11, [x5, 176]             // vb1x89AB
306        INS v10.d[1], x8
307        FMLA v25.4s, v19.4s, v2.s[3]
308        LDR x8, [x5, 184]
309        FMLA v28.4s, v19.4s, v3.s[1]
310        ADD x5, x5, 192
311        FMLA v31.4s, v19.4s, v3.s[3]
312        B.HS 1b
313
314        # Epilogue
315        # First block same as main loop.  Second block has no loads.
3162:
317        # BLOCK 0
318        LDR d2, [x3], 8                // a0
319        INS v11.d[1], x8
320        FMLA v20.4s, v6.4s, v0.s[0]
321        LDR x8, [x11], 8               // a1
322        FMLA v23.4s, v6.4s, v0.s[2]
323        FMLA v26.4s, v6.4s, v1.s[0]
324
325        # BLOCK 1
326        LDR d3, [x12], 8               // a2
327        INS v2.d[1], x8                // a1 was loaded in block 0
328        FMLA v29.4s, v6.4s, v1.s[2]
329        LDR x8, [x4], 8                // a3
330        FMLA v21.4s, v7.4s, v0.s[0]
331        FMLA v24.4s, v7.4s, v0.s[2]
332
333        # BLOCK 2
334        LDR d14, [x5]                  // vb0x0123
335        INS v3.d[1], x8                // a3 was loaded in block 1
336        FMLA v27.4s, v7.4s, v1.s[0]
337        LDR x8, [x5, 8]
338        FMLA v30.4s, v7.4s, v1.s[2]
339        FMLA v22.4s, v8.4s, v0.s[0]
340
341        # BLOCK 3
342        LDR d15, [x5, 16]              // vb0x4567
343        INS v14.d[1], x8               // v14 was loaded in block 2
344        FMLA v25.4s, v8.4s, v0.s[2]
345        LDR x8, [x5, 24]
346        FMLA v28.4s, v8.4s, v1.s[0]
347        FMLA v31.4s, v8.4s, v1.s[2]
348
349        # BLOCK 4
350        LDR d16, [x5, 32]              // vb0x89AB
351        INS v15.d[1], x8
352        FMLA v20.4s, v9.4s, v0.s[1]
353        LDR x8, [x5, 40]
354        FMLA v23.4s, v9.4s, v0.s[3]
355        FMLA v26.4s, v9.4s, v1.s[1]
356
357        # BLOCK 5
358        LDR d17, [x5, 48]             // vb1x0123
359        INS v16.d[1], x8
360        FMLA v29.4s, v9.4s, v1.s[3]
361        LDR x8, [x5, 56]
362        FMLA v21.4s, v10.4s, v0.s[1]
363        FMLA v24.4s, v10.4s, v0.s[3]
364
365        # BLOCK 6
366        LDR d18, [x5, 64]             // vb1x4567
367        INS v17.d[1], x8
368        FMLA v27.4s, v10.4s, v1.s[1]
369        LDR x8, [x5, 72]
370        FMLA v30.4s, v10.4s, v1.s[3]
371        FMLA v22.4s, v11.4s, v0.s[1]
372
373        # BLOCK 7
374        LDR d19, [x5, 80]             // vb1x89AB
375        INS v18.d[1], x8
376        FMLA v25.4s, v11.4s, v0.s[3]
377        LDR x8, [x5, 88]
378        FMLA v28.4s, v11.4s, v1.s[1]
379        FMLA v31.4s, v11.4s, v1.s[3]
380
381        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
382        # A is loaded for 1st group into v0/v1
383
384        # BLOCK 0
385        INS v19.d[1], x8
386        FMLA v20.4s, v14.4s, v2.s[0]
387        FMLA v23.4s, v14.4s, v2.s[2]
388        FMLA v26.4s, v14.4s, v3.s[0]
389
390        # BLOCK 1
391        FMLA v29.4s, v14.4s, v3.s[2]
392        FMLA v21.4s, v15.4s, v2.s[0]
393        FMLA v24.4s, v15.4s, v2.s[2]
394
395        # BLOCK 2
396        FMLA v27.4s, v15.4s, v3.s[0]
397        FMLA v30.4s, v15.4s, v3.s[2]
398        FMLA v22.4s, v16.4s, v2.s[0]
399
400        # BLOCK 3
401        FMLA v25.4s, v16.4s, v2.s[2]
402        FMLA v28.4s, v16.4s, v3.s[0]
403        FMLA v31.4s, v16.4s, v3.s[2]
404
405        # BLOCK 4
406        FMLA v20.4s, v17.4s, v2.s[1]
407        FMLA v23.4s, v17.4s, v2.s[3]
408        FMLA v26.4s, v17.4s, v3.s[1]
409
410        # BLOCK 5
411        FMLA v29.4s, v17.4s, v3.s[3]
412        FMLA v21.4s, v18.4s, v2.s[1]
413        FMLA v24.4s, v18.4s, v2.s[3]
414
415        # BLOCK 6
416        FMLA v27.4s, v18.4s, v3.s[1]
417        FMLA v30.4s, v18.4s, v3.s[3]
418        FMLA v22.4s, v19.4s, v2.s[1]
419        TST x0, 15
420
421        # BLOCK 7
422        FMLA v25.4s, v19.4s, v2.s[3]
423        FMLA v28.4s, v19.4s, v3.s[1]
424        ADD x5, x5, 96
425        FMLA v31.4s, v19.4s, v3.s[3]
426
427        # Is there a remainder?- 2 floats of A (8 bytes) or less
428        B.NE 5f
429
4304:
431        # Clamp
432        FMIN v20.4s, v20.4s, v4.4s
433        SUBS x1, x1, 12
434        FMIN v21.4s, v21.4s, v4.4s
435        FMIN v22.4s, v22.4s, v4.4s
436        FMIN v23.4s, v23.4s, v4.4s
437        FMIN v24.4s, v24.4s, v4.4s
438        FMIN v25.4s, v25.4s, v4.4s
439        FMIN v26.4s, v26.4s, v4.4s
440        FMIN v27.4s, v27.4s, v4.4s
441        FMIN v28.4s, v28.4s, v4.4s
442        FMIN v29.4s, v29.4s, v4.4s
443        FMIN v30.4s, v30.4s, v4.4s
444        FMIN v31.4s, v31.4s, v4.4s
445        FMAX v20.4s, v20.4s, v5.4s
446        FMAX v21.4s, v21.4s, v5.4s
447        FMAX v22.4s, v22.4s, v5.4s
448        FMAX v23.4s, v23.4s, v5.4s
449        FMAX v24.4s, v24.4s, v5.4s
450        FMAX v25.4s, v25.4s, v5.4s
451        FMAX v26.4s, v26.4s, v5.4s
452        FMAX v27.4s, v27.4s, v5.4s
453        FMAX v28.4s, v28.4s, v5.4s
454        FMAX v29.4s, v29.4s, v5.4s
455        FMAX v30.4s, v30.4s, v5.4s
456        FMAX v31.4s, v31.4s, v5.4s
457
458        # Store full 4 x 12
459        B.LO 7f
460
461        $if INC:
462          ST1 {v29.16b, v30.16b, v31.16b},  [x7], x14
463          SUB  x3,  x3, x2 // a0 -= kc
464          ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
465          SUB x11, x11, x2 // a1 -= kc
466          ST1 {v23.16b, v24.16b, v25.16b},  [x9], x14
467          SUB x12, x12, x2 // a2 -= kc
468          ST1 {v20.16b, v21.16b, v22.16b},  [x6], x14
469          SUB  x4,  x4, x2 // a3 -= kc
470        $else:
471          ST1 {v20.16b, v21.16b, v22.16b},  [x6], x14
472          SUB  x3,  x3, x2 // a0 -= kc
473          ST1 {v23.16b, v24.16b, v25.16b},  [x9], x14
474          SUB x11, x11, x2 // a1 -= kc
475          ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
476          SUB x12, x12, x2 // a2 -= kc
477          ST1 {v29.16b, v30.16b, v31.16b},  [x7], x14
478          SUB  x4,  x4, x2 // a3 -= kc
479
480        B.HI 0b
481
482        # Restore d8-d11,d14,d15 from stack
483        LDP d14, d15, [sp, 32]
484        LDP d10, d11, [sp, 16]
485        LDP  d8,  d9, [sp], 48
486        RET
487
4885:
489        # Is there a remainder?- 2 floats of A (8 bytes)
490        TBZ x0, 3, 6f
491
492        # Remainder - 2 floats of A (8 bytes)
493        # Read first block of 4 A.
494        LDR d0,  [x3], 8  // a0
495        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
496        LDR d1, [x11], 8  // a1
497        LDR d2, [x12], 8  // a2
498        LDR d3,  [x4], 8  // a3
499        LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48
500
501        # First block of 3 B
502        FMLA v20.4s, v6.4s, v0.s[0]
503        FMLA v23.4s, v6.4s, v1.s[0]
504        FMLA v26.4s, v6.4s, v2.s[0]
505        FMLA v29.4s, v6.4s, v3.s[0]
506        FMLA v21.4s, v7.4s, v0.s[0]
507        FMLA v24.4s, v7.4s, v1.s[0]
508        FMLA v27.4s, v7.4s, v2.s[0]
509        FMLA v30.4s, v7.4s, v3.s[0]
510        FMLA v22.4s, v8.4s, v0.s[0]
511        FMLA v25.4s, v8.4s, v1.s[0]
512        FMLA v28.4s, v8.4s, v2.s[0]
513        FMLA v31.4s, v8.4s, v3.s[0]
514
515        # Second block of 3 B
516        FMLA v20.4s, v9.4s, v0.s[1]
517        FMLA v23.4s, v9.4s, v1.s[1]
518        FMLA v26.4s, v9.4s, v2.s[1]
519        FMLA v29.4s, v9.4s, v3.s[1]
520        FMLA v21.4s, v10.4s, v0.s[1]
521        FMLA v24.4s, v10.4s, v1.s[1]
522        FMLA v27.4s, v10.4s, v2.s[1]
523        FMLA v30.4s, v10.4s, v3.s[1]
524        FMLA v22.4s, v11.4s, v0.s[1]
525        FMLA v25.4s, v11.4s, v1.s[1]
526        FMLA v28.4s, v11.4s, v2.s[1]
527        FMLA v31.4s, v11.4s, v3.s[1]
528
529        TBZ x0, 2, 4b
5306:
531        # Remainder - 1 float of A (4 bytes)
532        LDR s0,  [x3], 4  // a0
533        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
534        LDR s1, [x11], 4  // a1
535        LDR s2, [x12], 4  // a2
536        LDR s3,  [x4], 4  // a3
537
538        FMLA v20.4s, v6.4s, v0.s[0]
539        FMLA v23.4s, v6.4s, v1.s[0]
540        FMLA v26.4s, v6.4s, v2.s[0]
541        FMLA v29.4s, v6.4s, v3.s[0]
542        FMLA v21.4s, v7.4s, v0.s[0]
543        FMLA v24.4s, v7.4s, v1.s[0]
544        FMLA v27.4s, v7.4s, v2.s[0]
545        FMLA v30.4s, v7.4s, v3.s[0]
546        FMLA v22.4s, v8.4s, v0.s[0]
547        FMLA v25.4s, v8.4s, v1.s[0]
548        FMLA v28.4s, v8.4s, v2.s[0]
549        FMLA v31.4s, v8.4s, v3.s[0]
550        B 4b
551
5527:
553        ADD x1, x1, 12
554        # Store odd channels
555        TBZ x1, 3, 8f
556        $if INC:
557          STP q29, q30,  [x7], 32
558          MOV v29.16b, v31.16b
559          STP q26, q27, [x10], 32
560          MOV v26.16b, v28.16b
561          STP q23, q24,  [x9], 32
562          MOV v23.16b, v25.16b
563          STP q20, q21,  [x6], 32
564          MOV v20.16b, v22.16b
565        $else:
566          STP q20, q21,  [x6], 32
567          MOV v20.16b, v22.16b
568          STP q23, q24,  [x9], 32
569          MOV v23.16b, v25.16b
570          STP q26, q27, [x10], 32
571          MOV v26.16b, v28.16b
572          STP q29, q30,  [x7], 32
573          MOV v29.16b, v31.16b
574
5758:
576        TBZ x1, 2, 9f
577        $if INC:
578          STR q29,  [x7], 16
579          MOV v29.16b, v30.16b
580          STR q26, [x10], 16
581          MOV v26.16b, v27.16b
582          STR q23,  [x9], 16
583          MOV v23.16b, v24.16b
584          STR q20,  [x6], 16
585          MOV v20.16b, v21.16b
586        $else:
587          STR q20,  [x6], 16
588          MOV v20.16b, v21.16b
589          STR q23,  [x9], 16
590          MOV v23.16b, v24.16b
591          STR q26, [x10], 16
592          MOV v26.16b, v27.16b
593          STR q29,  [x7], 16
594          MOV v29.16b, v30.16b
595
5969:
597        TBZ x1, 1, 10f
598        $if INC:
599          STR d29,  [x7], 8
600          DUP d29, v29.d[1]
601          STR d26, [x10], 8
602          DUP d26, v26.d[1]
603          STR d23,  [x9], 8
604          DUP d23, v23.d[1]
605          STR d20,  [x6], 8
606          DUP d20, v20.d[1]
607        $else:
608          STR d20,  [x6], 8
609          DUP d20, v20.d[1]
610          STR d23,  [x9], 8
611          DUP d23, v23.d[1]
612          STR d26, [x10], 8
613          DUP d26, v26.d[1]
614          STR d29,  [x7], 8
615          DUP d29, v29.d[1]
616
61710:
618        TBZ x1, 0, 11f
619        $if INC:
620          STR s29,  [x7]
621          STR s26, [x10]
622          STR s23,  [x9]
623          STR s20,  [x6]
624        $else:
625          STR s20,  [x6]
626          STR s23,  [x9]
627          STR s26, [x10]
628          STR s29,  [x7]
62911:
630        # Restore d8-d11,d14,d15 from stack
631        LDP d14, d15, [sp, 32]
632        LDP d10, d11, [sp, 16]
633        LDP  d8,  d9, [sp], 48
634        RET
635
636END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
637
638#ifdef __ELF__
639.section ".note.GNU-stack","",%progbits
640#endif
641