• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# x8 temporary vector shadow register
39
40# Vector register usage and GPR shadows
41# a0  v0
42# a1  v0[1]
43# a2  v1
44# a3  v1[1]
45# a0  v2
46# a1  v2[1]
47# a2  v3
48# a3  v3[1]
49# B   v6  v7  v8
50# B   v9 v10 v11
51# B  v14 v15 v16
52# B  v17 v18 v19
53# C  v20 v21 v22
54# C  v23 v24 v25
55# C  v26 v27 v28
56# C  v29 v30 v31
57# Clamp v4 v5
58# v12 to v13 unused.
59
60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
61
62        $if INC:
63          # Load cn_stride, acc
64          LDP x14, x15, [sp]
65          # Load params pointer
66          LDR x8, [sp, 16]
67        $else:
68          # Load cn_stride, params pointer
69          LDP x14, x8, [sp]
70
71        # Load min/max values
72        LD2R {v4.4s, v5.4s}, [x8]
73
74        # Save d8-d11,d14,d15 on stack
75        STP  d8,  d9, [sp, -48]!
76        STP d10, d11, [sp, 16]
77        STP d14, d15, [sp, 32]
78
79        # Clamp A and C pointers
80        CMP x0, 2                // if mr < 2
81        ADD x11, x3, x4          // a1 = a0 + a_stride
82        ADD x9, x6, x7           // c1 = c0 + cm_stride
83        CSEL x11, x3, x11, LO    //   a1 = a0
84        CSEL x9, x6, x9, LO      //   c1 = c0
85        ADD x12, x11, x4         // a2 = a1 + a_stride
86        ADD x10, x9, x7          // c2 = c1 + cm_stride
87                                 // if mr <= 2
88        CSEL x12, x11, x12, LS   //   a2 = a1
89        CSEL x10, x9, x10, LS    //   c2 = c1
90        CMP x0, 4                // if mr < 4
91        ADD x4, x12, x4          // a3 = a2 + a_stride
92        ADD x7, x10, x7          // c3 = c2 + cm_stride
93        CSEL x4, x12, x4, LO     //   a3 = a2
94        CSEL x7, x10, x7, LO     //   c3 = c2
95
960:
97        $if INC:
98          # Load initial accumulators
99          LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48
100          LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
101          LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
102          LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
103          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
104          PRFM PLDL1KEEP,  [x3, 64]
105          PRFM PLDL1KEEP, [x11,  0]
106          PRFM PLDL1KEEP, [x11, 64]
107          PRFM PLDL1KEEP, [x12,  0]
108          PRFM PLDL1KEEP, [x12, 64]
109          PRFM PLDL1KEEP,  [x4,  0]
110          PRFM PLDL1KEEP,  [x4, 64]
111          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
112          PRFM PLDL1KEEP, [x5,  64]
113          PRFM PLDL1KEEP, [x5, 128]
114          PRFM PLDL1KEEP, [x5, 192]
115          PRFM PLDL1KEEP, [x5, 256]
116          PRFM PLDL1KEEP, [x5, 320]
117        $else:
118          # Load initial bias from w into accumulators
119          LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
120          MOV v23.16b, v20.16b
121          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
122          PRFM PLDL1KEEP,  [x3, 64]
123          MOV v24.16b, v21.16b
124          PRFM PLDL1KEEP,  [x11,  0]
125          PRFM PLDL1KEEP,  [x11, 64]
126          MOV v25.16b, v22.16b
127          PRFM PLDL1KEEP, [x12,  0]
128          PRFM PLDL1KEEP, [x12, 64]
129          MOV v26.16b, v20.16b
130          PRFM PLDL1KEEP, [x4,  0]
131          PRFM PLDL1KEEP, [x4, 64]
132          MOV v27.16b, v21.16b
133          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
134          PRFM PLDL1KEEP, [x5,  64]
135          MOV v28.16b, v22.16b
136          PRFM PLDL1KEEP, [x5, 128]
137          PRFM PLDL1KEEP, [x5, 192]
138          MOV v29.16b, v20.16b
139          PRFM PLDL1KEEP, [x5, 256]
140          MOV v30.16b, v21.16b
141          PRFM PLDL1KEEP, [x5, 320]
142          MOV v31.16b, v22.16b
143
144        # Is there at least 4 floats (16 bytes)?
145        SUBS x0, x2, 16  // k = kc - 16
146        B.LO 4f
147
148        SUBS x0, x0, 16
149
150        # Prologue - loads for first group of 24 FMA
151
152        # Read first block of 4 A.
153        LDR d0,  [x3], 8              // a0
154        LDR d1, [x12], 8              // a2
155        LD1 {v0.d}[1], [x11], 8       // a1
156        LD1 {v1.d}[1],  [x4], 8       // a3
157
158        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
159        LD1 {v9.16b, v10.16b}, [x5], 32
160        LDR d11, [x5], 8
161        LDR x8, [x5], 8
162
163        # Is there at least 4 floats (16 bytes) for main loop?
164        B.LO 2f
165
166        # Main loop - 4 floats of A (16 bytes)
1671:
168        # First group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
169        # A is loaded for 2nd group into v2/v3
170        # INS is 4 blocks (16 cycles) after load
171
172        # BLOCK 0
173        LDR d2, [x3], 8                // a0
174        INS v11.d[1], x8
175        FMLA v20.4s, v6.4s, v0.s[0]
176        LDR x8, [x11], 8               // a1
177        FMLA v23.4s, v6.4s, v0.s[2]
178        FMLA v26.4s, v6.4s, v1.s[0]
179        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
180
181        # BLOCK 1
182        LDR d3, [x12], 8               // a2
183        INS v2.d[1], x8                // a1 was loaded in block 0
184        FMLA v29.4s, v6.4s, v1.s[2]
185        LDR x8, [x4], 8                // a3
186        FMLA v21.4s, v7.4s, v0.s[0]
187        FMLA v24.4s, v7.4s, v0.s[2]
188        PRFM PLDL1KEEP, [x11, 128]      // Prefetch A1
189
190        # BLOCK 2
191        LDR d14, [x5]                  // vb0x0123
192        INS v3.d[1], x8                // a3 was loaded in block 1
193        FMLA v27.4s, v7.4s, v1.s[0]
194        LDR x8, [x5, 8]
195        FMLA v30.4s, v7.4s, v1.s[2]
196        FMLA v22.4s, v8.4s, v0.s[0]
197        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A2
198
199        # BLOCK 3
200        LDR d15, [x5, 16]              // vb0x4567
201        INS v14.d[1], x8               // v14 was loaded in block 2
202        FMLA v25.4s, v8.4s, v0.s[2]
203        LDR x8, [x5, 24]
204        FMLA v28.4s, v8.4s, v1.s[0]
205        FMLA v31.4s, v8.4s, v1.s[2]
206        PRFM PLDL1KEEP, [x4, 128]      // Prefetch A3
207
208        # BLOCK 4
209        LDR d16, [x5, 32]              // vb0x89AB
210        INS v15.d[1], x8
211        FMLA v20.4s, v9.4s, v0.s[1]
212        LDR x8, [x5, 40]
213        FMLA v23.4s, v9.4s, v0.s[3]
214        FMLA v26.4s, v9.4s, v1.s[1]
215        PRFM PLDL1KEEP, [x5, 320]      // Prefetch B
216
217        # BLOCK 5
218        LDR d17, [x5, 48]              // vb1x0123
219        INS v16.d[1], x8
220        FMLA v29.4s, v9.4s, v1.s[3]
221        LDR x8, [x5, 56]
222        FMLA v21.4s, v10.4s, v0.s[1]
223        FMLA v24.4s, v10.4s, v0.s[3]
224        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
225
226        # BLOCK 6
227        LDR d18, [x5, 64]              // vb1x4567
228        INS v17.d[1], x8
229        FMLA v27.4s, v10.4s, v1.s[1]
230        LDR x8, [x5, 72]
231        FMLA v30.4s, v10.4s, v1.s[3]
232        FMLA v22.4s, v11.4s, v0.s[1]
233        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
234
235        # BLOCK 7
236        LDR d19, [x5, 80]              // vb1x89AB
237        INS v18.d[1], x8
238        FMLA v25.4s, v11.4s, v0.s[3]
239        LDR x8, [x5, 88]
240        FMLA v28.4s, v11.4s, v1.s[1]
241        FMLA v31.4s, v11.4s, v1.s[3]
242
243        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
244        # A is loaded for 1st group into v0/v1
245
246        # BLOCK 0
247        LDR d0, [x3], 8                // a0
248        INS v19.d[1], x8
249        FMLA v20.4s, v14.4s, v2.s[0]
250        LDR x8, [x11], 8               // a1
251        FMLA v23.4s, v14.4s, v2.s[2]
252        FMLA v26.4s, v14.4s, v3.s[0]
253
254        # BLOCK 1
255        LDR d1, [x12], 8               // a2
256        INS v0.d[1], x8                // a1
257        FMLA v29.4s, v14.4s, v3.s[2]
258        LDR x8, [x4], 8                // a3
259        FMLA v21.4s, v15.4s, v2.s[0]
260        FMLA v24.4s, v15.4s, v2.s[2]
261
262        # BLOCK 2
263        LDR d6, [x5, 96]               // vb0x0123
264        INS v1.d[1], x8                // a3
265        FMLA v27.4s, v15.4s, v3.s[0]
266        LDR x8, [x5, 104]
267        FMLA v30.4s, v15.4s, v3.s[2]
268        FMLA v22.4s, v16.4s, v2.s[0]
269
270        # BLOCK 3
271        LDR d7, [x5, 112]              // vb0x4567
272        INS v6.d[1], x8
273        FMLA v25.4s, v16.4s, v2.s[2]
274        LDR x8, [x5, 120]
275        FMLA v28.4s, v16.4s, v3.s[0]
276        FMLA v31.4s, v16.4s, v3.s[2]
277
278        # BLOCK 4
279        LDR d8, [x5, 128]              // vb0x89AB
280        INS v7.d[1], x8
281        FMLA v20.4s, v17.4s, v2.s[1]
282        LDR x8, [x5, 136]
283        FMLA v23.4s, v17.4s, v2.s[3]
284        FMLA v26.4s, v17.4s, v3.s[1]
285
286        # BLOCK 5
287        LDR d9, [x5, 144]              // vb1x0123
288        INS v8.d[1], x8
289        FMLA v29.4s, v17.4s, v3.s[3]
290        LDR x8, [x5, 152]
291        FMLA v21.4s, v18.4s, v2.s[1]
292        FMLA v24.4s, v18.4s, v2.s[3]
293
294        # BLOCK 6
295        LDR d10, [x5, 160]             // vb1x4567
296        INS v9.d[1], x8
297        FMLA v27.4s, v18.4s, v3.s[1]
298        LDR x8, [x5, 168]
299        FMLA v30.4s, v18.4s, v3.s[3]
300        SUBS x0, x0, 16
301        FMLA v22.4s, v19.4s, v2.s[1]
302
303        # BLOCK 7
304        LDR d11, [x5, 176]             // vb1x89AB
305        INS v10.d[1], x8
306        FMLA v25.4s, v19.4s, v2.s[3]
307        LDR x8, [x5, 184]
308        FMLA v28.4s, v19.4s, v3.s[1]
309        ADD x5, x5, 192
310        FMLA v31.4s, v19.4s, v3.s[3]
311        B.HS 1b
312
313        # Epilogue
314        # First block same as main loop.  Second block has no loads.
3152:
316        # BLOCK 0
317        LDR d2, [x3], 8                // a0
318        INS v11.d[1], x8
319        FMLA v20.4s, v6.4s, v0.s[0]
320        LDR x8, [x11], 8               // a1
321        FMLA v23.4s, v6.4s, v0.s[2]
322        FMLA v26.4s, v6.4s, v1.s[0]
323
324        # BLOCK 1
325        LDR d3, [x12], 8               // a2
326        INS v2.d[1], x8                // a1 was loaded in block 0
327        FMLA v29.4s, v6.4s, v1.s[2]
328        LDR x8, [x4], 8                // a3
329        FMLA v21.4s, v7.4s, v0.s[0]
330        FMLA v24.4s, v7.4s, v0.s[2]
331
332        # BLOCK 2
333        LDR d14, [x5]                  // vb0x0123
334        INS v3.d[1], x8                // a3 was loaded in block 1
335        FMLA v27.4s, v7.4s, v1.s[0]
336        LDR x8, [x5, 8]
337        FMLA v30.4s, v7.4s, v1.s[2]
338        FMLA v22.4s, v8.4s, v0.s[0]
339
340        # BLOCK 3
341        LDR d15, [x5, 16]              // vb0x4567
342        INS v14.d[1], x8               // v14 was loaded in block 2
343        FMLA v25.4s, v8.4s, v0.s[2]
344        LDR x8, [x5, 24]
345        FMLA v28.4s, v8.4s, v1.s[0]
346        FMLA v31.4s, v8.4s, v1.s[2]
347
348        # BLOCK 4
349        LDR d16, [x5, 32]              // vb0x89AB
350        INS v15.d[1], x8
351        FMLA v20.4s, v9.4s, v0.s[1]
352        LDR x8, [x5, 40]
353        FMLA v23.4s, v9.4s, v0.s[3]
354        FMLA v26.4s, v9.4s, v1.s[1]
355
356        # BLOCK 5
357        LDR d17, [x5, 48]             // vb1x0123
358        INS v16.d[1], x8
359        FMLA v29.4s, v9.4s, v1.s[3]
360        LDR x8, [x5, 56]
361        FMLA v21.4s, v10.4s, v0.s[1]
362        FMLA v24.4s, v10.4s, v0.s[3]
363
364        # BLOCK 6
365        LDR d18, [x5, 64]             // vb1x4567
366        INS v17.d[1], x8
367        FMLA v27.4s, v10.4s, v1.s[1]
368        LDR x8, [x5, 72]
369        FMLA v30.4s, v10.4s, v1.s[3]
370        FMLA v22.4s, v11.4s, v0.s[1]
371
372        # BLOCK 7
373        LDR d19, [x5, 80]             // vb1x89AB
374        INS v18.d[1], x8
375        FMLA v25.4s, v11.4s, v0.s[3]
376        LDR x8, [x5, 88]
377        FMLA v28.4s, v11.4s, v1.s[1]
378        FMLA v31.4s, v11.4s, v1.s[3]
379
380        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
381        # A is loaded for 1st group into v0/v1
382
383        # BLOCK 0
384        INS v19.d[1], x8
385        FMLA v20.4s, v14.4s, v2.s[0]
386        FMLA v23.4s, v14.4s, v2.s[2]
387        FMLA v26.4s, v14.4s, v3.s[0]
388
389        # BLOCK 1
390        FMLA v29.4s, v14.4s, v3.s[2]
391        FMLA v21.4s, v15.4s, v2.s[0]
392        FMLA v24.4s, v15.4s, v2.s[2]
393
394        # BLOCK 2
395        FMLA v27.4s, v15.4s, v3.s[0]
396        FMLA v30.4s, v15.4s, v3.s[2]
397        FMLA v22.4s, v16.4s, v2.s[0]
398
399        # BLOCK 3
400        FMLA v25.4s, v16.4s, v2.s[2]
401        FMLA v28.4s, v16.4s, v3.s[0]
402        FMLA v31.4s, v16.4s, v3.s[2]
403
404        # BLOCK 4
405        FMLA v20.4s, v17.4s, v2.s[1]
406        FMLA v23.4s, v17.4s, v2.s[3]
407        FMLA v26.4s, v17.4s, v3.s[1]
408
409        # BLOCK 5
410        FMLA v29.4s, v17.4s, v3.s[3]
411        FMLA v21.4s, v18.4s, v2.s[1]
412        FMLA v24.4s, v18.4s, v2.s[3]
413
414        # BLOCK 6
415        FMLA v27.4s, v18.4s, v3.s[1]
416        FMLA v30.4s, v18.4s, v3.s[3]
417        FMLA v22.4s, v19.4s, v2.s[1]
418        TST x0, 15
419
420        # BLOCK 7
421        FMLA v25.4s, v19.4s, v2.s[3]
422        FMLA v28.4s, v19.4s, v3.s[1]
423        ADD x5, x5, 96
424        FMLA v31.4s, v19.4s, v3.s[3]
425
426        # Is there a remainder?- 2 floats of A (8 bytes) or less
427        B.NE 4f
428
4293:
430        # Clamp
431        FMAX v20.4s, v20.4s, v4.4s
432        SUBS x1, x1, 12
433        FMAX v21.4s, v21.4s, v4.4s
434        FMAX v22.4s, v22.4s, v4.4s
435        FMAX v23.4s, v23.4s, v4.4s
436        FMAX v24.4s, v24.4s, v4.4s
437        FMAX v25.4s, v25.4s, v4.4s
438        FMAX v26.4s, v26.4s, v4.4s
439        FMAX v27.4s, v27.4s, v4.4s
440        FMAX v28.4s, v28.4s, v4.4s
441        FMAX v29.4s, v29.4s, v4.4s
442        FMAX v30.4s, v30.4s, v4.4s
443        FMAX v31.4s, v31.4s, v4.4s
444        FMIN v20.4s, v20.4s, v5.4s
445        FMIN v21.4s, v21.4s, v5.4s
446        FMIN v22.4s, v22.4s, v5.4s
447        FMIN v23.4s, v23.4s, v5.4s
448        FMIN v24.4s, v24.4s, v5.4s
449        FMIN v25.4s, v25.4s, v5.4s
450        FMIN v26.4s, v26.4s, v5.4s
451        FMIN v27.4s, v27.4s, v5.4s
452        FMIN v28.4s, v28.4s, v5.4s
453        FMIN v29.4s, v29.4s, v5.4s
454        FMIN v30.4s, v30.4s, v5.4s
455        FMIN v31.4s, v31.4s, v5.4s
456
457        # Store full 4 x 12
458        B.LO 6f
459
460        $if INC:
461          ST1 {v29.16b, v30.16b, v31.16b},  [x7], x14
462          SUB  x3,  x3, x2 // a0 -= kc
463          ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
464          SUB x11, x11, x2 // a1 -= kc
465          ST1 {v23.16b, v24.16b, v25.16b},  [x9], x14
466          SUB x12, x12, x2 // a2 -= kc
467          ST1 {v20.16b, v21.16b, v22.16b},  [x6], x14
468          SUB  x4,  x4, x2 // a3 -= kc
469        $else:
470          ST1 {v20.16b, v21.16b, v22.16b},  [x6], x14
471          SUB  x3,  x3, x2 // a0 -= kc
472          ST1 {v23.16b, v24.16b, v25.16b},  [x9], x14
473          SUB x11, x11, x2 // a1 -= kc
474          ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
475          SUB x12, x12, x2 // a2 -= kc
476          ST1 {v29.16b, v30.16b, v31.16b},  [x7], x14
477          SUB  x4,  x4, x2 // a3 -= kc
478
479        B.HI 0b
480
481        # Restore d8-d11,d14,d15 from stack
482        LDP d14, d15, [sp, 32]
483        LDP d10, d11, [sp, 16]
484        LDP  d8,  d9, [sp], 48
485        RET
486
4874:
488        # Is there a remainder?- 2 floats of A (8 bytes)
489        TBZ x0, 3, 5f
490
491        # Remainder - 2 floats of A (8 bytes)
492        # Read first block of 4 A.
493        LDR d0,  [x3], 8  // a0
494        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
495        LDR d1, [x11], 8  // a1
496        LDR d2, [x12], 8  // a2
497        LDR d3,  [x4], 8  // a3
498        LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48
499
500        # First block of 3 B
501        FMLA v20.4s, v6.4s, v0.s[0]
502        FMLA v23.4s, v6.4s, v1.s[0]
503        FMLA v26.4s, v6.4s, v2.s[0]
504        FMLA v29.4s, v6.4s, v3.s[0]
505        FMLA v21.4s, v7.4s, v0.s[0]
506        FMLA v24.4s, v7.4s, v1.s[0]
507        FMLA v27.4s, v7.4s, v2.s[0]
508        FMLA v30.4s, v7.4s, v3.s[0]
509        FMLA v22.4s, v8.4s, v0.s[0]
510        FMLA v25.4s, v8.4s, v1.s[0]
511        FMLA v28.4s, v8.4s, v2.s[0]
512        FMLA v31.4s, v8.4s, v3.s[0]
513
514        # Second block of 3 B
515        FMLA v20.4s, v9.4s, v0.s[1]
516        FMLA v23.4s, v9.4s, v1.s[1]
517        FMLA v26.4s, v9.4s, v2.s[1]
518        FMLA v29.4s, v9.4s, v3.s[1]
519        FMLA v21.4s, v10.4s, v0.s[1]
520        FMLA v24.4s, v10.4s, v1.s[1]
521        FMLA v27.4s, v10.4s, v2.s[1]
522        FMLA v30.4s, v10.4s, v3.s[1]
523        FMLA v22.4s, v11.4s, v0.s[1]
524        FMLA v25.4s, v11.4s, v1.s[1]
525        FMLA v28.4s, v11.4s, v2.s[1]
526        FMLA v31.4s, v11.4s, v3.s[1]
527
528        TBZ x0, 2, 3b
5295:
530        # Remainder - 1 float of A (4 bytes)
531        LDR s0,  [x3], 4  // a0
532        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
533        LDR s1, [x11], 4  // a1
534        LDR s2, [x12], 4  // a2
535        LDR s3,  [x4], 4  // a3
536
537        FMLA v20.4s, v6.4s, v0.s[0]
538        FMLA v23.4s, v6.4s, v1.s[0]
539        FMLA v26.4s, v6.4s, v2.s[0]
540        FMLA v29.4s, v6.4s, v3.s[0]
541        FMLA v21.4s, v7.4s, v0.s[0]
542        FMLA v24.4s, v7.4s, v1.s[0]
543        FMLA v27.4s, v7.4s, v2.s[0]
544        FMLA v30.4s, v7.4s, v3.s[0]
545        FMLA v22.4s, v8.4s, v0.s[0]
546        FMLA v25.4s, v8.4s, v1.s[0]
547        FMLA v28.4s, v8.4s, v2.s[0]
548        FMLA v31.4s, v8.4s, v3.s[0]
549        B 3b
550
5516:
552        ADD x1, x1, 12
553        # Store odd channels
554        TBZ x1, 3, 7f
555        $if INC:
556          STP q29, q30,  [x7], 32
557          MOV v29.16b, v31.16b
558          STP q26, q27, [x10], 32
559          MOV v26.16b, v28.16b
560          STP q23, q24,  [x9], 32
561          MOV v23.16b, v25.16b
562          STP q20, q21,  [x6], 32
563          MOV v20.16b, v22.16b
564        $else:
565          STP q20, q21,  [x6], 32
566          MOV v20.16b, v22.16b
567          STP q23, q24,  [x9], 32
568          MOV v23.16b, v25.16b
569          STP q26, q27, [x10], 32
570          MOV v26.16b, v28.16b
571          STP q29, q30,  [x7], 32
572          MOV v29.16b, v31.16b
573
5747:
575        TBZ x1, 2, 8f
576        $if INC:
577          STR q29,  [x7], 16
578          MOV v29.16b, v30.16b
579          STR q26, [x10], 16
580          MOV v26.16b, v27.16b
581          STR q23,  [x9], 16
582          MOV v23.16b, v24.16b
583          STR q20,  [x6], 16
584          MOV v20.16b, v21.16b
585        $else:
586          STR q20,  [x6], 16
587          MOV v20.16b, v21.16b
588          STR q23,  [x9], 16
589          MOV v23.16b, v24.16b
590          STR q26, [x10], 16
591          MOV v26.16b, v27.16b
592          STR q29,  [x7], 16
593          MOV v29.16b, v30.16b
594
5958:
596        TBZ x1, 1, 9f
597        $if INC:
598          STR d29,  [x7], 8
599          DUP d29, v29.d[1]
600          STR d26, [x10], 8
601          DUP d26, v26.d[1]
602          STR d23,  [x9], 8
603          DUP d23, v23.d[1]
604          STR d20,  [x6], 8
605          DUP d20, v20.d[1]
606        $else:
607          STR d20,  [x6], 8
608          DUP d20, v20.d[1]
609          STR d23,  [x9], 8
610          DUP d23, v23.d[1]
611          STR d26, [x10], 8
612          DUP d26, v26.d[1]
613          STR d29,  [x7], 8
614          DUP d29, v29.d[1]
615
6169:
617        TBZ x1, 0, 10f
618        $if INC:
619          STR s29,  [x7]
620          STR s26, [x10]
621          STR s23,  [x9]
622          STR s20,  [x6]
623        $else:
624          STR s20,  [x6]
625          STR s23,  [x9]
626          STR s26, [x10]
627          STR s29,  [x7]
62810:
629        # Restore d8-d11,d14,d15 from stack
630        LDP d14, d15, [sp, 32]
631        LDP d10, d11, [sp, 16]
632        LDP  d8,  d9, [sp], 48
633        RET
634
635END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53
636
637#ifdef __ELF__
638.section ".note.GNU-stack","",%progbits
639#endif
640