• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41#  x7 c5
42
43# x8 temporary vector shadow register
44
45# Vector register usage
46# A0  v0     v3
47# A1  v0[1]  v3[1]
48# A2  v1     v4
49# A3  v1[1]  v4[1]
50# A4  v2     v5
51# A5  v2[1]  v5[1]
52# B   v12 v13 v14 v15 second set of B
53# B   v16 v17 v18 v19 first set
54# C   v20 v21
55# C   v22 v23
56# C   v24 v25
57# C   v26 v27
58# C   v28 v29
59# C   v30 v31
60# Clamp v6 v7
61# unused A   v8 v9 v10 v11
62
63BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53
64
65        # Clamp A and C pointers
66        CMP x0, 2                // if mr < 2
67        ADD x9, x3, x4           // a1 = a0 + a_stride
68        ADD x16, x6, x7          // c1 = c0 + cm_stride
69        CSEL x9, x3, x9, LO      //   a1 = a0
70        CSEL x16, x6, x16, LO    //   c1 = c0
71
72        ADD x10, x9, x4          // a2 = a1 + a_stride
73        ADD x17, x16, x7         // c2 = c1 + cm_stride
74                                 // if mr <= 2
75        CSEL x10, x9, x10, LS    //   a2 = a1
76        CSEL x17, x16, x17, LS   //   c2 = c1
77
78        CMP x0, 4                // if mr < 4
79        ADD x11, x10, x4         // a3 = a2 + a_stride
80        ADD x18, x17, x7         // c3 = c2 + cm_stride
81        CSEL x11, x10, x11, LO   //   a3 = a2
82        CSEL x18, x17, x18, LO   //   c3 = c2
83
84        ADD x12, x11, x4         // a4 = a3 + a_stride
85        ADD x13, x18, x7         // c4 = c3 + cm_stride
86                                 // if mr <= 5
87        CSEL x12, x11, x12, LS   //   a4 = a3
88        CSEL x13, x18, x13, LS   //   c4 = c3
89
90        # Load params pointer
91        LDR x8, [sp, 8]
92
93        CMP x0, 6                // if mr < 6
94        ADD x4, x12, x4          // a5 = a4 + a_stride
95        ADD x7, x13, x7          // c5 = c4 + cm_stride
96        CSEL x4, x12, x4, LO     //   a5 = a4
97        CSEL x7, x13, x7, LO     //   c5 = c4
98
99        # Load clamping_params values
100        LD2R {v6.4s, v7.4s}, [x8]
101
102        # Load cn_stride
103        LDR x14, [sp]
104
105        // Save d12-d15 on stack
106        STP d12, d13, [sp, -32]!
107        STP d14, d15, [sp, 16]
108
1090:
110        # Load initial bias from w into accumulators
111        LDP q20, q21, [x5], 32
112        MOV v22.16b, v20.16b
113        PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
114        PRFM PLDL1KEEP,  [x3, 64]
115        MOV v23.16b, v21.16b
116        PRFM PLDL1KEEP,  [x9,  0]
117        PRFM PLDL1KEEP,  [x9, 64]
118        MOV v24.16b, v20.16b
119        PRFM PLDL1KEEP, [x10,  0]
120        PRFM PLDL1KEEP, [x10, 64]
121        MOV v25.16b, v21.16b
122        PRFM PLDL1KEEP, [x11,  0]
123        PRFM PLDL1KEEP, [x11, 64]
124        MOV v26.16b, v20.16b
125        PRFM PLDL1KEEP, [x12,  0]
126        PRFM PLDL1KEEP, [x12, 64]
127        MOV v27.16b, v21.16b
128        PRFM PLDL1KEEP,  [x4,  0]
129        PRFM PLDL1KEEP,  [x4, 64]
130        MOV v28.16b, v20.16b
131        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
132        MOV v29.16b, v21.16b
133        PRFM PLDL1KEEP, [x5,  64]
134        MOV v30.16b, v20.16b
135        PRFM PLDL1KEEP, [x5, 128]
136        MOV v31.16b, v21.16b
137        PRFM PLDL1KEEP, [x5, 192]
138
139        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
140        SUBS x0, x2, 16  // k = kc - 16
141        B.LO 5f
142
143        # Prologue - First group loads, no FMA
144        LDR  d0, [x3], 8               // a0
145        LDP q16, q17, [x5], 32         // b
146        LDR  d1, [x10], 8              // a2
147        LDR  d2, [x12], 8              // a4
148        LD1  {v0.d}[1],  [x9], 8       // a1
149        LD1  {v1.d}[1], [x11], 8       // a3
150        LD1  {v2.d}[1],  [x4], 8       // a5
151        SUBS x0, x0, 16
152        LDR  q18, [x5], 16
153        LDR  d19, [x5], 8
154        LDR   x8, [x5], 8   // ins is in BLOCK 0
155
156        # Is there at least 4 floats (16 bytes) for main loop?
157        B.LO 2f
158
159        # Main loop - 4 floats of A (16 bytes)
160        # 48 FMA + 12 LD64 A + 8 LDR B
1611:
162        # First group of 24 FMA, Second group loads
163        // BLOCK 0
164        LDR   d3, [x3], 8              // a0
165        INS v19.d[1], x8               // b from second group
166        FMLA v20.4s, v16.4s,  v0.s[0]
167        LDR   x8, [x9], 8              // a1
168        FMLA v22.4s, v16.4s,  v0.s[2]
169        FMLA v24.4s, v16.4s,  v1.s[0]
170
171        // BLOCK 1
172        LDR  d12, [x5]
173        INS v3.d[1], x8                // a1 ins
174        FMLA v26.4s, v16.4s,  v1.s[2]
175        LDR   x8, [x5, 8]              // b
176        FMLA v28.4s, v16.4s,  v2.s[0]
177        FMLA v30.4s, v16.4s,  v2.s[2]
178
179        // BLOCK 2
180        LDR   d4, [x10], 8             // a2
181        INS v12.d[1], x8               // b  ins
182        FMLA v21.4s, v17.4s,  v0.s[0]
183        LDR   x8, [x11], 8             // a3
184        FMLA v23.4s, v17.4s,  v0.s[2]
185        FMLA v25.4s, v17.4s,  v1.s[0]
186
187        // BLOCK 3
188        LDR   d5, [x12], 8             // a4
189        INS v4.d[1], x8                // a3 ins
190        FMLA v27.4s, v17.4s,  v1.s[2]
191        LDR   x8, [x4], 8              // a5
192        FMLA v29.4s, v17.4s,  v2.s[0]
193        FMLA v31.4s, v17.4s,  v2.s[2]
194
195        // BLOCK 4
196        LDR  d13, [x5, 16]
197        INS v5.d[1], x8                // a5 ins
198        FMLA v20.4s, v18.4s,  v0.s[1]
199        LDR   x8, [x5, 24]
200        FMLA v22.4s, v18.4s,  v0.s[3]
201        FMLA v24.4s, v18.4s,  v1.s[1]
202
203        // BLOCK 5
204        LDR  d14, [x5, 32]
205        INS v13.d[1], x8               // b
206        FMLA v26.4s, v18.4s,  v1.s[3]
207        LDR   x8, [x5, 40]
208        FMLA v28.4s, v18.4s,  v2.s[1]
209        FMLA v30.4s, v18.4s,  v2.s[3]
210
211        // BLOCK 6
212        LDR  d15, [x5, 48]
213        INS v14.d[1], x8               // b
214        FMLA v21.4s, v19.4s,  v0.s[1]
215        LDR   x8, [x5, 56]
216        FMLA v23.4s, v19.4s,  v0.s[3]
217        FMLA v25.4s, v19.4s,  v1.s[1]
218
219        // BLOCK 7
220        INS v15.d[1], x8
221        FMLA v27.4s, v19.4s,  v1.s[3]
222        FMLA v29.4s, v19.4s,  v2.s[1]
223        FMLA v31.4s, v19.4s,  v2.s[3]
224
225        # Second group of 24 FMA, First group of loads
226        // BLOCK 0
227        LDR   d0, [x3], 8              // a0
228        FMLA v20.4s, v12.4s,  v3.s[0]
229        LDR   x8, [x9], 8              // a1
230        FMLA v22.4s, v12.4s,  v3.s[2]
231        FMLA v24.4s, v12.4s,  v4.s[0]
232        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
233
234        // BLOCK 1
235        LDR  d16, [x5, 64]
236        INS v0.d[1], x8                // a1 ins
237        FMLA v26.4s, v12.4s,  v4.s[2]
238        LDR   x8, [x5, 72]             // b
239        FMLA v28.4s, v12.4s,  v5.s[0]
240        FMLA v30.4s, v12.4s,  v5.s[2]
241        PRFM PLDL1KEEP, [x9, 128]      // Prefetch A1
242
243        // BLOCK 2
244        LDR   d1, [x10], 8             // a2
245        INS v16.d[1], x8               // b
246        FMLA v21.4s, v13.4s,  v3.s[0]
247        LDR   x8, [x11], 8             // a3
248        FMLA v23.4s, v13.4s,  v3.s[2]
249        FMLA v25.4s, v13.4s,  v4.s[0]
250        PRFM PLDL1KEEP, [x10, 128]     // Prefetch A2
251
252        // BLOCK 3
253        LDR   d2, [x12], 8             // a4
254        INS v1.d[1], x8                // a3 ins
255        FMLA v27.4s, v13.4s,  v4.s[2]
256        LDR   x8,  [x4], 8             // a5
257        FMLA v29.4s, v13.4s,  v5.s[0]
258        FMLA v31.4s, v13.4s,  v5.s[2]
259        PRFM PLDL1KEEP, [x11, 128]     // Prefetch A3
260
261        // BLOCK 4
262        LDR  d17, [x5, 80]
263        INS v2.d[1], x8                // a5 ins
264        FMLA v20.4s, v14.4s,  v3.s[1]
265        LDR   x8, [x5, 88]
266        FMLA v22.4s, v14.4s,  v3.s[3]
267        FMLA v24.4s, v14.4s,  v4.s[1]
268        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A4
269
270        // BLOCK 5
271        LDR  d18, [x5, 96]
272        INS v17.d[1], x8               // b
273        FMLA v26.4s, v14.4s,  v4.s[3]
274        LDR   x8, [x5, 104]
275        FMLA v28.4s, v14.4s,  v5.s[1]
276        FMLA v30.4s, v14.4s,  v5.s[3]
277        PRFM PLDL1KEEP, [x4, 128]      // Prefetch A5
278
279        // BLOCK 6
280        LDR  d19, [x5, 112]
281        INS v18.d[1], x8               // b
282        FMLA v21.4s, v15.4s,  v3.s[1]
283        LDR   x8, [x5, 120]
284        FMLA v23.4s, v15.4s,  v3.s[3]
285        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
286        FMLA v25.4s, v15.4s,  v4.s[1]
287        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
288
289        // BLOCK 7
290        SUBS x0, x0, 16  // LDR lands here
291        FMLA v27.4s, v15.4s,  v4.s[3]
292        FMLA v29.4s, v15.4s,  v5.s[1]
293        ADD x5, x5, 128
294        FMLA v31.4s, v15.4s,  v5.s[3]
295        B.HS 1b
296
297        # Epilogue - 4 floats of A (16 bytes)
298        # 48 FMA + 12 LD64 A + 8 LDR B
2992:
300        # First group of 24 FMA, Second group loads
301        // BLOCK 0
302        LDR   d3, [x3], 8              // a0
303        INS v19.d[1], x8               // b from second group
304        FMLA v20.4s, v16.4s,  v0.s[0]
305        LDR   x8, [x9], 8              // a1
306        FMLA v22.4s, v16.4s,  v0.s[2]
307        FMLA v24.4s, v16.4s,  v1.s[0]
308        PRFM PSTL1KEEP,  [x6]          // Prefetch C0
309
310        // BLOCK 1
311        LDR  d12, [x5]
312        INS v3.d[1], x8                // a1 ins
313        FMLA v26.4s, v16.4s,  v1.s[2]
314        LDR   x8, [x5, 8]              // b
315        FMLA v28.4s, v16.4s,  v2.s[0]
316        FMLA v30.4s, v16.4s,  v2.s[2]
317        PRFM PSTL1KEEP, [x16]          // Prefetch C1
318
319        // BLOCK 2
320        LDR   d4, [x10], 8             // a2
321        INS v12.d[1], x8               // b  ins
322        FMLA v21.4s, v17.4s,  v0.s[0]
323        LDR   x8, [x11], 8             // a3
324        FMLA v23.4s, v17.4s,  v0.s[2]
325        FMLA v25.4s, v17.4s,  v1.s[0]
326        PRFM PSTL1KEEP, [x17]          // Prefetch C2
327
328        // BLOCK 3
329        LDR   d5, [x12], 8             // a4
330        INS v4.d[1], x8                // a3 ins
331        FMLA v27.4s, v17.4s,  v1.s[2]
332        LDR   x8, [x4], 8              // a5
333        FMLA v29.4s, v17.4s,  v2.s[0]
334        FMLA v31.4s, v17.4s,  v2.s[2]
335        PRFM PSTL1KEEP, [x18]          // Prefetch C3
336
337        // BLOCK 4
338        LDR  d13, [x5, 16]
339        INS v5.d[1], x8                // a5 ins
340        FMLA v20.4s, v18.4s,  v0.s[1]
341        LDR   x8, [x5, 24]
342        FMLA v22.4s, v18.4s,  v0.s[3]
343        FMLA v24.4s, v18.4s,  v1.s[1]
344        PRFM PSTL1KEEP, [x13]          // Prefetch C4
345
346        // BLOCK 5
347        LDR  d14, [x5, 32]
348        INS v13.d[1], x8               // b
349        FMLA v26.4s, v18.4s,  v1.s[3]
350        LDR   x8, [x5, 40]
351        FMLA v28.4s, v18.4s,  v2.s[1]
352        FMLA v30.4s, v18.4s,  v2.s[3]
353        PRFM PSTL1KEEP, [x7]           // Prefetch C5
354
355        // BLOCK 6
356        LDR  d15, [x5, 48]
357        INS v14.d[1], x8               // b
358        FMLA v21.4s, v19.4s,  v0.s[1]
359        LDR   x8, [x5, 56]
360        FMLA v23.4s, v19.4s,  v0.s[3]
361        FMLA v25.4s, v19.4s,  v1.s[1]
362
363        // BLOCK 7
364        INS v15.d[1], x8               // b
365        FMLA v27.4s, v19.4s,  v1.s[3]
366        FMLA v29.4s, v19.4s,  v2.s[1]
367        FMLA v31.4s, v19.4s,  v2.s[3]
368
369        # Second group of 24 FMA, First group of loads
370        // BLOCK 0
371        FMLA v20.4s, v12.4s,  v3.s[0]
372        FMLA v22.4s, v12.4s,  v3.s[2]
373        FMLA v24.4s, v12.4s,  v4.s[0]
374
375        // BLOCK 1
376        FMLA v26.4s, v12.4s,  v4.s[2]
377        FMLA v28.4s, v12.4s,  v5.s[0]
378        FMLA v30.4s, v12.4s,  v5.s[2]
379
380        // BLOCK 2
381        FMLA v21.4s, v13.4s,  v3.s[0]
382        FMLA v23.4s, v13.4s,  v3.s[2]
383        FMLA v25.4s, v13.4s,  v4.s[0]
384
385        // BLOCK 3
386        FMLA v27.4s, v13.4s,  v4.s[2]
387        FMLA v29.4s, v13.4s,  v5.s[0]
388        FMLA v31.4s, v13.4s,  v5.s[2]
389
390        // BLOCK 4
391        FMLA v20.4s, v14.4s,  v3.s[1]
392        FMLA v22.4s, v14.4s,  v3.s[3]
393        FMLA v24.4s, v14.4s,  v4.s[1]
394
395        // BLOCK 5
396        FMLA v26.4s, v14.4s,  v4.s[3]
397        FMLA v28.4s, v14.4s,  v5.s[1]
398        FMLA v30.4s, v14.4s,  v5.s[3]
399
400        // BLOCK 6
401        FMLA v21.4s, v15.4s,  v3.s[1]
402        FMLA v23.4s, v15.4s,  v3.s[3]
403        FMLA v25.4s, v15.4s,  v4.s[1]
404        TST x0, 15
405
406        // BLOCK 7
407        FMLA v27.4s, v15.4s,  v4.s[3]
408        FMLA v29.4s, v15.4s,  v5.s[1]
409        FMLA v31.4s, v15.4s,  v5.s[3]
410        ADD x5, x5, 64
411
412        # Is there a remainder?- 2 floats of A (8 bytes) or less
413        B.NE 5f
4144:
415        # Clamp
416        FMIN v20.4s, v20.4s, v6.4s
417        SUBS x1, x1, 8
418        FMIN v21.4s, v21.4s, v6.4s
419        FMIN v22.4s, v22.4s, v6.4s
420        FMIN v23.4s, v23.4s, v6.4s
421        FMIN v24.4s, v24.4s, v6.4s
422        FMIN v25.4s, v25.4s, v6.4s
423        FMIN v26.4s, v26.4s, v6.4s
424        FMIN v27.4s, v27.4s, v6.4s
425        FMIN v28.4s, v28.4s, v6.4s
426        FMIN v29.4s, v29.4s, v6.4s
427        FMIN v30.4s, v30.4s, v6.4s
428        FMIN v31.4s, v31.4s, v6.4s
429        FMAX v20.4s, v20.4s, v7.4s
430        FMAX v21.4s, v21.4s, v7.4s
431        FMAX v22.4s, v22.4s, v7.4s
432        FMAX v23.4s, v23.4s, v7.4s
433        FMAX v24.4s, v24.4s, v7.4s
434        FMAX v25.4s, v25.4s, v7.4s
435        FMAX v26.4s, v26.4s, v7.4s
436        FMAX v27.4s, v27.4s, v7.4s
437        FMAX v28.4s, v28.4s, v7.4s
438        FMAX v29.4s, v29.4s, v7.4s
439        FMAX v30.4s, v30.4s, v7.4s
440        FMAX v31.4s, v31.4s, v7.4s
441
442        # Store full 6 x 8
443        B.LO 8f
444
445        ST1 {v20.16b, v21.16b},  [x6], x14
446        SUB  x3,  x3, x2 // a0 -= kc
447        ST1 {v22.16b, v23.16b}, [x16], x14
448        SUB  x9,  x9, x2 // a1 -= kc
449        ST1 {v24.16b, v25.16b}, [x17], x14
450        SUB x10, x10, x2 // a2 -= kc
451        ST1 {v26.16b, v27.16b}, [x18], x14
452        SUB x11, x11, x2 // a3 -= kc
453        ST1 {v28.16b, v29.16b}, [x13], x14
454        SUB x12, x12, x2 // a4 -= kc
455        ST1 {v30.16b, v31.16b},  [x7], x14
456        SUB  x4,  x4, x2 // a5 -= kc
457
458        B.HI 0b
459
460        // Restore d12-d15 from stack
461        LDP d14, d15, [sp, 16]
462        LDP d12, d13, [sp], 32
463        RET
464
4655:
466        # Is there a remainder?- 2 floats of A (8 bytes)
467        TBZ x0, 3, 6f
468
469        # Remainder- 2 floats of A (8 bytes)
470        LDR   d0,  [x3], 8
471        LDR  q16, [x5], 16
472        LD1   {v0.d}[1], [x9], 8
473        LDR   d1, [x10], 8
474        LD1   {v1.d}[1], [x11], 8
475        LDR   d2, [x12], 8
476        LD1   {v2.d}[1], [x4], 8
477        LDR  q17, [x5], 16
478        LDR  q18, [x5], 16
479        LDR  q19, [x5], 16
480
481        FMLA v20.4s, v16.4s,  v0.s[0]
482        FMLA v22.4s, v16.4s,  v0.s[2]
483        FMLA v24.4s, v16.4s,  v1.s[0]
484        FMLA v26.4s, v16.4s,  v1.s[2]
485        FMLA v28.4s, v16.4s,  v2.s[0]
486        FMLA v30.4s, v16.4s,  v2.s[2]
487        FMLA v21.4s, v17.4s,  v0.s[0]
488        FMLA v23.4s, v17.4s,  v0.s[2]
489        FMLA v25.4s, v17.4s,  v1.s[0]
490        FMLA v27.4s, v17.4s,  v1.s[2]
491        FMLA v29.4s, v17.4s,  v2.s[0]
492        FMLA v31.4s, v17.4s,  v2.s[2]
493
494        FMLA v20.4s, v18.4s,  v0.s[1]
495        FMLA v22.4s, v18.4s,  v0.s[3]
496        FMLA v24.4s, v18.4s,  v1.s[1]
497        FMLA v26.4s, v18.4s,  v1.s[3]
498        FMLA v28.4s, v18.4s,  v2.s[1]
499        FMLA v30.4s, v18.4s,  v2.s[3]
500        FMLA v21.4s, v19.4s,  v0.s[1]
501        FMLA v23.4s, v19.4s,  v0.s[3]
502        FMLA v25.4s, v19.4s,  v1.s[1]
503        FMLA v27.4s, v19.4s,  v1.s[3]
504        FMLA v29.4s, v19.4s,  v2.s[1]
505        FMLA v31.4s, v19.4s,  v2.s[3]
506
507        # Is there a remainder?- 1 floats of A (4 bytes)
508        TBZ x0, 2, 4b
5096:
510        # Remainder- 1 floats of A (4 bytes)
511        LDR   s0,  [x3], 4
512        LDR  q16, [x5], 16
513        LD1   {v0.s}[2], [x9], 4
514        LDR   s1, [x10], 4
515        LD1   {v1.s}[2], [x11], 4
516        LDR   s2, [x12], 4
517        LD1   {v2.s}[2], [x4], 4
518        LDR  q17, [x5], 16
519
520        FMLA v20.4s, v16.4s,  v0.s[0]
521        FMLA v22.4s, v16.4s,  v0.s[2]
522        FMLA v24.4s, v16.4s,  v1.s[0]
523        FMLA v26.4s, v16.4s,  v1.s[2]
524        FMLA v28.4s, v16.4s,  v2.s[0]
525        FMLA v30.4s, v16.4s,  v2.s[2]
526        FMLA v21.4s, v17.4s,  v0.s[0]
527        FMLA v23.4s, v17.4s,  v0.s[2]
528        FMLA v25.4s, v17.4s,  v1.s[0]
529        FMLA v27.4s, v17.4s,  v1.s[2]
530        FMLA v29.4s, v17.4s,  v2.s[0]
531        FMLA v31.4s, v17.4s,  v2.s[2]
532        B 4b
533
534        # Store odd width
5358:
536        TBZ x1, 2, 9f
537        STR q20,  [x6], 16
538        MOV v20.16b, v21.16b
539        STR q22, [x16], 16
540        MOV v22.16b, v23.16b
541        STR q24, [x17], 16
542        MOV v24.16b, v25.16b
543        STR q26, [x18], 16
544        MOV v26.16b, v27.16b
545        STR q28, [x13], 16
546        MOV v28.16b, v29.16b
547        STR q30,  [x7], 16
548        MOV v30.16b, v31.16b
549
5509:
551        TBZ x1, 1, 10f
552        STR d20,  [x6], 8
553        DUP d20, v20.d[1]
554        STR d22, [x16], 8
555        DUP d22, v22.d[1]
556        STR d24, [x17], 8
557        DUP d24, v24.d[1]
558        STR d26, [x18], 8
559        DUP d26, v26.d[1]
560        STR d28, [x13], 8
561        DUP d28, v28.d[1]
562        STR d30,  [x7], 8
563        DUP d30, v30.d[1]
564
56510:
566        TBZ x1, 0, 11f
567        STR s20,  [x6]
568        STR s22, [x16]
569        STR s24, [x17]
570        STR s26, [x18]
571        STR s28, [x13]
572        STR s30,  [x7]
57311:
574        // Restore d12-d15 from stack
575        LDP d14, d15, [sp, 16]
576        LDP d12, d13, [sp], 32
577        RET
578
579END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53
580
581#ifdef __ELF__
582.section ".note.GNU-stack","",%progbits
583#endif
584