• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x14 c3
40# x13 c4
41#  x7 c5
42
43# x8 temporary vector shadow register
44
45# Vector register usage
46# A0  v0     v3
47# A1  v0[1]  v3[1]
48# A2  v1     v4
49# A3  v1[1]  v4[1]
50# A4  v2     v5
51# A5  v2[1]  v5[1]
52# B   v12 v13 v14 v15 second set of B
53# B   v16 v17 v18 v19 first set
54# C   v20 v21
55# C   v22 v23
56# C   v24 v25
57# C   v26 v27
58# C   v28 v29
59# C   v30 v31
60# Clamp v6 v7
61# unused A   v8 v9 v10 v11
62
63BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
64
65        # Load acc, params pointer
66        LDP x15, x8, [sp, 8]
67
68        # Clamp A and C pointers
69        CMP x0, 2                // if mr < 2
70        ADD x9, x3, x4           // a1 = a0 + a_stride
71        ADD x16, x6, x7          // c1 = c0 + cm_stride
72        CSEL x9, x3, x9, LO      //   a1 = a0
73        CSEL x16, x6, x16, LO    //   c1 = c0
74
75        ADD x10, x9, x4          // a2 = a1 + a_stride
76        ADD x17, x16, x7         // c2 = c1 + cm_stride
77                                 // if mr <= 2
78        CSEL x10, x9, x10, LS    //   a2 = a1
79        CSEL x17, x16, x17, LS   //   c2 = c1
80
81        CMP x0, 4                // if mr < 4
82        ADD x11, x10, x4         // a3 = a2 + a_stride
83        ADD x14, x17, x7         // c3 = c2 + cm_stride
84        CSEL x11, x10, x11, LO   //   a3 = a2
85        CSEL x14, x17, x14, LO   //   c3 = c2
86
87        ADD x12, x11, x4         // a4 = a3 + a_stride
88        ADD x13, x14, x7         // c4 = c3 + cm_stride
89                                 // if mr <= 4
90        CSEL x12, x11, x12, LS   //   a4 = a3
91        CSEL x13, x14, x13, LS   //   c4 = c3
92
93        CMP x0, 6                // if mr < 6
94        ADD x4, x12, x4          // a5 = a4 + a_stride
95        ADD x7, x13, x7          // c5 = c4 + cm_stride
96        CSEL x4, x12, x4, LO     //   a5 = a4
97        CSEL x7, x13, x7, LO     //   c5 = c4
98
99        # Load min/max values
100        LD2R {v6.4s, v7.4s}, [x8]
101
102        // Save d12-d15 on stack
103        STP d12, d13, [sp, -32]!
104        STP d14, d15, [sp, 16]
105
1060:
107        # Load initial accumulators
108        LDP q20, q21, [x15], 32
109        LDP q22, q23, [x15], 32
110        LDP q24, q25, [x15], 32
111        LDP q26, q27, [x15], 32
112        LDP q28, q29, [x15], 32
113        LDP q30, q31, [x15], 32
114        PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
115        PRFM PLDL1KEEP,  [x3, 64]
116        PRFM PLDL1KEEP,  [x9,  0]
117        PRFM PLDL1KEEP,  [x9, 64]
118        PRFM PLDL1KEEP, [x10,  0]
119        PRFM PLDL1KEEP, [x10, 64]
120        PRFM PLDL1KEEP, [x11,  0]
121        PRFM PLDL1KEEP, [x11, 64]
122        PRFM PLDL1KEEP, [x12,  0]
123        PRFM PLDL1KEEP, [x12, 64]
124        PRFM PLDL1KEEP,  [x4,  0]
125        PRFM PLDL1KEEP,  [x4, 64]
126        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
127        PRFM PLDL1KEEP, [x5,  64]
128        PRFM PLDL1KEEP, [x5, 128]
129        PRFM PLDL1KEEP, [x5, 192]
130
131        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
132        SUBS x0, x2, 16  // k = kc - 16
133        B.LO 4f
134
135        # Prologue - First group loads, no FMA
136        LDR  d0, [x3], 8               // a0
137        LDP q16, q17, [x5], 32         // b
138        LDR  d1, [x10], 8              // a2
139        LDR  d2, [x12], 8              // a4
140        LD1  {v0.d}[1],  [x9], 8       // a1
141        LD1  {v1.d}[1], [x11], 8       // a3
142        LD1  {v2.d}[1],  [x4], 8       // a5
143        SUBS x0, x0, 16
144        LDR  q18, [x5], 16
145        LDR  d19, [x5], 8
146        LDR   x8, [x5], 8   // ins is in BLOCK 0
147
148        # Is there at least 4 floats (16 bytes) for main loop?
149        B.LO 2f
150
151        # Main loop - 4 floats of A (16 bytes)
152        # 48 FMA + 12 LD64 A + 8 LDR B
1531:
154        # First group of 24 FMA, Second group loads
155        // BLOCK 0
156        LDR   d3, [x3], 8              // a0
157        INS v19.d[1], x8               // b from second group
158        FMLA v20.4s, v16.4s,  v0.s[0]
159        LDR   x8, [x9], 8              // a1
160        FMLA v22.4s, v16.4s,  v0.s[2]
161        FMLA v24.4s, v16.4s,  v1.s[0]
162
163        // BLOCK 1
164        LDR  d12, [x5]
165        INS v3.d[1], x8                // a1 ins
166        FMLA v26.4s, v16.4s,  v1.s[2]
167        LDR   x8, [x5, 8]              // b
168        FMLA v28.4s, v16.4s,  v2.s[0]
169        FMLA v30.4s, v16.4s,  v2.s[2]
170
171        // BLOCK 2
172        LDR   d4, [x10], 8             // a2
173        INS v12.d[1], x8               // b  ins
174        FMLA v21.4s, v17.4s,  v0.s[0]
175        LDR   x8, [x11], 8             // a3
176        FMLA v23.4s, v17.4s,  v0.s[2]
177        FMLA v25.4s, v17.4s,  v1.s[0]
178
179        // BLOCK 3
180        LDR   d5, [x12], 8             // a4
181        INS v4.d[1], x8                // a3 ins
182        FMLA v27.4s, v17.4s,  v1.s[2]
183        LDR   x8, [x4], 8              // a5
184        FMLA v29.4s, v17.4s,  v2.s[0]
185        FMLA v31.4s, v17.4s,  v2.s[2]
186
187        // BLOCK 4
188        LDR  d13, [x5, 16]
189        INS v5.d[1], x8                // a5 ins
190        FMLA v20.4s, v18.4s,  v0.s[1]
191        LDR   x8, [x5, 24]
192        FMLA v22.4s, v18.4s,  v0.s[3]
193        FMLA v24.4s, v18.4s,  v1.s[1]
194
195        // BLOCK 5
196        LDR  d14, [x5, 32]
197        INS v13.d[1], x8               // b
198        FMLA v26.4s, v18.4s,  v1.s[3]
199        LDR   x8, [x5, 40]
200        FMLA v28.4s, v18.4s,  v2.s[1]
201        FMLA v30.4s, v18.4s,  v2.s[3]
202
203        // BLOCK 6
204        LDR  d15, [x5, 48]
205        INS v14.d[1], x8               // b
206        FMLA v21.4s, v19.4s,  v0.s[1]
207        LDR   x8, [x5, 56]
208        FMLA v23.4s, v19.4s,  v0.s[3]
209        FMLA v25.4s, v19.4s,  v1.s[1]
210
211        // BLOCK 7
212        INS v15.d[1], x8
213        FMLA v27.4s, v19.4s,  v1.s[3]
214        FMLA v29.4s, v19.4s,  v2.s[1]
215        FMLA v31.4s, v19.4s,  v2.s[3]
216
217        # Second group of 24 FMA, First group of loads
218        // BLOCK 0
219        LDR   d0, [x3], 8              // a0
220        FMLA v20.4s, v12.4s,  v3.s[0]
221        LDR   x8, [x9], 8              // a1
222        FMLA v22.4s, v12.4s,  v3.s[2]
223        FMLA v24.4s, v12.4s,  v4.s[0]
224        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
225
226        // BLOCK 1
227        LDR  d16, [x5, 64]
228        INS v0.d[1], x8                // a1 ins
229        FMLA v26.4s, v12.4s,  v4.s[2]
230        LDR   x8, [x5, 72]             // b
231        FMLA v28.4s, v12.4s,  v5.s[0]
232        FMLA v30.4s, v12.4s,  v5.s[2]
233        PRFM PLDL1KEEP, [x9, 128]      // Prefetch A1
234
235        // BLOCK 2
236        LDR   d1, [x10], 8             // a2
237        INS v16.d[1], x8               // b
238        FMLA v21.4s, v13.4s,  v3.s[0]
239        LDR   x8, [x11], 8             // a3
240        FMLA v23.4s, v13.4s,  v3.s[2]
241        FMLA v25.4s, v13.4s,  v4.s[0]
242        PRFM PLDL1KEEP, [x10, 128]     // Prefetch A2
243
244        // BLOCK 3
245        LDR   d2, [x12], 8             // a4
246        INS v1.d[1], x8                // a3 ins
247        FMLA v27.4s, v13.4s,  v4.s[2]
248        LDR   x8,  [x4], 8             // a5
249        FMLA v29.4s, v13.4s,  v5.s[0]
250        FMLA v31.4s, v13.4s,  v5.s[2]
251        PRFM PLDL1KEEP, [x11, 128]     // Prefetch A3
252
253        // BLOCK 4
254        LDR  d17, [x5, 80]
255        INS v2.d[1], x8                // a5 ins
256        FMLA v20.4s, v14.4s,  v3.s[1]
257        LDR   x8, [x5, 88]
258        FMLA v22.4s, v14.4s,  v3.s[3]
259        FMLA v24.4s, v14.4s,  v4.s[1]
260        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A4
261
262        // BLOCK 5
263        LDR  d18, [x5, 96]
264        INS v17.d[1], x8               // b
265        FMLA v26.4s, v14.4s,  v4.s[3]
266        LDR   x8, [x5, 104]
267        FMLA v28.4s, v14.4s,  v5.s[1]
268        FMLA v30.4s, v14.4s,  v5.s[3]
269        PRFM PLDL1KEEP, [x4, 128]      // Prefetch A5
270
271        // BLOCK 6
272        LDR  d19, [x5, 112]
273        INS v18.d[1], x8               // b
274        FMLA v21.4s, v15.4s,  v3.s[1]
275        LDR   x8, [x5, 120]
276        FMLA v23.4s, v15.4s,  v3.s[3]
277        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
278        FMLA v25.4s, v15.4s,  v4.s[1]
279        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
280
281        // BLOCK 7
282        SUBS x0, x0, 16  // LDR lands here
283        FMLA v27.4s, v15.4s,  v4.s[3]
284        FMLA v29.4s, v15.4s,  v5.s[1]
285        ADD x5, x5, 128
286        FMLA v31.4s, v15.4s,  v5.s[3]
287        B.HS 1b
288
289        # Epilogue - 4 floats of A (16 bytes)
290        # 48 FMA + 12 LD64 A + 8 LDR B
2912:
292        # First group of 24 FMA, Second group loads
293        // BLOCK 0
294        LDR   d3, [x3], 8              // a0
295        INS v19.d[1], x8               // b from second group
296        FMLA v20.4s, v16.4s,  v0.s[0]
297        LDR   x8, [x9], 8              // a1
298        FMLA v22.4s, v16.4s,  v0.s[2]
299        FMLA v24.4s, v16.4s,  v1.s[0]
300        PRFM PSTL1KEEP,  [x6]          // Prefetch C0
301
302        // BLOCK 1
303        LDR  d12, [x5]
304        INS v3.d[1], x8                // a1 ins
305        FMLA v26.4s, v16.4s,  v1.s[2]
306        LDR   x8, [x5, 8]              // b
307        FMLA v28.4s, v16.4s,  v2.s[0]
308        FMLA v30.4s, v16.4s,  v2.s[2]
309        PRFM PSTL1KEEP, [x16]          // Prefetch C1
310
311        // BLOCK 2
312        LDR   d4, [x10], 8             // a2
313        INS v12.d[1], x8               // b  ins
314        FMLA v21.4s, v17.4s,  v0.s[0]
315        LDR   x8, [x11], 8             // a3
316        FMLA v23.4s, v17.4s,  v0.s[2]
317        FMLA v25.4s, v17.4s,  v1.s[0]
318        PRFM PSTL1KEEP, [x17]          // Prefetch C2
319
320        // BLOCK 3
321        LDR   d5, [x12], 8             // a4
322        INS v4.d[1], x8                // a3 ins
323        FMLA v27.4s, v17.4s,  v1.s[2]
324        LDR   x8, [x4], 8              // a5
325        FMLA v29.4s, v17.4s,  v2.s[0]
326        FMLA v31.4s, v17.4s,  v2.s[2]
327        PRFM PSTL1KEEP, [x14]          // Prefetch C3
328
329        // BLOCK 4
330        LDR  d13, [x5, 16]
331        INS v5.d[1], x8                // a5 ins
332        FMLA v20.4s, v18.4s,  v0.s[1]
333        LDR   x8, [x5, 24]
334        FMLA v22.4s, v18.4s,  v0.s[3]
335        FMLA v24.4s, v18.4s,  v1.s[1]
336        PRFM PSTL1KEEP, [x13]          // Prefetch C4
337
338        // BLOCK 5
339        LDR  d14, [x5, 32]
340        INS v13.d[1], x8               // b
341        FMLA v26.4s, v18.4s,  v1.s[3]
342        LDR   x8, [x5, 40]
343        FMLA v28.4s, v18.4s,  v2.s[1]
344        FMLA v30.4s, v18.4s,  v2.s[3]
345        PRFM PSTL1KEEP, [x7]           // Prefetch C5
346
347        // BLOCK 6
348        LDR  d15, [x5, 48]
349        INS v14.d[1], x8               // b
350        FMLA v21.4s, v19.4s,  v0.s[1]
351        LDR   x8, [x5, 56]
352        FMLA v23.4s, v19.4s,  v0.s[3]
353        FMLA v25.4s, v19.4s,  v1.s[1]
354
355        // BLOCK 7
356        INS v15.d[1], x8               // b
357        FMLA v27.4s, v19.4s,  v1.s[3]
358        FMLA v29.4s, v19.4s,  v2.s[1]
359        FMLA v31.4s, v19.4s,  v2.s[3]
360
361        # Second group of 24 FMA, First group of loads
362        // BLOCK 0
363        FMLA v20.4s, v12.4s,  v3.s[0]
364        FMLA v22.4s, v12.4s,  v3.s[2]
365        FMLA v24.4s, v12.4s,  v4.s[0]
366
367        // BLOCK 1
368        FMLA v26.4s, v12.4s,  v4.s[2]
369        FMLA v28.4s, v12.4s,  v5.s[0]
370        FMLA v30.4s, v12.4s,  v5.s[2]
371
372        // BLOCK 2
373        FMLA v21.4s, v13.4s,  v3.s[0]
374        FMLA v23.4s, v13.4s,  v3.s[2]
375        FMLA v25.4s, v13.4s,  v4.s[0]
376
377        // BLOCK 3
378        FMLA v27.4s, v13.4s,  v4.s[2]
379        FMLA v29.4s, v13.4s,  v5.s[0]
380        FMLA v31.4s, v13.4s,  v5.s[2]
381
382        // BLOCK 4
383        FMLA v20.4s, v14.4s,  v3.s[1]
384        FMLA v22.4s, v14.4s,  v3.s[3]
385        FMLA v24.4s, v14.4s,  v4.s[1]
386
387        // BLOCK 5
388        FMLA v26.4s, v14.4s,  v4.s[3]
389        FMLA v28.4s, v14.4s,  v5.s[1]
390        FMLA v30.4s, v14.4s,  v5.s[3]
391
392        // BLOCK 6
393        FMLA v21.4s, v15.4s,  v3.s[1]
394        FMLA v23.4s, v15.4s,  v3.s[3]
395        FMLA v25.4s, v15.4s,  v4.s[1]
396        TST x0, 15
397
398        // BLOCK 7
399        FMLA v27.4s, v15.4s,  v4.s[3]
400        FMLA v29.4s, v15.4s,  v5.s[1]
401        FMLA v31.4s, v15.4s,  v5.s[3]
402        ADD x5, x5, 64
403
404        # Is there a remainder?- 2 floats of A (8 bytes) or less
405        B.NE 4f
4063:
407        # Clamp
408        FMAX v20.4s, v20.4s, v6.4s
409        # Load cn_stride
410        LDR x0, [sp, 32]
411        FMAX v21.4s, v21.4s, v6.4s
412        FMAX v22.4s, v22.4s, v6.4s
413        FMAX v23.4s, v23.4s, v6.4s
414        FMAX v24.4s, v24.4s, v6.4s
415        FMAX v25.4s, v25.4s, v6.4s
416        FMAX v26.4s, v26.4s, v6.4s
417        FMAX v27.4s, v27.4s, v6.4s
418        FMAX v28.4s, v28.4s, v6.4s
419        FMAX v29.4s, v29.4s, v6.4s
420        FMAX v30.4s, v30.4s, v6.4s
421        FMAX v31.4s, v31.4s, v6.4s
422        SUBS x1, x1, 8
423        FMIN v20.4s, v20.4s, v7.4s
424        FMIN v21.4s, v21.4s, v7.4s
425        FMIN v22.4s, v22.4s, v7.4s
426        FMIN v23.4s, v23.4s, v7.4s
427        FMIN v24.4s, v24.4s, v7.4s
428        FMIN v25.4s, v25.4s, v7.4s
429        FMIN v26.4s, v26.4s, v7.4s
430        FMIN v27.4s, v27.4s, v7.4s
431        FMIN v28.4s, v28.4s, v7.4s
432        FMIN v29.4s, v29.4s, v7.4s
433        FMIN v30.4s, v30.4s, v7.4s
434        FMIN v31.4s, v31.4s, v7.4s
435
436        # Store full 6 x 8
437        B.LO 6f
438
439        ST1 {v30.16b, v31.16b},  [x7], x0
440        SUB  x3,  x3, x2 // a0 -= kc
441        ST1 {v28.16b, v29.16b}, [x13], x0
442        SUB  x9,  x9, x2 // a1 -= kc
443        ST1 {v26.16b, v27.16b}, [x14], x0
444        SUB x10, x10, x2 // a2 -= kc
445        ST1 {v24.16b, v25.16b}, [x17], x0
446        SUB x11, x11, x2 // a3 -= kc
447        ST1 {v22.16b, v23.16b}, [x16], x0
448        SUB x12, x12, x2 // a4 -= kc
449        ST1 {v20.16b, v21.16b},  [x6], x0
450        SUB  x4,  x4, x2 // a5 -= kc
451
452        B.HI 0b
453
454        // Restore d12-d15 from stack
455        LDP d14, d15, [sp, 16]
456        LDP d12, d13, [sp], 32
457        RET
458
4594:
460        # Is there a remainder?- 2 floats of A (8 bytes)
461        TBZ x0, 3, 5f
462
463        # Remainder- 2 floats of A (8 bytes)
464        LDR   d0,  [x3], 8
465        LDR  q16, [x5], 16
466        LD1   {v0.d}[1], [x9], 8
467        LDR   d1, [x10], 8
468        LD1   {v1.d}[1], [x11], 8
469        LDR   d2, [x12], 8
470        LD1   {v2.d}[1], [x4], 8
471        LDR  q17, [x5], 16
472        LDR  q18, [x5], 16
473        LDR  q19, [x5], 16
474
475        FMLA v20.4s, v16.4s,  v0.s[0]
476        FMLA v22.4s, v16.4s,  v0.s[2]
477        FMLA v24.4s, v16.4s,  v1.s[0]
478        FMLA v26.4s, v16.4s,  v1.s[2]
479        FMLA v28.4s, v16.4s,  v2.s[0]
480        FMLA v30.4s, v16.4s,  v2.s[2]
481        FMLA v21.4s, v17.4s,  v0.s[0]
482        FMLA v23.4s, v17.4s,  v0.s[2]
483        FMLA v25.4s, v17.4s,  v1.s[0]
484        FMLA v27.4s, v17.4s,  v1.s[2]
485        FMLA v29.4s, v17.4s,  v2.s[0]
486        FMLA v31.4s, v17.4s,  v2.s[2]
487
488        FMLA v20.4s, v18.4s,  v0.s[1]
489        FMLA v22.4s, v18.4s,  v0.s[3]
490        FMLA v24.4s, v18.4s,  v1.s[1]
491        FMLA v26.4s, v18.4s,  v1.s[3]
492        FMLA v28.4s, v18.4s,  v2.s[1]
493        FMLA v30.4s, v18.4s,  v2.s[3]
494        FMLA v21.4s, v19.4s,  v0.s[1]
495        FMLA v23.4s, v19.4s,  v0.s[3]
496        FMLA v25.4s, v19.4s,  v1.s[1]
497        FMLA v27.4s, v19.4s,  v1.s[3]
498        FMLA v29.4s, v19.4s,  v2.s[1]
499        FMLA v31.4s, v19.4s,  v2.s[3]
500
501        # Is there a remainder?- 1 floats of A (4 bytes)
502        TBZ x0, 2, 3b
5035:
504        # Remainder- 1 floats of A (4 bytes)
505        LDR   s0,  [x3], 4
506        LDR  q16, [x5], 16
507        LD1   {v0.s}[2], [x9], 4
508        LDR   s1, [x10], 4
509        LD1   {v1.s}[2], [x11], 4
510        LDR   s2, [x12], 4
511        LD1   {v2.s}[2], [x4], 4
512        LDR  q17, [x5], 16
513
514        FMLA v20.4s, v16.4s,  v0.s[0]
515        FMLA v22.4s, v16.4s,  v0.s[2]
516        FMLA v24.4s, v16.4s,  v1.s[0]
517        FMLA v26.4s, v16.4s,  v1.s[2]
518        FMLA v28.4s, v16.4s,  v2.s[0]
519        FMLA v30.4s, v16.4s,  v2.s[2]
520        FMLA v21.4s, v17.4s,  v0.s[0]
521        FMLA v23.4s, v17.4s,  v0.s[2]
522        FMLA v25.4s, v17.4s,  v1.s[0]
523        FMLA v27.4s, v17.4s,  v1.s[2]
524        FMLA v29.4s, v17.4s,  v2.s[0]
525        FMLA v31.4s, v17.4s,  v2.s[2]
526        B 3b
527
528        # Store odd width
5296:
530        TBZ x1, 2, 7f
531        STR q30,  [x7], 16
532        MOV v30.16b, v31.16b
533        STR q28, [x13], 16
534        MOV v28.16b, v29.16b
535        STR q26, [x14], 16
536        MOV v26.16b, v27.16b
537        STR q24, [x17], 16
538        MOV v24.16b, v25.16b
539        STR q22, [x16], 16
540        MOV v22.16b, v23.16b
541        STR q20,  [x6], 16
542        MOV v20.16b, v21.16b
543
5447:
545        TBZ x1, 1, 8f
546        STR d30,  [x7], 8
547        DUP d30, v30.d[1]
548        STR d28, [x13], 8
549        DUP d28, v28.d[1]
550        STR d26, [x14], 8
551        DUP d26, v26.d[1]
552        STR d24, [x17], 8
553        DUP d24, v24.d[1]
554        STR d22, [x16], 8
555        DUP d22, v22.d[1]
556        STR d20,  [x6], 8
557        DUP d20, v20.d[1]
558
5598:
560        TBZ x1, 0, 9f
561        STR s30,  [x7]
562        STR s28, [x13]
563        STR s26, [x14]
564        STR s24, [x17]
565        STR s22, [x16]
566        STR s20,  [x6]
5679:
568        // Restore d12-d15 from stack
569        LDP d14, d15, [sp, 16]
570        LDP d12, d13, [sp], 32
571        RET
572
573END_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
574
575#ifdef __ELF__
576.section ".note.GNU-stack","",%progbits
577#endif
578