• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# x8 temporary vector shadow register
43
44# Vector register usage
45# A0  v0     v3
46# A1  v0[1]  v3[1]
47# A2  v1     v4
48# A3  v1[1]  v4[1]
49# A4  v2     v5
50# A5  v2[1]  v5[1]
51# B   v12 v13 v14 v15 second set of B
52# B   v16 v17 v18 v19 first set
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60# unused A   v8 v9 v10 v11
61
62BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55
63
64        # Load params pointer
65        LDR x8, [sp, 8]
66
67        # Clamp A and C pointers
68        CMP x0, 2                // if mr < 2
69        ADD x9, x3, x4           // a1 = a0 + a_stride
70        ADD x16, x6, x7          // c1 = c0 + cm_stride
71        CSEL x9, x3, x9, LO      //   a1 = a0
72        CSEL x16, x6, x16, LO    //   c1 = c0
73
74        ADD x10, x9, x4          // a2 = a1 + a_stride
75        ADD x17, x16, x7         // c2 = c1 + cm_stride
76                                 // if mr <= 2
77        CSEL x10, x9, x10, LS    //   a2 = a1
78        CSEL x17, x16, x17, LS   //   c2 = c1
79
80        CMP x0, 4                // if mr < 4
81        ADD x11, x10, x4         // a3 = a2 + a_stride
82        ADD x14, x17, x7         // c3 = c2 + cm_stride
83        CSEL x11, x10, x11, LO   //   a3 = a2
84        CSEL x14, x17, x14, LO   //   c3 = c2
85
86        ADD x12, x11, x4         // a4 = a3 + a_stride
87        ADD x13, x14, x7         // c4 = c3 + cm_stride
88                                 // if mr <= 4
89        CSEL x12, x11, x12, LS   //   a4 = a3
90        CSEL x13, x14, x13, LS   //   c4 = c3
91
92        CMP x0, 6                // if mr < 6
93        ADD x4, x12, x4          // a5 = a4 + a_stride
94        ADD x7, x13, x7          // c5 = c4 + cm_stride
95        CSEL x4, x12, x4, LO     //   a5 = a4
96        CSEL x7, x13, x7, LO     //   c5 = c4
97
98        # Load min/max values
99        LD2R {v6.4s, v7.4s}, [x8]
100
101        // Save d12-d15 on stack
102        STP d12, d13, [sp, -32]!
103        STP d14, d15, [sp, 16]
104
1050:
106        # Load initial bias from w into accumulators
107        LDP q20, q21, [x5], 32
108        SUBS x0, x2, 16  // k = kc - 16
109        PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
110        PRFM PLDL1KEEP,  [x3, 64]
111        MOV v22.16b, v20.16b
112        PRFM PLDL1KEEP,  [x9,  0]
113        PRFM PLDL1KEEP,  [x9, 64]
114        MOV v23.16b, v21.16b
115        PRFM PLDL1KEEP, [x10,  0]
116        PRFM PLDL1KEEP, [x10, 64]
117        MOV v24.16b, v20.16b
118        PRFM PLDL1KEEP, [x11,  0]
119        PRFM PLDL1KEEP, [x11, 64]
120        MOV v25.16b, v21.16b
121        PRFM PLDL1KEEP, [x12,  0]
122        PRFM PLDL1KEEP, [x12, 64]
123        MOV v26.16b, v20.16b
124        PRFM PLDL1KEEP,  [x4,  0]
125        PRFM PLDL1KEEP,  [x4, 64]
126        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
127        MOV v27.16b, v21.16b
128        PRFM PLDL1KEEP, [x5,  64]
129        MOV v28.16b, v20.16b
130        PRFM PLDL1KEEP, [x5, 128]
131        MOV v29.16b, v21.16b
132        PRFM PLDL1KEEP, [x5, 192]
133        MOV v30.16b, v20.16b
134        PRFM PLDL1KEEP, [x5, 256]
135        MOV v31.16b, v21.16b
136        PRFM PLDL1KEEP, [x5, 320]
137
138        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
139        B.LO 4f
140
141        # Prologue - First group loads, no FMA
142        LDR  d0, [x3], 8               // a0
143        LDP  q16, q17, [x5], 32        // b
144        LDR  d1, [x10], 8              // a2
145        LDR  d2, [x12], 8              // a4
146        LD1  {v0.d}[1],  [x9], 8       // a1
147        LD1  {v1.d}[1], [x11], 8       // a3
148        LD1  {v2.d}[1],  [x4], 8       // a5
149        SUBS x0, x0, 16
150        LDR  q18, [x5], 16
151        LDR  d19, [x5], 8
152        LDR   x8, [x5], 8   // ins is in BLOCK 0
153
154        # Is there at least 4 floats (16 bytes) for main loop?
155        B.LO 2f
156
157        # Main loop - 4 floats of A (16 bytes)
158        # 48 FMA + 12 LD64 A + 8 LDR B
1591:
160        # First group of 24 FMA, Second group loads
161        // BLOCK 0
162        FMLA v20.4s, v16.4s,  v0.s[0]
163        LDR   d3, [x3], 8              // a0
164        FMLA v22.4s, v16.4s,  v0.s[2]
165        INS v19.d[1], x8               // b from second group
166        FMLA v24.4s, v16.4s,  v1.s[0]
167        LDR   x8, [x9], 8              // a1
168
169        // BLOCK 1
170        FMLA v26.4s, v16.4s,  v1.s[2]
171        LDR  d12, [x5]
172        FMLA v28.4s, v16.4s,  v2.s[0]
173        INS v3.d[1], x8                // a1 ins
174        FMLA v30.4s, v16.4s,  v2.s[2]
175        LDR   x8, [x5, 8]              // b
176
177        // BLOCK 2
178        FMLA v21.4s, v17.4s,  v0.s[0]
179        LDR   d4, [x10], 8             // a2
180        FMLA v23.4s, v17.4s,  v0.s[2]
181        INS v12.d[1], x8               // b  ins
182        FMLA v25.4s, v17.4s,  v1.s[0]
183        LDR   x8, [x11], 8             // a3
184
185        // BLOCK 3
186        FMLA v27.4s, v17.4s,  v1.s[2]
187        LDR   d5, [x12], 8             // a4
188        FMLA v29.4s, v17.4s,  v2.s[0]
189        INS v4.d[1], x8                // a3 ins
190        FMLA v31.4s, v17.4s,  v2.s[2]
191        LDR   x8, [x4], 8              // a5
192
193        // BLOCK 4
194        FMLA v20.4s, v18.4s,  v0.s[1]
195        LDR  d13, [x5, 16]
196        FMLA v22.4s, v18.4s,  v0.s[3]
197        INS v5.d[1], x8                // a5 ins
198        FMLA v24.4s, v18.4s,  v1.s[1]
199        LDR   x8, [x5, 24]
200
201        // BLOCK 5
202        FMLA v26.4s, v18.4s,  v1.s[3]
203        LDR  d14, [x5, 32]
204        FMLA v28.4s, v18.4s,  v2.s[1]
205        INS v13.d[1], x8               // b
206        FMLA v30.4s, v18.4s,  v2.s[3]
207        LDR   x8, [x5, 40]
208
209        // BLOCK 6
210        FMLA v21.4s, v19.4s,  v0.s[1]
211        LDR  d15, [x5, 48]
212        FMLA v23.4s, v19.4s,  v0.s[3]
213        INS v14.d[1], x8               // b
214        FMLA v25.4s, v19.4s,  v1.s[1]
215        LDR   x8, [x5, 56]
216
217        // BLOCK 7
218        FMLA v27.4s, v19.4s,  v1.s[3]
219        FMLA v29.4s, v19.4s,  v2.s[1]
220        INS v15.d[1], x8
221        FMLA v31.4s, v19.4s,  v2.s[3]
222
223        # Second group of 24 FMA, First group of loads
224        // BLOCK 0
225        FMLA v20.4s, v12.4s,  v3.s[0]
226        LDR   d0, [x3], 8              // a0
227        FMLA v22.4s, v12.4s,  v3.s[2]
228        FMLA v24.4s, v12.4s,  v4.s[0]
229        LDR   x8, [x9], 8              // a1
230
231        // BLOCK 1
232        FMLA v26.4s, v12.4s,  v4.s[2]
233        LDR  d16, [x5, 64]
234        FMLA v28.4s, v12.4s,  v5.s[0]
235        INS v0.d[1], x8                // a1 ins
236        FMLA v30.4s, v12.4s,  v5.s[2]
237        LDR   x8, [x5, 72]             // b
238
239        // BLOCK 2
240        FMLA v21.4s, v13.4s,  v3.s[0]
241        LDR   d1, [x10], 8             // a2
242        FMLA v23.4s, v13.4s,  v3.s[2]
243        INS v16.d[1], x8               // b
244        FMLA v25.4s, v13.4s,  v4.s[0]
245        LDR   x8, [x11], 8             // a3
246
247        // BLOCK 3
248        FMLA v27.4s, v13.4s,  v4.s[2]
249        LDR   d2, [x12], 8             // a4
250        FMLA v29.4s, v13.4s,  v5.s[0]
251        INS v1.d[1], x8                // a3 ins
252        FMLA v31.4s, v13.4s,  v5.s[2]
253        LDR   x8,  [x4], 8             // a5
254
255        // BLOCK 4
256        FMLA v20.4s, v14.4s,  v3.s[1]
257        LDR  d17, [x5, 80]
258        FMLA v22.4s, v14.4s,  v3.s[3]
259        INS v2.d[1], x8                // a5 ins
260        FMLA v24.4s, v14.4s,  v4.s[1]
261        LDR   x8, [x5, 88]
262
263        // BLOCK 5
264        FMLA v26.4s, v14.4s,  v4.s[3]
265        LDR  d18, [x5, 96]
266        FMLA v28.4s, v14.4s,  v5.s[1]
267        INS v17.d[1], x8               // b
268        FMLA v30.4s, v14.4s,  v5.s[3]
269        LDR   x8, [x5, 104]
270
271        // BLOCK 6
272        FMLA v21.4s, v15.4s,  v3.s[1]
273        LDR  d19, [x5, 112]
274        FMLA v23.4s, v15.4s,  v3.s[3]
275        INS v18.d[1], x8               // b
276        FMLA v25.4s, v15.4s,  v4.s[1]
277        LDR   x8, [x5, 120]
278
279        // BLOCK 7
280        FMLA v27.4s, v15.4s,  v4.s[3]
281        SUBS x0, x0, 16
282        FMLA v29.4s, v15.4s,  v5.s[1]
283        ADD x5, x5, 128
284        FMLA v31.4s, v15.4s,  v5.s[3]
285        B.HS 1b
286
287        # Epilogue - 4 floats of A (16 bytes)
288        # 48 FMA + 12 LD64 A + 8 LDR B
2892:
290        # First group of 24 FMA, Second group loads
291        // BLOCK 0
292        FMLA v20.4s, v16.4s,  v0.s[0]
293        LDR   d3, [x3], 8              // a0
294        FMLA v22.4s, v16.4s,  v0.s[2]
295        INS v19.d[1], x8               // b from second group
296        FMLA v24.4s, v16.4s,  v1.s[0]
297        LDR   x8, [x9], 8              // a1
298
299        // BLOCK 1
300        FMLA v26.4s, v16.4s,  v1.s[2]
301        LDR  d12, [x5]
302        FMLA v28.4s, v16.4s,  v2.s[0]
303        INS v3.d[1], x8                // a1 ins
304        FMLA v30.4s, v16.4s,  v2.s[2]
305        LDR   x8, [x5, 8]              // b
306
307        // BLOCK 2
308        FMLA v21.4s, v17.4s,  v0.s[0]
309        LDR   d4, [x10], 8             // a2
310        FMLA v23.4s, v17.4s,  v0.s[2]
311        INS v12.d[1], x8               // b  ins
312        FMLA v25.4s, v17.4s,  v1.s[0]
313        LDR   x8, [x11], 8             // a3
314
315        // BLOCK 3
316        FMLA v27.4s, v17.4s,  v1.s[2]
317        LDR   d5, [x12], 8             // a4
318        FMLA v29.4s, v17.4s,  v2.s[0]
319        INS v4.d[1], x8                // a3 ins
320        FMLA v31.4s, v17.4s,  v2.s[2]
321        LDR   x8, [x4], 8              // a5
322
323        // BLOCK 4
324        FMLA v20.4s, v18.4s,  v0.s[1]
325        LDR  d13, [x5, 16]
326        FMLA v22.4s, v18.4s,  v0.s[3]
327        INS v5.d[1], x8                // a5 ins
328        FMLA v24.4s, v18.4s,  v1.s[1]
329        LDR   x8, [x5, 24]
330
331        // BLOCK 5
332        FMLA v26.4s, v18.4s,  v1.s[3]
333        LDR  d14, [x5, 32]
334        FMLA v28.4s, v18.4s,  v2.s[1]
335        INS v13.d[1], x8               // b
336        FMLA v30.4s, v18.4s,  v2.s[3]
337        LDR   x8, [x5, 40]
338
339        // BLOCK 6
340        FMLA v21.4s, v19.4s,  v0.s[1]
341        LDR  d15, [x5, 48]
342        FMLA v23.4s, v19.4s,  v0.s[3]
343        INS v14.d[1], x8               // b
344        FMLA v25.4s, v19.4s,  v1.s[1]
345        LDR   x8, [x5, 56]
346
347        // BLOCK 7
348        FMLA v27.4s, v19.4s,  v1.s[3]
349        FMLA v29.4s, v19.4s,  v2.s[1]
350        INS v15.d[1], x8               // b
351        FMLA v31.4s, v19.4s,  v2.s[3]
352
353        # Second group of 24 FMA, First group of loads
354        // BLOCK 0
355        FMLA v20.4s, v12.4s,  v3.s[0]
356        PRFM PSTL1KEEP,  [x6]          // Prefetch C0
357        FMLA v22.4s, v12.4s,  v3.s[2]
358        PRFM PSTL1KEEP, [x16]          // Prefetch C1
359        FMLA v24.4s, v12.4s,  v4.s[0]
360        PRFM PSTL1KEEP, [x17]          // Prefetch C2
361
362        // BLOCK 1
363        FMLA v26.4s, v12.4s,  v4.s[2]
364        PRFM PSTL1KEEP, [x14]          // Prefetch C3
365        FMLA v28.4s, v12.4s,  v5.s[0]
366        PRFM PSTL1KEEP, [x13]          // Prefetch C4
367        FMLA v30.4s, v12.4s,  v5.s[2]
368        PRFM PSTL1KEEP, [x7]           // Prefetch C5
369
370        // BLOCK 2
371        FMLA v21.4s, v13.4s,  v3.s[0]
372        FMLA v23.4s, v13.4s,  v3.s[2]
373        FMLA v25.4s, v13.4s,  v4.s[0]
374
375        // BLOCK 3
376        FMLA v27.4s, v13.4s,  v4.s[2]
377        FMLA v29.4s, v13.4s,  v5.s[0]
378        FMLA v31.4s, v13.4s,  v5.s[2]
379
380        // BLOCK 4
381        FMLA v20.4s, v14.4s,  v3.s[1]
382        FMLA v22.4s, v14.4s,  v3.s[3]
383        FMLA v24.4s, v14.4s,  v4.s[1]
384
385        // BLOCK 5
386        FMLA v26.4s, v14.4s,  v4.s[3]
387        FMLA v28.4s, v14.4s,  v5.s[1]
388        FMLA v30.4s, v14.4s,  v5.s[3]
389        TST x0, 15
390
391        // BLOCK 6
392        FMLA v21.4s, v15.4s,  v3.s[1]
393        FMLA v23.4s, v15.4s,  v3.s[3]
394        FMLA v25.4s, v15.4s,  v4.s[1]
395        ADD x5, x5, 64
396
397        // BLOCK 7
398        FMLA v27.4s, v15.4s,  v4.s[3]
399        FMLA v29.4s, v15.4s,  v5.s[1]
400        FMLA v31.4s, v15.4s,  v5.s[3]
401
402        # Is there a remainder?- 2 floats of A (8 bytes) or less
403        B.NE 4f
4043:
405        # Clamp
406        FMAX v20.4s, v20.4s, v6.4s
407        # Load cn_stride
408        LDR x0, [sp, 32]
409        FMAX v21.4s, v21.4s, v6.4s
410        FMAX v22.4s, v22.4s, v6.4s
411        FMAX v23.4s, v23.4s, v6.4s
412        FMAX v24.4s, v24.4s, v6.4s
413        FMAX v25.4s, v25.4s, v6.4s
414        FMAX v26.4s, v26.4s, v6.4s
415        FMAX v27.4s, v27.4s, v6.4s
416        FMAX v28.4s, v28.4s, v6.4s
417        FMAX v29.4s, v29.4s, v6.4s
418        FMAX v30.4s, v30.4s, v6.4s
419        FMAX v31.4s, v31.4s, v6.4s
420        SUBS x1, x1, 8
421        FMIN v20.4s, v20.4s, v7.4s
422        FMIN v21.4s, v21.4s, v7.4s
423        FMIN v22.4s, v22.4s, v7.4s
424        FMIN v23.4s, v23.4s, v7.4s
425        FMIN v24.4s, v24.4s, v7.4s
426        FMIN v25.4s, v25.4s, v7.4s
427        FMIN v26.4s, v26.4s, v7.4s
428        FMIN v27.4s, v27.4s, v7.4s
429        FMIN v28.4s, v28.4s, v7.4s
430        FMIN v29.4s, v29.4s, v7.4s
431        FMIN v30.4s, v30.4s, v7.4s
432        FMIN v31.4s, v31.4s, v7.4s
433
434        # Store full 6 x 8
435        B.LO 6f
436
437        ST1 {v20.16b, v21.16b},  [x6], x0
438        SUB  x3,  x3, x2 // a0 -= kc
439        ST1 {v22.16b, v23.16b}, [x16], x0
440        SUB  x9,  x9, x2 // a1 -= kc
441        ST1 {v24.16b, v25.16b}, [x17], x0
442        SUB x10, x10, x2 // a2 -= kc
443        ST1 {v26.16b, v27.16b}, [x14], x0
444        SUB x11, x11, x2 // a3 -= kc
445        ST1 {v28.16b, v29.16b}, [x13], x0
446        SUB x12, x12, x2 // a4 -= kc
447        ST1 {v30.16b, v31.16b},  [x7], x0
448        SUB  x4,  x4, x2 // a5 -= kc
449
450        B.HI 0b
451
452        // Restore d12-d15 from stack
453        LDP d14, d15, [sp, 16]
454        LDP d12, d13, [sp], 32
455        RET
456
4574:
458        # Is there a remainder?- 2 floats of A (8 bytes)
459        TBZ x0, 3, 5f
460
461        # Remainder- 2 floats of A (8 bytes)
462        LDR   d0,  [x3], 8
463        LDR  q16, [x5], 16
464        LD1   {v0.d}[1], [x9], 8
465        LDR   d1, [x10], 8
466        LD1   {v1.d}[1], [x11], 8
467        LDR   d2, [x12], 8
468        LD1   {v2.d}[1], [x4], 8
469        LDR  q17, [x5], 16
470        LDR  q18, [x5], 16
471        LDR  q19, [x5], 16
472
473        FMLA v20.4s, v16.4s,  v0.s[0]
474        FMLA v22.4s, v16.4s,  v0.s[2]
475        FMLA v24.4s, v16.4s,  v1.s[0]
476        FMLA v26.4s, v16.4s,  v1.s[2]
477        FMLA v28.4s, v16.4s,  v2.s[0]
478        FMLA v30.4s, v16.4s,  v2.s[2]
479        FMLA v21.4s, v17.4s,  v0.s[0]
480        FMLA v23.4s, v17.4s,  v0.s[2]
481        FMLA v25.4s, v17.4s,  v1.s[0]
482        FMLA v27.4s, v17.4s,  v1.s[2]
483        FMLA v29.4s, v17.4s,  v2.s[0]
484        FMLA v31.4s, v17.4s,  v2.s[2]
485
486        FMLA v20.4s, v18.4s,  v0.s[1]
487        FMLA v22.4s, v18.4s,  v0.s[3]
488        FMLA v24.4s, v18.4s,  v1.s[1]
489        FMLA v26.4s, v18.4s,  v1.s[3]
490        FMLA v28.4s, v18.4s,  v2.s[1]
491        FMLA v30.4s, v18.4s,  v2.s[3]
492        FMLA v21.4s, v19.4s,  v0.s[1]
493        FMLA v23.4s, v19.4s,  v0.s[3]
494        FMLA v25.4s, v19.4s,  v1.s[1]
495        FMLA v27.4s, v19.4s,  v1.s[3]
496        FMLA v29.4s, v19.4s,  v2.s[1]
497        FMLA v31.4s, v19.4s,  v2.s[3]
498
499        # Is there a remainder?- 1 floats of A (4 bytes)
500        TBZ x0, 2, 3b
5015:
502        # Remainder- 1 floats of A (4 bytes)
503        LDR   s0,  [x3], 4
504        LDR  q16, [x5], 16
505        LD1   {v0.s}[2], [x9], 4
506        LDR   s1, [x10], 4
507        LD1   {v1.s}[2], [x11], 4
508        LDR   s2, [x12], 4
509        LD1   {v2.s}[2], [x4], 4
510        LDR  q17, [x5], 16
511
512        FMLA v20.4s, v16.4s,  v0.s[0]
513        FMLA v22.4s, v16.4s,  v0.s[2]
514        FMLA v24.4s, v16.4s,  v1.s[0]
515        FMLA v26.4s, v16.4s,  v1.s[2]
516        FMLA v28.4s, v16.4s,  v2.s[0]
517        FMLA v30.4s, v16.4s,  v2.s[2]
518        FMLA v21.4s, v17.4s,  v0.s[0]
519        FMLA v23.4s, v17.4s,  v0.s[2]
520        FMLA v25.4s, v17.4s,  v1.s[0]
521        FMLA v27.4s, v17.4s,  v1.s[2]
522        FMLA v29.4s, v17.4s,  v2.s[0]
523        FMLA v31.4s, v17.4s,  v2.s[2]
524        B 3b
525
526        # Store odd width
5276:
528        TBZ x1, 2, 7f
529        STR q20,  [x6], 16
530        MOV v20.16b, v21.16b
531        STR q22, [x16], 16
532        MOV v22.16b, v23.16b
533        STR q24, [x17], 16
534        MOV v24.16b, v25.16b
535        STR q26, [x14], 16
536        MOV v26.16b, v27.16b
537        STR q28, [x13], 16
538        MOV v28.16b, v29.16b
539        STR q30,  [x7], 16
540        MOV v30.16b, v31.16b
541
5427:
543        TBZ x1, 1, 8f
544        STR d20,  [x6], 8
545        DUP d20, v20.d[1]
546        STR d22, [x16], 8
547        DUP d22, v22.d[1]
548        STR d24, [x17], 8
549        DUP d24, v24.d[1]
550        STR d26, [x14], 8
551        DUP d26, v26.d[1]
552        STR d28, [x13], 8
553        DUP d28, v28.d[1]
554        STR d30,  [x7], 8
555        DUP d30, v30.d[1]
556
5578:
558        TBZ x1, 0, 9f
559        STR s20,  [x6]
560        STR s22, [x16]
561        STR s24, [x17]
562        STR s26, [x14]
563        STR s28, [x13]
564        STR s30,  [x7]
5659:
566        // Restore d12-d15 from stack
567        LDP d14, d15, [sp, 16]
568        LDP d12, d13, [sp], 32
569        RET
570
571END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55
572
573#ifdef __ELF__
574.section ".note.GNU-stack","",%progbits
575#endif
576