• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> (x0)
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> x8
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointers
25# x14 a0
26# x15 a1
27# x20 a2
28# x21 a3
29# x22 a4
30# x23 a5
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x10 c3
37# x13 c4
38#  x7 c5
39
40# x19 temporary vector shadow register
41
42# Vector register usage
43# A0  v0     v3
44# A1  v0[1]  v3[1]
45# A2  v1     v4
46# A3  v1[1]  v4[1]
47# A4  v2     v5
48# A5  v2[1]  v5[1]
49# B   v12 v13 v14 v15 second set of B
50# B   v16 v17 v18 v19 first set
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58# unused A   v8 v9 v10 v11
59
60BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
61
62        # Clamp C pointers
63        CMP x0, 2                // if mr < 2
64        ADD x16, x6, x7          // c1 = c0 + cm_stride
65        CSEL x16, x6, x16, LO    //   c1 = c0
66
67        ADD x17, x16, x7         // c2 = c1 + cm_stride
68                                 // if mr <= 2
69        CSEL x17, x16, x17, LS   //   c2 = c1
70
71        CMP x0, 4                // if mr < 4
72        ADD x10, x17, x7         // c3 = c2 + cm_stride
73        CSEL x10, x17, x10, LO   //   c3 = c2
74
75        ADD x13, x10, x7         // c4 = c3 + cm_stride
76                                 // if mr <= 4
77        CSEL x13, x10, x13, LS   //   c4 = c3
78
79
80        CMP x0, 6                // if mr < 6
81        ADD x7, x13, x7          // c5 = c4 + cm_stride
82        CSEL x7, x13, x7, LO     //   c5 = c4
83
84        # Load a_offset
85        LDR x11, [sp, 8]
86
87        # Load zero, params pointer
88        LDP x12, x8, [sp, 16]
89
90        # Load min/max values
91        LD2R {v6.4s, v7.4s}, [x8]
92
93        // Save x19-x23, d12-d15 on stack
94        STP d12, d13, [sp, -80]!
95        STP d14, d15, [sp, 16]
96        STP x19, x20, [sp, 32]
97        STP x21, x22, [sp, 48]
98        STR x23,      [sp, 64]
99
1000:
101        # Load initial bias from w into accumulators
102        LDP q20, q21, [x5], 32
103        MOV v22.16b, v20.16b
104        MOV v23.16b, v21.16b
105        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
106        MOV v24.16b, v20.16b
107        PRFM PLDL1KEEP, [x5, 64]
108        MOV v25.16b, v21.16b
109        PRFM PLDL1KEEP, [x5, 128]
110        MOV v26.16b, v20.16b
111        PRFM PLDL1KEEP, [x5, 192]
112        MOV v27.16b, v21.16b
113        MOV v28.16b, v20.16b
114        MOV v29.16b, v21.16b
115        MOV v30.16b, v20.16b
116        MOV v31.16b, v21.16b
117
118        MOV x9, x3  // p = ks
119
1201:
121        # Load next 6 A pointers
122        LDP x14, x15, [x4], 16
123        LDP x20, x21, [x4], 16
124        LDP x22, x23, [x4], 16
125
126        CMP x14, x12            // if a0 == zero
127        ADD x14, x14, x11       // a0 += a_offset
128        CSEL x14, x12, x14, EQ  //   a0 = zero, else += a0 + a_offset
129        CMP x15, x12            // if a1 == zero
130        ADD x15, x15, x11       // a1 += a_offset
131        CSEL x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
132        CMP x20, x12            // if a2 == zero
133        ADD x20, x20, x11       // a2 += a_offset
134        CSEL x20, x12, x20, EQ  //   a2 = zero, else += a2 + a_offset
135        CMP x21, x12            // if a3 == zero
136        ADD x21, x21, x11       // a3 += a_offset
137        CSEL x21, x12, x21, EQ  //   a3 = zero, else += a3 + a_offset
138        CMP x22, x12            // if a4 == zero
139        ADD x22, x22, x11       // a4 += a_offset
140        CSEL x22, x12, x22, EQ  //   a4 = zero, else += a4 + a_offset
141        CMP x23, x12            // if a5 == zero
142        ADD x23, x23, x11       // a5 += a_offset
143        CSEL x23, x12, x23, EQ  //   a5 = zero, else += a5 + a_offset
144
145        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
146        SUBS x0, x2, 16  // k = kc - 16
147        B.LO 5f
148
149        # Prologue - First group loads, no FMA
150        LDR  d0, [x14], 8              // a0
151        LDP q16, q17, [x5], 32         // b
152        LDR  d1, [x20], 8              // a2
153        LDR  d2, [x22], 8              // a4
154        LD1  {v0.d}[1], [x15], 8       // a1
155        LD1  {v1.d}[1], [x21], 8       // a3
156        LD1  {v2.d}[1],  [x23], 8      // a5
157        SUBS x0, x0, 16
158        LDR  q18, [x5], 16
159        LDR  d19, [x5], 8
160        LDR  x19, [x5], 8   // ins is in BLOCK 0
161
162        # Is there at least 4 floats (16 bytes) for main loop?
163        B.LO 3f
164
165        # Main loop - 4 floats of A (16 bytes)
166        # 48 FMA + 12 LD64 A + 8 LDR B
1672:
168        # First group of 24 FMA, Second group loads
169        // BLOCK 0
170        LDR   d3, [x14], 8             // a0
171        INS v19.d[1], x19              // b from second group
172        FMLA v20.4s, v16.4s,  v0.s[0]
173        LDR  x19, [x15], 8             // a1
174        FMLA v22.4s, v16.4s,  v0.s[2]
175        FMLA v24.4s, v16.4s,  v1.s[0]
176
177        // BLOCK 1
178        LDR  d12, [x5]
179        INS v3.d[1], x19               // a1 ins
180        FMLA v26.4s, v16.4s,  v1.s[2]
181        LDR  x19, [x5, 8]   // b
182        FMLA v28.4s, v16.4s,  v2.s[0]
183        FMLA v30.4s, v16.4s,  v2.s[2]
184
185        // BLOCK 2
186        LDR   d4, [x20], 8             // a2
187        INS v12.d[1], x19  // b  ins
188        FMLA v21.4s, v17.4s,  v0.s[0]
189        LDR  x19, [x21], 8             // a3
190        FMLA v23.4s, v17.4s,  v0.s[2]
191        FMLA v25.4s, v17.4s,  v1.s[0]
192
193        // BLOCK 3
194        LDR   d5, [x22], 8             // a4
195        INS v4.d[1], x19               // a3 ins
196        FMLA v27.4s, v17.4s,  v1.s[2]
197        LDR  x19, [x23], 8             // a5
198        FMLA v29.4s, v17.4s,  v2.s[0]
199        FMLA v31.4s, v17.4s,  v2.s[2]
200
201        // BLOCK 4
202        LDR  d13, [x5, 16]
203        INS v5.d[1], x19               // a5 ins
204        FMLA v20.4s, v18.4s,  v0.s[1]
205        LDR  x19, [x5, 24]
206        FMLA v22.4s, v18.4s,  v0.s[3]
207        FMLA v24.4s, v18.4s,  v1.s[1]
208
209        // BLOCK 5
210        LDR  d14, [x5, 32]
211        INS v13.d[1], x19  // b
212        FMLA v26.4s, v18.4s,  v1.s[3]
213        LDR  x19, [x5, 40]
214        FMLA v28.4s, v18.4s,  v2.s[1]
215        FMLA v30.4s, v18.4s,  v2.s[3]
216
217        // BLOCK 6
218        LDR  d15, [x5, 48]
219        INS v14.d[1], x19  // b
220        FMLA v21.4s, v19.4s,  v0.s[1]
221        LDR  x19, [x5, 56]
222        FMLA v23.4s, v19.4s,  v0.s[3]
223        FMLA v25.4s, v19.4s,  v1.s[1]
224
225        // BLOCK 7
226        INS v15.d[1], x19
227        FMLA v27.4s, v19.4s,  v1.s[3]
228        FMLA v29.4s, v19.4s,  v2.s[1]
229        FMLA v31.4s, v19.4s,  v2.s[3]
230
231        # Second group of 24 FMA, First group of loads
232        // BLOCK 0
233        LDR   d0, [x14], 8             // a0
234        FMLA v20.4s, v12.4s,  v3.s[0]
235        LDR  x19, [x15], 8             // a1
236        FMLA v22.4s, v12.4s,  v3.s[2]
237        FMLA v24.4s, v12.4s,  v4.s[0]
238        PRFM PLDL1KEEP, [x14, 128]     // Prefetch A0
239
240        // BLOCK 1
241        LDR  d16, [x5, 64]
242        INS v0.d[1], x19               // a1 ins
243        FMLA v26.4s, v12.4s,  v4.s[2]
244        LDR  x19, [x5, 72]  // b
245        FMLA v28.4s, v12.4s,  v5.s[0]
246        FMLA v30.4s, v12.4s,  v5.s[2]
247        PRFM PLDL1KEEP, [x15, 128]     // Prefetch A1
248
249        // BLOCK 2
250        LDR   d1, [x20], 8             // a2
251        INS v16.d[1], x19  // b
252        FMLA v21.4s, v13.4s,  v3.s[0]
253        LDR  x19, [x21], 8             // a3
254        FMLA v23.4s, v13.4s,  v3.s[2]
255        FMLA v25.4s, v13.4s,  v4.s[0]
256        PRFM PLDL1KEEP, [x20, 128]     // Prefetch A2
257
258        // BLOCK 3
259        LDR   d2, [x22], 8             // a4
260        INS v1.d[1], x19               // a3 ins
261        FMLA v27.4s, v13.4s,  v4.s[2]
262        LDR  x19,  [x23], 8            // a5
263        FMLA v29.4s, v13.4s,  v5.s[0]
264        FMLA v31.4s, v13.4s,  v5.s[2]
265        PRFM PLDL1KEEP, [x21, 128]     // Prefetch A3
266
267        // BLOCK 4
268        LDR  d17, [x5, 80]
269        INS v2.d[1], x19               // a5 ins
270        FMLA v20.4s, v14.4s,  v3.s[1]
271        LDR  x19, [x5, 88]
272        FMLA v22.4s, v14.4s,  v3.s[3]
273        FMLA v24.4s, v14.4s,  v4.s[1]
274        PRFM PLDL1KEEP, [x22, 128]     // Prefetch A4
275
276        // BLOCK 5
277        LDR  d18, [x5, 96]
278        INS v17.d[1], x19  // b
279        FMLA v26.4s, v14.4s,  v4.s[3]
280        LDR  x19, [x5, 104]
281        FMLA v28.4s, v14.4s,  v5.s[1]
282        FMLA v30.4s, v14.4s,  v5.s[3]
283        PRFM PLDL1KEEP, [x23, 128]     // Prefetch A5
284
285        // BLOCK 6
286        LDR  d19, [x5, 112]
287        INS v18.d[1], x19  // b
288        FMLA v21.4s, v15.4s,  v3.s[1]
289        LDR  x19, [x5, 120]
290        FMLA v23.4s, v15.4s,  v3.s[3]
291        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
292        FMLA v25.4s, v15.4s,  v4.s[1]
293        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
294
295        // BLOCK 7
296        SUBS x0, x0, 16  // LDR lands here
297        FMLA v27.4s, v15.4s,  v4.s[3]
298        FMLA v29.4s, v15.4s,  v5.s[1]
299        ADD x5, x5, 128
300        FMLA v31.4s, v15.4s,  v5.s[3]
301        B.HS 2b
302
303        # Epilogue - 4 floats of A (16 bytes)
304        # 48 FMA + 12 LD64 A + 8 LDR B
3053:
306        # First group of 24 FMA, Second group loads
307        // BLOCK 0
308        LDR   d3, [x14], 8             // a0
309        INS v19.d[1], x19              // b from second group
310        FMLA v20.4s, v16.4s,  v0.s[0]
311        LDR  x19, [x15], 8             // a1
312        FMLA v22.4s, v16.4s,  v0.s[2]
313        FMLA v24.4s, v16.4s,  v1.s[0]
314        PRFM PSTL1KEEP,  [x6]          // Prefetch C0
315
316        // BLOCK 1
317        LDR  d12, [x5]
318        INS v3.d[1], x19               // a1 ins
319        FMLA v26.4s, v16.4s,  v1.s[2]
320        LDR  x19, [x5, 8]   // b
321        FMLA v28.4s, v16.4s,  v2.s[0]
322        FMLA v30.4s, v16.4s,  v2.s[2]
323        PRFM PSTL1KEEP, [x16]          // Prefetch C1
324
325        // BLOCK 2
326        LDR   d4, [x20], 8             // a2
327        INS v12.d[1], x19  // b  ins
328        FMLA v21.4s, v17.4s,  v0.s[0]
329        LDR  x19, [x21], 8             // a3
330        FMLA v23.4s, v17.4s,  v0.s[2]
331        FMLA v25.4s, v17.4s,  v1.s[0]
332        PRFM PSTL1KEEP, [x17]          // Prefetch C2
333
334        // BLOCK 3
335        LDR   d5, [x22], 8             // a4
336        INS v4.d[1], x19               // a3 ins
337        FMLA v27.4s, v17.4s,  v1.s[2]
338        LDR  x19, [x23], 8             // a5
339        FMLA v29.4s, v17.4s,  v2.s[0]
340        FMLA v31.4s, v17.4s,  v2.s[2]
341        PRFM PSTL1KEEP, [x10]          // Prefetch C3
342
343        // BLOCK 4
344        LDR  d13, [x5, 16]
345        INS v5.d[1], x19               // a5 ins
346        FMLA v20.4s, v18.4s,  v0.s[1]
347        LDR  x19, [x5, 24]
348        FMLA v22.4s, v18.4s,  v0.s[3]
349        FMLA v24.4s, v18.4s,  v1.s[1]
350        PRFM PSTL1KEEP, [x13]          // Prefetch C4
351
352        // BLOCK 5
353        LDR  d14, [x5, 32]
354        INS v13.d[1], x19  // b
355        FMLA v26.4s, v18.4s,  v1.s[3]
356        LDR  x19, [x5, 40]
357        FMLA v28.4s, v18.4s,  v2.s[1]
358        FMLA v30.4s, v18.4s,  v2.s[3]
359        PRFM PSTL1KEEP, [x7]           // Prefetch C5
360
361        // BLOCK 6
362        LDR  d15, [x5, 48]
363        INS v14.d[1], x19  // b
364        FMLA v21.4s, v19.4s,  v0.s[1]
365        LDR  x19, [x5, 56]
366        FMLA v23.4s, v19.4s,  v0.s[3]
367        FMLA v25.4s, v19.4s,  v1.s[1]
368
369        // BLOCK 7
370        INS v15.d[1], x19  // b from previous
371        FMLA v27.4s, v19.4s,  v1.s[3]
372        FMLA v29.4s, v19.4s,  v2.s[1]
373        FMLA v31.4s, v19.4s,  v2.s[3]
374
375        # Second group of 24 FMA, First group of loads
376        // BLOCK 0
377        FMLA v20.4s, v12.4s,  v3.s[0]
378        FMLA v22.4s, v12.4s,  v3.s[2]
379        FMLA v24.4s, v12.4s,  v4.s[0]
380
381        // BLOCK 1
382        FMLA v26.4s, v12.4s,  v4.s[2]
383        FMLA v28.4s, v12.4s,  v5.s[0]
384        FMLA v30.4s, v12.4s,  v5.s[2]
385
386        // BLOCK 2
387        FMLA v21.4s, v13.4s,  v3.s[0]
388        FMLA v23.4s, v13.4s,  v3.s[2]
389        FMLA v25.4s, v13.4s,  v4.s[0]
390
391        // BLOCK 3
392        FMLA v27.4s, v13.4s,  v4.s[2]
393        FMLA v29.4s, v13.4s,  v5.s[0]
394        FMLA v31.4s, v13.4s,  v5.s[2]
395
396        // BLOCK 4
397        FMLA v20.4s, v14.4s,  v3.s[1]
398        FMLA v22.4s, v14.4s,  v3.s[3]
399        FMLA v24.4s, v14.4s,  v4.s[1]
400
401        // BLOCK 5
402        FMLA v26.4s, v14.4s,  v4.s[3]
403        FMLA v28.4s, v14.4s,  v5.s[1]
404        FMLA v30.4s, v14.4s,  v5.s[3]
405
406        // BLOCK 6
407        FMLA v21.4s, v15.4s,  v3.s[1]
408        FMLA v23.4s, v15.4s,  v3.s[3]
409        FMLA v25.4s, v15.4s,  v4.s[1]
410        TST x0, 15
411
412        // BLOCK 7
413        FMLA v27.4s, v15.4s,  v4.s[3]
414        FMLA v29.4s, v15.4s,  v5.s[1]
415        FMLA v31.4s, v15.4s,  v5.s[3]
416        ADD x5, x5, 64
417
418        # Is there a remainder?- 2 floats of A (8 bytes) or less
419        B.NE 5f
420
4214:
422        # ks loop
423        SUBS x9, x9, 48  // ks -= MR * sizeof(void*)
424        B.HI 1b
425
426        # Clamp
427        FMAX v20.4s, v20.4s, v6.4s
428        # Load cn_stride
429        LDR x0, [sp, 80]
430        FMAX v21.4s, v21.4s, v6.4s
431        FMAX v22.4s, v22.4s, v6.4s
432        FMAX v23.4s, v23.4s, v6.4s
433        FMAX v24.4s, v24.4s, v6.4s
434        FMAX v25.4s, v25.4s, v6.4s
435        FMAX v26.4s, v26.4s, v6.4s
436        FMAX v27.4s, v27.4s, v6.4s
437        FMAX v28.4s, v28.4s, v6.4s
438        FMAX v29.4s, v29.4s, v6.4s
439        FMAX v30.4s, v30.4s, v6.4s
440        FMAX v31.4s, v31.4s, v6.4s
441        SUBS x1, x1, 8
442        FMIN v20.4s, v20.4s, v7.4s
443        FMIN v21.4s, v21.4s, v7.4s
444        FMIN v22.4s, v22.4s, v7.4s
445        FMIN v23.4s, v23.4s, v7.4s
446        FMIN v24.4s, v24.4s, v7.4s
447        FMIN v25.4s, v25.4s, v7.4s
448        FMIN v26.4s, v26.4s, v7.4s
449        FMIN v27.4s, v27.4s, v7.4s
450        FMIN v28.4s, v28.4s, v7.4s
451        FMIN v29.4s, v29.4s, v7.4s
452        FMIN v30.4s, v30.4s, v7.4s
453        FMIN v31.4s, v31.4s, v7.4s
454
455        # Store full 6 x 8
456        B.LO 7f
457
458        STP q30, q31,  [x7]
459        ADD x7, x7, x0
460        STP q28, q29, [x13]
461        ADD x13, x13, x0
462        STP q26, q27, [x10]
463        ADD x10, x10, x0
464        STP q24, q25, [x17]
465        ADD x17, x17, x0
466        STP q22, q23, [x16]
467        ADD x16, x16, x0
468        STP q20, q21,  [x6]
469        ADD  x6,  x6, x0
470
471        SUB x4, x4, x3  // a -= ks
472
473        # nc loop
474        B.HI 0b
475
476        // Restore x19-x23, d12-d15 from stack
477        LDR x23,      [sp, 64]
478        LDP x21, x22, [sp, 48]
479        LDP x19, x20, [sp, 32]
480        LDP d14, d15, [sp, 16]
481        LDP d12, d13, [sp], 80
482        RET
483
4845:
485        # Is there a remainder?- 2 floats of A (8 bytes)
486        TBZ x0, 3, 6f
487
488        # Remainder- 2 floats of A (8 bytes)
489        LDR   d0, [x14], 8
490        LDR  q16, [x5], 16
491        LD1   {v0.d}[1], [x15], 8
492        LDR   d1, [x20], 8
493        LD1   {v1.d}[1], [x21], 8
494        LDR   d2, [x22], 8
495        LD1   {v2.d}[1], [x23], 8
496        LDR  q17, [x5], 16
497        LDR  q18, [x5], 16
498        LDR  q19, [x5], 16
499        FMLA v20.4s, v16.4s,  v0.s[0]
500        FMLA v22.4s, v16.4s,  v0.s[2]
501        FMLA v24.4s, v16.4s,  v1.s[0]
502        FMLA v26.4s, v16.4s,  v1.s[2]
503        FMLA v28.4s, v16.4s,  v2.s[0]
504        FMLA v30.4s, v16.4s,  v2.s[2]
505        FMLA v21.4s, v17.4s,  v0.s[0]
506        FMLA v23.4s, v17.4s,  v0.s[2]
507        FMLA v25.4s, v17.4s,  v1.s[0]
508        FMLA v27.4s, v17.4s,  v1.s[2]
509        FMLA v29.4s, v17.4s,  v2.s[0]
510        FMLA v31.4s, v17.4s,  v2.s[2]
511
512        FMLA v20.4s, v18.4s,  v0.s[1]
513        FMLA v22.4s, v18.4s,  v0.s[3]
514        FMLA v24.4s, v18.4s,  v1.s[1]
515        FMLA v26.4s, v18.4s,  v1.s[3]
516        FMLA v28.4s, v18.4s,  v2.s[1]
517        FMLA v30.4s, v18.4s,  v2.s[3]
518        FMLA v21.4s, v19.4s,  v0.s[1]
519        FMLA v23.4s, v19.4s,  v0.s[3]
520        FMLA v25.4s, v19.4s,  v1.s[1]
521        FMLA v27.4s, v19.4s,  v1.s[3]
522        FMLA v29.4s, v19.4s,  v2.s[1]
523        FMLA v31.4s, v19.4s,  v2.s[3]
524
525        # Is there a remainder?- 1 floats of A (4 bytes)
526        TBZ x0, 2, 4b
5276:
528        # Remainder- 1 floats of A (4 bytes)
529        LDR   s0,  [x14], 4
530        LDR  q16, [x5], 16
531        LD1   {v0.s}[2], [x15], 4
532        LDR   s1, [x20], 4
533        LD1   {v1.s}[2], [x21], 4
534        LDR   s2, [x22], 4
535        LD1   {v2.s}[2], [x23], 4
536        LDR  q17, [x5], 16
537
538        FMLA v20.4s, v16.4s,  v0.s[0]
539        FMLA v22.4s, v16.4s,  v0.s[2]
540        FMLA v24.4s, v16.4s,  v1.s[0]
541        FMLA v26.4s, v16.4s,  v1.s[2]
542        FMLA v28.4s, v16.4s,  v2.s[0]
543        FMLA v30.4s, v16.4s,  v2.s[2]
544        FMLA v21.4s, v17.4s,  v0.s[0]
545        FMLA v23.4s, v17.4s,  v0.s[2]
546        FMLA v25.4s, v17.4s,  v1.s[0]
547        FMLA v27.4s, v17.4s,  v1.s[2]
548        FMLA v29.4s, v17.4s,  v2.s[0]
549        FMLA v31.4s, v17.4s,  v2.s[2]
550        B 4b
551
552        # Store odd width
5537:
554        TBZ x1, 2, 8f
555        STR q30,  [x7], 16
556        MOV v30.16b, v31.16b
557        STR q28, [x13], 16
558        MOV v28.16b, v29.16b
559        STR q26, [x10], 16
560        MOV v26.16b, v27.16b
561        STR q24, [x17], 16
562        MOV v24.16b, v25.16b
563        STR q22, [x16], 16
564        MOV v22.16b, v23.16b
565        STR q20,  [x6], 16
566        MOV v20.16b, v21.16b
5678:
568        TBZ x1, 1, 9f
569        STR d30,  [x7], 8
570        DUP d30, v30.d[1]
571        STR d28, [x13], 8
572        DUP d28, v28.d[1]
573        STR d26, [x10], 8
574        DUP d26, v26.d[1]
575        STR d24, [x17], 8
576        DUP d24, v24.d[1]
577        STR d22, [x16], 8
578        DUP d22, v22.d[1]
579        STR d20,  [x6], 8
580        DUP d20, v20.d[1]
581
5829:
583        TBZ x1, 0, 10f
584        STR s30,  [x7]
585        STR s28, [x13]
586        STR s26, [x10]
587        STR s24, [x17]
588        STR s22, [x16]
589        STR s20,  [x6]
59010:
591        // Restore x19-x23, d12-d15 from stack
592        LDR x23,      [sp, 64]
593        LDP x21, x22, [sp, 48]
594        LDP x19, x20, [sp, 32]
595        LDP d14, d15, [sp, 16]
596        LDP d12, d13, [sp], 80
597        RET
598
599END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
600
601#ifdef __ELF__
602.section ".note.GNU-stack","",%progbits
603#endif
604