• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const float*restrict w,            x5
15#     float*restrict c,                  x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_output_params params [sp + 24] -> (x8)
21
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x13 a0
27# x14 a1
28# x15 a2
29# x16 a3
30
31# C pointers
32# x6  c0
33# x17 c1
34# x18 c2
35# x7  c3 / cm_stride
36
37# x8 temporary vector shadow register
38
39# Vector register usage and GPR shadows
40# A0  v0
41# A1  v0[1]
42# A2  v1
43# A3  v1[1]
44# A0  v2
45# A1  v2[1]
46# A2  v3
47# A3  v3[1]
48# B   v6  v7  v8
49# B   v9 v10 v11
50# B  v14 v15 v16
51# B  v17 v18 v19
52# C  v20 v21 v22
53# C  v23 v24 v25
54# C  v26 v27 v28
55# C  v29 v30 v31
56# Clamp v4 v5
57# v12 to v13 unused.
58
59BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53
60
61        # Load cn_stride, a_offset
62        LDP x10, x11, [sp]
63
64        # Load zero, clamping params pointer
65        LDP x12, x8, [sp, 16]
66
67        # Save d8-d11,d14,d15 on stack
68        STP  d8,  d9, [sp, -48]!
69        STP d10, d11, [sp, 16]
70        STP d14, d15, [sp, 32]
71
72        # Load clamping_params values
73        LD2R {v4.4s, v5.4s}, [x8]
74
75        # Clamp C pointers
76        CMP x0, 2                // if mr < 2
77        ADD x17, x6, x7          // c1 = c0 + cm_stride
78        CSEL x17, x6, x17, LO    //   c1 = c0
79
80        ADD x18, x17, x7         // c2 = c1 + cm_stride
81                                 // if mr <= 2
82
83        CSEL x18, x17, x18, LS   //   c2 = c1
84
85        CMP x0, 4                // if mr < 4
86        ADD x7, x18, x7          // c3 = c2 + cm_stride
87        CSEL x7, x18, x7, LO     //   c3 = c2
88
890:
90        # Load initial bias from w into accumulators
91        LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
92        MOV v23.16b, v20.16b
93        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
94        MOV v24.16b, v21.16b
95        PRFM PLDL1KEEP, [x5,  64]
96        MOV v25.16b, v22.16b
97        PRFM PLDL1KEEP, [x5, 128]
98        MOV v26.16b, v20.16b
99        PRFM PLDL1KEEP, [x5, 192]
100        MOV v27.16b, v21.16b
101        PRFM PLDL1KEEP, [x5, 256]
102        MOV v28.16b, v22.16b
103        PRFM PLDL1KEEP, [x5, 320]
104        MOV v29.16b, v20.16b
105        MOV v30.16b, v21.16b
106        MOV v31.16b, v22.16b
107
108        MOV x9, x3  // p = ks
109
1101:
111        # Load next 4 A pointers
112        LDP x13, x14, [x4], 16
113        LDP x15, x16, [x4], 16
114
115        CMP x13, x12            // if a0 == zero
116        ADD x13, x13, x11       // a0 += a_offset
117        CSEL x13, x12, x13, EQ  //   a0 = zero, else += a0 + a_offset
118        CMP x14, x12            // if a1 == zero
119        ADD x14, x14, x11       // a1 += a_offset
120        CSEL x14, x12, x14, EQ  //   a1 = zero, else += a1 + a_offset
121        CMP x15, x12            // if a2 == zero
122        ADD x15, x15, x11       // a2 += a_offset
123        CSEL x15, x12, x15, EQ  //   a2 = zero, else += a2 + a_offset
124        CMP x16, x12            // if a3 == zero
125        ADD x16, x16, x11       // a3 += a_offset
126        CSEL x16, x12, x16, EQ  //   a3 = zero, else += a3 + a_offset
127
128        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
129        SUBS x0, x2, 16  // k = kc - 16
130
131        PRFM PLDL1KEEP, [x13,  0]  // Prefetch A
132        PRFM PLDL1KEEP, [x13, 64]
133        PRFM PLDL1KEEP, [x14,  0]
134        PRFM PLDL1KEEP, [x14, 64]
135        PRFM PLDL1KEEP, [x15,  0]
136        PRFM PLDL1KEEP, [x15, 64]
137        PRFM PLDL1KEEP, [x16,  0]
138        PRFM PLDL1KEEP, [x16, 64]
139        B.LO 5f
140
141        SUBS x0, x0, 16  // 4 floats for main loop
142
143        # Prologue - loads for first group of 24 FMA
144
145        # Read first block of 4 A.
146        LDR d0, [x13], 8              // a0
147        LDR d1, [x15], 8              // a2
148        LD1 {v0.d}[1], [x14], 8       // a1
149        LD1 {v1.d}[1], [x16], 8       // a3
150
151        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
152        LD1 {v9.16b, v10.16b}, [x5], 32
153        LDR d11, [x5], 8
154        LDR x8, [x5], 8
155
156        # Is there at least 4 floats (16 bytes) for main loop?
157        B.LO 3f
158
159        # Main loop - 4 floats of A (16 bytes)
1602:
161        # First group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
162        # A is loaded for 2nd group into v2/v3
163        # INS is 4 blocks (16 cycles) after load
164
165        # BLOCK 0
166        LDR d2, [x13], 8               // a0
167        INS v11.d[1], x8
168        FMLA v20.4s, v6.4s, v0.s[0]
169        LDR x8, [x14], 8               // a1
170        FMLA v23.4s, v6.4s, v0.s[2]
171        FMLA v26.4s, v6.4s, v1.s[0]
172        PRFM PLDL1KEEP, [x13, 128]      // Prefetch A0
173
174        # BLOCK 1
175        LDR d3, [x15], 8               // a2
176        INS v2.d[1], x8                // a1 was loaded in block 0
177        FMLA v29.4s, v6.4s, v1.s[2]
178        LDR x8, [x16], 8               // a3
179        FMLA v21.4s, v7.4s, v0.s[0]
180        FMLA v24.4s, v7.4s, v0.s[2]
181        PRFM PLDL1KEEP, [x14, 128]      // Prefetch A1
182
183        # BLOCK 2
184        LDR d14, [x5]                  // vb0x0123
185        INS v3.d[1], x8               // a3 was loaded in block 1
186        FMLA v27.4s, v7.4s, v1.s[0]
187        LDR x8, [x5, 8]
188        FMLA v30.4s, v7.4s, v1.s[2]
189        FMLA v22.4s, v8.4s, v0.s[0]
190        PRFM PLDL1KEEP, [x15, 128]     // Prefetch A2
191
192        # BLOCK 3
193        LDR d15, [x5, 16]              // vb0x4567
194        INS v14.d[1], x8               // v14 was loaded in block 2
195        FMLA v25.4s, v8.4s, v0.s[2]
196        LDR x8, [x5, 24]
197        FMLA v28.4s, v8.4s, v1.s[0]
198        FMLA v31.4s, v8.4s, v1.s[2]
199        PRFM PLDL1KEEP, [x16, 128]      // Prefetch A3
200
201        # BLOCK 4
202        LDR d16, [x5, 32]              // vb0x89AB
203        INS v15.d[1], x8
204        FMLA v20.4s, v9.4s, v0.s[1]
205        LDR x8, [x5, 40]
206        FMLA v23.4s, v9.4s, v0.s[3]
207        FMLA v26.4s, v9.4s, v1.s[1]
208        PRFM PLDL1KEEP, [x5, 320]      // Prefetch B
209
210        # BLOCK 5
211        LDR d17, [x5, 48]              // vb1x0123
212        INS v16.d[1], x8
213        FMLA v29.4s, v9.4s, v1.s[3]
214        LDR x8, [x5, 56]
215        FMLA v21.4s, v10.4s, v0.s[1]
216        FMLA v24.4s, v10.4s, v0.s[3]
217        PRFM PLDL1KEEP, [x5, 384]      // Prefetch B
218
219        # BLOCK 6
220        LDR d18, [x5, 64]              // vb1x4567
221        INS v17.d[1], x8
222        FMLA v27.4s, v10.4s, v1.s[1]
223        LDR x8, [x5, 72]
224        FMLA v30.4s, v10.4s, v1.s[3]
225        FMLA v22.4s, v11.4s, v0.s[1]
226        PRFM PLDL1KEEP, [x5, 448]      // Prefetch B
227
228        # BLOCK 7
229        LDR d19, [x5, 80]              // vb1x89AB
230        INS v18.d[1], x8
231        FMLA v25.4s, v11.4s, v0.s[3]
232        LDR x8, [x5, 88]
233        FMLA v28.4s, v11.4s, v1.s[1]
234        FMLA v31.4s, v11.4s, v1.s[3]
235
236        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
237        # A is loaded for 1st group into v0/v1
238
239        # BLOCK 0
240        LDR d0, [x13], 8               // a0
241        INS v19.d[1], x8
242        FMLA v20.4s, v14.4s, v2.s[0]
243        LDR x8, [x14], 8               // a1
244        FMLA v23.4s, v14.4s, v2.s[2]
245        FMLA v26.4s, v14.4s, v3.s[0]
246
247        # BLOCK 1
248        LDR d1, [x15], 8               // a2
249        INS v0.d[1], x8                // a1
250        FMLA v29.4s, v14.4s, v3.s[2]
251        LDR x8, [x16], 8               // a3
252        FMLA v21.4s, v15.4s, v2.s[0]
253        FMLA v24.4s, v15.4s, v2.s[2]
254
255        # BLOCK 2
256        LDR d6, [x5, 96]               // vb0x0123
257        INS v1.d[1], x8               // a3
258        FMLA v27.4s, v15.4s, v3.s[0]
259        LDR x8, [x5, 104]
260        FMLA v30.4s, v15.4s, v3.s[2]
261        FMLA v22.4s, v16.4s, v2.s[0]
262
263        # BLOCK 3
264        LDR d7, [x5, 112]              // vb0x4567
265        INS v6.d[1], x8
266        FMLA v25.4s, v16.4s, v2.s[2]
267        LDR x8, [x5, 120]
268        FMLA v28.4s, v16.4s, v3.s[0]
269        FMLA v31.4s, v16.4s, v3.s[2]
270
271        # BLOCK 4
272        LDR d8, [x5, 128]              // vb0x89AB
273        INS v7.d[1], x8
274        FMLA v20.4s, v17.4s, v2.s[1]
275        LDR x8, [x5, 136]
276        FMLA v23.4s, v17.4s, v2.s[3]
277        FMLA v26.4s, v17.4s, v3.s[1]
278
279        # BLOCK 5
280        LDR d9, [x5, 144]              // vb1x0123
281        INS v8.d[1], x8
282        FMLA v29.4s, v17.4s, v3.s[3]
283        LDR x8, [x5, 152]
284        FMLA v21.4s, v18.4s, v2.s[1]
285        FMLA v24.4s, v18.4s, v2.s[3]
286
287        # BLOCK 6
288        LDR d10, [x5, 160]             // vb1x4567
289        INS v9.d[1], x8
290        FMLA v27.4s, v18.4s, v3.s[1]
291        LDR x8, [x5, 168]
292        FMLA v30.4s, v18.4s, v3.s[3]
293        SUBS x0, x0, 16
294        FMLA v22.4s, v19.4s, v2.s[1]
295
296        # BLOCK 7
297        LDR d11, [x5, 176]             // vb1x89AB
298        INS v10.d[1], x8
299        FMLA v25.4s, v19.4s, v2.s[3]
300        LDR x8, [x5, 184]
301        FMLA v28.4s, v19.4s, v3.s[1]
302        ADD x5, x5, 192
303        FMLA v31.4s, v19.4s, v3.s[3]
304        B.HS 2b
305
306        # Epilogue
307        # First block same as main loop.  Second block has no loads.
3083:
309        # BLOCK 0
310        LDR d2, [x13], 8               // a0
311        INS v11.d[1], x8
312        FMLA v20.4s, v6.4s, v0.s[0]
313        LDR x8, [x14], 8               // a1
314        FMLA v23.4s, v6.4s, v0.s[2]
315        FMLA v26.4s, v6.4s, v1.s[0]
316
317        # BLOCK 1
318        LDR d3, [x15], 8               // a2
319        INS v2.d[1], x8                // a1 was loaded in block 0
320        FMLA v29.4s, v6.4s, v1.s[2]
321        LDR x8, [x16], 8               // a3
322        FMLA v21.4s, v7.4s, v0.s[0]
323        FMLA v24.4s, v7.4s, v0.s[2]
324
325        # BLOCK 2
326        LDR d14, [x5]                  // vb0x0123
327        INS v3.d[1], x8               // a3 was loaded in block 1
328        FMLA v27.4s, v7.4s, v1.s[0]
329        LDR x8, [x5, 8]
330        FMLA v30.4s, v7.4s, v1.s[2]
331        FMLA v22.4s, v8.4s, v0.s[0]
332
333        # BLOCK 3
334        LDR d15, [x5, 16]              // vb0x4567
335        INS v14.d[1], x8               // v14 was loaded in block 2
336        FMLA v25.4s, v8.4s, v0.s[2]
337        LDR x8, [x5, 24]
338        FMLA v28.4s, v8.4s, v1.s[0]
339        FMLA v31.4s, v8.4s, v1.s[2]
340
341        # BLOCK 4
342        LDR d16, [x5, 32]              // vb0x89AB
343        INS v15.d[1], x8
344        FMLA v20.4s, v9.4s, v0.s[1]
345        LDR x8, [x5, 40]
346        FMLA v23.4s, v9.4s, v0.s[3]
347        FMLA v26.4s, v9.4s, v1.s[1]
348
349        # BLOCK 5
350        LDR d17, [x5, 48]             // vb1x0123
351        INS v16.d[1], x8
352        FMLA v29.4s, v9.4s, v1.s[3]
353        LDR x8, [x5, 56]
354        FMLA v21.4s, v10.4s, v0.s[1]
355        FMLA v24.4s, v10.4s, v0.s[3]
356
357        # BLOCK 6
358        LDR d18, [x5, 64]             // vb1x4567
359        INS v17.d[1], x8
360        FMLA v27.4s, v10.4s, v1.s[1]
361        LDR x8, [x5, 72]
362        FMLA v30.4s, v10.4s, v1.s[3]
363        FMLA v22.4s, v11.4s, v0.s[1]
364
365        # BLOCK 7
366        LDR d19, [x5, 80]             // vb1x89AB
367        INS v18.d[1], x8
368        FMLA v25.4s, v11.4s, v0.s[3]
369        LDR x8, [x5, 88]
370        FMLA v28.4s, v11.4s, v1.s[1]
371        FMLA v31.4s, v11.4s, v1.s[3]
372
373        # Second group of 24 fma.  8 blocks of 4 cycles.  LDR + 3 FMA
374        # A is loaded for 1st group into v0/v1
375
376        # BLOCK 0
377        INS v19.d[1], x8
378        FMLA v20.4s, v14.4s, v2.s[0]
379        FMLA v23.4s, v14.4s, v2.s[2]
380        FMLA v26.4s, v14.4s, v3.s[0]
381
382        # BLOCK 1
383        FMLA v29.4s, v14.4s, v3.s[2]
384        FMLA v21.4s, v15.4s, v2.s[0]
385        FMLA v24.4s, v15.4s, v2.s[2]
386
387        # BLOCK 2
388        FMLA v27.4s, v15.4s, v3.s[0]
389        FMLA v30.4s, v15.4s, v3.s[2]
390        FMLA v22.4s, v16.4s, v2.s[0]
391
392        # BLOCK 3
393        FMLA v25.4s, v16.4s, v2.s[2]
394        FMLA v28.4s, v16.4s, v3.s[0]
395        FMLA v31.4s, v16.4s, v3.s[2]
396
397        # BLOCK 4
398        FMLA v20.4s, v17.4s, v2.s[1]
399        FMLA v23.4s, v17.4s, v2.s[3]
400        FMLA v26.4s, v17.4s, v3.s[1]
401
402        # BLOCK 5
403        FMLA v29.4s, v17.4s, v3.s[3]
404        FMLA v21.4s, v18.4s, v2.s[1]
405        FMLA v24.4s, v18.4s, v2.s[3]
406
407        # BLOCK 6
408        FMLA v27.4s, v18.4s, v3.s[1]
409        FMLA v30.4s, v18.4s, v3.s[3]
410        FMLA v22.4s, v19.4s, v2.s[1]
411        TST x0, 15
412
413        # BLOCK 7
414        FMLA v25.4s, v19.4s, v2.s[3]
415        FMLA v28.4s, v19.4s, v3.s[1]
416        ADD x5, x5, 96
417        FMLA v31.4s, v19.4s, v3.s[3]
418
419        # Is there a remainder?- 2 floats of A (8 bytes) or less
420        B.NE 5f
421
4224:
423        # ks loop
424        SUBS x9, x9, 32  // ks -= MR * sizeof(void*)
425        B.NE 1b
426
427        # Clamp
428        FMIN v20.4s, v20.4s, v4.4s
429        SUBS x1, x1, 12
430        FMIN v21.4s, v21.4s, v4.4s
431        FMIN v22.4s, v22.4s, v4.4s
432        FMIN v23.4s, v23.4s, v4.4s
433        FMIN v24.4s, v24.4s, v4.4s
434        FMIN v25.4s, v25.4s, v4.4s
435        FMIN v26.4s, v26.4s, v4.4s
436        FMIN v27.4s, v27.4s, v4.4s
437        FMIN v28.4s, v28.4s, v4.4s
438        FMIN v29.4s, v29.4s, v4.4s
439        FMIN v30.4s, v30.4s, v4.4s
440        FMIN v31.4s, v31.4s, v4.4s
441        FMAX v20.4s, v20.4s, v5.4s
442        FMAX v21.4s, v21.4s, v5.4s
443        FMAX v22.4s, v22.4s, v5.4s
444        FMAX v23.4s, v23.4s, v5.4s
445        FMAX v24.4s, v24.4s, v5.4s
446        FMAX v25.4s, v25.4s, v5.4s
447        FMAX v26.4s, v26.4s, v5.4s
448        FMAX v27.4s, v27.4s, v5.4s
449        FMAX v28.4s, v28.4s, v5.4s
450        FMAX v29.4s, v29.4s, v5.4s
451        FMAX v30.4s, v30.4s, v5.4s
452        FMAX v31.4s, v31.4s, v5.4s
453
454        # Store full 4 x 12
455        B.LO 8f
456
457        ST1 {v29.16b, v30.16b, v31.16b},  [x7], x10
458        ST1 {v26.16b, v27.16b, v28.16b}, [x18], x10
459        ST1 {v23.16b, v24.16b, v25.16b}, [x17], x10
460        ST1 {v20.16b, v21.16b, v22.16b},  [x6], x10
461        SUB x4, x4, x3  // a -= ks
462
463        # nc loop
464        B.HI 0b
465
466        # Restore d8-d11,d14,d15 from stack
467        LDP d14, d15, [sp, 32]
468        LDP d10, d11, [sp, 16]
469        LDP  d8,  d9, [sp], 48
470        RET
471
4725:
473        # Is there a remainder?- 2 floats of A (8 bytes)
474        TBZ x0, 3, 6f
475
476        # Remainder- 2 floats of A (8 bytes)
477        LDR d0,  [x13], 8  // a0
478        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
479        LDR d1, [x14], 8  // a1
480        LDR d2, [x15], 8  // a2
481        LDR d3,  [x16], 8 // a3
482        LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48
483
484        # First block of 3 B
485        FMLA v20.4s, v6.4s, v0.s[0]
486        FMLA v23.4s, v6.4s, v1.s[0]
487        FMLA v26.4s, v6.4s, v2.s[0]
488        FMLA v29.4s, v6.4s, v3.s[0]
489        FMLA v21.4s, v7.4s, v0.s[0]
490        FMLA v24.4s, v7.4s, v1.s[0]
491        FMLA v27.4s, v7.4s, v2.s[0]
492        FMLA v30.4s, v7.4s, v3.s[0]
493        FMLA v22.4s, v8.4s, v0.s[0]
494        FMLA v25.4s, v8.4s, v1.s[0]
495        FMLA v28.4s, v8.4s, v2.s[0]
496        FMLA v31.4s, v8.4s, v3.s[0]
497
498        # Second block of 3 B
499        FMLA v20.4s, v9.4s, v0.s[1]
500        FMLA v23.4s, v9.4s, v1.s[1]
501        FMLA v26.4s, v9.4s, v2.s[1]
502        FMLA v29.4s, v9.4s, v3.s[1]
503        FMLA v21.4s, v10.4s, v0.s[1]
504        FMLA v24.4s, v10.4s, v1.s[1]
505        FMLA v27.4s, v10.4s, v2.s[1]
506        FMLA v30.4s, v10.4s, v3.s[1]
507        FMLA v22.4s, v11.4s, v0.s[1]
508        FMLA v25.4s, v11.4s, v1.s[1]
509        FMLA v28.4s, v11.4s, v2.s[1]
510        FMLA v31.4s, v11.4s, v3.s[1]
511
512        # Is there a remainder?- 1 floats of A (4 bytes)
513        TBZ x0, 2, 4b
5146:
515        # Remainder- 1 floats of A (4 bytes)
516        LDR s0,  [x13], 4  // a0
517        LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
518        LDR s1, [x14], 4  // a1
519        LDR s2, [x15], 4  // a2
520        LDR s3,  [x16], 4 // a3
521
522        FMLA v20.4s, v6.4s, v0.s[0]
523        FMLA v23.4s, v6.4s, v1.s[0]
524        FMLA v26.4s, v6.4s, v2.s[0]
525        FMLA v29.4s, v6.4s, v3.s[0]
526        FMLA v21.4s, v7.4s, v0.s[0]
527        FMLA v24.4s, v7.4s, v1.s[0]
528        FMLA v27.4s, v7.4s, v2.s[0]
529        FMLA v30.4s, v7.4s, v3.s[0]
530        FMLA v22.4s, v8.4s, v0.s[0]
531        FMLA v25.4s, v8.4s, v1.s[0]
532        FMLA v28.4s, v8.4s, v2.s[0]
533        FMLA v31.4s, v8.4s, v3.s[0]
534        B 4b
535
5368:
537        ADD x1, x1, 12
538        # Store odd channels
539        TBZ x1, 3, 9f
540        STP q29, q30,  [x7], 32
541        MOV v29.16b, v31.16b
542        STP q26, q27, [x18], 32
543        MOV v26.16b, v28.16b
544        STP q23, q24, [x17], 32
545        MOV v23.16b, v25.16b
546        STP q20, q21,  [x6], 32
547        MOV v20.16b, v22.16b
548
5499:
550        TBZ x1, 2, 10f
551        STR q29,  [x7], 16
552        MOV v29.16b, v30.16b
553        STR q26, [x18], 16
554        MOV v26.16b, v27.16b
555        STR q23,  [x17], 16
556        MOV v23.16b, v24.16b
557        STR q20,  [x6], 16
558        MOV v20.16b, v21.16b
559
56010:
561        TBZ x1, 1, 11f
562        STR d29,  [x7], 8
563        DUP d29, v29.d[1]
564        STR d26, [x18], 8
565        DUP d26, v26.d[1]
566        STR d23, [x17], 8
567        DUP d23, v23.d[1]
568        STR d20,  [x6], 8
569        DUP d20, v20.d[1]
570
57111:
572        TBZ x1, 0, 12f
573        STR s29,  [x7]
574        STR s26, [x18]
575        STR s23, [x17]
576        STR s20,  [x6]
57712:
578        # Restore d8-d11,d14,d15 from stack
579        LDP d14, d15, [sp, 32]
580        LDP d10, d11, [sp, 16]
581        LDP  d8,  d9, [sp], 48
582        RET
583
584END_FUNCTION xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53
585
586#ifdef __ELF__
587.section ".note.GNU-stack","",%progbits
588#endif
589