• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a53(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41#  x7 c5
42
43# x8 temporary vector shadow register
44
45# Vector register usage
46# A0  v0     v3
47# A1  v0[1]  v3[1]
48# A2  v1     v4
49# A3  v1[1]  v4[1]
50# A4  v2     v5
51# A5  v2[1]  v5[1]
52# B   v12 v13 v14 v15 second set of B
53# B   v16 v17 v18 v19 first set
54# C   v20 v21
55# C   v22 v23
56# C   v24 v25
57# C   v26 v27
58# C   v28 v29
59# C   v30 v31
60# Clamp v6 v7
61# unused A   v8 v9 v10 v11
62
63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a53
64
65        # Clamp A and C pointers
66        CMP x0, 2                // if mr < 2
67        ADD x9, x3, x4           // a1 = a0 + a_stride
68        ADD x16, x6, x7          // c1 = c0 + cm_stride
69        CSEL x9, x3, x9, LO      //   a1 = a0
70        CSEL x16, x6, x16, LO    //   c1 = c0
71
72        ADD x10, x9, x4          // a2 = a1 + a_stride
73        ADD x17, x16, x7         // c2 = c1 + cm_stride
74                                 // if mr <= 2
75        CSEL x10, x9, x10, LS    //   a2 = a1
76        CSEL x17, x16, x17, LS   //   c2 = c1
77
78        CMP x0, 4                // if mr < 4
79        ADD x11, x10, x4         // a3 = a2 + a_stride
80        ADD x18, x17, x7         // c3 = c2 + cm_stride
81        CSEL x11, x10, x11, LO   //   a3 = a2
82        CSEL x18, x17, x18, LO   //   c3 = c2
83
84        ADD x12, x11, x4         // a4 = a3 + a_stride
85        ADD x13, x18, x7         // c4 = c3 + cm_stride
86                                 // if mr <= 5
87        CSEL x12, x11, x12, LS   //   a4 = a3
88        CSEL x13, x18, x13, LS   //   c4 = c3
89
90        $if INC:
91          # Load acc, params pointer
92          LDP x15, x8, [sp, 8]
93        $else:
94          # Load params pointer
95          LDR x8, [sp, 8]
96
97        CMP x0, 6                // if mr < 6
98        ADD x4, x12, x4          // a5 = a4 + a_stride
99        ADD x7, x13, x7          // c5 = c4 + cm_stride
100        CSEL x4, x12, x4, LO     //   a5 = a4
101        CSEL x7, x13, x7, LO     //   c5 = c4
102
103        # Load clamping_params values
104        LD2R {v6.4s, v7.4s}, [x8]
105
106        # Load cn_stride
107        LDR x14, [sp]
108
109        // Save d12-d15 on stack
110        STP d12, d13, [sp, -32]!
111        STP d14, d15, [sp, 16]
112
1130:
114        $if INC:
115          # Load initial accumulators
116          LDP q20, q21, [x15], 32
117          LDP q22, q23, [x15], 32
118          LDP q24, q25, [x15], 32
119          LDP q26, q27, [x15], 32
120          LDP q28, q29, [x15], 32
121          LDP q30, q31, [x15], 32
122          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
123          PRFM PLDL1KEEP,  [x3, 64]
124          PRFM PLDL1KEEP,  [x9,  0]
125          PRFM PLDL1KEEP,  [x9, 64]
126          PRFM PLDL1KEEP, [x10,  0]
127          PRFM PLDL1KEEP, [x10, 64]
128          PRFM PLDL1KEEP, [x11,  0]
129          PRFM PLDL1KEEP, [x11, 64]
130          PRFM PLDL1KEEP, [x12,  0]
131          PRFM PLDL1KEEP, [x12, 64]
132          PRFM PLDL1KEEP,  [x4,  0]
133          PRFM PLDL1KEEP,  [x4, 64]
134          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
135          PRFM PLDL1KEEP, [x5,  64]
136          PRFM PLDL1KEEP, [x5, 128]
137          PRFM PLDL1KEEP, [x5, 192]
138        $else:
139          # Load initial bias from w into accumulators
140          LDP q20, q21, [x5], 32
141          MOV v22.16b, v20.16b
142          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
143          PRFM PLDL1KEEP,  [x3, 64]
144          MOV v23.16b, v21.16b
145          PRFM PLDL1KEEP,  [x9,  0]
146          PRFM PLDL1KEEP,  [x9, 64]
147          MOV v24.16b, v20.16b
148          PRFM PLDL1KEEP, [x10,  0]
149          PRFM PLDL1KEEP, [x10, 64]
150          MOV v25.16b, v21.16b
151          PRFM PLDL1KEEP, [x11,  0]
152          PRFM PLDL1KEEP, [x11, 64]
153          MOV v26.16b, v20.16b
154          PRFM PLDL1KEEP, [x12,  0]
155          PRFM PLDL1KEEP, [x12, 64]
156          MOV v27.16b, v21.16b
157          PRFM PLDL1KEEP,  [x4,  0]
158          PRFM PLDL1KEEP,  [x4, 64]
159          MOV v28.16b, v20.16b
160          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
161          MOV v29.16b, v21.16b
162          PRFM PLDL1KEEP, [x5,  64]
163          MOV v30.16b, v20.16b
164          PRFM PLDL1KEEP, [x5, 128]
165          MOV v31.16b, v21.16b
166          PRFM PLDL1KEEP, [x5, 192]
167
168        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
169        SUBS x0, x2, 16  // k = kc - 16
170        B.LO 5f
171
172        # Prologue - First group loads, no FMA
173        LDR  d0, [x3], 8               // a0
174        LDP q16, q17, [x5], 32         // b
175        LDR  d1, [x10], 8              // a2
176        LDR  d2, [x12], 8              // a4
177        LD1  {v0.d}[1],  [x9], 8       // a1
178        LD1  {v1.d}[1], [x11], 8       // a3
179        LD1  {v2.d}[1],  [x4], 8       // a5
180        SUBS x0, x0, 16
181        LDR  q18, [x5], 16
182        LDR  d19, [x5], 8
183        LDR   x8, [x5], 8   // ins is in BLOCK 0
184
185        # Is there at least 4 floats (16 bytes) for main loop?
186        B.LO 2f
187
188        # Main loop - 4 floats of A (16 bytes)
189        # 48 FMA + 12 LD64 A + 8 LDR B
1901:
191        # First group of 24 FMA, Second group loads
192        // BLOCK 0
193        LDR   d3, [x3], 8              // a0
194        INS v19.d[1], x8               // b from second group
195        FMLA v20.4s, v16.4s,  v0.s[0]
196        LDR   x8, [x9], 8              // a1
197        FMLA v22.4s, v16.4s,  v0.s[2]
198        FMLA v24.4s, v16.4s,  v1.s[0]
199
200        // BLOCK 1
201        LDR  d12, [x5]
202        INS v3.d[1], x8                // a1 ins
203        FMLA v26.4s, v16.4s,  v1.s[2]
204        LDR   x8, [x5, 8]              // b
205        FMLA v28.4s, v16.4s,  v2.s[0]
206        FMLA v30.4s, v16.4s,  v2.s[2]
207
208        // BLOCK 2
209        LDR   d4, [x10], 8             // a2
210        INS v12.d[1], x8               // b  ins
211        FMLA v21.4s, v17.4s,  v0.s[0]
212        LDR   x8, [x11], 8             // a3
213        FMLA v23.4s, v17.4s,  v0.s[2]
214        FMLA v25.4s, v17.4s,  v1.s[0]
215
216        // BLOCK 3
217        LDR   d5, [x12], 8             // a4
218        INS v4.d[1], x8                // a3 ins
219        FMLA v27.4s, v17.4s,  v1.s[2]
220        LDR   x8, [x4], 8              // a5
221        FMLA v29.4s, v17.4s,  v2.s[0]
222        FMLA v31.4s, v17.4s,  v2.s[2]
223
224        // BLOCK 4
225        LDR  d13, [x5, 16]
226        INS v5.d[1], x8                // a5 ins
227        FMLA v20.4s, v18.4s,  v0.s[1]
228        LDR   x8, [x5, 24]
229        FMLA v22.4s, v18.4s,  v0.s[3]
230        FMLA v24.4s, v18.4s,  v1.s[1]
231
232        // BLOCK 5
233        LDR  d14, [x5, 32]
234        INS v13.d[1], x8               // b
235        FMLA v26.4s, v18.4s,  v1.s[3]
236        LDR   x8, [x5, 40]
237        FMLA v28.4s, v18.4s,  v2.s[1]
238        FMLA v30.4s, v18.4s,  v2.s[3]
239
240        // BLOCK 6
241        LDR  d15, [x5, 48]
242        INS v14.d[1], x8               // b
243        FMLA v21.4s, v19.4s,  v0.s[1]
244        LDR   x8, [x5, 56]
245        FMLA v23.4s, v19.4s,  v0.s[3]
246        FMLA v25.4s, v19.4s,  v1.s[1]
247
248        // BLOCK 7
249        INS v15.d[1], x8
250        FMLA v27.4s, v19.4s,  v1.s[3]
251        FMLA v29.4s, v19.4s,  v2.s[1]
252        FMLA v31.4s, v19.4s,  v2.s[3]
253
254        # Second group of 24 FMA, First group of loads
255        // BLOCK 0
256        LDR   d0, [x3], 8              // a0
257        FMLA v20.4s, v12.4s,  v3.s[0]
258        LDR   x8, [x9], 8              // a1
259        FMLA v22.4s, v12.4s,  v3.s[2]
260        FMLA v24.4s, v12.4s,  v4.s[0]
261        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
262
263        // BLOCK 1
264        LDR  d16, [x5, 64]
265        INS v0.d[1], x8                // a1 ins
266        FMLA v26.4s, v12.4s,  v4.s[2]
267        LDR   x8, [x5, 72]             // b
268        FMLA v28.4s, v12.4s,  v5.s[0]
269        FMLA v30.4s, v12.4s,  v5.s[2]
270        PRFM PLDL1KEEP, [x9, 128]      // Prefetch A1
271
272        // BLOCK 2
273        LDR   d1, [x10], 8             // a2
274        INS v16.d[1], x8               // b
275        FMLA v21.4s, v13.4s,  v3.s[0]
276        LDR   x8, [x11], 8             // a3
277        FMLA v23.4s, v13.4s,  v3.s[2]
278        FMLA v25.4s, v13.4s,  v4.s[0]
279        PRFM PLDL1KEEP, [x10, 128]     // Prefetch A2
280
281        // BLOCK 3
282        LDR   d2, [x12], 8             // a4
283        INS v1.d[1], x8                // a3 ins
284        FMLA v27.4s, v13.4s,  v4.s[2]
285        LDR   x8,  [x4], 8             // a5
286        FMLA v29.4s, v13.4s,  v5.s[0]
287        FMLA v31.4s, v13.4s,  v5.s[2]
288        PRFM PLDL1KEEP, [x11, 128]     // Prefetch A3
289
290        // BLOCK 4
291        LDR  d17, [x5, 80]
292        INS v2.d[1], x8                // a5 ins
293        FMLA v20.4s, v14.4s,  v3.s[1]
294        LDR   x8, [x5, 88]
295        FMLA v22.4s, v14.4s,  v3.s[3]
296        FMLA v24.4s, v14.4s,  v4.s[1]
297        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A4
298
299        // BLOCK 5
300        LDR  d18, [x5, 96]
301        INS v17.d[1], x8               // b
302        FMLA v26.4s, v14.4s,  v4.s[3]
303        LDR   x8, [x5, 104]
304        FMLA v28.4s, v14.4s,  v5.s[1]
305        FMLA v30.4s, v14.4s,  v5.s[3]
306        PRFM PLDL1KEEP, [x4, 128]      // Prefetch A5
307
308        // BLOCK 6
309        LDR  d19, [x5, 112]
310        INS v18.d[1], x8               // b
311        FMLA v21.4s, v15.4s,  v3.s[1]
312        LDR   x8, [x5, 120]
313        FMLA v23.4s, v15.4s,  v3.s[3]
314        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
315        FMLA v25.4s, v15.4s,  v4.s[1]
316        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
317
318        // BLOCK 7
319        SUBS x0, x0, 16  // LDR lands here
320        FMLA v27.4s, v15.4s,  v4.s[3]
321        FMLA v29.4s, v15.4s,  v5.s[1]
322        ADD x5, x5, 128
323        FMLA v31.4s, v15.4s,  v5.s[3]
324        B.HS 1b
325
326        # Epilogue - 4 floats of A (16 bytes)
327        # 48 FMA + 12 LD64 A + 8 LDR B
3282:
329        # First group of 24 FMA, Second group loads
330        // BLOCK 0
331        LDR   d3, [x3], 8              // a0
332        INS v19.d[1], x8               // b from second group
333        FMLA v20.4s, v16.4s,  v0.s[0]
334        LDR   x8, [x9], 8              // a1
335        FMLA v22.4s, v16.4s,  v0.s[2]
336        FMLA v24.4s, v16.4s,  v1.s[0]
337        PRFM PSTL1KEEP,  [x6]          // Prefetch C0
338
339        // BLOCK 1
340        LDR  d12, [x5]
341        INS v3.d[1], x8                // a1 ins
342        FMLA v26.4s, v16.4s,  v1.s[2]
343        LDR   x8, [x5, 8]              // b
344        FMLA v28.4s, v16.4s,  v2.s[0]
345        FMLA v30.4s, v16.4s,  v2.s[2]
346        PRFM PSTL1KEEP, [x16]          // Prefetch C1
347
348        // BLOCK 2
349        LDR   d4, [x10], 8             // a2
350        INS v12.d[1], x8               // b  ins
351        FMLA v21.4s, v17.4s,  v0.s[0]
352        LDR   x8, [x11], 8             // a3
353        FMLA v23.4s, v17.4s,  v0.s[2]
354        FMLA v25.4s, v17.4s,  v1.s[0]
355        PRFM PSTL1KEEP, [x17]          // Prefetch C2
356
357        // BLOCK 3
358        LDR   d5, [x12], 8             // a4
359        INS v4.d[1], x8                // a3 ins
360        FMLA v27.4s, v17.4s,  v1.s[2]
361        LDR   x8, [x4], 8              // a5
362        FMLA v29.4s, v17.4s,  v2.s[0]
363        FMLA v31.4s, v17.4s,  v2.s[2]
364        PRFM PSTL1KEEP, [x18]          // Prefetch C3
365
366        // BLOCK 4
367        LDR  d13, [x5, 16]
368        INS v5.d[1], x8                // a5 ins
369        FMLA v20.4s, v18.4s,  v0.s[1]
370        LDR   x8, [x5, 24]
371        FMLA v22.4s, v18.4s,  v0.s[3]
372        FMLA v24.4s, v18.4s,  v1.s[1]
373        PRFM PSTL1KEEP, [x13]          // Prefetch C4
374
375        // BLOCK 5
376        LDR  d14, [x5, 32]
377        INS v13.d[1], x8               // b
378        FMLA v26.4s, v18.4s,  v1.s[3]
379        LDR   x8, [x5, 40]
380        FMLA v28.4s, v18.4s,  v2.s[1]
381        FMLA v30.4s, v18.4s,  v2.s[3]
382        PRFM PSTL1KEEP, [x7]           // Prefetch C5
383
384        // BLOCK 6
385        LDR  d15, [x5, 48]
386        INS v14.d[1], x8               // b
387        FMLA v21.4s, v19.4s,  v0.s[1]
388        LDR   x8, [x5, 56]
389        FMLA v23.4s, v19.4s,  v0.s[3]
390        FMLA v25.4s, v19.4s,  v1.s[1]
391
392        // BLOCK 7
393        INS v15.d[1], x8               // b
394        FMLA v27.4s, v19.4s,  v1.s[3]
395        FMLA v29.4s, v19.4s,  v2.s[1]
396        FMLA v31.4s, v19.4s,  v2.s[3]
397
398        # Second group of 24 FMA, First group of loads
399        // BLOCK 0
400        FMLA v20.4s, v12.4s,  v3.s[0]
401        FMLA v22.4s, v12.4s,  v3.s[2]
402        FMLA v24.4s, v12.4s,  v4.s[0]
403
404        // BLOCK 1
405        FMLA v26.4s, v12.4s,  v4.s[2]
406        FMLA v28.4s, v12.4s,  v5.s[0]
407        FMLA v30.4s, v12.4s,  v5.s[2]
408
409        // BLOCK 2
410        FMLA v21.4s, v13.4s,  v3.s[0]
411        FMLA v23.4s, v13.4s,  v3.s[2]
412        FMLA v25.4s, v13.4s,  v4.s[0]
413
414        // BLOCK 3
415        FMLA v27.4s, v13.4s,  v4.s[2]
416        FMLA v29.4s, v13.4s,  v5.s[0]
417        FMLA v31.4s, v13.4s,  v5.s[2]
418
419        // BLOCK 4
420        FMLA v20.4s, v14.4s,  v3.s[1]
421        FMLA v22.4s, v14.4s,  v3.s[3]
422        FMLA v24.4s, v14.4s,  v4.s[1]
423
424        // BLOCK 5
425        FMLA v26.4s, v14.4s,  v4.s[3]
426        FMLA v28.4s, v14.4s,  v5.s[1]
427        FMLA v30.4s, v14.4s,  v5.s[3]
428
429        // BLOCK 6
430        FMLA v21.4s, v15.4s,  v3.s[1]
431        FMLA v23.4s, v15.4s,  v3.s[3]
432        FMLA v25.4s, v15.4s,  v4.s[1]
433        TST x0, 15
434
435        // BLOCK 7
436        FMLA v27.4s, v15.4s,  v4.s[3]
437        FMLA v29.4s, v15.4s,  v5.s[1]
438        FMLA v31.4s, v15.4s,  v5.s[3]
439        ADD x5, x5, 64
440
441        # Is there a remainder?- 2 floats of A (8 bytes) or less
442        B.NE 5f
4434:
444        # Clamp
445        FMIN v20.4s, v20.4s, v6.4s
446        SUBS x1, x1, 8
447        FMIN v21.4s, v21.4s, v6.4s
448        FMIN v22.4s, v22.4s, v6.4s
449        FMIN v23.4s, v23.4s, v6.4s
450        FMIN v24.4s, v24.4s, v6.4s
451        FMIN v25.4s, v25.4s, v6.4s
452        FMIN v26.4s, v26.4s, v6.4s
453        FMIN v27.4s, v27.4s, v6.4s
454        FMIN v28.4s, v28.4s, v6.4s
455        FMIN v29.4s, v29.4s, v6.4s
456        FMIN v30.4s, v30.4s, v6.4s
457        FMIN v31.4s, v31.4s, v6.4s
458        FMAX v20.4s, v20.4s, v7.4s
459        FMAX v21.4s, v21.4s, v7.4s
460        FMAX v22.4s, v22.4s, v7.4s
461        FMAX v23.4s, v23.4s, v7.4s
462        FMAX v24.4s, v24.4s, v7.4s
463        FMAX v25.4s, v25.4s, v7.4s
464        FMAX v26.4s, v26.4s, v7.4s
465        FMAX v27.4s, v27.4s, v7.4s
466        FMAX v28.4s, v28.4s, v7.4s
467        FMAX v29.4s, v29.4s, v7.4s
468        FMAX v30.4s, v30.4s, v7.4s
469        FMAX v31.4s, v31.4s, v7.4s
470
471        # Store full 6 x 8
472        B.LO 8f
473
474        $if INC:
475          ST1 {v30.16b, v31.16b},  [x7], x14
476          SUB  x3,  x3, x2 // a0 -= kc
477          ST1 {v28.16b, v29.16b}, [x13], x14
478          SUB  x9,  x9, x2 // a1 -= kc
479          ST1 {v26.16b, v27.16b}, [x18], x14
480          SUB x10, x10, x2 // a2 -= kc
481          ST1 {v24.16b, v25.16b}, [x17], x14
482          SUB x11, x11, x2 // a3 -= kc
483          ST1 {v22.16b, v23.16b}, [x16], x14
484          SUB x12, x12, x2 // a4 -= kc
485          ST1 {v20.16b, v21.16b},  [x6], x14
486          SUB  x4,  x4, x2 // a5 -= kc
487        $else:
488          ST1 {v20.16b, v21.16b},  [x6], x14
489          SUB  x3,  x3, x2 // a0 -= kc
490          ST1 {v22.16b, v23.16b}, [x16], x14
491          SUB  x9,  x9, x2 // a1 -= kc
492          ST1 {v24.16b, v25.16b}, [x17], x14
493          SUB x10, x10, x2 // a2 -= kc
494          ST1 {v26.16b, v27.16b}, [x18], x14
495          SUB x11, x11, x2 // a3 -= kc
496          ST1 {v28.16b, v29.16b}, [x13], x14
497          SUB x12, x12, x2 // a4 -= kc
498          ST1 {v30.16b, v31.16b},  [x7], x14
499          SUB  x4,  x4, x2 // a5 -= kc
500
501        B.HI 0b
502
503        // Restore d12-d15 from stack
504        LDP d14, d15, [sp, 16]
505        LDP d12, d13, [sp], 32
506        RET
507
5085:
509        # Is there a remainder?- 2 floats of A (8 bytes)
510        TBZ x0, 3, 6f
511
512        # Remainder- 2 floats of A (8 bytes)
513        LDR   d0,  [x3], 8
514        LDR  q16, [x5], 16
515        LD1   {v0.d}[1], [x9], 8
516        LDR   d1, [x10], 8
517        LD1   {v1.d}[1], [x11], 8
518        LDR   d2, [x12], 8
519        LD1   {v2.d}[1], [x4], 8
520        LDR  q17, [x5], 16
521        LDR  q18, [x5], 16
522        LDR  q19, [x5], 16
523
524        FMLA v20.4s, v16.4s,  v0.s[0]
525        FMLA v22.4s, v16.4s,  v0.s[2]
526        FMLA v24.4s, v16.4s,  v1.s[0]
527        FMLA v26.4s, v16.4s,  v1.s[2]
528        FMLA v28.4s, v16.4s,  v2.s[0]
529        FMLA v30.4s, v16.4s,  v2.s[2]
530        FMLA v21.4s, v17.4s,  v0.s[0]
531        FMLA v23.4s, v17.4s,  v0.s[2]
532        FMLA v25.4s, v17.4s,  v1.s[0]
533        FMLA v27.4s, v17.4s,  v1.s[2]
534        FMLA v29.4s, v17.4s,  v2.s[0]
535        FMLA v31.4s, v17.4s,  v2.s[2]
536
537        FMLA v20.4s, v18.4s,  v0.s[1]
538        FMLA v22.4s, v18.4s,  v0.s[3]
539        FMLA v24.4s, v18.4s,  v1.s[1]
540        FMLA v26.4s, v18.4s,  v1.s[3]
541        FMLA v28.4s, v18.4s,  v2.s[1]
542        FMLA v30.4s, v18.4s,  v2.s[3]
543        FMLA v21.4s, v19.4s,  v0.s[1]
544        FMLA v23.4s, v19.4s,  v0.s[3]
545        FMLA v25.4s, v19.4s,  v1.s[1]
546        FMLA v27.4s, v19.4s,  v1.s[3]
547        FMLA v29.4s, v19.4s,  v2.s[1]
548        FMLA v31.4s, v19.4s,  v2.s[3]
549
550        # Is there a remainder?- 1 floats of A (4 bytes)
551        TBZ x0, 2, 4b
5526:
553        # Remainder- 1 floats of A (4 bytes)
554        LDR   s0,  [x3], 4
555        LDR  q16, [x5], 16
556        LD1   {v0.s}[2], [x9], 4
557        LDR   s1, [x10], 4
558        LD1   {v1.s}[2], [x11], 4
559        LDR   s2, [x12], 4
560        LD1   {v2.s}[2], [x4], 4
561        LDR  q17, [x5], 16
562
563        FMLA v20.4s, v16.4s,  v0.s[0]
564        FMLA v22.4s, v16.4s,  v0.s[2]
565        FMLA v24.4s, v16.4s,  v1.s[0]
566        FMLA v26.4s, v16.4s,  v1.s[2]
567        FMLA v28.4s, v16.4s,  v2.s[0]
568        FMLA v30.4s, v16.4s,  v2.s[2]
569        FMLA v21.4s, v17.4s,  v0.s[0]
570        FMLA v23.4s, v17.4s,  v0.s[2]
571        FMLA v25.4s, v17.4s,  v1.s[0]
572        FMLA v27.4s, v17.4s,  v1.s[2]
573        FMLA v29.4s, v17.4s,  v2.s[0]
574        FMLA v31.4s, v17.4s,  v2.s[2]
575        B 4b
576
577        # Store odd width
5788:
579        TBZ x1, 2, 9f
580        $if INC:
581          STR q30,  [x7], 16
582          MOV v30.16b, v31.16b
583          STR q28, [x13], 16
584          MOV v28.16b, v29.16b
585          STR q26, [x18], 16
586          MOV v26.16b, v27.16b
587          STR q24, [x17], 16
588          MOV v24.16b, v25.16b
589          STR q22, [x16], 16
590          MOV v22.16b, v23.16b
591          STR q20,  [x6], 16
592          MOV v20.16b, v21.16b
593        $else:
594          STR q20,  [x6], 16
595          MOV v20.16b, v21.16b
596          STR q22, [x16], 16
597          MOV v22.16b, v23.16b
598          STR q24, [x17], 16
599          MOV v24.16b, v25.16b
600          STR q26, [x18], 16
601          MOV v26.16b, v27.16b
602          STR q28, [x13], 16
603          MOV v28.16b, v29.16b
604          STR q30,  [x7], 16
605          MOV v30.16b, v31.16b
606
6079:
608        TBZ x1, 1, 10f
609        $if INC:
610          STR d30,  [x7], 8
611          DUP d30, v30.d[1]
612          STR d28, [x13], 8
613          DUP d28, v28.d[1]
614          STR d26, [x18], 8
615          DUP d26, v26.d[1]
616          STR d24, [x17], 8
617          DUP d24, v24.d[1]
618          STR d22, [x16], 8
619          DUP d22, v22.d[1]
620          STR d20,  [x6], 8
621          DUP d20, v20.d[1]
622        $else:
623          STR d20,  [x6], 8
624          DUP d20, v20.d[1]
625          STR d22, [x16], 8
626          DUP d22, v22.d[1]
627          STR d24, [x17], 8
628          DUP d24, v24.d[1]
629          STR d26, [x18], 8
630          DUP d26, v26.d[1]
631          STR d28, [x13], 8
632          DUP d28, v28.d[1]
633          STR d30,  [x7], 8
634          DUP d30, v30.d[1]
635
63610:
637        TBZ x1, 0, 11f
638        $if INC:
639          STR s30,  [x7]
640          STR s28, [x13]
641          STR s26, [x18]
642          STR s24, [x17]
643          STR s22, [x16]
644          STR s20,  [x6]
645        $else:
646          STR s20,  [x6]
647          STR s22, [x16]
648          STR s24, [x17]
649          STR s26, [x18]
650          STR s28, [x13]
651          STR s30,  [x7]
65211:
653        // Restore d12-d15 from stack
654        LDP d14, d15, [sp, 16]
655        LDP d12, d13, [sp], 32
656        RET
657
658END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a53
659
660#ifdef __ELF__
661.section ".note.GNU-stack","",%progbits
662#endif
663