• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# x8 temporary vector shadow register
43
44# Vector register usage
45# A0  v0     v3
46# A1  v0[1]  v3[1]
47# A2  v1     v4
48# A3  v1[1]  v4[1]
49# A4  v2     v5
50# A5  v2[1]  v5[1]
51# B   v12 v13 v14 v15 second set of B
52# B   v16 v17 v18 v19 first set
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60# unused A   v8 v9 v10 v11
61
62BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55
63
64        $if INC:
65          # Load acc, params pointer
66          LDP x15, x8, [sp, 8]
67        $else:
68          # Load params pointer
69          LDR x8, [sp, 8]
70
71        # Clamp A and C pointers
72        CMP x0, 2                // if mr < 2
73        ADD x9, x3, x4           // a1 = a0 + a_stride
74        ADD x16, x6, x7          // c1 = c0 + cm_stride
75        CSEL x9, x3, x9, LO      //   a1 = a0
76        CSEL x16, x6, x16, LO    //   c1 = c0
77
78        ADD x10, x9, x4          // a2 = a1 + a_stride
79        ADD x17, x16, x7         // c2 = c1 + cm_stride
80                                 // if mr <= 2
81        CSEL x10, x9, x10, LS    //   a2 = a1
82        CSEL x17, x16, x17, LS   //   c2 = c1
83
84        CMP x0, 4                // if mr < 4
85        ADD x11, x10, x4         // a3 = a2 + a_stride
86        ADD x14, x17, x7         // c3 = c2 + cm_stride
87        CSEL x11, x10, x11, LO   //   a3 = a2
88        CSEL x14, x17, x14, LO   //   c3 = c2
89
90        ADD x12, x11, x4         // a4 = a3 + a_stride
91        ADD x13, x14, x7         // c4 = c3 + cm_stride
92                                 // if mr <= 4
93        CSEL x12, x11, x12, LS   //   a4 = a3
94        CSEL x13, x14, x13, LS   //   c4 = c3
95
96        CMP x0, 6                // if mr < 6
97        ADD x4, x12, x4          // a5 = a4 + a_stride
98        ADD x7, x13, x7          // c5 = c4 + cm_stride
99        CSEL x4, x12, x4, LO     //   a5 = a4
100        CSEL x7, x13, x7, LO     //   c5 = c4
101
102        # Load min/max values
103        LD2R {v6.4s, v7.4s}, [x8]
104
105        // Save d12-d15 on stack
106        STP d12, d13, [sp, -32]!
107        STP d14, d15, [sp, 16]
108
1090:
110        $if INC:
111          # Load initial accumulators
112          LDP q20, q21, [x15], 32
113          LDP q22, q23, [x15], 32
114          LDP q24, q25, [x15], 32
115          LDP q26, q27, [x15], 32
116          LDP q28, q29, [x15], 32
117          LDP q30, q31, [x15], 32
118          SUBS x0, x2, 16  // k = kc - 16
119          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
120          PRFM PLDL1KEEP,  [x3, 64]
121          PRFM PLDL1KEEP,  [x9,  0]
122          PRFM PLDL1KEEP,  [x9, 64]
123          PRFM PLDL1KEEP, [x10,  0]
124          PRFM PLDL1KEEP, [x10, 64]
125          PRFM PLDL1KEEP, [x11,  0]
126          PRFM PLDL1KEEP, [x11, 64]
127          PRFM PLDL1KEEP, [x12,  0]
128          PRFM PLDL1KEEP, [x12, 64]
129          PRFM PLDL1KEEP,  [x4,  0]
130          PRFM PLDL1KEEP,  [x4, 64]
131          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
132          PRFM PLDL1KEEP, [x5,  64]
133          PRFM PLDL1KEEP, [x5, 128]
134          PRFM PLDL1KEEP, [x5, 192]
135          PRFM PLDL1KEEP, [x5, 256]
136          PRFM PLDL1KEEP, [x5, 320]
137        $else:
138          # Load initial bias from w into accumulators
139          LDP q20, q21, [x5], 32
140          SUBS x0, x2, 16  // k = kc - 16
141          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
142          PRFM PLDL1KEEP,  [x3, 64]
143          MOV v22.16b, v20.16b
144          PRFM PLDL1KEEP,  [x9,  0]
145          PRFM PLDL1KEEP,  [x9, 64]
146          MOV v23.16b, v21.16b
147          PRFM PLDL1KEEP, [x10,  0]
148          PRFM PLDL1KEEP, [x10, 64]
149          MOV v24.16b, v20.16b
150          PRFM PLDL1KEEP, [x11,  0]
151          PRFM PLDL1KEEP, [x11, 64]
152          MOV v25.16b, v21.16b
153          PRFM PLDL1KEEP, [x12,  0]
154          PRFM PLDL1KEEP, [x12, 64]
155          MOV v26.16b, v20.16b
156          PRFM PLDL1KEEP,  [x4,  0]
157          PRFM PLDL1KEEP,  [x4, 64]
158          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
159          MOV v27.16b, v21.16b
160          PRFM PLDL1KEEP, [x5,  64]
161          MOV v28.16b, v20.16b
162          PRFM PLDL1KEEP, [x5, 128]
163          MOV v29.16b, v21.16b
164          PRFM PLDL1KEEP, [x5, 192]
165          MOV v30.16b, v20.16b
166          PRFM PLDL1KEEP, [x5, 256]
167          MOV v31.16b, v21.16b
168          PRFM PLDL1KEEP, [x5, 320]
169
170        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
171        B.LO 4f
172
173        # Prologue - First group loads, no FMA
174        LDR  d0, [x3], 8               // a0
175        LDP  q16, q17, [x5], 32        // b
176        LDR  d1, [x10], 8              // a2
177        LDR  d2, [x12], 8              // a4
178        LD1  {v0.d}[1],  [x9], 8       // a1
179        LD1  {v1.d}[1], [x11], 8       // a3
180        LD1  {v2.d}[1],  [x4], 8       // a5
181        SUBS x0, x0, 16
182        LDR  q18, [x5], 16
183        LDR  d19, [x5], 8
184        LDR   x8, [x5], 8   // ins is in BLOCK 0
185
186        # Is there at least 4 floats (16 bytes) for main loop?
187        B.LO 2f
188
189        # Main loop - 4 floats of A (16 bytes)
190        # 48 FMA + 12 LD64 A + 8 LDR B
1911:
192        # First group of 24 FMA, Second group loads
193        // BLOCK 0
194        FMLA v20.4s, v16.4s,  v0.s[0]
195        LDR   d3, [x3], 8              // a0
196        FMLA v22.4s, v16.4s,  v0.s[2]
197        INS v19.d[1], x8               // b from second group
198        FMLA v24.4s, v16.4s,  v1.s[0]
199        LDR   x8, [x9], 8              // a1
200
201        // BLOCK 1
202        FMLA v26.4s, v16.4s,  v1.s[2]
203        LDR  d12, [x5]
204        FMLA v28.4s, v16.4s,  v2.s[0]
205        INS v3.d[1], x8                // a1 ins
206        FMLA v30.4s, v16.4s,  v2.s[2]
207        LDR   x8, [x5, 8]              // b
208
209        // BLOCK 2
210        FMLA v21.4s, v17.4s,  v0.s[0]
211        LDR   d4, [x10], 8             // a2
212        FMLA v23.4s, v17.4s,  v0.s[2]
213        INS v12.d[1], x8               // b  ins
214        FMLA v25.4s, v17.4s,  v1.s[0]
215        LDR   x8, [x11], 8             // a3
216
217        // BLOCK 3
218        FMLA v27.4s, v17.4s,  v1.s[2]
219        LDR   d5, [x12], 8             // a4
220        FMLA v29.4s, v17.4s,  v2.s[0]
221        INS v4.d[1], x8                // a3 ins
222        FMLA v31.4s, v17.4s,  v2.s[2]
223        LDR   x8, [x4], 8              // a5
224
225        // BLOCK 4
226        FMLA v20.4s, v18.4s,  v0.s[1]
227        LDR  d13, [x5, 16]
228        FMLA v22.4s, v18.4s,  v0.s[3]
229        INS v5.d[1], x8                // a5 ins
230        FMLA v24.4s, v18.4s,  v1.s[1]
231        LDR   x8, [x5, 24]
232
233        // BLOCK 5
234        FMLA v26.4s, v18.4s,  v1.s[3]
235        LDR  d14, [x5, 32]
236        FMLA v28.4s, v18.4s,  v2.s[1]
237        INS v13.d[1], x8               // b
238        FMLA v30.4s, v18.4s,  v2.s[3]
239        LDR   x8, [x5, 40]
240
241        // BLOCK 6
242        FMLA v21.4s, v19.4s,  v0.s[1]
243        LDR  d15, [x5, 48]
244        FMLA v23.4s, v19.4s,  v0.s[3]
245        INS v14.d[1], x8               // b
246        FMLA v25.4s, v19.4s,  v1.s[1]
247        LDR   x8, [x5, 56]
248
249        // BLOCK 7
250        FMLA v27.4s, v19.4s,  v1.s[3]
251        FMLA v29.4s, v19.4s,  v2.s[1]
252        INS v15.d[1], x8
253        FMLA v31.4s, v19.4s,  v2.s[3]
254
255        # Second group of 24 FMA, First group of loads
256        // BLOCK 0
257        FMLA v20.4s, v12.4s,  v3.s[0]
258        LDR   d0, [x3], 8              // a0
259        FMLA v22.4s, v12.4s,  v3.s[2]
260        FMLA v24.4s, v12.4s,  v4.s[0]
261        LDR   x8, [x9], 8              // a1
262
263        // BLOCK 1
264        FMLA v26.4s, v12.4s,  v4.s[2]
265        LDR  d16, [x5, 64]
266        FMLA v28.4s, v12.4s,  v5.s[0]
267        INS v0.d[1], x8                // a1 ins
268        FMLA v30.4s, v12.4s,  v5.s[2]
269        LDR   x8, [x5, 72]             // b
270
271        // BLOCK 2
272        FMLA v21.4s, v13.4s,  v3.s[0]
273        LDR   d1, [x10], 8             // a2
274        FMLA v23.4s, v13.4s,  v3.s[2]
275        INS v16.d[1], x8               // b
276        FMLA v25.4s, v13.4s,  v4.s[0]
277        LDR   x8, [x11], 8             // a3
278
279        // BLOCK 3
280        FMLA v27.4s, v13.4s,  v4.s[2]
281        LDR   d2, [x12], 8             // a4
282        FMLA v29.4s, v13.4s,  v5.s[0]
283        INS v1.d[1], x8                // a3 ins
284        FMLA v31.4s, v13.4s,  v5.s[2]
285        LDR   x8,  [x4], 8             // a5
286
287        // BLOCK 4
288        FMLA v20.4s, v14.4s,  v3.s[1]
289        LDR  d17, [x5, 80]
290        FMLA v22.4s, v14.4s,  v3.s[3]
291        INS v2.d[1], x8                // a5 ins
292        FMLA v24.4s, v14.4s,  v4.s[1]
293        LDR   x8, [x5, 88]
294
295        // BLOCK 5
296        FMLA v26.4s, v14.4s,  v4.s[3]
297        LDR  d18, [x5, 96]
298        FMLA v28.4s, v14.4s,  v5.s[1]
299        INS v17.d[1], x8               // b
300        FMLA v30.4s, v14.4s,  v5.s[3]
301        LDR   x8, [x5, 104]
302
303        // BLOCK 6
304        FMLA v21.4s, v15.4s,  v3.s[1]
305        LDR  d19, [x5, 112]
306        FMLA v23.4s, v15.4s,  v3.s[3]
307        INS v18.d[1], x8               // b
308        FMLA v25.4s, v15.4s,  v4.s[1]
309        LDR   x8, [x5, 120]
310
311        // BLOCK 7
312        FMLA v27.4s, v15.4s,  v4.s[3]
313        SUBS x0, x0, 16
314        FMLA v29.4s, v15.4s,  v5.s[1]
315        ADD x5, x5, 128
316        FMLA v31.4s, v15.4s,  v5.s[3]
317        B.HS 1b
318
319        # Epilogue - 4 floats of A (16 bytes)
320        # 48 FMA + 12 LD64 A + 8 LDR B
3212:
322        # First group of 24 FMA, Second group loads
323        // BLOCK 0
324        FMLA v20.4s, v16.4s,  v0.s[0]
325        LDR   d3, [x3], 8              // a0
326        FMLA v22.4s, v16.4s,  v0.s[2]
327        INS v19.d[1], x8               // b from second group
328        FMLA v24.4s, v16.4s,  v1.s[0]
329        LDR   x8, [x9], 8              // a1
330
331        // BLOCK 1
332        FMLA v26.4s, v16.4s,  v1.s[2]
333        LDR  d12, [x5]
334        FMLA v28.4s, v16.4s,  v2.s[0]
335        INS v3.d[1], x8                // a1 ins
336        FMLA v30.4s, v16.4s,  v2.s[2]
337        LDR   x8, [x5, 8]              // b
338
339        // BLOCK 2
340        FMLA v21.4s, v17.4s,  v0.s[0]
341        LDR   d4, [x10], 8             // a2
342        FMLA v23.4s, v17.4s,  v0.s[2]
343        INS v12.d[1], x8               // b  ins
344        FMLA v25.4s, v17.4s,  v1.s[0]
345        LDR   x8, [x11], 8             // a3
346
347        // BLOCK 3
348        FMLA v27.4s, v17.4s,  v1.s[2]
349        LDR   d5, [x12], 8             // a4
350        FMLA v29.4s, v17.4s,  v2.s[0]
351        INS v4.d[1], x8                // a3 ins
352        FMLA v31.4s, v17.4s,  v2.s[2]
353        LDR   x8, [x4], 8              // a5
354
355        // BLOCK 4
356        FMLA v20.4s, v18.4s,  v0.s[1]
357        LDR  d13, [x5, 16]
358        FMLA v22.4s, v18.4s,  v0.s[3]
359        INS v5.d[1], x8                // a5 ins
360        FMLA v24.4s, v18.4s,  v1.s[1]
361        LDR   x8, [x5, 24]
362
363        // BLOCK 5
364        FMLA v26.4s, v18.4s,  v1.s[3]
365        LDR  d14, [x5, 32]
366        FMLA v28.4s, v18.4s,  v2.s[1]
367        INS v13.d[1], x8               // b
368        FMLA v30.4s, v18.4s,  v2.s[3]
369        LDR   x8, [x5, 40]
370
371        // BLOCK 6
372        FMLA v21.4s, v19.4s,  v0.s[1]
373        LDR  d15, [x5, 48]
374        FMLA v23.4s, v19.4s,  v0.s[3]
375        INS v14.d[1], x8               // b
376        FMLA v25.4s, v19.4s,  v1.s[1]
377        LDR   x8, [x5, 56]
378
379        // BLOCK 7
380        FMLA v27.4s, v19.4s,  v1.s[3]
381        FMLA v29.4s, v19.4s,  v2.s[1]
382        INS v15.d[1], x8               // b
383        FMLA v31.4s, v19.4s,  v2.s[3]
384
385        # Second group of 24 FMA, First group of loads
386        // BLOCK 0
387        FMLA v20.4s, v12.4s,  v3.s[0]
388        PRFM PSTL1KEEP,  [x6]          // Prefetch C0
389        FMLA v22.4s, v12.4s,  v3.s[2]
390        PRFM PSTL1KEEP, [x16]          // Prefetch C1
391        FMLA v24.4s, v12.4s,  v4.s[0]
392        PRFM PSTL1KEEP, [x17]          // Prefetch C2
393
394        // BLOCK 1
395        FMLA v26.4s, v12.4s,  v4.s[2]
396        PRFM PSTL1KEEP, [x14]          // Prefetch C3
397        FMLA v28.4s, v12.4s,  v5.s[0]
398        PRFM PSTL1KEEP, [x13]          // Prefetch C4
399        FMLA v30.4s, v12.4s,  v5.s[2]
400        PRFM PSTL1KEEP, [x7]           // Prefetch C5
401
402        // BLOCK 2
403        FMLA v21.4s, v13.4s,  v3.s[0]
404        FMLA v23.4s, v13.4s,  v3.s[2]
405        FMLA v25.4s, v13.4s,  v4.s[0]
406
407        // BLOCK 3
408        FMLA v27.4s, v13.4s,  v4.s[2]
409        FMLA v29.4s, v13.4s,  v5.s[0]
410        FMLA v31.4s, v13.4s,  v5.s[2]
411
412        // BLOCK 4
413        FMLA v20.4s, v14.4s,  v3.s[1]
414        FMLA v22.4s, v14.4s,  v3.s[3]
415        FMLA v24.4s, v14.4s,  v4.s[1]
416
417        // BLOCK 5
418        FMLA v26.4s, v14.4s,  v4.s[3]
419        FMLA v28.4s, v14.4s,  v5.s[1]
420        FMLA v30.4s, v14.4s,  v5.s[3]
421        TST x0, 15
422
423        // BLOCK 6
424        FMLA v21.4s, v15.4s,  v3.s[1]
425        FMLA v23.4s, v15.4s,  v3.s[3]
426        FMLA v25.4s, v15.4s,  v4.s[1]
427        ADD x5, x5, 64
428
429        // BLOCK 7
430        FMLA v27.4s, v15.4s,  v4.s[3]
431        FMLA v29.4s, v15.4s,  v5.s[1]
432        FMLA v31.4s, v15.4s,  v5.s[3]
433
434        # Is there a remainder?- 2 floats of A (8 bytes) or less
435        B.NE 4f
4363:
437        # Clamp
438        FMAX v20.4s, v20.4s, v6.4s
439        # Load cn_stride
440        LDR x0, [sp, 32]
441        FMAX v21.4s, v21.4s, v6.4s
442        FMAX v22.4s, v22.4s, v6.4s
443        FMAX v23.4s, v23.4s, v6.4s
444        FMAX v24.4s, v24.4s, v6.4s
445        FMAX v25.4s, v25.4s, v6.4s
446        FMAX v26.4s, v26.4s, v6.4s
447        FMAX v27.4s, v27.4s, v6.4s
448        FMAX v28.4s, v28.4s, v6.4s
449        FMAX v29.4s, v29.4s, v6.4s
450        FMAX v30.4s, v30.4s, v6.4s
451        FMAX v31.4s, v31.4s, v6.4s
452        SUBS x1, x1, 8
453        FMIN v20.4s, v20.4s, v7.4s
454        FMIN v21.4s, v21.4s, v7.4s
455        FMIN v22.4s, v22.4s, v7.4s
456        FMIN v23.4s, v23.4s, v7.4s
457        FMIN v24.4s, v24.4s, v7.4s
458        FMIN v25.4s, v25.4s, v7.4s
459        FMIN v26.4s, v26.4s, v7.4s
460        FMIN v27.4s, v27.4s, v7.4s
461        FMIN v28.4s, v28.4s, v7.4s
462        FMIN v29.4s, v29.4s, v7.4s
463        FMIN v30.4s, v30.4s, v7.4s
464        FMIN v31.4s, v31.4s, v7.4s
465
466        # Store full 6 x 8
467        B.LO 6f
468
469        $if INC:
470          ST1 {v30.16b, v31.16b},  [x7], x0
471          SUB  x3,  x3, x2 // a0 -= kc
472          ST1 {v28.16b, v29.16b}, [x13], x0
473          SUB  x9,  x9, x2 // a1 -= kc
474          ST1 {v26.16b, v27.16b}, [x14], x0
475          SUB x10, x10, x2 // a2 -= kc
476          ST1 {v24.16b, v25.16b}, [x17], x0
477          SUB x11, x11, x2 // a3 -= kc
478          ST1 {v22.16b, v23.16b}, [x16], x0
479          SUB x12, x12, x2 // a4 -= kc
480          ST1 {v20.16b, v21.16b},  [x6], x0
481          SUB  x4,  x4, x2 // a5 -= kc
482        $else:
483          ST1 {v20.16b, v21.16b},  [x6], x0
484          SUB  x3,  x3, x2 // a0 -= kc
485          ST1 {v22.16b, v23.16b}, [x16], x0
486          SUB  x9,  x9, x2 // a1 -= kc
487          ST1 {v24.16b, v25.16b}, [x17], x0
488          SUB x10, x10, x2 // a2 -= kc
489          ST1 {v26.16b, v27.16b}, [x14], x0
490          SUB x11, x11, x2 // a3 -= kc
491          ST1 {v28.16b, v29.16b}, [x13], x0
492          SUB x12, x12, x2 // a4 -= kc
493          ST1 {v30.16b, v31.16b},  [x7], x0
494          SUB  x4,  x4, x2 // a5 -= kc
495
496        B.HI 0b
497
498        // Restore d12-d15 from stack
499        LDP d14, d15, [sp, 16]
500        LDP d12, d13, [sp], 32
501        RET
502
5034:
504        # Is there a remainder?- 2 floats of A (8 bytes)
505        TBZ x0, 3, 5f
506
507        # Remainder- 2 floats of A (8 bytes)
508        LDR   d0,  [x3], 8
509        LDR  q16, [x5], 16
510        LD1   {v0.d}[1], [x9], 8
511        LDR   d1, [x10], 8
512        LD1   {v1.d}[1], [x11], 8
513        LDR   d2, [x12], 8
514        LD1   {v2.d}[1], [x4], 8
515        LDR  q17, [x5], 16
516        LDR  q18, [x5], 16
517        LDR  q19, [x5], 16
518
519        FMLA v20.4s, v16.4s,  v0.s[0]
520        FMLA v22.4s, v16.4s,  v0.s[2]
521        FMLA v24.4s, v16.4s,  v1.s[0]
522        FMLA v26.4s, v16.4s,  v1.s[2]
523        FMLA v28.4s, v16.4s,  v2.s[0]
524        FMLA v30.4s, v16.4s,  v2.s[2]
525        FMLA v21.4s, v17.4s,  v0.s[0]
526        FMLA v23.4s, v17.4s,  v0.s[2]
527        FMLA v25.4s, v17.4s,  v1.s[0]
528        FMLA v27.4s, v17.4s,  v1.s[2]
529        FMLA v29.4s, v17.4s,  v2.s[0]
530        FMLA v31.4s, v17.4s,  v2.s[2]
531
532        FMLA v20.4s, v18.4s,  v0.s[1]
533        FMLA v22.4s, v18.4s,  v0.s[3]
534        FMLA v24.4s, v18.4s,  v1.s[1]
535        FMLA v26.4s, v18.4s,  v1.s[3]
536        FMLA v28.4s, v18.4s,  v2.s[1]
537        FMLA v30.4s, v18.4s,  v2.s[3]
538        FMLA v21.4s, v19.4s,  v0.s[1]
539        FMLA v23.4s, v19.4s,  v0.s[3]
540        FMLA v25.4s, v19.4s,  v1.s[1]
541        FMLA v27.4s, v19.4s,  v1.s[3]
542        FMLA v29.4s, v19.4s,  v2.s[1]
543        FMLA v31.4s, v19.4s,  v2.s[3]
544
545        # Is there a remainder?- 1 floats of A (4 bytes)
546        TBZ x0, 2, 3b
5475:
548        # Remainder- 1 floats of A (4 bytes)
549        LDR   s0,  [x3], 4
550        LDR  q16, [x5], 16
551        LD1   {v0.s}[2], [x9], 4
552        LDR   s1, [x10], 4
553        LD1   {v1.s}[2], [x11], 4
554        LDR   s2, [x12], 4
555        LD1   {v2.s}[2], [x4], 4
556        LDR  q17, [x5], 16
557
558        FMLA v20.4s, v16.4s,  v0.s[0]
559        FMLA v22.4s, v16.4s,  v0.s[2]
560        FMLA v24.4s, v16.4s,  v1.s[0]
561        FMLA v26.4s, v16.4s,  v1.s[2]
562        FMLA v28.4s, v16.4s,  v2.s[0]
563        FMLA v30.4s, v16.4s,  v2.s[2]
564        FMLA v21.4s, v17.4s,  v0.s[0]
565        FMLA v23.4s, v17.4s,  v0.s[2]
566        FMLA v25.4s, v17.4s,  v1.s[0]
567        FMLA v27.4s, v17.4s,  v1.s[2]
568        FMLA v29.4s, v17.4s,  v2.s[0]
569        FMLA v31.4s, v17.4s,  v2.s[2]
570        B 3b
571
572        # Store odd width
5736:
574        TBZ x1, 2, 7f
575        $if INC:
576          STR q30,  [x7], 16
577          MOV v30.16b, v31.16b
578          STR q28, [x13], 16
579          MOV v28.16b, v29.16b
580          STR q26, [x14], 16
581          MOV v26.16b, v27.16b
582          STR q24, [x17], 16
583          MOV v24.16b, v25.16b
584          STR q22, [x16], 16
585          MOV v22.16b, v23.16b
586          STR q20,  [x6], 16
587          MOV v20.16b, v21.16b
588        $else:
589          STR q20,  [x6], 16
590          MOV v20.16b, v21.16b
591          STR q22, [x16], 16
592          MOV v22.16b, v23.16b
593          STR q24, [x17], 16
594          MOV v24.16b, v25.16b
595          STR q26, [x14], 16
596          MOV v26.16b, v27.16b
597          STR q28, [x13], 16
598          MOV v28.16b, v29.16b
599          STR q30,  [x7], 16
600          MOV v30.16b, v31.16b
601
6027:
603        TBZ x1, 1, 8f
604        $if INC:
605          STR d30,  [x7], 8
606          DUP d30, v30.d[1]
607          STR d28, [x13], 8
608          DUP d28, v28.d[1]
609          STR d26, [x14], 8
610          DUP d26, v26.d[1]
611          STR d24, [x17], 8
612          DUP d24, v24.d[1]
613          STR d22, [x16], 8
614          DUP d22, v22.d[1]
615          STR d20,  [x6], 8
616          DUP d20, v20.d[1]
617        $else:
618          STR d20,  [x6], 8
619          DUP d20, v20.d[1]
620          STR d22, [x16], 8
621          DUP d22, v22.d[1]
622          STR d24, [x17], 8
623          DUP d24, v24.d[1]
624          STR d26, [x14], 8
625          DUP d26, v26.d[1]
626          STR d28, [x13], 8
627          DUP d28, v28.d[1]
628          STR d30,  [x7], 8
629          DUP d30, v30.d[1]
630
6318:
632        TBZ x1, 0, 9f
633        $if INC:
634          STR s30,  [x7]
635          STR s28, [x13]
636          STR s26, [x14]
637          STR s24, [x17]
638          STR s22, [x16]
639          STR s20,  [x6]
640        $else:
641          STR s20,  [x6]
642          STR s22, [x16]
643          STR s24, [x17]
644          STR s26, [x14]
645          STR s28, [x13]
646          STR s30,  [x7]
6479:
648        // Restore d12-d15 from stack
649        LDP d14, d15, [sp, 16]
650        LDP d12, d13, [sp], 32
651        RET
652
653END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55
654
655#ifdef __ELF__
656.section ".note.GNU-stack","",%progbits
657#endif
658