• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# x8 temporary vector shadow register
43
44# Vector register usage
45# A0  v0     v3
46# A1  v0[1]  v3[1]
47# A2  v1     v4
48# A3  v1[1]  v4[1]
49# A4  v2     v5
50# A5  v2[1]  v5[1]
51# B   v12 v13 v14 v15 second set of B
52# B   v16 v17 v18 v19 first set
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60# unused A   v8 v9 v10 v11
61
62BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
63
64        $if INC:
65          # Load acc, params pointer
66          LDP x15, x8, [sp, 8]
67        $else:
68          # Load params pointer
69          LDR x8, [sp, 8]
70
71        # Clamp A and C pointers
72        CMP x0, 2                // if mr < 2
73        ADD x9, x3, x4           // a1 = a0 + a_stride
74        ADD x16, x6, x7          // c1 = c0 + cm_stride
75        CSEL x9, x3, x9, LO      //   a1 = a0
76        CSEL x16, x6, x16, LO    //   c1 = c0
77
78        ADD x10, x9, x4          // a2 = a1 + a_stride
79        ADD x17, x16, x7         // c2 = c1 + cm_stride
80                                 // if mr <= 2
81        CSEL x10, x9, x10, LS    //   a2 = a1
82        CSEL x17, x16, x17, LS   //   c2 = c1
83
84        CMP x0, 4                // if mr < 4
85        ADD x11, x10, x4         // a3 = a2 + a_stride
86        ADD x14, x17, x7         // c3 = c2 + cm_stride
87        CSEL x11, x10, x11, LO   //   a3 = a2
88        CSEL x14, x17, x14, LO   //   c3 = c2
89
90        ADD x12, x11, x4         // a4 = a3 + a_stride
91        ADD x13, x14, x7         // c4 = c3 + cm_stride
92                                 // if mr <= 4
93        CSEL x12, x11, x12, LS   //   a4 = a3
94        CSEL x13, x14, x13, LS   //   c4 = c3
95
96        CMP x0, 6                // if mr < 6
97        ADD x4, x12, x4          // a5 = a4 + a_stride
98        ADD x7, x13, x7          // c5 = c4 + cm_stride
99        CSEL x4, x12, x4, LO     //   a5 = a4
100        CSEL x7, x13, x7, LO     //   c5 = c4
101
102        # Load min/max values
103        LD2R {v6.4s, v7.4s}, [x8]
104
105        // Save d12-d15 on stack
106        STP d12, d13, [sp, -32]!
107        STP d14, d15, [sp, 16]
108
1090:
110        $if INC:
111          # Load initial accumulators
112          LDP q20, q21, [x15], 32
113          LDP q22, q23, [x15], 32
114          LDP q24, q25, [x15], 32
115          LDP q26, q27, [x15], 32
116          LDP q28, q29, [x15], 32
117          LDP q30, q31, [x15], 32
118          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
119          PRFM PLDL1KEEP,  [x3, 64]
120          PRFM PLDL1KEEP,  [x9,  0]
121          PRFM PLDL1KEEP,  [x9, 64]
122          PRFM PLDL1KEEP, [x10,  0]
123          PRFM PLDL1KEEP, [x10, 64]
124          PRFM PLDL1KEEP, [x11,  0]
125          PRFM PLDL1KEEP, [x11, 64]
126          PRFM PLDL1KEEP, [x12,  0]
127          PRFM PLDL1KEEP, [x12, 64]
128          PRFM PLDL1KEEP,  [x4,  0]
129          PRFM PLDL1KEEP,  [x4, 64]
130          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
131          PRFM PLDL1KEEP, [x5,  64]
132          PRFM PLDL1KEEP, [x5, 128]
133          PRFM PLDL1KEEP, [x5, 192]
134        $else:
135          # Load initial bias from w into accumulators
136          LDP q20, q21, [x5], 32
137          MOV v22.16b, v20.16b
138          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
139          PRFM PLDL1KEEP,  [x3, 64]
140          MOV v23.16b, v21.16b
141          PRFM PLDL1KEEP,  [x9,  0]
142          PRFM PLDL1KEEP,  [x9, 64]
143          MOV v24.16b, v20.16b
144          PRFM PLDL1KEEP, [x10,  0]
145          PRFM PLDL1KEEP, [x10, 64]
146          MOV v25.16b, v21.16b
147          PRFM PLDL1KEEP, [x11,  0]
148          PRFM PLDL1KEEP, [x11, 64]
149          MOV v26.16b, v20.16b
150          PRFM PLDL1KEEP, [x12,  0]
151          PRFM PLDL1KEEP, [x12, 64]
152          MOV v27.16b, v21.16b
153          PRFM PLDL1KEEP,  [x4,  0]
154          PRFM PLDL1KEEP,  [x4, 64]
155          MOV v28.16b, v20.16b
156          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
157          MOV v29.16b, v21.16b
158          PRFM PLDL1KEEP, [x5,  64]
159          MOV v30.16b, v20.16b
160          PRFM PLDL1KEEP, [x5, 128]
161          MOV v31.16b, v21.16b
162          PRFM PLDL1KEEP, [x5, 192]
163
164        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
165        SUBS x0, x2, 16  // k = kc - 16
166        B.LO 4f
167
168        # Prologue - First group loads, no FMA
169        LDR  d0, [x3], 8               // a0
170        LDP q16, q17, [x5], 32         // b
171        LDR  d1, [x10], 8              // a2
172        LDR  d2, [x12], 8              // a4
173        LD1  {v0.d}[1],  [x9], 8       // a1
174        LD1  {v1.d}[1], [x11], 8       // a3
175        LD1  {v2.d}[1],  [x4], 8       // a5
176        SUBS x0, x0, 16
177        LDR  q18, [x5], 16
178        LDR  d19, [x5], 8
179        LDR   x8, [x5], 8   // ins is in BLOCK 0
180
181        # Is there at least 4 floats (16 bytes) for main loop?
182        B.LO 2f
183
184        # Main loop - 4 floats of A (16 bytes)
185        # 48 FMA + 12 LD64 A + 8 LDR B
1861:
187        # First group of 24 FMA, Second group loads
188        // BLOCK 0
189        LDR   d3, [x3], 8              // a0
190        INS v19.d[1], x8               // b from second group
191        FMLA v20.4s, v16.4s,  v0.s[0]
192        LDR   x8, [x9], 8              // a1
193        FMLA v22.4s, v16.4s,  v0.s[2]
194        FMLA v24.4s, v16.4s,  v1.s[0]
195
196        // BLOCK 1
197        LDR  d12, [x5]
198        INS v3.d[1], x8                // a1 ins
199        FMLA v26.4s, v16.4s,  v1.s[2]
200        LDR   x8, [x5, 8]              // b
201        FMLA v28.4s, v16.4s,  v2.s[0]
202        FMLA v30.4s, v16.4s,  v2.s[2]
203
204        // BLOCK 2
205        LDR   d4, [x10], 8             // a2
206        INS v12.d[1], x8               // b  ins
207        FMLA v21.4s, v17.4s,  v0.s[0]
208        LDR   x8, [x11], 8             // a3
209        FMLA v23.4s, v17.4s,  v0.s[2]
210        FMLA v25.4s, v17.4s,  v1.s[0]
211
212        // BLOCK 3
213        LDR   d5, [x12], 8             // a4
214        INS v4.d[1], x8                // a3 ins
215        FMLA v27.4s, v17.4s,  v1.s[2]
216        LDR   x8, [x4], 8              // a5
217        FMLA v29.4s, v17.4s,  v2.s[0]
218        FMLA v31.4s, v17.4s,  v2.s[2]
219
220        // BLOCK 4
221        LDR  d13, [x5, 16]
222        INS v5.d[1], x8                // a5 ins
223        FMLA v20.4s, v18.4s,  v0.s[1]
224        LDR   x8, [x5, 24]
225        FMLA v22.4s, v18.4s,  v0.s[3]
226        FMLA v24.4s, v18.4s,  v1.s[1]
227
228        // BLOCK 5
229        LDR  d14, [x5, 32]
230        INS v13.d[1], x8               // b
231        FMLA v26.4s, v18.4s,  v1.s[3]
232        LDR   x8, [x5, 40]
233        FMLA v28.4s, v18.4s,  v2.s[1]
234        FMLA v30.4s, v18.4s,  v2.s[3]
235
236        // BLOCK 6
237        LDR  d15, [x5, 48]
238        INS v14.d[1], x8               // b
239        FMLA v21.4s, v19.4s,  v0.s[1]
240        LDR   x8, [x5, 56]
241        FMLA v23.4s, v19.4s,  v0.s[3]
242        FMLA v25.4s, v19.4s,  v1.s[1]
243
244        // BLOCK 7
245        INS v15.d[1], x8
246        FMLA v27.4s, v19.4s,  v1.s[3]
247        FMLA v29.4s, v19.4s,  v2.s[1]
248        FMLA v31.4s, v19.4s,  v2.s[3]
249
250        # Second group of 24 FMA, First group of loads
251        // BLOCK 0
252        LDR   d0, [x3], 8              // a0
253        FMLA v20.4s, v12.4s,  v3.s[0]
254        LDR   x8, [x9], 8              // a1
255        FMLA v22.4s, v12.4s,  v3.s[2]
256        FMLA v24.4s, v12.4s,  v4.s[0]
257        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
258
259        // BLOCK 1
260        LDR  d16, [x5, 64]
261        INS v0.d[1], x8                // a1 ins
262        FMLA v26.4s, v12.4s,  v4.s[2]
263        LDR   x8, [x5, 72]             // b
264        FMLA v28.4s, v12.4s,  v5.s[0]
265        FMLA v30.4s, v12.4s,  v5.s[2]
266        PRFM PLDL1KEEP, [x9, 128]      // Prefetch A1
267
268        // BLOCK 2
269        LDR   d1, [x10], 8             // a2
270        INS v16.d[1], x8               // b
271        FMLA v21.4s, v13.4s,  v3.s[0]
272        LDR   x8, [x11], 8             // a3
273        FMLA v23.4s, v13.4s,  v3.s[2]
274        FMLA v25.4s, v13.4s,  v4.s[0]
275        PRFM PLDL1KEEP, [x10, 128]     // Prefetch A2
276
277        // BLOCK 3
278        LDR   d2, [x12], 8             // a4
279        INS v1.d[1], x8                // a3 ins
280        FMLA v27.4s, v13.4s,  v4.s[2]
281        LDR   x8,  [x4], 8             // a5
282        FMLA v29.4s, v13.4s,  v5.s[0]
283        FMLA v31.4s, v13.4s,  v5.s[2]
284        PRFM PLDL1KEEP, [x11, 128]     // Prefetch A3
285
286        // BLOCK 4
287        LDR  d17, [x5, 80]
288        INS v2.d[1], x8                // a5 ins
289        FMLA v20.4s, v14.4s,  v3.s[1]
290        LDR   x8, [x5, 88]
291        FMLA v22.4s, v14.4s,  v3.s[3]
292        FMLA v24.4s, v14.4s,  v4.s[1]
293        PRFM PLDL1KEEP, [x12, 128]     // Prefetch A4
294
295        // BLOCK 5
296        LDR  d18, [x5, 96]
297        INS v17.d[1], x8               // b
298        FMLA v26.4s, v14.4s,  v4.s[3]
299        LDR   x8, [x5, 104]
300        FMLA v28.4s, v14.4s,  v5.s[1]
301        FMLA v30.4s, v14.4s,  v5.s[3]
302        PRFM PLDL1KEEP, [x4, 128]      // Prefetch A5
303
304        // BLOCK 6
305        LDR  d19, [x5, 112]
306        INS v18.d[1], x8               // b
307        FMLA v21.4s, v15.4s,  v3.s[1]
308        LDR   x8, [x5, 120]
309        FMLA v23.4s, v15.4s,  v3.s[3]
310        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
311        FMLA v25.4s, v15.4s,  v4.s[1]
312        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
313
314        // BLOCK 7
315        SUBS x0, x0, 16  // LDR lands here
316        FMLA v27.4s, v15.4s,  v4.s[3]
317        FMLA v29.4s, v15.4s,  v5.s[1]
318        ADD x5, x5, 128
319        FMLA v31.4s, v15.4s,  v5.s[3]
320        B.HS 1b
321
322        # Epilogue - 4 floats of A (16 bytes)
323        # 48 FMA + 12 LD64 A + 8 LDR B
3242:
325        # First group of 24 FMA, Second group loads
326        // BLOCK 0
327        LDR   d3, [x3], 8              // a0
328        INS v19.d[1], x8               // b from second group
329        FMLA v20.4s, v16.4s,  v0.s[0]
330        LDR   x8, [x9], 8              // a1
331        FMLA v22.4s, v16.4s,  v0.s[2]
332        FMLA v24.4s, v16.4s,  v1.s[0]
333        PRFM PSTL1KEEP,  [x6]          // Prefetch C0
334
335        // BLOCK 1
336        LDR  d12, [x5]
337        INS v3.d[1], x8                // a1 ins
338        FMLA v26.4s, v16.4s,  v1.s[2]
339        LDR   x8, [x5, 8]              // b
340        FMLA v28.4s, v16.4s,  v2.s[0]
341        FMLA v30.4s, v16.4s,  v2.s[2]
342        PRFM PSTL1KEEP, [x16]          // Prefetch C1
343
344        // BLOCK 2
345        LDR   d4, [x10], 8             // a2
346        INS v12.d[1], x8               // b  ins
347        FMLA v21.4s, v17.4s,  v0.s[0]
348        LDR   x8, [x11], 8             // a3
349        FMLA v23.4s, v17.4s,  v0.s[2]
350        FMLA v25.4s, v17.4s,  v1.s[0]
351        PRFM PSTL1KEEP, [x17]          // Prefetch C2
352
353        // BLOCK 3
354        LDR   d5, [x12], 8             // a4
355        INS v4.d[1], x8                // a3 ins
356        FMLA v27.4s, v17.4s,  v1.s[2]
357        LDR   x8, [x4], 8              // a5
358        FMLA v29.4s, v17.4s,  v2.s[0]
359        FMLA v31.4s, v17.4s,  v2.s[2]
360        PRFM PSTL1KEEP, [x14]          // Prefetch C3
361
362        // BLOCK 4
363        LDR  d13, [x5, 16]
364        INS v5.d[1], x8                // a5 ins
365        FMLA v20.4s, v18.4s,  v0.s[1]
366        LDR   x8, [x5, 24]
367        FMLA v22.4s, v18.4s,  v0.s[3]
368        FMLA v24.4s, v18.4s,  v1.s[1]
369        PRFM PSTL1KEEP, [x13]          // Prefetch C4
370
371        // BLOCK 5
372        LDR  d14, [x5, 32]
373        INS v13.d[1], x8               // b
374        FMLA v26.4s, v18.4s,  v1.s[3]
375        LDR   x8, [x5, 40]
376        FMLA v28.4s, v18.4s,  v2.s[1]
377        FMLA v30.4s, v18.4s,  v2.s[3]
378        PRFM PSTL1KEEP, [x7]           // Prefetch C5
379
380        // BLOCK 6
381        LDR  d15, [x5, 48]
382        INS v14.d[1], x8               // b
383        FMLA v21.4s, v19.4s,  v0.s[1]
384        LDR   x8, [x5, 56]
385        FMLA v23.4s, v19.4s,  v0.s[3]
386        FMLA v25.4s, v19.4s,  v1.s[1]
387
388        // BLOCK 7
389        INS v15.d[1], x8               // b
390        FMLA v27.4s, v19.4s,  v1.s[3]
391        FMLA v29.4s, v19.4s,  v2.s[1]
392        FMLA v31.4s, v19.4s,  v2.s[3]
393
394        # Second group of 24 FMA, First group of loads
395        // BLOCK 0
396        FMLA v20.4s, v12.4s,  v3.s[0]
397        FMLA v22.4s, v12.4s,  v3.s[2]
398        FMLA v24.4s, v12.4s,  v4.s[0]
399
400        // BLOCK 1
401        FMLA v26.4s, v12.4s,  v4.s[2]
402        FMLA v28.4s, v12.4s,  v5.s[0]
403        FMLA v30.4s, v12.4s,  v5.s[2]
404
405        // BLOCK 2
406        FMLA v21.4s, v13.4s,  v3.s[0]
407        FMLA v23.4s, v13.4s,  v3.s[2]
408        FMLA v25.4s, v13.4s,  v4.s[0]
409
410        // BLOCK 3
411        FMLA v27.4s, v13.4s,  v4.s[2]
412        FMLA v29.4s, v13.4s,  v5.s[0]
413        FMLA v31.4s, v13.4s,  v5.s[2]
414
415        // BLOCK 4
416        FMLA v20.4s, v14.4s,  v3.s[1]
417        FMLA v22.4s, v14.4s,  v3.s[3]
418        FMLA v24.4s, v14.4s,  v4.s[1]
419
420        // BLOCK 5
421        FMLA v26.4s, v14.4s,  v4.s[3]
422        FMLA v28.4s, v14.4s,  v5.s[1]
423        FMLA v30.4s, v14.4s,  v5.s[3]
424
425        // BLOCK 6
426        FMLA v21.4s, v15.4s,  v3.s[1]
427        FMLA v23.4s, v15.4s,  v3.s[3]
428        FMLA v25.4s, v15.4s,  v4.s[1]
429        TST x0, 15
430
431        // BLOCK 7
432        FMLA v27.4s, v15.4s,  v4.s[3]
433        FMLA v29.4s, v15.4s,  v5.s[1]
434        FMLA v31.4s, v15.4s,  v5.s[3]
435        ADD x5, x5, 64
436
437        # Is there a remainder?- 2 floats of A (8 bytes) or less
438        B.NE 4f
4393:
440        # Clamp
441        FMAX v20.4s, v20.4s, v6.4s
442        # Load cn_stride
443        LDR x0, [sp, 32]
444        FMAX v21.4s, v21.4s, v6.4s
445        FMAX v22.4s, v22.4s, v6.4s
446        FMAX v23.4s, v23.4s, v6.4s
447        FMAX v24.4s, v24.4s, v6.4s
448        FMAX v25.4s, v25.4s, v6.4s
449        FMAX v26.4s, v26.4s, v6.4s
450        FMAX v27.4s, v27.4s, v6.4s
451        FMAX v28.4s, v28.4s, v6.4s
452        FMAX v29.4s, v29.4s, v6.4s
453        FMAX v30.4s, v30.4s, v6.4s
454        FMAX v31.4s, v31.4s, v6.4s
455        SUBS x1, x1, 8
456        FMIN v20.4s, v20.4s, v7.4s
457        FMIN v21.4s, v21.4s, v7.4s
458        FMIN v22.4s, v22.4s, v7.4s
459        FMIN v23.4s, v23.4s, v7.4s
460        FMIN v24.4s, v24.4s, v7.4s
461        FMIN v25.4s, v25.4s, v7.4s
462        FMIN v26.4s, v26.4s, v7.4s
463        FMIN v27.4s, v27.4s, v7.4s
464        FMIN v28.4s, v28.4s, v7.4s
465        FMIN v29.4s, v29.4s, v7.4s
466        FMIN v30.4s, v30.4s, v7.4s
467        FMIN v31.4s, v31.4s, v7.4s
468
469        # Store full 6 x 8
470        B.LO 6f
471
472        $if INC:
473          ST1 {v30.16b, v31.16b},  [x7], x0
474          SUB  x3,  x3, x2 // a0 -= kc
475          ST1 {v28.16b, v29.16b}, [x13], x0
476          SUB  x9,  x9, x2 // a1 -= kc
477          ST1 {v26.16b, v27.16b}, [x14], x0
478          SUB x10, x10, x2 // a2 -= kc
479          ST1 {v24.16b, v25.16b}, [x17], x0
480          SUB x11, x11, x2 // a3 -= kc
481          ST1 {v22.16b, v23.16b}, [x16], x0
482          SUB x12, x12, x2 // a4 -= kc
483          ST1 {v20.16b, v21.16b},  [x6], x0
484          SUB  x4,  x4, x2 // a5 -= kc
485        $else:
486          ST1 {v20.16b, v21.16b},  [x6], x0
487          SUB  x3,  x3, x2 // a0 -= kc
488          ST1 {v22.16b, v23.16b}, [x16], x0
489          SUB  x9,  x9, x2 // a1 -= kc
490          ST1 {v24.16b, v25.16b}, [x17], x0
491          SUB x10, x10, x2 // a2 -= kc
492          ST1 {v26.16b, v27.16b}, [x14], x0
493          SUB x11, x11, x2 // a3 -= kc
494          ST1 {v28.16b, v29.16b}, [x13], x0
495          SUB x12, x12, x2 // a4 -= kc
496          ST1 {v30.16b, v31.16b},  [x7], x0
497          SUB  x4,  x4, x2 // a5 -= kc
498
499        B.HI 0b
500
501        // Restore d12-d15 from stack
502        LDP d14, d15, [sp, 16]
503        LDP d12, d13, [sp], 32
504        RET
505
5064:
507        # Is there a remainder?- 2 floats of A (8 bytes)
508        TBZ x0, 3, 5f
509
510        # Remainder- 2 floats of A (8 bytes)
511        LDR   d0,  [x3], 8
512        LDR  q16, [x5], 16
513        LD1   {v0.d}[1], [x9], 8
514        LDR   d1, [x10], 8
515        LD1   {v1.d}[1], [x11], 8
516        LDR   d2, [x12], 8
517        LD1   {v2.d}[1], [x4], 8
518        LDR  q17, [x5], 16
519        LDR  q18, [x5], 16
520        LDR  q19, [x5], 16
521
522        FMLA v20.4s, v16.4s,  v0.s[0]
523        FMLA v22.4s, v16.4s,  v0.s[2]
524        FMLA v24.4s, v16.4s,  v1.s[0]
525        FMLA v26.4s, v16.4s,  v1.s[2]
526        FMLA v28.4s, v16.4s,  v2.s[0]
527        FMLA v30.4s, v16.4s,  v2.s[2]
528        FMLA v21.4s, v17.4s,  v0.s[0]
529        FMLA v23.4s, v17.4s,  v0.s[2]
530        FMLA v25.4s, v17.4s,  v1.s[0]
531        FMLA v27.4s, v17.4s,  v1.s[2]
532        FMLA v29.4s, v17.4s,  v2.s[0]
533        FMLA v31.4s, v17.4s,  v2.s[2]
534
535        FMLA v20.4s, v18.4s,  v0.s[1]
536        FMLA v22.4s, v18.4s,  v0.s[3]
537        FMLA v24.4s, v18.4s,  v1.s[1]
538        FMLA v26.4s, v18.4s,  v1.s[3]
539        FMLA v28.4s, v18.4s,  v2.s[1]
540        FMLA v30.4s, v18.4s,  v2.s[3]
541        FMLA v21.4s, v19.4s,  v0.s[1]
542        FMLA v23.4s, v19.4s,  v0.s[3]
543        FMLA v25.4s, v19.4s,  v1.s[1]
544        FMLA v27.4s, v19.4s,  v1.s[3]
545        FMLA v29.4s, v19.4s,  v2.s[1]
546        FMLA v31.4s, v19.4s,  v2.s[3]
547
548        # Is there a remainder?- 1 floats of A (4 bytes)
549        TBZ x0, 2, 3b
5505:
551        # Remainder- 1 floats of A (4 bytes)
552        LDR   s0,  [x3], 4
553        LDR  q16, [x5], 16
554        LD1   {v0.s}[2], [x9], 4
555        LDR   s1, [x10], 4
556        LD1   {v1.s}[2], [x11], 4
557        LDR   s2, [x12], 4
558        LD1   {v2.s}[2], [x4], 4
559        LDR  q17, [x5], 16
560
561        FMLA v20.4s, v16.4s,  v0.s[0]
562        FMLA v22.4s, v16.4s,  v0.s[2]
563        FMLA v24.4s, v16.4s,  v1.s[0]
564        FMLA v26.4s, v16.4s,  v1.s[2]
565        FMLA v28.4s, v16.4s,  v2.s[0]
566        FMLA v30.4s, v16.4s,  v2.s[2]
567        FMLA v21.4s, v17.4s,  v0.s[0]
568        FMLA v23.4s, v17.4s,  v0.s[2]
569        FMLA v25.4s, v17.4s,  v1.s[0]
570        FMLA v27.4s, v17.4s,  v1.s[2]
571        FMLA v29.4s, v17.4s,  v2.s[0]
572        FMLA v31.4s, v17.4s,  v2.s[2]
573        B 3b
574
575        # Store odd width
5766:
577        TBZ x1, 2, 7f
578        $if INC:
579          STR q30,  [x7], 16
580          MOV v30.16b, v31.16b
581          STR q28, [x13], 16
582          MOV v28.16b, v29.16b
583          STR q26, [x14], 16
584          MOV v26.16b, v27.16b
585          STR q24, [x17], 16
586          MOV v24.16b, v25.16b
587          STR q22, [x16], 16
588          MOV v22.16b, v23.16b
589          STR q20,  [x6], 16
590          MOV v20.16b, v21.16b
591        $else:
592          STR q20,  [x6], 16
593          MOV v20.16b, v21.16b
594          STR q22, [x16], 16
595          MOV v22.16b, v23.16b
596          STR q24, [x17], 16
597          MOV v24.16b, v25.16b
598          STR q26, [x14], 16
599          MOV v26.16b, v27.16b
600          STR q28, [x13], 16
601          MOV v28.16b, v29.16b
602          STR q30,  [x7], 16
603          MOV v30.16b, v31.16b
604
6057:
606        TBZ x1, 1, 8f
607        $if INC:
608          STR d30,  [x7], 8
609          DUP d30, v30.d[1]
610          STR d28, [x13], 8
611          DUP d28, v28.d[1]
612          STR d26, [x14], 8
613          DUP d26, v26.d[1]
614          STR d24, [x17], 8
615          DUP d24, v24.d[1]
616          STR d22, [x16], 8
617          DUP d22, v22.d[1]
618          STR d20,  [x6], 8
619          DUP d20, v20.d[1]
620        $else:
621          STR d20,  [x6], 8
622          DUP d20, v20.d[1]
623          STR d22, [x16], 8
624          DUP d22, v22.d[1]
625          STR d24, [x17], 8
626          DUP d24, v24.d[1]
627          STR d26, [x14], 8
628          DUP d26, v26.d[1]
629          STR d28, [x13], 8
630          DUP d28, v28.d[1]
631          STR d30,  [x7], 8
632          DUP d30, v30.d[1]
633
6348:
635        TBZ x1, 0, 9f
636        $if INC:
637          STR s30,  [x7]
638          STR s28, [x13]
639          STR s26, [x14]
640          STR s24, [x17]
641          STR s22, [x16]
642          STR s20,  [x6]
643        $else:
644          STR s20,  [x6]
645          STR s22, [x16]
646          STR s24, [x17]
647          STR s26, [x14]
648          STR s28, [x13]
649          STR s30,  [x7]
6509:
651        // Restore d12-d15 from stack
652        LDP d14, d15, [sp, 16]
653        LDP d12, d13, [sp], 32
654        RET
655
656END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53
657
658#ifdef __ELF__
659.section ".note.GNU-stack","",%progbits
660#endif
661