• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# unused compared to 5x8
25#  x4 a5
26#  x7 c5
27# A5  v10 v11
28# C   v30 v31
29
30# d8-d15 need to be preserved if used.
31# x19-x30 need to be preserved if used.  x18 is reserved for OS.
32
33# A pointers
34#  x3 a0
35#  x9 a1
36# x10 a2
37# x11 a3
38# x12 a4
39
40# C pointers
41#  x6 c0
42# x16 c1
43# x17 c2
44# x13 c3
45#  x7 c4
46
47# Vector register usage
48# A0   v0  v1
49# A1   v2  v3
50# A2   v4  v5
51# A3   v6  v7
52# A4   v8  v9
53# B   v12 v13 v14 v15
54# B   v16 v17 v18 v19
55# C   v20 v21
56# C   v22 v23
57# C   v24 v25
58# C   v26 v27
59# C   v28 v29
60# Clamp v30 v31
61
62BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
63
64        # Clamp A and C pointers / Save d8-d15 on stack
65        STP  d8,  d9, [sp, -48]!
66        CMP x0, 2                // if mr < 2
67        ADD x9, x3, x4           // a1 = a0 + a_stride
68        ADD x16, x6, x7          // c1 = c0 + cm_stride
69        CSEL x9, x3, x9, LO      //   a1 = a0
70        CSEL x16, x6, x16, LO    //   c1 = c0
71
72        STP d12, d13, [sp, 16]
73        ADD x10, x9, x4          // a2 = a1 + a_stride
74        ADD x17, x16, x7         // c2 = c1 + cm_stride
75                                 // if mr <= 2
76        CSEL x10, x9, x10, LS    //   a2 = a1
77        CSEL x17, x16, x17, LS   //   c2 = c1
78
79        STP d14, d15, [sp, 32]
80        CMP x0, 4                // if mr < 4
81        ADD x11, x10, x4         // a3 = a2 + a_stride
82        ADD x13, x17, x7         // c3 = c2 + cm_stride
83        CSEL x11, x10, x11, LO   //   a3 = a2
84        CSEL x13, x17, x13, LO   //   c3 = c2
85
86        $if INC:
87          # Load acc, params pointer
88          LDP x15, x8, [sp, 56]
89        $else:
90          # Load params pointer
91          LDR x8, [sp, 56]
92
93        ADD x12, x11, x4         // a4 = a3 + a_stride
94        ADD x7, x13, x7         // c4 = c3 + cm_stride
95                                 // if mr <= 5
96        CSEL x12, x11, x12, LS   //   a4 = a3
97        CSEL x7, x13, x7, LS   //   c4 = c3
98
99        # Load clamp values
100        LD2R {v30.4s, v31.4s}, [x8]
101
102        # Load cn_stride
103        LDR x14, [sp, 48]
104
1050:
106        $if INC:
107          # Load initial accumulators
108          LDP q20, q21, [x15], 32
109          LDP q22, q23, [x15], 32
110          LDP q24, q25, [x15], 32
111          LDP q26, q27, [x15], 32
112          LDP q28, q29, [x15], 32
113          $if PREFETCH:
114            PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
115            PRFM PLDL1KEEP, [x5, 64]
116            PRFM PLDL1KEEP, [x5, 128]
117            PRFM PLDL1KEEP, [x5, 192]
118            PRFM PLDL1KEEP,  [x3]    // Prefetch A
119            PRFM PLDL1KEEP,  [x9]
120            PRFM PLDL1KEEP, [x10]
121            PRFM PLDL1KEEP, [x11]
122            PRFM PLDL1KEEP, [x12]
123        $else:
124          # Load initial bias from w into accumulators
125          LDP q20, q21, [x5], 32
126          MOV v22.16b, v20.16b
127          $if PREFETCH:
128            PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
129          MOV v23.16b, v21.16b
130          $if PREFETCH:
131            PRFM PLDL1KEEP, [x5, 64]
132          MOV v24.16b, v20.16b
133          $if PREFETCH:
134            PRFM PLDL1KEEP, [x5, 128]
135          MOV v25.16b, v21.16b
136          $if PREFETCH:
137            PRFM PLDL1KEEP, [x5, 192]
138          MOV v26.16b, v20.16b
139          $if PREFETCH:
140            PRFM PLDL1KEEP,  [x3]    // Prefetch A
141          MOV v27.16b, v21.16b
142          $if PREFETCH:
143            PRFM PLDL1KEEP,  [x9]
144          MOV v28.16b, v20.16b
145          $if PREFETCH:
146            PRFM PLDL1KEEP, [x10]
147          MOV v29.16b, v21.16b
148          $if PREFETCH:
149            PRFM PLDL1KEEP, [x11]
150            PRFM PLDL1KEEP, [x12]
151
152        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
153        SUBS x0, x2, 32  // k = kc - 32
154        B.LO 4f
155
156        # Prologue - loads for main loop of 80 FMA
157        LDR   q0,  [x3], 16
158        LDR   q2,  [x9], 16
159        LDR   q4, [x10], 16
160        LDR   q6, [x11], 16
161        LDR   q8, [x12], 16
162        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
163        LDP  q14,  q15, [x5], 32
164        LDP  q16,  q17, [x5], 32
165
166        # Is there at least 8 floats (32 bytes) for main loop?
167        SUBS x0, x0, 32
168        B.LO 2f
169
170        # Main loop - 8 floats of A (32 bytes)
171        # 80 FMA + 5 LDP A + 8 LDP B
1721:
173        # First group of 4 A.  40 FMA.
174        FMLA v20.4s, v12.4s,  v0.s[0]
175        LDP  q18,  q19, [x5], 32      // Load last B
176        FMLA v22.4s, v12.4s,  v2.s[0]
177        FMLA v24.4s, v12.4s,  v4.s[0]
178        FMLA v26.4s, v12.4s,  v6.s[0]
179        $if PREFETCH:
180          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
181        FMLA v28.4s, v12.4s,  v8.s[0]
182        FMLA v21.4s, v13.4s,  v0.s[0]
183        FMLA v23.4s, v13.4s,  v2.s[0]
184        $if PREFETCH:
185          PRFM PLDL1KEEP, [x5, 256]
186        FMLA v25.4s, v13.4s,  v4.s[0]
187        FMLA v27.4s, v13.4s,  v6.s[0]
188        FMLA v29.4s, v13.4s,  v8.s[0]
189        LDR   q1,  [x3], 16            // Load next 5 A
190
191        FMLA v20.4s, v14.4s,  v0.s[1]
192        FMLA v22.4s, v14.4s,  v2.s[1]
193        FMLA v24.4s, v14.4s,  v4.s[1]
194        LDR   q3,  [x9], 16
195        FMLA v26.4s, v14.4s,  v6.s[1]
196        FMLA v28.4s, v14.4s,  v8.s[1]
197        FMLA v21.4s, v15.4s,  v0.s[1]
198        LDR   q5, [x10], 16
199        FMLA v23.4s, v15.4s,  v2.s[1]
200        FMLA v25.4s, v15.4s,  v4.s[1]
201        FMLA v27.4s, v15.4s,  v6.s[1]
202        LDR   q7, [x11], 16
203        FMLA v29.4s, v15.4s,  v8.s[1]
204
205        FMLA v20.4s, v16.4s,  v0.s[2]
206        FMLA v22.4s, v16.4s,  v2.s[2]
207        LDR   q9, [x12], 16
208        FMLA v24.4s, v16.4s,  v4.s[2]
209        FMLA v26.4s, v16.4s,  v6.s[2]
210        FMLA v28.4s, v16.4s,  v8.s[2]
211        LDP  q12,  q13, [x5], 32       // Load 4 B
212        FMLA v21.4s, v17.4s,  v0.s[2]
213        FMLA v23.4s, v17.4s,  v2.s[2]
214        FMLA v25.4s, v17.4s,  v4.s[2]
215        LDP  q14,  q15, [x5], 32
216        FMLA v27.4s, v17.4s,  v6.s[2]
217        FMLA v29.4s, v17.4s,  v8.s[2]
218
219        FMLA v20.4s, v18.4s,  v0.s[3]
220        LDP  q16,  q17, [x5], 32
221        FMLA v22.4s, v18.4s,  v2.s[3]
222        FMLA v24.4s, v18.4s,  v4.s[3]
223        FMLA v26.4s, v18.4s,  v6.s[3]
224        FMLA v28.4s, v18.4s,  v8.s[3]
225        FMLA v21.4s, v19.4s,  v0.s[3]
226        FMLA v23.4s, v19.4s,  v2.s[3]
227        FMLA v25.4s, v19.4s,  v4.s[3]
228        FMLA v27.4s, v19.4s,  v6.s[3]
229        FMLA v29.4s, v19.4s,  v8.s[3]
230        LDP  q18,  q19, [x5], 32
231
232        # Second group of 4 A.  40 FMA.
233        FMLA v20.4s, v12.4s,  v1.s[0]
234        FMLA v22.4s, v12.4s,  v3.s[0]
235        FMLA v24.4s, v12.4s,  v5.s[0]
236        LDR   q0,  [x3], 16           // Load next 5 A
237        FMLA v26.4s, v12.4s,  v7.s[0]
238        FMLA v28.4s, v12.4s,  v9.s[0]
239        FMLA v21.4s, v13.4s,  v1.s[0]
240        LDR   q2,  [x9], 16
241        FMLA v23.4s, v13.4s,  v3.s[0]
242        FMLA v25.4s, v13.4s,  v5.s[0]
243        FMLA v27.4s, v13.4s,  v7.s[0]
244        LDR   q4, [x10], 16
245        FMLA v29.4s, v13.4s,  v9.s[0]
246
247        FMLA v20.4s, v14.4s,  v1.s[1]
248        FMLA v22.4s, v14.4s,  v3.s[1]
249        LDR   q6, [x11], 16
250        FMLA v24.4s, v14.4s,  v5.s[1]
251        FMLA v26.4s, v14.4s,  v7.s[1]
252        FMLA v28.4s, v14.4s,  v9.s[1]
253        LDR   q8, [x12], 16
254        FMLA v21.4s, v15.4s,  v1.s[1]
255        FMLA v23.4s, v15.4s,  v3.s[1]
256        FMLA v25.4s, v15.4s,  v5.s[1]
257        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
258        FMLA v27.4s, v15.4s,  v7.s[1]
259        FMLA v29.4s, v15.4s,  v9.s[1]
260
261        FMLA v20.4s, v16.4s,  v1.s[2]
262        LDP  q14,  q15, [x5], 32
263        FMLA v22.4s, v16.4s,  v3.s[2]
264        FMLA v24.4s, v16.4s,  v5.s[2]
265        FMLA v26.4s, v16.4s,  v7.s[2]
266        FMLA v28.4s, v16.4s,  v9.s[2]
267        FMLA v21.4s, v17.4s,  v1.s[2]
268        FMLA v23.4s, v17.4s,  v3.s[2]
269        FMLA v25.4s, v17.4s,  v5.s[2]
270        FMLA v27.4s, v17.4s,  v7.s[2]
271        FMLA v29.4s, v17.4s,  v9.s[2]
272        LDP  q16,  q17, [x5], 32
273
274        FMLA v20.4s, v18.4s,  v1.s[3]
275        FMLA v22.4s, v18.4s,  v3.s[3]
276        SUBS x0, x0, 32
277        FMLA v24.4s, v18.4s,  v5.s[3]
278        FMLA v26.4s, v18.4s,  v7.s[3]
279        FMLA v28.4s, v18.4s,  v9.s[3]
280        FMLA v21.4s, v19.4s,  v1.s[3]
281        FMLA v23.4s, v19.4s,  v3.s[3]
282        FMLA v25.4s, v19.4s,  v5.s[3]
283        FMLA v27.4s, v19.4s,  v7.s[3]
284        FMLA v29.4s, v19.4s,  v9.s[3]
285        B.HS 1b
286
287        # Epilogue - 8 floats of A (32 bytes)
288        # 80 FMA + 5 LDP A + 8 LDP B
289        # First block same as main loop.  Second block has no preloads.
2902:
291        # First group of 4 A.  40 FMA.
292        FMLA v20.4s, v12.4s,  v0.s[0]
293        LDP  q18,  q19, [x5], 32      // Load last B
294        FMLA v22.4s, v12.4s,  v2.s[0]
295        FMLA v24.4s, v12.4s,  v4.s[0]
296        FMLA v26.4s, v12.4s,  v6.s[0]
297        $if PREFETCH:
298          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
299        FMLA v28.4s, v12.4s,  v8.s[0]
300        FMLA v21.4s, v13.4s,  v0.s[0]
301        FMLA v23.4s, v13.4s,  v2.s[0]
302        $if PREFETCH:
303          PRFM PLDL1KEEP, [x5, 256]
304        FMLA v25.4s, v13.4s,  v4.s[0]
305        FMLA v27.4s, v13.4s,  v6.s[0]
306        FMLA v29.4s, v13.4s,  v8.s[0]
307        LDR   q1,  [x3], 16            // Load next 5 A
308
309        FMLA v20.4s, v14.4s,  v0.s[1]
310        FMLA v22.4s, v14.4s,  v2.s[1]
311        FMLA v24.4s, v14.4s,  v4.s[1]
312        LDR   q3,  [x9], 16
313        FMLA v26.4s, v14.4s,  v6.s[1]
314        FMLA v28.4s, v14.4s,  v8.s[1]
315        FMLA v21.4s, v15.4s,  v0.s[1]
316        LDR   q5, [x10], 16
317        FMLA v23.4s, v15.4s,  v2.s[1]
318        FMLA v25.4s, v15.4s,  v4.s[1]
319        FMLA v27.4s, v15.4s,  v6.s[1]
320        LDR   q7, [x11], 16
321        FMLA v29.4s, v15.4s,  v8.s[1]
322
323        FMLA v20.4s, v16.4s,  v0.s[2]
324        FMLA v22.4s, v16.4s,  v2.s[2]
325        LDR   q9, [x12], 16
326        FMLA v24.4s, v16.4s,  v4.s[2]
327        FMLA v26.4s, v16.4s,  v6.s[2]
328        FMLA v28.4s, v16.4s,  v8.s[2]
329        LDP  q12,  q13, [x5], 32       // Load 4 B
330        FMLA v21.4s, v17.4s,  v0.s[2]
331        FMLA v23.4s, v17.4s,  v2.s[2]
332        FMLA v25.4s, v17.4s,  v4.s[2]
333        LDP  q14,  q15, [x5], 32
334        FMLA v27.4s, v17.4s,  v6.s[2]
335        FMLA v29.4s, v17.4s,  v8.s[2]
336
337        FMLA v20.4s, v18.4s,  v0.s[3]
338        LDP  q16,  q17, [x5], 32
339        FMLA v22.4s, v18.4s,  v2.s[3]
340        FMLA v24.4s, v18.4s,  v4.s[3]
341        FMLA v26.4s, v18.4s,  v6.s[3]
342        FMLA v28.4s, v18.4s,  v8.s[3]
343        FMLA v21.4s, v19.4s,  v0.s[3]
344        FMLA v23.4s, v19.4s,  v2.s[3]
345        FMLA v25.4s, v19.4s,  v4.s[3]
346        FMLA v27.4s, v19.4s,  v6.s[3]
347        FMLA v29.4s, v19.4s,  v8.s[3]
348        LDP  q18,  q19, [x5], 32
349
350        # Second group of 4 A.  40 FMA.
351        FMLA v20.4s, v12.4s,  v1.s[0]
352        FMLA v22.4s, v12.4s,  v3.s[0]
353        FMLA v24.4s, v12.4s,  v5.s[0]
354        FMLA v26.4s, v12.4s,  v7.s[0]
355        FMLA v28.4s, v12.4s,  v9.s[0]
356        FMLA v21.4s, v13.4s,  v1.s[0]
357        FMLA v23.4s, v13.4s,  v3.s[0]
358        FMLA v25.4s, v13.4s,  v5.s[0]
359        FMLA v27.4s, v13.4s,  v7.s[0]
360        FMLA v29.4s, v13.4s,  v9.s[0]
361
362        FMLA v20.4s, v14.4s,  v1.s[1]
363        FMLA v22.4s, v14.4s,  v3.s[1]
364        FMLA v24.4s, v14.4s,  v5.s[1]
365        FMLA v26.4s, v14.4s,  v7.s[1]
366        FMLA v28.4s, v14.4s,  v9.s[1]
367        FMLA v21.4s, v15.4s,  v1.s[1]
368        FMLA v23.4s, v15.4s,  v3.s[1]
369        FMLA v25.4s, v15.4s,  v5.s[1]
370        FMLA v27.4s, v15.4s,  v7.s[1]
371        FMLA v29.4s, v15.4s,  v9.s[1]
372
373        FMLA v20.4s, v16.4s,  v1.s[2]
374        FMLA v22.4s, v16.4s,  v3.s[2]
375        FMLA v24.4s, v16.4s,  v5.s[2]
376        FMLA v26.4s, v16.4s,  v7.s[2]
377        FMLA v28.4s, v16.4s,  v9.s[2]
378        FMLA v21.4s, v17.4s,  v1.s[2]
379        FMLA v23.4s, v17.4s,  v3.s[2]
380        FMLA v25.4s, v17.4s,  v5.s[2]
381        FMLA v27.4s, v17.4s,  v7.s[2]
382        FMLA v29.4s, v17.4s,  v9.s[2]
383        TST x0, 31
384
385        FMLA v20.4s, v18.4s,  v1.s[3]
386        FMLA v22.4s, v18.4s,  v3.s[3]
387        FMLA v24.4s, v18.4s,  v5.s[3]
388        FMLA v26.4s, v18.4s,  v7.s[3]
389        FMLA v28.4s, v18.4s,  v9.s[3]
390        FMLA v21.4s, v19.4s,  v1.s[3]
391        FMLA v23.4s, v19.4s,  v3.s[3]
392        FMLA v25.4s, v19.4s,  v5.s[3]
393        FMLA v27.4s, v19.4s,  v7.s[3]
394        FMLA v29.4s, v19.4s,  v9.s[3]
395        B.NE 4f
396
397        # Clamp
3983:
399        FMIN v20.4s, v20.4s, v30.4s
400        SUBS x1, x1, 8
401        FMIN v21.4s, v21.4s, v30.4s
402        FMIN v22.4s, v22.4s, v30.4s
403        FMIN v23.4s, v23.4s, v30.4s
404        FMIN v24.4s, v24.4s, v30.4s
405        FMIN v25.4s, v25.4s, v30.4s
406        FMIN v26.4s, v26.4s, v30.4s
407        FMIN v27.4s, v27.4s, v30.4s
408        FMIN v28.4s, v28.4s, v30.4s
409        FMIN v29.4s, v29.4s, v30.4s
410        FMAX v20.4s, v20.4s, v31.4s
411        FMAX v21.4s, v21.4s, v31.4s
412        FMAX v22.4s, v22.4s, v31.4s
413        FMAX v23.4s, v23.4s, v31.4s
414        FMAX v24.4s, v24.4s, v31.4s
415        FMAX v25.4s, v25.4s, v31.4s
416        FMAX v26.4s, v26.4s, v31.4s
417        FMAX v27.4s, v27.4s, v31.4s
418        FMAX v28.4s, v28.4s, v31.4s
419        FMAX v29.4s, v29.4s, v31.4s
420
421        # Store full 5 x 8
422        B.LO 7f
423
424        $if INC:
425          SUB  x3,  x3, x2 // a0 -= kc
426          STP q28, q29, [x7]
427          ADD x7, x7, x14
428          SUB  x9,  x9, x2 // a1 -= kc
429          STP q26, q27, [x13]
430          ADD x13, x13, x14
431          SUB x10, x10, x2 // a2 -= kc
432          STP q24, q25, [x17]
433          ADD x17, x17, x14
434          SUB x11, x11, x2 // a3 -= kc
435          STP q22, q23, [x16]
436          ADD x16, x16, x14
437          SUB x12, x12, x2 // a4 -= kc
438          STP q20, q21,  [x6]
439          ADD  x6,  x6, x14
440        $else:
441          STP q20, q21,  [x6]
442          ADD  x6,  x6, x14
443          SUB  x3,  x3, x2 // a0 -= kc
444          STP q22, q23, [x16]
445          ADD x16, x16, x14
446          SUB  x9,  x9, x2 // a1 -= kc
447          STP q24, q25, [x17]
448          ADD x17, x17, x14
449          SUB x10, x10, x2 // a2 -= kc
450          STP q26, q27, [x13]
451          ADD x13, x13, x14
452          SUB x11, x11, x2 // a3 -= kc
453          STP q28, q29, [x7]
454          ADD x7, x7, x14
455          SUB x12, x12, x2 // a4 -= kc
456
457        B.HI 0b
458
459        # Restore d8-d15 from stack
460        LDP d14, d15, [sp, 32]
461        LDP d12, d13, [sp, 16]
462        LDP  d8,  d9, [sp], 48
463        RET
464
465        # Load clamp values
4664:
467        # Is there a remainder?- 4 floats of A (16 bytes)
468        TBZ x0, 4, 5f
469
470        # Remainder- 4 floats of A (16 bytes)
471        # Load A
472        LDR   q0,  [x3], 16
473        LDR   q2,  [x9], 16
474        LDR   q4, [x10], 16
475        LDR   q6, [x11], 16
476        LDR   q8, [x12], 16
477        # Load B
478        LDP  q12,  q13, [x5], 32
479        LDP  q14,  q15, [x5], 32
480        LDP  q16,  q17, [x5], 32
481        LDP  q18,  q19, [x5], 32
482
483        FMLA v20.4s, v12.4s,  v0.s[0]
484        FMLA v22.4s, v12.4s,  v2.s[0]
485        FMLA v24.4s, v12.4s,  v4.s[0]
486        FMLA v26.4s, v12.4s,  v6.s[0]
487        FMLA v28.4s, v12.4s,  v8.s[0]
488        FMLA v21.4s, v13.4s,  v0.s[0]
489        FMLA v23.4s, v13.4s,  v2.s[0]
490        FMLA v25.4s, v13.4s,  v4.s[0]
491        FMLA v27.4s, v13.4s,  v6.s[0]
492        FMLA v29.4s, v13.4s,  v8.s[0]
493
494        FMLA v20.4s, v14.4s,  v0.s[1]
495        FMLA v22.4s, v14.4s,  v2.s[1]
496        FMLA v24.4s, v14.4s,  v4.s[1]
497        FMLA v26.4s, v14.4s,  v6.s[1]
498        FMLA v28.4s, v14.4s,  v8.s[1]
499        FMLA v21.4s, v15.4s,  v0.s[1]
500        FMLA v23.4s, v15.4s,  v2.s[1]
501        FMLA v25.4s, v15.4s,  v4.s[1]
502        FMLA v27.4s, v15.4s,  v6.s[1]
503        FMLA v29.4s, v15.4s,  v8.s[1]
504
505        FMLA v20.4s, v16.4s,  v0.s[2]
506        FMLA v22.4s, v16.4s,  v2.s[2]
507        FMLA v24.4s, v16.4s,  v4.s[2]
508        FMLA v26.4s, v16.4s,  v6.s[2]
509        FMLA v28.4s, v16.4s,  v8.s[2]
510        FMLA v21.4s, v17.4s,  v0.s[2]
511        FMLA v23.4s, v17.4s,  v2.s[2]
512        FMLA v25.4s, v17.4s,  v4.s[2]
513        FMLA v27.4s, v17.4s,  v6.s[2]
514        FMLA v29.4s, v17.4s,  v8.s[2]
515
516        FMLA v20.4s, v18.4s,  v0.s[3]
517        FMLA v22.4s, v18.4s,  v2.s[3]
518        FMLA v24.4s, v18.4s,  v4.s[3]
519        FMLA v26.4s, v18.4s,  v6.s[3]
520        FMLA v28.4s, v18.4s,  v8.s[3]
521        FMLA v21.4s, v19.4s,  v0.s[3]
522        FMLA v23.4s, v19.4s,  v2.s[3]
523        FMLA v25.4s, v19.4s,  v4.s[3]
524        FMLA v27.4s, v19.4s,  v6.s[3]
525        FMLA v29.4s, v19.4s,  v8.s[3]
526
527        # Is there a remainder?- 2 floats of A (8 bytes)
5285:
529        TBZ x0, 3, 6f
530
531        # Remainder- 2 floats of A (8 bytes)
532        # Load A
533        LDR   d0,  [x3], 8
534        LDR   d2,  [x9], 8
535        LDR   d4, [x10], 8
536        LDR   d6, [x11], 8
537        LDR   d8, [x12], 8
538        # Load B
539        LDP  q12,  q13, [x5], 32
540        LDP  q14,  q15, [x5], 32
541
542        FMLA v20.4s, v12.4s,  v0.s[0]
543        FMLA v22.4s, v12.4s,  v2.s[0]
544        FMLA v24.4s, v12.4s,  v4.s[0]
545        FMLA v26.4s, v12.4s,  v6.s[0]
546        FMLA v28.4s, v12.4s,  v8.s[0]
547        FMLA v21.4s, v13.4s,  v0.s[0]
548        FMLA v23.4s, v13.4s,  v2.s[0]
549        FMLA v25.4s, v13.4s,  v4.s[0]
550        FMLA v27.4s, v13.4s,  v6.s[0]
551        FMLA v29.4s, v13.4s,  v8.s[0]
552
553        FMLA v20.4s, v14.4s,  v0.s[1]
554        FMLA v22.4s, v14.4s,  v2.s[1]
555        FMLA v24.4s, v14.4s,  v4.s[1]
556        FMLA v26.4s, v14.4s,  v6.s[1]
557        FMLA v28.4s, v14.4s,  v8.s[1]
558        FMLA v21.4s, v15.4s,  v0.s[1]
559        FMLA v23.4s, v15.4s,  v2.s[1]
560        FMLA v25.4s, v15.4s,  v4.s[1]
561        FMLA v27.4s, v15.4s,  v6.s[1]
562        FMLA v29.4s, v15.4s,  v8.s[1]
563
564        # Is there a remainder?- 1 float of A (4 bytes)
5656:
566        TBZ x0, 2, 3b
567
568        # Remainder- 1 float of A (4 bytes)
569        # Load A
570        LDR   s0,  [x3], 4
571        LDR   s2,  [x9], 4
572        LDR   s4, [x10], 4
573        LDR   s6, [x11], 4
574        LDR   s8, [x12], 4
575        # Load B
576        LDP  q12,  q13, [x5], 32
577
578        FMLA v20.4s, v12.4s,  v0.s[0]
579        FMLA v22.4s, v12.4s,  v2.s[0]
580        FMLA v24.4s, v12.4s,  v4.s[0]
581        FMLA v26.4s, v12.4s,  v6.s[0]
582        FMLA v28.4s, v12.4s,  v8.s[0]
583        FMLA v21.4s, v13.4s,  v0.s[0]
584        FMLA v23.4s, v13.4s,  v2.s[0]
585        FMLA v25.4s, v13.4s,  v4.s[0]
586        FMLA v27.4s, v13.4s,  v6.s[0]
587        FMLA v29.4s, v13.4s,  v8.s[0]
588        B 3b
589
590        # Store odd width
5917:
592        TBZ x1, 2, 8f
593        $if INC:
594          STR q28, [x7], 16
595          MOV v28.16b, v29.16b
596          STR q26, [x13], 16
597          MOV v26.16b, v27.16b
598          STR q24, [x17], 16
599          MOV v24.16b, v25.16b
600          STR q22, [x16], 16
601          MOV v22.16b, v23.16b
602          STR q20,  [x6], 16
603          MOV v20.16b, v21.16b
604        $else:
605          STR q20,  [x6], 16
606          MOV v20.16b, v21.16b
607          STR q22, [x16], 16
608          MOV v22.16b, v23.16b
609          STR q24, [x17], 16
610          MOV v24.16b, v25.16b
611          STR q26, [x13], 16
612          MOV v26.16b, v27.16b
613          STR q28, [x7], 16
614          MOV v28.16b, v29.16b
6158:
616        TBZ x1, 1, 9f
617        $if INC:
618          STR d28, [x7], 8
619          DUP d28, v28.d[1]
620          STR d26, [x13], 8
621          DUP d26, v26.d[1]
622          STR d24, [x17], 8
623          DUP d24, v24.d[1]
624          STR d22, [x16], 8
625          DUP d22, v22.d[1]
626          STR d20,  [x6], 8
627          DUP d20, v20.d[1]
628        $else:
629          STR d20,  [x6], 8
630          DUP d20, v20.d[1]
631          STR d22, [x16], 8
632          DUP d22, v22.d[1]
633          STR d24, [x17], 8
634          DUP d24, v24.d[1]
635          STR d26, [x13], 8
636          DUP d26, v26.d[1]
637          STR d28, [x7], 8
638          DUP d28, v28.d[1]
639
6409:
641        TBZ x1, 0, 10f
642        $if INC:
643          STR s28, [x7]
644          STR s26, [x13]
645          STR s24, [x17]
646          STR s22, [x16]
647          STR s20,  [x6]
648        $else:
649          STR s20,  [x6]
650          STR s22, [x16]
651          STR s24, [x17]
652          STR s26, [x13]
653          STR s28, [x7]
65410:
655        # Restore d8-d15 from stack
656        LDP d14, d15, [sp, 32]
657        LDP d12, d13, [sp, 16]
658        LDP  d8,  d9, [sp], 48
659        RET
660
661END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
662
663#ifdef __ELF__
664.section ".note.GNU-stack","",%progbits
665#endif
666