• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# unused compared to 5x8
25#  x4 a5
26#  x7 c5
27# A5  v10 v11
28# C   v30 v31
29
30# d8-d15 need to be preserved if used.
31# x19-x30 need to be preserved if used.  x18 is reserved for OS.
32
33# A pointers
34#  x3 a0
35#  x9 a1
36# x10 a2
37# x11 a3
38# x12 a4
39
40# C pointers
41#  x6 c0
42# x16 c1
43# x17 c2
44# x13 c3
45#  x7 c4
46
47# Vector register usage
48# A0   v0  v1
49# A1   v2  v3
50# A2   v4  v5
51# A3   v6  v7
52# A4   v8  v9
53# B   v12 v13 v14 v15
54# B   v16 v17 v18 v19
55# C   v20 v21
56# C   v22 v23
57# C   v24 v25
58# C   v26 v27
59# C   v28 v29
60# Clamp v30 v31
61
62BEGIN_FUNCTION xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57
63
64        # Clamp A and C pointers / Save d8-d15 on stack
65        STP  d8,  d9, [sp, -48]!
66        CMP x0, 2                // if mr < 2
67        ADD x9, x3, x4           // a1 = a0 + a_stride
68        ADD x16, x6, x7          // c1 = c0 + cm_stride
69        CSEL x9, x3, x9, LO      //   a1 = a0
70        CSEL x16, x6, x16, LO    //   c1 = c0
71
72        STP d12, d13, [sp, 16]
73        ADD x10, x9, x4          // a2 = a1 + a_stride
74        ADD x17, x16, x7         // c2 = c1 + cm_stride
75                                 // if mr <= 2
76        CSEL x10, x9, x10, LS    //   a2 = a1
77        CSEL x17, x16, x17, LS   //   c2 = c1
78
79        STP d14, d15, [sp, 32]
80        CMP x0, 4                // if mr < 4
81        ADD x11, x10, x4         // a3 = a2 + a_stride
82        ADD x13, x17, x7         // c3 = c2 + cm_stride
83        CSEL x11, x10, x11, LO   //   a3 = a2
84        CSEL x13, x17, x13, LO   //   c3 = c2
85
86        # Load params pointer
87        LDR x8, [sp, 56]
88
89        ADD x12, x11, x4         // a4 = a3 + a_stride
90        ADD x7, x13, x7         // c4 = c3 + cm_stride
91                                 // if mr <= 5
92        CSEL x12, x11, x12, LS   //   a4 = a3
93        CSEL x7, x13, x7, LS   //   c4 = c3
94
95        # Load clamp values
96        LD2R {v30.4s, v31.4s}, [x8]
97
98        # Load cn_stride
99        LDR x14, [sp, 48]
100
1010:
102        # Load initial bias from w into accumulators
103        LDP q20, q21, [x5], 32
104        MOV v22.16b, v20.16b
105        MOV v23.16b, v21.16b
106        MOV v24.16b, v20.16b
107        MOV v25.16b, v21.16b
108        MOV v26.16b, v20.16b
109        MOV v27.16b, v21.16b
110        MOV v28.16b, v20.16b
111        MOV v29.16b, v21.16b
112
113        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
114        SUBS x0, x2, 32  // k = kc - 32
115        B.LO 4f
116
117        # Prologue - loads for main loop of 80 FMA
118        LDR   q0,  [x3], 16
119        LDR   q2,  [x9], 16
120        LDR   q4, [x10], 16
121        LDR   q6, [x11], 16
122        LDR   q8, [x12], 16
123        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
124        LDP  q14,  q15, [x5], 32
125        LDP  q16,  q17, [x5], 32
126
127        # Is there at least 8 floats (32 bytes) for main loop?
128        SUBS x0, x0, 32
129        B.LO 2f
130
131        # Main loop - 8 floats of A (32 bytes)
132        # 80 FMA + 5 LDP A + 8 LDP B
1331:
134        # First group of 4 A.  40 FMA.
135        FMLA v20.4s, v12.4s,  v0.s[0]
136        LDP  q18,  q19, [x5], 32      // Load last B
137        FMLA v22.4s, v12.4s,  v2.s[0]
138        FMLA v24.4s, v12.4s,  v4.s[0]
139        FMLA v26.4s, v12.4s,  v6.s[0]
140        FMLA v28.4s, v12.4s,  v8.s[0]
141        FMLA v21.4s, v13.4s,  v0.s[0]
142        FMLA v23.4s, v13.4s,  v2.s[0]
143        FMLA v25.4s, v13.4s,  v4.s[0]
144        FMLA v27.4s, v13.4s,  v6.s[0]
145        FMLA v29.4s, v13.4s,  v8.s[0]
146        LDR   q1,  [x3], 16            // Load next 5 A
147
148        FMLA v20.4s, v14.4s,  v0.s[1]
149        FMLA v22.4s, v14.4s,  v2.s[1]
150        FMLA v24.4s, v14.4s,  v4.s[1]
151        LDR   q3,  [x9], 16
152        FMLA v26.4s, v14.4s,  v6.s[1]
153        FMLA v28.4s, v14.4s,  v8.s[1]
154        FMLA v21.4s, v15.4s,  v0.s[1]
155        LDR   q5, [x10], 16
156        FMLA v23.4s, v15.4s,  v2.s[1]
157        FMLA v25.4s, v15.4s,  v4.s[1]
158        FMLA v27.4s, v15.4s,  v6.s[1]
159        LDR   q7, [x11], 16
160        FMLA v29.4s, v15.4s,  v8.s[1]
161
162        FMLA v20.4s, v16.4s,  v0.s[2]
163        FMLA v22.4s, v16.4s,  v2.s[2]
164        LDR   q9, [x12], 16
165        FMLA v24.4s, v16.4s,  v4.s[2]
166        FMLA v26.4s, v16.4s,  v6.s[2]
167        FMLA v28.4s, v16.4s,  v8.s[2]
168        LDP  q12,  q13, [x5], 32       // Load 4 B
169        FMLA v21.4s, v17.4s,  v0.s[2]
170        FMLA v23.4s, v17.4s,  v2.s[2]
171        FMLA v25.4s, v17.4s,  v4.s[2]
172        LDP  q14,  q15, [x5], 32
173        FMLA v27.4s, v17.4s,  v6.s[2]
174        FMLA v29.4s, v17.4s,  v8.s[2]
175
176        FMLA v20.4s, v18.4s,  v0.s[3]
177        LDP  q16,  q17, [x5], 32
178        FMLA v22.4s, v18.4s,  v2.s[3]
179        FMLA v24.4s, v18.4s,  v4.s[3]
180        FMLA v26.4s, v18.4s,  v6.s[3]
181        FMLA v28.4s, v18.4s,  v8.s[3]
182        FMLA v21.4s, v19.4s,  v0.s[3]
183        FMLA v23.4s, v19.4s,  v2.s[3]
184        FMLA v25.4s, v19.4s,  v4.s[3]
185        FMLA v27.4s, v19.4s,  v6.s[3]
186        FMLA v29.4s, v19.4s,  v8.s[3]
187        LDP  q18,  q19, [x5], 32
188
189        # Second group of 4 A.  40 FMA.
190        FMLA v20.4s, v12.4s,  v1.s[0]
191        FMLA v22.4s, v12.4s,  v3.s[0]
192        FMLA v24.4s, v12.4s,  v5.s[0]
193        LDR   q0,  [x3], 16           // Load next 5 A
194        FMLA v26.4s, v12.4s,  v7.s[0]
195        FMLA v28.4s, v12.4s,  v9.s[0]
196        FMLA v21.4s, v13.4s,  v1.s[0]
197        LDR   q2,  [x9], 16
198        FMLA v23.4s, v13.4s,  v3.s[0]
199        FMLA v25.4s, v13.4s,  v5.s[0]
200        FMLA v27.4s, v13.4s,  v7.s[0]
201        LDR   q4, [x10], 16
202        FMLA v29.4s, v13.4s,  v9.s[0]
203
204        FMLA v20.4s, v14.4s,  v1.s[1]
205        FMLA v22.4s, v14.4s,  v3.s[1]
206        LDR   q6, [x11], 16
207        FMLA v24.4s, v14.4s,  v5.s[1]
208        FMLA v26.4s, v14.4s,  v7.s[1]
209        FMLA v28.4s, v14.4s,  v9.s[1]
210        LDR   q8, [x12], 16
211        FMLA v21.4s, v15.4s,  v1.s[1]
212        FMLA v23.4s, v15.4s,  v3.s[1]
213        FMLA v25.4s, v15.4s,  v5.s[1]
214        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
215        FMLA v27.4s, v15.4s,  v7.s[1]
216        FMLA v29.4s, v15.4s,  v9.s[1]
217
218        FMLA v20.4s, v16.4s,  v1.s[2]
219        LDP  q14,  q15, [x5], 32
220        FMLA v22.4s, v16.4s,  v3.s[2]
221        FMLA v24.4s, v16.4s,  v5.s[2]
222        FMLA v26.4s, v16.4s,  v7.s[2]
223        FMLA v28.4s, v16.4s,  v9.s[2]
224        FMLA v21.4s, v17.4s,  v1.s[2]
225        FMLA v23.4s, v17.4s,  v3.s[2]
226        FMLA v25.4s, v17.4s,  v5.s[2]
227        FMLA v27.4s, v17.4s,  v7.s[2]
228        FMLA v29.4s, v17.4s,  v9.s[2]
229        LDP  q16,  q17, [x5], 32
230
231        FMLA v20.4s, v18.4s,  v1.s[3]
232        FMLA v22.4s, v18.4s,  v3.s[3]
233        SUBS x0, x0, 32
234        FMLA v24.4s, v18.4s,  v5.s[3]
235        FMLA v26.4s, v18.4s,  v7.s[3]
236        FMLA v28.4s, v18.4s,  v9.s[3]
237        FMLA v21.4s, v19.4s,  v1.s[3]
238        FMLA v23.4s, v19.4s,  v3.s[3]
239        FMLA v25.4s, v19.4s,  v5.s[3]
240        FMLA v27.4s, v19.4s,  v7.s[3]
241        FMLA v29.4s, v19.4s,  v9.s[3]
242        B.HS 1b
243
244        # Epilogue - 8 floats of A (32 bytes)
245        # 80 FMA + 5 LDP A + 8 LDP B
246        # First block same as main loop.  Second block has no preloads.
2472:
248        # First group of 4 A.  40 FMA.
249        FMLA v20.4s, v12.4s,  v0.s[0]
250        LDP  q18,  q19, [x5], 32      // Load last B
251        FMLA v22.4s, v12.4s,  v2.s[0]
252        FMLA v24.4s, v12.4s,  v4.s[0]
253        FMLA v26.4s, v12.4s,  v6.s[0]
254        FMLA v28.4s, v12.4s,  v8.s[0]
255        FMLA v21.4s, v13.4s,  v0.s[0]
256        FMLA v23.4s, v13.4s,  v2.s[0]
257        FMLA v25.4s, v13.4s,  v4.s[0]
258        FMLA v27.4s, v13.4s,  v6.s[0]
259        FMLA v29.4s, v13.4s,  v8.s[0]
260        LDR   q1,  [x3], 16            // Load next 5 A
261
262        FMLA v20.4s, v14.4s,  v0.s[1]
263        FMLA v22.4s, v14.4s,  v2.s[1]
264        FMLA v24.4s, v14.4s,  v4.s[1]
265        LDR   q3,  [x9], 16
266        FMLA v26.4s, v14.4s,  v6.s[1]
267        FMLA v28.4s, v14.4s,  v8.s[1]
268        FMLA v21.4s, v15.4s,  v0.s[1]
269        LDR   q5, [x10], 16
270        FMLA v23.4s, v15.4s,  v2.s[1]
271        FMLA v25.4s, v15.4s,  v4.s[1]
272        FMLA v27.4s, v15.4s,  v6.s[1]
273        LDR   q7, [x11], 16
274        FMLA v29.4s, v15.4s,  v8.s[1]
275
276        FMLA v20.4s, v16.4s,  v0.s[2]
277        FMLA v22.4s, v16.4s,  v2.s[2]
278        LDR   q9, [x12], 16
279        FMLA v24.4s, v16.4s,  v4.s[2]
280        FMLA v26.4s, v16.4s,  v6.s[2]
281        FMLA v28.4s, v16.4s,  v8.s[2]
282        LDP  q12,  q13, [x5], 32       // Load 4 B
283        FMLA v21.4s, v17.4s,  v0.s[2]
284        FMLA v23.4s, v17.4s,  v2.s[2]
285        FMLA v25.4s, v17.4s,  v4.s[2]
286        LDP  q14,  q15, [x5], 32
287        FMLA v27.4s, v17.4s,  v6.s[2]
288        FMLA v29.4s, v17.4s,  v8.s[2]
289
290        FMLA v20.4s, v18.4s,  v0.s[3]
291        LDP  q16,  q17, [x5], 32
292        FMLA v22.4s, v18.4s,  v2.s[3]
293        FMLA v24.4s, v18.4s,  v4.s[3]
294        FMLA v26.4s, v18.4s,  v6.s[3]
295        FMLA v28.4s, v18.4s,  v8.s[3]
296        FMLA v21.4s, v19.4s,  v0.s[3]
297        FMLA v23.4s, v19.4s,  v2.s[3]
298        FMLA v25.4s, v19.4s,  v4.s[3]
299        FMLA v27.4s, v19.4s,  v6.s[3]
300        FMLA v29.4s, v19.4s,  v8.s[3]
301        LDP  q18,  q19, [x5], 32
302
303        # Second group of 4 A.  40 FMA.
304        FMLA v20.4s, v12.4s,  v1.s[0]
305        FMLA v22.4s, v12.4s,  v3.s[0]
306        FMLA v24.4s, v12.4s,  v5.s[0]
307        FMLA v26.4s, v12.4s,  v7.s[0]
308        FMLA v28.4s, v12.4s,  v9.s[0]
309        FMLA v21.4s, v13.4s,  v1.s[0]
310        FMLA v23.4s, v13.4s,  v3.s[0]
311        FMLA v25.4s, v13.4s,  v5.s[0]
312        FMLA v27.4s, v13.4s,  v7.s[0]
313        FMLA v29.4s, v13.4s,  v9.s[0]
314
315        FMLA v20.4s, v14.4s,  v1.s[1]
316        FMLA v22.4s, v14.4s,  v3.s[1]
317        FMLA v24.4s, v14.4s,  v5.s[1]
318        FMLA v26.4s, v14.4s,  v7.s[1]
319        FMLA v28.4s, v14.4s,  v9.s[1]
320        FMLA v21.4s, v15.4s,  v1.s[1]
321        FMLA v23.4s, v15.4s,  v3.s[1]
322        FMLA v25.4s, v15.4s,  v5.s[1]
323        FMLA v27.4s, v15.4s,  v7.s[1]
324        FMLA v29.4s, v15.4s,  v9.s[1]
325
326        FMLA v20.4s, v16.4s,  v1.s[2]
327        FMLA v22.4s, v16.4s,  v3.s[2]
328        FMLA v24.4s, v16.4s,  v5.s[2]
329        FMLA v26.4s, v16.4s,  v7.s[2]
330        FMLA v28.4s, v16.4s,  v9.s[2]
331        FMLA v21.4s, v17.4s,  v1.s[2]
332        FMLA v23.4s, v17.4s,  v3.s[2]
333        FMLA v25.4s, v17.4s,  v5.s[2]
334        FMLA v27.4s, v17.4s,  v7.s[2]
335        FMLA v29.4s, v17.4s,  v9.s[2]
336        TST x0, 31
337
338        FMLA v20.4s, v18.4s,  v1.s[3]
339        FMLA v22.4s, v18.4s,  v3.s[3]
340        FMLA v24.4s, v18.4s,  v5.s[3]
341        FMLA v26.4s, v18.4s,  v7.s[3]
342        FMLA v28.4s, v18.4s,  v9.s[3]
343        FMLA v21.4s, v19.4s,  v1.s[3]
344        FMLA v23.4s, v19.4s,  v3.s[3]
345        FMLA v25.4s, v19.4s,  v5.s[3]
346        FMLA v27.4s, v19.4s,  v7.s[3]
347        FMLA v29.4s, v19.4s,  v9.s[3]
348        B.NE 4f
349
350        # Clamp
3513:
352        FMIN v20.4s, v20.4s, v30.4s
353        SUBS x1, x1, 8
354        FMIN v21.4s, v21.4s, v30.4s
355        FMIN v22.4s, v22.4s, v30.4s
356        FMIN v23.4s, v23.4s, v30.4s
357        FMIN v24.4s, v24.4s, v30.4s
358        FMIN v25.4s, v25.4s, v30.4s
359        FMIN v26.4s, v26.4s, v30.4s
360        FMIN v27.4s, v27.4s, v30.4s
361        FMIN v28.4s, v28.4s, v30.4s
362        FMIN v29.4s, v29.4s, v30.4s
363        FMAX v20.4s, v20.4s, v31.4s
364        FMAX v21.4s, v21.4s, v31.4s
365        FMAX v22.4s, v22.4s, v31.4s
366        FMAX v23.4s, v23.4s, v31.4s
367        FMAX v24.4s, v24.4s, v31.4s
368        FMAX v25.4s, v25.4s, v31.4s
369        FMAX v26.4s, v26.4s, v31.4s
370        FMAX v27.4s, v27.4s, v31.4s
371        FMAX v28.4s, v28.4s, v31.4s
372        FMAX v29.4s, v29.4s, v31.4s
373
374        # Store full 5 x 8
375        B.LO 7f
376
377        STP q20, q21,  [x6]
378        ADD  x6,  x6, x14
379        SUB  x3,  x3, x2 // a0 -= kc
380        STP q22, q23, [x16]
381        ADD x16, x16, x14
382        SUB  x9,  x9, x2 // a1 -= kc
383        STP q24, q25, [x17]
384        ADD x17, x17, x14
385        SUB x10, x10, x2 // a2 -= kc
386        STP q26, q27, [x13]
387        ADD x13, x13, x14
388        SUB x11, x11, x2 // a3 -= kc
389        STP q28, q29, [x7]
390        ADD x7, x7, x14
391        SUB x12, x12, x2 // a4 -= kc
392
393        B.HI 0b
394
395        # Restore d8-d15 from stack
396        LDP d14, d15, [sp, 32]
397        LDP d12, d13, [sp, 16]
398        LDP  d8,  d9, [sp], 48
399        RET
400
401        # Load clamp values
4024:
403        # Is there a remainder?- 4 floats of A (16 bytes)
404        TBZ x0, 4, 5f
405
406        # Remainder- 4 floats of A (16 bytes)
407        # Load A
408        LDR   q0,  [x3], 16
409        LDR   q2,  [x9], 16
410        LDR   q4, [x10], 16
411        LDR   q6, [x11], 16
412        LDR   q8, [x12], 16
413        # Load B
414        LDP  q12,  q13, [x5], 32
415        LDP  q14,  q15, [x5], 32
416        LDP  q16,  q17, [x5], 32
417        LDP  q18,  q19, [x5], 32
418
419        FMLA v20.4s, v12.4s,  v0.s[0]
420        FMLA v22.4s, v12.4s,  v2.s[0]
421        FMLA v24.4s, v12.4s,  v4.s[0]
422        FMLA v26.4s, v12.4s,  v6.s[0]
423        FMLA v28.4s, v12.4s,  v8.s[0]
424        FMLA v21.4s, v13.4s,  v0.s[0]
425        FMLA v23.4s, v13.4s,  v2.s[0]
426        FMLA v25.4s, v13.4s,  v4.s[0]
427        FMLA v27.4s, v13.4s,  v6.s[0]
428        FMLA v29.4s, v13.4s,  v8.s[0]
429
430        FMLA v20.4s, v14.4s,  v0.s[1]
431        FMLA v22.4s, v14.4s,  v2.s[1]
432        FMLA v24.4s, v14.4s,  v4.s[1]
433        FMLA v26.4s, v14.4s,  v6.s[1]
434        FMLA v28.4s, v14.4s,  v8.s[1]
435        FMLA v21.4s, v15.4s,  v0.s[1]
436        FMLA v23.4s, v15.4s,  v2.s[1]
437        FMLA v25.4s, v15.4s,  v4.s[1]
438        FMLA v27.4s, v15.4s,  v6.s[1]
439        FMLA v29.4s, v15.4s,  v8.s[1]
440
441        FMLA v20.4s, v16.4s,  v0.s[2]
442        FMLA v22.4s, v16.4s,  v2.s[2]
443        FMLA v24.4s, v16.4s,  v4.s[2]
444        FMLA v26.4s, v16.4s,  v6.s[2]
445        FMLA v28.4s, v16.4s,  v8.s[2]
446        FMLA v21.4s, v17.4s,  v0.s[2]
447        FMLA v23.4s, v17.4s,  v2.s[2]
448        FMLA v25.4s, v17.4s,  v4.s[2]
449        FMLA v27.4s, v17.4s,  v6.s[2]
450        FMLA v29.4s, v17.4s,  v8.s[2]
451
452        FMLA v20.4s, v18.4s,  v0.s[3]
453        FMLA v22.4s, v18.4s,  v2.s[3]
454        FMLA v24.4s, v18.4s,  v4.s[3]
455        FMLA v26.4s, v18.4s,  v6.s[3]
456        FMLA v28.4s, v18.4s,  v8.s[3]
457        FMLA v21.4s, v19.4s,  v0.s[3]
458        FMLA v23.4s, v19.4s,  v2.s[3]
459        FMLA v25.4s, v19.4s,  v4.s[3]
460        FMLA v27.4s, v19.4s,  v6.s[3]
461        FMLA v29.4s, v19.4s,  v8.s[3]
462
463        # Is there a remainder?- 2 floats of A (8 bytes)
4645:
465        TBZ x0, 3, 6f
466
467        # Remainder- 2 floats of A (8 bytes)
468        # Load A
469        LDR   d0,  [x3], 8
470        LDR   d2,  [x9], 8
471        LDR   d4, [x10], 8
472        LDR   d6, [x11], 8
473        LDR   d8, [x12], 8
474        # Load B
475        LDP  q12,  q13, [x5], 32
476        LDP  q14,  q15, [x5], 32
477
478        FMLA v20.4s, v12.4s,  v0.s[0]
479        FMLA v22.4s, v12.4s,  v2.s[0]
480        FMLA v24.4s, v12.4s,  v4.s[0]
481        FMLA v26.4s, v12.4s,  v6.s[0]
482        FMLA v28.4s, v12.4s,  v8.s[0]
483        FMLA v21.4s, v13.4s,  v0.s[0]
484        FMLA v23.4s, v13.4s,  v2.s[0]
485        FMLA v25.4s, v13.4s,  v4.s[0]
486        FMLA v27.4s, v13.4s,  v6.s[0]
487        FMLA v29.4s, v13.4s,  v8.s[0]
488
489        FMLA v20.4s, v14.4s,  v0.s[1]
490        FMLA v22.4s, v14.4s,  v2.s[1]
491        FMLA v24.4s, v14.4s,  v4.s[1]
492        FMLA v26.4s, v14.4s,  v6.s[1]
493        FMLA v28.4s, v14.4s,  v8.s[1]
494        FMLA v21.4s, v15.4s,  v0.s[1]
495        FMLA v23.4s, v15.4s,  v2.s[1]
496        FMLA v25.4s, v15.4s,  v4.s[1]
497        FMLA v27.4s, v15.4s,  v6.s[1]
498        FMLA v29.4s, v15.4s,  v8.s[1]
499
500        # Is there a remainder?- 1 float of A (4 bytes)
5016:
502        TBZ x0, 2, 3b
503
504        # Remainder- 1 float of A (4 bytes)
505        # Load A
506        LDR   s0,  [x3], 4
507        LDR   s2,  [x9], 4
508        LDR   s4, [x10], 4
509        LDR   s6, [x11], 4
510        LDR   s8, [x12], 4
511        # Load B
512        LDP  q12,  q13, [x5], 32
513
514        FMLA v20.4s, v12.4s,  v0.s[0]
515        FMLA v22.4s, v12.4s,  v2.s[0]
516        FMLA v24.4s, v12.4s,  v4.s[0]
517        FMLA v26.4s, v12.4s,  v6.s[0]
518        FMLA v28.4s, v12.4s,  v8.s[0]
519        FMLA v21.4s, v13.4s,  v0.s[0]
520        FMLA v23.4s, v13.4s,  v2.s[0]
521        FMLA v25.4s, v13.4s,  v4.s[0]
522        FMLA v27.4s, v13.4s,  v6.s[0]
523        FMLA v29.4s, v13.4s,  v8.s[0]
524        B 3b
525
526        # Store odd width
5277:
528        TBZ x1, 2, 8f
529        STR q20,  [x6], 16
530        MOV v20.16b, v21.16b
531        STR q22, [x16], 16
532        MOV v22.16b, v23.16b
533        STR q24, [x17], 16
534        MOV v24.16b, v25.16b
535        STR q26, [x13], 16
536        MOV v26.16b, v27.16b
537        STR q28, [x7], 16
538        MOV v28.16b, v29.16b
5398:
540        TBZ x1, 1, 9f
541        STR d20,  [x6], 8
542        DUP d20, v20.d[1]
543        STR d22, [x16], 8
544        DUP d22, v22.d[1]
545        STR d24, [x17], 8
546        DUP d24, v24.d[1]
547        STR d26, [x13], 8
548        DUP d26, v26.d[1]
549        STR d28, [x7], 8
550        DUP d28, v28.d[1]
551
5529:
553        TBZ x1, 0, 10f
554        STR s20,  [x6]
555        STR s22, [x16]
556        STR s24, [x17]
557        STR s26, [x13]
558        STR s28, [x7]
55910:
560        # Restore d8-d15 from stack
561        LDP d14, d15, [sp, 32]
562        LDP d12, d13, [sp, 16]
563        LDP  d8,  d9, [sp], 48
564        RET
565
566END_FUNCTION xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57
567
568#ifdef __ELF__
569.section ".note.GNU-stack","",%progbits
570#endif
571