• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_output_params params [sp + 24] -> x8
21
22# 5x8 strips the following out of 5x8
23# x23 a5
24#  x7 c5  x13 unused
25# A5  v10 v11
26# C   v30 v31
27
28# d8-d15 need to be preserved if used.
29# x19-x30 need to be preserved if used.  x18 is reserved for OS.
30
31# A pointers
32# x14 a0
33# x15 a1
34# x20 a2
35# x21 a3
36#  x8 a4
37
38# C pointers
39#  x6 c0
40# x16 c1
41# x17 c2
42# x13 c3
43#  x7 c4
44
45# Vector register usage
46# A0   v0  v1
47# A1   v2  v3
48# A2   v4  v5
49# A3   v6  v7
50# A4   v8  v9
51# B   v12 v13 v14 v15
52# B   v16 v17 v18 v19
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# Clamp v30 v31
59
60BEGIN_FUNCTION xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
61
62        # Clamp C pointers / Save d8-d15 on stack
63        STP  d8,  d9, [sp, -64]!
64        CMP x0, 2                // if mr < 2
65        ADD x16, x6, x7          // c1 = c0 + cm_stride
66        CSEL x16, x6, x16, LO    //   c1 = c0
67
68        STP d12, d13, [sp, 16]
69        ADD x17, x16, x7         // c2 = c1 + cm_stride
70                                 // if mr <= 2
71        CSEL x17, x16, x17, LS   //   c2 = c1
72
73        STP d14, d15, [sp, 32]
74        CMP x0, 4                // if mr < 4
75        ADD x13, x17, x7         // c3 = c2 + cm_stride
76        CSEL x13, x17, x13, LO   //   c3 = c2
77
78        # Load zero, clamping params pointer
79        LDP x12, x8, [sp, 80]
80        ADD x7, x13, x7          // c4 = c3 + cm_stride
81                                 // if mr <= 5
82        CSEL x7, x13, x7, LS     //   c4 = c3
83
84        # Save x20,x21 on stack
85        STP x20, x21, [sp, 48]
86
87        # Load clamp values
88        LD2R {v30.4s, v31.4s}, [x8]
89
90        # Load cn_stride, a_offset
91        LDP x10, x11, [sp, 64]
92
930:
94        # Load initial bias from w into accumulators
95        LDP q20, q21, [x5], 32
96        MOV v22.16b, v20.16b
97        MOV v23.16b, v21.16b
98        $if PREFETCH:
99          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
100        MOV v24.16b, v20.16b
101        MOV v25.16b, v21.16b
102        $if PREFETCH:
103          PRFM PLDL1KEEP, [x5, 64]
104        MOV v26.16b, v20.16b
105        MOV v27.16b, v21.16b
106        $if PREFETCH:
107          PRFM PLDL1KEEP, [x5, 128]
108        MOV v28.16b, v20.16b
109        MOV v29.16b, v21.16b
110        $if PREFETCH:
111          PRFM PLDL1KEEP, [x5, 192]
112
113        MOV x9, x3  // p = ks
114
1151:
116        # Load next 5 A pointers
117        LDP x14, x15, [x4], 16
118        LDP x20, x21, [x4], 16
119        LDR x8, [x4], 8
120
121        CMP x14, x12            // if a0 == zero
122        ADD x14, x14, x11       // a0 += a_offset
123        CSEL x14, x12, x14, EQ  //   a0 = zero, else += a0 + a_offset
124        CMP x15, x12            // if a1 == zero
125        ADD x15, x15, x11       // a1 += a_offset
126        CSEL x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
127        CMP x20, x12            // if a2 == zero
128        ADD x20, x20, x11       // a2 += a_offset
129        CSEL x20, x12, x20, EQ  //   a2 = zero, else += a2 + a_offset
130        CMP x21, x12            // if a3 == zero
131        ADD x21, x21, x11       // a3 += a_offset
132        CSEL x21, x12, x21, EQ  //   a3 = zero, else += a3 + a_offset
133        CMP x8, x12            // if a4 == zero
134        ADD x8, x8, x11       // a4 += a_offset
135        CSEL x8, x12, x8, EQ  //   a4 = zero, else += a4 + a_offset
136
137        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
138        SUBS x0, x2, 32  // k = kc - 32
139        B.LO 5f
140
141        # Prologue - loads for main loop of 96 FMA
142        LDR   q0, [x14], 16
143        LDR   q2, [x15], 16
144        LDR   q4, [x20], 16
145        LDR   q6, [x21], 16
146        LDR   q8, [x8], 16
147        LDP  q12, q13, [x5], 32  // Fetch 3 B (4th deferred)
148        LDP  q14, q15, [x5], 32
149        LDP  q16, q17, [x5], 32
150
151        # Is there at least 8 floats (32 bytes) for main loop?
152        SUBS x0, x0, 32
153        B.LO 3f
154
155        # Main loop - 8 floats of A (32 bytes)
156        # 80 FMA + 5 LDP A + 8 LDP B
1572:
158        # First group of 4 A.  40 FMA.
159        FMLA v20.4s, v12.4s,  v0.s[0]
160        LDP  q18, q19, [x5], 32        // Load last B
161        FMLA v22.4s, v12.4s,  v2.s[0]
162        FMLA v24.4s, v12.4s,  v4.s[0]
163        FMLA v26.4s, v12.4s,  v6.s[0]
164        $if PREFETCH:
165          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
166        FMLA v28.4s, v12.4s,  v8.s[0]
167        FMLA v21.4s, v13.4s,  v0.s[0]
168        FMLA v23.4s, v13.4s,  v2.s[0]
169        $if PREFETCH:
170          PRFM PLDL1KEEP, [x5, 256]
171        FMLA v25.4s, v13.4s,  v4.s[0]
172        FMLA v27.4s, v13.4s,  v6.s[0]
173        FMLA v29.4s, v13.4s,  v8.s[0]
174        LDR   q1, [x14], 16            // Load next 5 A
175
176        FMLA v20.4s, v14.4s,  v0.s[1]
177        FMLA v22.4s, v14.4s,  v2.s[1]
178        FMLA v24.4s, v14.4s,  v4.s[1]
179        LDR   q3, [x15], 16
180        FMLA v26.4s, v14.4s,  v6.s[1]
181        FMLA v28.4s, v14.4s,  v8.s[1]
182        FMLA v21.4s, v15.4s,  v0.s[1]
183        LDR   q5, [x20], 16
184        FMLA v23.4s, v15.4s,  v2.s[1]
185        FMLA v25.4s, v15.4s,  v4.s[1]
186        FMLA v27.4s, v15.4s,  v6.s[1]
187        LDR   q7, [x21], 16
188        FMLA v29.4s, v15.4s,  v8.s[1]
189
190        FMLA v20.4s, v16.4s,  v0.s[2]
191        FMLA v22.4s, v16.4s,  v2.s[2]
192        LDR   q9, [x8], 16
193        FMLA v24.4s, v16.4s,  v4.s[2]
194        FMLA v26.4s, v16.4s,  v6.s[2]
195        FMLA v28.4s, v16.4s,  v8.s[2]
196        LDP  q12, q13, [x5], 32        // Load 4 B
197        FMLA v21.4s, v17.4s,  v0.s[2]
198        FMLA v23.4s, v17.4s,  v2.s[2]
199        FMLA v25.4s, v17.4s,  v4.s[2]
200        FMLA v27.4s, v17.4s,  v6.s[2]
201        FMLA v29.4s, v17.4s,  v8.s[2]
202
203        FMLA v20.4s, v18.4s,  v0.s[3]
204        FMLA v22.4s, v18.4s,  v2.s[3]
205        FMLA v24.4s, v18.4s,  v4.s[3]
206        FMLA v26.4s, v18.4s,  v6.s[3]
207        LDP  q14, q15, [x5], 32
208        FMLA v28.4s, v18.4s,  v8.s[3]
209        FMLA v21.4s, v19.4s,  v0.s[3]
210        FMLA v23.4s, v19.4s,  v2.s[3]
211        LDP  q16, q17, [x5], 32
212        FMLA v25.4s, v19.4s,  v4.s[3]
213        FMLA v27.4s, v19.4s,  v6.s[3]
214        FMLA v29.4s, v19.4s,  v8.s[3]
215        LDP  q18, q19, [x5], 32
216
217        # Second group of 4 A.  40 FMA.
218        FMLA v20.4s, v12.4s,  v1.s[0]
219        FMLA v22.4s, v12.4s,  v3.s[0]
220        FMLA v24.4s, v12.4s,  v5.s[0]
221        LDR   q0, [x14], 16            // Load next 5 A
222        FMLA v26.4s, v12.4s,  v7.s[0]
223        FMLA v28.4s, v12.4s,  v9.s[0]
224        FMLA v21.4s, v13.4s,  v1.s[0]
225        LDR   q2, [x15], 16
226        FMLA v23.4s, v13.4s,  v3.s[0]
227        FMLA v25.4s, v13.4s,  v5.s[0]
228        FMLA v27.4s, v13.4s,  v7.s[0]
229        LDR   q4, [x20], 16
230        FMLA v29.4s, v13.4s,  v9.s[0]
231
232        FMLA v20.4s, v14.4s,  v1.s[1]
233        FMLA v22.4s, v14.4s,  v3.s[1]
234        LDR   q6, [x21], 16
235        FMLA v24.4s, v14.4s,  v5.s[1]
236        FMLA v26.4s, v14.4s,  v7.s[1]
237        FMLA v28.4s, v14.4s,  v9.s[1]
238        LDR   q8, [x8], 16
239        FMLA v21.4s, v15.4s,  v1.s[1]
240        FMLA v23.4s, v15.4s,  v3.s[1]
241        FMLA v25.4s, v15.4s,  v5.s[1]
242        LDP  q12, q13, [x5], 32        // Load next 3 B (not last)
243        FMLA v27.4s, v15.4s,  v7.s[1]
244        FMLA v29.4s, v15.4s,  v9.s[1]
245
246        FMLA v20.4s, v16.4s,  v1.s[2]
247        FMLA v22.4s, v16.4s,  v3.s[2]
248        FMLA v24.4s, v16.4s,  v5.s[2]
249        FMLA v26.4s, v16.4s,  v7.s[2]
250        FMLA v28.4s, v16.4s,  v9.s[2]
251        FMLA v21.4s, v17.4s,  v1.s[2]
252        FMLA v23.4s, v17.4s,  v3.s[2]
253        LDP  q14, q15, [x5], 32
254        FMLA v25.4s, v17.4s,  v5.s[2]
255        FMLA v27.4s, v17.4s,  v7.s[2]
256        FMLA v29.4s, v17.4s,  v9.s[2]
257        LDP  q16,  q17, [x5], 32
258
259        FMLA v20.4s, v18.4s,  v1.s[3]
260        FMLA v22.4s, v18.4s,  v3.s[3]
261        SUBS x0, x0, 32
262        FMLA v24.4s, v18.4s,  v5.s[3]
263        FMLA v26.4s, v18.4s,  v7.s[3]
264        FMLA v28.4s, v18.4s,  v9.s[3]
265        FMLA v21.4s, v19.4s,  v1.s[3]
266        FMLA v23.4s, v19.4s,  v3.s[3]
267        FMLA v25.4s, v19.4s,  v5.s[3]
268        FMLA v27.4s, v19.4s,  v7.s[3]
269        FMLA v29.4s, v19.4s,  v9.s[3]
270        B.HS 2b
271
272        # Epilogue - 8 floats of A (32 bytes)
273        # 80 FMA + 5 LDP A + 8 LDP B
274        # First block same as main loop.  Second block has no preloads.
2753:
276        # First group of 4 A.  40 FMA.
277        FMLA v20.4s, v12.4s,  v0.s[0]
278        LDP  q18, q19, [x5], 32        // Load last B
279        FMLA v22.4s, v12.4s,  v2.s[0]
280        FMLA v24.4s, v12.4s,  v4.s[0]
281        FMLA v26.4s, v12.4s,  v6.s[0]
282        $if PREFETCH:
283          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
284        FMLA v28.4s, v12.4s,  v8.s[0]
285        FMLA v21.4s, v13.4s,  v0.s[0]
286        FMLA v23.4s, v13.4s,  v2.s[0]
287        $if PREFETCH:
288          PRFM PLDL1KEEP, [x5, 256]
289        FMLA v25.4s, v13.4s,  v4.s[0]
290        FMLA v27.4s, v13.4s,  v6.s[0]
291        FMLA v29.4s, v13.4s,  v8.s[0]
292        LDR   q1, [x14], 16            // Load next 5 A
293
294        FMLA v20.4s, v14.4s,  v0.s[1]
295        FMLA v22.4s, v14.4s,  v2.s[1]
296        FMLA v24.4s, v14.4s,  v4.s[1]
297        LDR   q3, [x15], 16
298        FMLA v26.4s, v14.4s,  v6.s[1]
299        FMLA v28.4s, v14.4s,  v8.s[1]
300        FMLA v21.4s, v15.4s,  v0.s[1]
301        LDR   q5, [x20], 16
302        FMLA v23.4s, v15.4s,  v2.s[1]
303        FMLA v25.4s, v15.4s,  v4.s[1]
304        FMLA v27.4s, v15.4s,  v6.s[1]
305        LDR   q7, [x21], 16
306        FMLA v29.4s, v15.4s,  v8.s[1]
307
308        FMLA v20.4s, v16.4s,  v0.s[2]
309        FMLA v22.4s, v16.4s,  v2.s[2]
310        LDR   q9, [x8], 16
311        FMLA v24.4s, v16.4s,  v4.s[2]
312        FMLA v26.4s, v16.4s,  v6.s[2]
313        FMLA v28.4s, v16.4s,  v8.s[2]
314        LDP  q12, q13, [x5], 32        // Load 4 B
315        FMLA v21.4s, v17.4s,  v0.s[2]
316        FMLA v23.4s, v17.4s,  v2.s[2]
317        FMLA v25.4s, v17.4s,  v4.s[2]
318        FMLA v27.4s, v17.4s,  v6.s[2]
319        FMLA v29.4s, v17.4s,  v8.s[2]
320
321        FMLA v20.4s, v18.4s,  v0.s[3]
322        FMLA v22.4s, v18.4s,  v2.s[3]
323        FMLA v24.4s, v18.4s,  v4.s[3]
324        FMLA v26.4s, v18.4s,  v6.s[3]
325        LDP  q14, q15, [x5], 32
326        FMLA v28.4s, v18.4s,  v8.s[3]
327        FMLA v21.4s, v19.4s,  v0.s[3]
328        FMLA v23.4s, v19.4s,  v2.s[3]
329        LDP  q16, q17, [x5], 32
330        FMLA v25.4s, v19.4s,  v4.s[3]
331        FMLA v27.4s, v19.4s,  v6.s[3]
332        FMLA v29.4s, v19.4s,  v8.s[3]
333        LDP  q18, q19, [x5], 32
334
335        # Second group of 4 A.  40 FMA.
336        FMLA v20.4s, v12.4s,  v1.s[0]
337        FMLA v22.4s, v12.4s,  v3.s[0]
338        FMLA v24.4s, v12.4s,  v5.s[0]
339        FMLA v26.4s, v12.4s,  v7.s[0]
340        FMLA v28.4s, v12.4s,  v9.s[0]
341        FMLA v21.4s, v13.4s,  v1.s[0]
342        FMLA v23.4s, v13.4s,  v3.s[0]
343        FMLA v25.4s, v13.4s,  v5.s[0]
344        FMLA v27.4s, v13.4s,  v7.s[0]
345        FMLA v29.4s, v13.4s,  v9.s[0]
346
347        FMLA v20.4s, v14.4s,  v1.s[1]
348        FMLA v22.4s, v14.4s,  v3.s[1]
349        FMLA v24.4s, v14.4s,  v5.s[1]
350        FMLA v26.4s, v14.4s,  v7.s[1]
351        FMLA v28.4s, v14.4s,  v9.s[1]
352        FMLA v21.4s, v15.4s,  v1.s[1]
353        FMLA v23.4s, v15.4s,  v3.s[1]
354        FMLA v25.4s, v15.4s,  v5.s[1]
355        FMLA v27.4s, v15.4s,  v7.s[1]
356        FMLA v29.4s, v15.4s,  v9.s[1]
357
358        FMLA v20.4s, v16.4s,  v1.s[2]
359        FMLA v22.4s, v16.4s,  v3.s[2]
360        FMLA v24.4s, v16.4s,  v5.s[2]
361        FMLA v26.4s, v16.4s,  v7.s[2]
362        FMLA v28.4s, v16.4s,  v9.s[2]
363        FMLA v21.4s, v17.4s,  v1.s[2]
364        FMLA v23.4s, v17.4s,  v3.s[2]
365        FMLA v25.4s, v17.4s,  v5.s[2]
366        FMLA v27.4s, v17.4s,  v7.s[2]
367        FMLA v29.4s, v17.4s,  v9.s[2]
368
369        FMLA v20.4s, v18.4s,  v1.s[3]
370        FMLA v22.4s, v18.4s,  v3.s[3]
371        FMLA v24.4s, v18.4s,  v5.s[3]
372        FMLA v26.4s, v18.4s,  v7.s[3]
373        FMLA v28.4s, v18.4s,  v9.s[3]
374        FMLA v21.4s, v19.4s,  v1.s[3]
375        FMLA v23.4s, v19.4s,  v3.s[3]
376        FMLA v25.4s, v19.4s,  v5.s[3]
377        FMLA v27.4s, v19.4s,  v7.s[3]
378        FMLA v29.4s, v19.4s,  v9.s[3]
379        # Is there a remainder?- 4 floats of A (16 bytes) or less
380        TST x0, 31
381        B.NE 5f
382
3834:
384        # ks loop
385        SUBS x9, x9, 40  // ks -= MR * sizeof(void*)
386        B.NE 1b
387
388        # Clamp
389        FMIN v20.4s, v20.4s, v30.4s
390        FMIN v21.4s, v21.4s, v30.4s
391        FMIN v22.4s, v22.4s, v30.4s
392        FMIN v23.4s, v23.4s, v30.4s
393        FMIN v24.4s, v24.4s, v30.4s
394        FMIN v25.4s, v25.4s, v30.4s
395        FMIN v26.4s, v26.4s, v30.4s
396        FMIN v27.4s, v27.4s, v30.4s
397        FMIN v28.4s, v28.4s, v30.4s
398        FMIN v29.4s, v29.4s, v30.4s
399        FMAX v20.4s, v20.4s, v31.4s
400        FMAX v21.4s, v21.4s, v31.4s
401        FMAX v22.4s, v22.4s, v31.4s
402        FMAX v23.4s, v23.4s, v31.4s
403        FMAX v24.4s, v24.4s, v31.4s
404        FMAX v25.4s, v25.4s, v31.4s
405        FMAX v26.4s, v26.4s, v31.4s
406        FMAX v27.4s, v27.4s, v31.4s
407        FMAX v28.4s, v28.4s, v31.4s
408        FMAX v29.4s, v29.4s, v31.4s
409
410        # Store full 5 x 8
411        SUBS x1, x1, 8
412        B.LO 8f
413
414        STP q28, q29, [x7]
415        ADD x7, x7, x10
416        STP q26, q27, [x13]
417        ADD x13, x13, x10
418        STP q24, q25, [x17]
419        ADD x17, x17, x10
420        STP q22, q23, [x16]
421        ADD x16, x16, x10
422        STP q20, q21,  [x6]
423        ADD  x6,  x6, x10
424
425        SUB x4, x4, x3  // a -= ks
426
427        # nc loop
428        B.HI 0b
429
430        # Restore x20,x21 from stack
431        LDP x20, x21, [sp, 48]
432
433        # Restore d8-d15 from stack
434        LDP d14, d15, [sp, 32]
435        LDP d12, d13, [sp, 16]
436        LDP  d8,  d9, [sp], 64
437        RET
438
4395:
440        # Is there a remainder?- 4 floats of A (16 bytes)
441        TBZ x0, 4, 6f
442
443        # Remainder- 4 floats of A (16 bytes)
444        # Load A
445        LDR   q0, [x14], 16
446        LDR   q2, [x15], 16
447        LDR   q4, [x20], 16
448        LDR   q6, [x21], 16
449        LDR   q8, [x8], 16
450        # Load B
451        LDP  q12, q13, [x5], 32
452        LDP  q14, q15, [x5], 32
453        LDP  q16, q17, [x5], 32
454        LDP  q18, q19, [x5], 32
455
456        FMLA v20.4s, v12.4s,  v0.s[0]
457        FMLA v22.4s, v12.4s,  v2.s[0]
458        FMLA v24.4s, v12.4s,  v4.s[0]
459        FMLA v26.4s, v12.4s,  v6.s[0]
460        FMLA v28.4s, v12.4s,  v8.s[0]
461        FMLA v21.4s, v13.4s,  v0.s[0]
462        FMLA v23.4s, v13.4s,  v2.s[0]
463        FMLA v25.4s, v13.4s,  v4.s[0]
464        FMLA v27.4s, v13.4s,  v6.s[0]
465        FMLA v29.4s, v13.4s,  v8.s[0]
466
467        FMLA v20.4s, v14.4s,  v0.s[1]
468        FMLA v22.4s, v14.4s,  v2.s[1]
469        FMLA v24.4s, v14.4s,  v4.s[1]
470        FMLA v26.4s, v14.4s,  v6.s[1]
471        FMLA v28.4s, v14.4s,  v8.s[1]
472        FMLA v21.4s, v15.4s,  v0.s[1]
473        FMLA v23.4s, v15.4s,  v2.s[1]
474        FMLA v25.4s, v15.4s,  v4.s[1]
475        FMLA v27.4s, v15.4s,  v6.s[1]
476        FMLA v29.4s, v15.4s,  v8.s[1]
477
478        FMLA v20.4s, v16.4s,  v0.s[2]
479        FMLA v22.4s, v16.4s,  v2.s[2]
480        FMLA v24.4s, v16.4s,  v4.s[2]
481        FMLA v26.4s, v16.4s,  v6.s[2]
482        FMLA v28.4s, v16.4s,  v8.s[2]
483        FMLA v21.4s, v17.4s,  v0.s[2]
484        FMLA v23.4s, v17.4s,  v2.s[2]
485        FMLA v25.4s, v17.4s,  v4.s[2]
486        FMLA v27.4s, v17.4s,  v6.s[2]
487        FMLA v29.4s, v17.4s,  v8.s[2]
488
489        FMLA v20.4s, v18.4s,  v0.s[3]
490        FMLA v22.4s, v18.4s,  v2.s[3]
491        FMLA v24.4s, v18.4s,  v4.s[3]
492        FMLA v26.4s, v18.4s,  v6.s[3]
493        FMLA v28.4s, v18.4s,  v8.s[3]
494        FMLA v21.4s, v19.4s,  v0.s[3]
495        FMLA v23.4s, v19.4s,  v2.s[3]
496        FMLA v25.4s, v19.4s,  v4.s[3]
497        FMLA v27.4s, v19.4s,  v6.s[3]
498        FMLA v29.4s, v19.4s,  v8.s[3]
499
500        # Is there a remainder?- 2 floats of A (8 bytes)
5016:
502        TBZ x0, 3, 7f
503
504        # Remainder- 2 floats of A (8 bytes)
505        # Load A
506        LDR   d0, [x14], 8
507        LDR   d2, [x15], 8
508        LDR   d4, [x20], 8
509        LDR   d6, [x21], 8
510        LDR   d8, [x8], 8
511        # Load B
512        LDP  q12, q13, [x5], 32
513        LDP  q14, q15, [x5], 32
514
515        FMLA v20.4s, v12.4s,  v0.s[0]
516        FMLA v22.4s, v12.4s,  v2.s[0]
517        FMLA v24.4s, v12.4s,  v4.s[0]
518        FMLA v26.4s, v12.4s,  v6.s[0]
519        FMLA v28.4s, v12.4s,  v8.s[0]
520        FMLA v21.4s, v13.4s,  v0.s[0]
521        FMLA v23.4s, v13.4s,  v2.s[0]
522        FMLA v25.4s, v13.4s,  v4.s[0]
523        FMLA v27.4s, v13.4s,  v6.s[0]
524        FMLA v29.4s, v13.4s,  v8.s[0]
525
526        FMLA v20.4s, v14.4s,  v0.s[1]
527        FMLA v22.4s, v14.4s,  v2.s[1]
528        FMLA v24.4s, v14.4s,  v4.s[1]
529        FMLA v26.4s, v14.4s,  v6.s[1]
530        FMLA v28.4s, v14.4s,  v8.s[1]
531        FMLA v21.4s, v15.4s,  v0.s[1]
532        FMLA v23.4s, v15.4s,  v2.s[1]
533        FMLA v25.4s, v15.4s,  v4.s[1]
534        FMLA v27.4s, v15.4s,  v6.s[1]
535        FMLA v29.4s, v15.4s,  v8.s[1]
536
537        # Is there a remainder?- 1 float of A (4 bytes)
5387:
539        TBZ x0, 2, 4b
540
541        # Remainder- 1 float of A (4 bytes)
542        # Load A
543        LDR   s0, [x14], 4
544        LDR   s2, [x15], 4
545        LDR   s4, [x20], 4
546        LDR   s6, [x21], 4
547        LDR   s8, [x8], 4
548        # Load B
549        LDP  q12, q13, [x5], 32
550
551        FMLA v20.4s, v12.4s,  v0.s[0]
552        FMLA v22.4s, v12.4s,  v2.s[0]
553        FMLA v24.4s, v12.4s,  v4.s[0]
554        FMLA v26.4s, v12.4s,  v6.s[0]
555        FMLA v28.4s, v12.4s,  v8.s[0]
556        FMLA v21.4s, v13.4s,  v0.s[0]
557        FMLA v23.4s, v13.4s,  v2.s[0]
558        FMLA v25.4s, v13.4s,  v4.s[0]
559        FMLA v27.4s, v13.4s,  v6.s[0]
560        FMLA v29.4s, v13.4s,  v8.s[0]
561        B 4b
562
563        # Store odd width
5648:
565        TBZ x1, 2, 9f
566        STR q28, [x7], 16
567        MOV v28.16b, v29.16b
568        STR q26, [x13], 16
569        MOV v26.16b, v27.16b
570        STR q24, [x17], 16
571        MOV v24.16b, v25.16b
572        STR q22, [x16], 16
573        MOV v22.16b, v23.16b
574        STR q20,  [x6], 16
575        MOV v20.16b, v21.16b
5769:
577        TBZ x1, 1, 10f
578        STR d28, [x7], 8
579        DUP d28, v28.d[1]
580        STR d26, [x13], 8
581        DUP d26, v26.d[1]
582        STR d24, [x17], 8
583        DUP d24, v24.d[1]
584        STR d22, [x16], 8
585        DUP d22, v22.d[1]
586        STR d20,  [x6], 8
587        DUP d20, v20.d[1]
588
58910:
590        TBZ x1, 0, 11f
591        STR s28, [x7]
592        STR s26, [x13]
593        STR s24, [x17]
594        STR s22, [x16]
595        STR s20,  [x6]
59611:
597        # Restore x20,x21 from stack
598        LDP x20, x21, [sp, 48]
599
600        # Restore d8-d15 from stack
601        LDP d14, d15, [sp, 32]
602        LDP d12, d13, [sp, 16]
603        LDP  d8,  d9, [sp], 64
604        RET
605
606END_FUNCTION xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
607
608#ifdef __ELF__
609.section ".note.GNU-stack","",%progbits
610#endif
611