• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# unused compared to 5x8
25#  x4 a5
26#  x7 c5
27# A5  v10 v11
28# C   v30 v31
29
30# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
31
32# A pointers
33#  x3 a0
34#  x9 a1
35# x10 a2
36# x11 a3
37# x12 a4
38
39# C pointers
40#  x6 c0
41# x16 c1
42# x17 c2
43# x13 c3
44#  x7 c4
45
46# Vector register usage
47# A0   v0  v1
48# A1   v2  v3
49# A2   v4  v5
50# A3   v6  v7
51# A4   v8  v9
52# B   v12 v13 v14 v15
53# B   v16 v17 v18 v19
54# C   v20 v21
55# C   v22 v23
56# C   v24 v25
57# C   v26 v27
58# C   v28 v29
59# Clamp v30 v31
60
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
62
63        $if INC:
64          # Load cn_stride, acc
65          LDP x14, x15, [sp]
66          # Load params pointer
67          LDR x8, [sp, 16]
68        $else:
69          # Load cn_stride, params pointer
70          LDP x14, x8, [sp]
71
72        # Clamp A and C pointers / Save d8-d15 on stack
73        STP  d8,  d9, [sp, -48]!
74        CMP x0, 2                // if mr < 2
75        ADD x9, x3, x4           // a1 = a0 + a_stride
76        ADD x16, x6, x7          // c1 = c0 + cm_stride
77        CSEL x9, x3, x9, LO      //   a1 = a0
78        CSEL x16, x6, x16, LO    //   c1 = c0
79
80        STP d12, d13, [sp, 16]
81        ADD x10, x9, x4          // a2 = a1 + a_stride
82        ADD x17, x16, x7         // c2 = c1 + cm_stride
83                                 // if mr <= 2
84        CSEL x10, x9, x10, LS    //   a2 = a1
85        CSEL x17, x16, x17, LS   //   c2 = c1
86
87        STP d14, d15, [sp, 32]
88        CMP x0, 4                // if mr < 4
89        ADD x11, x10, x4         // a3 = a2 + a_stride
90        ADD x13, x17, x7         // c3 = c2 + cm_stride
91        CSEL x11, x10, x11, LO   //   a3 = a2
92        CSEL x13, x17, x13, LO   //   c3 = c2
93
94        ADD x12, x11, x4         // a4 = a3 + a_stride
95        ADD x7, x13, x7         // c4 = c3 + cm_stride
96                                 // if mr <= 4
97        CSEL x12, x11, x12, LS   //   a4 = a3
98        CSEL x7, x13, x7, LS   //   c4 = c3
99
100        # Load clamp values
101        LD2R {v30.4s, v31.4s}, [x8]
102
1030:
104        $if INC:
105          # Load initial accumulators
106          LDP q20, q21, [x15], 32
107          LDP q22, q23, [x15], 32
108          LDP q24, q25, [x15], 32
109          LDP q26, q27, [x15], 32
110          LDP q28, q29, [x15], 32
111          $if PREFETCH:
112            PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
113            PRFM PLDL1KEEP, [x5, 64]
114            PRFM PLDL1KEEP, [x5, 128]
115            PRFM PLDL1KEEP, [x5, 192]
116            PRFM PLDL1KEEP,  [x3]    // Prefetch A
117            PRFM PLDL1KEEP,  [x9]
118            PRFM PLDL1KEEP, [x10]
119            PRFM PLDL1KEEP, [x11]
120            PRFM PLDL1KEEP, [x12]
121        $else:
122          # Load initial bias from w into accumulators
123          LDP q20, q21, [x5], 32
124          MOV v22.16b, v20.16b
125          $if PREFETCH:
126            PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
127          MOV v23.16b, v21.16b
128          $if PREFETCH:
129            PRFM PLDL1KEEP, [x5, 64]
130          MOV v24.16b, v20.16b
131          $if PREFETCH:
132            PRFM PLDL1KEEP, [x5, 128]
133          MOV v25.16b, v21.16b
134          $if PREFETCH:
135            PRFM PLDL1KEEP, [x5, 192]
136          MOV v26.16b, v20.16b
137          $if PREFETCH:
138            PRFM PLDL1KEEP,  [x3]    // Prefetch A
139          MOV v27.16b, v21.16b
140          $if PREFETCH:
141            PRFM PLDL1KEEP,  [x9]
142          MOV v28.16b, v20.16b
143          $if PREFETCH:
144            PRFM PLDL1KEEP, [x10]
145          MOV v29.16b, v21.16b
146          $if PREFETCH:
147            PRFM PLDL1KEEP, [x11]
148            PRFM PLDL1KEEP, [x12]
149
150        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
151        SUBS x0, x2, 32  // k = kc - 32
152        B.LO 4f
153
154        # Prologue - loads for main loop of 80 FMA
155        LDR   q0,  [x3], 16
156        LDR   q2,  [x9], 16
157        LDR   q4, [x10], 16
158        LDR   q6, [x11], 16
159        LDR   q8, [x12], 16
160        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
161        LDP  q14,  q15, [x5], 32
162        LDP  q16,  q17, [x5], 32
163
164        # Is there at least 8 floats (32 bytes) for main loop?
165        SUBS x0, x0, 32
166        B.LO 2f
167
168        # Main loop - 8 floats of A (32 bytes)
169        # 80 FMA + 5 LDP A + 8 LDP B
1701:
171        # First group of 4 A.  40 FMA.
172        FMLA v20.4s, v12.4s,  v0.s[0]
173        LDP  q18,  q19, [x5], 32      // Load last B
174        FMLA v22.4s, v12.4s,  v2.s[0]
175        FMLA v24.4s, v12.4s,  v4.s[0]
176        FMLA v26.4s, v12.4s,  v6.s[0]
177        $if PREFETCH:
178          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
179        FMLA v28.4s, v12.4s,  v8.s[0]
180        FMLA v21.4s, v13.4s,  v0.s[0]
181        FMLA v23.4s, v13.4s,  v2.s[0]
182        $if PREFETCH:
183          PRFM PLDL1KEEP, [x5, 256]
184        FMLA v25.4s, v13.4s,  v4.s[0]
185        FMLA v27.4s, v13.4s,  v6.s[0]
186        FMLA v29.4s, v13.4s,  v8.s[0]
187        LDR   q1,  [x3], 16            // Load next 5 A
188
189        FMLA v20.4s, v14.4s,  v0.s[1]
190        FMLA v22.4s, v14.4s,  v2.s[1]
191        FMLA v24.4s, v14.4s,  v4.s[1]
192        LDR   q3,  [x9], 16
193        FMLA v26.4s, v14.4s,  v6.s[1]
194        FMLA v28.4s, v14.4s,  v8.s[1]
195        FMLA v21.4s, v15.4s,  v0.s[1]
196        LDR   q5, [x10], 16
197        FMLA v23.4s, v15.4s,  v2.s[1]
198        FMLA v25.4s, v15.4s,  v4.s[1]
199        FMLA v27.4s, v15.4s,  v6.s[1]
200        LDR   q7, [x11], 16
201        FMLA v29.4s, v15.4s,  v8.s[1]
202
203        FMLA v20.4s, v16.4s,  v0.s[2]
204        FMLA v22.4s, v16.4s,  v2.s[2]
205        LDR   q9, [x12], 16
206        FMLA v24.4s, v16.4s,  v4.s[2]
207        FMLA v26.4s, v16.4s,  v6.s[2]
208        FMLA v28.4s, v16.4s,  v8.s[2]
209        LDP  q12,  q13, [x5], 32       // Load 4 B
210        FMLA v21.4s, v17.4s,  v0.s[2]
211        FMLA v23.4s, v17.4s,  v2.s[2]
212        FMLA v25.4s, v17.4s,  v4.s[2]
213        LDP  q14,  q15, [x5], 32
214        FMLA v27.4s, v17.4s,  v6.s[2]
215        FMLA v29.4s, v17.4s,  v8.s[2]
216
217        FMLA v20.4s, v18.4s,  v0.s[3]
218        LDP  q16,  q17, [x5], 32
219        FMLA v22.4s, v18.4s,  v2.s[3]
220        FMLA v24.4s, v18.4s,  v4.s[3]
221        FMLA v26.4s, v18.4s,  v6.s[3]
222        FMLA v28.4s, v18.4s,  v8.s[3]
223        FMLA v21.4s, v19.4s,  v0.s[3]
224        FMLA v23.4s, v19.4s,  v2.s[3]
225        FMLA v25.4s, v19.4s,  v4.s[3]
226        FMLA v27.4s, v19.4s,  v6.s[3]
227        FMLA v29.4s, v19.4s,  v8.s[3]
228        LDP  q18,  q19, [x5], 32
229
230        # Second group of 4 A.  40 FMA.
231        FMLA v20.4s, v12.4s,  v1.s[0]
232        FMLA v22.4s, v12.4s,  v3.s[0]
233        FMLA v24.4s, v12.4s,  v5.s[0]
234        LDR   q0,  [x3], 16           // Load next 5 A
235        FMLA v26.4s, v12.4s,  v7.s[0]
236        FMLA v28.4s, v12.4s,  v9.s[0]
237        FMLA v21.4s, v13.4s,  v1.s[0]
238        LDR   q2,  [x9], 16
239        FMLA v23.4s, v13.4s,  v3.s[0]
240        FMLA v25.4s, v13.4s,  v5.s[0]
241        FMLA v27.4s, v13.4s,  v7.s[0]
242        LDR   q4, [x10], 16
243        FMLA v29.4s, v13.4s,  v9.s[0]
244
245        FMLA v20.4s, v14.4s,  v1.s[1]
246        FMLA v22.4s, v14.4s,  v3.s[1]
247        LDR   q6, [x11], 16
248        FMLA v24.4s, v14.4s,  v5.s[1]
249        FMLA v26.4s, v14.4s,  v7.s[1]
250        FMLA v28.4s, v14.4s,  v9.s[1]
251        LDR   q8, [x12], 16
252        FMLA v21.4s, v15.4s,  v1.s[1]
253        FMLA v23.4s, v15.4s,  v3.s[1]
254        FMLA v25.4s, v15.4s,  v5.s[1]
255        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
256        FMLA v27.4s, v15.4s,  v7.s[1]
257        FMLA v29.4s, v15.4s,  v9.s[1]
258
259        FMLA v20.4s, v16.4s,  v1.s[2]
260        LDP  q14,  q15, [x5], 32
261        FMLA v22.4s, v16.4s,  v3.s[2]
262        FMLA v24.4s, v16.4s,  v5.s[2]
263        FMLA v26.4s, v16.4s,  v7.s[2]
264        FMLA v28.4s, v16.4s,  v9.s[2]
265        FMLA v21.4s, v17.4s,  v1.s[2]
266        FMLA v23.4s, v17.4s,  v3.s[2]
267        FMLA v25.4s, v17.4s,  v5.s[2]
268        FMLA v27.4s, v17.4s,  v7.s[2]
269        FMLA v29.4s, v17.4s,  v9.s[2]
270        LDP  q16,  q17, [x5], 32
271
272        FMLA v20.4s, v18.4s,  v1.s[3]
273        FMLA v22.4s, v18.4s,  v3.s[3]
274        SUBS x0, x0, 32
275        FMLA v24.4s, v18.4s,  v5.s[3]
276        FMLA v26.4s, v18.4s,  v7.s[3]
277        FMLA v28.4s, v18.4s,  v9.s[3]
278        FMLA v21.4s, v19.4s,  v1.s[3]
279        FMLA v23.4s, v19.4s,  v3.s[3]
280        FMLA v25.4s, v19.4s,  v5.s[3]
281        FMLA v27.4s, v19.4s,  v7.s[3]
282        FMLA v29.4s, v19.4s,  v9.s[3]
283        B.HS 1b
284
285        # Epilogue - 8 floats of A (32 bytes)
286        # 80 FMA + 5 LDP A + 8 LDP B
287        # First block same as main loop.  Second block has no preloads.
2882:
289        # First group of 4 A.  40 FMA.
290        FMLA v20.4s, v12.4s,  v0.s[0]
291        LDP  q18,  q19, [x5], 32      // Load last B
292        FMLA v22.4s, v12.4s,  v2.s[0]
293        FMLA v24.4s, v12.4s,  v4.s[0]
294        FMLA v26.4s, v12.4s,  v6.s[0]
295        $if PREFETCH:
296          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
297        FMLA v28.4s, v12.4s,  v8.s[0]
298        FMLA v21.4s, v13.4s,  v0.s[0]
299        FMLA v23.4s, v13.4s,  v2.s[0]
300        $if PREFETCH:
301          PRFM PLDL1KEEP, [x5, 256]
302        FMLA v25.4s, v13.4s,  v4.s[0]
303        FMLA v27.4s, v13.4s,  v6.s[0]
304        FMLA v29.4s, v13.4s,  v8.s[0]
305        LDR   q1,  [x3], 16            // Load next 5 A
306
307        FMLA v20.4s, v14.4s,  v0.s[1]
308        FMLA v22.4s, v14.4s,  v2.s[1]
309        FMLA v24.4s, v14.4s,  v4.s[1]
310        LDR   q3,  [x9], 16
311        FMLA v26.4s, v14.4s,  v6.s[1]
312        FMLA v28.4s, v14.4s,  v8.s[1]
313        FMLA v21.4s, v15.4s,  v0.s[1]
314        LDR   q5, [x10], 16
315        FMLA v23.4s, v15.4s,  v2.s[1]
316        FMLA v25.4s, v15.4s,  v4.s[1]
317        FMLA v27.4s, v15.4s,  v6.s[1]
318        LDR   q7, [x11], 16
319        FMLA v29.4s, v15.4s,  v8.s[1]
320
321        FMLA v20.4s, v16.4s,  v0.s[2]
322        FMLA v22.4s, v16.4s,  v2.s[2]
323        LDR   q9, [x12], 16
324        FMLA v24.4s, v16.4s,  v4.s[2]
325        FMLA v26.4s, v16.4s,  v6.s[2]
326        FMLA v28.4s, v16.4s,  v8.s[2]
327        LDP  q12,  q13, [x5], 32       // Load 4 B
328        FMLA v21.4s, v17.4s,  v0.s[2]
329        FMLA v23.4s, v17.4s,  v2.s[2]
330        FMLA v25.4s, v17.4s,  v4.s[2]
331        LDP  q14,  q15, [x5], 32
332        FMLA v27.4s, v17.4s,  v6.s[2]
333        FMLA v29.4s, v17.4s,  v8.s[2]
334
335        FMLA v20.4s, v18.4s,  v0.s[3]
336        LDP  q16,  q17, [x5], 32
337        FMLA v22.4s, v18.4s,  v2.s[3]
338        FMLA v24.4s, v18.4s,  v4.s[3]
339        FMLA v26.4s, v18.4s,  v6.s[3]
340        FMLA v28.4s, v18.4s,  v8.s[3]
341        FMLA v21.4s, v19.4s,  v0.s[3]
342        FMLA v23.4s, v19.4s,  v2.s[3]
343        FMLA v25.4s, v19.4s,  v4.s[3]
344        FMLA v27.4s, v19.4s,  v6.s[3]
345        FMLA v29.4s, v19.4s,  v8.s[3]
346        LDP  q18,  q19, [x5], 32
347
348        # Second group of 4 A.  40 FMA.
349        FMLA v20.4s, v12.4s,  v1.s[0]
350        FMLA v22.4s, v12.4s,  v3.s[0]
351        FMLA v24.4s, v12.4s,  v5.s[0]
352        FMLA v26.4s, v12.4s,  v7.s[0]
353        FMLA v28.4s, v12.4s,  v9.s[0]
354        FMLA v21.4s, v13.4s,  v1.s[0]
355        FMLA v23.4s, v13.4s,  v3.s[0]
356        FMLA v25.4s, v13.4s,  v5.s[0]
357        FMLA v27.4s, v13.4s,  v7.s[0]
358        FMLA v29.4s, v13.4s,  v9.s[0]
359
360        FMLA v20.4s, v14.4s,  v1.s[1]
361        FMLA v22.4s, v14.4s,  v3.s[1]
362        FMLA v24.4s, v14.4s,  v5.s[1]
363        FMLA v26.4s, v14.4s,  v7.s[1]
364        FMLA v28.4s, v14.4s,  v9.s[1]
365        FMLA v21.4s, v15.4s,  v1.s[1]
366        FMLA v23.4s, v15.4s,  v3.s[1]
367        FMLA v25.4s, v15.4s,  v5.s[1]
368        FMLA v27.4s, v15.4s,  v7.s[1]
369        FMLA v29.4s, v15.4s,  v9.s[1]
370
371        FMLA v20.4s, v16.4s,  v1.s[2]
372        FMLA v22.4s, v16.4s,  v3.s[2]
373        FMLA v24.4s, v16.4s,  v5.s[2]
374        FMLA v26.4s, v16.4s,  v7.s[2]
375        FMLA v28.4s, v16.4s,  v9.s[2]
376        FMLA v21.4s, v17.4s,  v1.s[2]
377        FMLA v23.4s, v17.4s,  v3.s[2]
378        FMLA v25.4s, v17.4s,  v5.s[2]
379        FMLA v27.4s, v17.4s,  v7.s[2]
380        FMLA v29.4s, v17.4s,  v9.s[2]
381        TST x0, 31
382
383        FMLA v20.4s, v18.4s,  v1.s[3]
384        FMLA v22.4s, v18.4s,  v3.s[3]
385        FMLA v24.4s, v18.4s,  v5.s[3]
386        FMLA v26.4s, v18.4s,  v7.s[3]
387        FMLA v28.4s, v18.4s,  v9.s[3]
388        FMLA v21.4s, v19.4s,  v1.s[3]
389        FMLA v23.4s, v19.4s,  v3.s[3]
390        FMLA v25.4s, v19.4s,  v5.s[3]
391        FMLA v27.4s, v19.4s,  v7.s[3]
392        FMLA v29.4s, v19.4s,  v9.s[3]
393        B.NE 4f
394
395        # Clamp
3963:
397        FMAX v20.4s, v20.4s, v30.4s
398        SUBS x1, x1, 8
399        FMAX v21.4s, v21.4s, v30.4s
400        FMAX v22.4s, v22.4s, v30.4s
401        FMAX v23.4s, v23.4s, v30.4s
402        FMAX v24.4s, v24.4s, v30.4s
403        FMAX v25.4s, v25.4s, v30.4s
404        FMAX v26.4s, v26.4s, v30.4s
405        FMAX v27.4s, v27.4s, v30.4s
406        FMAX v28.4s, v28.4s, v30.4s
407        FMAX v29.4s, v29.4s, v30.4s
408        FMIN v20.4s, v20.4s, v31.4s
409        FMIN v21.4s, v21.4s, v31.4s
410        FMIN v22.4s, v22.4s, v31.4s
411        FMIN v23.4s, v23.4s, v31.4s
412        FMIN v24.4s, v24.4s, v31.4s
413        FMIN v25.4s, v25.4s, v31.4s
414        FMIN v26.4s, v26.4s, v31.4s
415        FMIN v27.4s, v27.4s, v31.4s
416        FMIN v28.4s, v28.4s, v31.4s
417        FMIN v29.4s, v29.4s, v31.4s
418
419        # Store full 5 x 8
420        B.LO 7f
421
422        $if INC:
423          SUB  x3,  x3, x2 // a0 -= kc
424          STP q28, q29, [x7]
425          ADD x7, x7, x14
426          SUB  x9,  x9, x2 // a1 -= kc
427          STP q26, q27, [x13]
428          ADD x13, x13, x14
429          SUB x10, x10, x2 // a2 -= kc
430          STP q24, q25, [x17]
431          ADD x17, x17, x14
432          SUB x11, x11, x2 // a3 -= kc
433          STP q22, q23, [x16]
434          ADD x16, x16, x14
435          SUB x12, x12, x2 // a4 -= kc
436          STP q20, q21,  [x6]
437          ADD  x6,  x6, x14
438        $else:
439          STP q20, q21,  [x6]
440          ADD  x6,  x6, x14
441          SUB  x3,  x3, x2 // a0 -= kc
442          STP q22, q23, [x16]
443          ADD x16, x16, x14
444          SUB  x9,  x9, x2 // a1 -= kc
445          STP q24, q25, [x17]
446          ADD x17, x17, x14
447          SUB x10, x10, x2 // a2 -= kc
448          STP q26, q27, [x13]
449          ADD x13, x13, x14
450          SUB x11, x11, x2 // a3 -= kc
451          STP q28, q29, [x7]
452          ADD x7, x7, x14
453          SUB x12, x12, x2 // a4 -= kc
454
455        B.HI 0b
456
457        # Restore d8-d15 from stack
458        LDP d14, d15, [sp, 32]
459        LDP d12, d13, [sp, 16]
460        LDP  d8,  d9, [sp], 48
461        RET
462
463        # Load clamp values
4644:
465        # Is there a remainder?- 4 floats of A (16 bytes)
466        TBZ x0, 4, 5f
467
468        # Remainder- 4 floats of A (16 bytes)
469        # Load A
470        LDR   q0,  [x3], 16
471        LDR   q2,  [x9], 16
472        LDR   q4, [x10], 16
473        LDR   q6, [x11], 16
474        LDR   q8, [x12], 16
475        # Load B
476        LDP  q12,  q13, [x5], 32
477        LDP  q14,  q15, [x5], 32
478        LDP  q16,  q17, [x5], 32
479        LDP  q18,  q19, [x5], 32
480
481        FMLA v20.4s, v12.4s,  v0.s[0]
482        FMLA v22.4s, v12.4s,  v2.s[0]
483        FMLA v24.4s, v12.4s,  v4.s[0]
484        FMLA v26.4s, v12.4s,  v6.s[0]
485        FMLA v28.4s, v12.4s,  v8.s[0]
486        FMLA v21.4s, v13.4s,  v0.s[0]
487        FMLA v23.4s, v13.4s,  v2.s[0]
488        FMLA v25.4s, v13.4s,  v4.s[0]
489        FMLA v27.4s, v13.4s,  v6.s[0]
490        FMLA v29.4s, v13.4s,  v8.s[0]
491
492        FMLA v20.4s, v14.4s,  v0.s[1]
493        FMLA v22.4s, v14.4s,  v2.s[1]
494        FMLA v24.4s, v14.4s,  v4.s[1]
495        FMLA v26.4s, v14.4s,  v6.s[1]
496        FMLA v28.4s, v14.4s,  v8.s[1]
497        FMLA v21.4s, v15.4s,  v0.s[1]
498        FMLA v23.4s, v15.4s,  v2.s[1]
499        FMLA v25.4s, v15.4s,  v4.s[1]
500        FMLA v27.4s, v15.4s,  v6.s[1]
501        FMLA v29.4s, v15.4s,  v8.s[1]
502
503        FMLA v20.4s, v16.4s,  v0.s[2]
504        FMLA v22.4s, v16.4s,  v2.s[2]
505        FMLA v24.4s, v16.4s,  v4.s[2]
506        FMLA v26.4s, v16.4s,  v6.s[2]
507        FMLA v28.4s, v16.4s,  v8.s[2]
508        FMLA v21.4s, v17.4s,  v0.s[2]
509        FMLA v23.4s, v17.4s,  v2.s[2]
510        FMLA v25.4s, v17.4s,  v4.s[2]
511        FMLA v27.4s, v17.4s,  v6.s[2]
512        FMLA v29.4s, v17.4s,  v8.s[2]
513
514        FMLA v20.4s, v18.4s,  v0.s[3]
515        FMLA v22.4s, v18.4s,  v2.s[3]
516        FMLA v24.4s, v18.4s,  v4.s[3]
517        FMLA v26.4s, v18.4s,  v6.s[3]
518        FMLA v28.4s, v18.4s,  v8.s[3]
519        FMLA v21.4s, v19.4s,  v0.s[3]
520        FMLA v23.4s, v19.4s,  v2.s[3]
521        FMLA v25.4s, v19.4s,  v4.s[3]
522        FMLA v27.4s, v19.4s,  v6.s[3]
523        FMLA v29.4s, v19.4s,  v8.s[3]
524
525        # Is there a remainder?- 2 floats of A (8 bytes)
5265:
527        TBZ x0, 3, 6f
528
529        # Remainder- 2 floats of A (8 bytes)
530        # Load A
531        LDR   d0,  [x3], 8
532        LDR   d2,  [x9], 8
533        LDR   d4, [x10], 8
534        LDR   d6, [x11], 8
535        LDR   d8, [x12], 8
536        # Load B
537        LDP  q12,  q13, [x5], 32
538        LDP  q14,  q15, [x5], 32
539
540        FMLA v20.4s, v12.4s,  v0.s[0]
541        FMLA v22.4s, v12.4s,  v2.s[0]
542        FMLA v24.4s, v12.4s,  v4.s[0]
543        FMLA v26.4s, v12.4s,  v6.s[0]
544        FMLA v28.4s, v12.4s,  v8.s[0]
545        FMLA v21.4s, v13.4s,  v0.s[0]
546        FMLA v23.4s, v13.4s,  v2.s[0]
547        FMLA v25.4s, v13.4s,  v4.s[0]
548        FMLA v27.4s, v13.4s,  v6.s[0]
549        FMLA v29.4s, v13.4s,  v8.s[0]
550
551        FMLA v20.4s, v14.4s,  v0.s[1]
552        FMLA v22.4s, v14.4s,  v2.s[1]
553        FMLA v24.4s, v14.4s,  v4.s[1]
554        FMLA v26.4s, v14.4s,  v6.s[1]
555        FMLA v28.4s, v14.4s,  v8.s[1]
556        FMLA v21.4s, v15.4s,  v0.s[1]
557        FMLA v23.4s, v15.4s,  v2.s[1]
558        FMLA v25.4s, v15.4s,  v4.s[1]
559        FMLA v27.4s, v15.4s,  v6.s[1]
560        FMLA v29.4s, v15.4s,  v8.s[1]
561
562        # Is there a remainder?- 1 float of A (4 bytes)
5636:
564        TBZ x0, 2, 3b
565
566        # Remainder- 1 float of A (4 bytes)
567        # Load A
568        LDR   s0,  [x3], 4
569        LDR   s2,  [x9], 4
570        LDR   s4, [x10], 4
571        LDR   s6, [x11], 4
572        LDR   s8, [x12], 4
573        # Load B
574        LDP  q12,  q13, [x5], 32
575
576        FMLA v20.4s, v12.4s,  v0.s[0]
577        FMLA v22.4s, v12.4s,  v2.s[0]
578        FMLA v24.4s, v12.4s,  v4.s[0]
579        FMLA v26.4s, v12.4s,  v6.s[0]
580        FMLA v28.4s, v12.4s,  v8.s[0]
581        FMLA v21.4s, v13.4s,  v0.s[0]
582        FMLA v23.4s, v13.4s,  v2.s[0]
583        FMLA v25.4s, v13.4s,  v4.s[0]
584        FMLA v27.4s, v13.4s,  v6.s[0]
585        FMLA v29.4s, v13.4s,  v8.s[0]
586        B 3b
587
588        # Store odd width
5897:
590        TBZ x1, 2, 8f
591        $if INC:
592          STR q28, [x7], 16
593          MOV v28.16b, v29.16b
594          STR q26, [x13], 16
595          MOV v26.16b, v27.16b
596          STR q24, [x17], 16
597          MOV v24.16b, v25.16b
598          STR q22, [x16], 16
599          MOV v22.16b, v23.16b
600          STR q20,  [x6], 16
601          MOV v20.16b, v21.16b
602        $else:
603          STR q20,  [x6], 16
604          MOV v20.16b, v21.16b
605          STR q22, [x16], 16
606          MOV v22.16b, v23.16b
607          STR q24, [x17], 16
608          MOV v24.16b, v25.16b
609          STR q26, [x13], 16
610          MOV v26.16b, v27.16b
611          STR q28, [x7], 16
612          MOV v28.16b, v29.16b
6138:
614        TBZ x1, 1, 9f
615        $if INC:
616          STR d28, [x7], 8
617          DUP d28, v28.d[1]
618          STR d26, [x13], 8
619          DUP d26, v26.d[1]
620          STR d24, [x17], 8
621          DUP d24, v24.d[1]
622          STR d22, [x16], 8
623          DUP d22, v22.d[1]
624          STR d20,  [x6], 8
625          DUP d20, v20.d[1]
626        $else:
627          STR d20,  [x6], 8
628          DUP d20, v20.d[1]
629          STR d22, [x16], 8
630          DUP d22, v22.d[1]
631          STR d24, [x17], 8
632          DUP d24, v24.d[1]
633          STR d26, [x13], 8
634          DUP d26, v26.d[1]
635          STR d28, [x7], 8
636          DUP d28, v28.d[1]
637
6389:
639        TBZ x1, 0, 10f
640        $if INC:
641          STR s28, [x7]
642          STR s26, [x13]
643          STR s24, [x17]
644          STR s22, [x16]
645          STR s20,  [x6]
646        $else:
647          STR s20,  [x6]
648          STR s22, [x16]
649          STR s24, [x17]
650          STR s26, [x13]
651          STR s28, [x7]
65210:
653        # Restore d8-d15 from stack
654        LDP d14, d15, [sp, 32]
655        LDP d12, d13, [sp, 16]
656        LDP  d8,  d9, [sp], 48
657        RET
658
659END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
660
661#ifdef __ELF__
662.section ".note.GNU-stack","",%progbits
663#endif
664