• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> (x0)
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> x8
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointers
25# x14 a0
26# x15 a1
27# x20 a2
28# x21 a3
29# x22 a4
30# x23 a5
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x10 c3
37# x13 c4
38#  x7 c5
39
40# Vector register usage
41# A0   v0  v6
42# A1   v1  v7
43# A2   v2  v8
44# A3   v3  v9
45# A4   v4 v10
46# A5   v5 v11
47# B   v12 v13 v14 v15
48# B   v16 v17 v18 v19
49# C   v20 v21
50# C   v22 v23
51# C   v24 v25
52# C   v26 v27
53# C   v28 v29
54# C   v30 v31
55# Clamp v6 v7
56
57BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
58
59        # Clamp C pointers / Save d8-d15 on stack
60        STP  d8,  d9, [sp, -96]!
61        CMP x0, 2                // if mr < 2
62        ADD x16, x6, x7          // c1 = c0 + cm_stride
63        CSEL x16, x6, x16, LO    //   c1 = c0
64
65        STP d10, d11, [sp, 16]
66        ADD x17, x16, x7         // c2 = c1 + cm_stride
67                                 // if mr <= 2
68        CSEL x17, x16, x17, LS   //   c2 = c1
69
70        STP d12, d13, [sp, 32]
71        CMP x0, 4                // if mr < 4
72        ADD x10, x17, x7         // c3 = c2 + cm_stride
73        CSEL x10, x17, x10, LO   //   c3 = c2
74
75        STP d14, d15, [sp, 48]
76        ADD x13, x10, x7         // c4 = c3 + cm_stride
77                                 // if mr <= 4
78        CSEL x13, x10, x13, LS   //   c4 = c3
79
80        # Save x20,x21,x22,x23 on stack
81        STP x20, x21, [sp, 64]
82        STP x22, x23, [sp, 80]
83
84        CMP x0, 6                // if mr < 6
85        ADD x7, x13, x7          // c5 = c4 + cm_stride
86        CSEL x7, x13, x7, LO     //   c5 = c4
87
88        # Load a_offset
89        LDR x11, [sp, 104]
90
91        # Load zero, params pointer
92        LDP x12, x8, [sp, 112]
93
940:
95        # Load initial bias from w into accumulators
96        LDP q20, q21, [x5], 32
97        MOV v22.16b, v20.16b
98        MOV v23.16b, v21.16b
99        $if PREFETCH:
100          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
101        MOV v24.16b, v20.16b
102        MOV v25.16b, v21.16b
103        $if PREFETCH:
104          PRFM PLDL1KEEP, [x5, 64]
105        MOV v26.16b, v20.16b
106        MOV v27.16b, v21.16b
107        $if PREFETCH:
108          PRFM PLDL1KEEP, [x5, 128]
109        MOV v28.16b, v20.16b
110        MOV v29.16b, v21.16b
111        $if PREFETCH:
112          PRFM PLDL1KEEP, [x5, 192]
113        MOV v30.16b, v20.16b
114        MOV v31.16b, v21.16b
115
116        MOV x9, x3  // p = ks
117
1181:
119        # Load next 6 A pointers
120        LDP x14, x15, [x4], 16
121        LDP x20, x21, [x4], 16
122        LDP x22, x23, [x4], 16
123
124        CMP x14, x12            // if a0 == zero
125        ADD x14, x14, x11       // a0 += a_offset
126        CSEL x14, x12, x14, EQ  //   a0 = zero, else += a0 + a_offset
127        CMP x15, x12            // if a1 == zero
128        ADD x15, x15, x11       // a1 += a_offset
129        CSEL x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
130        CMP x20, x12            // if a2 == zero
131        ADD x20, x20, x11       // a2 += a_offset
132        CSEL x20, x12, x20, EQ  //   a2 = zero, else += a2 + a_offset
133        CMP x21, x12            // if a3 == zero
134        ADD x21, x21, x11       // a3 += a_offset
135        CSEL x21, x12, x21, EQ  //   a3 = zero, else += a3 + a_offset
136        CMP x22, x12            // if a4 == zero
137        ADD x22, x22, x11       // a4 += a_offset
138        CSEL x22, x12, x22, EQ  //   a4 = zero, else += a4 + a_offset
139        CMP x23, x12            // if a5 == zero
140        ADD x23, x23, x11       // a5 += a_offset
141        CSEL x23, x12, x23, EQ  //   a5 = zero, else += a5 + a_offset
142
143        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
144        SUBS x0, x2, 32  // k = kc - 32
145        B.LO 5f
146
147        # Prologue - loads for main loop of 96 FMA
148        LDR   q0, [x14], 16
149        LDR   q1, [x15], 16
150        LDR   q2, [x20], 16
151        LDR   q3, [x21], 16
152        LDR   q4, [x22], 16
153        LDR   q5, [x23], 16
154        LDP  q12, q13, [x5], 32  // Fetch 3 B (4th deferred)
155        LDP  q14, q15, [x5], 32
156        LDP  q16, q17, [x5], 32
157
158        # Is there at least 8 floats (32 bytes) for main loop?
159        SUBS x0, x0, 32
160        B.LO 3f
161
162        # Main loop - 8 floats of A (32 bytes)
163        # 96 FMA + 6 LDP A + 8 LDP B
1642:
165        # First group of 4 A.  48 FMA.
166        FMLA v20.4s, v12.4s,  v0.s[0]
167        LDP  q18, q19, [x5], 32        // Load last B
168        FMLA v22.4s, v12.4s,  v1.s[0]
169        FMLA v24.4s, v12.4s,  v2.s[0]
170        FMLA v26.4s, v12.4s,  v3.s[0]
171        FMLA v28.4s, v12.4s,  v4.s[0]
172        FMLA v30.4s, v12.4s,  v5.s[0]
173        FMLA v21.4s, v13.4s,  v0.s[0]
174        FMLA v23.4s, v13.4s,  v1.s[0]
175        FMLA v25.4s, v13.4s,  v2.s[0]
176        FMLA v27.4s, v13.4s,  v3.s[0]
177        FMLA v29.4s, v13.4s,  v4.s[0]
178
179        FMLA v31.4s, v13.4s,  v5.s[0]
180        FMLA v20.4s, v14.4s,  v0.s[1]
181        $if PREFETCH:
182          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
183        FMLA v22.4s, v14.4s,  v1.s[1]
184        FMLA v24.4s, v14.4s,  v2.s[1]
185        FMLA v26.4s, v14.4s,  v3.s[1]
186        FMLA v28.4s, v14.4s,  v4.s[1]
187        $if PREFETCH:
188          PRFM PLDL1KEEP, [x5, 256]
189        FMLA v30.4s, v14.4s,  v5.s[1]
190        FMLA v21.4s, v15.4s,  v0.s[1]
191        FMLA v23.4s, v15.4s,  v1.s[1]
192        FMLA v25.4s, v15.4s,  v2.s[1]
193        LDR   q6, [x14], 16            // Load next 6 A
194        FMLA v27.4s, v15.4s,  v3.s[1]
195        FMLA v29.4s, v15.4s,  v4.s[1]
196        FMLA v31.4s, v15.4s,  v5.s[1]
197        LDR   q7, [x15], 16
198
199        FMLA v20.4s, v16.4s,  v0.s[2]
200        FMLA v22.4s, v16.4s,  v1.s[2]
201        FMLA v24.4s, v16.4s,  v2.s[2]
202        LDR   q8, [x20], 16
203        FMLA v26.4s, v16.4s,  v3.s[2]
204        FMLA v28.4s, v16.4s,  v4.s[2]
205        FMLA v30.4s, v16.4s,  v5.s[2]
206        LDR   q9, [x21], 16
207        FMLA v21.4s, v17.4s,  v0.s[2]
208        FMLA v23.4s, v17.4s,  v1.s[2]
209        FMLA v25.4s, v17.4s,  v2.s[2]
210        LDR   q10, [x22], 16
211        FMLA v27.4s, v17.4s,  v3.s[2]
212        FMLA v29.4s, v17.4s,  v4.s[2]
213        FMLA v31.4s, v17.4s,  v5.s[2]
214        LDR  q11, [x23], 16
215
216        FMLA v20.4s, v18.4s,  v0.s[3]
217        FMLA v22.4s, v18.4s,  v1.s[3]
218        FMLA v24.4s, v18.4s,  v2.s[3]
219        LDP  q12, q13, [x5], 32        // Load 4 B
220        FMLA v26.4s, v18.4s,  v3.s[3]
221        FMLA v28.4s, v18.4s,  v4.s[3]
222        FMLA v30.4s, v18.4s,  v5.s[3]
223        LDP  q14, q15, [x5], 32
224        FMLA v21.4s, v19.4s,  v0.s[3]
225        FMLA v23.4s, v19.4s,  v1.s[3]
226        FMLA v25.4s, v19.4s,  v2.s[3]
227        LDP  q16, q17, [x5], 32
228        FMLA v27.4s, v19.4s,  v3.s[3]
229        FMLA v29.4s, v19.4s,  v4.s[3]
230        FMLA v31.4s, v19.4s,  v5.s[3]
231        LDP  q18, q19, [x5], 32
232
233        # Second group of 4 A.  48 FMA.
234        FMLA v20.4s, v12.4s,  v6.s[0]
235        FMLA v22.4s, v12.4s,  v7.s[0]
236        FMLA v24.4s, v12.4s,  v8.s[0]
237        LDR   q0, [x14], 16            // Load next 6 A
238        FMLA v26.4s, v12.4s,  v9.s[0]
239        FMLA v28.4s, v12.4s, v10.s[0]
240        FMLA v30.4s, v12.4s, v11.s[0]
241        LDR   q1, [x15], 16
242        FMLA v21.4s, v13.4s,  v6.s[0]
243        FMLA v23.4s, v13.4s,  v7.s[0]
244        FMLA v25.4s, v13.4s,  v8.s[0]
245        LDR   q2, [x20], 16
246        FMLA v27.4s, v13.4s,  v9.s[0]
247        FMLA v29.4s, v13.4s, v10.s[0]
248        FMLA v31.4s, v13.4s, v11.s[0]
249        LDR   q3, [x21], 16
250
251        FMLA v20.4s, v14.4s,  v6.s[1]
252        FMLA v22.4s, v14.4s,  v7.s[1]
253        FMLA v24.4s, v14.4s,  v8.s[1]
254        LDR   q4, [x22], 16
255        FMLA v26.4s, v14.4s,  v9.s[1]
256        FMLA v28.4s, v14.4s, v10.s[1]
257        FMLA v30.4s, v14.4s, v11.s[1]
258        LDR   q5, [x23], 16
259        FMLA v21.4s, v15.4s,  v6.s[1]
260        FMLA v23.4s, v15.4s,  v7.s[1]
261        FMLA v25.4s, v15.4s,  v8.s[1]
262        LDP  q12, q13, [x5], 32        // Load next 3 B (not last)
263        FMLA v27.4s, v15.4s,  v9.s[1]
264        FMLA v29.4s, v15.4s, v10.s[1]
265        FMLA v31.4s, v15.4s, v11.s[1]
266        LDP  q14, q15, [x5], 32
267
268        FMLA v20.4s, v16.4s,  v6.s[2]
269        FMLA v22.4s, v16.4s,  v7.s[2]
270        FMLA v24.4s, v16.4s,  v8.s[2]
271        FMLA v26.4s, v16.4s,  v9.s[2]
272        FMLA v28.4s, v16.4s, v10.s[2]
273        FMLA v30.4s, v16.4s, v11.s[2]
274        FMLA v21.4s, v17.4s,  v6.s[2]
275        FMLA v23.4s, v17.4s,  v7.s[2]
276        FMLA v25.4s, v17.4s,  v8.s[2]
277        FMLA v27.4s, v17.4s,  v9.s[2]
278        FMLA v29.4s, v17.4s, v10.s[2]
279        FMLA v31.4s, v17.4s, v11.s[2]
280        LDP  q16,  q17, [x5], 32
281
282        FMLA v20.4s, v18.4s,  v6.s[3]
283        FMLA v22.4s, v18.4s,  v7.s[3]
284        SUBS x0, x0, 32
285        FMLA v24.4s, v18.4s,  v8.s[3]
286        FMLA v26.4s, v18.4s,  v9.s[3]
287        FMLA v28.4s, v18.4s, v10.s[3]
288        FMLA v30.4s, v18.4s, v11.s[3]
289        FMLA v21.4s, v19.4s,  v6.s[3]
290        FMLA v23.4s, v19.4s,  v7.s[3]
291        FMLA v25.4s, v19.4s,  v8.s[3]
292        FMLA v27.4s, v19.4s,  v9.s[3]
293        FMLA v29.4s, v19.4s, v10.s[3]
294        FMLA v31.4s, v19.4s, v11.s[3]
295        B.HS 2b
296
297        # Epilogue - 8 floats of A (32 bytes)
298        # 96 FMA + 6 LDP A + 8 LDP B
299        # First block same as main loop.  Second block has no preloads.
3003:
301        # First group of 4 A.  48 FMA.
302        FMLA v20.4s, v12.4s,  v0.s[0]
303        LDP  q18, q19, [x5], 32        // Load last B
304        FMLA v22.4s, v12.4s,  v1.s[0]
305        FMLA v24.4s, v12.4s,  v2.s[0]
306        FMLA v26.4s, v12.4s,  v3.s[0]
307        FMLA v28.4s, v12.4s,  v4.s[0]
308        FMLA v30.4s, v12.4s,  v5.s[0]
309        FMLA v21.4s, v13.4s,  v0.s[0]
310        FMLA v23.4s, v13.4s,  v1.s[0]
311        FMLA v25.4s, v13.4s,  v2.s[0]
312        FMLA v27.4s, v13.4s,  v3.s[0]
313        FMLA v29.4s, v13.4s,  v4.s[0]
314
315        FMLA v31.4s, v13.4s,  v5.s[0]
316        FMLA v20.4s, v14.4s,  v0.s[1]
317        $if PREFETCH:
318          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
319        FMLA v22.4s, v14.4s,  v1.s[1]
320        FMLA v24.4s, v14.4s,  v2.s[1]
321        FMLA v26.4s, v14.4s,  v3.s[1]
322        FMLA v28.4s, v14.4s,  v4.s[1]
323        $if PREFETCH:
324          PRFM PLDL1KEEP, [x5, 256]
325        FMLA v30.4s, v14.4s,  v5.s[1]
326        FMLA v21.4s, v15.4s,  v0.s[1]
327        FMLA v23.4s, v15.4s,  v1.s[1]
328        FMLA v25.4s, v15.4s,  v2.s[1]
329        LDR   q6, [x14], 16            // Load next 6 A
330        FMLA v27.4s, v15.4s,  v3.s[1]
331        FMLA v29.4s, v15.4s,  v4.s[1]
332        FMLA v31.4s, v15.4s,  v5.s[1]
333        LDR   q7, [x15], 16
334
335        FMLA v20.4s, v16.4s,  v0.s[2]
336        FMLA v22.4s, v16.4s,  v1.s[2]
337        FMLA v24.4s, v16.4s,  v2.s[2]
338        LDR   q8, [x20], 16
339        FMLA v26.4s, v16.4s,  v3.s[2]
340        FMLA v28.4s, v16.4s,  v4.s[2]
341        FMLA v30.4s, v16.4s,  v5.s[2]
342        LDR   q9, [x21], 16
343        FMLA v21.4s, v17.4s,  v0.s[2]
344        FMLA v23.4s, v17.4s,  v1.s[2]
345        FMLA v25.4s, v17.4s,  v2.s[2]
346        LDR   q10, [x22], 16
347        FMLA v27.4s, v17.4s,  v3.s[2]
348        FMLA v29.4s, v17.4s,  v4.s[2]
349        FMLA v31.4s, v17.4s,  v5.s[2]
350        LDR  q11, [x23], 16
351
352        FMLA v20.4s, v18.4s,  v0.s[3]
353        FMLA v22.4s, v18.4s,  v1.s[3]
354        FMLA v24.4s, v18.4s,  v2.s[3]
355        LDP  q12, q13, [x5], 32        // Load 4 B
356        FMLA v26.4s, v18.4s,  v3.s[3]
357        FMLA v28.4s, v18.4s,  v4.s[3]
358        FMLA v30.4s, v18.4s,  v5.s[3]
359        LDP  q14, q15, [x5], 32
360        FMLA v21.4s, v19.4s,  v0.s[3]
361        FMLA v23.4s, v19.4s,  v1.s[3]
362        FMLA v25.4s, v19.4s,  v2.s[3]
363        LDP  q16, q17, [x5], 32
364        FMLA v27.4s, v19.4s,  v3.s[3]
365        FMLA v29.4s, v19.4s,  v4.s[3]
366        FMLA v31.4s, v19.4s,  v5.s[3]
367        LDP  q18, q19, [x5], 32
368
369        # Second group of 4 A.  48 FMA.
370        FMLA v20.4s, v12.4s,  v6.s[0]
371        FMLA v22.4s, v12.4s,  v7.s[0]
372        FMLA v24.4s, v12.4s,  v8.s[0]
373        FMLA v26.4s, v12.4s,  v9.s[0]
374        FMLA v28.4s, v12.4s, v10.s[0]
375        FMLA v30.4s, v12.4s, v11.s[0]
376        FMLA v21.4s, v13.4s,  v6.s[0]
377        FMLA v23.4s, v13.4s,  v7.s[0]
378        FMLA v25.4s, v13.4s,  v8.s[0]
379        FMLA v27.4s, v13.4s,  v9.s[0]
380        FMLA v29.4s, v13.4s, v10.s[0]
381        FMLA v31.4s, v13.4s, v11.s[0]
382
383        FMLA v20.4s, v14.4s,  v6.s[1]
384        FMLA v22.4s, v14.4s,  v7.s[1]
385        FMLA v24.4s, v14.4s,  v8.s[1]
386        FMLA v26.4s, v14.4s,  v9.s[1]
387        FMLA v28.4s, v14.4s, v10.s[1]
388        FMLA v30.4s, v14.4s, v11.s[1]
389        FMLA v21.4s, v15.4s,  v6.s[1]
390        FMLA v23.4s, v15.4s,  v7.s[1]
391        FMLA v25.4s, v15.4s,  v8.s[1]
392        FMLA v27.4s, v15.4s,  v9.s[1]
393        FMLA v29.4s, v15.4s, v10.s[1]
394        FMLA v31.4s, v15.4s, v11.s[1]
395
396        FMLA v20.4s, v16.4s,  v6.s[2]
397        FMLA v22.4s, v16.4s,  v7.s[2]
398        FMLA v24.4s, v16.4s,  v8.s[2]
399        FMLA v26.4s, v16.4s,  v9.s[2]
400        FMLA v28.4s, v16.4s, v10.s[2]
401        FMLA v30.4s, v16.4s, v11.s[2]
402        FMLA v21.4s, v17.4s,  v6.s[2]
403        FMLA v23.4s, v17.4s,  v7.s[2]
404        FMLA v25.4s, v17.4s,  v8.s[2]
405        FMLA v27.4s, v17.4s,  v9.s[2]
406        FMLA v29.4s, v17.4s, v10.s[2]
407        FMLA v31.4s, v17.4s, v11.s[2]
408
409        FMLA v20.4s, v18.4s,  v6.s[3]
410        FMLA v22.4s, v18.4s,  v7.s[3]
411        FMLA v24.4s, v18.4s,  v8.s[3]
412        FMLA v26.4s, v18.4s,  v9.s[3]
413        FMLA v28.4s, v18.4s, v10.s[3]
414        FMLA v30.4s, v18.4s, v11.s[3]
415        FMLA v21.4s, v19.4s,  v6.s[3]
416        FMLA v23.4s, v19.4s,  v7.s[3]
417
418        # Load min/max values
419        LD2R {v6.4s, v7.4s}, [x8]
420
421        FMLA v25.4s, v19.4s,  v8.s[3]
422        FMLA v27.4s, v19.4s,  v9.s[3]
423        # Is there a remainder?- 4 floats of A (16 bytes) or less
424        TST x0, 31
425        FMLA v29.4s, v19.4s, v10.s[3]
426        FMLA v31.4s, v19.4s, v11.s[3]
427        B.NE 5f
428
4294:
430        # ks loop
431        SUBS x9, x9, 48  // ks -= MR * sizeof(void*)
432        B.HI 1b
433
434        # Clamp
435        FMAX v20.4s, v20.4s, v6.4s
436        # Load cn_stride
437        LDR x0, [sp, 96]
438        FMAX v21.4s, v21.4s, v6.4s
439        FMAX v22.4s, v22.4s, v6.4s
440        FMAX v23.4s, v23.4s, v6.4s
441        FMAX v24.4s, v24.4s, v6.4s
442        FMAX v25.4s, v25.4s, v6.4s
443        FMAX v26.4s, v26.4s, v6.4s
444        FMAX v27.4s, v27.4s, v6.4s
445        FMAX v28.4s, v28.4s, v6.4s
446        FMAX v29.4s, v29.4s, v6.4s
447        FMAX v30.4s, v30.4s, v6.4s
448        FMAX v31.4s, v31.4s, v6.4s
449        SUBS x1, x1, 8
450        FMIN v20.4s, v20.4s, v7.4s
451        FMIN v21.4s, v21.4s, v7.4s
452        FMIN v22.4s, v22.4s, v7.4s
453        FMIN v23.4s, v23.4s, v7.4s
454        FMIN v24.4s, v24.4s, v7.4s
455        FMIN v25.4s, v25.4s, v7.4s
456        FMIN v26.4s, v26.4s, v7.4s
457        FMIN v27.4s, v27.4s, v7.4s
458        FMIN v28.4s, v28.4s, v7.4s
459        FMIN v29.4s, v29.4s, v7.4s
460        FMIN v30.4s, v30.4s, v7.4s
461        FMIN v31.4s, v31.4s, v7.4s
462
463        # Store full 6 x 8
464        B.LO 8f
465
466        STP q30, q31,  [x7]
467        ADD x7, x7, x0
468        STP q28, q29, [x13]
469        ADD x13, x13, x0
470        STP q26, q27, [x10]
471        ADD x10, x10, x0
472        STP q24, q25, [x17]
473        ADD x17, x17, x0
474        STP q22, q23, [x16]
475        ADD x16, x16, x0
476        STP q20, q21,  [x6]
477        ADD  x6,  x6, x0
478
479        SUB x4, x4, x3  // a -= ks
480
481        # nc loop
482        B.HI 0b
483
484        # Restore x20,x21,x22,x23 from stack
485        LDP x22, x23, [sp, 80]
486        LDP x20, x21, [sp, 64]
487
488        # Restore d8-d15 from stack
489        LDP d14, d15, [sp, 48]
490        LDP d12, d13, [sp, 32]
491        LDP d10, d11, [sp, 16]
492        LDP  d8,  d9, [sp], 96
493        RET
494
4955:
496        # Load min/max values
497        LD2R {v6.4s, v7.4s}, [x8]
498
499        # Is there a remainder?- 4 floats of A (16 bytes)
500        TBZ x0, 4, 6f
501
502        # Remainder- 4 floats of A (16 bytes)
503        # Load A
504        LDR   q0, [x14], 16
505        LDR   q1, [x15], 16
506        LDR   q2, [x20], 16
507        LDR   q3, [x21], 16
508        LDR   q4, [x22], 16
509        LDR   q5, [x23], 16
510        # Load B
511        LDP  q12, q13, [x5], 32
512        LDP  q14, q15, [x5], 32
513        LDP  q16, q17, [x5], 32
514        LDP  q18, q19, [x5], 32
515
516        FMLA v20.4s, v12.4s,  v0.s[0]
517        FMLA v22.4s, v12.4s,  v1.s[0]
518        FMLA v24.4s, v12.4s,  v2.s[0]
519        FMLA v26.4s, v12.4s,  v3.s[0]
520        FMLA v28.4s, v12.4s,  v4.s[0]
521        FMLA v30.4s, v12.4s,  v5.s[0]
522        FMLA v21.4s, v13.4s,  v0.s[0]
523        FMLA v23.4s, v13.4s,  v1.s[0]
524        FMLA v25.4s, v13.4s,  v2.s[0]
525        FMLA v27.4s, v13.4s,  v3.s[0]
526        FMLA v29.4s, v13.4s,  v4.s[0]
527        FMLA v31.4s, v13.4s,  v5.s[0]
528
529        FMLA v20.4s, v14.4s,  v0.s[1]
530        FMLA v22.4s, v14.4s,  v1.s[1]
531        FMLA v24.4s, v14.4s,  v2.s[1]
532        FMLA v26.4s, v14.4s,  v3.s[1]
533        FMLA v28.4s, v14.4s,  v4.s[1]
534        FMLA v30.4s, v14.4s,  v5.s[1]
535        FMLA v21.4s, v15.4s,  v0.s[1]
536        FMLA v23.4s, v15.4s,  v1.s[1]
537        FMLA v25.4s, v15.4s,  v2.s[1]
538        FMLA v27.4s, v15.4s,  v3.s[1]
539        FMLA v29.4s, v15.4s,  v4.s[1]
540        FMLA v31.4s, v15.4s,  v5.s[1]
541
542        FMLA v20.4s, v16.4s,  v0.s[2]
543        FMLA v22.4s, v16.4s,  v1.s[2]
544        FMLA v24.4s, v16.4s,  v2.s[2]
545        FMLA v26.4s, v16.4s,  v3.s[2]
546        FMLA v28.4s, v16.4s,  v4.s[2]
547        FMLA v30.4s, v16.4s,  v5.s[2]
548        FMLA v21.4s, v17.4s,  v0.s[2]
549        FMLA v23.4s, v17.4s,  v1.s[2]
550        FMLA v25.4s, v17.4s,  v2.s[2]
551        FMLA v27.4s, v17.4s,  v3.s[2]
552        FMLA v29.4s, v17.4s,  v4.s[2]
553        FMLA v31.4s, v17.4s,  v5.s[2]
554
555        FMLA v20.4s, v18.4s,  v0.s[3]
556        FMLA v22.4s, v18.4s,  v1.s[3]
557        FMLA v24.4s, v18.4s,  v2.s[3]
558        FMLA v26.4s, v18.4s,  v3.s[3]
559        FMLA v28.4s, v18.4s,  v4.s[3]
560        FMLA v30.4s, v18.4s,  v5.s[3]
561        FMLA v21.4s, v19.4s,  v0.s[3]
562        FMLA v23.4s, v19.4s,  v1.s[3]
563        FMLA v25.4s, v19.4s,  v2.s[3]
564        FMLA v27.4s, v19.4s,  v3.s[3]
565        FMLA v29.4s, v19.4s,  v4.s[3]
566        FMLA v31.4s, v19.4s,  v5.s[3]
567
568        # Is there a remainder?- 2 floats of A (8 bytes)
5696:
570        TBZ x0, 3, 7f
571
572        # Remainder- 2 floats of A (8 bytes)
573        # Load A
574        LDR   d0, [x14], 8
575        LDR   d1, [x15], 8
576        LDR   d2, [x20], 8
577        LDR   d3, [x21], 8
578        LDR   d4, [x22], 8
579        LDR   d5, [x23], 8
580        # Load B
581        LDP  q12, q13, [x5], 32
582        LDP  q14, q15, [x5], 32
583
584        FMLA v20.4s, v12.4s,  v0.s[0]
585        FMLA v22.4s, v12.4s,  v1.s[0]
586        FMLA v24.4s, v12.4s,  v2.s[0]
587        FMLA v26.4s, v12.4s,  v3.s[0]
588        FMLA v28.4s, v12.4s,  v4.s[0]
589        FMLA v30.4s, v12.4s,  v5.s[0]
590        FMLA v21.4s, v13.4s,  v0.s[0]
591        FMLA v23.4s, v13.4s,  v1.s[0]
592        FMLA v25.4s, v13.4s,  v2.s[0]
593        FMLA v27.4s, v13.4s,  v3.s[0]
594        FMLA v29.4s, v13.4s,  v4.s[0]
595        FMLA v31.4s, v13.4s,  v5.s[0]
596
597        FMLA v20.4s, v14.4s,  v0.s[1]
598        FMLA v22.4s, v14.4s,  v1.s[1]
599        FMLA v24.4s, v14.4s,  v2.s[1]
600        FMLA v26.4s, v14.4s,  v3.s[1]
601        FMLA v28.4s, v14.4s,  v4.s[1]
602        FMLA v30.4s, v14.4s,  v5.s[1]
603        FMLA v21.4s, v15.4s,  v0.s[1]
604        FMLA v23.4s, v15.4s,  v1.s[1]
605        FMLA v25.4s, v15.4s,  v2.s[1]
606        FMLA v27.4s, v15.4s,  v3.s[1]
607        FMLA v29.4s, v15.4s,  v4.s[1]
608        FMLA v31.4s, v15.4s,  v5.s[1]
609
610        # Is there a remainder?- 1 float of A (4 bytes)
6117:
612        TBZ x0, 2, 4b
613
614        # Remainder- 1 float of A (4 bytes)
615        # Load A
616        LDR   s0, [x14], 4
617        LDR   s1, [x15], 4
618        LDR   s2, [x20], 4
619        LDR   s3, [x21], 4
620        LDR   s4, [x22], 4
621        LDR   s5, [x23], 4
622        # Load B
623        LDP  q12, q13, [x5], 32
624
625        FMLA v20.4s, v12.4s,  v0.s[0]
626        FMLA v22.4s, v12.4s,  v1.s[0]
627        FMLA v24.4s, v12.4s,  v2.s[0]
628        FMLA v26.4s, v12.4s,  v3.s[0]
629        FMLA v28.4s, v12.4s,  v4.s[0]
630        FMLA v30.4s, v12.4s,  v5.s[0]
631        FMLA v21.4s, v13.4s,  v0.s[0]
632        FMLA v23.4s, v13.4s,  v1.s[0]
633        FMLA v25.4s, v13.4s,  v2.s[0]
634        FMLA v27.4s, v13.4s,  v3.s[0]
635        FMLA v29.4s, v13.4s,  v4.s[0]
636        FMLA v31.4s, v13.4s,  v5.s[0]
637        B 4b
638
639        # Store odd width
6408:
641        TBZ x1, 2, 9f
642        STR q30,  [x7], 16
643        MOV v30.16b, v31.16b
644        STR q28, [x13], 16
645        MOV v28.16b, v29.16b
646        STR q26, [x10], 16
647        MOV v26.16b, v27.16b
648        STR q24, [x17], 16
649        MOV v24.16b, v25.16b
650        STR q22, [x16], 16
651        MOV v22.16b, v23.16b
652        STR q20,  [x6], 16
653        MOV v20.16b, v21.16b
6549:
655        TBZ x1, 1, 10f
656        STR d30,  [x7], 8
657        DUP d30, v30.d[1]
658        STR d28, [x13], 8
659        DUP d28, v28.d[1]
660        STR d26, [x10], 8
661        DUP d26, v26.d[1]
662        STR d24, [x17], 8
663        DUP d24, v24.d[1]
664        STR d22, [x16], 8
665        DUP d22, v22.d[1]
666        STR d20,  [x6], 8
667        DUP d20, v20.d[1]
668
66910:
670        TBZ x1, 0, 11f
671        STR s30,  [x7]
672        STR s28, [x13]
673        STR s26, [x10]
674        STR s24, [x17]
675        STR s22, [x16]
676        STR s20,  [x6]
67711:
678        # Restore x20,x21,x22,x23 from stack
679        LDP x22, x23, [sp, 80]
680        LDP x20, x21, [sp, 64]
681
682        # Restore d8-d15 from stack
683        LDP d14, d15, [sp, 48]
684        LDP d12, d13, [sp, 32]
685        LDP d10, d11, [sp, 16]
686        LDP  d8,  d9, [sp], 96
687        RET
688
689END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
690
691#ifdef __ELF__
692.section ".note.GNU-stack","",%progbits
693#endif
694