• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0  v6
45# A1   v1  v7
46# A2   v2  v8
47# A3   v3  v9
48# A4   v4 v10
49# A5   v5 v11
50# B   v12 v13 v14 v15
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59
60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
61
62        # Clamp A and C pointers / Save d8-d15 on stack
63        STP  d8,  d9, [sp, -64]!
64        CMP x0, 2                // if mr < 2
65        ADD x9, x3, x4           // a1 = a0 + a_stride
66        ADD x16, x6, x7          // c1 = c0 + cm_stride
67        CSEL x9, x3, x9, LO      //   a1 = a0
68        CSEL x16, x6, x16, LO    //   c1 = c0
69
70        STP d10, d11, [sp, 16]
71        ADD x10, x9, x4          // a2 = a1 + a_stride
72        ADD x17, x16, x7         // c2 = c1 + cm_stride
73                                 // if mr <= 2
74        CSEL x10, x9, x10, LS    //   a2 = a1
75        CSEL x17, x16, x17, LS   //   c2 = c1
76
77        STP d12, d13, [sp, 32]
78        CMP x0, 4                // if mr < 4
79        ADD x11, x10, x4         // a3 = a2 + a_stride
80        ADD x18, x17, x7         // c3 = c2 + cm_stride
81        CSEL x11, x10, x11, LO   //   a3 = a2
82        CSEL x18, x17, x18, LO   //   c3 = c2
83
84        STP d14, d15, [sp, 48]
85        ADD x12, x11, x4         // a4 = a3 + a_stride
86        ADD x13, x18, x7         // c4 = c3 + cm_stride
87                                 // if mr <= 5
88        CSEL x12, x11, x12, LS   //   a4 = a3
89        CSEL x13, x18, x13, LS   //   c4 = c3
90
91        $if INC:
92          # Load acc, params pointer
93          LDP x15, x8, [sp, 72]
94        $else:
95          # Load params pointer
96          LDR x8, [sp, 72]
97
98        CMP x0, 6                // if mr < 6
99        ADD x4, x12, x4          // a5 = a4 + a_stride
100        ADD x7, x13, x7          // c5 = c4 + cm_stride
101        CSEL x4, x12, x4, LO     //   a5 = a4
102        CSEL x7, x13, x7, LO     //   c5 = c4
103
104        # Load cn_stride
105        LDR x14, [sp, 64]
106
1070:
108        $if INC:
109          # Load initial accumulators
110          LDP q20, q21, [x15], 32
111          LDP q22, q23, [x15], 32
112          LDP q24, q25, [x15], 32
113          LDP q26, q27, [x15], 32
114          LDP q28, q29, [x15], 32
115          LDP q30, q31, [x15], 32
116          $if PREFETCH:
117            PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
118            PRFM PLDL1KEEP, [x5, 64]
119            PRFM PLDL1KEEP, [x5, 128]
120            PRFM PLDL1KEEP, [x5, 192]
121            PRFM PLDL1KEEP,  [x3]    // Prefetch A
122            PRFM PLDL1KEEP,  [x9]
123            PRFM PLDL1KEEP, [x10]
124            PRFM PLDL1KEEP, [x11]
125            PRFM PLDL1KEEP, [x12]
126            PRFM PLDL1KEEP,  [x4]
127        $else:
128          # Load initial bias from w into accumulators
129          LDP q20, q21, [x5], 32
130          MOV v22.16b, v20.16b
131          $if PREFETCH:
132            PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
133          MOV v23.16b, v21.16b
134          $if PREFETCH:
135            PRFM PLDL1KEEP, [x5, 64]
136          MOV v24.16b, v20.16b
137          $if PREFETCH:
138            PRFM PLDL1KEEP, [x5, 128]
139          MOV v25.16b, v21.16b
140          $if PREFETCH:
141            PRFM PLDL1KEEP, [x5, 192]
142          MOV v26.16b, v20.16b
143          $if PREFETCH:
144            PRFM PLDL1KEEP,  [x3]    // Prefetch A
145          MOV v27.16b, v21.16b
146          $if PREFETCH:
147            PRFM PLDL1KEEP,  [x9]
148          MOV v28.16b, v20.16b
149          $if PREFETCH:
150            PRFM PLDL1KEEP, [x10]
151          MOV v29.16b, v21.16b
152          $if PREFETCH:
153            PRFM PLDL1KEEP, [x11]
154          MOV v30.16b, v20.16b
155          $if PREFETCH:
156            PRFM PLDL1KEEP, [x12]
157          MOV v31.16b, v21.16b
158          $if PREFETCH:
159            PRFM PLDL1KEEP,  [x4]
160
161        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
162        SUBS x0, x2, 32  // k = kc - 32
163        B.LO 4f
164
165        # Prologue - loads for main loop of 96 FMA
166        LDR   q0,  [x3], 16
167        LDR   q1,  [x9], 16
168        LDR   q2, [x10], 16
169        LDR   q3, [x11], 16
170        LDR   q4, [x12], 16
171        LDR   q5,  [x4], 16
172        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
173        LDP  q14,  q15, [x5], 32
174        LDP  q16,  q17, [x5], 32
175
176        # Is there at least 8 floats (32 bytes) for main loop?
177        SUBS x0, x0, 32
178        B.LO 2f
179
180        # Main loop - 8 floats of A (32 bytes)
181        # 96 FMA + 6 LDP A + 8 LDP B
1821:
183        # First group of 4 A.  48 FMA.
184        FMLA v20.4s, v12.4s,  v0.s[0]
185        LDP  q18,  q19, [x5], 32      // Load last B
186        FMLA v22.4s, v12.4s,  v1.s[0]
187        FMLA v24.4s, v12.4s,  v2.s[0]
188        FMLA v26.4s, v12.4s,  v3.s[0]
189        FMLA v28.4s, v12.4s,  v4.s[0]
190        FMLA v30.4s, v12.4s,  v5.s[0]
191        FMLA v21.4s, v13.4s,  v0.s[0]
192        FMLA v23.4s, v13.4s,  v1.s[0]
193        FMLA v25.4s, v13.4s,  v2.s[0]
194        FMLA v27.4s, v13.4s,  v3.s[0]
195        FMLA v29.4s, v13.4s,  v4.s[0]
196
197        FMLA v31.4s, v13.4s,  v5.s[0]
198        FMLA v20.4s, v14.4s,  v0.s[1]
199        $if PREFETCH:
200          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
201        FMLA v22.4s, v14.4s,  v1.s[1]
202        FMLA v24.4s, v14.4s,  v2.s[1]
203        FMLA v26.4s, v14.4s,  v3.s[1]
204        FMLA v28.4s, v14.4s,  v4.s[1]
205        $if PREFETCH:
206          PRFM PLDL1KEEP, [x5, 256]
207        FMLA v30.4s, v14.4s,  v5.s[1]
208        FMLA v21.4s, v15.4s,  v0.s[1]
209        FMLA v23.4s, v15.4s,  v1.s[1]
210        FMLA v25.4s, v15.4s,  v2.s[1]
211        LDR   q6,  [x3], 16            // Load next 6 A
212        FMLA v27.4s, v15.4s,  v3.s[1]
213        FMLA v29.4s, v15.4s,  v4.s[1]
214        FMLA v31.4s, v15.4s,  v5.s[1]
215        LDR   q7,  [x9], 16
216
217        FMLA v20.4s, v16.4s,  v0.s[2]
218        FMLA v22.4s, v16.4s,  v1.s[2]
219        FMLA v24.4s, v16.4s,  v2.s[2]
220        LDR   q8, [x10], 16
221        FMLA v26.4s, v16.4s,  v3.s[2]
222        FMLA v28.4s, v16.4s,  v4.s[2]
223        FMLA v30.4s, v16.4s,  v5.s[2]
224        LDR   q9, [x11], 16
225        FMLA v21.4s, v17.4s,  v0.s[2]
226        FMLA v23.4s, v17.4s,  v1.s[2]
227        FMLA v25.4s, v17.4s,  v2.s[2]
228        LDR   q10, [x12], 16
229        FMLA v27.4s, v17.4s,  v3.s[2]
230        FMLA v29.4s, v17.4s,  v4.s[2]
231        FMLA v31.4s, v17.4s,  v5.s[2]
232        LDR  q11,  [x4], 16
233
234        FMLA v20.4s, v18.4s,  v0.s[3]
235        FMLA v22.4s, v18.4s,  v1.s[3]
236        FMLA v24.4s, v18.4s,  v2.s[3]
237        LDP  q12,  q13, [x5], 32       // Load 4 B
238        FMLA v26.4s, v18.4s,  v3.s[3]
239        FMLA v28.4s, v18.4s,  v4.s[3]
240        FMLA v30.4s, v18.4s,  v5.s[3]
241        LDP  q14,  q15, [x5], 32
242        FMLA v21.4s, v19.4s,  v0.s[3]
243        FMLA v23.4s, v19.4s,  v1.s[3]
244        FMLA v25.4s, v19.4s,  v2.s[3]
245        LDP  q16,  q17, [x5], 32
246        FMLA v27.4s, v19.4s,  v3.s[3]
247        FMLA v29.4s, v19.4s,  v4.s[3]
248        FMLA v31.4s, v19.4s,  v5.s[3]
249        LDP  q18,  q19, [x5], 32
250
251        # Second group of 4 A.  48 FMA.
252        FMLA v20.4s, v12.4s,  v6.s[0]
253        FMLA v22.4s, v12.4s,  v7.s[0]
254        FMLA v24.4s, v12.4s,  v8.s[0]
255        LDR   q0,  [x3], 16           // Load next 6 A
256        FMLA v26.4s, v12.4s,  v9.s[0]
257        FMLA v28.4s, v12.4s, v10.s[0]
258        FMLA v30.4s, v12.4s, v11.s[0]
259        LDR   q1,  [x9], 16
260        FMLA v21.4s, v13.4s,  v6.s[0]
261        FMLA v23.4s, v13.4s,  v7.s[0]
262        FMLA v25.4s, v13.4s,  v8.s[0]
263        LDR   q2, [x10], 16
264        FMLA v27.4s, v13.4s,  v9.s[0]
265        FMLA v29.4s, v13.4s, v10.s[0]
266        FMLA v31.4s, v13.4s, v11.s[0]
267        LDR   q3, [x11], 16
268
269        FMLA v20.4s, v14.4s,  v6.s[1]
270        FMLA v22.4s, v14.4s,  v7.s[1]
271        FMLA v24.4s, v14.4s,  v8.s[1]
272        LDR   q4, [x12], 16
273        FMLA v26.4s, v14.4s,  v9.s[1]
274        FMLA v28.4s, v14.4s, v10.s[1]
275        FMLA v30.4s, v14.4s, v11.s[1]
276        LDR   q5,  [x4], 16
277        FMLA v21.4s, v15.4s,  v6.s[1]
278        FMLA v23.4s, v15.4s,  v7.s[1]
279        FMLA v25.4s, v15.4s,  v8.s[1]
280        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
281        FMLA v27.4s, v15.4s,  v9.s[1]
282        FMLA v29.4s, v15.4s, v10.s[1]
283        FMLA v31.4s, v15.4s, v11.s[1]
284        LDP  q14,  q15, [x5], 32
285
286        FMLA v20.4s, v16.4s,  v6.s[2]
287        FMLA v22.4s, v16.4s,  v7.s[2]
288        FMLA v24.4s, v16.4s,  v8.s[2]
289        FMLA v26.4s, v16.4s,  v9.s[2]
290        FMLA v28.4s, v16.4s, v10.s[2]
291        FMLA v30.4s, v16.4s, v11.s[2]
292        FMLA v21.4s, v17.4s,  v6.s[2]
293        FMLA v23.4s, v17.4s,  v7.s[2]
294        FMLA v25.4s, v17.4s,  v8.s[2]
295        FMLA v27.4s, v17.4s,  v9.s[2]
296        FMLA v29.4s, v17.4s, v10.s[2]
297        FMLA v31.4s, v17.4s, v11.s[2]
298        LDP  q16,  q17, [x5], 32
299
300        FMLA v20.4s, v18.4s,  v6.s[3]
301        FMLA v22.4s, v18.4s,  v7.s[3]
302        SUBS x0, x0, 32
303        FMLA v24.4s, v18.4s,  v8.s[3]
304        FMLA v26.4s, v18.4s,  v9.s[3]
305        FMLA v28.4s, v18.4s, v10.s[3]
306        FMLA v30.4s, v18.4s, v11.s[3]
307        FMLA v21.4s, v19.4s,  v6.s[3]
308        FMLA v23.4s, v19.4s,  v7.s[3]
309        FMLA v25.4s, v19.4s,  v8.s[3]
310        FMLA v27.4s, v19.4s,  v9.s[3]
311        FMLA v29.4s, v19.4s, v10.s[3]
312        FMLA v31.4s, v19.4s, v11.s[3]
313        B.HS 1b
314
315        # Epilogue - 8 floats of A (32 bytes)
316        # 96 FMA + 6 LDP A + 8 LDP B
317        # First block same as main loop.  Second block has no preloads.
3182:
319        # First group of 4 A.  48 FMA.
320        FMLA v20.4s, v12.4s,  v0.s[0]
321        LDP  q18,  q19, [x5], 32      // Load last B
322        FMLA v22.4s, v12.4s,  v1.s[0]
323        FMLA v24.4s, v12.4s,  v2.s[0]
324        FMLA v26.4s, v12.4s,  v3.s[0]
325        FMLA v28.4s, v12.4s,  v4.s[0]
326        FMLA v30.4s, v12.4s,  v5.s[0]
327        FMLA v21.4s, v13.4s,  v0.s[0]
328        FMLA v23.4s, v13.4s,  v1.s[0]
329        FMLA v25.4s, v13.4s,  v2.s[0]
330        FMLA v27.4s, v13.4s,  v3.s[0]
331        FMLA v29.4s, v13.4s,  v4.s[0]
332
333        FMLA v31.4s, v13.4s,  v5.s[0]
334        FMLA v20.4s, v14.4s,  v0.s[1]
335        $if PREFETCH:
336          PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
337        FMLA v22.4s, v14.4s,  v1.s[1]
338        FMLA v24.4s, v14.4s,  v2.s[1]
339        FMLA v26.4s, v14.4s,  v3.s[1]
340        FMLA v28.4s, v14.4s,  v4.s[1]
341        $if PREFETCH:
342          PRFM PLDL1KEEP, [x5, 256]
343        FMLA v30.4s, v14.4s,  v5.s[1]
344        FMLA v21.4s, v15.4s,  v0.s[1]
345        FMLA v23.4s, v15.4s,  v1.s[1]
346        FMLA v25.4s, v15.4s,  v2.s[1]
347        LDR   q6,  [x3], 16            // Load next 6 A
348        FMLA v27.4s, v15.4s,  v3.s[1]
349        FMLA v29.4s, v15.4s,  v4.s[1]
350        FMLA v31.4s, v15.4s,  v5.s[1]
351        LDR   q7,  [x9], 16
352
353        FMLA v20.4s, v16.4s,  v0.s[2]
354        FMLA v22.4s, v16.4s,  v1.s[2]
355        FMLA v24.4s, v16.4s,  v2.s[2]
356        LDR   q8, [x10], 16
357        FMLA v26.4s, v16.4s,  v3.s[2]
358        FMLA v28.4s, v16.4s,  v4.s[2]
359        FMLA v30.4s, v16.4s,  v5.s[2]
360        LDR   q9, [x11], 16
361        FMLA v21.4s, v17.4s,  v0.s[2]
362        FMLA v23.4s, v17.4s,  v1.s[2]
363        FMLA v25.4s, v17.4s,  v2.s[2]
364        LDR   q10, [x12], 16
365        FMLA v27.4s, v17.4s,  v3.s[2]
366        FMLA v29.4s, v17.4s,  v4.s[2]
367        FMLA v31.4s, v17.4s,  v5.s[2]
368        LDR  q11,  [x4], 16
369
370        FMLA v20.4s, v18.4s,  v0.s[3]
371        FMLA v22.4s, v18.4s,  v1.s[3]
372        FMLA v24.4s, v18.4s,  v2.s[3]
373        LDP  q12,  q13, [x5], 32       // Load 4 B
374        FMLA v26.4s, v18.4s,  v3.s[3]
375        FMLA v28.4s, v18.4s,  v4.s[3]
376        FMLA v30.4s, v18.4s,  v5.s[3]
377        LDP  q14,  q15, [x5], 32
378        FMLA v21.4s, v19.4s,  v0.s[3]
379        FMLA v23.4s, v19.4s,  v1.s[3]
380        FMLA v25.4s, v19.4s,  v2.s[3]
381        LDP  q16,  q17, [x5], 32
382        FMLA v27.4s, v19.4s,  v3.s[3]
383        FMLA v29.4s, v19.4s,  v4.s[3]
384        FMLA v31.4s, v19.4s,  v5.s[3]
385        LDP  q18,  q19, [x5], 32
386
387        # Second group of 4 A.  48 FMA.
388        FMLA v20.4s, v12.4s,  v6.s[0]
389        FMLA v22.4s, v12.4s,  v7.s[0]
390        FMLA v24.4s, v12.4s,  v8.s[0]
391        FMLA v26.4s, v12.4s,  v9.s[0]
392        FMLA v28.4s, v12.4s, v10.s[0]
393        FMLA v30.4s, v12.4s, v11.s[0]
394        FMLA v21.4s, v13.4s,  v6.s[0]
395        FMLA v23.4s, v13.4s,  v7.s[0]
396        FMLA v25.4s, v13.4s,  v8.s[0]
397        FMLA v27.4s, v13.4s,  v9.s[0]
398        FMLA v29.4s, v13.4s, v10.s[0]
399        FMLA v31.4s, v13.4s, v11.s[0]
400
401        FMLA v20.4s, v14.4s,  v6.s[1]
402        FMLA v22.4s, v14.4s,  v7.s[1]
403        FMLA v24.4s, v14.4s,  v8.s[1]
404        FMLA v26.4s, v14.4s,  v9.s[1]
405        FMLA v28.4s, v14.4s, v10.s[1]
406        FMLA v30.4s, v14.4s, v11.s[1]
407        FMLA v21.4s, v15.4s,  v6.s[1]
408        FMLA v23.4s, v15.4s,  v7.s[1]
409        FMLA v25.4s, v15.4s,  v8.s[1]
410        FMLA v27.4s, v15.4s,  v9.s[1]
411        FMLA v29.4s, v15.4s, v10.s[1]
412        FMLA v31.4s, v15.4s, v11.s[1]
413
414        FMLA v20.4s, v16.4s,  v6.s[2]
415        FMLA v22.4s, v16.4s,  v7.s[2]
416        FMLA v24.4s, v16.4s,  v8.s[2]
417        FMLA v26.4s, v16.4s,  v9.s[2]
418        FMLA v28.4s, v16.4s, v10.s[2]
419        FMLA v30.4s, v16.4s, v11.s[2]
420        FMLA v21.4s, v17.4s,  v6.s[2]
421        FMLA v23.4s, v17.4s,  v7.s[2]
422        FMLA v25.4s, v17.4s,  v8.s[2]
423        FMLA v27.4s, v17.4s,  v9.s[2]
424        FMLA v29.4s, v17.4s, v10.s[2]
425        FMLA v31.4s, v17.4s, v11.s[2]
426
427        FMLA v20.4s, v18.4s,  v6.s[3]
428        FMLA v22.4s, v18.4s,  v7.s[3]
429        FMLA v24.4s, v18.4s,  v8.s[3]
430        FMLA v26.4s, v18.4s,  v9.s[3]
431        FMLA v28.4s, v18.4s, v10.s[3]
432        FMLA v30.4s, v18.4s, v11.s[3]
433        FMLA v21.4s, v19.4s,  v6.s[3]
434        FMLA v23.4s, v19.4s,  v7.s[3]
435
436        # Load clamping_params values
437        LD2R {v6.4s, v7.4s}, [x8]
438
439        FMLA v25.4s, v19.4s,  v8.s[3]
440        FMLA v27.4s, v19.4s,  v9.s[3]
441        # Is there a remainder?- 4 floats of A (16 bytes) or less
442        TST x0, 31
443        FMLA v29.4s, v19.4s, v10.s[3]
444        FMLA v31.4s, v19.4s, v11.s[3]
445        B.NE 4f
446
447        # Clamp
4483:
449        FMIN v20.4s, v20.4s, v6.4s
450        SUBS x1, x1, 8
451        FMIN v21.4s, v21.4s, v6.4s
452        FMIN v22.4s, v22.4s, v6.4s
453        FMIN v23.4s, v23.4s, v6.4s
454        FMIN v24.4s, v24.4s, v6.4s
455        FMIN v25.4s, v25.4s, v6.4s
456        FMIN v26.4s, v26.4s, v6.4s
457        FMIN v27.4s, v27.4s, v6.4s
458        FMIN v28.4s, v28.4s, v6.4s
459        FMIN v29.4s, v29.4s, v6.4s
460        FMIN v30.4s, v30.4s, v6.4s
461        FMIN v31.4s, v31.4s, v6.4s
462        FMAX v20.4s, v20.4s, v7.4s
463        FMAX v21.4s, v21.4s, v7.4s
464        FMAX v22.4s, v22.4s, v7.4s
465        FMAX v23.4s, v23.4s, v7.4s
466        FMAX v24.4s, v24.4s, v7.4s
467        FMAX v25.4s, v25.4s, v7.4s
468        FMAX v26.4s, v26.4s, v7.4s
469        FMAX v27.4s, v27.4s, v7.4s
470        FMAX v28.4s, v28.4s, v7.4s
471        FMAX v29.4s, v29.4s, v7.4s
472        FMAX v30.4s, v30.4s, v7.4s
473        FMAX v31.4s, v31.4s, v7.4s
474
475        # Store full 6 x 8
476        B.LO 7f
477
478        $if INC:
479          STP q30, q31,  [x7]
480          ADD x7, x7, x14
481          SUB  x3,  x3, x2 // a0 -= kc
482          STP q28, q29, [x13]
483          ADD x13, x13, x14
484          SUB  x9,  x9, x2 // a1 -= kc
485          STP q26, q27, [x18]
486          ADD x18, x18, x14
487          SUB x10, x10, x2 // a2 -= kc
488          STP q24, q25, [x17]
489          ADD x17, x17, x14
490          SUB x11, x11, x2 // a3 -= kc
491          STP q22, q23, [x16]
492          ADD x16, x16, x14
493          SUB x12, x12, x2 // a4 -= kc
494          STP q20, q21,  [x6]
495          ADD  x6,  x6, x14
496          SUB  x4,  x4, x2 // a5 -= kc
497        $else:
498          STP q20, q21,  [x6]
499          ADD  x6,  x6, x14
500          SUB  x3,  x3, x2 // a0 -= kc
501          STP q22, q23, [x16]
502          ADD x16, x16, x14
503          SUB  x9,  x9, x2 // a1 -= kc
504          STP q24, q25, [x17]
505          ADD x17, x17, x14
506          SUB x10, x10, x2 // a2 -= kc
507          STP q26, q27, [x18]
508          ADD x18, x18, x14
509          SUB x11, x11, x2 // a3 -= kc
510          STP q28, q29, [x13]
511          ADD x13, x13, x14
512          SUB x12, x12, x2 // a4 -= kc
513          STP q30, q31,  [x7]
514          ADD x7, x7, x14
515          SUB  x4,  x4, x2 // a5 -= kc
516
517        B.HI 0b
518
519        # Restore d8-d15 from stack
520        LDP d14, d15, [sp, 48]
521        LDP d12, d13, [sp, 32]
522        LDP d10, d11, [sp, 16]
523        LDP  d8,  d9, [sp], 64
524        RET
525
5264:
527        # Load clamping_params values
528        LD2R {v6.4s, v7.4s}, [x8]
529
530        # Is there a remainder?- 4 floats of A (16 bytes)
531        TBZ x0, 4, 5f
532
533        # Remainder- 4 floats of A (16 bytes)
534        # Load A
535        LDR   q0,  [x3], 16
536        LDR   q1,  [x9], 16
537        LDR   q2, [x10], 16
538        LDR   q3, [x11], 16
539        LDR   q4, [x12], 16
540        LDR   q5,  [x4], 16
541        # Load B
542        LDP  q12,  q13, [x5], 32
543        LDP  q14,  q15, [x5], 32
544        LDP  q16,  q17, [x5], 32
545        LDP  q18,  q19, [x5], 32
546
547        FMLA v20.4s, v12.4s,  v0.s[0]
548        FMLA v22.4s, v12.4s,  v1.s[0]
549        FMLA v24.4s, v12.4s,  v2.s[0]
550        FMLA v26.4s, v12.4s,  v3.s[0]
551        FMLA v28.4s, v12.4s,  v4.s[0]
552        FMLA v30.4s, v12.4s,  v5.s[0]
553        FMLA v21.4s, v13.4s,  v0.s[0]
554        FMLA v23.4s, v13.4s,  v1.s[0]
555        FMLA v25.4s, v13.4s,  v2.s[0]
556        FMLA v27.4s, v13.4s,  v3.s[0]
557        FMLA v29.4s, v13.4s,  v4.s[0]
558        FMLA v31.4s, v13.4s,  v5.s[0]
559
560        FMLA v20.4s, v14.4s,  v0.s[1]
561        FMLA v22.4s, v14.4s,  v1.s[1]
562        FMLA v24.4s, v14.4s,  v2.s[1]
563        FMLA v26.4s, v14.4s,  v3.s[1]
564        FMLA v28.4s, v14.4s,  v4.s[1]
565        FMLA v30.4s, v14.4s,  v5.s[1]
566        FMLA v21.4s, v15.4s,  v0.s[1]
567        FMLA v23.4s, v15.4s,  v1.s[1]
568        FMLA v25.4s, v15.4s,  v2.s[1]
569        FMLA v27.4s, v15.4s,  v3.s[1]
570        FMLA v29.4s, v15.4s,  v4.s[1]
571        FMLA v31.4s, v15.4s,  v5.s[1]
572
573        FMLA v20.4s, v16.4s,  v0.s[2]
574        FMLA v22.4s, v16.4s,  v1.s[2]
575        FMLA v24.4s, v16.4s,  v2.s[2]
576        FMLA v26.4s, v16.4s,  v3.s[2]
577        FMLA v28.4s, v16.4s,  v4.s[2]
578        FMLA v30.4s, v16.4s,  v5.s[2]
579        FMLA v21.4s, v17.4s,  v0.s[2]
580        FMLA v23.4s, v17.4s,  v1.s[2]
581        FMLA v25.4s, v17.4s,  v2.s[2]
582        FMLA v27.4s, v17.4s,  v3.s[2]
583        FMLA v29.4s, v17.4s,  v4.s[2]
584        FMLA v31.4s, v17.4s,  v5.s[2]
585
586        FMLA v20.4s, v18.4s,  v0.s[3]
587        FMLA v22.4s, v18.4s,  v1.s[3]
588        FMLA v24.4s, v18.4s,  v2.s[3]
589        FMLA v26.4s, v18.4s,  v3.s[3]
590        FMLA v28.4s, v18.4s,  v4.s[3]
591        FMLA v30.4s, v18.4s,  v5.s[3]
592        FMLA v21.4s, v19.4s,  v0.s[3]
593        FMLA v23.4s, v19.4s,  v1.s[3]
594        FMLA v25.4s, v19.4s,  v2.s[3]
595        FMLA v27.4s, v19.4s,  v3.s[3]
596        FMLA v29.4s, v19.4s,  v4.s[3]
597        FMLA v31.4s, v19.4s,  v5.s[3]
598
599        # Is there a remainder?- 2 floats of A (8 bytes)
6005:
601        TBZ x0, 3, 6f
602
603        # Remainder- 2 floats of A (8 bytes)
604        # Load A
605        LDR   d0,  [x3], 8
606        LDR   d1,  [x9], 8
607        LDR   d2, [x10], 8
608        LDR   d3, [x11], 8
609        LDR   d4, [x12], 8
610        LDR   d5,  [x4], 8
611        # Load B
612        LDP  q12,  q13, [x5], 32
613        LDP  q14,  q15, [x5], 32
614
615        FMLA v20.4s, v12.4s,  v0.s[0]
616        FMLA v22.4s, v12.4s,  v1.s[0]
617        FMLA v24.4s, v12.4s,  v2.s[0]
618        FMLA v26.4s, v12.4s,  v3.s[0]
619        FMLA v28.4s, v12.4s,  v4.s[0]
620        FMLA v30.4s, v12.4s,  v5.s[0]
621        FMLA v21.4s, v13.4s,  v0.s[0]
622        FMLA v23.4s, v13.4s,  v1.s[0]
623        FMLA v25.4s, v13.4s,  v2.s[0]
624        FMLA v27.4s, v13.4s,  v3.s[0]
625        FMLA v29.4s, v13.4s,  v4.s[0]
626        FMLA v31.4s, v13.4s,  v5.s[0]
627
628        FMLA v20.4s, v14.4s,  v0.s[1]
629        FMLA v22.4s, v14.4s,  v1.s[1]
630        FMLA v24.4s, v14.4s,  v2.s[1]
631        FMLA v26.4s, v14.4s,  v3.s[1]
632        FMLA v28.4s, v14.4s,  v4.s[1]
633        FMLA v30.4s, v14.4s,  v5.s[1]
634        FMLA v21.4s, v15.4s,  v0.s[1]
635        FMLA v23.4s, v15.4s,  v1.s[1]
636        FMLA v25.4s, v15.4s,  v2.s[1]
637        FMLA v27.4s, v15.4s,  v3.s[1]
638        FMLA v29.4s, v15.4s,  v4.s[1]
639        FMLA v31.4s, v15.4s,  v5.s[1]
640
641        # Is there a remainder?- 1 float of A (4 bytes)
6426:
643        TBZ x0, 2, 3b
644
645        # Remainder- 1 float of A (4 bytes)
646        # Load A
647        LDR   s0,  [x3], 4
648        LDR   s1,  [x9], 4
649        LDR   s2, [x10], 4
650        LDR   s3, [x11], 4
651        LDR   s4, [x12], 4
652        LDR   s5,  [x4], 4
653        # Load B
654        LDP  q12,  q13, [x5], 32
655
656        FMLA v20.4s, v12.4s,  v0.s[0]
657        FMLA v22.4s, v12.4s,  v1.s[0]
658        FMLA v24.4s, v12.4s,  v2.s[0]
659        FMLA v26.4s, v12.4s,  v3.s[0]
660        FMLA v28.4s, v12.4s,  v4.s[0]
661        FMLA v30.4s, v12.4s,  v5.s[0]
662        FMLA v21.4s, v13.4s,  v0.s[0]
663        FMLA v23.4s, v13.4s,  v1.s[0]
664        FMLA v25.4s, v13.4s,  v2.s[0]
665        FMLA v27.4s, v13.4s,  v3.s[0]
666        FMLA v29.4s, v13.4s,  v4.s[0]
667        FMLA v31.4s, v13.4s,  v5.s[0]
668        B 3b
669
670        # Store odd width
6717:
672        TBZ x1, 2, 8f
673        $if INC:
674          STR q30,  [x7], 16
675          MOV v30.16b, v31.16b
676          STR q28, [x13], 16
677          MOV v28.16b, v29.16b
678          STR q26, [x18], 16
679          MOV v26.16b, v27.16b
680          STR q24, [x17], 16
681          MOV v24.16b, v25.16b
682          STR q22, [x16], 16
683          MOV v22.16b, v23.16b
684          STR q20,  [x6], 16
685          MOV v20.16b, v21.16b
686        $else:
687          STR q20,  [x6], 16
688          MOV v20.16b, v21.16b
689          STR q22, [x16], 16
690          MOV v22.16b, v23.16b
691          STR q24, [x17], 16
692          MOV v24.16b, v25.16b
693          STR q26, [x18], 16
694          MOV v26.16b, v27.16b
695          STR q28, [x13], 16
696          MOV v28.16b, v29.16b
697          STR q30,  [x7], 16
698          MOV v30.16b, v31.16b
6998:
700        TBZ x1, 1, 9f
701        $if INC:
702          STR d30,  [x7], 8
703          DUP d30, v30.d[1]
704          STR d28, [x13], 8
705          DUP d28, v28.d[1]
706          STR d26, [x18], 8
707          DUP d26, v26.d[1]
708          STR d24, [x17], 8
709          DUP d24, v24.d[1]
710          STR d22, [x16], 8
711          DUP d22, v22.d[1]
712          STR d20,  [x6], 8
713          DUP d20, v20.d[1]
714        $else:
715          STR d20,  [x6], 8
716          DUP d20, v20.d[1]
717          STR d22, [x16], 8
718          DUP d22, v22.d[1]
719          STR d24, [x17], 8
720          DUP d24, v24.d[1]
721          STR d26, [x18], 8
722          DUP d26, v26.d[1]
723          STR d28, [x13], 8
724          DUP d28, v28.d[1]
725          STR d30,  [x7], 8
726          DUP d30, v30.d[1]
727
7289:
729        TBZ x1, 0, 10f
730        $if INC:
731          STR s30,  [x7]
732          STR s28, [x13]
733          STR s26, [x18]
734          STR s24, [x17]
735          STR s22, [x16]
736          STR s20,  [x6]
737        $else:
738          STR s20,  [x6]
739          STR s22, [x16]
740          STR s24, [x17]
741          STR s26, [x18]
742          STR s28, [x13]
743          STR s30,  [x7]
74410:
745        # Restore d8-d15 from stack
746        LDP d14, d15, [sp, 48]
747        LDP d12, d13, [sp, 32]
748        LDP d10, d11, [sp, 16]
749        LDP  d8,  d9, [sp], 64
750        RET
751
752END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
753
754#ifdef __ELF__
755.section ".note.GNU-stack","",%progbits
756#endif
757