• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0  v6
45# A1   v1  v7
46# A2   v2  v8
47# A3   v3  v9
48# A4   v4 v10
49# A5   v5 v11
50# B   v12 v13 v14 v15
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59
60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73
61
62        # Clamp A and C pointers / Save d8-d15 on stack
63        STP  d8,  d9, [sp, -64]!
64        CMP x0, 2                // if mr < 2
65        ADD x9, x3, x4           // a1 = a0 + a_stride
66        ADD x16, x6, x7          // c1 = c0 + cm_stride
67        CSEL x9, x3, x9, LO      //   a1 = a0
68        CSEL x16, x6, x16, LO    //   c1 = c0
69
70        STP d10, d11, [sp, 16]
71        ADD x10, x9, x4          // a2 = a1 + a_stride
72        ADD x17, x16, x7         // c2 = c1 + cm_stride
73                                 // if mr <= 2
74        CSEL x10, x9, x10, LS    //   a2 = a1
75        CSEL x17, x16, x17, LS   //   c2 = c1
76
77        STP d12, d13, [sp, 32]
78        CMP x0, 4                // if mr < 4
79        ADD x11, x10, x4         // a3 = a2 + a_stride
80        ADD x18, x17, x7         // c3 = c2 + cm_stride
81        CSEL x11, x10, x11, LO   //   a3 = a2
82        CSEL x18, x17, x18, LO   //   c3 = c2
83
84        STP d14, d15, [sp, 48]
85        ADD x12, x11, x4         // a4 = a3 + a_stride
86        ADD x13, x18, x7         // c4 = c3 + cm_stride
87                                 // if mr <= 5
88        CSEL x12, x11, x12, LS   //   a4 = a3
89        CSEL x13, x18, x13, LS   //   c4 = c3
90
91        $if INC:
92          # Load acc, params pointer
93          LDP x15, x8, [sp, 72]
94        $else:
95          # Load params pointer
96          LDR x8, [sp, 72]
97
98        CMP x0, 6                // if mr < 6
99        ADD x4, x12, x4          // a5 = a4 + a_stride
100        ADD x7, x13, x7          // c5 = c4 + cm_stride
101        CSEL x4, x12, x4, LO     //   a5 = a4
102        CSEL x7, x13, x7, LO     //   c5 = c4
103
104        # Load cn_stride
105        LDR x14, [sp, 64]
106
107        .p2align 3
1080:
109        $if INC:
110          # Load initial accumulators
111          LDP q20, q21, [x15], 32
112          LDP q22, q23, [x15], 32
113          LDP q24, q25, [x15], 32
114          LDP q26, q27, [x15], 32
115          LDP q28, q29, [x15], 32
116          LDP q30, q31, [x15], 32
117          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
118          PRFM PLDL1KEEP, [x5, 64]
119          PRFM PLDL1KEEP, [x5, 128]
120          PRFM PLDL1KEEP, [x5, 192]
121          PRFM PLDL1KEEP,  [x3]    // Prefetch A
122          PRFM PLDL1KEEP,  [x9]
123          PRFM PLDL1KEEP, [x10]
124          PRFM PLDL1KEEP, [x11]
125          PRFM PLDL1KEEP, [x12]
126          PRFM PLDL1KEEP,  [x4]
127        $else:
128          # Load initial bias from w into accumulators
129          LDP q20, q21, [x5], 32
130          MOV v22.16b, v20.16b
131          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
132          MOV v23.16b, v21.16b
133          PRFM PLDL1KEEP, [x5, 64]
134          MOV v24.16b, v20.16b
135          PRFM PLDL1KEEP, [x5, 128]
136          MOV v25.16b, v21.16b
137          PRFM PLDL1KEEP, [x5, 192]
138          MOV v26.16b, v20.16b
139          PRFM PLDL1KEEP,  [x3]    // Prefetch A
140          MOV v27.16b, v21.16b
141          PRFM PLDL1KEEP,  [x9]
142          MOV v28.16b, v20.16b
143          PRFM PLDL1KEEP, [x10]
144          MOV v29.16b, v21.16b
145          PRFM PLDL1KEEP, [x11]
146          MOV v30.16b, v20.16b
147          PRFM PLDL1KEEP, [x12]
148          MOV v31.16b, v21.16b
149          PRFM PLDL1KEEP,  [x4]
150
151        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
152        SUBS x0, x2, 32  // k = kc - 32
153        B.LO 4f
154
155        # Prologue - loads for main loop of 96 FMA
156        # load A0 to A4 but not A5
157        LDP  q0,  q6,  [x3], 32
158        LDP  q1,  q7,  [x9], 32
159        LDP  q2,  q8, [x10], 32
160        LDP  q3,  q9, [x11], 32
161        LDP  q4,  q10, [x12], 32
162        # load first set of B
163        LDP q12, q13, [x5], 32
164        LDP q14, q15, [x5], 32
165
166        # Is there at least 8 floats (32 bytes) for main loop?
167        SUBS x0, x0, 32
168        B.LO 2f
169
170        # Main loop - 8 floats of A (32 bytes)
171        # 96 FMA + 6 LDP A + 8 LDP B
172        .p2align 3
1731:
174        # First group of 4 A.  48 FMA.  Loads A5
175
176        LDP  q5, q11, [x4], 32
177        FMLA v20.4s, v12.4s,  v0.s[0]
178        FMLA v22.4s, v12.4s,  v1.s[0]
179        LDP  q16,  q17, [x5], 32
180        FMLA v24.4s, v12.4s,  v2.s[0]
181        FMLA v26.4s, v12.4s,  v3.s[0]
182        LDP  q18,  q19, [x5], 32
183        FMLA v28.4s, v12.4s,  v4.s[0]
184        FMLA v30.4s, v12.4s,  v5.s[0]
185        FMLA v21.4s, v13.4s,  v0.s[0]
186        FMLA v23.4s, v13.4s,  v1.s[0]
187        FMLA v25.4s, v13.4s,  v2.s[0]
188        FMLA v27.4s, v13.4s,  v3.s[0]
189        FMLA v29.4s, v13.4s,  v4.s[0]
190        FMLA v31.4s, v13.4s,  v5.s[0]
191
192        FMLA v20.4s, v14.4s,  v0.s[1]
193        FMLA v22.4s, v14.4s,  v1.s[1]
194        FMLA v24.4s, v14.4s,  v2.s[1]
195        FMLA v26.4s, v14.4s,  v3.s[1]
196        FMLA v28.4s, v14.4s,  v4.s[1]
197        FMLA v30.4s, v14.4s,  v5.s[1]
198        FMLA v21.4s, v15.4s,  v0.s[1]
199        FMLA v23.4s, v15.4s,  v1.s[1]
200        FMLA v25.4s, v15.4s,  v2.s[1]
201        FMLA v27.4s, v15.4s,  v3.s[1]
202        FMLA v29.4s, v15.4s,  v4.s[1]
203        FMLA v31.4s, v15.4s,  v5.s[1]
204
205        LDP  q12,  q13, [x5], 32
206        FMLA v20.4s, v16.4s,  v0.s[2]
207        FMLA v22.4s, v16.4s,  v1.s[2]
208        LDP  q14,  q15, [x5], 32
209        FMLA v24.4s, v16.4s,  v2.s[2]
210        FMLA v26.4s, v16.4s,  v3.s[2]
211        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
212        FMLA v28.4s, v16.4s,  v4.s[2]
213        FMLA v30.4s, v16.4s,  v5.s[2]
214        PRFM PLDL1KEEP, [x5, 256]
215        FMLA v21.4s, v17.4s,  v0.s[2]
216        FMLA v23.4s, v17.4s,  v1.s[2]
217        FMLA v25.4s, v17.4s,  v2.s[2]
218        FMLA v27.4s, v17.4s,  v3.s[2]
219        FMLA v29.4s, v17.4s,  v4.s[2]
220        FMLA v31.4s, v17.4s,  v5.s[2]
221
222        FMLA v20.4s, v18.4s,  v0.s[3]
223        FMLA v22.4s, v18.4s,  v1.s[3]
224        FMLA v24.4s, v18.4s,  v2.s[3]
225        FMLA v26.4s, v18.4s,  v3.s[3]
226        FMLA v28.4s, v18.4s,  v4.s[3]
227        FMLA v30.4s, v18.4s,  v5.s[3]
228        FMLA v21.4s, v19.4s,  v0.s[3]
229        FMLA v23.4s, v19.4s,  v1.s[3]
230        FMLA v25.4s, v19.4s,  v2.s[3]
231        FMLA v27.4s, v19.4s,  v3.s[3]
232        FMLA v29.4s, v19.4s,  v4.s[3]
233        FMLA v31.4s, v19.4s,  v5.s[3]
234
235        # Second group of 4 A.  48 FMA.  Loads A0 - A4
236
237        LDP  q16,  q17, [x5], 32
238        FMLA v20.4s, v12.4s,  v6.s[0]
239        FMLA v22.4s, v12.4s,  v7.s[0]
240        LDP  q18,  q19, [x5], 32
241        FMLA v24.4s, v12.4s,  v8.s[0]
242        FMLA v26.4s, v12.4s,  v9.s[0]
243        FMLA v28.4s, v12.4s, v10.s[0]
244        FMLA v30.4s, v12.4s, v11.s[0]
245        FMLA v21.4s, v13.4s,  v6.s[0]
246        FMLA v23.4s, v13.4s,  v7.s[0]
247        FMLA v25.4s, v13.4s,  v8.s[0]
248        FMLA v27.4s, v13.4s,  v9.s[0]
249        FMLA v29.4s, v13.4s, v10.s[0]
250        FMLA v31.4s, v13.4s, v11.s[0]
251
252        FMLA v20.4s, v14.4s,  v6.s[1]
253        FMLA v22.4s, v14.4s,  v7.s[1]
254        FMLA v24.4s, v14.4s,  v8.s[1]
255        FMLA v26.4s, v14.4s,  v9.s[1]
256        FMLA v28.4s, v14.4s, v10.s[1]
257        FMLA v30.4s, v14.4s, v11.s[1]
258        FMLA v21.4s, v15.4s,  v6.s[1]
259        FMLA v23.4s, v15.4s,  v7.s[1]
260        FMLA v25.4s, v15.4s,  v8.s[1]
261        FMLA v27.4s, v15.4s,  v9.s[1]
262        FMLA v29.4s, v15.4s, v10.s[1]
263        FMLA v31.4s, v15.4s, v11.s[1]
264
265        LDP  q12,  q13, [x5], 32
266        FMLA v20.4s, v16.4s,  v6.s[2]
267        FMLA v20.4s, v18.4s,  v6.s[3]
268        LDP  q14,  q15, [x5], 32
269        FMLA v21.4s, v17.4s,  v6.s[2]
270        FMLA v21.4s, v19.4s,  v6.s[3]
271        LDP  q0,  q6, [x3], 32
272        FMLA v22.4s, v16.4s,  v7.s[2]
273        FMLA v22.4s, v18.4s,  v7.s[3]
274        FMLA v23.4s, v17.4s,  v7.s[2]
275        FMLA v23.4s, v19.4s,  v7.s[3]
276        LDP  q1,  q7, [x9], 32
277        FMLA v24.4s, v16.4s,  v8.s[2]
278        FMLA v24.4s, v18.4s,  v8.s[3]
279        FMLA v25.4s, v17.4s,  v8.s[2]
280        FMLA v25.4s, v19.4s,  v8.s[3]
281        LDP  q2,  q8, [x10], 32
282        FMLA v26.4s, v16.4s,  v9.s[2]
283        FMLA v26.4s, v18.4s,  v9.s[3]
284        FMLA v27.4s, v17.4s,  v9.s[2]
285        FMLA v27.4s, v19.4s,  v9.s[3]
286        LDP  q3,  q9, [x11], 32
287        FMLA v28.4s, v16.4s, v10.s[2]
288        FMLA v28.4s, v18.4s, v10.s[3]
289        FMLA v29.4s, v17.4s, v10.s[2]
290        FMLA v29.4s, v19.4s, v10.s[3]
291        LDP  q4,  q10, [x12], 32
292        FMLA v30.4s, v16.4s, v11.s[2]
293        FMLA v30.4s, v18.4s, v11.s[3]
294        SUBS x0, x0, 32
295        FMLA v31.4s, v17.4s, v11.s[2]
296        FMLA v31.4s, v19.4s, v11.s[3]
297        B.HS 1b
298
299        # Epilogue - 8 floats of A (32 bytes)
300        # 96 FMA + 6 LDP A + 8 LDP B
301        # First block same as main loop.  Second block has no preloads.
3022:
303        # First group of 4 A.  48 FMA.  Loads A5
304
305        LDP  q5, q11, [x4], 32
306        FMLA v20.4s, v12.4s,  v0.s[0]
307        FMLA v22.4s, v12.4s,  v1.s[0]
308        LDP  q16,  q17, [x5], 32
309        FMLA v24.4s, v12.4s,  v2.s[0]
310        FMLA v26.4s, v12.4s,  v3.s[0]
311        LDP  q18,  q19, [x5], 32
312        FMLA v28.4s, v12.4s,  v4.s[0]
313        FMLA v30.4s, v12.4s,  v5.s[0]
314        FMLA v21.4s, v13.4s,  v0.s[0]
315        FMLA v23.4s, v13.4s,  v1.s[0]
316        FMLA v25.4s, v13.4s,  v2.s[0]
317        FMLA v27.4s, v13.4s,  v3.s[0]
318        FMLA v29.4s, v13.4s,  v4.s[0]
319        FMLA v31.4s, v13.4s,  v5.s[0]
320
321        FMLA v20.4s, v14.4s,  v0.s[1]
322        FMLA v22.4s, v14.4s,  v1.s[1]
323        FMLA v24.4s, v14.4s,  v2.s[1]
324        FMLA v26.4s, v14.4s,  v3.s[1]
325        FMLA v28.4s, v14.4s,  v4.s[1]
326        FMLA v30.4s, v14.4s,  v5.s[1]
327        FMLA v21.4s, v15.4s,  v0.s[1]
328        FMLA v23.4s, v15.4s,  v1.s[1]
329        FMLA v25.4s, v15.4s,  v2.s[1]
330        FMLA v27.4s, v15.4s,  v3.s[1]
331        FMLA v29.4s, v15.4s,  v4.s[1]
332        FMLA v31.4s, v15.4s,  v5.s[1]
333
334        LDP  q12,  q13, [x5], 32
335        FMLA v20.4s, v16.4s,  v0.s[2]
336        FMLA v22.4s, v16.4s,  v1.s[2]
337        LDP  q14,  q15, [x5], 32
338        FMLA v24.4s, v16.4s,  v2.s[2]
339        FMLA v26.4s, v16.4s,  v3.s[2]
340        FMLA v28.4s, v16.4s,  v4.s[2]
341        FMLA v30.4s, v16.4s,  v5.s[2]
342        FMLA v21.4s, v17.4s,  v0.s[2]
343        FMLA v23.4s, v17.4s,  v1.s[2]
344        FMLA v25.4s, v17.4s,  v2.s[2]
345        FMLA v27.4s, v17.4s,  v3.s[2]
346        FMLA v29.4s, v17.4s,  v4.s[2]
347        FMLA v31.4s, v17.4s,  v5.s[2]
348
349        FMLA v20.4s, v18.4s,  v0.s[3]
350        FMLA v22.4s, v18.4s,  v1.s[3]
351        FMLA v24.4s, v18.4s,  v2.s[3]
352        FMLA v26.4s, v18.4s,  v3.s[3]
353        FMLA v28.4s, v18.4s,  v4.s[3]
354        FMLA v30.4s, v18.4s,  v5.s[3]
355        FMLA v21.4s, v19.4s,  v0.s[3]
356        FMLA v23.4s, v19.4s,  v1.s[3]
357        FMLA v25.4s, v19.4s,  v2.s[3]
358        FMLA v27.4s, v19.4s,  v3.s[3]
359        FMLA v29.4s, v19.4s,  v4.s[3]
360        FMLA v31.4s, v19.4s,  v5.s[3]
361
362        # Second group of 4 A.  48 FMA. No A Loads, No last B load
363
364        LDP  q16,  q17, [x5], 32
365        FMLA v20.4s, v12.4s,  v6.s[0]
366        FMLA v22.4s, v12.4s,  v7.s[0]
367        LDP  q18,  q19, [x5], 32
368        FMLA v24.4s, v12.4s,  v8.s[0]
369        FMLA v26.4s, v12.4s,  v9.s[0]
370        FMLA v28.4s, v12.4s, v10.s[0]
371        FMLA v30.4s, v12.4s, v11.s[0]
372        FMLA v21.4s, v13.4s,  v6.s[0]
373        FMLA v23.4s, v13.4s,  v7.s[0]
374        FMLA v25.4s, v13.4s,  v8.s[0]
375        FMLA v27.4s, v13.4s,  v9.s[0]
376        FMLA v29.4s, v13.4s, v10.s[0]
377        FMLA v31.4s, v13.4s, v11.s[0]
378
379        FMLA v20.4s, v14.4s,  v6.s[1]
380        FMLA v22.4s, v14.4s,  v7.s[1]
381        FMLA v24.4s, v14.4s,  v8.s[1]
382        FMLA v26.4s, v14.4s,  v9.s[1]
383        FMLA v28.4s, v14.4s, v10.s[1]
384        FMLA v30.4s, v14.4s, v11.s[1]
385        FMLA v21.4s, v15.4s,  v6.s[1]
386        FMLA v23.4s, v15.4s,  v7.s[1]
387        FMLA v25.4s, v15.4s,  v8.s[1]
388        FMLA v27.4s, v15.4s,  v9.s[1]
389        FMLA v29.4s, v15.4s, v10.s[1]
390        FMLA v31.4s, v15.4s, v11.s[1]
391
392        # Last part of epilogue has loads removed.
393
394        FMLA v20.4s, v16.4s,  v6.s[2]
395        FMLA v22.4s, v16.4s,  v7.s[2]
396        FMLA v24.4s, v16.4s,  v8.s[2]
397        FMLA v26.4s, v16.4s,  v9.s[2]
398        FMLA v28.4s, v16.4s, v10.s[2]
399        FMLA v30.4s, v16.4s, v11.s[2]
400        FMLA v21.4s, v17.4s,  v6.s[2]
401        FMLA v23.4s, v17.4s,  v7.s[2]
402        FMLA v25.4s, v17.4s,  v8.s[2]
403        FMLA v27.4s, v17.4s,  v9.s[2]
404        FMLA v29.4s, v17.4s, v10.s[2]
405        FMLA v31.4s, v17.4s, v11.s[2]
406
407        FMLA v20.4s, v18.4s,  v6.s[3]
408        FMLA v22.4s, v18.4s,  v7.s[3]
409        FMLA v24.4s, v18.4s,  v8.s[3]
410        FMLA v26.4s, v18.4s,  v9.s[3]
411        FMLA v28.4s, v18.4s, v10.s[3]
412        FMLA v30.4s, v18.4s, v11.s[3]
413        FMLA v21.4s, v19.4s,  v6.s[3]
414        FMLA v23.4s, v19.4s,  v7.s[3]
415
416        # Load clamping_params values
417        LD2R {v6.4s, v7.4s}, [x8]
418
419        FMLA v25.4s, v19.4s,  v8.s[3]
420        FMLA v27.4s, v19.4s,  v9.s[3]
421        # Is there a remainder?- 4 floats of A (16 bytes) or less
422        TST x0, 31
423        FMLA v29.4s, v19.4s, v10.s[3]
424        FMLA v31.4s, v19.4s, v11.s[3]
425        B.NE 4f
426
427        .p2align 3
428
429        # Clamp
4303:
431        SUBS x1, x1, 8
432        FMIN v20.4s, v20.4s, v6.4s
433        FMIN v21.4s, v21.4s, v6.4s
434        FMIN v22.4s, v22.4s, v6.4s
435        FMIN v23.4s, v23.4s, v6.4s
436        FMIN v24.4s, v24.4s, v6.4s
437        FMIN v25.4s, v25.4s, v6.4s
438        FMIN v26.4s, v26.4s, v6.4s
439        FMIN v27.4s, v27.4s, v6.4s
440        FMIN v28.4s, v28.4s, v6.4s
441        FMIN v29.4s, v29.4s, v6.4s
442        FMIN v30.4s, v30.4s, v6.4s
443        FMIN v31.4s, v31.4s, v6.4s
444        FMAX v20.4s, v20.4s, v7.4s
445        FMAX v21.4s, v21.4s, v7.4s
446        FMAX v22.4s, v22.4s, v7.4s
447        FMAX v23.4s, v23.4s, v7.4s
448        FMAX v24.4s, v24.4s, v7.4s
449        FMAX v25.4s, v25.4s, v7.4s
450        FMAX v26.4s, v26.4s, v7.4s
451        FMAX v27.4s, v27.4s, v7.4s
452        FMAX v28.4s, v28.4s, v7.4s
453        FMAX v29.4s, v29.4s, v7.4s
454        FMAX v30.4s, v30.4s, v7.4s
455        FMAX v31.4s, v31.4s, v7.4s
456
457        # Store full 6 x 8
458        NOP
459        B.LO 7f
460
461        $if INC:
462          STP q30, q31,  [x7]
463          ADD x7, x7, x14
464          SUB  x3,  x3, x2 // a0 -= kc
465          STP q28, q29, [x13]
466          ADD x13, x13, x14
467          SUB  x9,  x9, x2 // a1 -= kc
468          STP q26, q27, [x18]
469          ADD x18, x18, x14
470          SUB x10, x10, x2 // a2 -= kc
471          STP q24, q25, [x17]
472          ADD x17, x17, x14
473          SUB x11, x11, x2 // a3 -= kc
474          STP q22, q23, [x16]
475          ADD x16, x16, x14
476          SUB x12, x12, x2 // a4 -= kc
477          STP q20, q21,  [x6]
478          ADD  x6,  x6, x14
479          SUB  x4,  x4, x2 // a5 -= kc
480        $else:
481          STP q20, q21,  [x6]
482          ADD  x6,  x6, x14
483          SUB  x3,  x3, x2 // a0 -= kc
484          STP q22, q23, [x16]
485          ADD x16, x16, x14
486          SUB  x9,  x9, x2 // a1 -= kc
487          STP q24, q25, [x17]
488          ADD x17, x17, x14
489          SUB x10, x10, x2 // a2 -= kc
490          STP q26, q27, [x18]
491          ADD x18, x18, x14
492          SUB x11, x11, x2 // a3 -= kc
493          STP q28, q29, [x13]
494          ADD x13, x13, x14
495          SUB x12, x12, x2 // a4 -= kc
496          STP q30, q31,  [x7]
497          ADD x7, x7, x14
498          SUB  x4,  x4, x2 // a5 -= kc
499
500        NOP
501        B.HI 0b
502
503        # Restore d8-d15 from stack
504        LDP d14, d15, [sp, 48]
505        LDP d12, d13, [sp, 32]
506        LDP d10, d11, [sp, 16]
507        LDP  d8,  d9, [sp], 64
508        RET
509
510        .p2align 3
5114:
512        # Load clamping_params values
513        LD2R {v6.4s, v7.4s}, [x8]
514
515        # Is there a remainder?- 4 floats of A (16 bytes)
516        TBZ x0, 4, 5f
517
518        # Remainder- 4 floats of A (16 bytes)
519        # Load A
520        LDR   q0,  [x3], 16
521        LDR   q1,  [x9], 16
522        LDR   q2, [x10], 16
523        LDR   q3, [x11], 16
524        LDR   q4, [x12], 16
525        LDR   q5,  [x4], 16
526        # Load B
527        LDP  q12,  q13, [x5], 32
528        LDP  q14,  q15, [x5], 32
529        LDP  q16,  q17, [x5], 32
530        LDP  q18,  q19, [x5], 32
531
532        FMLA v20.4s, v12.4s,  v0.s[0]
533        FMLA v22.4s, v12.4s,  v1.s[0]
534        FMLA v24.4s, v12.4s,  v2.s[0]
535        FMLA v26.4s, v12.4s,  v3.s[0]
536        FMLA v28.4s, v12.4s,  v4.s[0]
537        FMLA v30.4s, v12.4s,  v5.s[0]
538        FMLA v21.4s, v13.4s,  v0.s[0]
539        FMLA v23.4s, v13.4s,  v1.s[0]
540        FMLA v25.4s, v13.4s,  v2.s[0]
541        FMLA v27.4s, v13.4s,  v3.s[0]
542        FMLA v29.4s, v13.4s,  v4.s[0]
543        FMLA v31.4s, v13.4s,  v5.s[0]
544
545        FMLA v20.4s, v14.4s,  v0.s[1]
546        FMLA v22.4s, v14.4s,  v1.s[1]
547        FMLA v24.4s, v14.4s,  v2.s[1]
548        FMLA v26.4s, v14.4s,  v3.s[1]
549        FMLA v28.4s, v14.4s,  v4.s[1]
550        FMLA v30.4s, v14.4s,  v5.s[1]
551        FMLA v21.4s, v15.4s,  v0.s[1]
552        FMLA v23.4s, v15.4s,  v1.s[1]
553        FMLA v25.4s, v15.4s,  v2.s[1]
554        FMLA v27.4s, v15.4s,  v3.s[1]
555        FMLA v29.4s, v15.4s,  v4.s[1]
556        FMLA v31.4s, v15.4s,  v5.s[1]
557
558        FMLA v20.4s, v16.4s,  v0.s[2]
559        FMLA v22.4s, v16.4s,  v1.s[2]
560        FMLA v24.4s, v16.4s,  v2.s[2]
561        FMLA v26.4s, v16.4s,  v3.s[2]
562        FMLA v28.4s, v16.4s,  v4.s[2]
563        FMLA v30.4s, v16.4s,  v5.s[2]
564        FMLA v21.4s, v17.4s,  v0.s[2]
565        FMLA v23.4s, v17.4s,  v1.s[2]
566        FMLA v25.4s, v17.4s,  v2.s[2]
567        FMLA v27.4s, v17.4s,  v3.s[2]
568        FMLA v29.4s, v17.4s,  v4.s[2]
569        FMLA v31.4s, v17.4s,  v5.s[2]
570
571        FMLA v20.4s, v18.4s,  v0.s[3]
572        FMLA v22.4s, v18.4s,  v1.s[3]
573        FMLA v24.4s, v18.4s,  v2.s[3]
574        FMLA v26.4s, v18.4s,  v3.s[3]
575        FMLA v28.4s, v18.4s,  v4.s[3]
576        FMLA v30.4s, v18.4s,  v5.s[3]
577        FMLA v21.4s, v19.4s,  v0.s[3]
578        FMLA v23.4s, v19.4s,  v1.s[3]
579        FMLA v25.4s, v19.4s,  v2.s[3]
580        FMLA v27.4s, v19.4s,  v3.s[3]
581        FMLA v29.4s, v19.4s,  v4.s[3]
582        FMLA v31.4s, v19.4s,  v5.s[3]
583
584        # Is there a remainder?- 2 floats of A (8 bytes)
5855:
586        TBZ x0, 3, 6f
587
588        # Remainder- 2 floats of A (8 bytes)
589        # Load A
590        LDR   d0,  [x3], 8
591        LDR   d1,  [x9], 8
592        LDR   d2, [x10], 8
593        LDR   d3, [x11], 8
594        LDR   d4, [x12], 8
595        LDR   d5,  [x4], 8
596        # Load B
597        LDP  q12,  q13, [x5], 32
598        LDP  q14,  q15, [x5], 32
599
600        FMLA v20.4s, v12.4s,  v0.s[0]
601        FMLA v22.4s, v12.4s,  v1.s[0]
602        FMLA v24.4s, v12.4s,  v2.s[0]
603        FMLA v26.4s, v12.4s,  v3.s[0]
604        FMLA v28.4s, v12.4s,  v4.s[0]
605        FMLA v30.4s, v12.4s,  v5.s[0]
606        FMLA v21.4s, v13.4s,  v0.s[0]
607        FMLA v23.4s, v13.4s,  v1.s[0]
608        FMLA v25.4s, v13.4s,  v2.s[0]
609        FMLA v27.4s, v13.4s,  v3.s[0]
610        FMLA v29.4s, v13.4s,  v4.s[0]
611        FMLA v31.4s, v13.4s,  v5.s[0]
612
613        FMLA v20.4s, v14.4s,  v0.s[1]
614        FMLA v22.4s, v14.4s,  v1.s[1]
615        FMLA v24.4s, v14.4s,  v2.s[1]
616        FMLA v26.4s, v14.4s,  v3.s[1]
617        FMLA v28.4s, v14.4s,  v4.s[1]
618        FMLA v30.4s, v14.4s,  v5.s[1]
619        FMLA v21.4s, v15.4s,  v0.s[1]
620        FMLA v23.4s, v15.4s,  v1.s[1]
621        FMLA v25.4s, v15.4s,  v2.s[1]
622        FMLA v27.4s, v15.4s,  v3.s[1]
623        FMLA v29.4s, v15.4s,  v4.s[1]
624        FMLA v31.4s, v15.4s,  v5.s[1]
625
626        # Is there a remainder?- 1 float of A (4 bytes)
6276:
628        TBZ x0, 2, 3b
629
630        # Remainder- 1 float of A (4 bytes)
631        # Load A
632        LDR   s0,  [x3], 4
633        LDR   s1,  [x9], 4
634        LDR   s2, [x10], 4
635        LDR   s3, [x11], 4
636        LDR   s4, [x12], 4
637        LDR   s5,  [x4], 4
638        # Load B
639        LDP  q12,  q13, [x5], 32
640
641        FMLA v20.4s, v12.4s,  v0.s[0]
642        FMLA v22.4s, v12.4s,  v1.s[0]
643        FMLA v24.4s, v12.4s,  v2.s[0]
644        FMLA v26.4s, v12.4s,  v3.s[0]
645        FMLA v28.4s, v12.4s,  v4.s[0]
646        FMLA v30.4s, v12.4s,  v5.s[0]
647        FMLA v21.4s, v13.4s,  v0.s[0]
648        FMLA v23.4s, v13.4s,  v1.s[0]
649        FMLA v25.4s, v13.4s,  v2.s[0]
650        FMLA v27.4s, v13.4s,  v3.s[0]
651        FMLA v29.4s, v13.4s,  v4.s[0]
652        FMLA v31.4s, v13.4s,  v5.s[0]
653        B 3b
654
655        .p2align 3
656
657        # Store odd width
6587:
659        TBZ x1, 2, 8f
660        $if INC:
661          STR q30,  [x7], 16
662          MOV v30.16b, v31.16b
663          STR q28, [x13], 16
664          MOV v28.16b, v29.16b
665          STR q26, [x18], 16
666          MOV v26.16b, v27.16b
667          STR q24, [x17], 16
668          MOV v24.16b, v25.16b
669          STR q22, [x16], 16
670          MOV v22.16b, v23.16b
671          STR q20,  [x6], 16
672          MOV v20.16b, v21.16b
673        $else:
674          STR q20,  [x6], 16
675          MOV v20.16b, v21.16b
676          STR q22, [x16], 16
677          MOV v22.16b, v23.16b
678          STR q24, [x17], 16
679          MOV v24.16b, v25.16b
680          STR q26, [x18], 16
681          MOV v26.16b, v27.16b
682          STR q28, [x13], 16
683          MOV v28.16b, v29.16b
684          STR q30,  [x7], 16
685          MOV v30.16b, v31.16b
6868:
687        TBZ x1, 1, 9f
688        $if INC:
689          STR d30,  [x7], 8
690          DUP d30, v30.d[1]
691          STR d28, [x13], 8
692          DUP d28, v28.d[1]
693          STR d26, [x18], 8
694          DUP d26, v26.d[1]
695          STR d24, [x17], 8
696          DUP d24, v24.d[1]
697          STR d22, [x16], 8
698          DUP d22, v22.d[1]
699          STR d20,  [x6], 8
700          DUP d20, v20.d[1]
701        $else:
702          STR d20,  [x6], 8
703          DUP d20, v20.d[1]
704          STR d22, [x16], 8
705          DUP d22, v22.d[1]
706          STR d24, [x17], 8
707          DUP d24, v24.d[1]
708          STR d26, [x18], 8
709          DUP d26, v26.d[1]
710          STR d28, [x13], 8
711          DUP d28, v28.d[1]
712          STR d30,  [x7], 8
713          DUP d30, v30.d[1]
714
7159:
716        TBZ x1, 0, 10f
717        $if INC:
718          STR s30,  [x7]
719          STR s28, [x13]
720          STR s26, [x18]
721          STR s24, [x17]
722          STR s22, [x16]
723          STR s20,  [x6]
724        $else:
725          STR s20,  [x6]
726          STR s22, [x16]
727          STR s24, [x17]
728          STR s26, [x18]
729          STR s28, [x13]
730          STR s30,  [x7]
73110:
732        # Restore d8-d15 from stack
733        LDP d14, d15, [sp, 48]
734        LDP d12, d13, [sp, 32]
735        LDP d10, d11, [sp, 16]
736        LDP  d8,  d9, [sp], 64
737        RET
738
739END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73
740
741#ifdef __ELF__
742.section ".note.GNU-stack","",%progbits
743#endif
744