• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
24
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29#  x3 a0
30#  x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34#  x4 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0  v6
46# A1   v1  v7
47# A2   v2  v8
48# A3   v3  v9
49# A4   v4 v10
50# A5   v5 v11
51# B   v12 v13 v14 v15
52# B   v16 v17 v18 v19
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60
61BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75
62
63        # Clamp A and C pointers / Save d8-d15 on stack
64        STP  d8,  d9, [sp, -64]!
65        CMP x0, 2                // if mr < 2
66        ADD x9, x3, x4           // a1 = a0 + a_stride
67        ADD x16, x6, x7          // c1 = c0 + cm_stride
68        CSEL x9, x3, x9, LO      //   a1 = a0
69        CSEL x16, x6, x16, LO    //   c1 = c0
70
71        STP d10, d11, [sp, 16]
72        ADD x10, x9, x4          // a2 = a1 + a_stride
73        ADD x17, x16, x7         // c2 = c1 + cm_stride
74                                 // if mr <= 2
75        CSEL x10, x9, x10, LS    //   a2 = a1
76        CSEL x17, x16, x17, LS   //   c2 = c1
77
78        STP d12, d13, [sp, 32]
79        CMP x0, 4                // if mr < 4
80        ADD x11, x10, x4         // a3 = a2 + a_stride
81        ADD x18, x17, x7         // c3 = c2 + cm_stride
82        CSEL x11, x10, x11, LO   //   a3 = a2
83        CSEL x18, x17, x18, LO   //   c3 = c2
84
85        STP d14, d15, [sp, 48]
86        ADD x12, x11, x4         // a4 = a3 + a_stride
87        ADD x13, x18, x7         // c4 = c3 + cm_stride
88                                 // if mr <= 5
89        CSEL x12, x11, x12, LS   //   a4 = a3
90        CSEL x13, x18, x13, LS   //   c4 = c3
91
92        # Load acc, params pointer
93        LDP x15, x8, [sp, 72]
94
95        CMP x0, 6                // if mr < 6
96        ADD x4, x12, x4          // a5 = a4 + a_stride
97        ADD x7, x13, x7          // c5 = c4 + cm_stride
98        CSEL x4, x12, x4, LO     //   a5 = a4
99        CSEL x7, x13, x7, LO     //   c5 = c4
100
101        # Load cn_stride
102        LDR x14, [sp, 64]
103
1040:
105        # Load initial accumulators
106        LDP q20, q21, [x15], 32
107        LDP q22, q23, [x15], 32
108        LDP q24, q25, [x15], 32
109        LDP q26, q27, [x15], 32
110        LDP q28, q29, [x15], 32
111        LDP q30, q31, [x15], 32
112        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
113        PRFM PLDL1KEEP, [x5, 64]
114        PRFM PLDL1KEEP, [x5, 128]
115        PRFM PLDL1KEEP, [x5, 192]
116        PRFM PLDL1KEEP,  [x3]    // Prefetch A
117        PRFM PLDL1KEEP,  [x9]
118        PRFM PLDL1KEEP, [x10]
119        PRFM PLDL1KEEP, [x11]
120        PRFM PLDL1KEEP, [x12]
121        PRFM PLDL1KEEP,  [x4]
122
123        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
124        SUBS x0, x2, 32  // k = kc - 32
125        B.LO 4f
126
127        # Prologue - loads for main loop of 96 FMA
128        LDR   q0,  [x3], 16
129        LDR   q1,  [x9], 16
130        LDR   q2, [x10], 16
131        LDR   q3, [x11], 16
132        LDR   q4, [x12], 16
133        LDR   q5,  [x4], 16
134        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
135        LDP  q14,  q15, [x5], 32
136        LDP  q16,  q17, [x5], 32
137
138        # Is there at least 8 floats (32 bytes) for main loop?
139        SUBS x0, x0, 32
140        B.LO 2f
141
142        # Main loop - 8 floats of A (32 bytes)
143        # 96 FMA + 6 LDP A + 8 LDP B
1441:
145        # First group of 4 A.  48 FMA.
146        FMLA v20.4s, v12.4s,  v0.s[0]
147        LDP  q18,  q19, [x5], 32      // Load last B
148        FMLA v22.4s, v12.4s,  v1.s[0]
149        FMLA v24.4s, v12.4s,  v2.s[0]
150        FMLA v26.4s, v12.4s,  v3.s[0]
151        FMLA v28.4s, v12.4s,  v4.s[0]
152        FMLA v30.4s, v12.4s,  v5.s[0]
153        FMLA v21.4s, v13.4s,  v0.s[0]
154        FMLA v23.4s, v13.4s,  v1.s[0]
155        FMLA v25.4s, v13.4s,  v2.s[0]
156        FMLA v27.4s, v13.4s,  v3.s[0]
157        FMLA v29.4s, v13.4s,  v4.s[0]
158
159        FMLA v31.4s, v13.4s,  v5.s[0]
160        FMLA v20.4s, v14.4s,  v0.s[1]
161        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
162        FMLA v22.4s, v14.4s,  v1.s[1]
163        FMLA v24.4s, v14.4s,  v2.s[1]
164        FMLA v26.4s, v14.4s,  v3.s[1]
165        FMLA v28.4s, v14.4s,  v4.s[1]
166        PRFM PLDL1KEEP, [x5, 256]
167        FMLA v30.4s, v14.4s,  v5.s[1]
168        FMLA v21.4s, v15.4s,  v0.s[1]
169        FMLA v23.4s, v15.4s,  v1.s[1]
170        FMLA v25.4s, v15.4s,  v2.s[1]
171        LDR   q6,  [x3], 16            // Load next 6 A
172        FMLA v27.4s, v15.4s,  v3.s[1]
173        FMLA v29.4s, v15.4s,  v4.s[1]
174        FMLA v31.4s, v15.4s,  v5.s[1]
175        LDR   q7,  [x9], 16
176
177        FMLA v20.4s, v16.4s,  v0.s[2]
178        FMLA v22.4s, v16.4s,  v1.s[2]
179        FMLA v24.4s, v16.4s,  v2.s[2]
180        LDR   q8, [x10], 16
181        FMLA v26.4s, v16.4s,  v3.s[2]
182        FMLA v28.4s, v16.4s,  v4.s[2]
183        FMLA v30.4s, v16.4s,  v5.s[2]
184        LDR   q9, [x11], 16
185        FMLA v21.4s, v17.4s,  v0.s[2]
186        FMLA v23.4s, v17.4s,  v1.s[2]
187        FMLA v25.4s, v17.4s,  v2.s[2]
188        LDR   q10, [x12], 16
189        FMLA v27.4s, v17.4s,  v3.s[2]
190        FMLA v29.4s, v17.4s,  v4.s[2]
191        FMLA v31.4s, v17.4s,  v5.s[2]
192        LDR  q11,  [x4], 16
193
194        FMLA v20.4s, v18.4s,  v0.s[3]
195        FMLA v22.4s, v18.4s,  v1.s[3]
196        FMLA v24.4s, v18.4s,  v2.s[3]
197        LDP  q12,  q13, [x5], 32       // Load 4 B
198        FMLA v26.4s, v18.4s,  v3.s[3]
199        FMLA v28.4s, v18.4s,  v4.s[3]
200        FMLA v30.4s, v18.4s,  v5.s[3]
201        LDP  q14,  q15, [x5], 32
202        FMLA v21.4s, v19.4s,  v0.s[3]
203        FMLA v23.4s, v19.4s,  v1.s[3]
204        FMLA v25.4s, v19.4s,  v2.s[3]
205        LDP  q16,  q17, [x5], 32
206        FMLA v27.4s, v19.4s,  v3.s[3]
207        FMLA v29.4s, v19.4s,  v4.s[3]
208        FMLA v31.4s, v19.4s,  v5.s[3]
209        LDP  q18,  q19, [x5], 32
210
211        # Second group of 4 A.  48 FMA.
212        FMLA v20.4s, v12.4s,  v6.s[0]
213        FMLA v22.4s, v12.4s,  v7.s[0]
214        FMLA v24.4s, v12.4s,  v8.s[0]
215        LDR   q0,  [x3], 16           // Load next 6 A
216        FMLA v26.4s, v12.4s,  v9.s[0]
217        FMLA v28.4s, v12.4s, v10.s[0]
218        FMLA v30.4s, v12.4s, v11.s[0]
219        LDR   q1,  [x9], 16
220        FMLA v21.4s, v13.4s,  v6.s[0]
221        FMLA v23.4s, v13.4s,  v7.s[0]
222        FMLA v25.4s, v13.4s,  v8.s[0]
223        LDR   q2, [x10], 16
224        FMLA v27.4s, v13.4s,  v9.s[0]
225        FMLA v29.4s, v13.4s, v10.s[0]
226        FMLA v31.4s, v13.4s, v11.s[0]
227        LDR   q3, [x11], 16
228
229        FMLA v20.4s, v14.4s,  v6.s[1]
230        FMLA v22.4s, v14.4s,  v7.s[1]
231        FMLA v24.4s, v14.4s,  v8.s[1]
232        LDR   q4, [x12], 16
233        FMLA v26.4s, v14.4s,  v9.s[1]
234        FMLA v28.4s, v14.4s, v10.s[1]
235        FMLA v30.4s, v14.4s, v11.s[1]
236        LDR   q5,  [x4], 16
237        FMLA v21.4s, v15.4s,  v6.s[1]
238        FMLA v23.4s, v15.4s,  v7.s[1]
239        FMLA v25.4s, v15.4s,  v8.s[1]
240        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
241        FMLA v27.4s, v15.4s,  v9.s[1]
242        FMLA v29.4s, v15.4s, v10.s[1]
243        FMLA v31.4s, v15.4s, v11.s[1]
244        LDP  q14,  q15, [x5], 32
245
246        FMLA v20.4s, v16.4s,  v6.s[2]
247        FMLA v22.4s, v16.4s,  v7.s[2]
248        FMLA v24.4s, v16.4s,  v8.s[2]
249        FMLA v26.4s, v16.4s,  v9.s[2]
250        FMLA v28.4s, v16.4s, v10.s[2]
251        FMLA v30.4s, v16.4s, v11.s[2]
252        FMLA v21.4s, v17.4s,  v6.s[2]
253        FMLA v23.4s, v17.4s,  v7.s[2]
254        FMLA v25.4s, v17.4s,  v8.s[2]
255        FMLA v27.4s, v17.4s,  v9.s[2]
256        FMLA v29.4s, v17.4s, v10.s[2]
257        FMLA v31.4s, v17.4s, v11.s[2]
258        LDP  q16,  q17, [x5], 32
259
260        FMLA v20.4s, v18.4s,  v6.s[3]
261        FMLA v22.4s, v18.4s,  v7.s[3]
262        SUBS x0, x0, 32
263        FMLA v24.4s, v18.4s,  v8.s[3]
264        FMLA v26.4s, v18.4s,  v9.s[3]
265        FMLA v28.4s, v18.4s, v10.s[3]
266        FMLA v30.4s, v18.4s, v11.s[3]
267        FMLA v21.4s, v19.4s,  v6.s[3]
268        FMLA v23.4s, v19.4s,  v7.s[3]
269        FMLA v25.4s, v19.4s,  v8.s[3]
270        FMLA v27.4s, v19.4s,  v9.s[3]
271        FMLA v29.4s, v19.4s, v10.s[3]
272        FMLA v31.4s, v19.4s, v11.s[3]
273        B.HS 1b
274
275        # Epilogue - 8 floats of A (32 bytes)
276        # 96 FMA + 6 LDP A + 8 LDP B
277        # First block same as main loop.  Second block has no preloads.
2782:
279        # First group of 4 A.  48 FMA.
280        FMLA v20.4s, v12.4s,  v0.s[0]
281        LDP  q18,  q19, [x5], 32      // Load last B
282        FMLA v22.4s, v12.4s,  v1.s[0]
283        FMLA v24.4s, v12.4s,  v2.s[0]
284        FMLA v26.4s, v12.4s,  v3.s[0]
285        FMLA v28.4s, v12.4s,  v4.s[0]
286        FMLA v30.4s, v12.4s,  v5.s[0]
287        FMLA v21.4s, v13.4s,  v0.s[0]
288        FMLA v23.4s, v13.4s,  v1.s[0]
289        FMLA v25.4s, v13.4s,  v2.s[0]
290        FMLA v27.4s, v13.4s,  v3.s[0]
291        FMLA v29.4s, v13.4s,  v4.s[0]
292
293        FMLA v31.4s, v13.4s,  v5.s[0]
294        FMLA v20.4s, v14.4s,  v0.s[1]
295        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
296        FMLA v22.4s, v14.4s,  v1.s[1]
297        FMLA v24.4s, v14.4s,  v2.s[1]
298        FMLA v26.4s, v14.4s,  v3.s[1]
299        FMLA v28.4s, v14.4s,  v4.s[1]
300        PRFM PLDL1KEEP, [x5, 256]
301        FMLA v30.4s, v14.4s,  v5.s[1]
302        FMLA v21.4s, v15.4s,  v0.s[1]
303        FMLA v23.4s, v15.4s,  v1.s[1]
304        FMLA v25.4s, v15.4s,  v2.s[1]
305        LDR   q6,  [x3], 16            // Load next 6 A
306        FMLA v27.4s, v15.4s,  v3.s[1]
307        FMLA v29.4s, v15.4s,  v4.s[1]
308        FMLA v31.4s, v15.4s,  v5.s[1]
309        LDR   q7,  [x9], 16
310
311        FMLA v20.4s, v16.4s,  v0.s[2]
312        FMLA v22.4s, v16.4s,  v1.s[2]
313        FMLA v24.4s, v16.4s,  v2.s[2]
314        LDR   q8, [x10], 16
315        FMLA v26.4s, v16.4s,  v3.s[2]
316        FMLA v28.4s, v16.4s,  v4.s[2]
317        FMLA v30.4s, v16.4s,  v5.s[2]
318        LDR   q9, [x11], 16
319        FMLA v21.4s, v17.4s,  v0.s[2]
320        FMLA v23.4s, v17.4s,  v1.s[2]
321        FMLA v25.4s, v17.4s,  v2.s[2]
322        LDR   q10, [x12], 16
323        FMLA v27.4s, v17.4s,  v3.s[2]
324        FMLA v29.4s, v17.4s,  v4.s[2]
325        FMLA v31.4s, v17.4s,  v5.s[2]
326        LDR  q11,  [x4], 16
327
328        FMLA v20.4s, v18.4s,  v0.s[3]
329        FMLA v22.4s, v18.4s,  v1.s[3]
330        FMLA v24.4s, v18.4s,  v2.s[3]
331        LDP  q12,  q13, [x5], 32       // Load 4 B
332        FMLA v26.4s, v18.4s,  v3.s[3]
333        FMLA v28.4s, v18.4s,  v4.s[3]
334        FMLA v30.4s, v18.4s,  v5.s[3]
335        LDP  q14,  q15, [x5], 32
336        FMLA v21.4s, v19.4s,  v0.s[3]
337        FMLA v23.4s, v19.4s,  v1.s[3]
338        FMLA v25.4s, v19.4s,  v2.s[3]
339        LDP  q16,  q17, [x5], 32
340        FMLA v27.4s, v19.4s,  v3.s[3]
341        FMLA v29.4s, v19.4s,  v4.s[3]
342        FMLA v31.4s, v19.4s,  v5.s[3]
343        LDP  q18,  q19, [x5], 32
344
345        # Second group of 4 A.  48 FMA.
346        FMLA v20.4s, v12.4s,  v6.s[0]
347        FMLA v22.4s, v12.4s,  v7.s[0]
348        FMLA v24.4s, v12.4s,  v8.s[0]
349        FMLA v26.4s, v12.4s,  v9.s[0]
350        FMLA v28.4s, v12.4s, v10.s[0]
351        FMLA v30.4s, v12.4s, v11.s[0]
352        FMLA v21.4s, v13.4s,  v6.s[0]
353        FMLA v23.4s, v13.4s,  v7.s[0]
354        FMLA v25.4s, v13.4s,  v8.s[0]
355        FMLA v27.4s, v13.4s,  v9.s[0]
356        FMLA v29.4s, v13.4s, v10.s[0]
357        FMLA v31.4s, v13.4s, v11.s[0]
358
359        FMLA v20.4s, v14.4s,  v6.s[1]
360        FMLA v22.4s, v14.4s,  v7.s[1]
361        FMLA v24.4s, v14.4s,  v8.s[1]
362        FMLA v26.4s, v14.4s,  v9.s[1]
363        FMLA v28.4s, v14.4s, v10.s[1]
364        FMLA v30.4s, v14.4s, v11.s[1]
365        FMLA v21.4s, v15.4s,  v6.s[1]
366        FMLA v23.4s, v15.4s,  v7.s[1]
367        FMLA v25.4s, v15.4s,  v8.s[1]
368        FMLA v27.4s, v15.4s,  v9.s[1]
369        FMLA v29.4s, v15.4s, v10.s[1]
370        FMLA v31.4s, v15.4s, v11.s[1]
371
372        FMLA v20.4s, v16.4s,  v6.s[2]
373        FMLA v22.4s, v16.4s,  v7.s[2]
374        FMLA v24.4s, v16.4s,  v8.s[2]
375        FMLA v26.4s, v16.4s,  v9.s[2]
376        FMLA v28.4s, v16.4s, v10.s[2]
377        FMLA v30.4s, v16.4s, v11.s[2]
378        FMLA v21.4s, v17.4s,  v6.s[2]
379        FMLA v23.4s, v17.4s,  v7.s[2]
380        FMLA v25.4s, v17.4s,  v8.s[2]
381        FMLA v27.4s, v17.4s,  v9.s[2]
382        FMLA v29.4s, v17.4s, v10.s[2]
383        FMLA v31.4s, v17.4s, v11.s[2]
384
385        FMLA v20.4s, v18.4s,  v6.s[3]
386        FMLA v22.4s, v18.4s,  v7.s[3]
387        FMLA v24.4s, v18.4s,  v8.s[3]
388        FMLA v26.4s, v18.4s,  v9.s[3]
389        FMLA v28.4s, v18.4s, v10.s[3]
390        FMLA v30.4s, v18.4s, v11.s[3]
391        FMLA v21.4s, v19.4s,  v6.s[3]
392        FMLA v23.4s, v19.4s,  v7.s[3]
393
394        # Load clamping_params values
395        LD2R {v6.4s, v7.4s}, [x8]
396
397        FMLA v25.4s, v19.4s,  v8.s[3]
398        FMLA v27.4s, v19.4s,  v9.s[3]
399        # Is there a remainder?- 4 floats of A (16 bytes) or less
400        TST x0, 31
401        FMLA v29.4s, v19.4s, v10.s[3]
402        FMLA v31.4s, v19.4s, v11.s[3]
403        B.NE 4f
404
405        # Clamp
4063:
407        FMIN v20.4s, v20.4s, v6.4s
408        SUBS x1, x1, 8
409        FMIN v21.4s, v21.4s, v6.4s
410        FMIN v22.4s, v22.4s, v6.4s
411        FMIN v23.4s, v23.4s, v6.4s
412        FMIN v24.4s, v24.4s, v6.4s
413        FMIN v25.4s, v25.4s, v6.4s
414        FMIN v26.4s, v26.4s, v6.4s
415        FMIN v27.4s, v27.4s, v6.4s
416        FMIN v28.4s, v28.4s, v6.4s
417        FMIN v29.4s, v29.4s, v6.4s
418        FMIN v30.4s, v30.4s, v6.4s
419        FMIN v31.4s, v31.4s, v6.4s
420        FMAX v20.4s, v20.4s, v7.4s
421        FMAX v21.4s, v21.4s, v7.4s
422        FMAX v22.4s, v22.4s, v7.4s
423        FMAX v23.4s, v23.4s, v7.4s
424        FMAX v24.4s, v24.4s, v7.4s
425        FMAX v25.4s, v25.4s, v7.4s
426        FMAX v26.4s, v26.4s, v7.4s
427        FMAX v27.4s, v27.4s, v7.4s
428        FMAX v28.4s, v28.4s, v7.4s
429        FMAX v29.4s, v29.4s, v7.4s
430        FMAX v30.4s, v30.4s, v7.4s
431        FMAX v31.4s, v31.4s, v7.4s
432
433        # Store full 6 x 8
434        B.LO 7f
435
436        STP q30, q31,  [x7]
437        ADD x7, x7, x14
438        SUB  x3,  x3, x2 // a0 -= kc
439        STP q28, q29, [x13]
440        ADD x13, x13, x14
441        SUB  x9,  x9, x2 // a1 -= kc
442        STP q26, q27, [x18]
443        ADD x18, x18, x14
444        SUB x10, x10, x2 // a2 -= kc
445        STP q24, q25, [x17]
446        ADD x17, x17, x14
447        SUB x11, x11, x2 // a3 -= kc
448        STP q22, q23, [x16]
449        ADD x16, x16, x14
450        SUB x12, x12, x2 // a4 -= kc
451        STP q20, q21,  [x6]
452        ADD  x6,  x6, x14
453        SUB  x4,  x4, x2 // a5 -= kc
454
455        B.HI 0b
456
457        # Restore d8-d15 from stack
458        LDP d14, d15, [sp, 48]
459        LDP d12, d13, [sp, 32]
460        LDP d10, d11, [sp, 16]
461        LDP  d8,  d9, [sp], 64
462        RET
463
4644:
465        # Load clamping_params values
466        LD2R {v6.4s, v7.4s}, [x8]
467
468        # Is there a remainder?- 4 floats of A (16 bytes)
469        TBZ x0, 4, 5f
470
471        # Remainder- 4 floats of A (16 bytes)
472        # Load A
473        LDR   q0,  [x3], 16
474        LDR   q1,  [x9], 16
475        LDR   q2, [x10], 16
476        LDR   q3, [x11], 16
477        LDR   q4, [x12], 16
478        LDR   q5,  [x4], 16
479        # Load B
480        LDP  q12,  q13, [x5], 32
481        LDP  q14,  q15, [x5], 32
482        LDP  q16,  q17, [x5], 32
483        LDP  q18,  q19, [x5], 32
484
485        FMLA v20.4s, v12.4s,  v0.s[0]
486        FMLA v22.4s, v12.4s,  v1.s[0]
487        FMLA v24.4s, v12.4s,  v2.s[0]
488        FMLA v26.4s, v12.4s,  v3.s[0]
489        FMLA v28.4s, v12.4s,  v4.s[0]
490        FMLA v30.4s, v12.4s,  v5.s[0]
491        FMLA v21.4s, v13.4s,  v0.s[0]
492        FMLA v23.4s, v13.4s,  v1.s[0]
493        FMLA v25.4s, v13.4s,  v2.s[0]
494        FMLA v27.4s, v13.4s,  v3.s[0]
495        FMLA v29.4s, v13.4s,  v4.s[0]
496        FMLA v31.4s, v13.4s,  v5.s[0]
497
498        FMLA v20.4s, v14.4s,  v0.s[1]
499        FMLA v22.4s, v14.4s,  v1.s[1]
500        FMLA v24.4s, v14.4s,  v2.s[1]
501        FMLA v26.4s, v14.4s,  v3.s[1]
502        FMLA v28.4s, v14.4s,  v4.s[1]
503        FMLA v30.4s, v14.4s,  v5.s[1]
504        FMLA v21.4s, v15.4s,  v0.s[1]
505        FMLA v23.4s, v15.4s,  v1.s[1]
506        FMLA v25.4s, v15.4s,  v2.s[1]
507        FMLA v27.4s, v15.4s,  v3.s[1]
508        FMLA v29.4s, v15.4s,  v4.s[1]
509        FMLA v31.4s, v15.4s,  v5.s[1]
510
511        FMLA v20.4s, v16.4s,  v0.s[2]
512        FMLA v22.4s, v16.4s,  v1.s[2]
513        FMLA v24.4s, v16.4s,  v2.s[2]
514        FMLA v26.4s, v16.4s,  v3.s[2]
515        FMLA v28.4s, v16.4s,  v4.s[2]
516        FMLA v30.4s, v16.4s,  v5.s[2]
517        FMLA v21.4s, v17.4s,  v0.s[2]
518        FMLA v23.4s, v17.4s,  v1.s[2]
519        FMLA v25.4s, v17.4s,  v2.s[2]
520        FMLA v27.4s, v17.4s,  v3.s[2]
521        FMLA v29.4s, v17.4s,  v4.s[2]
522        FMLA v31.4s, v17.4s,  v5.s[2]
523
524        FMLA v20.4s, v18.4s,  v0.s[3]
525        FMLA v22.4s, v18.4s,  v1.s[3]
526        FMLA v24.4s, v18.4s,  v2.s[3]
527        FMLA v26.4s, v18.4s,  v3.s[3]
528        FMLA v28.4s, v18.4s,  v4.s[3]
529        FMLA v30.4s, v18.4s,  v5.s[3]
530        FMLA v21.4s, v19.4s,  v0.s[3]
531        FMLA v23.4s, v19.4s,  v1.s[3]
532        FMLA v25.4s, v19.4s,  v2.s[3]
533        FMLA v27.4s, v19.4s,  v3.s[3]
534        FMLA v29.4s, v19.4s,  v4.s[3]
535        FMLA v31.4s, v19.4s,  v5.s[3]
536
537        # Is there a remainder?- 2 floats of A (8 bytes)
5385:
539        TBZ x0, 3, 6f
540
541        # Remainder- 2 floats of A (8 bytes)
542        # Load A
543        LDR   d0,  [x3], 8
544        LDR   d1,  [x9], 8
545        LDR   d2, [x10], 8
546        LDR   d3, [x11], 8
547        LDR   d4, [x12], 8
548        LDR   d5,  [x4], 8
549        # Load B
550        LDP  q12,  q13, [x5], 32
551        LDP  q14,  q15, [x5], 32
552
553        FMLA v20.4s, v12.4s,  v0.s[0]
554        FMLA v22.4s, v12.4s,  v1.s[0]
555        FMLA v24.4s, v12.4s,  v2.s[0]
556        FMLA v26.4s, v12.4s,  v3.s[0]
557        FMLA v28.4s, v12.4s,  v4.s[0]
558        FMLA v30.4s, v12.4s,  v5.s[0]
559        FMLA v21.4s, v13.4s,  v0.s[0]
560        FMLA v23.4s, v13.4s,  v1.s[0]
561        FMLA v25.4s, v13.4s,  v2.s[0]
562        FMLA v27.4s, v13.4s,  v3.s[0]
563        FMLA v29.4s, v13.4s,  v4.s[0]
564        FMLA v31.4s, v13.4s,  v5.s[0]
565
566        FMLA v20.4s, v14.4s,  v0.s[1]
567        FMLA v22.4s, v14.4s,  v1.s[1]
568        FMLA v24.4s, v14.4s,  v2.s[1]
569        FMLA v26.4s, v14.4s,  v3.s[1]
570        FMLA v28.4s, v14.4s,  v4.s[1]
571        FMLA v30.4s, v14.4s,  v5.s[1]
572        FMLA v21.4s, v15.4s,  v0.s[1]
573        FMLA v23.4s, v15.4s,  v1.s[1]
574        FMLA v25.4s, v15.4s,  v2.s[1]
575        FMLA v27.4s, v15.4s,  v3.s[1]
576        FMLA v29.4s, v15.4s,  v4.s[1]
577        FMLA v31.4s, v15.4s,  v5.s[1]
578
579        # Is there a remainder?- 1 float of A (4 bytes)
5806:
581        TBZ x0, 2, 3b
582
583        # Remainder- 1 float of A (4 bytes)
584        # Load A
585        LDR   s0,  [x3], 4
586        LDR   s1,  [x9], 4
587        LDR   s2, [x10], 4
588        LDR   s3, [x11], 4
589        LDR   s4, [x12], 4
590        LDR   s5,  [x4], 4
591        # Load B
592        LDP  q12,  q13, [x5], 32
593
594        FMLA v20.4s, v12.4s,  v0.s[0]
595        FMLA v22.4s, v12.4s,  v1.s[0]
596        FMLA v24.4s, v12.4s,  v2.s[0]
597        FMLA v26.4s, v12.4s,  v3.s[0]
598        FMLA v28.4s, v12.4s,  v4.s[0]
599        FMLA v30.4s, v12.4s,  v5.s[0]
600        FMLA v21.4s, v13.4s,  v0.s[0]
601        FMLA v23.4s, v13.4s,  v1.s[0]
602        FMLA v25.4s, v13.4s,  v2.s[0]
603        FMLA v27.4s, v13.4s,  v3.s[0]
604        FMLA v29.4s, v13.4s,  v4.s[0]
605        FMLA v31.4s, v13.4s,  v5.s[0]
606        B 3b
607
608        # Store odd width
6097:
610        TBZ x1, 2, 8f
611        STR q30,  [x7], 16
612        MOV v30.16b, v31.16b
613        STR q28, [x13], 16
614        MOV v28.16b, v29.16b
615        STR q26, [x18], 16
616        MOV v26.16b, v27.16b
617        STR q24, [x17], 16
618        MOV v24.16b, v25.16b
619        STR q22, [x16], 16
620        MOV v22.16b, v23.16b
621        STR q20,  [x6], 16
622        MOV v20.16b, v21.16b
6238:
624        TBZ x1, 1, 9f
625        STR d30,  [x7], 8
626        DUP d30, v30.d[1]
627        STR d28, [x13], 8
628        DUP d28, v28.d[1]
629        STR d26, [x18], 8
630        DUP d26, v26.d[1]
631        STR d24, [x17], 8
632        DUP d24, v24.d[1]
633        STR d22, [x16], 8
634        DUP d22, v22.d[1]
635        STR d20,  [x6], 8
636        DUP d20, v20.d[1]
637
6389:
639        TBZ x1, 0, 10f
640        STR s30,  [x7]
641        STR s28, [x13]
642        STR s26, [x18]
643        STR s24, [x17]
644        STR s22, [x16]
645        STR s20,  [x6]
64610:
647        # Restore d8-d15 from stack
648        LDP d14, d15, [sp, 48]
649        LDP d12, d13, [sp, 32]
650        LDP d10, d11, [sp, 16]
651        LDP  d8,  d9, [sp], 64
652        RET
653
654END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75
655
656#ifdef __ELF__
657.section ".note.GNU-stack","",%progbits
658#endif
659