• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
24
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29#  x3 a0
30#  x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34#  x4 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0  v6
46# A1   v1  v7
47# A2   v2  v8
48# A3   v3  v9
49# A4   v4 v10
50# A5   v5 v11
51# B   v12 v13 v14 v15
52# B   v16 v17 v18 v19
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60
61BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57
62
63        # Clamp A and C pointers / Save d8-d15 on stack
64        STP  d8,  d9, [sp, -64]!
65        CMP x0, 2                // if mr < 2
66        ADD x9, x3, x4           // a1 = a0 + a_stride
67        ADD x16, x6, x7          // c1 = c0 + cm_stride
68        CSEL x9, x3, x9, LO      //   a1 = a0
69        CSEL x16, x6, x16, LO    //   c1 = c0
70
71        STP d10, d11, [sp, 16]
72        ADD x10, x9, x4          // a2 = a1 + a_stride
73        ADD x17, x16, x7         // c2 = c1 + cm_stride
74                                 // if mr <= 2
75        CSEL x10, x9, x10, LS    //   a2 = a1
76        CSEL x17, x16, x17, LS   //   c2 = c1
77
78        STP d12, d13, [sp, 32]
79        CMP x0, 4                // if mr < 4
80        ADD x11, x10, x4         // a3 = a2 + a_stride
81        ADD x18, x17, x7         // c3 = c2 + cm_stride
82        CSEL x11, x10, x11, LO   //   a3 = a2
83        CSEL x18, x17, x18, LO   //   c3 = c2
84
85        STP d14, d15, [sp, 48]
86        ADD x12, x11, x4         // a4 = a3 + a_stride
87        ADD x13, x18, x7         // c4 = c3 + cm_stride
88                                 // if mr <= 5
89        CSEL x12, x11, x12, LS   //   a4 = a3
90        CSEL x13, x18, x13, LS   //   c4 = c3
91
92        # Load acc, params pointer
93        LDP x15, x8, [sp, 72]
94
95        CMP x0, 6                // if mr < 6
96        ADD x4, x12, x4          // a5 = a4 + a_stride
97        ADD x7, x13, x7          // c5 = c4 + cm_stride
98        CSEL x4, x12, x4, LO     //   a5 = a4
99        CSEL x7, x13, x7, LO     //   c5 = c4
100
101        # Load cn_stride
102        LDR x14, [sp, 64]
103
1040:
105        # Load initial accumulators
106        LDP q20, q21, [x15], 32
107        LDP q22, q23, [x15], 32
108        LDP q24, q25, [x15], 32
109        LDP q26, q27, [x15], 32
110        LDP q28, q29, [x15], 32
111        LDP q30, q31, [x15], 32
112
113        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
114        SUBS x0, x2, 32  // k = kc - 32
115        B.LO 4f
116
117        # Prologue - loads for main loop of 96 FMA
118        LDR   q0,  [x3], 16
119        LDR   q1,  [x9], 16
120        LDR   q2, [x10], 16
121        LDR   q3, [x11], 16
122        LDR   q4, [x12], 16
123        LDR   q5,  [x4], 16
124        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
125        LDP  q14,  q15, [x5], 32
126        LDP  q16,  q17, [x5], 32
127
128        # Is there at least 8 floats (32 bytes) for main loop?
129        SUBS x0, x0, 32
130        B.LO 2f
131
132        # Main loop - 8 floats of A (32 bytes)
133        # 96 FMA + 6 LDP A + 8 LDP B
1341:
135        # First group of 4 A.  48 FMA.
136        FMLA v20.4s, v12.4s,  v0.s[0]
137        LDP  q18,  q19, [x5], 32      // Load last B
138        FMLA v22.4s, v12.4s,  v1.s[0]
139        FMLA v24.4s, v12.4s,  v2.s[0]
140        FMLA v26.4s, v12.4s,  v3.s[0]
141        FMLA v28.4s, v12.4s,  v4.s[0]
142        FMLA v30.4s, v12.4s,  v5.s[0]
143        FMLA v21.4s, v13.4s,  v0.s[0]
144        FMLA v23.4s, v13.4s,  v1.s[0]
145        FMLA v25.4s, v13.4s,  v2.s[0]
146        FMLA v27.4s, v13.4s,  v3.s[0]
147        FMLA v29.4s, v13.4s,  v4.s[0]
148
149        FMLA v31.4s, v13.4s,  v5.s[0]
150        FMLA v20.4s, v14.4s,  v0.s[1]
151        FMLA v22.4s, v14.4s,  v1.s[1]
152        FMLA v24.4s, v14.4s,  v2.s[1]
153        FMLA v26.4s, v14.4s,  v3.s[1]
154        FMLA v28.4s, v14.4s,  v4.s[1]
155        FMLA v30.4s, v14.4s,  v5.s[1]
156        FMLA v21.4s, v15.4s,  v0.s[1]
157        FMLA v23.4s, v15.4s,  v1.s[1]
158        FMLA v25.4s, v15.4s,  v2.s[1]
159        LDR   q6,  [x3], 16            // Load next 6 A
160        FMLA v27.4s, v15.4s,  v3.s[1]
161        FMLA v29.4s, v15.4s,  v4.s[1]
162        FMLA v31.4s, v15.4s,  v5.s[1]
163        LDR   q7,  [x9], 16
164
165        FMLA v20.4s, v16.4s,  v0.s[2]
166        FMLA v22.4s, v16.4s,  v1.s[2]
167        FMLA v24.4s, v16.4s,  v2.s[2]
168        LDR   q8, [x10], 16
169        FMLA v26.4s, v16.4s,  v3.s[2]
170        FMLA v28.4s, v16.4s,  v4.s[2]
171        FMLA v30.4s, v16.4s,  v5.s[2]
172        LDR   q9, [x11], 16
173        FMLA v21.4s, v17.4s,  v0.s[2]
174        FMLA v23.4s, v17.4s,  v1.s[2]
175        FMLA v25.4s, v17.4s,  v2.s[2]
176        LDR   q10, [x12], 16
177        FMLA v27.4s, v17.4s,  v3.s[2]
178        FMLA v29.4s, v17.4s,  v4.s[2]
179        FMLA v31.4s, v17.4s,  v5.s[2]
180        LDR  q11,  [x4], 16
181
182        FMLA v20.4s, v18.4s,  v0.s[3]
183        FMLA v22.4s, v18.4s,  v1.s[3]
184        FMLA v24.4s, v18.4s,  v2.s[3]
185        LDP  q12,  q13, [x5], 32       // Load 4 B
186        FMLA v26.4s, v18.4s,  v3.s[3]
187        FMLA v28.4s, v18.4s,  v4.s[3]
188        FMLA v30.4s, v18.4s,  v5.s[3]
189        LDP  q14,  q15, [x5], 32
190        FMLA v21.4s, v19.4s,  v0.s[3]
191        FMLA v23.4s, v19.4s,  v1.s[3]
192        FMLA v25.4s, v19.4s,  v2.s[3]
193        LDP  q16,  q17, [x5], 32
194        FMLA v27.4s, v19.4s,  v3.s[3]
195        FMLA v29.4s, v19.4s,  v4.s[3]
196        FMLA v31.4s, v19.4s,  v5.s[3]
197        LDP  q18,  q19, [x5], 32
198
199        # Second group of 4 A.  48 FMA.
200        FMLA v20.4s, v12.4s,  v6.s[0]
201        FMLA v22.4s, v12.4s,  v7.s[0]
202        FMLA v24.4s, v12.4s,  v8.s[0]
203        LDR   q0,  [x3], 16           // Load next 6 A
204        FMLA v26.4s, v12.4s,  v9.s[0]
205        FMLA v28.4s, v12.4s, v10.s[0]
206        FMLA v30.4s, v12.4s, v11.s[0]
207        LDR   q1,  [x9], 16
208        FMLA v21.4s, v13.4s,  v6.s[0]
209        FMLA v23.4s, v13.4s,  v7.s[0]
210        FMLA v25.4s, v13.4s,  v8.s[0]
211        LDR   q2, [x10], 16
212        FMLA v27.4s, v13.4s,  v9.s[0]
213        FMLA v29.4s, v13.4s, v10.s[0]
214        FMLA v31.4s, v13.4s, v11.s[0]
215        LDR   q3, [x11], 16
216
217        FMLA v20.4s, v14.4s,  v6.s[1]
218        FMLA v22.4s, v14.4s,  v7.s[1]
219        FMLA v24.4s, v14.4s,  v8.s[1]
220        LDR   q4, [x12], 16
221        FMLA v26.4s, v14.4s,  v9.s[1]
222        FMLA v28.4s, v14.4s, v10.s[1]
223        FMLA v30.4s, v14.4s, v11.s[1]
224        LDR   q5,  [x4], 16
225        FMLA v21.4s, v15.4s,  v6.s[1]
226        FMLA v23.4s, v15.4s,  v7.s[1]
227        FMLA v25.4s, v15.4s,  v8.s[1]
228        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
229        FMLA v27.4s, v15.4s,  v9.s[1]
230        FMLA v29.4s, v15.4s, v10.s[1]
231        FMLA v31.4s, v15.4s, v11.s[1]
232        LDP  q14,  q15, [x5], 32
233
234        FMLA v20.4s, v16.4s,  v6.s[2]
235        FMLA v22.4s, v16.4s,  v7.s[2]
236        FMLA v24.4s, v16.4s,  v8.s[2]
237        FMLA v26.4s, v16.4s,  v9.s[2]
238        FMLA v28.4s, v16.4s, v10.s[2]
239        FMLA v30.4s, v16.4s, v11.s[2]
240        FMLA v21.4s, v17.4s,  v6.s[2]
241        FMLA v23.4s, v17.4s,  v7.s[2]
242        FMLA v25.4s, v17.4s,  v8.s[2]
243        FMLA v27.4s, v17.4s,  v9.s[2]
244        FMLA v29.4s, v17.4s, v10.s[2]
245        FMLA v31.4s, v17.4s, v11.s[2]
246        LDP  q16,  q17, [x5], 32
247
248        FMLA v20.4s, v18.4s,  v6.s[3]
249        FMLA v22.4s, v18.4s,  v7.s[3]
250        SUBS x0, x0, 32
251        FMLA v24.4s, v18.4s,  v8.s[3]
252        FMLA v26.4s, v18.4s,  v9.s[3]
253        FMLA v28.4s, v18.4s, v10.s[3]
254        FMLA v30.4s, v18.4s, v11.s[3]
255        FMLA v21.4s, v19.4s,  v6.s[3]
256        FMLA v23.4s, v19.4s,  v7.s[3]
257        FMLA v25.4s, v19.4s,  v8.s[3]
258        FMLA v27.4s, v19.4s,  v9.s[3]
259        FMLA v29.4s, v19.4s, v10.s[3]
260        FMLA v31.4s, v19.4s, v11.s[3]
261        B.HS 1b
262
263        # Epilogue - 8 floats of A (32 bytes)
264        # 96 FMA + 6 LDP A + 8 LDP B
265        # First block same as main loop.  Second block has no preloads.
2662:
267        # First group of 4 A.  48 FMA.
268        FMLA v20.4s, v12.4s,  v0.s[0]
269        LDP  q18,  q19, [x5], 32      // Load last B
270        FMLA v22.4s, v12.4s,  v1.s[0]
271        FMLA v24.4s, v12.4s,  v2.s[0]
272        FMLA v26.4s, v12.4s,  v3.s[0]
273        FMLA v28.4s, v12.4s,  v4.s[0]
274        FMLA v30.4s, v12.4s,  v5.s[0]
275        FMLA v21.4s, v13.4s,  v0.s[0]
276        FMLA v23.4s, v13.4s,  v1.s[0]
277        FMLA v25.4s, v13.4s,  v2.s[0]
278        FMLA v27.4s, v13.4s,  v3.s[0]
279        FMLA v29.4s, v13.4s,  v4.s[0]
280
281        FMLA v31.4s, v13.4s,  v5.s[0]
282        FMLA v20.4s, v14.4s,  v0.s[1]
283        FMLA v22.4s, v14.4s,  v1.s[1]
284        FMLA v24.4s, v14.4s,  v2.s[1]
285        FMLA v26.4s, v14.4s,  v3.s[1]
286        FMLA v28.4s, v14.4s,  v4.s[1]
287        FMLA v30.4s, v14.4s,  v5.s[1]
288        FMLA v21.4s, v15.4s,  v0.s[1]
289        FMLA v23.4s, v15.4s,  v1.s[1]
290        FMLA v25.4s, v15.4s,  v2.s[1]
291        LDR   q6,  [x3], 16            // Load next 6 A
292        FMLA v27.4s, v15.4s,  v3.s[1]
293        FMLA v29.4s, v15.4s,  v4.s[1]
294        FMLA v31.4s, v15.4s,  v5.s[1]
295        LDR   q7,  [x9], 16
296
297        FMLA v20.4s, v16.4s,  v0.s[2]
298        FMLA v22.4s, v16.4s,  v1.s[2]
299        FMLA v24.4s, v16.4s,  v2.s[2]
300        LDR   q8, [x10], 16
301        FMLA v26.4s, v16.4s,  v3.s[2]
302        FMLA v28.4s, v16.4s,  v4.s[2]
303        FMLA v30.4s, v16.4s,  v5.s[2]
304        LDR   q9, [x11], 16
305        FMLA v21.4s, v17.4s,  v0.s[2]
306        FMLA v23.4s, v17.4s,  v1.s[2]
307        FMLA v25.4s, v17.4s,  v2.s[2]
308        LDR   q10, [x12], 16
309        FMLA v27.4s, v17.4s,  v3.s[2]
310        FMLA v29.4s, v17.4s,  v4.s[2]
311        FMLA v31.4s, v17.4s,  v5.s[2]
312        LDR  q11,  [x4], 16
313
314        FMLA v20.4s, v18.4s,  v0.s[3]
315        FMLA v22.4s, v18.4s,  v1.s[3]
316        FMLA v24.4s, v18.4s,  v2.s[3]
317        LDP  q12,  q13, [x5], 32       // Load 4 B
318        FMLA v26.4s, v18.4s,  v3.s[3]
319        FMLA v28.4s, v18.4s,  v4.s[3]
320        FMLA v30.4s, v18.4s,  v5.s[3]
321        LDP  q14,  q15, [x5], 32
322        FMLA v21.4s, v19.4s,  v0.s[3]
323        FMLA v23.4s, v19.4s,  v1.s[3]
324        FMLA v25.4s, v19.4s,  v2.s[3]
325        LDP  q16,  q17, [x5], 32
326        FMLA v27.4s, v19.4s,  v3.s[3]
327        FMLA v29.4s, v19.4s,  v4.s[3]
328        FMLA v31.4s, v19.4s,  v5.s[3]
329        LDP  q18,  q19, [x5], 32
330
331        # Second group of 4 A.  48 FMA.
332        FMLA v20.4s, v12.4s,  v6.s[0]
333        FMLA v22.4s, v12.4s,  v7.s[0]
334        FMLA v24.4s, v12.4s,  v8.s[0]
335        FMLA v26.4s, v12.4s,  v9.s[0]
336        FMLA v28.4s, v12.4s, v10.s[0]
337        FMLA v30.4s, v12.4s, v11.s[0]
338        FMLA v21.4s, v13.4s,  v6.s[0]
339        FMLA v23.4s, v13.4s,  v7.s[0]
340        FMLA v25.4s, v13.4s,  v8.s[0]
341        FMLA v27.4s, v13.4s,  v9.s[0]
342        FMLA v29.4s, v13.4s, v10.s[0]
343        FMLA v31.4s, v13.4s, v11.s[0]
344
345        FMLA v20.4s, v14.4s,  v6.s[1]
346        FMLA v22.4s, v14.4s,  v7.s[1]
347        FMLA v24.4s, v14.4s,  v8.s[1]
348        FMLA v26.4s, v14.4s,  v9.s[1]
349        FMLA v28.4s, v14.4s, v10.s[1]
350        FMLA v30.4s, v14.4s, v11.s[1]
351        FMLA v21.4s, v15.4s,  v6.s[1]
352        FMLA v23.4s, v15.4s,  v7.s[1]
353        FMLA v25.4s, v15.4s,  v8.s[1]
354        FMLA v27.4s, v15.4s,  v9.s[1]
355        FMLA v29.4s, v15.4s, v10.s[1]
356        FMLA v31.4s, v15.4s, v11.s[1]
357
358        FMLA v20.4s, v16.4s,  v6.s[2]
359        FMLA v22.4s, v16.4s,  v7.s[2]
360        FMLA v24.4s, v16.4s,  v8.s[2]
361        FMLA v26.4s, v16.4s,  v9.s[2]
362        FMLA v28.4s, v16.4s, v10.s[2]
363        FMLA v30.4s, v16.4s, v11.s[2]
364        FMLA v21.4s, v17.4s,  v6.s[2]
365        FMLA v23.4s, v17.4s,  v7.s[2]
366        FMLA v25.4s, v17.4s,  v8.s[2]
367        FMLA v27.4s, v17.4s,  v9.s[2]
368        FMLA v29.4s, v17.4s, v10.s[2]
369        FMLA v31.4s, v17.4s, v11.s[2]
370
371        FMLA v20.4s, v18.4s,  v6.s[3]
372        FMLA v22.4s, v18.4s,  v7.s[3]
373        FMLA v24.4s, v18.4s,  v8.s[3]
374        FMLA v26.4s, v18.4s,  v9.s[3]
375        FMLA v28.4s, v18.4s, v10.s[3]
376        FMLA v30.4s, v18.4s, v11.s[3]
377        FMLA v21.4s, v19.4s,  v6.s[3]
378        FMLA v23.4s, v19.4s,  v7.s[3]
379
380        # Load clamping_params values
381        LD2R {v6.4s, v7.4s}, [x8]
382
383        FMLA v25.4s, v19.4s,  v8.s[3]
384        FMLA v27.4s, v19.4s,  v9.s[3]
385        # Is there a remainder?- 4 floats of A (16 bytes) or less
386        TST x0, 31
387        FMLA v29.4s, v19.4s, v10.s[3]
388        FMLA v31.4s, v19.4s, v11.s[3]
389        B.NE 4f
390
391        # Clamp
3923:
393        FMIN v20.4s, v20.4s, v6.4s
394        SUBS x1, x1, 8
395        FMIN v21.4s, v21.4s, v6.4s
396        FMIN v22.4s, v22.4s, v6.4s
397        FMIN v23.4s, v23.4s, v6.4s
398        FMIN v24.4s, v24.4s, v6.4s
399        FMIN v25.4s, v25.4s, v6.4s
400        FMIN v26.4s, v26.4s, v6.4s
401        FMIN v27.4s, v27.4s, v6.4s
402        FMIN v28.4s, v28.4s, v6.4s
403        FMIN v29.4s, v29.4s, v6.4s
404        FMIN v30.4s, v30.4s, v6.4s
405        FMIN v31.4s, v31.4s, v6.4s
406        FMAX v20.4s, v20.4s, v7.4s
407        FMAX v21.4s, v21.4s, v7.4s
408        FMAX v22.4s, v22.4s, v7.4s
409        FMAX v23.4s, v23.4s, v7.4s
410        FMAX v24.4s, v24.4s, v7.4s
411        FMAX v25.4s, v25.4s, v7.4s
412        FMAX v26.4s, v26.4s, v7.4s
413        FMAX v27.4s, v27.4s, v7.4s
414        FMAX v28.4s, v28.4s, v7.4s
415        FMAX v29.4s, v29.4s, v7.4s
416        FMAX v30.4s, v30.4s, v7.4s
417        FMAX v31.4s, v31.4s, v7.4s
418
419        # Store full 6 x 8
420        B.LO 7f
421
422        STP q30, q31,  [x7]
423        ADD x7, x7, x14
424        SUB  x3,  x3, x2 // a0 -= kc
425        STP q28, q29, [x13]
426        ADD x13, x13, x14
427        SUB  x9,  x9, x2 // a1 -= kc
428        STP q26, q27, [x18]
429        ADD x18, x18, x14
430        SUB x10, x10, x2 // a2 -= kc
431        STP q24, q25, [x17]
432        ADD x17, x17, x14
433        SUB x11, x11, x2 // a3 -= kc
434        STP q22, q23, [x16]
435        ADD x16, x16, x14
436        SUB x12, x12, x2 // a4 -= kc
437        STP q20, q21,  [x6]
438        ADD  x6,  x6, x14
439        SUB  x4,  x4, x2 // a5 -= kc
440
441        B.HI 0b
442
443        # Restore d8-d15 from stack
444        LDP d14, d15, [sp, 48]
445        LDP d12, d13, [sp, 32]
446        LDP d10, d11, [sp, 16]
447        LDP  d8,  d9, [sp], 64
448        RET
449
4504:
451        # Load clamping_params values
452        LD2R {v6.4s, v7.4s}, [x8]
453
454        # Is there a remainder?- 4 floats of A (16 bytes)
455        TBZ x0, 4, 5f
456
457        # Remainder- 4 floats of A (16 bytes)
458        # Load A
459        LDR   q0,  [x3], 16
460        LDR   q1,  [x9], 16
461        LDR   q2, [x10], 16
462        LDR   q3, [x11], 16
463        LDR   q4, [x12], 16
464        LDR   q5,  [x4], 16
465        # Load B
466        LDP  q12,  q13, [x5], 32
467        LDP  q14,  q15, [x5], 32
468        LDP  q16,  q17, [x5], 32
469        LDP  q18,  q19, [x5], 32
470
471        FMLA v20.4s, v12.4s,  v0.s[0]
472        FMLA v22.4s, v12.4s,  v1.s[0]
473        FMLA v24.4s, v12.4s,  v2.s[0]
474        FMLA v26.4s, v12.4s,  v3.s[0]
475        FMLA v28.4s, v12.4s,  v4.s[0]
476        FMLA v30.4s, v12.4s,  v5.s[0]
477        FMLA v21.4s, v13.4s,  v0.s[0]
478        FMLA v23.4s, v13.4s,  v1.s[0]
479        FMLA v25.4s, v13.4s,  v2.s[0]
480        FMLA v27.4s, v13.4s,  v3.s[0]
481        FMLA v29.4s, v13.4s,  v4.s[0]
482        FMLA v31.4s, v13.4s,  v5.s[0]
483
484        FMLA v20.4s, v14.4s,  v0.s[1]
485        FMLA v22.4s, v14.4s,  v1.s[1]
486        FMLA v24.4s, v14.4s,  v2.s[1]
487        FMLA v26.4s, v14.4s,  v3.s[1]
488        FMLA v28.4s, v14.4s,  v4.s[1]
489        FMLA v30.4s, v14.4s,  v5.s[1]
490        FMLA v21.4s, v15.4s,  v0.s[1]
491        FMLA v23.4s, v15.4s,  v1.s[1]
492        FMLA v25.4s, v15.4s,  v2.s[1]
493        FMLA v27.4s, v15.4s,  v3.s[1]
494        FMLA v29.4s, v15.4s,  v4.s[1]
495        FMLA v31.4s, v15.4s,  v5.s[1]
496
497        FMLA v20.4s, v16.4s,  v0.s[2]
498        FMLA v22.4s, v16.4s,  v1.s[2]
499        FMLA v24.4s, v16.4s,  v2.s[2]
500        FMLA v26.4s, v16.4s,  v3.s[2]
501        FMLA v28.4s, v16.4s,  v4.s[2]
502        FMLA v30.4s, v16.4s,  v5.s[2]
503        FMLA v21.4s, v17.4s,  v0.s[2]
504        FMLA v23.4s, v17.4s,  v1.s[2]
505        FMLA v25.4s, v17.4s,  v2.s[2]
506        FMLA v27.4s, v17.4s,  v3.s[2]
507        FMLA v29.4s, v17.4s,  v4.s[2]
508        FMLA v31.4s, v17.4s,  v5.s[2]
509
510        FMLA v20.4s, v18.4s,  v0.s[3]
511        FMLA v22.4s, v18.4s,  v1.s[3]
512        FMLA v24.4s, v18.4s,  v2.s[3]
513        FMLA v26.4s, v18.4s,  v3.s[3]
514        FMLA v28.4s, v18.4s,  v4.s[3]
515        FMLA v30.4s, v18.4s,  v5.s[3]
516        FMLA v21.4s, v19.4s,  v0.s[3]
517        FMLA v23.4s, v19.4s,  v1.s[3]
518        FMLA v25.4s, v19.4s,  v2.s[3]
519        FMLA v27.4s, v19.4s,  v3.s[3]
520        FMLA v29.4s, v19.4s,  v4.s[3]
521        FMLA v31.4s, v19.4s,  v5.s[3]
522
523        # Is there a remainder?- 2 floats of A (8 bytes)
5245:
525        TBZ x0, 3, 6f
526
527        # Remainder- 2 floats of A (8 bytes)
528        # Load A
529        LDR   d0,  [x3], 8
530        LDR   d1,  [x9], 8
531        LDR   d2, [x10], 8
532        LDR   d3, [x11], 8
533        LDR   d4, [x12], 8
534        LDR   d5,  [x4], 8
535        # Load B
536        LDP  q12,  q13, [x5], 32
537        LDP  q14,  q15, [x5], 32
538
539        FMLA v20.4s, v12.4s,  v0.s[0]
540        FMLA v22.4s, v12.4s,  v1.s[0]
541        FMLA v24.4s, v12.4s,  v2.s[0]
542        FMLA v26.4s, v12.4s,  v3.s[0]
543        FMLA v28.4s, v12.4s,  v4.s[0]
544        FMLA v30.4s, v12.4s,  v5.s[0]
545        FMLA v21.4s, v13.4s,  v0.s[0]
546        FMLA v23.4s, v13.4s,  v1.s[0]
547        FMLA v25.4s, v13.4s,  v2.s[0]
548        FMLA v27.4s, v13.4s,  v3.s[0]
549        FMLA v29.4s, v13.4s,  v4.s[0]
550        FMLA v31.4s, v13.4s,  v5.s[0]
551
552        FMLA v20.4s, v14.4s,  v0.s[1]
553        FMLA v22.4s, v14.4s,  v1.s[1]
554        FMLA v24.4s, v14.4s,  v2.s[1]
555        FMLA v26.4s, v14.4s,  v3.s[1]
556        FMLA v28.4s, v14.4s,  v4.s[1]
557        FMLA v30.4s, v14.4s,  v5.s[1]
558        FMLA v21.4s, v15.4s,  v0.s[1]
559        FMLA v23.4s, v15.4s,  v1.s[1]
560        FMLA v25.4s, v15.4s,  v2.s[1]
561        FMLA v27.4s, v15.4s,  v3.s[1]
562        FMLA v29.4s, v15.4s,  v4.s[1]
563        FMLA v31.4s, v15.4s,  v5.s[1]
564
565        # Is there a remainder?- 1 float of A (4 bytes)
5666:
567        TBZ x0, 2, 3b
568
569        # Remainder- 1 float of A (4 bytes)
570        # Load A
571        LDR   s0,  [x3], 4
572        LDR   s1,  [x9], 4
573        LDR   s2, [x10], 4
574        LDR   s3, [x11], 4
575        LDR   s4, [x12], 4
576        LDR   s5,  [x4], 4
577        # Load B
578        LDP  q12,  q13, [x5], 32
579
580        FMLA v20.4s, v12.4s,  v0.s[0]
581        FMLA v22.4s, v12.4s,  v1.s[0]
582        FMLA v24.4s, v12.4s,  v2.s[0]
583        FMLA v26.4s, v12.4s,  v3.s[0]
584        FMLA v28.4s, v12.4s,  v4.s[0]
585        FMLA v30.4s, v12.4s,  v5.s[0]
586        FMLA v21.4s, v13.4s,  v0.s[0]
587        FMLA v23.4s, v13.4s,  v1.s[0]
588        FMLA v25.4s, v13.4s,  v2.s[0]
589        FMLA v27.4s, v13.4s,  v3.s[0]
590        FMLA v29.4s, v13.4s,  v4.s[0]
591        FMLA v31.4s, v13.4s,  v5.s[0]
592        B 3b
593
594        # Store odd width
5957:
596        TBZ x1, 2, 8f
597        STR q30,  [x7], 16
598        MOV v30.16b, v31.16b
599        STR q28, [x13], 16
600        MOV v28.16b, v29.16b
601        STR q26, [x18], 16
602        MOV v26.16b, v27.16b
603        STR q24, [x17], 16
604        MOV v24.16b, v25.16b
605        STR q22, [x16], 16
606        MOV v22.16b, v23.16b
607        STR q20,  [x6], 16
608        MOV v20.16b, v21.16b
6098:
610        TBZ x1, 1, 9f
611        STR d30,  [x7], 8
612        DUP d30, v30.d[1]
613        STR d28, [x13], 8
614        DUP d28, v28.d[1]
615        STR d26, [x18], 8
616        DUP d26, v26.d[1]
617        STR d24, [x17], 8
618        DUP d24, v24.d[1]
619        STR d22, [x16], 8
620        DUP d22, v22.d[1]
621        STR d20,  [x6], 8
622        DUP d20, v20.d[1]
623
6249:
625        TBZ x1, 0, 10f
626        STR s30,  [x7]
627        STR s28, [x13]
628        STR s26, [x18]
629        STR s24, [x17]
630        STR s22, [x16]
631        STR s20,  [x6]
63210:
633        # Restore d8-d15 from stack
634        LDP d14, d15, [sp, 48]
635        LDP d12, d13, [sp, 32]
636        LDP d10, d11, [sp, 16]
637        LDP  d8,  d9, [sp], 64
638        RET
639
640END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57
641
642#ifdef __ELF__
643.section ".note.GNU-stack","",%progbits
644#endif
645