• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0  v6
45# A1   v1  v7
46# A2   v2  v8
47# A3   v3  v9
48# A4   v4 v10
49# A5   v5 v11
50# B   v12 v13 v14 v15
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59
60BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57
61
62        # Clamp A and C pointers / Save d8-d15 on stack
63        STP  d8,  d9, [sp, -64]!
64        CMP x0, 2                // if mr < 2
65        ADD x9, x3, x4           // a1 = a0 + a_stride
66        ADD x16, x6, x7          // c1 = c0 + cm_stride
67        CSEL x9, x3, x9, LO      //   a1 = a0
68        CSEL x16, x6, x16, LO    //   c1 = c0
69
70        STP d10, d11, [sp, 16]
71        ADD x10, x9, x4          // a2 = a1 + a_stride
72        ADD x17, x16, x7         // c2 = c1 + cm_stride
73                                 // if mr <= 2
74        CSEL x10, x9, x10, LS    //   a2 = a1
75        CSEL x17, x16, x17, LS   //   c2 = c1
76
77        STP d12, d13, [sp, 32]
78        CMP x0, 4                // if mr < 4
79        ADD x11, x10, x4         // a3 = a2 + a_stride
80        ADD x18, x17, x7         // c3 = c2 + cm_stride
81        CSEL x11, x10, x11, LO   //   a3 = a2
82        CSEL x18, x17, x18, LO   //   c3 = c2
83
84        STP d14, d15, [sp, 48]
85        ADD x12, x11, x4         // a4 = a3 + a_stride
86        ADD x13, x18, x7         // c4 = c3 + cm_stride
87                                 // if mr <= 5
88        CSEL x12, x11, x12, LS   //   a4 = a3
89        CSEL x13, x18, x13, LS   //   c4 = c3
90
91        # Load params pointer
92        LDR x8, [sp, 72]
93
94        CMP x0, 6                // if mr < 6
95        ADD x4, x12, x4          // a5 = a4 + a_stride
96        ADD x7, x13, x7          // c5 = c4 + cm_stride
97        CSEL x4, x12, x4, LO     //   a5 = a4
98        CSEL x7, x13, x7, LO     //   c5 = c4
99
100        # Load cn_stride
101        LDR x14, [sp, 64]
102
1030:
104        # Load initial bias from w into accumulators
105        LDP q20, q21, [x5], 32
106        MOV v22.16b, v20.16b
107        MOV v23.16b, v21.16b
108        MOV v24.16b, v20.16b
109        MOV v25.16b, v21.16b
110        MOV v26.16b, v20.16b
111        MOV v27.16b, v21.16b
112        MOV v28.16b, v20.16b
113        MOV v29.16b, v21.16b
114        MOV v30.16b, v20.16b
115        MOV v31.16b, v21.16b
116
117        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
118        SUBS x0, x2, 32  // k = kc - 32
119        B.LO 4f
120
121        # Prologue - loads for main loop of 96 FMA
122        LDR   q0,  [x3], 16
123        LDR   q1,  [x9], 16
124        LDR   q2, [x10], 16
125        LDR   q3, [x11], 16
126        LDR   q4, [x12], 16
127        LDR   q5,  [x4], 16
128        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
129        LDP  q14,  q15, [x5], 32
130        LDP  q16,  q17, [x5], 32
131
132        # Is there at least 8 floats (32 bytes) for main loop?
133        SUBS x0, x0, 32
134        B.LO 2f
135
136        # Main loop - 8 floats of A (32 bytes)
137        # 96 FMA + 6 LDP A + 8 LDP B
1381:
139        # First group of 4 A.  48 FMA.
140        FMLA v20.4s, v12.4s,  v0.s[0]
141        LDP  q18,  q19, [x5], 32      // Load last B
142        FMLA v22.4s, v12.4s,  v1.s[0]
143        FMLA v24.4s, v12.4s,  v2.s[0]
144        FMLA v26.4s, v12.4s,  v3.s[0]
145        FMLA v28.4s, v12.4s,  v4.s[0]
146        FMLA v30.4s, v12.4s,  v5.s[0]
147        FMLA v21.4s, v13.4s,  v0.s[0]
148        FMLA v23.4s, v13.4s,  v1.s[0]
149        FMLA v25.4s, v13.4s,  v2.s[0]
150        FMLA v27.4s, v13.4s,  v3.s[0]
151        FMLA v29.4s, v13.4s,  v4.s[0]
152
153        FMLA v31.4s, v13.4s,  v5.s[0]
154        FMLA v20.4s, v14.4s,  v0.s[1]
155        FMLA v22.4s, v14.4s,  v1.s[1]
156        FMLA v24.4s, v14.4s,  v2.s[1]
157        FMLA v26.4s, v14.4s,  v3.s[1]
158        FMLA v28.4s, v14.4s,  v4.s[1]
159        FMLA v30.4s, v14.4s,  v5.s[1]
160        FMLA v21.4s, v15.4s,  v0.s[1]
161        FMLA v23.4s, v15.4s,  v1.s[1]
162        FMLA v25.4s, v15.4s,  v2.s[1]
163        LDR   q6,  [x3], 16            // Load next 6 A
164        FMLA v27.4s, v15.4s,  v3.s[1]
165        FMLA v29.4s, v15.4s,  v4.s[1]
166        FMLA v31.4s, v15.4s,  v5.s[1]
167        LDR   q7,  [x9], 16
168
169        FMLA v20.4s, v16.4s,  v0.s[2]
170        FMLA v22.4s, v16.4s,  v1.s[2]
171        FMLA v24.4s, v16.4s,  v2.s[2]
172        LDR   q8, [x10], 16
173        FMLA v26.4s, v16.4s,  v3.s[2]
174        FMLA v28.4s, v16.4s,  v4.s[2]
175        FMLA v30.4s, v16.4s,  v5.s[2]
176        LDR   q9, [x11], 16
177        FMLA v21.4s, v17.4s,  v0.s[2]
178        FMLA v23.4s, v17.4s,  v1.s[2]
179        FMLA v25.4s, v17.4s,  v2.s[2]
180        LDR   q10, [x12], 16
181        FMLA v27.4s, v17.4s,  v3.s[2]
182        FMLA v29.4s, v17.4s,  v4.s[2]
183        FMLA v31.4s, v17.4s,  v5.s[2]
184        LDR  q11,  [x4], 16
185
186        FMLA v20.4s, v18.4s,  v0.s[3]
187        FMLA v22.4s, v18.4s,  v1.s[3]
188        FMLA v24.4s, v18.4s,  v2.s[3]
189        LDP  q12,  q13, [x5], 32       // Load 4 B
190        FMLA v26.4s, v18.4s,  v3.s[3]
191        FMLA v28.4s, v18.4s,  v4.s[3]
192        FMLA v30.4s, v18.4s,  v5.s[3]
193        LDP  q14,  q15, [x5], 32
194        FMLA v21.4s, v19.4s,  v0.s[3]
195        FMLA v23.4s, v19.4s,  v1.s[3]
196        FMLA v25.4s, v19.4s,  v2.s[3]
197        LDP  q16,  q17, [x5], 32
198        FMLA v27.4s, v19.4s,  v3.s[3]
199        FMLA v29.4s, v19.4s,  v4.s[3]
200        FMLA v31.4s, v19.4s,  v5.s[3]
201        LDP  q18,  q19, [x5], 32
202
203        # Second group of 4 A.  48 FMA.
204        FMLA v20.4s, v12.4s,  v6.s[0]
205        FMLA v22.4s, v12.4s,  v7.s[0]
206        FMLA v24.4s, v12.4s,  v8.s[0]
207        LDR   q0,  [x3], 16           // Load next 6 A
208        FMLA v26.4s, v12.4s,  v9.s[0]
209        FMLA v28.4s, v12.4s, v10.s[0]
210        FMLA v30.4s, v12.4s, v11.s[0]
211        LDR   q1,  [x9], 16
212        FMLA v21.4s, v13.4s,  v6.s[0]
213        FMLA v23.4s, v13.4s,  v7.s[0]
214        FMLA v25.4s, v13.4s,  v8.s[0]
215        LDR   q2, [x10], 16
216        FMLA v27.4s, v13.4s,  v9.s[0]
217        FMLA v29.4s, v13.4s, v10.s[0]
218        FMLA v31.4s, v13.4s, v11.s[0]
219        LDR   q3, [x11], 16
220
221        FMLA v20.4s, v14.4s,  v6.s[1]
222        FMLA v22.4s, v14.4s,  v7.s[1]
223        FMLA v24.4s, v14.4s,  v8.s[1]
224        LDR   q4, [x12], 16
225        FMLA v26.4s, v14.4s,  v9.s[1]
226        FMLA v28.4s, v14.4s, v10.s[1]
227        FMLA v30.4s, v14.4s, v11.s[1]
228        LDR   q5,  [x4], 16
229        FMLA v21.4s, v15.4s,  v6.s[1]
230        FMLA v23.4s, v15.4s,  v7.s[1]
231        FMLA v25.4s, v15.4s,  v8.s[1]
232        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
233        FMLA v27.4s, v15.4s,  v9.s[1]
234        FMLA v29.4s, v15.4s, v10.s[1]
235        FMLA v31.4s, v15.4s, v11.s[1]
236        LDP  q14,  q15, [x5], 32
237
238        FMLA v20.4s, v16.4s,  v6.s[2]
239        FMLA v22.4s, v16.4s,  v7.s[2]
240        FMLA v24.4s, v16.4s,  v8.s[2]
241        FMLA v26.4s, v16.4s,  v9.s[2]
242        FMLA v28.4s, v16.4s, v10.s[2]
243        FMLA v30.4s, v16.4s, v11.s[2]
244        FMLA v21.4s, v17.4s,  v6.s[2]
245        FMLA v23.4s, v17.4s,  v7.s[2]
246        FMLA v25.4s, v17.4s,  v8.s[2]
247        FMLA v27.4s, v17.4s,  v9.s[2]
248        FMLA v29.4s, v17.4s, v10.s[2]
249        FMLA v31.4s, v17.4s, v11.s[2]
250        LDP  q16,  q17, [x5], 32
251
252        FMLA v20.4s, v18.4s,  v6.s[3]
253        FMLA v22.4s, v18.4s,  v7.s[3]
254        SUBS x0, x0, 32
255        FMLA v24.4s, v18.4s,  v8.s[3]
256        FMLA v26.4s, v18.4s,  v9.s[3]
257        FMLA v28.4s, v18.4s, v10.s[3]
258        FMLA v30.4s, v18.4s, v11.s[3]
259        FMLA v21.4s, v19.4s,  v6.s[3]
260        FMLA v23.4s, v19.4s,  v7.s[3]
261        FMLA v25.4s, v19.4s,  v8.s[3]
262        FMLA v27.4s, v19.4s,  v9.s[3]
263        FMLA v29.4s, v19.4s, v10.s[3]
264        FMLA v31.4s, v19.4s, v11.s[3]
265        B.HS 1b
266
267        # Epilogue - 8 floats of A (32 bytes)
268        # 96 FMA + 6 LDP A + 8 LDP B
269        # First block same as main loop.  Second block has no preloads.
2702:
271        # First group of 4 A.  48 FMA.
272        FMLA v20.4s, v12.4s,  v0.s[0]
273        LDP  q18,  q19, [x5], 32      // Load last B
274        FMLA v22.4s, v12.4s,  v1.s[0]
275        FMLA v24.4s, v12.4s,  v2.s[0]
276        FMLA v26.4s, v12.4s,  v3.s[0]
277        FMLA v28.4s, v12.4s,  v4.s[0]
278        FMLA v30.4s, v12.4s,  v5.s[0]
279        FMLA v21.4s, v13.4s,  v0.s[0]
280        FMLA v23.4s, v13.4s,  v1.s[0]
281        FMLA v25.4s, v13.4s,  v2.s[0]
282        FMLA v27.4s, v13.4s,  v3.s[0]
283        FMLA v29.4s, v13.4s,  v4.s[0]
284
285        FMLA v31.4s, v13.4s,  v5.s[0]
286        FMLA v20.4s, v14.4s,  v0.s[1]
287        FMLA v22.4s, v14.4s,  v1.s[1]
288        FMLA v24.4s, v14.4s,  v2.s[1]
289        FMLA v26.4s, v14.4s,  v3.s[1]
290        FMLA v28.4s, v14.4s,  v4.s[1]
291        FMLA v30.4s, v14.4s,  v5.s[1]
292        FMLA v21.4s, v15.4s,  v0.s[1]
293        FMLA v23.4s, v15.4s,  v1.s[1]
294        FMLA v25.4s, v15.4s,  v2.s[1]
295        LDR   q6,  [x3], 16            // Load next 6 A
296        FMLA v27.4s, v15.4s,  v3.s[1]
297        FMLA v29.4s, v15.4s,  v4.s[1]
298        FMLA v31.4s, v15.4s,  v5.s[1]
299        LDR   q7,  [x9], 16
300
301        FMLA v20.4s, v16.4s,  v0.s[2]
302        FMLA v22.4s, v16.4s,  v1.s[2]
303        FMLA v24.4s, v16.4s,  v2.s[2]
304        LDR   q8, [x10], 16
305        FMLA v26.4s, v16.4s,  v3.s[2]
306        FMLA v28.4s, v16.4s,  v4.s[2]
307        FMLA v30.4s, v16.4s,  v5.s[2]
308        LDR   q9, [x11], 16
309        FMLA v21.4s, v17.4s,  v0.s[2]
310        FMLA v23.4s, v17.4s,  v1.s[2]
311        FMLA v25.4s, v17.4s,  v2.s[2]
312        LDR   q10, [x12], 16
313        FMLA v27.4s, v17.4s,  v3.s[2]
314        FMLA v29.4s, v17.4s,  v4.s[2]
315        FMLA v31.4s, v17.4s,  v5.s[2]
316        LDR  q11,  [x4], 16
317
318        FMLA v20.4s, v18.4s,  v0.s[3]
319        FMLA v22.4s, v18.4s,  v1.s[3]
320        FMLA v24.4s, v18.4s,  v2.s[3]
321        LDP  q12,  q13, [x5], 32       // Load 4 B
322        FMLA v26.4s, v18.4s,  v3.s[3]
323        FMLA v28.4s, v18.4s,  v4.s[3]
324        FMLA v30.4s, v18.4s,  v5.s[3]
325        LDP  q14,  q15, [x5], 32
326        FMLA v21.4s, v19.4s,  v0.s[3]
327        FMLA v23.4s, v19.4s,  v1.s[3]
328        FMLA v25.4s, v19.4s,  v2.s[3]
329        LDP  q16,  q17, [x5], 32
330        FMLA v27.4s, v19.4s,  v3.s[3]
331        FMLA v29.4s, v19.4s,  v4.s[3]
332        FMLA v31.4s, v19.4s,  v5.s[3]
333        LDP  q18,  q19, [x5], 32
334
335        # Second group of 4 A.  48 FMA.
336        FMLA v20.4s, v12.4s,  v6.s[0]
337        FMLA v22.4s, v12.4s,  v7.s[0]
338        FMLA v24.4s, v12.4s,  v8.s[0]
339        FMLA v26.4s, v12.4s,  v9.s[0]
340        FMLA v28.4s, v12.4s, v10.s[0]
341        FMLA v30.4s, v12.4s, v11.s[0]
342        FMLA v21.4s, v13.4s,  v6.s[0]
343        FMLA v23.4s, v13.4s,  v7.s[0]
344        FMLA v25.4s, v13.4s,  v8.s[0]
345        FMLA v27.4s, v13.4s,  v9.s[0]
346        FMLA v29.4s, v13.4s, v10.s[0]
347        FMLA v31.4s, v13.4s, v11.s[0]
348
349        FMLA v20.4s, v14.4s,  v6.s[1]
350        FMLA v22.4s, v14.4s,  v7.s[1]
351        FMLA v24.4s, v14.4s,  v8.s[1]
352        FMLA v26.4s, v14.4s,  v9.s[1]
353        FMLA v28.4s, v14.4s, v10.s[1]
354        FMLA v30.4s, v14.4s, v11.s[1]
355        FMLA v21.4s, v15.4s,  v6.s[1]
356        FMLA v23.4s, v15.4s,  v7.s[1]
357        FMLA v25.4s, v15.4s,  v8.s[1]
358        FMLA v27.4s, v15.4s,  v9.s[1]
359        FMLA v29.4s, v15.4s, v10.s[1]
360        FMLA v31.4s, v15.4s, v11.s[1]
361
362        FMLA v20.4s, v16.4s,  v6.s[2]
363        FMLA v22.4s, v16.4s,  v7.s[2]
364        FMLA v24.4s, v16.4s,  v8.s[2]
365        FMLA v26.4s, v16.4s,  v9.s[2]
366        FMLA v28.4s, v16.4s, v10.s[2]
367        FMLA v30.4s, v16.4s, v11.s[2]
368        FMLA v21.4s, v17.4s,  v6.s[2]
369        FMLA v23.4s, v17.4s,  v7.s[2]
370        FMLA v25.4s, v17.4s,  v8.s[2]
371        FMLA v27.4s, v17.4s,  v9.s[2]
372        FMLA v29.4s, v17.4s, v10.s[2]
373        FMLA v31.4s, v17.4s, v11.s[2]
374
375        FMLA v20.4s, v18.4s,  v6.s[3]
376        FMLA v22.4s, v18.4s,  v7.s[3]
377        FMLA v24.4s, v18.4s,  v8.s[3]
378        FMLA v26.4s, v18.4s,  v9.s[3]
379        FMLA v28.4s, v18.4s, v10.s[3]
380        FMLA v30.4s, v18.4s, v11.s[3]
381        FMLA v21.4s, v19.4s,  v6.s[3]
382        FMLA v23.4s, v19.4s,  v7.s[3]
383
384        # Load clamping_params values
385        LD2R {v6.4s, v7.4s}, [x8]
386
387        FMLA v25.4s, v19.4s,  v8.s[3]
388        FMLA v27.4s, v19.4s,  v9.s[3]
389        # Is there a remainder?- 4 floats of A (16 bytes) or less
390        TST x0, 31
391        FMLA v29.4s, v19.4s, v10.s[3]
392        FMLA v31.4s, v19.4s, v11.s[3]
393        B.NE 4f
394
395        # Clamp
3963:
397        FMIN v20.4s, v20.4s, v6.4s
398        SUBS x1, x1, 8
399        FMIN v21.4s, v21.4s, v6.4s
400        FMIN v22.4s, v22.4s, v6.4s
401        FMIN v23.4s, v23.4s, v6.4s
402        FMIN v24.4s, v24.4s, v6.4s
403        FMIN v25.4s, v25.4s, v6.4s
404        FMIN v26.4s, v26.4s, v6.4s
405        FMIN v27.4s, v27.4s, v6.4s
406        FMIN v28.4s, v28.4s, v6.4s
407        FMIN v29.4s, v29.4s, v6.4s
408        FMIN v30.4s, v30.4s, v6.4s
409        FMIN v31.4s, v31.4s, v6.4s
410        FMAX v20.4s, v20.4s, v7.4s
411        FMAX v21.4s, v21.4s, v7.4s
412        FMAX v22.4s, v22.4s, v7.4s
413        FMAX v23.4s, v23.4s, v7.4s
414        FMAX v24.4s, v24.4s, v7.4s
415        FMAX v25.4s, v25.4s, v7.4s
416        FMAX v26.4s, v26.4s, v7.4s
417        FMAX v27.4s, v27.4s, v7.4s
418        FMAX v28.4s, v28.4s, v7.4s
419        FMAX v29.4s, v29.4s, v7.4s
420        FMAX v30.4s, v30.4s, v7.4s
421        FMAX v31.4s, v31.4s, v7.4s
422
423        # Store full 6 x 8
424        B.LO 7f
425
426        STP q20, q21,  [x6]
427        ADD  x6,  x6, x14
428        SUB  x3,  x3, x2 // a0 -= kc
429        STP q22, q23, [x16]
430        ADD x16, x16, x14
431        SUB  x9,  x9, x2 // a1 -= kc
432        STP q24, q25, [x17]
433        ADD x17, x17, x14
434        SUB x10, x10, x2 // a2 -= kc
435        STP q26, q27, [x18]
436        ADD x18, x18, x14
437        SUB x11, x11, x2 // a3 -= kc
438        STP q28, q29, [x13]
439        ADD x13, x13, x14
440        SUB x12, x12, x2 // a4 -= kc
441        STP q30, q31,  [x7]
442        ADD x7, x7, x14
443        SUB  x4,  x4, x2 // a5 -= kc
444
445        B.HI 0b
446
447        # Restore d8-d15 from stack
448        LDP d14, d15, [sp, 48]
449        LDP d12, d13, [sp, 32]
450        LDP d10, d11, [sp, 16]
451        LDP  d8,  d9, [sp], 64
452        RET
453
4544:
455        # Load clamping_params values
456        LD2R {v6.4s, v7.4s}, [x8]
457
458        # Is there a remainder?- 4 floats of A (16 bytes)
459        TBZ x0, 4, 5f
460
461        # Remainder- 4 floats of A (16 bytes)
462        # Load A
463        LDR   q0,  [x3], 16
464        LDR   q1,  [x9], 16
465        LDR   q2, [x10], 16
466        LDR   q3, [x11], 16
467        LDR   q4, [x12], 16
468        LDR   q5,  [x4], 16
469        # Load B
470        LDP  q12,  q13, [x5], 32
471        LDP  q14,  q15, [x5], 32
472        LDP  q16,  q17, [x5], 32
473        LDP  q18,  q19, [x5], 32
474
475        FMLA v20.4s, v12.4s,  v0.s[0]
476        FMLA v22.4s, v12.4s,  v1.s[0]
477        FMLA v24.4s, v12.4s,  v2.s[0]
478        FMLA v26.4s, v12.4s,  v3.s[0]
479        FMLA v28.4s, v12.4s,  v4.s[0]
480        FMLA v30.4s, v12.4s,  v5.s[0]
481        FMLA v21.4s, v13.4s,  v0.s[0]
482        FMLA v23.4s, v13.4s,  v1.s[0]
483        FMLA v25.4s, v13.4s,  v2.s[0]
484        FMLA v27.4s, v13.4s,  v3.s[0]
485        FMLA v29.4s, v13.4s,  v4.s[0]
486        FMLA v31.4s, v13.4s,  v5.s[0]
487
488        FMLA v20.4s, v14.4s,  v0.s[1]
489        FMLA v22.4s, v14.4s,  v1.s[1]
490        FMLA v24.4s, v14.4s,  v2.s[1]
491        FMLA v26.4s, v14.4s,  v3.s[1]
492        FMLA v28.4s, v14.4s,  v4.s[1]
493        FMLA v30.4s, v14.4s,  v5.s[1]
494        FMLA v21.4s, v15.4s,  v0.s[1]
495        FMLA v23.4s, v15.4s,  v1.s[1]
496        FMLA v25.4s, v15.4s,  v2.s[1]
497        FMLA v27.4s, v15.4s,  v3.s[1]
498        FMLA v29.4s, v15.4s,  v4.s[1]
499        FMLA v31.4s, v15.4s,  v5.s[1]
500
501        FMLA v20.4s, v16.4s,  v0.s[2]
502        FMLA v22.4s, v16.4s,  v1.s[2]
503        FMLA v24.4s, v16.4s,  v2.s[2]
504        FMLA v26.4s, v16.4s,  v3.s[2]
505        FMLA v28.4s, v16.4s,  v4.s[2]
506        FMLA v30.4s, v16.4s,  v5.s[2]
507        FMLA v21.4s, v17.4s,  v0.s[2]
508        FMLA v23.4s, v17.4s,  v1.s[2]
509        FMLA v25.4s, v17.4s,  v2.s[2]
510        FMLA v27.4s, v17.4s,  v3.s[2]
511        FMLA v29.4s, v17.4s,  v4.s[2]
512        FMLA v31.4s, v17.4s,  v5.s[2]
513
514        FMLA v20.4s, v18.4s,  v0.s[3]
515        FMLA v22.4s, v18.4s,  v1.s[3]
516        FMLA v24.4s, v18.4s,  v2.s[3]
517        FMLA v26.4s, v18.4s,  v3.s[3]
518        FMLA v28.4s, v18.4s,  v4.s[3]
519        FMLA v30.4s, v18.4s,  v5.s[3]
520        FMLA v21.4s, v19.4s,  v0.s[3]
521        FMLA v23.4s, v19.4s,  v1.s[3]
522        FMLA v25.4s, v19.4s,  v2.s[3]
523        FMLA v27.4s, v19.4s,  v3.s[3]
524        FMLA v29.4s, v19.4s,  v4.s[3]
525        FMLA v31.4s, v19.4s,  v5.s[3]
526
527        # Is there a remainder?- 2 floats of A (8 bytes)
5285:
529        TBZ x0, 3, 6f
530
531        # Remainder- 2 floats of A (8 bytes)
532        # Load A
533        LDR   d0,  [x3], 8
534        LDR   d1,  [x9], 8
535        LDR   d2, [x10], 8
536        LDR   d3, [x11], 8
537        LDR   d4, [x12], 8
538        LDR   d5,  [x4], 8
539        # Load B
540        LDP  q12,  q13, [x5], 32
541        LDP  q14,  q15, [x5], 32
542
543        FMLA v20.4s, v12.4s,  v0.s[0]
544        FMLA v22.4s, v12.4s,  v1.s[0]
545        FMLA v24.4s, v12.4s,  v2.s[0]
546        FMLA v26.4s, v12.4s,  v3.s[0]
547        FMLA v28.4s, v12.4s,  v4.s[0]
548        FMLA v30.4s, v12.4s,  v5.s[0]
549        FMLA v21.4s, v13.4s,  v0.s[0]
550        FMLA v23.4s, v13.4s,  v1.s[0]
551        FMLA v25.4s, v13.4s,  v2.s[0]
552        FMLA v27.4s, v13.4s,  v3.s[0]
553        FMLA v29.4s, v13.4s,  v4.s[0]
554        FMLA v31.4s, v13.4s,  v5.s[0]
555
556        FMLA v20.4s, v14.4s,  v0.s[1]
557        FMLA v22.4s, v14.4s,  v1.s[1]
558        FMLA v24.4s, v14.4s,  v2.s[1]
559        FMLA v26.4s, v14.4s,  v3.s[1]
560        FMLA v28.4s, v14.4s,  v4.s[1]
561        FMLA v30.4s, v14.4s,  v5.s[1]
562        FMLA v21.4s, v15.4s,  v0.s[1]
563        FMLA v23.4s, v15.4s,  v1.s[1]
564        FMLA v25.4s, v15.4s,  v2.s[1]
565        FMLA v27.4s, v15.4s,  v3.s[1]
566        FMLA v29.4s, v15.4s,  v4.s[1]
567        FMLA v31.4s, v15.4s,  v5.s[1]
568
569        # Is there a remainder?- 1 float of A (4 bytes)
5706:
571        TBZ x0, 2, 3b
572
573        # Remainder- 1 float of A (4 bytes)
574        # Load A
575        LDR   s0,  [x3], 4
576        LDR   s1,  [x9], 4
577        LDR   s2, [x10], 4
578        LDR   s3, [x11], 4
579        LDR   s4, [x12], 4
580        LDR   s5,  [x4], 4
581        # Load B
582        LDP  q12,  q13, [x5], 32
583
584        FMLA v20.4s, v12.4s,  v0.s[0]
585        FMLA v22.4s, v12.4s,  v1.s[0]
586        FMLA v24.4s, v12.4s,  v2.s[0]
587        FMLA v26.4s, v12.4s,  v3.s[0]
588        FMLA v28.4s, v12.4s,  v4.s[0]
589        FMLA v30.4s, v12.4s,  v5.s[0]
590        FMLA v21.4s, v13.4s,  v0.s[0]
591        FMLA v23.4s, v13.4s,  v1.s[0]
592        FMLA v25.4s, v13.4s,  v2.s[0]
593        FMLA v27.4s, v13.4s,  v3.s[0]
594        FMLA v29.4s, v13.4s,  v4.s[0]
595        FMLA v31.4s, v13.4s,  v5.s[0]
596        B 3b
597
598        # Store odd width
5997:
600        TBZ x1, 2, 8f
601        STR q20,  [x6], 16
602        MOV v20.16b, v21.16b
603        STR q22, [x16], 16
604        MOV v22.16b, v23.16b
605        STR q24, [x17], 16
606        MOV v24.16b, v25.16b
607        STR q26, [x18], 16
608        MOV v26.16b, v27.16b
609        STR q28, [x13], 16
610        MOV v28.16b, v29.16b
611        STR q30,  [x7], 16
612        MOV v30.16b, v31.16b
6138:
614        TBZ x1, 1, 9f
615        STR d20,  [x6], 8
616        DUP d20, v20.d[1]
617        STR d22, [x16], 8
618        DUP d22, v22.d[1]
619        STR d24, [x17], 8
620        DUP d24, v24.d[1]
621        STR d26, [x18], 8
622        DUP d26, v26.d[1]
623        STR d28, [x13], 8
624        DUP d28, v28.d[1]
625        STR d30,  [x7], 8
626        DUP d30, v30.d[1]
627
6289:
629        TBZ x1, 0, 10f
630        STR s20,  [x6]
631        STR s22, [x16]
632        STR s24, [x17]
633        STR s26, [x18]
634        STR s28, [x13]
635        STR s30,  [x7]
63610:
637        # Restore d8-d15 from stack
638        LDP d14, d15, [sp, 48]
639        LDP d12, d13, [sp, 32]
640        LDP d10, d11, [sp, 16]
641        LDP  d8,  d9, [sp], 64
642        RET
643
644END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57
645
646#ifdef __ELF__
647.section ".note.GNU-stack","",%progbits
648#endif
649