• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x14 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0  v6
45# A1   v1  v7
46# A2   v2  v8
47# A3   v3  v9
48# A4   v4 v10
49# A5   v5 v11
50# B   v12 v13 v14 v15
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59
60BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57
61
62        # Load acc, params pointer
63        LDP x15, x8, [sp, 8]
64
65        # Clamp A and C pointers / Save d8-d15 on stack
66        STP  d8,  d9, [sp, -64]!
67        CMP x0, 2                // if mr < 2
68        ADD x9, x3, x4           // a1 = a0 + a_stride
69        ADD x16, x6, x7          // c1 = c0 + cm_stride
70        CSEL x9, x3, x9, LO      //   a1 = a0
71        CSEL x16, x6, x16, LO    //   c1 = c0
72
73        STP d10, d11, [sp, 16]
74        ADD x10, x9, x4          // a2 = a1 + a_stride
75        ADD x17, x16, x7         // c2 = c1 + cm_stride
76                                 // if mr <= 2
77        CSEL x10, x9, x10, LS    //   a2 = a1
78        CSEL x17, x16, x17, LS   //   c2 = c1
79
80        STP d12, d13, [sp, 32]
81        CMP x0, 4                // if mr < 4
82        ADD x11, x10, x4         // a3 = a2 + a_stride
83        ADD x14, x17, x7         // c3 = c2 + cm_stride
84        CSEL x11, x10, x11, LO   //   a3 = a2
85        CSEL x14, x17, x14, LO   //   c3 = c2
86
87        STP d14, d15, [sp, 48]
88        ADD x12, x11, x4         // a4 = a3 + a_stride
89        ADD x13, x14, x7         // c4 = c3 + cm_stride
90                                 // if mr <= 4
91        CSEL x12, x11, x12, LS   //   a4 = a3
92        CSEL x13, x14, x13, LS   //   c4 = c3
93
94        CMP x0, 6                // if mr < 6
95        ADD x4, x12, x4          // a5 = a4 + a_stride
96        ADD x7, x13, x7          // c5 = c4 + cm_stride
97        CSEL x4, x12, x4, LO     //   a5 = a4
98        CSEL x7, x13, x7, LO     //   c5 = c4
99
1000:
101        # Load initial accumulators
102        LDP q20, q21, [x15], 32
103        LDP q22, q23, [x15], 32
104        LDP q24, q25, [x15], 32
105        LDP q26, q27, [x15], 32
106        LDP q28, q29, [x15], 32
107        LDP q30, q31, [x15], 32
108
109        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
110        SUBS x0, x2, 32  // k = kc - 32
111        B.LO 4f
112
113        # Prologue - loads for main loop of 96 FMA
114        LDR   q0,  [x3], 16
115        LDR   q1,  [x9], 16
116        LDR   q2, [x10], 16
117        LDR   q3, [x11], 16
118        LDR   q4, [x12], 16
119        LDR   q5,  [x4], 16
120        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
121        LDP  q14,  q15, [x5], 32
122        LDP  q16,  q17, [x5], 32
123
124        # Is there at least 8 floats (32 bytes) for main loop?
125        SUBS x0, x0, 32
126        B.LO 2f
127
128        # Main loop - 8 floats of A (32 bytes)
129        # 96 FMA + 6 LDP A + 8 LDP B
1301:
131        # First group of 4 A.  48 FMA.
132        FMLA v20.4s, v12.4s,  v0.s[0]
133        LDP  q18,  q19, [x5], 32      // Load last B
134        FMLA v22.4s, v12.4s,  v1.s[0]
135        FMLA v24.4s, v12.4s,  v2.s[0]
136        FMLA v26.4s, v12.4s,  v3.s[0]
137        FMLA v28.4s, v12.4s,  v4.s[0]
138        FMLA v30.4s, v12.4s,  v5.s[0]
139        FMLA v21.4s, v13.4s,  v0.s[0]
140        FMLA v23.4s, v13.4s,  v1.s[0]
141        FMLA v25.4s, v13.4s,  v2.s[0]
142        FMLA v27.4s, v13.4s,  v3.s[0]
143        FMLA v29.4s, v13.4s,  v4.s[0]
144
145        FMLA v31.4s, v13.4s,  v5.s[0]
146        FMLA v20.4s, v14.4s,  v0.s[1]
147        FMLA v22.4s, v14.4s,  v1.s[1]
148        FMLA v24.4s, v14.4s,  v2.s[1]
149        FMLA v26.4s, v14.4s,  v3.s[1]
150        FMLA v28.4s, v14.4s,  v4.s[1]
151        FMLA v30.4s, v14.4s,  v5.s[1]
152        FMLA v21.4s, v15.4s,  v0.s[1]
153        FMLA v23.4s, v15.4s,  v1.s[1]
154        FMLA v25.4s, v15.4s,  v2.s[1]
155        LDR   q6,  [x3], 16            // Load next 6 A
156        FMLA v27.4s, v15.4s,  v3.s[1]
157        FMLA v29.4s, v15.4s,  v4.s[1]
158        FMLA v31.4s, v15.4s,  v5.s[1]
159        LDR   q7,  [x9], 16
160
161        FMLA v20.4s, v16.4s,  v0.s[2]
162        FMLA v22.4s, v16.4s,  v1.s[2]
163        FMLA v24.4s, v16.4s,  v2.s[2]
164        LDR   q8, [x10], 16
165        FMLA v26.4s, v16.4s,  v3.s[2]
166        FMLA v28.4s, v16.4s,  v4.s[2]
167        FMLA v30.4s, v16.4s,  v5.s[2]
168        LDR   q9, [x11], 16
169        FMLA v21.4s, v17.4s,  v0.s[2]
170        FMLA v23.4s, v17.4s,  v1.s[2]
171        FMLA v25.4s, v17.4s,  v2.s[2]
172        LDR   q10, [x12], 16
173        FMLA v27.4s, v17.4s,  v3.s[2]
174        FMLA v29.4s, v17.4s,  v4.s[2]
175        FMLA v31.4s, v17.4s,  v5.s[2]
176        LDR  q11,  [x4], 16
177
178        FMLA v20.4s, v18.4s,  v0.s[3]
179        FMLA v22.4s, v18.4s,  v1.s[3]
180        FMLA v24.4s, v18.4s,  v2.s[3]
181        LDP  q12,  q13, [x5], 32       // Load 4 B
182        FMLA v26.4s, v18.4s,  v3.s[3]
183        FMLA v28.4s, v18.4s,  v4.s[3]
184        FMLA v30.4s, v18.4s,  v5.s[3]
185        LDP  q14,  q15, [x5], 32
186        FMLA v21.4s, v19.4s,  v0.s[3]
187        FMLA v23.4s, v19.4s,  v1.s[3]
188        FMLA v25.4s, v19.4s,  v2.s[3]
189        LDP  q16,  q17, [x5], 32
190        FMLA v27.4s, v19.4s,  v3.s[3]
191        FMLA v29.4s, v19.4s,  v4.s[3]
192        FMLA v31.4s, v19.4s,  v5.s[3]
193        LDP  q18,  q19, [x5], 32
194
195        # Second group of 4 A.  48 FMA.
196        FMLA v20.4s, v12.4s,  v6.s[0]
197        FMLA v22.4s, v12.4s,  v7.s[0]
198        FMLA v24.4s, v12.4s,  v8.s[0]
199        LDR   q0,  [x3], 16           // Load next 6 A
200        FMLA v26.4s, v12.4s,  v9.s[0]
201        FMLA v28.4s, v12.4s, v10.s[0]
202        FMLA v30.4s, v12.4s, v11.s[0]
203        LDR   q1,  [x9], 16
204        FMLA v21.4s, v13.4s,  v6.s[0]
205        FMLA v23.4s, v13.4s,  v7.s[0]
206        FMLA v25.4s, v13.4s,  v8.s[0]
207        LDR   q2, [x10], 16
208        FMLA v27.4s, v13.4s,  v9.s[0]
209        FMLA v29.4s, v13.4s, v10.s[0]
210        FMLA v31.4s, v13.4s, v11.s[0]
211        LDR   q3, [x11], 16
212
213        FMLA v20.4s, v14.4s,  v6.s[1]
214        FMLA v22.4s, v14.4s,  v7.s[1]
215        FMLA v24.4s, v14.4s,  v8.s[1]
216        LDR   q4, [x12], 16
217        FMLA v26.4s, v14.4s,  v9.s[1]
218        FMLA v28.4s, v14.4s, v10.s[1]
219        FMLA v30.4s, v14.4s, v11.s[1]
220        LDR   q5,  [x4], 16
221        FMLA v21.4s, v15.4s,  v6.s[1]
222        FMLA v23.4s, v15.4s,  v7.s[1]
223        FMLA v25.4s, v15.4s,  v8.s[1]
224        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
225        FMLA v27.4s, v15.4s,  v9.s[1]
226        FMLA v29.4s, v15.4s, v10.s[1]
227        FMLA v31.4s, v15.4s, v11.s[1]
228        LDP  q14,  q15, [x5], 32
229
230        FMLA v20.4s, v16.4s,  v6.s[2]
231        FMLA v22.4s, v16.4s,  v7.s[2]
232        FMLA v24.4s, v16.4s,  v8.s[2]
233        FMLA v26.4s, v16.4s,  v9.s[2]
234        FMLA v28.4s, v16.4s, v10.s[2]
235        FMLA v30.4s, v16.4s, v11.s[2]
236        FMLA v21.4s, v17.4s,  v6.s[2]
237        FMLA v23.4s, v17.4s,  v7.s[2]
238        FMLA v25.4s, v17.4s,  v8.s[2]
239        FMLA v27.4s, v17.4s,  v9.s[2]
240        FMLA v29.4s, v17.4s, v10.s[2]
241        FMLA v31.4s, v17.4s, v11.s[2]
242        LDP  q16,  q17, [x5], 32
243
244        FMLA v20.4s, v18.4s,  v6.s[3]
245        FMLA v22.4s, v18.4s,  v7.s[3]
246        SUBS x0, x0, 32
247        FMLA v24.4s, v18.4s,  v8.s[3]
248        FMLA v26.4s, v18.4s,  v9.s[3]
249        FMLA v28.4s, v18.4s, v10.s[3]
250        FMLA v30.4s, v18.4s, v11.s[3]
251        FMLA v21.4s, v19.4s,  v6.s[3]
252        FMLA v23.4s, v19.4s,  v7.s[3]
253        FMLA v25.4s, v19.4s,  v8.s[3]
254        FMLA v27.4s, v19.4s,  v9.s[3]
255        FMLA v29.4s, v19.4s, v10.s[3]
256        FMLA v31.4s, v19.4s, v11.s[3]
257        B.HS 1b
258
259        # Epilogue - 8 floats of A (32 bytes)
260        # 96 FMA + 6 LDP A + 8 LDP B
261        # First block same as main loop.  Second block has no preloads.
2622:
263        # First group of 4 A.  48 FMA.
264        FMLA v20.4s, v12.4s,  v0.s[0]
265        LDP  q18,  q19, [x5], 32      // Load last B
266        FMLA v22.4s, v12.4s,  v1.s[0]
267        FMLA v24.4s, v12.4s,  v2.s[0]
268        FMLA v26.4s, v12.4s,  v3.s[0]
269        FMLA v28.4s, v12.4s,  v4.s[0]
270        FMLA v30.4s, v12.4s,  v5.s[0]
271        FMLA v21.4s, v13.4s,  v0.s[0]
272        FMLA v23.4s, v13.4s,  v1.s[0]
273        FMLA v25.4s, v13.4s,  v2.s[0]
274        FMLA v27.4s, v13.4s,  v3.s[0]
275        FMLA v29.4s, v13.4s,  v4.s[0]
276
277        FMLA v31.4s, v13.4s,  v5.s[0]
278        FMLA v20.4s, v14.4s,  v0.s[1]
279        FMLA v22.4s, v14.4s,  v1.s[1]
280        FMLA v24.4s, v14.4s,  v2.s[1]
281        FMLA v26.4s, v14.4s,  v3.s[1]
282        FMLA v28.4s, v14.4s,  v4.s[1]
283        FMLA v30.4s, v14.4s,  v5.s[1]
284        FMLA v21.4s, v15.4s,  v0.s[1]
285        FMLA v23.4s, v15.4s,  v1.s[1]
286        FMLA v25.4s, v15.4s,  v2.s[1]
287        LDR   q6,  [x3], 16            // Load next 6 A
288        FMLA v27.4s, v15.4s,  v3.s[1]
289        FMLA v29.4s, v15.4s,  v4.s[1]
290        FMLA v31.4s, v15.4s,  v5.s[1]
291        LDR   q7,  [x9], 16
292
293        FMLA v20.4s, v16.4s,  v0.s[2]
294        FMLA v22.4s, v16.4s,  v1.s[2]
295        FMLA v24.4s, v16.4s,  v2.s[2]
296        LDR   q8, [x10], 16
297        FMLA v26.4s, v16.4s,  v3.s[2]
298        FMLA v28.4s, v16.4s,  v4.s[2]
299        FMLA v30.4s, v16.4s,  v5.s[2]
300        LDR   q9, [x11], 16
301        FMLA v21.4s, v17.4s,  v0.s[2]
302        FMLA v23.4s, v17.4s,  v1.s[2]
303        FMLA v25.4s, v17.4s,  v2.s[2]
304        LDR   q10, [x12], 16
305        FMLA v27.4s, v17.4s,  v3.s[2]
306        FMLA v29.4s, v17.4s,  v4.s[2]
307        FMLA v31.4s, v17.4s,  v5.s[2]
308        LDR  q11,  [x4], 16
309
310        FMLA v20.4s, v18.4s,  v0.s[3]
311        FMLA v22.4s, v18.4s,  v1.s[3]
312        FMLA v24.4s, v18.4s,  v2.s[3]
313        LDP  q12,  q13, [x5], 32       // Load 4 B
314        FMLA v26.4s, v18.4s,  v3.s[3]
315        FMLA v28.4s, v18.4s,  v4.s[3]
316        FMLA v30.4s, v18.4s,  v5.s[3]
317        LDP  q14,  q15, [x5], 32
318        FMLA v21.4s, v19.4s,  v0.s[3]
319        FMLA v23.4s, v19.4s,  v1.s[3]
320        FMLA v25.4s, v19.4s,  v2.s[3]
321        LDP  q16,  q17, [x5], 32
322        FMLA v27.4s, v19.4s,  v3.s[3]
323        FMLA v29.4s, v19.4s,  v4.s[3]
324        FMLA v31.4s, v19.4s,  v5.s[3]
325        LDP  q18,  q19, [x5], 32
326
327        # Second group of 4 A.  48 FMA.
328        FMLA v20.4s, v12.4s,  v6.s[0]
329        FMLA v22.4s, v12.4s,  v7.s[0]
330        FMLA v24.4s, v12.4s,  v8.s[0]
331        FMLA v26.4s, v12.4s,  v9.s[0]
332        FMLA v28.4s, v12.4s, v10.s[0]
333        FMLA v30.4s, v12.4s, v11.s[0]
334        FMLA v21.4s, v13.4s,  v6.s[0]
335        FMLA v23.4s, v13.4s,  v7.s[0]
336        FMLA v25.4s, v13.4s,  v8.s[0]
337        FMLA v27.4s, v13.4s,  v9.s[0]
338        FMLA v29.4s, v13.4s, v10.s[0]
339        FMLA v31.4s, v13.4s, v11.s[0]
340
341        FMLA v20.4s, v14.4s,  v6.s[1]
342        FMLA v22.4s, v14.4s,  v7.s[1]
343        FMLA v24.4s, v14.4s,  v8.s[1]
344        FMLA v26.4s, v14.4s,  v9.s[1]
345        FMLA v28.4s, v14.4s, v10.s[1]
346        FMLA v30.4s, v14.4s, v11.s[1]
347        FMLA v21.4s, v15.4s,  v6.s[1]
348        FMLA v23.4s, v15.4s,  v7.s[1]
349        FMLA v25.4s, v15.4s,  v8.s[1]
350        FMLA v27.4s, v15.4s,  v9.s[1]
351        FMLA v29.4s, v15.4s, v10.s[1]
352        FMLA v31.4s, v15.4s, v11.s[1]
353
354        FMLA v20.4s, v16.4s,  v6.s[2]
355        FMLA v22.4s, v16.4s,  v7.s[2]
356        FMLA v24.4s, v16.4s,  v8.s[2]
357        FMLA v26.4s, v16.4s,  v9.s[2]
358        FMLA v28.4s, v16.4s, v10.s[2]
359        FMLA v30.4s, v16.4s, v11.s[2]
360        FMLA v21.4s, v17.4s,  v6.s[2]
361        FMLA v23.4s, v17.4s,  v7.s[2]
362        FMLA v25.4s, v17.4s,  v8.s[2]
363        FMLA v27.4s, v17.4s,  v9.s[2]
364        FMLA v29.4s, v17.4s, v10.s[2]
365        FMLA v31.4s, v17.4s, v11.s[2]
366
367        FMLA v20.4s, v18.4s,  v6.s[3]
368        FMLA v22.4s, v18.4s,  v7.s[3]
369        FMLA v24.4s, v18.4s,  v8.s[3]
370        FMLA v26.4s, v18.4s,  v9.s[3]
371        FMLA v28.4s, v18.4s, v10.s[3]
372        FMLA v30.4s, v18.4s, v11.s[3]
373        FMLA v21.4s, v19.4s,  v6.s[3]
374        FMLA v23.4s, v19.4s,  v7.s[3]
375
376        # Load min/max values
377        LD2R {v6.4s, v7.4s}, [x8]
378
379        FMLA v25.4s, v19.4s,  v8.s[3]
380        FMLA v27.4s, v19.4s,  v9.s[3]
381        # Is there a remainder?- 4 floats of A (16 bytes) or less
382        TST x0, 31
383        FMLA v29.4s, v19.4s, v10.s[3]
384        FMLA v31.4s, v19.4s, v11.s[3]
385        B.NE 4f
386
387        # Clamp
3883:
389        FMAX v20.4s, v20.4s, v6.4s
390        # Load cn_stride
391        LDR x0, [sp, 64]
392        FMAX v21.4s, v21.4s, v6.4s
393        FMAX v22.4s, v22.4s, v6.4s
394        FMAX v23.4s, v23.4s, v6.4s
395        FMAX v24.4s, v24.4s, v6.4s
396        FMAX v25.4s, v25.4s, v6.4s
397        FMAX v26.4s, v26.4s, v6.4s
398        FMAX v27.4s, v27.4s, v6.4s
399        FMAX v28.4s, v28.4s, v6.4s
400        FMAX v29.4s, v29.4s, v6.4s
401        FMAX v30.4s, v30.4s, v6.4s
402        FMAX v31.4s, v31.4s, v6.4s
403        SUBS x1, x1, 8
404        FMIN v20.4s, v20.4s, v7.4s
405        FMIN v21.4s, v21.4s, v7.4s
406        FMIN v22.4s, v22.4s, v7.4s
407        FMIN v23.4s, v23.4s, v7.4s
408        FMIN v24.4s, v24.4s, v7.4s
409        FMIN v25.4s, v25.4s, v7.4s
410        FMIN v26.4s, v26.4s, v7.4s
411        FMIN v27.4s, v27.4s, v7.4s
412        FMIN v28.4s, v28.4s, v7.4s
413        FMIN v29.4s, v29.4s, v7.4s
414        FMIN v30.4s, v30.4s, v7.4s
415        FMIN v31.4s, v31.4s, v7.4s
416
417        # Store full 6 x 8
418        B.LO 7f
419
420        STP q30, q31,  [x7]
421        ADD x7, x7, x0
422        SUB  x3,  x3, x2 // a0 -= kc
423        STP q28, q29, [x13]
424        ADD x13, x13, x0
425        SUB  x9,  x9, x2 // a1 -= kc
426        STP q26, q27, [x14]
427        ADD x14, x14, x0
428        SUB x10, x10, x2 // a2 -= kc
429        STP q24, q25, [x17]
430        ADD x17, x17, x0
431        SUB x11, x11, x2 // a3 -= kc
432        STP q22, q23, [x16]
433        ADD x16, x16, x0
434        SUB x12, x12, x2 // a4 -= kc
435        STP q20, q21,  [x6]
436        ADD  x6,  x6, x0
437        SUB  x4,  x4, x2 // a5 -= kc
438
439        B.HI 0b
440
441        # Restore d8-d15 from stack
442        LDP d14, d15, [sp, 48]
443        LDP d12, d13, [sp, 32]
444        LDP d10, d11, [sp, 16]
445        LDP  d8,  d9, [sp], 64
446        RET
447
4484:
449        # Load min/max values
450        LD2R {v6.4s, v7.4s}, [x8]
451
452        # Is there a remainder?- 4 floats of A (16 bytes)
453        TBZ x0, 4, 5f
454
455        # Remainder- 4 floats of A (16 bytes)
456        # Load A
457        LDR   q0,  [x3], 16
458        LDR   q1,  [x9], 16
459        LDR   q2, [x10], 16
460        LDR   q3, [x11], 16
461        LDR   q4, [x12], 16
462        LDR   q5,  [x4], 16
463        # Load B
464        LDP  q12,  q13, [x5], 32
465        LDP  q14,  q15, [x5], 32
466        LDP  q16,  q17, [x5], 32
467        LDP  q18,  q19, [x5], 32
468
469        FMLA v20.4s, v12.4s,  v0.s[0]
470        FMLA v22.4s, v12.4s,  v1.s[0]
471        FMLA v24.4s, v12.4s,  v2.s[0]
472        FMLA v26.4s, v12.4s,  v3.s[0]
473        FMLA v28.4s, v12.4s,  v4.s[0]
474        FMLA v30.4s, v12.4s,  v5.s[0]
475        FMLA v21.4s, v13.4s,  v0.s[0]
476        FMLA v23.4s, v13.4s,  v1.s[0]
477        FMLA v25.4s, v13.4s,  v2.s[0]
478        FMLA v27.4s, v13.4s,  v3.s[0]
479        FMLA v29.4s, v13.4s,  v4.s[0]
480        FMLA v31.4s, v13.4s,  v5.s[0]
481
482        FMLA v20.4s, v14.4s,  v0.s[1]
483        FMLA v22.4s, v14.4s,  v1.s[1]
484        FMLA v24.4s, v14.4s,  v2.s[1]
485        FMLA v26.4s, v14.4s,  v3.s[1]
486        FMLA v28.4s, v14.4s,  v4.s[1]
487        FMLA v30.4s, v14.4s,  v5.s[1]
488        FMLA v21.4s, v15.4s,  v0.s[1]
489        FMLA v23.4s, v15.4s,  v1.s[1]
490        FMLA v25.4s, v15.4s,  v2.s[1]
491        FMLA v27.4s, v15.4s,  v3.s[1]
492        FMLA v29.4s, v15.4s,  v4.s[1]
493        FMLA v31.4s, v15.4s,  v5.s[1]
494
495        FMLA v20.4s, v16.4s,  v0.s[2]
496        FMLA v22.4s, v16.4s,  v1.s[2]
497        FMLA v24.4s, v16.4s,  v2.s[2]
498        FMLA v26.4s, v16.4s,  v3.s[2]
499        FMLA v28.4s, v16.4s,  v4.s[2]
500        FMLA v30.4s, v16.4s,  v5.s[2]
501        FMLA v21.4s, v17.4s,  v0.s[2]
502        FMLA v23.4s, v17.4s,  v1.s[2]
503        FMLA v25.4s, v17.4s,  v2.s[2]
504        FMLA v27.4s, v17.4s,  v3.s[2]
505        FMLA v29.4s, v17.4s,  v4.s[2]
506        FMLA v31.4s, v17.4s,  v5.s[2]
507
508        FMLA v20.4s, v18.4s,  v0.s[3]
509        FMLA v22.4s, v18.4s,  v1.s[3]
510        FMLA v24.4s, v18.4s,  v2.s[3]
511        FMLA v26.4s, v18.4s,  v3.s[3]
512        FMLA v28.4s, v18.4s,  v4.s[3]
513        FMLA v30.4s, v18.4s,  v5.s[3]
514        FMLA v21.4s, v19.4s,  v0.s[3]
515        FMLA v23.4s, v19.4s,  v1.s[3]
516        FMLA v25.4s, v19.4s,  v2.s[3]
517        FMLA v27.4s, v19.4s,  v3.s[3]
518        FMLA v29.4s, v19.4s,  v4.s[3]
519        FMLA v31.4s, v19.4s,  v5.s[3]
520
521        # Is there a remainder?- 2 floats of A (8 bytes)
5225:
523        TBZ x0, 3, 6f
524
525        # Remainder- 2 floats of A (8 bytes)
526        # Load A
527        LDR   d0,  [x3], 8
528        LDR   d1,  [x9], 8
529        LDR   d2, [x10], 8
530        LDR   d3, [x11], 8
531        LDR   d4, [x12], 8
532        LDR   d5,  [x4], 8
533        # Load B
534        LDP  q12,  q13, [x5], 32
535        LDP  q14,  q15, [x5], 32
536
537        FMLA v20.4s, v12.4s,  v0.s[0]
538        FMLA v22.4s, v12.4s,  v1.s[0]
539        FMLA v24.4s, v12.4s,  v2.s[0]
540        FMLA v26.4s, v12.4s,  v3.s[0]
541        FMLA v28.4s, v12.4s,  v4.s[0]
542        FMLA v30.4s, v12.4s,  v5.s[0]
543        FMLA v21.4s, v13.4s,  v0.s[0]
544        FMLA v23.4s, v13.4s,  v1.s[0]
545        FMLA v25.4s, v13.4s,  v2.s[0]
546        FMLA v27.4s, v13.4s,  v3.s[0]
547        FMLA v29.4s, v13.4s,  v4.s[0]
548        FMLA v31.4s, v13.4s,  v5.s[0]
549
550        FMLA v20.4s, v14.4s,  v0.s[1]
551        FMLA v22.4s, v14.4s,  v1.s[1]
552        FMLA v24.4s, v14.4s,  v2.s[1]
553        FMLA v26.4s, v14.4s,  v3.s[1]
554        FMLA v28.4s, v14.4s,  v4.s[1]
555        FMLA v30.4s, v14.4s,  v5.s[1]
556        FMLA v21.4s, v15.4s,  v0.s[1]
557        FMLA v23.4s, v15.4s,  v1.s[1]
558        FMLA v25.4s, v15.4s,  v2.s[1]
559        FMLA v27.4s, v15.4s,  v3.s[1]
560        FMLA v29.4s, v15.4s,  v4.s[1]
561        FMLA v31.4s, v15.4s,  v5.s[1]
562
563        # Is there a remainder?- 1 float of A (4 bytes)
5646:
565        TBZ x0, 2, 3b
566
567        # Remainder- 1 float of A (4 bytes)
568        # Load A
569        LDR   s0,  [x3], 4
570        LDR   s1,  [x9], 4
571        LDR   s2, [x10], 4
572        LDR   s3, [x11], 4
573        LDR   s4, [x12], 4
574        LDR   s5,  [x4], 4
575        # Load B
576        LDP  q12,  q13, [x5], 32
577
578        FMLA v20.4s, v12.4s,  v0.s[0]
579        FMLA v22.4s, v12.4s,  v1.s[0]
580        FMLA v24.4s, v12.4s,  v2.s[0]
581        FMLA v26.4s, v12.4s,  v3.s[0]
582        FMLA v28.4s, v12.4s,  v4.s[0]
583        FMLA v30.4s, v12.4s,  v5.s[0]
584        FMLA v21.4s, v13.4s,  v0.s[0]
585        FMLA v23.4s, v13.4s,  v1.s[0]
586        FMLA v25.4s, v13.4s,  v2.s[0]
587        FMLA v27.4s, v13.4s,  v3.s[0]
588        FMLA v29.4s, v13.4s,  v4.s[0]
589        FMLA v31.4s, v13.4s,  v5.s[0]
590        B 3b
591
592        # Store odd width
5937:
594        TBZ x1, 2, 8f
595        STR q30,  [x7], 16
596        MOV v30.16b, v31.16b
597        STR q28, [x13], 16
598        MOV v28.16b, v29.16b
599        STR q26, [x14], 16
600        MOV v26.16b, v27.16b
601        STR q24, [x17], 16
602        MOV v24.16b, v25.16b
603        STR q22, [x16], 16
604        MOV v22.16b, v23.16b
605        STR q20,  [x6], 16
606        MOV v20.16b, v21.16b
6078:
608        TBZ x1, 1, 9f
609        STR d30,  [x7], 8
610        DUP d30, v30.d[1]
611        STR d28, [x13], 8
612        DUP d28, v28.d[1]
613        STR d26, [x14], 8
614        DUP d26, v26.d[1]
615        STR d24, [x17], 8
616        DUP d24, v24.d[1]
617        STR d22, [x16], 8
618        DUP d22, v22.d[1]
619        STR d20,  [x6], 8
620        DUP d20, v20.d[1]
621
6229:
623        TBZ x1, 0, 10f
624        STR s30,  [x7]
625        STR s28, [x13]
626        STR s26, [x14]
627        STR s24, [x17]
628        STR s22, [x16]
629        STR s20,  [x6]
63010:
631        # Restore d8-d15 from stack
632        LDP d14, d15, [sp, 48]
633        LDP d12, d13, [sp, 32]
634        LDP d10, d11, [sp, 16]
635        LDP  d8,  d9, [sp], 64
636        RET
637
638END_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57
639
640#ifdef __ELF__
641.section ".note.GNU-stack","",%progbits
642#endif
643