• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     const uint8_t* a,                  x3
17#     size_t a_stride,                   x4
18#     const void* w,                     x5
19#     uint8_t* c,                        x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     const float* acc,                  [sp + 8] -> x15
23#     const xnn_f32_minmax_params params [sp + 16] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x14 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0  v6
45# A1   v1  v7
46# A2   v2  v8
47# A3   v3  v9
48# A4   v4 v10
49# A5   v5 v11
50# B   v12 v13 v14 v15
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59
60BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
61
62        # Clamp A and C pointers / Save d8-d15 on stack
63        CMP     x0, 2                   // if mr < 2
64        STP     d8,  d9, [sp, -64]!
65        ADD     x9, x3, x4              // a1 = a0 + a_stride
66        ADD     x16, x6, x7             // c1 = c0 + cm_stride
67        CSEL    x9, x3, x9, LO          //   a1 = a0
68        CSEL    x16, x6, x16, LO        //   c1 = c0
69
70        STP     d10, d11, [sp, 16]
71        ADD     x10, x9, x4             // a2 = a1 + a_stride
72        ADD     x17, x16, x7            // c2 = c1 + cm_stride
73                                        // if mr <= 2
74        CSEL    x10, x9, x10, LS        //   a2 = a1
75        CSEL    x17, x16, x17, LS       //   c2 = c1
76
77        STP     d12, d13, [sp, 32]
78        CMP     x0, 4                   // if mr < 4
79        ADD     x11, x10, x4            // a3 = a2 + a_stride
80        ADD     x14, x17, x7            // c3 = c2 + cm_stride
81        CSEL    x11, x10, x11, LO       //   a3 = a2
82        CSEL    x14, x17, x14, LO       //   c3 = c2
83
84        STP     d14, d15, [sp, 48]
85        ADD     x12, x11, x4            // a4 = a3 + a_stride
86        ADD     x13, x14, x7            // c4 = c3 + cm_stride
87                                        // if mr <= 4
88        CSEL    x12, x11, x12, LS       //   a4 = a3
89        CSEL    x13, x14, x13, LS       //   c4 = c3
90
91        # Load acc, params pointer
92        LDP     x15, x8, [sp, 72]
93
94        CMP     x0, 6                   // if mr < 6
95        ADD     x4, x12, x4             // a5 = a4 + a_stride
96        ADD     x7, x13, x7             // c5 = c4 + cm_stride
97        CSEL    x4, x12, x4, LO         //   a5 = a4
98        CSEL    x7, x13, x7, LO         //   c5 = c4
99
1000:
101        # Load initial accumulators
102        LDP     q20, q21, [x15], 32
103        LDP     q22, q23, [x15], 32
104        LDP     q24, q25, [x15], 32
105        LDP     q26, q27, [x15], 32
106        LDP     q28, q29, [x15], 32
107        LDP     q30, q31, [x15], 32
108        SUBS    x0, x2, 32              // k = kc - 32
109        B.LO    4f
110
111        # Prologue - loads for main loop of 96 FMA
112        LDR     q0,  [x3], 16
113        LDP     q12,  q13, [x5], 32     // Fetch 3 B (4th deferred)
114        LDR     q1,  [x9], 16
115        LDR     q2, [x10], 16
116        LDR     q3, [x11], 16
117        LDR     q4, [x12], 16
118        LDR     q5,  [x4], 16
119        LDP     q14,  q15, [x5], 32
120        LDP     q16,  q17, [x5], 32
121
122        # Is there at least 8 floats (32 bytes) for main loop?
123        SUBS    x0, x0, 32
124        B.LO    2f
125
126        # Main loop - 8 floats of A (32 bytes)
127        # 96 FMA + 6 LDP A + 8 LDP B
128        # 64 float weights = 256 bytes.  4 cache lines.
1291:
130        # First group of 4 A.  48 FMA.
131        FMLA    v20.4s, v12.4s,  v0.s[0]
132        LDP     q18,  q19, [x5], 32      // Load last B
133        FMLA    v22.4s, v12.4s,  v1.s[0]
134        FMLA    v24.4s, v12.4s,  v2.s[0]
135        FMLA    v26.4s, v12.4s,  v3.s[0]
136        FMLA    v28.4s, v12.4s,  v4.s[0]
137        FMLA    v30.4s, v12.4s,  v5.s[0]
138        FMLA    v21.4s, v13.4s,  v0.s[0]
139        FMLA    v23.4s, v13.4s,  v1.s[0]
140        FMLA    v25.4s, v13.4s,  v2.s[0]
141        FMLA    v27.4s, v13.4s,  v3.s[0]
142        FMLA    v29.4s, v13.4s,  v4.s[0]
143        FMLA    v31.4s, v13.4s,  v5.s[0]
144        FMLA    v20.4s, v14.4s,  v0.s[1]
145        FMLA    v22.4s, v14.4s,  v1.s[1]
146        FMLA    v24.4s, v14.4s,  v2.s[1]
147        FMLA    v26.4s, v14.4s,  v3.s[1]
148        FMLA    v28.4s, v14.4s,  v4.s[1]
149        FMLA    v30.4s, v14.4s,  v5.s[1]
150        FMLA    v21.4s, v15.4s,  v0.s[1]
151        FMLA    v23.4s, v15.4s,  v1.s[1]
152        FMLA    v25.4s, v15.4s,  v2.s[1]
153        LDR     q6,  [x3], 16            // Load next 6 A
154        FMLA    v27.4s, v15.4s,  v3.s[1]
155        FMLA    v29.4s, v15.4s,  v4.s[1]
156        FMLA    v31.4s, v15.4s,  v5.s[1]
157        LDR     q7,  [x9], 16
158
159        FMLA    v20.4s, v16.4s,  v0.s[2]
160        FMLA    v22.4s, v16.4s,  v1.s[2]
161        FMLA    v24.4s, v16.4s,  v2.s[2]
162        LDR     q8, [x10], 16
163        FMLA    v26.4s, v16.4s,  v3.s[2]
164        FMLA    v28.4s, v16.4s,  v4.s[2]
165        FMLA    v30.4s, v16.4s,  v5.s[2]
166        LDR     q9, [x11], 16
167        FMLA    v21.4s, v17.4s,  v0.s[2]
168        FMLA    v23.4s, v17.4s,  v1.s[2]
169        FMLA    v25.4s, v17.4s,  v2.s[2]
170        LDR     q10, [x12], 16
171        FMLA    v27.4s, v17.4s,  v3.s[2]
172        FMLA    v29.4s, v17.4s,  v4.s[2]
173        FMLA    v31.4s, v17.4s,  v5.s[2]
174        LDR     q11,  [x4], 16
175
176        FMLA    v20.4s, v18.4s,  v0.s[3]
177        FMLA    v22.4s, v18.4s,  v1.s[3]
178        FMLA    v24.4s, v18.4s,  v2.s[3]
179        LDP     q12,  q13, [x5], 32       // Load 4 B
180        FMLA    v26.4s, v18.4s,  v3.s[3]
181        FMLA    v28.4s, v18.4s,  v4.s[3]
182        FMLA    v30.4s, v18.4s,  v5.s[3]
183        LDP     q14,  q15, [x5], 32
184        FMLA    v21.4s, v19.4s,  v0.s[3]
185        FMLA    v23.4s, v19.4s,  v1.s[3]
186        FMLA    v25.4s, v19.4s,  v2.s[3]
187        LDP     q16,  q17, [x5], 32
188        FMLA    v27.4s, v19.4s,  v3.s[3]
189        FMLA    v29.4s, v19.4s,  v4.s[3]
190        FMLA    v31.4s, v19.4s,  v5.s[3]
191        LDP     q18,  q19, [x5], 32
192
193        # Second group of 4 A.  48 FMA.
194        FMLA    v20.4s, v12.4s,  v6.s[0]
195        FMLA    v22.4s, v12.4s,  v7.s[0]
196        FMLA    v24.4s, v12.4s,  v8.s[0]
197        LDR     q0,  [x3], 16           // Load next 6 A
198        FMLA    v26.4s, v12.4s,  v9.s[0]
199        FMLA    v28.4s, v12.4s, v10.s[0]
200        FMLA    v30.4s, v12.4s, v11.s[0]
201        LDR     q1,  [x9], 16
202        FMLA    v21.4s, v13.4s,  v6.s[0]
203        FMLA    v23.4s, v13.4s,  v7.s[0]
204        FMLA    v25.4s, v13.4s,  v8.s[0]
205        LDR     q2, [x10], 16
206        FMLA    v27.4s, v13.4s,  v9.s[0]
207        FMLA    v29.4s, v13.4s, v10.s[0]
208        FMLA    v31.4s, v13.4s, v11.s[0]
209        LDR     q3, [x11], 16
210
211        FMLA    v20.4s, v14.4s,  v6.s[1]
212        FMLA    v22.4s, v14.4s,  v7.s[1]
213        FMLA    v24.4s, v14.4s,  v8.s[1]
214        LDR     q4, [x12], 16
215        FMLA    v26.4s, v14.4s,  v9.s[1]
216        FMLA    v28.4s, v14.4s, v10.s[1]
217        FMLA    v30.4s, v14.4s, v11.s[1]
218        LDR     q5,  [x4], 16
219        FMLA    v21.4s, v15.4s,  v6.s[1]
220        FMLA    v23.4s, v15.4s,  v7.s[1]
221        FMLA    v25.4s, v15.4s,  v8.s[1]
222        LDP     q12,  q13, [x5], 32       // Load next 3 B (not last)
223        FMLA    v27.4s, v15.4s,  v9.s[1]
224        FMLA    v29.4s, v15.4s, v10.s[1]
225        FMLA    v31.4s, v15.4s, v11.s[1]
226        LDP     q14,  q15, [x5], 32
227
228        FMLA    v20.4s, v16.4s,  v6.s[2]
229        FMLA    v22.4s, v16.4s,  v7.s[2]
230        FMLA    v24.4s, v16.4s,  v8.s[2]
231        FMLA    v26.4s, v16.4s,  v9.s[2]
232        FMLA    v28.4s, v16.4s, v10.s[2]
233        FMLA    v30.4s, v16.4s, v11.s[2]
234        FMLA    v21.4s, v17.4s,  v6.s[2]
235        FMLA    v23.4s, v17.4s,  v7.s[2]
236        FMLA    v25.4s, v17.4s,  v8.s[2]
237        FMLA    v27.4s, v17.4s,  v9.s[2]
238        FMLA    v29.4s, v17.4s, v10.s[2]
239        FMLA    v31.4s, v17.4s, v11.s[2]
240
241        FMLA    v20.4s, v18.4s,  v6.s[3]
242        FMLA    v22.4s, v18.4s,  v7.s[3]
243        LDP     q16,  q17, [x5], 32
244        FMLA    v24.4s, v18.4s,  v8.s[3]
245        FMLA    v26.4s, v18.4s,  v9.s[3]
246        FMLA    v28.4s, v18.4s, v10.s[3]
247        FMLA    v30.4s, v18.4s, v11.s[3]
248        SUBS    x0, x0, 32
249        FMLA    v21.4s, v19.4s,  v6.s[3]
250        FMLA    v23.4s, v19.4s,  v7.s[3]
251        FMLA    v25.4s, v19.4s,  v8.s[3]
252        FMLA    v27.4s, v19.4s,  v9.s[3]
253        FMLA    v29.4s, v19.4s, v10.s[3]
254        FMLA    v31.4s, v19.4s, v11.s[3]
255        B.HS    1b
256
257        # Epilogue - 8 floats of A (32 bytes)
258        # 96 FMA + 6 LDP A + 8 LDP B
259        # First block same as main loop.  Second block has no preloads.
2602:
261        # First group of 4 A.  48 FMA.
262        FMLA    v20.4s, v12.4s,  v0.s[0]
263        LDP     q18,  q19, [x5], 32      // Load last B
264        FMLA    v22.4s, v12.4s,  v1.s[0]
265        FMLA    v24.4s, v12.4s,  v2.s[0]
266        FMLA    v26.4s, v12.4s,  v3.s[0]
267        FMLA    v28.4s, v12.4s,  v4.s[0]
268        FMLA    v30.4s, v12.4s,  v5.s[0]
269        FMLA    v21.4s, v13.4s,  v0.s[0]
270        FMLA    v23.4s, v13.4s,  v1.s[0]
271        FMLA    v25.4s, v13.4s,  v2.s[0]
272        FMLA    v27.4s, v13.4s,  v3.s[0]
273        FMLA    v29.4s, v13.4s,  v4.s[0]
274        FMLA    v31.4s, v13.4s,  v5.s[0]
275        FMLA    v20.4s, v14.4s,  v0.s[1]
276        FMLA    v22.4s, v14.4s,  v1.s[1]
277        FMLA    v24.4s, v14.4s,  v2.s[1]
278        FMLA    v26.4s, v14.4s,  v3.s[1]
279        FMLA    v28.4s, v14.4s,  v4.s[1]
280        FMLA    v30.4s, v14.4s,  v5.s[1]
281        FMLA    v21.4s, v15.4s,  v0.s[1]
282        FMLA    v23.4s, v15.4s,  v1.s[1]
283        FMLA    v25.4s, v15.4s,  v2.s[1]
284        LDR     q6,  [x3], 16            // Load next 6 A
285        FMLA    v27.4s, v15.4s,  v3.s[1]
286        FMLA    v29.4s, v15.4s,  v4.s[1]
287        FMLA    v31.4s, v15.4s,  v5.s[1]
288        LDR     q7,  [x9], 16
289
290        FMLA    v20.4s, v16.4s,  v0.s[2]
291        FMLA    v22.4s, v16.4s,  v1.s[2]
292        FMLA    v24.4s, v16.4s,  v2.s[2]
293        LDR     q8, [x10], 16
294        FMLA    v26.4s, v16.4s,  v3.s[2]
295        FMLA    v28.4s, v16.4s,  v4.s[2]
296        FMLA    v30.4s, v16.4s,  v5.s[2]
297        LDR     q9, [x11], 16
298        FMLA    v21.4s, v17.4s,  v0.s[2]
299        FMLA    v23.4s, v17.4s,  v1.s[2]
300        FMLA    v25.4s, v17.4s,  v2.s[2]
301        LDR     q10, [x12], 16
302        FMLA    v27.4s, v17.4s,  v3.s[2]
303        FMLA    v29.4s, v17.4s,  v4.s[2]
304        FMLA    v31.4s, v17.4s,  v5.s[2]
305        LDR     q11,  [x4], 16
306
307        FMLA    v20.4s, v18.4s,  v0.s[3]
308        FMLA    v22.4s, v18.4s,  v1.s[3]
309        FMLA    v24.4s, v18.4s,  v2.s[3]
310        LDP     q12,  q13, [x5], 32       // Load 4 B
311        FMLA    v26.4s, v18.4s,  v3.s[3]
312        FMLA    v28.4s, v18.4s,  v4.s[3]
313        FMLA    v30.4s, v18.4s,  v5.s[3]
314        LDP     q14,  q15, [x5], 32
315        FMLA    v21.4s, v19.4s,  v0.s[3]
316        FMLA    v23.4s, v19.4s,  v1.s[3]
317        FMLA    v25.4s, v19.4s,  v2.s[3]
318        LDP     q16,  q17, [x5], 32
319        FMLA    v27.4s, v19.4s,  v3.s[3]
320        FMLA    v29.4s, v19.4s,  v4.s[3]
321        FMLA    v31.4s, v19.4s,  v5.s[3]
322        LDP     q18,  q19, [x5], 32
323
324        # Second group of 4 A.  48 FMA.
325        FMLA    v20.4s, v12.4s,  v6.s[0]
326        FMLA    v22.4s, v12.4s,  v7.s[0]
327        FMLA    v24.4s, v12.4s,  v8.s[0]
328        FMLA    v26.4s, v12.4s,  v9.s[0]
329        FMLA    v28.4s, v12.4s, v10.s[0]
330        FMLA    v30.4s, v12.4s, v11.s[0]
331        FMLA    v21.4s, v13.4s,  v6.s[0]
332        FMLA    v23.4s, v13.4s,  v7.s[0]
333        FMLA    v25.4s, v13.4s,  v8.s[0]
334        FMLA    v27.4s, v13.4s,  v9.s[0]
335        FMLA    v29.4s, v13.4s, v10.s[0]
336        FMLA    v31.4s, v13.4s, v11.s[0]
337
338        FMLA    v20.4s, v14.4s,  v6.s[1]
339        FMLA    v22.4s, v14.4s,  v7.s[1]
340        FMLA    v24.4s, v14.4s,  v8.s[1]
341        FMLA    v26.4s, v14.4s,  v9.s[1]
342        FMLA    v28.4s, v14.4s, v10.s[1]
343        FMLA    v30.4s, v14.4s, v11.s[1]
344        FMLA    v21.4s, v15.4s,  v6.s[1]
345        FMLA    v23.4s, v15.4s,  v7.s[1]
346        FMLA    v25.4s, v15.4s,  v8.s[1]
347        FMLA    v27.4s, v15.4s,  v9.s[1]
348        FMLA    v29.4s, v15.4s, v10.s[1]
349        FMLA    v31.4s, v15.4s, v11.s[1]
350
351        FMLA    v20.4s, v16.4s,  v6.s[2]
352        FMLA    v22.4s, v16.4s,  v7.s[2]
353        FMLA    v24.4s, v16.4s,  v8.s[2]
354        FMLA    v26.4s, v16.4s,  v9.s[2]
355        FMLA    v28.4s, v16.4s, v10.s[2]
356        FMLA    v30.4s, v16.4s, v11.s[2]
357        FMLA    v21.4s, v17.4s,  v6.s[2]
358        FMLA    v23.4s, v17.4s,  v7.s[2]
359        FMLA    v25.4s, v17.4s,  v8.s[2]
360        FMLA    v27.4s, v17.4s,  v9.s[2]
361        FMLA    v29.4s, v17.4s, v10.s[2]
362        FMLA    v31.4s, v17.4s, v11.s[2]
363
364        FMLA    v20.4s, v18.4s,  v6.s[3]
365        FMLA    v22.4s, v18.4s,  v7.s[3]
366        FMLA    v24.4s, v18.4s,  v8.s[3]
367        FMLA    v26.4s, v18.4s,  v9.s[3]
368        FMLA    v28.4s, v18.4s, v10.s[3]
369        FMLA    v30.4s, v18.4s, v11.s[3]
370
371        # Is there a remainder?- 4 floats of A (16 bytes) or less
372        TST     x0, 31
373
374        FMLA    v21.4s, v19.4s,  v6.s[3]
375        FMLA    v23.4s, v19.4s,  v7.s[3]
376        FMLA    v25.4s, v19.4s,  v8.s[3]
377        LD2R    {v6.4s, v7.4s}, [x8]      // Load min/max values
378        FMLA    v27.4s, v19.4s,  v9.s[3]
379        FMLA    v29.4s, v19.4s, v10.s[3]
380        FMLA    v31.4s, v19.4s, v11.s[3]
381        B.NE    4f
382
383        # Clamp
3843:
385        FMAX    v20.4s, v20.4s, v6.4s
386        FMAX    v21.4s, v21.4s, v6.4s
387        FMAX    v22.4s, v22.4s, v6.4s
388        FMAX    v23.4s, v23.4s, v6.4s
389        FMAX    v24.4s, v24.4s, v6.4s
390        LDR     x0, [sp, 64]            // Load cn_stride
391        FMAX    v25.4s, v25.4s, v6.4s
392        FMAX    v26.4s, v26.4s, v6.4s
393        FMAX    v27.4s, v27.4s, v6.4s
394        FMAX    v28.4s, v28.4s, v6.4s
395        FMAX    v29.4s, v29.4s, v6.4s
396        FMAX    v30.4s, v30.4s, v6.4s
397        FMAX    v31.4s, v31.4s, v6.4s
398        SUBS    x1, x1, 8
399        FMIN    v20.4s, v20.4s, v7.4s
400        FMIN    v21.4s, v21.4s, v7.4s
401        FMIN    v22.4s, v22.4s, v7.4s
402        FMIN    v23.4s, v23.4s, v7.4s
403        FMIN    v24.4s, v24.4s, v7.4s
404        FMIN    v25.4s, v25.4s, v7.4s
405        FMIN    v26.4s, v26.4s, v7.4s
406        FMIN    v27.4s, v27.4s, v7.4s
407        FMIN    v28.4s, v28.4s, v7.4s
408        FMIN    v29.4s, v29.4s, v7.4s
409        FMIN    v30.4s, v30.4s, v7.4s
410        FMIN    v31.4s, v31.4s, v7.4s
411
412        # Store full 6 x 8
413        B.LO    7f
414
415        STP     q30, q31,  [x7]
416        ADD     x7, x7, x0
417        SUB     x3,  x3, x2             // a0 -= kc
418        STP     q28, q29, [x13]
419        ADD     x13, x13, x0
420        SUB     x9,  x9, x2             // a1 -= kc
421        STP     q26, q27, [x14]
422        ADD     x14, x14, x0
423        SUB     x10, x10, x2            // a2 -= kc
424        STP     q24, q25, [x17]
425        ADD     x17, x17, x0
426        SUB     x11, x11, x2            // a3 -= kc
427        STP     q22, q23, [x16]
428        ADD     x16, x16, x0
429        SUB     x12, x12, x2            // a4 -= kc
430        STP     q20, q21,  [x6]
431        ADD     x6,  x6, x0
432        SUB     x4,  x4, x2             // a5 -= kc
433
434        B.HI    0b
435
436        # Restore d8-d15 from stack
437        LDP     d14, d15, [sp, 48]
438        LDP     d12, d13, [sp, 32]
439        LDP     d10, d11, [sp, 16]
440        LDP     d8,  d9, [sp], 64
441        RET
442
4434:
444        # Load min/max values
445        LD2R    {v6.4s, v7.4s}, [x8]
446
447        # Is there a remainder?- 4 floats of A (16 bytes)
448        TBZ     x0, 4, 5f
449
450        # Remainder- 4 floats of A (16 bytes)
451        # Load A
452        LDR     q0,  [x3], 16
453        LDR     q1,  [x9], 16
454        LDR     q2, [x10], 16
455        LDR     q3, [x11], 16
456        LDR     q4, [x12], 16
457        LDR     q5,  [x4], 16
458        # Load B
459        LDP     q12,  q13, [x5], 32
460        LDP     q14,  q15, [x5], 32
461        LDP     q16,  q17, [x5], 32
462        LDP     q18,  q19, [x5], 32
463
464        FMLA    v20.4s, v12.4s,  v0.s[0]
465        FMLA    v22.4s, v12.4s,  v1.s[0]
466        FMLA    v24.4s, v12.4s,  v2.s[0]
467        FMLA    v26.4s, v12.4s,  v3.s[0]
468        FMLA    v28.4s, v12.4s,  v4.s[0]
469        FMLA    v30.4s, v12.4s,  v5.s[0]
470        FMLA    v21.4s, v13.4s,  v0.s[0]
471        FMLA    v23.4s, v13.4s,  v1.s[0]
472        FMLA    v25.4s, v13.4s,  v2.s[0]
473        FMLA    v27.4s, v13.4s,  v3.s[0]
474        FMLA    v29.4s, v13.4s,  v4.s[0]
475        FMLA    v31.4s, v13.4s,  v5.s[0]
476
477        FMLA    v20.4s, v14.4s,  v0.s[1]
478        FMLA    v22.4s, v14.4s,  v1.s[1]
479        FMLA    v24.4s, v14.4s,  v2.s[1]
480        FMLA    v26.4s, v14.4s,  v3.s[1]
481        FMLA    v28.4s, v14.4s,  v4.s[1]
482        FMLA    v30.4s, v14.4s,  v5.s[1]
483        FMLA    v21.4s, v15.4s,  v0.s[1]
484        FMLA    v23.4s, v15.4s,  v1.s[1]
485        FMLA    v25.4s, v15.4s,  v2.s[1]
486        FMLA    v27.4s, v15.4s,  v3.s[1]
487        FMLA    v29.4s, v15.4s,  v4.s[1]
488        FMLA    v31.4s, v15.4s,  v5.s[1]
489
490        FMLA    v20.4s, v16.4s,  v0.s[2]
491        FMLA    v22.4s, v16.4s,  v1.s[2]
492        FMLA    v24.4s, v16.4s,  v2.s[2]
493        FMLA    v26.4s, v16.4s,  v3.s[2]
494        FMLA    v28.4s, v16.4s,  v4.s[2]
495        FMLA    v30.4s, v16.4s,  v5.s[2]
496        FMLA    v21.4s, v17.4s,  v0.s[2]
497        FMLA    v23.4s, v17.4s,  v1.s[2]
498        FMLA    v25.4s, v17.4s,  v2.s[2]
499        FMLA    v27.4s, v17.4s,  v3.s[2]
500        FMLA    v29.4s, v17.4s,  v4.s[2]
501        FMLA    v31.4s, v17.4s,  v5.s[2]
502
503        FMLA    v20.4s, v18.4s,  v0.s[3]
504        FMLA    v22.4s, v18.4s,  v1.s[3]
505        FMLA    v24.4s, v18.4s,  v2.s[3]
506        FMLA    v26.4s, v18.4s,  v3.s[3]
507        FMLA    v28.4s, v18.4s,  v4.s[3]
508        FMLA    v30.4s, v18.4s,  v5.s[3]
509        FMLA    v21.4s, v19.4s,  v0.s[3]
510        FMLA    v23.4s, v19.4s,  v1.s[3]
511        FMLA    v25.4s, v19.4s,  v2.s[3]
512        FMLA    v27.4s, v19.4s,  v3.s[3]
513        FMLA    v29.4s, v19.4s,  v4.s[3]
514        FMLA    v31.4s, v19.4s,  v5.s[3]
515
516        # Is there a remainder?- 2 floats of A (8 bytes)
5175:
518        TBZ     x0, 3, 6f
519
520        # Remainder- 2 floats of A (8 bytes)
521        # Load A
522        LDR     d0,  [x3], 8
523        LDR     d1,  [x9], 8
524        LDR     d2, [x10], 8
525        LDR     d3, [x11], 8
526        LDR     d4, [x12], 8
527        LDR     d5,  [x4], 8
528        # Load B
529        LDP     q12,  q13, [x5], 32
530        LDP     q14,  q15, [x5], 32
531
532        FMLA    v20.4s, v12.4s,  v0.s[0]
533        FMLA    v22.4s, v12.4s,  v1.s[0]
534        FMLA    v24.4s, v12.4s,  v2.s[0]
535        FMLA    v26.4s, v12.4s,  v3.s[0]
536        FMLA    v28.4s, v12.4s,  v4.s[0]
537        FMLA    v30.4s, v12.4s,  v5.s[0]
538        FMLA    v21.4s, v13.4s,  v0.s[0]
539        FMLA    v23.4s, v13.4s,  v1.s[0]
540        FMLA    v25.4s, v13.4s,  v2.s[0]
541        FMLA    v27.4s, v13.4s,  v3.s[0]
542        FMLA    v29.4s, v13.4s,  v4.s[0]
543        FMLA    v31.4s, v13.4s,  v5.s[0]
544
545        FMLA    v20.4s, v14.4s,  v0.s[1]
546        FMLA    v22.4s, v14.4s,  v1.s[1]
547        FMLA    v24.4s, v14.4s,  v2.s[1]
548        FMLA    v26.4s, v14.4s,  v3.s[1]
549        FMLA    v28.4s, v14.4s,  v4.s[1]
550        FMLA    v30.4s, v14.4s,  v5.s[1]
551        FMLA    v21.4s, v15.4s,  v0.s[1]
552        FMLA    v23.4s, v15.4s,  v1.s[1]
553        FMLA    v25.4s, v15.4s,  v2.s[1]
554        FMLA    v27.4s, v15.4s,  v3.s[1]
555        FMLA    v29.4s, v15.4s,  v4.s[1]
556        FMLA    v31.4s, v15.4s,  v5.s[1]
557
558        # Is there a remainder?- 1 float of A (4 bytes)
5596:
560        TBZ     x0, 2, 3b
561
562        # Remainder- 1 float of A (4 bytes)
563        # Load A
564        LDR     s0,  [x3], 4
565        LDR     s1,  [x9], 4
566        LDR     s2, [x10], 4
567        LDR     s3, [x11], 4
568        LDR     s4, [x12], 4
569        LDR     s5,  [x4], 4
570        # Load B
571        LDP     q12,  q13, [x5], 32
572
573        FMLA    v20.4s, v12.4s,  v0.s[0]
574        FMLA    v22.4s, v12.4s,  v1.s[0]
575        FMLA    v24.4s, v12.4s,  v2.s[0]
576        FMLA    v26.4s, v12.4s,  v3.s[0]
577        FMLA    v28.4s, v12.4s,  v4.s[0]
578        FMLA    v30.4s, v12.4s,  v5.s[0]
579        FMLA    v21.4s, v13.4s,  v0.s[0]
580        FMLA    v23.4s, v13.4s,  v1.s[0]
581        FMLA    v25.4s, v13.4s,  v2.s[0]
582        FMLA    v27.4s, v13.4s,  v3.s[0]
583        FMLA    v29.4s, v13.4s,  v4.s[0]
584        FMLA    v31.4s, v13.4s,  v5.s[0]
585        B       3b
586
587        # Store odd width
5887:
589        TBZ     x1, 2, 8f
590        STR     q30,  [x7], 16
591        MOV     v30.16b, v31.16b
592        STR     q28, [x13], 16
593        MOV     v28.16b, v29.16b
594        STR     q26, [x14], 16
595        MOV     v26.16b, v27.16b
596        STR     q24, [x17], 16
597        MOV     v24.16b, v25.16b
598        STR     q22, [x16], 16
599        MOV     v22.16b, v23.16b
600        STR     q20,  [x6], 16
601        MOV     v20.16b, v21.16b
6028:
603        TBZ     x1, 1, 9f
604        STR     d30,  [x7], 8
605        STR     d28, [x13], 8
606        DUP     d30, v30.d[1]
607        DUP     d28, v28.d[1]
608        STR     d26, [x14], 8
609        STR     d24, [x17], 8
610        DUP     d26, v26.d[1]
611        DUP     d24, v24.d[1]
612        STR     d22, [x16], 8
613        STR     d20,  [x6], 8
614        DUP     d22, v22.d[1]
615        DUP     d20, v20.d[1]
616
6179:
618        TBZ     x1, 0, 10f
619        STR     s30,  [x7]
620        STR     s28, [x13]
621        STR     s26, [x14]
622        STR     s24, [x17]
623        STR     s22, [x16]
624        STR     s20,  [x6]
62510:
626        # Restore d8-d15 from stack
627        LDP     d14, d15, [sp, 48]
628        LDP     d12, d13, [sp, 32]
629        LDP     d10, d11, [sp, 16]
630        LDP     d8,  d9, [sp], 64
631        RET
632
633END_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
634
635#ifdef __ELF__
636.section ".note.GNU-stack","",%progbits
637#endif
638