• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_output_params params [sp + 24] -> x8
21
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x14 a0
27# x15 a1
28# x20 a2
29# x21 a3
30# x22 a4
31# x23 a5
32
33# C pointers
34#  x6 c0
35# x16 c1
36# x17 c2
37# x18 c3
38# x13 c4
39#  x7 c5
40
41# Vector register usage
42# A0   v0  v6
43# A1   v1  v7
44# A2   v2  v8
45# A3   v3  v9
46# A4   v4 v10
47# A5   v5 v11
48# B   v12 v13 v14 v15
49# B   v16 v17 v18 v19
50# C   v20 v21
51# C   v22 v23
52# C   v24 v25
53# C   v26 v27
54# C   v28 v29
55# C   v30 v31
56# Clamp v6 v7
57
58BEGIN_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73
59
60        # Load cn_stride, a_offset
61        LDP x10, x11, [sp]
62
63        # Load zero, clamping params pointer
64        LDP x12, x8, [sp, 16]
65
66        # Clamp C pointers
67        STP  d8,  d9, [sp, -96]!
68        CMP x0, 2                // if mr < 2
69        ADD x16, x6, x7          // c1 = c0 + cm_stride
70        CSEL x16, x6, x16, LO    //   c1 = c0
71
72        STP d10, d11, [sp, 16]
73        ADD x17, x16, x7         // c2 = c1 + cm_stride
74                                 // if mr <= 2
75        CSEL x17, x16, x17, LS   //   c2 = c1
76
77        STP d12, d13, [sp, 32]
78        CMP x0, 4                // if mr < 4
79        ADD x18, x17, x7         // c3 = c2 + cm_stride
80        CSEL x18, x17, x18, LO   //   c3 = c2
81
82        STP d14, d15, [sp, 48]
83        ADD x13, x18, x7         // c4 = c3 + cm_stride
84                                 // if mr <= 5
85        CSEL x13, x18, x13, LS   //   c4 = c3
86
87        # Save x20,x21,x22,x23 on stack
88        STP x20, x21, [sp, 64]
89        STP x22, x23, [sp, 80]
90
91        CMP x0, 6                // if mr < 6
92        ADD x7, x13, x7          // c5 = c4 + cm_stride
93        CSEL x7, x13, x7, LO     //   c5 = c4
94
95        # Load zero, clamping params pointer
96        LDP x12, x8, [sp, 112]
97
98        # Load cn_stride, a_offset
99        LDP x10, x11, [sp, 96]
100
101        # Load clamping_params values
102        LD2R {v6.4s, v7.4s}, [x8]
103
1040:
105        # Load initial bias from w into accumulators
106        LD1 {v20.16b, v21.16b}, [x5], 32
107        MOV v22.16b, v20.16b
108        MOV v23.16b, v21.16b
109        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
110        MOV v24.16b, v20.16b
111        MOV v25.16b, v21.16b
112        PRFM PLDL1KEEP, [x5, 64]
113        MOV v26.16b, v20.16b
114        MOV v27.16b, v21.16b
115        PRFM PLDL1KEEP, [x5, 128]
116        MOV v28.16b, v20.16b
117        MOV v29.16b, v21.16b
118        PRFM PLDL1KEEP, [x5, 192]
119        MOV v30.16b, v20.16b
120        MOV v31.16b, v21.16b
121
122        MOV x9, x3  // p = ks
123
1241:
125        # Load next 6 A pointers
126        LDP x14, x15, [x4], 16
127        LDP x20, x21, [x4], 16
128        LDP x22, x23, [x4], 16
129
130        CMP x14, x12            // if a0 == zero
131        ADD x14, x14, x11       // a0 += a_offset
132        CSEL x14, x12, x14, EQ  //   a0 = zero, else += a0 + a_offset
133        CMP x15, x12            // if a1 == zero
134        ADD x15, x15, x11       // a1 += a_offset
135        CSEL x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
136        CMP x20, x12            // if a2 == zero
137        ADD x20, x20, x11       // a2 += a_offset
138        CSEL x20, x12, x20, EQ  //   a2 = zero, else += a2 + a_offset
139        CMP x21, x12            // if a3 == zero
140        ADD x21, x21, x11       // a3 += a_offset
141        CSEL x21, x12, x21, EQ  //   a3 = zero, else += a3 + a_offset
142        CMP x22, x12            // if a4 == zero
143        ADD x22, x22, x11       // a4 += a_offset
144        CSEL x22, x12, x22, EQ  //   a4 = zero, else += a4 + a_offset
145        CMP x23, x12            // if a5 == zero
146        ADD x23, x23, x11       // a5 += a_offset
147        CSEL x23, x12, x23, EQ  //   a5 = zero, else += a5 + a_offset
148
149        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
150        SUBS x0, x2, 32  // k = kc - 32
151        B.LO 5f
152
153        # Prologue - loads for main loop of 96 FMA
154        # load A0 to A4 but not A5
155        LDP  q0,  q6, [x14], 32
156        LDP  q1,  q7, [x15], 32
157        LDP  q2,  q8, [x20], 32
158        LDP  q3,  q9, [x21], 32
159        LDP  q4,  q10, [x22], 32
160        # load first set of B
161        LDP  q12, q13, [x5], 32
162        LDP  q14, q15, [x5], 32
163
164        # Is there at least 8 floats (32 bytes) for main loop?
165        SUBS x0, x0, 32
166        B.LO 3f
167
168        # Main loop - 8 floats of A (32 bytes)
169        # 96 FMA + 6 LDP A + 8 LDP B
1702:
171        # First group of 4 A.  48 FMA.  Loads A5
172
173        LDP  q5, q11, [x23], 32
174        FMLA v20.4s, v12.4s,  v0.s[0]
175        FMLA v22.4s, v12.4s,  v1.s[0]
176        LDP  q16,  q17, [x5], 32
177        FMLA v24.4s, v12.4s,  v2.s[0]
178        FMLA v26.4s, v12.4s,  v3.s[0]
179        LDP  q18,  q19, [x5], 32
180        FMLA v28.4s, v12.4s,  v4.s[0]
181        FMLA v30.4s, v12.4s,  v5.s[0]
182        FMLA v21.4s, v13.4s,  v0.s[0]
183        FMLA v23.4s, v13.4s,  v1.s[0]
184        FMLA v25.4s, v13.4s,  v2.s[0]
185        FMLA v27.4s, v13.4s,  v3.s[0]
186        FMLA v29.4s, v13.4s,  v4.s[0]
187        FMLA v31.4s, v13.4s,  v5.s[0]
188
189        FMLA v20.4s, v14.4s,  v0.s[1]
190        FMLA v22.4s, v14.4s,  v1.s[1]
191        FMLA v24.4s, v14.4s,  v2.s[1]
192        FMLA v26.4s, v14.4s,  v3.s[1]
193        FMLA v28.4s, v14.4s,  v4.s[1]
194        FMLA v30.4s, v14.4s,  v5.s[1]
195        FMLA v21.4s, v15.4s,  v0.s[1]
196        FMLA v23.4s, v15.4s,  v1.s[1]
197        FMLA v25.4s, v15.4s,  v2.s[1]
198        FMLA v27.4s, v15.4s,  v3.s[1]
199        FMLA v29.4s, v15.4s,  v4.s[1]
200        FMLA v31.4s, v15.4s,  v5.s[1]
201
202        LDP  q12,  q13, [x5], 32
203        FMLA v20.4s, v16.4s,  v0.s[2]
204        FMLA v22.4s, v16.4s,  v1.s[2]
205        LDP  q14,  q15, [x5], 32
206        FMLA v24.4s, v16.4s,  v2.s[2]
207        FMLA v26.4s, v16.4s,  v3.s[2]
208        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
209        FMLA v28.4s, v16.4s,  v4.s[2]
210        FMLA v30.4s, v16.4s,  v5.s[2]
211        FMLA v21.4s, v17.4s,  v0.s[2]
212        FMLA v23.4s, v17.4s,  v1.s[2]
213        PRFM PLDL1KEEP, [x5, 256]
214        FMLA v25.4s, v17.4s,  v2.s[2]
215        FMLA v27.4s, v17.4s,  v3.s[2]
216        FMLA v29.4s, v17.4s,  v4.s[2]
217        FMLA v31.4s, v17.4s,  v5.s[2]
218
219        FMLA v20.4s, v18.4s,  v0.s[3]
220        FMLA v22.4s, v18.4s,  v1.s[3]
221        FMLA v24.4s, v18.4s,  v2.s[3]
222        FMLA v26.4s, v18.4s,  v3.s[3]
223        FMLA v28.4s, v18.4s,  v4.s[3]
224        FMLA v30.4s, v18.4s,  v5.s[3]
225        FMLA v21.4s, v19.4s,  v0.s[3]
226        FMLA v23.4s, v19.4s,  v1.s[3]
227        FMLA v25.4s, v19.4s,  v2.s[3]
228        FMLA v27.4s, v19.4s,  v3.s[3]
229        FMLA v29.4s, v19.4s,  v4.s[3]
230        FMLA v31.4s, v19.4s,  v5.s[3]
231
232        # Second group of 4 A.  48 FMA.  Loads A0 - A4
233
234        LDP  q16,  q17, [x5], 32
235        FMLA v20.4s, v12.4s,  v6.s[0]
236        FMLA v22.4s, v12.4s,  v7.s[0]
237        LDP  q18,  q19, [x5], 32
238        FMLA v24.4s, v12.4s,  v8.s[0]
239        FMLA v26.4s, v12.4s,  v9.s[0]
240        FMLA v28.4s, v12.4s, v10.s[0]
241        FMLA v30.4s, v12.4s, v11.s[0]
242        FMLA v21.4s, v13.4s,  v6.s[0]
243        FMLA v23.4s, v13.4s,  v7.s[0]
244        FMLA v25.4s, v13.4s,  v8.s[0]
245        FMLA v27.4s, v13.4s,  v9.s[0]
246        FMLA v29.4s, v13.4s, v10.s[0]
247        FMLA v31.4s, v13.4s, v11.s[0]
248
249        FMLA v20.4s, v14.4s,  v6.s[1]
250        FMLA v22.4s, v14.4s,  v7.s[1]
251        FMLA v24.4s, v14.4s,  v8.s[1]
252        FMLA v26.4s, v14.4s,  v9.s[1]
253        FMLA v28.4s, v14.4s, v10.s[1]
254        FMLA v30.4s, v14.4s, v11.s[1]
255        FMLA v21.4s, v15.4s,  v6.s[1]
256        FMLA v23.4s, v15.4s,  v7.s[1]
257        FMLA v25.4s, v15.4s,  v8.s[1]
258        FMLA v27.4s, v15.4s,  v9.s[1]
259        FMLA v29.4s, v15.4s, v10.s[1]
260        FMLA v31.4s, v15.4s, v11.s[1]
261
262        LDP  q12,  q13, [x5], 32
263        FMLA v20.4s, v16.4s,  v6.s[2]
264        FMLA v20.4s, v18.4s,  v6.s[3]
265        LDP  q14,  q15, [x5], 32
266        FMLA v21.4s, v17.4s,  v6.s[2]
267        FMLA v21.4s, v19.4s,  v6.s[3]
268        LDP  q0,  q6, [x14], 32
269        FMLA v22.4s, v16.4s,  v7.s[2]
270        FMLA v22.4s, v18.4s,  v7.s[3]
271        FMLA v23.4s, v17.4s,  v7.s[2]
272        FMLA v23.4s, v19.4s,  v7.s[3]
273        LDP  q1,  q7, [x15], 32
274        FMLA v24.4s, v16.4s,  v8.s[2]
275        FMLA v24.4s, v18.4s,  v8.s[3]
276        FMLA v25.4s, v17.4s,  v8.s[2]
277        FMLA v25.4s, v19.4s,  v8.s[3]
278        LDP  q2,  q8, [x20], 32
279        FMLA v26.4s, v16.4s,  v9.s[2]
280        FMLA v26.4s, v18.4s,  v9.s[3]
281        FMLA v27.4s, v17.4s,  v9.s[2]
282        FMLA v27.4s, v19.4s,  v9.s[3]
283        LDP  q3,  q9, [x21], 32
284        FMLA v28.4s, v16.4s, v10.s[2]
285        FMLA v28.4s, v18.4s, v10.s[3]
286        FMLA v29.4s, v17.4s, v10.s[2]
287        FMLA v29.4s, v19.4s, v10.s[3]
288        LDP  q4,  q10, [x22], 32
289        FMLA v30.4s, v16.4s, v11.s[2]
290        FMLA v30.4s, v18.4s, v11.s[3]
291        SUBS x0, x0, 32
292        FMLA v31.4s, v17.4s, v11.s[2]
293        FMLA v31.4s, v19.4s, v11.s[3]
294        B.HS 2b
295
296        # Epilogue - 8 floats of A (32 bytes)
297        # 96 FMA + 6 LDP A + 8 LDP B
298        # First block same as main loop.  Second block has no preloads.
2993:
300        # First group of 4 A.  48 FMA.  Loads A5
301
302        LDP  q5, q11, [x23], 32
303        FMLA v20.4s, v12.4s,  v0.s[0]
304        FMLA v22.4s, v12.4s,  v1.s[0]
305        LDP  q16,  q17, [x5], 32
306        FMLA v24.4s, v12.4s,  v2.s[0]
307        FMLA v26.4s, v12.4s,  v3.s[0]
308        LDP  q18,  q19, [x5], 32
309        FMLA v28.4s, v12.4s,  v4.s[0]
310        FMLA v30.4s, v12.4s,  v5.s[0]
311        FMLA v21.4s, v13.4s,  v0.s[0]
312        FMLA v23.4s, v13.4s,  v1.s[0]
313        FMLA v25.4s, v13.4s,  v2.s[0]
314        FMLA v27.4s, v13.4s,  v3.s[0]
315        FMLA v29.4s, v13.4s,  v4.s[0]
316        FMLA v31.4s, v13.4s,  v5.s[0]
317
318        FMLA v20.4s, v14.4s,  v0.s[1]
319        FMLA v22.4s, v14.4s,  v1.s[1]
320        FMLA v24.4s, v14.4s,  v2.s[1]
321        FMLA v26.4s, v14.4s,  v3.s[1]
322        FMLA v28.4s, v14.4s,  v4.s[1]
323        FMLA v30.4s, v14.4s,  v5.s[1]
324        FMLA v21.4s, v15.4s,  v0.s[1]
325        FMLA v23.4s, v15.4s,  v1.s[1]
326        FMLA v25.4s, v15.4s,  v2.s[1]
327        FMLA v27.4s, v15.4s,  v3.s[1]
328        FMLA v29.4s, v15.4s,  v4.s[1]
329        FMLA v31.4s, v15.4s,  v5.s[1]
330
331        LDP  q12,  q13, [x5], 32
332        FMLA v20.4s, v16.4s,  v0.s[2]
333        FMLA v22.4s, v16.4s,  v1.s[2]
334        LDP  q14,  q15, [x5], 32
335        FMLA v24.4s, v16.4s,  v2.s[2]
336        FMLA v26.4s, v16.4s,  v3.s[2]
337        FMLA v28.4s, v16.4s,  v4.s[2]
338        FMLA v30.4s, v16.4s,  v5.s[2]
339        FMLA v21.4s, v17.4s,  v0.s[2]
340        FMLA v23.4s, v17.4s,  v1.s[2]
341        FMLA v25.4s, v17.4s,  v2.s[2]
342        FMLA v27.4s, v17.4s,  v3.s[2]
343        FMLA v29.4s, v17.4s,  v4.s[2]
344        FMLA v31.4s, v17.4s,  v5.s[2]
345
346        FMLA v20.4s, v18.4s,  v0.s[3]
347        FMLA v22.4s, v18.4s,  v1.s[3]
348        FMLA v24.4s, v18.4s,  v2.s[3]
349        FMLA v26.4s, v18.4s,  v3.s[3]
350        FMLA v28.4s, v18.4s,  v4.s[3]
351        FMLA v30.4s, v18.4s,  v5.s[3]
352        FMLA v21.4s, v19.4s,  v0.s[3]
353        FMLA v23.4s, v19.4s,  v1.s[3]
354        FMLA v25.4s, v19.4s,  v2.s[3]
355        FMLA v27.4s, v19.4s,  v3.s[3]
356        FMLA v29.4s, v19.4s,  v4.s[3]
357        FMLA v31.4s, v19.4s,  v5.s[3]
358
359        # Second group of 4 A.  48 FMA. No A Loads, No last B load
360
361        LDP  q16,  q17, [x5], 32
362        FMLA v20.4s, v12.4s,  v6.s[0]
363        FMLA v22.4s, v12.4s,  v7.s[0]
364        LDP  q18,  q19, [x5], 32
365        FMLA v24.4s, v12.4s,  v8.s[0]
366        FMLA v26.4s, v12.4s,  v9.s[0]
367        FMLA v28.4s, v12.4s, v10.s[0]
368        FMLA v30.4s, v12.4s, v11.s[0]
369        FMLA v21.4s, v13.4s,  v6.s[0]
370        FMLA v23.4s, v13.4s,  v7.s[0]
371        FMLA v25.4s, v13.4s,  v8.s[0]
372        FMLA v27.4s, v13.4s,  v9.s[0]
373        FMLA v29.4s, v13.4s, v10.s[0]
374        FMLA v31.4s, v13.4s, v11.s[0]
375
376        FMLA v20.4s, v14.4s,  v6.s[1]
377        FMLA v22.4s, v14.4s,  v7.s[1]
378        FMLA v24.4s, v14.4s,  v8.s[1]
379        FMLA v26.4s, v14.4s,  v9.s[1]
380        FMLA v28.4s, v14.4s, v10.s[1]
381        FMLA v30.4s, v14.4s, v11.s[1]
382        FMLA v21.4s, v15.4s,  v6.s[1]
383        FMLA v23.4s, v15.4s,  v7.s[1]
384        FMLA v25.4s, v15.4s,  v8.s[1]
385        FMLA v27.4s, v15.4s,  v9.s[1]
386        FMLA v29.4s, v15.4s, v10.s[1]
387        FMLA v31.4s, v15.4s, v11.s[1]
388
389        # Last part of epilogue has loads removed.
390
391        FMLA v20.4s, v16.4s,  v6.s[2]
392        FMLA v22.4s, v16.4s,  v7.s[2]
393        FMLA v24.4s, v16.4s,  v8.s[2]
394        FMLA v26.4s, v16.4s,  v9.s[2]
395        FMLA v28.4s, v16.4s, v10.s[2]
396        FMLA v30.4s, v16.4s, v11.s[2]
397        FMLA v21.4s, v17.4s,  v6.s[2]
398        FMLA v23.4s, v17.4s,  v7.s[2]
399        FMLA v25.4s, v17.4s,  v8.s[2]
400        FMLA v27.4s, v17.4s,  v9.s[2]
401        FMLA v29.4s, v17.4s, v10.s[2]
402        FMLA v31.4s, v17.4s, v11.s[2]
403
404        FMLA v20.4s, v18.4s,  v6.s[3]
405        FMLA v22.4s, v18.4s,  v7.s[3]
406        FMLA v24.4s, v18.4s,  v8.s[3]
407        FMLA v26.4s, v18.4s,  v9.s[3]
408        FMLA v28.4s, v18.4s, v10.s[3]
409        FMLA v30.4s, v18.4s, v11.s[3]
410        FMLA v21.4s, v19.4s,  v6.s[3]
411        FMLA v23.4s, v19.4s,  v7.s[3]
412
413        # Load clamping_params values
414        LD2R {v6.4s, v7.4s}, [x8]
415
416        FMLA v25.4s, v19.4s,  v8.s[3]
417        FMLA v27.4s, v19.4s,  v9.s[3]
418        TST x0, 31
419        FMLA v29.4s, v19.4s, v10.s[3]
420        FMLA v31.4s, v19.4s, v11.s[3]
421        B.NE 5f
422
423        .p2align 3
4244:
425        # ks loop
426        SUBS x9, x9, 48  // ks -= MR * sizeof(void*)
427        B.NE 1b
428
429        # Clamp
430        FMIN v20.4s, v20.4s, v6.4s
431        FMIN v21.4s, v21.4s, v6.4s
432        FMIN v22.4s, v22.4s, v6.4s
433        FMIN v23.4s, v23.4s, v6.4s
434        FMIN v24.4s, v24.4s, v6.4s
435        FMIN v25.4s, v25.4s, v6.4s
436        FMIN v26.4s, v26.4s, v6.4s
437        FMIN v27.4s, v27.4s, v6.4s
438        FMIN v28.4s, v28.4s, v6.4s
439        FMIN v29.4s, v29.4s, v6.4s
440        FMIN v30.4s, v30.4s, v6.4s
441        FMIN v31.4s, v31.4s, v6.4s
442        FMAX v20.4s, v20.4s, v7.4s
443        FMAX v21.4s, v21.4s, v7.4s
444        FMAX v22.4s, v22.4s, v7.4s
445        FMAX v23.4s, v23.4s, v7.4s
446        FMAX v24.4s, v24.4s, v7.4s
447        FMAX v25.4s, v25.4s, v7.4s
448        FMAX v26.4s, v26.4s, v7.4s
449        FMAX v27.4s, v27.4s, v7.4s
450        FMAX v28.4s, v28.4s, v7.4s
451        FMAX v29.4s, v29.4s, v7.4s
452        FMAX v30.4s, v30.4s, v7.4s
453        FMAX v31.4s, v31.4s, v7.4s
454
455        # Store full 6 x 8
456        SUBS x1, x1, 8
457        B.LO 8f
458
459        STP q30, q31,  [x7]
460        ADD x7, x7, x10
461        STP q28, q29, [x13]
462        ADD x13, x13, x10
463        STP q26, q27, [x18]
464        ADD x18, x18, x10
465        STP q24, q25, [x17]
466        ADD x17, x17, x10
467        STP q22, q23, [x16]
468        ADD x16, x16, x10
469        STP q20, q21,  [x6]
470        ADD  x6,  x6, x10
471
472        SUB x4, x4, x3  // a -= ks
473
474        # nc loop
475        B.HI 0b
476
477        # Restore x20,x21,x22,x23 from stack
478        LDP x22, x23, [sp, 80]
479        LDP x20, x21, [sp, 64]
480
481        # Restore d8-d15 from stack
482        LDP d14, d15, [sp, 48]
483        LDP d12, d13, [sp, 32]
484        LDP d10, d11, [sp, 16]
485        LDP  d8,  d9, [sp], 96
486        RET
487
488        .p2align 3
4895:
490        # Is there a remainder?- 4 floats of A (16 bytes)
491        TBZ x0, 4, 6f
492
493        # Remainder- 4 floats of A (16 bytes)
494        # Load A
495        LDR   q0, [x14], 16
496        LDR   q1, [x15], 16
497        LDR   q2, [x20], 16
498        LDR   q3, [x21], 16
499        LDR   q4, [x22], 16
500        LDR   q5, [x23], 16
501        # Load B
502        LDP  q12, q13, [x5], 32
503        LDP  q14, q15, [x5], 32
504        LDP  q16, q17, [x5], 32
505        LDP  q18, q19, [x5], 32
506
507        FMLA v20.4s, v12.4s,  v0.s[0]
508        FMLA v22.4s, v12.4s,  v1.s[0]
509        FMLA v24.4s, v12.4s,  v2.s[0]
510        FMLA v26.4s, v12.4s,  v3.s[0]
511        FMLA v28.4s, v12.4s,  v4.s[0]
512        FMLA v30.4s, v12.4s,  v5.s[0]
513        FMLA v21.4s, v13.4s,  v0.s[0]
514        FMLA v23.4s, v13.4s,  v1.s[0]
515        FMLA v25.4s, v13.4s,  v2.s[0]
516        FMLA v27.4s, v13.4s,  v3.s[0]
517        FMLA v29.4s, v13.4s,  v4.s[0]
518        FMLA v31.4s, v13.4s,  v5.s[0]
519
520        FMLA v20.4s, v14.4s,  v0.s[1]
521        FMLA v22.4s, v14.4s,  v1.s[1]
522        FMLA v24.4s, v14.4s,  v2.s[1]
523        FMLA v26.4s, v14.4s,  v3.s[1]
524        FMLA v28.4s, v14.4s,  v4.s[1]
525        FMLA v30.4s, v14.4s,  v5.s[1]
526        FMLA v21.4s, v15.4s,  v0.s[1]
527        FMLA v23.4s, v15.4s,  v1.s[1]
528        FMLA v25.4s, v15.4s,  v2.s[1]
529        FMLA v27.4s, v15.4s,  v3.s[1]
530        FMLA v29.4s, v15.4s,  v4.s[1]
531        FMLA v31.4s, v15.4s,  v5.s[1]
532
533        FMLA v20.4s, v16.4s,  v0.s[2]
534        FMLA v22.4s, v16.4s,  v1.s[2]
535        FMLA v24.4s, v16.4s,  v2.s[2]
536        FMLA v26.4s, v16.4s,  v3.s[2]
537        FMLA v28.4s, v16.4s,  v4.s[2]
538        FMLA v30.4s, v16.4s,  v5.s[2]
539        FMLA v21.4s, v17.4s,  v0.s[2]
540        FMLA v23.4s, v17.4s,  v1.s[2]
541        FMLA v25.4s, v17.4s,  v2.s[2]
542        FMLA v27.4s, v17.4s,  v3.s[2]
543        FMLA v29.4s, v17.4s,  v4.s[2]
544        FMLA v31.4s, v17.4s,  v5.s[2]
545
546        FMLA v20.4s, v18.4s,  v0.s[3]
547        FMLA v22.4s, v18.4s,  v1.s[3]
548        FMLA v24.4s, v18.4s,  v2.s[3]
549        FMLA v26.4s, v18.4s,  v3.s[3]
550        FMLA v28.4s, v18.4s,  v4.s[3]
551        FMLA v30.4s, v18.4s,  v5.s[3]
552        FMLA v21.4s, v19.4s,  v0.s[3]
553        FMLA v23.4s, v19.4s,  v1.s[3]
554        FMLA v25.4s, v19.4s,  v2.s[3]
555        FMLA v27.4s, v19.4s,  v3.s[3]
556        FMLA v29.4s, v19.4s,  v4.s[3]
557        FMLA v31.4s, v19.4s,  v5.s[3]
558
559        # Is there a remainder?- 2 floats of A (8 bytes)
5606:
561        TBZ x0, 3, 7f
562
563        # Remainder- 2 floats of A (8 bytes)
564        # Load A
565        LDR   d0, [x14], 8
566        LDR   d1, [x15], 8
567        LDR   d2, [x20], 8
568        LDR   d3, [x21], 8
569        LDR   d4, [x22], 8
570        LDR   d5, [x23], 8
571        # Load B
572        LDP  q12, q13, [x5], 32
573        LDP  q14, q15, [x5], 32
574
575        FMLA v20.4s, v12.4s,  v0.s[0]
576        FMLA v22.4s, v12.4s,  v1.s[0]
577        FMLA v24.4s, v12.4s,  v2.s[0]
578        FMLA v26.4s, v12.4s,  v3.s[0]
579        FMLA v28.4s, v12.4s,  v4.s[0]
580        FMLA v30.4s, v12.4s,  v5.s[0]
581        FMLA v21.4s, v13.4s,  v0.s[0]
582        FMLA v23.4s, v13.4s,  v1.s[0]
583        FMLA v25.4s, v13.4s,  v2.s[0]
584        FMLA v27.4s, v13.4s,  v3.s[0]
585        FMLA v29.4s, v13.4s,  v4.s[0]
586        FMLA v31.4s, v13.4s,  v5.s[0]
587
588        FMLA v20.4s, v14.4s,  v0.s[1]
589        FMLA v22.4s, v14.4s,  v1.s[1]
590        FMLA v24.4s, v14.4s,  v2.s[1]
591        FMLA v26.4s, v14.4s,  v3.s[1]
592        FMLA v28.4s, v14.4s,  v4.s[1]
593        FMLA v30.4s, v14.4s,  v5.s[1]
594        FMLA v21.4s, v15.4s,  v0.s[1]
595        FMLA v23.4s, v15.4s,  v1.s[1]
596        FMLA v25.4s, v15.4s,  v2.s[1]
597        FMLA v27.4s, v15.4s,  v3.s[1]
598        FMLA v29.4s, v15.4s,  v4.s[1]
599        FMLA v31.4s, v15.4s,  v5.s[1]
600
601        # Is there a remainder?- 1 float of A (4 bytes)
6027:
603        TBZ x0, 2, 4b
604
605        # Remainder- 1 float of A (4 bytes)
606        # Load A
607        LDR   s0, [x14], 4
608        LDR   s1, [x15], 4
609        LDR   s2, [x20], 4
610        LDR   s3, [x21], 4
611        LDR   s4, [x22], 4
612        LDR   s5, [x23], 4
613        # Load B
614        LDP  q12, q13, [x5], 32
615
616        FMLA v20.4s, v12.4s,  v0.s[0]
617        FMLA v22.4s, v12.4s,  v1.s[0]
618        FMLA v24.4s, v12.4s,  v2.s[0]
619        FMLA v26.4s, v12.4s,  v3.s[0]
620        FMLA v28.4s, v12.4s,  v4.s[0]
621        FMLA v30.4s, v12.4s,  v5.s[0]
622        FMLA v21.4s, v13.4s,  v0.s[0]
623        FMLA v23.4s, v13.4s,  v1.s[0]
624        FMLA v25.4s, v13.4s,  v2.s[0]
625        FMLA v27.4s, v13.4s,  v3.s[0]
626        FMLA v29.4s, v13.4s,  v4.s[0]
627        FMLA v31.4s, v13.4s,  v5.s[0]
628        B 4b
629
630        # Store odd width
6318:
632        TBZ x1, 2, 9f
633        STR q30,  [x7], 16
634        MOV v30.16b, v31.16b
635        STR q28, [x13], 16
636        MOV v28.16b, v29.16b
637        STR q26, [x18], 16
638        MOV v26.16b, v27.16b
639        STR q24, [x17], 16
640        MOV v24.16b, v25.16b
641        STR q22, [x16], 16
642        MOV v22.16b, v23.16b
643        STR q20,  [x6], 16
644        MOV v20.16b, v21.16b
6459:
646        TBZ x1, 1, 10f
647        STR d30,  [x7], 8
648        DUP d30, v30.d[1]
649        STR d28, [x13], 8
650        DUP d28, v28.d[1]
651        STR d26, [x18], 8
652        DUP d26, v26.d[1]
653        STR d24, [x17], 8
654        DUP d24, v24.d[1]
655        STR d22, [x16], 8
656        DUP d22, v22.d[1]
657        STR d20,  [x6], 8
658        DUP d20, v20.d[1]
659
66010:
661        TBZ x1, 0, 11f
662        STR s30,  [x7]
663        STR s28, [x13]
664        STR s26, [x18]
665        STR s24, [x17]
666        STR s22, [x16]
667        STR s20,  [x6]
66811:
669        # Restore x20,x21,x22,x23 from stack
670        LDP x22, x23, [sp, 80]
671        LDP x20, x21, [sp, 64]
672
673        # Restore d8-d15 from stack
674        LDP d14, d15, [sp, 48]
675        LDP d12, d13, [sp, 32]
676        LDP d10, d11, [sp, 16]
677        LDP  d8,  d9, [sp], 96
678        RET
679
680END_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73
681
682#ifdef __ELF__
683.section ".note.GNU-stack","",%progbits
684#endif
685