• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# unused compared to 5x8
26#  x4 a5
27#  x7 c5
28# A5  v10 v11
29# C   v30 v31
30
31# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
32
33# A pointers
34#  x3 a0
35#  x9 a1
36# x10 a2
37# x11 a3
38# x12 a4
39
40# C pointers
41#  x6 c0
42# x16 c1
43# x17 c2
44# x13 c3
45#  x7 c4
46
47# Vector register usage
48# A0   v0  v1
49# A1   v2  v3
50# A2   v4  v5
51# A3   v6  v7
52# A4   v8  v9
53# B   v12 v13 v14 v15
54# B   v16 v17 v18 v19
55# C   v20 v21
56# C   v22 v23
57# C   v24 v25
58# C   v26 v27
59# C   v28 v29
60# Clamp v30 v31
61
62BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75
63
64        # Load cn_stride, acc
65        LDP x14, x15, [sp]
66        # Load params pointer
67        LDR x8, [sp, 16]
68
69        # Clamp A and C pointers / Save d8-d15 on stack
70        STP  d8,  d9, [sp, -48]!
71        CMP x0, 2                // if mr < 2
72        ADD x9, x3, x4           // a1 = a0 + a_stride
73        ADD x16, x6, x7          // c1 = c0 + cm_stride
74        CSEL x9, x3, x9, LO      //   a1 = a0
75        CSEL x16, x6, x16, LO    //   c1 = c0
76
77        STP d12, d13, [sp, 16]
78        ADD x10, x9, x4          // a2 = a1 + a_stride
79        ADD x17, x16, x7         // c2 = c1 + cm_stride
80                                 // if mr <= 2
81        CSEL x10, x9, x10, LS    //   a2 = a1
82        CSEL x17, x16, x17, LS   //   c2 = c1
83
84        STP d14, d15, [sp, 32]
85        CMP x0, 4                // if mr < 4
86        ADD x11, x10, x4         // a3 = a2 + a_stride
87        ADD x13, x17, x7         // c3 = c2 + cm_stride
88        CSEL x11, x10, x11, LO   //   a3 = a2
89        CSEL x13, x17, x13, LO   //   c3 = c2
90
91        ADD x12, x11, x4         // a4 = a3 + a_stride
92        ADD x7, x13, x7         // c4 = c3 + cm_stride
93                                 // if mr <= 4
94        CSEL x12, x11, x12, LS   //   a4 = a3
95        CSEL x7, x13, x7, LS   //   c4 = c3
96
97        # Load clamp values
98        LD2R {v30.4s, v31.4s}, [x8]
99
1000:
101        # Load initial accumulators
102        LDP q20, q21, [x15], 32
103        LDP q22, q23, [x15], 32
104        LDP q24, q25, [x15], 32
105        LDP q26, q27, [x15], 32
106        LDP q28, q29, [x15], 32
107        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
108        PRFM PLDL1KEEP, [x5, 64]
109        PRFM PLDL1KEEP, [x5, 128]
110        PRFM PLDL1KEEP, [x5, 192]
111        PRFM PLDL1KEEP,  [x3]    // Prefetch A
112        PRFM PLDL1KEEP,  [x9]
113        PRFM PLDL1KEEP, [x10]
114        PRFM PLDL1KEEP, [x11]
115        PRFM PLDL1KEEP, [x12]
116
117        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
118        SUBS x0, x2, 32  // k = kc - 32
119        B.LO 4f
120
121        # Prologue - loads for main loop of 80 FMA
122        LDR   q0,  [x3], 16
123        LDR   q2,  [x9], 16
124        LDR   q4, [x10], 16
125        LDR   q6, [x11], 16
126        LDR   q8, [x12], 16
127        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
128        LDP  q14,  q15, [x5], 32
129        LDP  q16,  q17, [x5], 32
130
131        # Is there at least 8 floats (32 bytes) for main loop?
132        SUBS x0, x0, 32
133        B.LO 2f
134
135        # Main loop - 8 floats of A (32 bytes)
136        # 80 FMA + 5 LDP A + 8 LDP B
1371:
138        # First group of 4 A.  40 FMA.
139        FMLA v20.4s, v12.4s,  v0.s[0]
140        LDP  q18,  q19, [x5], 32      // Load last B
141        FMLA v22.4s, v12.4s,  v2.s[0]
142        FMLA v24.4s, v12.4s,  v4.s[0]
143        FMLA v26.4s, v12.4s,  v6.s[0]
144        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
145        FMLA v28.4s, v12.4s,  v8.s[0]
146        FMLA v21.4s, v13.4s,  v0.s[0]
147        FMLA v23.4s, v13.4s,  v2.s[0]
148        PRFM PLDL1KEEP, [x5, 256]
149        FMLA v25.4s, v13.4s,  v4.s[0]
150        FMLA v27.4s, v13.4s,  v6.s[0]
151        FMLA v29.4s, v13.4s,  v8.s[0]
152        LDR   q1,  [x3], 16            // Load next 5 A
153
154        FMLA v20.4s, v14.4s,  v0.s[1]
155        FMLA v22.4s, v14.4s,  v2.s[1]
156        FMLA v24.4s, v14.4s,  v4.s[1]
157        LDR   q3,  [x9], 16
158        FMLA v26.4s, v14.4s,  v6.s[1]
159        FMLA v28.4s, v14.4s,  v8.s[1]
160        FMLA v21.4s, v15.4s,  v0.s[1]
161        LDR   q5, [x10], 16
162        FMLA v23.4s, v15.4s,  v2.s[1]
163        FMLA v25.4s, v15.4s,  v4.s[1]
164        FMLA v27.4s, v15.4s,  v6.s[1]
165        LDR   q7, [x11], 16
166        FMLA v29.4s, v15.4s,  v8.s[1]
167
168        FMLA v20.4s, v16.4s,  v0.s[2]
169        FMLA v22.4s, v16.4s,  v2.s[2]
170        LDR   q9, [x12], 16
171        FMLA v24.4s, v16.4s,  v4.s[2]
172        FMLA v26.4s, v16.4s,  v6.s[2]
173        FMLA v28.4s, v16.4s,  v8.s[2]
174        LDP  q12,  q13, [x5], 32       // Load 4 B
175        FMLA v21.4s, v17.4s,  v0.s[2]
176        FMLA v23.4s, v17.4s,  v2.s[2]
177        FMLA v25.4s, v17.4s,  v4.s[2]
178        LDP  q14,  q15, [x5], 32
179        FMLA v27.4s, v17.4s,  v6.s[2]
180        FMLA v29.4s, v17.4s,  v8.s[2]
181
182        FMLA v20.4s, v18.4s,  v0.s[3]
183        LDP  q16,  q17, [x5], 32
184        FMLA v22.4s, v18.4s,  v2.s[3]
185        FMLA v24.4s, v18.4s,  v4.s[3]
186        FMLA v26.4s, v18.4s,  v6.s[3]
187        FMLA v28.4s, v18.4s,  v8.s[3]
188        FMLA v21.4s, v19.4s,  v0.s[3]
189        FMLA v23.4s, v19.4s,  v2.s[3]
190        FMLA v25.4s, v19.4s,  v4.s[3]
191        FMLA v27.4s, v19.4s,  v6.s[3]
192        FMLA v29.4s, v19.4s,  v8.s[3]
193        LDP  q18,  q19, [x5], 32
194
195        # Second group of 4 A.  40 FMA.
196        FMLA v20.4s, v12.4s,  v1.s[0]
197        FMLA v22.4s, v12.4s,  v3.s[0]
198        FMLA v24.4s, v12.4s,  v5.s[0]
199        LDR   q0,  [x3], 16           // Load next 5 A
200        FMLA v26.4s, v12.4s,  v7.s[0]
201        FMLA v28.4s, v12.4s,  v9.s[0]
202        FMLA v21.4s, v13.4s,  v1.s[0]
203        LDR   q2,  [x9], 16
204        FMLA v23.4s, v13.4s,  v3.s[0]
205        FMLA v25.4s, v13.4s,  v5.s[0]
206        FMLA v27.4s, v13.4s,  v7.s[0]
207        LDR   q4, [x10], 16
208        FMLA v29.4s, v13.4s,  v9.s[0]
209
210        FMLA v20.4s, v14.4s,  v1.s[1]
211        FMLA v22.4s, v14.4s,  v3.s[1]
212        LDR   q6, [x11], 16
213        FMLA v24.4s, v14.4s,  v5.s[1]
214        FMLA v26.4s, v14.4s,  v7.s[1]
215        FMLA v28.4s, v14.4s,  v9.s[1]
216        LDR   q8, [x12], 16
217        FMLA v21.4s, v15.4s,  v1.s[1]
218        FMLA v23.4s, v15.4s,  v3.s[1]
219        FMLA v25.4s, v15.4s,  v5.s[1]
220        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
221        FMLA v27.4s, v15.4s,  v7.s[1]
222        FMLA v29.4s, v15.4s,  v9.s[1]
223
224        FMLA v20.4s, v16.4s,  v1.s[2]
225        LDP  q14,  q15, [x5], 32
226        FMLA v22.4s, v16.4s,  v3.s[2]
227        FMLA v24.4s, v16.4s,  v5.s[2]
228        FMLA v26.4s, v16.4s,  v7.s[2]
229        FMLA v28.4s, v16.4s,  v9.s[2]
230        FMLA v21.4s, v17.4s,  v1.s[2]
231        FMLA v23.4s, v17.4s,  v3.s[2]
232        FMLA v25.4s, v17.4s,  v5.s[2]
233        FMLA v27.4s, v17.4s,  v7.s[2]
234        FMLA v29.4s, v17.4s,  v9.s[2]
235        LDP  q16,  q17, [x5], 32
236
237        FMLA v20.4s, v18.4s,  v1.s[3]
238        FMLA v22.4s, v18.4s,  v3.s[3]
239        SUBS x0, x0, 32
240        FMLA v24.4s, v18.4s,  v5.s[3]
241        FMLA v26.4s, v18.4s,  v7.s[3]
242        FMLA v28.4s, v18.4s,  v9.s[3]
243        FMLA v21.4s, v19.4s,  v1.s[3]
244        FMLA v23.4s, v19.4s,  v3.s[3]
245        FMLA v25.4s, v19.4s,  v5.s[3]
246        FMLA v27.4s, v19.4s,  v7.s[3]
247        FMLA v29.4s, v19.4s,  v9.s[3]
248        B.HS 1b
249
250        # Epilogue - 8 floats of A (32 bytes)
251        # 80 FMA + 5 LDP A + 8 LDP B
252        # First block same as main loop.  Second block has no preloads.
2532:
254        # First group of 4 A.  40 FMA.
255        FMLA v20.4s, v12.4s,  v0.s[0]
256        LDP  q18,  q19, [x5], 32      // Load last B
257        FMLA v22.4s, v12.4s,  v2.s[0]
258        FMLA v24.4s, v12.4s,  v4.s[0]
259        FMLA v26.4s, v12.4s,  v6.s[0]
260        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
261        FMLA v28.4s, v12.4s,  v8.s[0]
262        FMLA v21.4s, v13.4s,  v0.s[0]
263        FMLA v23.4s, v13.4s,  v2.s[0]
264        PRFM PLDL1KEEP, [x5, 256]
265        FMLA v25.4s, v13.4s,  v4.s[0]
266        FMLA v27.4s, v13.4s,  v6.s[0]
267        FMLA v29.4s, v13.4s,  v8.s[0]
268        LDR   q1,  [x3], 16            // Load next 5 A
269
270        FMLA v20.4s, v14.4s,  v0.s[1]
271        FMLA v22.4s, v14.4s,  v2.s[1]
272        FMLA v24.4s, v14.4s,  v4.s[1]
273        LDR   q3,  [x9], 16
274        FMLA v26.4s, v14.4s,  v6.s[1]
275        FMLA v28.4s, v14.4s,  v8.s[1]
276        FMLA v21.4s, v15.4s,  v0.s[1]
277        LDR   q5, [x10], 16
278        FMLA v23.4s, v15.4s,  v2.s[1]
279        FMLA v25.4s, v15.4s,  v4.s[1]
280        FMLA v27.4s, v15.4s,  v6.s[1]
281        LDR   q7, [x11], 16
282        FMLA v29.4s, v15.4s,  v8.s[1]
283
284        FMLA v20.4s, v16.4s,  v0.s[2]
285        FMLA v22.4s, v16.4s,  v2.s[2]
286        LDR   q9, [x12], 16
287        FMLA v24.4s, v16.4s,  v4.s[2]
288        FMLA v26.4s, v16.4s,  v6.s[2]
289        FMLA v28.4s, v16.4s,  v8.s[2]
290        LDP  q12,  q13, [x5], 32       // Load 4 B
291        FMLA v21.4s, v17.4s,  v0.s[2]
292        FMLA v23.4s, v17.4s,  v2.s[2]
293        FMLA v25.4s, v17.4s,  v4.s[2]
294        LDP  q14,  q15, [x5], 32
295        FMLA v27.4s, v17.4s,  v6.s[2]
296        FMLA v29.4s, v17.4s,  v8.s[2]
297
298        FMLA v20.4s, v18.4s,  v0.s[3]
299        LDP  q16,  q17, [x5], 32
300        FMLA v22.4s, v18.4s,  v2.s[3]
301        FMLA v24.4s, v18.4s,  v4.s[3]
302        FMLA v26.4s, v18.4s,  v6.s[3]
303        FMLA v28.4s, v18.4s,  v8.s[3]
304        FMLA v21.4s, v19.4s,  v0.s[3]
305        FMLA v23.4s, v19.4s,  v2.s[3]
306        FMLA v25.4s, v19.4s,  v4.s[3]
307        FMLA v27.4s, v19.4s,  v6.s[3]
308        FMLA v29.4s, v19.4s,  v8.s[3]
309        LDP  q18,  q19, [x5], 32
310
311        # Second group of 4 A.  40 FMA.
312        FMLA v20.4s, v12.4s,  v1.s[0]
313        FMLA v22.4s, v12.4s,  v3.s[0]
314        FMLA v24.4s, v12.4s,  v5.s[0]
315        FMLA v26.4s, v12.4s,  v7.s[0]
316        FMLA v28.4s, v12.4s,  v9.s[0]
317        FMLA v21.4s, v13.4s,  v1.s[0]
318        FMLA v23.4s, v13.4s,  v3.s[0]
319        FMLA v25.4s, v13.4s,  v5.s[0]
320        FMLA v27.4s, v13.4s,  v7.s[0]
321        FMLA v29.4s, v13.4s,  v9.s[0]
322
323        FMLA v20.4s, v14.4s,  v1.s[1]
324        FMLA v22.4s, v14.4s,  v3.s[1]
325        FMLA v24.4s, v14.4s,  v5.s[1]
326        FMLA v26.4s, v14.4s,  v7.s[1]
327        FMLA v28.4s, v14.4s,  v9.s[1]
328        FMLA v21.4s, v15.4s,  v1.s[1]
329        FMLA v23.4s, v15.4s,  v3.s[1]
330        FMLA v25.4s, v15.4s,  v5.s[1]
331        FMLA v27.4s, v15.4s,  v7.s[1]
332        FMLA v29.4s, v15.4s,  v9.s[1]
333
334        FMLA v20.4s, v16.4s,  v1.s[2]
335        FMLA v22.4s, v16.4s,  v3.s[2]
336        FMLA v24.4s, v16.4s,  v5.s[2]
337        FMLA v26.4s, v16.4s,  v7.s[2]
338        FMLA v28.4s, v16.4s,  v9.s[2]
339        FMLA v21.4s, v17.4s,  v1.s[2]
340        FMLA v23.4s, v17.4s,  v3.s[2]
341        FMLA v25.4s, v17.4s,  v5.s[2]
342        FMLA v27.4s, v17.4s,  v7.s[2]
343        FMLA v29.4s, v17.4s,  v9.s[2]
344        TST x0, 31
345
346        FMLA v20.4s, v18.4s,  v1.s[3]
347        FMLA v22.4s, v18.4s,  v3.s[3]
348        FMLA v24.4s, v18.4s,  v5.s[3]
349        FMLA v26.4s, v18.4s,  v7.s[3]
350        FMLA v28.4s, v18.4s,  v9.s[3]
351        FMLA v21.4s, v19.4s,  v1.s[3]
352        FMLA v23.4s, v19.4s,  v3.s[3]
353        FMLA v25.4s, v19.4s,  v5.s[3]
354        FMLA v27.4s, v19.4s,  v7.s[3]
355        FMLA v29.4s, v19.4s,  v9.s[3]
356        B.NE 4f
357
358        # Clamp
3593:
360        FMAX v20.4s, v20.4s, v30.4s
361        SUBS x1, x1, 8
362        FMAX v21.4s, v21.4s, v30.4s
363        FMAX v22.4s, v22.4s, v30.4s
364        FMAX v23.4s, v23.4s, v30.4s
365        FMAX v24.4s, v24.4s, v30.4s
366        FMAX v25.4s, v25.4s, v30.4s
367        FMAX v26.4s, v26.4s, v30.4s
368        FMAX v27.4s, v27.4s, v30.4s
369        FMAX v28.4s, v28.4s, v30.4s
370        FMAX v29.4s, v29.4s, v30.4s
371        FMIN v20.4s, v20.4s, v31.4s
372        FMIN v21.4s, v21.4s, v31.4s
373        FMIN v22.4s, v22.4s, v31.4s
374        FMIN v23.4s, v23.4s, v31.4s
375        FMIN v24.4s, v24.4s, v31.4s
376        FMIN v25.4s, v25.4s, v31.4s
377        FMIN v26.4s, v26.4s, v31.4s
378        FMIN v27.4s, v27.4s, v31.4s
379        FMIN v28.4s, v28.4s, v31.4s
380        FMIN v29.4s, v29.4s, v31.4s
381
382        # Store full 5 x 8
383        B.LO 7f
384
385        SUB  x3,  x3, x2 // a0 -= kc
386        STP q28, q29, [x7]
387        ADD x7, x7, x14
388        SUB  x9,  x9, x2 // a1 -= kc
389        STP q26, q27, [x13]
390        ADD x13, x13, x14
391        SUB x10, x10, x2 // a2 -= kc
392        STP q24, q25, [x17]
393        ADD x17, x17, x14
394        SUB x11, x11, x2 // a3 -= kc
395        STP q22, q23, [x16]
396        ADD x16, x16, x14
397        SUB x12, x12, x2 // a4 -= kc
398        STP q20, q21,  [x6]
399        ADD  x6,  x6, x14
400
401        B.HI 0b
402
403        # Restore d8-d15 from stack
404        LDP d14, d15, [sp, 32]
405        LDP d12, d13, [sp, 16]
406        LDP  d8,  d9, [sp], 48
407        RET
408
409        # Load clamp values
4104:
411        # Is there a remainder?- 4 floats of A (16 bytes)
412        TBZ x0, 4, 5f
413
414        # Remainder- 4 floats of A (16 bytes)
415        # Load A
416        LDR   q0,  [x3], 16
417        LDR   q2,  [x9], 16
418        LDR   q4, [x10], 16
419        LDR   q6, [x11], 16
420        LDR   q8, [x12], 16
421        # Load B
422        LDP  q12,  q13, [x5], 32
423        LDP  q14,  q15, [x5], 32
424        LDP  q16,  q17, [x5], 32
425        LDP  q18,  q19, [x5], 32
426
427        FMLA v20.4s, v12.4s,  v0.s[0]
428        FMLA v22.4s, v12.4s,  v2.s[0]
429        FMLA v24.4s, v12.4s,  v4.s[0]
430        FMLA v26.4s, v12.4s,  v6.s[0]
431        FMLA v28.4s, v12.4s,  v8.s[0]
432        FMLA v21.4s, v13.4s,  v0.s[0]
433        FMLA v23.4s, v13.4s,  v2.s[0]
434        FMLA v25.4s, v13.4s,  v4.s[0]
435        FMLA v27.4s, v13.4s,  v6.s[0]
436        FMLA v29.4s, v13.4s,  v8.s[0]
437
438        FMLA v20.4s, v14.4s,  v0.s[1]
439        FMLA v22.4s, v14.4s,  v2.s[1]
440        FMLA v24.4s, v14.4s,  v4.s[1]
441        FMLA v26.4s, v14.4s,  v6.s[1]
442        FMLA v28.4s, v14.4s,  v8.s[1]
443        FMLA v21.4s, v15.4s,  v0.s[1]
444        FMLA v23.4s, v15.4s,  v2.s[1]
445        FMLA v25.4s, v15.4s,  v4.s[1]
446        FMLA v27.4s, v15.4s,  v6.s[1]
447        FMLA v29.4s, v15.4s,  v8.s[1]
448
449        FMLA v20.4s, v16.4s,  v0.s[2]
450        FMLA v22.4s, v16.4s,  v2.s[2]
451        FMLA v24.4s, v16.4s,  v4.s[2]
452        FMLA v26.4s, v16.4s,  v6.s[2]
453        FMLA v28.4s, v16.4s,  v8.s[2]
454        FMLA v21.4s, v17.4s,  v0.s[2]
455        FMLA v23.4s, v17.4s,  v2.s[2]
456        FMLA v25.4s, v17.4s,  v4.s[2]
457        FMLA v27.4s, v17.4s,  v6.s[2]
458        FMLA v29.4s, v17.4s,  v8.s[2]
459
460        FMLA v20.4s, v18.4s,  v0.s[3]
461        FMLA v22.4s, v18.4s,  v2.s[3]
462        FMLA v24.4s, v18.4s,  v4.s[3]
463        FMLA v26.4s, v18.4s,  v6.s[3]
464        FMLA v28.4s, v18.4s,  v8.s[3]
465        FMLA v21.4s, v19.4s,  v0.s[3]
466        FMLA v23.4s, v19.4s,  v2.s[3]
467        FMLA v25.4s, v19.4s,  v4.s[3]
468        FMLA v27.4s, v19.4s,  v6.s[3]
469        FMLA v29.4s, v19.4s,  v8.s[3]
470
471        # Is there a remainder?- 2 floats of A (8 bytes)
4725:
473        TBZ x0, 3, 6f
474
475        # Remainder- 2 floats of A (8 bytes)
476        # Load A
477        LDR   d0,  [x3], 8
478        LDR   d2,  [x9], 8
479        LDR   d4, [x10], 8
480        LDR   d6, [x11], 8
481        LDR   d8, [x12], 8
482        # Load B
483        LDP  q12,  q13, [x5], 32
484        LDP  q14,  q15, [x5], 32
485
486        FMLA v20.4s, v12.4s,  v0.s[0]
487        FMLA v22.4s, v12.4s,  v2.s[0]
488        FMLA v24.4s, v12.4s,  v4.s[0]
489        FMLA v26.4s, v12.4s,  v6.s[0]
490        FMLA v28.4s, v12.4s,  v8.s[0]
491        FMLA v21.4s, v13.4s,  v0.s[0]
492        FMLA v23.4s, v13.4s,  v2.s[0]
493        FMLA v25.4s, v13.4s,  v4.s[0]
494        FMLA v27.4s, v13.4s,  v6.s[0]
495        FMLA v29.4s, v13.4s,  v8.s[0]
496
497        FMLA v20.4s, v14.4s,  v0.s[1]
498        FMLA v22.4s, v14.4s,  v2.s[1]
499        FMLA v24.4s, v14.4s,  v4.s[1]
500        FMLA v26.4s, v14.4s,  v6.s[1]
501        FMLA v28.4s, v14.4s,  v8.s[1]
502        FMLA v21.4s, v15.4s,  v0.s[1]
503        FMLA v23.4s, v15.4s,  v2.s[1]
504        FMLA v25.4s, v15.4s,  v4.s[1]
505        FMLA v27.4s, v15.4s,  v6.s[1]
506        FMLA v29.4s, v15.4s,  v8.s[1]
507
508        # Is there a remainder?- 1 float of A (4 bytes)
5096:
510        TBZ x0, 2, 3b
511
512        # Remainder- 1 float of A (4 bytes)
513        # Load A
514        LDR   s0,  [x3], 4
515        LDR   s2,  [x9], 4
516        LDR   s4, [x10], 4
517        LDR   s6, [x11], 4
518        LDR   s8, [x12], 4
519        # Load B
520        LDP  q12,  q13, [x5], 32
521
522        FMLA v20.4s, v12.4s,  v0.s[0]
523        FMLA v22.4s, v12.4s,  v2.s[0]
524        FMLA v24.4s, v12.4s,  v4.s[0]
525        FMLA v26.4s, v12.4s,  v6.s[0]
526        FMLA v28.4s, v12.4s,  v8.s[0]
527        FMLA v21.4s, v13.4s,  v0.s[0]
528        FMLA v23.4s, v13.4s,  v2.s[0]
529        FMLA v25.4s, v13.4s,  v4.s[0]
530        FMLA v27.4s, v13.4s,  v6.s[0]
531        FMLA v29.4s, v13.4s,  v8.s[0]
532        B 3b
533
534        # Store odd width
5357:
536        TBZ x1, 2, 8f
537        STR q28, [x7], 16
538        MOV v28.16b, v29.16b
539        STR q26, [x13], 16
540        MOV v26.16b, v27.16b
541        STR q24, [x17], 16
542        MOV v24.16b, v25.16b
543        STR q22, [x16], 16
544        MOV v22.16b, v23.16b
545        STR q20,  [x6], 16
546        MOV v20.16b, v21.16b
5478:
548        TBZ x1, 1, 9f
549        STR d28, [x7], 8
550        DUP d28, v28.d[1]
551        STR d26, [x13], 8
552        DUP d26, v26.d[1]
553        STR d24, [x17], 8
554        DUP d24, v24.d[1]
555        STR d22, [x16], 8
556        DUP d22, v22.d[1]
557        STR d20,  [x6], 8
558        DUP d20, v20.d[1]
559
5609:
561        TBZ x1, 0, 10f
562        STR s28, [x7]
563        STR s26, [x13]
564        STR s24, [x17]
565        STR s22, [x16]
566        STR s20,  [x6]
56710:
568        # Restore d8-d15 from stack
569        LDP d14, d15, [sp, 32]
570        LDP d12, d13, [sp, 16]
571        LDP  d8,  d9, [sp], 48
572        RET
573
574END_FUNCTION xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75
575
576#ifdef __ELF__
577.section ".note.GNU-stack","",%progbits
578#endif
579