• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> (x8)
25
26# 5x8 strips the following out of 5x8
27# x23 a5
28#  x7 c5  x13 unused
29# A5  v10 v11
30# C   v30 v31
31
32# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
33
34# A pointers
35# x14 a0
36# x15 a1
37# x20 a2
38# x21 a3
39#  x8 a4
40
41# C pointers
42#  x6 c0
43# x16 c1
44# x17 c2
45# x13 c3
46#  x7 c4
47
48# Vector register usage
49# A0   v0  v1
50# A1   v2  v3
51# A2   v4  v5
52# A3   v6  v7
53# A4   v8  v9
54# B   v12 v13 v14 v15
55# B   v16 v17 v18 v19
56# C   v20 v21
57# C   v22 v23
58# C   v24 v25
59# C   v26 v27
60# C   v28 v29
61# Clamp v30 v31
62
63BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75
64
65        # Clamp C pointers / Save d8-d15 on stack
66        STP     d8,  d9, [sp, -64]!
67        CMP     x0, 2                   // if mr < 2
68        ADD     x16, x6, x7             // c1 = c0 + cm_stride
69        CSEL    x16, x6, x16, LO        //   c1 = c0
70
71        STP     d12, d13, [sp, 16]
72        ADD     x17, x16, x7            // c2 = c1 + cm_stride
73                                        // if mr <= 2
74        CSEL    x17, x16, x17, LS       //   c2 = c1
75
76        STP     d14, d15, [sp, 32]
77        CMP     x0, 4                   // if mr < 4
78        ADD     x13, x17, x7            // c3 = c2 + cm_stride
79        CSEL    x13, x17, x13, LO       //   c3 = c2
80
81        # Load zero, params pointer
82        LDP     x12, x8, [sp, 80]
83        ADD     x7, x13, x7             // c4 = c3 + cm_stride
84                                        // if mr <= 4
85        CSEL    x7, x13, x7, LS         //   c4 = c3
86
87        # Save x20,x21 on stack
88        STP     x20, x21, [sp, 48]
89
90        # Load clamp values
91        LD2R    {v30.4s, v31.4s}, [x8]
92
93        # Load cn_stride, a_offset
94        LDP     x10, x11, [sp, 64]
95
960:
97        # Load initial bias from w into accumulators
98        LDP     q20, q21, [x5], 32
99        MOV     v22.16b, v20.16b
100        MOV     v23.16b, v21.16b
101        MOV     v24.16b, v20.16b
102        MOV     v25.16b, v21.16b
103        MOV     v26.16b, v20.16b
104        MOV     v27.16b, v21.16b
105        MOV     v28.16b, v20.16b
106        MOV     v29.16b, v21.16b
107
108        MOV     x9, x3                  // p = ks
109
1101:
111        # Load next 5 A pointers
112        LDP     x14, x15, [x4], 16
113        LDP     x20, x21, [x4], 16
114        LDR     x8, [x4], 8
115
116        CMP     x14, x12                // if a0 == zero
117        ADD     x14, x14, x11           // a0 += a_offset
118        CSEL    x14, x12, x14, EQ       //   a0 = zero, else += a0 + a_offset
119        CMP     x15, x12                // if a1 == zero
120        ADD     x15, x15, x11           // a1 += a_offset
121        CSEL    x15, x12, x15, EQ       //   a1 = zero, else += a1 + a_offset
122        CMP     x20, x12                // if a2 == zero
123        ADD     x20, x20, x11           // a2 += a_offset
124        CSEL    x20, x12, x20, EQ       //   a2 = zero, else += a2 + a_offset
125        CMP     x21, x12                // if a3 == zero
126        ADD     x21, x21, x11           // a3 += a_offset
127        CSEL    x21, x12, x21, EQ       //   a3 = zero, else += a3 + a_offset
128        CMP     x8, x12                 // if a4 == zero
129        ADD     x8, x8, x11             // a4 += a_offset
130        CSEL    x8, x12, x8, EQ         //   a4 = zero, else += a4 + a_offset
131
132        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
133        SUBS    x0, x2, 32              // k = kc - 32
134        B.LO    5f
135
136        # Prologue - loads for main loop of 96 FMA
137        LDR     q0, [x14], 16
138        LDR     q2, [x15], 16
139        LDR     q4, [x20], 16
140        LDR     q6, [x21], 16
141        LDR     q8, [x8], 16
142        LDP     q12, q13, [x5], 32      // Fetch 3 B (4th deferred)
143        LDP     q14, q15, [x5], 32
144        LDP     q16, q17, [x5], 32
145
146        # Is there at least 8 floats (32 bytes) for main loop?
147        SUBS    x0, x0, 32
148        B.LO    3f
149
150        # Main loop - 8 floats of A (32 bytes)
151        # 80 FMA + 5 LDP A + 8 LDP B
1522:
153        # First group of 4 A.  40 FMA.
154        FMLA    v20.4s, v12.4s,  v0.s[0]
155        LDP     q18, q19, [x5], 32        // Load last B
156        FMLA    v22.4s, v12.4s,  v2.s[0]
157        FMLA    v24.4s, v12.4s,  v4.s[0]
158        FMLA    v26.4s, v12.4s,  v6.s[0]
159        FMLA    v28.4s, v12.4s,  v8.s[0]
160        FMLA    v21.4s, v13.4s,  v0.s[0]
161        FMLA    v23.4s, v13.4s,  v2.s[0]
162        FMLA    v25.4s, v13.4s,  v4.s[0]
163        FMLA    v27.4s, v13.4s,  v6.s[0]
164        FMLA    v29.4s, v13.4s,  v8.s[0]
165        LDR     q1, [x14], 16            // Load next 5 A
166
167        FMLA    v20.4s, v14.4s,  v0.s[1]
168        FMLA    v22.4s, v14.4s,  v2.s[1]
169        FMLA    v24.4s, v14.4s,  v4.s[1]
170        LDR     q3, [x15], 16
171        FMLA    v26.4s, v14.4s,  v6.s[1]
172        FMLA    v28.4s, v14.4s,  v8.s[1]
173        FMLA    v21.4s, v15.4s,  v0.s[1]
174        LDR     q5, [x20], 16
175        FMLA    v23.4s, v15.4s,  v2.s[1]
176        FMLA    v25.4s, v15.4s,  v4.s[1]
177        FMLA    v27.4s, v15.4s,  v6.s[1]
178        LDR     q7, [x21], 16
179        FMLA    v29.4s, v15.4s,  v8.s[1]
180
181        FMLA    v20.4s, v16.4s,  v0.s[2]
182        FMLA    v22.4s, v16.4s,  v2.s[2]
183        LDR     q9, [x8], 16
184        FMLA    v24.4s, v16.4s,  v4.s[2]
185        FMLA    v26.4s, v16.4s,  v6.s[2]
186        FMLA    v28.4s, v16.4s,  v8.s[2]
187        LDP     q12, q13, [x5], 32        // Load 4 B
188        FMLA    v21.4s, v17.4s,  v0.s[2]
189        FMLA    v23.4s, v17.4s,  v2.s[2]
190        FMLA    v25.4s, v17.4s,  v4.s[2]
191        FMLA    v27.4s, v17.4s,  v6.s[2]
192        FMLA    v29.4s, v17.4s,  v8.s[2]
193
194        FMLA    v20.4s, v18.4s,  v0.s[3]
195        FMLA    v22.4s, v18.4s,  v2.s[3]
196        FMLA    v24.4s, v18.4s,  v4.s[3]
197        FMLA    v26.4s, v18.4s,  v6.s[3]
198        LDP     q14, q15, [x5], 32
199        FMLA    v28.4s, v18.4s,  v8.s[3]
200        FMLA    v21.4s, v19.4s,  v0.s[3]
201        FMLA    v23.4s, v19.4s,  v2.s[3]
202        LDP     q16, q17, [x5], 32
203        FMLA    v25.4s, v19.4s,  v4.s[3]
204        FMLA    v27.4s, v19.4s,  v6.s[3]
205        FMLA    v29.4s, v19.4s,  v8.s[3]
206        LDP     q18, q19, [x5], 32
207
208        # Second group of 4 A.  40 FMA.
209        FMLA    v20.4s, v12.4s,  v1.s[0]
210        FMLA    v22.4s, v12.4s,  v3.s[0]
211        FMLA    v24.4s, v12.4s,  v5.s[0]
212        LDR     q0, [x14], 16            // Load next 5 A
213        FMLA    v26.4s, v12.4s,  v7.s[0]
214        FMLA    v28.4s, v12.4s,  v9.s[0]
215        FMLA    v21.4s, v13.4s,  v1.s[0]
216        LDR     q2, [x15], 16
217        FMLA    v23.4s, v13.4s,  v3.s[0]
218        FMLA    v25.4s, v13.4s,  v5.s[0]
219        FMLA    v27.4s, v13.4s,  v7.s[0]
220        LDR     q4, [x20], 16
221        FMLA    v29.4s, v13.4s,  v9.s[0]
222
223        FMLA    v20.4s, v14.4s,  v1.s[1]
224        FMLA    v22.4s, v14.4s,  v3.s[1]
225        LDR     q6, [x21], 16
226        FMLA    v24.4s, v14.4s,  v5.s[1]
227        FMLA    v26.4s, v14.4s,  v7.s[1]
228        FMLA    v28.4s, v14.4s,  v9.s[1]
229        LDR     q8, [x8], 16
230        FMLA    v21.4s, v15.4s,  v1.s[1]
231        FMLA    v23.4s, v15.4s,  v3.s[1]
232        FMLA    v25.4s, v15.4s,  v5.s[1]
233        LDP     q12, q13, [x5], 32        // Load next 3 B (not last)
234        FMLA    v27.4s, v15.4s,  v7.s[1]
235        FMLA    v29.4s, v15.4s,  v9.s[1]
236
237        FMLA    v20.4s, v16.4s,  v1.s[2]
238        FMLA    v22.4s, v16.4s,  v3.s[2]
239        FMLA    v24.4s, v16.4s,  v5.s[2]
240        FMLA    v26.4s, v16.4s,  v7.s[2]
241        FMLA    v28.4s, v16.4s,  v9.s[2]
242        FMLA    v21.4s, v17.4s,  v1.s[2]
243        FMLA    v23.4s, v17.4s,  v3.s[2]
244        LDP     q14, q15, [x5], 32
245        FMLA    v25.4s, v17.4s,  v5.s[2]
246        FMLA    v27.4s, v17.4s,  v7.s[2]
247        FMLA    v29.4s, v17.4s,  v9.s[2]
248        LDP     q16,  q17, [x5], 32
249
250        FMLA    v20.4s, v18.4s,  v1.s[3]
251        FMLA    v22.4s, v18.4s,  v3.s[3]
252        SUBS    x0, x0, 32
253        FMLA    v24.4s, v18.4s,  v5.s[3]
254        FMLA    v26.4s, v18.4s,  v7.s[3]
255        FMLA    v28.4s, v18.4s,  v9.s[3]
256        FMLA    v21.4s, v19.4s,  v1.s[3]
257        FMLA    v23.4s, v19.4s,  v3.s[3]
258        FMLA    v25.4s, v19.4s,  v5.s[3]
259        FMLA    v27.4s, v19.4s,  v7.s[3]
260        FMLA    v29.4s, v19.4s,  v9.s[3]
261        B.HS    2b
262
263        # Epilogue - 8 floats of A (32 bytes)
264        # 80 FMA + 5 LDP A + 8 LDP B
265        # First block same as main loop.  Second block has no preloads.
2663:
267        # First group of 4 A.  40 FMA.
268        FMLA    v20.4s, v12.4s,  v0.s[0]
269        LDP     q18, q19, [x5], 32        // Load last B
270        FMLA    v22.4s, v12.4s,  v2.s[0]
271        FMLA    v24.4s, v12.4s,  v4.s[0]
272        FMLA    v26.4s, v12.4s,  v6.s[0]
273        FMLA    v28.4s, v12.4s,  v8.s[0]
274        FMLA    v21.4s, v13.4s,  v0.s[0]
275        FMLA    v23.4s, v13.4s,  v2.s[0]
276        FMLA    v25.4s, v13.4s,  v4.s[0]
277        FMLA    v27.4s, v13.4s,  v6.s[0]
278        FMLA    v29.4s, v13.4s,  v8.s[0]
279        LDR     q1, [x14], 16            // Load next 5 A
280
281        FMLA    v20.4s, v14.4s,  v0.s[1]
282        FMLA    v22.4s, v14.4s,  v2.s[1]
283        FMLA    v24.4s, v14.4s,  v4.s[1]
284        LDR     q3, [x15], 16
285        FMLA    v26.4s, v14.4s,  v6.s[1]
286        FMLA    v28.4s, v14.4s,  v8.s[1]
287        FMLA    v21.4s, v15.4s,  v0.s[1]
288        LDR     q5, [x20], 16
289        FMLA    v23.4s, v15.4s,  v2.s[1]
290        FMLA    v25.4s, v15.4s,  v4.s[1]
291        FMLA    v27.4s, v15.4s,  v6.s[1]
292        LDR     q7, [x21], 16
293        FMLA    v29.4s, v15.4s,  v8.s[1]
294
295        FMLA    v20.4s, v16.4s,  v0.s[2]
296        FMLA    v22.4s, v16.4s,  v2.s[2]
297        LDR     q9, [x8], 16
298        FMLA    v24.4s, v16.4s,  v4.s[2]
299        FMLA    v26.4s, v16.4s,  v6.s[2]
300        FMLA    v28.4s, v16.4s,  v8.s[2]
301        LDP     q12, q13, [x5], 32        // Load 4 B
302        FMLA    v21.4s, v17.4s,  v0.s[2]
303        FMLA    v23.4s, v17.4s,  v2.s[2]
304        FMLA    v25.4s, v17.4s,  v4.s[2]
305        FMLA    v27.4s, v17.4s,  v6.s[2]
306        FMLA    v29.4s, v17.4s,  v8.s[2]
307
308        FMLA    v20.4s, v18.4s,  v0.s[3]
309        FMLA    v22.4s, v18.4s,  v2.s[3]
310        FMLA    v24.4s, v18.4s,  v4.s[3]
311        FMLA    v26.4s, v18.4s,  v6.s[3]
312        LDP     q14, q15, [x5], 32
313        FMLA    v28.4s, v18.4s,  v8.s[3]
314        FMLA    v21.4s, v19.4s,  v0.s[3]
315        FMLA    v23.4s, v19.4s,  v2.s[3]
316        LDP     q16, q17, [x5], 32
317        FMLA    v25.4s, v19.4s,  v4.s[3]
318        FMLA    v27.4s, v19.4s,  v6.s[3]
319        FMLA    v29.4s, v19.4s,  v8.s[3]
320        LDP     q18, q19, [x5], 32
321
322        # Second group of 4 A.  40 FMA.
323        FMLA    v20.4s, v12.4s,  v1.s[0]
324        FMLA    v22.4s, v12.4s,  v3.s[0]
325        FMLA    v24.4s, v12.4s,  v5.s[0]
326        FMLA    v26.4s, v12.4s,  v7.s[0]
327        FMLA    v28.4s, v12.4s,  v9.s[0]
328        FMLA    v21.4s, v13.4s,  v1.s[0]
329        FMLA    v23.4s, v13.4s,  v3.s[0]
330        FMLA    v25.4s, v13.4s,  v5.s[0]
331        FMLA    v27.4s, v13.4s,  v7.s[0]
332        FMLA    v29.4s, v13.4s,  v9.s[0]
333
334        FMLA    v20.4s, v14.4s,  v1.s[1]
335        FMLA    v22.4s, v14.4s,  v3.s[1]
336        FMLA    v24.4s, v14.4s,  v5.s[1]
337        FMLA    v26.4s, v14.4s,  v7.s[1]
338        FMLA    v28.4s, v14.4s,  v9.s[1]
339        FMLA    v21.4s, v15.4s,  v1.s[1]
340        FMLA    v23.4s, v15.4s,  v3.s[1]
341        FMLA    v25.4s, v15.4s,  v5.s[1]
342        FMLA    v27.4s, v15.4s,  v7.s[1]
343        FMLA    v29.4s, v15.4s,  v9.s[1]
344
345        FMLA    v20.4s, v16.4s,  v1.s[2]
346        FMLA    v22.4s, v16.4s,  v3.s[2]
347        FMLA    v24.4s, v16.4s,  v5.s[2]
348        FMLA    v26.4s, v16.4s,  v7.s[2]
349        FMLA    v28.4s, v16.4s,  v9.s[2]
350        FMLA    v21.4s, v17.4s,  v1.s[2]
351        FMLA    v23.4s, v17.4s,  v3.s[2]
352        FMLA    v25.4s, v17.4s,  v5.s[2]
353        FMLA    v27.4s, v17.4s,  v7.s[2]
354        FMLA    v29.4s, v17.4s,  v9.s[2]
355
356        FMLA    v20.4s, v18.4s,  v1.s[3]
357        FMLA    v22.4s, v18.4s,  v3.s[3]
358        FMLA    v24.4s, v18.4s,  v5.s[3]
359        FMLA    v26.4s, v18.4s,  v7.s[3]
360        FMLA    v28.4s, v18.4s,  v9.s[3]
361        FMLA    v21.4s, v19.4s,  v1.s[3]
362        FMLA    v23.4s, v19.4s,  v3.s[3]
363        FMLA    v25.4s, v19.4s,  v5.s[3]
364        FMLA    v27.4s, v19.4s,  v7.s[3]
365        FMLA    v29.4s, v19.4s,  v9.s[3]
366        # Is there a remainder?- 4 floats of A (16 bytes) or less
367        TST     x0, 31
368        B.NE    5f
369
3704:
371        # ks loop
372        SUBS    x9, x9, 40              // ks -= MR * sizeof(void*)
373        B.HI    1b
374
375        # Clamp
376        FMAX    v20.4s, v20.4s, v30.4s
377        FMAX    v21.4s, v21.4s, v30.4s
378        FMAX    v22.4s, v22.4s, v30.4s
379        FMAX    v23.4s, v23.4s, v30.4s
380        FMAX    v24.4s, v24.4s, v30.4s
381        FMAX    v25.4s, v25.4s, v30.4s
382        FMAX    v26.4s, v26.4s, v30.4s
383        FMAX    v27.4s, v27.4s, v30.4s
384        FMAX    v28.4s, v28.4s, v30.4s
385        FMAX    v29.4s, v29.4s, v30.4s
386        FMIN    v20.4s, v20.4s, v31.4s
387        FMIN    v21.4s, v21.4s, v31.4s
388        FMIN    v22.4s, v22.4s, v31.4s
389        FMIN    v23.4s, v23.4s, v31.4s
390        FMIN    v24.4s, v24.4s, v31.4s
391        FMIN    v25.4s, v25.4s, v31.4s
392        FMIN    v26.4s, v26.4s, v31.4s
393        FMIN    v27.4s, v27.4s, v31.4s
394        FMIN    v28.4s, v28.4s, v31.4s
395        FMIN    v29.4s, v29.4s, v31.4s
396
397        # Store full 5 x 8
398        SUBS    x1, x1, 8
399        B.LO    8f
400
401        STP     q28, q29, [x7]
402        ADD     x7, x7, x10
403        STP     q26, q27, [x13]
404        ADD     x13, x13, x10
405        STP     q24, q25, [x17]
406        ADD     x17, x17, x10
407        STP     q22, q23, [x16]
408        ADD     x16, x16, x10
409        STP     q20, q21,  [x6]
410        ADD     x6,  x6, x10
411
412        SUB     x4, x4, x3              // a -= ks
413
414        # nc loop
415        B.HI    0b
416
417        # Restore x20,x21 from stack
418        LDP     x20, x21, [sp, 48]
419
420        # Restore d8-d15 from stack
421        LDP     d14, d15, [sp, 32]
422        LDP     d12, d13, [sp, 16]
423        LDP     d8,  d9, [sp], 64
424        RET
425
4265:
427        # Is there a remainder?- 4 floats of A (16 bytes)
428        TBZ     x0, 4, 6f
429
430        # Remainder- 4 floats of A (16 bytes)
431        # Load A
432        LDR     q0, [x14], 16
433        LDR     q2, [x15], 16
434        LDR     q4, [x20], 16
435        LDR     q6, [x21], 16
436        LDR     q8, [x8], 16
437        # Load B
438        LDP     q12, q13, [x5], 32
439        LDP     q14, q15, [x5], 32
440        LDP     q16, q17, [x5], 32
441        LDP     q18, q19, [x5], 32
442
443        FMLA    v20.4s, v12.4s,  v0.s[0]
444        FMLA    v22.4s, v12.4s,  v2.s[0]
445        FMLA    v24.4s, v12.4s,  v4.s[0]
446        FMLA    v26.4s, v12.4s,  v6.s[0]
447        FMLA    v28.4s, v12.4s,  v8.s[0]
448        FMLA    v21.4s, v13.4s,  v0.s[0]
449        FMLA    v23.4s, v13.4s,  v2.s[0]
450        FMLA    v25.4s, v13.4s,  v4.s[0]
451        FMLA    v27.4s, v13.4s,  v6.s[0]
452        FMLA    v29.4s, v13.4s,  v8.s[0]
453
454        FMLA    v20.4s, v14.4s,  v0.s[1]
455        FMLA    v22.4s, v14.4s,  v2.s[1]
456        FMLA    v24.4s, v14.4s,  v4.s[1]
457        FMLA    v26.4s, v14.4s,  v6.s[1]
458        FMLA    v28.4s, v14.4s,  v8.s[1]
459        FMLA    v21.4s, v15.4s,  v0.s[1]
460        FMLA    v23.4s, v15.4s,  v2.s[1]
461        FMLA    v25.4s, v15.4s,  v4.s[1]
462        FMLA    v27.4s, v15.4s,  v6.s[1]
463        FMLA    v29.4s, v15.4s,  v8.s[1]
464
465        FMLA    v20.4s, v16.4s,  v0.s[2]
466        FMLA    v22.4s, v16.4s,  v2.s[2]
467        FMLA    v24.4s, v16.4s,  v4.s[2]
468        FMLA    v26.4s, v16.4s,  v6.s[2]
469        FMLA    v28.4s, v16.4s,  v8.s[2]
470        FMLA    v21.4s, v17.4s,  v0.s[2]
471        FMLA    v23.4s, v17.4s,  v2.s[2]
472        FMLA    v25.4s, v17.4s,  v4.s[2]
473        FMLA    v27.4s, v17.4s,  v6.s[2]
474        FMLA    v29.4s, v17.4s,  v8.s[2]
475
476        FMLA    v20.4s, v18.4s,  v0.s[3]
477        FMLA    v22.4s, v18.4s,  v2.s[3]
478        FMLA    v24.4s, v18.4s,  v4.s[3]
479        FMLA    v26.4s, v18.4s,  v6.s[3]
480        FMLA    v28.4s, v18.4s,  v8.s[3]
481        FMLA    v21.4s, v19.4s,  v0.s[3]
482        FMLA    v23.4s, v19.4s,  v2.s[3]
483        FMLA    v25.4s, v19.4s,  v4.s[3]
484        FMLA    v27.4s, v19.4s,  v6.s[3]
485        FMLA    v29.4s, v19.4s,  v8.s[3]
486
487        # Is there a remainder?- 2 floats of A (8 bytes)
4886:
489        TBZ     x0, 3, 7f
490
491        # Remainder- 2 floats of A (8 bytes)
492        # Load A
493        LDR     d0, [x14], 8
494        LDR     d2, [x15], 8
495        LDR     d4, [x20], 8
496        LDR     d6, [x21], 8
497        LDR     d8, [x8], 8
498        # Load B
499        LDP     q12, q13, [x5], 32
500        LDP     q14, q15, [x5], 32
501
502        FMLA    v20.4s, v12.4s,  v0.s[0]
503        FMLA    v22.4s, v12.4s,  v2.s[0]
504        FMLA    v24.4s, v12.4s,  v4.s[0]
505        FMLA    v26.4s, v12.4s,  v6.s[0]
506        FMLA    v28.4s, v12.4s,  v8.s[0]
507        FMLA    v21.4s, v13.4s,  v0.s[0]
508        FMLA    v23.4s, v13.4s,  v2.s[0]
509        FMLA    v25.4s, v13.4s,  v4.s[0]
510        FMLA    v27.4s, v13.4s,  v6.s[0]
511        FMLA    v29.4s, v13.4s,  v8.s[0]
512
513        FMLA    v20.4s, v14.4s,  v0.s[1]
514        FMLA    v22.4s, v14.4s,  v2.s[1]
515        FMLA    v24.4s, v14.4s,  v4.s[1]
516        FMLA    v26.4s, v14.4s,  v6.s[1]
517        FMLA    v28.4s, v14.4s,  v8.s[1]
518        FMLA    v21.4s, v15.4s,  v0.s[1]
519        FMLA    v23.4s, v15.4s,  v2.s[1]
520        FMLA    v25.4s, v15.4s,  v4.s[1]
521        FMLA    v27.4s, v15.4s,  v6.s[1]
522        FMLA    v29.4s, v15.4s,  v8.s[1]
523
524        # Is there a remainder?- 1 float of A (4 bytes)
5257:
526        TBZ     x0, 2, 4b
527
528        # Remainder- 1 float of A (4 bytes)
529        # Load A
530        LDR     s0, [x14], 4
531        LDR     s2, [x15], 4
532        LDR     s4, [x20], 4
533        LDR     s6, [x21], 4
534        LDR     s8, [x8], 4
535        # Load B
536        LDP     q12, q13, [x5], 32
537
538        FMLA    v20.4s, v12.4s,  v0.s[0]
539        FMLA    v22.4s, v12.4s,  v2.s[0]
540        FMLA    v24.4s, v12.4s,  v4.s[0]
541        FMLA    v26.4s, v12.4s,  v6.s[0]
542        FMLA    v28.4s, v12.4s,  v8.s[0]
543        FMLA    v21.4s, v13.4s,  v0.s[0]
544        FMLA    v23.4s, v13.4s,  v2.s[0]
545        FMLA    v25.4s, v13.4s,  v4.s[0]
546        FMLA    v27.4s, v13.4s,  v6.s[0]
547        FMLA    v29.4s, v13.4s,  v8.s[0]
548        B       4b
549
550        # Store odd width
5518:
552        TBZ     x1, 2, 9f
553        STR     q28, [x7], 16
554        MOV     v28.16b, v29.16b
555        STR     q26, [x13], 16
556        MOV     v26.16b, v27.16b
557        STR     q24, [x17], 16
558        MOV     v24.16b, v25.16b
559        STR     q22, [x16], 16
560        MOV     v22.16b, v23.16b
561        STR     q20,  [x6], 16
562        MOV     v20.16b, v21.16b
5639:
564        TBZ     x1, 1, 10f
565        STR     d28, [x7], 8
566        STR     d26, [x13], 8
567        DUP     d28, v28.d[1]
568        DUP     d26, v26.d[1]
569        STR     d24, [x17], 8
570        STR     d22, [x16], 8
571        DUP     d24, v24.d[1]
572        DUP     d22, v22.d[1]
573        STR     d20,  [x6], 8
574        DUP     d20, v20.d[1]
575
57610:
577        TBZ     x1, 0, 11f
578        STR     s28, [x7]
579        STR     s26, [x13]
580        STR     s24, [x17]
581        STR     s22, [x16]
582        STR     s20,  [x6]
58311:
584        # Restore x20,x21 from stack
585        LDP     x20, x21, [sp, 48]
586
587        # Restore d8-d15 from stack
588        LDP     d14, d15, [sp, 32]
589        LDP     d12, d13, [sp, 16]
590        LDP     d8,  d9, [sp], 64
591        RET
592
593END_FUNCTION xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75
594
595#ifdef __ELF__
596.section ".note.GNU-stack","",%progbits
597#endif
598