• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# unused compared to 5x8
25#  x4 a5
26#  x7 c5
27# A5  v10 v11
28# C   v30 v31
29
30# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
31
32# A pointers
33#  x3 a0
34#  x9 a1
35# x10 a2
36# x11 a3
37# x12 a4
38
39# C pointers
40#  x6 c0
41# x16 c1
42# x17 c2
43# x13 c3
44#  x7 c4
45
46# Vector register usage
47# A0   v0  v1
48# A1   v2  v3
49# A2   v4  v5
50# A3   v6  v7
51# A4   v8  v9
52# B   v12 v13 v14 v15
53# B   v16 v17 v18 v19
54# C   v20 v21
55# C   v22 v23
56# C   v24 v25
57# C   v26 v27
58# C   v28 v29
59# Clamp v30 v31
60
61BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75
62
63        # Load cn_stride, params pointer
64        LDP     x14, x8, [sp]
65
66        # Clamp A and C pointers / Save d8-d15 on stack
67        STP     d8,  d9, [sp, -48]!
68        CMP     x0, 2                   // if mr < 2
69        ADD     x9, x3, x4              // a1 = a0 + a_stride
70        ADD     x16, x6, x7             // c1 = c0 + cm_stride
71        CSEL    x9, x3, x9, LO          //   a1 = a0
72        CSEL    x16, x6, x16, LO        //   c1 = c0
73
74        STP     d12, d13, [sp, 16]
75        ADD     x10, x9, x4             // a2 = a1 + a_stride
76        ADD     x17, x16, x7            // c2 = c1 + cm_stride
77                                        // if mr <= 2
78        CSEL    x10, x9, x10, LS        //   a2 = a1
79        CSEL    x17, x16, x17, LS       //   c2 = c1
80
81        STP     d14, d15, [sp, 32]
82        CMP     x0, 4                   // if mr < 4
83        ADD     x11, x10, x4            // a3 = a2 + a_stride
84        ADD     x13, x17, x7            // c3 = c2 + cm_stride
85        CSEL    x11, x10, x11, LO       //   a3 = a2
86        CSEL    x13, x17, x13, LO       //   c3 = c2
87
88        ADD     x12, x11, x4            // a4 = a3 + a_stride
89        ADD     x7, x13, x7             // c4 = c3 + cm_stride
90                                        // if mr <= 4
91        CSEL    x12, x11, x12, LS       //   a4 = a3
92        CSEL    x7, x13, x7, LS         //   c4 = c3
93
94        # Load clamp values
95        LD2R    {v30.4s, v31.4s}, [x8]
96
970:
98        # Load initial bias from w into accumulators
99        LDP     q20, q21, [x5], 32
100        MOV     v22.16b, v20.16b
101        PRFM    PLDL1KEEP, [x5, 0]      // Prefetch B
102        MOV     v23.16b, v21.16b
103        PRFM    PLDL1KEEP, [x5, 64]
104        MOV     v24.16b, v20.16b
105        PRFM    PLDL1KEEP, [x5, 128]
106        MOV     v25.16b, v21.16b
107        PRFM    PLDL1KEEP, [x5, 192]
108        MOV     v26.16b, v20.16b
109        PRFM    PLDL1KEEP,  [x3]        // Prefetch A
110        MOV     v27.16b, v21.16b
111        PRFM    PLDL1KEEP,  [x9]
112        MOV     v28.16b, v20.16b
113        PRFM    PLDL1KEEP, [x10]
114        MOV     v29.16b, v21.16b
115        PRFM    PLDL1KEEP, [x11]
116        PRFM    PLDL1KEEP, [x12]
117
118        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
119        SUBS    x0, x2, 32              // k = kc - 32
120        B.LO    4f
121
122        # Prologue - loads for main loop of 80 FMA
123        LDR     q0,  [x3], 16
124        LDR     q2,  [x9], 16
125        LDR     q4, [x10], 16
126        LDR     q6, [x11], 16
127        LDR     q8, [x12], 16
128        LDP     q12,  q13, [x5], 32     // Fetch 3 B (4th deferred)
129        LDP     q14,  q15, [x5], 32
130        LDP     q16,  q17, [x5], 32
131
132        # Is there at least 8 floats (32 bytes) for main loop?
133        SUBS    x0, x0, 32
134        B.LO    2f
135
136        # Main loop - 8 floats of A (32 bytes)
137        # 80 FMA + 5 LDP A + 8 LDP B
1381:
139        # First group of 4 A.  40 FMA.
140        FMLA    v20.4s, v12.4s,  v0.s[0]
141        LDP     q18,  q19, [x5], 32      // Load last B
142        FMLA    v22.4s, v12.4s,  v2.s[0]
143        FMLA    v24.4s, v12.4s,  v4.s[0]
144        FMLA    v26.4s, v12.4s,  v6.s[0]
145        PRFM    PLDL1KEEP, [x5, 128]      // Prefetch B
146        FMLA    v28.4s, v12.4s,  v8.s[0]
147        FMLA    v21.4s, v13.4s,  v0.s[0]
148        FMLA    v23.4s, v13.4s,  v2.s[0]
149        PRFM    PLDL1KEEP, [x5, 256]
150        FMLA    v25.4s, v13.4s,  v4.s[0]
151        FMLA    v27.4s, v13.4s,  v6.s[0]
152        FMLA    v29.4s, v13.4s,  v8.s[0]
153        LDR     q1,  [x3], 16            // Load next 5 A
154
155        FMLA    v20.4s, v14.4s,  v0.s[1]
156        FMLA    v22.4s, v14.4s,  v2.s[1]
157        FMLA    v24.4s, v14.4s,  v4.s[1]
158        LDR     q3,  [x9], 16
159        FMLA    v26.4s, v14.4s,  v6.s[1]
160        FMLA    v28.4s, v14.4s,  v8.s[1]
161        FMLA    v21.4s, v15.4s,  v0.s[1]
162        LDR     q5, [x10], 16
163        FMLA    v23.4s, v15.4s,  v2.s[1]
164        FMLA    v25.4s, v15.4s,  v4.s[1]
165        FMLA    v27.4s, v15.4s,  v6.s[1]
166        LDR     q7, [x11], 16
167        FMLA    v29.4s, v15.4s,  v8.s[1]
168
169        FMLA    v20.4s, v16.4s,  v0.s[2]
170        FMLA    v22.4s, v16.4s,  v2.s[2]
171        LDR     q9, [x12], 16
172        FMLA    v24.4s, v16.4s,  v4.s[2]
173        FMLA    v26.4s, v16.4s,  v6.s[2]
174        FMLA    v28.4s, v16.4s,  v8.s[2]
175        LDP     q12,  q13, [x5], 32       // Load 4 B
176        FMLA    v21.4s, v17.4s,  v0.s[2]
177        FMLA    v23.4s, v17.4s,  v2.s[2]
178        FMLA    v25.4s, v17.4s,  v4.s[2]
179        LDP     q14,  q15, [x5], 32
180        FMLA    v27.4s, v17.4s,  v6.s[2]
181        FMLA    v29.4s, v17.4s,  v8.s[2]
182
183        FMLA    v20.4s, v18.4s,  v0.s[3]
184        LDP     q16,  q17, [x5], 32
185        FMLA    v22.4s, v18.4s,  v2.s[3]
186        FMLA    v24.4s, v18.4s,  v4.s[3]
187        FMLA    v26.4s, v18.4s,  v6.s[3]
188        FMLA    v28.4s, v18.4s,  v8.s[3]
189        FMLA    v21.4s, v19.4s,  v0.s[3]
190        FMLA    v23.4s, v19.4s,  v2.s[3]
191        FMLA    v25.4s, v19.4s,  v4.s[3]
192        FMLA    v27.4s, v19.4s,  v6.s[3]
193        FMLA    v29.4s, v19.4s,  v8.s[3]
194        LDP     q18,  q19, [x5], 32
195
196        # Second group of 4 A.  40 FMA.
197        FMLA    v20.4s, v12.4s,  v1.s[0]
198        FMLA    v22.4s, v12.4s,  v3.s[0]
199        FMLA    v24.4s, v12.4s,  v5.s[0]
200        LDR     q0,  [x3], 16           // Load next 5 A
201        FMLA    v26.4s, v12.4s,  v7.s[0]
202        FMLA    v28.4s, v12.4s,  v9.s[0]
203        FMLA    v21.4s, v13.4s,  v1.s[0]
204        LDR     q2,  [x9], 16
205        FMLA    v23.4s, v13.4s,  v3.s[0]
206        FMLA    v25.4s, v13.4s,  v5.s[0]
207        FMLA    v27.4s, v13.4s,  v7.s[0]
208        LDR     q4, [x10], 16
209        FMLA    v29.4s, v13.4s,  v9.s[0]
210
211        FMLA    v20.4s, v14.4s,  v1.s[1]
212        FMLA    v22.4s, v14.4s,  v3.s[1]
213        LDR     q6, [x11], 16
214        FMLA    v24.4s, v14.4s,  v5.s[1]
215        FMLA    v26.4s, v14.4s,  v7.s[1]
216        FMLA    v28.4s, v14.4s,  v9.s[1]
217        LDR     q8, [x12], 16
218        FMLA    v21.4s, v15.4s,  v1.s[1]
219        FMLA    v23.4s, v15.4s,  v3.s[1]
220        FMLA    v25.4s, v15.4s,  v5.s[1]
221        LDP     q12,  q13, [x5], 32       // Load next 3 B (not last)
222        FMLA    v27.4s, v15.4s,  v7.s[1]
223        FMLA    v29.4s, v15.4s,  v9.s[1]
224
225        FMLA    v20.4s, v16.4s,  v1.s[2]
226        LDP     q14,  q15, [x5], 32
227        FMLA    v22.4s, v16.4s,  v3.s[2]
228        FMLA    v24.4s, v16.4s,  v5.s[2]
229        FMLA    v26.4s, v16.4s,  v7.s[2]
230        FMLA    v28.4s, v16.4s,  v9.s[2]
231        FMLA    v21.4s, v17.4s,  v1.s[2]
232        FMLA    v23.4s, v17.4s,  v3.s[2]
233        FMLA    v25.4s, v17.4s,  v5.s[2]
234        FMLA    v27.4s, v17.4s,  v7.s[2]
235        FMLA    v29.4s, v17.4s,  v9.s[2]
236        LDP     q16,  q17, [x5], 32
237
238        FMLA    v20.4s, v18.4s,  v1.s[3]
239        FMLA    v22.4s, v18.4s,  v3.s[3]
240        SUBS    x0, x0, 32
241        FMLA    v24.4s, v18.4s,  v5.s[3]
242        FMLA    v26.4s, v18.4s,  v7.s[3]
243        FMLA    v28.4s, v18.4s,  v9.s[3]
244        FMLA    v21.4s, v19.4s,  v1.s[3]
245        FMLA    v23.4s, v19.4s,  v3.s[3]
246        FMLA    v25.4s, v19.4s,  v5.s[3]
247        FMLA    v27.4s, v19.4s,  v7.s[3]
248        FMLA    v29.4s, v19.4s,  v9.s[3]
249        B.HS    1b
250
251        # Epilogue - 8 floats of A (32 bytes)
252        # 80 FMA + 5 LDP A + 8 LDP B
253        # First block same as main loop.  Second block has no preloads.
2542:
255        # First group of 4 A.  40 FMA.
256        FMLA    v20.4s, v12.4s,  v0.s[0]
257        LDP     q18,  q19, [x5], 32      // Load last B
258        FMLA    v22.4s, v12.4s,  v2.s[0]
259        FMLA    v24.4s, v12.4s,  v4.s[0]
260        FMLA    v26.4s, v12.4s,  v6.s[0]
261        PRFM    PLDL1KEEP, [x5, 128]      // Prefetch B
262        FMLA    v28.4s, v12.4s,  v8.s[0]
263        FMLA    v21.4s, v13.4s,  v0.s[0]
264        FMLA    v23.4s, v13.4s,  v2.s[0]
265        PRFM    PLDL1KEEP, [x5, 256]
266        FMLA    v25.4s, v13.4s,  v4.s[0]
267        FMLA    v27.4s, v13.4s,  v6.s[0]
268        FMLA    v29.4s, v13.4s,  v8.s[0]
269        LDR     q1,  [x3], 16            // Load next 5 A
270
271        FMLA    v20.4s, v14.4s,  v0.s[1]
272        FMLA    v22.4s, v14.4s,  v2.s[1]
273        FMLA    v24.4s, v14.4s,  v4.s[1]
274        LDR     q3,  [x9], 16
275        FMLA    v26.4s, v14.4s,  v6.s[1]
276        FMLA    v28.4s, v14.4s,  v8.s[1]
277        FMLA    v21.4s, v15.4s,  v0.s[1]
278        LDR     q5, [x10], 16
279        FMLA    v23.4s, v15.4s,  v2.s[1]
280        FMLA    v25.4s, v15.4s,  v4.s[1]
281        FMLA    v27.4s, v15.4s,  v6.s[1]
282        LDR     q7, [x11], 16
283        FMLA    v29.4s, v15.4s,  v8.s[1]
284
285        FMLA    v20.4s, v16.4s,  v0.s[2]
286        FMLA    v22.4s, v16.4s,  v2.s[2]
287        LDR     q9, [x12], 16
288        FMLA    v24.4s, v16.4s,  v4.s[2]
289        FMLA    v26.4s, v16.4s,  v6.s[2]
290        FMLA    v28.4s, v16.4s,  v8.s[2]
291        LDP     q12,  q13, [x5], 32       // Load 4 B
292        FMLA    v21.4s, v17.4s,  v0.s[2]
293        FMLA    v23.4s, v17.4s,  v2.s[2]
294        FMLA    v25.4s, v17.4s,  v4.s[2]
295        LDP     q14,  q15, [x5], 32
296        FMLA    v27.4s, v17.4s,  v6.s[2]
297        FMLA    v29.4s, v17.4s,  v8.s[2]
298
299        FMLA    v20.4s, v18.4s,  v0.s[3]
300        LDP     q16,  q17, [x5], 32
301        FMLA    v22.4s, v18.4s,  v2.s[3]
302        FMLA    v24.4s, v18.4s,  v4.s[3]
303        FMLA    v26.4s, v18.4s,  v6.s[3]
304        FMLA    v28.4s, v18.4s,  v8.s[3]
305        FMLA    v21.4s, v19.4s,  v0.s[3]
306        FMLA    v23.4s, v19.4s,  v2.s[3]
307        FMLA    v25.4s, v19.4s,  v4.s[3]
308        FMLA    v27.4s, v19.4s,  v6.s[3]
309        FMLA    v29.4s, v19.4s,  v8.s[3]
310        LDP     q18,  q19, [x5], 32
311
312        # Second group of 4 A.  40 FMA.
313        FMLA    v20.4s, v12.4s,  v1.s[0]
314        FMLA    v22.4s, v12.4s,  v3.s[0]
315        FMLA    v24.4s, v12.4s,  v5.s[0]
316        FMLA    v26.4s, v12.4s,  v7.s[0]
317        FMLA    v28.4s, v12.4s,  v9.s[0]
318        FMLA    v21.4s, v13.4s,  v1.s[0]
319        FMLA    v23.4s, v13.4s,  v3.s[0]
320        FMLA    v25.4s, v13.4s,  v5.s[0]
321        FMLA    v27.4s, v13.4s,  v7.s[0]
322        FMLA    v29.4s, v13.4s,  v9.s[0]
323
324        FMLA    v20.4s, v14.4s,  v1.s[1]
325        FMLA    v22.4s, v14.4s,  v3.s[1]
326        FMLA    v24.4s, v14.4s,  v5.s[1]
327        FMLA    v26.4s, v14.4s,  v7.s[1]
328        FMLA    v28.4s, v14.4s,  v9.s[1]
329        FMLA    v21.4s, v15.4s,  v1.s[1]
330        FMLA    v23.4s, v15.4s,  v3.s[1]
331        FMLA    v25.4s, v15.4s,  v5.s[1]
332        FMLA    v27.4s, v15.4s,  v7.s[1]
333        FMLA    v29.4s, v15.4s,  v9.s[1]
334
335        FMLA    v20.4s, v16.4s,  v1.s[2]
336        FMLA    v22.4s, v16.4s,  v3.s[2]
337        FMLA    v24.4s, v16.4s,  v5.s[2]
338        FMLA    v26.4s, v16.4s,  v7.s[2]
339        FMLA    v28.4s, v16.4s,  v9.s[2]
340        FMLA    v21.4s, v17.4s,  v1.s[2]
341        FMLA    v23.4s, v17.4s,  v3.s[2]
342        FMLA    v25.4s, v17.4s,  v5.s[2]
343        FMLA    v27.4s, v17.4s,  v7.s[2]
344        FMLA    v29.4s, v17.4s,  v9.s[2]
345        TST     x0, 31
346
347        FMLA    v20.4s, v18.4s,  v1.s[3]
348        FMLA    v22.4s, v18.4s,  v3.s[3]
349        FMLA    v24.4s, v18.4s,  v5.s[3]
350        FMLA    v26.4s, v18.4s,  v7.s[3]
351        FMLA    v28.4s, v18.4s,  v9.s[3]
352        FMLA    v21.4s, v19.4s,  v1.s[3]
353        FMLA    v23.4s, v19.4s,  v3.s[3]
354        FMLA    v25.4s, v19.4s,  v5.s[3]
355        FMLA    v27.4s, v19.4s,  v7.s[3]
356        FMLA    v29.4s, v19.4s,  v9.s[3]
357        B.NE    4f
358
359        # Clamp
3603:
361        FMAX    v20.4s, v20.4s, v30.4s
362        SUBS    x1, x1, 8
363        FMAX    v21.4s, v21.4s, v30.4s
364        FMAX    v22.4s, v22.4s, v30.4s
365        FMAX    v23.4s, v23.4s, v30.4s
366        FMAX    v24.4s, v24.4s, v30.4s
367        FMAX    v25.4s, v25.4s, v30.4s
368        FMAX    v26.4s, v26.4s, v30.4s
369        FMAX    v27.4s, v27.4s, v30.4s
370        FMAX    v28.4s, v28.4s, v30.4s
371        FMAX    v29.4s, v29.4s, v30.4s
372        FMIN    v20.4s, v20.4s, v31.4s
373        FMIN    v21.4s, v21.4s, v31.4s
374        FMIN    v22.4s, v22.4s, v31.4s
375        FMIN    v23.4s, v23.4s, v31.4s
376        FMIN    v24.4s, v24.4s, v31.4s
377        FMIN    v25.4s, v25.4s, v31.4s
378        FMIN    v26.4s, v26.4s, v31.4s
379        FMIN    v27.4s, v27.4s, v31.4s
380        FMIN    v28.4s, v28.4s, v31.4s
381        FMIN    v29.4s, v29.4s, v31.4s
382
383        # Store full 5 x 8
384        B.LO    7f
385
386        STP     q20, q21,  [x6]
387        ADD     x6,  x6, x14
388        SUB     x3,  x3, x2             // a0 -= kc
389        STP     q22, q23, [x16]
390        ADD     x16, x16, x14
391        SUB     x9,  x9, x2             // a1 -= kc
392        STP     q24, q25, [x17]
393        ADD     x17, x17, x14
394        SUB     x10, x10, x2            // a2 -= kc
395        STP     q26, q27, [x13]
396        ADD     x13, x13, x14
397        SUB     x11, x11, x2            // a3 -= kc
398        STP     q28, q29, [x7]
399        ADD     x7, x7, x14
400        SUB     x12, x12, x2            // a4 -= kc
401
402        B.HI    0b
403
404        # Restore d8-d15 from stack
405        LDP     d14, d15, [sp, 32]
406        LDP     d12, d13, [sp, 16]
407        LDP     d8,  d9, [sp], 48
408        RET
409
410        # Load clamp values
4114:
412        # Is there a remainder?- 4 floats of A (16 bytes)
413        TBZ     x0, 4, 5f
414
415        # Remainder- 4 floats of A (16 bytes)
416        # Load A
417        LDR     q0,  [x3], 16
418        LDR     q2,  [x9], 16
419        LDR     q4, [x10], 16
420        LDR     q6, [x11], 16
421        LDR     q8, [x12], 16
422        # Load B
423        LDP     q12,  q13, [x5], 32
424        LDP     q14,  q15, [x5], 32
425        LDP     q16,  q17, [x5], 32
426        LDP     q18,  q19, [x5], 32
427
428        FMLA    v20.4s, v12.4s,  v0.s[0]
429        FMLA    v22.4s, v12.4s,  v2.s[0]
430        FMLA    v24.4s, v12.4s,  v4.s[0]
431        FMLA    v26.4s, v12.4s,  v6.s[0]
432        FMLA    v28.4s, v12.4s,  v8.s[0]
433        FMLA    v21.4s, v13.4s,  v0.s[0]
434        FMLA    v23.4s, v13.4s,  v2.s[0]
435        FMLA    v25.4s, v13.4s,  v4.s[0]
436        FMLA    v27.4s, v13.4s,  v6.s[0]
437        FMLA    v29.4s, v13.4s,  v8.s[0]
438
439        FMLA    v20.4s, v14.4s,  v0.s[1]
440        FMLA    v22.4s, v14.4s,  v2.s[1]
441        FMLA    v24.4s, v14.4s,  v4.s[1]
442        FMLA    v26.4s, v14.4s,  v6.s[1]
443        FMLA    v28.4s, v14.4s,  v8.s[1]
444        FMLA    v21.4s, v15.4s,  v0.s[1]
445        FMLA    v23.4s, v15.4s,  v2.s[1]
446        FMLA    v25.4s, v15.4s,  v4.s[1]
447        FMLA    v27.4s, v15.4s,  v6.s[1]
448        FMLA    v29.4s, v15.4s,  v8.s[1]
449
450        FMLA    v20.4s, v16.4s,  v0.s[2]
451        FMLA    v22.4s, v16.4s,  v2.s[2]
452        FMLA    v24.4s, v16.4s,  v4.s[2]
453        FMLA    v26.4s, v16.4s,  v6.s[2]
454        FMLA    v28.4s, v16.4s,  v8.s[2]
455        FMLA    v21.4s, v17.4s,  v0.s[2]
456        FMLA    v23.4s, v17.4s,  v2.s[2]
457        FMLA    v25.4s, v17.4s,  v4.s[2]
458        FMLA    v27.4s, v17.4s,  v6.s[2]
459        FMLA    v29.4s, v17.4s,  v8.s[2]
460
461        FMLA    v20.4s, v18.4s,  v0.s[3]
462        FMLA    v22.4s, v18.4s,  v2.s[3]
463        FMLA    v24.4s, v18.4s,  v4.s[3]
464        FMLA    v26.4s, v18.4s,  v6.s[3]
465        FMLA    v28.4s, v18.4s,  v8.s[3]
466        FMLA    v21.4s, v19.4s,  v0.s[3]
467        FMLA    v23.4s, v19.4s,  v2.s[3]
468        FMLA    v25.4s, v19.4s,  v4.s[3]
469        FMLA    v27.4s, v19.4s,  v6.s[3]
470        FMLA    v29.4s, v19.4s,  v8.s[3]
471
472        # Is there a remainder?- 2 floats of A (8 bytes)
4735:
474        TBZ     x0, 3, 6f
475
476        # Remainder- 2 floats of A (8 bytes)
477        # Load A
478        LDR     d0,  [x3], 8
479        LDR     d2,  [x9], 8
480        LDR     d4, [x10], 8
481        LDR     d6, [x11], 8
482        LDR     d8, [x12], 8
483        # Load B
484        LDP     q12,  q13, [x5], 32
485        LDP     q14,  q15, [x5], 32
486
487        FMLA    v20.4s, v12.4s,  v0.s[0]
488        FMLA    v22.4s, v12.4s,  v2.s[0]
489        FMLA    v24.4s, v12.4s,  v4.s[0]
490        FMLA    v26.4s, v12.4s,  v6.s[0]
491        FMLA    v28.4s, v12.4s,  v8.s[0]
492        FMLA    v21.4s, v13.4s,  v0.s[0]
493        FMLA    v23.4s, v13.4s,  v2.s[0]
494        FMLA    v25.4s, v13.4s,  v4.s[0]
495        FMLA    v27.4s, v13.4s,  v6.s[0]
496        FMLA    v29.4s, v13.4s,  v8.s[0]
497
498        FMLA    v20.4s, v14.4s,  v0.s[1]
499        FMLA    v22.4s, v14.4s,  v2.s[1]
500        FMLA    v24.4s, v14.4s,  v4.s[1]
501        FMLA    v26.4s, v14.4s,  v6.s[1]
502        FMLA    v28.4s, v14.4s,  v8.s[1]
503        FMLA    v21.4s, v15.4s,  v0.s[1]
504        FMLA    v23.4s, v15.4s,  v2.s[1]
505        FMLA    v25.4s, v15.4s,  v4.s[1]
506        FMLA    v27.4s, v15.4s,  v6.s[1]
507        FMLA    v29.4s, v15.4s,  v8.s[1]
508
509        # Is there a remainder?- 1 float of A (4 bytes)
5106:
511        TBZ     x0, 2, 3b
512
513        # Remainder- 1 float of A (4 bytes)
514        # Load A
515        LDR     s0,  [x3], 4
516        LDR     s2,  [x9], 4
517        LDR     s4, [x10], 4
518        LDR     s6, [x11], 4
519        LDR     s8, [x12], 4
520        # Load B
521        LDP     q12,  q13, [x5], 32
522
523        FMLA    v20.4s, v12.4s,  v0.s[0]
524        FMLA    v22.4s, v12.4s,  v2.s[0]
525        FMLA    v24.4s, v12.4s,  v4.s[0]
526        FMLA    v26.4s, v12.4s,  v6.s[0]
527        FMLA    v28.4s, v12.4s,  v8.s[0]
528        FMLA    v21.4s, v13.4s,  v0.s[0]
529        FMLA    v23.4s, v13.4s,  v2.s[0]
530        FMLA    v25.4s, v13.4s,  v4.s[0]
531        FMLA    v27.4s, v13.4s,  v6.s[0]
532        FMLA    v29.4s, v13.4s,  v8.s[0]
533        B       3b
534
535        # Store odd width
5367:
537        TBZ     x1, 2, 8f
538        STR     q20,  [x6], 16
539        MOV     v20.16b, v21.16b
540        STR     q22, [x16], 16
541        MOV     v22.16b, v23.16b
542        STR     q24, [x17], 16
543        MOV     v24.16b, v25.16b
544        STR     q26, [x13], 16
545        MOV     v26.16b, v27.16b
546        STR     q28, [x7], 16
547        MOV     v28.16b, v29.16b
5488:
549        TBZ     x1, 1, 9f
550        STR     d20,  [x6], 8
551        STR     d22, [x16], 8
552        DUP     d20, v20.d[1]
553        DUP     d22, v22.d[1]
554        STR     d24, [x17], 8
555        STR     d26, [x13], 8
556        DUP     d24, v24.d[1]
557        DUP     d26, v26.d[1]
558        STR     d28, [x7], 8
559        DUP     d28, v28.d[1]
560
5619:
562        TBZ     x1, 0, 10f
563        STR     s20,  [x6]
564        STR     s22, [x16]
565        STR     s24, [x17]
566        STR     s26, [x13]
567        STR     s28, [x7]
56810:
569        # Restore d8-d15 from stack
570        LDP     d14, d15, [sp, 32]
571        LDP     d12, d13, [sp, 16]
572        LDP     d8,  d9, [sp], 48
573        RET
574
575END_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75
576
577#ifdef __ELF__
578.section ".note.GNU-stack","",%progbits
579#endif
580