• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 #ifdef __ARM_FEATURE_SVE
25 
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28 
29 #include <cassert>
30 
31 namespace arm_gemm {
32 
sve_gemv_fp32_mla_8VL(const float * A_ptr,const float * B_ptr,float * output_ptr,size_t N,size_t K,const float * bias,Activation act,bool)33 void sve_gemv_fp32_mla_8VL (
34     const float *A_ptr, const float *B_ptr, float *output_ptr,
35     size_t N, size_t K,
36     const float *bias, Activation act, bool
37 )
38 {
39     struct KernelArgs {
40         float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
41         float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
42         const float *B_ptr = {};
43         size_t output_offset = {};
44         unsigned int input_initial_col = {};
45     } ka;
46 
47     unsigned long flags=0;
48     ka.B_ptr = B_ptr;
49     switch(act.type) {
50         default:
51         case Activation::Type::None:
52             break;
53         case Activation::Type::BoundedReLU:
54             ka.maxval = static_cast<float>(act.param1);
55             /* fall through */
56         case Activation::Type::ReLU:
57             ka.minval = 0;
58             flags |= 0x2;
59             break;
60     }
61     __asm__ __volatile__(
62       "ptrue p2.b\n"
63       "cntw x24\n"
64       "add x23, %x[N], x24\n"
65       "sub x23, x23, #0x1\n"
66       "udiv x23, x23, x24\n"
67       "mov x22, %x[bias]\n"
68       "1:"  // Column loop
69       "cmp x23, #0x8\n"
70       "bge 50f\n"
71       "cmp x23, #0x6\n"
72       "bgt 43f\n"
73       "beq 36f\n"
74       "cmp x23, #0x4\n"
75       "bgt 29f\n"
76       "beq 22f\n"
77       "cmp x23, #0x2\n"
78       "bgt 15f\n"
79       "beq 8f\n"
80       "mov x21, %x[K]\n"
81       "mov x20, %x[A_ptr]\n"
82       "whilelt p1.s, XZR, %x[N]\n"
83       "cbz x22, 2f\n"
84       "ld1w { z24.s }, p2/Z, [x22]\n"
85       "addvl x22, x22, #1\n"
86       "b 3f\n"
87       "2:"  // Width 1: no bias
88       "mov z24.b, #0x0\n"
89       "3:"  // Width 1: setup done
90       "cmp x21, #0x4\n"
91       "ble 5f\n"
92       "4:"  // Width 1: Multiply loop: Main loop head
93       "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
94       "whilelt p0.s, XZR, x21\n"
95       "addvl %x[B_ptr], %x[B_ptr], #8\n"
96       "ld1rqw { z0.s }, p0/Z, [x20]\n"
97       "fmla z24.s, z1.s, z0.s[0]\n"
98       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
99       "add x20, x20, #0x10\n"
100       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
101       "sub x21, x21, #0x4\n"
102       "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
103       "fmla z24.s, z2.s, z0.s[1]\n"
104       "addvl %x[B_ptr], %x[B_ptr], #8\n"
105       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
106       "cmp x21, #0x4\n"
107       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
108       "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
109       "fmla z24.s, z3.s, z0.s[2]\n"
110       "addvl %x[B_ptr], %x[B_ptr], #8\n"
111       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
112       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
113       "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
114       "fmla z24.s, z4.s, z0.s[3]\n"
115       "addvl %x[B_ptr], %x[B_ptr], #8\n"
116       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
117       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
118       "prfm pldl1keep, [x20, #0x80]\n"
119       "bgt 4b\n"
120       "5:"  // Width 1: Multiply loop: Single iteration only
121       "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
122       "whilelt p0.s, XZR, x21\n"
123       "addvl %x[B_ptr], %x[B_ptr], #8\n"
124       "ld1rqw { z0.s }, p0/Z, [x20]\n"
125       "fmla z24.s, z5.s, z0.s[0]\n"
126       "add x20, x20, #0x10\n"
127       "subs x21, x21, #0x1\n"
128       "ble 6f\n"
129       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
130       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
131       "subs x21, x21, #0x1\n"
132       "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
133       "fmla z24.s, z6.s, z0.s[1]\n"
134       "addvl %x[B_ptr], %x[B_ptr], #8\n"
135       "ble 6f\n"
136       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
137       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
138       "subs x21, x21, #0x1\n"
139       "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
140       "fmla z24.s, z7.s, z0.s[2]\n"
141       "addvl %x[B_ptr], %x[B_ptr], #8\n"
142       "ble 6f\n"
143       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
144       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
145       "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
146       "fmla z24.s, z8.s, z0.s[3]\n"
147       "addvl %x[B_ptr], %x[B_ptr], #8\n"
148       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
149       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
150       "6:"  // Width 1: Multiply loop: multiply skip
151       "prfm pldl1keep, [x20, #0x80]\n"
152       "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
153       "tbz %x[flags], #1, 7f\n"
154       "add x19, %x[args_ptr], %[offset_min]\n"
155       "ld1rw { z17.s }, p2/Z, [x19]\n"
156       "add x19, %x[args_ptr], %[offset_max]\n"
157       "ld1rw { z16.s }, p2/Z, [x19]\n"
158       "fmin z24.s, p2/M, z24.s, z16.s\n"
159       "fmax z24.s, p2/M, z24.s, z17.s\n"
160       "7:"  // Width 1: No activation
161       "st1w { z24.s }, p1, [%x[output_ptr]]\n"
162       "addvl %x[output_ptr], %x[output_ptr], #1\n"
163       "b 57f\n"
164       "8:"  // Width 2
165       "mov x21, %x[K]\n"
166       "mov x20, %x[A_ptr]\n"
167       "sub x19, %x[N], x24\n"
168       "whilelt p1.s, XZR, x19\n"
169       "cbz x22, 9f\n"
170       "ld1w { z24.s }, p2/Z, [x22]\n"
171       "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
172       "addvl x22, x22, #2\n"
173       "b 10f\n"
174       "9:"  // Width 2: no bias
175       "mov z24.b, #0x0\n"
176       "mov z25.b, #0x0\n"
177       "10:"  // Width 2: setup done
178       "cmp x21, #0x4\n"
179       "ble 12f\n"
180       "11:"  // Width 2: Multiply loop: Main loop head
181       "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
182       "whilelt p0.s, XZR, x21\n"
183       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
184       "addvl %x[B_ptr], %x[B_ptr], #8\n"
185       "ld1rqw { z0.s }, p0/Z, [x20]\n"
186       "fmla z24.s, z1.s, z0.s[0]\n"
187       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
188       "add x20, x20, #0x10\n"
189       "fmla z25.s, z2.s, z0.s[0]\n"
190       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
191       "sub x21, x21, #0x4\n"
192       "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
193       "fmla z24.s, z3.s, z0.s[1]\n"
194       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
195       "addvl %x[B_ptr], %x[B_ptr], #8\n"
196       "fmla z25.s, z4.s, z0.s[1]\n"
197       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
198       "cmp x21, #0x4\n"
199       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
200       "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
201       "fmla z24.s, z5.s, z0.s[2]\n"
202       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
203       "addvl %x[B_ptr], %x[B_ptr], #8\n"
204       "fmla z25.s, z6.s, z0.s[2]\n"
205       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
206       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
207       "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
208       "fmla z24.s, z7.s, z0.s[3]\n"
209       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
210       "addvl %x[B_ptr], %x[B_ptr], #8\n"
211       "fmla z25.s, z8.s, z0.s[3]\n"
212       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
213       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
214       "prfm pldl1keep, [x20, #0x80]\n"
215       "bgt 11b\n"
216       "12:"  // Width 2: Multiply loop: Single iteration only
217       "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
218       "whilelt p0.s, XZR, x21\n"
219       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
220       "addvl %x[B_ptr], %x[B_ptr], #8\n"
221       "ld1rqw { z0.s }, p0/Z, [x20]\n"
222       "fmla z24.s, z9.s, z0.s[0]\n"
223       "add x20, x20, #0x10\n"
224       "fmla z25.s, z10.s, z0.s[0]\n"
225       "subs x21, x21, #0x1\n"
226       "ble 13f\n"
227       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
228       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
229       "subs x21, x21, #0x1\n"
230       "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
231       "fmla z24.s, z11.s, z0.s[1]\n"
232       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
233       "addvl %x[B_ptr], %x[B_ptr], #8\n"
234       "fmla z25.s, z12.s, z0.s[1]\n"
235       "ble 13f\n"
236       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
237       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
238       "subs x21, x21, #0x1\n"
239       "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
240       "fmla z24.s, z13.s, z0.s[2]\n"
241       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
242       "addvl %x[B_ptr], %x[B_ptr], #8\n"
243       "fmla z25.s, z14.s, z0.s[2]\n"
244       "ble 13f\n"
245       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
246       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
247       "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
248       "fmla z24.s, z15.s, z0.s[3]\n"
249       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
250       "addvl %x[B_ptr], %x[B_ptr], #8\n"
251       "fmla z25.s, z16.s, z0.s[3]\n"
252       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
253       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
254       "13:"  // Width 2: Multiply loop: multiply skip
255       "prfm pldl1keep, [x20, #0x80]\n"
256       "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
257       "tbz %x[flags], #1, 14f\n"
258       "add x19, %x[args_ptr], %[offset_min]\n"
259       "ld1rw { z17.s }, p2/Z, [x19]\n"
260       "add x19, %x[args_ptr], %[offset_max]\n"
261       "ld1rw { z16.s }, p2/Z, [x19]\n"
262       "fmin z24.s, p2/M, z24.s, z16.s\n"
263       "fmin z25.s, p2/M, z25.s, z16.s\n"
264       "fmax z24.s, p2/M, z24.s, z17.s\n"
265       "fmax z25.s, p2/M, z25.s, z17.s\n"
266       "14:"  // Width 2: No activation
267       "st1w { z24.s }, p2, [%x[output_ptr]]\n"
268       "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n"
269       "addvl %x[output_ptr], %x[output_ptr], #2\n"
270       "b 57f\n"
271       "15:"  // Width 3
272       "mov x21, %x[K]\n"
273       "mov x20, %x[A_ptr]\n"
274       "mov x19, #0x2\n"
275       "msub x19, x24, x19, %x[N]\n"
276       "whilelt p1.s, XZR, x19\n"
277       "cbz x22, 16f\n"
278       "ld1w { z24.s }, p2/Z, [x22]\n"
279       "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
280       "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
281       "addvl x22, x22, #3\n"
282       "b 17f\n"
283       "16:"  // Width 3: no bias
284       "mov z24.b, #0x0\n"
285       "mov z25.b, #0x0\n"
286       "mov z26.b, #0x0\n"
287       "17:"  // Width 3: setup done
288       "cmp x21, #0x4\n"
289       "ble 19f\n"
290       "18:"  // Width 3: Multiply loop: Main loop head
291       "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
292       "whilelt p0.s, XZR, x21\n"
293       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
294       "sub x21, x21, #0x4\n"
295       "ld1rqw { z0.s }, p0/Z, [x20]\n"
296       "fmla z24.s, z1.s, z0.s[0]\n"
297       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
298       "add x20, x20, #0x10\n"
299       "fmla z25.s, z2.s, z0.s[0]\n"
300       "addvl %x[B_ptr], %x[B_ptr], #8\n"
301       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
302       "fmla z26.s, z3.s, z0.s[0]\n"
303       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
304       "cmp x21, #0x4\n"
305       "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
306       "fmla z24.s, z4.s, z0.s[1]\n"
307       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
308       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
309       "fmla z25.s, z5.s, z0.s[1]\n"
310       "addvl %x[B_ptr], %x[B_ptr], #8\n"
311       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
312       "fmla z26.s, z6.s, z0.s[1]\n"
313       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
314       "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
315       "fmla z24.s, z7.s, z0.s[2]\n"
316       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
317       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
318       "fmla z25.s, z8.s, z0.s[2]\n"
319       "addvl %x[B_ptr], %x[B_ptr], #8\n"
320       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
321       "fmla z26.s, z9.s, z0.s[2]\n"
322       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
323       "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
324       "fmla z24.s, z10.s, z0.s[3]\n"
325       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
326       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
327       "fmla z25.s, z11.s, z0.s[3]\n"
328       "addvl %x[B_ptr], %x[B_ptr], #8\n"
329       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
330       "fmla z26.s, z12.s, z0.s[3]\n"
331       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
332       "prfm pldl1keep, [x20, #0x80]\n"
333       "bgt 18b\n"
334       "19:"  // Width 3: Multiply loop: Single iteration only
335       "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
336       "whilelt p0.s, XZR, x21\n"
337       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
338       "subs x21, x21, #0x1\n"
339       "ld1rqw { z0.s }, p0/Z, [x20]\n"
340       "fmla z24.s, z13.s, z0.s[0]\n"
341       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
342       "add x20, x20, #0x10\n"
343       "fmla z25.s, z14.s, z0.s[0]\n"
344       "addvl %x[B_ptr], %x[B_ptr], #8\n"
345       "fmla z26.s, z15.s, z0.s[0]\n"
346       "ble 20f\n"
347       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
348       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
349       "subs x21, x21, #0x1\n"
350       "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
351       "fmla z24.s, z16.s, z0.s[1]\n"
352       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
353       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
354       "fmla z25.s, z17.s, z0.s[1]\n"
355       "addvl %x[B_ptr], %x[B_ptr], #8\n"
356       "fmla z26.s, z18.s, z0.s[1]\n"
357       "ble 20f\n"
358       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
359       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
360       "subs x21, x21, #0x1\n"
361       "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
362       "fmla z24.s, z19.s, z0.s[2]\n"
363       "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
364       "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
365       "fmla z25.s, z20.s, z0.s[2]\n"
366       "addvl %x[B_ptr], %x[B_ptr], #8\n"
367       "fmla z26.s, z21.s, z0.s[2]\n"
368       "ble 20f\n"
369       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
370       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
371       "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
372       "fmla z24.s, z22.s, z0.s[3]\n"
373       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
374       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
375       "fmla z25.s, z23.s, z0.s[3]\n"
376       "addvl %x[B_ptr], %x[B_ptr], #8\n"
377       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
378       "fmla z26.s, z1.s, z0.s[3]\n"
379       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
380       "20:"  // Width 3: Multiply loop: multiply skip
381       "prfm pldl1keep, [x20, #0x80]\n"
382       "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
383       "tbz %x[flags], #1, 21f\n"
384       "add x19, %x[args_ptr], %[offset_min]\n"
385       "ld1rw { z17.s }, p2/Z, [x19]\n"
386       "add x19, %x[args_ptr], %[offset_max]\n"
387       "ld1rw { z16.s }, p2/Z, [x19]\n"
388       "fmin z24.s, p2/M, z24.s, z16.s\n"
389       "fmin z25.s, p2/M, z25.s, z16.s\n"
390       "fmin z26.s, p2/M, z26.s, z16.s\n"
391       "fmax z24.s, p2/M, z24.s, z17.s\n"
392       "fmax z25.s, p2/M, z25.s, z17.s\n"
393       "fmax z26.s, p2/M, z26.s, z17.s\n"
394       "21:"  // Width 3: No activation
395       "st1w { z24.s }, p2, [%x[output_ptr]]\n"
396       "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
397       "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n"
398       "addvl %x[output_ptr], %x[output_ptr], #3\n"
399       "b 57f\n"
400       "22:"  // Width 4
401       "mov x21, %x[K]\n"
402       "mov x20, %x[A_ptr]\n"
403       "mov x19, #0x3\n"
404       "msub x19, x24, x19, %x[N]\n"
405       "whilelt p1.s, XZR, x19\n"
406       "cbz x22, 23f\n"
407       "ld1w { z24.s }, p2/Z, [x22]\n"
408       "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
409       "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
410       "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
411       "addvl x22, x22, #4\n"
412       "b 24f\n"
413       "23:"  // Width 4: no bias
414       "mov z24.b, #0x0\n"
415       "mov z25.b, #0x0\n"
416       "mov z26.b, #0x0\n"
417       "mov z27.b, #0x0\n"
418       "24:"  // Width 4: setup done
419       "cmp x21, #0x4\n"
420       "ble 26f\n"
421       "25:"  // Width 4: Multiply loop: Main loop head
422       "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
423       "whilelt p0.s, XZR, x21\n"
424       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
425       "sub x21, x21, #0x4\n"
426       "ld1rqw { z0.s }, p0/Z, [x20]\n"
427       "fmla z24.s, z1.s, z0.s[0]\n"
428       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
429       "add x20, x20, #0x10\n"
430       "fmla z25.s, z2.s, z0.s[0]\n"
431       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
432       "addvl %x[B_ptr], %x[B_ptr], #8\n"
433       "fmla z26.s, z3.s, z0.s[0]\n"
434       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
435       "cmp x21, #0x4\n"
436       "fmla z27.s, z4.s, z0.s[0]\n"
437       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
438       "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
439       "fmla z24.s, z5.s, z0.s[1]\n"
440       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
441       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
442       "fmla z25.s, z6.s, z0.s[1]\n"
443       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
444       "addvl %x[B_ptr], %x[B_ptr], #8\n"
445       "fmla z26.s, z7.s, z0.s[1]\n"
446       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
447       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
448       "fmla z27.s, z8.s, z0.s[1]\n"
449       "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
450       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
451       "fmla z24.s, z9.s, z0.s[2]\n"
452       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
453       "fmla z25.s, z10.s, z0.s[2]\n"
454       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
455       "addvl %x[B_ptr], %x[B_ptr], #8\n"
456       "fmla z26.s, z11.s, z0.s[2]\n"
457       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
458       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
459       "fmla z27.s, z12.s, z0.s[2]\n"
460       "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
461       "fmla z24.s, z13.s, z0.s[3]\n"
462       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
463       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
464       "fmla z25.s, z14.s, z0.s[3]\n"
465       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
466       "addvl %x[B_ptr], %x[B_ptr], #8\n"
467       "fmla z26.s, z15.s, z0.s[3]\n"
468       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
469       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
470       "fmla z27.s, z16.s, z0.s[3]\n"
471       "prfm pldl1keep, [x20, #0x80]\n"
472       "bgt 25b\n"
473       "26:"  // Width 4: Multiply loop: Single iteration only
474       "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
475       "whilelt p0.s, XZR, x21\n"
476       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
477       "subs x21, x21, #0x1\n"
478       "ld1rqw { z0.s }, p0/Z, [x20]\n"
479       "fmla z24.s, z17.s, z0.s[0]\n"
480       "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
481       "add x20, x20, #0x10\n"
482       "fmla z25.s, z18.s, z0.s[0]\n"
483       "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
484       "addvl %x[B_ptr], %x[B_ptr], #8\n"
485       "fmla z26.s, z19.s, z0.s[0]\n"
486       "fmla z27.s, z20.s, z0.s[0]\n"
487       "ble 27f\n"
488       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
489       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
490       "subs x21, x21, #0x1\n"
491       "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
492       "fmla z24.s, z21.s, z0.s[1]\n"
493       "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
494       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
495       "fmla z25.s, z22.s, z0.s[1]\n"
496       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
497       "addvl %x[B_ptr], %x[B_ptr], #8\n"
498       "fmla z26.s, z23.s, z0.s[1]\n"
499       "fmla z27.s, z1.s, z0.s[1]\n"
500       "ble 27f\n"
501       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
502       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
503       "subs x21, x21, #0x1\n"
504       "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
505       "fmla z24.s, z2.s, z0.s[2]\n"
506       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
507       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
508       "fmla z25.s, z3.s, z0.s[2]\n"
509       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
510       "addvl %x[B_ptr], %x[B_ptr], #8\n"
511       "fmla z26.s, z4.s, z0.s[2]\n"
512       "fmla z27.s, z5.s, z0.s[2]\n"
513       "ble 27f\n"
514       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
515       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
516       "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
517       "fmla z24.s, z6.s, z0.s[3]\n"
518       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
519       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
520       "fmla z25.s, z7.s, z0.s[3]\n"
521       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
522       "addvl %x[B_ptr], %x[B_ptr], #8\n"
523       "fmla z26.s, z8.s, z0.s[3]\n"
524       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
525       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
526       "fmla z27.s, z9.s, z0.s[3]\n"
527       "27:"  // Width 4: Multiply loop: multiply skip
528       "prfm pldl1keep, [x20, #0x80]\n"
529       "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
530       "tbz %x[flags], #1, 28f\n"
531       "add x19, %x[args_ptr], %[offset_min]\n"
532       "ld1rw { z17.s }, p2/Z, [x19]\n"
533       "add x19, %x[args_ptr], %[offset_max]\n"
534       "ld1rw { z16.s }, p2/Z, [x19]\n"
535       "fmin z24.s, p2/M, z24.s, z16.s\n"
536       "fmin z25.s, p2/M, z25.s, z16.s\n"
537       "fmin z26.s, p2/M, z26.s, z16.s\n"
538       "fmin z27.s, p2/M, z27.s, z16.s\n"
539       "fmax z24.s, p2/M, z24.s, z17.s\n"
540       "fmax z25.s, p2/M, z25.s, z17.s\n"
541       "fmax z26.s, p2/M, z26.s, z17.s\n"
542       "fmax z27.s, p2/M, z27.s, z17.s\n"
543       "28:"  // Width 4: No activation
544       "st1w { z24.s }, p2, [%x[output_ptr]]\n"
545       "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
546       "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
547       "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n"
548       "addvl %x[output_ptr], %x[output_ptr], #4\n"
549       "b 57f\n"
550       "29:"  // Width 5
551       "mov x21, %x[K]\n"
552       "mov x20, %x[A_ptr]\n"
553       "mov x19, #0x4\n"
554       "msub x19, x24, x19, %x[N]\n"
555       "whilelt p1.s, XZR, x19\n"
556       "cbz x22, 30f\n"
557       "ld1w { z24.s }, p2/Z, [x22]\n"
558       "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
559       "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
560       "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
561       "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
562       "addvl x22, x22, #5\n"
563       "b 31f\n"
564       "30:"  // Width 5: no bias
565       "mov z24.b, #0x0\n"
566       "mov z25.b, #0x0\n"
567       "mov z26.b, #0x0\n"
568       "mov z27.b, #0x0\n"
569       "mov z28.b, #0x0\n"
570       "31:"  // Width 5: setup done
571       "cmp x21, #0x4\n"
572       "ble 33f\n"
573       "32:"  // Width 5: Multiply loop: Main loop head
574       "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
575       "whilelt p0.s, XZR, x21\n"
576       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
577       "sub x21, x21, #0x4\n"
578       "ld1rqw { z0.s }, p0/Z, [x20]\n"
579       "fmla z24.s, z1.s, z0.s[0]\n"
580       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
581       "add x20, x20, #0x10\n"
582       "fmla z25.s, z2.s, z0.s[0]\n"
583       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
584       "cmp x21, #0x4\n"
585       "fmla z26.s, z3.s, z0.s[0]\n"
586       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
587       "addvl %x[B_ptr], %x[B_ptr], #8\n"
588       "fmla z27.s, z4.s, z0.s[0]\n"
589       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
590       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
591       "fmla z28.s, z5.s, z0.s[0]\n"
592       "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
593       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
594       "fmla z24.s, z6.s, z0.s[1]\n"
595       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
596       "fmla z25.s, z7.s, z0.s[1]\n"
597       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
598       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
599       "fmla z26.s, z8.s, z0.s[1]\n"
600       "addvl %x[B_ptr], %x[B_ptr], #8\n"
601       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
602       "fmla z27.s, z9.s, z0.s[1]\n"
603       "fmla z28.s, z10.s, z0.s[1]\n"
604       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
605       "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
606       "fmla z24.s, z11.s, z0.s[2]\n"
607       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
608       "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
609       "fmla z25.s, z12.s, z0.s[2]\n"
610       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
611       "fmla z26.s, z13.s, z0.s[2]\n"
612       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
613       "addvl %x[B_ptr], %x[B_ptr], #8\n"
614       "fmla z27.s, z14.s, z0.s[2]\n"
615       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
616       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
617       "fmla z28.s, z15.s, z0.s[2]\n"
618       "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
619       "fmla z24.s, z16.s, z0.s[3]\n"
620       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
621       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
622       "fmla z25.s, z17.s, z0.s[3]\n"
623       "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
624       "fmla z26.s, z18.s, z0.s[3]\n"
625       "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
626       "addvl %x[B_ptr], %x[B_ptr], #8\n"
627       "fmla z27.s, z19.s, z0.s[3]\n"
628       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
629       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
630       "fmla z28.s, z20.s, z0.s[3]\n"
631       "prfm pldl1keep, [x20, #0x80]\n"
632       "bgt 32b\n"
633       "33:"  // Width 5: Multiply loop: Single iteration only
634       "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
635       "whilelt p0.s, XZR, x21\n"
636       "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
637       "subs x21, x21, #0x1\n"
638       "ld1rqw { z0.s }, p0/Z, [x20]\n"
639       "fmla z24.s, z21.s, z0.s[0]\n"
640       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
641       "add x20, x20, #0x10\n"
642       "fmla z25.s, z22.s, z0.s[0]\n"
643       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
644       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
645       "fmla z26.s, z23.s, z0.s[0]\n"
646       "addvl %x[B_ptr], %x[B_ptr], #8\n"
647       "fmla z27.s, z1.s, z0.s[0]\n"
648       "fmla z28.s, z2.s, z0.s[0]\n"
649       "ble 34f\n"
650       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
651       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
652       "subs x21, x21, #0x1\n"
653       "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
654       "fmla z24.s, z3.s, z0.s[1]\n"
655       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
656       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
657       "fmla z25.s, z4.s, z0.s[1]\n"
658       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
659       "fmla z26.s, z5.s, z0.s[1]\n"
660       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
661       "addvl %x[B_ptr], %x[B_ptr], #8\n"
662       "fmla z27.s, z6.s, z0.s[1]\n"
663       "fmla z28.s, z7.s, z0.s[1]\n"
664       "ble 34f\n"
665       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
666       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
667       "subs x21, x21, #0x1\n"
668       "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
669       "fmla z24.s, z8.s, z0.s[2]\n"
670       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
671       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
672       "fmla z25.s, z9.s, z0.s[2]\n"
673       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
674       "fmla z26.s, z10.s, z0.s[2]\n"
675       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
676       "addvl %x[B_ptr], %x[B_ptr], #8\n"
677       "fmla z27.s, z11.s, z0.s[2]\n"
678       "fmla z28.s, z12.s, z0.s[2]\n"
679       "ble 34f\n"
680       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
681       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
682       "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
683       "fmla z24.s, z13.s, z0.s[3]\n"
684       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
685       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
686       "fmla z25.s, z14.s, z0.s[3]\n"
687       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
688       "fmla z26.s, z15.s, z0.s[3]\n"
689       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
690       "addvl %x[B_ptr], %x[B_ptr], #8\n"
691       "fmla z27.s, z16.s, z0.s[3]\n"
692       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
693       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
694       "fmla z28.s, z17.s, z0.s[3]\n"
695       "34:"  // Width 5: Multiply loop: multiply skip
696       "prfm pldl1keep, [x20, #0x80]\n"
697       "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
698       "tbz %x[flags], #1, 35f\n"
699       "add x19, %x[args_ptr], %[offset_min]\n"
700       "ld1rw { z17.s }, p2/Z, [x19]\n"
701       "add x19, %x[args_ptr], %[offset_max]\n"
702       "ld1rw { z16.s }, p2/Z, [x19]\n"
703       "fmin z24.s, p2/M, z24.s, z16.s\n"
704       "fmin z25.s, p2/M, z25.s, z16.s\n"
705       "fmin z26.s, p2/M, z26.s, z16.s\n"
706       "fmin z27.s, p2/M, z27.s, z16.s\n"
707       "fmin z28.s, p2/M, z28.s, z16.s\n"
708       "fmax z24.s, p2/M, z24.s, z17.s\n"
709       "fmax z25.s, p2/M, z25.s, z17.s\n"
710       "fmax z26.s, p2/M, z26.s, z17.s\n"
711       "fmax z27.s, p2/M, z27.s, z17.s\n"
712       "fmax z28.s, p2/M, z28.s, z17.s\n"
713       "35:"  // Width 5: No activation
714       "st1w { z24.s }, p2, [%x[output_ptr]]\n"
715       "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
716       "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
717       "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
718       "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n"
719       "addvl %x[output_ptr], %x[output_ptr], #5\n"
720       "b 57f\n"
721       "36:"  // Width 6
722       "mov x21, %x[K]\n"
723       "mov x20, %x[A_ptr]\n"
724       "mov x19, #0x5\n"
725       "msub x19, x24, x19, %x[N]\n"
726       "whilelt p1.s, XZR, x19\n"
727       "cbz x22, 37f\n"
728       "ld1w { z24.s }, p2/Z, [x22]\n"
729       "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
730       "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
731       "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
732       "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
733       "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
734       "addvl x22, x22, #6\n"
735       "b 38f\n"
736       "37:"  // Width 6: no bias
737       "mov z24.b, #0x0\n"
738       "mov z25.b, #0x0\n"
739       "mov z26.b, #0x0\n"
740       "mov z27.b, #0x0\n"
741       "mov z28.b, #0x0\n"
742       "mov z29.b, #0x0\n"
743       "38:"  // Width 6: setup done
744       "cmp x21, #0x4\n"
745       "ble 40f\n"
746       "39:"  // Width 6: Multiply loop: Main loop head
747       "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
748       "whilelt p0.s, XZR, x21\n"
749       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
750       "sub x21, x21, #0x4\n"
751       "ld1rqw { z0.s }, p0/Z, [x20]\n"
752       "fmla z24.s, z1.s, z0.s[0]\n"
753       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
754       "add x20, x20, #0x10\n"
755       "fmla z25.s, z2.s, z0.s[0]\n"
756       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
757       "cmp x21, #0x4\n"
758       "fmla z26.s, z3.s, z0.s[0]\n"
759       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
760       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
761       "fmla z27.s, z4.s, z0.s[0]\n"
762       "addvl %x[B_ptr], %x[B_ptr], #8\n"
763       "fmla z28.s, z5.s, z0.s[0]\n"
764       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
765       "fmla z29.s, z6.s, z0.s[0]\n"
766       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
767       "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
768       "fmla z24.s, z7.s, z0.s[1]\n"
769       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
770       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
771       "fmla z25.s, z8.s, z0.s[1]\n"
772       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
773       "fmla z26.s, z9.s, z0.s[1]\n"
774       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
775       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
776       "fmla z27.s, z10.s, z0.s[1]\n"
777       "addvl %x[B_ptr], %x[B_ptr], #8\n"
778       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
779       "fmla z28.s, z11.s, z0.s[1]\n"
780       "fmla z29.s, z12.s, z0.s[1]\n"
781       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
782       "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
783       "fmla z24.s, z13.s, z0.s[2]\n"
784       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
785       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
786       "fmla z25.s, z14.s, z0.s[2]\n"
787       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
788       "fmla z26.s, z15.s, z0.s[2]\n"
789       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
790       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
791       "fmla z27.s, z16.s, z0.s[2]\n"
792       "addvl %x[B_ptr], %x[B_ptr], #8\n"
793       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
794       "fmla z28.s, z17.s, z0.s[2]\n"
795       "fmla z29.s, z18.s, z0.s[2]\n"
796       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
797       "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
798       "fmla z24.s, z19.s, z0.s[3]\n"
799       "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
800       "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
801       "fmla z25.s, z20.s, z0.s[3]\n"
802       "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
803       "fmla z26.s, z21.s, z0.s[3]\n"
804       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
805       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
806       "fmla z27.s, z22.s, z0.s[3]\n"
807       "addvl %x[B_ptr], %x[B_ptr], #8\n"
808       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
809       "fmla z28.s, z23.s, z0.s[3]\n"
810       "fmla z29.s, z1.s, z0.s[3]\n"
811       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
812       "prfm pldl1keep, [x20, #0x80]\n"
813       "bgt 39b\n"
814       "40:"  // Width 6: Multiply loop: Single iteration only
815       "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
816       "whilelt p0.s, XZR, x21\n"
817       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
818       "subs x21, x21, #0x1\n"
819       "ld1rqw { z0.s }, p0/Z, [x20]\n"
820       "fmla z24.s, z2.s, z0.s[0]\n"
821       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
822       "add x20, x20, #0x10\n"
823       "fmla z25.s, z3.s, z0.s[0]\n"
824       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
825       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
826       "fmla z26.s, z4.s, z0.s[0]\n"
827       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
828       "addvl %x[B_ptr], %x[B_ptr], #8\n"
829       "fmla z27.s, z5.s, z0.s[0]\n"
830       "fmla z28.s, z6.s, z0.s[0]\n"
831       "fmla z29.s, z7.s, z0.s[0]\n"
832       "ble 41f\n"
833       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
834       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
835       "subs x21, x21, #0x1\n"
836       "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
837       "fmla z24.s, z8.s, z0.s[1]\n"
838       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
839       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
840       "fmla z25.s, z9.s, z0.s[1]\n"
841       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
842       "fmla z26.s, z10.s, z0.s[1]\n"
843       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
844       "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
845       "fmla z27.s, z11.s, z0.s[1]\n"
846       "addvl %x[B_ptr], %x[B_ptr], #8\n"
847       "fmla z28.s, z12.s, z0.s[1]\n"
848       "fmla z29.s, z13.s, z0.s[1]\n"
849       "ble 41f\n"
850       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
851       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
852       "subs x21, x21, #0x1\n"
853       "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n"
854       "fmla z24.s, z14.s, z0.s[2]\n"
855       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
856       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
857       "fmla z25.s, z15.s, z0.s[2]\n"
858       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
859       "fmla z26.s, z16.s, z0.s[2]\n"
860       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
861       "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
862       "fmla z27.s, z17.s, z0.s[2]\n"
863       "addvl %x[B_ptr], %x[B_ptr], #8\n"
864       "fmla z28.s, z18.s, z0.s[2]\n"
865       "fmla z29.s, z19.s, z0.s[2]\n"
866       "ble 41f\n"
867       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
868       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
869       "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
870       "fmla z24.s, z20.s, z0.s[3]\n"
871       "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
872       "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
873       "fmla z25.s, z21.s, z0.s[3]\n"
874       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
875       "fmla z26.s, z22.s, z0.s[3]\n"
876       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
877       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
878       "fmla z27.s, z23.s, z0.s[3]\n"
879       "addvl %x[B_ptr], %x[B_ptr], #8\n"
880       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
881       "fmla z28.s, z1.s, z0.s[3]\n"
882       "fmla z29.s, z2.s, z0.s[3]\n"
883       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
884       "41:"  // Width 6: Multiply loop: multiply skip
885       "prfm pldl1keep, [x20, #0x80]\n"
886       "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
887       "tbz %x[flags], #1, 42f\n"
888       "add x19, %x[args_ptr], %[offset_min]\n"
889       "ld1rw { z17.s }, p2/Z, [x19]\n"
890       "add x19, %x[args_ptr], %[offset_max]\n"
891       "ld1rw { z16.s }, p2/Z, [x19]\n"
892       "fmin z24.s, p2/M, z24.s, z16.s\n"
893       "fmin z25.s, p2/M, z25.s, z16.s\n"
894       "fmin z26.s, p2/M, z26.s, z16.s\n"
895       "fmin z27.s, p2/M, z27.s, z16.s\n"
896       "fmin z28.s, p2/M, z28.s, z16.s\n"
897       "fmax z24.s, p2/M, z24.s, z17.s\n"
898       "fmax z25.s, p2/M, z25.s, z17.s\n"
899       "fmax z26.s, p2/M, z26.s, z17.s\n"
900       "fmax z27.s, p2/M, z27.s, z17.s\n"
901       "fmax z28.s, p2/M, z28.s, z17.s\n"
902       "fmin z29.s, p2/M, z29.s, z16.s\n"
903       "fmax z29.s, p2/M, z29.s, z17.s\n"
904       "42:"  // Width 6: No activation
905       "st1w { z24.s }, p2, [%x[output_ptr]]\n"
906       "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
907       "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
908       "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
909       "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
910       "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n"
911       "addvl %x[output_ptr], %x[output_ptr], #6\n"
912       "b 57f\n"
913       "43:"  // Width 7
914       "mov x21, %x[K]\n"
915       "mov x20, %x[A_ptr]\n"
916       "mov x19, #0x6\n"
917       "msub x19, x24, x19, %x[N]\n"
918       "whilelt p1.s, XZR, x19\n"
919       "cbz x22, 44f\n"
920       "ld1w { z24.s }, p2/Z, [x22]\n"
921       "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
922       "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
923       "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
924       "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
925       "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
926       "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
927       "addvl x22, x22, #7\n"
928       "b 45f\n"
929       "44:"  // Width 7: no bias
930       "mov z24.b, #0x0\n"
931       "mov z25.b, #0x0\n"
932       "mov z26.b, #0x0\n"
933       "mov z27.b, #0x0\n"
934       "mov z28.b, #0x0\n"
935       "mov z29.b, #0x0\n"
936       "mov z30.b, #0x0\n"
937       "45:"  // Width 7: setup done
938       "cmp x21, #0x4\n"
939       "ble 47f\n"
940       "46:"  // Width 7: Multiply loop: Main loop head
941       "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
942       "whilelt p0.s, XZR, x21\n"
943       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
944       "sub x21, x21, #0x4\n"
945       "ld1rqw { z0.s }, p0/Z, [x20]\n"
946       "fmla z24.s, z1.s, z0.s[0]\n"
947       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
948       "add x20, x20, #0x10\n"
949       "fmla z25.s, z2.s, z0.s[0]\n"
950       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
951       "cmp x21, #0x4\n"
952       "fmla z26.s, z3.s, z0.s[0]\n"
953       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
954       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
955       "fmla z27.s, z4.s, z0.s[0]\n"
956       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
957       "fmla z28.s, z5.s, z0.s[0]\n"
958       "addvl %x[B_ptr], %x[B_ptr], #8\n"
959       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
960       "fmla z29.s, z6.s, z0.s[0]\n"
961       "fmla z30.s, z7.s, z0.s[0]\n"
962       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
963       "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
964       "fmla z24.s, z8.s, z0.s[1]\n"
965       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
966       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
967       "fmla z25.s, z9.s, z0.s[1]\n"
968       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
969       "fmla z26.s, z10.s, z0.s[1]\n"
970       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
971       "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
972       "fmla z27.s, z11.s, z0.s[1]\n"
973       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
974       "addvl %x[B_ptr], %x[B_ptr], #8\n"
975       "fmla z28.s, z12.s, z0.s[1]\n"
976       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
977       "fmla z29.s, z13.s, z0.s[1]\n"
978       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
979       "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
980       "fmla z30.s, z14.s, z0.s[1]\n"
981       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
982       "fmla z24.s, z15.s, z0.s[2]\n"
983       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
984       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
985       "fmla z25.s, z16.s, z0.s[2]\n"
986       "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
987       "fmla z26.s, z17.s, z0.s[2]\n"
988       "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
989       "fmla z27.s, z18.s, z0.s[2]\n"
990       "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
991       "addvl %x[B_ptr], %x[B_ptr], #8\n"
992       "fmla z28.s, z19.s, z0.s[2]\n"
993       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
994       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
995       "fmla z29.s, z20.s, z0.s[2]\n"
996       "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
997       "fmla z30.s, z21.s, z0.s[2]\n"
998       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
999       "fmla z24.s, z22.s, z0.s[3]\n"
1000       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1001       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1002       "fmla z25.s, z23.s, z0.s[3]\n"
1003       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1004       "fmla z26.s, z1.s, z0.s[3]\n"
1005       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1006       "fmla z27.s, z2.s, z0.s[3]\n"
1007       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1008       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1009       "fmla z28.s, z3.s, z0.s[3]\n"
1010       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1011       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1012       "fmla z29.s, z4.s, z0.s[3]\n"
1013       "prfm pldl1keep, [x20, #0x80]\n"
1014       "fmla z30.s, z5.s, z0.s[3]\n"
1015       "bgt 46b\n"
1016       "47:"  // Width 7: Multiply loop: Single iteration only
1017       "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
1018       "whilelt p0.s, XZR, x21\n"
1019       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1020       "subs x21, x21, #0x1\n"
1021       "ld1rqw { z0.s }, p0/Z, [x20]\n"
1022       "fmla z24.s, z6.s, z0.s[0]\n"
1023       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1024       "add x20, x20, #0x10\n"
1025       "fmla z25.s, z7.s, z0.s[0]\n"
1026       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1027       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1028       "fmla z26.s, z8.s, z0.s[0]\n"
1029       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1030       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1031       "fmla z27.s, z9.s, z0.s[0]\n"
1032       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1033       "fmla z28.s, z10.s, z0.s[0]\n"
1034       "fmla z29.s, z11.s, z0.s[0]\n"
1035       "fmla z30.s, z12.s, z0.s[0]\n"
1036       "ble 48f\n"
1037       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1038       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1039       "subs x21, x21, #0x1\n"
1040       "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
1041       "fmla z24.s, z13.s, z0.s[1]\n"
1042       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1043       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1044       "fmla z25.s, z14.s, z0.s[1]\n"
1045       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1046       "fmla z26.s, z15.s, z0.s[1]\n"
1047       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1048       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1049       "fmla z27.s, z16.s, z0.s[1]\n"
1050       "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1051       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1052       "fmla z28.s, z17.s, z0.s[1]\n"
1053       "fmla z29.s, z18.s, z0.s[1]\n"
1054       "fmla z30.s, z19.s, z0.s[1]\n"
1055       "ble 48f\n"
1056       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1057       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1058       "subs x21, x21, #0x1\n"
1059       "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
1060       "fmla z24.s, z20.s, z0.s[2]\n"
1061       "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1062       "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1063       "fmla z25.s, z21.s, z0.s[2]\n"
1064       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1065       "fmla z26.s, z22.s, z0.s[2]\n"
1066       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1067       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1068       "fmla z27.s, z23.s, z0.s[2]\n"
1069       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1070       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1071       "fmla z28.s, z1.s, z0.s[2]\n"
1072       "fmla z29.s, z2.s, z0.s[2]\n"
1073       "fmla z30.s, z3.s, z0.s[2]\n"
1074       "ble 48f\n"
1075       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1076       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1077       "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
1078       "fmla z24.s, z4.s, z0.s[3]\n"
1079       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1080       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1081       "fmla z25.s, z5.s, z0.s[3]\n"
1082       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1083       "fmla z26.s, z6.s, z0.s[3]\n"
1084       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1085       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1086       "fmla z27.s, z7.s, z0.s[3]\n"
1087       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1088       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1089       "fmla z28.s, z8.s, z0.s[3]\n"
1090       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1091       "fmla z29.s, z9.s, z0.s[3]\n"
1092       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1093       "fmla z30.s, z10.s, z0.s[3]\n"
1094       "48:"  // Width 7: Multiply loop: multiply skip
1095       "prfm pldl1keep, [x20, #0x80]\n"
1096       "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1097       "tbz %x[flags], #1, 49f\n"
1098       "add x19, %x[args_ptr], %[offset_min]\n"
1099       "ld1rw { z17.s }, p2/Z, [x19]\n"
1100       "add x19, %x[args_ptr], %[offset_max]\n"
1101       "ld1rw { z16.s }, p2/Z, [x19]\n"
1102       "fmin z24.s, p2/M, z24.s, z16.s\n"
1103       "fmin z25.s, p2/M, z25.s, z16.s\n"
1104       "fmin z26.s, p2/M, z26.s, z16.s\n"
1105       "fmin z27.s, p2/M, z27.s, z16.s\n"
1106       "fmin z28.s, p2/M, z28.s, z16.s\n"
1107       "fmax z24.s, p2/M, z24.s, z17.s\n"
1108       "fmax z25.s, p2/M, z25.s, z17.s\n"
1109       "fmax z26.s, p2/M, z26.s, z17.s\n"
1110       "fmax z27.s, p2/M, z27.s, z17.s\n"
1111       "fmax z28.s, p2/M, z28.s, z17.s\n"
1112       "fmin z29.s, p2/M, z29.s, z16.s\n"
1113       "fmin z30.s, p2/M, z30.s, z16.s\n"
1114       "fmax z29.s, p2/M, z29.s, z17.s\n"
1115       "fmax z30.s, p2/M, z30.s, z17.s\n"
1116       "49:"  // Width 7: No activation
1117       "st1w { z24.s }, p2, [%x[output_ptr]]\n"
1118       "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
1119       "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
1120       "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
1121       "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
1122       "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
1123       "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n"
1124       "addvl %x[output_ptr], %x[output_ptr], #7\n"
1125       "b 57f\n"
1126       "50:"  // Width 8
1127       "mov x21, %x[K]\n"
1128       "mov x20, %x[A_ptr]\n"
1129       "mov x19, #0x7\n"
1130       "msub x19, x24, x19, %x[N]\n"
1131       "whilelt p1.s, XZR, x19\n"
1132       "cbz x22, 51f\n"
1133       "ld1w { z24.s }, p2/Z, [x22]\n"
1134       "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
1135       "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
1136       "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
1137       "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
1138       "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
1139       "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
1140       "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n"
1141       "addvl x22, x22, #8\n"
1142       "b 52f\n"
1143       "51:"  // Width 8: no bias
1144       "mov z24.b, #0x0\n"
1145       "mov z25.b, #0x0\n"
1146       "mov z26.b, #0x0\n"
1147       "mov z27.b, #0x0\n"
1148       "mov z28.b, #0x0\n"
1149       "mov z29.b, #0x0\n"
1150       "mov z30.b, #0x0\n"
1151       "mov z31.b, #0x0\n"
1152       "52:"  // Width 8: setup done
1153       "cmp x21, #0x4\n"
1154       "ble 54f\n"
1155       "53:"  // Width 8: Multiply loop: Main loop head
1156       "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
1157       "whilelt p0.s, XZR, x21\n"
1158       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1159       "sub x21, x21, #0x4\n"
1160       "ld1rqw { z0.s }, p0/Z, [x20]\n"
1161       "fmla z24.s, z1.s, z0.s[0]\n"
1162       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1163       "add x20, x20, #0x10\n"
1164       "fmla z25.s, z2.s, z0.s[0]\n"
1165       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1166       "cmp x21, #0x4\n"
1167       "fmla z26.s, z3.s, z0.s[0]\n"
1168       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1169       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1170       "fmla z27.s, z4.s, z0.s[0]\n"
1171       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1172       "fmla z28.s, z5.s, z0.s[0]\n"
1173       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1174       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1175       "fmla z29.s, z6.s, z0.s[0]\n"
1176       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1177       "fmla z30.s, z7.s, z0.s[0]\n"
1178       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1179       "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
1180       "fmla z31.s, z8.s, z0.s[0]\n"
1181       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1182       "fmla z24.s, z9.s, z0.s[1]\n"
1183       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1184       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1185       "fmla z25.s, z10.s, z0.s[1]\n"
1186       "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1187       "fmla z26.s, z11.s, z0.s[1]\n"
1188       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1189       "fmla z27.s, z12.s, z0.s[1]\n"
1190       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1191       "fmla z28.s, z13.s, z0.s[1]\n"
1192       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1193       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1194       "fmla z29.s, z14.s, z0.s[1]\n"
1195       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1196       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1197       "fmla z30.s, z15.s, z0.s[1]\n"
1198       "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
1199       "fmla z31.s, z16.s, z0.s[1]\n"
1200       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1201       "fmla z24.s, z17.s, z0.s[2]\n"
1202       "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1203       "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1204       "fmla z25.s, z18.s, z0.s[2]\n"
1205       "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1206       "fmla z26.s, z19.s, z0.s[2]\n"
1207       "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1208       "fmla z27.s, z20.s, z0.s[2]\n"
1209       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1210       "fmla z28.s, z21.s, z0.s[2]\n"
1211       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1212       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1213       "fmla z29.s, z22.s, z0.s[2]\n"
1214       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1215       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1216       "fmla z30.s, z23.s, z0.s[2]\n"
1217       "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
1218       "fmla z31.s, z1.s, z0.s[2]\n"
1219       "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1220       "fmla z24.s, z2.s, z0.s[3]\n"
1221       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1222       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1223       "fmla z25.s, z3.s, z0.s[3]\n"
1224       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1225       "fmla z26.s, z4.s, z0.s[3]\n"
1226       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1227       "fmla z27.s, z5.s, z0.s[3]\n"
1228       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1229       "fmla z28.s, z6.s, z0.s[3]\n"
1230       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1231       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1232       "fmla z29.s, z7.s, z0.s[3]\n"
1233       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1234       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1235       "fmla z30.s, z8.s, z0.s[3]\n"
1236       "prfm pldl1keep, [x20, #0x80]\n"
1237       "fmla z31.s, z9.s, z0.s[3]\n"
1238       "bgt 53b\n"
1239       "54:"  // Width 8: Multiply loop: Single iteration only
1240       "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
1241       "whilelt p0.s, XZR, x21\n"
1242       "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1243       "subs x21, x21, #0x1\n"
1244       "ld1rqw { z0.s }, p0/Z, [x20]\n"
1245       "fmla z24.s, z10.s, z0.s[0]\n"
1246       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1247       "add x20, x20, #0x10\n"
1248       "fmla z25.s, z11.s, z0.s[0]\n"
1249       "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1250       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1251       "fmla z26.s, z12.s, z0.s[0]\n"
1252       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1253       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1254       "fmla z27.s, z13.s, z0.s[0]\n"
1255       "fmla z28.s, z14.s, z0.s[0]\n"
1256       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1257       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1258       "fmla z29.s, z15.s, z0.s[0]\n"
1259       "fmla z30.s, z16.s, z0.s[0]\n"
1260       "fmla z31.s, z17.s, z0.s[0]\n"
1261       "ble 55f\n"
1262       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1263       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1264       "subs x21, x21, #0x1\n"
1265       "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n"
1266       "fmla z24.s, z18.s, z0.s[1]\n"
1267       "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1268       "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1269       "fmla z25.s, z19.s, z0.s[1]\n"
1270       "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1271       "fmla z26.s, z20.s, z0.s[1]\n"
1272       "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1273       "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1274       "fmla z27.s, z21.s, z0.s[1]\n"
1275       "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1276       "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1277       "fmla z28.s, z22.s, z0.s[1]\n"
1278       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1279       "fmla z29.s, z23.s, z0.s[1]\n"
1280       "fmla z30.s, z1.s, z0.s[1]\n"
1281       "fmla z31.s, z2.s, z0.s[1]\n"
1282       "ble 55f\n"
1283       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1284       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1285       "subs x21, x21, #0x1\n"
1286       "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
1287       "fmla z24.s, z3.s, z0.s[2]\n"
1288       "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1289       "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1290       "fmla z25.s, z4.s, z0.s[2]\n"
1291       "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1292       "fmla z26.s, z5.s, z0.s[2]\n"
1293       "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1294       "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1295       "fmla z27.s, z6.s, z0.s[2]\n"
1296       "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1297       "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1298       "fmla z28.s, z7.s, z0.s[2]\n"
1299       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1300       "fmla z29.s, z8.s, z0.s[2]\n"
1301       "fmla z30.s, z9.s, z0.s[2]\n"
1302       "fmla z31.s, z10.s, z0.s[2]\n"
1303       "ble 55f\n"
1304       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1305       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1306       "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
1307       "fmla z24.s, z11.s, z0.s[3]\n"
1308       "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1309       "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1310       "fmla z25.s, z12.s, z0.s[3]\n"
1311       "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1312       "fmla z26.s, z13.s, z0.s[3]\n"
1313       "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1314       "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1315       "fmla z27.s, z14.s, z0.s[3]\n"
1316       "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1317       "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1318       "fmla z28.s, z15.s, z0.s[3]\n"
1319       "addvl %x[B_ptr], %x[B_ptr], #8\n"
1320       "fmla z29.s, z16.s, z0.s[3]\n"
1321       "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1322       "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1323       "fmla z30.s, z17.s, z0.s[3]\n"
1324       "fmla z31.s, z18.s, z0.s[3]\n"
1325       "55:"  // Width 8: Multiply loop: multiply skip
1326       "prfm pldl1keep, [x20, #0x80]\n"
1327       "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1328       "tbz %x[flags], #1, 56f\n"
1329       "add x19, %x[args_ptr], %[offset_min]\n"
1330       "ld1rw { z17.s }, p2/Z, [x19]\n"
1331       "add x19, %x[args_ptr], %[offset_max]\n"
1332       "ld1rw { z16.s }, p2/Z, [x19]\n"
1333       "fmin z24.s, p2/M, z24.s, z16.s\n"
1334       "fmin z25.s, p2/M, z25.s, z16.s\n"
1335       "fmin z26.s, p2/M, z26.s, z16.s\n"
1336       "fmin z27.s, p2/M, z27.s, z16.s\n"
1337       "fmin z28.s, p2/M, z28.s, z16.s\n"
1338       "fmax z24.s, p2/M, z24.s, z17.s\n"
1339       "fmax z25.s, p2/M, z25.s, z17.s\n"
1340       "fmax z26.s, p2/M, z26.s, z17.s\n"
1341       "fmax z27.s, p2/M, z27.s, z17.s\n"
1342       "fmax z28.s, p2/M, z28.s, z17.s\n"
1343       "fmin z29.s, p2/M, z29.s, z16.s\n"
1344       "fmin z30.s, p2/M, z30.s, z16.s\n"
1345       "fmin z31.s, p2/M, z31.s, z16.s\n"
1346       "fmax z29.s, p2/M, z29.s, z17.s\n"
1347       "fmax z30.s, p2/M, z30.s, z17.s\n"
1348       "fmax z31.s, p2/M, z31.s, z17.s\n"
1349       "56:"  // Width 8: No activation
1350       "st1w { z24.s }, p2, [%x[output_ptr]]\n"
1351       "subs x23, x23, #0x8\n"
1352       "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
1353       "sub %x[N], %x[N], x24, LSL #3\n"
1354       "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
1355       "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
1356       "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
1357       "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
1358       "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
1359       "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n"
1360       "addvl %x[output_ptr], %x[output_ptr], #8\n"
1361       "bgt 1b\n"
1362       "57:"  // Exit
1363 
1364       : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
1365       : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
1366       : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
1367     );
1368 }
1369 
1370 } // namespace arm_gemm
1371 
1372 #endif
1373