1 /*
2 * Copyright (c) 2019-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24 #ifdef __ARM_FEATURE_SVE
25
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28
29 #include <cassert>
30
31 namespace arm_gemm {
32
sve_gemv_fp32_mla_8VL(const float * A_ptr,const float * B_ptr,float * output_ptr,size_t N,size_t K,const float * bias,Activation act,bool)33 void sve_gemv_fp32_mla_8VL (
34 const float *A_ptr, const float *B_ptr, float *output_ptr,
35 size_t N, size_t K,
36 const float *bias, Activation act, bool
37 )
38 {
39 struct KernelArgs {
40 float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
41 float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
42 const float *B_ptr = {};
43 size_t output_offset = {};
44 unsigned int input_initial_col = {};
45 } ka;
46
47 unsigned long flags=0;
48 ka.B_ptr = B_ptr;
49 switch(act.type) {
50 default:
51 case Activation::Type::None:
52 break;
53 case Activation::Type::BoundedReLU:
54 ka.maxval = static_cast<float>(act.param1);
55 /* fall through */
56 case Activation::Type::ReLU:
57 ka.minval = 0;
58 flags |= 0x2;
59 break;
60 }
61 __asm__ __volatile__(
62 "ptrue p2.b\n"
63 "cntw x24\n"
64 "add x23, %x[N], x24\n"
65 "sub x23, x23, #0x1\n"
66 "udiv x23, x23, x24\n"
67 "mov x22, %x[bias]\n"
68 "1:" // Column loop
69 "cmp x23, #0x8\n"
70 "bge 50f\n"
71 "cmp x23, #0x6\n"
72 "bgt 43f\n"
73 "beq 36f\n"
74 "cmp x23, #0x4\n"
75 "bgt 29f\n"
76 "beq 22f\n"
77 "cmp x23, #0x2\n"
78 "bgt 15f\n"
79 "beq 8f\n"
80 "mov x21, %x[K]\n"
81 "mov x20, %x[A_ptr]\n"
82 "whilelt p1.s, XZR, %x[N]\n"
83 "cbz x22, 2f\n"
84 "ld1w { z24.s }, p2/Z, [x22]\n"
85 "addvl x22, x22, #1\n"
86 "b 3f\n"
87 "2:" // Width 1: no bias
88 "mov z24.b, #0x0\n"
89 "3:" // Width 1: setup done
90 "cmp x21, #0x4\n"
91 "ble 5f\n"
92 "4:" // Width 1: Multiply loop: Main loop head
93 "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
94 "whilelt p0.s, XZR, x21\n"
95 "addvl %x[B_ptr], %x[B_ptr], #8\n"
96 "ld1rqw { z0.s }, p0/Z, [x20]\n"
97 "fmla z24.s, z1.s, z0.s[0]\n"
98 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
99 "add x20, x20, #0x10\n"
100 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
101 "sub x21, x21, #0x4\n"
102 "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
103 "fmla z24.s, z2.s, z0.s[1]\n"
104 "addvl %x[B_ptr], %x[B_ptr], #8\n"
105 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
106 "cmp x21, #0x4\n"
107 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
108 "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
109 "fmla z24.s, z3.s, z0.s[2]\n"
110 "addvl %x[B_ptr], %x[B_ptr], #8\n"
111 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
112 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
113 "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
114 "fmla z24.s, z4.s, z0.s[3]\n"
115 "addvl %x[B_ptr], %x[B_ptr], #8\n"
116 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
117 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
118 "prfm pldl1keep, [x20, #0x80]\n"
119 "bgt 4b\n"
120 "5:" // Width 1: Multiply loop: Single iteration only
121 "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
122 "whilelt p0.s, XZR, x21\n"
123 "addvl %x[B_ptr], %x[B_ptr], #8\n"
124 "ld1rqw { z0.s }, p0/Z, [x20]\n"
125 "fmla z24.s, z5.s, z0.s[0]\n"
126 "add x20, x20, #0x10\n"
127 "subs x21, x21, #0x1\n"
128 "ble 6f\n"
129 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
130 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
131 "subs x21, x21, #0x1\n"
132 "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
133 "fmla z24.s, z6.s, z0.s[1]\n"
134 "addvl %x[B_ptr], %x[B_ptr], #8\n"
135 "ble 6f\n"
136 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
137 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
138 "subs x21, x21, #0x1\n"
139 "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
140 "fmla z24.s, z7.s, z0.s[2]\n"
141 "addvl %x[B_ptr], %x[B_ptr], #8\n"
142 "ble 6f\n"
143 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
144 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
145 "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
146 "fmla z24.s, z8.s, z0.s[3]\n"
147 "addvl %x[B_ptr], %x[B_ptr], #8\n"
148 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
149 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
150 "6:" // Width 1: Multiply loop: multiply skip
151 "prfm pldl1keep, [x20, #0x80]\n"
152 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
153 "tbz %x[flags], #1, 7f\n"
154 "add x19, %x[args_ptr], %[offset_min]\n"
155 "ld1rw { z17.s }, p2/Z, [x19]\n"
156 "add x19, %x[args_ptr], %[offset_max]\n"
157 "ld1rw { z16.s }, p2/Z, [x19]\n"
158 "fmin z24.s, p2/M, z24.s, z16.s\n"
159 "fmax z24.s, p2/M, z24.s, z17.s\n"
160 "7:" // Width 1: No activation
161 "st1w { z24.s }, p1, [%x[output_ptr]]\n"
162 "addvl %x[output_ptr], %x[output_ptr], #1\n"
163 "b 57f\n"
164 "8:" // Width 2
165 "mov x21, %x[K]\n"
166 "mov x20, %x[A_ptr]\n"
167 "sub x19, %x[N], x24\n"
168 "whilelt p1.s, XZR, x19\n"
169 "cbz x22, 9f\n"
170 "ld1w { z24.s }, p2/Z, [x22]\n"
171 "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
172 "addvl x22, x22, #2\n"
173 "b 10f\n"
174 "9:" // Width 2: no bias
175 "mov z24.b, #0x0\n"
176 "mov z25.b, #0x0\n"
177 "10:" // Width 2: setup done
178 "cmp x21, #0x4\n"
179 "ble 12f\n"
180 "11:" // Width 2: Multiply loop: Main loop head
181 "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
182 "whilelt p0.s, XZR, x21\n"
183 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
184 "addvl %x[B_ptr], %x[B_ptr], #8\n"
185 "ld1rqw { z0.s }, p0/Z, [x20]\n"
186 "fmla z24.s, z1.s, z0.s[0]\n"
187 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
188 "add x20, x20, #0x10\n"
189 "fmla z25.s, z2.s, z0.s[0]\n"
190 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
191 "sub x21, x21, #0x4\n"
192 "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
193 "fmla z24.s, z3.s, z0.s[1]\n"
194 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
195 "addvl %x[B_ptr], %x[B_ptr], #8\n"
196 "fmla z25.s, z4.s, z0.s[1]\n"
197 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
198 "cmp x21, #0x4\n"
199 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
200 "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
201 "fmla z24.s, z5.s, z0.s[2]\n"
202 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
203 "addvl %x[B_ptr], %x[B_ptr], #8\n"
204 "fmla z25.s, z6.s, z0.s[2]\n"
205 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
206 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
207 "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
208 "fmla z24.s, z7.s, z0.s[3]\n"
209 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
210 "addvl %x[B_ptr], %x[B_ptr], #8\n"
211 "fmla z25.s, z8.s, z0.s[3]\n"
212 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
213 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
214 "prfm pldl1keep, [x20, #0x80]\n"
215 "bgt 11b\n"
216 "12:" // Width 2: Multiply loop: Single iteration only
217 "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
218 "whilelt p0.s, XZR, x21\n"
219 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
220 "addvl %x[B_ptr], %x[B_ptr], #8\n"
221 "ld1rqw { z0.s }, p0/Z, [x20]\n"
222 "fmla z24.s, z9.s, z0.s[0]\n"
223 "add x20, x20, #0x10\n"
224 "fmla z25.s, z10.s, z0.s[0]\n"
225 "subs x21, x21, #0x1\n"
226 "ble 13f\n"
227 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
228 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
229 "subs x21, x21, #0x1\n"
230 "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
231 "fmla z24.s, z11.s, z0.s[1]\n"
232 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
233 "addvl %x[B_ptr], %x[B_ptr], #8\n"
234 "fmla z25.s, z12.s, z0.s[1]\n"
235 "ble 13f\n"
236 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
237 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
238 "subs x21, x21, #0x1\n"
239 "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
240 "fmla z24.s, z13.s, z0.s[2]\n"
241 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
242 "addvl %x[B_ptr], %x[B_ptr], #8\n"
243 "fmla z25.s, z14.s, z0.s[2]\n"
244 "ble 13f\n"
245 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
246 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
247 "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
248 "fmla z24.s, z15.s, z0.s[3]\n"
249 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
250 "addvl %x[B_ptr], %x[B_ptr], #8\n"
251 "fmla z25.s, z16.s, z0.s[3]\n"
252 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
253 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
254 "13:" // Width 2: Multiply loop: multiply skip
255 "prfm pldl1keep, [x20, #0x80]\n"
256 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
257 "tbz %x[flags], #1, 14f\n"
258 "add x19, %x[args_ptr], %[offset_min]\n"
259 "ld1rw { z17.s }, p2/Z, [x19]\n"
260 "add x19, %x[args_ptr], %[offset_max]\n"
261 "ld1rw { z16.s }, p2/Z, [x19]\n"
262 "fmin z24.s, p2/M, z24.s, z16.s\n"
263 "fmin z25.s, p2/M, z25.s, z16.s\n"
264 "fmax z24.s, p2/M, z24.s, z17.s\n"
265 "fmax z25.s, p2/M, z25.s, z17.s\n"
266 "14:" // Width 2: No activation
267 "st1w { z24.s }, p2, [%x[output_ptr]]\n"
268 "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n"
269 "addvl %x[output_ptr], %x[output_ptr], #2\n"
270 "b 57f\n"
271 "15:" // Width 3
272 "mov x21, %x[K]\n"
273 "mov x20, %x[A_ptr]\n"
274 "mov x19, #0x2\n"
275 "msub x19, x24, x19, %x[N]\n"
276 "whilelt p1.s, XZR, x19\n"
277 "cbz x22, 16f\n"
278 "ld1w { z24.s }, p2/Z, [x22]\n"
279 "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
280 "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
281 "addvl x22, x22, #3\n"
282 "b 17f\n"
283 "16:" // Width 3: no bias
284 "mov z24.b, #0x0\n"
285 "mov z25.b, #0x0\n"
286 "mov z26.b, #0x0\n"
287 "17:" // Width 3: setup done
288 "cmp x21, #0x4\n"
289 "ble 19f\n"
290 "18:" // Width 3: Multiply loop: Main loop head
291 "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
292 "whilelt p0.s, XZR, x21\n"
293 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
294 "sub x21, x21, #0x4\n"
295 "ld1rqw { z0.s }, p0/Z, [x20]\n"
296 "fmla z24.s, z1.s, z0.s[0]\n"
297 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
298 "add x20, x20, #0x10\n"
299 "fmla z25.s, z2.s, z0.s[0]\n"
300 "addvl %x[B_ptr], %x[B_ptr], #8\n"
301 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
302 "fmla z26.s, z3.s, z0.s[0]\n"
303 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
304 "cmp x21, #0x4\n"
305 "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
306 "fmla z24.s, z4.s, z0.s[1]\n"
307 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
308 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
309 "fmla z25.s, z5.s, z0.s[1]\n"
310 "addvl %x[B_ptr], %x[B_ptr], #8\n"
311 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
312 "fmla z26.s, z6.s, z0.s[1]\n"
313 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
314 "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
315 "fmla z24.s, z7.s, z0.s[2]\n"
316 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
317 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
318 "fmla z25.s, z8.s, z0.s[2]\n"
319 "addvl %x[B_ptr], %x[B_ptr], #8\n"
320 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
321 "fmla z26.s, z9.s, z0.s[2]\n"
322 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
323 "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
324 "fmla z24.s, z10.s, z0.s[3]\n"
325 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
326 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
327 "fmla z25.s, z11.s, z0.s[3]\n"
328 "addvl %x[B_ptr], %x[B_ptr], #8\n"
329 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
330 "fmla z26.s, z12.s, z0.s[3]\n"
331 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
332 "prfm pldl1keep, [x20, #0x80]\n"
333 "bgt 18b\n"
334 "19:" // Width 3: Multiply loop: Single iteration only
335 "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
336 "whilelt p0.s, XZR, x21\n"
337 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
338 "subs x21, x21, #0x1\n"
339 "ld1rqw { z0.s }, p0/Z, [x20]\n"
340 "fmla z24.s, z13.s, z0.s[0]\n"
341 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
342 "add x20, x20, #0x10\n"
343 "fmla z25.s, z14.s, z0.s[0]\n"
344 "addvl %x[B_ptr], %x[B_ptr], #8\n"
345 "fmla z26.s, z15.s, z0.s[0]\n"
346 "ble 20f\n"
347 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
348 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
349 "subs x21, x21, #0x1\n"
350 "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
351 "fmla z24.s, z16.s, z0.s[1]\n"
352 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
353 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
354 "fmla z25.s, z17.s, z0.s[1]\n"
355 "addvl %x[B_ptr], %x[B_ptr], #8\n"
356 "fmla z26.s, z18.s, z0.s[1]\n"
357 "ble 20f\n"
358 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
359 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
360 "subs x21, x21, #0x1\n"
361 "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
362 "fmla z24.s, z19.s, z0.s[2]\n"
363 "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
364 "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
365 "fmla z25.s, z20.s, z0.s[2]\n"
366 "addvl %x[B_ptr], %x[B_ptr], #8\n"
367 "fmla z26.s, z21.s, z0.s[2]\n"
368 "ble 20f\n"
369 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
370 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
371 "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
372 "fmla z24.s, z22.s, z0.s[3]\n"
373 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
374 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
375 "fmla z25.s, z23.s, z0.s[3]\n"
376 "addvl %x[B_ptr], %x[B_ptr], #8\n"
377 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
378 "fmla z26.s, z1.s, z0.s[3]\n"
379 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
380 "20:" // Width 3: Multiply loop: multiply skip
381 "prfm pldl1keep, [x20, #0x80]\n"
382 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
383 "tbz %x[flags], #1, 21f\n"
384 "add x19, %x[args_ptr], %[offset_min]\n"
385 "ld1rw { z17.s }, p2/Z, [x19]\n"
386 "add x19, %x[args_ptr], %[offset_max]\n"
387 "ld1rw { z16.s }, p2/Z, [x19]\n"
388 "fmin z24.s, p2/M, z24.s, z16.s\n"
389 "fmin z25.s, p2/M, z25.s, z16.s\n"
390 "fmin z26.s, p2/M, z26.s, z16.s\n"
391 "fmax z24.s, p2/M, z24.s, z17.s\n"
392 "fmax z25.s, p2/M, z25.s, z17.s\n"
393 "fmax z26.s, p2/M, z26.s, z17.s\n"
394 "21:" // Width 3: No activation
395 "st1w { z24.s }, p2, [%x[output_ptr]]\n"
396 "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
397 "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n"
398 "addvl %x[output_ptr], %x[output_ptr], #3\n"
399 "b 57f\n"
400 "22:" // Width 4
401 "mov x21, %x[K]\n"
402 "mov x20, %x[A_ptr]\n"
403 "mov x19, #0x3\n"
404 "msub x19, x24, x19, %x[N]\n"
405 "whilelt p1.s, XZR, x19\n"
406 "cbz x22, 23f\n"
407 "ld1w { z24.s }, p2/Z, [x22]\n"
408 "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
409 "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
410 "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
411 "addvl x22, x22, #4\n"
412 "b 24f\n"
413 "23:" // Width 4: no bias
414 "mov z24.b, #0x0\n"
415 "mov z25.b, #0x0\n"
416 "mov z26.b, #0x0\n"
417 "mov z27.b, #0x0\n"
418 "24:" // Width 4: setup done
419 "cmp x21, #0x4\n"
420 "ble 26f\n"
421 "25:" // Width 4: Multiply loop: Main loop head
422 "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
423 "whilelt p0.s, XZR, x21\n"
424 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
425 "sub x21, x21, #0x4\n"
426 "ld1rqw { z0.s }, p0/Z, [x20]\n"
427 "fmla z24.s, z1.s, z0.s[0]\n"
428 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
429 "add x20, x20, #0x10\n"
430 "fmla z25.s, z2.s, z0.s[0]\n"
431 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
432 "addvl %x[B_ptr], %x[B_ptr], #8\n"
433 "fmla z26.s, z3.s, z0.s[0]\n"
434 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
435 "cmp x21, #0x4\n"
436 "fmla z27.s, z4.s, z0.s[0]\n"
437 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
438 "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
439 "fmla z24.s, z5.s, z0.s[1]\n"
440 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
441 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
442 "fmla z25.s, z6.s, z0.s[1]\n"
443 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
444 "addvl %x[B_ptr], %x[B_ptr], #8\n"
445 "fmla z26.s, z7.s, z0.s[1]\n"
446 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
447 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
448 "fmla z27.s, z8.s, z0.s[1]\n"
449 "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
450 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
451 "fmla z24.s, z9.s, z0.s[2]\n"
452 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
453 "fmla z25.s, z10.s, z0.s[2]\n"
454 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
455 "addvl %x[B_ptr], %x[B_ptr], #8\n"
456 "fmla z26.s, z11.s, z0.s[2]\n"
457 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
458 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
459 "fmla z27.s, z12.s, z0.s[2]\n"
460 "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
461 "fmla z24.s, z13.s, z0.s[3]\n"
462 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
463 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
464 "fmla z25.s, z14.s, z0.s[3]\n"
465 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
466 "addvl %x[B_ptr], %x[B_ptr], #8\n"
467 "fmla z26.s, z15.s, z0.s[3]\n"
468 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
469 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
470 "fmla z27.s, z16.s, z0.s[3]\n"
471 "prfm pldl1keep, [x20, #0x80]\n"
472 "bgt 25b\n"
473 "26:" // Width 4: Multiply loop: Single iteration only
474 "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
475 "whilelt p0.s, XZR, x21\n"
476 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
477 "subs x21, x21, #0x1\n"
478 "ld1rqw { z0.s }, p0/Z, [x20]\n"
479 "fmla z24.s, z17.s, z0.s[0]\n"
480 "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
481 "add x20, x20, #0x10\n"
482 "fmla z25.s, z18.s, z0.s[0]\n"
483 "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
484 "addvl %x[B_ptr], %x[B_ptr], #8\n"
485 "fmla z26.s, z19.s, z0.s[0]\n"
486 "fmla z27.s, z20.s, z0.s[0]\n"
487 "ble 27f\n"
488 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
489 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
490 "subs x21, x21, #0x1\n"
491 "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
492 "fmla z24.s, z21.s, z0.s[1]\n"
493 "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
494 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
495 "fmla z25.s, z22.s, z0.s[1]\n"
496 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
497 "addvl %x[B_ptr], %x[B_ptr], #8\n"
498 "fmla z26.s, z23.s, z0.s[1]\n"
499 "fmla z27.s, z1.s, z0.s[1]\n"
500 "ble 27f\n"
501 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
502 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
503 "subs x21, x21, #0x1\n"
504 "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
505 "fmla z24.s, z2.s, z0.s[2]\n"
506 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
507 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
508 "fmla z25.s, z3.s, z0.s[2]\n"
509 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
510 "addvl %x[B_ptr], %x[B_ptr], #8\n"
511 "fmla z26.s, z4.s, z0.s[2]\n"
512 "fmla z27.s, z5.s, z0.s[2]\n"
513 "ble 27f\n"
514 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
515 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
516 "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
517 "fmla z24.s, z6.s, z0.s[3]\n"
518 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
519 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
520 "fmla z25.s, z7.s, z0.s[3]\n"
521 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
522 "addvl %x[B_ptr], %x[B_ptr], #8\n"
523 "fmla z26.s, z8.s, z0.s[3]\n"
524 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
525 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
526 "fmla z27.s, z9.s, z0.s[3]\n"
527 "27:" // Width 4: Multiply loop: multiply skip
528 "prfm pldl1keep, [x20, #0x80]\n"
529 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
530 "tbz %x[flags], #1, 28f\n"
531 "add x19, %x[args_ptr], %[offset_min]\n"
532 "ld1rw { z17.s }, p2/Z, [x19]\n"
533 "add x19, %x[args_ptr], %[offset_max]\n"
534 "ld1rw { z16.s }, p2/Z, [x19]\n"
535 "fmin z24.s, p2/M, z24.s, z16.s\n"
536 "fmin z25.s, p2/M, z25.s, z16.s\n"
537 "fmin z26.s, p2/M, z26.s, z16.s\n"
538 "fmin z27.s, p2/M, z27.s, z16.s\n"
539 "fmax z24.s, p2/M, z24.s, z17.s\n"
540 "fmax z25.s, p2/M, z25.s, z17.s\n"
541 "fmax z26.s, p2/M, z26.s, z17.s\n"
542 "fmax z27.s, p2/M, z27.s, z17.s\n"
543 "28:" // Width 4: No activation
544 "st1w { z24.s }, p2, [%x[output_ptr]]\n"
545 "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
546 "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
547 "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n"
548 "addvl %x[output_ptr], %x[output_ptr], #4\n"
549 "b 57f\n"
550 "29:" // Width 5
551 "mov x21, %x[K]\n"
552 "mov x20, %x[A_ptr]\n"
553 "mov x19, #0x4\n"
554 "msub x19, x24, x19, %x[N]\n"
555 "whilelt p1.s, XZR, x19\n"
556 "cbz x22, 30f\n"
557 "ld1w { z24.s }, p2/Z, [x22]\n"
558 "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
559 "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
560 "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
561 "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
562 "addvl x22, x22, #5\n"
563 "b 31f\n"
564 "30:" // Width 5: no bias
565 "mov z24.b, #0x0\n"
566 "mov z25.b, #0x0\n"
567 "mov z26.b, #0x0\n"
568 "mov z27.b, #0x0\n"
569 "mov z28.b, #0x0\n"
570 "31:" // Width 5: setup done
571 "cmp x21, #0x4\n"
572 "ble 33f\n"
573 "32:" // Width 5: Multiply loop: Main loop head
574 "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
575 "whilelt p0.s, XZR, x21\n"
576 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
577 "sub x21, x21, #0x4\n"
578 "ld1rqw { z0.s }, p0/Z, [x20]\n"
579 "fmla z24.s, z1.s, z0.s[0]\n"
580 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
581 "add x20, x20, #0x10\n"
582 "fmla z25.s, z2.s, z0.s[0]\n"
583 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
584 "cmp x21, #0x4\n"
585 "fmla z26.s, z3.s, z0.s[0]\n"
586 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
587 "addvl %x[B_ptr], %x[B_ptr], #8\n"
588 "fmla z27.s, z4.s, z0.s[0]\n"
589 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
590 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
591 "fmla z28.s, z5.s, z0.s[0]\n"
592 "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
593 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
594 "fmla z24.s, z6.s, z0.s[1]\n"
595 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
596 "fmla z25.s, z7.s, z0.s[1]\n"
597 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
598 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
599 "fmla z26.s, z8.s, z0.s[1]\n"
600 "addvl %x[B_ptr], %x[B_ptr], #8\n"
601 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
602 "fmla z27.s, z9.s, z0.s[1]\n"
603 "fmla z28.s, z10.s, z0.s[1]\n"
604 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
605 "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
606 "fmla z24.s, z11.s, z0.s[2]\n"
607 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
608 "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
609 "fmla z25.s, z12.s, z0.s[2]\n"
610 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
611 "fmla z26.s, z13.s, z0.s[2]\n"
612 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
613 "addvl %x[B_ptr], %x[B_ptr], #8\n"
614 "fmla z27.s, z14.s, z0.s[2]\n"
615 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
616 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
617 "fmla z28.s, z15.s, z0.s[2]\n"
618 "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
619 "fmla z24.s, z16.s, z0.s[3]\n"
620 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
621 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
622 "fmla z25.s, z17.s, z0.s[3]\n"
623 "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
624 "fmla z26.s, z18.s, z0.s[3]\n"
625 "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
626 "addvl %x[B_ptr], %x[B_ptr], #8\n"
627 "fmla z27.s, z19.s, z0.s[3]\n"
628 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
629 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
630 "fmla z28.s, z20.s, z0.s[3]\n"
631 "prfm pldl1keep, [x20, #0x80]\n"
632 "bgt 32b\n"
633 "33:" // Width 5: Multiply loop: Single iteration only
634 "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
635 "whilelt p0.s, XZR, x21\n"
636 "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
637 "subs x21, x21, #0x1\n"
638 "ld1rqw { z0.s }, p0/Z, [x20]\n"
639 "fmla z24.s, z21.s, z0.s[0]\n"
640 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
641 "add x20, x20, #0x10\n"
642 "fmla z25.s, z22.s, z0.s[0]\n"
643 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
644 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
645 "fmla z26.s, z23.s, z0.s[0]\n"
646 "addvl %x[B_ptr], %x[B_ptr], #8\n"
647 "fmla z27.s, z1.s, z0.s[0]\n"
648 "fmla z28.s, z2.s, z0.s[0]\n"
649 "ble 34f\n"
650 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
651 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
652 "subs x21, x21, #0x1\n"
653 "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
654 "fmla z24.s, z3.s, z0.s[1]\n"
655 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
656 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
657 "fmla z25.s, z4.s, z0.s[1]\n"
658 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
659 "fmla z26.s, z5.s, z0.s[1]\n"
660 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
661 "addvl %x[B_ptr], %x[B_ptr], #8\n"
662 "fmla z27.s, z6.s, z0.s[1]\n"
663 "fmla z28.s, z7.s, z0.s[1]\n"
664 "ble 34f\n"
665 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
666 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
667 "subs x21, x21, #0x1\n"
668 "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
669 "fmla z24.s, z8.s, z0.s[2]\n"
670 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
671 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
672 "fmla z25.s, z9.s, z0.s[2]\n"
673 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
674 "fmla z26.s, z10.s, z0.s[2]\n"
675 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
676 "addvl %x[B_ptr], %x[B_ptr], #8\n"
677 "fmla z27.s, z11.s, z0.s[2]\n"
678 "fmla z28.s, z12.s, z0.s[2]\n"
679 "ble 34f\n"
680 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
681 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
682 "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
683 "fmla z24.s, z13.s, z0.s[3]\n"
684 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
685 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
686 "fmla z25.s, z14.s, z0.s[3]\n"
687 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
688 "fmla z26.s, z15.s, z0.s[3]\n"
689 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
690 "addvl %x[B_ptr], %x[B_ptr], #8\n"
691 "fmla z27.s, z16.s, z0.s[3]\n"
692 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
693 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
694 "fmla z28.s, z17.s, z0.s[3]\n"
695 "34:" // Width 5: Multiply loop: multiply skip
696 "prfm pldl1keep, [x20, #0x80]\n"
697 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
698 "tbz %x[flags], #1, 35f\n"
699 "add x19, %x[args_ptr], %[offset_min]\n"
700 "ld1rw { z17.s }, p2/Z, [x19]\n"
701 "add x19, %x[args_ptr], %[offset_max]\n"
702 "ld1rw { z16.s }, p2/Z, [x19]\n"
703 "fmin z24.s, p2/M, z24.s, z16.s\n"
704 "fmin z25.s, p2/M, z25.s, z16.s\n"
705 "fmin z26.s, p2/M, z26.s, z16.s\n"
706 "fmin z27.s, p2/M, z27.s, z16.s\n"
707 "fmin z28.s, p2/M, z28.s, z16.s\n"
708 "fmax z24.s, p2/M, z24.s, z17.s\n"
709 "fmax z25.s, p2/M, z25.s, z17.s\n"
710 "fmax z26.s, p2/M, z26.s, z17.s\n"
711 "fmax z27.s, p2/M, z27.s, z17.s\n"
712 "fmax z28.s, p2/M, z28.s, z17.s\n"
713 "35:" // Width 5: No activation
714 "st1w { z24.s }, p2, [%x[output_ptr]]\n"
715 "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
716 "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
717 "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
718 "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n"
719 "addvl %x[output_ptr], %x[output_ptr], #5\n"
720 "b 57f\n"
721 "36:" // Width 6
722 "mov x21, %x[K]\n"
723 "mov x20, %x[A_ptr]\n"
724 "mov x19, #0x5\n"
725 "msub x19, x24, x19, %x[N]\n"
726 "whilelt p1.s, XZR, x19\n"
727 "cbz x22, 37f\n"
728 "ld1w { z24.s }, p2/Z, [x22]\n"
729 "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
730 "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
731 "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
732 "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
733 "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
734 "addvl x22, x22, #6\n"
735 "b 38f\n"
736 "37:" // Width 6: no bias
737 "mov z24.b, #0x0\n"
738 "mov z25.b, #0x0\n"
739 "mov z26.b, #0x0\n"
740 "mov z27.b, #0x0\n"
741 "mov z28.b, #0x0\n"
742 "mov z29.b, #0x0\n"
743 "38:" // Width 6: setup done
744 "cmp x21, #0x4\n"
745 "ble 40f\n"
746 "39:" // Width 6: Multiply loop: Main loop head
747 "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
748 "whilelt p0.s, XZR, x21\n"
749 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
750 "sub x21, x21, #0x4\n"
751 "ld1rqw { z0.s }, p0/Z, [x20]\n"
752 "fmla z24.s, z1.s, z0.s[0]\n"
753 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
754 "add x20, x20, #0x10\n"
755 "fmla z25.s, z2.s, z0.s[0]\n"
756 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
757 "cmp x21, #0x4\n"
758 "fmla z26.s, z3.s, z0.s[0]\n"
759 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
760 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
761 "fmla z27.s, z4.s, z0.s[0]\n"
762 "addvl %x[B_ptr], %x[B_ptr], #8\n"
763 "fmla z28.s, z5.s, z0.s[0]\n"
764 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
765 "fmla z29.s, z6.s, z0.s[0]\n"
766 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
767 "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
768 "fmla z24.s, z7.s, z0.s[1]\n"
769 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
770 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
771 "fmla z25.s, z8.s, z0.s[1]\n"
772 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
773 "fmla z26.s, z9.s, z0.s[1]\n"
774 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
775 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
776 "fmla z27.s, z10.s, z0.s[1]\n"
777 "addvl %x[B_ptr], %x[B_ptr], #8\n"
778 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
779 "fmla z28.s, z11.s, z0.s[1]\n"
780 "fmla z29.s, z12.s, z0.s[1]\n"
781 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
782 "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
783 "fmla z24.s, z13.s, z0.s[2]\n"
784 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
785 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
786 "fmla z25.s, z14.s, z0.s[2]\n"
787 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
788 "fmla z26.s, z15.s, z0.s[2]\n"
789 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
790 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
791 "fmla z27.s, z16.s, z0.s[2]\n"
792 "addvl %x[B_ptr], %x[B_ptr], #8\n"
793 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
794 "fmla z28.s, z17.s, z0.s[2]\n"
795 "fmla z29.s, z18.s, z0.s[2]\n"
796 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
797 "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
798 "fmla z24.s, z19.s, z0.s[3]\n"
799 "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
800 "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
801 "fmla z25.s, z20.s, z0.s[3]\n"
802 "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
803 "fmla z26.s, z21.s, z0.s[3]\n"
804 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
805 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
806 "fmla z27.s, z22.s, z0.s[3]\n"
807 "addvl %x[B_ptr], %x[B_ptr], #8\n"
808 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
809 "fmla z28.s, z23.s, z0.s[3]\n"
810 "fmla z29.s, z1.s, z0.s[3]\n"
811 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
812 "prfm pldl1keep, [x20, #0x80]\n"
813 "bgt 39b\n"
814 "40:" // Width 6: Multiply loop: Single iteration only
815 "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
816 "whilelt p0.s, XZR, x21\n"
817 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
818 "subs x21, x21, #0x1\n"
819 "ld1rqw { z0.s }, p0/Z, [x20]\n"
820 "fmla z24.s, z2.s, z0.s[0]\n"
821 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
822 "add x20, x20, #0x10\n"
823 "fmla z25.s, z3.s, z0.s[0]\n"
824 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
825 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
826 "fmla z26.s, z4.s, z0.s[0]\n"
827 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
828 "addvl %x[B_ptr], %x[B_ptr], #8\n"
829 "fmla z27.s, z5.s, z0.s[0]\n"
830 "fmla z28.s, z6.s, z0.s[0]\n"
831 "fmla z29.s, z7.s, z0.s[0]\n"
832 "ble 41f\n"
833 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
834 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
835 "subs x21, x21, #0x1\n"
836 "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
837 "fmla z24.s, z8.s, z0.s[1]\n"
838 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
839 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
840 "fmla z25.s, z9.s, z0.s[1]\n"
841 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
842 "fmla z26.s, z10.s, z0.s[1]\n"
843 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
844 "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
845 "fmla z27.s, z11.s, z0.s[1]\n"
846 "addvl %x[B_ptr], %x[B_ptr], #8\n"
847 "fmla z28.s, z12.s, z0.s[1]\n"
848 "fmla z29.s, z13.s, z0.s[1]\n"
849 "ble 41f\n"
850 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
851 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
852 "subs x21, x21, #0x1\n"
853 "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n"
854 "fmla z24.s, z14.s, z0.s[2]\n"
855 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
856 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
857 "fmla z25.s, z15.s, z0.s[2]\n"
858 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
859 "fmla z26.s, z16.s, z0.s[2]\n"
860 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
861 "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
862 "fmla z27.s, z17.s, z0.s[2]\n"
863 "addvl %x[B_ptr], %x[B_ptr], #8\n"
864 "fmla z28.s, z18.s, z0.s[2]\n"
865 "fmla z29.s, z19.s, z0.s[2]\n"
866 "ble 41f\n"
867 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
868 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
869 "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
870 "fmla z24.s, z20.s, z0.s[3]\n"
871 "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
872 "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
873 "fmla z25.s, z21.s, z0.s[3]\n"
874 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
875 "fmla z26.s, z22.s, z0.s[3]\n"
876 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
877 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
878 "fmla z27.s, z23.s, z0.s[3]\n"
879 "addvl %x[B_ptr], %x[B_ptr], #8\n"
880 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
881 "fmla z28.s, z1.s, z0.s[3]\n"
882 "fmla z29.s, z2.s, z0.s[3]\n"
883 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
884 "41:" // Width 6: Multiply loop: multiply skip
885 "prfm pldl1keep, [x20, #0x80]\n"
886 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
887 "tbz %x[flags], #1, 42f\n"
888 "add x19, %x[args_ptr], %[offset_min]\n"
889 "ld1rw { z17.s }, p2/Z, [x19]\n"
890 "add x19, %x[args_ptr], %[offset_max]\n"
891 "ld1rw { z16.s }, p2/Z, [x19]\n"
892 "fmin z24.s, p2/M, z24.s, z16.s\n"
893 "fmin z25.s, p2/M, z25.s, z16.s\n"
894 "fmin z26.s, p2/M, z26.s, z16.s\n"
895 "fmin z27.s, p2/M, z27.s, z16.s\n"
896 "fmin z28.s, p2/M, z28.s, z16.s\n"
897 "fmax z24.s, p2/M, z24.s, z17.s\n"
898 "fmax z25.s, p2/M, z25.s, z17.s\n"
899 "fmax z26.s, p2/M, z26.s, z17.s\n"
900 "fmax z27.s, p2/M, z27.s, z17.s\n"
901 "fmax z28.s, p2/M, z28.s, z17.s\n"
902 "fmin z29.s, p2/M, z29.s, z16.s\n"
903 "fmax z29.s, p2/M, z29.s, z17.s\n"
904 "42:" // Width 6: No activation
905 "st1w { z24.s }, p2, [%x[output_ptr]]\n"
906 "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
907 "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
908 "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
909 "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
910 "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n"
911 "addvl %x[output_ptr], %x[output_ptr], #6\n"
912 "b 57f\n"
913 "43:" // Width 7
914 "mov x21, %x[K]\n"
915 "mov x20, %x[A_ptr]\n"
916 "mov x19, #0x6\n"
917 "msub x19, x24, x19, %x[N]\n"
918 "whilelt p1.s, XZR, x19\n"
919 "cbz x22, 44f\n"
920 "ld1w { z24.s }, p2/Z, [x22]\n"
921 "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
922 "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
923 "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
924 "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
925 "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
926 "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
927 "addvl x22, x22, #7\n"
928 "b 45f\n"
929 "44:" // Width 7: no bias
930 "mov z24.b, #0x0\n"
931 "mov z25.b, #0x0\n"
932 "mov z26.b, #0x0\n"
933 "mov z27.b, #0x0\n"
934 "mov z28.b, #0x0\n"
935 "mov z29.b, #0x0\n"
936 "mov z30.b, #0x0\n"
937 "45:" // Width 7: setup done
938 "cmp x21, #0x4\n"
939 "ble 47f\n"
940 "46:" // Width 7: Multiply loop: Main loop head
941 "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
942 "whilelt p0.s, XZR, x21\n"
943 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
944 "sub x21, x21, #0x4\n"
945 "ld1rqw { z0.s }, p0/Z, [x20]\n"
946 "fmla z24.s, z1.s, z0.s[0]\n"
947 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
948 "add x20, x20, #0x10\n"
949 "fmla z25.s, z2.s, z0.s[0]\n"
950 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
951 "cmp x21, #0x4\n"
952 "fmla z26.s, z3.s, z0.s[0]\n"
953 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
954 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
955 "fmla z27.s, z4.s, z0.s[0]\n"
956 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
957 "fmla z28.s, z5.s, z0.s[0]\n"
958 "addvl %x[B_ptr], %x[B_ptr], #8\n"
959 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
960 "fmla z29.s, z6.s, z0.s[0]\n"
961 "fmla z30.s, z7.s, z0.s[0]\n"
962 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
963 "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
964 "fmla z24.s, z8.s, z0.s[1]\n"
965 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
966 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
967 "fmla z25.s, z9.s, z0.s[1]\n"
968 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
969 "fmla z26.s, z10.s, z0.s[1]\n"
970 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
971 "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
972 "fmla z27.s, z11.s, z0.s[1]\n"
973 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
974 "addvl %x[B_ptr], %x[B_ptr], #8\n"
975 "fmla z28.s, z12.s, z0.s[1]\n"
976 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
977 "fmla z29.s, z13.s, z0.s[1]\n"
978 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
979 "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
980 "fmla z30.s, z14.s, z0.s[1]\n"
981 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
982 "fmla z24.s, z15.s, z0.s[2]\n"
983 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
984 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
985 "fmla z25.s, z16.s, z0.s[2]\n"
986 "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
987 "fmla z26.s, z17.s, z0.s[2]\n"
988 "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
989 "fmla z27.s, z18.s, z0.s[2]\n"
990 "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
991 "addvl %x[B_ptr], %x[B_ptr], #8\n"
992 "fmla z28.s, z19.s, z0.s[2]\n"
993 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
994 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
995 "fmla z29.s, z20.s, z0.s[2]\n"
996 "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
997 "fmla z30.s, z21.s, z0.s[2]\n"
998 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
999 "fmla z24.s, z22.s, z0.s[3]\n"
1000 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1001 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1002 "fmla z25.s, z23.s, z0.s[3]\n"
1003 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1004 "fmla z26.s, z1.s, z0.s[3]\n"
1005 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1006 "fmla z27.s, z2.s, z0.s[3]\n"
1007 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1008 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1009 "fmla z28.s, z3.s, z0.s[3]\n"
1010 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1011 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1012 "fmla z29.s, z4.s, z0.s[3]\n"
1013 "prfm pldl1keep, [x20, #0x80]\n"
1014 "fmla z30.s, z5.s, z0.s[3]\n"
1015 "bgt 46b\n"
1016 "47:" // Width 7: Multiply loop: Single iteration only
1017 "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
1018 "whilelt p0.s, XZR, x21\n"
1019 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1020 "subs x21, x21, #0x1\n"
1021 "ld1rqw { z0.s }, p0/Z, [x20]\n"
1022 "fmla z24.s, z6.s, z0.s[0]\n"
1023 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1024 "add x20, x20, #0x10\n"
1025 "fmla z25.s, z7.s, z0.s[0]\n"
1026 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1027 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1028 "fmla z26.s, z8.s, z0.s[0]\n"
1029 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1030 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1031 "fmla z27.s, z9.s, z0.s[0]\n"
1032 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1033 "fmla z28.s, z10.s, z0.s[0]\n"
1034 "fmla z29.s, z11.s, z0.s[0]\n"
1035 "fmla z30.s, z12.s, z0.s[0]\n"
1036 "ble 48f\n"
1037 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1038 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1039 "subs x21, x21, #0x1\n"
1040 "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
1041 "fmla z24.s, z13.s, z0.s[1]\n"
1042 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1043 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1044 "fmla z25.s, z14.s, z0.s[1]\n"
1045 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1046 "fmla z26.s, z15.s, z0.s[1]\n"
1047 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1048 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1049 "fmla z27.s, z16.s, z0.s[1]\n"
1050 "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1051 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1052 "fmla z28.s, z17.s, z0.s[1]\n"
1053 "fmla z29.s, z18.s, z0.s[1]\n"
1054 "fmla z30.s, z19.s, z0.s[1]\n"
1055 "ble 48f\n"
1056 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1057 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1058 "subs x21, x21, #0x1\n"
1059 "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
1060 "fmla z24.s, z20.s, z0.s[2]\n"
1061 "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1062 "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1063 "fmla z25.s, z21.s, z0.s[2]\n"
1064 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1065 "fmla z26.s, z22.s, z0.s[2]\n"
1066 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1067 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1068 "fmla z27.s, z23.s, z0.s[2]\n"
1069 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1070 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1071 "fmla z28.s, z1.s, z0.s[2]\n"
1072 "fmla z29.s, z2.s, z0.s[2]\n"
1073 "fmla z30.s, z3.s, z0.s[2]\n"
1074 "ble 48f\n"
1075 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1076 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1077 "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
1078 "fmla z24.s, z4.s, z0.s[3]\n"
1079 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1080 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1081 "fmla z25.s, z5.s, z0.s[3]\n"
1082 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1083 "fmla z26.s, z6.s, z0.s[3]\n"
1084 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1085 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1086 "fmla z27.s, z7.s, z0.s[3]\n"
1087 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1088 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1089 "fmla z28.s, z8.s, z0.s[3]\n"
1090 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1091 "fmla z29.s, z9.s, z0.s[3]\n"
1092 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1093 "fmla z30.s, z10.s, z0.s[3]\n"
1094 "48:" // Width 7: Multiply loop: multiply skip
1095 "prfm pldl1keep, [x20, #0x80]\n"
1096 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1097 "tbz %x[flags], #1, 49f\n"
1098 "add x19, %x[args_ptr], %[offset_min]\n"
1099 "ld1rw { z17.s }, p2/Z, [x19]\n"
1100 "add x19, %x[args_ptr], %[offset_max]\n"
1101 "ld1rw { z16.s }, p2/Z, [x19]\n"
1102 "fmin z24.s, p2/M, z24.s, z16.s\n"
1103 "fmin z25.s, p2/M, z25.s, z16.s\n"
1104 "fmin z26.s, p2/M, z26.s, z16.s\n"
1105 "fmin z27.s, p2/M, z27.s, z16.s\n"
1106 "fmin z28.s, p2/M, z28.s, z16.s\n"
1107 "fmax z24.s, p2/M, z24.s, z17.s\n"
1108 "fmax z25.s, p2/M, z25.s, z17.s\n"
1109 "fmax z26.s, p2/M, z26.s, z17.s\n"
1110 "fmax z27.s, p2/M, z27.s, z17.s\n"
1111 "fmax z28.s, p2/M, z28.s, z17.s\n"
1112 "fmin z29.s, p2/M, z29.s, z16.s\n"
1113 "fmin z30.s, p2/M, z30.s, z16.s\n"
1114 "fmax z29.s, p2/M, z29.s, z17.s\n"
1115 "fmax z30.s, p2/M, z30.s, z17.s\n"
1116 "49:" // Width 7: No activation
1117 "st1w { z24.s }, p2, [%x[output_ptr]]\n"
1118 "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
1119 "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
1120 "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
1121 "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
1122 "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
1123 "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n"
1124 "addvl %x[output_ptr], %x[output_ptr], #7\n"
1125 "b 57f\n"
1126 "50:" // Width 8
1127 "mov x21, %x[K]\n"
1128 "mov x20, %x[A_ptr]\n"
1129 "mov x19, #0x7\n"
1130 "msub x19, x24, x19, %x[N]\n"
1131 "whilelt p1.s, XZR, x19\n"
1132 "cbz x22, 51f\n"
1133 "ld1w { z24.s }, p2/Z, [x22]\n"
1134 "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
1135 "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
1136 "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
1137 "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
1138 "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
1139 "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
1140 "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n"
1141 "addvl x22, x22, #8\n"
1142 "b 52f\n"
1143 "51:" // Width 8: no bias
1144 "mov z24.b, #0x0\n"
1145 "mov z25.b, #0x0\n"
1146 "mov z26.b, #0x0\n"
1147 "mov z27.b, #0x0\n"
1148 "mov z28.b, #0x0\n"
1149 "mov z29.b, #0x0\n"
1150 "mov z30.b, #0x0\n"
1151 "mov z31.b, #0x0\n"
1152 "52:" // Width 8: setup done
1153 "cmp x21, #0x4\n"
1154 "ble 54f\n"
1155 "53:" // Width 8: Multiply loop: Main loop head
1156 "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
1157 "whilelt p0.s, XZR, x21\n"
1158 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1159 "sub x21, x21, #0x4\n"
1160 "ld1rqw { z0.s }, p0/Z, [x20]\n"
1161 "fmla z24.s, z1.s, z0.s[0]\n"
1162 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1163 "add x20, x20, #0x10\n"
1164 "fmla z25.s, z2.s, z0.s[0]\n"
1165 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1166 "cmp x21, #0x4\n"
1167 "fmla z26.s, z3.s, z0.s[0]\n"
1168 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1169 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1170 "fmla z27.s, z4.s, z0.s[0]\n"
1171 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1172 "fmla z28.s, z5.s, z0.s[0]\n"
1173 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1174 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1175 "fmla z29.s, z6.s, z0.s[0]\n"
1176 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1177 "fmla z30.s, z7.s, z0.s[0]\n"
1178 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1179 "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
1180 "fmla z31.s, z8.s, z0.s[0]\n"
1181 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1182 "fmla z24.s, z9.s, z0.s[1]\n"
1183 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1184 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1185 "fmla z25.s, z10.s, z0.s[1]\n"
1186 "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1187 "fmla z26.s, z11.s, z0.s[1]\n"
1188 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1189 "fmla z27.s, z12.s, z0.s[1]\n"
1190 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1191 "fmla z28.s, z13.s, z0.s[1]\n"
1192 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1193 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1194 "fmla z29.s, z14.s, z0.s[1]\n"
1195 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1196 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1197 "fmla z30.s, z15.s, z0.s[1]\n"
1198 "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
1199 "fmla z31.s, z16.s, z0.s[1]\n"
1200 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1201 "fmla z24.s, z17.s, z0.s[2]\n"
1202 "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1203 "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1204 "fmla z25.s, z18.s, z0.s[2]\n"
1205 "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1206 "fmla z26.s, z19.s, z0.s[2]\n"
1207 "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1208 "fmla z27.s, z20.s, z0.s[2]\n"
1209 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1210 "fmla z28.s, z21.s, z0.s[2]\n"
1211 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1212 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1213 "fmla z29.s, z22.s, z0.s[2]\n"
1214 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1215 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1216 "fmla z30.s, z23.s, z0.s[2]\n"
1217 "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
1218 "fmla z31.s, z1.s, z0.s[2]\n"
1219 "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1220 "fmla z24.s, z2.s, z0.s[3]\n"
1221 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1222 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1223 "fmla z25.s, z3.s, z0.s[3]\n"
1224 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1225 "fmla z26.s, z4.s, z0.s[3]\n"
1226 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1227 "fmla z27.s, z5.s, z0.s[3]\n"
1228 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1229 "fmla z28.s, z6.s, z0.s[3]\n"
1230 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1231 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1232 "fmla z29.s, z7.s, z0.s[3]\n"
1233 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1234 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1235 "fmla z30.s, z8.s, z0.s[3]\n"
1236 "prfm pldl1keep, [x20, #0x80]\n"
1237 "fmla z31.s, z9.s, z0.s[3]\n"
1238 "bgt 53b\n"
1239 "54:" // Width 8: Multiply loop: Single iteration only
1240 "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
1241 "whilelt p0.s, XZR, x21\n"
1242 "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1243 "subs x21, x21, #0x1\n"
1244 "ld1rqw { z0.s }, p0/Z, [x20]\n"
1245 "fmla z24.s, z10.s, z0.s[0]\n"
1246 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1247 "add x20, x20, #0x10\n"
1248 "fmla z25.s, z11.s, z0.s[0]\n"
1249 "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1250 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1251 "fmla z26.s, z12.s, z0.s[0]\n"
1252 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1253 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1254 "fmla z27.s, z13.s, z0.s[0]\n"
1255 "fmla z28.s, z14.s, z0.s[0]\n"
1256 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1257 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1258 "fmla z29.s, z15.s, z0.s[0]\n"
1259 "fmla z30.s, z16.s, z0.s[0]\n"
1260 "fmla z31.s, z17.s, z0.s[0]\n"
1261 "ble 55f\n"
1262 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1263 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1264 "subs x21, x21, #0x1\n"
1265 "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n"
1266 "fmla z24.s, z18.s, z0.s[1]\n"
1267 "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1268 "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1269 "fmla z25.s, z19.s, z0.s[1]\n"
1270 "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1271 "fmla z26.s, z20.s, z0.s[1]\n"
1272 "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1273 "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1274 "fmla z27.s, z21.s, z0.s[1]\n"
1275 "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1276 "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1277 "fmla z28.s, z22.s, z0.s[1]\n"
1278 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1279 "fmla z29.s, z23.s, z0.s[1]\n"
1280 "fmla z30.s, z1.s, z0.s[1]\n"
1281 "fmla z31.s, z2.s, z0.s[1]\n"
1282 "ble 55f\n"
1283 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1284 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1285 "subs x21, x21, #0x1\n"
1286 "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
1287 "fmla z24.s, z3.s, z0.s[2]\n"
1288 "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1289 "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1290 "fmla z25.s, z4.s, z0.s[2]\n"
1291 "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1292 "fmla z26.s, z5.s, z0.s[2]\n"
1293 "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1294 "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1295 "fmla z27.s, z6.s, z0.s[2]\n"
1296 "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1297 "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1298 "fmla z28.s, z7.s, z0.s[2]\n"
1299 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1300 "fmla z29.s, z8.s, z0.s[2]\n"
1301 "fmla z30.s, z9.s, z0.s[2]\n"
1302 "fmla z31.s, z10.s, z0.s[2]\n"
1303 "ble 55f\n"
1304 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1305 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1306 "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
1307 "fmla z24.s, z11.s, z0.s[3]\n"
1308 "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
1309 "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
1310 "fmla z25.s, z12.s, z0.s[3]\n"
1311 "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
1312 "fmla z26.s, z13.s, z0.s[3]\n"
1313 "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
1314 "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
1315 "fmla z27.s, z14.s, z0.s[3]\n"
1316 "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
1317 "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
1318 "fmla z28.s, z15.s, z0.s[3]\n"
1319 "addvl %x[B_ptr], %x[B_ptr], #8\n"
1320 "fmla z29.s, z16.s, z0.s[3]\n"
1321 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1322 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1323 "fmla z30.s, z17.s, z0.s[3]\n"
1324 "fmla z31.s, z18.s, z0.s[3]\n"
1325 "55:" // Width 8: Multiply loop: multiply skip
1326 "prfm pldl1keep, [x20, #0x80]\n"
1327 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1328 "tbz %x[flags], #1, 56f\n"
1329 "add x19, %x[args_ptr], %[offset_min]\n"
1330 "ld1rw { z17.s }, p2/Z, [x19]\n"
1331 "add x19, %x[args_ptr], %[offset_max]\n"
1332 "ld1rw { z16.s }, p2/Z, [x19]\n"
1333 "fmin z24.s, p2/M, z24.s, z16.s\n"
1334 "fmin z25.s, p2/M, z25.s, z16.s\n"
1335 "fmin z26.s, p2/M, z26.s, z16.s\n"
1336 "fmin z27.s, p2/M, z27.s, z16.s\n"
1337 "fmin z28.s, p2/M, z28.s, z16.s\n"
1338 "fmax z24.s, p2/M, z24.s, z17.s\n"
1339 "fmax z25.s, p2/M, z25.s, z17.s\n"
1340 "fmax z26.s, p2/M, z26.s, z17.s\n"
1341 "fmax z27.s, p2/M, z27.s, z17.s\n"
1342 "fmax z28.s, p2/M, z28.s, z17.s\n"
1343 "fmin z29.s, p2/M, z29.s, z16.s\n"
1344 "fmin z30.s, p2/M, z30.s, z16.s\n"
1345 "fmin z31.s, p2/M, z31.s, z16.s\n"
1346 "fmax z29.s, p2/M, z29.s, z17.s\n"
1347 "fmax z30.s, p2/M, z30.s, z17.s\n"
1348 "fmax z31.s, p2/M, z31.s, z17.s\n"
1349 "56:" // Width 8: No activation
1350 "st1w { z24.s }, p2, [%x[output_ptr]]\n"
1351 "subs x23, x23, #0x8\n"
1352 "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
1353 "sub %x[N], %x[N], x24, LSL #3\n"
1354 "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
1355 "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
1356 "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
1357 "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
1358 "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
1359 "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n"
1360 "addvl %x[output_ptr], %x[output_ptr], #8\n"
1361 "bgt 1b\n"
1362 "57:" // Exit
1363
1364 : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
1365 : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
1366 : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
1367 );
1368 }
1369
1370 } // namespace arm_gemm
1371
1372 #endif
1373