1 /*
2 * Copyright (c) 2019-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24 #ifdef __ARM_FEATURE_SVE
25
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28
29 #include <cassert>
30
31 namespace arm_gemm {
32
sve_hybrid_fp32_mla_8x1VL(unsigned int num_strings,const unsigned int * string_lengths,IndirectInputArg<float> A_arg,size_t M,size_t N,const float * B_ptr,IndirectOutputArg<float> output_arg,const float * bias,Activation act,bool accumulate)33 void sve_hybrid_fp32_mla_8x1VL (
34 unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
35 size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
36 const float *bias, Activation act, bool accumulate
37 )
38 {
39 struct KernelArgs {
40 float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
41 float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
42 unsigned int num_strings = {};
43 const unsigned int *string_lengths = {};
44 size_t N = {};
45 const float *B_ptr = {};
46 size_t output_offset = {};
47 size_t input_initial_col = {};
48 size_t input_offset = {};
49 } ka;
50
51 unsigned long flags=0;
52 void *output_ptr;
53 void *input_ptr;
54
55 if (output_arg.is_indirect) {
56 output_ptr=(void *)(output_arg.indirect.ptr);
57 ka.output_offset=output_arg.indirect.offset;
58 flags |= 0x4;
59 } else {
60 output_ptr=(void *)(output_arg.direct.base);
61 ka.output_offset=output_arg.direct.stride;
62 }
63
64 if (A_arg.is_indirect) {
65 input_ptr=(void *)(A_arg.indirect.ptr);
66 ka.input_offset=A_arg.indirect.start_row;
67 ka.input_initial_col=A_arg.indirect.start_col;
68 flags |= 0x8;
69 } else {
70 assert(num_strings==1);
71 input_ptr=(void *)(A_arg.direct.base);
72 ka.input_offset=A_arg.direct.stride;
73 }
74 if (accumulate) {
75 flags |= 0x1;
76 }
77 ka.num_strings = num_strings;
78 ka.string_lengths = string_lengths;
79 ka.N = N;
80 ka.B_ptr = B_ptr;
81 switch(act.type) {
82 default:
83 case Activation::Type::None:
84 break;
85 case Activation::Type::BoundedReLU:
86 ka.maxval = static_cast<float>(act.param1);
87 /* fall through */
88 case Activation::Type::ReLU:
89 ka.minval = 0;
90 flags |= 0x2;
91 break;
92 }
93 __asm__ __volatile__(
94 "ptrue p2.b\n"
95 "1:" // Row loop
96 "cmp %x[M], #0x8\n"
97 "bge 99f\n"
98 "cmp %x[M], #0x6\n"
99 "bgt 85f\n"
100 "beq 71f\n"
101 "cmp %x[M], #0x4\n"
102 "bgt 57f\n"
103 "beq 43f\n"
104 "cmp %x[M], #0x2\n"
105 "bgt 29f\n"
106 "beq 15f\n"
107 "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
108 "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
109 "mov x8, %x[bias]\n"
110 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
111 "tbz %x[flags], #2, 2f\n"
112 "ldr x17, [%x[output_ptr], #0x0]\n"
113 "add x17, x17, x19, LSL #2\n"
114 "b 3f\n"
115 "2:" // Height 1: setup direct output
116 "mov x17, %x[output_ptr]\n"
117 "3:" // Height 1: Column loop
118 "mov x19, #0x0\n"
119 "whilelt p1.s, x19, x6\n"
120 "cbz x8, 4f\n"
121 "ld1w { z24.s }, p2/Z, [x8]\n"
122 "addvl x8, x8, #1\n"
123 "b 6f\n"
124 "4:" // Height 1: no bias
125 "tbz %x[flags], #0, 5f\n"
126 "ld1w { z24.s }, p1/Z, [x17]\n"
127 "b 6f\n"
128 "5:" // Height 1: no accumulate
129 "mov z24.b, #0x0\n"
130 "6:" // Height 1: setup done
131 "mov x16, #0x0\n"
132 "7:" // Height 1: String loop
133 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
134 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
135 "ldr w15, [x20, x16, LSL #0x2]\n"
136 "tbz %x[flags], #3, 8f\n"
137 "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
138 "add x20, x20, x19, LSL #3\n"
139 "ldr x14, [x20, #0x0]\n"
140 "cbnz x16, 9f\n"
141 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
142 "add x14, x14, x19, LSL #2\n"
143 "b 9f\n"
144 "8:" // Height 1: setup direct input
145 "mov x14, %x[input_ptr]\n"
146 "9:" // Height 1: input setup done
147 "cmp x15, #0x4\n"
148 "ble 11f\n"
149 "10:" // Height 1: Multiply loop: Main loop head
150 "ld1w { z8.s }, p2/Z, [x7]\n"
151 "whilelt p0.s, XZR, x15\n"
152 "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
153 "sub x15, x15, #0x4\n"
154 "ld1rqw { z0.s }, p0/Z, [x14]\n"
155 "fmla z24.s, z8.s, z0.s[0]\n"
156 "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
157 "add x14, x14, #0x10\n"
158 "fmla z24.s, z9.s, z0.s[1]\n"
159 "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
160 "cmp x15, #0x4\n"
161 "fmla z24.s, z10.s, z0.s[2]\n"
162 "prfm pldl1keep, [x14, #0x80]\n"
163 "addvl x7, x7, #4\n"
164 "fmla z24.s, z11.s, z0.s[3]\n"
165 "bgt 10b\n"
166 "11:" // Height 1: Multiply loop: Single iteration only
167 "ld1w { z12.s }, p2/Z, [x7]\n"
168 "whilelt p0.s, XZR, x15\n"
169 "subs x15, x15, #0x1\n"
170 "ld1rqw { z0.s }, p0/Z, [x14]\n"
171 "fmla z24.s, z12.s, z0.s[0]\n"
172 "add x14, x14, #0x10\n"
173 "addvl x7, x7, #1\n"
174 "ble 12f\n"
175 "ld1w { z13.s }, p2/Z, [x7]\n"
176 "fmla z24.s, z13.s, z0.s[1]\n"
177 "subs x15, x15, #0x1\n"
178 "addvl x7, x7, #1\n"
179 "ble 12f\n"
180 "ld1w { z14.s }, p2/Z, [x7]\n"
181 "fmla z24.s, z14.s, z0.s[2]\n"
182 "subs x15, x15, #0x1\n"
183 "addvl x7, x7, #1\n"
184 "ble 12f\n"
185 "ld1w { z15.s }, p2/Z, [x7]\n"
186 "fmla z24.s, z15.s, z0.s[3]\n"
187 "addvl x7, x7, #1\n"
188 "12:" // Height 1: Multiply loop: multiply skip
189 "prfm pldl1keep, [x14, #0x80]\n"
190 "add x16, x16, #0x1\n"
191 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
192 "cmp x16, x19\n"
193 "bne 7b\n"
194 "prfm pstl1keep, [x17, #0x0]\n"
195 "tbz %x[flags], #1, 13f\n"
196 "add x19, %x[args_ptr], %[offset_min]\n"
197 "ld1rw { z17.s }, p2/Z, [x19]\n"
198 "add x19, %x[args_ptr], %[offset_max]\n"
199 "ld1rw { z16.s }, p2/Z, [x19]\n"
200 "fmin z24.s, p2/M, z24.s, z16.s\n"
201 "fmax z24.s, p2/M, z24.s, z17.s\n"
202 "13:" // Height 1: No activation
203 "st1w { z24.s }, p1, [x17]\n"
204 "addvl x17, x17, #1\n"
205 "14:" // Height 1: Writeback done
206 "mov x19, #0x0\n"
207 "incw x19\n"
208 "subs x6, x6, x19\n"
209 "bgt 3b\n"
210 "b 114f\n"
211 "15:" // Height 2
212 "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
213 "mov x8, %x[bias]\n"
214 "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
215 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
216 "tbz %x[flags], #2, 16f\n"
217 "ldr x17, [%x[output_ptr], #0x0]\n"
218 "add x17, x17, x19, LSL #2\n"
219 "ldr x13, [%x[output_ptr], #0x8]\n"
220 "add x13, x13, x19, LSL #2\n"
221 "b 17f\n"
222 "16:" // Height 2: setup direct output
223 "mov x17, %x[output_ptr]\n"
224 "add x13, x17, x19, LSL #2\n"
225 "17:" // Height 2: Column loop
226 "mov x19, #0x0\n"
227 "whilelt p1.s, x19, x6\n"
228 "cbz x8, 18f\n"
229 "ld1w { z24.s }, p2/Z, [x8]\n"
230 "mov z25.d, z24.d\n"
231 "addvl x8, x8, #1\n"
232 "b 20f\n"
233 "18:" // Height 2: no bias
234 "tbz %x[flags], #0, 19f\n"
235 "ld1w { z24.s }, p1/Z, [x17]\n"
236 "ld1w { z25.s }, p1/Z, [x13]\n"
237 "b 20f\n"
238 "19:" // Height 2: no accumulate
239 "mov z24.b, #0x0\n"
240 "mov z25.b, #0x0\n"
241 "20:" // Height 2: setup done
242 "mov x16, #0x0\n"
243 "21:" // Height 2: String loop
244 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
245 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
246 "ldr w15, [x20, x16, LSL #0x2]\n"
247 "tbz %x[flags], #3, 22f\n"
248 "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
249 "add x20, x20, x19, LSL #3\n"
250 "ldr x14, [x20, #0x0]\n"
251 "ldr x12, [x20, #0x8]\n"
252 "cbnz x16, 23f\n"
253 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
254 "add x14, x14, x19, LSL #2\n"
255 "add x12, x12, x19, LSL #2\n"
256 "b 23f\n"
257 "22:" // Height 2: setup direct input
258 "mov x14, %x[input_ptr]\n"
259 "add x12, x14, x19, LSL #2\n"
260 "23:" // Height 2: input setup done
261 "cmp x15, #0x4\n"
262 "ble 25f\n"
263 "24:" // Height 2: Multiply loop: Main loop head
264 "ld1w { z8.s }, p2/Z, [x7]\n"
265 "whilelt p0.s, XZR, x15\n"
266 "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
267 "sub x15, x15, #0x4\n"
268 "ld1rqw { z0.s }, p0/Z, [x14]\n"
269 "fmla z24.s, z8.s, z0.s[0]\n"
270 "ld1rqw { z1.s }, p0/Z, [x12]\n"
271 "add x14, x14, #0x10\n"
272 "fmla z25.s, z8.s, z1.s[0]\n"
273 "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
274 "add x12, x12, #0x10\n"
275 "fmla z24.s, z9.s, z0.s[1]\n"
276 "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
277 "cmp x15, #0x4\n"
278 "fmla z25.s, z9.s, z1.s[1]\n"
279 "prfm pldl1keep, [x14, #0x80]\n"
280 "addvl x7, x7, #4\n"
281 "fmla z24.s, z10.s, z0.s[2]\n"
282 "prfm pldl1keep, [x12, #0x80]\n"
283 "fmla z25.s, z10.s, z1.s[2]\n"
284 "fmla z24.s, z11.s, z0.s[3]\n"
285 "fmla z25.s, z11.s, z1.s[3]\n"
286 "bgt 24b\n"
287 "25:" // Height 2: Multiply loop: Single iteration only
288 "ld1w { z12.s }, p2/Z, [x7]\n"
289 "whilelt p0.s, XZR, x15\n"
290 "subs x15, x15, #0x1\n"
291 "ld1rqw { z0.s }, p0/Z, [x14]\n"
292 "fmla z24.s, z12.s, z0.s[0]\n"
293 "ld1rqw { z1.s }, p0/Z, [x12]\n"
294 "add x14, x14, #0x10\n"
295 "fmla z25.s, z12.s, z1.s[0]\n"
296 "add x12, x12, #0x10\n"
297 "addvl x7, x7, #1\n"
298 "ble 26f\n"
299 "ld1w { z13.s }, p2/Z, [x7]\n"
300 "fmla z24.s, z13.s, z0.s[1]\n"
301 "subs x15, x15, #0x1\n"
302 "fmla z25.s, z13.s, z1.s[1]\n"
303 "addvl x7, x7, #1\n"
304 "ble 26f\n"
305 "ld1w { z14.s }, p2/Z, [x7]\n"
306 "fmla z24.s, z14.s, z0.s[2]\n"
307 "subs x15, x15, #0x1\n"
308 "fmla z25.s, z14.s, z1.s[2]\n"
309 "addvl x7, x7, #1\n"
310 "ble 26f\n"
311 "ld1w { z15.s }, p2/Z, [x7]\n"
312 "fmla z24.s, z15.s, z0.s[3]\n"
313 "addvl x7, x7, #1\n"
314 "fmla z25.s, z15.s, z1.s[3]\n"
315 "26:" // Height 2: Multiply loop: multiply skip
316 "prfm pldl1keep, [x14, #0x80]\n"
317 "add x16, x16, #0x1\n"
318 "prfm pldl1keep, [x12, #0x80]\n"
319 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
320 "cmp x16, x19\n"
321 "bne 21b\n"
322 "prfm pstl1keep, [x17, #0x0]\n"
323 "prfm pstl1keep, [x13, #0x0]\n"
324 "tbz %x[flags], #1, 27f\n"
325 "add x19, %x[args_ptr], %[offset_min]\n"
326 "ld1rw { z17.s }, p2/Z, [x19]\n"
327 "add x19, %x[args_ptr], %[offset_max]\n"
328 "ld1rw { z16.s }, p2/Z, [x19]\n"
329 "fmin z24.s, p2/M, z24.s, z16.s\n"
330 "fmin z25.s, p2/M, z25.s, z16.s\n"
331 "fmax z24.s, p2/M, z24.s, z17.s\n"
332 "fmax z25.s, p2/M, z25.s, z17.s\n"
333 "27:" // Height 2: No activation
334 "st1w { z24.s }, p1, [x17]\n"
335 "addvl x17, x17, #1\n"
336 "st1w { z25.s }, p1, [x13]\n"
337 "addvl x13, x13, #1\n"
338 "28:" // Height 2: Writeback done
339 "mov x19, #0x0\n"
340 "incw x19\n"
341 "subs x6, x6, x19\n"
342 "bgt 17b\n"
343 "b 114f\n"
344 "29:" // Height 3
345 "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
346 "mov x8, %x[bias]\n"
347 "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
348 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
349 "tbz %x[flags], #2, 30f\n"
350 "ldr x17, [%x[output_ptr], #0x0]\n"
351 "add x17, x17, x19, LSL #2\n"
352 "ldr x13, [%x[output_ptr], #0x8]\n"
353 "ldr x11, [%x[output_ptr], #0x10]\n"
354 "add x13, x13, x19, LSL #2\n"
355 "add x11, x11, x19, LSL #2\n"
356 "b 31f\n"
357 "30:" // Height 3: setup direct output
358 "mov x17, %x[output_ptr]\n"
359 "add x13, x17, x19, LSL #2\n"
360 "add x11, x13, x19, LSL #2\n"
361 "31:" // Height 3: Column loop
362 "mov x19, #0x0\n"
363 "whilelt p1.s, x19, x6\n"
364 "cbz x8, 32f\n"
365 "ld1w { z24.s }, p2/Z, [x8]\n"
366 "mov z25.d, z24.d\n"
367 "addvl x8, x8, #1\n"
368 "mov z26.d, z24.d\n"
369 "b 34f\n"
370 "32:" // Height 3: no bias
371 "tbz %x[flags], #0, 33f\n"
372 "ld1w { z24.s }, p1/Z, [x17]\n"
373 "ld1w { z25.s }, p1/Z, [x13]\n"
374 "ld1w { z26.s }, p1/Z, [x11]\n"
375 "b 34f\n"
376 "33:" // Height 3: no accumulate
377 "mov z24.b, #0x0\n"
378 "mov z25.b, #0x0\n"
379 "mov z26.b, #0x0\n"
380 "34:" // Height 3: setup done
381 "mov x16, #0x0\n"
382 "35:" // Height 3: String loop
383 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
384 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
385 "ldr w15, [x20, x16, LSL #0x2]\n"
386 "tbz %x[flags], #3, 36f\n"
387 "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
388 "add x20, x20, x19, LSL #3\n"
389 "ldr x14, [x20, #0x0]\n"
390 "ldr x12, [x20, #0x8]\n"
391 "ldr x10, [x20, #0x10]\n"
392 "cbnz x16, 37f\n"
393 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
394 "add x14, x14, x19, LSL #2\n"
395 "add x12, x12, x19, LSL #2\n"
396 "add x10, x10, x19, LSL #2\n"
397 "b 37f\n"
398 "36:" // Height 3: setup direct input
399 "mov x14, %x[input_ptr]\n"
400 "add x12, x14, x19, LSL #2\n"
401 "add x10, x12, x19, LSL #2\n"
402 "37:" // Height 3: input setup done
403 "cmp x15, #0x4\n"
404 "ble 39f\n"
405 "38:" // Height 3: Multiply loop: Main loop head
406 "ld1w { z8.s }, p2/Z, [x7]\n"
407 "whilelt p0.s, XZR, x15\n"
408 "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
409 "sub x15, x15, #0x4\n"
410 "ld1rqw { z0.s }, p0/Z, [x14]\n"
411 "fmla z24.s, z8.s, z0.s[0]\n"
412 "ld1rqw { z1.s }, p0/Z, [x12]\n"
413 "add x14, x14, #0x10\n"
414 "fmla z25.s, z8.s, z1.s[0]\n"
415 "ld1rqw { z2.s }, p0/Z, [x10]\n"
416 "add x12, x12, #0x10\n"
417 "fmla z24.s, z9.s, z0.s[1]\n"
418 "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
419 "add x10, x10, #0x10\n"
420 "fmla z26.s, z8.s, z2.s[0]\n"
421 "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
422 "cmp x15, #0x4\n"
423 "fmla z25.s, z9.s, z1.s[1]\n"
424 "prfm pldl1keep, [x14, #0x80]\n"
425 "addvl x7, x7, #4\n"
426 "fmla z24.s, z10.s, z0.s[2]\n"
427 "prfm pldl1keep, [x12, #0x80]\n"
428 "prfm pldl1keep, [x10, #0x80]\n"
429 "fmla z26.s, z9.s, z2.s[1]\n"
430 "fmla z25.s, z10.s, z1.s[2]\n"
431 "fmla z24.s, z11.s, z0.s[3]\n"
432 "fmla z26.s, z10.s, z2.s[2]\n"
433 "fmla z25.s, z11.s, z1.s[3]\n"
434 "fmla z26.s, z11.s, z2.s[3]\n"
435 "bgt 38b\n"
436 "39:" // Height 3: Multiply loop: Single iteration only
437 "ld1w { z12.s }, p2/Z, [x7]\n"
438 "whilelt p0.s, XZR, x15\n"
439 "subs x15, x15, #0x1\n"
440 "ld1rqw { z0.s }, p0/Z, [x14]\n"
441 "fmla z24.s, z12.s, z0.s[0]\n"
442 "ld1rqw { z1.s }, p0/Z, [x12]\n"
443 "add x14, x14, #0x10\n"
444 "fmla z25.s, z12.s, z1.s[0]\n"
445 "ld1rqw { z2.s }, p0/Z, [x10]\n"
446 "add x12, x12, #0x10\n"
447 "fmla z26.s, z12.s, z2.s[0]\n"
448 "add x10, x10, #0x10\n"
449 "addvl x7, x7, #1\n"
450 "ble 40f\n"
451 "ld1w { z13.s }, p2/Z, [x7]\n"
452 "fmla z24.s, z13.s, z0.s[1]\n"
453 "subs x15, x15, #0x1\n"
454 "fmla z25.s, z13.s, z1.s[1]\n"
455 "addvl x7, x7, #1\n"
456 "fmla z26.s, z13.s, z2.s[1]\n"
457 "ble 40f\n"
458 "ld1w { z14.s }, p2/Z, [x7]\n"
459 "fmla z24.s, z14.s, z0.s[2]\n"
460 "subs x15, x15, #0x1\n"
461 "fmla z25.s, z14.s, z1.s[2]\n"
462 "addvl x7, x7, #1\n"
463 "fmla z26.s, z14.s, z2.s[2]\n"
464 "ble 40f\n"
465 "ld1w { z15.s }, p2/Z, [x7]\n"
466 "fmla z24.s, z15.s, z0.s[3]\n"
467 "addvl x7, x7, #1\n"
468 "fmla z25.s, z15.s, z1.s[3]\n"
469 "fmla z26.s, z15.s, z2.s[3]\n"
470 "40:" // Height 3: Multiply loop: multiply skip
471 "prfm pldl1keep, [x14, #0x80]\n"
472 "add x16, x16, #0x1\n"
473 "prfm pldl1keep, [x12, #0x80]\n"
474 "prfm pldl1keep, [x10, #0x80]\n"
475 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
476 "cmp x16, x19\n"
477 "bne 35b\n"
478 "prfm pstl1keep, [x17, #0x0]\n"
479 "prfm pstl1keep, [x13, #0x0]\n"
480 "prfm pstl1keep, [x11, #0x0]\n"
481 "tbz %x[flags], #1, 41f\n"
482 "add x19, %x[args_ptr], %[offset_min]\n"
483 "ld1rw { z17.s }, p2/Z, [x19]\n"
484 "add x19, %x[args_ptr], %[offset_max]\n"
485 "ld1rw { z16.s }, p2/Z, [x19]\n"
486 "fmin z24.s, p2/M, z24.s, z16.s\n"
487 "fmin z25.s, p2/M, z25.s, z16.s\n"
488 "fmin z26.s, p2/M, z26.s, z16.s\n"
489 "fmax z24.s, p2/M, z24.s, z17.s\n"
490 "fmax z25.s, p2/M, z25.s, z17.s\n"
491 "fmax z26.s, p2/M, z26.s, z17.s\n"
492 "41:" // Height 3: No activation
493 "st1w { z24.s }, p1, [x17]\n"
494 "addvl x17, x17, #1\n"
495 "st1w { z25.s }, p1, [x13]\n"
496 "addvl x13, x13, #1\n"
497 "st1w { z26.s }, p1, [x11]\n"
498 "addvl x11, x11, #1\n"
499 "42:" // Height 3: Writeback done
500 "mov x19, #0x0\n"
501 "incw x19\n"
502 "subs x6, x6, x19\n"
503 "bgt 31b\n"
504 "b 114f\n"
505 "43:" // Height 4
506 "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
507 "mov x8, %x[bias]\n"
508 "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
509 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
510 "tbz %x[flags], #2, 44f\n"
511 "ldr x17, [%x[output_ptr], #0x0]\n"
512 "add x17, x17, x19, LSL #2\n"
513 "ldr x13, [%x[output_ptr], #0x8]\n"
514 "ldr x11, [%x[output_ptr], #0x10]\n"
515 "add x13, x13, x19, LSL #2\n"
516 "ldr x9, [%x[output_ptr], #0x18]\n"
517 "add x11, x11, x19, LSL #2\n"
518 "add x9, x9, x19, LSL #2\n"
519 "b 45f\n"
520 "44:" // Height 4: setup direct output
521 "mov x17, %x[output_ptr]\n"
522 "add x13, x17, x19, LSL #2\n"
523 "add x11, x13, x19, LSL #2\n"
524 "add x9, x11, x19, LSL #2\n"
525 "45:" // Height 4: Column loop
526 "mov x19, #0x0\n"
527 "whilelt p1.s, x19, x6\n"
528 "cbz x8, 46f\n"
529 "ld1w { z24.s }, p2/Z, [x8]\n"
530 "mov z25.d, z24.d\n"
531 "addvl x8, x8, #1\n"
532 "mov z26.d, z24.d\n"
533 "mov z27.d, z24.d\n"
534 "b 48f\n"
535 "46:" // Height 4: no bias
536 "tbz %x[flags], #0, 47f\n"
537 "ld1w { z24.s }, p1/Z, [x17]\n"
538 "ld1w { z25.s }, p1/Z, [x13]\n"
539 "ld1w { z26.s }, p1/Z, [x11]\n"
540 "ld1w { z27.s }, p1/Z, [x9]\n"
541 "b 48f\n"
542 "47:" // Height 4: no accumulate
543 "mov z24.b, #0x0\n"
544 "mov z25.b, #0x0\n"
545 "mov z26.b, #0x0\n"
546 "mov z27.b, #0x0\n"
547 "48:" // Height 4: setup done
548 "mov x16, #0x0\n"
549 "49:" // Height 4: String loop
550 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
551 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
552 "ldr w15, [x20, x16, LSL #0x2]\n"
553 "tbz %x[flags], #3, 50f\n"
554 "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
555 "add x20, x20, x19, LSL #3\n"
556 "ldr x14, [x20, #0x0]\n"
557 "ldr x12, [x20, #0x8]\n"
558 "ldr x10, [x20, #0x10]\n"
559 "ldr x28, [x20, #0x18]\n"
560 "cbnz x16, 51f\n"
561 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
562 "add x14, x14, x19, LSL #2\n"
563 "add x12, x12, x19, LSL #2\n"
564 "add x10, x10, x19, LSL #2\n"
565 "add x28, x28, x19, LSL #2\n"
566 "b 51f\n"
567 "50:" // Height 4: setup direct input
568 "mov x14, %x[input_ptr]\n"
569 "add x12, x14, x19, LSL #2\n"
570 "add x10, x12, x19, LSL #2\n"
571 "add x28, x10, x19, LSL #2\n"
572 "51:" // Height 4: input setup done
573 "cmp x15, #0x4\n"
574 "ble 53f\n"
575 "52:" // Height 4: Multiply loop: Main loop head
576 "ld1w { z8.s }, p2/Z, [x7]\n"
577 "whilelt p0.s, XZR, x15\n"
578 "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
579 "sub x15, x15, #0x4\n"
580 "ld1rqw { z0.s }, p0/Z, [x14]\n"
581 "fmla z24.s, z8.s, z0.s[0]\n"
582 "ld1rqw { z1.s }, p0/Z, [x12]\n"
583 "add x14, x14, #0x10\n"
584 "fmla z25.s, z8.s, z1.s[0]\n"
585 "ld1rqw { z2.s }, p0/Z, [x10]\n"
586 "add x12, x12, #0x10\n"
587 "fmla z24.s, z9.s, z0.s[1]\n"
588 "ld1rqw { z3.s }, p0/Z, [x28]\n"
589 "add x10, x10, #0x10\n"
590 "fmla z26.s, z8.s, z2.s[0]\n"
591 "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
592 "add x28, x28, #0x10\n"
593 "fmla z27.s, z8.s, z3.s[0]\n"
594 "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
595 "cmp x15, #0x4\n"
596 "fmla z25.s, z9.s, z1.s[1]\n"
597 "prfm pldl1keep, [x14, #0x80]\n"
598 "addvl x7, x7, #4\n"
599 "fmla z24.s, z10.s, z0.s[2]\n"
600 "prfm pldl1keep, [x12, #0x80]\n"
601 "prfm pldl1keep, [x10, #0x80]\n"
602 "fmla z26.s, z9.s, z2.s[1]\n"
603 "prfm pldl1keep, [x28, #0x80]\n"
604 "fmla z27.s, z9.s, z3.s[1]\n"
605 "fmla z25.s, z10.s, z1.s[2]\n"
606 "fmla z24.s, z11.s, z0.s[3]\n"
607 "fmla z26.s, z10.s, z2.s[2]\n"
608 "fmla z27.s, z10.s, z3.s[2]\n"
609 "fmla z25.s, z11.s, z1.s[3]\n"
610 "fmla z26.s, z11.s, z2.s[3]\n"
611 "fmla z27.s, z11.s, z3.s[3]\n"
612 "bgt 52b\n"
613 "53:" // Height 4: Multiply loop: Single iteration only
614 "ld1w { z12.s }, p2/Z, [x7]\n"
615 "whilelt p0.s, XZR, x15\n"
616 "subs x15, x15, #0x1\n"
617 "ld1rqw { z0.s }, p0/Z, [x14]\n"
618 "fmla z24.s, z12.s, z0.s[0]\n"
619 "ld1rqw { z1.s }, p0/Z, [x12]\n"
620 "add x14, x14, #0x10\n"
621 "fmla z25.s, z12.s, z1.s[0]\n"
622 "ld1rqw { z2.s }, p0/Z, [x10]\n"
623 "add x12, x12, #0x10\n"
624 "fmla z26.s, z12.s, z2.s[0]\n"
625 "ld1rqw { z3.s }, p0/Z, [x28]\n"
626 "add x10, x10, #0x10\n"
627 "fmla z27.s, z12.s, z3.s[0]\n"
628 "add x28, x28, #0x10\n"
629 "addvl x7, x7, #1\n"
630 "ble 54f\n"
631 "ld1w { z13.s }, p2/Z, [x7]\n"
632 "fmla z24.s, z13.s, z0.s[1]\n"
633 "subs x15, x15, #0x1\n"
634 "fmla z25.s, z13.s, z1.s[1]\n"
635 "addvl x7, x7, #1\n"
636 "fmla z26.s, z13.s, z2.s[1]\n"
637 "fmla z27.s, z13.s, z3.s[1]\n"
638 "ble 54f\n"
639 "ld1w { z14.s }, p2/Z, [x7]\n"
640 "fmla z24.s, z14.s, z0.s[2]\n"
641 "subs x15, x15, #0x1\n"
642 "fmla z25.s, z14.s, z1.s[2]\n"
643 "addvl x7, x7, #1\n"
644 "fmla z26.s, z14.s, z2.s[2]\n"
645 "fmla z27.s, z14.s, z3.s[2]\n"
646 "ble 54f\n"
647 "ld1w { z15.s }, p2/Z, [x7]\n"
648 "fmla z24.s, z15.s, z0.s[3]\n"
649 "addvl x7, x7, #1\n"
650 "fmla z25.s, z15.s, z1.s[3]\n"
651 "fmla z26.s, z15.s, z2.s[3]\n"
652 "fmla z27.s, z15.s, z3.s[3]\n"
653 "54:" // Height 4: Multiply loop: multiply skip
654 "prfm pldl1keep, [x14, #0x80]\n"
655 "add x16, x16, #0x1\n"
656 "prfm pldl1keep, [x12, #0x80]\n"
657 "prfm pldl1keep, [x10, #0x80]\n"
658 "prfm pldl1keep, [x28, #0x80]\n"
659 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
660 "cmp x16, x19\n"
661 "bne 49b\n"
662 "prfm pstl1keep, [x17, #0x0]\n"
663 "prfm pstl1keep, [x13, #0x0]\n"
664 "prfm pstl1keep, [x11, #0x0]\n"
665 "prfm pstl1keep, [x9, #0x0]\n"
666 "tbz %x[flags], #1, 55f\n"
667 "add x19, %x[args_ptr], %[offset_min]\n"
668 "ld1rw { z17.s }, p2/Z, [x19]\n"
669 "add x19, %x[args_ptr], %[offset_max]\n"
670 "ld1rw { z16.s }, p2/Z, [x19]\n"
671 "fmin z24.s, p2/M, z24.s, z16.s\n"
672 "fmin z25.s, p2/M, z25.s, z16.s\n"
673 "fmin z26.s, p2/M, z26.s, z16.s\n"
674 "fmin z27.s, p2/M, z27.s, z16.s\n"
675 "fmax z24.s, p2/M, z24.s, z17.s\n"
676 "fmax z25.s, p2/M, z25.s, z17.s\n"
677 "fmax z26.s, p2/M, z26.s, z17.s\n"
678 "fmax z27.s, p2/M, z27.s, z17.s\n"
679 "55:" // Height 4: No activation
680 "st1w { z24.s }, p1, [x17]\n"
681 "addvl x17, x17, #1\n"
682 "st1w { z25.s }, p1, [x13]\n"
683 "addvl x13, x13, #1\n"
684 "st1w { z26.s }, p1, [x11]\n"
685 "addvl x11, x11, #1\n"
686 "st1w { z27.s }, p1, [x9]\n"
687 "addvl x9, x9, #1\n"
688 "56:" // Height 4: Writeback done
689 "mov x19, #0x0\n"
690 "incw x19\n"
691 "subs x6, x6, x19\n"
692 "bgt 45b\n"
693 "b 114f\n"
694 "57:" // Height 5
695 "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
696 "mov x8, %x[bias]\n"
697 "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
698 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
699 "tbz %x[flags], #2, 58f\n"
700 "ldr x17, [%x[output_ptr], #0x0]\n"
701 "add x17, x17, x19, LSL #2\n"
702 "ldr x13, [%x[output_ptr], #0x8]\n"
703 "ldr x11, [%x[output_ptr], #0x10]\n"
704 "add x13, x13, x19, LSL #2\n"
705 "ldr x9, [%x[output_ptr], #0x18]\n"
706 "ldr x27, [%x[output_ptr], #0x20]\n"
707 "add x11, x11, x19, LSL #2\n"
708 "add x9, x9, x19, LSL #2\n"
709 "add x27, x27, x19, LSL #2\n"
710 "b 59f\n"
711 "58:" // Height 5: setup direct output
712 "mov x17, %x[output_ptr]\n"
713 "add x13, x17, x19, LSL #2\n"
714 "add x11, x13, x19, LSL #2\n"
715 "add x9, x11, x19, LSL #2\n"
716 "add x27, x9, x19, LSL #2\n"
717 "59:" // Height 5: Column loop
718 "mov x19, #0x0\n"
719 "whilelt p1.s, x19, x6\n"
720 "cbz x8, 60f\n"
721 "ld1w { z24.s }, p2/Z, [x8]\n"
722 "mov z25.d, z24.d\n"
723 "addvl x8, x8, #1\n"
724 "mov z26.d, z24.d\n"
725 "mov z27.d, z24.d\n"
726 "mov z28.d, z24.d\n"
727 "b 62f\n"
728 "60:" // Height 5: no bias
729 "tbz %x[flags], #0, 61f\n"
730 "ld1w { z24.s }, p1/Z, [x17]\n"
731 "ld1w { z25.s }, p1/Z, [x13]\n"
732 "ld1w { z26.s }, p1/Z, [x11]\n"
733 "ld1w { z27.s }, p1/Z, [x9]\n"
734 "ld1w { z28.s }, p1/Z, [x27]\n"
735 "b 62f\n"
736 "61:" // Height 5: no accumulate
737 "mov z24.b, #0x0\n"
738 "mov z25.b, #0x0\n"
739 "mov z26.b, #0x0\n"
740 "mov z27.b, #0x0\n"
741 "mov z28.b, #0x0\n"
742 "62:" // Height 5: setup done
743 "mov x16, #0x0\n"
744 "63:" // Height 5: String loop
745 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
746 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
747 "ldr w15, [x20, x16, LSL #0x2]\n"
748 "tbz %x[flags], #3, 64f\n"
749 "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
750 "add x20, x20, x19, LSL #3\n"
751 "ldr x14, [x20, #0x0]\n"
752 "ldr x12, [x20, #0x8]\n"
753 "ldr x10, [x20, #0x10]\n"
754 "ldr x28, [x20, #0x18]\n"
755 "ldr x26, [x20, #0x20]\n"
756 "cbnz x16, 65f\n"
757 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
758 "add x14, x14, x19, LSL #2\n"
759 "add x12, x12, x19, LSL #2\n"
760 "add x10, x10, x19, LSL #2\n"
761 "add x28, x28, x19, LSL #2\n"
762 "add x26, x26, x19, LSL #2\n"
763 "b 65f\n"
764 "64:" // Height 5: setup direct input
765 "mov x14, %x[input_ptr]\n"
766 "add x12, x14, x19, LSL #2\n"
767 "add x10, x12, x19, LSL #2\n"
768 "add x28, x10, x19, LSL #2\n"
769 "add x26, x28, x19, LSL #2\n"
770 "65:" // Height 5: input setup done
771 "cmp x15, #0x4\n"
772 "ble 67f\n"
773 "66:" // Height 5: Multiply loop: Main loop head
774 "ld1w { z8.s }, p2/Z, [x7]\n"
775 "whilelt p0.s, XZR, x15\n"
776 "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
777 "sub x15, x15, #0x4\n"
778 "ld1rqw { z0.s }, p0/Z, [x14]\n"
779 "fmla z24.s, z8.s, z0.s[0]\n"
780 "ld1rqw { z1.s }, p0/Z, [x12]\n"
781 "add x14, x14, #0x10\n"
782 "fmla z25.s, z8.s, z1.s[0]\n"
783 "ld1rqw { z2.s }, p0/Z, [x10]\n"
784 "add x12, x12, #0x10\n"
785 "fmla z24.s, z9.s, z0.s[1]\n"
786 "ld1rqw { z3.s }, p0/Z, [x28]\n"
787 "add x10, x10, #0x10\n"
788 "fmla z26.s, z8.s, z2.s[0]\n"
789 "ld1rqw { z4.s }, p0/Z, [x26]\n"
790 "add x28, x28, #0x10\n"
791 "fmla z27.s, z8.s, z3.s[0]\n"
792 "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
793 "add x26, x26, #0x10\n"
794 "fmla z25.s, z9.s, z1.s[1]\n"
795 "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
796 "cmp x15, #0x4\n"
797 "fmla z28.s, z8.s, z4.s[0]\n"
798 "prfm pldl1keep, [x14, #0x80]\n"
799 "addvl x7, x7, #4\n"
800 "fmla z26.s, z9.s, z2.s[1]\n"
801 "prfm pldl1keep, [x12, #0x80]\n"
802 "fmla z24.s, z10.s, z0.s[2]\n"
803 "prfm pldl1keep, [x10, #0x80]\n"
804 "fmla z27.s, z9.s, z3.s[1]\n"
805 "prfm pldl1keep, [x28, #0x80]\n"
806 "fmla z25.s, z10.s, z1.s[2]\n"
807 "prfm pldl1keep, [x26, #0x80]\n"
808 "fmla z28.s, z9.s, z4.s[1]\n"
809 "fmla z26.s, z10.s, z2.s[2]\n"
810 "fmla z27.s, z10.s, z3.s[2]\n"
811 "fmla z24.s, z11.s, z0.s[3]\n"
812 "fmla z28.s, z10.s, z4.s[2]\n"
813 "fmla z25.s, z11.s, z1.s[3]\n"
814 "fmla z26.s, z11.s, z2.s[3]\n"
815 "fmla z27.s, z11.s, z3.s[3]\n"
816 "fmla z28.s, z11.s, z4.s[3]\n"
817 "bgt 66b\n"
818 "67:" // Height 5: Multiply loop: Single iteration only
819 "ld1w { z12.s }, p2/Z, [x7]\n"
820 "whilelt p0.s, XZR, x15\n"
821 "subs x15, x15, #0x1\n"
822 "ld1rqw { z0.s }, p0/Z, [x14]\n"
823 "fmla z24.s, z12.s, z0.s[0]\n"
824 "ld1rqw { z1.s }, p0/Z, [x12]\n"
825 "add x14, x14, #0x10\n"
826 "fmla z25.s, z12.s, z1.s[0]\n"
827 "ld1rqw { z2.s }, p0/Z, [x10]\n"
828 "add x12, x12, #0x10\n"
829 "fmla z26.s, z12.s, z2.s[0]\n"
830 "ld1rqw { z3.s }, p0/Z, [x28]\n"
831 "add x10, x10, #0x10\n"
832 "fmla z27.s, z12.s, z3.s[0]\n"
833 "ld1rqw { z4.s }, p0/Z, [x26]\n"
834 "add x28, x28, #0x10\n"
835 "fmla z28.s, z12.s, z4.s[0]\n"
836 "add x26, x26, #0x10\n"
837 "addvl x7, x7, #1\n"
838 "ble 68f\n"
839 "ld1w { z13.s }, p2/Z, [x7]\n"
840 "fmla z24.s, z13.s, z0.s[1]\n"
841 "subs x15, x15, #0x1\n"
842 "fmla z25.s, z13.s, z1.s[1]\n"
843 "addvl x7, x7, #1\n"
844 "fmla z26.s, z13.s, z2.s[1]\n"
845 "fmla z27.s, z13.s, z3.s[1]\n"
846 "fmla z28.s, z13.s, z4.s[1]\n"
847 "ble 68f\n"
848 "ld1w { z14.s }, p2/Z, [x7]\n"
849 "fmla z24.s, z14.s, z0.s[2]\n"
850 "subs x15, x15, #0x1\n"
851 "fmla z25.s, z14.s, z1.s[2]\n"
852 "addvl x7, x7, #1\n"
853 "fmla z26.s, z14.s, z2.s[2]\n"
854 "fmla z27.s, z14.s, z3.s[2]\n"
855 "fmla z28.s, z14.s, z4.s[2]\n"
856 "ble 68f\n"
857 "ld1w { z15.s }, p2/Z, [x7]\n"
858 "fmla z24.s, z15.s, z0.s[3]\n"
859 "addvl x7, x7, #1\n"
860 "fmla z25.s, z15.s, z1.s[3]\n"
861 "fmla z26.s, z15.s, z2.s[3]\n"
862 "fmla z27.s, z15.s, z3.s[3]\n"
863 "fmla z28.s, z15.s, z4.s[3]\n"
864 "68:" // Height 5: Multiply loop: multiply skip
865 "prfm pldl1keep, [x14, #0x80]\n"
866 "add x16, x16, #0x1\n"
867 "prfm pldl1keep, [x12, #0x80]\n"
868 "prfm pldl1keep, [x10, #0x80]\n"
869 "prfm pldl1keep, [x28, #0x80]\n"
870 "prfm pldl1keep, [x26, #0x80]\n"
871 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
872 "cmp x16, x19\n"
873 "bne 63b\n"
874 "prfm pstl1keep, [x17, #0x0]\n"
875 "prfm pstl1keep, [x13, #0x0]\n"
876 "prfm pstl1keep, [x11, #0x0]\n"
877 "prfm pstl1keep, [x9, #0x0]\n"
878 "prfm pstl1keep, [x27, #0x0]\n"
879 "tbz %x[flags], #1, 69f\n"
880 "add x19, %x[args_ptr], %[offset_min]\n"
881 "ld1rw { z17.s }, p2/Z, [x19]\n"
882 "add x19, %x[args_ptr], %[offset_max]\n"
883 "ld1rw { z16.s }, p2/Z, [x19]\n"
884 "fmin z24.s, p2/M, z24.s, z16.s\n"
885 "fmin z25.s, p2/M, z25.s, z16.s\n"
886 "fmin z26.s, p2/M, z26.s, z16.s\n"
887 "fmin z27.s, p2/M, z27.s, z16.s\n"
888 "fmin z28.s, p2/M, z28.s, z16.s\n"
889 "fmax z24.s, p2/M, z24.s, z17.s\n"
890 "fmax z25.s, p2/M, z25.s, z17.s\n"
891 "fmax z26.s, p2/M, z26.s, z17.s\n"
892 "fmax z27.s, p2/M, z27.s, z17.s\n"
893 "fmax z28.s, p2/M, z28.s, z17.s\n"
894 "69:" // Height 5: No activation
895 "st1w { z24.s }, p1, [x17]\n"
896 "addvl x17, x17, #1\n"
897 "st1w { z25.s }, p1, [x13]\n"
898 "addvl x13, x13, #1\n"
899 "st1w { z26.s }, p1, [x11]\n"
900 "addvl x11, x11, #1\n"
901 "st1w { z27.s }, p1, [x9]\n"
902 "addvl x9, x9, #1\n"
903 "st1w { z28.s }, p1, [x27]\n"
904 "addvl x27, x27, #1\n"
905 "70:" // Height 5: Writeback done
906 "mov x19, #0x0\n"
907 "incw x19\n"
908 "subs x6, x6, x19\n"
909 "bgt 59b\n"
910 "b 114f\n"
911 "71:" // Height 6
912 "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
913 "mov x8, %x[bias]\n"
914 "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
915 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
916 "tbz %x[flags], #2, 72f\n"
917 "ldr x17, [%x[output_ptr], #0x0]\n"
918 "add x17, x17, x19, LSL #2\n"
919 "ldr x13, [%x[output_ptr], #0x8]\n"
920 "ldr x11, [%x[output_ptr], #0x10]\n"
921 "add x13, x13, x19, LSL #2\n"
922 "ldr x9, [%x[output_ptr], #0x18]\n"
923 "ldr x27, [%x[output_ptr], #0x20]\n"
924 "add x11, x11, x19, LSL #2\n"
925 "ldr x25, [%x[output_ptr], #0x28]\n"
926 "add x9, x9, x19, LSL #2\n"
927 "add x27, x27, x19, LSL #2\n"
928 "add x25, x25, x19, LSL #2\n"
929 "b 73f\n"
930 "72:" // Height 6: setup direct output
931 "mov x17, %x[output_ptr]\n"
932 "add x13, x17, x19, LSL #2\n"
933 "add x11, x13, x19, LSL #2\n"
934 "add x9, x11, x19, LSL #2\n"
935 "add x27, x9, x19, LSL #2\n"
936 "add x25, x27, x19, LSL #2\n"
937 "73:" // Height 6: Column loop
938 "mov x19, #0x0\n"
939 "whilelt p1.s, x19, x6\n"
940 "cbz x8, 74f\n"
941 "ld1w { z24.s }, p2/Z, [x8]\n"
942 "mov z25.d, z24.d\n"
943 "addvl x8, x8, #1\n"
944 "mov z26.d, z24.d\n"
945 "mov z27.d, z24.d\n"
946 "mov z28.d, z24.d\n"
947 "mov z29.d, z24.d\n"
948 "b 76f\n"
949 "74:" // Height 6: no bias
950 "tbz %x[flags], #0, 75f\n"
951 "ld1w { z24.s }, p1/Z, [x17]\n"
952 "ld1w { z25.s }, p1/Z, [x13]\n"
953 "ld1w { z26.s }, p1/Z, [x11]\n"
954 "ld1w { z27.s }, p1/Z, [x9]\n"
955 "ld1w { z28.s }, p1/Z, [x27]\n"
956 "ld1w { z29.s }, p1/Z, [x25]\n"
957 "b 76f\n"
958 "75:" // Height 6: no accumulate
959 "mov z24.b, #0x0\n"
960 "mov z25.b, #0x0\n"
961 "mov z26.b, #0x0\n"
962 "mov z27.b, #0x0\n"
963 "mov z28.b, #0x0\n"
964 "mov z29.b, #0x0\n"
965 "76:" // Height 6: setup done
966 "mov x16, #0x0\n"
967 "77:" // Height 6: String loop
968 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
969 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
970 "ldr w15, [x20, x16, LSL #0x2]\n"
971 "tbz %x[flags], #3, 78f\n"
972 "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
973 "add x20, x20, x19, LSL #3\n"
974 "ldr x14, [x20, #0x0]\n"
975 "ldr x12, [x20, #0x8]\n"
976 "ldr x10, [x20, #0x10]\n"
977 "ldr x28, [x20, #0x18]\n"
978 "ldr x26, [x20, #0x20]\n"
979 "ldr x24, [x20, #0x28]\n"
980 "cbnz x16, 79f\n"
981 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
982 "add x14, x14, x19, LSL #2\n"
983 "add x12, x12, x19, LSL #2\n"
984 "add x10, x10, x19, LSL #2\n"
985 "add x28, x28, x19, LSL #2\n"
986 "add x26, x26, x19, LSL #2\n"
987 "add x24, x24, x19, LSL #2\n"
988 "b 79f\n"
989 "78:" // Height 6: setup direct input
990 "mov x14, %x[input_ptr]\n"
991 "add x12, x14, x19, LSL #2\n"
992 "add x10, x12, x19, LSL #2\n"
993 "add x28, x10, x19, LSL #2\n"
994 "add x26, x28, x19, LSL #2\n"
995 "add x24, x26, x19, LSL #2\n"
996 "79:" // Height 6: input setup done
997 "cmp x15, #0x4\n"
998 "ble 81f\n"
999 "80:" // Height 6: Multiply loop: Main loop head
1000 "ld1w { z8.s }, p2/Z, [x7]\n"
1001 "whilelt p0.s, XZR, x15\n"
1002 "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
1003 "sub x15, x15, #0x4\n"
1004 "ld1rqw { z0.s }, p0/Z, [x14]\n"
1005 "fmla z24.s, z8.s, z0.s[0]\n"
1006 "ld1rqw { z1.s }, p0/Z, [x12]\n"
1007 "add x14, x14, #0x10\n"
1008 "fmla z25.s, z8.s, z1.s[0]\n"
1009 "ld1rqw { z2.s }, p0/Z, [x10]\n"
1010 "add x12, x12, #0x10\n"
1011 "fmla z24.s, z9.s, z0.s[1]\n"
1012 "ld1rqw { z3.s }, p0/Z, [x28]\n"
1013 "add x10, x10, #0x10\n"
1014 "fmla z26.s, z8.s, z2.s[0]\n"
1015 "ld1rqw { z4.s }, p0/Z, [x26]\n"
1016 "add x28, x28, #0x10\n"
1017 "fmla z27.s, z8.s, z3.s[0]\n"
1018 "ld1rqw { z5.s }, p0/Z, [x24]\n"
1019 "add x26, x26, #0x10\n"
1020 "fmla z25.s, z9.s, z1.s[1]\n"
1021 "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
1022 "add x24, x24, #0x10\n"
1023 "fmla z28.s, z8.s, z4.s[0]\n"
1024 "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
1025 "cmp x15, #0x4\n"
1026 "fmla z29.s, z8.s, z5.s[0]\n"
1027 "prfm pldl1keep, [x14, #0x80]\n"
1028 "addvl x7, x7, #4\n"
1029 "fmla z26.s, z9.s, z2.s[1]\n"
1030 "prfm pldl1keep, [x12, #0x80]\n"
1031 "fmla z27.s, z9.s, z3.s[1]\n"
1032 "prfm pldl1keep, [x10, #0x80]\n"
1033 "fmla z24.s, z10.s, z0.s[2]\n"
1034 "prfm pldl1keep, [x28, #0x80]\n"
1035 "fmla z28.s, z9.s, z4.s[1]\n"
1036 "prfm pldl1keep, [x26, #0x80]\n"
1037 "fmla z29.s, z9.s, z5.s[1]\n"
1038 "prfm pldl1keep, [x24, #0x80]\n"
1039 "fmla z25.s, z10.s, z1.s[2]\n"
1040 "fmla z26.s, z10.s, z2.s[2]\n"
1041 "fmla z27.s, z10.s, z3.s[2]\n"
1042 "fmla z28.s, z10.s, z4.s[2]\n"
1043 "fmla z29.s, z10.s, z5.s[2]\n"
1044 "fmla z24.s, z11.s, z0.s[3]\n"
1045 "fmla z25.s, z11.s, z1.s[3]\n"
1046 "fmla z26.s, z11.s, z2.s[3]\n"
1047 "fmla z27.s, z11.s, z3.s[3]\n"
1048 "fmla z28.s, z11.s, z4.s[3]\n"
1049 "fmla z29.s, z11.s, z5.s[3]\n"
1050 "bgt 80b\n"
1051 "81:" // Height 6: Multiply loop: Single iteration only
1052 "ld1w { z12.s }, p2/Z, [x7]\n"
1053 "whilelt p0.s, XZR, x15\n"
1054 "subs x15, x15, #0x1\n"
1055 "ld1rqw { z0.s }, p0/Z, [x14]\n"
1056 "fmla z24.s, z12.s, z0.s[0]\n"
1057 "ld1rqw { z1.s }, p0/Z, [x12]\n"
1058 "add x14, x14, #0x10\n"
1059 "fmla z25.s, z12.s, z1.s[0]\n"
1060 "ld1rqw { z2.s }, p0/Z, [x10]\n"
1061 "add x12, x12, #0x10\n"
1062 "fmla z26.s, z12.s, z2.s[0]\n"
1063 "ld1rqw { z3.s }, p0/Z, [x28]\n"
1064 "add x10, x10, #0x10\n"
1065 "fmla z27.s, z12.s, z3.s[0]\n"
1066 "ld1rqw { z4.s }, p0/Z, [x26]\n"
1067 "add x28, x28, #0x10\n"
1068 "fmla z28.s, z12.s, z4.s[0]\n"
1069 "ld1rqw { z5.s }, p0/Z, [x24]\n"
1070 "add x26, x26, #0x10\n"
1071 "fmla z29.s, z12.s, z5.s[0]\n"
1072 "add x24, x24, #0x10\n"
1073 "addvl x7, x7, #1\n"
1074 "ble 82f\n"
1075 "ld1w { z13.s }, p2/Z, [x7]\n"
1076 "fmla z24.s, z13.s, z0.s[1]\n"
1077 "subs x15, x15, #0x1\n"
1078 "fmla z25.s, z13.s, z1.s[1]\n"
1079 "addvl x7, x7, #1\n"
1080 "fmla z26.s, z13.s, z2.s[1]\n"
1081 "fmla z27.s, z13.s, z3.s[1]\n"
1082 "fmla z28.s, z13.s, z4.s[1]\n"
1083 "fmla z29.s, z13.s, z5.s[1]\n"
1084 "ble 82f\n"
1085 "ld1w { z14.s }, p2/Z, [x7]\n"
1086 "fmla z24.s, z14.s, z0.s[2]\n"
1087 "subs x15, x15, #0x1\n"
1088 "fmla z25.s, z14.s, z1.s[2]\n"
1089 "addvl x7, x7, #1\n"
1090 "fmla z26.s, z14.s, z2.s[2]\n"
1091 "fmla z27.s, z14.s, z3.s[2]\n"
1092 "fmla z28.s, z14.s, z4.s[2]\n"
1093 "fmla z29.s, z14.s, z5.s[2]\n"
1094 "ble 82f\n"
1095 "ld1w { z15.s }, p2/Z, [x7]\n"
1096 "fmla z24.s, z15.s, z0.s[3]\n"
1097 "addvl x7, x7, #1\n"
1098 "fmla z25.s, z15.s, z1.s[3]\n"
1099 "fmla z26.s, z15.s, z2.s[3]\n"
1100 "fmla z27.s, z15.s, z3.s[3]\n"
1101 "fmla z28.s, z15.s, z4.s[3]\n"
1102 "fmla z29.s, z15.s, z5.s[3]\n"
1103 "82:" // Height 6: Multiply loop: multiply skip
1104 "prfm pldl1keep, [x14, #0x80]\n"
1105 "add x16, x16, #0x1\n"
1106 "prfm pldl1keep, [x12, #0x80]\n"
1107 "prfm pldl1keep, [x10, #0x80]\n"
1108 "prfm pldl1keep, [x28, #0x80]\n"
1109 "prfm pldl1keep, [x26, #0x80]\n"
1110 "prfm pldl1keep, [x24, #0x80]\n"
1111 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1112 "cmp x16, x19\n"
1113 "bne 77b\n"
1114 "prfm pstl1keep, [x17, #0x0]\n"
1115 "prfm pstl1keep, [x13, #0x0]\n"
1116 "prfm pstl1keep, [x11, #0x0]\n"
1117 "prfm pstl1keep, [x9, #0x0]\n"
1118 "prfm pstl1keep, [x27, #0x0]\n"
1119 "prfm pstl1keep, [x25, #0x0]\n"
1120 "tbz %x[flags], #1, 83f\n"
1121 "add x19, %x[args_ptr], %[offset_min]\n"
1122 "ld1rw { z17.s }, p2/Z, [x19]\n"
1123 "add x19, %x[args_ptr], %[offset_max]\n"
1124 "ld1rw { z16.s }, p2/Z, [x19]\n"
1125 "fmin z24.s, p2/M, z24.s, z16.s\n"
1126 "fmin z25.s, p2/M, z25.s, z16.s\n"
1127 "fmin z26.s, p2/M, z26.s, z16.s\n"
1128 "fmin z27.s, p2/M, z27.s, z16.s\n"
1129 "fmin z28.s, p2/M, z28.s, z16.s\n"
1130 "fmax z24.s, p2/M, z24.s, z17.s\n"
1131 "fmax z25.s, p2/M, z25.s, z17.s\n"
1132 "fmax z26.s, p2/M, z26.s, z17.s\n"
1133 "fmax z27.s, p2/M, z27.s, z17.s\n"
1134 "fmax z28.s, p2/M, z28.s, z17.s\n"
1135 "fmin z29.s, p2/M, z29.s, z16.s\n"
1136 "fmax z29.s, p2/M, z29.s, z17.s\n"
1137 "83:" // Height 6: No activation
1138 "st1w { z24.s }, p1, [x17]\n"
1139 "addvl x17, x17, #1\n"
1140 "st1w { z25.s }, p1, [x13]\n"
1141 "addvl x13, x13, #1\n"
1142 "st1w { z26.s }, p1, [x11]\n"
1143 "addvl x11, x11, #1\n"
1144 "st1w { z27.s }, p1, [x9]\n"
1145 "addvl x9, x9, #1\n"
1146 "st1w { z28.s }, p1, [x27]\n"
1147 "addvl x27, x27, #1\n"
1148 "st1w { z29.s }, p1, [x25]\n"
1149 "addvl x25, x25, #1\n"
1150 "84:" // Height 6: Writeback done
1151 "mov x19, #0x0\n"
1152 "incw x19\n"
1153 "subs x6, x6, x19\n"
1154 "bgt 73b\n"
1155 "b 114f\n"
1156 "85:" // Height 7
1157 "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
1158 "mov x8, %x[bias]\n"
1159 "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1160 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1161 "tbz %x[flags], #2, 86f\n"
1162 "ldr x17, [%x[output_ptr], #0x0]\n"
1163 "add x17, x17, x19, LSL #2\n"
1164 "ldr x13, [%x[output_ptr], #0x8]\n"
1165 "ldr x11, [%x[output_ptr], #0x10]\n"
1166 "add x13, x13, x19, LSL #2\n"
1167 "ldr x9, [%x[output_ptr], #0x18]\n"
1168 "ldr x27, [%x[output_ptr], #0x20]\n"
1169 "add x11, x11, x19, LSL #2\n"
1170 "ldr x25, [%x[output_ptr], #0x28]\n"
1171 "add x9, x9, x19, LSL #2\n"
1172 "ldr x23, [%x[output_ptr], #0x30]\n"
1173 "add x27, x27, x19, LSL #2\n"
1174 "add x25, x25, x19, LSL #2\n"
1175 "add x23, x23, x19, LSL #2\n"
1176 "b 87f\n"
1177 "86:" // Height 7: setup direct output
1178 "mov x17, %x[output_ptr]\n"
1179 "add x13, x17, x19, LSL #2\n"
1180 "add x11, x13, x19, LSL #2\n"
1181 "add x9, x11, x19, LSL #2\n"
1182 "add x27, x9, x19, LSL #2\n"
1183 "add x25, x27, x19, LSL #2\n"
1184 "add x23, x25, x19, LSL #2\n"
1185 "87:" // Height 7: Column loop
1186 "mov x19, #0x0\n"
1187 "whilelt p1.s, x19, x6\n"
1188 "cbz x8, 88f\n"
1189 "ld1w { z24.s }, p2/Z, [x8]\n"
1190 "mov z25.d, z24.d\n"
1191 "addvl x8, x8, #1\n"
1192 "mov z26.d, z24.d\n"
1193 "mov z27.d, z24.d\n"
1194 "mov z28.d, z24.d\n"
1195 "mov z29.d, z24.d\n"
1196 "mov z30.d, z24.d\n"
1197 "b 90f\n"
1198 "88:" // Height 7: no bias
1199 "tbz %x[flags], #0, 89f\n"
1200 "ld1w { z24.s }, p1/Z, [x17]\n"
1201 "ld1w { z25.s }, p1/Z, [x13]\n"
1202 "ld1w { z26.s }, p1/Z, [x11]\n"
1203 "ld1w { z27.s }, p1/Z, [x9]\n"
1204 "ld1w { z28.s }, p1/Z, [x27]\n"
1205 "ld1w { z29.s }, p1/Z, [x25]\n"
1206 "ld1w { z30.s }, p1/Z, [x23]\n"
1207 "b 90f\n"
1208 "89:" // Height 7: no accumulate
1209 "mov z24.b, #0x0\n"
1210 "mov z25.b, #0x0\n"
1211 "mov z26.b, #0x0\n"
1212 "mov z27.b, #0x0\n"
1213 "mov z28.b, #0x0\n"
1214 "mov z29.b, #0x0\n"
1215 "mov z30.b, #0x0\n"
1216 "90:" // Height 7: setup done
1217 "mov x16, #0x0\n"
1218 "91:" // Height 7: String loop
1219 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1220 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1221 "ldr w15, [x20, x16, LSL #0x2]\n"
1222 "tbz %x[flags], #3, 92f\n"
1223 "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
1224 "add x20, x20, x19, LSL #3\n"
1225 "ldr x14, [x20, #0x0]\n"
1226 "ldr x12, [x20, #0x8]\n"
1227 "ldr x10, [x20, #0x10]\n"
1228 "ldr x28, [x20, #0x18]\n"
1229 "ldr x26, [x20, #0x20]\n"
1230 "ldr x24, [x20, #0x28]\n"
1231 "ldr x22, [x20, #0x30]\n"
1232 "cbnz x16, 93f\n"
1233 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1234 "add x14, x14, x19, LSL #2\n"
1235 "add x12, x12, x19, LSL #2\n"
1236 "add x10, x10, x19, LSL #2\n"
1237 "add x28, x28, x19, LSL #2\n"
1238 "add x26, x26, x19, LSL #2\n"
1239 "add x24, x24, x19, LSL #2\n"
1240 "add x22, x22, x19, LSL #2\n"
1241 "b 93f\n"
1242 "92:" // Height 7: setup direct input
1243 "mov x14, %x[input_ptr]\n"
1244 "add x12, x14, x19, LSL #2\n"
1245 "add x10, x12, x19, LSL #2\n"
1246 "add x28, x10, x19, LSL #2\n"
1247 "add x26, x28, x19, LSL #2\n"
1248 "add x24, x26, x19, LSL #2\n"
1249 "add x22, x24, x19, LSL #2\n"
1250 "93:" // Height 7: input setup done
1251 "cmp x15, #0x4\n"
1252 "ble 95f\n"
1253 "94:" // Height 7: Multiply loop: Main loop head
1254 "ld1w { z8.s }, p2/Z, [x7]\n"
1255 "whilelt p0.s, XZR, x15\n"
1256 "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
1257 "sub x15, x15, #0x4\n"
1258 "ld1rqw { z0.s }, p0/Z, [x14]\n"
1259 "fmla z24.s, z8.s, z0.s[0]\n"
1260 "ld1rqw { z1.s }, p0/Z, [x12]\n"
1261 "add x14, x14, #0x10\n"
1262 "fmla z25.s, z8.s, z1.s[0]\n"
1263 "ld1rqw { z2.s }, p0/Z, [x10]\n"
1264 "add x12, x12, #0x10\n"
1265 "fmla z24.s, z9.s, z0.s[1]\n"
1266 "ld1rqw { z3.s }, p0/Z, [x28]\n"
1267 "add x10, x10, #0x10\n"
1268 "fmla z26.s, z8.s, z2.s[0]\n"
1269 "ld1rqw { z4.s }, p0/Z, [x26]\n"
1270 "add x28, x28, #0x10\n"
1271 "fmla z27.s, z8.s, z3.s[0]\n"
1272 "ld1rqw { z5.s }, p0/Z, [x24]\n"
1273 "add x26, x26, #0x10\n"
1274 "fmla z25.s, z9.s, z1.s[1]\n"
1275 "ld1rqw { z6.s }, p0/Z, [x22]\n"
1276 "add x24, x24, #0x10\n"
1277 "fmla z28.s, z8.s, z4.s[0]\n"
1278 "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
1279 "add x22, x22, #0x10\n"
1280 "fmla z29.s, z8.s, z5.s[0]\n"
1281 "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
1282 "cmp x15, #0x4\n"
1283 "fmla z30.s, z8.s, z6.s[0]\n"
1284 "prfm pldl1keep, [x14, #0x80]\n"
1285 "addvl x7, x7, #4\n"
1286 "fmla z26.s, z9.s, z2.s[1]\n"
1287 "prfm pldl1keep, [x12, #0x80]\n"
1288 "fmla z27.s, z9.s, z3.s[1]\n"
1289 "prfm pldl1keep, [x10, #0x80]\n"
1290 "fmla z28.s, z9.s, z4.s[1]\n"
1291 "prfm pldl1keep, [x28, #0x80]\n"
1292 "fmla z29.s, z9.s, z5.s[1]\n"
1293 "prfm pldl1keep, [x26, #0x80]\n"
1294 "fmla z30.s, z9.s, z6.s[1]\n"
1295 "prfm pldl1keep, [x24, #0x80]\n"
1296 "fmla z24.s, z10.s, z0.s[2]\n"
1297 "prfm pldl1keep, [x22, #0x80]\n"
1298 "fmla z25.s, z10.s, z1.s[2]\n"
1299 "fmla z26.s, z10.s, z2.s[2]\n"
1300 "fmla z27.s, z10.s, z3.s[2]\n"
1301 "fmla z28.s, z10.s, z4.s[2]\n"
1302 "fmla z29.s, z10.s, z5.s[2]\n"
1303 "fmla z30.s, z10.s, z6.s[2]\n"
1304 "fmla z24.s, z11.s, z0.s[3]\n"
1305 "fmla z25.s, z11.s, z1.s[3]\n"
1306 "fmla z26.s, z11.s, z2.s[3]\n"
1307 "fmla z27.s, z11.s, z3.s[3]\n"
1308 "fmla z28.s, z11.s, z4.s[3]\n"
1309 "fmla z29.s, z11.s, z5.s[3]\n"
1310 "fmla z30.s, z11.s, z6.s[3]\n"
1311 "bgt 94b\n"
1312 "95:" // Height 7: Multiply loop: Single iteration only
1313 "ld1w { z12.s }, p2/Z, [x7]\n"
1314 "whilelt p0.s, XZR, x15\n"
1315 "subs x15, x15, #0x1\n"
1316 "ld1rqw { z0.s }, p0/Z, [x14]\n"
1317 "fmla z24.s, z12.s, z0.s[0]\n"
1318 "ld1rqw { z1.s }, p0/Z, [x12]\n"
1319 "add x14, x14, #0x10\n"
1320 "fmla z25.s, z12.s, z1.s[0]\n"
1321 "ld1rqw { z2.s }, p0/Z, [x10]\n"
1322 "add x12, x12, #0x10\n"
1323 "fmla z26.s, z12.s, z2.s[0]\n"
1324 "ld1rqw { z3.s }, p0/Z, [x28]\n"
1325 "add x10, x10, #0x10\n"
1326 "fmla z27.s, z12.s, z3.s[0]\n"
1327 "ld1rqw { z4.s }, p0/Z, [x26]\n"
1328 "add x28, x28, #0x10\n"
1329 "fmla z28.s, z12.s, z4.s[0]\n"
1330 "ld1rqw { z5.s }, p0/Z, [x24]\n"
1331 "add x26, x26, #0x10\n"
1332 "fmla z29.s, z12.s, z5.s[0]\n"
1333 "ld1rqw { z6.s }, p0/Z, [x22]\n"
1334 "add x24, x24, #0x10\n"
1335 "fmla z30.s, z12.s, z6.s[0]\n"
1336 "add x22, x22, #0x10\n"
1337 "addvl x7, x7, #1\n"
1338 "ble 96f\n"
1339 "ld1w { z13.s }, p2/Z, [x7]\n"
1340 "fmla z24.s, z13.s, z0.s[1]\n"
1341 "subs x15, x15, #0x1\n"
1342 "fmla z25.s, z13.s, z1.s[1]\n"
1343 "addvl x7, x7, #1\n"
1344 "fmla z26.s, z13.s, z2.s[1]\n"
1345 "fmla z27.s, z13.s, z3.s[1]\n"
1346 "fmla z28.s, z13.s, z4.s[1]\n"
1347 "fmla z29.s, z13.s, z5.s[1]\n"
1348 "fmla z30.s, z13.s, z6.s[1]\n"
1349 "ble 96f\n"
1350 "ld1w { z14.s }, p2/Z, [x7]\n"
1351 "fmla z24.s, z14.s, z0.s[2]\n"
1352 "subs x15, x15, #0x1\n"
1353 "fmla z25.s, z14.s, z1.s[2]\n"
1354 "addvl x7, x7, #1\n"
1355 "fmla z26.s, z14.s, z2.s[2]\n"
1356 "fmla z27.s, z14.s, z3.s[2]\n"
1357 "fmla z28.s, z14.s, z4.s[2]\n"
1358 "fmla z29.s, z14.s, z5.s[2]\n"
1359 "fmla z30.s, z14.s, z6.s[2]\n"
1360 "ble 96f\n"
1361 "ld1w { z15.s }, p2/Z, [x7]\n"
1362 "fmla z24.s, z15.s, z0.s[3]\n"
1363 "addvl x7, x7, #1\n"
1364 "fmla z25.s, z15.s, z1.s[3]\n"
1365 "fmla z26.s, z15.s, z2.s[3]\n"
1366 "fmla z27.s, z15.s, z3.s[3]\n"
1367 "fmla z28.s, z15.s, z4.s[3]\n"
1368 "fmla z29.s, z15.s, z5.s[3]\n"
1369 "fmla z30.s, z15.s, z6.s[3]\n"
1370 "96:" // Height 7: Multiply loop: multiply skip
1371 "prfm pldl1keep, [x14, #0x80]\n"
1372 "add x16, x16, #0x1\n"
1373 "prfm pldl1keep, [x12, #0x80]\n"
1374 "prfm pldl1keep, [x10, #0x80]\n"
1375 "prfm pldl1keep, [x28, #0x80]\n"
1376 "prfm pldl1keep, [x26, #0x80]\n"
1377 "prfm pldl1keep, [x24, #0x80]\n"
1378 "prfm pldl1keep, [x22, #0x80]\n"
1379 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1380 "cmp x16, x19\n"
1381 "bne 91b\n"
1382 "prfm pstl1keep, [x17, #0x0]\n"
1383 "prfm pstl1keep, [x13, #0x0]\n"
1384 "prfm pstl1keep, [x11, #0x0]\n"
1385 "prfm pstl1keep, [x9, #0x0]\n"
1386 "prfm pstl1keep, [x27, #0x0]\n"
1387 "prfm pstl1keep, [x25, #0x0]\n"
1388 "prfm pstl1keep, [x23, #0x0]\n"
1389 "tbz %x[flags], #1, 97f\n"
1390 "add x19, %x[args_ptr], %[offset_min]\n"
1391 "ld1rw { z17.s }, p2/Z, [x19]\n"
1392 "add x19, %x[args_ptr], %[offset_max]\n"
1393 "ld1rw { z16.s }, p2/Z, [x19]\n"
1394 "fmin z24.s, p2/M, z24.s, z16.s\n"
1395 "fmin z25.s, p2/M, z25.s, z16.s\n"
1396 "fmin z26.s, p2/M, z26.s, z16.s\n"
1397 "fmin z27.s, p2/M, z27.s, z16.s\n"
1398 "fmin z28.s, p2/M, z28.s, z16.s\n"
1399 "fmax z24.s, p2/M, z24.s, z17.s\n"
1400 "fmax z25.s, p2/M, z25.s, z17.s\n"
1401 "fmax z26.s, p2/M, z26.s, z17.s\n"
1402 "fmax z27.s, p2/M, z27.s, z17.s\n"
1403 "fmax z28.s, p2/M, z28.s, z17.s\n"
1404 "fmin z29.s, p2/M, z29.s, z16.s\n"
1405 "fmin z30.s, p2/M, z30.s, z16.s\n"
1406 "fmax z29.s, p2/M, z29.s, z17.s\n"
1407 "fmax z30.s, p2/M, z30.s, z17.s\n"
1408 "97:" // Height 7: No activation
1409 "st1w { z24.s }, p1, [x17]\n"
1410 "addvl x17, x17, #1\n"
1411 "st1w { z25.s }, p1, [x13]\n"
1412 "addvl x13, x13, #1\n"
1413 "st1w { z26.s }, p1, [x11]\n"
1414 "addvl x11, x11, #1\n"
1415 "st1w { z27.s }, p1, [x9]\n"
1416 "addvl x9, x9, #1\n"
1417 "st1w { z28.s }, p1, [x27]\n"
1418 "addvl x27, x27, #1\n"
1419 "st1w { z29.s }, p1, [x25]\n"
1420 "addvl x25, x25, #1\n"
1421 "st1w { z30.s }, p1, [x23]\n"
1422 "addvl x23, x23, #1\n"
1423 "98:" // Height 7: Writeback done
1424 "mov x19, #0x0\n"
1425 "incw x19\n"
1426 "subs x6, x6, x19\n"
1427 "bgt 87b\n"
1428 "b 114f\n"
1429 "99:" // Height 8
1430 "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
1431 "mov x8, %x[bias]\n"
1432 "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1433 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1434 "tbz %x[flags], #2, 100f\n"
1435 "ldr x17, [%x[output_ptr], #0x0]\n"
1436 "add x17, x17, x19, LSL #2\n"
1437 "ldr x13, [%x[output_ptr], #0x8]\n"
1438 "ldr x11, [%x[output_ptr], #0x10]\n"
1439 "add x13, x13, x19, LSL #2\n"
1440 "ldr x9, [%x[output_ptr], #0x18]\n"
1441 "ldr x27, [%x[output_ptr], #0x20]\n"
1442 "add x11, x11, x19, LSL #2\n"
1443 "ldr x25, [%x[output_ptr], #0x28]\n"
1444 "add x9, x9, x19, LSL #2\n"
1445 "ldr x23, [%x[output_ptr], #0x30]\n"
1446 "ldr x21, [%x[output_ptr], #0x38]\n"
1447 "add x27, x27, x19, LSL #2\n"
1448 "add x25, x25, x19, LSL #2\n"
1449 "add %x[output_ptr], %x[output_ptr], #0x40\n"
1450 "add x23, x23, x19, LSL #2\n"
1451 "add x21, x21, x19, LSL #2\n"
1452 "b 101f\n"
1453 "100:" // Height 8: setup direct output
1454 "mov x17, %x[output_ptr]\n"
1455 "add x13, x17, x19, LSL #2\n"
1456 "add x11, x13, x19, LSL #2\n"
1457 "add x9, x11, x19, LSL #2\n"
1458 "add x27, x9, x19, LSL #2\n"
1459 "add x25, x27, x19, LSL #2\n"
1460 "add x23, x25, x19, LSL #2\n"
1461 "add x21, x23, x19, LSL #2\n"
1462 "add %x[output_ptr], x21, x19, LSL #2\n"
1463 "101:" // Height 8: Column loop
1464 "mov x19, #0x0\n"
1465 "whilelt p1.s, x19, x6\n"
1466 "cbz x8, 102f\n"
1467 "ld1w { z24.s }, p2/Z, [x8]\n"
1468 "mov z25.d, z24.d\n"
1469 "addvl x8, x8, #1\n"
1470 "mov z26.d, z24.d\n"
1471 "mov z27.d, z24.d\n"
1472 "mov z28.d, z24.d\n"
1473 "mov z29.d, z24.d\n"
1474 "mov z30.d, z24.d\n"
1475 "mov z31.d, z24.d\n"
1476 "b 104f\n"
1477 "102:" // Height 8: no bias
1478 "tbz %x[flags], #0, 103f\n"
1479 "ld1w { z24.s }, p1/Z, [x17]\n"
1480 "ld1w { z25.s }, p1/Z, [x13]\n"
1481 "ld1w { z26.s }, p1/Z, [x11]\n"
1482 "ld1w { z27.s }, p1/Z, [x9]\n"
1483 "ld1w { z28.s }, p1/Z, [x27]\n"
1484 "ld1w { z29.s }, p1/Z, [x25]\n"
1485 "ld1w { z30.s }, p1/Z, [x23]\n"
1486 "ld1w { z31.s }, p1/Z, [x21]\n"
1487 "b 104f\n"
1488 "103:" // Height 8: no accumulate
1489 "mov z24.b, #0x0\n"
1490 "mov z25.b, #0x0\n"
1491 "mov z26.b, #0x0\n"
1492 "mov z27.b, #0x0\n"
1493 "mov z28.b, #0x0\n"
1494 "mov z29.b, #0x0\n"
1495 "mov z30.b, #0x0\n"
1496 "mov z31.b, #0x0\n"
1497 "104:" // Height 8: setup done
1498 "mov x16, #0x0\n"
1499 "105:" // Height 8: String loop
1500 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1501 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1502 "ldr w15, [x20, x16, LSL #0x2]\n"
1503 "tbz %x[flags], #3, 106f\n"
1504 "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
1505 "add x20, x20, x19, LSL #3\n"
1506 "ldr x14, [x20, #0x0]\n"
1507 "ldr x12, [x20, #0x8]\n"
1508 "ldr x10, [x20, #0x10]\n"
1509 "ldr x28, [x20, #0x18]\n"
1510 "ldr x26, [x20, #0x20]\n"
1511 "ldr x24, [x20, #0x28]\n"
1512 "ldr x22, [x20, #0x30]\n"
1513 "ldr x20, [x20, #0x38]\n"
1514 "cbnz x16, 107f\n"
1515 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1516 "add x14, x14, x19, LSL #2\n"
1517 "add x12, x12, x19, LSL #2\n"
1518 "add x10, x10, x19, LSL #2\n"
1519 "add x28, x28, x19, LSL #2\n"
1520 "add x26, x26, x19, LSL #2\n"
1521 "add x24, x24, x19, LSL #2\n"
1522 "add x22, x22, x19, LSL #2\n"
1523 "add x20, x20, x19, LSL #2\n"
1524 "b 107f\n"
1525 "106:" // Height 8: setup direct input
1526 "mov x14, %x[input_ptr]\n"
1527 "add x12, x14, x19, LSL #2\n"
1528 "add x10, x12, x19, LSL #2\n"
1529 "add x28, x10, x19, LSL #2\n"
1530 "add x26, x28, x19, LSL #2\n"
1531 "add x24, x26, x19, LSL #2\n"
1532 "add x22, x24, x19, LSL #2\n"
1533 "add x20, x22, x19, LSL #2\n"
1534 "107:" // Height 8: input setup done
1535 "cmp x15, #0x4\n"
1536 "ble 109f\n"
1537 "108:" // Height 8: Multiply loop: Main loop head
1538 "ld1w { z8.s }, p2/Z, [x7]\n"
1539 "whilelt p0.s, XZR, x15\n"
1540 "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
1541 "sub x15, x15, #0x4\n"
1542 "ld1rqw { z0.s }, p0/Z, [x14]\n"
1543 "fmla z24.s, z8.s, z0.s[0]\n"
1544 "ld1rqw { z1.s }, p0/Z, [x12]\n"
1545 "add x14, x14, #0x10\n"
1546 "fmla z25.s, z8.s, z1.s[0]\n"
1547 "ld1rqw { z2.s }, p0/Z, [x10]\n"
1548 "add x12, x12, #0x10\n"
1549 "fmla z24.s, z9.s, z0.s[1]\n"
1550 "ld1rqw { z3.s }, p0/Z, [x28]\n"
1551 "add x10, x10, #0x10\n"
1552 "fmla z26.s, z8.s, z2.s[0]\n"
1553 "ld1rqw { z4.s }, p0/Z, [x26]\n"
1554 "add x28, x28, #0x10\n"
1555 "fmla z27.s, z8.s, z3.s[0]\n"
1556 "ld1rqw { z5.s }, p0/Z, [x24]\n"
1557 "add x26, x26, #0x10\n"
1558 "fmla z25.s, z9.s, z1.s[1]\n"
1559 "ld1rqw { z6.s }, p0/Z, [x22]\n"
1560 "add x24, x24, #0x10\n"
1561 "fmla z28.s, z8.s, z4.s[0]\n"
1562 "ld1rqw { z7.s }, p0/Z, [x20]\n"
1563 "add x22, x22, #0x10\n"
1564 "fmla z29.s, z8.s, z5.s[0]\n"
1565 "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
1566 "add x20, x20, #0x10\n"
1567 "fmla z30.s, z8.s, z6.s[0]\n"
1568 "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
1569 "cmp x15, #0x4\n"
1570 "fmla z31.s, z8.s, z7.s[0]\n"
1571 "prfm pldl1keep, [x14, #0x80]\n"
1572 "addvl x7, x7, #4\n"
1573 "fmla z26.s, z9.s, z2.s[1]\n"
1574 "prfm pldl1keep, [x12, #0x80]\n"
1575 "fmla z27.s, z9.s, z3.s[1]\n"
1576 "prfm pldl1keep, [x10, #0x80]\n"
1577 "fmla z28.s, z9.s, z4.s[1]\n"
1578 "prfm pldl1keep, [x28, #0x80]\n"
1579 "fmla z29.s, z9.s, z5.s[1]\n"
1580 "prfm pldl1keep, [x26, #0x80]\n"
1581 "fmla z30.s, z9.s, z6.s[1]\n"
1582 "prfm pldl1keep, [x24, #0x80]\n"
1583 "fmla z31.s, z9.s, z7.s[1]\n"
1584 "prfm pldl1keep, [x22, #0x80]\n"
1585 "fmla z24.s, z10.s, z0.s[2]\n"
1586 "prfm pldl1keep, [x20, #0x80]\n"
1587 "fmla z25.s, z10.s, z1.s[2]\n"
1588 "fmla z26.s, z10.s, z2.s[2]\n"
1589 "fmla z27.s, z10.s, z3.s[2]\n"
1590 "fmla z28.s, z10.s, z4.s[2]\n"
1591 "fmla z29.s, z10.s, z5.s[2]\n"
1592 "fmla z30.s, z10.s, z6.s[2]\n"
1593 "fmla z31.s, z10.s, z7.s[2]\n"
1594 "fmla z24.s, z11.s, z0.s[3]\n"
1595 "fmla z25.s, z11.s, z1.s[3]\n"
1596 "fmla z26.s, z11.s, z2.s[3]\n"
1597 "fmla z27.s, z11.s, z3.s[3]\n"
1598 "fmla z28.s, z11.s, z4.s[3]\n"
1599 "fmla z29.s, z11.s, z5.s[3]\n"
1600 "fmla z30.s, z11.s, z6.s[3]\n"
1601 "fmla z31.s, z11.s, z7.s[3]\n"
1602 "bgt 108b\n"
1603 "109:" // Height 8: Multiply loop: Single iteration only
1604 "ld1w { z12.s }, p2/Z, [x7]\n"
1605 "whilelt p0.s, XZR, x15\n"
1606 "subs x15, x15, #0x1\n"
1607 "ld1rqw { z0.s }, p0/Z, [x14]\n"
1608 "fmla z24.s, z12.s, z0.s[0]\n"
1609 "ld1rqw { z1.s }, p0/Z, [x12]\n"
1610 "add x14, x14, #0x10\n"
1611 "fmla z25.s, z12.s, z1.s[0]\n"
1612 "ld1rqw { z2.s }, p0/Z, [x10]\n"
1613 "add x12, x12, #0x10\n"
1614 "fmla z26.s, z12.s, z2.s[0]\n"
1615 "ld1rqw { z3.s }, p0/Z, [x28]\n"
1616 "add x10, x10, #0x10\n"
1617 "fmla z27.s, z12.s, z3.s[0]\n"
1618 "ld1rqw { z4.s }, p0/Z, [x26]\n"
1619 "add x28, x28, #0x10\n"
1620 "fmla z28.s, z12.s, z4.s[0]\n"
1621 "ld1rqw { z5.s }, p0/Z, [x24]\n"
1622 "add x26, x26, #0x10\n"
1623 "fmla z29.s, z12.s, z5.s[0]\n"
1624 "ld1rqw { z6.s }, p0/Z, [x22]\n"
1625 "add x24, x24, #0x10\n"
1626 "fmla z30.s, z12.s, z6.s[0]\n"
1627 "ld1rqw { z7.s }, p0/Z, [x20]\n"
1628 "add x22, x22, #0x10\n"
1629 "fmla z31.s, z12.s, z7.s[0]\n"
1630 "add x20, x20, #0x10\n"
1631 "addvl x7, x7, #1\n"
1632 "ble 110f\n"
1633 "ld1w { z13.s }, p2/Z, [x7]\n"
1634 "fmla z24.s, z13.s, z0.s[1]\n"
1635 "subs x15, x15, #0x1\n"
1636 "fmla z25.s, z13.s, z1.s[1]\n"
1637 "addvl x7, x7, #1\n"
1638 "fmla z26.s, z13.s, z2.s[1]\n"
1639 "fmla z27.s, z13.s, z3.s[1]\n"
1640 "fmla z28.s, z13.s, z4.s[1]\n"
1641 "fmla z29.s, z13.s, z5.s[1]\n"
1642 "fmla z30.s, z13.s, z6.s[1]\n"
1643 "fmla z31.s, z13.s, z7.s[1]\n"
1644 "ble 110f\n"
1645 "ld1w { z14.s }, p2/Z, [x7]\n"
1646 "fmla z24.s, z14.s, z0.s[2]\n"
1647 "subs x15, x15, #0x1\n"
1648 "fmla z25.s, z14.s, z1.s[2]\n"
1649 "addvl x7, x7, #1\n"
1650 "fmla z26.s, z14.s, z2.s[2]\n"
1651 "fmla z27.s, z14.s, z3.s[2]\n"
1652 "fmla z28.s, z14.s, z4.s[2]\n"
1653 "fmla z29.s, z14.s, z5.s[2]\n"
1654 "fmla z30.s, z14.s, z6.s[2]\n"
1655 "fmla z31.s, z14.s, z7.s[2]\n"
1656 "ble 110f\n"
1657 "ld1w { z15.s }, p2/Z, [x7]\n"
1658 "fmla z24.s, z15.s, z0.s[3]\n"
1659 "addvl x7, x7, #1\n"
1660 "fmla z25.s, z15.s, z1.s[3]\n"
1661 "fmla z26.s, z15.s, z2.s[3]\n"
1662 "fmla z27.s, z15.s, z3.s[3]\n"
1663 "fmla z28.s, z15.s, z4.s[3]\n"
1664 "fmla z29.s, z15.s, z5.s[3]\n"
1665 "fmla z30.s, z15.s, z6.s[3]\n"
1666 "fmla z31.s, z15.s, z7.s[3]\n"
1667 "110:" // Height 8: Multiply loop: multiply skip
1668 "prfm pldl1keep, [x14, #0x80]\n"
1669 "add x16, x16, #0x1\n"
1670 "prfm pldl1keep, [x12, #0x80]\n"
1671 "prfm pldl1keep, [x10, #0x80]\n"
1672 "prfm pldl1keep, [x28, #0x80]\n"
1673 "prfm pldl1keep, [x26, #0x80]\n"
1674 "prfm pldl1keep, [x24, #0x80]\n"
1675 "prfm pldl1keep, [x22, #0x80]\n"
1676 "prfm pldl1keep, [x20, #0x80]\n"
1677 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1678 "cmp x16, x19\n"
1679 "bne 105b\n"
1680 "prfm pstl1keep, [x17, #0x0]\n"
1681 "prfm pstl1keep, [x13, #0x0]\n"
1682 "prfm pstl1keep, [x11, #0x0]\n"
1683 "prfm pstl1keep, [x9, #0x0]\n"
1684 "prfm pstl1keep, [x27, #0x0]\n"
1685 "prfm pstl1keep, [x25, #0x0]\n"
1686 "prfm pstl1keep, [x23, #0x0]\n"
1687 "prfm pstl1keep, [x21, #0x0]\n"
1688 "tbz %x[flags], #1, 111f\n"
1689 "add x19, %x[args_ptr], %[offset_min]\n"
1690 "ld1rw { z17.s }, p2/Z, [x19]\n"
1691 "add x19, %x[args_ptr], %[offset_max]\n"
1692 "ld1rw { z16.s }, p2/Z, [x19]\n"
1693 "fmin z24.s, p2/M, z24.s, z16.s\n"
1694 "fmin z25.s, p2/M, z25.s, z16.s\n"
1695 "fmin z26.s, p2/M, z26.s, z16.s\n"
1696 "fmin z27.s, p2/M, z27.s, z16.s\n"
1697 "fmin z28.s, p2/M, z28.s, z16.s\n"
1698 "fmax z24.s, p2/M, z24.s, z17.s\n"
1699 "fmax z25.s, p2/M, z25.s, z17.s\n"
1700 "fmax z26.s, p2/M, z26.s, z17.s\n"
1701 "fmax z27.s, p2/M, z27.s, z17.s\n"
1702 "fmax z28.s, p2/M, z28.s, z17.s\n"
1703 "fmin z29.s, p2/M, z29.s, z16.s\n"
1704 "fmin z30.s, p2/M, z30.s, z16.s\n"
1705 "fmin z31.s, p2/M, z31.s, z16.s\n"
1706 "fmax z29.s, p2/M, z29.s, z17.s\n"
1707 "fmax z30.s, p2/M, z30.s, z17.s\n"
1708 "fmax z31.s, p2/M, z31.s, z17.s\n"
1709 "111:" // Height 8: No activation
1710 "st1w { z24.s }, p1, [x17]\n"
1711 "addvl x17, x17, #1\n"
1712 "st1w { z25.s }, p1, [x13]\n"
1713 "addvl x13, x13, #1\n"
1714 "st1w { z26.s }, p1, [x11]\n"
1715 "addvl x11, x11, #1\n"
1716 "st1w { z27.s }, p1, [x9]\n"
1717 "addvl x9, x9, #1\n"
1718 "st1w { z28.s }, p1, [x27]\n"
1719 "addvl x27, x27, #1\n"
1720 "st1w { z29.s }, p1, [x25]\n"
1721 "addvl x25, x25, #1\n"
1722 "st1w { z30.s }, p1, [x23]\n"
1723 "addvl x23, x23, #1\n"
1724 "st1w { z31.s }, p1, [x21]\n"
1725 "addvl x21, x21, #1\n"
1726 "112:" // Height 8: Writeback done
1727 "mov x19, #0x0\n"
1728 "incw x19\n"
1729 "subs x6, x6, x19\n"
1730 "bgt 101b\n"
1731 "subs %x[M], %x[M], #0x8\n"
1732 "beq 114f\n"
1733 "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1734 "tbz %x[flags], #3, 113f\n"
1735 "add x20, x20, #0x8\n"
1736 "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1737 "b 1b\n"
1738 "113:" // Update direct input
1739 "mov x19, #0x20\n"
1740 "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
1741 "b 1b\n"
1742 "114:" // Exit
1743
1744 : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
1745 : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
1746 : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
1747 );
1748 }
1749
1750 } // namespace arm_gemm
1751 #endif // __ARM_FEATURE_SVE
1752