• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 #ifdef __ARM_FEATURE_SVE
25 
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28 
29 #include <cassert>
30 
31 namespace arm_gemm {
32 
sve_hybrid_fp32_mla_8x1VL(unsigned int num_strings,const unsigned int * string_lengths,IndirectInputArg<float> A_arg,size_t M,size_t N,const float * B_ptr,IndirectOutputArg<float> output_arg,const float * bias,Activation act,bool accumulate)33 void sve_hybrid_fp32_mla_8x1VL (
34     unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
35     size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
36     const float *bias, Activation act, bool accumulate
37 )
38 {
39     struct KernelArgs {
40         float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
41         float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
42         unsigned int num_strings = {};
43         const unsigned int *string_lengths = {};
44         size_t N = {};
45         const float *B_ptr = {};
46         size_t output_offset = {};
47         size_t input_initial_col = {};
48         size_t input_offset = {};
49     } ka;
50 
51     unsigned long flags=0;
52     void *output_ptr;
53     void *input_ptr;
54 
55     if (output_arg.is_indirect) {
56         output_ptr=(void *)(output_arg.indirect.ptr);
57         ka.output_offset=output_arg.indirect.offset;
58         flags |= 0x4;
59     } else {
60         output_ptr=(void *)(output_arg.direct.base);
61         ka.output_offset=output_arg.direct.stride;
62     }
63 
64     if (A_arg.is_indirect) {
65         input_ptr=(void *)(A_arg.indirect.ptr);
66         ka.input_offset=A_arg.indirect.start_row;
67         ka.input_initial_col=A_arg.indirect.start_col;
68         flags |= 0x8;
69     } else {
70         assert(num_strings==1);
71         input_ptr=(void *)(A_arg.direct.base);
72         ka.input_offset=A_arg.direct.stride;
73     }
74     if (accumulate) {
75         flags |= 0x1;
76     }
77     ka.num_strings = num_strings;
78     ka.string_lengths = string_lengths;
79     ka.N = N;
80     ka.B_ptr = B_ptr;
81     switch(act.type) {
82         default:
83         case Activation::Type::None:
84             break;
85         case Activation::Type::BoundedReLU:
86             ka.maxval = static_cast<float>(act.param1);
87             /* fall through */
88         case Activation::Type::ReLU:
89             ka.minval = 0;
90             flags |= 0x2;
91             break;
92     }
93     __asm__ __volatile__(
94       "ptrue p2.b\n"
95       "1:"  // Row loop
96       "cmp %x[M], #0x8\n"
97       "bge 99f\n"
98       "cmp %x[M], #0x6\n"
99       "bgt 85f\n"
100       "beq 71f\n"
101       "cmp %x[M], #0x4\n"
102       "bgt 57f\n"
103       "beq 43f\n"
104       "cmp %x[M], #0x2\n"
105       "bgt 29f\n"
106       "beq 15f\n"
107       "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
108       "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
109       "mov x8, %x[bias]\n"
110       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
111       "tbz %x[flags], #2, 2f\n"
112       "ldr x17, [%x[output_ptr], #0x0]\n"
113       "add x17, x17, x19, LSL #2\n"
114       "b 3f\n"
115       "2:"  // Height 1: setup direct output
116       "mov x17, %x[output_ptr]\n"
117       "3:"  // Height 1: Column loop
118       "mov x19, #0x0\n"
119       "whilelt p1.s, x19, x6\n"
120       "cbz x8, 4f\n"
121       "ld1w { z24.s }, p2/Z, [x8]\n"
122       "addvl x8, x8, #1\n"
123       "b 6f\n"
124       "4:"  // Height 1: no bias
125       "tbz %x[flags], #0, 5f\n"
126       "ld1w { z24.s }, p1/Z, [x17]\n"
127       "b 6f\n"
128       "5:"  // Height 1: no accumulate
129       "mov z24.b, #0x0\n"
130       "6:"  // Height 1: setup done
131       "mov x16, #0x0\n"
132       "7:"  // Height 1: String loop
133       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
134       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
135       "ldr w15, [x20, x16, LSL #0x2]\n"
136       "tbz %x[flags], #3, 8f\n"
137       "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
138       "add x20, x20, x19, LSL #3\n"
139       "ldr x14, [x20, #0x0]\n"
140       "cbnz x16, 9f\n"
141       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
142       "add x14, x14, x19, LSL #2\n"
143       "b 9f\n"
144       "8:"  // Height 1: setup direct input
145       "mov x14, %x[input_ptr]\n"
146       "9:"  // Height 1: input setup done
147       "cmp x15, #0x4\n"
148       "ble 11f\n"
149       "10:"  // Height 1: Multiply loop: Main loop head
150       "ld1w { z8.s }, p2/Z, [x7]\n"
151       "whilelt p0.s, XZR, x15\n"
152       "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
153       "sub x15, x15, #0x4\n"
154       "ld1rqw { z0.s }, p0/Z, [x14]\n"
155       "fmla z24.s, z8.s, z0.s[0]\n"
156       "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
157       "add x14, x14, #0x10\n"
158       "fmla z24.s, z9.s, z0.s[1]\n"
159       "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
160       "cmp x15, #0x4\n"
161       "fmla z24.s, z10.s, z0.s[2]\n"
162       "prfm pldl1keep, [x14, #0x80]\n"
163       "addvl x7, x7, #4\n"
164       "fmla z24.s, z11.s, z0.s[3]\n"
165       "bgt 10b\n"
166       "11:"  // Height 1: Multiply loop: Single iteration only
167       "ld1w { z12.s }, p2/Z, [x7]\n"
168       "whilelt p0.s, XZR, x15\n"
169       "subs x15, x15, #0x1\n"
170       "ld1rqw { z0.s }, p0/Z, [x14]\n"
171       "fmla z24.s, z12.s, z0.s[0]\n"
172       "add x14, x14, #0x10\n"
173       "addvl x7, x7, #1\n"
174       "ble 12f\n"
175       "ld1w { z13.s }, p2/Z, [x7]\n"
176       "fmla z24.s, z13.s, z0.s[1]\n"
177       "subs x15, x15, #0x1\n"
178       "addvl x7, x7, #1\n"
179       "ble 12f\n"
180       "ld1w { z14.s }, p2/Z, [x7]\n"
181       "fmla z24.s, z14.s, z0.s[2]\n"
182       "subs x15, x15, #0x1\n"
183       "addvl x7, x7, #1\n"
184       "ble 12f\n"
185       "ld1w { z15.s }, p2/Z, [x7]\n"
186       "fmla z24.s, z15.s, z0.s[3]\n"
187       "addvl x7, x7, #1\n"
188       "12:"  // Height 1: Multiply loop: multiply skip
189       "prfm pldl1keep, [x14, #0x80]\n"
190       "add x16, x16, #0x1\n"
191       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
192       "cmp x16, x19\n"
193       "bne 7b\n"
194       "prfm pstl1keep, [x17, #0x0]\n"
195       "tbz %x[flags], #1, 13f\n"
196       "add x19, %x[args_ptr], %[offset_min]\n"
197       "ld1rw { z17.s }, p2/Z, [x19]\n"
198       "add x19, %x[args_ptr], %[offset_max]\n"
199       "ld1rw { z16.s }, p2/Z, [x19]\n"
200       "fmin z24.s, p2/M, z24.s, z16.s\n"
201       "fmax z24.s, p2/M, z24.s, z17.s\n"
202       "13:"  // Height 1: No activation
203       "st1w { z24.s }, p1, [x17]\n"
204       "addvl x17, x17, #1\n"
205       "14:"  // Height 1: Writeback done
206       "mov x19, #0x0\n"
207       "incw x19\n"
208       "subs x6, x6, x19\n"
209       "bgt 3b\n"
210       "b 114f\n"
211       "15:"  // Height 2
212       "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
213       "mov x8, %x[bias]\n"
214       "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
215       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
216       "tbz %x[flags], #2, 16f\n"
217       "ldr x17, [%x[output_ptr], #0x0]\n"
218       "add x17, x17, x19, LSL #2\n"
219       "ldr x13, [%x[output_ptr], #0x8]\n"
220       "add x13, x13, x19, LSL #2\n"
221       "b 17f\n"
222       "16:"  // Height 2: setup direct output
223       "mov x17, %x[output_ptr]\n"
224       "add x13, x17, x19, LSL #2\n"
225       "17:"  // Height 2: Column loop
226       "mov x19, #0x0\n"
227       "whilelt p1.s, x19, x6\n"
228       "cbz x8, 18f\n"
229       "ld1w { z24.s }, p2/Z, [x8]\n"
230       "mov z25.d, z24.d\n"
231       "addvl x8, x8, #1\n"
232       "b 20f\n"
233       "18:"  // Height 2: no bias
234       "tbz %x[flags], #0, 19f\n"
235       "ld1w { z24.s }, p1/Z, [x17]\n"
236       "ld1w { z25.s }, p1/Z, [x13]\n"
237       "b 20f\n"
238       "19:"  // Height 2: no accumulate
239       "mov z24.b, #0x0\n"
240       "mov z25.b, #0x0\n"
241       "20:"  // Height 2: setup done
242       "mov x16, #0x0\n"
243       "21:"  // Height 2: String loop
244       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
245       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
246       "ldr w15, [x20, x16, LSL #0x2]\n"
247       "tbz %x[flags], #3, 22f\n"
248       "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
249       "add x20, x20, x19, LSL #3\n"
250       "ldr x14, [x20, #0x0]\n"
251       "ldr x12, [x20, #0x8]\n"
252       "cbnz x16, 23f\n"
253       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
254       "add x14, x14, x19, LSL #2\n"
255       "add x12, x12, x19, LSL #2\n"
256       "b 23f\n"
257       "22:"  // Height 2: setup direct input
258       "mov x14, %x[input_ptr]\n"
259       "add x12, x14, x19, LSL #2\n"
260       "23:"  // Height 2: input setup done
261       "cmp x15, #0x4\n"
262       "ble 25f\n"
263       "24:"  // Height 2: Multiply loop: Main loop head
264       "ld1w { z8.s }, p2/Z, [x7]\n"
265       "whilelt p0.s, XZR, x15\n"
266       "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
267       "sub x15, x15, #0x4\n"
268       "ld1rqw { z0.s }, p0/Z, [x14]\n"
269       "fmla z24.s, z8.s, z0.s[0]\n"
270       "ld1rqw { z1.s }, p0/Z, [x12]\n"
271       "add x14, x14, #0x10\n"
272       "fmla z25.s, z8.s, z1.s[0]\n"
273       "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
274       "add x12, x12, #0x10\n"
275       "fmla z24.s, z9.s, z0.s[1]\n"
276       "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
277       "cmp x15, #0x4\n"
278       "fmla z25.s, z9.s, z1.s[1]\n"
279       "prfm pldl1keep, [x14, #0x80]\n"
280       "addvl x7, x7, #4\n"
281       "fmla z24.s, z10.s, z0.s[2]\n"
282       "prfm pldl1keep, [x12, #0x80]\n"
283       "fmla z25.s, z10.s, z1.s[2]\n"
284       "fmla z24.s, z11.s, z0.s[3]\n"
285       "fmla z25.s, z11.s, z1.s[3]\n"
286       "bgt 24b\n"
287       "25:"  // Height 2: Multiply loop: Single iteration only
288       "ld1w { z12.s }, p2/Z, [x7]\n"
289       "whilelt p0.s, XZR, x15\n"
290       "subs x15, x15, #0x1\n"
291       "ld1rqw { z0.s }, p0/Z, [x14]\n"
292       "fmla z24.s, z12.s, z0.s[0]\n"
293       "ld1rqw { z1.s }, p0/Z, [x12]\n"
294       "add x14, x14, #0x10\n"
295       "fmla z25.s, z12.s, z1.s[0]\n"
296       "add x12, x12, #0x10\n"
297       "addvl x7, x7, #1\n"
298       "ble 26f\n"
299       "ld1w { z13.s }, p2/Z, [x7]\n"
300       "fmla z24.s, z13.s, z0.s[1]\n"
301       "subs x15, x15, #0x1\n"
302       "fmla z25.s, z13.s, z1.s[1]\n"
303       "addvl x7, x7, #1\n"
304       "ble 26f\n"
305       "ld1w { z14.s }, p2/Z, [x7]\n"
306       "fmla z24.s, z14.s, z0.s[2]\n"
307       "subs x15, x15, #0x1\n"
308       "fmla z25.s, z14.s, z1.s[2]\n"
309       "addvl x7, x7, #1\n"
310       "ble 26f\n"
311       "ld1w { z15.s }, p2/Z, [x7]\n"
312       "fmla z24.s, z15.s, z0.s[3]\n"
313       "addvl x7, x7, #1\n"
314       "fmla z25.s, z15.s, z1.s[3]\n"
315       "26:"  // Height 2: Multiply loop: multiply skip
316       "prfm pldl1keep, [x14, #0x80]\n"
317       "add x16, x16, #0x1\n"
318       "prfm pldl1keep, [x12, #0x80]\n"
319       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
320       "cmp x16, x19\n"
321       "bne 21b\n"
322       "prfm pstl1keep, [x17, #0x0]\n"
323       "prfm pstl1keep, [x13, #0x0]\n"
324       "tbz %x[flags], #1, 27f\n"
325       "add x19, %x[args_ptr], %[offset_min]\n"
326       "ld1rw { z17.s }, p2/Z, [x19]\n"
327       "add x19, %x[args_ptr], %[offset_max]\n"
328       "ld1rw { z16.s }, p2/Z, [x19]\n"
329       "fmin z24.s, p2/M, z24.s, z16.s\n"
330       "fmin z25.s, p2/M, z25.s, z16.s\n"
331       "fmax z24.s, p2/M, z24.s, z17.s\n"
332       "fmax z25.s, p2/M, z25.s, z17.s\n"
333       "27:"  // Height 2: No activation
334       "st1w { z24.s }, p1, [x17]\n"
335       "addvl x17, x17, #1\n"
336       "st1w { z25.s }, p1, [x13]\n"
337       "addvl x13, x13, #1\n"
338       "28:"  // Height 2: Writeback done
339       "mov x19, #0x0\n"
340       "incw x19\n"
341       "subs x6, x6, x19\n"
342       "bgt 17b\n"
343       "b 114f\n"
344       "29:"  // Height 3
345       "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
346       "mov x8, %x[bias]\n"
347       "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
348       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
349       "tbz %x[flags], #2, 30f\n"
350       "ldr x17, [%x[output_ptr], #0x0]\n"
351       "add x17, x17, x19, LSL #2\n"
352       "ldr x13, [%x[output_ptr], #0x8]\n"
353       "ldr x11, [%x[output_ptr], #0x10]\n"
354       "add x13, x13, x19, LSL #2\n"
355       "add x11, x11, x19, LSL #2\n"
356       "b 31f\n"
357       "30:"  // Height 3: setup direct output
358       "mov x17, %x[output_ptr]\n"
359       "add x13, x17, x19, LSL #2\n"
360       "add x11, x13, x19, LSL #2\n"
361       "31:"  // Height 3: Column loop
362       "mov x19, #0x0\n"
363       "whilelt p1.s, x19, x6\n"
364       "cbz x8, 32f\n"
365       "ld1w { z24.s }, p2/Z, [x8]\n"
366       "mov z25.d, z24.d\n"
367       "addvl x8, x8, #1\n"
368       "mov z26.d, z24.d\n"
369       "b 34f\n"
370       "32:"  // Height 3: no bias
371       "tbz %x[flags], #0, 33f\n"
372       "ld1w { z24.s }, p1/Z, [x17]\n"
373       "ld1w { z25.s }, p1/Z, [x13]\n"
374       "ld1w { z26.s }, p1/Z, [x11]\n"
375       "b 34f\n"
376       "33:"  // Height 3: no accumulate
377       "mov z24.b, #0x0\n"
378       "mov z25.b, #0x0\n"
379       "mov z26.b, #0x0\n"
380       "34:"  // Height 3: setup done
381       "mov x16, #0x0\n"
382       "35:"  // Height 3: String loop
383       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
384       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
385       "ldr w15, [x20, x16, LSL #0x2]\n"
386       "tbz %x[flags], #3, 36f\n"
387       "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
388       "add x20, x20, x19, LSL #3\n"
389       "ldr x14, [x20, #0x0]\n"
390       "ldr x12, [x20, #0x8]\n"
391       "ldr x10, [x20, #0x10]\n"
392       "cbnz x16, 37f\n"
393       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
394       "add x14, x14, x19, LSL #2\n"
395       "add x12, x12, x19, LSL #2\n"
396       "add x10, x10, x19, LSL #2\n"
397       "b 37f\n"
398       "36:"  // Height 3: setup direct input
399       "mov x14, %x[input_ptr]\n"
400       "add x12, x14, x19, LSL #2\n"
401       "add x10, x12, x19, LSL #2\n"
402       "37:"  // Height 3: input setup done
403       "cmp x15, #0x4\n"
404       "ble 39f\n"
405       "38:"  // Height 3: Multiply loop: Main loop head
406       "ld1w { z8.s }, p2/Z, [x7]\n"
407       "whilelt p0.s, XZR, x15\n"
408       "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
409       "sub x15, x15, #0x4\n"
410       "ld1rqw { z0.s }, p0/Z, [x14]\n"
411       "fmla z24.s, z8.s, z0.s[0]\n"
412       "ld1rqw { z1.s }, p0/Z, [x12]\n"
413       "add x14, x14, #0x10\n"
414       "fmla z25.s, z8.s, z1.s[0]\n"
415       "ld1rqw { z2.s }, p0/Z, [x10]\n"
416       "add x12, x12, #0x10\n"
417       "fmla z24.s, z9.s, z0.s[1]\n"
418       "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
419       "add x10, x10, #0x10\n"
420       "fmla z26.s, z8.s, z2.s[0]\n"
421       "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
422       "cmp x15, #0x4\n"
423       "fmla z25.s, z9.s, z1.s[1]\n"
424       "prfm pldl1keep, [x14, #0x80]\n"
425       "addvl x7, x7, #4\n"
426       "fmla z24.s, z10.s, z0.s[2]\n"
427       "prfm pldl1keep, [x12, #0x80]\n"
428       "prfm pldl1keep, [x10, #0x80]\n"
429       "fmla z26.s, z9.s, z2.s[1]\n"
430       "fmla z25.s, z10.s, z1.s[2]\n"
431       "fmla z24.s, z11.s, z0.s[3]\n"
432       "fmla z26.s, z10.s, z2.s[2]\n"
433       "fmla z25.s, z11.s, z1.s[3]\n"
434       "fmla z26.s, z11.s, z2.s[3]\n"
435       "bgt 38b\n"
436       "39:"  // Height 3: Multiply loop: Single iteration only
437       "ld1w { z12.s }, p2/Z, [x7]\n"
438       "whilelt p0.s, XZR, x15\n"
439       "subs x15, x15, #0x1\n"
440       "ld1rqw { z0.s }, p0/Z, [x14]\n"
441       "fmla z24.s, z12.s, z0.s[0]\n"
442       "ld1rqw { z1.s }, p0/Z, [x12]\n"
443       "add x14, x14, #0x10\n"
444       "fmla z25.s, z12.s, z1.s[0]\n"
445       "ld1rqw { z2.s }, p0/Z, [x10]\n"
446       "add x12, x12, #0x10\n"
447       "fmla z26.s, z12.s, z2.s[0]\n"
448       "add x10, x10, #0x10\n"
449       "addvl x7, x7, #1\n"
450       "ble 40f\n"
451       "ld1w { z13.s }, p2/Z, [x7]\n"
452       "fmla z24.s, z13.s, z0.s[1]\n"
453       "subs x15, x15, #0x1\n"
454       "fmla z25.s, z13.s, z1.s[1]\n"
455       "addvl x7, x7, #1\n"
456       "fmla z26.s, z13.s, z2.s[1]\n"
457       "ble 40f\n"
458       "ld1w { z14.s }, p2/Z, [x7]\n"
459       "fmla z24.s, z14.s, z0.s[2]\n"
460       "subs x15, x15, #0x1\n"
461       "fmla z25.s, z14.s, z1.s[2]\n"
462       "addvl x7, x7, #1\n"
463       "fmla z26.s, z14.s, z2.s[2]\n"
464       "ble 40f\n"
465       "ld1w { z15.s }, p2/Z, [x7]\n"
466       "fmla z24.s, z15.s, z0.s[3]\n"
467       "addvl x7, x7, #1\n"
468       "fmla z25.s, z15.s, z1.s[3]\n"
469       "fmla z26.s, z15.s, z2.s[3]\n"
470       "40:"  // Height 3: Multiply loop: multiply skip
471       "prfm pldl1keep, [x14, #0x80]\n"
472       "add x16, x16, #0x1\n"
473       "prfm pldl1keep, [x12, #0x80]\n"
474       "prfm pldl1keep, [x10, #0x80]\n"
475       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
476       "cmp x16, x19\n"
477       "bne 35b\n"
478       "prfm pstl1keep, [x17, #0x0]\n"
479       "prfm pstl1keep, [x13, #0x0]\n"
480       "prfm pstl1keep, [x11, #0x0]\n"
481       "tbz %x[flags], #1, 41f\n"
482       "add x19, %x[args_ptr], %[offset_min]\n"
483       "ld1rw { z17.s }, p2/Z, [x19]\n"
484       "add x19, %x[args_ptr], %[offset_max]\n"
485       "ld1rw { z16.s }, p2/Z, [x19]\n"
486       "fmin z24.s, p2/M, z24.s, z16.s\n"
487       "fmin z25.s, p2/M, z25.s, z16.s\n"
488       "fmin z26.s, p2/M, z26.s, z16.s\n"
489       "fmax z24.s, p2/M, z24.s, z17.s\n"
490       "fmax z25.s, p2/M, z25.s, z17.s\n"
491       "fmax z26.s, p2/M, z26.s, z17.s\n"
492       "41:"  // Height 3: No activation
493       "st1w { z24.s }, p1, [x17]\n"
494       "addvl x17, x17, #1\n"
495       "st1w { z25.s }, p1, [x13]\n"
496       "addvl x13, x13, #1\n"
497       "st1w { z26.s }, p1, [x11]\n"
498       "addvl x11, x11, #1\n"
499       "42:"  // Height 3: Writeback done
500       "mov x19, #0x0\n"
501       "incw x19\n"
502       "subs x6, x6, x19\n"
503       "bgt 31b\n"
504       "b 114f\n"
505       "43:"  // Height 4
506       "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
507       "mov x8, %x[bias]\n"
508       "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
509       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
510       "tbz %x[flags], #2, 44f\n"
511       "ldr x17, [%x[output_ptr], #0x0]\n"
512       "add x17, x17, x19, LSL #2\n"
513       "ldr x13, [%x[output_ptr], #0x8]\n"
514       "ldr x11, [%x[output_ptr], #0x10]\n"
515       "add x13, x13, x19, LSL #2\n"
516       "ldr x9, [%x[output_ptr], #0x18]\n"
517       "add x11, x11, x19, LSL #2\n"
518       "add x9, x9, x19, LSL #2\n"
519       "b 45f\n"
520       "44:"  // Height 4: setup direct output
521       "mov x17, %x[output_ptr]\n"
522       "add x13, x17, x19, LSL #2\n"
523       "add x11, x13, x19, LSL #2\n"
524       "add x9, x11, x19, LSL #2\n"
525       "45:"  // Height 4: Column loop
526       "mov x19, #0x0\n"
527       "whilelt p1.s, x19, x6\n"
528       "cbz x8, 46f\n"
529       "ld1w { z24.s }, p2/Z, [x8]\n"
530       "mov z25.d, z24.d\n"
531       "addvl x8, x8, #1\n"
532       "mov z26.d, z24.d\n"
533       "mov z27.d, z24.d\n"
534       "b 48f\n"
535       "46:"  // Height 4: no bias
536       "tbz %x[flags], #0, 47f\n"
537       "ld1w { z24.s }, p1/Z, [x17]\n"
538       "ld1w { z25.s }, p1/Z, [x13]\n"
539       "ld1w { z26.s }, p1/Z, [x11]\n"
540       "ld1w { z27.s }, p1/Z, [x9]\n"
541       "b 48f\n"
542       "47:"  // Height 4: no accumulate
543       "mov z24.b, #0x0\n"
544       "mov z25.b, #0x0\n"
545       "mov z26.b, #0x0\n"
546       "mov z27.b, #0x0\n"
547       "48:"  // Height 4: setup done
548       "mov x16, #0x0\n"
549       "49:"  // Height 4: String loop
550       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
551       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
552       "ldr w15, [x20, x16, LSL #0x2]\n"
553       "tbz %x[flags], #3, 50f\n"
554       "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
555       "add x20, x20, x19, LSL #3\n"
556       "ldr x14, [x20, #0x0]\n"
557       "ldr x12, [x20, #0x8]\n"
558       "ldr x10, [x20, #0x10]\n"
559       "ldr x28, [x20, #0x18]\n"
560       "cbnz x16, 51f\n"
561       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
562       "add x14, x14, x19, LSL #2\n"
563       "add x12, x12, x19, LSL #2\n"
564       "add x10, x10, x19, LSL #2\n"
565       "add x28, x28, x19, LSL #2\n"
566       "b 51f\n"
567       "50:"  // Height 4: setup direct input
568       "mov x14, %x[input_ptr]\n"
569       "add x12, x14, x19, LSL #2\n"
570       "add x10, x12, x19, LSL #2\n"
571       "add x28, x10, x19, LSL #2\n"
572       "51:"  // Height 4: input setup done
573       "cmp x15, #0x4\n"
574       "ble 53f\n"
575       "52:"  // Height 4: Multiply loop: Main loop head
576       "ld1w { z8.s }, p2/Z, [x7]\n"
577       "whilelt p0.s, XZR, x15\n"
578       "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
579       "sub x15, x15, #0x4\n"
580       "ld1rqw { z0.s }, p0/Z, [x14]\n"
581       "fmla z24.s, z8.s, z0.s[0]\n"
582       "ld1rqw { z1.s }, p0/Z, [x12]\n"
583       "add x14, x14, #0x10\n"
584       "fmla z25.s, z8.s, z1.s[0]\n"
585       "ld1rqw { z2.s }, p0/Z, [x10]\n"
586       "add x12, x12, #0x10\n"
587       "fmla z24.s, z9.s, z0.s[1]\n"
588       "ld1rqw { z3.s }, p0/Z, [x28]\n"
589       "add x10, x10, #0x10\n"
590       "fmla z26.s, z8.s, z2.s[0]\n"
591       "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
592       "add x28, x28, #0x10\n"
593       "fmla z27.s, z8.s, z3.s[0]\n"
594       "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
595       "cmp x15, #0x4\n"
596       "fmla z25.s, z9.s, z1.s[1]\n"
597       "prfm pldl1keep, [x14, #0x80]\n"
598       "addvl x7, x7, #4\n"
599       "fmla z24.s, z10.s, z0.s[2]\n"
600       "prfm pldl1keep, [x12, #0x80]\n"
601       "prfm pldl1keep, [x10, #0x80]\n"
602       "fmla z26.s, z9.s, z2.s[1]\n"
603       "prfm pldl1keep, [x28, #0x80]\n"
604       "fmla z27.s, z9.s, z3.s[1]\n"
605       "fmla z25.s, z10.s, z1.s[2]\n"
606       "fmla z24.s, z11.s, z0.s[3]\n"
607       "fmla z26.s, z10.s, z2.s[2]\n"
608       "fmla z27.s, z10.s, z3.s[2]\n"
609       "fmla z25.s, z11.s, z1.s[3]\n"
610       "fmla z26.s, z11.s, z2.s[3]\n"
611       "fmla z27.s, z11.s, z3.s[3]\n"
612       "bgt 52b\n"
613       "53:"  // Height 4: Multiply loop: Single iteration only
614       "ld1w { z12.s }, p2/Z, [x7]\n"
615       "whilelt p0.s, XZR, x15\n"
616       "subs x15, x15, #0x1\n"
617       "ld1rqw { z0.s }, p0/Z, [x14]\n"
618       "fmla z24.s, z12.s, z0.s[0]\n"
619       "ld1rqw { z1.s }, p0/Z, [x12]\n"
620       "add x14, x14, #0x10\n"
621       "fmla z25.s, z12.s, z1.s[0]\n"
622       "ld1rqw { z2.s }, p0/Z, [x10]\n"
623       "add x12, x12, #0x10\n"
624       "fmla z26.s, z12.s, z2.s[0]\n"
625       "ld1rqw { z3.s }, p0/Z, [x28]\n"
626       "add x10, x10, #0x10\n"
627       "fmla z27.s, z12.s, z3.s[0]\n"
628       "add x28, x28, #0x10\n"
629       "addvl x7, x7, #1\n"
630       "ble 54f\n"
631       "ld1w { z13.s }, p2/Z, [x7]\n"
632       "fmla z24.s, z13.s, z0.s[1]\n"
633       "subs x15, x15, #0x1\n"
634       "fmla z25.s, z13.s, z1.s[1]\n"
635       "addvl x7, x7, #1\n"
636       "fmla z26.s, z13.s, z2.s[1]\n"
637       "fmla z27.s, z13.s, z3.s[1]\n"
638       "ble 54f\n"
639       "ld1w { z14.s }, p2/Z, [x7]\n"
640       "fmla z24.s, z14.s, z0.s[2]\n"
641       "subs x15, x15, #0x1\n"
642       "fmla z25.s, z14.s, z1.s[2]\n"
643       "addvl x7, x7, #1\n"
644       "fmla z26.s, z14.s, z2.s[2]\n"
645       "fmla z27.s, z14.s, z3.s[2]\n"
646       "ble 54f\n"
647       "ld1w { z15.s }, p2/Z, [x7]\n"
648       "fmla z24.s, z15.s, z0.s[3]\n"
649       "addvl x7, x7, #1\n"
650       "fmla z25.s, z15.s, z1.s[3]\n"
651       "fmla z26.s, z15.s, z2.s[3]\n"
652       "fmla z27.s, z15.s, z3.s[3]\n"
653       "54:"  // Height 4: Multiply loop: multiply skip
654       "prfm pldl1keep, [x14, #0x80]\n"
655       "add x16, x16, #0x1\n"
656       "prfm pldl1keep, [x12, #0x80]\n"
657       "prfm pldl1keep, [x10, #0x80]\n"
658       "prfm pldl1keep, [x28, #0x80]\n"
659       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
660       "cmp x16, x19\n"
661       "bne 49b\n"
662       "prfm pstl1keep, [x17, #0x0]\n"
663       "prfm pstl1keep, [x13, #0x0]\n"
664       "prfm pstl1keep, [x11, #0x0]\n"
665       "prfm pstl1keep, [x9, #0x0]\n"
666       "tbz %x[flags], #1, 55f\n"
667       "add x19, %x[args_ptr], %[offset_min]\n"
668       "ld1rw { z17.s }, p2/Z, [x19]\n"
669       "add x19, %x[args_ptr], %[offset_max]\n"
670       "ld1rw { z16.s }, p2/Z, [x19]\n"
671       "fmin z24.s, p2/M, z24.s, z16.s\n"
672       "fmin z25.s, p2/M, z25.s, z16.s\n"
673       "fmin z26.s, p2/M, z26.s, z16.s\n"
674       "fmin z27.s, p2/M, z27.s, z16.s\n"
675       "fmax z24.s, p2/M, z24.s, z17.s\n"
676       "fmax z25.s, p2/M, z25.s, z17.s\n"
677       "fmax z26.s, p2/M, z26.s, z17.s\n"
678       "fmax z27.s, p2/M, z27.s, z17.s\n"
679       "55:"  // Height 4: No activation
680       "st1w { z24.s }, p1, [x17]\n"
681       "addvl x17, x17, #1\n"
682       "st1w { z25.s }, p1, [x13]\n"
683       "addvl x13, x13, #1\n"
684       "st1w { z26.s }, p1, [x11]\n"
685       "addvl x11, x11, #1\n"
686       "st1w { z27.s }, p1, [x9]\n"
687       "addvl x9, x9, #1\n"
688       "56:"  // Height 4: Writeback done
689       "mov x19, #0x0\n"
690       "incw x19\n"
691       "subs x6, x6, x19\n"
692       "bgt 45b\n"
693       "b 114f\n"
694       "57:"  // Height 5
695       "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
696       "mov x8, %x[bias]\n"
697       "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
698       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
699       "tbz %x[flags], #2, 58f\n"
700       "ldr x17, [%x[output_ptr], #0x0]\n"
701       "add x17, x17, x19, LSL #2\n"
702       "ldr x13, [%x[output_ptr], #0x8]\n"
703       "ldr x11, [%x[output_ptr], #0x10]\n"
704       "add x13, x13, x19, LSL #2\n"
705       "ldr x9, [%x[output_ptr], #0x18]\n"
706       "ldr x27, [%x[output_ptr], #0x20]\n"
707       "add x11, x11, x19, LSL #2\n"
708       "add x9, x9, x19, LSL #2\n"
709       "add x27, x27, x19, LSL #2\n"
710       "b 59f\n"
711       "58:"  // Height 5: setup direct output
712       "mov x17, %x[output_ptr]\n"
713       "add x13, x17, x19, LSL #2\n"
714       "add x11, x13, x19, LSL #2\n"
715       "add x9, x11, x19, LSL #2\n"
716       "add x27, x9, x19, LSL #2\n"
717       "59:"  // Height 5: Column loop
718       "mov x19, #0x0\n"
719       "whilelt p1.s, x19, x6\n"
720       "cbz x8, 60f\n"
721       "ld1w { z24.s }, p2/Z, [x8]\n"
722       "mov z25.d, z24.d\n"
723       "addvl x8, x8, #1\n"
724       "mov z26.d, z24.d\n"
725       "mov z27.d, z24.d\n"
726       "mov z28.d, z24.d\n"
727       "b 62f\n"
728       "60:"  // Height 5: no bias
729       "tbz %x[flags], #0, 61f\n"
730       "ld1w { z24.s }, p1/Z, [x17]\n"
731       "ld1w { z25.s }, p1/Z, [x13]\n"
732       "ld1w { z26.s }, p1/Z, [x11]\n"
733       "ld1w { z27.s }, p1/Z, [x9]\n"
734       "ld1w { z28.s }, p1/Z, [x27]\n"
735       "b 62f\n"
736       "61:"  // Height 5: no accumulate
737       "mov z24.b, #0x0\n"
738       "mov z25.b, #0x0\n"
739       "mov z26.b, #0x0\n"
740       "mov z27.b, #0x0\n"
741       "mov z28.b, #0x0\n"
742       "62:"  // Height 5: setup done
743       "mov x16, #0x0\n"
744       "63:"  // Height 5: String loop
745       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
746       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
747       "ldr w15, [x20, x16, LSL #0x2]\n"
748       "tbz %x[flags], #3, 64f\n"
749       "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
750       "add x20, x20, x19, LSL #3\n"
751       "ldr x14, [x20, #0x0]\n"
752       "ldr x12, [x20, #0x8]\n"
753       "ldr x10, [x20, #0x10]\n"
754       "ldr x28, [x20, #0x18]\n"
755       "ldr x26, [x20, #0x20]\n"
756       "cbnz x16, 65f\n"
757       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
758       "add x14, x14, x19, LSL #2\n"
759       "add x12, x12, x19, LSL #2\n"
760       "add x10, x10, x19, LSL #2\n"
761       "add x28, x28, x19, LSL #2\n"
762       "add x26, x26, x19, LSL #2\n"
763       "b 65f\n"
764       "64:"  // Height 5: setup direct input
765       "mov x14, %x[input_ptr]\n"
766       "add x12, x14, x19, LSL #2\n"
767       "add x10, x12, x19, LSL #2\n"
768       "add x28, x10, x19, LSL #2\n"
769       "add x26, x28, x19, LSL #2\n"
770       "65:"  // Height 5: input setup done
771       "cmp x15, #0x4\n"
772       "ble 67f\n"
773       "66:"  // Height 5: Multiply loop: Main loop head
774       "ld1w { z8.s }, p2/Z, [x7]\n"
775       "whilelt p0.s, XZR, x15\n"
776       "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
777       "sub x15, x15, #0x4\n"
778       "ld1rqw { z0.s }, p0/Z, [x14]\n"
779       "fmla z24.s, z8.s, z0.s[0]\n"
780       "ld1rqw { z1.s }, p0/Z, [x12]\n"
781       "add x14, x14, #0x10\n"
782       "fmla z25.s, z8.s, z1.s[0]\n"
783       "ld1rqw { z2.s }, p0/Z, [x10]\n"
784       "add x12, x12, #0x10\n"
785       "fmla z24.s, z9.s, z0.s[1]\n"
786       "ld1rqw { z3.s }, p0/Z, [x28]\n"
787       "add x10, x10, #0x10\n"
788       "fmla z26.s, z8.s, z2.s[0]\n"
789       "ld1rqw { z4.s }, p0/Z, [x26]\n"
790       "add x28, x28, #0x10\n"
791       "fmla z27.s, z8.s, z3.s[0]\n"
792       "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
793       "add x26, x26, #0x10\n"
794       "fmla z25.s, z9.s, z1.s[1]\n"
795       "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
796       "cmp x15, #0x4\n"
797       "fmla z28.s, z8.s, z4.s[0]\n"
798       "prfm pldl1keep, [x14, #0x80]\n"
799       "addvl x7, x7, #4\n"
800       "fmla z26.s, z9.s, z2.s[1]\n"
801       "prfm pldl1keep, [x12, #0x80]\n"
802       "fmla z24.s, z10.s, z0.s[2]\n"
803       "prfm pldl1keep, [x10, #0x80]\n"
804       "fmla z27.s, z9.s, z3.s[1]\n"
805       "prfm pldl1keep, [x28, #0x80]\n"
806       "fmla z25.s, z10.s, z1.s[2]\n"
807       "prfm pldl1keep, [x26, #0x80]\n"
808       "fmla z28.s, z9.s, z4.s[1]\n"
809       "fmla z26.s, z10.s, z2.s[2]\n"
810       "fmla z27.s, z10.s, z3.s[2]\n"
811       "fmla z24.s, z11.s, z0.s[3]\n"
812       "fmla z28.s, z10.s, z4.s[2]\n"
813       "fmla z25.s, z11.s, z1.s[3]\n"
814       "fmla z26.s, z11.s, z2.s[3]\n"
815       "fmla z27.s, z11.s, z3.s[3]\n"
816       "fmla z28.s, z11.s, z4.s[3]\n"
817       "bgt 66b\n"
818       "67:"  // Height 5: Multiply loop: Single iteration only
819       "ld1w { z12.s }, p2/Z, [x7]\n"
820       "whilelt p0.s, XZR, x15\n"
821       "subs x15, x15, #0x1\n"
822       "ld1rqw { z0.s }, p0/Z, [x14]\n"
823       "fmla z24.s, z12.s, z0.s[0]\n"
824       "ld1rqw { z1.s }, p0/Z, [x12]\n"
825       "add x14, x14, #0x10\n"
826       "fmla z25.s, z12.s, z1.s[0]\n"
827       "ld1rqw { z2.s }, p0/Z, [x10]\n"
828       "add x12, x12, #0x10\n"
829       "fmla z26.s, z12.s, z2.s[0]\n"
830       "ld1rqw { z3.s }, p0/Z, [x28]\n"
831       "add x10, x10, #0x10\n"
832       "fmla z27.s, z12.s, z3.s[0]\n"
833       "ld1rqw { z4.s }, p0/Z, [x26]\n"
834       "add x28, x28, #0x10\n"
835       "fmla z28.s, z12.s, z4.s[0]\n"
836       "add x26, x26, #0x10\n"
837       "addvl x7, x7, #1\n"
838       "ble 68f\n"
839       "ld1w { z13.s }, p2/Z, [x7]\n"
840       "fmla z24.s, z13.s, z0.s[1]\n"
841       "subs x15, x15, #0x1\n"
842       "fmla z25.s, z13.s, z1.s[1]\n"
843       "addvl x7, x7, #1\n"
844       "fmla z26.s, z13.s, z2.s[1]\n"
845       "fmla z27.s, z13.s, z3.s[1]\n"
846       "fmla z28.s, z13.s, z4.s[1]\n"
847       "ble 68f\n"
848       "ld1w { z14.s }, p2/Z, [x7]\n"
849       "fmla z24.s, z14.s, z0.s[2]\n"
850       "subs x15, x15, #0x1\n"
851       "fmla z25.s, z14.s, z1.s[2]\n"
852       "addvl x7, x7, #1\n"
853       "fmla z26.s, z14.s, z2.s[2]\n"
854       "fmla z27.s, z14.s, z3.s[2]\n"
855       "fmla z28.s, z14.s, z4.s[2]\n"
856       "ble 68f\n"
857       "ld1w { z15.s }, p2/Z, [x7]\n"
858       "fmla z24.s, z15.s, z0.s[3]\n"
859       "addvl x7, x7, #1\n"
860       "fmla z25.s, z15.s, z1.s[3]\n"
861       "fmla z26.s, z15.s, z2.s[3]\n"
862       "fmla z27.s, z15.s, z3.s[3]\n"
863       "fmla z28.s, z15.s, z4.s[3]\n"
864       "68:"  // Height 5: Multiply loop: multiply skip
865       "prfm pldl1keep, [x14, #0x80]\n"
866       "add x16, x16, #0x1\n"
867       "prfm pldl1keep, [x12, #0x80]\n"
868       "prfm pldl1keep, [x10, #0x80]\n"
869       "prfm pldl1keep, [x28, #0x80]\n"
870       "prfm pldl1keep, [x26, #0x80]\n"
871       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
872       "cmp x16, x19\n"
873       "bne 63b\n"
874       "prfm pstl1keep, [x17, #0x0]\n"
875       "prfm pstl1keep, [x13, #0x0]\n"
876       "prfm pstl1keep, [x11, #0x0]\n"
877       "prfm pstl1keep, [x9, #0x0]\n"
878       "prfm pstl1keep, [x27, #0x0]\n"
879       "tbz %x[flags], #1, 69f\n"
880       "add x19, %x[args_ptr], %[offset_min]\n"
881       "ld1rw { z17.s }, p2/Z, [x19]\n"
882       "add x19, %x[args_ptr], %[offset_max]\n"
883       "ld1rw { z16.s }, p2/Z, [x19]\n"
884       "fmin z24.s, p2/M, z24.s, z16.s\n"
885       "fmin z25.s, p2/M, z25.s, z16.s\n"
886       "fmin z26.s, p2/M, z26.s, z16.s\n"
887       "fmin z27.s, p2/M, z27.s, z16.s\n"
888       "fmin z28.s, p2/M, z28.s, z16.s\n"
889       "fmax z24.s, p2/M, z24.s, z17.s\n"
890       "fmax z25.s, p2/M, z25.s, z17.s\n"
891       "fmax z26.s, p2/M, z26.s, z17.s\n"
892       "fmax z27.s, p2/M, z27.s, z17.s\n"
893       "fmax z28.s, p2/M, z28.s, z17.s\n"
894       "69:"  // Height 5: No activation
895       "st1w { z24.s }, p1, [x17]\n"
896       "addvl x17, x17, #1\n"
897       "st1w { z25.s }, p1, [x13]\n"
898       "addvl x13, x13, #1\n"
899       "st1w { z26.s }, p1, [x11]\n"
900       "addvl x11, x11, #1\n"
901       "st1w { z27.s }, p1, [x9]\n"
902       "addvl x9, x9, #1\n"
903       "st1w { z28.s }, p1, [x27]\n"
904       "addvl x27, x27, #1\n"
905       "70:"  // Height 5: Writeback done
906       "mov x19, #0x0\n"
907       "incw x19\n"
908       "subs x6, x6, x19\n"
909       "bgt 59b\n"
910       "b 114f\n"
911       "71:"  // Height 6
912       "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
913       "mov x8, %x[bias]\n"
914       "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
915       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
916       "tbz %x[flags], #2, 72f\n"
917       "ldr x17, [%x[output_ptr], #0x0]\n"
918       "add x17, x17, x19, LSL #2\n"
919       "ldr x13, [%x[output_ptr], #0x8]\n"
920       "ldr x11, [%x[output_ptr], #0x10]\n"
921       "add x13, x13, x19, LSL #2\n"
922       "ldr x9, [%x[output_ptr], #0x18]\n"
923       "ldr x27, [%x[output_ptr], #0x20]\n"
924       "add x11, x11, x19, LSL #2\n"
925       "ldr x25, [%x[output_ptr], #0x28]\n"
926       "add x9, x9, x19, LSL #2\n"
927       "add x27, x27, x19, LSL #2\n"
928       "add x25, x25, x19, LSL #2\n"
929       "b 73f\n"
930       "72:"  // Height 6: setup direct output
931       "mov x17, %x[output_ptr]\n"
932       "add x13, x17, x19, LSL #2\n"
933       "add x11, x13, x19, LSL #2\n"
934       "add x9, x11, x19, LSL #2\n"
935       "add x27, x9, x19, LSL #2\n"
936       "add x25, x27, x19, LSL #2\n"
937       "73:"  // Height 6: Column loop
938       "mov x19, #0x0\n"
939       "whilelt p1.s, x19, x6\n"
940       "cbz x8, 74f\n"
941       "ld1w { z24.s }, p2/Z, [x8]\n"
942       "mov z25.d, z24.d\n"
943       "addvl x8, x8, #1\n"
944       "mov z26.d, z24.d\n"
945       "mov z27.d, z24.d\n"
946       "mov z28.d, z24.d\n"
947       "mov z29.d, z24.d\n"
948       "b 76f\n"
949       "74:"  // Height 6: no bias
950       "tbz %x[flags], #0, 75f\n"
951       "ld1w { z24.s }, p1/Z, [x17]\n"
952       "ld1w { z25.s }, p1/Z, [x13]\n"
953       "ld1w { z26.s }, p1/Z, [x11]\n"
954       "ld1w { z27.s }, p1/Z, [x9]\n"
955       "ld1w { z28.s }, p1/Z, [x27]\n"
956       "ld1w { z29.s }, p1/Z, [x25]\n"
957       "b 76f\n"
958       "75:"  // Height 6: no accumulate
959       "mov z24.b, #0x0\n"
960       "mov z25.b, #0x0\n"
961       "mov z26.b, #0x0\n"
962       "mov z27.b, #0x0\n"
963       "mov z28.b, #0x0\n"
964       "mov z29.b, #0x0\n"
965       "76:"  // Height 6: setup done
966       "mov x16, #0x0\n"
967       "77:"  // Height 6: String loop
968       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
969       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
970       "ldr w15, [x20, x16, LSL #0x2]\n"
971       "tbz %x[flags], #3, 78f\n"
972       "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
973       "add x20, x20, x19, LSL #3\n"
974       "ldr x14, [x20, #0x0]\n"
975       "ldr x12, [x20, #0x8]\n"
976       "ldr x10, [x20, #0x10]\n"
977       "ldr x28, [x20, #0x18]\n"
978       "ldr x26, [x20, #0x20]\n"
979       "ldr x24, [x20, #0x28]\n"
980       "cbnz x16, 79f\n"
981       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
982       "add x14, x14, x19, LSL #2\n"
983       "add x12, x12, x19, LSL #2\n"
984       "add x10, x10, x19, LSL #2\n"
985       "add x28, x28, x19, LSL #2\n"
986       "add x26, x26, x19, LSL #2\n"
987       "add x24, x24, x19, LSL #2\n"
988       "b 79f\n"
989       "78:"  // Height 6: setup direct input
990       "mov x14, %x[input_ptr]\n"
991       "add x12, x14, x19, LSL #2\n"
992       "add x10, x12, x19, LSL #2\n"
993       "add x28, x10, x19, LSL #2\n"
994       "add x26, x28, x19, LSL #2\n"
995       "add x24, x26, x19, LSL #2\n"
996       "79:"  // Height 6: input setup done
997       "cmp x15, #0x4\n"
998       "ble 81f\n"
999       "80:"  // Height 6: Multiply loop: Main loop head
1000       "ld1w { z8.s }, p2/Z, [x7]\n"
1001       "whilelt p0.s, XZR, x15\n"
1002       "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
1003       "sub x15, x15, #0x4\n"
1004       "ld1rqw { z0.s }, p0/Z, [x14]\n"
1005       "fmla z24.s, z8.s, z0.s[0]\n"
1006       "ld1rqw { z1.s }, p0/Z, [x12]\n"
1007       "add x14, x14, #0x10\n"
1008       "fmla z25.s, z8.s, z1.s[0]\n"
1009       "ld1rqw { z2.s }, p0/Z, [x10]\n"
1010       "add x12, x12, #0x10\n"
1011       "fmla z24.s, z9.s, z0.s[1]\n"
1012       "ld1rqw { z3.s }, p0/Z, [x28]\n"
1013       "add x10, x10, #0x10\n"
1014       "fmla z26.s, z8.s, z2.s[0]\n"
1015       "ld1rqw { z4.s }, p0/Z, [x26]\n"
1016       "add x28, x28, #0x10\n"
1017       "fmla z27.s, z8.s, z3.s[0]\n"
1018       "ld1rqw { z5.s }, p0/Z, [x24]\n"
1019       "add x26, x26, #0x10\n"
1020       "fmla z25.s, z9.s, z1.s[1]\n"
1021       "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
1022       "add x24, x24, #0x10\n"
1023       "fmla z28.s, z8.s, z4.s[0]\n"
1024       "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
1025       "cmp x15, #0x4\n"
1026       "fmla z29.s, z8.s, z5.s[0]\n"
1027       "prfm pldl1keep, [x14, #0x80]\n"
1028       "addvl x7, x7, #4\n"
1029       "fmla z26.s, z9.s, z2.s[1]\n"
1030       "prfm pldl1keep, [x12, #0x80]\n"
1031       "fmla z27.s, z9.s, z3.s[1]\n"
1032       "prfm pldl1keep, [x10, #0x80]\n"
1033       "fmla z24.s, z10.s, z0.s[2]\n"
1034       "prfm pldl1keep, [x28, #0x80]\n"
1035       "fmla z28.s, z9.s, z4.s[1]\n"
1036       "prfm pldl1keep, [x26, #0x80]\n"
1037       "fmla z29.s, z9.s, z5.s[1]\n"
1038       "prfm pldl1keep, [x24, #0x80]\n"
1039       "fmla z25.s, z10.s, z1.s[2]\n"
1040       "fmla z26.s, z10.s, z2.s[2]\n"
1041       "fmla z27.s, z10.s, z3.s[2]\n"
1042       "fmla z28.s, z10.s, z4.s[2]\n"
1043       "fmla z29.s, z10.s, z5.s[2]\n"
1044       "fmla z24.s, z11.s, z0.s[3]\n"
1045       "fmla z25.s, z11.s, z1.s[3]\n"
1046       "fmla z26.s, z11.s, z2.s[3]\n"
1047       "fmla z27.s, z11.s, z3.s[3]\n"
1048       "fmla z28.s, z11.s, z4.s[3]\n"
1049       "fmla z29.s, z11.s, z5.s[3]\n"
1050       "bgt 80b\n"
1051       "81:"  // Height 6: Multiply loop: Single iteration only
1052       "ld1w { z12.s }, p2/Z, [x7]\n"
1053       "whilelt p0.s, XZR, x15\n"
1054       "subs x15, x15, #0x1\n"
1055       "ld1rqw { z0.s }, p0/Z, [x14]\n"
1056       "fmla z24.s, z12.s, z0.s[0]\n"
1057       "ld1rqw { z1.s }, p0/Z, [x12]\n"
1058       "add x14, x14, #0x10\n"
1059       "fmla z25.s, z12.s, z1.s[0]\n"
1060       "ld1rqw { z2.s }, p0/Z, [x10]\n"
1061       "add x12, x12, #0x10\n"
1062       "fmla z26.s, z12.s, z2.s[0]\n"
1063       "ld1rqw { z3.s }, p0/Z, [x28]\n"
1064       "add x10, x10, #0x10\n"
1065       "fmla z27.s, z12.s, z3.s[0]\n"
1066       "ld1rqw { z4.s }, p0/Z, [x26]\n"
1067       "add x28, x28, #0x10\n"
1068       "fmla z28.s, z12.s, z4.s[0]\n"
1069       "ld1rqw { z5.s }, p0/Z, [x24]\n"
1070       "add x26, x26, #0x10\n"
1071       "fmla z29.s, z12.s, z5.s[0]\n"
1072       "add x24, x24, #0x10\n"
1073       "addvl x7, x7, #1\n"
1074       "ble 82f\n"
1075       "ld1w { z13.s }, p2/Z, [x7]\n"
1076       "fmla z24.s, z13.s, z0.s[1]\n"
1077       "subs x15, x15, #0x1\n"
1078       "fmla z25.s, z13.s, z1.s[1]\n"
1079       "addvl x7, x7, #1\n"
1080       "fmla z26.s, z13.s, z2.s[1]\n"
1081       "fmla z27.s, z13.s, z3.s[1]\n"
1082       "fmla z28.s, z13.s, z4.s[1]\n"
1083       "fmla z29.s, z13.s, z5.s[1]\n"
1084       "ble 82f\n"
1085       "ld1w { z14.s }, p2/Z, [x7]\n"
1086       "fmla z24.s, z14.s, z0.s[2]\n"
1087       "subs x15, x15, #0x1\n"
1088       "fmla z25.s, z14.s, z1.s[2]\n"
1089       "addvl x7, x7, #1\n"
1090       "fmla z26.s, z14.s, z2.s[2]\n"
1091       "fmla z27.s, z14.s, z3.s[2]\n"
1092       "fmla z28.s, z14.s, z4.s[2]\n"
1093       "fmla z29.s, z14.s, z5.s[2]\n"
1094       "ble 82f\n"
1095       "ld1w { z15.s }, p2/Z, [x7]\n"
1096       "fmla z24.s, z15.s, z0.s[3]\n"
1097       "addvl x7, x7, #1\n"
1098       "fmla z25.s, z15.s, z1.s[3]\n"
1099       "fmla z26.s, z15.s, z2.s[3]\n"
1100       "fmla z27.s, z15.s, z3.s[3]\n"
1101       "fmla z28.s, z15.s, z4.s[3]\n"
1102       "fmla z29.s, z15.s, z5.s[3]\n"
1103       "82:"  // Height 6: Multiply loop: multiply skip
1104       "prfm pldl1keep, [x14, #0x80]\n"
1105       "add x16, x16, #0x1\n"
1106       "prfm pldl1keep, [x12, #0x80]\n"
1107       "prfm pldl1keep, [x10, #0x80]\n"
1108       "prfm pldl1keep, [x28, #0x80]\n"
1109       "prfm pldl1keep, [x26, #0x80]\n"
1110       "prfm pldl1keep, [x24, #0x80]\n"
1111       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1112       "cmp x16, x19\n"
1113       "bne 77b\n"
1114       "prfm pstl1keep, [x17, #0x0]\n"
1115       "prfm pstl1keep, [x13, #0x0]\n"
1116       "prfm pstl1keep, [x11, #0x0]\n"
1117       "prfm pstl1keep, [x9, #0x0]\n"
1118       "prfm pstl1keep, [x27, #0x0]\n"
1119       "prfm pstl1keep, [x25, #0x0]\n"
1120       "tbz %x[flags], #1, 83f\n"
1121       "add x19, %x[args_ptr], %[offset_min]\n"
1122       "ld1rw { z17.s }, p2/Z, [x19]\n"
1123       "add x19, %x[args_ptr], %[offset_max]\n"
1124       "ld1rw { z16.s }, p2/Z, [x19]\n"
1125       "fmin z24.s, p2/M, z24.s, z16.s\n"
1126       "fmin z25.s, p2/M, z25.s, z16.s\n"
1127       "fmin z26.s, p2/M, z26.s, z16.s\n"
1128       "fmin z27.s, p2/M, z27.s, z16.s\n"
1129       "fmin z28.s, p2/M, z28.s, z16.s\n"
1130       "fmax z24.s, p2/M, z24.s, z17.s\n"
1131       "fmax z25.s, p2/M, z25.s, z17.s\n"
1132       "fmax z26.s, p2/M, z26.s, z17.s\n"
1133       "fmax z27.s, p2/M, z27.s, z17.s\n"
1134       "fmax z28.s, p2/M, z28.s, z17.s\n"
1135       "fmin z29.s, p2/M, z29.s, z16.s\n"
1136       "fmax z29.s, p2/M, z29.s, z17.s\n"
1137       "83:"  // Height 6: No activation
1138       "st1w { z24.s }, p1, [x17]\n"
1139       "addvl x17, x17, #1\n"
1140       "st1w { z25.s }, p1, [x13]\n"
1141       "addvl x13, x13, #1\n"
1142       "st1w { z26.s }, p1, [x11]\n"
1143       "addvl x11, x11, #1\n"
1144       "st1w { z27.s }, p1, [x9]\n"
1145       "addvl x9, x9, #1\n"
1146       "st1w { z28.s }, p1, [x27]\n"
1147       "addvl x27, x27, #1\n"
1148       "st1w { z29.s }, p1, [x25]\n"
1149       "addvl x25, x25, #1\n"
1150       "84:"  // Height 6: Writeback done
1151       "mov x19, #0x0\n"
1152       "incw x19\n"
1153       "subs x6, x6, x19\n"
1154       "bgt 73b\n"
1155       "b 114f\n"
1156       "85:"  // Height 7
1157       "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
1158       "mov x8, %x[bias]\n"
1159       "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1160       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1161       "tbz %x[flags], #2, 86f\n"
1162       "ldr x17, [%x[output_ptr], #0x0]\n"
1163       "add x17, x17, x19, LSL #2\n"
1164       "ldr x13, [%x[output_ptr], #0x8]\n"
1165       "ldr x11, [%x[output_ptr], #0x10]\n"
1166       "add x13, x13, x19, LSL #2\n"
1167       "ldr x9, [%x[output_ptr], #0x18]\n"
1168       "ldr x27, [%x[output_ptr], #0x20]\n"
1169       "add x11, x11, x19, LSL #2\n"
1170       "ldr x25, [%x[output_ptr], #0x28]\n"
1171       "add x9, x9, x19, LSL #2\n"
1172       "ldr x23, [%x[output_ptr], #0x30]\n"
1173       "add x27, x27, x19, LSL #2\n"
1174       "add x25, x25, x19, LSL #2\n"
1175       "add x23, x23, x19, LSL #2\n"
1176       "b 87f\n"
1177       "86:"  // Height 7: setup direct output
1178       "mov x17, %x[output_ptr]\n"
1179       "add x13, x17, x19, LSL #2\n"
1180       "add x11, x13, x19, LSL #2\n"
1181       "add x9, x11, x19, LSL #2\n"
1182       "add x27, x9, x19, LSL #2\n"
1183       "add x25, x27, x19, LSL #2\n"
1184       "add x23, x25, x19, LSL #2\n"
1185       "87:"  // Height 7: Column loop
1186       "mov x19, #0x0\n"
1187       "whilelt p1.s, x19, x6\n"
1188       "cbz x8, 88f\n"
1189       "ld1w { z24.s }, p2/Z, [x8]\n"
1190       "mov z25.d, z24.d\n"
1191       "addvl x8, x8, #1\n"
1192       "mov z26.d, z24.d\n"
1193       "mov z27.d, z24.d\n"
1194       "mov z28.d, z24.d\n"
1195       "mov z29.d, z24.d\n"
1196       "mov z30.d, z24.d\n"
1197       "b 90f\n"
1198       "88:"  // Height 7: no bias
1199       "tbz %x[flags], #0, 89f\n"
1200       "ld1w { z24.s }, p1/Z, [x17]\n"
1201       "ld1w { z25.s }, p1/Z, [x13]\n"
1202       "ld1w { z26.s }, p1/Z, [x11]\n"
1203       "ld1w { z27.s }, p1/Z, [x9]\n"
1204       "ld1w { z28.s }, p1/Z, [x27]\n"
1205       "ld1w { z29.s }, p1/Z, [x25]\n"
1206       "ld1w { z30.s }, p1/Z, [x23]\n"
1207       "b 90f\n"
1208       "89:"  // Height 7: no accumulate
1209       "mov z24.b, #0x0\n"
1210       "mov z25.b, #0x0\n"
1211       "mov z26.b, #0x0\n"
1212       "mov z27.b, #0x0\n"
1213       "mov z28.b, #0x0\n"
1214       "mov z29.b, #0x0\n"
1215       "mov z30.b, #0x0\n"
1216       "90:"  // Height 7: setup done
1217       "mov x16, #0x0\n"
1218       "91:"  // Height 7: String loop
1219       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1220       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1221       "ldr w15, [x20, x16, LSL #0x2]\n"
1222       "tbz %x[flags], #3, 92f\n"
1223       "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
1224       "add x20, x20, x19, LSL #3\n"
1225       "ldr x14, [x20, #0x0]\n"
1226       "ldr x12, [x20, #0x8]\n"
1227       "ldr x10, [x20, #0x10]\n"
1228       "ldr x28, [x20, #0x18]\n"
1229       "ldr x26, [x20, #0x20]\n"
1230       "ldr x24, [x20, #0x28]\n"
1231       "ldr x22, [x20, #0x30]\n"
1232       "cbnz x16, 93f\n"
1233       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1234       "add x14, x14, x19, LSL #2\n"
1235       "add x12, x12, x19, LSL #2\n"
1236       "add x10, x10, x19, LSL #2\n"
1237       "add x28, x28, x19, LSL #2\n"
1238       "add x26, x26, x19, LSL #2\n"
1239       "add x24, x24, x19, LSL #2\n"
1240       "add x22, x22, x19, LSL #2\n"
1241       "b 93f\n"
1242       "92:"  // Height 7: setup direct input
1243       "mov x14, %x[input_ptr]\n"
1244       "add x12, x14, x19, LSL #2\n"
1245       "add x10, x12, x19, LSL #2\n"
1246       "add x28, x10, x19, LSL #2\n"
1247       "add x26, x28, x19, LSL #2\n"
1248       "add x24, x26, x19, LSL #2\n"
1249       "add x22, x24, x19, LSL #2\n"
1250       "93:"  // Height 7: input setup done
1251       "cmp x15, #0x4\n"
1252       "ble 95f\n"
1253       "94:"  // Height 7: Multiply loop: Main loop head
1254       "ld1w { z8.s }, p2/Z, [x7]\n"
1255       "whilelt p0.s, XZR, x15\n"
1256       "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
1257       "sub x15, x15, #0x4\n"
1258       "ld1rqw { z0.s }, p0/Z, [x14]\n"
1259       "fmla z24.s, z8.s, z0.s[0]\n"
1260       "ld1rqw { z1.s }, p0/Z, [x12]\n"
1261       "add x14, x14, #0x10\n"
1262       "fmla z25.s, z8.s, z1.s[0]\n"
1263       "ld1rqw { z2.s }, p0/Z, [x10]\n"
1264       "add x12, x12, #0x10\n"
1265       "fmla z24.s, z9.s, z0.s[1]\n"
1266       "ld1rqw { z3.s }, p0/Z, [x28]\n"
1267       "add x10, x10, #0x10\n"
1268       "fmla z26.s, z8.s, z2.s[0]\n"
1269       "ld1rqw { z4.s }, p0/Z, [x26]\n"
1270       "add x28, x28, #0x10\n"
1271       "fmla z27.s, z8.s, z3.s[0]\n"
1272       "ld1rqw { z5.s }, p0/Z, [x24]\n"
1273       "add x26, x26, #0x10\n"
1274       "fmla z25.s, z9.s, z1.s[1]\n"
1275       "ld1rqw { z6.s }, p0/Z, [x22]\n"
1276       "add x24, x24, #0x10\n"
1277       "fmla z28.s, z8.s, z4.s[0]\n"
1278       "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
1279       "add x22, x22, #0x10\n"
1280       "fmla z29.s, z8.s, z5.s[0]\n"
1281       "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
1282       "cmp x15, #0x4\n"
1283       "fmla z30.s, z8.s, z6.s[0]\n"
1284       "prfm pldl1keep, [x14, #0x80]\n"
1285       "addvl x7, x7, #4\n"
1286       "fmla z26.s, z9.s, z2.s[1]\n"
1287       "prfm pldl1keep, [x12, #0x80]\n"
1288       "fmla z27.s, z9.s, z3.s[1]\n"
1289       "prfm pldl1keep, [x10, #0x80]\n"
1290       "fmla z28.s, z9.s, z4.s[1]\n"
1291       "prfm pldl1keep, [x28, #0x80]\n"
1292       "fmla z29.s, z9.s, z5.s[1]\n"
1293       "prfm pldl1keep, [x26, #0x80]\n"
1294       "fmla z30.s, z9.s, z6.s[1]\n"
1295       "prfm pldl1keep, [x24, #0x80]\n"
1296       "fmla z24.s, z10.s, z0.s[2]\n"
1297       "prfm pldl1keep, [x22, #0x80]\n"
1298       "fmla z25.s, z10.s, z1.s[2]\n"
1299       "fmla z26.s, z10.s, z2.s[2]\n"
1300       "fmla z27.s, z10.s, z3.s[2]\n"
1301       "fmla z28.s, z10.s, z4.s[2]\n"
1302       "fmla z29.s, z10.s, z5.s[2]\n"
1303       "fmla z30.s, z10.s, z6.s[2]\n"
1304       "fmla z24.s, z11.s, z0.s[3]\n"
1305       "fmla z25.s, z11.s, z1.s[3]\n"
1306       "fmla z26.s, z11.s, z2.s[3]\n"
1307       "fmla z27.s, z11.s, z3.s[3]\n"
1308       "fmla z28.s, z11.s, z4.s[3]\n"
1309       "fmla z29.s, z11.s, z5.s[3]\n"
1310       "fmla z30.s, z11.s, z6.s[3]\n"
1311       "bgt 94b\n"
1312       "95:"  // Height 7: Multiply loop: Single iteration only
1313       "ld1w { z12.s }, p2/Z, [x7]\n"
1314       "whilelt p0.s, XZR, x15\n"
1315       "subs x15, x15, #0x1\n"
1316       "ld1rqw { z0.s }, p0/Z, [x14]\n"
1317       "fmla z24.s, z12.s, z0.s[0]\n"
1318       "ld1rqw { z1.s }, p0/Z, [x12]\n"
1319       "add x14, x14, #0x10\n"
1320       "fmla z25.s, z12.s, z1.s[0]\n"
1321       "ld1rqw { z2.s }, p0/Z, [x10]\n"
1322       "add x12, x12, #0x10\n"
1323       "fmla z26.s, z12.s, z2.s[0]\n"
1324       "ld1rqw { z3.s }, p0/Z, [x28]\n"
1325       "add x10, x10, #0x10\n"
1326       "fmla z27.s, z12.s, z3.s[0]\n"
1327       "ld1rqw { z4.s }, p0/Z, [x26]\n"
1328       "add x28, x28, #0x10\n"
1329       "fmla z28.s, z12.s, z4.s[0]\n"
1330       "ld1rqw { z5.s }, p0/Z, [x24]\n"
1331       "add x26, x26, #0x10\n"
1332       "fmla z29.s, z12.s, z5.s[0]\n"
1333       "ld1rqw { z6.s }, p0/Z, [x22]\n"
1334       "add x24, x24, #0x10\n"
1335       "fmla z30.s, z12.s, z6.s[0]\n"
1336       "add x22, x22, #0x10\n"
1337       "addvl x7, x7, #1\n"
1338       "ble 96f\n"
1339       "ld1w { z13.s }, p2/Z, [x7]\n"
1340       "fmla z24.s, z13.s, z0.s[1]\n"
1341       "subs x15, x15, #0x1\n"
1342       "fmla z25.s, z13.s, z1.s[1]\n"
1343       "addvl x7, x7, #1\n"
1344       "fmla z26.s, z13.s, z2.s[1]\n"
1345       "fmla z27.s, z13.s, z3.s[1]\n"
1346       "fmla z28.s, z13.s, z4.s[1]\n"
1347       "fmla z29.s, z13.s, z5.s[1]\n"
1348       "fmla z30.s, z13.s, z6.s[1]\n"
1349       "ble 96f\n"
1350       "ld1w { z14.s }, p2/Z, [x7]\n"
1351       "fmla z24.s, z14.s, z0.s[2]\n"
1352       "subs x15, x15, #0x1\n"
1353       "fmla z25.s, z14.s, z1.s[2]\n"
1354       "addvl x7, x7, #1\n"
1355       "fmla z26.s, z14.s, z2.s[2]\n"
1356       "fmla z27.s, z14.s, z3.s[2]\n"
1357       "fmla z28.s, z14.s, z4.s[2]\n"
1358       "fmla z29.s, z14.s, z5.s[2]\n"
1359       "fmla z30.s, z14.s, z6.s[2]\n"
1360       "ble 96f\n"
1361       "ld1w { z15.s }, p2/Z, [x7]\n"
1362       "fmla z24.s, z15.s, z0.s[3]\n"
1363       "addvl x7, x7, #1\n"
1364       "fmla z25.s, z15.s, z1.s[3]\n"
1365       "fmla z26.s, z15.s, z2.s[3]\n"
1366       "fmla z27.s, z15.s, z3.s[3]\n"
1367       "fmla z28.s, z15.s, z4.s[3]\n"
1368       "fmla z29.s, z15.s, z5.s[3]\n"
1369       "fmla z30.s, z15.s, z6.s[3]\n"
1370       "96:"  // Height 7: Multiply loop: multiply skip
1371       "prfm pldl1keep, [x14, #0x80]\n"
1372       "add x16, x16, #0x1\n"
1373       "prfm pldl1keep, [x12, #0x80]\n"
1374       "prfm pldl1keep, [x10, #0x80]\n"
1375       "prfm pldl1keep, [x28, #0x80]\n"
1376       "prfm pldl1keep, [x26, #0x80]\n"
1377       "prfm pldl1keep, [x24, #0x80]\n"
1378       "prfm pldl1keep, [x22, #0x80]\n"
1379       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1380       "cmp x16, x19\n"
1381       "bne 91b\n"
1382       "prfm pstl1keep, [x17, #0x0]\n"
1383       "prfm pstl1keep, [x13, #0x0]\n"
1384       "prfm pstl1keep, [x11, #0x0]\n"
1385       "prfm pstl1keep, [x9, #0x0]\n"
1386       "prfm pstl1keep, [x27, #0x0]\n"
1387       "prfm pstl1keep, [x25, #0x0]\n"
1388       "prfm pstl1keep, [x23, #0x0]\n"
1389       "tbz %x[flags], #1, 97f\n"
1390       "add x19, %x[args_ptr], %[offset_min]\n"
1391       "ld1rw { z17.s }, p2/Z, [x19]\n"
1392       "add x19, %x[args_ptr], %[offset_max]\n"
1393       "ld1rw { z16.s }, p2/Z, [x19]\n"
1394       "fmin z24.s, p2/M, z24.s, z16.s\n"
1395       "fmin z25.s, p2/M, z25.s, z16.s\n"
1396       "fmin z26.s, p2/M, z26.s, z16.s\n"
1397       "fmin z27.s, p2/M, z27.s, z16.s\n"
1398       "fmin z28.s, p2/M, z28.s, z16.s\n"
1399       "fmax z24.s, p2/M, z24.s, z17.s\n"
1400       "fmax z25.s, p2/M, z25.s, z17.s\n"
1401       "fmax z26.s, p2/M, z26.s, z17.s\n"
1402       "fmax z27.s, p2/M, z27.s, z17.s\n"
1403       "fmax z28.s, p2/M, z28.s, z17.s\n"
1404       "fmin z29.s, p2/M, z29.s, z16.s\n"
1405       "fmin z30.s, p2/M, z30.s, z16.s\n"
1406       "fmax z29.s, p2/M, z29.s, z17.s\n"
1407       "fmax z30.s, p2/M, z30.s, z17.s\n"
1408       "97:"  // Height 7: No activation
1409       "st1w { z24.s }, p1, [x17]\n"
1410       "addvl x17, x17, #1\n"
1411       "st1w { z25.s }, p1, [x13]\n"
1412       "addvl x13, x13, #1\n"
1413       "st1w { z26.s }, p1, [x11]\n"
1414       "addvl x11, x11, #1\n"
1415       "st1w { z27.s }, p1, [x9]\n"
1416       "addvl x9, x9, #1\n"
1417       "st1w { z28.s }, p1, [x27]\n"
1418       "addvl x27, x27, #1\n"
1419       "st1w { z29.s }, p1, [x25]\n"
1420       "addvl x25, x25, #1\n"
1421       "st1w { z30.s }, p1, [x23]\n"
1422       "addvl x23, x23, #1\n"
1423       "98:"  // Height 7: Writeback done
1424       "mov x19, #0x0\n"
1425       "incw x19\n"
1426       "subs x6, x6, x19\n"
1427       "bgt 87b\n"
1428       "b 114f\n"
1429       "99:"  // Height 8
1430       "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
1431       "mov x8, %x[bias]\n"
1432       "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1433       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1434       "tbz %x[flags], #2, 100f\n"
1435       "ldr x17, [%x[output_ptr], #0x0]\n"
1436       "add x17, x17, x19, LSL #2\n"
1437       "ldr x13, [%x[output_ptr], #0x8]\n"
1438       "ldr x11, [%x[output_ptr], #0x10]\n"
1439       "add x13, x13, x19, LSL #2\n"
1440       "ldr x9, [%x[output_ptr], #0x18]\n"
1441       "ldr x27, [%x[output_ptr], #0x20]\n"
1442       "add x11, x11, x19, LSL #2\n"
1443       "ldr x25, [%x[output_ptr], #0x28]\n"
1444       "add x9, x9, x19, LSL #2\n"
1445       "ldr x23, [%x[output_ptr], #0x30]\n"
1446       "ldr x21, [%x[output_ptr], #0x38]\n"
1447       "add x27, x27, x19, LSL #2\n"
1448       "add x25, x25, x19, LSL #2\n"
1449       "add %x[output_ptr], %x[output_ptr], #0x40\n"
1450       "add x23, x23, x19, LSL #2\n"
1451       "add x21, x21, x19, LSL #2\n"
1452       "b 101f\n"
1453       "100:"  // Height 8: setup direct output
1454       "mov x17, %x[output_ptr]\n"
1455       "add x13, x17, x19, LSL #2\n"
1456       "add x11, x13, x19, LSL #2\n"
1457       "add x9, x11, x19, LSL #2\n"
1458       "add x27, x9, x19, LSL #2\n"
1459       "add x25, x27, x19, LSL #2\n"
1460       "add x23, x25, x19, LSL #2\n"
1461       "add x21, x23, x19, LSL #2\n"
1462       "add %x[output_ptr], x21, x19, LSL #2\n"
1463       "101:"  // Height 8: Column loop
1464       "mov x19, #0x0\n"
1465       "whilelt p1.s, x19, x6\n"
1466       "cbz x8, 102f\n"
1467       "ld1w { z24.s }, p2/Z, [x8]\n"
1468       "mov z25.d, z24.d\n"
1469       "addvl x8, x8, #1\n"
1470       "mov z26.d, z24.d\n"
1471       "mov z27.d, z24.d\n"
1472       "mov z28.d, z24.d\n"
1473       "mov z29.d, z24.d\n"
1474       "mov z30.d, z24.d\n"
1475       "mov z31.d, z24.d\n"
1476       "b 104f\n"
1477       "102:"  // Height 8: no bias
1478       "tbz %x[flags], #0, 103f\n"
1479       "ld1w { z24.s }, p1/Z, [x17]\n"
1480       "ld1w { z25.s }, p1/Z, [x13]\n"
1481       "ld1w { z26.s }, p1/Z, [x11]\n"
1482       "ld1w { z27.s }, p1/Z, [x9]\n"
1483       "ld1w { z28.s }, p1/Z, [x27]\n"
1484       "ld1w { z29.s }, p1/Z, [x25]\n"
1485       "ld1w { z30.s }, p1/Z, [x23]\n"
1486       "ld1w { z31.s }, p1/Z, [x21]\n"
1487       "b 104f\n"
1488       "103:"  // Height 8: no accumulate
1489       "mov z24.b, #0x0\n"
1490       "mov z25.b, #0x0\n"
1491       "mov z26.b, #0x0\n"
1492       "mov z27.b, #0x0\n"
1493       "mov z28.b, #0x0\n"
1494       "mov z29.b, #0x0\n"
1495       "mov z30.b, #0x0\n"
1496       "mov z31.b, #0x0\n"
1497       "104:"  // Height 8: setup done
1498       "mov x16, #0x0\n"
1499       "105:"  // Height 8: String loop
1500       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1501       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1502       "ldr w15, [x20, x16, LSL #0x2]\n"
1503       "tbz %x[flags], #3, 106f\n"
1504       "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
1505       "add x20, x20, x19, LSL #3\n"
1506       "ldr x14, [x20, #0x0]\n"
1507       "ldr x12, [x20, #0x8]\n"
1508       "ldr x10, [x20, #0x10]\n"
1509       "ldr x28, [x20, #0x18]\n"
1510       "ldr x26, [x20, #0x20]\n"
1511       "ldr x24, [x20, #0x28]\n"
1512       "ldr x22, [x20, #0x30]\n"
1513       "ldr x20, [x20, #0x38]\n"
1514       "cbnz x16, 107f\n"
1515       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1516       "add x14, x14, x19, LSL #2\n"
1517       "add x12, x12, x19, LSL #2\n"
1518       "add x10, x10, x19, LSL #2\n"
1519       "add x28, x28, x19, LSL #2\n"
1520       "add x26, x26, x19, LSL #2\n"
1521       "add x24, x24, x19, LSL #2\n"
1522       "add x22, x22, x19, LSL #2\n"
1523       "add x20, x20, x19, LSL #2\n"
1524       "b 107f\n"
1525       "106:"  // Height 8: setup direct input
1526       "mov x14, %x[input_ptr]\n"
1527       "add x12, x14, x19, LSL #2\n"
1528       "add x10, x12, x19, LSL #2\n"
1529       "add x28, x10, x19, LSL #2\n"
1530       "add x26, x28, x19, LSL #2\n"
1531       "add x24, x26, x19, LSL #2\n"
1532       "add x22, x24, x19, LSL #2\n"
1533       "add x20, x22, x19, LSL #2\n"
1534       "107:"  // Height 8: input setup done
1535       "cmp x15, #0x4\n"
1536       "ble 109f\n"
1537       "108:"  // Height 8: Multiply loop: Main loop head
1538       "ld1w { z8.s }, p2/Z, [x7]\n"
1539       "whilelt p0.s, XZR, x15\n"
1540       "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
1541       "sub x15, x15, #0x4\n"
1542       "ld1rqw { z0.s }, p0/Z, [x14]\n"
1543       "fmla z24.s, z8.s, z0.s[0]\n"
1544       "ld1rqw { z1.s }, p0/Z, [x12]\n"
1545       "add x14, x14, #0x10\n"
1546       "fmla z25.s, z8.s, z1.s[0]\n"
1547       "ld1rqw { z2.s }, p0/Z, [x10]\n"
1548       "add x12, x12, #0x10\n"
1549       "fmla z24.s, z9.s, z0.s[1]\n"
1550       "ld1rqw { z3.s }, p0/Z, [x28]\n"
1551       "add x10, x10, #0x10\n"
1552       "fmla z26.s, z8.s, z2.s[0]\n"
1553       "ld1rqw { z4.s }, p0/Z, [x26]\n"
1554       "add x28, x28, #0x10\n"
1555       "fmla z27.s, z8.s, z3.s[0]\n"
1556       "ld1rqw { z5.s }, p0/Z, [x24]\n"
1557       "add x26, x26, #0x10\n"
1558       "fmla z25.s, z9.s, z1.s[1]\n"
1559       "ld1rqw { z6.s }, p0/Z, [x22]\n"
1560       "add x24, x24, #0x10\n"
1561       "fmla z28.s, z8.s, z4.s[0]\n"
1562       "ld1rqw { z7.s }, p0/Z, [x20]\n"
1563       "add x22, x22, #0x10\n"
1564       "fmla z29.s, z8.s, z5.s[0]\n"
1565       "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
1566       "add x20, x20, #0x10\n"
1567       "fmla z30.s, z8.s, z6.s[0]\n"
1568       "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
1569       "cmp x15, #0x4\n"
1570       "fmla z31.s, z8.s, z7.s[0]\n"
1571       "prfm pldl1keep, [x14, #0x80]\n"
1572       "addvl x7, x7, #4\n"
1573       "fmla z26.s, z9.s, z2.s[1]\n"
1574       "prfm pldl1keep, [x12, #0x80]\n"
1575       "fmla z27.s, z9.s, z3.s[1]\n"
1576       "prfm pldl1keep, [x10, #0x80]\n"
1577       "fmla z28.s, z9.s, z4.s[1]\n"
1578       "prfm pldl1keep, [x28, #0x80]\n"
1579       "fmla z29.s, z9.s, z5.s[1]\n"
1580       "prfm pldl1keep, [x26, #0x80]\n"
1581       "fmla z30.s, z9.s, z6.s[1]\n"
1582       "prfm pldl1keep, [x24, #0x80]\n"
1583       "fmla z31.s, z9.s, z7.s[1]\n"
1584       "prfm pldl1keep, [x22, #0x80]\n"
1585       "fmla z24.s, z10.s, z0.s[2]\n"
1586       "prfm pldl1keep, [x20, #0x80]\n"
1587       "fmla z25.s, z10.s, z1.s[2]\n"
1588       "fmla z26.s, z10.s, z2.s[2]\n"
1589       "fmla z27.s, z10.s, z3.s[2]\n"
1590       "fmla z28.s, z10.s, z4.s[2]\n"
1591       "fmla z29.s, z10.s, z5.s[2]\n"
1592       "fmla z30.s, z10.s, z6.s[2]\n"
1593       "fmla z31.s, z10.s, z7.s[2]\n"
1594       "fmla z24.s, z11.s, z0.s[3]\n"
1595       "fmla z25.s, z11.s, z1.s[3]\n"
1596       "fmla z26.s, z11.s, z2.s[3]\n"
1597       "fmla z27.s, z11.s, z3.s[3]\n"
1598       "fmla z28.s, z11.s, z4.s[3]\n"
1599       "fmla z29.s, z11.s, z5.s[3]\n"
1600       "fmla z30.s, z11.s, z6.s[3]\n"
1601       "fmla z31.s, z11.s, z7.s[3]\n"
1602       "bgt 108b\n"
1603       "109:"  // Height 8: Multiply loop: Single iteration only
1604       "ld1w { z12.s }, p2/Z, [x7]\n"
1605       "whilelt p0.s, XZR, x15\n"
1606       "subs x15, x15, #0x1\n"
1607       "ld1rqw { z0.s }, p0/Z, [x14]\n"
1608       "fmla z24.s, z12.s, z0.s[0]\n"
1609       "ld1rqw { z1.s }, p0/Z, [x12]\n"
1610       "add x14, x14, #0x10\n"
1611       "fmla z25.s, z12.s, z1.s[0]\n"
1612       "ld1rqw { z2.s }, p0/Z, [x10]\n"
1613       "add x12, x12, #0x10\n"
1614       "fmla z26.s, z12.s, z2.s[0]\n"
1615       "ld1rqw { z3.s }, p0/Z, [x28]\n"
1616       "add x10, x10, #0x10\n"
1617       "fmla z27.s, z12.s, z3.s[0]\n"
1618       "ld1rqw { z4.s }, p0/Z, [x26]\n"
1619       "add x28, x28, #0x10\n"
1620       "fmla z28.s, z12.s, z4.s[0]\n"
1621       "ld1rqw { z5.s }, p0/Z, [x24]\n"
1622       "add x26, x26, #0x10\n"
1623       "fmla z29.s, z12.s, z5.s[0]\n"
1624       "ld1rqw { z6.s }, p0/Z, [x22]\n"
1625       "add x24, x24, #0x10\n"
1626       "fmla z30.s, z12.s, z6.s[0]\n"
1627       "ld1rqw { z7.s }, p0/Z, [x20]\n"
1628       "add x22, x22, #0x10\n"
1629       "fmla z31.s, z12.s, z7.s[0]\n"
1630       "add x20, x20, #0x10\n"
1631       "addvl x7, x7, #1\n"
1632       "ble 110f\n"
1633       "ld1w { z13.s }, p2/Z, [x7]\n"
1634       "fmla z24.s, z13.s, z0.s[1]\n"
1635       "subs x15, x15, #0x1\n"
1636       "fmla z25.s, z13.s, z1.s[1]\n"
1637       "addvl x7, x7, #1\n"
1638       "fmla z26.s, z13.s, z2.s[1]\n"
1639       "fmla z27.s, z13.s, z3.s[1]\n"
1640       "fmla z28.s, z13.s, z4.s[1]\n"
1641       "fmla z29.s, z13.s, z5.s[1]\n"
1642       "fmla z30.s, z13.s, z6.s[1]\n"
1643       "fmla z31.s, z13.s, z7.s[1]\n"
1644       "ble 110f\n"
1645       "ld1w { z14.s }, p2/Z, [x7]\n"
1646       "fmla z24.s, z14.s, z0.s[2]\n"
1647       "subs x15, x15, #0x1\n"
1648       "fmla z25.s, z14.s, z1.s[2]\n"
1649       "addvl x7, x7, #1\n"
1650       "fmla z26.s, z14.s, z2.s[2]\n"
1651       "fmla z27.s, z14.s, z3.s[2]\n"
1652       "fmla z28.s, z14.s, z4.s[2]\n"
1653       "fmla z29.s, z14.s, z5.s[2]\n"
1654       "fmla z30.s, z14.s, z6.s[2]\n"
1655       "fmla z31.s, z14.s, z7.s[2]\n"
1656       "ble 110f\n"
1657       "ld1w { z15.s }, p2/Z, [x7]\n"
1658       "fmla z24.s, z15.s, z0.s[3]\n"
1659       "addvl x7, x7, #1\n"
1660       "fmla z25.s, z15.s, z1.s[3]\n"
1661       "fmla z26.s, z15.s, z2.s[3]\n"
1662       "fmla z27.s, z15.s, z3.s[3]\n"
1663       "fmla z28.s, z15.s, z4.s[3]\n"
1664       "fmla z29.s, z15.s, z5.s[3]\n"
1665       "fmla z30.s, z15.s, z6.s[3]\n"
1666       "fmla z31.s, z15.s, z7.s[3]\n"
1667       "110:"  // Height 8: Multiply loop: multiply skip
1668       "prfm pldl1keep, [x14, #0x80]\n"
1669       "add x16, x16, #0x1\n"
1670       "prfm pldl1keep, [x12, #0x80]\n"
1671       "prfm pldl1keep, [x10, #0x80]\n"
1672       "prfm pldl1keep, [x28, #0x80]\n"
1673       "prfm pldl1keep, [x26, #0x80]\n"
1674       "prfm pldl1keep, [x24, #0x80]\n"
1675       "prfm pldl1keep, [x22, #0x80]\n"
1676       "prfm pldl1keep, [x20, #0x80]\n"
1677       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1678       "cmp x16, x19\n"
1679       "bne 105b\n"
1680       "prfm pstl1keep, [x17, #0x0]\n"
1681       "prfm pstl1keep, [x13, #0x0]\n"
1682       "prfm pstl1keep, [x11, #0x0]\n"
1683       "prfm pstl1keep, [x9, #0x0]\n"
1684       "prfm pstl1keep, [x27, #0x0]\n"
1685       "prfm pstl1keep, [x25, #0x0]\n"
1686       "prfm pstl1keep, [x23, #0x0]\n"
1687       "prfm pstl1keep, [x21, #0x0]\n"
1688       "tbz %x[flags], #1, 111f\n"
1689       "add x19, %x[args_ptr], %[offset_min]\n"
1690       "ld1rw { z17.s }, p2/Z, [x19]\n"
1691       "add x19, %x[args_ptr], %[offset_max]\n"
1692       "ld1rw { z16.s }, p2/Z, [x19]\n"
1693       "fmin z24.s, p2/M, z24.s, z16.s\n"
1694       "fmin z25.s, p2/M, z25.s, z16.s\n"
1695       "fmin z26.s, p2/M, z26.s, z16.s\n"
1696       "fmin z27.s, p2/M, z27.s, z16.s\n"
1697       "fmin z28.s, p2/M, z28.s, z16.s\n"
1698       "fmax z24.s, p2/M, z24.s, z17.s\n"
1699       "fmax z25.s, p2/M, z25.s, z17.s\n"
1700       "fmax z26.s, p2/M, z26.s, z17.s\n"
1701       "fmax z27.s, p2/M, z27.s, z17.s\n"
1702       "fmax z28.s, p2/M, z28.s, z17.s\n"
1703       "fmin z29.s, p2/M, z29.s, z16.s\n"
1704       "fmin z30.s, p2/M, z30.s, z16.s\n"
1705       "fmin z31.s, p2/M, z31.s, z16.s\n"
1706       "fmax z29.s, p2/M, z29.s, z17.s\n"
1707       "fmax z30.s, p2/M, z30.s, z17.s\n"
1708       "fmax z31.s, p2/M, z31.s, z17.s\n"
1709       "111:"  // Height 8: No activation
1710       "st1w { z24.s }, p1, [x17]\n"
1711       "addvl x17, x17, #1\n"
1712       "st1w { z25.s }, p1, [x13]\n"
1713       "addvl x13, x13, #1\n"
1714       "st1w { z26.s }, p1, [x11]\n"
1715       "addvl x11, x11, #1\n"
1716       "st1w { z27.s }, p1, [x9]\n"
1717       "addvl x9, x9, #1\n"
1718       "st1w { z28.s }, p1, [x27]\n"
1719       "addvl x27, x27, #1\n"
1720       "st1w { z29.s }, p1, [x25]\n"
1721       "addvl x25, x25, #1\n"
1722       "st1w { z30.s }, p1, [x23]\n"
1723       "addvl x23, x23, #1\n"
1724       "st1w { z31.s }, p1, [x21]\n"
1725       "addvl x21, x21, #1\n"
1726       "112:"  // Height 8: Writeback done
1727       "mov x19, #0x0\n"
1728       "incw x19\n"
1729       "subs x6, x6, x19\n"
1730       "bgt 101b\n"
1731       "subs %x[M], %x[M], #0x8\n"
1732       "beq 114f\n"
1733       "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1734       "tbz %x[flags], #3, 113f\n"
1735       "add x20, x20, #0x8\n"
1736       "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1737       "b 1b\n"
1738       "113:"  // Update direct input
1739       "mov x19, #0x20\n"
1740       "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
1741       "b 1b\n"
1742       "114:"  // Exit
1743 
1744       : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
1745       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
1746       : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
1747     );
1748 }
1749 
1750 } // namespace arm_gemm
1751 #endif // __ARM_FEATURE_SVE
1752