• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 #ifdef ARM_COMPUTE_ENABLE_SVE
25 
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28 
29 #include <cassert>
30 #include <limits>
31 
32 namespace arm_gemm {
33 
sve_hybrid_fp32_mla_8x1VL(unsigned int num_strings,const unsigned int * string_lengths,IndirectInputArg<float> A_arg,size_t M,size_t N,const float * B_ptr,IndirectOutputArg<float> output_arg,const float * bias,Activation act,bool accumulate)34 void sve_hybrid_fp32_mla_8x1VL (
35     unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
36     size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
37     const float *bias, Activation act, bool accumulate
38 )
39 {
40     struct KernelArgs {
41         float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
42         float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
43         unsigned int num_strings = {};
44         const unsigned int *string_lengths = {};
45         size_t N = {};
46         const float *B_ptr = {};
47         size_t output_offset = {};
48         size_t input_initial_col = {};
49         size_t input_offset = {};
50     } ka;
51 
52     unsigned long flags=0;
53     void *output_ptr;
54     void *input_ptr;
55 
56     if (output_arg.is_indirect) {
57         output_ptr=(void *)(output_arg.indirect.ptr);
58         ka.output_offset=output_arg.indirect.offset;
59         flags |= 0x4;
60     } else {
61         output_ptr=(void *)(output_arg.direct.base);
62         ka.output_offset=output_arg.direct.stride;
63     }
64 
65     if (A_arg.is_indirect) {
66         input_ptr=(void *)(A_arg.indirect.ptr);
67         ka.input_offset=A_arg.indirect.start_row;
68         ka.input_initial_col=A_arg.indirect.start_col;
69         flags |= 0x8;
70     } else {
71         assert(num_strings==1);
72         input_ptr=(void *)(A_arg.direct.base);
73         ka.input_offset=A_arg.direct.stride;
74     }
75     if (accumulate) {
76         flags |= 0x1;
77     }
78     ka.num_strings = num_strings;
79     ka.string_lengths = string_lengths;
80     ka.N = N;
81     ka.B_ptr = B_ptr;
82     switch(act.type) {
83         default:
84         case Activation::Type::None:
85             break;
86         case Activation::Type::BoundedReLU:
87             ka.maxval = static_cast<float>(act.param1);
88             /* fall through */
89         case Activation::Type::ReLU:
90             ka.minval = 0;
91             flags |= 0x2;
92             break;
93     }
94     __asm__ __volatile__(
95       "ptrue p2.b\n"
96       "1:"  // Row loop
97       "cmp %x[M], #0x8\n"
98       "bge 92f\n"
99       "cmp %x[M], #0x6\n"
100       "bgt 79f\n"
101       "beq 66f\n"
102       "cmp %x[M], #0x4\n"
103       "bgt 53f\n"
104       "beq 40f\n"
105       "cmp %x[M], #0x2\n"
106       "bgt 27f\n"
107       "beq 14f\n"
108       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
109       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
110       "mov x11, %x[bias]\n"
111       "mov x10, %x[output_ptr]\n"
112       "2:"  // Height 1: Column loop
113       "mov x19, #0x0\n"
114       "whilelt p1.s, x19, x13\n"
115       "cbz x11, 3f\n"
116       "ld1w { z24.s }, p2/Z, [x11]\n"
117       "addvl x11, x11, #1\n"
118       "b 5f\n"
119       "3:"  // Height 1: no bias
120       "tbz %x[flags], #0, 4f\n"
121       "ld1w { z24.s }, p1/Z, [x10]\n"
122       "b 5f\n"
123       "4:"  // Height 1: no accumulate
124       "mov z24.b, #0x0\n"
125       "5:"  // Height 1: setup done
126       "mov x9, #0x0\n"
127       "6:"  // Height 1: String loop
128       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
129       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
130       "ldr w28, [x20, x9, LSL #0x2]\n"
131       "tbz %x[flags], #3, 7f\n"
132       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
133       "add x20, x20, x19, LSL #3\n"
134       "ldr x27, [x20, #0x0]\n"
135       "cbnz x9, 8f\n"
136       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
137       "add x27, x27, x19, LSL #2\n"
138       "b 8f\n"
139       "7:"  // Height 1: setup direct input
140       "mov x27, %x[input_ptr]\n"
141       "8:"  // Height 1: input setup done
142       "cmp x28, #0x4\n"
143       "ble 10f\n"
144       "9:"  // Height 1: Multiply loop: Main loop head
145       "ld1w { z8.s }, p2/Z, [x12]\n"
146       "whilelt p0.s, XZR, x28\n"
147       "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
148       "sub x28, x28, #0x4\n"
149       "ld1rqw { z0.s }, p0/Z, [x27]\n"
150       "fmla z24.s, z8.s, z0.s[0]\n"
151       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
152       "cmp x28, #0x4\n"
153       "fmla z24.s, z9.s, z0.s[1]\n"
154       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
155       "add x27, x27, #0x10\n"
156       "fmla z24.s, z10.s, z0.s[2]\n"
157       "addvl x12, x12, #4\n"
158       "fmla z24.s, z11.s, z0.s[3]\n"
159       "bgt 9b\n"
160       "10:"  // Height 1: Multiply loop: Single iteration only
161       "ld1w { z8.s }, p2/Z, [x12]\n"
162       "whilelt p0.s, XZR, x28\n"
163       "subs x28, x28, #0x1\n"
164       "ld1rqw { z0.s }, p0/Z, [x27]\n"
165       "fmla z24.s, z8.s, z0.s[0]\n"
166       "addvl x12, x12, #1\n"
167       "ble 11f\n"
168       "ld1w { z9.s }, p2/Z, [x12]\n"
169       "fmla z24.s, z9.s, z0.s[1]\n"
170       "subs x28, x28, #0x1\n"
171       "addvl x12, x12, #1\n"
172       "ble 11f\n"
173       "ld1w { z10.s }, p2/Z, [x12]\n"
174       "fmla z24.s, z10.s, z0.s[2]\n"
175       "subs x28, x28, #0x1\n"
176       "addvl x12, x12, #1\n"
177       "ble 11f\n"
178       "ld1w { z11.s }, p2/Z, [x12]\n"
179       "fmla z24.s, z11.s, z0.s[3]\n"
180       "addvl x12, x12, #1\n"
181       "11:"  // Height 1: Multiply loop: multiply skip
182       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
183       "add x9, x9, #0x1\n"
184       "cmp x9, x19\n"
185       "bne 6b\n"
186       "tbz %x[flags], #1, 12f\n"
187       "add x19, %x[args_ptr], %[offset_min]\n"
188       "ld1rw { z17.s }, p2/Z, [x19]\n"
189       "add x19, %x[args_ptr], %[offset_max]\n"
190       "ld1rw { z16.s }, p2/Z, [x19]\n"
191       "fmin z24.s, p2/M, z24.s, z16.s\n"
192       "fmax z24.s, p2/M, z24.s, z17.s\n"
193       "12:"  // Height 1: No activation
194       "st1w { z24.s }, p1, [x10]\n"
195       "addvl x10, x10, #1\n"
196       "13:"  // Height 1: Writeback done
197       "decw x13\n"
198       "cmp x13, XZR\n"
199       "bgt 2b\n"
200       "b 106f\n"
201       "14:"  // Height 2
202       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
203       "mov x11, %x[bias]\n"
204       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
205       "mov x10, %x[output_ptr]\n"
206       "15:"  // Height 2: Column loop
207       "mov x19, #0x0\n"
208       "whilelt p1.s, x19, x13\n"
209       "cbz x11, 16f\n"
210       "ld1w { z24.s }, p2/Z, [x11]\n"
211       "mov z25.d, z24.d\n"
212       "addvl x11, x11, #1\n"
213       "b 18f\n"
214       "16:"  // Height 2: no bias
215       "tbz %x[flags], #0, 17f\n"
216       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
217       "ld1w { z24.s }, p1/Z, [x10]\n"
218       "add x26, x10, x19, LSL #2\n"
219       "ld1w { z25.s }, p1/Z, [x26]\n"
220       "b 18f\n"
221       "17:"  // Height 2: no accumulate
222       "mov z24.b, #0x0\n"
223       "mov z25.b, #0x0\n"
224       "18:"  // Height 2: setup done
225       "mov x9, #0x0\n"
226       "19:"  // Height 2: String loop
227       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
228       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
229       "ldr w28, [x20, x9, LSL #0x2]\n"
230       "tbz %x[flags], #3, 20f\n"
231       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
232       "add x20, x20, x19, LSL #3\n"
233       "ldr x27, [x20, #0x0]\n"
234       "ldr x26, [x20, #0x8]\n"
235       "cbnz x9, 21f\n"
236       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
237       "add x27, x27, x19, LSL #2\n"
238       "add x26, x26, x19, LSL #2\n"
239       "b 21f\n"
240       "20:"  // Height 2: setup direct input
241       "mov x27, %x[input_ptr]\n"
242       "add x26, x27, x19, LSL #2\n"
243       "21:"  // Height 2: input setup done
244       "cmp x28, #0x4\n"
245       "ble 23f\n"
246       "22:"  // Height 2: Multiply loop: Main loop head
247       "ld1w { z8.s }, p2/Z, [x12]\n"
248       "whilelt p0.s, XZR, x28\n"
249       "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
250       "sub x28, x28, #0x4\n"
251       "ld1rqw { z0.s }, p0/Z, [x27]\n"
252       "fmla z24.s, z8.s, z0.s[0]\n"
253       "ld1rqw { z1.s }, p0/Z, [x26]\n"
254       "cmp x28, #0x4\n"
255       "fmla z25.s, z8.s, z1.s[0]\n"
256       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
257       "add x27, x27, #0x10\n"
258       "fmla z24.s, z9.s, z0.s[1]\n"
259       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
260       "add x26, x26, #0x10\n"
261       "fmla z25.s, z9.s, z1.s[1]\n"
262       "addvl x12, x12, #4\n"
263       "fmla z24.s, z10.s, z0.s[2]\n"
264       "fmla z25.s, z10.s, z1.s[2]\n"
265       "fmla z24.s, z11.s, z0.s[3]\n"
266       "fmla z25.s, z11.s, z1.s[3]\n"
267       "bgt 22b\n"
268       "23:"  // Height 2: Multiply loop: Single iteration only
269       "ld1w { z8.s }, p2/Z, [x12]\n"
270       "whilelt p0.s, XZR, x28\n"
271       "subs x28, x28, #0x1\n"
272       "ld1rqw { z0.s }, p0/Z, [x27]\n"
273       "fmla z24.s, z8.s, z0.s[0]\n"
274       "ld1rqw { z1.s }, p0/Z, [x26]\n"
275       "addvl x12, x12, #1\n"
276       "fmla z25.s, z8.s, z1.s[0]\n"
277       "ble 24f\n"
278       "ld1w { z9.s }, p2/Z, [x12]\n"
279       "fmla z24.s, z9.s, z0.s[1]\n"
280       "subs x28, x28, #0x1\n"
281       "addvl x12, x12, #1\n"
282       "fmla z25.s, z9.s, z1.s[1]\n"
283       "ble 24f\n"
284       "ld1w { z10.s }, p2/Z, [x12]\n"
285       "fmla z24.s, z10.s, z0.s[2]\n"
286       "subs x28, x28, #0x1\n"
287       "fmla z25.s, z10.s, z1.s[2]\n"
288       "addvl x12, x12, #1\n"
289       "ble 24f\n"
290       "ld1w { z11.s }, p2/Z, [x12]\n"
291       "fmla z24.s, z11.s, z0.s[3]\n"
292       "addvl x12, x12, #1\n"
293       "fmla z25.s, z11.s, z1.s[3]\n"
294       "24:"  // Height 2: Multiply loop: multiply skip
295       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
296       "add x9, x9, #0x1\n"
297       "cmp x9, x19\n"
298       "bne 19b\n"
299       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
300       "add x26, x10, x19, LSL #2\n"
301       "tbz %x[flags], #1, 25f\n"
302       "add x19, %x[args_ptr], %[offset_min]\n"
303       "ld1rw { z17.s }, p2/Z, [x19]\n"
304       "add x19, %x[args_ptr], %[offset_max]\n"
305       "ld1rw { z16.s }, p2/Z, [x19]\n"
306       "fmin z24.s, p2/M, z24.s, z16.s\n"
307       "fmin z25.s, p2/M, z25.s, z16.s\n"
308       "fmax z24.s, p2/M, z24.s, z17.s\n"
309       "fmax z25.s, p2/M, z25.s, z17.s\n"
310       "25:"  // Height 2: No activation
311       "st1w { z24.s }, p1, [x10]\n"
312       "addvl x10, x10, #1\n"
313       "st1w { z25.s }, p1, [x26]\n"
314       "26:"  // Height 2: Writeback done
315       "decw x13\n"
316       "cmp x13, XZR\n"
317       "bgt 15b\n"
318       "b 106f\n"
319       "27:"  // Height 3
320       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
321       "mov x11, %x[bias]\n"
322       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
323       "mov x10, %x[output_ptr]\n"
324       "28:"  // Height 3: Column loop
325       "mov x19, #0x0\n"
326       "whilelt p1.s, x19, x13\n"
327       "cbz x11, 29f\n"
328       "ld1w { z24.s }, p2/Z, [x11]\n"
329       "mov z25.d, z24.d\n"
330       "addvl x11, x11, #1\n"
331       "mov z26.d, z24.d\n"
332       "b 31f\n"
333       "29:"  // Height 3: no bias
334       "tbz %x[flags], #0, 30f\n"
335       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
336       "ld1w { z24.s }, p1/Z, [x10]\n"
337       "add x26, x10, x19, LSL #2\n"
338       "ld1w { z25.s }, p1/Z, [x26]\n"
339       "add x25, x26, x19, LSL #2\n"
340       "ld1w { z26.s }, p1/Z, [x25]\n"
341       "b 31f\n"
342       "30:"  // Height 3: no accumulate
343       "mov z24.b, #0x0\n"
344       "mov z25.b, #0x0\n"
345       "mov z26.b, #0x0\n"
346       "31:"  // Height 3: setup done
347       "mov x9, #0x0\n"
348       "32:"  // Height 3: String loop
349       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
350       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
351       "ldr w28, [x20, x9, LSL #0x2]\n"
352       "tbz %x[flags], #3, 33f\n"
353       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
354       "add x20, x20, x19, LSL #3\n"
355       "ldr x27, [x20, #0x0]\n"
356       "ldr x26, [x20, #0x8]\n"
357       "ldr x25, [x20, #0x10]\n"
358       "cbnz x9, 34f\n"
359       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
360       "add x27, x27, x19, LSL #2\n"
361       "add x26, x26, x19, LSL #2\n"
362       "add x25, x25, x19, LSL #2\n"
363       "b 34f\n"
364       "33:"  // Height 3: setup direct input
365       "mov x27, %x[input_ptr]\n"
366       "add x26, x27, x19, LSL #2\n"
367       "add x25, x26, x19, LSL #2\n"
368       "34:"  // Height 3: input setup done
369       "cmp x28, #0x4\n"
370       "ble 36f\n"
371       "35:"  // Height 3: Multiply loop: Main loop head
372       "ld1w { z8.s }, p2/Z, [x12]\n"
373       "whilelt p0.s, XZR, x28\n"
374       "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
375       "sub x28, x28, #0x4\n"
376       "ld1rqw { z0.s }, p0/Z, [x27]\n"
377       "fmla z24.s, z8.s, z0.s[0]\n"
378       "ld1rqw { z1.s }, p0/Z, [x26]\n"
379       "cmp x28, #0x4\n"
380       "fmla z25.s, z8.s, z1.s[0]\n"
381       "ld1rqw { z2.s }, p0/Z, [x25]\n"
382       "add x27, x27, #0x10\n"
383       "fmla z24.s, z9.s, z0.s[1]\n"
384       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
385       "add x26, x26, #0x10\n"
386       "fmla z26.s, z8.s, z2.s[0]\n"
387       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
388       "add x25, x25, #0x10\n"
389       "fmla z25.s, z9.s, z1.s[1]\n"
390       "addvl x12, x12, #4\n"
391       "fmla z24.s, z10.s, z0.s[2]\n"
392       "fmla z26.s, z9.s, z2.s[1]\n"
393       "fmla z25.s, z10.s, z1.s[2]\n"
394       "fmla z24.s, z11.s, z0.s[3]\n"
395       "fmla z26.s, z10.s, z2.s[2]\n"
396       "fmla z25.s, z11.s, z1.s[3]\n"
397       "fmla z26.s, z11.s, z2.s[3]\n"
398       "bgt 35b\n"
399       "36:"  // Height 3: Multiply loop: Single iteration only
400       "ld1w { z8.s }, p2/Z, [x12]\n"
401       "whilelt p0.s, XZR, x28\n"
402       "subs x28, x28, #0x1\n"
403       "ld1rqw { z0.s }, p0/Z, [x27]\n"
404       "fmla z24.s, z8.s, z0.s[0]\n"
405       "ld1rqw { z1.s }, p0/Z, [x26]\n"
406       "addvl x12, x12, #1\n"
407       "fmla z25.s, z8.s, z1.s[0]\n"
408       "ld1rqw { z2.s }, p0/Z, [x25]\n"
409       "fmla z26.s, z8.s, z2.s[0]\n"
410       "ble 37f\n"
411       "ld1w { z9.s }, p2/Z, [x12]\n"
412       "fmla z24.s, z9.s, z0.s[1]\n"
413       "subs x28, x28, #0x1\n"
414       "fmla z25.s, z9.s, z1.s[1]\n"
415       "addvl x12, x12, #1\n"
416       "fmla z26.s, z9.s, z2.s[1]\n"
417       "ble 37f\n"
418       "ld1w { z10.s }, p2/Z, [x12]\n"
419       "fmla z24.s, z10.s, z0.s[2]\n"
420       "subs x28, x28, #0x1\n"
421       "fmla z25.s, z10.s, z1.s[2]\n"
422       "addvl x12, x12, #1\n"
423       "fmla z26.s, z10.s, z2.s[2]\n"
424       "ble 37f\n"
425       "ld1w { z11.s }, p2/Z, [x12]\n"
426       "fmla z24.s, z11.s, z0.s[3]\n"
427       "addvl x12, x12, #1\n"
428       "fmla z25.s, z11.s, z1.s[3]\n"
429       "fmla z26.s, z11.s, z2.s[3]\n"
430       "37:"  // Height 3: Multiply loop: multiply skip
431       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
432       "add x9, x9, #0x1\n"
433       "cmp x9, x19\n"
434       "bne 32b\n"
435       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
436       "add x26, x10, x19, LSL #2\n"
437       "add x25, x26, x19, LSL #2\n"
438       "tbz %x[flags], #1, 38f\n"
439       "add x19, %x[args_ptr], %[offset_min]\n"
440       "ld1rw { z17.s }, p2/Z, [x19]\n"
441       "add x19, %x[args_ptr], %[offset_max]\n"
442       "ld1rw { z16.s }, p2/Z, [x19]\n"
443       "fmin z24.s, p2/M, z24.s, z16.s\n"
444       "fmin z25.s, p2/M, z25.s, z16.s\n"
445       "fmin z26.s, p2/M, z26.s, z16.s\n"
446       "fmax z24.s, p2/M, z24.s, z17.s\n"
447       "fmax z25.s, p2/M, z25.s, z17.s\n"
448       "fmax z26.s, p2/M, z26.s, z17.s\n"
449       "38:"  // Height 3: No activation
450       "st1w { z24.s }, p1, [x10]\n"
451       "addvl x10, x10, #1\n"
452       "st1w { z25.s }, p1, [x26]\n"
453       "st1w { z26.s }, p1, [x25]\n"
454       "39:"  // Height 3: Writeback done
455       "decw x13\n"
456       "cmp x13, XZR\n"
457       "bgt 28b\n"
458       "b 106f\n"
459       "40:"  // Height 4
460       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
461       "mov x11, %x[bias]\n"
462       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
463       "mov x10, %x[output_ptr]\n"
464       "41:"  // Height 4: Column loop
465       "mov x19, #0x0\n"
466       "whilelt p1.s, x19, x13\n"
467       "cbz x11, 42f\n"
468       "ld1w { z24.s }, p2/Z, [x11]\n"
469       "mov z25.d, z24.d\n"
470       "addvl x11, x11, #1\n"
471       "mov z26.d, z24.d\n"
472       "mov z27.d, z24.d\n"
473       "b 44f\n"
474       "42:"  // Height 4: no bias
475       "tbz %x[flags], #0, 43f\n"
476       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
477       "ld1w { z24.s }, p1/Z, [x10]\n"
478       "add x26, x10, x19, LSL #2\n"
479       "ld1w { z25.s }, p1/Z, [x26]\n"
480       "add x25, x26, x19, LSL #2\n"
481       "ld1w { z26.s }, p1/Z, [x25]\n"
482       "add x24, x25, x19, LSL #2\n"
483       "ld1w { z27.s }, p1/Z, [x24]\n"
484       "b 44f\n"
485       "43:"  // Height 4: no accumulate
486       "mov z24.b, #0x0\n"
487       "mov z25.b, #0x0\n"
488       "mov z26.b, #0x0\n"
489       "mov z27.b, #0x0\n"
490       "44:"  // Height 4: setup done
491       "mov x9, #0x0\n"
492       "45:"  // Height 4: String loop
493       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
494       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
495       "ldr w28, [x20, x9, LSL #0x2]\n"
496       "tbz %x[flags], #3, 46f\n"
497       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
498       "add x20, x20, x19, LSL #3\n"
499       "ldr x27, [x20, #0x0]\n"
500       "ldr x26, [x20, #0x8]\n"
501       "ldr x25, [x20, #0x10]\n"
502       "ldr x24, [x20, #0x18]\n"
503       "cbnz x9, 47f\n"
504       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
505       "add x27, x27, x19, LSL #2\n"
506       "add x26, x26, x19, LSL #2\n"
507       "add x25, x25, x19, LSL #2\n"
508       "add x24, x24, x19, LSL #2\n"
509       "b 47f\n"
510       "46:"  // Height 4: setup direct input
511       "mov x27, %x[input_ptr]\n"
512       "add x26, x27, x19, LSL #2\n"
513       "add x25, x26, x19, LSL #2\n"
514       "add x24, x25, x19, LSL #2\n"
515       "47:"  // Height 4: input setup done
516       "cmp x28, #0x4\n"
517       "ble 49f\n"
518       "48:"  // Height 4: Multiply loop: Main loop head
519       "ld1w { z8.s }, p2/Z, [x12]\n"
520       "whilelt p0.s, XZR, x28\n"
521       "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
522       "sub x28, x28, #0x4\n"
523       "ld1rqw { z0.s }, p0/Z, [x27]\n"
524       "fmla z24.s, z8.s, z0.s[0]\n"
525       "ld1rqw { z1.s }, p0/Z, [x26]\n"
526       "cmp x28, #0x4\n"
527       "fmla z25.s, z8.s, z1.s[0]\n"
528       "ld1rqw { z2.s }, p0/Z, [x25]\n"
529       "add x27, x27, #0x10\n"
530       "fmla z24.s, z9.s, z0.s[1]\n"
531       "ld1rqw { z3.s }, p0/Z, [x24]\n"
532       "add x26, x26, #0x10\n"
533       "fmla z26.s, z8.s, z2.s[0]\n"
534       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
535       "add x25, x25, #0x10\n"
536       "fmla z27.s, z8.s, z3.s[0]\n"
537       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
538       "add x24, x24, #0x10\n"
539       "fmla z25.s, z9.s, z1.s[1]\n"
540       "addvl x12, x12, #4\n"
541       "fmla z24.s, z10.s, z0.s[2]\n"
542       "fmla z26.s, z9.s, z2.s[1]\n"
543       "fmla z27.s, z9.s, z3.s[1]\n"
544       "fmla z25.s, z10.s, z1.s[2]\n"
545       "fmla z24.s, z11.s, z0.s[3]\n"
546       "fmla z26.s, z10.s, z2.s[2]\n"
547       "fmla z27.s, z10.s, z3.s[2]\n"
548       "fmla z25.s, z11.s, z1.s[3]\n"
549       "fmla z26.s, z11.s, z2.s[3]\n"
550       "fmla z27.s, z11.s, z3.s[3]\n"
551       "bgt 48b\n"
552       "49:"  // Height 4: Multiply loop: Single iteration only
553       "ld1w { z8.s }, p2/Z, [x12]\n"
554       "whilelt p0.s, XZR, x28\n"
555       "subs x28, x28, #0x1\n"
556       "ld1rqw { z0.s }, p0/Z, [x27]\n"
557       "fmla z24.s, z8.s, z0.s[0]\n"
558       "ld1rqw { z1.s }, p0/Z, [x26]\n"
559       "addvl x12, x12, #1\n"
560       "fmla z25.s, z8.s, z1.s[0]\n"
561       "ld1rqw { z2.s }, p0/Z, [x25]\n"
562       "ld1rqw { z3.s }, p0/Z, [x24]\n"
563       "fmla z26.s, z8.s, z2.s[0]\n"
564       "fmla z27.s, z8.s, z3.s[0]\n"
565       "ble 50f\n"
566       "ld1w { z9.s }, p2/Z, [x12]\n"
567       "fmla z24.s, z9.s, z0.s[1]\n"
568       "subs x28, x28, #0x1\n"
569       "fmla z25.s, z9.s, z1.s[1]\n"
570       "addvl x12, x12, #1\n"
571       "fmla z26.s, z9.s, z2.s[1]\n"
572       "fmla z27.s, z9.s, z3.s[1]\n"
573       "ble 50f\n"
574       "ld1w { z10.s }, p2/Z, [x12]\n"
575       "fmla z24.s, z10.s, z0.s[2]\n"
576       "subs x28, x28, #0x1\n"
577       "fmla z25.s, z10.s, z1.s[2]\n"
578       "addvl x12, x12, #1\n"
579       "fmla z26.s, z10.s, z2.s[2]\n"
580       "fmla z27.s, z10.s, z3.s[2]\n"
581       "ble 50f\n"
582       "ld1w { z11.s }, p2/Z, [x12]\n"
583       "fmla z24.s, z11.s, z0.s[3]\n"
584       "addvl x12, x12, #1\n"
585       "fmla z25.s, z11.s, z1.s[3]\n"
586       "fmla z26.s, z11.s, z2.s[3]\n"
587       "fmla z27.s, z11.s, z3.s[3]\n"
588       "50:"  // Height 4: Multiply loop: multiply skip
589       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
590       "add x9, x9, #0x1\n"
591       "cmp x9, x19\n"
592       "bne 45b\n"
593       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
594       "add x26, x10, x19, LSL #2\n"
595       "add x25, x26, x19, LSL #2\n"
596       "add x24, x25, x19, LSL #2\n"
597       "tbz %x[flags], #1, 51f\n"
598       "add x19, %x[args_ptr], %[offset_min]\n"
599       "ld1rw { z17.s }, p2/Z, [x19]\n"
600       "add x19, %x[args_ptr], %[offset_max]\n"
601       "ld1rw { z16.s }, p2/Z, [x19]\n"
602       "fmin z24.s, p2/M, z24.s, z16.s\n"
603       "fmin z25.s, p2/M, z25.s, z16.s\n"
604       "fmin z26.s, p2/M, z26.s, z16.s\n"
605       "fmin z27.s, p2/M, z27.s, z16.s\n"
606       "fmax z24.s, p2/M, z24.s, z17.s\n"
607       "fmax z25.s, p2/M, z25.s, z17.s\n"
608       "fmax z26.s, p2/M, z26.s, z17.s\n"
609       "fmax z27.s, p2/M, z27.s, z17.s\n"
610       "51:"  // Height 4: No activation
611       "st1w { z24.s }, p1, [x10]\n"
612       "addvl x10, x10, #1\n"
613       "st1w { z25.s }, p1, [x26]\n"
614       "st1w { z26.s }, p1, [x25]\n"
615       "st1w { z27.s }, p1, [x24]\n"
616       "52:"  // Height 4: Writeback done
617       "decw x13\n"
618       "cmp x13, XZR\n"
619       "bgt 41b\n"
620       "b 106f\n"
621       "53:"  // Height 5
622       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
623       "mov x11, %x[bias]\n"
624       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
625       "mov x10, %x[output_ptr]\n"
626       "54:"  // Height 5: Column loop
627       "mov x19, #0x0\n"
628       "whilelt p1.s, x19, x13\n"
629       "cbz x11, 55f\n"
630       "ld1w { z24.s }, p2/Z, [x11]\n"
631       "mov z25.d, z24.d\n"
632       "addvl x11, x11, #1\n"
633       "mov z26.d, z24.d\n"
634       "mov z27.d, z24.d\n"
635       "mov z28.d, z24.d\n"
636       "b 57f\n"
637       "55:"  // Height 5: no bias
638       "tbz %x[flags], #0, 56f\n"
639       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
640       "ld1w { z24.s }, p1/Z, [x10]\n"
641       "add x26, x10, x19, LSL #2\n"
642       "ld1w { z25.s }, p1/Z, [x26]\n"
643       "add x25, x26, x19, LSL #2\n"
644       "ld1w { z26.s }, p1/Z, [x25]\n"
645       "add x24, x25, x19, LSL #2\n"
646       "ld1w { z27.s }, p1/Z, [x24]\n"
647       "add x23, x24, x19, LSL #2\n"
648       "ld1w { z28.s }, p1/Z, [x23]\n"
649       "b 57f\n"
650       "56:"  // Height 5: no accumulate
651       "mov z24.b, #0x0\n"
652       "mov z25.b, #0x0\n"
653       "mov z26.b, #0x0\n"
654       "mov z27.b, #0x0\n"
655       "mov z28.b, #0x0\n"
656       "57:"  // Height 5: setup done
657       "mov x9, #0x0\n"
658       "58:"  // Height 5: String loop
659       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
660       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
661       "ldr w28, [x20, x9, LSL #0x2]\n"
662       "tbz %x[flags], #3, 59f\n"
663       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
664       "add x20, x20, x19, LSL #3\n"
665       "ldr x27, [x20, #0x0]\n"
666       "ldr x26, [x20, #0x8]\n"
667       "ldr x25, [x20, #0x10]\n"
668       "ldr x24, [x20, #0x18]\n"
669       "ldr x23, [x20, #0x20]\n"
670       "cbnz x9, 60f\n"
671       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
672       "add x27, x27, x19, LSL #2\n"
673       "add x26, x26, x19, LSL #2\n"
674       "add x25, x25, x19, LSL #2\n"
675       "add x24, x24, x19, LSL #2\n"
676       "add x23, x23, x19, LSL #2\n"
677       "b 60f\n"
678       "59:"  // Height 5: setup direct input
679       "mov x27, %x[input_ptr]\n"
680       "add x26, x27, x19, LSL #2\n"
681       "add x25, x26, x19, LSL #2\n"
682       "add x24, x25, x19, LSL #2\n"
683       "add x23, x24, x19, LSL #2\n"
684       "60:"  // Height 5: input setup done
685       "cmp x28, #0x4\n"
686       "ble 62f\n"
687       "61:"  // Height 5: Multiply loop: Main loop head
688       "ld1w { z8.s }, p2/Z, [x12]\n"
689       "whilelt p0.s, XZR, x28\n"
690       "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
691       "sub x28, x28, #0x4\n"
692       "ld1rqw { z0.s }, p0/Z, [x27]\n"
693       "fmla z24.s, z8.s, z0.s[0]\n"
694       "ld1rqw { z1.s }, p0/Z, [x26]\n"
695       "cmp x28, #0x4\n"
696       "fmla z25.s, z8.s, z1.s[0]\n"
697       "ld1rqw { z2.s }, p0/Z, [x25]\n"
698       "add x27, x27, #0x10\n"
699       "fmla z24.s, z9.s, z0.s[1]\n"
700       "ld1rqw { z3.s }, p0/Z, [x24]\n"
701       "add x26, x26, #0x10\n"
702       "fmla z26.s, z8.s, z2.s[0]\n"
703       "ld1rqw { z4.s }, p0/Z, [x23]\n"
704       "add x25, x25, #0x10\n"
705       "fmla z27.s, z8.s, z3.s[0]\n"
706       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
707       "add x24, x24, #0x10\n"
708       "fmla z25.s, z9.s, z1.s[1]\n"
709       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
710       "add x23, x23, #0x10\n"
711       "fmla z28.s, z8.s, z4.s[0]\n"
712       "addvl x12, x12, #4\n"
713       "fmla z26.s, z9.s, z2.s[1]\n"
714       "fmla z24.s, z10.s, z0.s[2]\n"
715       "fmla z27.s, z9.s, z3.s[1]\n"
716       "fmla z25.s, z10.s, z1.s[2]\n"
717       "fmla z28.s, z9.s, z4.s[1]\n"
718       "fmla z26.s, z10.s, z2.s[2]\n"
719       "fmla z27.s, z10.s, z3.s[2]\n"
720       "fmla z24.s, z11.s, z0.s[3]\n"
721       "fmla z28.s, z10.s, z4.s[2]\n"
722       "fmla z25.s, z11.s, z1.s[3]\n"
723       "fmla z26.s, z11.s, z2.s[3]\n"
724       "fmla z27.s, z11.s, z3.s[3]\n"
725       "fmla z28.s, z11.s, z4.s[3]\n"
726       "bgt 61b\n"
727       "62:"  // Height 5: Multiply loop: Single iteration only
728       "ld1w { z8.s }, p2/Z, [x12]\n"
729       "whilelt p0.s, XZR, x28\n"
730       "subs x28, x28, #0x1\n"
731       "ld1rqw { z0.s }, p0/Z, [x27]\n"
732       "fmla z24.s, z8.s, z0.s[0]\n"
733       "ld1rqw { z1.s }, p0/Z, [x26]\n"
734       "addvl x12, x12, #1\n"
735       "fmla z25.s, z8.s, z1.s[0]\n"
736       "ld1rqw { z2.s }, p0/Z, [x25]\n"
737       "ld1rqw { z3.s }, p0/Z, [x24]\n"
738       "fmla z26.s, z8.s, z2.s[0]\n"
739       "ld1rqw { z4.s }, p0/Z, [x23]\n"
740       "fmla z27.s, z8.s, z3.s[0]\n"
741       "fmla z28.s, z8.s, z4.s[0]\n"
742       "ble 63f\n"
743       "ld1w { z9.s }, p2/Z, [x12]\n"
744       "fmla z24.s, z9.s, z0.s[1]\n"
745       "subs x28, x28, #0x1\n"
746       "fmla z25.s, z9.s, z1.s[1]\n"
747       "addvl x12, x12, #1\n"
748       "fmla z26.s, z9.s, z2.s[1]\n"
749       "fmla z27.s, z9.s, z3.s[1]\n"
750       "fmla z28.s, z9.s, z4.s[1]\n"
751       "ble 63f\n"
752       "ld1w { z10.s }, p2/Z, [x12]\n"
753       "fmla z24.s, z10.s, z0.s[2]\n"
754       "subs x28, x28, #0x1\n"
755       "fmla z25.s, z10.s, z1.s[2]\n"
756       "addvl x12, x12, #1\n"
757       "fmla z26.s, z10.s, z2.s[2]\n"
758       "fmla z27.s, z10.s, z3.s[2]\n"
759       "fmla z28.s, z10.s, z4.s[2]\n"
760       "ble 63f\n"
761       "ld1w { z11.s }, p2/Z, [x12]\n"
762       "fmla z24.s, z11.s, z0.s[3]\n"
763       "addvl x12, x12, #1\n"
764       "fmla z25.s, z11.s, z1.s[3]\n"
765       "fmla z26.s, z11.s, z2.s[3]\n"
766       "fmla z27.s, z11.s, z3.s[3]\n"
767       "fmla z28.s, z11.s, z4.s[3]\n"
768       "63:"  // Height 5: Multiply loop: multiply skip
769       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
770       "add x9, x9, #0x1\n"
771       "cmp x9, x19\n"
772       "bne 58b\n"
773       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
774       "add x26, x10, x19, LSL #2\n"
775       "add x25, x26, x19, LSL #2\n"
776       "add x24, x25, x19, LSL #2\n"
777       "add x23, x24, x19, LSL #2\n"
778       "tbz %x[flags], #1, 64f\n"
779       "add x19, %x[args_ptr], %[offset_min]\n"
780       "ld1rw { z17.s }, p2/Z, [x19]\n"
781       "add x19, %x[args_ptr], %[offset_max]\n"
782       "ld1rw { z16.s }, p2/Z, [x19]\n"
783       "fmin z24.s, p2/M, z24.s, z16.s\n"
784       "fmin z25.s, p2/M, z25.s, z16.s\n"
785       "fmin z26.s, p2/M, z26.s, z16.s\n"
786       "fmin z27.s, p2/M, z27.s, z16.s\n"
787       "fmin z28.s, p2/M, z28.s, z16.s\n"
788       "fmax z24.s, p2/M, z24.s, z17.s\n"
789       "fmax z25.s, p2/M, z25.s, z17.s\n"
790       "fmax z26.s, p2/M, z26.s, z17.s\n"
791       "fmax z27.s, p2/M, z27.s, z17.s\n"
792       "fmax z28.s, p2/M, z28.s, z17.s\n"
793       "64:"  // Height 5: No activation
794       "st1w { z24.s }, p1, [x10]\n"
795       "addvl x10, x10, #1\n"
796       "st1w { z25.s }, p1, [x26]\n"
797       "st1w { z26.s }, p1, [x25]\n"
798       "st1w { z27.s }, p1, [x24]\n"
799       "st1w { z28.s }, p1, [x23]\n"
800       "65:"  // Height 5: Writeback done
801       "decw x13\n"
802       "cmp x13, XZR\n"
803       "bgt 54b\n"
804       "b 106f\n"
805       "66:"  // Height 6
806       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
807       "mov x11, %x[bias]\n"
808       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
809       "mov x10, %x[output_ptr]\n"
810       "67:"  // Height 6: Column loop
811       "mov x19, #0x0\n"
812       "whilelt p1.s, x19, x13\n"
813       "cbz x11, 68f\n"
814       "ld1w { z24.s }, p2/Z, [x11]\n"
815       "mov z25.d, z24.d\n"
816       "addvl x11, x11, #1\n"
817       "mov z26.d, z24.d\n"
818       "mov z27.d, z24.d\n"
819       "mov z28.d, z24.d\n"
820       "mov z29.d, z24.d\n"
821       "b 70f\n"
822       "68:"  // Height 6: no bias
823       "tbz %x[flags], #0, 69f\n"
824       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
825       "ld1w { z24.s }, p1/Z, [x10]\n"
826       "add x26, x10, x19, LSL #2\n"
827       "ld1w { z25.s }, p1/Z, [x26]\n"
828       "add x25, x26, x19, LSL #2\n"
829       "ld1w { z26.s }, p1/Z, [x25]\n"
830       "add x24, x25, x19, LSL #2\n"
831       "ld1w { z27.s }, p1/Z, [x24]\n"
832       "add x23, x24, x19, LSL #2\n"
833       "ld1w { z28.s }, p1/Z, [x23]\n"
834       "add x22, x23, x19, LSL #2\n"
835       "ld1w { z29.s }, p1/Z, [x22]\n"
836       "b 70f\n"
837       "69:"  // Height 6: no accumulate
838       "mov z24.b, #0x0\n"
839       "mov z25.b, #0x0\n"
840       "mov z26.b, #0x0\n"
841       "mov z27.b, #0x0\n"
842       "mov z28.b, #0x0\n"
843       "mov z29.b, #0x0\n"
844       "70:"  // Height 6: setup done
845       "mov x9, #0x0\n"
846       "71:"  // Height 6: String loop
847       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
848       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
849       "ldr w28, [x20, x9, LSL #0x2]\n"
850       "tbz %x[flags], #3, 72f\n"
851       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
852       "add x20, x20, x19, LSL #3\n"
853       "ldr x27, [x20, #0x0]\n"
854       "ldr x26, [x20, #0x8]\n"
855       "ldr x25, [x20, #0x10]\n"
856       "ldr x24, [x20, #0x18]\n"
857       "ldr x23, [x20, #0x20]\n"
858       "ldr x22, [x20, #0x28]\n"
859       "cbnz x9, 73f\n"
860       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
861       "add x27, x27, x19, LSL #2\n"
862       "add x26, x26, x19, LSL #2\n"
863       "add x25, x25, x19, LSL #2\n"
864       "add x24, x24, x19, LSL #2\n"
865       "add x23, x23, x19, LSL #2\n"
866       "add x22, x22, x19, LSL #2\n"
867       "b 73f\n"
868       "72:"  // Height 6: setup direct input
869       "mov x27, %x[input_ptr]\n"
870       "add x26, x27, x19, LSL #2\n"
871       "add x25, x26, x19, LSL #2\n"
872       "add x24, x25, x19, LSL #2\n"
873       "add x23, x24, x19, LSL #2\n"
874       "add x22, x23, x19, LSL #2\n"
875       "73:"  // Height 6: input setup done
876       "cmp x28, #0x4\n"
877       "ble 75f\n"
878       "74:"  // Height 6: Multiply loop: Main loop head
879       "ld1w { z8.s }, p2/Z, [x12]\n"
880       "whilelt p0.s, XZR, x28\n"
881       "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
882       "sub x28, x28, #0x4\n"
883       "ld1rqw { z0.s }, p0/Z, [x27]\n"
884       "fmla z24.s, z8.s, z0.s[0]\n"
885       "ld1rqw { z1.s }, p0/Z, [x26]\n"
886       "cmp x28, #0x4\n"
887       "fmla z25.s, z8.s, z1.s[0]\n"
888       "ld1rqw { z2.s }, p0/Z, [x25]\n"
889       "add x27, x27, #0x10\n"
890       "fmla z24.s, z9.s, z0.s[1]\n"
891       "ld1rqw { z3.s }, p0/Z, [x24]\n"
892       "add x26, x26, #0x10\n"
893       "fmla z26.s, z8.s, z2.s[0]\n"
894       "ld1rqw { z4.s }, p0/Z, [x23]\n"
895       "add x25, x25, #0x10\n"
896       "fmla z27.s, z8.s, z3.s[0]\n"
897       "ld1rqw { z5.s }, p0/Z, [x22]\n"
898       "add x24, x24, #0x10\n"
899       "fmla z25.s, z9.s, z1.s[1]\n"
900       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
901       "add x23, x23, #0x10\n"
902       "fmla z28.s, z8.s, z4.s[0]\n"
903       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
904       "add x22, x22, #0x10\n"
905       "fmla z29.s, z8.s, z5.s[0]\n"
906       "addvl x12, x12, #4\n"
907       "fmla z26.s, z9.s, z2.s[1]\n"
908       "fmla z27.s, z9.s, z3.s[1]\n"
909       "fmla z24.s, z10.s, z0.s[2]\n"
910       "fmla z28.s, z9.s, z4.s[1]\n"
911       "fmla z29.s, z9.s, z5.s[1]\n"
912       "fmla z25.s, z10.s, z1.s[2]\n"
913       "fmla z26.s, z10.s, z2.s[2]\n"
914       "fmla z27.s, z10.s, z3.s[2]\n"
915       "fmla z28.s, z10.s, z4.s[2]\n"
916       "fmla z29.s, z10.s, z5.s[2]\n"
917       "fmla z24.s, z11.s, z0.s[3]\n"
918       "fmla z25.s, z11.s, z1.s[3]\n"
919       "fmla z26.s, z11.s, z2.s[3]\n"
920       "fmla z27.s, z11.s, z3.s[3]\n"
921       "fmla z28.s, z11.s, z4.s[3]\n"
922       "fmla z29.s, z11.s, z5.s[3]\n"
923       "bgt 74b\n"
924       "75:"  // Height 6: Multiply loop: Single iteration only
925       "ld1w { z8.s }, p2/Z, [x12]\n"
926       "whilelt p0.s, XZR, x28\n"
927       "subs x28, x28, #0x1\n"
928       "ld1rqw { z0.s }, p0/Z, [x27]\n"
929       "fmla z24.s, z8.s, z0.s[0]\n"
930       "ld1rqw { z1.s }, p0/Z, [x26]\n"
931       "addvl x12, x12, #1\n"
932       "fmla z25.s, z8.s, z1.s[0]\n"
933       "ld1rqw { z2.s }, p0/Z, [x25]\n"
934       "ld1rqw { z3.s }, p0/Z, [x24]\n"
935       "fmla z26.s, z8.s, z2.s[0]\n"
936       "ld1rqw { z4.s }, p0/Z, [x23]\n"
937       "fmla z27.s, z8.s, z3.s[0]\n"
938       "ld1rqw { z5.s }, p0/Z, [x22]\n"
939       "fmla z28.s, z8.s, z4.s[0]\n"
940       "fmla z29.s, z8.s, z5.s[0]\n"
941       "ble 76f\n"
942       "ld1w { z9.s }, p2/Z, [x12]\n"
943       "fmla z24.s, z9.s, z0.s[1]\n"
944       "subs x28, x28, #0x1\n"
945       "fmla z25.s, z9.s, z1.s[1]\n"
946       "addvl x12, x12, #1\n"
947       "fmla z26.s, z9.s, z2.s[1]\n"
948       "fmla z27.s, z9.s, z3.s[1]\n"
949       "fmla z28.s, z9.s, z4.s[1]\n"
950       "fmla z29.s, z9.s, z5.s[1]\n"
951       "ble 76f\n"
952       "ld1w { z10.s }, p2/Z, [x12]\n"
953       "fmla z24.s, z10.s, z0.s[2]\n"
954       "subs x28, x28, #0x1\n"
955       "fmla z25.s, z10.s, z1.s[2]\n"
956       "addvl x12, x12, #1\n"
957       "fmla z26.s, z10.s, z2.s[2]\n"
958       "fmla z27.s, z10.s, z3.s[2]\n"
959       "fmla z28.s, z10.s, z4.s[2]\n"
960       "fmla z29.s, z10.s, z5.s[2]\n"
961       "ble 76f\n"
962       "ld1w { z11.s }, p2/Z, [x12]\n"
963       "fmla z24.s, z11.s, z0.s[3]\n"
964       "addvl x12, x12, #1\n"
965       "fmla z25.s, z11.s, z1.s[3]\n"
966       "fmla z26.s, z11.s, z2.s[3]\n"
967       "fmla z27.s, z11.s, z3.s[3]\n"
968       "fmla z28.s, z11.s, z4.s[3]\n"
969       "fmla z29.s, z11.s, z5.s[3]\n"
970       "76:"  // Height 6: Multiply loop: multiply skip
971       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
972       "add x9, x9, #0x1\n"
973       "cmp x9, x19\n"
974       "bne 71b\n"
975       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
976       "add x26, x10, x19, LSL #2\n"
977       "add x25, x26, x19, LSL #2\n"
978       "add x24, x25, x19, LSL #2\n"
979       "add x23, x24, x19, LSL #2\n"
980       "add x22, x23, x19, LSL #2\n"
981       "tbz %x[flags], #1, 77f\n"
982       "add x19, %x[args_ptr], %[offset_min]\n"
983       "ld1rw { z17.s }, p2/Z, [x19]\n"
984       "add x19, %x[args_ptr], %[offset_max]\n"
985       "ld1rw { z16.s }, p2/Z, [x19]\n"
986       "fmin z24.s, p2/M, z24.s, z16.s\n"
987       "fmin z25.s, p2/M, z25.s, z16.s\n"
988       "fmin z26.s, p2/M, z26.s, z16.s\n"
989       "fmin z27.s, p2/M, z27.s, z16.s\n"
990       "fmin z28.s, p2/M, z28.s, z16.s\n"
991       "fmax z24.s, p2/M, z24.s, z17.s\n"
992       "fmax z25.s, p2/M, z25.s, z17.s\n"
993       "fmax z26.s, p2/M, z26.s, z17.s\n"
994       "fmax z27.s, p2/M, z27.s, z17.s\n"
995       "fmax z28.s, p2/M, z28.s, z17.s\n"
996       "fmin z29.s, p2/M, z29.s, z16.s\n"
997       "fmax z29.s, p2/M, z29.s, z17.s\n"
998       "77:"  // Height 6: No activation
999       "st1w { z24.s }, p1, [x10]\n"
1000       "addvl x10, x10, #1\n"
1001       "st1w { z25.s }, p1, [x26]\n"
1002       "st1w { z26.s }, p1, [x25]\n"
1003       "st1w { z27.s }, p1, [x24]\n"
1004       "st1w { z28.s }, p1, [x23]\n"
1005       "st1w { z29.s }, p1, [x22]\n"
1006       "78:"  // Height 6: Writeback done
1007       "decw x13\n"
1008       "cmp x13, XZR\n"
1009       "bgt 67b\n"
1010       "b 106f\n"
1011       "79:"  // Height 7
1012       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
1013       "mov x11, %x[bias]\n"
1014       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1015       "mov x10, %x[output_ptr]\n"
1016       "80:"  // Height 7: Column loop
1017       "mov x19, #0x0\n"
1018       "whilelt p1.s, x19, x13\n"
1019       "cbz x11, 81f\n"
1020       "ld1w { z24.s }, p2/Z, [x11]\n"
1021       "mov z25.d, z24.d\n"
1022       "addvl x11, x11, #1\n"
1023       "mov z26.d, z24.d\n"
1024       "mov z27.d, z24.d\n"
1025       "mov z28.d, z24.d\n"
1026       "mov z29.d, z24.d\n"
1027       "mov z30.d, z24.d\n"
1028       "b 83f\n"
1029       "81:"  // Height 7: no bias
1030       "tbz %x[flags], #0, 82f\n"
1031       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1032       "ld1w { z24.s }, p1/Z, [x10]\n"
1033       "add x26, x10, x19, LSL #2\n"
1034       "ld1w { z25.s }, p1/Z, [x26]\n"
1035       "add x25, x26, x19, LSL #2\n"
1036       "ld1w { z26.s }, p1/Z, [x25]\n"
1037       "add x24, x25, x19, LSL #2\n"
1038       "ld1w { z27.s }, p1/Z, [x24]\n"
1039       "add x23, x24, x19, LSL #2\n"
1040       "ld1w { z28.s }, p1/Z, [x23]\n"
1041       "add x22, x23, x19, LSL #2\n"
1042       "ld1w { z29.s }, p1/Z, [x22]\n"
1043       "add x21, x22, x19, LSL #2\n"
1044       "ld1w { z30.s }, p1/Z, [x21]\n"
1045       "b 83f\n"
1046       "82:"  // Height 7: no accumulate
1047       "mov z24.b, #0x0\n"
1048       "mov z25.b, #0x0\n"
1049       "mov z26.b, #0x0\n"
1050       "mov z27.b, #0x0\n"
1051       "mov z28.b, #0x0\n"
1052       "mov z29.b, #0x0\n"
1053       "mov z30.b, #0x0\n"
1054       "83:"  // Height 7: setup done
1055       "mov x9, #0x0\n"
1056       "84:"  // Height 7: String loop
1057       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1058       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1059       "ldr w28, [x20, x9, LSL #0x2]\n"
1060       "tbz %x[flags], #3, 85f\n"
1061       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
1062       "add x20, x20, x19, LSL #3\n"
1063       "ldr x27, [x20, #0x0]\n"
1064       "ldr x26, [x20, #0x8]\n"
1065       "ldr x25, [x20, #0x10]\n"
1066       "ldr x24, [x20, #0x18]\n"
1067       "ldr x23, [x20, #0x20]\n"
1068       "ldr x22, [x20, #0x28]\n"
1069       "ldr x21, [x20, #0x30]\n"
1070       "cbnz x9, 86f\n"
1071       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1072       "add x27, x27, x19, LSL #2\n"
1073       "add x26, x26, x19, LSL #2\n"
1074       "add x25, x25, x19, LSL #2\n"
1075       "add x24, x24, x19, LSL #2\n"
1076       "add x23, x23, x19, LSL #2\n"
1077       "add x22, x22, x19, LSL #2\n"
1078       "add x21, x21, x19, LSL #2\n"
1079       "b 86f\n"
1080       "85:"  // Height 7: setup direct input
1081       "mov x27, %x[input_ptr]\n"
1082       "add x26, x27, x19, LSL #2\n"
1083       "add x25, x26, x19, LSL #2\n"
1084       "add x24, x25, x19, LSL #2\n"
1085       "add x23, x24, x19, LSL #2\n"
1086       "add x22, x23, x19, LSL #2\n"
1087       "add x21, x22, x19, LSL #2\n"
1088       "86:"  // Height 7: input setup done
1089       "cmp x28, #0x4\n"
1090       "ble 88f\n"
1091       "87:"  // Height 7: Multiply loop: Main loop head
1092       "ld1w { z8.s }, p2/Z, [x12]\n"
1093       "whilelt p0.s, XZR, x28\n"
1094       "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
1095       "sub x28, x28, #0x4\n"
1096       "ld1rqw { z0.s }, p0/Z, [x27]\n"
1097       "fmla z24.s, z8.s, z0.s[0]\n"
1098       "ld1rqw { z1.s }, p0/Z, [x26]\n"
1099       "cmp x28, #0x4\n"
1100       "fmla z25.s, z8.s, z1.s[0]\n"
1101       "ld1rqw { z2.s }, p0/Z, [x25]\n"
1102       "add x27, x27, #0x10\n"
1103       "fmla z24.s, z9.s, z0.s[1]\n"
1104       "ld1rqw { z3.s }, p0/Z, [x24]\n"
1105       "add x26, x26, #0x10\n"
1106       "fmla z26.s, z8.s, z2.s[0]\n"
1107       "ld1rqw { z4.s }, p0/Z, [x23]\n"
1108       "add x25, x25, #0x10\n"
1109       "fmla z27.s, z8.s, z3.s[0]\n"
1110       "ld1rqw { z5.s }, p0/Z, [x22]\n"
1111       "add x24, x24, #0x10\n"
1112       "fmla z25.s, z9.s, z1.s[1]\n"
1113       "ld1rqw { z6.s }, p0/Z, [x21]\n"
1114       "add x23, x23, #0x10\n"
1115       "fmla z28.s, z8.s, z4.s[0]\n"
1116       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
1117       "add x22, x22, #0x10\n"
1118       "fmla z29.s, z8.s, z5.s[0]\n"
1119       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
1120       "add x21, x21, #0x10\n"
1121       "fmla z30.s, z8.s, z6.s[0]\n"
1122       "addvl x12, x12, #4\n"
1123       "fmla z26.s, z9.s, z2.s[1]\n"
1124       "fmla z27.s, z9.s, z3.s[1]\n"
1125       "fmla z28.s, z9.s, z4.s[1]\n"
1126       "fmla z29.s, z9.s, z5.s[1]\n"
1127       "fmla z30.s, z9.s, z6.s[1]\n"
1128       "fmla z24.s, z10.s, z0.s[2]\n"
1129       "fmla z25.s, z10.s, z1.s[2]\n"
1130       "fmla z26.s, z10.s, z2.s[2]\n"
1131       "fmla z27.s, z10.s, z3.s[2]\n"
1132       "fmla z28.s, z10.s, z4.s[2]\n"
1133       "fmla z29.s, z10.s, z5.s[2]\n"
1134       "fmla z30.s, z10.s, z6.s[2]\n"
1135       "fmla z24.s, z11.s, z0.s[3]\n"
1136       "fmla z25.s, z11.s, z1.s[3]\n"
1137       "fmla z26.s, z11.s, z2.s[3]\n"
1138       "fmla z27.s, z11.s, z3.s[3]\n"
1139       "fmla z28.s, z11.s, z4.s[3]\n"
1140       "fmla z29.s, z11.s, z5.s[3]\n"
1141       "fmla z30.s, z11.s, z6.s[3]\n"
1142       "bgt 87b\n"
1143       "88:"  // Height 7: Multiply loop: Single iteration only
1144       "ld1w { z8.s }, p2/Z, [x12]\n"
1145       "whilelt p0.s, XZR, x28\n"
1146       "subs x28, x28, #0x1\n"
1147       "ld1rqw { z0.s }, p0/Z, [x27]\n"
1148       "fmla z24.s, z8.s, z0.s[0]\n"
1149       "ld1rqw { z1.s }, p0/Z, [x26]\n"
1150       "addvl x12, x12, #1\n"
1151       "fmla z25.s, z8.s, z1.s[0]\n"
1152       "ld1rqw { z2.s }, p0/Z, [x25]\n"
1153       "ld1rqw { z3.s }, p0/Z, [x24]\n"
1154       "fmla z26.s, z8.s, z2.s[0]\n"
1155       "ld1rqw { z4.s }, p0/Z, [x23]\n"
1156       "fmla z27.s, z8.s, z3.s[0]\n"
1157       "ld1rqw { z5.s }, p0/Z, [x22]\n"
1158       "ld1rqw { z6.s }, p0/Z, [x21]\n"
1159       "fmla z28.s, z8.s, z4.s[0]\n"
1160       "fmla z29.s, z8.s, z5.s[0]\n"
1161       "fmla z30.s, z8.s, z6.s[0]\n"
1162       "ble 89f\n"
1163       "ld1w { z9.s }, p2/Z, [x12]\n"
1164       "fmla z24.s, z9.s, z0.s[1]\n"
1165       "subs x28, x28, #0x1\n"
1166       "fmla z25.s, z9.s, z1.s[1]\n"
1167       "addvl x12, x12, #1\n"
1168       "fmla z26.s, z9.s, z2.s[1]\n"
1169       "fmla z27.s, z9.s, z3.s[1]\n"
1170       "fmla z28.s, z9.s, z4.s[1]\n"
1171       "fmla z29.s, z9.s, z5.s[1]\n"
1172       "fmla z30.s, z9.s, z6.s[1]\n"
1173       "ble 89f\n"
1174       "ld1w { z10.s }, p2/Z, [x12]\n"
1175       "fmla z24.s, z10.s, z0.s[2]\n"
1176       "subs x28, x28, #0x1\n"
1177       "fmla z25.s, z10.s, z1.s[2]\n"
1178       "addvl x12, x12, #1\n"
1179       "fmla z26.s, z10.s, z2.s[2]\n"
1180       "fmla z27.s, z10.s, z3.s[2]\n"
1181       "fmla z28.s, z10.s, z4.s[2]\n"
1182       "fmla z29.s, z10.s, z5.s[2]\n"
1183       "fmla z30.s, z10.s, z6.s[2]\n"
1184       "ble 89f\n"
1185       "ld1w { z11.s }, p2/Z, [x12]\n"
1186       "fmla z24.s, z11.s, z0.s[3]\n"
1187       "addvl x12, x12, #1\n"
1188       "fmla z25.s, z11.s, z1.s[3]\n"
1189       "fmla z26.s, z11.s, z2.s[3]\n"
1190       "fmla z27.s, z11.s, z3.s[3]\n"
1191       "fmla z28.s, z11.s, z4.s[3]\n"
1192       "fmla z29.s, z11.s, z5.s[3]\n"
1193       "fmla z30.s, z11.s, z6.s[3]\n"
1194       "89:"  // Height 7: Multiply loop: multiply skip
1195       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1196       "add x9, x9, #0x1\n"
1197       "cmp x9, x19\n"
1198       "bne 84b\n"
1199       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1200       "add x26, x10, x19, LSL #2\n"
1201       "add x25, x26, x19, LSL #2\n"
1202       "add x24, x25, x19, LSL #2\n"
1203       "add x23, x24, x19, LSL #2\n"
1204       "add x22, x23, x19, LSL #2\n"
1205       "add x21, x22, x19, LSL #2\n"
1206       "tbz %x[flags], #1, 90f\n"
1207       "add x19, %x[args_ptr], %[offset_min]\n"
1208       "ld1rw { z17.s }, p2/Z, [x19]\n"
1209       "add x19, %x[args_ptr], %[offset_max]\n"
1210       "ld1rw { z16.s }, p2/Z, [x19]\n"
1211       "fmin z24.s, p2/M, z24.s, z16.s\n"
1212       "fmin z25.s, p2/M, z25.s, z16.s\n"
1213       "fmin z26.s, p2/M, z26.s, z16.s\n"
1214       "fmin z27.s, p2/M, z27.s, z16.s\n"
1215       "fmin z28.s, p2/M, z28.s, z16.s\n"
1216       "fmax z24.s, p2/M, z24.s, z17.s\n"
1217       "fmax z25.s, p2/M, z25.s, z17.s\n"
1218       "fmax z26.s, p2/M, z26.s, z17.s\n"
1219       "fmax z27.s, p2/M, z27.s, z17.s\n"
1220       "fmax z28.s, p2/M, z28.s, z17.s\n"
1221       "fmin z29.s, p2/M, z29.s, z16.s\n"
1222       "fmin z30.s, p2/M, z30.s, z16.s\n"
1223       "fmax z29.s, p2/M, z29.s, z17.s\n"
1224       "fmax z30.s, p2/M, z30.s, z17.s\n"
1225       "90:"  // Height 7: No activation
1226       "st1w { z24.s }, p1, [x10]\n"
1227       "addvl x10, x10, #1\n"
1228       "st1w { z25.s }, p1, [x26]\n"
1229       "st1w { z26.s }, p1, [x25]\n"
1230       "st1w { z27.s }, p1, [x24]\n"
1231       "st1w { z28.s }, p1, [x23]\n"
1232       "st1w { z29.s }, p1, [x22]\n"
1233       "st1w { z30.s }, p1, [x21]\n"
1234       "91:"  // Height 7: Writeback done
1235       "decw x13\n"
1236       "cmp x13, XZR\n"
1237       "bgt 80b\n"
1238       "b 106f\n"
1239       "92:"  // Height 8
1240       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
1241       "mov x11, %x[bias]\n"
1242       "ldr x12, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1243       "mov x10, %x[output_ptr]\n"
1244       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
1245       "mov x19, #0x20\n"
1246       "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
1247       "93:"  // Height 8: Column loop
1248       "mov x19, #0x0\n"
1249       "whilelt p1.s, x19, x13\n"
1250       "cbz x11, 94f\n"
1251       "ld1w { z24.s }, p2/Z, [x11]\n"
1252       "mov z25.d, z24.d\n"
1253       "addvl x11, x11, #1\n"
1254       "mov z26.d, z24.d\n"
1255       "mov z27.d, z24.d\n"
1256       "mov z28.d, z24.d\n"
1257       "mov z29.d, z24.d\n"
1258       "mov z30.d, z24.d\n"
1259       "mov z31.d, z24.d\n"
1260       "b 96f\n"
1261       "94:"  // Height 8: no bias
1262       "tbz %x[flags], #0, 95f\n"
1263       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1264       "ld1w { z24.s }, p1/Z, [x10]\n"
1265       "add x26, x10, x19, LSL #2\n"
1266       "ld1w { z25.s }, p1/Z, [x26]\n"
1267       "add x25, x26, x19, LSL #2\n"
1268       "ld1w { z26.s }, p1/Z, [x25]\n"
1269       "add x24, x25, x19, LSL #2\n"
1270       "ld1w { z27.s }, p1/Z, [x24]\n"
1271       "add x23, x24, x19, LSL #2\n"
1272       "ld1w { z28.s }, p1/Z, [x23]\n"
1273       "add x22, x23, x19, LSL #2\n"
1274       "ld1w { z29.s }, p1/Z, [x22]\n"
1275       "add x21, x22, x19, LSL #2\n"
1276       "ld1w { z30.s }, p1/Z, [x21]\n"
1277       "add x20, x21, x19, LSL #2\n"
1278       "ld1w { z31.s }, p1/Z, [x20]\n"
1279       "b 96f\n"
1280       "95:"  // Height 8: no accumulate
1281       "mov z24.b, #0x0\n"
1282       "mov z25.b, #0x0\n"
1283       "mov z26.b, #0x0\n"
1284       "mov z27.b, #0x0\n"
1285       "mov z28.b, #0x0\n"
1286       "mov z29.b, #0x0\n"
1287       "mov z30.b, #0x0\n"
1288       "mov z31.b, #0x0\n"
1289       "96:"  // Height 8: setup done
1290       "mov x9, #0x0\n"
1291       "97:"  // Height 8: String loop
1292       "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1293       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1294       "ldr w28, [x20, x9, LSL #0x2]\n"
1295       "tbz %x[flags], #3, 98f\n"
1296       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
1297       "add x20, x20, x19, LSL #3\n"
1298       "ldr x27, [x20, #0x0]\n"
1299       "ldr x26, [x20, #0x8]\n"
1300       "ldr x25, [x20, #0x10]\n"
1301       "ldr x24, [x20, #0x18]\n"
1302       "ldr x23, [x20, #0x20]\n"
1303       "ldr x22, [x20, #0x28]\n"
1304       "ldr x21, [x20, #0x30]\n"
1305       "ldr x20, [x20, #0x38]\n"
1306       "cbnz x9, 99f\n"
1307       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1308       "add x27, x27, x19, LSL #2\n"
1309       "add x26, x26, x19, LSL #2\n"
1310       "add x25, x25, x19, LSL #2\n"
1311       "add x24, x24, x19, LSL #2\n"
1312       "add x23, x23, x19, LSL #2\n"
1313       "add x22, x22, x19, LSL #2\n"
1314       "add x21, x21, x19, LSL #2\n"
1315       "add x20, x20, x19, LSL #2\n"
1316       "b 99f\n"
1317       "98:"  // Height 8: setup direct input
1318       "mov x27, %x[input_ptr]\n"
1319       "add x26, x27, x19, LSL #2\n"
1320       "add x25, x26, x19, LSL #2\n"
1321       "add x24, x25, x19, LSL #2\n"
1322       "add x23, x24, x19, LSL #2\n"
1323       "add x22, x23, x19, LSL #2\n"
1324       "add x21, x22, x19, LSL #2\n"
1325       "add x20, x21, x19, LSL #2\n"
1326       "99:"  // Height 8: input setup done
1327       "cmp x28, #0x4\n"
1328       "ble 101f\n"
1329       "100:"  // Height 8: Multiply loop: Main loop head
1330       "ld1w { z8.s }, p2/Z, [x12]\n"
1331       "whilelt p0.s, XZR, x28\n"
1332       "ld1w { z9.s }, p2/Z, [x12, #1, MUL VL]\n"
1333       "sub x28, x28, #0x4\n"
1334       "ld1rqw { z0.s }, p0/Z, [x27]\n"
1335       "fmla z24.s, z8.s, z0.s[0]\n"
1336       "ld1rqw { z1.s }, p0/Z, [x26]\n"
1337       "cmp x28, #0x4\n"
1338       "fmla z25.s, z8.s, z1.s[0]\n"
1339       "ld1rqw { z2.s }, p0/Z, [x25]\n"
1340       "add x27, x27, #0x10\n"
1341       "fmla z24.s, z9.s, z0.s[1]\n"
1342       "ld1rqw { z3.s }, p0/Z, [x24]\n"
1343       "add x26, x26, #0x10\n"
1344       "fmla z26.s, z8.s, z2.s[0]\n"
1345       "ld1rqw { z4.s }, p0/Z, [x23]\n"
1346       "add x25, x25, #0x10\n"
1347       "fmla z27.s, z8.s, z3.s[0]\n"
1348       "ld1rqw { z5.s }, p0/Z, [x22]\n"
1349       "add x24, x24, #0x10\n"
1350       "fmla z25.s, z9.s, z1.s[1]\n"
1351       "ld1rqw { z6.s }, p0/Z, [x21]\n"
1352       "add x23, x23, #0x10\n"
1353       "fmla z28.s, z8.s, z4.s[0]\n"
1354       "ld1rqw { z7.s }, p0/Z, [x20]\n"
1355       "add x22, x22, #0x10\n"
1356       "fmla z29.s, z8.s, z5.s[0]\n"
1357       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
1358       "add x21, x21, #0x10\n"
1359       "fmla z30.s, z8.s, z6.s[0]\n"
1360       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
1361       "add x20, x20, #0x10\n"
1362       "fmla z31.s, z8.s, z7.s[0]\n"
1363       "addvl x12, x12, #4\n"
1364       "fmla z26.s, z9.s, z2.s[1]\n"
1365       "fmla z27.s, z9.s, z3.s[1]\n"
1366       "fmla z28.s, z9.s, z4.s[1]\n"
1367       "fmla z29.s, z9.s, z5.s[1]\n"
1368       "fmla z30.s, z9.s, z6.s[1]\n"
1369       "fmla z31.s, z9.s, z7.s[1]\n"
1370       "fmla z24.s, z10.s, z0.s[2]\n"
1371       "fmla z25.s, z10.s, z1.s[2]\n"
1372       "fmla z26.s, z10.s, z2.s[2]\n"
1373       "fmla z27.s, z10.s, z3.s[2]\n"
1374       "fmla z28.s, z10.s, z4.s[2]\n"
1375       "fmla z29.s, z10.s, z5.s[2]\n"
1376       "fmla z30.s, z10.s, z6.s[2]\n"
1377       "fmla z31.s, z10.s, z7.s[2]\n"
1378       "fmla z24.s, z11.s, z0.s[3]\n"
1379       "fmla z25.s, z11.s, z1.s[3]\n"
1380       "fmla z26.s, z11.s, z2.s[3]\n"
1381       "fmla z27.s, z11.s, z3.s[3]\n"
1382       "fmla z28.s, z11.s, z4.s[3]\n"
1383       "fmla z29.s, z11.s, z5.s[3]\n"
1384       "fmla z30.s, z11.s, z6.s[3]\n"
1385       "fmla z31.s, z11.s, z7.s[3]\n"
1386       "bgt 100b\n"
1387       "101:"  // Height 8: Multiply loop: Single iteration only
1388       "ld1w { z8.s }, p2/Z, [x12]\n"
1389       "whilelt p0.s, XZR, x28\n"
1390       "subs x28, x28, #0x1\n"
1391       "ld1rqw { z0.s }, p0/Z, [x27]\n"
1392       "fmla z24.s, z8.s, z0.s[0]\n"
1393       "ld1rqw { z1.s }, p0/Z, [x26]\n"
1394       "addvl x12, x12, #1\n"
1395       "fmla z25.s, z8.s, z1.s[0]\n"
1396       "ld1rqw { z2.s }, p0/Z, [x25]\n"
1397       "ld1rqw { z3.s }, p0/Z, [x24]\n"
1398       "fmla z26.s, z8.s, z2.s[0]\n"
1399       "ld1rqw { z4.s }, p0/Z, [x23]\n"
1400       "fmla z27.s, z8.s, z3.s[0]\n"
1401       "ld1rqw { z5.s }, p0/Z, [x22]\n"
1402       "ld1rqw { z6.s }, p0/Z, [x21]\n"
1403       "fmla z28.s, z8.s, z4.s[0]\n"
1404       "ld1rqw { z7.s }, p0/Z, [x20]\n"
1405       "fmla z29.s, z8.s, z5.s[0]\n"
1406       "fmla z30.s, z8.s, z6.s[0]\n"
1407       "fmla z31.s, z8.s, z7.s[0]\n"
1408       "ble 102f\n"
1409       "ld1w { z9.s }, p2/Z, [x12]\n"
1410       "fmla z24.s, z9.s, z0.s[1]\n"
1411       "subs x28, x28, #0x1\n"
1412       "fmla z25.s, z9.s, z1.s[1]\n"
1413       "addvl x12, x12, #1\n"
1414       "fmla z26.s, z9.s, z2.s[1]\n"
1415       "fmla z27.s, z9.s, z3.s[1]\n"
1416       "fmla z28.s, z9.s, z4.s[1]\n"
1417       "fmla z29.s, z9.s, z5.s[1]\n"
1418       "fmla z30.s, z9.s, z6.s[1]\n"
1419       "fmla z31.s, z9.s, z7.s[1]\n"
1420       "ble 102f\n"
1421       "ld1w { z10.s }, p2/Z, [x12]\n"
1422       "fmla z24.s, z10.s, z0.s[2]\n"
1423       "subs x28, x28, #0x1\n"
1424       "fmla z25.s, z10.s, z1.s[2]\n"
1425       "addvl x12, x12, #1\n"
1426       "fmla z26.s, z10.s, z2.s[2]\n"
1427       "fmla z27.s, z10.s, z3.s[2]\n"
1428       "fmla z28.s, z10.s, z4.s[2]\n"
1429       "fmla z29.s, z10.s, z5.s[2]\n"
1430       "fmla z30.s, z10.s, z6.s[2]\n"
1431       "fmla z31.s, z10.s, z7.s[2]\n"
1432       "ble 102f\n"
1433       "ld1w { z11.s }, p2/Z, [x12]\n"
1434       "fmla z24.s, z11.s, z0.s[3]\n"
1435       "addvl x12, x12, #1\n"
1436       "fmla z25.s, z11.s, z1.s[3]\n"
1437       "fmla z26.s, z11.s, z2.s[3]\n"
1438       "fmla z27.s, z11.s, z3.s[3]\n"
1439       "fmla z28.s, z11.s, z4.s[3]\n"
1440       "fmla z29.s, z11.s, z5.s[3]\n"
1441       "fmla z30.s, z11.s, z6.s[3]\n"
1442       "fmla z31.s, z11.s, z7.s[3]\n"
1443       "102:"  // Height 8: Multiply loop: multiply skip
1444       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1445       "add x9, x9, #0x1\n"
1446       "cmp x9, x19\n"
1447       "bne 97b\n"
1448       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1449       "add x26, x10, x19, LSL #2\n"
1450       "add x25, x26, x19, LSL #2\n"
1451       "add x24, x25, x19, LSL #2\n"
1452       "add x23, x24, x19, LSL #2\n"
1453       "add x22, x23, x19, LSL #2\n"
1454       "add x21, x22, x19, LSL #2\n"
1455       "add x20, x21, x19, LSL #2\n"
1456       "tbz %x[flags], #1, 103f\n"
1457       "add x19, %x[args_ptr], %[offset_min]\n"
1458       "ld1rw { z17.s }, p2/Z, [x19]\n"
1459       "add x19, %x[args_ptr], %[offset_max]\n"
1460       "ld1rw { z16.s }, p2/Z, [x19]\n"
1461       "fmin z24.s, p2/M, z24.s, z16.s\n"
1462       "fmin z25.s, p2/M, z25.s, z16.s\n"
1463       "fmin z26.s, p2/M, z26.s, z16.s\n"
1464       "fmin z27.s, p2/M, z27.s, z16.s\n"
1465       "fmin z28.s, p2/M, z28.s, z16.s\n"
1466       "fmax z24.s, p2/M, z24.s, z17.s\n"
1467       "fmax z25.s, p2/M, z25.s, z17.s\n"
1468       "fmax z26.s, p2/M, z26.s, z17.s\n"
1469       "fmax z27.s, p2/M, z27.s, z17.s\n"
1470       "fmax z28.s, p2/M, z28.s, z17.s\n"
1471       "fmin z29.s, p2/M, z29.s, z16.s\n"
1472       "fmin z30.s, p2/M, z30.s, z16.s\n"
1473       "fmin z31.s, p2/M, z31.s, z16.s\n"
1474       "fmax z29.s, p2/M, z29.s, z17.s\n"
1475       "fmax z30.s, p2/M, z30.s, z17.s\n"
1476       "fmax z31.s, p2/M, z31.s, z17.s\n"
1477       "103:"  // Height 8: No activation
1478       "st1w { z24.s }, p1, [x10]\n"
1479       "addvl x10, x10, #1\n"
1480       "st1w { z25.s }, p1, [x26]\n"
1481       "st1w { z26.s }, p1, [x25]\n"
1482       "st1w { z27.s }, p1, [x24]\n"
1483       "st1w { z28.s }, p1, [x23]\n"
1484       "st1w { z29.s }, p1, [x22]\n"
1485       "st1w { z30.s }, p1, [x21]\n"
1486       "st1w { z31.s }, p1, [x20]\n"
1487       "104:"  // Height 8: Writeback done
1488       "decw x13\n"
1489       "cmp x13, XZR\n"
1490       "bgt 93b\n"
1491       "subs %x[M], %x[M], #0x8\n"
1492       "beq 106f\n"
1493       "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1494       "tbz %x[flags], #3, 105f\n"
1495       "add x20, x20, #0x8\n"
1496       "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1497       "b 1b\n"
1498       "105:"  // Update direct input
1499       "mov x19, #0x20\n"
1500       "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
1501       "b 1b\n"
1502       "106:"  // Exit
1503 
1504       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
1505       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
1506       : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
1507     );
1508 }
1509 
1510 } // namespace arm_gemm
1511 #endif // ARM_COMPUTE_ENABLE_SVE
1512