• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 #ifdef ARM_COMPUTE_ENABLE_SVE
25 
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28 
29 #include <cassert>
30 #include <limits>
31 
32 namespace arm_gemm {
33 
sve_hybrid_fp32_mla_8x1VL_a64fx(unsigned int num_strings,const unsigned int * string_lengths,IndirectInputArg<float> A_arg,size_t M,size_t N,const float * B_ptr,IndirectOutputArg<float> output_arg,const float * bias,Activation act,bool accumulate)34 void sve_hybrid_fp32_mla_8x1VL_a64fx (
35     unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
36     size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
37     const float *bias, Activation act, bool accumulate
38 )
39 {
40     struct KernelArgs {
41         float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
42         float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
43         unsigned int num_strings = {};
44         const unsigned int *string_lengths = {};
45         size_t N = {};
46         const float *B_ptr = {};
47         size_t output_offset = {};
48         size_t input_initial_col = {};
49         size_t input_offset = {};
50     } ka;
51 
52     unsigned long flags=0;
53     void *output_ptr;
54     void *input_ptr;
55 
56     if (output_arg.is_indirect) {
57         output_ptr=(void *)(output_arg.indirect.ptr);
58         ka.output_offset=output_arg.indirect.offset;
59         flags |= 0x4;
60     } else {
61         output_ptr=(void *)(output_arg.direct.base);
62         ka.output_offset=output_arg.direct.stride;
63     }
64 
65     if (A_arg.is_indirect) {
66         input_ptr=(void *)(A_arg.indirect.ptr);
67         ka.input_offset=A_arg.indirect.start_row;
68         ka.input_initial_col=A_arg.indirect.start_col;
69         flags |= 0x8;
70     } else {
71         assert(num_strings==1);
72         input_ptr=(void *)(A_arg.direct.base);
73         ka.input_offset=A_arg.direct.stride;
74     }
75     if (accumulate) {
76         flags |= 0x1;
77     }
78     ka.num_strings = num_strings;
79     ka.string_lengths = string_lengths;
80     ka.N = N;
81     ka.B_ptr = B_ptr;
82     switch(act.type) {
83         default:
84         case Activation::Type::None:
85             break;
86         case Activation::Type::BoundedReLU:
87             ka.maxval = static_cast<float>(act.param1);
88             /* fall through */
89         case Activation::Type::ReLU:
90             ka.minval = 0;
91             flags |= 0x2;
92             break;
93     }
94     __asm__ __volatile__(
95       "ptrue p1.b\n"
96       "1:"  // Row loop
97       "cmp %x[M], #0x8\n"
98       "bge 85f\n"
99       "cmp %x[M], #0x6\n"
100       "bgt 73f\n"
101       "beq 61f\n"
102       "cmp %x[M], #0x4\n"
103       "bgt 49f\n"
104       "beq 37f\n"
105       "cmp %x[M], #0x2\n"
106       "bgt 25f\n"
107       "beq 13f\n"
108       "mov x13, %x[bias]\n"
109       "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
110       "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
111       "mov x10, %x[output_ptr]\n"
112       "2:"  // Height 1: Column loop
113       "mov x19, #0x0\n"
114       "whilelt p0.s, x19, x12\n"
115       "cbz x13, 3f\n"
116       "ld1w { z24.s }, p1/Z, [x13]\n"
117       "addvl x13, x13, #1\n"
118       "b 5f\n"
119       "3:"  // Height 1: no bias
120       "tbz %x[flags], #0, 4f\n"
121       "ld1w { z24.s }, p0/Z, [x10]\n"
122       "b 5f\n"
123       "4:"  // Height 1: no accumulate
124       "mov z24.b, #0x0\n"
125       "5:"  // Height 1: setup done
126       "mov x9, #0x0\n"
127       "6:"  // Height 1: String loop
128       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
129       "ldr w28, [x19, x9, LSL #0x2]\n"
130       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
131       "tbz %x[flags], #3, 7f\n"
132       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
133       "add x20, x20, x19, LSL #3\n"
134       "ldr x27, [x20, #0x0]\n"
135       "cbnz x9, 8f\n"
136       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
137       "add x27, x27, x19, LSL #2\n"
138       "b 8f\n"
139       "7:"  // Height 1: setup direct input
140       "mov x27, %x[input_ptr]\n"
141       "8:"  // Height 1: input setup done
142       "subs x28, x28, #0x1\n"
143       "ld1rw { z0.s }, p1/Z, [x27]\n"
144       "ble 10f\n"
145       "9:"  // Height 1: Multiply loop: Main loop
146       "ld1w { z8.s }, p1/Z, [x11]\n"
147       "add x27, x27, #0x4\n"
148       "subs x28, x28, #0x1\n"
149       "fmla z24.s, p1/M, z8.s, z0.s\n"
150       "addvl x11, x11, #1\n"
151       "ld1rw { z0.s }, p1/Z, [x27]\n"
152       "bgt 9b\n"
153       "10:"  // Height 1: Multiply loop: Main loop skip
154       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
155       "ld1w { z9.s }, p1/Z, [x11]\n"
156       "add x9, x9, #0x1\n"
157       "cmp x9, x19\n"
158       "fmla z24.s, p1/M, z9.s, z0.s\n"
159       "addvl x11, x11, #1\n"
160       "bne 6b\n"
161       "tbz %x[flags], #1, 11f\n"
162       "add x19, %x[args_ptr], %[offset_max]\n"
163       "ld1rw { z17.s }, p1/Z, [x19]\n"
164       "add x19, %x[args_ptr], %[offset_min]\n"
165       "ld1rw { z16.s }, p1/Z, [x19]\n"
166       "fmin z24.s, p1/M, z24.s, z17.s\n"
167       "fmax z24.s, p1/M, z24.s, z16.s\n"
168       "11:"  // Height 1: No activation
169       "st1w { z24.s }, p0, [x10]\n"
170       "addvl x10, x10, #1\n"
171       "12:"  // Height 1: Writeback done
172       "decw x12\n"
173       "cmp x12, XZR\n"
174       "bgt 2b\n"
175       "b 98f\n"
176       "13:"  // Height 2
177       "mov x13, %x[bias]\n"
178       "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
179       "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
180       "mov x10, %x[output_ptr]\n"
181       "14:"  // Height 2: Column loop
182       "mov x19, #0x0\n"
183       "whilelt p0.s, x19, x12\n"
184       "cbz x13, 15f\n"
185       "ld1w { z24.s }, p1/Z, [x13]\n"
186       "mov z25.d, z24.d\n"
187       "addvl x13, x13, #1\n"
188       "b 17f\n"
189       "15:"  // Height 2: no bias
190       "tbz %x[flags], #0, 16f\n"
191       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
192       "add x26, x10, x19, LSL #2\n"
193       "ld1w { z24.s }, p0/Z, [x10]\n"
194       "ld1w { z25.s }, p0/Z, [x26]\n"
195       "b 17f\n"
196       "16:"  // Height 2: no accumulate
197       "mov z24.b, #0x0\n"
198       "mov z25.b, #0x0\n"
199       "17:"  // Height 2: setup done
200       "mov x9, #0x0\n"
201       "18:"  // Height 2: String loop
202       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
203       "ldr w28, [x19, x9, LSL #0x2]\n"
204       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
205       "tbz %x[flags], #3, 19f\n"
206       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
207       "add x20, x20, x19, LSL #3\n"
208       "ldr x27, [x20, #0x0]\n"
209       "ldr x26, [x20, #0x8]\n"
210       "cbnz x9, 20f\n"
211       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
212       "add x27, x27, x19, LSL #2\n"
213       "add x26, x26, x19, LSL #2\n"
214       "b 20f\n"
215       "19:"  // Height 2: setup direct input
216       "mov x27, %x[input_ptr]\n"
217       "add x26, x27, x19, LSL #2\n"
218       "20:"  // Height 2: input setup done
219       "subs x28, x28, #0x1\n"
220       "ld1rw { z0.s }, p1/Z, [x27]\n"
221       "ld1rw { z1.s }, p1/Z, [x26]\n"
222       "ble 22f\n"
223       "21:"  // Height 2: Multiply loop: Main loop
224       "ld1w { z8.s }, p1/Z, [x11]\n"
225       "add x27, x27, #0x4\n"
226       "subs x28, x28, #0x1\n"
227       "fmla z24.s, p1/M, z8.s, z0.s\n"
228       "add x26, x26, #0x4\n"
229       "fmla z25.s, p1/M, z8.s, z1.s\n"
230       "addvl x11, x11, #1\n"
231       "ld1rw { z0.s }, p1/Z, [x27]\n"
232       "ld1rw { z1.s }, p1/Z, [x26]\n"
233       "bgt 21b\n"
234       "22:"  // Height 2: Multiply loop: Main loop skip
235       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
236       "ld1w { z9.s }, p1/Z, [x11]\n"
237       "add x9, x9, #0x1\n"
238       "cmp x9, x19\n"
239       "fmla z24.s, p1/M, z9.s, z0.s\n"
240       "fmla z25.s, p1/M, z9.s, z1.s\n"
241       "addvl x11, x11, #1\n"
242       "bne 18b\n"
243       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
244       "add x26, x10, x19, LSL #2\n"
245       "tbz %x[flags], #1, 23f\n"
246       "add x19, %x[args_ptr], %[offset_max]\n"
247       "ld1rw { z17.s }, p1/Z, [x19]\n"
248       "add x19, %x[args_ptr], %[offset_min]\n"
249       "ld1rw { z16.s }, p1/Z, [x19]\n"
250       "fmin z24.s, p1/M, z24.s, z17.s\n"
251       "fmin z25.s, p1/M, z25.s, z17.s\n"
252       "fmax z24.s, p1/M, z24.s, z16.s\n"
253       "fmax z25.s, p1/M, z25.s, z16.s\n"
254       "23:"  // Height 2: No activation
255       "st1w { z24.s }, p0, [x10]\n"
256       "addvl x10, x10, #1\n"
257       "st1w { z25.s }, p0, [x26]\n"
258       "24:"  // Height 2: Writeback done
259       "decw x12\n"
260       "cmp x12, XZR\n"
261       "bgt 14b\n"
262       "b 98f\n"
263       "25:"  // Height 3
264       "mov x13, %x[bias]\n"
265       "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
266       "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
267       "mov x10, %x[output_ptr]\n"
268       "26:"  // Height 3: Column loop
269       "mov x19, #0x0\n"
270       "whilelt p0.s, x19, x12\n"
271       "cbz x13, 27f\n"
272       "ld1w { z24.s }, p1/Z, [x13]\n"
273       "mov z25.d, z24.d\n"
274       "mov z26.d, z24.d\n"
275       "addvl x13, x13, #1\n"
276       "b 29f\n"
277       "27:"  // Height 3: no bias
278       "tbz %x[flags], #0, 28f\n"
279       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
280       "add x26, x10, x19, LSL #2\n"
281       "add x25, x26, x19, LSL #2\n"
282       "ld1w { z24.s }, p0/Z, [x10]\n"
283       "ld1w { z25.s }, p0/Z, [x26]\n"
284       "ld1w { z26.s }, p0/Z, [x25]\n"
285       "b 29f\n"
286       "28:"  // Height 3: no accumulate
287       "mov z24.b, #0x0\n"
288       "mov z25.b, #0x0\n"
289       "mov z26.b, #0x0\n"
290       "29:"  // Height 3: setup done
291       "mov x9, #0x0\n"
292       "30:"  // Height 3: String loop
293       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
294       "ldr w28, [x19, x9, LSL #0x2]\n"
295       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
296       "tbz %x[flags], #3, 31f\n"
297       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
298       "add x20, x20, x19, LSL #3\n"
299       "ldr x27, [x20, #0x0]\n"
300       "ldr x26, [x20, #0x8]\n"
301       "ldr x25, [x20, #0x10]\n"
302       "cbnz x9, 32f\n"
303       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
304       "add x27, x27, x19, LSL #2\n"
305       "add x26, x26, x19, LSL #2\n"
306       "add x25, x25, x19, LSL #2\n"
307       "b 32f\n"
308       "31:"  // Height 3: setup direct input
309       "mov x27, %x[input_ptr]\n"
310       "add x26, x27, x19, LSL #2\n"
311       "add x25, x26, x19, LSL #2\n"
312       "32:"  // Height 3: input setup done
313       "subs x28, x28, #0x1\n"
314       "ld1rw { z0.s }, p1/Z, [x27]\n"
315       "ld1rw { z1.s }, p1/Z, [x26]\n"
316       "ld1rw { z2.s }, p1/Z, [x25]\n"
317       "ble 34f\n"
318       "33:"  // Height 3: Multiply loop: Main loop
319       "ld1w { z8.s }, p1/Z, [x11]\n"
320       "add x27, x27, #0x4\n"
321       "subs x28, x28, #0x1\n"
322       "fmla z24.s, p1/M, z8.s, z0.s\n"
323       "add x26, x26, #0x4\n"
324       "add x25, x25, #0x4\n"
325       "fmla z25.s, p1/M, z8.s, z1.s\n"
326       "fmla z26.s, p1/M, z8.s, z2.s\n"
327       "addvl x11, x11, #1\n"
328       "ld1rw { z0.s }, p1/Z, [x27]\n"
329       "ld1rw { z1.s }, p1/Z, [x26]\n"
330       "ld1rw { z2.s }, p1/Z, [x25]\n"
331       "bgt 33b\n"
332       "34:"  // Height 3: Multiply loop: Main loop skip
333       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
334       "ld1w { z9.s }, p1/Z, [x11]\n"
335       "add x9, x9, #0x1\n"
336       "cmp x9, x19\n"
337       "fmla z24.s, p1/M, z9.s, z0.s\n"
338       "fmla z25.s, p1/M, z9.s, z1.s\n"
339       "addvl x11, x11, #1\n"
340       "fmla z26.s, p1/M, z9.s, z2.s\n"
341       "bne 30b\n"
342       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
343       "add x26, x10, x19, LSL #2\n"
344       "add x25, x26, x19, LSL #2\n"
345       "tbz %x[flags], #1, 35f\n"
346       "add x19, %x[args_ptr], %[offset_max]\n"
347       "ld1rw { z17.s }, p1/Z, [x19]\n"
348       "add x19, %x[args_ptr], %[offset_min]\n"
349       "ld1rw { z16.s }, p1/Z, [x19]\n"
350       "fmin z24.s, p1/M, z24.s, z17.s\n"
351       "fmin z25.s, p1/M, z25.s, z17.s\n"
352       "fmin z26.s, p1/M, z26.s, z17.s\n"
353       "fmax z24.s, p1/M, z24.s, z16.s\n"
354       "fmax z25.s, p1/M, z25.s, z16.s\n"
355       "fmax z26.s, p1/M, z26.s, z16.s\n"
356       "35:"  // Height 3: No activation
357       "st1w { z24.s }, p0, [x10]\n"
358       "addvl x10, x10, #1\n"
359       "st1w { z25.s }, p0, [x26]\n"
360       "st1w { z26.s }, p0, [x25]\n"
361       "36:"  // Height 3: Writeback done
362       "decw x12\n"
363       "cmp x12, XZR\n"
364       "bgt 26b\n"
365       "b 98f\n"
366       "37:"  // Height 4
367       "mov x13, %x[bias]\n"
368       "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
369       "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
370       "mov x10, %x[output_ptr]\n"
371       "38:"  // Height 4: Column loop
372       "mov x19, #0x0\n"
373       "whilelt p0.s, x19, x12\n"
374       "cbz x13, 39f\n"
375       "ld1w { z24.s }, p1/Z, [x13]\n"
376       "mov z25.d, z24.d\n"
377       "mov z26.d, z24.d\n"
378       "addvl x13, x13, #1\n"
379       "mov z27.d, z24.d\n"
380       "b 41f\n"
381       "39:"  // Height 4: no bias
382       "tbz %x[flags], #0, 40f\n"
383       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
384       "add x26, x10, x19, LSL #2\n"
385       "add x25, x26, x19, LSL #2\n"
386       "add x24, x25, x19, LSL #2\n"
387       "ld1w { z24.s }, p0/Z, [x10]\n"
388       "ld1w { z25.s }, p0/Z, [x26]\n"
389       "ld1w { z26.s }, p0/Z, [x25]\n"
390       "ld1w { z27.s }, p0/Z, [x24]\n"
391       "b 41f\n"
392       "40:"  // Height 4: no accumulate
393       "mov z24.b, #0x0\n"
394       "mov z25.b, #0x0\n"
395       "mov z26.b, #0x0\n"
396       "mov z27.b, #0x0\n"
397       "41:"  // Height 4: setup done
398       "mov x9, #0x0\n"
399       "42:"  // Height 4: String loop
400       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
401       "ldr w28, [x19, x9, LSL #0x2]\n"
402       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
403       "tbz %x[flags], #3, 43f\n"
404       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
405       "add x20, x20, x19, LSL #3\n"
406       "ldr x27, [x20, #0x0]\n"
407       "ldr x26, [x20, #0x8]\n"
408       "ldr x25, [x20, #0x10]\n"
409       "ldr x24, [x20, #0x18]\n"
410       "cbnz x9, 44f\n"
411       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
412       "add x27, x27, x19, LSL #2\n"
413       "add x26, x26, x19, LSL #2\n"
414       "add x25, x25, x19, LSL #2\n"
415       "add x24, x24, x19, LSL #2\n"
416       "b 44f\n"
417       "43:"  // Height 4: setup direct input
418       "mov x27, %x[input_ptr]\n"
419       "add x26, x27, x19, LSL #2\n"
420       "add x25, x26, x19, LSL #2\n"
421       "add x24, x25, x19, LSL #2\n"
422       "44:"  // Height 4: input setup done
423       "subs x28, x28, #0x1\n"
424       "ld1rw { z0.s }, p1/Z, [x27]\n"
425       "ld1rw { z1.s }, p1/Z, [x26]\n"
426       "ld1rw { z2.s }, p1/Z, [x25]\n"
427       "ld1rw { z3.s }, p1/Z, [x24]\n"
428       "ble 46f\n"
429       "45:"  // Height 4: Multiply loop: Main loop
430       "ld1w { z8.s }, p1/Z, [x11]\n"
431       "add x27, x27, #0x4\n"
432       "subs x28, x28, #0x1\n"
433       "fmla z24.s, p1/M, z8.s, z0.s\n"
434       "add x26, x26, #0x4\n"
435       "add x25, x25, #0x4\n"
436       "fmla z25.s, p1/M, z8.s, z1.s\n"
437       "fmla z26.s, p1/M, z8.s, z2.s\n"
438       "add x24, x24, #0x4\n"
439       "fmla z27.s, p1/M, z8.s, z3.s\n"
440       "addvl x11, x11, #1\n"
441       "ld1rw { z0.s }, p1/Z, [x27]\n"
442       "ld1rw { z1.s }, p1/Z, [x26]\n"
443       "ld1rw { z2.s }, p1/Z, [x25]\n"
444       "ld1rw { z3.s }, p1/Z, [x24]\n"
445       "bgt 45b\n"
446       "46:"  // Height 4: Multiply loop: Main loop skip
447       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
448       "ld1w { z9.s }, p1/Z, [x11]\n"
449       "add x9, x9, #0x1\n"
450       "cmp x9, x19\n"
451       "fmla z24.s, p1/M, z9.s, z0.s\n"
452       "fmla z25.s, p1/M, z9.s, z1.s\n"
453       "addvl x11, x11, #1\n"
454       "fmla z26.s, p1/M, z9.s, z2.s\n"
455       "fmla z27.s, p1/M, z9.s, z3.s\n"
456       "bne 42b\n"
457       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
458       "add x26, x10, x19, LSL #2\n"
459       "add x25, x26, x19, LSL #2\n"
460       "add x24, x25, x19, LSL #2\n"
461       "tbz %x[flags], #1, 47f\n"
462       "add x19, %x[args_ptr], %[offset_max]\n"
463       "ld1rw { z17.s }, p1/Z, [x19]\n"
464       "add x19, %x[args_ptr], %[offset_min]\n"
465       "ld1rw { z16.s }, p1/Z, [x19]\n"
466       "fmin z24.s, p1/M, z24.s, z17.s\n"
467       "fmin z25.s, p1/M, z25.s, z17.s\n"
468       "fmin z26.s, p1/M, z26.s, z17.s\n"
469       "fmin z27.s, p1/M, z27.s, z17.s\n"
470       "fmax z24.s, p1/M, z24.s, z16.s\n"
471       "fmax z25.s, p1/M, z25.s, z16.s\n"
472       "fmax z26.s, p1/M, z26.s, z16.s\n"
473       "fmax z27.s, p1/M, z27.s, z16.s\n"
474       "47:"  // Height 4: No activation
475       "st1w { z24.s }, p0, [x10]\n"
476       "addvl x10, x10, #1\n"
477       "st1w { z25.s }, p0, [x26]\n"
478       "st1w { z26.s }, p0, [x25]\n"
479       "st1w { z27.s }, p0, [x24]\n"
480       "48:"  // Height 4: Writeback done
481       "decw x12\n"
482       "cmp x12, XZR\n"
483       "bgt 38b\n"
484       "b 98f\n"
485       "49:"  // Height 5
486       "mov x13, %x[bias]\n"
487       "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
488       "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
489       "mov x10, %x[output_ptr]\n"
490       "50:"  // Height 5: Column loop
491       "mov x19, #0x0\n"
492       "whilelt p0.s, x19, x12\n"
493       "cbz x13, 51f\n"
494       "ld1w { z24.s }, p1/Z, [x13]\n"
495       "mov z25.d, z24.d\n"
496       "mov z26.d, z24.d\n"
497       "addvl x13, x13, #1\n"
498       "mov z27.d, z24.d\n"
499       "mov z28.d, z24.d\n"
500       "b 53f\n"
501       "51:"  // Height 5: no bias
502       "tbz %x[flags], #0, 52f\n"
503       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
504       "add x26, x10, x19, LSL #2\n"
505       "add x25, x26, x19, LSL #2\n"
506       "add x24, x25, x19, LSL #2\n"
507       "add x23, x24, x19, LSL #2\n"
508       "ld1w { z24.s }, p0/Z, [x10]\n"
509       "ld1w { z25.s }, p0/Z, [x26]\n"
510       "ld1w { z26.s }, p0/Z, [x25]\n"
511       "ld1w { z27.s }, p0/Z, [x24]\n"
512       "ld1w { z28.s }, p0/Z, [x23]\n"
513       "b 53f\n"
514       "52:"  // Height 5: no accumulate
515       "mov z24.b, #0x0\n"
516       "mov z25.b, #0x0\n"
517       "mov z26.b, #0x0\n"
518       "mov z27.b, #0x0\n"
519       "mov z28.b, #0x0\n"
520       "53:"  // Height 5: setup done
521       "mov x9, #0x0\n"
522       "54:"  // Height 5: String loop
523       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
524       "ldr w28, [x19, x9, LSL #0x2]\n"
525       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
526       "tbz %x[flags], #3, 55f\n"
527       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
528       "add x20, x20, x19, LSL #3\n"
529       "ldr x27, [x20, #0x0]\n"
530       "ldr x26, [x20, #0x8]\n"
531       "ldr x25, [x20, #0x10]\n"
532       "ldr x24, [x20, #0x18]\n"
533       "ldr x23, [x20, #0x20]\n"
534       "cbnz x9, 56f\n"
535       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
536       "add x27, x27, x19, LSL #2\n"
537       "add x26, x26, x19, LSL #2\n"
538       "add x25, x25, x19, LSL #2\n"
539       "add x24, x24, x19, LSL #2\n"
540       "add x23, x23, x19, LSL #2\n"
541       "b 56f\n"
542       "55:"  // Height 5: setup direct input
543       "mov x27, %x[input_ptr]\n"
544       "add x26, x27, x19, LSL #2\n"
545       "add x25, x26, x19, LSL #2\n"
546       "add x24, x25, x19, LSL #2\n"
547       "add x23, x24, x19, LSL #2\n"
548       "56:"  // Height 5: input setup done
549       "subs x28, x28, #0x1\n"
550       "ld1rw { z0.s }, p1/Z, [x27]\n"
551       "ld1rw { z1.s }, p1/Z, [x26]\n"
552       "ld1rw { z2.s }, p1/Z, [x25]\n"
553       "ld1rw { z3.s }, p1/Z, [x24]\n"
554       "ld1rw { z4.s }, p1/Z, [x23]\n"
555       "ble 58f\n"
556       "57:"  // Height 5: Multiply loop: Main loop
557       "ld1w { z8.s }, p1/Z, [x11]\n"
558       "add x27, x27, #0x4\n"
559       "subs x28, x28, #0x1\n"
560       "fmla z24.s, p1/M, z8.s, z0.s\n"
561       "add x26, x26, #0x4\n"
562       "add x25, x25, #0x4\n"
563       "fmla z25.s, p1/M, z8.s, z1.s\n"
564       "fmla z26.s, p1/M, z8.s, z2.s\n"
565       "add x24, x24, #0x4\n"
566       "add x23, x23, #0x4\n"
567       "fmla z27.s, p1/M, z8.s, z3.s\n"
568       "ld1rw { z0.s }, p1/Z, [x27]\n"
569       "addvl x11, x11, #1\n"
570       "fmla z28.s, p1/M, z8.s, z4.s\n"
571       "ld1rw { z1.s }, p1/Z, [x26]\n"
572       "ld1rw { z2.s }, p1/Z, [x25]\n"
573       "ld1rw { z3.s }, p1/Z, [x24]\n"
574       "ld1rw { z4.s }, p1/Z, [x23]\n"
575       "bgt 57b\n"
576       "58:"  // Height 5: Multiply loop: Main loop skip
577       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
578       "ld1w { z9.s }, p1/Z, [x11]\n"
579       "add x9, x9, #0x1\n"
580       "cmp x9, x19\n"
581       "fmla z24.s, p1/M, z9.s, z0.s\n"
582       "fmla z25.s, p1/M, z9.s, z1.s\n"
583       "addvl x11, x11, #1\n"
584       "fmla z26.s, p1/M, z9.s, z2.s\n"
585       "fmla z27.s, p1/M, z9.s, z3.s\n"
586       "fmla z28.s, p1/M, z9.s, z4.s\n"
587       "bne 54b\n"
588       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
589       "add x26, x10, x19, LSL #2\n"
590       "add x25, x26, x19, LSL #2\n"
591       "add x24, x25, x19, LSL #2\n"
592       "add x23, x24, x19, LSL #2\n"
593       "tbz %x[flags], #1, 59f\n"
594       "add x19, %x[args_ptr], %[offset_max]\n"
595       "ld1rw { z17.s }, p1/Z, [x19]\n"
596       "add x19, %x[args_ptr], %[offset_min]\n"
597       "ld1rw { z16.s }, p1/Z, [x19]\n"
598       "fmin z24.s, p1/M, z24.s, z17.s\n"
599       "fmin z25.s, p1/M, z25.s, z17.s\n"
600       "fmin z26.s, p1/M, z26.s, z17.s\n"
601       "fmin z27.s, p1/M, z27.s, z17.s\n"
602       "fmin z28.s, p1/M, z28.s, z17.s\n"
603       "fmax z24.s, p1/M, z24.s, z16.s\n"
604       "fmax z25.s, p1/M, z25.s, z16.s\n"
605       "fmax z26.s, p1/M, z26.s, z16.s\n"
606       "fmax z27.s, p1/M, z27.s, z16.s\n"
607       "fmax z28.s, p1/M, z28.s, z16.s\n"
608       "59:"  // Height 5: No activation
609       "st1w { z24.s }, p0, [x10]\n"
610       "addvl x10, x10, #1\n"
611       "st1w { z25.s }, p0, [x26]\n"
612       "st1w { z26.s }, p0, [x25]\n"
613       "st1w { z27.s }, p0, [x24]\n"
614       "st1w { z28.s }, p0, [x23]\n"
615       "60:"  // Height 5: Writeback done
616       "decw x12\n"
617       "cmp x12, XZR\n"
618       "bgt 50b\n"
619       "b 98f\n"
620       "61:"  // Height 6
621       "mov x13, %x[bias]\n"
622       "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
623       "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
624       "mov x10, %x[output_ptr]\n"
625       "62:"  // Height 6: Column loop
626       "mov x19, #0x0\n"
627       "whilelt p0.s, x19, x12\n"
628       "cbz x13, 63f\n"
629       "ld1w { z24.s }, p1/Z, [x13]\n"
630       "mov z25.d, z24.d\n"
631       "mov z26.d, z24.d\n"
632       "addvl x13, x13, #1\n"
633       "mov z27.d, z24.d\n"
634       "mov z28.d, z24.d\n"
635       "mov z29.d, z24.d\n"
636       "b 65f\n"
637       "63:"  // Height 6: no bias
638       "tbz %x[flags], #0, 64f\n"
639       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
640       "add x26, x10, x19, LSL #2\n"
641       "add x25, x26, x19, LSL #2\n"
642       "add x24, x25, x19, LSL #2\n"
643       "add x23, x24, x19, LSL #2\n"
644       "ld1w { z24.s }, p0/Z, [x10]\n"
645       "ld1w { z25.s }, p0/Z, [x26]\n"
646       "add x22, x23, x19, LSL #2\n"
647       "ld1w { z26.s }, p0/Z, [x25]\n"
648       "ld1w { z27.s }, p0/Z, [x24]\n"
649       "ld1w { z28.s }, p0/Z, [x23]\n"
650       "ld1w { z29.s }, p0/Z, [x22]\n"
651       "b 65f\n"
652       "64:"  // Height 6: no accumulate
653       "mov z24.b, #0x0\n"
654       "mov z25.b, #0x0\n"
655       "mov z26.b, #0x0\n"
656       "mov z27.b, #0x0\n"
657       "mov z28.b, #0x0\n"
658       "mov z29.b, #0x0\n"
659       "65:"  // Height 6: setup done
660       "mov x9, #0x0\n"
661       "66:"  // Height 6: String loop
662       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
663       "ldr w28, [x19, x9, LSL #0x2]\n"
664       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
665       "tbz %x[flags], #3, 67f\n"
666       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
667       "add x20, x20, x19, LSL #3\n"
668       "ldr x27, [x20, #0x0]\n"
669       "ldr x26, [x20, #0x8]\n"
670       "ldr x25, [x20, #0x10]\n"
671       "ldr x24, [x20, #0x18]\n"
672       "ldr x23, [x20, #0x20]\n"
673       "ldr x22, [x20, #0x28]\n"
674       "cbnz x9, 68f\n"
675       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
676       "add x27, x27, x19, LSL #2\n"
677       "add x26, x26, x19, LSL #2\n"
678       "add x25, x25, x19, LSL #2\n"
679       "add x24, x24, x19, LSL #2\n"
680       "add x23, x23, x19, LSL #2\n"
681       "add x22, x22, x19, LSL #2\n"
682       "b 68f\n"
683       "67:"  // Height 6: setup direct input
684       "mov x27, %x[input_ptr]\n"
685       "add x26, x27, x19, LSL #2\n"
686       "add x25, x26, x19, LSL #2\n"
687       "add x24, x25, x19, LSL #2\n"
688       "add x23, x24, x19, LSL #2\n"
689       "add x22, x23, x19, LSL #2\n"
690       "68:"  // Height 6: input setup done
691       "subs x28, x28, #0x1\n"
692       "ld1rw { z0.s }, p1/Z, [x27]\n"
693       "ld1rw { z1.s }, p1/Z, [x26]\n"
694       "ld1rw { z2.s }, p1/Z, [x25]\n"
695       "ld1rw { z3.s }, p1/Z, [x24]\n"
696       "ld1rw { z4.s }, p1/Z, [x23]\n"
697       "ld1rw { z5.s }, p1/Z, [x22]\n"
698       "ble 70f\n"
699       "69:"  // Height 6: Multiply loop: Main loop
700       "ld1w { z8.s }, p1/Z, [x11]\n"
701       "add x27, x27, #0x4\n"
702       "subs x28, x28, #0x1\n"
703       "fmla z24.s, p1/M, z8.s, z0.s\n"
704       "add x26, x26, #0x4\n"
705       "add x25, x25, #0x4\n"
706       "fmla z25.s, p1/M, z8.s, z1.s\n"
707       "fmla z26.s, p1/M, z8.s, z2.s\n"
708       "add x24, x24, #0x4\n"
709       "add x23, x23, #0x4\n"
710       "fmla z27.s, p1/M, z8.s, z3.s\n"
711       "fmla z28.s, p1/M, z8.s, z4.s\n"
712       "add x22, x22, #0x4\n"
713       "addvl x11, x11, #1\n"
714       "fmla z29.s, p1/M, z8.s, z5.s\n"
715       "ld1rw { z0.s }, p1/Z, [x27]\n"
716       "ld1rw { z1.s }, p1/Z, [x26]\n"
717       "ld1rw { z2.s }, p1/Z, [x25]\n"
718       "ld1rw { z3.s }, p1/Z, [x24]\n"
719       "ld1rw { z4.s }, p1/Z, [x23]\n"
720       "ld1rw { z5.s }, p1/Z, [x22]\n"
721       "bgt 69b\n"
722       "70:"  // Height 6: Multiply loop: Main loop skip
723       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
724       "ld1w { z9.s }, p1/Z, [x11]\n"
725       "add x9, x9, #0x1\n"
726       "cmp x9, x19\n"
727       "fmla z24.s, p1/M, z9.s, z0.s\n"
728       "fmla z25.s, p1/M, z9.s, z1.s\n"
729       "addvl x11, x11, #1\n"
730       "fmla z26.s, p1/M, z9.s, z2.s\n"
731       "fmla z27.s, p1/M, z9.s, z3.s\n"
732       "fmla z28.s, p1/M, z9.s, z4.s\n"
733       "fmla z29.s, p1/M, z9.s, z5.s\n"
734       "bne 66b\n"
735       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
736       "add x26, x10, x19, LSL #2\n"
737       "add x25, x26, x19, LSL #2\n"
738       "add x24, x25, x19, LSL #2\n"
739       "add x23, x24, x19, LSL #2\n"
740       "add x22, x23, x19, LSL #2\n"
741       "tbz %x[flags], #1, 71f\n"
742       "add x19, %x[args_ptr], %[offset_max]\n"
743       "ld1rw { z17.s }, p1/Z, [x19]\n"
744       "add x19, %x[args_ptr], %[offset_min]\n"
745       "ld1rw { z16.s }, p1/Z, [x19]\n"
746       "fmin z24.s, p1/M, z24.s, z17.s\n"
747       "fmin z25.s, p1/M, z25.s, z17.s\n"
748       "fmin z26.s, p1/M, z26.s, z17.s\n"
749       "fmin z27.s, p1/M, z27.s, z17.s\n"
750       "fmin z28.s, p1/M, z28.s, z17.s\n"
751       "fmin z29.s, p1/M, z29.s, z17.s\n"
752       "fmax z24.s, p1/M, z24.s, z16.s\n"
753       "fmax z25.s, p1/M, z25.s, z16.s\n"
754       "fmax z26.s, p1/M, z26.s, z16.s\n"
755       "fmax z27.s, p1/M, z27.s, z16.s\n"
756       "fmax z28.s, p1/M, z28.s, z16.s\n"
757       "fmax z29.s, p1/M, z29.s, z16.s\n"
758       "71:"  // Height 6: No activation
759       "st1w { z24.s }, p0, [x10]\n"
760       "addvl x10, x10, #1\n"
761       "st1w { z25.s }, p0, [x26]\n"
762       "st1w { z26.s }, p0, [x25]\n"
763       "st1w { z27.s }, p0, [x24]\n"
764       "st1w { z28.s }, p0, [x23]\n"
765       "st1w { z29.s }, p0, [x22]\n"
766       "72:"  // Height 6: Writeback done
767       "decw x12\n"
768       "cmp x12, XZR\n"
769       "bgt 62b\n"
770       "b 98f\n"
771       "73:"  // Height 7
772       "mov x13, %x[bias]\n"
773       "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
774       "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
775       "mov x10, %x[output_ptr]\n"
776       "74:"  // Height 7: Column loop
777       "mov x19, #0x0\n"
778       "whilelt p0.s, x19, x12\n"
779       "cbz x13, 75f\n"
780       "ld1w { z24.s }, p1/Z, [x13]\n"
781       "mov z25.d, z24.d\n"
782       "mov z26.d, z24.d\n"
783       "addvl x13, x13, #1\n"
784       "mov z27.d, z24.d\n"
785       "mov z28.d, z24.d\n"
786       "mov z29.d, z24.d\n"
787       "mov z30.d, z24.d\n"
788       "b 77f\n"
789       "75:"  // Height 7: no bias
790       "tbz %x[flags], #0, 76f\n"
791       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
792       "add x26, x10, x19, LSL #2\n"
793       "add x25, x26, x19, LSL #2\n"
794       "add x24, x25, x19, LSL #2\n"
795       "add x23, x24, x19, LSL #2\n"
796       "ld1w { z24.s }, p0/Z, [x10]\n"
797       "ld1w { z25.s }, p0/Z, [x26]\n"
798       "add x22, x23, x19, LSL #2\n"
799       "add x21, x22, x19, LSL #2\n"
800       "ld1w { z26.s }, p0/Z, [x25]\n"
801       "ld1w { z27.s }, p0/Z, [x24]\n"
802       "ld1w { z28.s }, p0/Z, [x23]\n"
803       "ld1w { z29.s }, p0/Z, [x22]\n"
804       "ld1w { z30.s }, p0/Z, [x21]\n"
805       "b 77f\n"
806       "76:"  // Height 7: no accumulate
807       "mov z24.b, #0x0\n"
808       "mov z25.b, #0x0\n"
809       "mov z26.b, #0x0\n"
810       "mov z27.b, #0x0\n"
811       "mov z28.b, #0x0\n"
812       "mov z29.b, #0x0\n"
813       "mov z30.b, #0x0\n"
814       "77:"  // Height 7: setup done
815       "mov x9, #0x0\n"
816       "78:"  // Height 7: String loop
817       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
818       "ldr w28, [x19, x9, LSL #0x2]\n"
819       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
820       "tbz %x[flags], #3, 79f\n"
821       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
822       "add x20, x20, x19, LSL #3\n"
823       "ldr x27, [x20, #0x0]\n"
824       "ldr x26, [x20, #0x8]\n"
825       "ldr x25, [x20, #0x10]\n"
826       "ldr x24, [x20, #0x18]\n"
827       "ldr x23, [x20, #0x20]\n"
828       "ldr x22, [x20, #0x28]\n"
829       "ldr x21, [x20, #0x30]\n"
830       "cbnz x9, 80f\n"
831       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
832       "add x27, x27, x19, LSL #2\n"
833       "add x26, x26, x19, LSL #2\n"
834       "add x25, x25, x19, LSL #2\n"
835       "add x24, x24, x19, LSL #2\n"
836       "add x23, x23, x19, LSL #2\n"
837       "add x22, x22, x19, LSL #2\n"
838       "add x21, x21, x19, LSL #2\n"
839       "b 80f\n"
840       "79:"  // Height 7: setup direct input
841       "mov x27, %x[input_ptr]\n"
842       "add x26, x27, x19, LSL #2\n"
843       "add x25, x26, x19, LSL #2\n"
844       "add x24, x25, x19, LSL #2\n"
845       "add x23, x24, x19, LSL #2\n"
846       "add x22, x23, x19, LSL #2\n"
847       "add x21, x22, x19, LSL #2\n"
848       "80:"  // Height 7: input setup done
849       "subs x28, x28, #0x1\n"
850       "ld1rw { z0.s }, p1/Z, [x27]\n"
851       "ld1rw { z1.s }, p1/Z, [x26]\n"
852       "ld1rw { z2.s }, p1/Z, [x25]\n"
853       "ld1rw { z3.s }, p1/Z, [x24]\n"
854       "ld1rw { z4.s }, p1/Z, [x23]\n"
855       "ld1rw { z5.s }, p1/Z, [x22]\n"
856       "ld1rw { z6.s }, p1/Z, [x21]\n"
857       "ble 82f\n"
858       "81:"  // Height 7: Multiply loop: Main loop
859       "ld1w { z8.s }, p1/Z, [x11]\n"
860       "add x27, x27, #0x4\n"
861       "subs x28, x28, #0x1\n"
862       "fmla z24.s, p1/M, z8.s, z0.s\n"
863       "add x26, x26, #0x4\n"
864       "add x25, x25, #0x4\n"
865       "fmla z25.s, p1/M, z8.s, z1.s\n"
866       "fmla z26.s, p1/M, z8.s, z2.s\n"
867       "add x24, x24, #0x4\n"
868       "add x23, x23, #0x4\n"
869       "fmla z27.s, p1/M, z8.s, z3.s\n"
870       "ld1rw { z0.s }, p1/Z, [x27]\n"
871       "add x22, x22, #0x4\n"
872       "add x21, x21, #0x4\n"
873       "fmla z28.s, p1/M, z8.s, z4.s\n"
874       "fmla z29.s, p1/M, z8.s, z5.s\n"
875       "addvl x11, x11, #1\n"
876       "ld1rw { z1.s }, p1/Z, [x26]\n"
877       "fmla z30.s, p1/M, z8.s, z6.s\n"
878       "ld1rw { z2.s }, p1/Z, [x25]\n"
879       "ld1rw { z3.s }, p1/Z, [x24]\n"
880       "ld1rw { z4.s }, p1/Z, [x23]\n"
881       "ld1rw { z5.s }, p1/Z, [x22]\n"
882       "ld1rw { z6.s }, p1/Z, [x21]\n"
883       "bgt 81b\n"
884       "82:"  // Height 7: Multiply loop: Main loop skip
885       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
886       "ld1w { z9.s }, p1/Z, [x11]\n"
887       "add x9, x9, #0x1\n"
888       "cmp x9, x19\n"
889       "fmla z24.s, p1/M, z9.s, z0.s\n"
890       "fmla z25.s, p1/M, z9.s, z1.s\n"
891       "addvl x11, x11, #1\n"
892       "fmla z26.s, p1/M, z9.s, z2.s\n"
893       "fmla z27.s, p1/M, z9.s, z3.s\n"
894       "fmla z28.s, p1/M, z9.s, z4.s\n"
895       "fmla z29.s, p1/M, z9.s, z5.s\n"
896       "fmla z30.s, p1/M, z9.s, z6.s\n"
897       "bne 78b\n"
898       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
899       "add x26, x10, x19, LSL #2\n"
900       "add x25, x26, x19, LSL #2\n"
901       "add x24, x25, x19, LSL #2\n"
902       "add x23, x24, x19, LSL #2\n"
903       "add x22, x23, x19, LSL #2\n"
904       "add x21, x22, x19, LSL #2\n"
905       "tbz %x[flags], #1, 83f\n"
906       "add x19, %x[args_ptr], %[offset_max]\n"
907       "ld1rw { z17.s }, p1/Z, [x19]\n"
908       "add x19, %x[args_ptr], %[offset_min]\n"
909       "ld1rw { z16.s }, p1/Z, [x19]\n"
910       "fmin z24.s, p1/M, z24.s, z17.s\n"
911       "fmin z25.s, p1/M, z25.s, z17.s\n"
912       "fmin z26.s, p1/M, z26.s, z17.s\n"
913       "fmin z27.s, p1/M, z27.s, z17.s\n"
914       "fmin z28.s, p1/M, z28.s, z17.s\n"
915       "fmin z29.s, p1/M, z29.s, z17.s\n"
916       "fmin z30.s, p1/M, z30.s, z17.s\n"
917       "fmax z24.s, p1/M, z24.s, z16.s\n"
918       "fmax z25.s, p1/M, z25.s, z16.s\n"
919       "fmax z26.s, p1/M, z26.s, z16.s\n"
920       "fmax z27.s, p1/M, z27.s, z16.s\n"
921       "fmax z28.s, p1/M, z28.s, z16.s\n"
922       "fmax z29.s, p1/M, z29.s, z16.s\n"
923       "fmax z30.s, p1/M, z30.s, z16.s\n"
924       "83:"  // Height 7: No activation
925       "st1w { z24.s }, p0, [x10]\n"
926       "addvl x10, x10, #1\n"
927       "st1w { z25.s }, p0, [x26]\n"
928       "st1w { z26.s }, p0, [x25]\n"
929       "st1w { z27.s }, p0, [x24]\n"
930       "st1w { z28.s }, p0, [x23]\n"
931       "st1w { z29.s }, p0, [x22]\n"
932       "st1w { z30.s }, p0, [x21]\n"
933       "84:"  // Height 7: Writeback done
934       "decw x12\n"
935       "cmp x12, XZR\n"
936       "bgt 74b\n"
937       "b 98f\n"
938       "85:"  // Height 8
939       "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
940       "mov x19, #0x20\n"
941       "mov x13, %x[bias]\n"
942       "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
943       "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
944       "mov x10, %x[output_ptr]\n"
945       "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
946       "86:"  // Height 8: Column loop
947       "mov x19, #0x0\n"
948       "whilelt p0.s, x19, x12\n"
949       "cbz x13, 87f\n"
950       "ld1w { z24.s }, p1/Z, [x13]\n"
951       "mov z25.d, z24.d\n"
952       "mov z26.d, z24.d\n"
953       "addvl x13, x13, #1\n"
954       "mov z27.d, z24.d\n"
955       "mov z28.d, z24.d\n"
956       "mov z29.d, z24.d\n"
957       "mov z30.d, z24.d\n"
958       "mov z31.d, z24.d\n"
959       "b 89f\n"
960       "87:"  // Height 8: no bias
961       "tbz %x[flags], #0, 88f\n"
962       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
963       "add x26, x10, x19, LSL #2\n"
964       "add x25, x26, x19, LSL #2\n"
965       "add x24, x25, x19, LSL #2\n"
966       "add x23, x24, x19, LSL #2\n"
967       "ld1w { z24.s }, p0/Z, [x10]\n"
968       "ld1w { z25.s }, p0/Z, [x26]\n"
969       "add x22, x23, x19, LSL #2\n"
970       "add x21, x22, x19, LSL #2\n"
971       "ld1w { z26.s }, p0/Z, [x25]\n"
972       "ld1w { z27.s }, p0/Z, [x24]\n"
973       "add x20, x21, x19, LSL #2\n"
974       "ld1w { z28.s }, p0/Z, [x23]\n"
975       "ld1w { z29.s }, p0/Z, [x22]\n"
976       "ld1w { z30.s }, p0/Z, [x21]\n"
977       "ld1w { z31.s }, p0/Z, [x20]\n"
978       "b 89f\n"
979       "88:"  // Height 8: no accumulate
980       "mov z24.b, #0x0\n"
981       "mov z25.b, #0x0\n"
982       "mov z26.b, #0x0\n"
983       "mov z27.b, #0x0\n"
984       "mov z28.b, #0x0\n"
985       "mov z29.b, #0x0\n"
986       "mov z30.b, #0x0\n"
987       "mov z31.b, #0x0\n"
988       "89:"  // Height 8: setup done
989       "mov x9, #0x0\n"
990       "90:"  // Height 8: String loop
991       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
992       "ldr w28, [x19, x9, LSL #0x2]\n"
993       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
994       "tbz %x[flags], #3, 91f\n"
995       "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
996       "add x20, x20, x19, LSL #3\n"
997       "ldr x27, [x20, #0x0]\n"
998       "ldr x26, [x20, #0x8]\n"
999       "ldr x25, [x20, #0x10]\n"
1000       "ldr x24, [x20, #0x18]\n"
1001       "ldr x23, [x20, #0x20]\n"
1002       "ldr x22, [x20, #0x28]\n"
1003       "ldr x21, [x20, #0x30]\n"
1004       "ldr x20, [x20, #0x38]\n"
1005       "cbnz x9, 92f\n"
1006       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1007       "add x27, x27, x19, LSL #2\n"
1008       "add x26, x26, x19, LSL #2\n"
1009       "add x25, x25, x19, LSL #2\n"
1010       "add x24, x24, x19, LSL #2\n"
1011       "add x23, x23, x19, LSL #2\n"
1012       "add x22, x22, x19, LSL #2\n"
1013       "add x21, x21, x19, LSL #2\n"
1014       "add x20, x20, x19, LSL #2\n"
1015       "b 92f\n"
1016       "91:"  // Height 8: setup direct input
1017       "mov x27, %x[input_ptr]\n"
1018       "add x26, x27, x19, LSL #2\n"
1019       "add x25, x26, x19, LSL #2\n"
1020       "add x24, x25, x19, LSL #2\n"
1021       "add x23, x24, x19, LSL #2\n"
1022       "add x22, x23, x19, LSL #2\n"
1023       "add x21, x22, x19, LSL #2\n"
1024       "add x20, x21, x19, LSL #2\n"
1025       "92:"  // Height 8: input setup done
1026       "subs x28, x28, #0x1\n"
1027       "ld1rw { z0.s }, p1/Z, [x27]\n"
1028       "ld1rw { z1.s }, p1/Z, [x26]\n"
1029       "ld1rw { z2.s }, p1/Z, [x25]\n"
1030       "ld1rw { z3.s }, p1/Z, [x24]\n"
1031       "ld1rw { z4.s }, p1/Z, [x23]\n"
1032       "ld1rw { z5.s }, p1/Z, [x22]\n"
1033       "ld1rw { z6.s }, p1/Z, [x21]\n"
1034       "ld1rw { z7.s }, p1/Z, [x20]\n"
1035       "ble 94f\n"
1036       "93:"  // Height 8: Multiply loop: Main loop
1037       "ld1w { z8.s }, p1/Z, [x11]\n"
1038       "add x27, x27, #0x4\n"
1039       "subs x28, x28, #0x1\n"
1040       "fmla z24.s, p1/M, z8.s, z0.s\n"
1041       "add x26, x26, #0x4\n"
1042       "add x25, x25, #0x4\n"
1043       "fmla z25.s, p1/M, z8.s, z1.s\n"
1044       "fmla z26.s, p1/M, z8.s, z2.s\n"
1045       "add x24, x24, #0x4\n"
1046       "add x23, x23, #0x4\n"
1047       "fmla z27.s, p1/M, z8.s, z3.s\n"
1048       "fmla z28.s, p1/M, z8.s, z4.s\n"
1049       "add x22, x22, #0x4\n"
1050       "add x21, x21, #0x4\n"
1051       "fmla z29.s, p1/M, z8.s, z5.s\n"
1052       "ld1rw { z0.s }, p1/Z, [x27]\n"
1053       "add x20, x20, #0x4\n"
1054       "addvl x11, x11, #1\n"
1055       "ld1rw { z1.s }, p1/Z, [x26]\n"
1056       "fmla z30.s, p1/M, z8.s, z6.s\n"
1057       "fmla z31.s, p1/M, z8.s, z7.s\n"
1058       "ld1rw { z2.s }, p1/Z, [x25]\n"
1059       "ld1rw { z3.s }, p1/Z, [x24]\n"
1060       "ld1rw { z4.s }, p1/Z, [x23]\n"
1061       "ld1rw { z5.s }, p1/Z, [x22]\n"
1062       "ld1rw { z6.s }, p1/Z, [x21]\n"
1063       "ld1rw { z7.s }, p1/Z, [x20]\n"
1064       "bgt 93b\n"
1065       "94:"  // Height 8: Multiply loop: Main loop skip
1066       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1067       "ld1w { z9.s }, p1/Z, [x11]\n"
1068       "add x9, x9, #0x1\n"
1069       "cmp x9, x19\n"
1070       "fmla z24.s, p1/M, z9.s, z0.s\n"
1071       "fmla z25.s, p1/M, z9.s, z1.s\n"
1072       "addvl x11, x11, #1\n"
1073       "fmla z26.s, p1/M, z9.s, z2.s\n"
1074       "fmla z27.s, p1/M, z9.s, z3.s\n"
1075       "fmla z28.s, p1/M, z9.s, z4.s\n"
1076       "fmla z29.s, p1/M, z9.s, z5.s\n"
1077       "fmla z30.s, p1/M, z9.s, z6.s\n"
1078       "fmla z31.s, p1/M, z9.s, z7.s\n"
1079       "bne 90b\n"
1080       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1081       "add x26, x10, x19, LSL #2\n"
1082       "add x25, x26, x19, LSL #2\n"
1083       "add x24, x25, x19, LSL #2\n"
1084       "add x23, x24, x19, LSL #2\n"
1085       "add x22, x23, x19, LSL #2\n"
1086       "add x21, x22, x19, LSL #2\n"
1087       "add x20, x21, x19, LSL #2\n"
1088       "tbz %x[flags], #1, 95f\n"
1089       "add x19, %x[args_ptr], %[offset_max]\n"
1090       "ld1rw { z17.s }, p1/Z, [x19]\n"
1091       "add x19, %x[args_ptr], %[offset_min]\n"
1092       "ld1rw { z16.s }, p1/Z, [x19]\n"
1093       "fmin z24.s, p1/M, z24.s, z17.s\n"
1094       "fmin z25.s, p1/M, z25.s, z17.s\n"
1095       "fmin z26.s, p1/M, z26.s, z17.s\n"
1096       "fmin z27.s, p1/M, z27.s, z17.s\n"
1097       "fmin z28.s, p1/M, z28.s, z17.s\n"
1098       "fmin z29.s, p1/M, z29.s, z17.s\n"
1099       "fmin z30.s, p1/M, z30.s, z17.s\n"
1100       "fmin z31.s, p1/M, z31.s, z17.s\n"
1101       "fmax z24.s, p1/M, z24.s, z16.s\n"
1102       "fmax z25.s, p1/M, z25.s, z16.s\n"
1103       "fmax z26.s, p1/M, z26.s, z16.s\n"
1104       "fmax z27.s, p1/M, z27.s, z16.s\n"
1105       "fmax z28.s, p1/M, z28.s, z16.s\n"
1106       "fmax z29.s, p1/M, z29.s, z16.s\n"
1107       "fmax z30.s, p1/M, z30.s, z16.s\n"
1108       "fmax z31.s, p1/M, z31.s, z16.s\n"
1109       "95:"  // Height 8: No activation
1110       "st1w { z24.s }, p0, [x10]\n"
1111       "addvl x10, x10, #1\n"
1112       "st1w { z25.s }, p0, [x26]\n"
1113       "st1w { z26.s }, p0, [x25]\n"
1114       "st1w { z27.s }, p0, [x24]\n"
1115       "st1w { z28.s }, p0, [x23]\n"
1116       "st1w { z29.s }, p0, [x22]\n"
1117       "st1w { z30.s }, p0, [x21]\n"
1118       "st1w { z31.s }, p0, [x20]\n"
1119       "96:"  // Height 8: Writeback done
1120       "decw x12\n"
1121       "cmp x12, XZR\n"
1122       "bgt 86b\n"
1123       "subs %x[M], %x[M], #0x8\n"
1124       "beq 98f\n"
1125       "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1126       "tbz %x[flags], #3, 97f\n"
1127       "add x20, x20, #0x8\n"
1128       "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1129       "b 1b\n"
1130       "97:"  // Update direct input
1131       "mov x19, #0x20\n"
1132       "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
1133       "b 1b\n"
1134       "98:"  // Exit
1135 
1136       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
1137       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
1138       : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
1139     );
1140 }
1141 
1142 } // namespace arm_gemm
1143 #endif // ARM_COMPUTE_ENABLE_SVE
1144