1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 #ifdef __aarch64__
25 
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28 #include "../../bfloat.hpp"
29 
30 #include <cassert>
31 #include <limits>
32 
33 namespace arm_gemm {
34 
a64_ffhybrid_fp32bf16fp32_mmla_4x24(unsigned int num_strings,const unsigned int * string_lengths,IndirectInputArg<float> A_arg,size_t M,size_t N,const bfloat16 * B_ptr,size_t B_stride,IndirectOutputArg<float> output_arg,const float * bias,Activation act,bool accumulate)35 void a64_ffhybrid_fp32bf16fp32_mmla_4x24 (
36     unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
37     size_t M, size_t N, const bfloat16 *B_ptr, size_t B_stride, IndirectOutputArg<float> output_arg,
38     const float *bias, Activation act, bool accumulate
39 )
40 {
41     struct KernelArgs {
42         float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
43         float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
44         unsigned int num_strings = {};
45         const unsigned int *string_lengths = {};
46         size_t N = {};
47         const bfloat16 *B_ptr = {};
48         const bfloat16 *cur_B_ptr = {};
49         size_t B_stride = {};
50         size_t output_offset = {};
51         size_t input_initial_col = {};
52         size_t input_offset = {};
53     } ka;
54 
55     unsigned long flags=0;
56     void *output_ptr;
57     void *input_ptr;
58 
59     if (output_arg.is_indirect) {
60         output_ptr=(void *)(output_arg.indirect.ptr);
61         ka.output_offset=output_arg.indirect.offset;
62         flags |= 0x4;
63     } else {
64         output_ptr=(void *)(output_arg.direct.base);
65         ka.output_offset=output_arg.direct.stride;
66     }
67 
68     if (A_arg.is_indirect) {
69         input_ptr=(void *)(A_arg.indirect.ptr);
70         ka.input_offset=A_arg.indirect.start_row;
71         ka.input_initial_col=A_arg.indirect.start_col;
72         flags |= 0x8;
73     } else {
74         assert(num_strings==1);
75         input_ptr=(void *)(A_arg.direct.base);
76         ka.input_offset=A_arg.direct.stride;
77     }
78     if (accumulate) {
79         flags |= 0x1;
80     }
81     ka.num_strings = num_strings;
82     ka.string_lengths = string_lengths;
83     ka.N = N;
84     ka.B_ptr = B_ptr;
85     ka.B_stride = B_stride;
86     switch(act.type) {
87         default:
88         case Activation::Type::None:
89             break;
90         case Activation::Type::BoundedReLU:
91             ka.maxval = static_cast<float>(act.param1);
92             /* fall through */
93         case Activation::Type::ReLU:
94             ka.minval = 0;
95             flags |= 0x2;
96             break;
97     }
98     __asm__ __volatile__(
99       "1:"  // Row loop
100       "cmp %x[M], #0x4\n"
101       "bge 133f\n"
102       "cmp %x[M], #0x2\n"
103       "bgt 89f\n"
104       "beq 45f\n"
105       "ldr x19, [%x[args_ptr], %[offsetof_B_ptr]]\n"
106       "mov x14, %x[bias]\n"
107       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
108       "str x19, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
109       "mov x12, %x[output_ptr]\n"
110       "2:"  // Height 1: Column loop
111       "ldr x11, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
112       "ldr x19, [%x[args_ptr], %[offsetof_B_stride]]\n"
113       "add x10, x11, x19, LSL #1\n"
114       "add x9, x10, x19, LSL #1\n"
115       "add x28, x9, x19, LSL #1\n"
116       "add x27, x28, x19, LSL #1\n"
117       "add x26, x27, x19, LSL #1\n"
118       "add x19, x26, x19, LSL #1\n"
119       "str x19, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
120       "cmp x13, #0x14\n"
121       "bgt 3f\n"
122       "cmp x13, #0x10\n"
123       "mov x26, x11\n"
124       "bgt 3f\n"
125       "cmp x13, #0xc\n"
126       "mov x27, x11\n"
127       "bgt 3f\n"
128       "cmp x13, #0x8\n"
129       "mov x28, x11\n"
130       "bgt 3f\n"
131       "cmp x13, #0x4\n"
132       "mov x9, x11\n"
133       "bgt 3f\n"
134       "mov x10, x11\n"
135       "3:"  // Height 1: B setup done
136       "cbz x14, 4f\n"
137       "ldr q8, [x14, #0x0]\n"
138       "ldr q9, [x14, #0x10]\n"
139       "zip2 v14.2d, v8.2d, v8.2d\n"
140       "zip1 v8.2d, v8.2d, v8.2d\n"
141       "ldr q10, [x14, #0x20]\n"
142       "ldr q11, [x14, #0x30]\n"
143       "zip2 v15.2d, v9.2d, v9.2d\n"
144       "zip1 v9.2d, v9.2d, v9.2d\n"
145       "ldr q12, [x14, #0x40]\n"
146       "ldr q13, [x14, #0x50]\n"
147       "zip2 v16.2d, v10.2d, v10.2d\n"
148       "zip1 v10.2d, v10.2d, v10.2d\n"
149       "zip2 v17.2d, v11.2d, v11.2d\n"
150       "zip1 v11.2d, v11.2d, v11.2d\n"
151       "add x14, x14, #0x60\n"
152       "zip2 v18.2d, v12.2d, v12.2d\n"
153       "zip1 v12.2d, v12.2d, v12.2d\n"
154       "zip2 v19.2d, v13.2d, v13.2d\n"
155       "zip1 v13.2d, v13.2d, v13.2d\n"
156       "b 20f\n"
157       "4:"  // Height 1: no bias
158       "tbz %x[flags], #0, 19f\n"
159       "cmp x13, #0x18\n"
160       "bge 17f\n"
161       "tbz x13, #4, 8f\n"
162       "ld1 { v9.4s }, [x12], #0x10\n"
163       "ld1 { v10.4s }, [x12], #0x10\n"
164       "ld1 { v11.4s }, [x12], #0x10\n"
165       "ld1 { v12.4s }, [x12], #0x10\n"
166       "tbz x13, #2, 6f\n"
167       "ld1 { v13.4s }, [x12], #0x10\n"
168       "tbz x13, #1, 5f\n"
169       "ldr d20, [x12], #0x8\n"
170       "mov x19, #0x58\n"
171       "tbz x13, #0, 16f\n"
172       "ld1 { v20.s }[2], [x12]\n"
173       "b 16f\n"
174       "5:"  // Height 1: Partial accumulate: partial_1_20
175       "mov x19, #0x50\n"
176       "tbz x13, #0, 16f\n"
177       "ldr s20, [x12, #0x0]\n"
178       "b 16f\n"
179       "6:"  // Height 1: Partial accumulate: partial_2_16
180       "tbz x13, #1, 7f\n"
181       "ldr d13, [x12], #0x8\n"
182       "mov x19, #0x48\n"
183       "tbz x13, #0, 16f\n"
184       "ld1 { v13.s }[2], [x12]\n"
185       "b 16f\n"
186       "7:"  // Height 1: Partial accumulate: partial_1_16
187       "mov x19, #0x40\n"
188       "tbz x13, #0, 16f\n"
189       "ldr s13, [x12, #0x0]\n"
190       "b 16f\n"
191       "8:"  // Height 1: Partial accumulate: partial_8_0
192       "tbz x13, #3, 12f\n"
193       "ld1 { v9.4s }, [x12], #0x10\n"
194       "ld1 { v10.4s }, [x12], #0x10\n"
195       "tbz x13, #2, 10f\n"
196       "ld1 { v11.4s }, [x12], #0x10\n"
197       "tbz x13, #1, 9f\n"
198       "ldr d12, [x12], #0x8\n"
199       "mov x19, #0x38\n"
200       "tbz x13, #0, 16f\n"
201       "ld1 { v12.s }[2], [x12]\n"
202       "b 16f\n"
203       "9:"  // Height 1: Partial accumulate: partial_1_12
204       "mov x19, #0x30\n"
205       "tbz x13, #0, 16f\n"
206       "ldr s12, [x12, #0x0]\n"
207       "b 16f\n"
208       "10:"  // Height 1: Partial accumulate: partial_2_8
209       "tbz x13, #1, 11f\n"
210       "ldr d11, [x12], #0x8\n"
211       "mov x19, #0x28\n"
212       "tbz x13, #0, 16f\n"
213       "ld1 { v11.s }[2], [x12]\n"
214       "b 16f\n"
215       "11:"  // Height 1: Partial accumulate: partial_1_8
216       "mov x19, #0x20\n"
217       "tbz x13, #0, 16f\n"
218       "ldr s11, [x12, #0x0]\n"
219       "b 16f\n"
220       "12:"  // Height 1: Partial accumulate: partial_4_0
221       "tbz x13, #2, 14f\n"
222       "ld1 { v9.4s }, [x12], #0x10\n"
223       "tbz x13, #1, 13f\n"
224       "ldr d10, [x12], #0x8\n"
225       "mov x19, #0x18\n"
226       "tbz x13, #0, 16f\n"
227       "ld1 { v10.s }[2], [x12]\n"
228       "b 16f\n"
229       "13:"  // Height 1: Partial accumulate: partial_1_4
230       "mov x19, #0x10\n"
231       "tbz x13, #0, 16f\n"
232       "ldr s10, [x12, #0x0]\n"
233       "b 16f\n"
234       "14:"  // Height 1: Partial accumulate: partial_2_0
235       "tbz x13, #1, 15f\n"
236       "ldr d9, [x12], #0x8\n"
237       "mov x19, #0x8\n"
238       "tbz x13, #0, 16f\n"
239       "ld1 { v9.s }[2], [x12]\n"
240       "b 16f\n"
241       "15:"  // Height 1: Partial accumulate: partial_1_0
242       "ldr s9, [x12, #0x0]\n"
243       "mov x19, #0x0\n"
244       "16:"  // Height 1: Partial accumulate: Done
245       "sub x12, x12, x19\n"
246       "b 18f\n"
247       "17:"  // Height 1: full accumulate
248       "ldr q9, [x12, #0x0]\n"
249       "ldr q10, [x12, #0x10]\n"
250       "ldr q11, [x12, #0x20]\n"
251       "ldr q12, [x12, #0x30]\n"
252       "ldr q13, [x12, #0x40]\n"
253       "ldr q20, [x12, #0x50]\n"
254       "18:"  // Height 1: MMLA fixup
255       "zip1 v8.2d, v9.2d, v14.2d\n"
256       "zip2 v14.2d, v9.2d, v14.2d\n"
257       "zip1 v9.2d, v10.2d, v15.2d\n"
258       "zip2 v15.2d, v10.2d, v15.2d\n"
259       "zip1 v10.2d, v11.2d, v16.2d\n"
260       "zip2 v16.2d, v11.2d, v16.2d\n"
261       "zip1 v11.2d, v12.2d, v17.2d\n"
262       "zip2 v17.2d, v12.2d, v17.2d\n"
263       "zip1 v12.2d, v13.2d, v18.2d\n"
264       "zip2 v18.2d, v13.2d, v18.2d\n"
265       "zip1 v13.2d, v20.2d, v19.2d\n"
266       "zip2 v19.2d, v20.2d, v19.2d\n"
267       "b 20f\n"
268       "19:"  // Height 1: no accumulate
269       "movi v8.16b, #0x0\n"
270       "movi v9.16b, #0x0\n"
271       "movi v10.16b, #0x0\n"
272       "movi v11.16b, #0x0\n"
273       "movi v12.16b, #0x0\n"
274       "movi v13.16b, #0x0\n"
275       "movi v14.16b, #0x0\n"
276       "movi v15.16b, #0x0\n"
277       "movi v16.16b, #0x0\n"
278       "movi v17.16b, #0x0\n"
279       "movi v18.16b, #0x0\n"
280       "movi v19.16b, #0x0\n"
281       "20:"  // Height 1: setup done
282       "mov x25, #0x0\n"
283       "21:"  // Height 1: String loop
284       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
285       "ldr w24, [x19, x25, LSL #0x2]\n"
286       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
287       "tbz %x[flags], #3, 22f\n"
288       "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
289       "add x20, x20, x19, LSL #3\n"
290       "ldr x23, [x20, #0x0]\n"
291       "cbnz x25, 23f\n"
292       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
293       "add x23, x23, x19, LSL #2\n"
294       "b 23f\n"
295       "22:"  // Height 1: setup direct input
296       "mov x23, %x[input_ptr]\n"
297       "23:"  // Height 1: input setup done
298       "cmp x24, #0x4\n"
299       "blt 26f\n"
300       "ld1 { v0.4s }, [x23], #0x10\n"
301       "ldr q4, [x11, #0x0]\n"
302       "cmp x24, #0x8\n"
303       "ldr q5, [x11, #0x10]\n"
304       "ldr q6, [x10, #0x0]\n"
305       "ldr q7, [x10, #0x10]\n"
306       "blt 25f\n"
307       "24:"  // Height 1: Multiply loop: Main loop head
308       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
309       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
310       "ldr q4, [x9, #0x0]\n"
311       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
312       "ldr q5, [x9, #0x10]\n"
313       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
314       "ldr q6, [x28, #0x0]\n"
315       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
316       "ldr q7, [x28, #0x10]\n"
317       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
318       "ldr q4, [x27, #0x0]\n"
319       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
320       "ldr q5, [x27, #0x10]\n"
321       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
322       "ldr q6, [x26, #0x0]\n"
323       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
324       "ldr q7, [x26, #0x10]\n"
325       "sub x24, x24, #0x4\n"
326       "cmp x24, #0x8\n"
327       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
328       "add x11, x11, #0x20\n"
329       "ldr q4, [x11, #0x0]\n"
330       "add x10, x10, #0x20\n"
331       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
332       "ldr q5, [x11, #0x10]\n"
333       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
334       "ldr q6, [x10, #0x0]\n"
335       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
336       "ld1 { v0.4s }, [x23], #0x10\n"
337       "ldr q7, [x10, #0x10]\n"
338       "add x9, x9, #0x20\n"
339       "add x28, x28, #0x20\n"
340       "add x27, x27, #0x20\n"
341       "add x26, x26, #0x20\n"
342       "bge 24b\n"
343       "25:"  // Height 1: Multiply loop: Single iteration only
344       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
345       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
346       "ldr q4, [x9, #0x0]\n"
347       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
348       "ldr q5, [x9, #0x10]\n"
349       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
350       "ldr q6, [x28, #0x0]\n"
351       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
352       "ldr q7, [x28, #0x10]\n"
353       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
354       "ldr q4, [x27, #0x0]\n"
355       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
356       "ldr q5, [x27, #0x10]\n"
357       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
358       "ldr q6, [x26, #0x0]\n"
359       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
360       "ldr q7, [x26, #0x10]\n"
361       "sub x24, x24, #0x4\n"
362       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
363       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
364       "add x11, x11, #0x20\n"
365       "add x10, x10, #0x20\n"
366       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
367       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
368       "add x9, x9, #0x20\n"
369       "add x28, x28, #0x20\n"
370       "add x27, x27, #0x20\n"
371       "add x26, x26, #0x20\n"
372       "26:"  // Height 1: Multiply loop: Main loop skip
373       "cbz x24, 29f\n"
374       "cbz x24, 29f\n"
375       "tbz x24, #1, 27f\n"
376       "ldr d0, [x23], #0x8\n"
377       "tbz x24, #0, 28f\n"
378       "ld1 { v0.s }[2], [x23]\n"
379       "b 28f\n"
380       "27:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
381       "ldr s0, [x23, #0x0]\n"
382       "28:"  // Height 1: Multiply loop: Ragged operand read: Done
383       "ldr q4, [x11, #0x0]\n"
384       "ldr q5, [x11, #0x10]\n"
385       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
386       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
387       "ldr q6, [x10, #0x0]\n"
388       "ldr q7, [x10, #0x10]\n"
389       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
390       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
391       "ldr q4, [x9, #0x0]\n"
392       "ldr q5, [x9, #0x10]\n"
393       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
394       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
395       "ldr q6, [x28, #0x0]\n"
396       "ldr q7, [x28, #0x10]\n"
397       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
398       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
399       "ldr q4, [x27, #0x0]\n"
400       "ldr q5, [x27, #0x10]\n"
401       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
402       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
403       "ldr q6, [x26, #0x0]\n"
404       "ldr q7, [x26, #0x10]\n"
405       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
406       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
407       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
408       "add x11, x11, #0x20\n"
409       "add x10, x10, #0x20\n"
410       "add x9, x9, #0x20\n"
411       "add x28, x28, #0x20\n"
412       "add x27, x27, #0x20\n"
413       "add x26, x26, #0x20\n"
414       "29:"  // Height 1: Multiply loop: No odd multiplies
415       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
416       "add x25, x25, #0x1\n"
417       "cmp x25, x19\n"
418       "bne 21b\n"
419       "uzp1 v8.2d, v8.2d, v14.2d\n"
420       "uzp1 v9.2d, v9.2d, v15.2d\n"
421       "uzp1 v10.2d, v10.2d, v16.2d\n"
422       "uzp1 v11.2d, v11.2d, v17.2d\n"
423       "uzp1 v12.2d, v12.2d, v18.2d\n"
424       "uzp1 v13.2d, v13.2d, v19.2d\n"
425       "tbz %x[flags], #1, 30f\n"
426       "add x19, %x[args_ptr], %[offset_max]\n"
427       "ld1r { v1.4s }, [x19]\n"
428       "add x19, %x[args_ptr], %[offset_min]\n"
429       "ld1r { v0.4s }, [x19]\n"
430       "fmin v8.4s, v8.4s, v1.4s\n"
431       "fmin v9.4s, v9.4s, v1.4s\n"
432       "fmin v10.4s, v10.4s, v1.4s\n"
433       "fmin v11.4s, v11.4s, v1.4s\n"
434       "fmin v12.4s, v12.4s, v1.4s\n"
435       "fmin v13.4s, v13.4s, v1.4s\n"
436       "fmax v8.4s, v8.4s, v0.4s\n"
437       "fmax v9.4s, v9.4s, v0.4s\n"
438       "fmax v10.4s, v10.4s, v0.4s\n"
439       "fmax v11.4s, v11.4s, v0.4s\n"
440       "fmax v12.4s, v12.4s, v0.4s\n"
441       "fmax v13.4s, v13.4s, v0.4s\n"
442       "30:"  // Height 1: No activation
443       "cmp x13, #0x18\n"
444       "bge 43f\n"
445       "tbz x13, #4, 34f\n"
446       "st1 { v8.4s }, [x12], #0x10\n"
447       "st1 { v9.4s }, [x12], #0x10\n"
448       "st1 { v10.4s }, [x12], #0x10\n"
449       "st1 { v11.4s }, [x12], #0x10\n"
450       "tbz x13, #2, 32f\n"
451       "st1 { v12.4s }, [x12], #0x10\n"
452       "tbz x13, #1, 31f\n"
453       "str d13, [x12], #0x8\n"
454       "tbz x13, #0, 42f\n"
455       "st1 { v13.s }[2], [x12]\n"
456       "b 42f\n"
457       "31:"  // Height 1: Partial direct writeback: partial_1_20
458       "tbz x13, #0, 42f\n"
459       "str s13, [x12, #0x0]\n"
460       "b 42f\n"
461       "32:"  // Height 1: Partial direct writeback: partial_2_16
462       "tbz x13, #1, 33f\n"
463       "str d12, [x12], #0x8\n"
464       "tbz x13, #0, 42f\n"
465       "st1 { v12.s }[2], [x12]\n"
466       "b 42f\n"
467       "33:"  // Height 1: Partial direct writeback: partial_1_16
468       "tbz x13, #0, 42f\n"
469       "str s12, [x12, #0x0]\n"
470       "b 42f\n"
471       "34:"  // Height 1: Partial direct writeback: partial_8_0
472       "tbz x13, #3, 38f\n"
473       "st1 { v8.4s }, [x12], #0x10\n"
474       "st1 { v9.4s }, [x12], #0x10\n"
475       "tbz x13, #2, 36f\n"
476       "st1 { v10.4s }, [x12], #0x10\n"
477       "tbz x13, #1, 35f\n"
478       "str d11, [x12], #0x8\n"
479       "tbz x13, #0, 42f\n"
480       "st1 { v11.s }[2], [x12]\n"
481       "b 42f\n"
482       "35:"  // Height 1: Partial direct writeback: partial_1_12
483       "tbz x13, #0, 42f\n"
484       "str s11, [x12, #0x0]\n"
485       "b 42f\n"
486       "36:"  // Height 1: Partial direct writeback: partial_2_8
487       "tbz x13, #1, 37f\n"
488       "str d10, [x12], #0x8\n"
489       "tbz x13, #0, 42f\n"
490       "st1 { v10.s }[2], [x12]\n"
491       "b 42f\n"
492       "37:"  // Height 1: Partial direct writeback: partial_1_8
493       "tbz x13, #0, 42f\n"
494       "str s10, [x12, #0x0]\n"
495       "b 42f\n"
496       "38:"  // Height 1: Partial direct writeback: partial_4_0
497       "tbz x13, #2, 40f\n"
498       "st1 { v8.4s }, [x12], #0x10\n"
499       "tbz x13, #1, 39f\n"
500       "str d9, [x12], #0x8\n"
501       "tbz x13, #0, 42f\n"
502       "st1 { v9.s }[2], [x12]\n"
503       "b 42f\n"
504       "39:"  // Height 1: Partial direct writeback: partial_1_4
505       "tbz x13, #0, 42f\n"
506       "str s9, [x12, #0x0]\n"
507       "b 42f\n"
508       "40:"  // Height 1: Partial direct writeback: partial_2_0
509       "tbz x13, #1, 41f\n"
510       "str d8, [x12], #0x8\n"
511       "tbz x13, #0, 42f\n"
512       "st1 { v8.s }[2], [x12]\n"
513       "b 42f\n"
514       "41:"  // Height 1: Partial direct writeback: partial_1_0
515       "str s8, [x12, #0x0]\n"
516       "42:"  // Height 1: Partial direct writeback: Done
517       "b 44f\n"
518       "43:"  // Height 1: Full writeback
519       "str q8, [x12, #0x0]\n"
520       "str q9, [x12, #0x10]\n"
521       "str q10, [x12, #0x20]\n"
522       "str q11, [x12, #0x30]\n"
523       "str q12, [x12, #0x40]\n"
524       "str q13, [x12, #0x50]\n"
525       "add x12, x12, #0x60\n"
526       "44:"  // Height 1: Writeback done
527       "subs x13, x13, #0x18\n"
528       "bgt 2b\n"
529       "b 178f\n"
530       "45:"  // Height 2
531       "ldr x19, [%x[args_ptr], %[offsetof_B_ptr]]\n"
532       "mov x14, %x[bias]\n"
533       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
534       "str x19, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
535       "mov x12, %x[output_ptr]\n"
536       "46:"  // Height 2: Column loop
537       "ldr x11, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
538       "ldr x19, [%x[args_ptr], %[offsetof_B_stride]]\n"
539       "add x10, x11, x19, LSL #1\n"
540       "add x9, x10, x19, LSL #1\n"
541       "add x28, x9, x19, LSL #1\n"
542       "add x27, x28, x19, LSL #1\n"
543       "add x26, x27, x19, LSL #1\n"
544       "add x19, x26, x19, LSL #1\n"
545       "str x19, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
546       "cmp x13, #0x14\n"
547       "bgt 47f\n"
548       "cmp x13, #0x10\n"
549       "mov x26, x11\n"
550       "bgt 47f\n"
551       "cmp x13, #0xc\n"
552       "mov x27, x11\n"
553       "bgt 47f\n"
554       "cmp x13, #0x8\n"
555       "mov x28, x11\n"
556       "bgt 47f\n"
557       "cmp x13, #0x4\n"
558       "mov x9, x11\n"
559       "bgt 47f\n"
560       "mov x10, x11\n"
561       "47:"  // Height 2: B setup done
562       "cbz x14, 48f\n"
563       "ldr q8, [x14, #0x0]\n"
564       "ldr q9, [x14, #0x10]\n"
565       "zip2 v14.2d, v8.2d, v8.2d\n"
566       "zip1 v8.2d, v8.2d, v8.2d\n"
567       "ldr q10, [x14, #0x20]\n"
568       "ldr q11, [x14, #0x30]\n"
569       "zip2 v15.2d, v9.2d, v9.2d\n"
570       "zip1 v9.2d, v9.2d, v9.2d\n"
571       "ldr q12, [x14, #0x40]\n"
572       "ldr q13, [x14, #0x50]\n"
573       "zip2 v16.2d, v10.2d, v10.2d\n"
574       "zip1 v10.2d, v10.2d, v10.2d\n"
575       "zip2 v17.2d, v11.2d, v11.2d\n"
576       "zip1 v11.2d, v11.2d, v11.2d\n"
577       "add x14, x14, #0x60\n"
578       "zip2 v18.2d, v12.2d, v12.2d\n"
579       "zip1 v12.2d, v12.2d, v12.2d\n"
580       "zip2 v19.2d, v13.2d, v13.2d\n"
581       "zip1 v13.2d, v13.2d, v13.2d\n"
582       "b 64f\n"
583       "48:"  // Height 2: no bias
584       "tbz %x[flags], #0, 63f\n"
585       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
586       "cmp x13, #0x18\n"
587       "add x22, x12, x19, LSL #2\n"
588       "bge 61f\n"
589       "tbz x13, #4, 52f\n"
590       "ld1 { v9.4s }, [x12], #0x10\n"
591       "ld1 { v14.4s }, [x22], #0x10\n"
592       "ld1 { v10.4s }, [x12], #0x10\n"
593       "ld1 { v15.4s }, [x22], #0x10\n"
594       "ld1 { v11.4s }, [x12], #0x10\n"
595       "ld1 { v16.4s }, [x22], #0x10\n"
596       "ld1 { v12.4s }, [x12], #0x10\n"
597       "ld1 { v17.4s }, [x22], #0x10\n"
598       "tbz x13, #2, 50f\n"
599       "ld1 { v13.4s }, [x12], #0x10\n"
600       "ld1 { v18.4s }, [x22], #0x10\n"
601       "tbz x13, #1, 49f\n"
602       "ldr d20, [x12], #0x8\n"
603       "ldr d19, [x22], #0x8\n"
604       "mov x19, #0x58\n"
605       "tbz x13, #0, 60f\n"
606       "ld1 { v20.s }[2], [x12]\n"
607       "ld1 { v19.s }[2], [x22]\n"
608       "b 60f\n"
609       "49:"  // Height 2: Partial accumulate: partial_1_20
610       "mov x19, #0x50\n"
611       "tbz x13, #0, 60f\n"
612       "ldr s20, [x12, #0x0]\n"
613       "ldr s19, [x22, #0x0]\n"
614       "b 60f\n"
615       "50:"  // Height 2: Partial accumulate: partial_2_16
616       "tbz x13, #1, 51f\n"
617       "ldr d13, [x12], #0x8\n"
618       "ldr d18, [x22], #0x8\n"
619       "mov x19, #0x48\n"
620       "tbz x13, #0, 60f\n"
621       "ld1 { v13.s }[2], [x12]\n"
622       "ld1 { v18.s }[2], [x22]\n"
623       "b 60f\n"
624       "51:"  // Height 2: Partial accumulate: partial_1_16
625       "mov x19, #0x40\n"
626       "tbz x13, #0, 60f\n"
627       "ldr s13, [x12, #0x0]\n"
628       "ldr s18, [x22, #0x0]\n"
629       "b 60f\n"
630       "52:"  // Height 2: Partial accumulate: partial_8_0
631       "tbz x13, #3, 56f\n"
632       "ld1 { v9.4s }, [x12], #0x10\n"
633       "ld1 { v14.4s }, [x22], #0x10\n"
634       "ld1 { v10.4s }, [x12], #0x10\n"
635       "ld1 { v15.4s }, [x22], #0x10\n"
636       "tbz x13, #2, 54f\n"
637       "ld1 { v11.4s }, [x12], #0x10\n"
638       "ld1 { v16.4s }, [x22], #0x10\n"
639       "tbz x13, #1, 53f\n"
640       "ldr d12, [x12], #0x8\n"
641       "ldr d17, [x22], #0x8\n"
642       "mov x19, #0x38\n"
643       "tbz x13, #0, 60f\n"
644       "ld1 { v12.s }[2], [x12]\n"
645       "ld1 { v17.s }[2], [x22]\n"
646       "b 60f\n"
647       "53:"  // Height 2: Partial accumulate: partial_1_12
648       "mov x19, #0x30\n"
649       "tbz x13, #0, 60f\n"
650       "ldr s12, [x12, #0x0]\n"
651       "ldr s17, [x22, #0x0]\n"
652       "b 60f\n"
653       "54:"  // Height 2: Partial accumulate: partial_2_8
654       "tbz x13, #1, 55f\n"
655       "ldr d11, [x12], #0x8\n"
656       "ldr d16, [x22], #0x8\n"
657       "mov x19, #0x28\n"
658       "tbz x13, #0, 60f\n"
659       "ld1 { v11.s }[2], [x12]\n"
660       "ld1 { v16.s }[2], [x22]\n"
661       "b 60f\n"
662       "55:"  // Height 2: Partial accumulate: partial_1_8
663       "mov x19, #0x20\n"
664       "tbz x13, #0, 60f\n"
665       "ldr s11, [x12, #0x0]\n"
666       "ldr s16, [x22, #0x0]\n"
667       "b 60f\n"
668       "56:"  // Height 2: Partial accumulate: partial_4_0
669       "tbz x13, #2, 58f\n"
670       "ld1 { v9.4s }, [x12], #0x10\n"
671       "ld1 { v14.4s }, [x22], #0x10\n"
672       "tbz x13, #1, 57f\n"
673       "ldr d10, [x12], #0x8\n"
674       "ldr d15, [x22], #0x8\n"
675       "mov x19, #0x18\n"
676       "tbz x13, #0, 60f\n"
677       "ld1 { v10.s }[2], [x12]\n"
678       "ld1 { v15.s }[2], [x22]\n"
679       "b 60f\n"
680       "57:"  // Height 2: Partial accumulate: partial_1_4
681       "mov x19, #0x10\n"
682       "tbz x13, #0, 60f\n"
683       "ldr s10, [x12, #0x0]\n"
684       "ldr s15, [x22, #0x0]\n"
685       "b 60f\n"
686       "58:"  // Height 2: Partial accumulate: partial_2_0
687       "tbz x13, #1, 59f\n"
688       "ldr d9, [x12], #0x8\n"
689       "ldr d14, [x22], #0x8\n"
690       "mov x19, #0x8\n"
691       "tbz x13, #0, 60f\n"
692       "ld1 { v9.s }[2], [x12]\n"
693       "ld1 { v14.s }[2], [x22]\n"
694       "b 60f\n"
695       "59:"  // Height 2: Partial accumulate: partial_1_0
696       "ldr s9, [x12, #0x0]\n"
697       "ldr s14, [x22, #0x0]\n"
698       "mov x19, #0x0\n"
699       "60:"  // Height 2: Partial accumulate: Done
700       "sub x12, x12, x19\n"
701       "b 62f\n"
702       "61:"  // Height 2: full accumulate
703       "ldr q9, [x12, #0x0]\n"
704       "ldr q10, [x12, #0x10]\n"
705       "ldr q11, [x12, #0x20]\n"
706       "ldr q12, [x12, #0x30]\n"
707       "ldr q13, [x12, #0x40]\n"
708       "ldr q20, [x12, #0x50]\n"
709       "ldr q14, [x22, #0x0]\n"
710       "ldr q15, [x22, #0x10]\n"
711       "ldr q16, [x22, #0x20]\n"
712       "ldr q17, [x22, #0x30]\n"
713       "ldr q18, [x22, #0x40]\n"
714       "ldr q19, [x22, #0x50]\n"
715       "62:"  // Height 2: MMLA fixup
716       "zip1 v8.2d, v9.2d, v14.2d\n"
717       "zip2 v14.2d, v9.2d, v14.2d\n"
718       "zip1 v9.2d, v10.2d, v15.2d\n"
719       "zip2 v15.2d, v10.2d, v15.2d\n"
720       "zip1 v10.2d, v11.2d, v16.2d\n"
721       "zip2 v16.2d, v11.2d, v16.2d\n"
722       "zip1 v11.2d, v12.2d, v17.2d\n"
723       "zip2 v17.2d, v12.2d, v17.2d\n"
724       "zip1 v12.2d, v13.2d, v18.2d\n"
725       "zip2 v18.2d, v13.2d, v18.2d\n"
726       "zip1 v13.2d, v20.2d, v19.2d\n"
727       "zip2 v19.2d, v20.2d, v19.2d\n"
728       "b 64f\n"
729       "63:"  // Height 2: no accumulate
730       "movi v8.16b, #0x0\n"
731       "movi v9.16b, #0x0\n"
732       "movi v10.16b, #0x0\n"
733       "movi v11.16b, #0x0\n"
734       "movi v12.16b, #0x0\n"
735       "movi v13.16b, #0x0\n"
736       "movi v14.16b, #0x0\n"
737       "movi v15.16b, #0x0\n"
738       "movi v16.16b, #0x0\n"
739       "movi v17.16b, #0x0\n"
740       "movi v18.16b, #0x0\n"
741       "movi v19.16b, #0x0\n"
742       "64:"  // Height 2: setup done
743       "mov x25, #0x0\n"
744       "65:"  // Height 2: String loop
745       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
746       "ldr w24, [x19, x25, LSL #0x2]\n"
747       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
748       "tbz %x[flags], #3, 66f\n"
749       "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
750       "add x20, x20, x19, LSL #3\n"
751       "ldr x23, [x20, #0x0]\n"
752       "ldr x22, [x20, #0x8]\n"
753       "cbnz x25, 67f\n"
754       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
755       "add x23, x23, x19, LSL #2\n"
756       "add x22, x22, x19, LSL #2\n"
757       "b 67f\n"
758       "66:"  // Height 2: setup direct input
759       "mov x23, %x[input_ptr]\n"
760       "add x22, x23, x19, LSL #2\n"
761       "67:"  // Height 2: input setup done
762       "cmp x24, #0x4\n"
763       "blt 70f\n"
764       "ld1 { v0.4s }, [x23], #0x10\n"
765       "ld1 { v1.4s }, [x22], #0x10\n"
766       "cmp x24, #0x8\n"
767       "ldr q4, [x11, #0x0]\n"
768       "ldr q5, [x11, #0x10]\n"
769       "ldr q6, [x10, #0x0]\n"
770       "ldr q7, [x10, #0x10]\n"
771       "blt 69f\n"
772       "68:"  // Height 2: Multiply loop: Main loop head
773       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
774       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
775       "ld1 { v1.4s }, [x22], #0x10\n"
776       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
777       "ldr q4, [x9, #0x0]\n"
778       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
779       "ldr q5, [x9, #0x10]\n"
780       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
781       "ldr q6, [x28, #0x0]\n"
782       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
783       "ldr q7, [x28, #0x10]\n"
784       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
785       "ldr q4, [x27, #0x0]\n"
786       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
787       "ldr q5, [x27, #0x10]\n"
788       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
789       "ldr q6, [x26, #0x0]\n"
790       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
791       "ldr q7, [x26, #0x10]\n"
792       "sub x24, x24, #0x4\n"
793       "cmp x24, #0x8\n"
794       "add x11, x11, #0x20\n"
795       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
796       "ldr q4, [x11, #0x0]\n"
797       "add x10, x10, #0x20\n"
798       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
799       "ldr q5, [x11, #0x10]\n"
800       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
801       "ldr q6, [x10, #0x0]\n"
802       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
803       "ld1 { v0.4s }, [x23], #0x10\n"
804       "add x9, x9, #0x20\n"
805       "ldr q7, [x10, #0x10]\n"
806       "add x28, x28, #0x20\n"
807       "add x27, x27, #0x20\n"
808       "add x26, x26, #0x20\n"
809       "bge 68b\n"
810       "69:"  // Height 2: Multiply loop: Single iteration only
811       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
812       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
813       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
814       "ldr q4, [x9, #0x0]\n"
815       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
816       "ldr q5, [x9, #0x10]\n"
817       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
818       "ldr q6, [x28, #0x0]\n"
819       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
820       "ldr q7, [x28, #0x10]\n"
821       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
822       "ldr q4, [x27, #0x0]\n"
823       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
824       "ldr q5, [x27, #0x10]\n"
825       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
826       "ldr q6, [x26, #0x0]\n"
827       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
828       "ldr q7, [x26, #0x10]\n"
829       "sub x24, x24, #0x4\n"
830       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
831       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
832       "add x11, x11, #0x20\n"
833       "add x10, x10, #0x20\n"
834       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
835       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
836       "add x9, x9, #0x20\n"
837       "add x28, x28, #0x20\n"
838       "add x27, x27, #0x20\n"
839       "add x26, x26, #0x20\n"
840       "70:"  // Height 2: Multiply loop: Main loop skip
841       "cbz x24, 73f\n"
842       "cbz x24, 73f\n"
843       "tbz x24, #1, 71f\n"
844       "ldr d0, [x23], #0x8\n"
845       "ldr d1, [x22], #0x8\n"
846       "tbz x24, #0, 72f\n"
847       "ld1 { v0.s }[2], [x23]\n"
848       "ld1 { v1.s }[2], [x22]\n"
849       "b 72f\n"
850       "71:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
851       "ldr s0, [x23, #0x0]\n"
852       "ldr s1, [x22, #0x0]\n"
853       "72:"  // Height 2: Multiply loop: Ragged operand read: Done
854       "ldr q4, [x11, #0x0]\n"
855       "ldr q5, [x11, #0x10]\n"
856       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
857       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
858       "ldr q6, [x10, #0x0]\n"
859       "ldr q7, [x10, #0x10]\n"
860       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
861       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
862       "ldr q4, [x9, #0x0]\n"
863       "ldr q5, [x9, #0x10]\n"
864       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
865       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
866       "ldr q6, [x28, #0x0]\n"
867       "ldr q7, [x28, #0x10]\n"
868       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
869       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
870       "ldr q4, [x27, #0x0]\n"
871       "ldr q5, [x27, #0x10]\n"
872       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
873       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
874       "ldr q6, [x26, #0x0]\n"
875       "ldr q7, [x26, #0x10]\n"
876       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
877       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
878       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
879       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
880       "add x11, x11, #0x20\n"
881       "add x10, x10, #0x20\n"
882       "add x9, x9, #0x20\n"
883       "add x28, x28, #0x20\n"
884       "add x27, x27, #0x20\n"
885       "add x26, x26, #0x20\n"
886       "73:"  // Height 2: Multiply loop: No odd multiplies
887       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
888       "add x25, x25, #0x1\n"
889       "cmp x25, x19\n"
890       "bne 65b\n"
891       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
892       "uzp1 v4.2d, v8.2d, v14.2d\n"
893       "uzp2 v8.2d, v8.2d, v14.2d\n"
894       "add x22, x12, x19, LSL #2\n"
895       "uzp1 v14.2d, v9.2d, v15.2d\n"
896       "uzp2 v9.2d, v9.2d, v15.2d\n"
897       "uzp1 v15.2d, v10.2d, v16.2d\n"
898       "uzp2 v10.2d, v10.2d, v16.2d\n"
899       "uzp1 v16.2d, v11.2d, v17.2d\n"
900       "uzp2 v11.2d, v11.2d, v17.2d\n"
901       "uzp1 v17.2d, v12.2d, v18.2d\n"
902       "uzp2 v12.2d, v12.2d, v18.2d\n"
903       "uzp1 v18.2d, v13.2d, v19.2d\n"
904       "uzp2 v13.2d, v13.2d, v19.2d\n"
905       "tbz %x[flags], #1, 74f\n"
906       "add x19, %x[args_ptr], %[offset_max]\n"
907       "ld1r { v1.4s }, [x19]\n"
908       "add x19, %x[args_ptr], %[offset_min]\n"
909       "ld1r { v0.4s }, [x19]\n"
910       "fmin v4.4s, v4.4s, v1.4s\n"
911       "fmin v14.4s, v14.4s, v1.4s\n"
912       "fmin v15.4s, v15.4s, v1.4s\n"
913       "fmin v16.4s, v16.4s, v1.4s\n"
914       "fmin v17.4s, v17.4s, v1.4s\n"
915       "fmin v18.4s, v18.4s, v1.4s\n"
916       "fmin v8.4s, v8.4s, v1.4s\n"
917       "fmin v9.4s, v9.4s, v1.4s\n"
918       "fmin v10.4s, v10.4s, v1.4s\n"
919       "fmin v11.4s, v11.4s, v1.4s\n"
920       "fmin v12.4s, v12.4s, v1.4s\n"
921       "fmin v13.4s, v13.4s, v1.4s\n"
922       "fmax v4.4s, v4.4s, v0.4s\n"
923       "fmax v14.4s, v14.4s, v0.4s\n"
924       "fmax v15.4s, v15.4s, v0.4s\n"
925       "fmax v16.4s, v16.4s, v0.4s\n"
926       "fmax v17.4s, v17.4s, v0.4s\n"
927       "fmax v18.4s, v18.4s, v0.4s\n"
928       "fmax v8.4s, v8.4s, v0.4s\n"
929       "fmax v9.4s, v9.4s, v0.4s\n"
930       "fmax v10.4s, v10.4s, v0.4s\n"
931       "fmax v11.4s, v11.4s, v0.4s\n"
932       "fmax v12.4s, v12.4s, v0.4s\n"
933       "fmax v13.4s, v13.4s, v0.4s\n"
934       "74:"  // Height 2: No activation
935       "cmp x13, #0x18\n"
936       "bge 87f\n"
937       "tbz x13, #4, 78f\n"
938       "st1 { v4.4s }, [x12], #0x10\n"
939       "st1 { v14.4s }, [x12], #0x10\n"
940       "st1 { v15.4s }, [x12], #0x10\n"
941       "st1 { v16.4s }, [x12], #0x10\n"
942       "st1 { v8.4s }, [x22], #0x10\n"
943       "st1 { v9.4s }, [x22], #0x10\n"
944       "st1 { v10.4s }, [x22], #0x10\n"
945       "st1 { v11.4s }, [x22], #0x10\n"
946       "tbz x13, #2, 76f\n"
947       "st1 { v17.4s }, [x12], #0x10\n"
948       "st1 { v12.4s }, [x22], #0x10\n"
949       "tbz x13, #1, 75f\n"
950       "str d18, [x12], #0x8\n"
951       "str d13, [x22], #0x8\n"
952       "tbz x13, #0, 86f\n"
953       "st1 { v18.s }[2], [x12]\n"
954       "st1 { v13.s }[2], [x22]\n"
955       "b 86f\n"
956       "75:"  // Height 2: Partial direct writeback: partial_1_20
957       "tbz x13, #0, 86f\n"
958       "str s18, [x12, #0x0]\n"
959       "str s13, [x22, #0x0]\n"
960       "b 86f\n"
961       "76:"  // Height 2: Partial direct writeback: partial_2_16
962       "tbz x13, #1, 77f\n"
963       "str d17, [x12], #0x8\n"
964       "str d12, [x22], #0x8\n"
965       "tbz x13, #0, 86f\n"
966       "st1 { v17.s }[2], [x12]\n"
967       "st1 { v12.s }[2], [x22]\n"
968       "b 86f\n"
969       "77:"  // Height 2: Partial direct writeback: partial_1_16
970       "tbz x13, #0, 86f\n"
971       "str s17, [x12, #0x0]\n"
972       "str s12, [x22, #0x0]\n"
973       "b 86f\n"
974       "78:"  // Height 2: Partial direct writeback: partial_8_0
975       "tbz x13, #3, 82f\n"
976       "st1 { v4.4s }, [x12], #0x10\n"
977       "st1 { v14.4s }, [x12], #0x10\n"
978       "st1 { v8.4s }, [x22], #0x10\n"
979       "st1 { v9.4s }, [x22], #0x10\n"
980       "tbz x13, #2, 80f\n"
981       "st1 { v15.4s }, [x12], #0x10\n"
982       "st1 { v10.4s }, [x22], #0x10\n"
983       "tbz x13, #1, 79f\n"
984       "str d16, [x12], #0x8\n"
985       "str d11, [x22], #0x8\n"
986       "tbz x13, #0, 86f\n"
987       "st1 { v16.s }[2], [x12]\n"
988       "st1 { v11.s }[2], [x22]\n"
989       "b 86f\n"
990       "79:"  // Height 2: Partial direct writeback: partial_1_12
991       "tbz x13, #0, 86f\n"
992       "str s16, [x12, #0x0]\n"
993       "str s11, [x22, #0x0]\n"
994       "b 86f\n"
995       "80:"  // Height 2: Partial direct writeback: partial_2_8
996       "tbz x13, #1, 81f\n"
997       "str d15, [x12], #0x8\n"
998       "str d10, [x22], #0x8\n"
999       "tbz x13, #0, 86f\n"
1000       "st1 { v15.s }[2], [x12]\n"
1001       "st1 { v10.s }[2], [x22]\n"
1002       "b 86f\n"
1003       "81:"  // Height 2: Partial direct writeback: partial_1_8
1004       "tbz x13, #0, 86f\n"
1005       "str s15, [x12, #0x0]\n"
1006       "str s10, [x22, #0x0]\n"
1007       "b 86f\n"
1008       "82:"  // Height 2: Partial direct writeback: partial_4_0
1009       "tbz x13, #2, 84f\n"
1010       "st1 { v4.4s }, [x12], #0x10\n"
1011       "st1 { v8.4s }, [x22], #0x10\n"
1012       "tbz x13, #1, 83f\n"
1013       "str d14, [x12], #0x8\n"
1014       "str d9, [x22], #0x8\n"
1015       "tbz x13, #0, 86f\n"
1016       "st1 { v14.s }[2], [x12]\n"
1017       "st1 { v9.s }[2], [x22]\n"
1018       "b 86f\n"
1019       "83:"  // Height 2: Partial direct writeback: partial_1_4
1020       "tbz x13, #0, 86f\n"
1021       "str s14, [x12, #0x0]\n"
1022       "str s9, [x22, #0x0]\n"
1023       "b 86f\n"
1024       "84:"  // Height 2: Partial direct writeback: partial_2_0
1025       "tbz x13, #1, 85f\n"
1026       "str d4, [x12], #0x8\n"
1027       "str d8, [x22], #0x8\n"
1028       "tbz x13, #0, 86f\n"
1029       "st1 { v4.s }[2], [x12]\n"
1030       "st1 { v8.s }[2], [x22]\n"
1031       "b 86f\n"
1032       "85:"  // Height 2: Partial direct writeback: partial_1_0
1033       "str s4, [x12, #0x0]\n"
1034       "str s8, [x22, #0x0]\n"
1035       "86:"  // Height 2: Partial direct writeback: Done
1036       "b 88f\n"
1037       "87:"  // Height 2: Full writeback
1038       "str q4, [x12, #0x0]\n"
1039       "str q14, [x12, #0x10]\n"
1040       "str q15, [x12, #0x20]\n"
1041       "str q16, [x12, #0x30]\n"
1042       "str q17, [x12, #0x40]\n"
1043       "str q18, [x12, #0x50]\n"
1044       "add x12, x12, #0x60\n"
1045       "str q8, [x22, #0x0]\n"
1046       "str q9, [x22, #0x10]\n"
1047       "str q10, [x22, #0x20]\n"
1048       "str q11, [x22, #0x30]\n"
1049       "str q12, [x22, #0x40]\n"
1050       "str q13, [x22, #0x50]\n"
1051       "88:"  // Height 2: Writeback done
1052       "subs x13, x13, #0x18\n"
1053       "bgt 46b\n"
1054       "b 178f\n"
1055       "89:"  // Height 3
1056       "ldr x19, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1057       "mov x14, %x[bias]\n"
1058       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
1059       "str x19, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
1060       "mov x12, %x[output_ptr]\n"
1061       "90:"  // Height 3: Column loop
1062       "ldr x11, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
1063       "ldr x19, [%x[args_ptr], %[offsetof_B_stride]]\n"
1064       "add x10, x11, x19, LSL #1\n"
1065       "add x9, x10, x19, LSL #1\n"
1066       "add x28, x9, x19, LSL #1\n"
1067       "add x27, x28, x19, LSL #1\n"
1068       "add x26, x27, x19, LSL #1\n"
1069       "add x19, x26, x19, LSL #1\n"
1070       "str x19, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
1071       "cmp x13, #0x14\n"
1072       "bgt 91f\n"
1073       "cmp x13, #0x10\n"
1074       "mov x26, x11\n"
1075       "bgt 91f\n"
1076       "cmp x13, #0xc\n"
1077       "mov x27, x11\n"
1078       "bgt 91f\n"
1079       "cmp x13, #0x8\n"
1080       "mov x28, x11\n"
1081       "bgt 91f\n"
1082       "cmp x13, #0x4\n"
1083       "mov x9, x11\n"
1084       "bgt 91f\n"
1085       "mov x10, x11\n"
1086       "91:"  // Height 3: B setup done
1087       "cbz x14, 92f\n"
1088       "ldr q8, [x14, #0x0]\n"
1089       "ldr q9, [x14, #0x10]\n"
1090       "zip2 v14.2d, v8.2d, v8.2d\n"
1091       "zip1 v8.2d, v8.2d, v8.2d\n"
1092       "ldr q10, [x14, #0x20]\n"
1093       "ldr q11, [x14, #0x30]\n"
1094       "zip2 v15.2d, v9.2d, v9.2d\n"
1095       "zip1 v9.2d, v9.2d, v9.2d\n"
1096       "ldr q12, [x14, #0x40]\n"
1097       "ldr q13, [x14, #0x50]\n"
1098       "zip2 v16.2d, v10.2d, v10.2d\n"
1099       "zip1 v10.2d, v10.2d, v10.2d\n"
1100       "zip2 v17.2d, v11.2d, v11.2d\n"
1101       "zip1 v11.2d, v11.2d, v11.2d\n"
1102       "add x14, x14, #0x60\n"
1103       "zip2 v18.2d, v12.2d, v12.2d\n"
1104       "zip1 v12.2d, v12.2d, v12.2d\n"
1105       "zip2 v19.2d, v13.2d, v13.2d\n"
1106       "zip1 v13.2d, v13.2d, v13.2d\n"
1107       "mov v20.16b, v8.16b\n"
1108       "mov v26.16b, v14.16b\n"
1109       "mov v21.16b, v9.16b\n"
1110       "mov v27.16b, v15.16b\n"
1111       "mov v22.16b, v10.16b\n"
1112       "mov v28.16b, v16.16b\n"
1113       "mov v23.16b, v11.16b\n"
1114       "mov v29.16b, v17.16b\n"
1115       "mov v24.16b, v12.16b\n"
1116       "mov v30.16b, v18.16b\n"
1117       "mov v25.16b, v13.16b\n"
1118       "mov v31.16b, v19.16b\n"
1119       "b 108f\n"
1120       "92:"  // Height 3: no bias
1121       "tbz %x[flags], #0, 107f\n"
1122       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1123       "add x22, x12, x19, LSL #2\n"
1124       "cmp x13, #0x18\n"
1125       "add x21, x22, x19, LSL #2\n"
1126       "bge 105f\n"
1127       "tbz x13, #4, 96f\n"
1128       "ld1 { v9.4s }, [x12], #0x10\n"
1129       "ld1 { v14.4s }, [x22], #0x10\n"
1130       "ld1 { v21.4s }, [x21], #0x10\n"
1131       "ld1 { v10.4s }, [x12], #0x10\n"
1132       "ld1 { v15.4s }, [x22], #0x10\n"
1133       "ld1 { v22.4s }, [x21], #0x10\n"
1134       "ld1 { v11.4s }, [x12], #0x10\n"
1135       "ld1 { v16.4s }, [x22], #0x10\n"
1136       "ld1 { v23.4s }, [x21], #0x10\n"
1137       "ld1 { v12.4s }, [x12], #0x10\n"
1138       "ld1 { v17.4s }, [x22], #0x10\n"
1139       "ld1 { v24.4s }, [x21], #0x10\n"
1140       "tbz x13, #2, 94f\n"
1141       "ld1 { v13.4s }, [x12], #0x10\n"
1142       "ld1 { v18.4s }, [x22], #0x10\n"
1143       "ld1 { v25.4s }, [x21], #0x10\n"
1144       "tbz x13, #1, 93f\n"
1145       "ldr d20, [x12], #0x8\n"
1146       "ldr d19, [x22], #0x8\n"
1147       "mov x19, #0x58\n"
1148       "ldr d4, [x21], #0x8\n"
1149       "tbz x13, #0, 104f\n"
1150       "ld1 { v20.s }[2], [x12]\n"
1151       "ld1 { v19.s }[2], [x22]\n"
1152       "ld1 { v4.s }[2], [x21]\n"
1153       "b 104f\n"
1154       "93:"  // Height 3: Partial accumulate: partial_1_20
1155       "mov x19, #0x50\n"
1156       "tbz x13, #0, 104f\n"
1157       "ldr s20, [x12, #0x0]\n"
1158       "ldr s19, [x22, #0x0]\n"
1159       "ldr s4, [x21, #0x0]\n"
1160       "b 104f\n"
1161       "94:"  // Height 3: Partial accumulate: partial_2_16
1162       "tbz x13, #1, 95f\n"
1163       "ldr d13, [x12], #0x8\n"
1164       "ldr d18, [x22], #0x8\n"
1165       "mov x19, #0x48\n"
1166       "ldr d25, [x21], #0x8\n"
1167       "tbz x13, #0, 104f\n"
1168       "ld1 { v13.s }[2], [x12]\n"
1169       "ld1 { v18.s }[2], [x22]\n"
1170       "ld1 { v25.s }[2], [x21]\n"
1171       "b 104f\n"
1172       "95:"  // Height 3: Partial accumulate: partial_1_16
1173       "mov x19, #0x40\n"
1174       "tbz x13, #0, 104f\n"
1175       "ldr s13, [x12, #0x0]\n"
1176       "ldr s18, [x22, #0x0]\n"
1177       "ldr s25, [x21, #0x0]\n"
1178       "b 104f\n"
1179       "96:"  // Height 3: Partial accumulate: partial_8_0
1180       "tbz x13, #3, 100f\n"
1181       "ld1 { v9.4s }, [x12], #0x10\n"
1182       "ld1 { v14.4s }, [x22], #0x10\n"
1183       "ld1 { v21.4s }, [x21], #0x10\n"
1184       "ld1 { v10.4s }, [x12], #0x10\n"
1185       "ld1 { v15.4s }, [x22], #0x10\n"
1186       "ld1 { v22.4s }, [x21], #0x10\n"
1187       "tbz x13, #2, 98f\n"
1188       "ld1 { v11.4s }, [x12], #0x10\n"
1189       "ld1 { v16.4s }, [x22], #0x10\n"
1190       "ld1 { v23.4s }, [x21], #0x10\n"
1191       "tbz x13, #1, 97f\n"
1192       "ldr d12, [x12], #0x8\n"
1193       "ldr d17, [x22], #0x8\n"
1194       "mov x19, #0x38\n"
1195       "ldr d24, [x21], #0x8\n"
1196       "tbz x13, #0, 104f\n"
1197       "ld1 { v12.s }[2], [x12]\n"
1198       "ld1 { v17.s }[2], [x22]\n"
1199       "ld1 { v24.s }[2], [x21]\n"
1200       "b 104f\n"
1201       "97:"  // Height 3: Partial accumulate: partial_1_12
1202       "mov x19, #0x30\n"
1203       "tbz x13, #0, 104f\n"
1204       "ldr s12, [x12, #0x0]\n"
1205       "ldr s17, [x22, #0x0]\n"
1206       "ldr s24, [x21, #0x0]\n"
1207       "b 104f\n"
1208       "98:"  // Height 3: Partial accumulate: partial_2_8
1209       "tbz x13, #1, 99f\n"
1210       "ldr d11, [x12], #0x8\n"
1211       "ldr d16, [x22], #0x8\n"
1212       "mov x19, #0x28\n"
1213       "ldr d23, [x21], #0x8\n"
1214       "tbz x13, #0, 104f\n"
1215       "ld1 { v11.s }[2], [x12]\n"
1216       "ld1 { v16.s }[2], [x22]\n"
1217       "ld1 { v23.s }[2], [x21]\n"
1218       "b 104f\n"
1219       "99:"  // Height 3: Partial accumulate: partial_1_8
1220       "mov x19, #0x20\n"
1221       "tbz x13, #0, 104f\n"
1222       "ldr s11, [x12, #0x0]\n"
1223       "ldr s16, [x22, #0x0]\n"
1224       "ldr s23, [x21, #0x0]\n"
1225       "b 104f\n"
1226       "100:"  // Height 3: Partial accumulate: partial_4_0
1227       "tbz x13, #2, 102f\n"
1228       "ld1 { v9.4s }, [x12], #0x10\n"
1229       "ld1 { v14.4s }, [x22], #0x10\n"
1230       "ld1 { v21.4s }, [x21], #0x10\n"
1231       "tbz x13, #1, 101f\n"
1232       "ldr d10, [x12], #0x8\n"
1233       "ldr d15, [x22], #0x8\n"
1234       "mov x19, #0x18\n"
1235       "ldr d22, [x21], #0x8\n"
1236       "tbz x13, #0, 104f\n"
1237       "ld1 { v10.s }[2], [x12]\n"
1238       "ld1 { v15.s }[2], [x22]\n"
1239       "ld1 { v22.s }[2], [x21]\n"
1240       "b 104f\n"
1241       "101:"  // Height 3: Partial accumulate: partial_1_4
1242       "mov x19, #0x10\n"
1243       "tbz x13, #0, 104f\n"
1244       "ldr s10, [x12, #0x0]\n"
1245       "ldr s15, [x22, #0x0]\n"
1246       "ldr s22, [x21, #0x0]\n"
1247       "b 104f\n"
1248       "102:"  // Height 3: Partial accumulate: partial_2_0
1249       "tbz x13, #1, 103f\n"
1250       "ldr d9, [x12], #0x8\n"
1251       "ldr d14, [x22], #0x8\n"
1252       "mov x19, #0x8\n"
1253       "ldr d21, [x21], #0x8\n"
1254       "tbz x13, #0, 104f\n"
1255       "ld1 { v9.s }[2], [x12]\n"
1256       "ld1 { v14.s }[2], [x22]\n"
1257       "ld1 { v21.s }[2], [x21]\n"
1258       "b 104f\n"
1259       "103:"  // Height 3: Partial accumulate: partial_1_0
1260       "ldr s9, [x12, #0x0]\n"
1261       "ldr s14, [x22, #0x0]\n"
1262       "mov x19, #0x0\n"
1263       "ldr s21, [x21, #0x0]\n"
1264       "104:"  // Height 3: Partial accumulate: Done
1265       "sub x12, x12, x19\n"
1266       "b 106f\n"
1267       "105:"  // Height 3: full accumulate
1268       "ldr q9, [x12, #0x0]\n"
1269       "ldr q10, [x12, #0x10]\n"
1270       "ldr q11, [x12, #0x20]\n"
1271       "ldr q12, [x12, #0x30]\n"
1272       "ldr q13, [x12, #0x40]\n"
1273       "ldr q20, [x12, #0x50]\n"
1274       "ldr q14, [x22, #0x0]\n"
1275       "ldr q15, [x22, #0x10]\n"
1276       "ldr q16, [x22, #0x20]\n"
1277       "ldr q17, [x22, #0x30]\n"
1278       "ldr q18, [x22, #0x40]\n"
1279       "ldr q19, [x22, #0x50]\n"
1280       "ldr q21, [x21, #0x0]\n"
1281       "ldr q22, [x21, #0x10]\n"
1282       "ldr q23, [x21, #0x20]\n"
1283       "ldr q24, [x21, #0x30]\n"
1284       "ldr q25, [x21, #0x40]\n"
1285       "ldr q4, [x21, #0x50]\n"
1286       "106:"  // Height 3: MMLA fixup
1287       "zip1 v8.2d, v9.2d, v14.2d\n"
1288       "zip2 v14.2d, v9.2d, v14.2d\n"
1289       "zip1 v9.2d, v10.2d, v15.2d\n"
1290       "zip2 v15.2d, v10.2d, v15.2d\n"
1291       "zip1 v10.2d, v11.2d, v16.2d\n"
1292       "zip2 v16.2d, v11.2d, v16.2d\n"
1293       "zip1 v11.2d, v12.2d, v17.2d\n"
1294       "zip2 v17.2d, v12.2d, v17.2d\n"
1295       "zip1 v12.2d, v13.2d, v18.2d\n"
1296       "zip2 v18.2d, v13.2d, v18.2d\n"
1297       "zip1 v13.2d, v20.2d, v19.2d\n"
1298       "zip2 v19.2d, v20.2d, v19.2d\n"
1299       "zip1 v20.2d, v21.2d, v26.2d\n"
1300       "zip2 v26.2d, v21.2d, v26.2d\n"
1301       "zip1 v21.2d, v22.2d, v27.2d\n"
1302       "zip2 v27.2d, v22.2d, v27.2d\n"
1303       "zip1 v22.2d, v23.2d, v28.2d\n"
1304       "zip2 v28.2d, v23.2d, v28.2d\n"
1305       "zip1 v23.2d, v24.2d, v29.2d\n"
1306       "zip2 v29.2d, v24.2d, v29.2d\n"
1307       "zip1 v24.2d, v25.2d, v30.2d\n"
1308       "zip2 v30.2d, v25.2d, v30.2d\n"
1309       "zip1 v25.2d, v4.2d, v31.2d\n"
1310       "zip2 v31.2d, v4.2d, v31.2d\n"
1311       "b 108f\n"
1312       "107:"  // Height 3: no accumulate
1313       "movi v8.16b, #0x0\n"
1314       "movi v9.16b, #0x0\n"
1315       "movi v10.16b, #0x0\n"
1316       "movi v11.16b, #0x0\n"
1317       "movi v12.16b, #0x0\n"
1318       "movi v13.16b, #0x0\n"
1319       "movi v14.16b, #0x0\n"
1320       "movi v15.16b, #0x0\n"
1321       "movi v16.16b, #0x0\n"
1322       "movi v17.16b, #0x0\n"
1323       "movi v18.16b, #0x0\n"
1324       "movi v19.16b, #0x0\n"
1325       "movi v20.16b, #0x0\n"
1326       "movi v21.16b, #0x0\n"
1327       "movi v22.16b, #0x0\n"
1328       "movi v23.16b, #0x0\n"
1329       "movi v24.16b, #0x0\n"
1330       "movi v25.16b, #0x0\n"
1331       "movi v26.16b, #0x0\n"
1332       "movi v27.16b, #0x0\n"
1333       "movi v28.16b, #0x0\n"
1334       "movi v29.16b, #0x0\n"
1335       "movi v30.16b, #0x0\n"
1336       "movi v31.16b, #0x0\n"
1337       "108:"  // Height 3: setup done
1338       "mov x25, #0x0\n"
1339       "109:"  // Height 3: String loop
1340       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1341       "ldr w24, [x19, x25, LSL #0x2]\n"
1342       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1343       "tbz %x[flags], #3, 110f\n"
1344       "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
1345       "add x20, x20, x19, LSL #3\n"
1346       "ldr x23, [x20, #0x0]\n"
1347       "ldr x22, [x20, #0x8]\n"
1348       "ldr x21, [x20, #0x10]\n"
1349       "cbnz x25, 111f\n"
1350       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1351       "add x23, x23, x19, LSL #2\n"
1352       "add x22, x22, x19, LSL #2\n"
1353       "add x21, x21, x19, LSL #2\n"
1354       "b 111f\n"
1355       "110:"  // Height 3: setup direct input
1356       "mov x23, %x[input_ptr]\n"
1357       "add x22, x23, x19, LSL #2\n"
1358       "add x21, x22, x19, LSL #2\n"
1359       "111:"  // Height 3: input setup done
1360       "cmp x24, #0x4\n"
1361       "blt 114f\n"
1362       "ld1 { v0.4s }, [x23], #0x10\n"
1363       "ld1 { v1.4s }, [x22], #0x10\n"
1364       "cmp x24, #0x8\n"
1365       "ld1 { v2.4s }, [x21], #0x10\n"
1366       "ldr q4, [x11, #0x0]\n"
1367       "ldr q5, [x11, #0x10]\n"
1368       "ldr q6, [x10, #0x0]\n"
1369       "ldr q7, [x10, #0x10]\n"
1370       "blt 113f\n"
1371       "112:"  // Height 3: Multiply loop: Main loop head
1372       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
1373       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
1374       "ld1 { v1.4s }, [x22], #0x10\n"
1375       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
1376       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
1377       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
1378       "ldr q4, [x9, #0x0]\n"
1379       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
1380       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
1381       "ldr q5, [x9, #0x10]\n"
1382       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
1383       "sub x24, x24, #0x4\n"
1384       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
1385       "ldr q6, [x28, #0x0]\n"
1386       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
1387       "cmp x24, #0x8\n"
1388       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
1389       "ldr q7, [x28, #0x10]\n"
1390       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
1391       "add x11, x11, #0x20\n"
1392       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
1393       "ldr q4, [x27, #0x0]\n"
1394       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
1395       "add x10, x10, #0x20\n"
1396       ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
1397       "ldr q5, [x27, #0x10]\n"
1398       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
1399       "add x9, x9, #0x20\n"
1400       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
1401       "ldr q6, [x26, #0x0]\n"
1402       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
1403       "add x28, x28, #0x20\n"
1404       ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
1405       "ldr q7, [x26, #0x10]\n"
1406       "add x27, x27, #0x20\n"
1407       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
1408       "add x26, x26, #0x20\n"
1409       ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
1410       "ldr q4, [x11, #0x0]\n"
1411       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
1412       ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
1413       "ldr q5, [x11, #0x10]\n"
1414       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
1415       ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
1416       "ldr q6, [x10, #0x0]\n"
1417       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
1418       "ld1 { v0.4s }, [x23], #0x10\n"
1419       ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
1420       "ld1 { v2.4s }, [x21], #0x10\n"
1421       "ldr q7, [x10, #0x10]\n"
1422       "bge 112b\n"
1423       "113:"  // Height 3: Multiply loop: Single iteration only
1424       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
1425       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
1426       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
1427       "sub x24, x24, #0x4\n"
1428       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
1429       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
1430       "ldr q4, [x9, #0x0]\n"
1431       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
1432       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
1433       "ldr q5, [x9, #0x10]\n"
1434       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
1435       "add x11, x11, #0x20\n"
1436       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
1437       "ldr q6, [x28, #0x0]\n"
1438       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
1439       "add x10, x10, #0x20\n"
1440       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
1441       "ldr q7, [x28, #0x10]\n"
1442       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
1443       "add x9, x9, #0x20\n"
1444       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
1445       "ldr q4, [x27, #0x0]\n"
1446       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
1447       "add x28, x28, #0x20\n"
1448       ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
1449       "ldr q5, [x27, #0x10]\n"
1450       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
1451       "add x27, x27, #0x20\n"
1452       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
1453       "ldr q6, [x26, #0x0]\n"
1454       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
1455       ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
1456       "ldr q7, [x26, #0x10]\n"
1457       "add x26, x26, #0x20\n"
1458       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
1459       ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
1460       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
1461       ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
1462       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
1463       ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
1464       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
1465       ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
1466       "114:"  // Height 3: Multiply loop: Main loop skip
1467       "cbz x24, 117f\n"
1468       "cbz x24, 117f\n"
1469       "tbz x24, #1, 115f\n"
1470       "ldr d0, [x23], #0x8\n"
1471       "ldr d1, [x22], #0x8\n"
1472       "ldr d2, [x21], #0x8\n"
1473       "tbz x24, #0, 116f\n"
1474       "ld1 { v0.s }[2], [x23]\n"
1475       "ld1 { v1.s }[2], [x22]\n"
1476       "ld1 { v2.s }[2], [x21]\n"
1477       "b 116f\n"
1478       "115:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
1479       "ldr s0, [x23, #0x0]\n"
1480       "ldr s1, [x22, #0x0]\n"
1481       "ldr s2, [x21, #0x0]\n"
1482       "116:"  // Height 3: Multiply loop: Ragged operand read: Done
1483       "ldr q4, [x11, #0x0]\n"
1484       "ldr q5, [x11, #0x10]\n"
1485       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
1486       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
1487       "ldr q6, [x10, #0x0]\n"
1488       "ldr q7, [x10, #0x10]\n"
1489       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
1490       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
1491       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
1492       "ldr q4, [x9, #0x0]\n"
1493       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
1494       "add x11, x11, #0x20\n"
1495       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
1496       "ldr q5, [x9, #0x10]\n"
1497       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
1498       "add x10, x10, #0x20\n"
1499       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
1500       "ldr q6, [x28, #0x0]\n"
1501       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
1502       "add x9, x9, #0x20\n"
1503       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
1504       "ldr q7, [x28, #0x10]\n"
1505       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
1506       "add x28, x28, #0x20\n"
1507       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
1508       "ldr q4, [x27, #0x0]\n"
1509       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
1510       ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
1511       "ldr q5, [x27, #0x10]\n"
1512       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
1513       "add x27, x27, #0x20\n"
1514       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
1515       "ldr q6, [x26, #0x0]\n"
1516       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
1517       ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
1518       "ldr q7, [x26, #0x10]\n"
1519       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
1520       "add x26, x26, #0x20\n"
1521       ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
1522       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
1523       ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
1524       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
1525       ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
1526       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
1527       ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
1528       "117:"  // Height 3: Multiply loop: No odd multiplies
1529       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1530       "add x25, x25, #0x1\n"
1531       "cmp x25, x19\n"
1532       "bne 109b\n"
1533       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1534       "add x22, x12, x19, LSL #2\n"
1535       "uzp1 v4.2d, v8.2d, v14.2d\n"
1536       "uzp2 v8.2d, v8.2d, v14.2d\n"
1537       "uzp1 v14.2d, v9.2d, v15.2d\n"
1538       "uzp2 v9.2d, v9.2d, v15.2d\n"
1539       "add x21, x22, x19, LSL #2\n"
1540       "uzp1 v15.2d, v10.2d, v16.2d\n"
1541       "uzp2 v10.2d, v10.2d, v16.2d\n"
1542       "uzp1 v16.2d, v11.2d, v17.2d\n"
1543       "uzp2 v11.2d, v11.2d, v17.2d\n"
1544       "uzp1 v17.2d, v12.2d, v18.2d\n"
1545       "uzp2 v12.2d, v12.2d, v18.2d\n"
1546       "uzp1 v18.2d, v13.2d, v19.2d\n"
1547       "uzp2 v13.2d, v13.2d, v19.2d\n"
1548       "uzp1 v20.2d, v20.2d, v26.2d\n"
1549       "uzp1 v21.2d, v21.2d, v27.2d\n"
1550       "uzp1 v22.2d, v22.2d, v28.2d\n"
1551       "uzp1 v23.2d, v23.2d, v29.2d\n"
1552       "uzp1 v24.2d, v24.2d, v30.2d\n"
1553       "uzp1 v25.2d, v25.2d, v31.2d\n"
1554       "tbz %x[flags], #1, 118f\n"
1555       "add x19, %x[args_ptr], %[offset_max]\n"
1556       "ld1r { v1.4s }, [x19]\n"
1557       "add x19, %x[args_ptr], %[offset_min]\n"
1558       "ld1r { v0.4s }, [x19]\n"
1559       "fmin v4.4s, v4.4s, v1.4s\n"
1560       "fmin v14.4s, v14.4s, v1.4s\n"
1561       "fmin v15.4s, v15.4s, v1.4s\n"
1562       "fmin v16.4s, v16.4s, v1.4s\n"
1563       "fmin v17.4s, v17.4s, v1.4s\n"
1564       "fmin v18.4s, v18.4s, v1.4s\n"
1565       "fmin v8.4s, v8.4s, v1.4s\n"
1566       "fmin v9.4s, v9.4s, v1.4s\n"
1567       "fmin v10.4s, v10.4s, v1.4s\n"
1568       "fmin v11.4s, v11.4s, v1.4s\n"
1569       "fmin v12.4s, v12.4s, v1.4s\n"
1570       "fmin v13.4s, v13.4s, v1.4s\n"
1571       "fmin v20.4s, v20.4s, v1.4s\n"
1572       "fmin v21.4s, v21.4s, v1.4s\n"
1573       "fmin v22.4s, v22.4s, v1.4s\n"
1574       "fmin v23.4s, v23.4s, v1.4s\n"
1575       "fmin v24.4s, v24.4s, v1.4s\n"
1576       "fmin v25.4s, v25.4s, v1.4s\n"
1577       "fmax v4.4s, v4.4s, v0.4s\n"
1578       "fmax v14.4s, v14.4s, v0.4s\n"
1579       "fmax v15.4s, v15.4s, v0.4s\n"
1580       "fmax v16.4s, v16.4s, v0.4s\n"
1581       "fmax v17.4s, v17.4s, v0.4s\n"
1582       "fmax v18.4s, v18.4s, v0.4s\n"
1583       "fmax v8.4s, v8.4s, v0.4s\n"
1584       "fmax v9.4s, v9.4s, v0.4s\n"
1585       "fmax v10.4s, v10.4s, v0.4s\n"
1586       "fmax v11.4s, v11.4s, v0.4s\n"
1587       "fmax v12.4s, v12.4s, v0.4s\n"
1588       "fmax v13.4s, v13.4s, v0.4s\n"
1589       "fmax v20.4s, v20.4s, v0.4s\n"
1590       "fmax v21.4s, v21.4s, v0.4s\n"
1591       "fmax v22.4s, v22.4s, v0.4s\n"
1592       "fmax v23.4s, v23.4s, v0.4s\n"
1593       "fmax v24.4s, v24.4s, v0.4s\n"
1594       "fmax v25.4s, v25.4s, v0.4s\n"
1595       "118:"  // Height 3: No activation
1596       "cmp x13, #0x18\n"
1597       "bge 131f\n"
1598       "tbz x13, #4, 122f\n"
1599       "st1 { v4.4s }, [x12], #0x10\n"
1600       "st1 { v14.4s }, [x12], #0x10\n"
1601       "st1 { v15.4s }, [x12], #0x10\n"
1602       "st1 { v16.4s }, [x12], #0x10\n"
1603       "st1 { v8.4s }, [x22], #0x10\n"
1604       "st1 { v9.4s }, [x22], #0x10\n"
1605       "st1 { v10.4s }, [x22], #0x10\n"
1606       "st1 { v11.4s }, [x22], #0x10\n"
1607       "st1 { v20.4s }, [x21], #0x10\n"
1608       "st1 { v21.4s }, [x21], #0x10\n"
1609       "st1 { v22.4s }, [x21], #0x10\n"
1610       "st1 { v23.4s }, [x21], #0x10\n"
1611       "tbz x13, #2, 120f\n"
1612       "st1 { v17.4s }, [x12], #0x10\n"
1613       "st1 { v12.4s }, [x22], #0x10\n"
1614       "st1 { v24.4s }, [x21], #0x10\n"
1615       "tbz x13, #1, 119f\n"
1616       "str d18, [x12], #0x8\n"
1617       "str d13, [x22], #0x8\n"
1618       "str d25, [x21], #0x8\n"
1619       "tbz x13, #0, 130f\n"
1620       "st1 { v18.s }[2], [x12]\n"
1621       "st1 { v13.s }[2], [x22]\n"
1622       "st1 { v25.s }[2], [x21]\n"
1623       "b 130f\n"
1624       "119:"  // Height 3: Partial direct writeback: partial_1_20
1625       "tbz x13, #0, 130f\n"
1626       "str s18, [x12, #0x0]\n"
1627       "str s13, [x22, #0x0]\n"
1628       "str s25, [x21, #0x0]\n"
1629       "b 130f\n"
1630       "120:"  // Height 3: Partial direct writeback: partial_2_16
1631       "tbz x13, #1, 121f\n"
1632       "str d17, [x12], #0x8\n"
1633       "str d12, [x22], #0x8\n"
1634       "str d24, [x21], #0x8\n"
1635       "tbz x13, #0, 130f\n"
1636       "st1 { v17.s }[2], [x12]\n"
1637       "st1 { v12.s }[2], [x22]\n"
1638       "st1 { v24.s }[2], [x21]\n"
1639       "b 130f\n"
1640       "121:"  // Height 3: Partial direct writeback: partial_1_16
1641       "tbz x13, #0, 130f\n"
1642       "str s17, [x12, #0x0]\n"
1643       "str s12, [x22, #0x0]\n"
1644       "str s24, [x21, #0x0]\n"
1645       "b 130f\n"
1646       "122:"  // Height 3: Partial direct writeback: partial_8_0
1647       "tbz x13, #3, 126f\n"
1648       "st1 { v4.4s }, [x12], #0x10\n"
1649       "st1 { v14.4s }, [x12], #0x10\n"
1650       "st1 { v8.4s }, [x22], #0x10\n"
1651       "st1 { v9.4s }, [x22], #0x10\n"
1652       "st1 { v20.4s }, [x21], #0x10\n"
1653       "st1 { v21.4s }, [x21], #0x10\n"
1654       "tbz x13, #2, 124f\n"
1655       "st1 { v15.4s }, [x12], #0x10\n"
1656       "st1 { v10.4s }, [x22], #0x10\n"
1657       "st1 { v22.4s }, [x21], #0x10\n"
1658       "tbz x13, #1, 123f\n"
1659       "str d16, [x12], #0x8\n"
1660       "str d11, [x22], #0x8\n"
1661       "str d23, [x21], #0x8\n"
1662       "tbz x13, #0, 130f\n"
1663       "st1 { v16.s }[2], [x12]\n"
1664       "st1 { v11.s }[2], [x22]\n"
1665       "st1 { v23.s }[2], [x21]\n"
1666       "b 130f\n"
1667       "123:"  // Height 3: Partial direct writeback: partial_1_12
1668       "tbz x13, #0, 130f\n"
1669       "str s16, [x12, #0x0]\n"
1670       "str s11, [x22, #0x0]\n"
1671       "str s23, [x21, #0x0]\n"
1672       "b 130f\n"
1673       "124:"  // Height 3: Partial direct writeback: partial_2_8
1674       "tbz x13, #1, 125f\n"
1675       "str d15, [x12], #0x8\n"
1676       "str d10, [x22], #0x8\n"
1677       "str d22, [x21], #0x8\n"
1678       "tbz x13, #0, 130f\n"
1679       "st1 { v15.s }[2], [x12]\n"
1680       "st1 { v10.s }[2], [x22]\n"
1681       "st1 { v22.s }[2], [x21]\n"
1682       "b 130f\n"
1683       "125:"  // Height 3: Partial direct writeback: partial_1_8
1684       "tbz x13, #0, 130f\n"
1685       "str s15, [x12, #0x0]\n"
1686       "str s10, [x22, #0x0]\n"
1687       "str s22, [x21, #0x0]\n"
1688       "b 130f\n"
1689       "126:"  // Height 3: Partial direct writeback: partial_4_0
1690       "tbz x13, #2, 128f\n"
1691       "st1 { v4.4s }, [x12], #0x10\n"
1692       "st1 { v8.4s }, [x22], #0x10\n"
1693       "st1 { v20.4s }, [x21], #0x10\n"
1694       "tbz x13, #1, 127f\n"
1695       "str d14, [x12], #0x8\n"
1696       "str d9, [x22], #0x8\n"
1697       "str d21, [x21], #0x8\n"
1698       "tbz x13, #0, 130f\n"
1699       "st1 { v14.s }[2], [x12]\n"
1700       "st1 { v9.s }[2], [x22]\n"
1701       "st1 { v21.s }[2], [x21]\n"
1702       "b 130f\n"
1703       "127:"  // Height 3: Partial direct writeback: partial_1_4
1704       "tbz x13, #0, 130f\n"
1705       "str s14, [x12, #0x0]\n"
1706       "str s9, [x22, #0x0]\n"
1707       "str s21, [x21, #0x0]\n"
1708       "b 130f\n"
1709       "128:"  // Height 3: Partial direct writeback: partial_2_0
1710       "tbz x13, #1, 129f\n"
1711       "str d4, [x12], #0x8\n"
1712       "str d8, [x22], #0x8\n"
1713       "str d20, [x21], #0x8\n"
1714       "tbz x13, #0, 130f\n"
1715       "st1 { v4.s }[2], [x12]\n"
1716       "st1 { v8.s }[2], [x22]\n"
1717       "st1 { v20.s }[2], [x21]\n"
1718       "b 130f\n"
1719       "129:"  // Height 3: Partial direct writeback: partial_1_0
1720       "str s4, [x12, #0x0]\n"
1721       "str s8, [x22, #0x0]\n"
1722       "str s20, [x21, #0x0]\n"
1723       "130:"  // Height 3: Partial direct writeback: Done
1724       "b 132f\n"
1725       "131:"  // Height 3: Full writeback
1726       "str q4, [x12, #0x0]\n"
1727       "str q14, [x12, #0x10]\n"
1728       "str q15, [x12, #0x20]\n"
1729       "str q16, [x12, #0x30]\n"
1730       "str q17, [x12, #0x40]\n"
1731       "str q18, [x12, #0x50]\n"
1732       "add x12, x12, #0x60\n"
1733       "str q8, [x22, #0x0]\n"
1734       "str q9, [x22, #0x10]\n"
1735       "str q10, [x22, #0x20]\n"
1736       "str q11, [x22, #0x30]\n"
1737       "str q12, [x22, #0x40]\n"
1738       "str q13, [x22, #0x50]\n"
1739       "str q20, [x21, #0x0]\n"
1740       "str q21, [x21, #0x10]\n"
1741       "str q22, [x21, #0x20]\n"
1742       "str q23, [x21, #0x30]\n"
1743       "str q24, [x21, #0x40]\n"
1744       "str q25, [x21, #0x50]\n"
1745       "132:"  // Height 3: Writeback done
1746       "subs x13, x13, #0x18\n"
1747       "bgt 90b\n"
1748       "b 178f\n"
1749       "133:"  // Height 4
1750       "ldr x19, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1751       "ldr x13, [%x[args_ptr], %[offsetof_N]]\n"
1752       "str x19, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
1753       "mov x20, #0x10\n"
1754       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1755       "mov x14, %x[bias]\n"
1756       "mov x12, %x[output_ptr]\n"
1757       "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
1758       "134:"  // Height 4: Column loop
1759       "ldr x11, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
1760       "ldr x19, [%x[args_ptr], %[offsetof_B_stride]]\n"
1761       "add x10, x11, x19, LSL #1\n"
1762       "add x9, x10, x19, LSL #1\n"
1763       "add x28, x9, x19, LSL #1\n"
1764       "add x27, x28, x19, LSL #1\n"
1765       "add x26, x27, x19, LSL #1\n"
1766       "add x19, x26, x19, LSL #1\n"
1767       "str x19, [%x[args_ptr], %[offsetof_cur_B_ptr]]\n"
1768       "cmp x13, #0x14\n"
1769       "bgt 135f\n"
1770       "cmp x13, #0x10\n"
1771       "mov x26, x11\n"
1772       "bgt 135f\n"
1773       "cmp x13, #0xc\n"
1774       "mov x27, x11\n"
1775       "bgt 135f\n"
1776       "cmp x13, #0x8\n"
1777       "mov x28, x11\n"
1778       "bgt 135f\n"
1779       "cmp x13, #0x4\n"
1780       "mov x9, x11\n"
1781       "bgt 135f\n"
1782       "mov x10, x11\n"
1783       "135:"  // Height 4: B setup done
1784       "cbz x14, 136f\n"
1785       "ldr q8, [x14, #0x0]\n"
1786       "ldr q9, [x14, #0x10]\n"
1787       "zip2 v14.2d, v8.2d, v8.2d\n"
1788       "zip1 v8.2d, v8.2d, v8.2d\n"
1789       "ldr q10, [x14, #0x20]\n"
1790       "ldr q11, [x14, #0x30]\n"
1791       "zip2 v15.2d, v9.2d, v9.2d\n"
1792       "zip1 v9.2d, v9.2d, v9.2d\n"
1793       "ldr q12, [x14, #0x40]\n"
1794       "ldr q13, [x14, #0x50]\n"
1795       "zip2 v16.2d, v10.2d, v10.2d\n"
1796       "zip1 v10.2d, v10.2d, v10.2d\n"
1797       "zip2 v17.2d, v11.2d, v11.2d\n"
1798       "zip1 v11.2d, v11.2d, v11.2d\n"
1799       "add x14, x14, #0x60\n"
1800       "zip2 v18.2d, v12.2d, v12.2d\n"
1801       "zip1 v12.2d, v12.2d, v12.2d\n"
1802       "zip2 v19.2d, v13.2d, v13.2d\n"
1803       "zip1 v13.2d, v13.2d, v13.2d\n"
1804       "mov v20.16b, v8.16b\n"
1805       "mov v26.16b, v14.16b\n"
1806       "mov v21.16b, v9.16b\n"
1807       "mov v27.16b, v15.16b\n"
1808       "mov v22.16b, v10.16b\n"
1809       "mov v28.16b, v16.16b\n"
1810       "mov v23.16b, v11.16b\n"
1811       "mov v29.16b, v17.16b\n"
1812       "mov v24.16b, v12.16b\n"
1813       "mov v30.16b, v18.16b\n"
1814       "mov v25.16b, v13.16b\n"
1815       "mov v31.16b, v19.16b\n"
1816       "b 152f\n"
1817       "136:"  // Height 4: no bias
1818       "tbz %x[flags], #0, 151f\n"
1819       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1820       "add x22, x12, x19, LSL #2\n"
1821       "add x21, x22, x19, LSL #2\n"
1822       "cmp x13, #0x18\n"
1823       "add x20, x21, x19, LSL #2\n"
1824       "bge 149f\n"
1825       "tbz x13, #4, 140f\n"
1826       "ld1 { v9.4s }, [x12], #0x10\n"
1827       "ld1 { v14.4s }, [x22], #0x10\n"
1828       "ld1 { v21.4s }, [x21], #0x10\n"
1829       "ld1 { v26.4s }, [x20], #0x10\n"
1830       "ld1 { v10.4s }, [x12], #0x10\n"
1831       "ld1 { v15.4s }, [x22], #0x10\n"
1832       "ld1 { v22.4s }, [x21], #0x10\n"
1833       "ld1 { v27.4s }, [x20], #0x10\n"
1834       "ld1 { v11.4s }, [x12], #0x10\n"
1835       "ld1 { v16.4s }, [x22], #0x10\n"
1836       "ld1 { v23.4s }, [x21], #0x10\n"
1837       "ld1 { v28.4s }, [x20], #0x10\n"
1838       "ld1 { v12.4s }, [x12], #0x10\n"
1839       "ld1 { v17.4s }, [x22], #0x10\n"
1840       "ld1 { v24.4s }, [x21], #0x10\n"
1841       "ld1 { v29.4s }, [x20], #0x10\n"
1842       "tbz x13, #2, 138f\n"
1843       "ld1 { v13.4s }, [x12], #0x10\n"
1844       "ld1 { v18.4s }, [x22], #0x10\n"
1845       "ld1 { v25.4s }, [x21], #0x10\n"
1846       "ld1 { v30.4s }, [x20], #0x10\n"
1847       "tbz x13, #1, 137f\n"
1848       "ldr d20, [x12], #0x8\n"
1849       "ldr d19, [x22], #0x8\n"
1850       "mov x19, #0x58\n"
1851       "ldr d4, [x21], #0x8\n"
1852       "ldr d31, [x20], #0x8\n"
1853       "tbz x13, #0, 148f\n"
1854       "ld1 { v20.s }[2], [x12]\n"
1855       "ld1 { v19.s }[2], [x22]\n"
1856       "ld1 { v4.s }[2], [x21]\n"
1857       "ld1 { v31.s }[2], [x20]\n"
1858       "b 148f\n"
1859       "137:"  // Height 4: Partial accumulate: partial_1_20
1860       "mov x19, #0x50\n"
1861       "tbz x13, #0, 148f\n"
1862       "ldr s20, [x12, #0x0]\n"
1863       "ldr s19, [x22, #0x0]\n"
1864       "ldr s4, [x21, #0x0]\n"
1865       "ldr s31, [x20, #0x0]\n"
1866       "b 148f\n"
1867       "138:"  // Height 4: Partial accumulate: partial_2_16
1868       "tbz x13, #1, 139f\n"
1869       "ldr d13, [x12], #0x8\n"
1870       "ldr d18, [x22], #0x8\n"
1871       "mov x19, #0x48\n"
1872       "ldr d25, [x21], #0x8\n"
1873       "ldr d30, [x20], #0x8\n"
1874       "tbz x13, #0, 148f\n"
1875       "ld1 { v13.s }[2], [x12]\n"
1876       "ld1 { v18.s }[2], [x22]\n"
1877       "ld1 { v25.s }[2], [x21]\n"
1878       "ld1 { v30.s }[2], [x20]\n"
1879       "b 148f\n"
1880       "139:"  // Height 4: Partial accumulate: partial_1_16
1881       "mov x19, #0x40\n"
1882       "tbz x13, #0, 148f\n"
1883       "ldr s13, [x12, #0x0]\n"
1884       "ldr s18, [x22, #0x0]\n"
1885       "ldr s25, [x21, #0x0]\n"
1886       "ldr s30, [x20, #0x0]\n"
1887       "b 148f\n"
1888       "140:"  // Height 4: Partial accumulate: partial_8_0
1889       "tbz x13, #3, 144f\n"
1890       "ld1 { v9.4s }, [x12], #0x10\n"
1891       "ld1 { v14.4s }, [x22], #0x10\n"
1892       "ld1 { v21.4s }, [x21], #0x10\n"
1893       "ld1 { v26.4s }, [x20], #0x10\n"
1894       "ld1 { v10.4s }, [x12], #0x10\n"
1895       "ld1 { v15.4s }, [x22], #0x10\n"
1896       "ld1 { v22.4s }, [x21], #0x10\n"
1897       "ld1 { v27.4s }, [x20], #0x10\n"
1898       "tbz x13, #2, 142f\n"
1899       "ld1 { v11.4s }, [x12], #0x10\n"
1900       "ld1 { v16.4s }, [x22], #0x10\n"
1901       "ld1 { v23.4s }, [x21], #0x10\n"
1902       "ld1 { v28.4s }, [x20], #0x10\n"
1903       "tbz x13, #1, 141f\n"
1904       "ldr d12, [x12], #0x8\n"
1905       "ldr d17, [x22], #0x8\n"
1906       "mov x19, #0x38\n"
1907       "ldr d24, [x21], #0x8\n"
1908       "ldr d29, [x20], #0x8\n"
1909       "tbz x13, #0, 148f\n"
1910       "ld1 { v12.s }[2], [x12]\n"
1911       "ld1 { v17.s }[2], [x22]\n"
1912       "ld1 { v24.s }[2], [x21]\n"
1913       "ld1 { v29.s }[2], [x20]\n"
1914       "b 148f\n"
1915       "141:"  // Height 4: Partial accumulate: partial_1_12
1916       "mov x19, #0x30\n"
1917       "tbz x13, #0, 148f\n"
1918       "ldr s12, [x12, #0x0]\n"
1919       "ldr s17, [x22, #0x0]\n"
1920       "ldr s24, [x21, #0x0]\n"
1921       "ldr s29, [x20, #0x0]\n"
1922       "b 148f\n"
1923       "142:"  // Height 4: Partial accumulate: partial_2_8
1924       "tbz x13, #1, 143f\n"
1925       "ldr d11, [x12], #0x8\n"
1926       "ldr d16, [x22], #0x8\n"
1927       "mov x19, #0x28\n"
1928       "ldr d23, [x21], #0x8\n"
1929       "ldr d28, [x20], #0x8\n"
1930       "tbz x13, #0, 148f\n"
1931       "ld1 { v11.s }[2], [x12]\n"
1932       "ld1 { v16.s }[2], [x22]\n"
1933       "ld1 { v23.s }[2], [x21]\n"
1934       "ld1 { v28.s }[2], [x20]\n"
1935       "b 148f\n"
1936       "143:"  // Height 4: Partial accumulate: partial_1_8
1937       "mov x19, #0x20\n"
1938       "tbz x13, #0, 148f\n"
1939       "ldr s11, [x12, #0x0]\n"
1940       "ldr s16, [x22, #0x0]\n"
1941       "ldr s23, [x21, #0x0]\n"
1942       "ldr s28, [x20, #0x0]\n"
1943       "b 148f\n"
1944       "144:"  // Height 4: Partial accumulate: partial_4_0
1945       "tbz x13, #2, 146f\n"
1946       "ld1 { v9.4s }, [x12], #0x10\n"
1947       "ld1 { v14.4s }, [x22], #0x10\n"
1948       "ld1 { v21.4s }, [x21], #0x10\n"
1949       "ld1 { v26.4s }, [x20], #0x10\n"
1950       "tbz x13, #1, 145f\n"
1951       "ldr d10, [x12], #0x8\n"
1952       "ldr d15, [x22], #0x8\n"
1953       "mov x19, #0x18\n"
1954       "ldr d22, [x21], #0x8\n"
1955       "ldr d27, [x20], #0x8\n"
1956       "tbz x13, #0, 148f\n"
1957       "ld1 { v10.s }[2], [x12]\n"
1958       "ld1 { v15.s }[2], [x22]\n"
1959       "ld1 { v22.s }[2], [x21]\n"
1960       "ld1 { v27.s }[2], [x20]\n"
1961       "b 148f\n"
1962       "145:"  // Height 4: Partial accumulate: partial_1_4
1963       "mov x19, #0x10\n"
1964       "tbz x13, #0, 148f\n"
1965       "ldr s10, [x12, #0x0]\n"
1966       "ldr s15, [x22, #0x0]\n"
1967       "ldr s22, [x21, #0x0]\n"
1968       "ldr s27, [x20, #0x0]\n"
1969       "b 148f\n"
1970       "146:"  // Height 4: Partial accumulate: partial_2_0
1971       "tbz x13, #1, 147f\n"
1972       "ldr d9, [x12], #0x8\n"
1973       "ldr d14, [x22], #0x8\n"
1974       "mov x19, #0x8\n"
1975       "ldr d21, [x21], #0x8\n"
1976       "ldr d26, [x20], #0x8\n"
1977       "tbz x13, #0, 148f\n"
1978       "ld1 { v9.s }[2], [x12]\n"
1979       "ld1 { v14.s }[2], [x22]\n"
1980       "ld1 { v21.s }[2], [x21]\n"
1981       "ld1 { v26.s }[2], [x20]\n"
1982       "b 148f\n"
1983       "147:"  // Height 4: Partial accumulate: partial_1_0
1984       "ldr s9, [x12, #0x0]\n"
1985       "ldr s14, [x22, #0x0]\n"
1986       "mov x19, #0x0\n"
1987       "ldr s21, [x21, #0x0]\n"
1988       "ldr s26, [x20, #0x0]\n"
1989       "148:"  // Height 4: Partial accumulate: Done
1990       "sub x12, x12, x19\n"
1991       "b 150f\n"
1992       "149:"  // Height 4: full accumulate
1993       "ldr q9, [x12, #0x0]\n"
1994       "ldr q10, [x12, #0x10]\n"
1995       "ldr q11, [x12, #0x20]\n"
1996       "ldr q12, [x12, #0x30]\n"
1997       "ldr q13, [x12, #0x40]\n"
1998       "ldr q20, [x12, #0x50]\n"
1999       "ldr q14, [x22, #0x0]\n"
2000       "ldr q15, [x22, #0x10]\n"
2001       "ldr q16, [x22, #0x20]\n"
2002       "ldr q17, [x22, #0x30]\n"
2003       "ldr q18, [x22, #0x40]\n"
2004       "ldr q19, [x22, #0x50]\n"
2005       "ldr q21, [x21, #0x0]\n"
2006       "ldr q22, [x21, #0x10]\n"
2007       "ldr q23, [x21, #0x20]\n"
2008       "ldr q24, [x21, #0x30]\n"
2009       "ldr q25, [x21, #0x40]\n"
2010       "ldr q4, [x21, #0x50]\n"
2011       "ldr q26, [x20, #0x0]\n"
2012       "ldr q27, [x20, #0x10]\n"
2013       "ldr q28, [x20, #0x20]\n"
2014       "ldr q29, [x20, #0x30]\n"
2015       "ldr q30, [x20, #0x40]\n"
2016       "ldr q31, [x20, #0x50]\n"
2017       "150:"  // Height 4: MMLA fixup
2018       "zip1 v8.2d, v9.2d, v14.2d\n"
2019       "zip2 v14.2d, v9.2d, v14.2d\n"
2020       "zip1 v9.2d, v10.2d, v15.2d\n"
2021       "zip2 v15.2d, v10.2d, v15.2d\n"
2022       "zip1 v10.2d, v11.2d, v16.2d\n"
2023       "zip2 v16.2d, v11.2d, v16.2d\n"
2024       "zip1 v11.2d, v12.2d, v17.2d\n"
2025       "zip2 v17.2d, v12.2d, v17.2d\n"
2026       "zip1 v12.2d, v13.2d, v18.2d\n"
2027       "zip2 v18.2d, v13.2d, v18.2d\n"
2028       "zip1 v13.2d, v20.2d, v19.2d\n"
2029       "zip2 v19.2d, v20.2d, v19.2d\n"
2030       "zip1 v20.2d, v21.2d, v26.2d\n"
2031       "zip2 v26.2d, v21.2d, v26.2d\n"
2032       "zip1 v21.2d, v22.2d, v27.2d\n"
2033       "zip2 v27.2d, v22.2d, v27.2d\n"
2034       "zip1 v22.2d, v23.2d, v28.2d\n"
2035       "zip2 v28.2d, v23.2d, v28.2d\n"
2036       "zip1 v23.2d, v24.2d, v29.2d\n"
2037       "zip2 v29.2d, v24.2d, v29.2d\n"
2038       "zip1 v24.2d, v25.2d, v30.2d\n"
2039       "zip2 v30.2d, v25.2d, v30.2d\n"
2040       "zip1 v25.2d, v4.2d, v31.2d\n"
2041       "zip2 v31.2d, v4.2d, v31.2d\n"
2042       "b 152f\n"
2043       "151:"  // Height 4: no accumulate
2044       "movi v8.16b, #0x0\n"
2045       "movi v9.16b, #0x0\n"
2046       "movi v10.16b, #0x0\n"
2047       "movi v11.16b, #0x0\n"
2048       "movi v12.16b, #0x0\n"
2049       "movi v13.16b, #0x0\n"
2050       "movi v14.16b, #0x0\n"
2051       "movi v15.16b, #0x0\n"
2052       "movi v16.16b, #0x0\n"
2053       "movi v17.16b, #0x0\n"
2054       "movi v18.16b, #0x0\n"
2055       "movi v19.16b, #0x0\n"
2056       "movi v20.16b, #0x0\n"
2057       "movi v21.16b, #0x0\n"
2058       "movi v22.16b, #0x0\n"
2059       "movi v23.16b, #0x0\n"
2060       "movi v24.16b, #0x0\n"
2061       "movi v25.16b, #0x0\n"
2062       "movi v26.16b, #0x0\n"
2063       "movi v27.16b, #0x0\n"
2064       "movi v28.16b, #0x0\n"
2065       "movi v29.16b, #0x0\n"
2066       "movi v30.16b, #0x0\n"
2067       "movi v31.16b, #0x0\n"
2068       "152:"  // Height 4: setup done
2069       "mov x25, #0x0\n"
2070       "153:"  // Height 4: String loop
2071       "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
2072       "ldr w24, [x19, x25, LSL #0x2]\n"
2073       "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
2074       "tbz %x[flags], #3, 154f\n"
2075       "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
2076       "add x20, x20, x19, LSL #3\n"
2077       "ldr x23, [x20, #0x0]\n"
2078       "ldr x22, [x20, #0x8]\n"
2079       "ldr x21, [x20, #0x10]\n"
2080       "ldr x20, [x20, #0x18]\n"
2081       "cbnz x25, 155f\n"
2082       "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
2083       "add x23, x23, x19, LSL #2\n"
2084       "add x22, x22, x19, LSL #2\n"
2085       "add x21, x21, x19, LSL #2\n"
2086       "add x20, x20, x19, LSL #2\n"
2087       "b 155f\n"
2088       "154:"  // Height 4: setup direct input
2089       "mov x23, %x[input_ptr]\n"
2090       "add x22, x23, x19, LSL #2\n"
2091       "add x21, x22, x19, LSL #2\n"
2092       "add x20, x21, x19, LSL #2\n"
2093       "155:"  // Height 4: input setup done
2094       "cmp x24, #0x4\n"
2095       "blt 158f\n"
2096       "ld1 { v0.4s }, [x23], #0x10\n"
2097       "ld1 { v2.4s }, [x21], #0x10\n"
2098       "cmp x24, #0x8\n"
2099       "ld1 { v1.4s }, [x22], #0x10\n"
2100       "ld1 { v3.4s }, [x20], #0x10\n"
2101       "ldr q4, [x11, #0x0]\n"
2102       "ldr q5, [x11, #0x10]\n"
2103       "ldr q6, [x10, #0x0]\n"
2104       "ldr q7, [x10, #0x10]\n"
2105       "blt 157f\n"
2106       "156:"  // Height 4: Multiply loop: Main loop head
2107       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
2108       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
2109       "sub x24, x24, #0x4\n"
2110       "cmp x24, #0x8\n"
2111       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
2112       "ld1 { v1.4s }, [x22], #0x10\n"
2113       ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
2114       "ld1 { v3.4s }, [x20], #0x10\n"
2115       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
2116       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
2117       "ldr q4, [x9, #0x0]\n"
2118       "add x11, x11, #0x20\n"
2119       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
2120       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
2121       "ldr q5, [x9, #0x10]\n"
2122       "add x10, x10, #0x20\n"
2123       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
2124       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
2125       "ldr q6, [x28, #0x0]\n"
2126       "add x9, x9, #0x20\n"
2127       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
2128       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
2129       "ldr q7, [x28, #0x10]\n"
2130       "add x28, x28, #0x20\n"
2131       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
2132       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
2133       "ldr q4, [x27, #0x0]\n"
2134       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
2135       ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
2136       "ldr q5, [x27, #0x10]\n"
2137       "add x27, x27, #0x20\n"
2138       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
2139       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
2140       "ldr q6, [x26, #0x0]\n"
2141       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
2142       ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
2143       "ldr q7, [x26, #0x10]\n"
2144       "add x26, x26, #0x20\n"
2145       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
2146       ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
2147       "ldr q4, [x11, #0x0]\n"
2148       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
2149       ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
2150       "ldr q5, [x11, #0x10]\n"
2151       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
2152       ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
2153       "ldr q6, [x10, #0x0]\n"
2154       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
2155       "ld1 { v0.4s }, [x23], #0x10\n"
2156       ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
2157       "ld1 { v2.4s }, [x21], #0x10\n"
2158       "ldr q7, [x10, #0x10]\n"
2159       "bge 156b\n"
2160       "157:"  // Height 4: Multiply loop: Single iteration only
2161       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
2162       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
2163       "sub x24, x24, #0x4\n"
2164       "add x11, x11, #0x20\n"
2165       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
2166       ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
2167       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
2168       "add x10, x10, #0x20\n"
2169       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
2170       "ldr q4, [x9, #0x0]\n"
2171       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
2172       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
2173       "ldr q5, [x9, #0x10]\n"
2174       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
2175       "add x9, x9, #0x20\n"
2176       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
2177       "ldr q6, [x28, #0x0]\n"
2178       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
2179       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
2180       "ldr q7, [x28, #0x10]\n"
2181       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
2182       "add x28, x28, #0x20\n"
2183       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
2184       "ldr q4, [x27, #0x0]\n"
2185       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
2186       ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
2187       "ldr q5, [x27, #0x10]\n"
2188       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
2189       "add x27, x27, #0x20\n"
2190       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
2191       "ldr q6, [x26, #0x0]\n"
2192       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
2193       ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
2194       "ldr q7, [x26, #0x10]\n"
2195       "add x26, x26, #0x20\n"
2196       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
2197       ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
2198       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
2199       ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
2200       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
2201       ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
2202       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
2203       ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
2204       "158:"  // Height 4: Multiply loop: Main loop skip
2205       "cbz x24, 161f\n"
2206       "cbz x24, 161f\n"
2207       "tbz x24, #1, 159f\n"
2208       "ldr d0, [x23], #0x8\n"
2209       "ldr d1, [x22], #0x8\n"
2210       "ldr d2, [x21], #0x8\n"
2211       "ldr d3, [x20], #0x8\n"
2212       "tbz x24, #0, 160f\n"
2213       "ld1 { v0.s }[2], [x23]\n"
2214       "ld1 { v1.s }[2], [x22]\n"
2215       "ld1 { v2.s }[2], [x21]\n"
2216       "ld1 { v3.s }[2], [x20]\n"
2217       "b 160f\n"
2218       "159:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
2219       "ldr s0, [x23, #0x0]\n"
2220       "ldr s1, [x22, #0x0]\n"
2221       "ldr s2, [x21, #0x0]\n"
2222       "ldr s3, [x20, #0x0]\n"
2223       "160:"  // Height 4: Multiply loop: Ragged operand read: Done
2224       "ldr q4, [x11, #0x0]\n"
2225       "ldr q5, [x11, #0x10]\n"
2226       ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
2227       ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
2228       "ldr q6, [x10, #0x0]\n"
2229       "ldr q7, [x10, #0x10]\n"
2230       ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
2231       ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
2232       ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
2233       ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
2234       "ldr q4, [x9, #0x0]\n"
2235       "add x11, x11, #0x20\n"
2236       ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
2237       ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
2238       "ldr q5, [x9, #0x10]\n"
2239       "add x10, x10, #0x20\n"
2240       ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
2241       ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
2242       "ldr q6, [x28, #0x0]\n"
2243       "add x9, x9, #0x20\n"
2244       ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
2245       ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
2246       "ldr q7, [x28, #0x10]\n"
2247       "add x28, x28, #0x20\n"
2248       ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
2249       ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
2250       "ldr q4, [x27, #0x0]\n"
2251       ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
2252       ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
2253       "ldr q5, [x27, #0x10]\n"
2254       "add x27, x27, #0x20\n"
2255       ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
2256       ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
2257       "ldr q6, [x26, #0x0]\n"
2258       ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
2259       ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
2260       "ldr q7, [x26, #0x10]\n"
2261       "add x26, x26, #0x20\n"
2262       ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
2263       ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
2264       ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
2265       ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
2266       ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
2267       ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
2268       ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
2269       ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
2270       "161:"  // Height 4: Multiply loop: No odd multiplies
2271       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
2272       "add x25, x25, #0x1\n"
2273       "cmp x25, x19\n"
2274       "bne 153b\n"
2275       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
2276       "add x22, x12, x19, LSL #2\n"
2277       "add x21, x22, x19, LSL #2\n"
2278       "uzp1 v4.2d, v8.2d, v14.2d\n"
2279       "uzp2 v8.2d, v8.2d, v14.2d\n"
2280       "uzp1 v14.2d, v9.2d, v15.2d\n"
2281       "add x20, x21, x19, LSL #2\n"
2282       "uzp2 v9.2d, v9.2d, v15.2d\n"
2283       "uzp1 v15.2d, v10.2d, v16.2d\n"
2284       "uzp2 v10.2d, v10.2d, v16.2d\n"
2285       "uzp1 v16.2d, v11.2d, v17.2d\n"
2286       "uzp2 v11.2d, v11.2d, v17.2d\n"
2287       "uzp1 v17.2d, v12.2d, v18.2d\n"
2288       "uzp2 v12.2d, v12.2d, v18.2d\n"
2289       "uzp1 v18.2d, v13.2d, v19.2d\n"
2290       "uzp2 v13.2d, v13.2d, v19.2d\n"
2291       "uzp1 v19.2d, v20.2d, v26.2d\n"
2292       "uzp2 v20.2d, v20.2d, v26.2d\n"
2293       "uzp1 v26.2d, v21.2d, v27.2d\n"
2294       "uzp2 v21.2d, v21.2d, v27.2d\n"
2295       "uzp1 v27.2d, v22.2d, v28.2d\n"
2296       "uzp2 v22.2d, v22.2d, v28.2d\n"
2297       "uzp1 v28.2d, v23.2d, v29.2d\n"
2298       "uzp2 v23.2d, v23.2d, v29.2d\n"
2299       "uzp1 v29.2d, v24.2d, v30.2d\n"
2300       "uzp2 v24.2d, v24.2d, v30.2d\n"
2301       "uzp1 v30.2d, v25.2d, v31.2d\n"
2302       "uzp2 v25.2d, v25.2d, v31.2d\n"
2303       "tbz %x[flags], #1, 162f\n"
2304       "add x19, %x[args_ptr], %[offset_max]\n"
2305       "ld1r { v1.4s }, [x19]\n"
2306       "add x19, %x[args_ptr], %[offset_min]\n"
2307       "ld1r { v0.4s }, [x19]\n"
2308       "fmin v4.4s, v4.4s, v1.4s\n"
2309       "fmin v14.4s, v14.4s, v1.4s\n"
2310       "fmin v15.4s, v15.4s, v1.4s\n"
2311       "fmin v16.4s, v16.4s, v1.4s\n"
2312       "fmin v17.4s, v17.4s, v1.4s\n"
2313       "fmin v18.4s, v18.4s, v1.4s\n"
2314       "fmin v8.4s, v8.4s, v1.4s\n"
2315       "fmin v9.4s, v9.4s, v1.4s\n"
2316       "fmin v10.4s, v10.4s, v1.4s\n"
2317       "fmin v11.4s, v11.4s, v1.4s\n"
2318       "fmin v12.4s, v12.4s, v1.4s\n"
2319       "fmin v13.4s, v13.4s, v1.4s\n"
2320       "fmin v19.4s, v19.4s, v1.4s\n"
2321       "fmin v26.4s, v26.4s, v1.4s\n"
2322       "fmin v27.4s, v27.4s, v1.4s\n"
2323       "fmin v28.4s, v28.4s, v1.4s\n"
2324       "fmin v29.4s, v29.4s, v1.4s\n"
2325       "fmin v30.4s, v30.4s, v1.4s\n"
2326       "fmin v20.4s, v20.4s, v1.4s\n"
2327       "fmin v21.4s, v21.4s, v1.4s\n"
2328       "fmin v22.4s, v22.4s, v1.4s\n"
2329       "fmin v23.4s, v23.4s, v1.4s\n"
2330       "fmin v24.4s, v24.4s, v1.4s\n"
2331       "fmin v25.4s, v25.4s, v1.4s\n"
2332       "fmax v4.4s, v4.4s, v0.4s\n"
2333       "fmax v14.4s, v14.4s, v0.4s\n"
2334       "fmax v15.4s, v15.4s, v0.4s\n"
2335       "fmax v16.4s, v16.4s, v0.4s\n"
2336       "fmax v17.4s, v17.4s, v0.4s\n"
2337       "fmax v18.4s, v18.4s, v0.4s\n"
2338       "fmax v8.4s, v8.4s, v0.4s\n"
2339       "fmax v9.4s, v9.4s, v0.4s\n"
2340       "fmax v10.4s, v10.4s, v0.4s\n"
2341       "fmax v11.4s, v11.4s, v0.4s\n"
2342       "fmax v12.4s, v12.4s, v0.4s\n"
2343       "fmax v13.4s, v13.4s, v0.4s\n"
2344       "fmax v19.4s, v19.4s, v0.4s\n"
2345       "fmax v26.4s, v26.4s, v0.4s\n"
2346       "fmax v27.4s, v27.4s, v0.4s\n"
2347       "fmax v28.4s, v28.4s, v0.4s\n"
2348       "fmax v29.4s, v29.4s, v0.4s\n"
2349       "fmax v30.4s, v30.4s, v0.4s\n"
2350       "fmax v20.4s, v20.4s, v0.4s\n"
2351       "fmax v21.4s, v21.4s, v0.4s\n"
2352       "fmax v22.4s, v22.4s, v0.4s\n"
2353       "fmax v23.4s, v23.4s, v0.4s\n"
2354       "fmax v24.4s, v24.4s, v0.4s\n"
2355       "fmax v25.4s, v25.4s, v0.4s\n"
2356       "162:"  // Height 4: No activation
2357       "cmp x13, #0x18\n"
2358       "bge 175f\n"
2359       "tbz x13, #4, 166f\n"
2360       "st1 { v4.4s }, [x12], #0x10\n"
2361       "st1 { v14.4s }, [x12], #0x10\n"
2362       "st1 { v15.4s }, [x12], #0x10\n"
2363       "st1 { v16.4s }, [x12], #0x10\n"
2364       "st1 { v8.4s }, [x22], #0x10\n"
2365       "st1 { v9.4s }, [x22], #0x10\n"
2366       "st1 { v10.4s }, [x22], #0x10\n"
2367       "st1 { v11.4s }, [x22], #0x10\n"
2368       "st1 { v19.4s }, [x21], #0x10\n"
2369       "st1 { v26.4s }, [x21], #0x10\n"
2370       "st1 { v27.4s }, [x21], #0x10\n"
2371       "st1 { v28.4s }, [x21], #0x10\n"
2372       "st1 { v20.4s }, [x20], #0x10\n"
2373       "st1 { v21.4s }, [x20], #0x10\n"
2374       "st1 { v22.4s }, [x20], #0x10\n"
2375       "st1 { v23.4s }, [x20], #0x10\n"
2376       "tbz x13, #2, 164f\n"
2377       "st1 { v17.4s }, [x12], #0x10\n"
2378       "st1 { v12.4s }, [x22], #0x10\n"
2379       "st1 { v29.4s }, [x21], #0x10\n"
2380       "st1 { v24.4s }, [x20], #0x10\n"
2381       "tbz x13, #1, 163f\n"
2382       "str d18, [x12], #0x8\n"
2383       "str d13, [x22], #0x8\n"
2384       "str d30, [x21], #0x8\n"
2385       "str d25, [x20], #0x8\n"
2386       "tbz x13, #0, 174f\n"
2387       "st1 { v18.s }[2], [x12]\n"
2388       "st1 { v13.s }[2], [x22]\n"
2389       "st1 { v30.s }[2], [x21]\n"
2390       "st1 { v25.s }[2], [x20]\n"
2391       "b 174f\n"
2392       "163:"  // Height 4: Partial direct writeback: partial_1_20
2393       "tbz x13, #0, 174f\n"
2394       "str s18, [x12, #0x0]\n"
2395       "str s13, [x22, #0x0]\n"
2396       "str s30, [x21, #0x0]\n"
2397       "str s25, [x20, #0x0]\n"
2398       "b 174f\n"
2399       "164:"  // Height 4: Partial direct writeback: partial_2_16
2400       "tbz x13, #1, 165f\n"
2401       "str d17, [x12], #0x8\n"
2402       "str d12, [x22], #0x8\n"
2403       "str d29, [x21], #0x8\n"
2404       "str d24, [x20], #0x8\n"
2405       "tbz x13, #0, 174f\n"
2406       "st1 { v17.s }[2], [x12]\n"
2407       "st1 { v12.s }[2], [x22]\n"
2408       "st1 { v29.s }[2], [x21]\n"
2409       "st1 { v24.s }[2], [x20]\n"
2410       "b 174f\n"
2411       "165:"  // Height 4: Partial direct writeback: partial_1_16
2412       "tbz x13, #0, 174f\n"
2413       "str s17, [x12, #0x0]\n"
2414       "str s12, [x22, #0x0]\n"
2415       "str s29, [x21, #0x0]\n"
2416       "str s24, [x20, #0x0]\n"
2417       "b 174f\n"
2418       "166:"  // Height 4: Partial direct writeback: partial_8_0
2419       "tbz x13, #3, 170f\n"
2420       "st1 { v4.4s }, [x12], #0x10\n"
2421       "st1 { v14.4s }, [x12], #0x10\n"
2422       "st1 { v8.4s }, [x22], #0x10\n"
2423       "st1 { v9.4s }, [x22], #0x10\n"
2424       "st1 { v19.4s }, [x21], #0x10\n"
2425       "st1 { v26.4s }, [x21], #0x10\n"
2426       "st1 { v20.4s }, [x20], #0x10\n"
2427       "st1 { v21.4s }, [x20], #0x10\n"
2428       "tbz x13, #2, 168f\n"
2429       "st1 { v15.4s }, [x12], #0x10\n"
2430       "st1 { v10.4s }, [x22], #0x10\n"
2431       "st1 { v27.4s }, [x21], #0x10\n"
2432       "st1 { v22.4s }, [x20], #0x10\n"
2433       "tbz x13, #1, 167f\n"
2434       "str d16, [x12], #0x8\n"
2435       "str d11, [x22], #0x8\n"
2436       "str d28, [x21], #0x8\n"
2437       "str d23, [x20], #0x8\n"
2438       "tbz x13, #0, 174f\n"
2439       "st1 { v16.s }[2], [x12]\n"
2440       "st1 { v11.s }[2], [x22]\n"
2441       "st1 { v28.s }[2], [x21]\n"
2442       "st1 { v23.s }[2], [x20]\n"
2443       "b 174f\n"
2444       "167:"  // Height 4: Partial direct writeback: partial_1_12
2445       "tbz x13, #0, 174f\n"
2446       "str s16, [x12, #0x0]\n"
2447       "str s11, [x22, #0x0]\n"
2448       "str s28, [x21, #0x0]\n"
2449       "str s23, [x20, #0x0]\n"
2450       "b 174f\n"
2451       "168:"  // Height 4: Partial direct writeback: partial_2_8
2452       "tbz x13, #1, 169f\n"
2453       "str d15, [x12], #0x8\n"
2454       "str d10, [x22], #0x8\n"
2455       "str d27, [x21], #0x8\n"
2456       "str d22, [x20], #0x8\n"
2457       "tbz x13, #0, 174f\n"
2458       "st1 { v15.s }[2], [x12]\n"
2459       "st1 { v10.s }[2], [x22]\n"
2460       "st1 { v27.s }[2], [x21]\n"
2461       "st1 { v22.s }[2], [x20]\n"
2462       "b 174f\n"
2463       "169:"  // Height 4: Partial direct writeback: partial_1_8
2464       "tbz x13, #0, 174f\n"
2465       "str s15, [x12, #0x0]\n"
2466       "str s10, [x22, #0x0]\n"
2467       "str s27, [x21, #0x0]\n"
2468       "str s22, [x20, #0x0]\n"
2469       "b 174f\n"
2470       "170:"  // Height 4: Partial direct writeback: partial_4_0
2471       "tbz x13, #2, 172f\n"
2472       "st1 { v4.4s }, [x12], #0x10\n"
2473       "st1 { v8.4s }, [x22], #0x10\n"
2474       "st1 { v19.4s }, [x21], #0x10\n"
2475       "st1 { v20.4s }, [x20], #0x10\n"
2476       "tbz x13, #1, 171f\n"
2477       "str d14, [x12], #0x8\n"
2478       "str d9, [x22], #0x8\n"
2479       "str d26, [x21], #0x8\n"
2480       "str d21, [x20], #0x8\n"
2481       "tbz x13, #0, 174f\n"
2482       "st1 { v14.s }[2], [x12]\n"
2483       "st1 { v9.s }[2], [x22]\n"
2484       "st1 { v26.s }[2], [x21]\n"
2485       "st1 { v21.s }[2], [x20]\n"
2486       "b 174f\n"
2487       "171:"  // Height 4: Partial direct writeback: partial_1_4
2488       "tbz x13, #0, 174f\n"
2489       "str s14, [x12, #0x0]\n"
2490       "str s9, [x22, #0x0]\n"
2491       "str s26, [x21, #0x0]\n"
2492       "str s21, [x20, #0x0]\n"
2493       "b 174f\n"
2494       "172:"  // Height 4: Partial direct writeback: partial_2_0
2495       "tbz x13, #1, 173f\n"
2496       "str d4, [x12], #0x8\n"
2497       "str d8, [x22], #0x8\n"
2498       "str d19, [x21], #0x8\n"
2499       "str d20, [x20], #0x8\n"
2500       "tbz x13, #0, 174f\n"
2501       "st1 { v4.s }[2], [x12]\n"
2502       "st1 { v8.s }[2], [x22]\n"
2503       "st1 { v19.s }[2], [x21]\n"
2504       "st1 { v20.s }[2], [x20]\n"
2505       "b 174f\n"
2506       "173:"  // Height 4: Partial direct writeback: partial_1_0
2507       "str s4, [x12, #0x0]\n"
2508       "str s8, [x22, #0x0]\n"
2509       "str s19, [x21, #0x0]\n"
2510       "str s20, [x20, #0x0]\n"
2511       "174:"  // Height 4: Partial direct writeback: Done
2512       "b 176f\n"
2513       "175:"  // Height 4: Full writeback
2514       "str q4, [x12, #0x0]\n"
2515       "str q14, [x12, #0x10]\n"
2516       "str q15, [x12, #0x20]\n"
2517       "str q16, [x12, #0x30]\n"
2518       "str q17, [x12, #0x40]\n"
2519       "str q18, [x12, #0x50]\n"
2520       "add x12, x12, #0x60\n"
2521       "str q8, [x22, #0x0]\n"
2522       "str q9, [x22, #0x10]\n"
2523       "str q10, [x22, #0x20]\n"
2524       "str q11, [x22, #0x30]\n"
2525       "str q12, [x22, #0x40]\n"
2526       "str q13, [x22, #0x50]\n"
2527       "str q19, [x21, #0x0]\n"
2528       "str q26, [x21, #0x10]\n"
2529       "str q27, [x21, #0x20]\n"
2530       "str q28, [x21, #0x30]\n"
2531       "str q29, [x21, #0x40]\n"
2532       "str q30, [x21, #0x50]\n"
2533       "str q20, [x20, #0x0]\n"
2534       "str q21, [x20, #0x10]\n"
2535       "str q22, [x20, #0x20]\n"
2536       "str q23, [x20, #0x30]\n"
2537       "str q24, [x20, #0x40]\n"
2538       "str q25, [x20, #0x50]\n"
2539       "176:"  // Height 4: Writeback done
2540       "subs x13, x13, #0x18\n"
2541       "bgt 134b\n"
2542       "subs %x[M], %x[M], #0x4\n"
2543       "beq 178f\n"
2544       "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
2545       "tbz %x[flags], #3, 177f\n"
2546       "add x20, x20, #0x4\n"
2547       "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
2548       "b 1b\n"
2549       "177:"  // Update direct input
2550       "mov x19, #0x10\n"
2551       "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
2552       "b 1b\n"
2553       "178:"  // Exit
2554       : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
2555       : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_B_stride] "I" (offsetof(KernelArgs, B_stride)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_cur_B_ptr] "I" (offsetof(KernelArgs, cur_B_ptr)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
2556       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
2557     );
2558 }
2559 
2560 } // namespace arm_gemm
2561 #endif // __aarch64__
2562