1 /*
2 * Copyright (c) 2019-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24 #ifdef __ARM_FEATURE_SVE
25
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28
29 #include <cassert>
30
31 namespace arm_gemm {
32
sve_hybrid_s8s32_dot_6x4VL(unsigned int num_strings,const unsigned int * string_lengths,IndirectInputArg<int8_t> A_arg,size_t M,size_t N,const int8_t * B_ptr,IndirectOutputArg<int32_t> output_arg,const int32_t *,Activation,bool accumulate)33 void sve_hybrid_s8s32_dot_6x4VL (
34 unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
35 size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
36 const int32_t *, Activation, bool accumulate
37 )
38 {
39 struct KernelArgs {
40 unsigned int num_strings = {};
41 const unsigned int *string_lengths = {};
42 size_t N = {};
43 const int8_t *B_ptr = {};
44 size_t output_offset = {};
45 size_t input_initial_col = {};
46 size_t input_offset = {};
47 } ka;
48
49 unsigned long flags=0;
50 void *output_ptr;
51 void *input_ptr;
52
53 if (output_arg.is_indirect) {
54 output_ptr=(void *)(output_arg.indirect.ptr);
55 ka.output_offset=output_arg.indirect.offset;
56 flags |= 0x4;
57 } else {
58 output_ptr=(void *)(output_arg.direct.base);
59 ka.output_offset=output_arg.direct.stride;
60 }
61
62 if (A_arg.is_indirect) {
63 input_ptr=(void *)(A_arg.indirect.ptr);
64 ka.input_offset=A_arg.indirect.start_row;
65 ka.input_initial_col=A_arg.indirect.start_col;
66 flags |= 0x8;
67 } else {
68 assert(num_strings==1);
69 input_ptr=(void *)(A_arg.direct.base);
70 ka.input_offset=A_arg.direct.stride;
71 }
72 if (accumulate) {
73 flags |= 0x1;
74 }
75 ka.num_strings = num_strings;
76 ka.string_lengths = string_lengths;
77 ka.N = N;
78 ka.B_ptr = B_ptr;
79 __asm__ __volatile__(
80 "ptrue p5.b\n"
81 "1:" // Row loop
82 "cmp %x[M], #0x6\n"
83 "bge 61f\n"
84 "cmp %x[M], #0x4\n"
85 "bgt 49f\n"
86 "beq 37f\n"
87 "cmp %x[M], #0x2\n"
88 "bgt 25f\n"
89 "beq 13f\n"
90 "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
91 "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
92 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
93 "tbz %x[flags], #2, 2f\n"
94 "ldr x13, [%x[output_ptr], #0x0]\n"
95 "add x13, x13, x19, LSL #2\n"
96 "b 3f\n"
97 "2:" // Height 1: setup direct output
98 "mov x13, %x[output_ptr]\n"
99 "3:" // Height 1: Column loop
100 "mov x19, #0x0\n"
101 "whilelt p4.s, x19, x15\n"
102 "incw x19\n"
103 "whilelt p3.s, x19, x15\n"
104 "incw x19\n"
105 "whilelt p2.s, x19, x15\n"
106 "incw x19\n"
107 "whilelt p1.s, x19, x15\n"
108 "tbz %x[flags], #0, 4f\n"
109 "ld1w { z8.s }, p4/Z, [x13]\n"
110 "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
111 "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
112 "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
113 "b 5f\n"
114 "4:" // Height 1: no accumulate
115 "mov z8.s, #0x0\n"
116 "mov z9.s, #0x0\n"
117 "mov z10.s, #0x0\n"
118 "mov z11.s, #0x0\n"
119 "5:" // Height 1: setup done
120 "mov x12, #0x0\n"
121 "6:" // Height 1: String loop
122 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
123 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
124 "ldr w11, [x20, x12, LSL #0x2]\n"
125 "tbz %x[flags], #3, 7f\n"
126 "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
127 "add x20, x20, x19, LSL #3\n"
128 "ldr x10, [x20, #0x0]\n"
129 "cbnz x12, 8f\n"
130 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
131 "add x10, x10, x19\n"
132 "b 8f\n"
133 "7:" // Height 1: setup direct input
134 "mov x10, %x[input_ptr]\n"
135 "8:" // Height 1: input setup done
136 "cmp x11, #0x10\n"
137 "ble 10f\n"
138 "9:" // Height 1: Multiply loop: Main loop head
139 "ld1b { z6.b }, p5/Z, [x14]\n"
140 "whilelt p0.b, XZR, x11\n"
141 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
142 "sub x11, x11, #0x10\n"
143 "ld1rqb { z0.b }, p0/Z, [x10]\n"
144 "sdot z8.s, z6.b, z0.b[0]\n"
145 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
146 "add x10, x10, #0x10\n"
147 "sdot z9.s, z7.b, z0.b[0]\n"
148 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
149 "cmp x11, #0x10\n"
150 "sdot z10.s, z6.b, z0.b[0]\n"
151 "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
152 "prfm pldl1keep, [x10, #0x80]\n"
153 "sdot z11.s, z7.b, z0.b[0]\n"
154 "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
155 "sdot z8.s, z6.b, z0.b[1]\n"
156 "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
157 "sdot z9.s, z7.b, z0.b[1]\n"
158 "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
159 "addvl x14, x14, #16\n"
160 "sdot z10.s, z6.b, z0.b[1]\n"
161 "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
162 "sdot z11.s, z7.b, z0.b[1]\n"
163 "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
164 "sdot z8.s, z6.b, z0.b[2]\n"
165 "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
166 "sdot z9.s, z7.b, z0.b[2]\n"
167 "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
168 "sdot z10.s, z6.b, z0.b[2]\n"
169 "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
170 "sdot z11.s, z7.b, z0.b[2]\n"
171 "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
172 "sdot z8.s, z6.b, z0.b[3]\n"
173 "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
174 "sdot z9.s, z7.b, z0.b[3]\n"
175 "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
176 "sdot z10.s, z6.b, z0.b[3]\n"
177 "sdot z11.s, z7.b, z0.b[3]\n"
178 "bgt 9b\n"
179 "10:" // Height 1: Multiply loop: Single iteration only
180 "ld1b { z6.b }, p5/Z, [x14]\n"
181 "whilelt p0.b, XZR, x11\n"
182 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
183 "subs x11, x11, #0x4\n"
184 "ld1rqb { z0.b }, p0/Z, [x10]\n"
185 "sdot z8.s, z6.b, z0.b[0]\n"
186 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
187 "add x10, x10, #0x10\n"
188 "sdot z9.s, z7.b, z0.b[0]\n"
189 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
190 "addvl x14, x14, #4\n"
191 "sdot z10.s, z6.b, z0.b[0]\n"
192 "sdot z11.s, z7.b, z0.b[0]\n"
193 "ble 11f\n"
194 "ld1b { z6.b }, p5/Z, [x14]\n"
195 "sdot z8.s, z6.b, z0.b[1]\n"
196 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
197 "subs x11, x11, #0x4\n"
198 "sdot z9.s, z7.b, z0.b[1]\n"
199 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
200 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
201 "sdot z10.s, z6.b, z0.b[1]\n"
202 "addvl x14, x14, #4\n"
203 "sdot z11.s, z7.b, z0.b[1]\n"
204 "ble 11f\n"
205 "ld1b { z6.b }, p5/Z, [x14]\n"
206 "sdot z8.s, z6.b, z0.b[2]\n"
207 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
208 "subs x11, x11, #0x4\n"
209 "sdot z9.s, z7.b, z0.b[2]\n"
210 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
211 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
212 "sdot z10.s, z6.b, z0.b[2]\n"
213 "addvl x14, x14, #4\n"
214 "sdot z11.s, z7.b, z0.b[2]\n"
215 "ble 11f\n"
216 "ld1b { z6.b }, p5/Z, [x14]\n"
217 "sdot z8.s, z6.b, z0.b[3]\n"
218 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
219 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
220 "sdot z9.s, z7.b, z0.b[3]\n"
221 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
222 "addvl x14, x14, #4\n"
223 "sdot z10.s, z6.b, z0.b[3]\n"
224 "sdot z11.s, z7.b, z0.b[3]\n"
225 "11:" // Height 1: Multiply loop: multiply skip
226 "prfm pldl1keep, [x10, #0x80]\n"
227 "add x12, x12, #0x1\n"
228 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
229 "cmp x12, x19\n"
230 "bne 6b\n"
231 "prfm pstl1keep, [x13, #0x0]\n"
232 "st1w { z8.s }, p4, [x13]\n"
233 "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
234 "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
235 "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
236 "addvl x13, x13, #4\n"
237 "12:" // Height 1: Writeback done
238 "mov x19, #0x0\n"
239 "incw x19, ALL, MUL #4\n"
240 "subs x15, x15, x19\n"
241 "bgt 3b\n"
242 "b 74f\n"
243 "13:" // Height 2
244 "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
245 "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
246 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
247 "tbz %x[flags], #2, 14f\n"
248 "ldr x13, [%x[output_ptr], #0x0]\n"
249 "add x13, x13, x19, LSL #2\n"
250 "ldr x9, [%x[output_ptr], #0x8]\n"
251 "add x9, x9, x19, LSL #2\n"
252 "b 15f\n"
253 "14:" // Height 2: setup direct output
254 "mov x13, %x[output_ptr]\n"
255 "add x9, x13, x19, LSL #2\n"
256 "15:" // Height 2: Column loop
257 "mov x19, #0x0\n"
258 "whilelt p4.s, x19, x15\n"
259 "incw x19\n"
260 "whilelt p3.s, x19, x15\n"
261 "incw x19\n"
262 "whilelt p2.s, x19, x15\n"
263 "incw x19\n"
264 "whilelt p1.s, x19, x15\n"
265 "tbz %x[flags], #0, 16f\n"
266 "ld1w { z8.s }, p4/Z, [x13]\n"
267 "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
268 "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
269 "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
270 "ld1w { z12.s }, p4/Z, [x9]\n"
271 "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
272 "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
273 "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
274 "b 17f\n"
275 "16:" // Height 2: no accumulate
276 "mov z8.s, #0x0\n"
277 "mov z9.s, #0x0\n"
278 "mov z10.s, #0x0\n"
279 "mov z11.s, #0x0\n"
280 "mov z12.s, #0x0\n"
281 "mov z13.s, #0x0\n"
282 "mov z14.s, #0x0\n"
283 "mov z15.s, #0x0\n"
284 "17:" // Height 2: setup done
285 "mov x12, #0x0\n"
286 "18:" // Height 2: String loop
287 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
288 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
289 "ldr w11, [x20, x12, LSL #0x2]\n"
290 "tbz %x[flags], #3, 19f\n"
291 "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
292 "add x20, x20, x19, LSL #3\n"
293 "ldr x10, [x20, #0x0]\n"
294 "ldr x28, [x20, #0x8]\n"
295 "cbnz x12, 20f\n"
296 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
297 "add x10, x10, x19\n"
298 "add x28, x28, x19\n"
299 "b 20f\n"
300 "19:" // Height 2: setup direct input
301 "mov x10, %x[input_ptr]\n"
302 "add x28, x10, x19\n"
303 "20:" // Height 2: input setup done
304 "cmp x11, #0x10\n"
305 "ble 22f\n"
306 "21:" // Height 2: Multiply loop: Main loop head
307 "ld1b { z6.b }, p5/Z, [x14]\n"
308 "whilelt p0.b, XZR, x11\n"
309 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
310 "sub x11, x11, #0x10\n"
311 "ld1rqb { z0.b }, p0/Z, [x10]\n"
312 "sdot z8.s, z6.b, z0.b[0]\n"
313 "ld1rqb { z1.b }, p0/Z, [x28]\n"
314 "add x10, x10, #0x10\n"
315 "sdot z9.s, z7.b, z0.b[0]\n"
316 "prfm pldl1keep, [x10, #0x80]\n"
317 "add x28, x28, #0x10\n"
318 "sdot z12.s, z6.b, z1.b[0]\n"
319 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
320 "cmp x11, #0x10\n"
321 "sdot z13.s, z7.b, z1.b[0]\n"
322 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
323 "prfm pldl1keep, [x28, #0x80]\n"
324 "sdot z10.s, z6.b, z0.b[0]\n"
325 "sdot z14.s, z6.b, z1.b[0]\n"
326 "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
327 "sdot z11.s, z7.b, z0.b[0]\n"
328 "sdot z15.s, z7.b, z1.b[0]\n"
329 "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
330 "sdot z8.s, z6.b, z0.b[1]\n"
331 "sdot z12.s, z6.b, z1.b[1]\n"
332 "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
333 "sdot z9.s, z7.b, z0.b[1]\n"
334 "sdot z13.s, z7.b, z1.b[1]\n"
335 "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
336 "addvl x14, x14, #16\n"
337 "sdot z10.s, z6.b, z0.b[1]\n"
338 "sdot z14.s, z6.b, z1.b[1]\n"
339 "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
340 "sdot z11.s, z7.b, z0.b[1]\n"
341 "sdot z15.s, z7.b, z1.b[1]\n"
342 "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
343 "sdot z8.s, z6.b, z0.b[2]\n"
344 "sdot z12.s, z6.b, z1.b[2]\n"
345 "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
346 "sdot z9.s, z7.b, z0.b[2]\n"
347 "sdot z13.s, z7.b, z1.b[2]\n"
348 "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
349 "sdot z10.s, z6.b, z0.b[2]\n"
350 "sdot z14.s, z6.b, z1.b[2]\n"
351 "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
352 "sdot z11.s, z7.b, z0.b[2]\n"
353 "sdot z15.s, z7.b, z1.b[2]\n"
354 "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
355 "sdot z8.s, z6.b, z0.b[3]\n"
356 "sdot z12.s, z6.b, z1.b[3]\n"
357 "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
358 "sdot z9.s, z7.b, z0.b[3]\n"
359 "sdot z13.s, z7.b, z1.b[3]\n"
360 "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
361 "sdot z10.s, z6.b, z0.b[3]\n"
362 "sdot z14.s, z6.b, z1.b[3]\n"
363 "sdot z11.s, z7.b, z0.b[3]\n"
364 "sdot z15.s, z7.b, z1.b[3]\n"
365 "bgt 21b\n"
366 "22:" // Height 2: Multiply loop: Single iteration only
367 "ld1b { z6.b }, p5/Z, [x14]\n"
368 "whilelt p0.b, XZR, x11\n"
369 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
370 "subs x11, x11, #0x4\n"
371 "ld1rqb { z0.b }, p0/Z, [x10]\n"
372 "sdot z8.s, z6.b, z0.b[0]\n"
373 "ld1rqb { z1.b }, p0/Z, [x28]\n"
374 "add x10, x10, #0x10\n"
375 "sdot z9.s, z7.b, z0.b[0]\n"
376 "add x28, x28, #0x10\n"
377 "sdot z12.s, z6.b, z1.b[0]\n"
378 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
379 "sdot z13.s, z7.b, z1.b[0]\n"
380 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
381 "addvl x14, x14, #4\n"
382 "sdot z10.s, z6.b, z0.b[0]\n"
383 "sdot z14.s, z6.b, z1.b[0]\n"
384 "sdot z11.s, z7.b, z0.b[0]\n"
385 "sdot z15.s, z7.b, z1.b[0]\n"
386 "ble 23f\n"
387 "ld1b { z6.b }, p5/Z, [x14]\n"
388 "sdot z8.s, z6.b, z0.b[1]\n"
389 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
390 "subs x11, x11, #0x4\n"
391 "sdot z12.s, z6.b, z1.b[1]\n"
392 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
393 "sdot z9.s, z7.b, z0.b[1]\n"
394 "sdot z13.s, z7.b, z1.b[1]\n"
395 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
396 "addvl x14, x14, #4\n"
397 "sdot z10.s, z6.b, z0.b[1]\n"
398 "sdot z14.s, z6.b, z1.b[1]\n"
399 "sdot z11.s, z7.b, z0.b[1]\n"
400 "sdot z15.s, z7.b, z1.b[1]\n"
401 "ble 23f\n"
402 "ld1b { z6.b }, p5/Z, [x14]\n"
403 "sdot z8.s, z6.b, z0.b[2]\n"
404 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
405 "subs x11, x11, #0x4\n"
406 "sdot z12.s, z6.b, z1.b[2]\n"
407 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
408 "sdot z9.s, z7.b, z0.b[2]\n"
409 "sdot z13.s, z7.b, z1.b[2]\n"
410 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
411 "addvl x14, x14, #4\n"
412 "sdot z10.s, z6.b, z0.b[2]\n"
413 "sdot z14.s, z6.b, z1.b[2]\n"
414 "sdot z11.s, z7.b, z0.b[2]\n"
415 "sdot z15.s, z7.b, z1.b[2]\n"
416 "ble 23f\n"
417 "ld1b { z6.b }, p5/Z, [x14]\n"
418 "sdot z8.s, z6.b, z0.b[3]\n"
419 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
420 "sdot z12.s, z6.b, z1.b[3]\n"
421 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
422 "sdot z9.s, z7.b, z0.b[3]\n"
423 "sdot z13.s, z7.b, z1.b[3]\n"
424 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
425 "addvl x14, x14, #4\n"
426 "sdot z10.s, z6.b, z0.b[3]\n"
427 "sdot z14.s, z6.b, z1.b[3]\n"
428 "sdot z11.s, z7.b, z0.b[3]\n"
429 "sdot z15.s, z7.b, z1.b[3]\n"
430 "23:" // Height 2: Multiply loop: multiply skip
431 "prfm pldl1keep, [x10, #0x80]\n"
432 "add x12, x12, #0x1\n"
433 "prfm pldl1keep, [x28, #0x80]\n"
434 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
435 "cmp x12, x19\n"
436 "bne 18b\n"
437 "prfm pstl1keep, [x13, #0x0]\n"
438 "prfm pstl1keep, [x9, #0x0]\n"
439 "st1w { z8.s }, p4, [x13]\n"
440 "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
441 "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
442 "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
443 "addvl x13, x13, #4\n"
444 "st1w { z12.s }, p4, [x9]\n"
445 "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
446 "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
447 "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
448 "addvl x9, x9, #4\n"
449 "24:" // Height 2: Writeback done
450 "mov x19, #0x0\n"
451 "incw x19, ALL, MUL #4\n"
452 "subs x15, x15, x19\n"
453 "bgt 15b\n"
454 "b 74f\n"
455 "25:" // Height 3
456 "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
457 "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
458 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
459 "tbz %x[flags], #2, 26f\n"
460 "ldr x13, [%x[output_ptr], #0x0]\n"
461 "add x13, x13, x19, LSL #2\n"
462 "ldr x9, [%x[output_ptr], #0x8]\n"
463 "ldr x27, [%x[output_ptr], #0x10]\n"
464 "add x9, x9, x19, LSL #2\n"
465 "add x27, x27, x19, LSL #2\n"
466 "b 27f\n"
467 "26:" // Height 3: setup direct output
468 "mov x13, %x[output_ptr]\n"
469 "add x9, x13, x19, LSL #2\n"
470 "add x27, x9, x19, LSL #2\n"
471 "27:" // Height 3: Column loop
472 "mov x19, #0x0\n"
473 "whilelt p4.s, x19, x15\n"
474 "incw x19\n"
475 "whilelt p3.s, x19, x15\n"
476 "incw x19\n"
477 "whilelt p2.s, x19, x15\n"
478 "incw x19\n"
479 "whilelt p1.s, x19, x15\n"
480 "tbz %x[flags], #0, 28f\n"
481 "ld1w { z8.s }, p4/Z, [x13]\n"
482 "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
483 "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
484 "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
485 "ld1w { z12.s }, p4/Z, [x9]\n"
486 "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
487 "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
488 "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
489 "ld1w { z16.s }, p4/Z, [x27]\n"
490 "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
491 "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
492 "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
493 "b 29f\n"
494 "28:" // Height 3: no accumulate
495 "mov z8.s, #0x0\n"
496 "mov z9.s, #0x0\n"
497 "mov z10.s, #0x0\n"
498 "mov z11.s, #0x0\n"
499 "mov z12.s, #0x0\n"
500 "mov z13.s, #0x0\n"
501 "mov z14.s, #0x0\n"
502 "mov z15.s, #0x0\n"
503 "mov z16.s, #0x0\n"
504 "mov z17.s, #0x0\n"
505 "mov z18.s, #0x0\n"
506 "mov z19.s, #0x0\n"
507 "29:" // Height 3: setup done
508 "mov x12, #0x0\n"
509 "30:" // Height 3: String loop
510 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
511 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
512 "ldr w11, [x20, x12, LSL #0x2]\n"
513 "tbz %x[flags], #3, 31f\n"
514 "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
515 "add x20, x20, x19, LSL #3\n"
516 "ldr x10, [x20, #0x0]\n"
517 "ldr x28, [x20, #0x8]\n"
518 "ldr x26, [x20, #0x10]\n"
519 "cbnz x12, 32f\n"
520 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
521 "add x10, x10, x19\n"
522 "add x28, x28, x19\n"
523 "add x26, x26, x19\n"
524 "b 32f\n"
525 "31:" // Height 3: setup direct input
526 "mov x10, %x[input_ptr]\n"
527 "add x28, x10, x19\n"
528 "add x26, x28, x19\n"
529 "32:" // Height 3: input setup done
530 "cmp x11, #0x10\n"
531 "ble 34f\n"
532 "33:" // Height 3: Multiply loop: Main loop head
533 "ld1b { z6.b }, p5/Z, [x14]\n"
534 "whilelt p0.b, XZR, x11\n"
535 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
536 "sub x11, x11, #0x10\n"
537 "ld1rqb { z0.b }, p0/Z, [x10]\n"
538 "sdot z8.s, z6.b, z0.b[0]\n"
539 "ld1rqb { z1.b }, p0/Z, [x28]\n"
540 "add x10, x10, #0x10\n"
541 "sdot z9.s, z7.b, z0.b[0]\n"
542 "ld1rqb { z2.b }, p0/Z, [x26]\n"
543 "add x28, x28, #0x10\n"
544 "sdot z12.s, z6.b, z1.b[0]\n"
545 "prfm pldl1keep, [x10, #0x80]\n"
546 "add x26, x26, #0x10\n"
547 "sdot z16.s, z6.b, z2.b[0]\n"
548 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
549 "cmp x11, #0x10\n"
550 "sdot z13.s, z7.b, z1.b[0]\n"
551 "prfm pldl1keep, [x28, #0x80]\n"
552 "sdot z17.s, z7.b, z2.b[0]\n"
553 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
554 "prfm pldl1keep, [x26, #0x80]\n"
555 "sdot z10.s, z6.b, z0.b[0]\n"
556 "sdot z14.s, z6.b, z1.b[0]\n"
557 "sdot z18.s, z6.b, z2.b[0]\n"
558 "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
559 "sdot z11.s, z7.b, z0.b[0]\n"
560 "sdot z15.s, z7.b, z1.b[0]\n"
561 "sdot z19.s, z7.b, z2.b[0]\n"
562 "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
563 "sdot z8.s, z6.b, z0.b[1]\n"
564 "sdot z12.s, z6.b, z1.b[1]\n"
565 "sdot z16.s, z6.b, z2.b[1]\n"
566 "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
567 "sdot z9.s, z7.b, z0.b[1]\n"
568 "sdot z13.s, z7.b, z1.b[1]\n"
569 "sdot z17.s, z7.b, z2.b[1]\n"
570 "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
571 "addvl x14, x14, #16\n"
572 "sdot z10.s, z6.b, z0.b[1]\n"
573 "sdot z14.s, z6.b, z1.b[1]\n"
574 "sdot z18.s, z6.b, z2.b[1]\n"
575 "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
576 "sdot z11.s, z7.b, z0.b[1]\n"
577 "sdot z15.s, z7.b, z1.b[1]\n"
578 "sdot z19.s, z7.b, z2.b[1]\n"
579 "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
580 "sdot z8.s, z6.b, z0.b[2]\n"
581 "sdot z12.s, z6.b, z1.b[2]\n"
582 "sdot z16.s, z6.b, z2.b[2]\n"
583 "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
584 "sdot z9.s, z7.b, z0.b[2]\n"
585 "sdot z13.s, z7.b, z1.b[2]\n"
586 "sdot z17.s, z7.b, z2.b[2]\n"
587 "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
588 "sdot z10.s, z6.b, z0.b[2]\n"
589 "sdot z14.s, z6.b, z1.b[2]\n"
590 "sdot z18.s, z6.b, z2.b[2]\n"
591 "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
592 "sdot z11.s, z7.b, z0.b[2]\n"
593 "sdot z15.s, z7.b, z1.b[2]\n"
594 "sdot z19.s, z7.b, z2.b[2]\n"
595 "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
596 "sdot z8.s, z6.b, z0.b[3]\n"
597 "sdot z12.s, z6.b, z1.b[3]\n"
598 "sdot z16.s, z6.b, z2.b[3]\n"
599 "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
600 "sdot z9.s, z7.b, z0.b[3]\n"
601 "sdot z13.s, z7.b, z1.b[3]\n"
602 "sdot z17.s, z7.b, z2.b[3]\n"
603 "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
604 "sdot z10.s, z6.b, z0.b[3]\n"
605 "sdot z14.s, z6.b, z1.b[3]\n"
606 "sdot z18.s, z6.b, z2.b[3]\n"
607 "sdot z11.s, z7.b, z0.b[3]\n"
608 "sdot z15.s, z7.b, z1.b[3]\n"
609 "sdot z19.s, z7.b, z2.b[3]\n"
610 "bgt 33b\n"
611 "34:" // Height 3: Multiply loop: Single iteration only
612 "ld1b { z6.b }, p5/Z, [x14]\n"
613 "whilelt p0.b, XZR, x11\n"
614 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
615 "subs x11, x11, #0x4\n"
616 "ld1rqb { z0.b }, p0/Z, [x10]\n"
617 "sdot z8.s, z6.b, z0.b[0]\n"
618 "ld1rqb { z1.b }, p0/Z, [x28]\n"
619 "add x10, x10, #0x10\n"
620 "sdot z9.s, z7.b, z0.b[0]\n"
621 "ld1rqb { z2.b }, p0/Z, [x26]\n"
622 "add x28, x28, #0x10\n"
623 "sdot z12.s, z6.b, z1.b[0]\n"
624 "add x26, x26, #0x10\n"
625 "sdot z13.s, z7.b, z1.b[0]\n"
626 "sdot z16.s, z6.b, z2.b[0]\n"
627 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
628 "sdot z17.s, z7.b, z2.b[0]\n"
629 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
630 "addvl x14, x14, #4\n"
631 "sdot z10.s, z6.b, z0.b[0]\n"
632 "sdot z14.s, z6.b, z1.b[0]\n"
633 "sdot z18.s, z6.b, z2.b[0]\n"
634 "sdot z11.s, z7.b, z0.b[0]\n"
635 "sdot z15.s, z7.b, z1.b[0]\n"
636 "sdot z19.s, z7.b, z2.b[0]\n"
637 "ble 35f\n"
638 "ld1b { z6.b }, p5/Z, [x14]\n"
639 "sdot z8.s, z6.b, z0.b[1]\n"
640 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
641 "subs x11, x11, #0x4\n"
642 "sdot z12.s, z6.b, z1.b[1]\n"
643 "sdot z16.s, z6.b, z2.b[1]\n"
644 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
645 "sdot z9.s, z7.b, z0.b[1]\n"
646 "sdot z13.s, z7.b, z1.b[1]\n"
647 "sdot z17.s, z7.b, z2.b[1]\n"
648 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
649 "addvl x14, x14, #4\n"
650 "sdot z10.s, z6.b, z0.b[1]\n"
651 "sdot z14.s, z6.b, z1.b[1]\n"
652 "sdot z18.s, z6.b, z2.b[1]\n"
653 "sdot z11.s, z7.b, z0.b[1]\n"
654 "sdot z15.s, z7.b, z1.b[1]\n"
655 "sdot z19.s, z7.b, z2.b[1]\n"
656 "ble 35f\n"
657 "ld1b { z6.b }, p5/Z, [x14]\n"
658 "sdot z8.s, z6.b, z0.b[2]\n"
659 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
660 "subs x11, x11, #0x4\n"
661 "sdot z12.s, z6.b, z1.b[2]\n"
662 "sdot z16.s, z6.b, z2.b[2]\n"
663 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
664 "sdot z9.s, z7.b, z0.b[2]\n"
665 "sdot z13.s, z7.b, z1.b[2]\n"
666 "sdot z17.s, z7.b, z2.b[2]\n"
667 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
668 "addvl x14, x14, #4\n"
669 "sdot z10.s, z6.b, z0.b[2]\n"
670 "sdot z14.s, z6.b, z1.b[2]\n"
671 "sdot z18.s, z6.b, z2.b[2]\n"
672 "sdot z11.s, z7.b, z0.b[2]\n"
673 "sdot z15.s, z7.b, z1.b[2]\n"
674 "sdot z19.s, z7.b, z2.b[2]\n"
675 "ble 35f\n"
676 "ld1b { z6.b }, p5/Z, [x14]\n"
677 "sdot z8.s, z6.b, z0.b[3]\n"
678 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
679 "sdot z12.s, z6.b, z1.b[3]\n"
680 "sdot z16.s, z6.b, z2.b[3]\n"
681 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
682 "sdot z9.s, z7.b, z0.b[3]\n"
683 "sdot z13.s, z7.b, z1.b[3]\n"
684 "sdot z17.s, z7.b, z2.b[3]\n"
685 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
686 "addvl x14, x14, #4\n"
687 "sdot z10.s, z6.b, z0.b[3]\n"
688 "sdot z14.s, z6.b, z1.b[3]\n"
689 "sdot z18.s, z6.b, z2.b[3]\n"
690 "sdot z11.s, z7.b, z0.b[3]\n"
691 "sdot z15.s, z7.b, z1.b[3]\n"
692 "sdot z19.s, z7.b, z2.b[3]\n"
693 "35:" // Height 3: Multiply loop: multiply skip
694 "prfm pldl1keep, [x10, #0x80]\n"
695 "add x12, x12, #0x1\n"
696 "prfm pldl1keep, [x28, #0x80]\n"
697 "prfm pldl1keep, [x26, #0x80]\n"
698 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
699 "cmp x12, x19\n"
700 "bne 30b\n"
701 "prfm pstl1keep, [x13, #0x0]\n"
702 "prfm pstl1keep, [x9, #0x0]\n"
703 "prfm pstl1keep, [x27, #0x0]\n"
704 "st1w { z8.s }, p4, [x13]\n"
705 "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
706 "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
707 "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
708 "addvl x13, x13, #4\n"
709 "st1w { z12.s }, p4, [x9]\n"
710 "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
711 "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
712 "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
713 "addvl x9, x9, #4\n"
714 "st1w { z16.s }, p4, [x27]\n"
715 "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
716 "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
717 "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
718 "addvl x27, x27, #4\n"
719 "36:" // Height 3: Writeback done
720 "mov x19, #0x0\n"
721 "incw x19, ALL, MUL #4\n"
722 "subs x15, x15, x19\n"
723 "bgt 27b\n"
724 "b 74f\n"
725 "37:" // Height 4
726 "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
727 "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
728 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
729 "tbz %x[flags], #2, 38f\n"
730 "ldr x13, [%x[output_ptr], #0x0]\n"
731 "add x13, x13, x19, LSL #2\n"
732 "ldr x9, [%x[output_ptr], #0x8]\n"
733 "ldr x27, [%x[output_ptr], #0x10]\n"
734 "add x9, x9, x19, LSL #2\n"
735 "ldr x25, [%x[output_ptr], #0x18]\n"
736 "add x27, x27, x19, LSL #2\n"
737 "add x25, x25, x19, LSL #2\n"
738 "b 39f\n"
739 "38:" // Height 4: setup direct output
740 "mov x13, %x[output_ptr]\n"
741 "add x9, x13, x19, LSL #2\n"
742 "add x27, x9, x19, LSL #2\n"
743 "add x25, x27, x19, LSL #2\n"
744 "39:" // Height 4: Column loop
745 "mov x19, #0x0\n"
746 "whilelt p4.s, x19, x15\n"
747 "incw x19\n"
748 "whilelt p3.s, x19, x15\n"
749 "incw x19\n"
750 "whilelt p2.s, x19, x15\n"
751 "incw x19\n"
752 "whilelt p1.s, x19, x15\n"
753 "tbz %x[flags], #0, 40f\n"
754 "ld1w { z8.s }, p4/Z, [x13]\n"
755 "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
756 "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
757 "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
758 "ld1w { z12.s }, p4/Z, [x9]\n"
759 "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
760 "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
761 "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
762 "ld1w { z16.s }, p4/Z, [x27]\n"
763 "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
764 "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
765 "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
766 "ld1w { z20.s }, p4/Z, [x25]\n"
767 "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
768 "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
769 "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
770 "b 41f\n"
771 "40:" // Height 4: no accumulate
772 "mov z8.s, #0x0\n"
773 "mov z9.s, #0x0\n"
774 "mov z10.s, #0x0\n"
775 "mov z11.s, #0x0\n"
776 "mov z12.s, #0x0\n"
777 "mov z13.s, #0x0\n"
778 "mov z14.s, #0x0\n"
779 "mov z15.s, #0x0\n"
780 "mov z16.s, #0x0\n"
781 "mov z17.s, #0x0\n"
782 "mov z18.s, #0x0\n"
783 "mov z19.s, #0x0\n"
784 "mov z20.s, #0x0\n"
785 "mov z21.s, #0x0\n"
786 "mov z22.s, #0x0\n"
787 "mov z23.s, #0x0\n"
788 "41:" // Height 4: setup done
789 "mov x12, #0x0\n"
790 "42:" // Height 4: String loop
791 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
792 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
793 "ldr w11, [x20, x12, LSL #0x2]\n"
794 "tbz %x[flags], #3, 43f\n"
795 "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
796 "add x20, x20, x19, LSL #3\n"
797 "ldr x10, [x20, #0x0]\n"
798 "ldr x28, [x20, #0x8]\n"
799 "ldr x26, [x20, #0x10]\n"
800 "ldr x24, [x20, #0x18]\n"
801 "cbnz x12, 44f\n"
802 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
803 "add x10, x10, x19\n"
804 "add x28, x28, x19\n"
805 "add x26, x26, x19\n"
806 "add x24, x24, x19\n"
807 "b 44f\n"
808 "43:" // Height 4: setup direct input
809 "mov x10, %x[input_ptr]\n"
810 "add x28, x10, x19\n"
811 "add x26, x28, x19\n"
812 "add x24, x26, x19\n"
813 "44:" // Height 4: input setup done
814 "cmp x11, #0x10\n"
815 "ble 46f\n"
816 "45:" // Height 4: Multiply loop: Main loop head
817 "ld1b { z6.b }, p5/Z, [x14]\n"
818 "whilelt p0.b, XZR, x11\n"
819 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
820 "sub x11, x11, #0x10\n"
821 "ld1rqb { z0.b }, p0/Z, [x10]\n"
822 "sdot z8.s, z6.b, z0.b[0]\n"
823 "ld1rqb { z1.b }, p0/Z, [x28]\n"
824 "add x10, x10, #0x10\n"
825 "sdot z9.s, z7.b, z0.b[0]\n"
826 "ld1rqb { z2.b }, p0/Z, [x26]\n"
827 "add x28, x28, #0x10\n"
828 "sdot z12.s, z6.b, z1.b[0]\n"
829 "ld1rqb { z3.b }, p0/Z, [x24]\n"
830 "add x26, x26, #0x10\n"
831 "sdot z16.s, z6.b, z2.b[0]\n"
832 "prfm pldl1keep, [x10, #0x80]\n"
833 "add x24, x24, #0x10\n"
834 "sdot z13.s, z7.b, z1.b[0]\n"
835 "prfm pldl1keep, [x28, #0x80]\n"
836 "cmp x11, #0x10\n"
837 "sdot z20.s, z6.b, z3.b[0]\n"
838 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
839 "sdot z17.s, z7.b, z2.b[0]\n"
840 "prfm pldl1keep, [x26, #0x80]\n"
841 "sdot z21.s, z7.b, z3.b[0]\n"
842 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
843 "prfm pldl1keep, [x24, #0x80]\n"
844 "sdot z10.s, z6.b, z0.b[0]\n"
845 "sdot z14.s, z6.b, z1.b[0]\n"
846 "sdot z18.s, z6.b, z2.b[0]\n"
847 "sdot z22.s, z6.b, z3.b[0]\n"
848 "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
849 "sdot z11.s, z7.b, z0.b[0]\n"
850 "sdot z15.s, z7.b, z1.b[0]\n"
851 "sdot z19.s, z7.b, z2.b[0]\n"
852 "sdot z23.s, z7.b, z3.b[0]\n"
853 "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
854 "sdot z8.s, z6.b, z0.b[1]\n"
855 "sdot z12.s, z6.b, z1.b[1]\n"
856 "sdot z16.s, z6.b, z2.b[1]\n"
857 "sdot z20.s, z6.b, z3.b[1]\n"
858 "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
859 "sdot z9.s, z7.b, z0.b[1]\n"
860 "sdot z13.s, z7.b, z1.b[1]\n"
861 "sdot z17.s, z7.b, z2.b[1]\n"
862 "sdot z21.s, z7.b, z3.b[1]\n"
863 "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
864 "addvl x14, x14, #16\n"
865 "sdot z10.s, z6.b, z0.b[1]\n"
866 "sdot z14.s, z6.b, z1.b[1]\n"
867 "sdot z18.s, z6.b, z2.b[1]\n"
868 "sdot z22.s, z6.b, z3.b[1]\n"
869 "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
870 "sdot z11.s, z7.b, z0.b[1]\n"
871 "sdot z15.s, z7.b, z1.b[1]\n"
872 "sdot z19.s, z7.b, z2.b[1]\n"
873 "sdot z23.s, z7.b, z3.b[1]\n"
874 "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
875 "sdot z8.s, z6.b, z0.b[2]\n"
876 "sdot z12.s, z6.b, z1.b[2]\n"
877 "sdot z16.s, z6.b, z2.b[2]\n"
878 "sdot z20.s, z6.b, z3.b[2]\n"
879 "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
880 "sdot z9.s, z7.b, z0.b[2]\n"
881 "sdot z13.s, z7.b, z1.b[2]\n"
882 "sdot z17.s, z7.b, z2.b[2]\n"
883 "sdot z21.s, z7.b, z3.b[2]\n"
884 "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
885 "sdot z10.s, z6.b, z0.b[2]\n"
886 "sdot z14.s, z6.b, z1.b[2]\n"
887 "sdot z18.s, z6.b, z2.b[2]\n"
888 "sdot z22.s, z6.b, z3.b[2]\n"
889 "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
890 "sdot z11.s, z7.b, z0.b[2]\n"
891 "sdot z15.s, z7.b, z1.b[2]\n"
892 "sdot z19.s, z7.b, z2.b[2]\n"
893 "sdot z23.s, z7.b, z3.b[2]\n"
894 "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
895 "sdot z8.s, z6.b, z0.b[3]\n"
896 "sdot z12.s, z6.b, z1.b[3]\n"
897 "sdot z16.s, z6.b, z2.b[3]\n"
898 "sdot z20.s, z6.b, z3.b[3]\n"
899 "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
900 "sdot z9.s, z7.b, z0.b[3]\n"
901 "sdot z13.s, z7.b, z1.b[3]\n"
902 "sdot z17.s, z7.b, z2.b[3]\n"
903 "sdot z21.s, z7.b, z3.b[3]\n"
904 "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
905 "sdot z10.s, z6.b, z0.b[3]\n"
906 "sdot z14.s, z6.b, z1.b[3]\n"
907 "sdot z18.s, z6.b, z2.b[3]\n"
908 "sdot z22.s, z6.b, z3.b[3]\n"
909 "sdot z11.s, z7.b, z0.b[3]\n"
910 "sdot z15.s, z7.b, z1.b[3]\n"
911 "sdot z19.s, z7.b, z2.b[3]\n"
912 "sdot z23.s, z7.b, z3.b[3]\n"
913 "bgt 45b\n"
914 "46:" // Height 4: Multiply loop: Single iteration only
915 "ld1b { z6.b }, p5/Z, [x14]\n"
916 "whilelt p0.b, XZR, x11\n"
917 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
918 "subs x11, x11, #0x4\n"
919 "ld1rqb { z0.b }, p0/Z, [x10]\n"
920 "sdot z8.s, z6.b, z0.b[0]\n"
921 "ld1rqb { z1.b }, p0/Z, [x28]\n"
922 "add x10, x10, #0x10\n"
923 "sdot z9.s, z7.b, z0.b[0]\n"
924 "ld1rqb { z2.b }, p0/Z, [x26]\n"
925 "add x28, x28, #0x10\n"
926 "sdot z12.s, z6.b, z1.b[0]\n"
927 "ld1rqb { z3.b }, p0/Z, [x24]\n"
928 "add x26, x26, #0x10\n"
929 "sdot z16.s, z6.b, z2.b[0]\n"
930 "add x24, x24, #0x10\n"
931 "sdot z13.s, z7.b, z1.b[0]\n"
932 "sdot z17.s, z7.b, z2.b[0]\n"
933 "sdot z20.s, z6.b, z3.b[0]\n"
934 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
935 "sdot z21.s, z7.b, z3.b[0]\n"
936 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
937 "addvl x14, x14, #4\n"
938 "sdot z10.s, z6.b, z0.b[0]\n"
939 "sdot z14.s, z6.b, z1.b[0]\n"
940 "sdot z18.s, z6.b, z2.b[0]\n"
941 "sdot z22.s, z6.b, z3.b[0]\n"
942 "sdot z11.s, z7.b, z0.b[0]\n"
943 "sdot z15.s, z7.b, z1.b[0]\n"
944 "sdot z19.s, z7.b, z2.b[0]\n"
945 "sdot z23.s, z7.b, z3.b[0]\n"
946 "ble 47f\n"
947 "ld1b { z6.b }, p5/Z, [x14]\n"
948 "sdot z8.s, z6.b, z0.b[1]\n"
949 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
950 "subs x11, x11, #0x4\n"
951 "sdot z12.s, z6.b, z1.b[1]\n"
952 "sdot z16.s, z6.b, z2.b[1]\n"
953 "sdot z20.s, z6.b, z3.b[1]\n"
954 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
955 "sdot z9.s, z7.b, z0.b[1]\n"
956 "sdot z13.s, z7.b, z1.b[1]\n"
957 "sdot z17.s, z7.b, z2.b[1]\n"
958 "sdot z21.s, z7.b, z3.b[1]\n"
959 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
960 "addvl x14, x14, #4\n"
961 "sdot z10.s, z6.b, z0.b[1]\n"
962 "sdot z14.s, z6.b, z1.b[1]\n"
963 "sdot z18.s, z6.b, z2.b[1]\n"
964 "sdot z22.s, z6.b, z3.b[1]\n"
965 "sdot z11.s, z7.b, z0.b[1]\n"
966 "sdot z15.s, z7.b, z1.b[1]\n"
967 "sdot z19.s, z7.b, z2.b[1]\n"
968 "sdot z23.s, z7.b, z3.b[1]\n"
969 "ble 47f\n"
970 "ld1b { z6.b }, p5/Z, [x14]\n"
971 "sdot z8.s, z6.b, z0.b[2]\n"
972 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
973 "subs x11, x11, #0x4\n"
974 "sdot z12.s, z6.b, z1.b[2]\n"
975 "sdot z16.s, z6.b, z2.b[2]\n"
976 "sdot z20.s, z6.b, z3.b[2]\n"
977 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
978 "sdot z9.s, z7.b, z0.b[2]\n"
979 "sdot z13.s, z7.b, z1.b[2]\n"
980 "sdot z17.s, z7.b, z2.b[2]\n"
981 "sdot z21.s, z7.b, z3.b[2]\n"
982 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
983 "addvl x14, x14, #4\n"
984 "sdot z10.s, z6.b, z0.b[2]\n"
985 "sdot z14.s, z6.b, z1.b[2]\n"
986 "sdot z18.s, z6.b, z2.b[2]\n"
987 "sdot z22.s, z6.b, z3.b[2]\n"
988 "sdot z11.s, z7.b, z0.b[2]\n"
989 "sdot z15.s, z7.b, z1.b[2]\n"
990 "sdot z19.s, z7.b, z2.b[2]\n"
991 "sdot z23.s, z7.b, z3.b[2]\n"
992 "ble 47f\n"
993 "ld1b { z6.b }, p5/Z, [x14]\n"
994 "sdot z8.s, z6.b, z0.b[3]\n"
995 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
996 "sdot z12.s, z6.b, z1.b[3]\n"
997 "sdot z16.s, z6.b, z2.b[3]\n"
998 "sdot z20.s, z6.b, z3.b[3]\n"
999 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1000 "sdot z9.s, z7.b, z0.b[3]\n"
1001 "sdot z13.s, z7.b, z1.b[3]\n"
1002 "sdot z17.s, z7.b, z2.b[3]\n"
1003 "sdot z21.s, z7.b, z3.b[3]\n"
1004 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1005 "addvl x14, x14, #4\n"
1006 "sdot z10.s, z6.b, z0.b[3]\n"
1007 "sdot z14.s, z6.b, z1.b[3]\n"
1008 "sdot z18.s, z6.b, z2.b[3]\n"
1009 "sdot z22.s, z6.b, z3.b[3]\n"
1010 "sdot z11.s, z7.b, z0.b[3]\n"
1011 "sdot z15.s, z7.b, z1.b[3]\n"
1012 "sdot z19.s, z7.b, z2.b[3]\n"
1013 "sdot z23.s, z7.b, z3.b[3]\n"
1014 "47:" // Height 4: Multiply loop: multiply skip
1015 "prfm pldl1keep, [x10, #0x80]\n"
1016 "add x12, x12, #0x1\n"
1017 "prfm pldl1keep, [x28, #0x80]\n"
1018 "prfm pldl1keep, [x26, #0x80]\n"
1019 "prfm pldl1keep, [x24, #0x80]\n"
1020 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1021 "cmp x12, x19\n"
1022 "bne 42b\n"
1023 "prfm pstl1keep, [x13, #0x0]\n"
1024 "prfm pstl1keep, [x9, #0x0]\n"
1025 "prfm pstl1keep, [x27, #0x0]\n"
1026 "prfm pstl1keep, [x25, #0x0]\n"
1027 "st1w { z8.s }, p4, [x13]\n"
1028 "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
1029 "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
1030 "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
1031 "addvl x13, x13, #4\n"
1032 "st1w { z12.s }, p4, [x9]\n"
1033 "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
1034 "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
1035 "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
1036 "addvl x9, x9, #4\n"
1037 "st1w { z16.s }, p4, [x27]\n"
1038 "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
1039 "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
1040 "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
1041 "addvl x27, x27, #4\n"
1042 "st1w { z20.s }, p4, [x25]\n"
1043 "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
1044 "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
1045 "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
1046 "addvl x25, x25, #4\n"
1047 "48:" // Height 4: Writeback done
1048 "mov x19, #0x0\n"
1049 "incw x19, ALL, MUL #4\n"
1050 "subs x15, x15, x19\n"
1051 "bgt 39b\n"
1052 "b 74f\n"
1053 "49:" // Height 5
1054 "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
1055 "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1056 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1057 "tbz %x[flags], #2, 50f\n"
1058 "ldr x13, [%x[output_ptr], #0x0]\n"
1059 "add x13, x13, x19, LSL #2\n"
1060 "ldr x9, [%x[output_ptr], #0x8]\n"
1061 "ldr x27, [%x[output_ptr], #0x10]\n"
1062 "add x9, x9, x19, LSL #2\n"
1063 "ldr x25, [%x[output_ptr], #0x18]\n"
1064 "ldr x23, [%x[output_ptr], #0x20]\n"
1065 "add x27, x27, x19, LSL #2\n"
1066 "add x25, x25, x19, LSL #2\n"
1067 "add x23, x23, x19, LSL #2\n"
1068 "b 51f\n"
1069 "50:" // Height 5: setup direct output
1070 "mov x13, %x[output_ptr]\n"
1071 "add x9, x13, x19, LSL #2\n"
1072 "add x27, x9, x19, LSL #2\n"
1073 "add x25, x27, x19, LSL #2\n"
1074 "add x23, x25, x19, LSL #2\n"
1075 "51:" // Height 5: Column loop
1076 "mov x19, #0x0\n"
1077 "whilelt p4.s, x19, x15\n"
1078 "incw x19\n"
1079 "whilelt p3.s, x19, x15\n"
1080 "incw x19\n"
1081 "whilelt p2.s, x19, x15\n"
1082 "incw x19\n"
1083 "whilelt p1.s, x19, x15\n"
1084 "tbz %x[flags], #0, 52f\n"
1085 "ld1w { z8.s }, p4/Z, [x13]\n"
1086 "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
1087 "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
1088 "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
1089 "ld1w { z12.s }, p4/Z, [x9]\n"
1090 "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
1091 "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
1092 "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
1093 "ld1w { z16.s }, p4/Z, [x27]\n"
1094 "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
1095 "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
1096 "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
1097 "ld1w { z20.s }, p4/Z, [x25]\n"
1098 "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
1099 "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
1100 "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
1101 "ld1w { z24.s }, p4/Z, [x23]\n"
1102 "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
1103 "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
1104 "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
1105 "b 53f\n"
1106 "52:" // Height 5: no accumulate
1107 "mov z8.s, #0x0\n"
1108 "mov z9.s, #0x0\n"
1109 "mov z10.s, #0x0\n"
1110 "mov z11.s, #0x0\n"
1111 "mov z12.s, #0x0\n"
1112 "mov z13.s, #0x0\n"
1113 "mov z14.s, #0x0\n"
1114 "mov z15.s, #0x0\n"
1115 "mov z16.s, #0x0\n"
1116 "mov z17.s, #0x0\n"
1117 "mov z18.s, #0x0\n"
1118 "mov z19.s, #0x0\n"
1119 "mov z20.s, #0x0\n"
1120 "mov z21.s, #0x0\n"
1121 "mov z22.s, #0x0\n"
1122 "mov z23.s, #0x0\n"
1123 "mov z24.s, #0x0\n"
1124 "mov z25.s, #0x0\n"
1125 "mov z26.s, #0x0\n"
1126 "mov z27.s, #0x0\n"
1127 "53:" // Height 5: setup done
1128 "mov x12, #0x0\n"
1129 "54:" // Height 5: String loop
1130 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1131 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1132 "ldr w11, [x20, x12, LSL #0x2]\n"
1133 "tbz %x[flags], #3, 55f\n"
1134 "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
1135 "add x20, x20, x19, LSL #3\n"
1136 "ldr x10, [x20, #0x0]\n"
1137 "ldr x28, [x20, #0x8]\n"
1138 "ldr x26, [x20, #0x10]\n"
1139 "ldr x24, [x20, #0x18]\n"
1140 "ldr x22, [x20, #0x20]\n"
1141 "cbnz x12, 56f\n"
1142 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1143 "add x10, x10, x19\n"
1144 "add x28, x28, x19\n"
1145 "add x26, x26, x19\n"
1146 "add x24, x24, x19\n"
1147 "add x22, x22, x19\n"
1148 "b 56f\n"
1149 "55:" // Height 5: setup direct input
1150 "mov x10, %x[input_ptr]\n"
1151 "add x28, x10, x19\n"
1152 "add x26, x28, x19\n"
1153 "add x24, x26, x19\n"
1154 "add x22, x24, x19\n"
1155 "56:" // Height 5: input setup done
1156 "cmp x11, #0x10\n"
1157 "ble 58f\n"
1158 "57:" // Height 5: Multiply loop: Main loop head
1159 "ld1b { z6.b }, p5/Z, [x14]\n"
1160 "whilelt p0.b, XZR, x11\n"
1161 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1162 "sub x11, x11, #0x10\n"
1163 "ld1rqb { z0.b }, p0/Z, [x10]\n"
1164 "sdot z8.s, z6.b, z0.b[0]\n"
1165 "ld1rqb { z1.b }, p0/Z, [x28]\n"
1166 "add x10, x10, #0x10\n"
1167 "sdot z9.s, z7.b, z0.b[0]\n"
1168 "ld1rqb { z2.b }, p0/Z, [x26]\n"
1169 "add x28, x28, #0x10\n"
1170 "sdot z12.s, z6.b, z1.b[0]\n"
1171 "ld1rqb { z3.b }, p0/Z, [x24]\n"
1172 "add x26, x26, #0x10\n"
1173 "sdot z16.s, z6.b, z2.b[0]\n"
1174 "ld1rqb { z4.b }, p0/Z, [x22]\n"
1175 "add x24, x24, #0x10\n"
1176 "sdot z13.s, z7.b, z1.b[0]\n"
1177 "prfm pldl1keep, [x10, #0x80]\n"
1178 "add x22, x22, #0x10\n"
1179 "sdot z20.s, z6.b, z3.b[0]\n"
1180 "prfm pldl1keep, [x28, #0x80]\n"
1181 "cmp x11, #0x10\n"
1182 "sdot z24.s, z6.b, z4.b[0]\n"
1183 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1184 "sdot z17.s, z7.b, z2.b[0]\n"
1185 "prfm pldl1keep, [x26, #0x80]\n"
1186 "sdot z21.s, z7.b, z3.b[0]\n"
1187 "prfm pldl1keep, [x24, #0x80]\n"
1188 "sdot z25.s, z7.b, z4.b[0]\n"
1189 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1190 "sdot z10.s, z6.b, z0.b[0]\n"
1191 "prfm pldl1keep, [x22, #0x80]\n"
1192 "sdot z14.s, z6.b, z1.b[0]\n"
1193 "sdot z18.s, z6.b, z2.b[0]\n"
1194 "sdot z22.s, z6.b, z3.b[0]\n"
1195 "sdot z26.s, z6.b, z4.b[0]\n"
1196 "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
1197 "sdot z11.s, z7.b, z0.b[0]\n"
1198 "sdot z15.s, z7.b, z1.b[0]\n"
1199 "sdot z19.s, z7.b, z2.b[0]\n"
1200 "sdot z23.s, z7.b, z3.b[0]\n"
1201 "sdot z27.s, z7.b, z4.b[0]\n"
1202 "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
1203 "sdot z8.s, z6.b, z0.b[1]\n"
1204 "sdot z12.s, z6.b, z1.b[1]\n"
1205 "sdot z16.s, z6.b, z2.b[1]\n"
1206 "sdot z20.s, z6.b, z3.b[1]\n"
1207 "sdot z24.s, z6.b, z4.b[1]\n"
1208 "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
1209 "sdot z9.s, z7.b, z0.b[1]\n"
1210 "sdot z13.s, z7.b, z1.b[1]\n"
1211 "sdot z17.s, z7.b, z2.b[1]\n"
1212 "sdot z21.s, z7.b, z3.b[1]\n"
1213 "sdot z25.s, z7.b, z4.b[1]\n"
1214 "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
1215 "addvl x14, x14, #16\n"
1216 "sdot z10.s, z6.b, z0.b[1]\n"
1217 "sdot z14.s, z6.b, z1.b[1]\n"
1218 "sdot z18.s, z6.b, z2.b[1]\n"
1219 "sdot z22.s, z6.b, z3.b[1]\n"
1220 "sdot z26.s, z6.b, z4.b[1]\n"
1221 "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
1222 "sdot z11.s, z7.b, z0.b[1]\n"
1223 "sdot z15.s, z7.b, z1.b[1]\n"
1224 "sdot z19.s, z7.b, z2.b[1]\n"
1225 "sdot z23.s, z7.b, z3.b[1]\n"
1226 "sdot z27.s, z7.b, z4.b[1]\n"
1227 "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
1228 "sdot z8.s, z6.b, z0.b[2]\n"
1229 "sdot z12.s, z6.b, z1.b[2]\n"
1230 "sdot z16.s, z6.b, z2.b[2]\n"
1231 "sdot z20.s, z6.b, z3.b[2]\n"
1232 "sdot z24.s, z6.b, z4.b[2]\n"
1233 "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
1234 "sdot z9.s, z7.b, z0.b[2]\n"
1235 "sdot z13.s, z7.b, z1.b[2]\n"
1236 "sdot z17.s, z7.b, z2.b[2]\n"
1237 "sdot z21.s, z7.b, z3.b[2]\n"
1238 "sdot z25.s, z7.b, z4.b[2]\n"
1239 "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
1240 "sdot z10.s, z6.b, z0.b[2]\n"
1241 "sdot z14.s, z6.b, z1.b[2]\n"
1242 "sdot z18.s, z6.b, z2.b[2]\n"
1243 "sdot z22.s, z6.b, z3.b[2]\n"
1244 "sdot z26.s, z6.b, z4.b[2]\n"
1245 "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
1246 "sdot z11.s, z7.b, z0.b[2]\n"
1247 "sdot z15.s, z7.b, z1.b[2]\n"
1248 "sdot z19.s, z7.b, z2.b[2]\n"
1249 "sdot z23.s, z7.b, z3.b[2]\n"
1250 "sdot z27.s, z7.b, z4.b[2]\n"
1251 "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
1252 "sdot z8.s, z6.b, z0.b[3]\n"
1253 "sdot z12.s, z6.b, z1.b[3]\n"
1254 "sdot z16.s, z6.b, z2.b[3]\n"
1255 "sdot z20.s, z6.b, z3.b[3]\n"
1256 "sdot z24.s, z6.b, z4.b[3]\n"
1257 "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
1258 "sdot z9.s, z7.b, z0.b[3]\n"
1259 "sdot z13.s, z7.b, z1.b[3]\n"
1260 "sdot z17.s, z7.b, z2.b[3]\n"
1261 "sdot z21.s, z7.b, z3.b[3]\n"
1262 "sdot z25.s, z7.b, z4.b[3]\n"
1263 "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
1264 "sdot z10.s, z6.b, z0.b[3]\n"
1265 "sdot z14.s, z6.b, z1.b[3]\n"
1266 "sdot z18.s, z6.b, z2.b[3]\n"
1267 "sdot z22.s, z6.b, z3.b[3]\n"
1268 "sdot z26.s, z6.b, z4.b[3]\n"
1269 "sdot z11.s, z7.b, z0.b[3]\n"
1270 "sdot z15.s, z7.b, z1.b[3]\n"
1271 "sdot z19.s, z7.b, z2.b[3]\n"
1272 "sdot z23.s, z7.b, z3.b[3]\n"
1273 "sdot z27.s, z7.b, z4.b[3]\n"
1274 "bgt 57b\n"
1275 "58:" // Height 5: Multiply loop: Single iteration only
1276 "ld1b { z6.b }, p5/Z, [x14]\n"
1277 "whilelt p0.b, XZR, x11\n"
1278 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1279 "subs x11, x11, #0x4\n"
1280 "ld1rqb { z0.b }, p0/Z, [x10]\n"
1281 "sdot z8.s, z6.b, z0.b[0]\n"
1282 "ld1rqb { z1.b }, p0/Z, [x28]\n"
1283 "add x10, x10, #0x10\n"
1284 "sdot z9.s, z7.b, z0.b[0]\n"
1285 "ld1rqb { z2.b }, p0/Z, [x26]\n"
1286 "add x28, x28, #0x10\n"
1287 "sdot z12.s, z6.b, z1.b[0]\n"
1288 "ld1rqb { z3.b }, p0/Z, [x24]\n"
1289 "add x26, x26, #0x10\n"
1290 "sdot z16.s, z6.b, z2.b[0]\n"
1291 "ld1rqb { z4.b }, p0/Z, [x22]\n"
1292 "add x24, x24, #0x10\n"
1293 "sdot z13.s, z7.b, z1.b[0]\n"
1294 "add x22, x22, #0x10\n"
1295 "sdot z17.s, z7.b, z2.b[0]\n"
1296 "sdot z20.s, z6.b, z3.b[0]\n"
1297 "sdot z24.s, z6.b, z4.b[0]\n"
1298 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1299 "sdot z21.s, z7.b, z3.b[0]\n"
1300 "sdot z25.s, z7.b, z4.b[0]\n"
1301 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1302 "addvl x14, x14, #4\n"
1303 "sdot z10.s, z6.b, z0.b[0]\n"
1304 "sdot z14.s, z6.b, z1.b[0]\n"
1305 "sdot z18.s, z6.b, z2.b[0]\n"
1306 "sdot z22.s, z6.b, z3.b[0]\n"
1307 "sdot z26.s, z6.b, z4.b[0]\n"
1308 "sdot z11.s, z7.b, z0.b[0]\n"
1309 "sdot z15.s, z7.b, z1.b[0]\n"
1310 "sdot z19.s, z7.b, z2.b[0]\n"
1311 "sdot z23.s, z7.b, z3.b[0]\n"
1312 "sdot z27.s, z7.b, z4.b[0]\n"
1313 "ble 59f\n"
1314 "ld1b { z6.b }, p5/Z, [x14]\n"
1315 "sdot z8.s, z6.b, z0.b[1]\n"
1316 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1317 "subs x11, x11, #0x4\n"
1318 "sdot z12.s, z6.b, z1.b[1]\n"
1319 "sdot z16.s, z6.b, z2.b[1]\n"
1320 "sdot z20.s, z6.b, z3.b[1]\n"
1321 "sdot z24.s, z6.b, z4.b[1]\n"
1322 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1323 "sdot z9.s, z7.b, z0.b[1]\n"
1324 "sdot z13.s, z7.b, z1.b[1]\n"
1325 "sdot z17.s, z7.b, z2.b[1]\n"
1326 "sdot z21.s, z7.b, z3.b[1]\n"
1327 "sdot z25.s, z7.b, z4.b[1]\n"
1328 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1329 "addvl x14, x14, #4\n"
1330 "sdot z10.s, z6.b, z0.b[1]\n"
1331 "sdot z14.s, z6.b, z1.b[1]\n"
1332 "sdot z18.s, z6.b, z2.b[1]\n"
1333 "sdot z22.s, z6.b, z3.b[1]\n"
1334 "sdot z26.s, z6.b, z4.b[1]\n"
1335 "sdot z11.s, z7.b, z0.b[1]\n"
1336 "sdot z15.s, z7.b, z1.b[1]\n"
1337 "sdot z19.s, z7.b, z2.b[1]\n"
1338 "sdot z23.s, z7.b, z3.b[1]\n"
1339 "sdot z27.s, z7.b, z4.b[1]\n"
1340 "ble 59f\n"
1341 "ld1b { z6.b }, p5/Z, [x14]\n"
1342 "sdot z8.s, z6.b, z0.b[2]\n"
1343 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1344 "subs x11, x11, #0x4\n"
1345 "sdot z12.s, z6.b, z1.b[2]\n"
1346 "sdot z16.s, z6.b, z2.b[2]\n"
1347 "sdot z20.s, z6.b, z3.b[2]\n"
1348 "sdot z24.s, z6.b, z4.b[2]\n"
1349 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1350 "sdot z9.s, z7.b, z0.b[2]\n"
1351 "sdot z13.s, z7.b, z1.b[2]\n"
1352 "sdot z17.s, z7.b, z2.b[2]\n"
1353 "sdot z21.s, z7.b, z3.b[2]\n"
1354 "sdot z25.s, z7.b, z4.b[2]\n"
1355 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1356 "addvl x14, x14, #4\n"
1357 "sdot z10.s, z6.b, z0.b[2]\n"
1358 "sdot z14.s, z6.b, z1.b[2]\n"
1359 "sdot z18.s, z6.b, z2.b[2]\n"
1360 "sdot z22.s, z6.b, z3.b[2]\n"
1361 "sdot z26.s, z6.b, z4.b[2]\n"
1362 "sdot z11.s, z7.b, z0.b[2]\n"
1363 "sdot z15.s, z7.b, z1.b[2]\n"
1364 "sdot z19.s, z7.b, z2.b[2]\n"
1365 "sdot z23.s, z7.b, z3.b[2]\n"
1366 "sdot z27.s, z7.b, z4.b[2]\n"
1367 "ble 59f\n"
1368 "ld1b { z6.b }, p5/Z, [x14]\n"
1369 "sdot z8.s, z6.b, z0.b[3]\n"
1370 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1371 "sdot z12.s, z6.b, z1.b[3]\n"
1372 "sdot z16.s, z6.b, z2.b[3]\n"
1373 "sdot z20.s, z6.b, z3.b[3]\n"
1374 "sdot z24.s, z6.b, z4.b[3]\n"
1375 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1376 "sdot z9.s, z7.b, z0.b[3]\n"
1377 "sdot z13.s, z7.b, z1.b[3]\n"
1378 "sdot z17.s, z7.b, z2.b[3]\n"
1379 "sdot z21.s, z7.b, z3.b[3]\n"
1380 "sdot z25.s, z7.b, z4.b[3]\n"
1381 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1382 "addvl x14, x14, #4\n"
1383 "sdot z10.s, z6.b, z0.b[3]\n"
1384 "sdot z14.s, z6.b, z1.b[3]\n"
1385 "sdot z18.s, z6.b, z2.b[3]\n"
1386 "sdot z22.s, z6.b, z3.b[3]\n"
1387 "sdot z26.s, z6.b, z4.b[3]\n"
1388 "sdot z11.s, z7.b, z0.b[3]\n"
1389 "sdot z15.s, z7.b, z1.b[3]\n"
1390 "sdot z19.s, z7.b, z2.b[3]\n"
1391 "sdot z23.s, z7.b, z3.b[3]\n"
1392 "sdot z27.s, z7.b, z4.b[3]\n"
1393 "59:" // Height 5: Multiply loop: multiply skip
1394 "prfm pldl1keep, [x10, #0x80]\n"
1395 "add x12, x12, #0x1\n"
1396 "prfm pldl1keep, [x28, #0x80]\n"
1397 "prfm pldl1keep, [x26, #0x80]\n"
1398 "prfm pldl1keep, [x24, #0x80]\n"
1399 "prfm pldl1keep, [x22, #0x80]\n"
1400 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1401 "cmp x12, x19\n"
1402 "bne 54b\n"
1403 "prfm pstl1keep, [x13, #0x0]\n"
1404 "prfm pstl1keep, [x9, #0x0]\n"
1405 "prfm pstl1keep, [x27, #0x0]\n"
1406 "prfm pstl1keep, [x25, #0x0]\n"
1407 "prfm pstl1keep, [x23, #0x0]\n"
1408 "st1w { z8.s }, p4, [x13]\n"
1409 "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
1410 "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
1411 "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
1412 "addvl x13, x13, #4\n"
1413 "st1w { z12.s }, p4, [x9]\n"
1414 "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
1415 "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
1416 "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
1417 "addvl x9, x9, #4\n"
1418 "st1w { z16.s }, p4, [x27]\n"
1419 "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
1420 "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
1421 "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
1422 "addvl x27, x27, #4\n"
1423 "st1w { z20.s }, p4, [x25]\n"
1424 "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
1425 "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
1426 "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
1427 "addvl x25, x25, #4\n"
1428 "st1w { z24.s }, p4, [x23]\n"
1429 "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
1430 "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
1431 "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
1432 "addvl x23, x23, #4\n"
1433 "60:" // Height 5: Writeback done
1434 "mov x19, #0x0\n"
1435 "incw x19, ALL, MUL #4\n"
1436 "subs x15, x15, x19\n"
1437 "bgt 51b\n"
1438 "b 74f\n"
1439 "61:" // Height 6
1440 "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
1441 "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
1442 "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
1443 "tbz %x[flags], #2, 62f\n"
1444 "ldr x13, [%x[output_ptr], #0x0]\n"
1445 "add x13, x13, x19, LSL #2\n"
1446 "ldr x9, [%x[output_ptr], #0x8]\n"
1447 "ldr x27, [%x[output_ptr], #0x10]\n"
1448 "add x9, x9, x19, LSL #2\n"
1449 "ldr x25, [%x[output_ptr], #0x18]\n"
1450 "ldr x23, [%x[output_ptr], #0x20]\n"
1451 "add x27, x27, x19, LSL #2\n"
1452 "ldr x21, [%x[output_ptr], #0x28]\n"
1453 "add %x[output_ptr], %x[output_ptr], #0x30\n"
1454 "add x25, x25, x19, LSL #2\n"
1455 "add x23, x23, x19, LSL #2\n"
1456 "add x21, x21, x19, LSL #2\n"
1457 "b 63f\n"
1458 "62:" // Height 6: setup direct output
1459 "mov x13, %x[output_ptr]\n"
1460 "add x9, x13, x19, LSL #2\n"
1461 "add x27, x9, x19, LSL #2\n"
1462 "add x25, x27, x19, LSL #2\n"
1463 "add x23, x25, x19, LSL #2\n"
1464 "add x21, x23, x19, LSL #2\n"
1465 "add %x[output_ptr], x21, x19, LSL #2\n"
1466 "63:" // Height 6: Column loop
1467 "mov x19, #0x0\n"
1468 "whilelt p4.s, x19, x15\n"
1469 "incw x19\n"
1470 "whilelt p3.s, x19, x15\n"
1471 "incw x19\n"
1472 "whilelt p2.s, x19, x15\n"
1473 "incw x19\n"
1474 "whilelt p1.s, x19, x15\n"
1475 "tbz %x[flags], #0, 64f\n"
1476 "ld1w { z8.s }, p4/Z, [x13]\n"
1477 "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
1478 "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
1479 "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
1480 "ld1w { z12.s }, p4/Z, [x9]\n"
1481 "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
1482 "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
1483 "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
1484 "ld1w { z16.s }, p4/Z, [x27]\n"
1485 "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
1486 "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
1487 "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
1488 "ld1w { z20.s }, p4/Z, [x25]\n"
1489 "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
1490 "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
1491 "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
1492 "ld1w { z24.s }, p4/Z, [x23]\n"
1493 "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
1494 "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
1495 "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
1496 "ld1w { z28.s }, p4/Z, [x21]\n"
1497 "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
1498 "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
1499 "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
1500 "b 65f\n"
1501 "64:" // Height 6: no accumulate
1502 "mov z8.s, #0x0\n"
1503 "mov z9.s, #0x0\n"
1504 "mov z10.s, #0x0\n"
1505 "mov z11.s, #0x0\n"
1506 "mov z12.s, #0x0\n"
1507 "mov z13.s, #0x0\n"
1508 "mov z14.s, #0x0\n"
1509 "mov z15.s, #0x0\n"
1510 "mov z16.s, #0x0\n"
1511 "mov z17.s, #0x0\n"
1512 "mov z18.s, #0x0\n"
1513 "mov z19.s, #0x0\n"
1514 "mov z20.s, #0x0\n"
1515 "mov z21.s, #0x0\n"
1516 "mov z22.s, #0x0\n"
1517 "mov z23.s, #0x0\n"
1518 "mov z24.s, #0x0\n"
1519 "mov z25.s, #0x0\n"
1520 "mov z26.s, #0x0\n"
1521 "mov z27.s, #0x0\n"
1522 "mov z28.s, #0x0\n"
1523 "mov z29.s, #0x0\n"
1524 "mov z30.s, #0x0\n"
1525 "mov z31.s, #0x0\n"
1526 "65:" // Height 6: setup done
1527 "mov x12, #0x0\n"
1528 "66:" // Height 6: String loop
1529 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
1530 "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
1531 "ldr w11, [x20, x12, LSL #0x2]\n"
1532 "tbz %x[flags], #3, 67f\n"
1533 "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
1534 "add x20, x20, x19, LSL #3\n"
1535 "ldr x10, [x20, #0x0]\n"
1536 "ldr x28, [x20, #0x8]\n"
1537 "ldr x26, [x20, #0x10]\n"
1538 "ldr x24, [x20, #0x18]\n"
1539 "ldr x22, [x20, #0x20]\n"
1540 "ldr x20, [x20, #0x28]\n"
1541 "cbnz x12, 68f\n"
1542 "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
1543 "add x10, x10, x19\n"
1544 "add x28, x28, x19\n"
1545 "add x26, x26, x19\n"
1546 "add x24, x24, x19\n"
1547 "add x22, x22, x19\n"
1548 "add x20, x20, x19\n"
1549 "b 68f\n"
1550 "67:" // Height 6: setup direct input
1551 "mov x10, %x[input_ptr]\n"
1552 "add x28, x10, x19\n"
1553 "add x26, x28, x19\n"
1554 "add x24, x26, x19\n"
1555 "add x22, x24, x19\n"
1556 "add x20, x22, x19\n"
1557 "68:" // Height 6: input setup done
1558 "cmp x11, #0x10\n"
1559 "ble 70f\n"
1560 "69:" // Height 6: Multiply loop: Main loop head
1561 "ld1b { z6.b }, p5/Z, [x14]\n"
1562 "whilelt p0.b, XZR, x11\n"
1563 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1564 "sub x11, x11, #0x10\n"
1565 "ld1rqb { z0.b }, p0/Z, [x10]\n"
1566 "sdot z8.s, z6.b, z0.b[0]\n"
1567 "ld1rqb { z1.b }, p0/Z, [x28]\n"
1568 "add x10, x10, #0x10\n"
1569 "sdot z9.s, z7.b, z0.b[0]\n"
1570 "ld1rqb { z2.b }, p0/Z, [x26]\n"
1571 "add x28, x28, #0x10\n"
1572 "sdot z12.s, z6.b, z1.b[0]\n"
1573 "ld1rqb { z3.b }, p0/Z, [x24]\n"
1574 "add x26, x26, #0x10\n"
1575 "sdot z16.s, z6.b, z2.b[0]\n"
1576 "ld1rqb { z4.b }, p0/Z, [x22]\n"
1577 "add x24, x24, #0x10\n"
1578 "sdot z13.s, z7.b, z1.b[0]\n"
1579 "ld1rqb { z5.b }, p0/Z, [x20]\n"
1580 "add x22, x22, #0x10\n"
1581 "sdot z20.s, z6.b, z3.b[0]\n"
1582 "prfm pldl1keep, [x10, #0x80]\n"
1583 "add x20, x20, #0x10\n"
1584 "sdot z24.s, z6.b, z4.b[0]\n"
1585 "prfm pldl1keep, [x28, #0x80]\n"
1586 "cmp x11, #0x10\n"
1587 "sdot z28.s, z6.b, z5.b[0]\n"
1588 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1589 "sdot z17.s, z7.b, z2.b[0]\n"
1590 "prfm pldl1keep, [x26, #0x80]\n"
1591 "sdot z21.s, z7.b, z3.b[0]\n"
1592 "prfm pldl1keep, [x24, #0x80]\n"
1593 "sdot z25.s, z7.b, z4.b[0]\n"
1594 "prfm pldl1keep, [x22, #0x80]\n"
1595 "sdot z29.s, z7.b, z5.b[0]\n"
1596 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1597 "sdot z10.s, z6.b, z0.b[0]\n"
1598 "prfm pldl1keep, [x20, #0x80]\n"
1599 "sdot z14.s, z6.b, z1.b[0]\n"
1600 "sdot z18.s, z6.b, z2.b[0]\n"
1601 "sdot z22.s, z6.b, z3.b[0]\n"
1602 "sdot z26.s, z6.b, z4.b[0]\n"
1603 "sdot z30.s, z6.b, z5.b[0]\n"
1604 "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
1605 "sdot z11.s, z7.b, z0.b[0]\n"
1606 "sdot z15.s, z7.b, z1.b[0]\n"
1607 "sdot z19.s, z7.b, z2.b[0]\n"
1608 "sdot z23.s, z7.b, z3.b[0]\n"
1609 "sdot z27.s, z7.b, z4.b[0]\n"
1610 "sdot z31.s, z7.b, z5.b[0]\n"
1611 "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
1612 "sdot z8.s, z6.b, z0.b[1]\n"
1613 "sdot z12.s, z6.b, z1.b[1]\n"
1614 "sdot z16.s, z6.b, z2.b[1]\n"
1615 "sdot z20.s, z6.b, z3.b[1]\n"
1616 "sdot z24.s, z6.b, z4.b[1]\n"
1617 "sdot z28.s, z6.b, z5.b[1]\n"
1618 "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
1619 "sdot z9.s, z7.b, z0.b[1]\n"
1620 "sdot z13.s, z7.b, z1.b[1]\n"
1621 "sdot z17.s, z7.b, z2.b[1]\n"
1622 "sdot z21.s, z7.b, z3.b[1]\n"
1623 "sdot z25.s, z7.b, z4.b[1]\n"
1624 "sdot z29.s, z7.b, z5.b[1]\n"
1625 "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
1626 "addvl x14, x14, #16\n"
1627 "sdot z10.s, z6.b, z0.b[1]\n"
1628 "sdot z14.s, z6.b, z1.b[1]\n"
1629 "sdot z18.s, z6.b, z2.b[1]\n"
1630 "sdot z22.s, z6.b, z3.b[1]\n"
1631 "sdot z26.s, z6.b, z4.b[1]\n"
1632 "sdot z30.s, z6.b, z5.b[1]\n"
1633 "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
1634 "sdot z11.s, z7.b, z0.b[1]\n"
1635 "sdot z15.s, z7.b, z1.b[1]\n"
1636 "sdot z19.s, z7.b, z2.b[1]\n"
1637 "sdot z23.s, z7.b, z3.b[1]\n"
1638 "sdot z27.s, z7.b, z4.b[1]\n"
1639 "sdot z31.s, z7.b, z5.b[1]\n"
1640 "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
1641 "sdot z8.s, z6.b, z0.b[2]\n"
1642 "sdot z12.s, z6.b, z1.b[2]\n"
1643 "sdot z16.s, z6.b, z2.b[2]\n"
1644 "sdot z20.s, z6.b, z3.b[2]\n"
1645 "sdot z24.s, z6.b, z4.b[2]\n"
1646 "sdot z28.s, z6.b, z5.b[2]\n"
1647 "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
1648 "sdot z9.s, z7.b, z0.b[2]\n"
1649 "sdot z13.s, z7.b, z1.b[2]\n"
1650 "sdot z17.s, z7.b, z2.b[2]\n"
1651 "sdot z21.s, z7.b, z3.b[2]\n"
1652 "sdot z25.s, z7.b, z4.b[2]\n"
1653 "sdot z29.s, z7.b, z5.b[2]\n"
1654 "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
1655 "sdot z10.s, z6.b, z0.b[2]\n"
1656 "sdot z14.s, z6.b, z1.b[2]\n"
1657 "sdot z18.s, z6.b, z2.b[2]\n"
1658 "sdot z22.s, z6.b, z3.b[2]\n"
1659 "sdot z26.s, z6.b, z4.b[2]\n"
1660 "sdot z30.s, z6.b, z5.b[2]\n"
1661 "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
1662 "sdot z11.s, z7.b, z0.b[2]\n"
1663 "sdot z15.s, z7.b, z1.b[2]\n"
1664 "sdot z19.s, z7.b, z2.b[2]\n"
1665 "sdot z23.s, z7.b, z3.b[2]\n"
1666 "sdot z27.s, z7.b, z4.b[2]\n"
1667 "sdot z31.s, z7.b, z5.b[2]\n"
1668 "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
1669 "sdot z8.s, z6.b, z0.b[3]\n"
1670 "sdot z12.s, z6.b, z1.b[3]\n"
1671 "sdot z16.s, z6.b, z2.b[3]\n"
1672 "sdot z20.s, z6.b, z3.b[3]\n"
1673 "sdot z24.s, z6.b, z4.b[3]\n"
1674 "sdot z28.s, z6.b, z5.b[3]\n"
1675 "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
1676 "sdot z9.s, z7.b, z0.b[3]\n"
1677 "sdot z13.s, z7.b, z1.b[3]\n"
1678 "sdot z17.s, z7.b, z2.b[3]\n"
1679 "sdot z21.s, z7.b, z3.b[3]\n"
1680 "sdot z25.s, z7.b, z4.b[3]\n"
1681 "sdot z29.s, z7.b, z5.b[3]\n"
1682 "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
1683 "sdot z10.s, z6.b, z0.b[3]\n"
1684 "sdot z14.s, z6.b, z1.b[3]\n"
1685 "sdot z18.s, z6.b, z2.b[3]\n"
1686 "sdot z22.s, z6.b, z3.b[3]\n"
1687 "sdot z26.s, z6.b, z4.b[3]\n"
1688 "sdot z30.s, z6.b, z5.b[3]\n"
1689 "sdot z11.s, z7.b, z0.b[3]\n"
1690 "sdot z15.s, z7.b, z1.b[3]\n"
1691 "sdot z19.s, z7.b, z2.b[3]\n"
1692 "sdot z23.s, z7.b, z3.b[3]\n"
1693 "sdot z27.s, z7.b, z4.b[3]\n"
1694 "sdot z31.s, z7.b, z5.b[3]\n"
1695 "bgt 69b\n"
1696 "70:" // Height 6: Multiply loop: Single iteration only
1697 "ld1b { z6.b }, p5/Z, [x14]\n"
1698 "whilelt p0.b, XZR, x11\n"
1699 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1700 "subs x11, x11, #0x4\n"
1701 "ld1rqb { z0.b }, p0/Z, [x10]\n"
1702 "sdot z8.s, z6.b, z0.b[0]\n"
1703 "ld1rqb { z1.b }, p0/Z, [x28]\n"
1704 "add x10, x10, #0x10\n"
1705 "sdot z9.s, z7.b, z0.b[0]\n"
1706 "ld1rqb { z2.b }, p0/Z, [x26]\n"
1707 "add x28, x28, #0x10\n"
1708 "sdot z12.s, z6.b, z1.b[0]\n"
1709 "ld1rqb { z3.b }, p0/Z, [x24]\n"
1710 "add x26, x26, #0x10\n"
1711 "sdot z16.s, z6.b, z2.b[0]\n"
1712 "ld1rqb { z4.b }, p0/Z, [x22]\n"
1713 "add x24, x24, #0x10\n"
1714 "sdot z13.s, z7.b, z1.b[0]\n"
1715 "ld1rqb { z5.b }, p0/Z, [x20]\n"
1716 "add x22, x22, #0x10\n"
1717 "sdot z20.s, z6.b, z3.b[0]\n"
1718 "add x20, x20, #0x10\n"
1719 "sdot z17.s, z7.b, z2.b[0]\n"
1720 "sdot z24.s, z6.b, z4.b[0]\n"
1721 "sdot z28.s, z6.b, z5.b[0]\n"
1722 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1723 "sdot z21.s, z7.b, z3.b[0]\n"
1724 "sdot z25.s, z7.b, z4.b[0]\n"
1725 "sdot z29.s, z7.b, z5.b[0]\n"
1726 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1727 "addvl x14, x14, #4\n"
1728 "sdot z10.s, z6.b, z0.b[0]\n"
1729 "sdot z14.s, z6.b, z1.b[0]\n"
1730 "sdot z18.s, z6.b, z2.b[0]\n"
1731 "sdot z22.s, z6.b, z3.b[0]\n"
1732 "sdot z26.s, z6.b, z4.b[0]\n"
1733 "sdot z30.s, z6.b, z5.b[0]\n"
1734 "sdot z11.s, z7.b, z0.b[0]\n"
1735 "sdot z15.s, z7.b, z1.b[0]\n"
1736 "sdot z19.s, z7.b, z2.b[0]\n"
1737 "sdot z23.s, z7.b, z3.b[0]\n"
1738 "sdot z27.s, z7.b, z4.b[0]\n"
1739 "sdot z31.s, z7.b, z5.b[0]\n"
1740 "ble 71f\n"
1741 "ld1b { z6.b }, p5/Z, [x14]\n"
1742 "sdot z8.s, z6.b, z0.b[1]\n"
1743 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1744 "subs x11, x11, #0x4\n"
1745 "sdot z12.s, z6.b, z1.b[1]\n"
1746 "sdot z16.s, z6.b, z2.b[1]\n"
1747 "sdot z20.s, z6.b, z3.b[1]\n"
1748 "sdot z24.s, z6.b, z4.b[1]\n"
1749 "sdot z28.s, z6.b, z5.b[1]\n"
1750 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1751 "sdot z9.s, z7.b, z0.b[1]\n"
1752 "sdot z13.s, z7.b, z1.b[1]\n"
1753 "sdot z17.s, z7.b, z2.b[1]\n"
1754 "sdot z21.s, z7.b, z3.b[1]\n"
1755 "sdot z25.s, z7.b, z4.b[1]\n"
1756 "sdot z29.s, z7.b, z5.b[1]\n"
1757 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1758 "addvl x14, x14, #4\n"
1759 "sdot z10.s, z6.b, z0.b[1]\n"
1760 "sdot z14.s, z6.b, z1.b[1]\n"
1761 "sdot z18.s, z6.b, z2.b[1]\n"
1762 "sdot z22.s, z6.b, z3.b[1]\n"
1763 "sdot z26.s, z6.b, z4.b[1]\n"
1764 "sdot z30.s, z6.b, z5.b[1]\n"
1765 "sdot z11.s, z7.b, z0.b[1]\n"
1766 "sdot z15.s, z7.b, z1.b[1]\n"
1767 "sdot z19.s, z7.b, z2.b[1]\n"
1768 "sdot z23.s, z7.b, z3.b[1]\n"
1769 "sdot z27.s, z7.b, z4.b[1]\n"
1770 "sdot z31.s, z7.b, z5.b[1]\n"
1771 "ble 71f\n"
1772 "ld1b { z6.b }, p5/Z, [x14]\n"
1773 "sdot z8.s, z6.b, z0.b[2]\n"
1774 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1775 "subs x11, x11, #0x4\n"
1776 "sdot z12.s, z6.b, z1.b[2]\n"
1777 "sdot z16.s, z6.b, z2.b[2]\n"
1778 "sdot z20.s, z6.b, z3.b[2]\n"
1779 "sdot z24.s, z6.b, z4.b[2]\n"
1780 "sdot z28.s, z6.b, z5.b[2]\n"
1781 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1782 "sdot z9.s, z7.b, z0.b[2]\n"
1783 "sdot z13.s, z7.b, z1.b[2]\n"
1784 "sdot z17.s, z7.b, z2.b[2]\n"
1785 "sdot z21.s, z7.b, z3.b[2]\n"
1786 "sdot z25.s, z7.b, z4.b[2]\n"
1787 "sdot z29.s, z7.b, z5.b[2]\n"
1788 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1789 "addvl x14, x14, #4\n"
1790 "sdot z10.s, z6.b, z0.b[2]\n"
1791 "sdot z14.s, z6.b, z1.b[2]\n"
1792 "sdot z18.s, z6.b, z2.b[2]\n"
1793 "sdot z22.s, z6.b, z3.b[2]\n"
1794 "sdot z26.s, z6.b, z4.b[2]\n"
1795 "sdot z30.s, z6.b, z5.b[2]\n"
1796 "sdot z11.s, z7.b, z0.b[2]\n"
1797 "sdot z15.s, z7.b, z1.b[2]\n"
1798 "sdot z19.s, z7.b, z2.b[2]\n"
1799 "sdot z23.s, z7.b, z3.b[2]\n"
1800 "sdot z27.s, z7.b, z4.b[2]\n"
1801 "sdot z31.s, z7.b, z5.b[2]\n"
1802 "ble 71f\n"
1803 "ld1b { z6.b }, p5/Z, [x14]\n"
1804 "sdot z8.s, z6.b, z0.b[3]\n"
1805 "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
1806 "sdot z12.s, z6.b, z1.b[3]\n"
1807 "sdot z16.s, z6.b, z2.b[3]\n"
1808 "sdot z20.s, z6.b, z3.b[3]\n"
1809 "sdot z24.s, z6.b, z4.b[3]\n"
1810 "sdot z28.s, z6.b, z5.b[3]\n"
1811 "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
1812 "sdot z9.s, z7.b, z0.b[3]\n"
1813 "sdot z13.s, z7.b, z1.b[3]\n"
1814 "sdot z17.s, z7.b, z2.b[3]\n"
1815 "sdot z21.s, z7.b, z3.b[3]\n"
1816 "sdot z25.s, z7.b, z4.b[3]\n"
1817 "sdot z29.s, z7.b, z5.b[3]\n"
1818 "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
1819 "addvl x14, x14, #4\n"
1820 "sdot z10.s, z6.b, z0.b[3]\n"
1821 "sdot z14.s, z6.b, z1.b[3]\n"
1822 "sdot z18.s, z6.b, z2.b[3]\n"
1823 "sdot z22.s, z6.b, z3.b[3]\n"
1824 "sdot z26.s, z6.b, z4.b[3]\n"
1825 "sdot z30.s, z6.b, z5.b[3]\n"
1826 "sdot z11.s, z7.b, z0.b[3]\n"
1827 "sdot z15.s, z7.b, z1.b[3]\n"
1828 "sdot z19.s, z7.b, z2.b[3]\n"
1829 "sdot z23.s, z7.b, z3.b[3]\n"
1830 "sdot z27.s, z7.b, z4.b[3]\n"
1831 "sdot z31.s, z7.b, z5.b[3]\n"
1832 "71:" // Height 6: Multiply loop: multiply skip
1833 "prfm pldl1keep, [x10, #0x80]\n"
1834 "add x12, x12, #0x1\n"
1835 "prfm pldl1keep, [x28, #0x80]\n"
1836 "prfm pldl1keep, [x26, #0x80]\n"
1837 "prfm pldl1keep, [x24, #0x80]\n"
1838 "prfm pldl1keep, [x22, #0x80]\n"
1839 "prfm pldl1keep, [x20, #0x80]\n"
1840 "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
1841 "cmp x12, x19\n"
1842 "bne 66b\n"
1843 "prfm pstl1keep, [x13, #0x0]\n"
1844 "prfm pstl1keep, [x9, #0x0]\n"
1845 "prfm pstl1keep, [x27, #0x0]\n"
1846 "prfm pstl1keep, [x25, #0x0]\n"
1847 "prfm pstl1keep, [x23, #0x0]\n"
1848 "prfm pstl1keep, [x21, #0x0]\n"
1849 "st1w { z8.s }, p4, [x13]\n"
1850 "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
1851 "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
1852 "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
1853 "addvl x13, x13, #4\n"
1854 "st1w { z12.s }, p4, [x9]\n"
1855 "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
1856 "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
1857 "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
1858 "addvl x9, x9, #4\n"
1859 "st1w { z16.s }, p4, [x27]\n"
1860 "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
1861 "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
1862 "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
1863 "addvl x27, x27, #4\n"
1864 "st1w { z20.s }, p4, [x25]\n"
1865 "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
1866 "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
1867 "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
1868 "addvl x25, x25, #4\n"
1869 "st1w { z24.s }, p4, [x23]\n"
1870 "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
1871 "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
1872 "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
1873 "addvl x23, x23, #4\n"
1874 "st1w { z28.s }, p4, [x21]\n"
1875 "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
1876 "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
1877 "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
1878 "addvl x21, x21, #4\n"
1879 "72:" // Height 6: Writeback done
1880 "mov x19, #0x0\n"
1881 "incw x19, ALL, MUL #4\n"
1882 "subs x15, x15, x19\n"
1883 "bgt 63b\n"
1884 "subs %x[M], %x[M], #0x6\n"
1885 "beq 74f\n"
1886 "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1887 "tbz %x[flags], #3, 73f\n"
1888 "add x20, x20, #0x6\n"
1889 "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
1890 "b 1b\n"
1891 "73:" // Update direct input
1892 "mov x19, #0x6\n"
1893 "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
1894 "b 1b\n"
1895 "74:" // Exit
1896
1897 : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
1898 : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
1899 : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
1900 );
1901 }
1902
1903 } // namespace arm_gemm
1904 #endif // __ARM_FEATURE_SVE
1905