1 /*
2 * Copyright (c) 2019-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24 #ifdef __aarch64__
25
26 #include "arm_gemm.hpp"
27 #include "../../utils.hpp"
28
29 #include <cassert>
30
31 namespace arm_gemm {
32
a64_gemv_fp32_mla_32(const float * A_ptr,const float * B_ptr,float * output_ptr,size_t N,size_t K,const float * bias,Activation act,bool)33 void a64_gemv_fp32_mla_32 (
34 const float *A_ptr, const float *B_ptr, float *output_ptr,
35 size_t N, size_t K,
36 const float *bias, Activation act, bool
37 )
38 {
39 struct KernelArgs {
40 float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
41 float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
42 const float *B_ptr = {};
43 size_t output_offset = {};
44 unsigned int input_initial_col = {};
45 } ka;
46
47 unsigned long flags=0;
48 ka.B_ptr = B_ptr;
49 switch(act.type) {
50 default:
51 case Activation::Type::None:
52 break;
53 case Activation::Type::BoundedReLU:
54 ka.maxval = static_cast<float>(act.param1);
55 /* fall through */
56 case Activation::Type::ReLU:
57 ka.minval = 0;
58 flags |= 0x2;
59 break;
60 }
61 __asm__ __volatile__(
62 "add x22, %x[N], #0x3\n"
63 "mov x21, %x[bias]\n"
64 "lsr x22, x22, #0x2\n"
65 "1:" // Column loop
66 "cmp x22, #0x8\n"
67 "bge 85f\n"
68 "cmp x22, #0x6\n"
69 "bgt 73f\n"
70 "beq 61f\n"
71 "cmp x22, #0x4\n"
72 "bgt 49f\n"
73 "beq 37f\n"
74 "cmp x22, #0x2\n"
75 "bgt 25f\n"
76 "beq 13f\n"
77 "mov x20, %x[K]\n"
78 "mov x19, %x[A_ptr]\n"
79 "cbz x21, 2f\n"
80 "ldr q24, [x21, #0x0]\n"
81 "add x21, x21, #0x10\n"
82 "b 3f\n"
83 "2:" // Width 1: no bias
84 "movi v24.16b, #0x0\n"
85 "3:" // Width 1: setup done
86 "cmp x20, #0x4\n"
87 "blt 6f\n"
88 "cmp x20, #0x8\n"
89 "blt 5f\n"
90 "4:" // Width 1: Multiply loop: Main loop head
91 "ldr q0, [x19, #0x0]\n"
92 "ldr q1, [%x[B_ptr], #0x0]\n"
93 "fmla v24.4s, v1.4s, v0.s[0]\n"
94 "add %x[B_ptr], %x[B_ptr], #0x80\n"
95 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
96 "ldr q2, [%x[B_ptr], #0x0]\n"
97 "fmla v24.4s, v2.4s, v0.s[1]\n"
98 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
99 "add %x[B_ptr], %x[B_ptr], #0x80\n"
100 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
101 "ldr q3, [%x[B_ptr], #0x0]\n"
102 "fmla v24.4s, v3.4s, v0.s[2]\n"
103 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
104 "add %x[B_ptr], %x[B_ptr], #0x80\n"
105 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
106 "ldr q4, [%x[B_ptr], #0x0]\n"
107 "fmla v24.4s, v4.4s, v0.s[3]\n"
108 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
109 "add %x[B_ptr], %x[B_ptr], #0x80\n"
110 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
111 "add x19, x19, #0x10\n"
112 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
113 "sub x20, x20, #0x4\n"
114 "prfm pldl1keep, [x19, #0x80]\n"
115 "cmp x20, #0x8\n"
116 "bge 4b\n"
117 "5:" // Width 1: Multiply loop: Single iteration only
118 "sub x20, x20, #0x4\n"
119 "ldr q0, [x19, #0x0]\n"
120 "ldr q5, [%x[B_ptr], #0x0]\n"
121 "fmla v24.4s, v5.4s, v0.s[0]\n"
122 "add %x[B_ptr], %x[B_ptr], #0x80\n"
123 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
124 "ldr q6, [%x[B_ptr], #0x0]\n"
125 "fmla v24.4s, v6.4s, v0.s[1]\n"
126 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
127 "add %x[B_ptr], %x[B_ptr], #0x80\n"
128 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
129 "ldr q7, [%x[B_ptr], #0x0]\n"
130 "fmla v24.4s, v7.4s, v0.s[2]\n"
131 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
132 "add %x[B_ptr], %x[B_ptr], #0x80\n"
133 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
134 "ldr q8, [%x[B_ptr], #0x0]\n"
135 "fmla v24.4s, v8.4s, v0.s[3]\n"
136 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
137 "add %x[B_ptr], %x[B_ptr], #0x80\n"
138 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
139 "add x19, x19, #0x10\n"
140 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
141 "prfm pldl1keep, [x19, #0x80]\n"
142 "6:" // Width 1: Multiply loop: Main loop skip
143 "cbz x20, 8f\n"
144 "7:" // Width 1: Multiply loop: Odd block loop
145 "ldr s0, [x19], #0x4\n"
146 "ldr q9, [%x[B_ptr], #0x0]\n"
147 "fmla v24.4s, v9.4s, v0.s[0]\n"
148 "add %x[B_ptr], %x[B_ptr], #0x80\n"
149 "sub x20, x20, #0x1\n"
150 "cbnz x20, 7b\n"
151 "8:" // Width 1: Multiply loop: No odd multiplies
152 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
153 "tbz %x[flags], #1, 9f\n"
154 "add x19, %x[args_ptr], %[offset_min]\n"
155 "ld1r { v17.4s }, [x19]\n"
156 "add x19, %x[args_ptr], %[offset_max]\n"
157 "ld1r { v16.4s }, [x19]\n"
158 "fmin v24.4s, v24.4s, v16.4s\n"
159 "fmax v24.4s, v24.4s, v17.4s\n"
160 "9:" // Width 1: No activation
161 "cmp %x[N], #0x4\n"
162 "blt 10f\n"
163 "str q24, [%x[output_ptr], #0x0]\n"
164 "add %x[output_ptr], %x[output_ptr], #0x10\n"
165 "b 12f\n"
166 "10:" // Width 1: Partial writeback
167 "tbz %x[N], #1, 11f\n"
168 "str d24, [%x[output_ptr]], #0x8\n"
169 "tbz %x[N], #0, 12f\n"
170 "st1 { v24.s }[2], [%x[output_ptr]]\n"
171 "b 12f\n"
172 "11:" // Width 1: Partial direct writeback: partial_1_0
173 "str s24, [%x[output_ptr], #0x0]\n"
174 "12:" // Width 1: Writeback done
175 "b 97f\n"
176 "13:" // Width 2
177 "mov x20, %x[K]\n"
178 "mov x19, %x[A_ptr]\n"
179 "cbz x21, 14f\n"
180 "ldr q24, [x21, #0x0]\n"
181 "ldr q25, [x21, #0x10]\n"
182 "add x21, x21, #0x20\n"
183 "b 15f\n"
184 "14:" // Width 2: no bias
185 "movi v24.16b, #0x0\n"
186 "movi v25.16b, #0x0\n"
187 "15:" // Width 2: setup done
188 "cmp x20, #0x4\n"
189 "blt 18f\n"
190 "cmp x20, #0x8\n"
191 "blt 17f\n"
192 "16:" // Width 2: Multiply loop: Main loop head
193 "ldr q0, [x19, #0x0]\n"
194 "ldr q1, [%x[B_ptr], #0x0]\n"
195 "fmla v24.4s, v1.4s, v0.s[0]\n"
196 "ldr q2, [%x[B_ptr], #0x10]\n"
197 "add %x[B_ptr], %x[B_ptr], #0x80\n"
198 "fmla v25.4s, v2.4s, v0.s[0]\n"
199 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
200 "ldr q3, [%x[B_ptr], #0x0]\n"
201 "fmla v24.4s, v3.4s, v0.s[1]\n"
202 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
203 "ldr q4, [%x[B_ptr], #0x10]\n"
204 "fmla v25.4s, v4.4s, v0.s[1]\n"
205 "add %x[B_ptr], %x[B_ptr], #0x80\n"
206 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
207 "ldr q5, [%x[B_ptr], #0x0]\n"
208 "fmla v24.4s, v5.4s, v0.s[2]\n"
209 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
210 "ldr q6, [%x[B_ptr], #0x10]\n"
211 "fmla v25.4s, v6.4s, v0.s[2]\n"
212 "add %x[B_ptr], %x[B_ptr], #0x80\n"
213 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
214 "ldr q7, [%x[B_ptr], #0x0]\n"
215 "fmla v24.4s, v7.4s, v0.s[3]\n"
216 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
217 "ldr q8, [%x[B_ptr], #0x10]\n"
218 "fmla v25.4s, v8.4s, v0.s[3]\n"
219 "add %x[B_ptr], %x[B_ptr], #0x80\n"
220 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
221 "add x19, x19, #0x10\n"
222 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
223 "sub x20, x20, #0x4\n"
224 "prfm pldl1keep, [x19, #0x80]\n"
225 "cmp x20, #0x8\n"
226 "bge 16b\n"
227 "17:" // Width 2: Multiply loop: Single iteration only
228 "sub x20, x20, #0x4\n"
229 "ldr q0, [x19, #0x0]\n"
230 "ldr q9, [%x[B_ptr], #0x0]\n"
231 "fmla v24.4s, v9.4s, v0.s[0]\n"
232 "ldr q10, [%x[B_ptr], #0x10]\n"
233 "add %x[B_ptr], %x[B_ptr], #0x80\n"
234 "fmla v25.4s, v10.4s, v0.s[0]\n"
235 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
236 "ldr q11, [%x[B_ptr], #0x0]\n"
237 "fmla v24.4s, v11.4s, v0.s[1]\n"
238 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
239 "ldr q12, [%x[B_ptr], #0x10]\n"
240 "fmla v25.4s, v12.4s, v0.s[1]\n"
241 "add %x[B_ptr], %x[B_ptr], #0x80\n"
242 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
243 "ldr q13, [%x[B_ptr], #0x0]\n"
244 "fmla v24.4s, v13.4s, v0.s[2]\n"
245 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
246 "ldr q14, [%x[B_ptr], #0x10]\n"
247 "fmla v25.4s, v14.4s, v0.s[2]\n"
248 "add %x[B_ptr], %x[B_ptr], #0x80\n"
249 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
250 "ldr q15, [%x[B_ptr], #0x0]\n"
251 "fmla v24.4s, v15.4s, v0.s[3]\n"
252 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
253 "ldr q16, [%x[B_ptr], #0x10]\n"
254 "fmla v25.4s, v16.4s, v0.s[3]\n"
255 "add %x[B_ptr], %x[B_ptr], #0x80\n"
256 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
257 "add x19, x19, #0x10\n"
258 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
259 "prfm pldl1keep, [x19, #0x80]\n"
260 "18:" // Width 2: Multiply loop: Main loop skip
261 "cbz x20, 20f\n"
262 "19:" // Width 2: Multiply loop: Odd block loop
263 "ldr s0, [x19], #0x4\n"
264 "ldr q17, [%x[B_ptr], #0x0]\n"
265 "fmla v24.4s, v17.4s, v0.s[0]\n"
266 "ldr q18, [%x[B_ptr], #0x10]\n"
267 "add %x[B_ptr], %x[B_ptr], #0x80\n"
268 "fmla v25.4s, v18.4s, v0.s[0]\n"
269 "sub x20, x20, #0x1\n"
270 "cbnz x20, 19b\n"
271 "20:" // Width 2: Multiply loop: No odd multiplies
272 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
273 "tbz %x[flags], #1, 21f\n"
274 "add x19, %x[args_ptr], %[offset_min]\n"
275 "ld1r { v17.4s }, [x19]\n"
276 "add x19, %x[args_ptr], %[offset_max]\n"
277 "ld1r { v16.4s }, [x19]\n"
278 "fmin v24.4s, v24.4s, v16.4s\n"
279 "fmin v25.4s, v25.4s, v16.4s\n"
280 "fmax v24.4s, v24.4s, v17.4s\n"
281 "fmax v25.4s, v25.4s, v17.4s\n"
282 "21:" // Width 2: No activation
283 "str q24, [%x[output_ptr], #0x0]\n"
284 "cmp %x[N], #0x8\n"
285 "add %x[output_ptr], %x[output_ptr], #0x10\n"
286 "blt 22f\n"
287 "str q25, [%x[output_ptr], #0x0]\n"
288 "add %x[output_ptr], %x[output_ptr], #0x10\n"
289 "b 24f\n"
290 "22:" // Width 2: Partial writeback
291 "tbz %x[N], #1, 23f\n"
292 "str d25, [%x[output_ptr]], #0x8\n"
293 "tbz %x[N], #0, 24f\n"
294 "st1 { v25.s }[2], [%x[output_ptr]]\n"
295 "b 24f\n"
296 "23:" // Width 2: Partial direct writeback: partial_1_4
297 "tbz %x[N], #0, 24f\n"
298 "str s25, [%x[output_ptr], #0x0]\n"
299 "24:" // Width 2: Writeback done
300 "b 97f\n"
301 "25:" // Width 3
302 "mov x20, %x[K]\n"
303 "mov x19, %x[A_ptr]\n"
304 "cbz x21, 26f\n"
305 "ldr q24, [x21, #0x0]\n"
306 "ldr q25, [x21, #0x10]\n"
307 "ldr q26, [x21, #0x20]\n"
308 "add x21, x21, #0x30\n"
309 "b 27f\n"
310 "26:" // Width 3: no bias
311 "movi v24.16b, #0x0\n"
312 "movi v25.16b, #0x0\n"
313 "movi v26.16b, #0x0\n"
314 "27:" // Width 3: setup done
315 "cmp x20, #0x4\n"
316 "blt 30f\n"
317 "cmp x20, #0x8\n"
318 "blt 29f\n"
319 "28:" // Width 3: Multiply loop: Main loop head
320 "ldr q0, [x19, #0x0]\n"
321 "ldr q1, [%x[B_ptr], #0x0]\n"
322 "fmla v24.4s, v1.4s, v0.s[0]\n"
323 "ldr q2, [%x[B_ptr], #0x10]\n"
324 "ldr q3, [%x[B_ptr], #0x20]\n"
325 "fmla v25.4s, v2.4s, v0.s[0]\n"
326 "add %x[B_ptr], %x[B_ptr], #0x80\n"
327 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
328 "fmla v26.4s, v3.4s, v0.s[0]\n"
329 "ldr q4, [%x[B_ptr], #0x0]\n"
330 "fmla v24.4s, v4.4s, v0.s[1]\n"
331 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
332 "ldr q5, [%x[B_ptr], #0x10]\n"
333 "fmla v25.4s, v5.4s, v0.s[1]\n"
334 "ldr q6, [%x[B_ptr], #0x20]\n"
335 "add %x[B_ptr], %x[B_ptr], #0x80\n"
336 "fmla v26.4s, v6.4s, v0.s[1]\n"
337 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
338 "ldr q7, [%x[B_ptr], #0x0]\n"
339 "fmla v24.4s, v7.4s, v0.s[2]\n"
340 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
341 "ldr q8, [%x[B_ptr], #0x10]\n"
342 "fmla v25.4s, v8.4s, v0.s[2]\n"
343 "ldr q9, [%x[B_ptr], #0x20]\n"
344 "add %x[B_ptr], %x[B_ptr], #0x80\n"
345 "fmla v26.4s, v9.4s, v0.s[2]\n"
346 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
347 "ldr q10, [%x[B_ptr], #0x0]\n"
348 "fmla v24.4s, v10.4s, v0.s[3]\n"
349 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
350 "ldr q11, [%x[B_ptr], #0x10]\n"
351 "fmla v25.4s, v11.4s, v0.s[3]\n"
352 "ldr q12, [%x[B_ptr], #0x20]\n"
353 "add %x[B_ptr], %x[B_ptr], #0x80\n"
354 "fmla v26.4s, v12.4s, v0.s[3]\n"
355 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
356 "add x19, x19, #0x10\n"
357 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
358 "sub x20, x20, #0x4\n"
359 "prfm pldl1keep, [x19, #0x80]\n"
360 "cmp x20, #0x8\n"
361 "bge 28b\n"
362 "29:" // Width 3: Multiply loop: Single iteration only
363 "sub x20, x20, #0x4\n"
364 "ldr q0, [x19, #0x0]\n"
365 "ldr q13, [%x[B_ptr], #0x0]\n"
366 "fmla v24.4s, v13.4s, v0.s[0]\n"
367 "ldr q14, [%x[B_ptr], #0x10]\n"
368 "ldr q15, [%x[B_ptr], #0x20]\n"
369 "fmla v25.4s, v14.4s, v0.s[0]\n"
370 "add %x[B_ptr], %x[B_ptr], #0x80\n"
371 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
372 "fmla v26.4s, v15.4s, v0.s[0]\n"
373 "ldr q16, [%x[B_ptr], #0x0]\n"
374 "fmla v24.4s, v16.4s, v0.s[1]\n"
375 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
376 "ldr q17, [%x[B_ptr], #0x10]\n"
377 "fmla v25.4s, v17.4s, v0.s[1]\n"
378 "ldr q18, [%x[B_ptr], #0x20]\n"
379 "add %x[B_ptr], %x[B_ptr], #0x80\n"
380 "fmla v26.4s, v18.4s, v0.s[1]\n"
381 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
382 "ldr q19, [%x[B_ptr], #0x0]\n"
383 "fmla v24.4s, v19.4s, v0.s[2]\n"
384 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
385 "ldr q20, [%x[B_ptr], #0x10]\n"
386 "fmla v25.4s, v20.4s, v0.s[2]\n"
387 "ldr q21, [%x[B_ptr], #0x20]\n"
388 "add %x[B_ptr], %x[B_ptr], #0x80\n"
389 "fmla v26.4s, v21.4s, v0.s[2]\n"
390 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
391 "ldr q22, [%x[B_ptr], #0x0]\n"
392 "fmla v24.4s, v22.4s, v0.s[3]\n"
393 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
394 "ldr q23, [%x[B_ptr], #0x10]\n"
395 "fmla v25.4s, v23.4s, v0.s[3]\n"
396 "ldr q1, [%x[B_ptr], #0x20]\n"
397 "add %x[B_ptr], %x[B_ptr], #0x80\n"
398 "fmla v26.4s, v1.4s, v0.s[3]\n"
399 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
400 "add x19, x19, #0x10\n"
401 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
402 "prfm pldl1keep, [x19, #0x80]\n"
403 "30:" // Width 3: Multiply loop: Main loop skip
404 "cbz x20, 32f\n"
405 "31:" // Width 3: Multiply loop: Odd block loop
406 "ldr s0, [x19], #0x4\n"
407 "ldr q2, [%x[B_ptr], #0x0]\n"
408 "fmla v24.4s, v2.4s, v0.s[0]\n"
409 "ldr q3, [%x[B_ptr], #0x10]\n"
410 "ldr q4, [%x[B_ptr], #0x20]\n"
411 "fmla v25.4s, v3.4s, v0.s[0]\n"
412 "add %x[B_ptr], %x[B_ptr], #0x80\n"
413 "fmla v26.4s, v4.4s, v0.s[0]\n"
414 "sub x20, x20, #0x1\n"
415 "cbnz x20, 31b\n"
416 "32:" // Width 3: Multiply loop: No odd multiplies
417 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
418 "tbz %x[flags], #1, 33f\n"
419 "add x19, %x[args_ptr], %[offset_min]\n"
420 "ld1r { v17.4s }, [x19]\n"
421 "add x19, %x[args_ptr], %[offset_max]\n"
422 "ld1r { v16.4s }, [x19]\n"
423 "fmin v24.4s, v24.4s, v16.4s\n"
424 "fmin v25.4s, v25.4s, v16.4s\n"
425 "fmin v26.4s, v26.4s, v16.4s\n"
426 "fmax v24.4s, v24.4s, v17.4s\n"
427 "fmax v25.4s, v25.4s, v17.4s\n"
428 "fmax v26.4s, v26.4s, v17.4s\n"
429 "33:" // Width 3: No activation
430 "str q24, [%x[output_ptr], #0x0]\n"
431 "str q25, [%x[output_ptr], #0x10]\n"
432 "cmp %x[N], #0xc\n"
433 "add %x[output_ptr], %x[output_ptr], #0x20\n"
434 "blt 34f\n"
435 "str q26, [%x[output_ptr], #0x0]\n"
436 "add %x[output_ptr], %x[output_ptr], #0x10\n"
437 "b 36f\n"
438 "34:" // Width 3: Partial writeback
439 "tbz %x[N], #1, 35f\n"
440 "str d26, [%x[output_ptr]], #0x8\n"
441 "tbz %x[N], #0, 36f\n"
442 "st1 { v26.s }[2], [%x[output_ptr]]\n"
443 "b 36f\n"
444 "35:" // Width 3: Partial direct writeback: partial_1_8
445 "tbz %x[N], #0, 36f\n"
446 "str s26, [%x[output_ptr], #0x0]\n"
447 "36:" // Width 3: Writeback done
448 "b 97f\n"
449 "37:" // Width 4
450 "mov x20, %x[K]\n"
451 "mov x19, %x[A_ptr]\n"
452 "cbz x21, 38f\n"
453 "ldr q24, [x21, #0x0]\n"
454 "ldr q25, [x21, #0x10]\n"
455 "ldr q26, [x21, #0x20]\n"
456 "ldr q27, [x21, #0x30]\n"
457 "add x21, x21, #0x40\n"
458 "b 39f\n"
459 "38:" // Width 4: no bias
460 "movi v24.16b, #0x0\n"
461 "movi v25.16b, #0x0\n"
462 "movi v26.16b, #0x0\n"
463 "movi v27.16b, #0x0\n"
464 "39:" // Width 4: setup done
465 "cmp x20, #0x4\n"
466 "blt 42f\n"
467 "cmp x20, #0x8\n"
468 "blt 41f\n"
469 "40:" // Width 4: Multiply loop: Main loop head
470 "ldr q0, [x19, #0x0]\n"
471 "ldr q1, [%x[B_ptr], #0x0]\n"
472 "fmla v24.4s, v1.4s, v0.s[0]\n"
473 "ldr q2, [%x[B_ptr], #0x10]\n"
474 "ldr q3, [%x[B_ptr], #0x20]\n"
475 "fmla v25.4s, v2.4s, v0.s[0]\n"
476 "ldr q4, [%x[B_ptr], #0x30]\n"
477 "fmla v26.4s, v3.4s, v0.s[0]\n"
478 "add %x[B_ptr], %x[B_ptr], #0x80\n"
479 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
480 "fmla v27.4s, v4.4s, v0.s[0]\n"
481 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
482 "ldr q5, [%x[B_ptr], #0x0]\n"
483 "fmla v24.4s, v5.4s, v0.s[1]\n"
484 "ldr q6, [%x[B_ptr], #0x10]\n"
485 "ldr q7, [%x[B_ptr], #0x20]\n"
486 "fmla v25.4s, v6.4s, v0.s[1]\n"
487 "ldr q8, [%x[B_ptr], #0x30]\n"
488 "fmla v26.4s, v7.4s, v0.s[1]\n"
489 "add %x[B_ptr], %x[B_ptr], #0x80\n"
490 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
491 "fmla v27.4s, v8.4s, v0.s[1]\n"
492 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
493 "ldr q9, [%x[B_ptr], #0x0]\n"
494 "fmla v24.4s, v9.4s, v0.s[2]\n"
495 "ldr q10, [%x[B_ptr], #0x10]\n"
496 "ldr q11, [%x[B_ptr], #0x20]\n"
497 "fmla v25.4s, v10.4s, v0.s[2]\n"
498 "ldr q12, [%x[B_ptr], #0x30]\n"
499 "fmla v26.4s, v11.4s, v0.s[2]\n"
500 "add %x[B_ptr], %x[B_ptr], #0x80\n"
501 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
502 "fmla v27.4s, v12.4s, v0.s[2]\n"
503 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
504 "ldr q13, [%x[B_ptr], #0x0]\n"
505 "fmla v24.4s, v13.4s, v0.s[3]\n"
506 "ldr q14, [%x[B_ptr], #0x10]\n"
507 "ldr q15, [%x[B_ptr], #0x20]\n"
508 "fmla v25.4s, v14.4s, v0.s[3]\n"
509 "ldr q16, [%x[B_ptr], #0x30]\n"
510 "fmla v26.4s, v15.4s, v0.s[3]\n"
511 "add %x[B_ptr], %x[B_ptr], #0x80\n"
512 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
513 "fmla v27.4s, v16.4s, v0.s[3]\n"
514 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
515 "add x19, x19, #0x10\n"
516 "prfm pldl1keep, [x19, #0x80]\n"
517 "sub x20, x20, #0x4\n"
518 "cmp x20, #0x8\n"
519 "bge 40b\n"
520 "41:" // Width 4: Multiply loop: Single iteration only
521 "sub x20, x20, #0x4\n"
522 "ldr q0, [x19, #0x0]\n"
523 "ldr q17, [%x[B_ptr], #0x0]\n"
524 "fmla v24.4s, v17.4s, v0.s[0]\n"
525 "ldr q18, [%x[B_ptr], #0x10]\n"
526 "ldr q19, [%x[B_ptr], #0x20]\n"
527 "fmla v25.4s, v18.4s, v0.s[0]\n"
528 "ldr q20, [%x[B_ptr], #0x30]\n"
529 "fmla v26.4s, v19.4s, v0.s[0]\n"
530 "add %x[B_ptr], %x[B_ptr], #0x80\n"
531 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
532 "fmla v27.4s, v20.4s, v0.s[0]\n"
533 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
534 "ldr q21, [%x[B_ptr], #0x0]\n"
535 "fmla v24.4s, v21.4s, v0.s[1]\n"
536 "ldr q22, [%x[B_ptr], #0x10]\n"
537 "ldr q23, [%x[B_ptr], #0x20]\n"
538 "fmla v25.4s, v22.4s, v0.s[1]\n"
539 "ldr q1, [%x[B_ptr], #0x30]\n"
540 "fmla v26.4s, v23.4s, v0.s[1]\n"
541 "add %x[B_ptr], %x[B_ptr], #0x80\n"
542 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
543 "fmla v27.4s, v1.4s, v0.s[1]\n"
544 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
545 "ldr q2, [%x[B_ptr], #0x0]\n"
546 "fmla v24.4s, v2.4s, v0.s[2]\n"
547 "ldr q3, [%x[B_ptr], #0x10]\n"
548 "ldr q4, [%x[B_ptr], #0x20]\n"
549 "fmla v25.4s, v3.4s, v0.s[2]\n"
550 "ldr q5, [%x[B_ptr], #0x30]\n"
551 "fmla v26.4s, v4.4s, v0.s[2]\n"
552 "add %x[B_ptr], %x[B_ptr], #0x80\n"
553 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
554 "fmla v27.4s, v5.4s, v0.s[2]\n"
555 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
556 "ldr q6, [%x[B_ptr], #0x0]\n"
557 "fmla v24.4s, v6.4s, v0.s[3]\n"
558 "ldr q7, [%x[B_ptr], #0x10]\n"
559 "ldr q8, [%x[B_ptr], #0x20]\n"
560 "fmla v25.4s, v7.4s, v0.s[3]\n"
561 "ldr q9, [%x[B_ptr], #0x30]\n"
562 "fmla v26.4s, v8.4s, v0.s[3]\n"
563 "add %x[B_ptr], %x[B_ptr], #0x80\n"
564 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
565 "fmla v27.4s, v9.4s, v0.s[3]\n"
566 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
567 "add x19, x19, #0x10\n"
568 "prfm pldl1keep, [x19, #0x80]\n"
569 "42:" // Width 4: Multiply loop: Main loop skip
570 "cbz x20, 44f\n"
571 "43:" // Width 4: Multiply loop: Odd block loop
572 "ldr s0, [x19], #0x4\n"
573 "ldr q10, [%x[B_ptr], #0x0]\n"
574 "fmla v24.4s, v10.4s, v0.s[0]\n"
575 "ldr q11, [%x[B_ptr], #0x10]\n"
576 "ldr q12, [%x[B_ptr], #0x20]\n"
577 "fmla v25.4s, v11.4s, v0.s[0]\n"
578 "ldr q13, [%x[B_ptr], #0x30]\n"
579 "fmla v26.4s, v12.4s, v0.s[0]\n"
580 "add %x[B_ptr], %x[B_ptr], #0x80\n"
581 "sub x20, x20, #0x1\n"
582 "fmla v27.4s, v13.4s, v0.s[0]\n"
583 "cbnz x20, 43b\n"
584 "44:" // Width 4: Multiply loop: No odd multiplies
585 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
586 "tbz %x[flags], #1, 45f\n"
587 "add x19, %x[args_ptr], %[offset_min]\n"
588 "ld1r { v17.4s }, [x19]\n"
589 "add x19, %x[args_ptr], %[offset_max]\n"
590 "ld1r { v16.4s }, [x19]\n"
591 "fmin v24.4s, v24.4s, v16.4s\n"
592 "fmin v25.4s, v25.4s, v16.4s\n"
593 "fmin v26.4s, v26.4s, v16.4s\n"
594 "fmin v27.4s, v27.4s, v16.4s\n"
595 "fmax v24.4s, v24.4s, v17.4s\n"
596 "fmax v25.4s, v25.4s, v17.4s\n"
597 "fmax v26.4s, v26.4s, v17.4s\n"
598 "fmax v27.4s, v27.4s, v17.4s\n"
599 "45:" // Width 4: No activation
600 "str q24, [%x[output_ptr], #0x0]\n"
601 "str q25, [%x[output_ptr], #0x10]\n"
602 "str q26, [%x[output_ptr], #0x20]\n"
603 "cmp %x[N], #0x10\n"
604 "add %x[output_ptr], %x[output_ptr], #0x30\n"
605 "blt 46f\n"
606 "str q27, [%x[output_ptr], #0x0]\n"
607 "add %x[output_ptr], %x[output_ptr], #0x10\n"
608 "b 48f\n"
609 "46:" // Width 4: Partial writeback
610 "tbz %x[N], #1, 47f\n"
611 "str d27, [%x[output_ptr]], #0x8\n"
612 "tbz %x[N], #0, 48f\n"
613 "st1 { v27.s }[2], [%x[output_ptr]]\n"
614 "b 48f\n"
615 "47:" // Width 4: Partial direct writeback: partial_1_12
616 "tbz %x[N], #0, 48f\n"
617 "str s27, [%x[output_ptr], #0x0]\n"
618 "48:" // Width 4: Writeback done
619 "b 97f\n"
620 "49:" // Width 5
621 "mov x20, %x[K]\n"
622 "mov x19, %x[A_ptr]\n"
623 "cbz x21, 50f\n"
624 "ldr q24, [x21, #0x0]\n"
625 "ldr q25, [x21, #0x10]\n"
626 "ldr q26, [x21, #0x20]\n"
627 "ldr q27, [x21, #0x30]\n"
628 "ldr q28, [x21, #0x40]\n"
629 "add x21, x21, #0x50\n"
630 "b 51f\n"
631 "50:" // Width 5: no bias
632 "movi v24.16b, #0x0\n"
633 "movi v25.16b, #0x0\n"
634 "movi v26.16b, #0x0\n"
635 "movi v27.16b, #0x0\n"
636 "movi v28.16b, #0x0\n"
637 "51:" // Width 5: setup done
638 "cmp x20, #0x4\n"
639 "blt 54f\n"
640 "cmp x20, #0x8\n"
641 "blt 53f\n"
642 "52:" // Width 5: Multiply loop: Main loop head
643 "ldr q0, [x19, #0x0]\n"
644 "ldr q1, [%x[B_ptr], #0x0]\n"
645 "fmla v24.4s, v1.4s, v0.s[0]\n"
646 "ldr q2, [%x[B_ptr], #0x10]\n"
647 "ldr q3, [%x[B_ptr], #0x20]\n"
648 "fmla v25.4s, v2.4s, v0.s[0]\n"
649 "ldr q4, [%x[B_ptr], #0x30]\n"
650 "fmla v26.4s, v3.4s, v0.s[0]\n"
651 "ldr q5, [%x[B_ptr], #0x40]\n"
652 "add %x[B_ptr], %x[B_ptr], #0x80\n"
653 "fmla v27.4s, v4.4s, v0.s[0]\n"
654 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
655 "ldr q6, [%x[B_ptr], #0x0]\n"
656 "fmla v28.4s, v5.4s, v0.s[0]\n"
657 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
658 "ldr q7, [%x[B_ptr], #0x10]\n"
659 "fmla v24.4s, v6.4s, v0.s[1]\n"
660 "ldr q8, [%x[B_ptr], #0x20]\n"
661 "ldr q9, [%x[B_ptr], #0x30]\n"
662 "fmla v25.4s, v7.4s, v0.s[1]\n"
663 "ldr q10, [%x[B_ptr], #0x40]\n"
664 "fmla v26.4s, v8.4s, v0.s[1]\n"
665 "add %x[B_ptr], %x[B_ptr], #0x80\n"
666 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
667 "fmla v27.4s, v9.4s, v0.s[1]\n"
668 "ldr q11, [%x[B_ptr], #0x0]\n"
669 "fmla v28.4s, v10.4s, v0.s[1]\n"
670 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
671 "ldr q12, [%x[B_ptr], #0x10]\n"
672 "fmla v24.4s, v11.4s, v0.s[2]\n"
673 "ldr q13, [%x[B_ptr], #0x20]\n"
674 "ldr q14, [%x[B_ptr], #0x30]\n"
675 "fmla v25.4s, v12.4s, v0.s[2]\n"
676 "ldr q15, [%x[B_ptr], #0x40]\n"
677 "fmla v26.4s, v13.4s, v0.s[2]\n"
678 "add %x[B_ptr], %x[B_ptr], #0x80\n"
679 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
680 "fmla v27.4s, v14.4s, v0.s[2]\n"
681 "ldr q16, [%x[B_ptr], #0x0]\n"
682 "fmla v28.4s, v15.4s, v0.s[2]\n"
683 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
684 "ldr q17, [%x[B_ptr], #0x10]\n"
685 "fmla v24.4s, v16.4s, v0.s[3]\n"
686 "ldr q18, [%x[B_ptr], #0x20]\n"
687 "ldr q19, [%x[B_ptr], #0x30]\n"
688 "fmla v25.4s, v17.4s, v0.s[3]\n"
689 "ldr q20, [%x[B_ptr], #0x40]\n"
690 "fmla v26.4s, v18.4s, v0.s[3]\n"
691 "add %x[B_ptr], %x[B_ptr], #0x80\n"
692 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
693 "fmla v27.4s, v19.4s, v0.s[3]\n"
694 "add x19, x19, #0x10\n"
695 "fmla v28.4s, v20.4s, v0.s[3]\n"
696 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
697 "sub x20, x20, #0x4\n"
698 "prfm pldl1keep, [x19, #0x80]\n"
699 "cmp x20, #0x8\n"
700 "bge 52b\n"
701 "53:" // Width 5: Multiply loop: Single iteration only
702 "sub x20, x20, #0x4\n"
703 "ldr q0, [x19, #0x0]\n"
704 "ldr q21, [%x[B_ptr], #0x0]\n"
705 "fmla v24.4s, v21.4s, v0.s[0]\n"
706 "ldr q22, [%x[B_ptr], #0x10]\n"
707 "ldr q23, [%x[B_ptr], #0x20]\n"
708 "fmla v25.4s, v22.4s, v0.s[0]\n"
709 "ldr q1, [%x[B_ptr], #0x30]\n"
710 "fmla v26.4s, v23.4s, v0.s[0]\n"
711 "ldr q2, [%x[B_ptr], #0x40]\n"
712 "add %x[B_ptr], %x[B_ptr], #0x80\n"
713 "fmla v27.4s, v1.4s, v0.s[0]\n"
714 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
715 "ldr q3, [%x[B_ptr], #0x0]\n"
716 "fmla v28.4s, v2.4s, v0.s[0]\n"
717 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
718 "ldr q4, [%x[B_ptr], #0x10]\n"
719 "fmla v24.4s, v3.4s, v0.s[1]\n"
720 "ldr q5, [%x[B_ptr], #0x20]\n"
721 "ldr q6, [%x[B_ptr], #0x30]\n"
722 "fmla v25.4s, v4.4s, v0.s[1]\n"
723 "ldr q7, [%x[B_ptr], #0x40]\n"
724 "fmla v26.4s, v5.4s, v0.s[1]\n"
725 "add %x[B_ptr], %x[B_ptr], #0x80\n"
726 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
727 "fmla v27.4s, v6.4s, v0.s[1]\n"
728 "ldr q8, [%x[B_ptr], #0x0]\n"
729 "fmla v28.4s, v7.4s, v0.s[1]\n"
730 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
731 "ldr q9, [%x[B_ptr], #0x10]\n"
732 "fmla v24.4s, v8.4s, v0.s[2]\n"
733 "ldr q10, [%x[B_ptr], #0x20]\n"
734 "ldr q11, [%x[B_ptr], #0x30]\n"
735 "fmla v25.4s, v9.4s, v0.s[2]\n"
736 "ldr q12, [%x[B_ptr], #0x40]\n"
737 "fmla v26.4s, v10.4s, v0.s[2]\n"
738 "add %x[B_ptr], %x[B_ptr], #0x80\n"
739 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
740 "fmla v27.4s, v11.4s, v0.s[2]\n"
741 "ldr q13, [%x[B_ptr], #0x0]\n"
742 "fmla v28.4s, v12.4s, v0.s[2]\n"
743 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
744 "ldr q14, [%x[B_ptr], #0x10]\n"
745 "fmla v24.4s, v13.4s, v0.s[3]\n"
746 "ldr q15, [%x[B_ptr], #0x20]\n"
747 "ldr q16, [%x[B_ptr], #0x30]\n"
748 "fmla v25.4s, v14.4s, v0.s[3]\n"
749 "ldr q17, [%x[B_ptr], #0x40]\n"
750 "fmla v26.4s, v15.4s, v0.s[3]\n"
751 "add %x[B_ptr], %x[B_ptr], #0x80\n"
752 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
753 "fmla v27.4s, v16.4s, v0.s[3]\n"
754 "add x19, x19, #0x10\n"
755 "fmla v28.4s, v17.4s, v0.s[3]\n"
756 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
757 "prfm pldl1keep, [x19, #0x80]\n"
758 "54:" // Width 5: Multiply loop: Main loop skip
759 "cbz x20, 56f\n"
760 "55:" // Width 5: Multiply loop: Odd block loop
761 "ldr s0, [x19], #0x4\n"
762 "ldr q18, [%x[B_ptr], #0x0]\n"
763 "fmla v24.4s, v18.4s, v0.s[0]\n"
764 "ldr q19, [%x[B_ptr], #0x10]\n"
765 "ldr q20, [%x[B_ptr], #0x20]\n"
766 "fmla v25.4s, v19.4s, v0.s[0]\n"
767 "ldr q21, [%x[B_ptr], #0x30]\n"
768 "fmla v26.4s, v20.4s, v0.s[0]\n"
769 "ldr q22, [%x[B_ptr], #0x40]\n"
770 "add %x[B_ptr], %x[B_ptr], #0x80\n"
771 "fmla v27.4s, v21.4s, v0.s[0]\n"
772 "sub x20, x20, #0x1\n"
773 "fmla v28.4s, v22.4s, v0.s[0]\n"
774 "cbnz x20, 55b\n"
775 "56:" // Width 5: Multiply loop: No odd multiplies
776 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
777 "tbz %x[flags], #1, 57f\n"
778 "add x19, %x[args_ptr], %[offset_min]\n"
779 "ld1r { v17.4s }, [x19]\n"
780 "add x19, %x[args_ptr], %[offset_max]\n"
781 "ld1r { v16.4s }, [x19]\n"
782 "fmin v24.4s, v24.4s, v16.4s\n"
783 "fmin v25.4s, v25.4s, v16.4s\n"
784 "fmin v26.4s, v26.4s, v16.4s\n"
785 "fmin v27.4s, v27.4s, v16.4s\n"
786 "fmax v24.4s, v24.4s, v17.4s\n"
787 "fmax v25.4s, v25.4s, v17.4s\n"
788 "fmax v26.4s, v26.4s, v17.4s\n"
789 "fmax v27.4s, v27.4s, v17.4s\n"
790 "fmin v28.4s, v28.4s, v16.4s\n"
791 "fmax v28.4s, v28.4s, v17.4s\n"
792 "57:" // Width 5: No activation
793 "str q24, [%x[output_ptr], #0x0]\n"
794 "str q25, [%x[output_ptr], #0x10]\n"
795 "str q26, [%x[output_ptr], #0x20]\n"
796 "str q27, [%x[output_ptr], #0x30]\n"
797 "cmp %x[N], #0x14\n"
798 "add %x[output_ptr], %x[output_ptr], #0x40\n"
799 "blt 58f\n"
800 "str q28, [%x[output_ptr], #0x0]\n"
801 "add %x[output_ptr], %x[output_ptr], #0x10\n"
802 "b 60f\n"
803 "58:" // Width 5: Partial writeback
804 "tbz %x[N], #1, 59f\n"
805 "str d28, [%x[output_ptr]], #0x8\n"
806 "tbz %x[N], #0, 60f\n"
807 "st1 { v28.s }[2], [%x[output_ptr]]\n"
808 "b 60f\n"
809 "59:" // Width 5: Partial direct writeback: partial_1_16
810 "tbz %x[N], #0, 60f\n"
811 "str s28, [%x[output_ptr], #0x0]\n"
812 "60:" // Width 5: Writeback done
813 "b 97f\n"
814 "61:" // Width 6
815 "mov x20, %x[K]\n"
816 "mov x19, %x[A_ptr]\n"
817 "cbz x21, 62f\n"
818 "ldr q24, [x21, #0x0]\n"
819 "ldr q25, [x21, #0x10]\n"
820 "ldr q26, [x21, #0x20]\n"
821 "ldr q27, [x21, #0x30]\n"
822 "ldr q28, [x21, #0x40]\n"
823 "ldr q29, [x21, #0x50]\n"
824 "add x21, x21, #0x60\n"
825 "b 63f\n"
826 "62:" // Width 6: no bias
827 "movi v24.16b, #0x0\n"
828 "movi v25.16b, #0x0\n"
829 "movi v26.16b, #0x0\n"
830 "movi v27.16b, #0x0\n"
831 "movi v28.16b, #0x0\n"
832 "movi v29.16b, #0x0\n"
833 "63:" // Width 6: setup done
834 "cmp x20, #0x4\n"
835 "blt 66f\n"
836 "cmp x20, #0x8\n"
837 "blt 65f\n"
838 "64:" // Width 6: Multiply loop: Main loop head
839 "ldr q0, [x19, #0x0]\n"
840 "ldr q1, [%x[B_ptr], #0x0]\n"
841 "fmla v24.4s, v1.4s, v0.s[0]\n"
842 "ldr q2, [%x[B_ptr], #0x10]\n"
843 "ldr q3, [%x[B_ptr], #0x20]\n"
844 "fmla v25.4s, v2.4s, v0.s[0]\n"
845 "ldr q4, [%x[B_ptr], #0x30]\n"
846 "fmla v26.4s, v3.4s, v0.s[0]\n"
847 "ldr q5, [%x[B_ptr], #0x40]\n"
848 "ldr q6, [%x[B_ptr], #0x50]\n"
849 "fmla v27.4s, v4.4s, v0.s[0]\n"
850 "add %x[B_ptr], %x[B_ptr], #0x80\n"
851 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
852 "fmla v28.4s, v5.4s, v0.s[0]\n"
853 "ldr q7, [%x[B_ptr], #0x0]\n"
854 "fmla v29.4s, v6.4s, v0.s[0]\n"
855 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
856 "ldr q8, [%x[B_ptr], #0x10]\n"
857 "fmla v24.4s, v7.4s, v0.s[1]\n"
858 "ldr q9, [%x[B_ptr], #0x20]\n"
859 "ldr q10, [%x[B_ptr], #0x30]\n"
860 "fmla v25.4s, v8.4s, v0.s[1]\n"
861 "ldr q11, [%x[B_ptr], #0x40]\n"
862 "fmla v26.4s, v9.4s, v0.s[1]\n"
863 "ldr q12, [%x[B_ptr], #0x50]\n"
864 "fmla v27.4s, v10.4s, v0.s[1]\n"
865 "add %x[B_ptr], %x[B_ptr], #0x80\n"
866 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
867 "fmla v28.4s, v11.4s, v0.s[1]\n"
868 "ldr q13, [%x[B_ptr], #0x0]\n"
869 "fmla v29.4s, v12.4s, v0.s[1]\n"
870 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
871 "ldr q14, [%x[B_ptr], #0x10]\n"
872 "fmla v24.4s, v13.4s, v0.s[2]\n"
873 "ldr q15, [%x[B_ptr], #0x20]\n"
874 "ldr q16, [%x[B_ptr], #0x30]\n"
875 "fmla v25.4s, v14.4s, v0.s[2]\n"
876 "ldr q17, [%x[B_ptr], #0x40]\n"
877 "ldr q18, [%x[B_ptr], #0x50]\n"
878 "fmla v26.4s, v15.4s, v0.s[2]\n"
879 "add %x[B_ptr], %x[B_ptr], #0x80\n"
880 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
881 "fmla v27.4s, v16.4s, v0.s[2]\n"
882 "ldr q19, [%x[B_ptr], #0x0]\n"
883 "fmla v28.4s, v17.4s, v0.s[2]\n"
884 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
885 "ldr q20, [%x[B_ptr], #0x10]\n"
886 "fmla v29.4s, v18.4s, v0.s[2]\n"
887 "ldr q21, [%x[B_ptr], #0x20]\n"
888 "ldr q22, [%x[B_ptr], #0x30]\n"
889 "fmla v24.4s, v19.4s, v0.s[3]\n"
890 "ldr q23, [%x[B_ptr], #0x40]\n"
891 "ldr q1, [%x[B_ptr], #0x50]\n"
892 "fmla v25.4s, v20.4s, v0.s[3]\n"
893 "add %x[B_ptr], %x[B_ptr], #0x80\n"
894 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
895 "fmla v26.4s, v21.4s, v0.s[3]\n"
896 "add x19, x19, #0x10\n"
897 "fmla v27.4s, v22.4s, v0.s[3]\n"
898 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
899 "sub x20, x20, #0x4\n"
900 "fmla v28.4s, v23.4s, v0.s[3]\n"
901 "prfm pldl1keep, [x19, #0x80]\n"
902 "cmp x20, #0x8\n"
903 "fmla v29.4s, v1.4s, v0.s[3]\n"
904 "bge 64b\n"
905 "65:" // Width 6: Multiply loop: Single iteration only
906 "sub x20, x20, #0x4\n"
907 "ldr q0, [x19, #0x0]\n"
908 "ldr q2, [%x[B_ptr], #0x0]\n"
909 "fmla v24.4s, v2.4s, v0.s[0]\n"
910 "ldr q3, [%x[B_ptr], #0x10]\n"
911 "ldr q4, [%x[B_ptr], #0x20]\n"
912 "fmla v25.4s, v3.4s, v0.s[0]\n"
913 "ldr q5, [%x[B_ptr], #0x30]\n"
914 "fmla v26.4s, v4.4s, v0.s[0]\n"
915 "ldr q6, [%x[B_ptr], #0x40]\n"
916 "ldr q7, [%x[B_ptr], #0x50]\n"
917 "fmla v27.4s, v5.4s, v0.s[0]\n"
918 "add %x[B_ptr], %x[B_ptr], #0x80\n"
919 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
920 "fmla v28.4s, v6.4s, v0.s[0]\n"
921 "ldr q8, [%x[B_ptr], #0x0]\n"
922 "fmla v29.4s, v7.4s, v0.s[0]\n"
923 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
924 "ldr q9, [%x[B_ptr], #0x10]\n"
925 "fmla v24.4s, v8.4s, v0.s[1]\n"
926 "ldr q10, [%x[B_ptr], #0x20]\n"
927 "ldr q11, [%x[B_ptr], #0x30]\n"
928 "fmla v25.4s, v9.4s, v0.s[1]\n"
929 "ldr q12, [%x[B_ptr], #0x40]\n"
930 "fmla v26.4s, v10.4s, v0.s[1]\n"
931 "ldr q13, [%x[B_ptr], #0x50]\n"
932 "fmla v27.4s, v11.4s, v0.s[1]\n"
933 "add %x[B_ptr], %x[B_ptr], #0x80\n"
934 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
935 "fmla v28.4s, v12.4s, v0.s[1]\n"
936 "ldr q14, [%x[B_ptr], #0x0]\n"
937 "fmla v29.4s, v13.4s, v0.s[1]\n"
938 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
939 "ldr q15, [%x[B_ptr], #0x10]\n"
940 "fmla v24.4s, v14.4s, v0.s[2]\n"
941 "ldr q16, [%x[B_ptr], #0x20]\n"
942 "ldr q17, [%x[B_ptr], #0x30]\n"
943 "fmla v25.4s, v15.4s, v0.s[2]\n"
944 "ldr q18, [%x[B_ptr], #0x40]\n"
945 "ldr q19, [%x[B_ptr], #0x50]\n"
946 "fmla v26.4s, v16.4s, v0.s[2]\n"
947 "add %x[B_ptr], %x[B_ptr], #0x80\n"
948 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
949 "fmla v27.4s, v17.4s, v0.s[2]\n"
950 "ldr q20, [%x[B_ptr], #0x0]\n"
951 "fmla v28.4s, v18.4s, v0.s[2]\n"
952 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
953 "ldr q21, [%x[B_ptr], #0x10]\n"
954 "fmla v29.4s, v19.4s, v0.s[2]\n"
955 "ldr q22, [%x[B_ptr], #0x20]\n"
956 "ldr q23, [%x[B_ptr], #0x30]\n"
957 "fmla v24.4s, v20.4s, v0.s[3]\n"
958 "ldr q1, [%x[B_ptr], #0x40]\n"
959 "ldr q2, [%x[B_ptr], #0x50]\n"
960 "fmla v25.4s, v21.4s, v0.s[3]\n"
961 "add %x[B_ptr], %x[B_ptr], #0x80\n"
962 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
963 "fmla v26.4s, v22.4s, v0.s[3]\n"
964 "add x19, x19, #0x10\n"
965 "fmla v27.4s, v23.4s, v0.s[3]\n"
966 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
967 "prfm pldl1keep, [x19, #0x80]\n"
968 "fmla v28.4s, v1.4s, v0.s[3]\n"
969 "fmla v29.4s, v2.4s, v0.s[3]\n"
970 "66:" // Width 6: Multiply loop: Main loop skip
971 "cbz x20, 68f\n"
972 "67:" // Width 6: Multiply loop: Odd block loop
973 "ldr s0, [x19], #0x4\n"
974 "ldr q3, [%x[B_ptr], #0x0]\n"
975 "fmla v24.4s, v3.4s, v0.s[0]\n"
976 "ldr q4, [%x[B_ptr], #0x10]\n"
977 "ldr q5, [%x[B_ptr], #0x20]\n"
978 "fmla v25.4s, v4.4s, v0.s[0]\n"
979 "ldr q6, [%x[B_ptr], #0x30]\n"
980 "fmla v26.4s, v5.4s, v0.s[0]\n"
981 "ldr q7, [%x[B_ptr], #0x40]\n"
982 "ldr q8, [%x[B_ptr], #0x50]\n"
983 "fmla v27.4s, v6.4s, v0.s[0]\n"
984 "add %x[B_ptr], %x[B_ptr], #0x80\n"
985 "sub x20, x20, #0x1\n"
986 "fmla v28.4s, v7.4s, v0.s[0]\n"
987 "fmla v29.4s, v8.4s, v0.s[0]\n"
988 "cbnz x20, 67b\n"
989 "68:" // Width 6: Multiply loop: No odd multiplies
990 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
991 "tbz %x[flags], #1, 69f\n"
992 "add x19, %x[args_ptr], %[offset_min]\n"
993 "ld1r { v17.4s }, [x19]\n"
994 "add x19, %x[args_ptr], %[offset_max]\n"
995 "ld1r { v16.4s }, [x19]\n"
996 "fmin v24.4s, v24.4s, v16.4s\n"
997 "fmin v25.4s, v25.4s, v16.4s\n"
998 "fmin v26.4s, v26.4s, v16.4s\n"
999 "fmin v27.4s, v27.4s, v16.4s\n"
1000 "fmax v24.4s, v24.4s, v17.4s\n"
1001 "fmax v25.4s, v25.4s, v17.4s\n"
1002 "fmax v26.4s, v26.4s, v17.4s\n"
1003 "fmax v27.4s, v27.4s, v17.4s\n"
1004 "fmin v28.4s, v28.4s, v16.4s\n"
1005 "fmin v29.4s, v29.4s, v16.4s\n"
1006 "fmax v28.4s, v28.4s, v17.4s\n"
1007 "fmax v29.4s, v29.4s, v17.4s\n"
1008 "69:" // Width 6: No activation
1009 "str q24, [%x[output_ptr], #0x0]\n"
1010 "str q25, [%x[output_ptr], #0x10]\n"
1011 "str q26, [%x[output_ptr], #0x20]\n"
1012 "str q27, [%x[output_ptr], #0x30]\n"
1013 "str q28, [%x[output_ptr], #0x40]\n"
1014 "cmp %x[N], #0x18\n"
1015 "add %x[output_ptr], %x[output_ptr], #0x50\n"
1016 "blt 70f\n"
1017 "str q29, [%x[output_ptr], #0x0]\n"
1018 "add %x[output_ptr], %x[output_ptr], #0x10\n"
1019 "b 72f\n"
1020 "70:" // Width 6: Partial writeback
1021 "tbz %x[N], #1, 71f\n"
1022 "str d29, [%x[output_ptr]], #0x8\n"
1023 "tbz %x[N], #0, 72f\n"
1024 "st1 { v29.s }[2], [%x[output_ptr]]\n"
1025 "b 72f\n"
1026 "71:" // Width 6: Partial direct writeback: partial_1_20
1027 "tbz %x[N], #0, 72f\n"
1028 "str s29, [%x[output_ptr], #0x0]\n"
1029 "72:" // Width 6: Writeback done
1030 "b 97f\n"
1031 "73:" // Width 7
1032 "mov x20, %x[K]\n"
1033 "mov x19, %x[A_ptr]\n"
1034 "cbz x21, 74f\n"
1035 "ldr q24, [x21, #0x0]\n"
1036 "ldr q25, [x21, #0x10]\n"
1037 "ldr q26, [x21, #0x20]\n"
1038 "ldr q27, [x21, #0x30]\n"
1039 "ldr q28, [x21, #0x40]\n"
1040 "ldr q29, [x21, #0x50]\n"
1041 "ldr q30, [x21, #0x60]\n"
1042 "add x21, x21, #0x70\n"
1043 "b 75f\n"
1044 "74:" // Width 7: no bias
1045 "movi v24.16b, #0x0\n"
1046 "movi v25.16b, #0x0\n"
1047 "movi v26.16b, #0x0\n"
1048 "movi v27.16b, #0x0\n"
1049 "movi v28.16b, #0x0\n"
1050 "movi v29.16b, #0x0\n"
1051 "movi v30.16b, #0x0\n"
1052 "75:" // Width 7: setup done
1053 "cmp x20, #0x4\n"
1054 "blt 78f\n"
1055 "cmp x20, #0x8\n"
1056 "blt 77f\n"
1057 "76:" // Width 7: Multiply loop: Main loop head
1058 "ldr q0, [x19, #0x0]\n"
1059 "ldr q1, [%x[B_ptr], #0x0]\n"
1060 "fmla v24.4s, v1.4s, v0.s[0]\n"
1061 "ldr q2, [%x[B_ptr], #0x10]\n"
1062 "ldr q3, [%x[B_ptr], #0x20]\n"
1063 "fmla v25.4s, v2.4s, v0.s[0]\n"
1064 "ldr q4, [%x[B_ptr], #0x30]\n"
1065 "fmla v26.4s, v3.4s, v0.s[0]\n"
1066 "ldr q5, [%x[B_ptr], #0x40]\n"
1067 "ldr q6, [%x[B_ptr], #0x50]\n"
1068 "fmla v27.4s, v4.4s, v0.s[0]\n"
1069 "ldr q7, [%x[B_ptr], #0x60]\n"
1070 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1071 "fmla v28.4s, v5.4s, v0.s[0]\n"
1072 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1073 "fmla v29.4s, v6.4s, v0.s[0]\n"
1074 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1075 "ldr q8, [%x[B_ptr], #0x0]\n"
1076 "fmla v30.4s, v7.4s, v0.s[0]\n"
1077 "ldr q9, [%x[B_ptr], #0x10]\n"
1078 "ldr q10, [%x[B_ptr], #0x20]\n"
1079 "fmla v24.4s, v8.4s, v0.s[1]\n"
1080 "ldr q11, [%x[B_ptr], #0x30]\n"
1081 "ldr q12, [%x[B_ptr], #0x40]\n"
1082 "fmla v25.4s, v9.4s, v0.s[1]\n"
1083 "ldr q13, [%x[B_ptr], #0x50]\n"
1084 "fmla v26.4s, v10.4s, v0.s[1]\n"
1085 "ldr q14, [%x[B_ptr], #0x60]\n"
1086 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1087 "fmla v27.4s, v11.4s, v0.s[1]\n"
1088 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1089 "fmla v28.4s, v12.4s, v0.s[1]\n"
1090 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1091 "ldr q15, [%x[B_ptr], #0x0]\n"
1092 "fmla v29.4s, v13.4s, v0.s[1]\n"
1093 "ldr q16, [%x[B_ptr], #0x10]\n"
1094 "ldr q17, [%x[B_ptr], #0x20]\n"
1095 "fmla v30.4s, v14.4s, v0.s[1]\n"
1096 "ldr q18, [%x[B_ptr], #0x30]\n"
1097 "fmla v24.4s, v15.4s, v0.s[2]\n"
1098 "ldr q19, [%x[B_ptr], #0x40]\n"
1099 "ldr q20, [%x[B_ptr], #0x50]\n"
1100 "fmla v25.4s, v16.4s, v0.s[2]\n"
1101 "ldr q21, [%x[B_ptr], #0x60]\n"
1102 "fmla v26.4s, v17.4s, v0.s[2]\n"
1103 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1104 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1105 "fmla v27.4s, v18.4s, v0.s[2]\n"
1106 "ldr q22, [%x[B_ptr], #0x0]\n"
1107 "fmla v28.4s, v19.4s, v0.s[2]\n"
1108 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1109 "ldr q23, [%x[B_ptr], #0x10]\n"
1110 "fmla v29.4s, v20.4s, v0.s[2]\n"
1111 "ldr q1, [%x[B_ptr], #0x20]\n"
1112 "ldr q2, [%x[B_ptr], #0x30]\n"
1113 "fmla v30.4s, v21.4s, v0.s[2]\n"
1114 "ldr q3, [%x[B_ptr], #0x40]\n"
1115 "fmla v24.4s, v22.4s, v0.s[3]\n"
1116 "ldr q4, [%x[B_ptr], #0x50]\n"
1117 "ldr q5, [%x[B_ptr], #0x60]\n"
1118 "fmla v25.4s, v23.4s, v0.s[3]\n"
1119 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1120 "fmla v26.4s, v1.4s, v0.s[3]\n"
1121 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1122 "fmla v27.4s, v2.4s, v0.s[3]\n"
1123 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1124 "add x19, x19, #0x10\n"
1125 "fmla v28.4s, v3.4s, v0.s[3]\n"
1126 "prfm pldl1keep, [x19, #0x80]\n"
1127 "sub x20, x20, #0x4\n"
1128 "fmla v29.4s, v4.4s, v0.s[3]\n"
1129 "cmp x20, #0x8\n"
1130 "fmla v30.4s, v5.4s, v0.s[3]\n"
1131 "bge 76b\n"
1132 "77:" // Width 7: Multiply loop: Single iteration only
1133 "sub x20, x20, #0x4\n"
1134 "ldr q0, [x19, #0x0]\n"
1135 "ldr q6, [%x[B_ptr], #0x0]\n"
1136 "fmla v24.4s, v6.4s, v0.s[0]\n"
1137 "ldr q7, [%x[B_ptr], #0x10]\n"
1138 "ldr q8, [%x[B_ptr], #0x20]\n"
1139 "fmla v25.4s, v7.4s, v0.s[0]\n"
1140 "ldr q9, [%x[B_ptr], #0x30]\n"
1141 "fmla v26.4s, v8.4s, v0.s[0]\n"
1142 "ldr q10, [%x[B_ptr], #0x40]\n"
1143 "ldr q11, [%x[B_ptr], #0x50]\n"
1144 "fmla v27.4s, v9.4s, v0.s[0]\n"
1145 "ldr q12, [%x[B_ptr], #0x60]\n"
1146 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1147 "fmla v28.4s, v10.4s, v0.s[0]\n"
1148 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1149 "fmla v29.4s, v11.4s, v0.s[0]\n"
1150 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1151 "ldr q13, [%x[B_ptr], #0x0]\n"
1152 "fmla v30.4s, v12.4s, v0.s[0]\n"
1153 "ldr q14, [%x[B_ptr], #0x10]\n"
1154 "ldr q15, [%x[B_ptr], #0x20]\n"
1155 "fmla v24.4s, v13.4s, v0.s[1]\n"
1156 "ldr q16, [%x[B_ptr], #0x30]\n"
1157 "ldr q17, [%x[B_ptr], #0x40]\n"
1158 "fmla v25.4s, v14.4s, v0.s[1]\n"
1159 "ldr q18, [%x[B_ptr], #0x50]\n"
1160 "fmla v26.4s, v15.4s, v0.s[1]\n"
1161 "ldr q19, [%x[B_ptr], #0x60]\n"
1162 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1163 "fmla v27.4s, v16.4s, v0.s[1]\n"
1164 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1165 "fmla v28.4s, v17.4s, v0.s[1]\n"
1166 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1167 "ldr q20, [%x[B_ptr], #0x0]\n"
1168 "fmla v29.4s, v18.4s, v0.s[1]\n"
1169 "ldr q21, [%x[B_ptr], #0x10]\n"
1170 "ldr q22, [%x[B_ptr], #0x20]\n"
1171 "fmla v30.4s, v19.4s, v0.s[1]\n"
1172 "ldr q23, [%x[B_ptr], #0x30]\n"
1173 "fmla v24.4s, v20.4s, v0.s[2]\n"
1174 "ldr q1, [%x[B_ptr], #0x40]\n"
1175 "ldr q2, [%x[B_ptr], #0x50]\n"
1176 "fmla v25.4s, v21.4s, v0.s[2]\n"
1177 "ldr q3, [%x[B_ptr], #0x60]\n"
1178 "fmla v26.4s, v22.4s, v0.s[2]\n"
1179 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1180 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1181 "fmla v27.4s, v23.4s, v0.s[2]\n"
1182 "ldr q4, [%x[B_ptr], #0x0]\n"
1183 "fmla v28.4s, v1.4s, v0.s[2]\n"
1184 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1185 "ldr q5, [%x[B_ptr], #0x10]\n"
1186 "fmla v29.4s, v2.4s, v0.s[2]\n"
1187 "ldr q6, [%x[B_ptr], #0x20]\n"
1188 "ldr q7, [%x[B_ptr], #0x30]\n"
1189 "fmla v30.4s, v3.4s, v0.s[2]\n"
1190 "ldr q8, [%x[B_ptr], #0x40]\n"
1191 "fmla v24.4s, v4.4s, v0.s[3]\n"
1192 "ldr q9, [%x[B_ptr], #0x50]\n"
1193 "ldr q10, [%x[B_ptr], #0x60]\n"
1194 "fmla v25.4s, v5.4s, v0.s[3]\n"
1195 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1196 "fmla v26.4s, v6.4s, v0.s[3]\n"
1197 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1198 "fmla v27.4s, v7.4s, v0.s[3]\n"
1199 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1200 "add x19, x19, #0x10\n"
1201 "fmla v28.4s, v8.4s, v0.s[3]\n"
1202 "prfm pldl1keep, [x19, #0x80]\n"
1203 "fmla v29.4s, v9.4s, v0.s[3]\n"
1204 "fmla v30.4s, v10.4s, v0.s[3]\n"
1205 "78:" // Width 7: Multiply loop: Main loop skip
1206 "cbz x20, 80f\n"
1207 "79:" // Width 7: Multiply loop: Odd block loop
1208 "ldr s0, [x19], #0x4\n"
1209 "ldr q11, [%x[B_ptr], #0x0]\n"
1210 "fmla v24.4s, v11.4s, v0.s[0]\n"
1211 "ldr q12, [%x[B_ptr], #0x10]\n"
1212 "ldr q13, [%x[B_ptr], #0x20]\n"
1213 "fmla v25.4s, v12.4s, v0.s[0]\n"
1214 "ldr q14, [%x[B_ptr], #0x30]\n"
1215 "fmla v26.4s, v13.4s, v0.s[0]\n"
1216 "ldr q15, [%x[B_ptr], #0x40]\n"
1217 "ldr q16, [%x[B_ptr], #0x50]\n"
1218 "fmla v27.4s, v14.4s, v0.s[0]\n"
1219 "ldr q17, [%x[B_ptr], #0x60]\n"
1220 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1221 "fmla v28.4s, v15.4s, v0.s[0]\n"
1222 "fmla v29.4s, v16.4s, v0.s[0]\n"
1223 "sub x20, x20, #0x1\n"
1224 "fmla v30.4s, v17.4s, v0.s[0]\n"
1225 "cbnz x20, 79b\n"
1226 "80:" // Width 7: Multiply loop: No odd multiplies
1227 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1228 "tbz %x[flags], #1, 81f\n"
1229 "add x19, %x[args_ptr], %[offset_min]\n"
1230 "ld1r { v17.4s }, [x19]\n"
1231 "add x19, %x[args_ptr], %[offset_max]\n"
1232 "ld1r { v16.4s }, [x19]\n"
1233 "fmin v24.4s, v24.4s, v16.4s\n"
1234 "fmin v25.4s, v25.4s, v16.4s\n"
1235 "fmin v26.4s, v26.4s, v16.4s\n"
1236 "fmin v27.4s, v27.4s, v16.4s\n"
1237 "fmax v24.4s, v24.4s, v17.4s\n"
1238 "fmax v25.4s, v25.4s, v17.4s\n"
1239 "fmax v26.4s, v26.4s, v17.4s\n"
1240 "fmax v27.4s, v27.4s, v17.4s\n"
1241 "fmin v28.4s, v28.4s, v16.4s\n"
1242 "fmin v29.4s, v29.4s, v16.4s\n"
1243 "fmin v30.4s, v30.4s, v16.4s\n"
1244 "fmax v28.4s, v28.4s, v17.4s\n"
1245 "fmax v29.4s, v29.4s, v17.4s\n"
1246 "fmax v30.4s, v30.4s, v17.4s\n"
1247 "81:" // Width 7: No activation
1248 "str q24, [%x[output_ptr], #0x0]\n"
1249 "str q25, [%x[output_ptr], #0x10]\n"
1250 "str q26, [%x[output_ptr], #0x20]\n"
1251 "str q27, [%x[output_ptr], #0x30]\n"
1252 "str q28, [%x[output_ptr], #0x40]\n"
1253 "str q29, [%x[output_ptr], #0x50]\n"
1254 "cmp %x[N], #0x1c\n"
1255 "add %x[output_ptr], %x[output_ptr], #0x60\n"
1256 "blt 82f\n"
1257 "str q30, [%x[output_ptr], #0x0]\n"
1258 "add %x[output_ptr], %x[output_ptr], #0x10\n"
1259 "b 84f\n"
1260 "82:" // Width 7: Partial writeback
1261 "tbz %x[N], #1, 83f\n"
1262 "str d30, [%x[output_ptr]], #0x8\n"
1263 "tbz %x[N], #0, 84f\n"
1264 "st1 { v30.s }[2], [%x[output_ptr]]\n"
1265 "b 84f\n"
1266 "83:" // Width 7: Partial direct writeback: partial_1_24
1267 "tbz %x[N], #0, 84f\n"
1268 "str s30, [%x[output_ptr], #0x0]\n"
1269 "84:" // Width 7: Writeback done
1270 "b 97f\n"
1271 "85:" // Width 8
1272 "mov x20, %x[K]\n"
1273 "mov x19, %x[A_ptr]\n"
1274 "cbz x21, 86f\n"
1275 "ldr q24, [x21, #0x0]\n"
1276 "ldr q25, [x21, #0x10]\n"
1277 "ldr q26, [x21, #0x20]\n"
1278 "ldr q27, [x21, #0x30]\n"
1279 "ldr q28, [x21, #0x40]\n"
1280 "ldr q29, [x21, #0x50]\n"
1281 "ldr q30, [x21, #0x60]\n"
1282 "ldr q31, [x21, #0x70]\n"
1283 "add x21, x21, #0x80\n"
1284 "b 87f\n"
1285 "86:" // Width 8: no bias
1286 "movi v24.16b, #0x0\n"
1287 "movi v25.16b, #0x0\n"
1288 "movi v26.16b, #0x0\n"
1289 "movi v27.16b, #0x0\n"
1290 "movi v28.16b, #0x0\n"
1291 "movi v29.16b, #0x0\n"
1292 "movi v30.16b, #0x0\n"
1293 "movi v31.16b, #0x0\n"
1294 "87:" // Width 8: setup done
1295 "cmp x20, #0x4\n"
1296 "blt 90f\n"
1297 "cmp x20, #0x8\n"
1298 "blt 89f\n"
1299 "88:" // Width 8: Multiply loop: Main loop head
1300 "ldr q0, [x19, #0x0]\n"
1301 "ldr q1, [%x[B_ptr], #0x0]\n"
1302 "fmla v24.4s, v1.4s, v0.s[0]\n"
1303 "ldr q2, [%x[B_ptr], #0x10]\n"
1304 "ldr q3, [%x[B_ptr], #0x20]\n"
1305 "fmla v25.4s, v2.4s, v0.s[0]\n"
1306 "ldr q4, [%x[B_ptr], #0x30]\n"
1307 "fmla v26.4s, v3.4s, v0.s[0]\n"
1308 "ldr q5, [%x[B_ptr], #0x40]\n"
1309 "ldr q6, [%x[B_ptr], #0x50]\n"
1310 "fmla v27.4s, v4.4s, v0.s[0]\n"
1311 "ldr q7, [%x[B_ptr], #0x60]\n"
1312 "ldr q8, [%x[B_ptr], #0x70]\n"
1313 "fmla v28.4s, v5.4s, v0.s[0]\n"
1314 "fmla v29.4s, v6.4s, v0.s[0]\n"
1315 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1316 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1317 "fmla v30.4s, v7.4s, v0.s[0]\n"
1318 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1319 "ldr q9, [%x[B_ptr], #0x0]\n"
1320 "fmla v31.4s, v8.4s, v0.s[0]\n"
1321 "ldr q10, [%x[B_ptr], #0x10]\n"
1322 "ldr q11, [%x[B_ptr], #0x20]\n"
1323 "fmla v24.4s, v9.4s, v0.s[1]\n"
1324 "ldr q12, [%x[B_ptr], #0x30]\n"
1325 "ldr q13, [%x[B_ptr], #0x40]\n"
1326 "fmla v25.4s, v10.4s, v0.s[1]\n"
1327 "fmla v26.4s, v11.4s, v0.s[1]\n"
1328 "ldr q14, [%x[B_ptr], #0x50]\n"
1329 "ldr q15, [%x[B_ptr], #0x60]\n"
1330 "fmla v27.4s, v12.4s, v0.s[1]\n"
1331 "ldr q16, [%x[B_ptr], #0x70]\n"
1332 "fmla v28.4s, v13.4s, v0.s[1]\n"
1333 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1334 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1335 "fmla v29.4s, v14.4s, v0.s[1]\n"
1336 "ldr q17, [%x[B_ptr], #0x0]\n"
1337 "fmla v30.4s, v15.4s, v0.s[1]\n"
1338 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1339 "ldr q18, [%x[B_ptr], #0x10]\n"
1340 "fmla v31.4s, v16.4s, v0.s[1]\n"
1341 "ldr q19, [%x[B_ptr], #0x20]\n"
1342 "ldr q20, [%x[B_ptr], #0x30]\n"
1343 "fmla v24.4s, v17.4s, v0.s[2]\n"
1344 "ldr q21, [%x[B_ptr], #0x40]\n"
1345 "ldr q22, [%x[B_ptr], #0x50]\n"
1346 "fmla v25.4s, v18.4s, v0.s[2]\n"
1347 "ldr q23, [%x[B_ptr], #0x60]\n"
1348 "fmla v26.4s, v19.4s, v0.s[2]\n"
1349 "ldr q1, [%x[B_ptr], #0x70]\n"
1350 "fmla v27.4s, v20.4s, v0.s[2]\n"
1351 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1352 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1353 "fmla v28.4s, v21.4s, v0.s[2]\n"
1354 "ldr q2, [%x[B_ptr], #0x0]\n"
1355 "fmla v29.4s, v22.4s, v0.s[2]\n"
1356 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1357 "ldr q3, [%x[B_ptr], #0x10]\n"
1358 "fmla v30.4s, v23.4s, v0.s[2]\n"
1359 "ldr q4, [%x[B_ptr], #0x20]\n"
1360 "ldr q5, [%x[B_ptr], #0x30]\n"
1361 "fmla v31.4s, v1.4s, v0.s[2]\n"
1362 "ldr q6, [%x[B_ptr], #0x40]\n"
1363 "fmla v24.4s, v2.4s, v0.s[3]\n"
1364 "ldr q7, [%x[B_ptr], #0x50]\n"
1365 "ldr q8, [%x[B_ptr], #0x60]\n"
1366 "fmla v25.4s, v3.4s, v0.s[3]\n"
1367 "ldr q9, [%x[B_ptr], #0x70]\n"
1368 "fmla v26.4s, v4.4s, v0.s[3]\n"
1369 "fmla v27.4s, v5.4s, v0.s[3]\n"
1370 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1371 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1372 "fmla v28.4s, v6.4s, v0.s[3]\n"
1373 "add x19, x19, #0x10\n"
1374 "fmla v29.4s, v7.4s, v0.s[3]\n"
1375 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1376 "sub x20, x20, #0x4\n"
1377 "fmla v30.4s, v8.4s, v0.s[3]\n"
1378 "prfm pldl1keep, [x19, #0x80]\n"
1379 "cmp x20, #0x8\n"
1380 "fmla v31.4s, v9.4s, v0.s[3]\n"
1381 "bge 88b\n"
1382 "89:" // Width 8: Multiply loop: Single iteration only
1383 "sub x20, x20, #0x4\n"
1384 "ldr q0, [x19, #0x0]\n"
1385 "ldr q10, [%x[B_ptr], #0x0]\n"
1386 "fmla v24.4s, v10.4s, v0.s[0]\n"
1387 "ldr q11, [%x[B_ptr], #0x10]\n"
1388 "ldr q12, [%x[B_ptr], #0x20]\n"
1389 "fmla v25.4s, v11.4s, v0.s[0]\n"
1390 "ldr q13, [%x[B_ptr], #0x30]\n"
1391 "fmla v26.4s, v12.4s, v0.s[0]\n"
1392 "ldr q14, [%x[B_ptr], #0x40]\n"
1393 "ldr q15, [%x[B_ptr], #0x50]\n"
1394 "fmla v27.4s, v13.4s, v0.s[0]\n"
1395 "ldr q16, [%x[B_ptr], #0x60]\n"
1396 "ldr q17, [%x[B_ptr], #0x70]\n"
1397 "fmla v28.4s, v14.4s, v0.s[0]\n"
1398 "fmla v29.4s, v15.4s, v0.s[0]\n"
1399 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1400 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1401 "fmla v30.4s, v16.4s, v0.s[0]\n"
1402 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1403 "ldr q18, [%x[B_ptr], #0x0]\n"
1404 "fmla v31.4s, v17.4s, v0.s[0]\n"
1405 "ldr q19, [%x[B_ptr], #0x10]\n"
1406 "ldr q20, [%x[B_ptr], #0x20]\n"
1407 "fmla v24.4s, v18.4s, v0.s[1]\n"
1408 "ldr q21, [%x[B_ptr], #0x30]\n"
1409 "ldr q22, [%x[B_ptr], #0x40]\n"
1410 "fmla v25.4s, v19.4s, v0.s[1]\n"
1411 "fmla v26.4s, v20.4s, v0.s[1]\n"
1412 "ldr q23, [%x[B_ptr], #0x50]\n"
1413 "ldr q1, [%x[B_ptr], #0x60]\n"
1414 "fmla v27.4s, v21.4s, v0.s[1]\n"
1415 "ldr q2, [%x[B_ptr], #0x70]\n"
1416 "fmla v28.4s, v22.4s, v0.s[1]\n"
1417 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1418 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1419 "fmla v29.4s, v23.4s, v0.s[1]\n"
1420 "ldr q3, [%x[B_ptr], #0x0]\n"
1421 "fmla v30.4s, v1.4s, v0.s[1]\n"
1422 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1423 "ldr q4, [%x[B_ptr], #0x10]\n"
1424 "fmla v31.4s, v2.4s, v0.s[1]\n"
1425 "ldr q5, [%x[B_ptr], #0x20]\n"
1426 "ldr q6, [%x[B_ptr], #0x30]\n"
1427 "fmla v24.4s, v3.4s, v0.s[2]\n"
1428 "ldr q7, [%x[B_ptr], #0x40]\n"
1429 "ldr q8, [%x[B_ptr], #0x50]\n"
1430 "fmla v25.4s, v4.4s, v0.s[2]\n"
1431 "ldr q9, [%x[B_ptr], #0x60]\n"
1432 "fmla v26.4s, v5.4s, v0.s[2]\n"
1433 "ldr q10, [%x[B_ptr], #0x70]\n"
1434 "fmla v27.4s, v6.4s, v0.s[2]\n"
1435 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1436 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1437 "fmla v28.4s, v7.4s, v0.s[2]\n"
1438 "ldr q11, [%x[B_ptr], #0x0]\n"
1439 "fmla v29.4s, v8.4s, v0.s[2]\n"
1440 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1441 "ldr q12, [%x[B_ptr], #0x10]\n"
1442 "fmla v30.4s, v9.4s, v0.s[2]\n"
1443 "ldr q13, [%x[B_ptr], #0x20]\n"
1444 "ldr q14, [%x[B_ptr], #0x30]\n"
1445 "fmla v31.4s, v10.4s, v0.s[2]\n"
1446 "ldr q15, [%x[B_ptr], #0x40]\n"
1447 "fmla v24.4s, v11.4s, v0.s[3]\n"
1448 "ldr q16, [%x[B_ptr], #0x50]\n"
1449 "ldr q17, [%x[B_ptr], #0x60]\n"
1450 "fmla v25.4s, v12.4s, v0.s[3]\n"
1451 "ldr q18, [%x[B_ptr], #0x70]\n"
1452 "fmla v26.4s, v13.4s, v0.s[3]\n"
1453 "fmla v27.4s, v14.4s, v0.s[3]\n"
1454 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1455 "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
1456 "fmla v28.4s, v15.4s, v0.s[3]\n"
1457 "add x19, x19, #0x10\n"
1458 "fmla v29.4s, v16.4s, v0.s[3]\n"
1459 "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
1460 "fmla v30.4s, v17.4s, v0.s[3]\n"
1461 "prfm pldl1keep, [x19, #0x80]\n"
1462 "fmla v31.4s, v18.4s, v0.s[3]\n"
1463 "90:" // Width 8: Multiply loop: Main loop skip
1464 "cbz x20, 92f\n"
1465 "91:" // Width 8: Multiply loop: Odd block loop
1466 "ldr s0, [x19], #0x4\n"
1467 "ldr q19, [%x[B_ptr], #0x0]\n"
1468 "fmla v24.4s, v19.4s, v0.s[0]\n"
1469 "ldr q20, [%x[B_ptr], #0x10]\n"
1470 "ldr q21, [%x[B_ptr], #0x20]\n"
1471 "fmla v25.4s, v20.4s, v0.s[0]\n"
1472 "ldr q22, [%x[B_ptr], #0x30]\n"
1473 "fmla v26.4s, v21.4s, v0.s[0]\n"
1474 "ldr q23, [%x[B_ptr], #0x40]\n"
1475 "ldr q1, [%x[B_ptr], #0x50]\n"
1476 "fmla v27.4s, v22.4s, v0.s[0]\n"
1477 "ldr q2, [%x[B_ptr], #0x60]\n"
1478 "ldr q3, [%x[B_ptr], #0x70]\n"
1479 "fmla v28.4s, v23.4s, v0.s[0]\n"
1480 "fmla v29.4s, v1.4s, v0.s[0]\n"
1481 "add %x[B_ptr], %x[B_ptr], #0x80\n"
1482 "sub x20, x20, #0x1\n"
1483 "fmla v30.4s, v2.4s, v0.s[0]\n"
1484 "fmla v31.4s, v3.4s, v0.s[0]\n"
1485 "cbnz x20, 91b\n"
1486 "92:" // Width 8: Multiply loop: No odd multiplies
1487 "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
1488 "tbz %x[flags], #1, 93f\n"
1489 "add x19, %x[args_ptr], %[offset_min]\n"
1490 "ld1r { v17.4s }, [x19]\n"
1491 "add x19, %x[args_ptr], %[offset_max]\n"
1492 "ld1r { v16.4s }, [x19]\n"
1493 "fmin v24.4s, v24.4s, v16.4s\n"
1494 "fmin v25.4s, v25.4s, v16.4s\n"
1495 "fmin v26.4s, v26.4s, v16.4s\n"
1496 "fmin v27.4s, v27.4s, v16.4s\n"
1497 "fmax v24.4s, v24.4s, v17.4s\n"
1498 "fmax v25.4s, v25.4s, v17.4s\n"
1499 "fmax v26.4s, v26.4s, v17.4s\n"
1500 "fmax v27.4s, v27.4s, v17.4s\n"
1501 "fmin v28.4s, v28.4s, v16.4s\n"
1502 "fmin v29.4s, v29.4s, v16.4s\n"
1503 "fmin v30.4s, v30.4s, v16.4s\n"
1504 "fmax v28.4s, v28.4s, v17.4s\n"
1505 "fmax v29.4s, v29.4s, v17.4s\n"
1506 "fmax v30.4s, v30.4s, v17.4s\n"
1507 "fmin v31.4s, v31.4s, v16.4s\n"
1508 "fmax v31.4s, v31.4s, v17.4s\n"
1509 "93:" // Width 8: No activation
1510 "str q24, [%x[output_ptr], #0x0]\n"
1511 "str q25, [%x[output_ptr], #0x10]\n"
1512 "str q26, [%x[output_ptr], #0x20]\n"
1513 "str q27, [%x[output_ptr], #0x30]\n"
1514 "str q28, [%x[output_ptr], #0x40]\n"
1515 "str q29, [%x[output_ptr], #0x50]\n"
1516 "str q30, [%x[output_ptr], #0x60]\n"
1517 "cmp %x[N], #0x20\n"
1518 "add %x[output_ptr], %x[output_ptr], #0x70\n"
1519 "blt 94f\n"
1520 "str q31, [%x[output_ptr], #0x0]\n"
1521 "add %x[output_ptr], %x[output_ptr], #0x10\n"
1522 "b 96f\n"
1523 "94:" // Width 8: Partial writeback
1524 "tbz %x[N], #1, 95f\n"
1525 "str d31, [%x[output_ptr]], #0x8\n"
1526 "tbz %x[N], #0, 96f\n"
1527 "st1 { v31.s }[2], [%x[output_ptr]]\n"
1528 "b 96f\n"
1529 "95:" // Width 8: Partial direct writeback: partial_1_28
1530 "tbz %x[N], #0, 96f\n"
1531 "str s31, [%x[output_ptr], #0x0]\n"
1532 "96:" // Width 8: Writeback done
1533 "subs x22, x22, #0x8\n"
1534 "sub %x[N], %x[N], #0x20\n"
1535 "bgt 1b\n"
1536 "97:" // Exit
1537
1538 : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
1539 : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
1540 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
1541 );
1542 }
1543
1544 } // namespace arm_gemm
1545
1546 #endif
1547