1 /*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
26
27 #include <cstddef>
28
29 namespace arm_conv {
30 namespace winograd {
31 namespace output_transform {
32
sme_fp32_mopa_4x4_3x3(const unsigned int n_channels,const float * inptr,const size_t matrix_stride,const float * bptr,float * const output,const size_t output_row_stride,const size_t output_col_stride,const float output_min,const float output_max)33 void sme_fp32_mopa_4x4_3x3(
34 const unsigned int n_channels,
35 const float* inptr,
36 const size_t matrix_stride,
37 const float* bptr,
38 float* const output,
39 const size_t output_row_stride,
40 const size_t output_col_stride,
41 const float output_min,
42 const float output_max
43 )
44 {
45 // The below assembler uses the Kronecker product and the "vec trick" to
46 // implement the Winograd output transform (y = AT Y A) using the SME
47 // array. This code REQUIRES that the vectors are 512b long (or longer, if
48 // we add some predication).
49 //
50 // The "vec trick" uses the identity $vec(AT Y A) = (AT (x) AT) vec(Y)$ to
51 // convert the chain of matrix multiplications into a matrix-vector
52 // product. We then stack multiple channels of vec(Y) together to allow us
53 // to perform multiple channels of the transformation simultaneously.
54 //
55 // Since the complete matrix (AT (x) AT) is quite big [16 x 36], we compute
56 // it on the fly. To do so, we store two representations of the matrix AT.
57 // The first representation (the outer terms) contains, within each quad,
58 // four coefficients of the matrix AT.
59 const float outer_terms[32] = {
60 1, 1, 1, 1,
61 0, 1, -1, 2,
62 0, 1, 1, 4,
63 0, 1, -1, 8,
64 // The following rows are continuations of the first four rows, and each
65 // contains two columns of padding values which aren't used in the
66 // computation but are there to ensure that the coefficients end up in
67 // the right quads of the vector into which they're read.
68 1, 0, 0, 0,
69 -2, 0, 0, 0,
70 4, 0, 0, 0,
71 -8, 1, 0, 0
72 };
73 // This should be loaded completely into two Z registers.
74 //
75 // We can then use by-element FMLA to construct columns of (AT (x) AT) by
76 // multiplying elements of the outer terms against the following inner
77 // terms (again split into quads, but expected to be loaded replicated such
78 // that each of the six required Z registers contains a repeated quad of
79 // the values).
80 const float inner_terms[24] = {
81 1, 0, 0, 0,
82 1, 1, 1, 1,
83 1, -1, 1, -1,
84 1, 2, 4, 8,
85 1, -2, 4, -8,
86 0, 0, 0, 1
87 };
88
89 struct Params
90 {
91 const float *outer_terms;
92 const float *inner_terms;
93 float act_min;
94 float act_max;
95
96 Params(const float *outer_terms,
97 const float *inner_terms,
98 float act_min,
99 float act_max)
100 : outer_terms(outer_terms), inner_terms(inner_terms),
101 act_min(act_min), act_max(act_max)
102 {
103 }
104 };
105
106 Params params(outer_terms, inner_terms, output_min, output_max);
107
108 __asm__ __volatile__(
109 "ldr x20, [%x[params], %[offsetof_Params_outer_terms]]\n"
110 ".inst 0xd503477f // SMSTART ZA\n"
111 "ptrue p5.b\n"
112 "ld1rw { z12.s }, p5/Z, [%x[params], %[offsetof_Params_act_min]]\n"
113 "ld1rw { z10.s }, p5/Z, [%x[params], %[offsetof_Params_act_max]]\n"
114 "pfalse p8.b\n"
115 "ldr x19, [%x[params], %[offsetof_Params_inner_terms]]\n"
116 "ld1w { z6.s }, p5/Z, [x20]\n"
117 "ld1w { z7.s }, p5/Z, [x20, #1, MUL VL]\n"
118 "ld1rqw { z9.s }, p5/Z, [x19]\n"
119 "ld1rqw { z8.s }, p5/Z, [x19, #16]\n"
120 "ld1rqw { z15.s }, p5/Z, [x19, #32]\n"
121 "fmul z11.s, z9.s, z6.s[0]\n"
122 "fmul z5.s, z9.s, z6.s[1]\n"
123 "ld1rqw { z4.s }, p5/Z, [x19, #48]\n"
124 "ld1rqw { z3.s }, p5/Z, [x19, #64]\n"
125 "ld1rqw { z2.s }, p5/Z, [x19, #80]\n"
126 "cbz %x[bptr], 1f\n"
127 "ptrue p8.s\n"
128 "1:" // Set bias predicate: Done
129 ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
130 "fmov z1.s, #1.0\n"
131 "mov x25, #0x0\n"
132 "cntw x24\n"
133 "cntw x23, ALL, MUL #2\n"
134 "cntw x22, ALL, MUL #3\n"
135 "whilelt p4.s, x25, %x[n_channels]\n"
136 "whilelt p3.s, x24, %x[n_channels]\n"
137 "ld1w { z31.s }, p4/Z, [%x[inptr], x25, LSL #2]\n"
138 "ld1w { z30.s }, p3/Z, [%x[inptr], x24, LSL #2]\n"
139 "whilelt p2.s, x23, %x[n_channels]\n"
140 "whilelt p1.s, x22, %x[n_channels]\n"
141 "ld1w { z29.s }, p2/Z, [%x[inptr], x23, LSL #2]\n"
142 "add x21, %x[inptr], %x[matrix_stride], LSL #2\n"
143 "and p0.b, p5/Z, p8.b, p4.b\n"
144 "ld1w { z28.s }, p1/Z, [%x[inptr], x22, LSL #2]\n"
145 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
146 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
147 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
148 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
149 "add x21, x21, %x[matrix_stride], LSL #2\n"
150 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
151 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
152 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
153 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
154 "add x21, x21, %x[matrix_stride], LSL #2\n"
155 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
156 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
157 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
158 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
159 "ld1w { z0.s }, p0/Z, [%x[bptr], x25, LSL #2]\n"
160 "and p0.b, p5/Z, p8.b, p3.b\n"
161 ".inst 0x8080b420 // fmopa za0.s, p5/M, p5/M, z1.s, z0.s\n"
162 "ld1w { z0.s }, p0/Z, [%x[bptr], x24, LSL #2]\n"
163 "and p0.b, p5/Z, p8.b, p2.b\n"
164 ".inst 0x8080b421 // fmopa za1.s, p5/M, p5/M, z1.s, z0.s\n"
165 "ld1w { z0.s }, p0/Z, [%x[bptr], x23, LSL #2]\n"
166 "and p0.b, p5/Z, p8.b, p1.b\n"
167 ".inst 0x8080b422 // fmopa za2.s, p5/M, p5/M, z1.s, z0.s\n"
168 "ld1w { z0.s }, p0/Z, [%x[bptr], x22, LSL #2]\n"
169 ".inst 0x8080b423 // fmopa za3.s, p5/M, p5/M, z1.s, z0.s\n"
170 "2:" // Loop
171 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
172 "add x21, x21, %x[matrix_stride], LSL #2\n"
173 "mov x14, #0xc\n"
174 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
175 "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
176 "whilelt p0.s, x25, %x[n_channels]\n"
177 "add x20, %x[output], %x[output_col_stride], LSL #2\n"
178 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
179 "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
180 "add x19, %x[output], %x[output_row_stride], LSL #2\n"
181 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
182 "fmul z11.s, z9.s, z6.s[2]\n"
183 "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
184 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
185 "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
186 "add x21, x21, %x[matrix_stride], LSL #2\n"
187 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
188 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
189 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
190 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
191 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
192 "fmul z5.s, z9.s, z6.s[3]\n"
193 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
194 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
195 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
196 "add x21, x21, %x[matrix_stride], LSL #2\n"
197 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
198 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
199 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
200 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
201 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
202 "fmul z11.s, z9.s, z7.s[0]\n"
203 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
204 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
205 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
206 "add x21, x21, %x[matrix_stride], LSL #2\n"
207 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
208 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
209 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
210 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
211 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
212 "fmul z5.s, z9.s, z7.s[1]\n"
213 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
214 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
215 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
216 "add x21, x21, %x[matrix_stride], LSL #2\n"
217 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
218 "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
219 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
220 "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
221 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
222 "fmul z11.s, z8.s, z6.s[0]\n"
223 "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
224 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
225 "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
226 "add x21, x21, %x[matrix_stride], LSL #2\n"
227 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
228 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
229 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
230 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
231 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
232 "fmul z5.s, z8.s, z6.s[1]\n"
233 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
234 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
235 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
236 "add x21, x21, %x[matrix_stride], LSL #2\n"
237 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
238 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
239 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
240 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
241 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
242 "fmul z11.s, z8.s, z6.s[2]\n"
243 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
244 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
245 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
246 "add x21, x21, %x[matrix_stride], LSL #2\n"
247 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
248 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
249 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
250 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
251 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
252 "fmul z5.s, z8.s, z6.s[3]\n"
253 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
254 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
255 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
256 "add x21, x21, %x[matrix_stride], LSL #2\n"
257 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
258 "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
259 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
260 "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
261 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
262 "fmul z11.s, z8.s, z7.s[0]\n"
263 "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
264 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
265 "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
266 "add x21, x21, %x[matrix_stride], LSL #2\n"
267 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
268 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
269 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
270 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
271 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
272 "fmul z5.s, z8.s, z7.s[1]\n"
273 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
274 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
275 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
276 "add x21, x21, %x[matrix_stride], LSL #2\n"
277 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
278 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
279 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
280 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
281 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
282 "fmul z11.s, z15.s, z6.s[0]\n"
283 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
284 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
285 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
286 "add x21, x21, %x[matrix_stride], LSL #2\n"
287 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
288 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
289 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
290 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
291 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
292 "fmul z5.s, z15.s, z6.s[1]\n"
293 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
294 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
295 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
296 "add x21, x21, %x[matrix_stride], LSL #2\n"
297 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
298 "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
299 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
300 "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
301 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
302 "fmul z11.s, z15.s, z6.s[2]\n"
303 "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
304 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
305 "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
306 "add x21, x21, %x[matrix_stride], LSL #2\n"
307 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
308 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
309 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
310 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
311 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
312 "fmul z5.s, z15.s, z6.s[3]\n"
313 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
314 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
315 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
316 "add x21, x21, %x[matrix_stride], LSL #2\n"
317 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
318 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
319 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
320 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
321 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
322 "fmul z11.s, z15.s, z7.s[0]\n"
323 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
324 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
325 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
326 "add x21, x21, %x[matrix_stride], LSL #2\n"
327 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
328 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
329 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
330 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
331 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
332 "fmul z5.s, z15.s, z7.s[1]\n"
333 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
334 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
335 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
336 "add x21, x21, %x[matrix_stride], LSL #2\n"
337 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
338 "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
339 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
340 "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
341 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
342 "fmul z11.s, z4.s, z6.s[0]\n"
343 "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
344 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
345 "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
346 "add x21, x21, %x[matrix_stride], LSL #2\n"
347 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
348 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
349 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
350 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
351 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
352 "fmul z5.s, z4.s, z6.s[1]\n"
353 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
354 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
355 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
356 "add x21, x21, %x[matrix_stride], LSL #2\n"
357 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
358 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
359 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
360 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
361 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
362 "fmul z11.s, z4.s, z6.s[2]\n"
363 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
364 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
365 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
366 "add x21, x21, %x[matrix_stride], LSL #2\n"
367 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
368 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
369 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
370 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
371 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
372 "fmul z5.s, z4.s, z6.s[3]\n"
373 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
374 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
375 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
376 "add x21, x21, %x[matrix_stride], LSL #2\n"
377 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
378 "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
379 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
380 "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
381 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
382 "fmul z11.s, z4.s, z7.s[0]\n"
383 "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
384 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
385 "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
386 "add x21, x21, %x[matrix_stride], LSL #2\n"
387 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
388 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
389 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
390 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
391 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
392 "fmul z5.s, z4.s, z7.s[1]\n"
393 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
394 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
395 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
396 "add x21, x21, %x[matrix_stride], LSL #2\n"
397 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
398 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
399 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
400 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
401 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
402 "fmul z11.s, z3.s, z6.s[0]\n"
403 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
404 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
405 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
406 "add x21, x21, %x[matrix_stride], LSL #2\n"
407 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
408 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
409 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
410 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
411 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
412 "fmul z5.s, z3.s, z6.s[1]\n"
413 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
414 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
415 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
416 "add x21, x21, %x[matrix_stride], LSL #2\n"
417 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
418 "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
419 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
420 "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
421 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
422 "fmul z11.s, z3.s, z6.s[2]\n"
423 "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
424 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
425 "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
426 "add x21, x21, %x[matrix_stride], LSL #2\n"
427 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
428 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
429 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
430 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
431 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
432 "fmul z5.s, z3.s, z6.s[3]\n"
433 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
434 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
435 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
436 "add x21, x21, %x[matrix_stride], LSL #2\n"
437 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
438 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
439 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
440 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
441 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
442 "fmul z11.s, z3.s, z7.s[0]\n"
443 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
444 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
445 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
446 "add x21, x21, %x[matrix_stride], LSL #2\n"
447 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
448 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
449 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
450 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
451 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
452 "fmul z5.s, z3.s, z7.s[1]\n"
453 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
454 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
455 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
456 "add x21, x21, %x[matrix_stride], LSL #2\n"
457 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
458 "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
459 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
460 "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
461 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
462 "fmul z11.s, z2.s, z6.s[0]\n"
463 "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
464 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
465 "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
466 "add x21, x21, %x[matrix_stride], LSL #2\n"
467 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
468 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
469 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
470 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
471 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
472 "fmul z5.s, z2.s, z6.s[1]\n"
473 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
474 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
475 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
476 "add x21, x21, %x[matrix_stride], LSL #2\n"
477 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
478 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
479 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
480 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
481 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
482 "fmul z11.s, z2.s, z6.s[2]\n"
483 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
484 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
485 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
486 "add x21, x21, %x[matrix_stride], LSL #2\n"
487 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
488 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
489 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
490 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
491 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
492 "fmul z5.s, z2.s, z6.s[3]\n"
493 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
494 ".inst 0x809fb560 // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
495 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
496 ".inst 0x809eb561 // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
497 ".inst 0x809db562 // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
498 ".inst 0x809cb563 // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
499 "fmul z11.s, z2.s, z7.s[0]\n"
500 ".inst 0x809bb4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
501 ".inst 0x809ab4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
502 ".inst 0x8099b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
503 ".inst 0x8098b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
504 "fmul z5.s, z2.s, z7.s[1]\n"
505 ".inst 0x8097b560 // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
506 ".inst 0x8096b561 // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
507 ".inst 0x8095b562 // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
508 ".inst 0x8094b563 // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
509 "fmul z11.s, z9.s, z6.s[0]\n"
510 ".inst 0x8093b4a0 // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
511 ".inst 0x8092b4a1 // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
512 ".inst 0x8091b4a2 // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
513 ".inst 0x8090b4a3 // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
514 "fmul z5.s, z9.s, z6.s[1]\n"
515 ".inst 0xc082741f // mova z31.s, p5/M, za0h.s[XZR]\n"
516 ".inst 0xc082541c // mova z28.s, p5/M, za0h.s[x14]\n"
517 "fmin z31.s, p5/M, z31.s, z10.s\n"
518 ".inst 0xc082743b // mova z27.s, p5/M, za0h.s[XZR, #1]\n"
519 "fmin z28.s, p5/M, z28.s, z10.s\n"
520 ".inst 0xc0825438 // mova z24.s, p5/M, za0h.s[x14, #1]\n"
521 "fmin z27.s, p5/M, z27.s, z10.s\n"
522 "mov x13, #0x4\n"
523 "mov x12, #0x8\n"
524 ".inst 0xc082341e // mova z30.s, p5/M, za0h.s[x13]\n"
525 "fmin z24.s, p5/M, z24.s, z10.s\n"
526 ".inst 0xc082141d // mova z29.s, p5/M, za0h.s[x12]\n"
527 "fmax z31.s, p5/M, z31.s, z12.s\n"
528 "fmin z30.s, p5/M, z30.s, z10.s\n"
529 ".inst 0xc082343a // mova z26.s, p5/M, za0h.s[x13, #1]\n"
530 "fmin z29.s, p5/M, z29.s, z10.s\n"
531 "fmax z28.s, p5/M, z28.s, z12.s\n"
532 ".inst 0xc0821439 // mova z25.s, p5/M, za0h.s[x12, #1]\n"
533 "fmax z27.s, p5/M, z27.s, z12.s\n"
534 "fmin z26.s, p5/M, z26.s, z10.s\n"
535 ".inst 0xc0827457 // mova z23.s, p5/M, za0h.s[XZR, #2]\n"
536 "fmin z25.s, p5/M, z25.s, z10.s\n"
537 "fmax z24.s, p5/M, z24.s, z12.s\n"
538 ".inst 0xc0823456 // mova z22.s, p5/M, za0h.s[x13, #2]\n"
539 "fmax z30.s, p5/M, z30.s, z12.s\n"
540 "fmin z23.s, p5/M, z23.s, z10.s\n"
541 ".inst 0xc0821455 // mova z21.s, p5/M, za0h.s[x12, #2]\n"
542 "fmax z29.s, p5/M, z29.s, z12.s\n"
543 "fmin z22.s, p5/M, z22.s, z10.s\n"
544 ".inst 0xc0825454 // mova z20.s, p5/M, za0h.s[x14, #2]\n"
545 "fmax z26.s, p5/M, z26.s, z12.s\n"
546 "fmin z21.s, p5/M, z21.s, z10.s\n"
547 ".inst 0xc0827473 // mova z19.s, p5/M, za0h.s[XZR, #3]\n"
548 "fmax z25.s, p5/M, z25.s, z12.s\n"
549 "fmin z20.s, p5/M, z20.s, z10.s\n"
550 ".inst 0xc0823472 // mova z18.s, p5/M, za0h.s[x13, #3]\n"
551 "fmax z23.s, p5/M, z23.s, z12.s\n"
552 "fmin z19.s, p5/M, z19.s, z10.s\n"
553 ".inst 0xc0821471 // mova z17.s, p5/M, za0h.s[x12, #3]\n"
554 "fmax z22.s, p5/M, z22.s, z12.s\n"
555 "fmin z18.s, p5/M, z18.s, z10.s\n"
556 ".inst 0xc0825470 // mova z16.s, p5/M, za0h.s[x14, #3]\n"
557 "fmax z21.s, p5/M, z21.s, z12.s\n"
558 "fmin z17.s, p5/M, z17.s, z10.s\n"
559 "fmax z20.s, p5/M, z20.s, z12.s\n"
560 "fmin z16.s, p5/M, z16.s, z10.s\n"
561 "st1w { z31.s }, p0, [%x[output], x25, LSL #2]\n"
562 "fmax z19.s, p5/M, z19.s, z12.s\n"
563 "st1w { z30.s }, p0, [x20, x25, LSL #2]\n"
564 "add x20, x20, %x[output_col_stride], LSL #2\n"
565 "fmax z18.s, p5/M, z18.s, z12.s\n"
566 "st1w { z29.s }, p0, [x20, x25, LSL #2]\n"
567 "add x20, x20, %x[output_col_stride], LSL #2\n"
568 "fmax z17.s, p5/M, z17.s, z12.s\n"
569 "st1w { z28.s }, p0, [x20, x25, LSL #2]\n"
570 "add x20, x19, %x[output_col_stride], LSL #2\n"
571 "fmax z16.s, p5/M, z16.s, z12.s\n"
572 "st1w { z27.s }, p0, [x19, x25, LSL #2]\n"
573 "add x19, x19, %x[output_row_stride], LSL #2\n"
574 "st1w { z26.s }, p0, [x20, x25, LSL #2]\n"
575 "add x20, x20, %x[output_col_stride], LSL #2\n"
576 "st1w { z25.s }, p0, [x20, x25, LSL #2]\n"
577 "add x20, x20, %x[output_col_stride], LSL #2\n"
578 "st1w { z24.s }, p0, [x20, x25, LSL #2]\n"
579 "add x20, x19, %x[output_col_stride], LSL #2\n"
580 "st1w { z23.s }, p0, [x19, x25, LSL #2]\n"
581 "add x19, x19, %x[output_row_stride], LSL #2\n"
582 "st1w { z22.s }, p0, [x20, x25, LSL #2]\n"
583 "add x20, x20, %x[output_col_stride], LSL #2\n"
584 "st1w { z21.s }, p0, [x20, x25, LSL #2]\n"
585 "add x20, x20, %x[output_col_stride], LSL #2\n"
586 "st1w { z20.s }, p0, [x20, x25, LSL #2]\n"
587 "add x20, x19, %x[output_col_stride], LSL #2\n"
588 "st1w { z19.s }, p0, [x19, x25, LSL #2]\n"
589 "st1w { z18.s }, p0, [x20, x25, LSL #2]\n"
590 "add x20, x20, %x[output_col_stride], LSL #2\n"
591 "st1w { z17.s }, p0, [x20, x25, LSL #2]\n"
592 "add x20, x20, %x[output_col_stride], LSL #2\n"
593 "st1w { z16.s }, p0, [x20, x25, LSL #2]\n"
594 "whilelt p0.s, x24, %x[n_channels]\n"
595 "b.none 3f\n"
596 ".inst 0xc082749f // mova z31.s, p5/M, za1h.s[XZR]\n"
597 ".inst 0xc082349e // mova z30.s, p5/M, za1h.s[x13]\n"
598 "fmin z31.s, p5/M, z31.s, z10.s\n"
599 ".inst 0xc082149d // mova z29.s, p5/M, za1h.s[x12]\n"
600 "fmin z30.s, p5/M, z30.s, z10.s\n"
601 ".inst 0xc082549c // mova z28.s, p5/M, za1h.s[x14]\n"
602 "fmin z29.s, p5/M, z29.s, z10.s\n"
603 ".inst 0xc08274bb // mova z27.s, p5/M, za1h.s[XZR, #1]\n"
604 "fmin z28.s, p5/M, z28.s, z10.s\n"
605 ".inst 0xc08234ba // mova z26.s, p5/M, za1h.s[x13, #1]\n"
606 "fmax z31.s, p5/M, z31.s, z12.s\n"
607 "fmin z27.s, p5/M, z27.s, z10.s\n"
608 ".inst 0xc08214b9 // mova z25.s, p5/M, za1h.s[x12, #1]\n"
609 "fmax z30.s, p5/M, z30.s, z12.s\n"
610 "fmin z26.s, p5/M, z26.s, z10.s\n"
611 ".inst 0xc08254b8 // mova z24.s, p5/M, za1h.s[x14, #1]\n"
612 "fmax z29.s, p5/M, z29.s, z12.s\n"
613 "fmin z25.s, p5/M, z25.s, z10.s\n"
614 ".inst 0xc08274d7 // mova z23.s, p5/M, za1h.s[XZR, #2]\n"
615 "fmax z28.s, p5/M, z28.s, z12.s\n"
616 "fmin z24.s, p5/M, z24.s, z10.s\n"
617 ".inst 0xc08234d6 // mova z22.s, p5/M, za1h.s[x13, #2]\n"
618 "fmax z27.s, p5/M, z27.s, z12.s\n"
619 "fmin z23.s, p5/M, z23.s, z10.s\n"
620 ".inst 0xc08214d5 // mova z21.s, p5/M, za1h.s[x12, #2]\n"
621 "fmax z26.s, p5/M, z26.s, z12.s\n"
622 "fmin z22.s, p5/M, z22.s, z10.s\n"
623 "add x20, %x[output], %x[output_col_stride], LSL #2\n"
624 ".inst 0xc08254d4 // mova z20.s, p5/M, za1h.s[x14, #2]\n"
625 "fmax z25.s, p5/M, z25.s, z12.s\n"
626 "fmin z21.s, p5/M, z21.s, z10.s\n"
627 "add x19, %x[output], %x[output_row_stride], LSL #2\n"
628 ".inst 0xc08274f3 // mova z19.s, p5/M, za1h.s[XZR, #3]\n"
629 "fmax z24.s, p5/M, z24.s, z12.s\n"
630 "fmin z20.s, p5/M, z20.s, z10.s\n"
631 ".inst 0xc08234f2 // mova z18.s, p5/M, za1h.s[x13, #3]\n"
632 "fmax z23.s, p5/M, z23.s, z12.s\n"
633 "fmin z19.s, p5/M, z19.s, z10.s\n"
634 ".inst 0xc08214f1 // mova z17.s, p5/M, za1h.s[x12, #3]\n"
635 "fmax z22.s, p5/M, z22.s, z12.s\n"
636 "fmin z18.s, p5/M, z18.s, z10.s\n"
637 ".inst 0xc08254f0 // mova z16.s, p5/M, za1h.s[x14, #3]\n"
638 "fmax z21.s, p5/M, z21.s, z12.s\n"
639 "fmin z17.s, p5/M, z17.s, z10.s\n"
640 "fmax z20.s, p5/M, z20.s, z12.s\n"
641 "fmin z16.s, p5/M, z16.s, z10.s\n"
642 "st1w { z31.s }, p0, [%x[output], x24, LSL #2]\n"
643 "fmax z19.s, p5/M, z19.s, z12.s\n"
644 "st1w { z30.s }, p0, [x20, x24, LSL #2]\n"
645 "add x20, x20, %x[output_col_stride], LSL #2\n"
646 "fmax z18.s, p5/M, z18.s, z12.s\n"
647 "st1w { z29.s }, p0, [x20, x24, LSL #2]\n"
648 "add x20, x20, %x[output_col_stride], LSL #2\n"
649 "fmax z17.s, p5/M, z17.s, z12.s\n"
650 "st1w { z28.s }, p0, [x20, x24, LSL #2]\n"
651 "add x20, x19, %x[output_col_stride], LSL #2\n"
652 "fmax z16.s, p5/M, z16.s, z12.s\n"
653 "st1w { z27.s }, p0, [x19, x24, LSL #2]\n"
654 "add x19, x19, %x[output_row_stride], LSL #2\n"
655 "st1w { z26.s }, p0, [x20, x24, LSL #2]\n"
656 "add x20, x20, %x[output_col_stride], LSL #2\n"
657 "st1w { z25.s }, p0, [x20, x24, LSL #2]\n"
658 "add x20, x20, %x[output_col_stride], LSL #2\n"
659 "st1w { z24.s }, p0, [x20, x24, LSL #2]\n"
660 "add x20, x19, %x[output_col_stride], LSL #2\n"
661 "st1w { z23.s }, p0, [x19, x24, LSL #2]\n"
662 "add x19, x19, %x[output_row_stride], LSL #2\n"
663 "st1w { z22.s }, p0, [x20, x24, LSL #2]\n"
664 "add x20, x20, %x[output_col_stride], LSL #2\n"
665 "st1w { z21.s }, p0, [x20, x24, LSL #2]\n"
666 "add x20, x20, %x[output_col_stride], LSL #2\n"
667 "st1w { z20.s }, p0, [x20, x24, LSL #2]\n"
668 "add x20, x19, %x[output_col_stride], LSL #2\n"
669 "st1w { z19.s }, p0, [x19, x24, LSL #2]\n"
670 "st1w { z18.s }, p0, [x20, x24, LSL #2]\n"
671 "add x20, x20, %x[output_col_stride], LSL #2\n"
672 "st1w { z17.s }, p0, [x20, x24, LSL #2]\n"
673 "add x20, x20, %x[output_col_stride], LSL #2\n"
674 "st1w { z16.s }, p0, [x20, x24, LSL #2]\n"
675 "whilelt p0.s, x23, %x[n_channels]\n"
676 "b.none 3f\n"
677 ".inst 0xc082751f // mova z31.s, p5/M, za2h.s[XZR]\n"
678 ".inst 0xc082351e // mova z30.s, p5/M, za2h.s[x13]\n"
679 "fmin z31.s, p5/M, z31.s, z10.s\n"
680 ".inst 0xc082151d // mova z29.s, p5/M, za2h.s[x12]\n"
681 "fmin z30.s, p5/M, z30.s, z10.s\n"
682 ".inst 0xc082551c // mova z28.s, p5/M, za2h.s[x14]\n"
683 "fmin z29.s, p5/M, z29.s, z10.s\n"
684 ".inst 0xc082753b // mova z27.s, p5/M, za2h.s[XZR, #1]\n"
685 "fmin z28.s, p5/M, z28.s, z10.s\n"
686 ".inst 0xc082353a // mova z26.s, p5/M, za2h.s[x13, #1]\n"
687 "fmax z31.s, p5/M, z31.s, z12.s\n"
688 "fmin z27.s, p5/M, z27.s, z10.s\n"
689 ".inst 0xc0821539 // mova z25.s, p5/M, za2h.s[x12, #1]\n"
690 "fmax z30.s, p5/M, z30.s, z12.s\n"
691 "fmin z26.s, p5/M, z26.s, z10.s\n"
692 ".inst 0xc0825538 // mova z24.s, p5/M, za2h.s[x14, #1]\n"
693 "fmax z29.s, p5/M, z29.s, z12.s\n"
694 "fmin z25.s, p5/M, z25.s, z10.s\n"
695 ".inst 0xc0827557 // mova z23.s, p5/M, za2h.s[XZR, #2]\n"
696 "fmax z28.s, p5/M, z28.s, z12.s\n"
697 "fmin z24.s, p5/M, z24.s, z10.s\n"
698 ".inst 0xc0823556 // mova z22.s, p5/M, za2h.s[x13, #2]\n"
699 "fmax z27.s, p5/M, z27.s, z12.s\n"
700 "fmin z23.s, p5/M, z23.s, z10.s\n"
701 ".inst 0xc0821555 // mova z21.s, p5/M, za2h.s[x12, #2]\n"
702 "fmax z26.s, p5/M, z26.s, z12.s\n"
703 "fmin z22.s, p5/M, z22.s, z10.s\n"
704 "add x20, %x[output], %x[output_col_stride], LSL #2\n"
705 ".inst 0xc0825554 // mova z20.s, p5/M, za2h.s[x14, #2]\n"
706 "fmax z25.s, p5/M, z25.s, z12.s\n"
707 "fmin z21.s, p5/M, z21.s, z10.s\n"
708 "add x19, %x[output], %x[output_row_stride], LSL #2\n"
709 ".inst 0xc0827573 // mova z19.s, p5/M, za2h.s[XZR, #3]\n"
710 "fmax z24.s, p5/M, z24.s, z12.s\n"
711 "fmin z20.s, p5/M, z20.s, z10.s\n"
712 ".inst 0xc0823572 // mova z18.s, p5/M, za2h.s[x13, #3]\n"
713 "fmax z23.s, p5/M, z23.s, z12.s\n"
714 "fmin z19.s, p5/M, z19.s, z10.s\n"
715 ".inst 0xc0821571 // mova z17.s, p5/M, za2h.s[x12, #3]\n"
716 "fmax z22.s, p5/M, z22.s, z12.s\n"
717 "fmin z18.s, p5/M, z18.s, z10.s\n"
718 ".inst 0xc0825570 // mova z16.s, p5/M, za2h.s[x14, #3]\n"
719 "fmax z21.s, p5/M, z21.s, z12.s\n"
720 "fmin z17.s, p5/M, z17.s, z10.s\n"
721 "fmax z20.s, p5/M, z20.s, z12.s\n"
722 "fmin z16.s, p5/M, z16.s, z10.s\n"
723 "st1w { z31.s }, p0, [%x[output], x23, LSL #2]\n"
724 "fmax z19.s, p5/M, z19.s, z12.s\n"
725 "st1w { z30.s }, p0, [x20, x23, LSL #2]\n"
726 "add x20, x20, %x[output_col_stride], LSL #2\n"
727 "fmax z18.s, p5/M, z18.s, z12.s\n"
728 "st1w { z29.s }, p0, [x20, x23, LSL #2]\n"
729 "add x20, x20, %x[output_col_stride], LSL #2\n"
730 "fmax z17.s, p5/M, z17.s, z12.s\n"
731 "st1w { z28.s }, p0, [x20, x23, LSL #2]\n"
732 "add x20, x19, %x[output_col_stride], LSL #2\n"
733 "fmax z16.s, p5/M, z16.s, z12.s\n"
734 "st1w { z27.s }, p0, [x19, x23, LSL #2]\n"
735 "add x19, x19, %x[output_row_stride], LSL #2\n"
736 "st1w { z26.s }, p0, [x20, x23, LSL #2]\n"
737 "add x20, x20, %x[output_col_stride], LSL #2\n"
738 "st1w { z25.s }, p0, [x20, x23, LSL #2]\n"
739 "add x20, x20, %x[output_col_stride], LSL #2\n"
740 "st1w { z24.s }, p0, [x20, x23, LSL #2]\n"
741 "add x20, x19, %x[output_col_stride], LSL #2\n"
742 "st1w { z23.s }, p0, [x19, x23, LSL #2]\n"
743 "add x19, x19, %x[output_row_stride], LSL #2\n"
744 "st1w { z22.s }, p0, [x20, x23, LSL #2]\n"
745 "add x20, x20, %x[output_col_stride], LSL #2\n"
746 "st1w { z21.s }, p0, [x20, x23, LSL #2]\n"
747 "add x20, x20, %x[output_col_stride], LSL #2\n"
748 "st1w { z20.s }, p0, [x20, x23, LSL #2]\n"
749 "add x20, x19, %x[output_col_stride], LSL #2\n"
750 "st1w { z19.s }, p0, [x19, x23, LSL #2]\n"
751 "st1w { z18.s }, p0, [x20, x23, LSL #2]\n"
752 "add x20, x20, %x[output_col_stride], LSL #2\n"
753 "st1w { z17.s }, p0, [x20, x23, LSL #2]\n"
754 "add x20, x20, %x[output_col_stride], LSL #2\n"
755 "st1w { z16.s }, p0, [x20, x23, LSL #2]\n"
756 "whilelt p0.s, x22, %x[n_channels]\n"
757 "b.none 3f\n"
758 "fmov z1.s, #1.0\n"
759 ".inst 0xc082759f // mova z31.s, p5/M, za3h.s[XZR]\n"
760 ".inst 0xc082359e // mova z30.s, p5/M, za3h.s[x13]\n"
761 "fmin z31.s, p5/M, z31.s, z10.s\n"
762 ".inst 0xc082159d // mova z29.s, p5/M, za3h.s[x12]\n"
763 "fmin z30.s, p5/M, z30.s, z10.s\n"
764 ".inst 0xc082559c // mova z28.s, p5/M, za3h.s[x14]\n"
765 "fmin z29.s, p5/M, z29.s, z10.s\n"
766 ".inst 0xc08275bb // mova z27.s, p5/M, za3h.s[XZR, #1]\n"
767 "fmin z28.s, p5/M, z28.s, z10.s\n"
768 ".inst 0xc08235ba // mova z26.s, p5/M, za3h.s[x13, #1]\n"
769 "fmax z31.s, p5/M, z31.s, z12.s\n"
770 "fmin z27.s, p5/M, z27.s, z10.s\n"
771 ".inst 0xc08215b9 // mova z25.s, p5/M, za3h.s[x12, #1]\n"
772 "fmax z30.s, p5/M, z30.s, z12.s\n"
773 "fmin z26.s, p5/M, z26.s, z10.s\n"
774 ".inst 0xc08255b8 // mova z24.s, p5/M, za3h.s[x14, #1]\n"
775 "fmax z29.s, p5/M, z29.s, z12.s\n"
776 "fmin z25.s, p5/M, z25.s, z10.s\n"
777 ".inst 0xc08275d7 // mova z23.s, p5/M, za3h.s[XZR, #2]\n"
778 "fmax z28.s, p5/M, z28.s, z12.s\n"
779 "fmin z24.s, p5/M, z24.s, z10.s\n"
780 ".inst 0xc08235d6 // mova z22.s, p5/M, za3h.s[x13, #2]\n"
781 "fmax z27.s, p5/M, z27.s, z12.s\n"
782 "fmin z23.s, p5/M, z23.s, z10.s\n"
783 ".inst 0xc08215d5 // mova z21.s, p5/M, za3h.s[x12, #2]\n"
784 "fmax z26.s, p5/M, z26.s, z12.s\n"
785 "fmin z22.s, p5/M, z22.s, z10.s\n"
786 ".inst 0xc08255d4 // mova z20.s, p5/M, za3h.s[x14, #2]\n"
787 "fmax z25.s, p5/M, z25.s, z12.s\n"
788 "fmin z21.s, p5/M, z21.s, z10.s\n"
789 "add x20, %x[output], %x[output_col_stride], LSL #2\n"
790 ".inst 0xc08275f3 // mova z19.s, p5/M, za3h.s[XZR, #3]\n"
791 "fmax z24.s, p5/M, z24.s, z12.s\n"
792 "fmin z20.s, p5/M, z20.s, z10.s\n"
793 "add x19, %x[output], %x[output_row_stride], LSL #2\n"
794 ".inst 0xc08235f2 // mova z18.s, p5/M, za3h.s[x13, #3]\n"
795 "fmax z23.s, p5/M, z23.s, z12.s\n"
796 "fmin z19.s, p5/M, z19.s, z10.s\n"
797 "incw x25, ALL, MUL #4\n"
798 ".inst 0xc08215f1 // mova z17.s, p5/M, za3h.s[x12, #3]\n"
799 "fmax z22.s, p5/M, z22.s, z12.s\n"
800 "fmin z18.s, p5/M, z18.s, z10.s\n"
801 "incw x24, ALL, MUL #4\n"
802 ".inst 0xc08255f0 // mova z16.s, p5/M, za3h.s[x14, #3]\n"
803 "fmax z21.s, p5/M, z21.s, z12.s\n"
804 "fmin z17.s, p5/M, z17.s, z10.s\n"
805 "incw x23, ALL, MUL #4\n"
806 ".inst 0xc00800ff // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
807 "fmax z20.s, p5/M, z20.s, z12.s\n"
808 "fmin z16.s, p5/M, z16.s, z10.s\n"
809 "add x21, %x[inptr], %x[matrix_stride], LSL #2\n"
810 "fmax z19.s, p5/M, z19.s, z12.s\n"
811 "st1w { z31.s }, p0, [%x[output], x22, LSL #2]\n"
812 "fmax z18.s, p5/M, z18.s, z12.s\n"
813 "st1w { z30.s }, p0, [x20, x22, LSL #2]\n"
814 "add x20, x20, %x[output_col_stride], LSL #2\n"
815 "fmax z17.s, p5/M, z17.s, z12.s\n"
816 "st1w { z29.s }, p0, [x20, x22, LSL #2]\n"
817 "add x20, x20, %x[output_col_stride], LSL #2\n"
818 "fmax z16.s, p5/M, z16.s, z12.s\n"
819 "st1w { z28.s }, p0, [x20, x22, LSL #2]\n"
820 "add x20, x19, %x[output_col_stride], LSL #2\n"
821 "st1w { z27.s }, p0, [x19, x22, LSL #2]\n"
822 "add x19, x19, %x[output_row_stride], LSL #2\n"
823 "st1w { z26.s }, p0, [x20, x22, LSL #2]\n"
824 "add x20, x20, %x[output_col_stride], LSL #2\n"
825 "st1w { z25.s }, p0, [x20, x22, LSL #2]\n"
826 "add x20, x20, %x[output_col_stride], LSL #2\n"
827 "st1w { z24.s }, p0, [x20, x22, LSL #2]\n"
828 "add x20, x19, %x[output_col_stride], LSL #2\n"
829 "st1w { z23.s }, p0, [x19, x22, LSL #2]\n"
830 "add x19, x19, %x[output_row_stride], LSL #2\n"
831 "st1w { z22.s }, p0, [x20, x22, LSL #2]\n"
832 "add x20, x20, %x[output_col_stride], LSL #2\n"
833 "st1w { z21.s }, p0, [x20, x22, LSL #2]\n"
834 "add x20, x20, %x[output_col_stride], LSL #2\n"
835 "st1w { z20.s }, p0, [x20, x22, LSL #2]\n"
836 "add x20, x19, %x[output_col_stride], LSL #2\n"
837 "st1w { z19.s }, p0, [x19, x22, LSL #2]\n"
838 "st1w { z18.s }, p0, [x20, x22, LSL #2]\n"
839 "add x20, x20, %x[output_col_stride], LSL #2\n"
840 "st1w { z17.s }, p0, [x20, x22, LSL #2]\n"
841 "add x20, x20, %x[output_col_stride], LSL #2\n"
842 "st1w { z16.s }, p0, [x20, x22, LSL #2]\n"
843 "incw x22, ALL, MUL #4\n"
844 "whilelt p1.s, x22, %x[n_channels]\n"
845 "ld1w { z28.s }, p1/Z, [%x[inptr], x22, LSL #2]\n"
846 "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
847 "whilelt p2.s, x23, %x[n_channels]\n"
848 "whilelt p3.s, x24, %x[n_channels]\n"
849 "ld1w { z30.s }, p3/Z, [%x[inptr], x24, LSL #2]\n"
850 "whilelt p4.s, x25, %x[n_channels]\n"
851 "ld1w { z31.s }, p4/Z, [%x[inptr], x25, LSL #2]\n"
852 "and p0.b, p5/Z, p8.b, p4.b\n"
853 "ld1w { z29.s }, p2/Z, [%x[inptr], x23, LSL #2]\n"
854 "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
855 "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
856 "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
857 "add x21, x21, %x[matrix_stride], LSL #2\n"
858 "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
859 "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
860 "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
861 "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
862 "add x21, x21, %x[matrix_stride], LSL #2\n"
863 "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
864 "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
865 "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
866 "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
867 "ld1w { z0.s }, p0/Z, [%x[bptr], x25, LSL #2]\n"
868 "and p0.b, p5/Z, p8.b, p3.b\n"
869 ".inst 0x8080b420 // fmopa za0.s, p5/M, p5/M, z1.s, z0.s\n"
870 "ld1w { z0.s }, p0/Z, [%x[bptr], x24, LSL #2]\n"
871 "and p0.b, p5/Z, p8.b, p2.b\n"
872 ".inst 0x8080b421 // fmopa za1.s, p5/M, p5/M, z1.s, z0.s\n"
873 "ld1w { z0.s }, p0/Z, [%x[bptr], x23, LSL #2]\n"
874 "and p0.b, p5/Z, p8.b, p1.b\n"
875 ".inst 0x8080b422 // fmopa za2.s, p5/M, p5/M, z1.s, z0.s\n"
876 "ld1w { z0.s }, p0/Z, [%x[bptr], x22, LSL #2]\n"
877 ".inst 0x8080b423 // fmopa za3.s, p5/M, p5/M, z1.s, z0.s\n"
878 "b.any 2b\n"
879 "3:" // End
880 ".inst 0xd503467f // SMSTOP\n"
881 :
882 : [bptr] "r" (bptr), [inptr] "r" (inptr), [matrix_stride] "r" (matrix_stride), [n_channels] "r" (n_channels), [offsetof_Params_act_max] "I" (offsetof(Params, act_max)), [offsetof_Params_act_min] "I" (offsetof(Params, act_min)), [offsetof_Params_inner_terms] "I" (offsetof(Params, inner_terms)), [offsetof_Params_outer_terms] "I" (offsetof(Params, outer_terms)), [output] "r" (output), [output_col_stride] "r" (output_col_stride), [output_row_stride] "r" (output_row_stride), [params] "r" (¶ms)
883 : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p8", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
884 );
885 }
886
887 } // namespace output_transform
888 } // namespace winograd
889 } // namespace arm_conv
890
891 #endif //defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
892
893