• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
26 
27 #include <cstddef>
28 
29 namespace arm_conv {
30 namespace winograd {
31 namespace output_transform {
32 
sme_fp32_mopa_4x4_3x3(const unsigned int n_channels,const float * inptr,const size_t matrix_stride,const float * bptr,float * const output,const size_t output_row_stride,const size_t output_col_stride,const float output_min,const float output_max)33 void sme_fp32_mopa_4x4_3x3(
34   const unsigned int n_channels,
35   const float* inptr,
36   const size_t matrix_stride,
37   const float* bptr,
38   float* const output,
39   const size_t output_row_stride,
40   const size_t output_col_stride,
41   const float output_min,
42   const float output_max
43 )
44 {
45   // The below assembler uses the Kronecker product and the "vec trick" to
46   // implement the Winograd output transform (y = AT Y A) using the SME
47   // array.  This code REQUIRES that the vectors are 512b long (or longer, if
48   // we add some predication).
49   //
50   // The "vec trick" uses the identity $vec(AT Y A) = (AT (x) AT) vec(Y)$ to
51   // convert the chain of matrix multiplications into a matrix-vector
52   // product.  We then stack multiple channels of vec(Y) together to allow us
53   // to perform multiple channels of the transformation simultaneously.
54   //
55   // Since the complete matrix (AT (x) AT) is quite big [16 x 36], we compute
56   // it on the fly. To do so, we store two representations of the matrix AT.
57   // The first representation (the outer terms) contains, within each quad,
58   // four coefficients of the matrix AT.
59   const float outer_terms[32] = {
60      1, 1,  1, 1,
61      0, 1, -1, 2,
62      0, 1,  1, 4,
63      0, 1, -1, 8,
64     // The following rows are continuations of the first four rows, and each
65     // contains two columns of padding values which aren't used in the
66     // computation but are there to ensure that the coefficients end up in
67     // the right quads of the vector into which they're read.
68      1, 0,  0, 0,
69     -2, 0,  0, 0,
70      4, 0,  0, 0,
71     -8, 1,  0, 0
72   };
73   // This should be loaded completely into two Z registers.
74   //
75   // We can then use by-element FMLA to construct columns of (AT (x) AT) by
76   // multiplying elements of the outer terms against the following inner
77   // terms (again split into quads, but expected to be loaded replicated such
78   // that each of the six required Z registers contains a repeated quad of
79   // the values).
80   const float inner_terms[24] = {
81     1,  0, 0,  0,
82     1,  1, 1,  1,
83     1, -1, 1, -1,
84     1,  2, 4,  8,
85     1, -2, 4, -8,
86     0,  0, 0,  1
87   };
88 
89   struct Params
90   {
91     const float *outer_terms;
92     const float *inner_terms;
93     float act_min;
94     float act_max;
95 
96     Params(const float *outer_terms,
97            const float *inner_terms,
98            float act_min,
99            float act_max)
100       : outer_terms(outer_terms), inner_terms(inner_terms),
101         act_min(act_min), act_max(act_max)
102     {
103     }
104   };
105 
106   Params params(outer_terms, inner_terms, output_min, output_max);
107 
108   __asm__ __volatile__(
109     "ldr x20, [%x[params], %[offsetof_Params_outer_terms]]\n"
110     ".inst 0xd503477f  // SMSTART ZA\n"
111     "ptrue p5.b\n"
112     "ld1rw { z12.s }, p5/Z, [%x[params], %[offsetof_Params_act_min]]\n"
113     "ld1rw { z10.s }, p5/Z, [%x[params], %[offsetof_Params_act_max]]\n"
114     "pfalse p8.b\n"
115     "ldr x19, [%x[params], %[offsetof_Params_inner_terms]]\n"
116     "ld1w { z6.s }, p5/Z, [x20]\n"
117     "ld1w { z7.s }, p5/Z, [x20, #1, MUL VL]\n"
118     "ld1rqw { z9.s }, p5/Z, [x19]\n"
119     "ld1rqw { z8.s }, p5/Z, [x19, #16]\n"
120     "ld1rqw { z15.s }, p5/Z, [x19, #32]\n"
121     "fmul z11.s, z9.s, z6.s[0]\n"
122     "fmul z5.s, z9.s, z6.s[1]\n"
123     "ld1rqw { z4.s }, p5/Z, [x19, #48]\n"
124     "ld1rqw { z3.s }, p5/Z, [x19, #64]\n"
125     "ld1rqw { z2.s }, p5/Z, [x19, #80]\n"
126     "cbz %x[bptr], 1f\n"
127     "ptrue p8.s\n"
128     "1:"  // Set bias predicate: Done
129     ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
130     "fmov z1.s, #1.0\n"
131     "mov x25, #0x0\n"
132     "cntw x24\n"
133     "cntw x23, ALL, MUL #2\n"
134     "cntw x22, ALL, MUL #3\n"
135     "whilelt p4.s, x25, %x[n_channels]\n"
136     "whilelt p3.s, x24, %x[n_channels]\n"
137     "ld1w { z31.s }, p4/Z, [%x[inptr], x25, LSL #2]\n"
138     "ld1w { z30.s }, p3/Z, [%x[inptr], x24, LSL #2]\n"
139     "whilelt p2.s, x23, %x[n_channels]\n"
140     "whilelt p1.s, x22, %x[n_channels]\n"
141     "ld1w { z29.s }, p2/Z, [%x[inptr], x23, LSL #2]\n"
142     "add x21, %x[inptr], %x[matrix_stride], LSL #2\n"
143     "and p0.b, p5/Z, p8.b, p4.b\n"
144     "ld1w { z28.s }, p1/Z, [%x[inptr], x22, LSL #2]\n"
145     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
146     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
147     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
148     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
149     "add x21, x21, %x[matrix_stride], LSL #2\n"
150     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
151     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
152     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
153     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
154     "add x21, x21, %x[matrix_stride], LSL #2\n"
155     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
156     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
157     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
158     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
159     "ld1w { z0.s }, p0/Z, [%x[bptr], x25, LSL #2]\n"
160     "and p0.b, p5/Z, p8.b, p3.b\n"
161     ".inst 0x8080b420  // fmopa za0.s, p5/M, p5/M, z1.s, z0.s\n"
162     "ld1w { z0.s }, p0/Z, [%x[bptr], x24, LSL #2]\n"
163     "and p0.b, p5/Z, p8.b, p2.b\n"
164     ".inst 0x8080b421  // fmopa za1.s, p5/M, p5/M, z1.s, z0.s\n"
165     "ld1w { z0.s }, p0/Z, [%x[bptr], x23, LSL #2]\n"
166     "and p0.b, p5/Z, p8.b, p1.b\n"
167     ".inst 0x8080b422  // fmopa za2.s, p5/M, p5/M, z1.s, z0.s\n"
168     "ld1w { z0.s }, p0/Z, [%x[bptr], x22, LSL #2]\n"
169     ".inst 0x8080b423  // fmopa za3.s, p5/M, p5/M, z1.s, z0.s\n"
170     "2:"  // Loop
171     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
172     "add x21, x21, %x[matrix_stride], LSL #2\n"
173     "mov x14, #0xc\n"
174     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
175     "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
176     "whilelt p0.s, x25, %x[n_channels]\n"
177     "add x20, %x[output], %x[output_col_stride], LSL #2\n"
178     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
179     "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
180     "add x19, %x[output], %x[output_row_stride], LSL #2\n"
181     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
182     "fmul z11.s, z9.s, z6.s[2]\n"
183     "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
184     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
185     "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
186     "add x21, x21, %x[matrix_stride], LSL #2\n"
187     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
188     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
189     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
190     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
191     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
192     "fmul z5.s, z9.s, z6.s[3]\n"
193     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
194     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
195     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
196     "add x21, x21, %x[matrix_stride], LSL #2\n"
197     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
198     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
199     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
200     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
201     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
202     "fmul z11.s, z9.s, z7.s[0]\n"
203     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
204     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
205     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
206     "add x21, x21, %x[matrix_stride], LSL #2\n"
207     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
208     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
209     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
210     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
211     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
212     "fmul z5.s, z9.s, z7.s[1]\n"
213     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
214     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
215     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
216     "add x21, x21, %x[matrix_stride], LSL #2\n"
217     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
218     "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
219     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
220     "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
221     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
222     "fmul z11.s, z8.s, z6.s[0]\n"
223     "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
224     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
225     "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
226     "add x21, x21, %x[matrix_stride], LSL #2\n"
227     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
228     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
229     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
230     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
231     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
232     "fmul z5.s, z8.s, z6.s[1]\n"
233     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
234     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
235     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
236     "add x21, x21, %x[matrix_stride], LSL #2\n"
237     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
238     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
239     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
240     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
241     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
242     "fmul z11.s, z8.s, z6.s[2]\n"
243     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
244     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
245     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
246     "add x21, x21, %x[matrix_stride], LSL #2\n"
247     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
248     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
249     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
250     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
251     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
252     "fmul z5.s, z8.s, z6.s[3]\n"
253     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
254     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
255     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
256     "add x21, x21, %x[matrix_stride], LSL #2\n"
257     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
258     "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
259     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
260     "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
261     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
262     "fmul z11.s, z8.s, z7.s[0]\n"
263     "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
264     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
265     "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
266     "add x21, x21, %x[matrix_stride], LSL #2\n"
267     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
268     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
269     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
270     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
271     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
272     "fmul z5.s, z8.s, z7.s[1]\n"
273     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
274     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
275     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
276     "add x21, x21, %x[matrix_stride], LSL #2\n"
277     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
278     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
279     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
280     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
281     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
282     "fmul z11.s, z15.s, z6.s[0]\n"
283     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
284     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
285     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
286     "add x21, x21, %x[matrix_stride], LSL #2\n"
287     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
288     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
289     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
290     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
291     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
292     "fmul z5.s, z15.s, z6.s[1]\n"
293     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
294     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
295     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
296     "add x21, x21, %x[matrix_stride], LSL #2\n"
297     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
298     "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
299     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
300     "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
301     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
302     "fmul z11.s, z15.s, z6.s[2]\n"
303     "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
304     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
305     "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
306     "add x21, x21, %x[matrix_stride], LSL #2\n"
307     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
308     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
309     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
310     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
311     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
312     "fmul z5.s, z15.s, z6.s[3]\n"
313     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
314     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
315     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
316     "add x21, x21, %x[matrix_stride], LSL #2\n"
317     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
318     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
319     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
320     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
321     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
322     "fmul z11.s, z15.s, z7.s[0]\n"
323     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
324     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
325     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
326     "add x21, x21, %x[matrix_stride], LSL #2\n"
327     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
328     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
329     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
330     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
331     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
332     "fmul z5.s, z15.s, z7.s[1]\n"
333     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
334     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
335     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
336     "add x21, x21, %x[matrix_stride], LSL #2\n"
337     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
338     "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
339     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
340     "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
341     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
342     "fmul z11.s, z4.s, z6.s[0]\n"
343     "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
344     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
345     "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
346     "add x21, x21, %x[matrix_stride], LSL #2\n"
347     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
348     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
349     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
350     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
351     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
352     "fmul z5.s, z4.s, z6.s[1]\n"
353     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
354     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
355     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
356     "add x21, x21, %x[matrix_stride], LSL #2\n"
357     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
358     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
359     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
360     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
361     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
362     "fmul z11.s, z4.s, z6.s[2]\n"
363     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
364     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
365     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
366     "add x21, x21, %x[matrix_stride], LSL #2\n"
367     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
368     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
369     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
370     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
371     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
372     "fmul z5.s, z4.s, z6.s[3]\n"
373     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
374     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
375     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
376     "add x21, x21, %x[matrix_stride], LSL #2\n"
377     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
378     "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
379     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
380     "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
381     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
382     "fmul z11.s, z4.s, z7.s[0]\n"
383     "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
384     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
385     "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
386     "add x21, x21, %x[matrix_stride], LSL #2\n"
387     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
388     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
389     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
390     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
391     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
392     "fmul z5.s, z4.s, z7.s[1]\n"
393     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
394     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
395     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
396     "add x21, x21, %x[matrix_stride], LSL #2\n"
397     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
398     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
399     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
400     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
401     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
402     "fmul z11.s, z3.s, z6.s[0]\n"
403     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
404     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
405     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
406     "add x21, x21, %x[matrix_stride], LSL #2\n"
407     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
408     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
409     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
410     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
411     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
412     "fmul z5.s, z3.s, z6.s[1]\n"
413     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
414     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
415     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
416     "add x21, x21, %x[matrix_stride], LSL #2\n"
417     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
418     "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
419     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
420     "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
421     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
422     "fmul z11.s, z3.s, z6.s[2]\n"
423     "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
424     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
425     "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
426     "add x21, x21, %x[matrix_stride], LSL #2\n"
427     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
428     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
429     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
430     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
431     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
432     "fmul z5.s, z3.s, z6.s[3]\n"
433     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
434     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
435     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
436     "add x21, x21, %x[matrix_stride], LSL #2\n"
437     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
438     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
439     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
440     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
441     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
442     "fmul z11.s, z3.s, z7.s[0]\n"
443     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
444     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
445     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
446     "add x21, x21, %x[matrix_stride], LSL #2\n"
447     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
448     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
449     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
450     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
451     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
452     "fmul z5.s, z3.s, z7.s[1]\n"
453     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
454     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
455     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
456     "add x21, x21, %x[matrix_stride], LSL #2\n"
457     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
458     "ld1w { z31.s }, p4/Z, [x21, x25, LSL #2]\n"
459     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
460     "ld1w { z30.s }, p3/Z, [x21, x24, LSL #2]\n"
461     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
462     "fmul z11.s, z2.s, z6.s[0]\n"
463     "ld1w { z29.s }, p2/Z, [x21, x23, LSL #2]\n"
464     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
465     "ld1w { z28.s }, p1/Z, [x21, x22, LSL #2]\n"
466     "add x21, x21, %x[matrix_stride], LSL #2\n"
467     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
468     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
469     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
470     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
471     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
472     "fmul z5.s, z2.s, z6.s[1]\n"
473     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
474     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
475     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
476     "add x21, x21, %x[matrix_stride], LSL #2\n"
477     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
478     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
479     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
480     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
481     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
482     "fmul z11.s, z2.s, z6.s[2]\n"
483     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
484     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
485     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
486     "add x21, x21, %x[matrix_stride], LSL #2\n"
487     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
488     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
489     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
490     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
491     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
492     "fmul z5.s, z2.s, z6.s[3]\n"
493     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
494     ".inst 0x809fb560  // fmopa za0.s, p5/M, p5/M, z11.s, z31.s\n"
495     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
496     ".inst 0x809eb561  // fmopa za1.s, p5/M, p5/M, z11.s, z30.s\n"
497     ".inst 0x809db562  // fmopa za2.s, p5/M, p5/M, z11.s, z29.s\n"
498     ".inst 0x809cb563  // fmopa za3.s, p5/M, p5/M, z11.s, z28.s\n"
499     "fmul z11.s, z2.s, z7.s[0]\n"
500     ".inst 0x809bb4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z27.s\n"
501     ".inst 0x809ab4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z26.s\n"
502     ".inst 0x8099b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z25.s\n"
503     ".inst 0x8098b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z24.s\n"
504     "fmul z5.s, z2.s, z7.s[1]\n"
505     ".inst 0x8097b560  // fmopa za0.s, p5/M, p5/M, z11.s, z23.s\n"
506     ".inst 0x8096b561  // fmopa za1.s, p5/M, p5/M, z11.s, z22.s\n"
507     ".inst 0x8095b562  // fmopa za2.s, p5/M, p5/M, z11.s, z21.s\n"
508     ".inst 0x8094b563  // fmopa za3.s, p5/M, p5/M, z11.s, z20.s\n"
509     "fmul z11.s, z9.s, z6.s[0]\n"
510     ".inst 0x8093b4a0  // fmopa za0.s, p5/M, p5/M, z5.s, z19.s\n"
511     ".inst 0x8092b4a1  // fmopa za1.s, p5/M, p5/M, z5.s, z18.s\n"
512     ".inst 0x8091b4a2  // fmopa za2.s, p5/M, p5/M, z5.s, z17.s\n"
513     ".inst 0x8090b4a3  // fmopa za3.s, p5/M, p5/M, z5.s, z16.s\n"
514     "fmul z5.s, z9.s, z6.s[1]\n"
515     ".inst 0xc082741f  // mova z31.s, p5/M, za0h.s[XZR]\n"
516     ".inst 0xc082541c  // mova z28.s, p5/M, za0h.s[x14]\n"
517     "fmin z31.s, p5/M, z31.s, z10.s\n"
518     ".inst 0xc082743b  // mova z27.s, p5/M, za0h.s[XZR, #1]\n"
519     "fmin z28.s, p5/M, z28.s, z10.s\n"
520     ".inst 0xc0825438  // mova z24.s, p5/M, za0h.s[x14, #1]\n"
521     "fmin z27.s, p5/M, z27.s, z10.s\n"
522     "mov x13, #0x4\n"
523     "mov x12, #0x8\n"
524     ".inst 0xc082341e  // mova z30.s, p5/M, za0h.s[x13]\n"
525     "fmin z24.s, p5/M, z24.s, z10.s\n"
526     ".inst 0xc082141d  // mova z29.s, p5/M, za0h.s[x12]\n"
527     "fmax z31.s, p5/M, z31.s, z12.s\n"
528     "fmin z30.s, p5/M, z30.s, z10.s\n"
529     ".inst 0xc082343a  // mova z26.s, p5/M, za0h.s[x13, #1]\n"
530     "fmin z29.s, p5/M, z29.s, z10.s\n"
531     "fmax z28.s, p5/M, z28.s, z12.s\n"
532     ".inst 0xc0821439  // mova z25.s, p5/M, za0h.s[x12, #1]\n"
533     "fmax z27.s, p5/M, z27.s, z12.s\n"
534     "fmin z26.s, p5/M, z26.s, z10.s\n"
535     ".inst 0xc0827457  // mova z23.s, p5/M, za0h.s[XZR, #2]\n"
536     "fmin z25.s, p5/M, z25.s, z10.s\n"
537     "fmax z24.s, p5/M, z24.s, z12.s\n"
538     ".inst 0xc0823456  // mova z22.s, p5/M, za0h.s[x13, #2]\n"
539     "fmax z30.s, p5/M, z30.s, z12.s\n"
540     "fmin z23.s, p5/M, z23.s, z10.s\n"
541     ".inst 0xc0821455  // mova z21.s, p5/M, za0h.s[x12, #2]\n"
542     "fmax z29.s, p5/M, z29.s, z12.s\n"
543     "fmin z22.s, p5/M, z22.s, z10.s\n"
544     ".inst 0xc0825454  // mova z20.s, p5/M, za0h.s[x14, #2]\n"
545     "fmax z26.s, p5/M, z26.s, z12.s\n"
546     "fmin z21.s, p5/M, z21.s, z10.s\n"
547     ".inst 0xc0827473  // mova z19.s, p5/M, za0h.s[XZR, #3]\n"
548     "fmax z25.s, p5/M, z25.s, z12.s\n"
549     "fmin z20.s, p5/M, z20.s, z10.s\n"
550     ".inst 0xc0823472  // mova z18.s, p5/M, za0h.s[x13, #3]\n"
551     "fmax z23.s, p5/M, z23.s, z12.s\n"
552     "fmin z19.s, p5/M, z19.s, z10.s\n"
553     ".inst 0xc0821471  // mova z17.s, p5/M, za0h.s[x12, #3]\n"
554     "fmax z22.s, p5/M, z22.s, z12.s\n"
555     "fmin z18.s, p5/M, z18.s, z10.s\n"
556     ".inst 0xc0825470  // mova z16.s, p5/M, za0h.s[x14, #3]\n"
557     "fmax z21.s, p5/M, z21.s, z12.s\n"
558     "fmin z17.s, p5/M, z17.s, z10.s\n"
559     "fmax z20.s, p5/M, z20.s, z12.s\n"
560     "fmin z16.s, p5/M, z16.s, z10.s\n"
561     "st1w { z31.s }, p0, [%x[output], x25, LSL #2]\n"
562     "fmax z19.s, p5/M, z19.s, z12.s\n"
563     "st1w { z30.s }, p0, [x20, x25, LSL #2]\n"
564     "add x20, x20, %x[output_col_stride], LSL #2\n"
565     "fmax z18.s, p5/M, z18.s, z12.s\n"
566     "st1w { z29.s }, p0, [x20, x25, LSL #2]\n"
567     "add x20, x20, %x[output_col_stride], LSL #2\n"
568     "fmax z17.s, p5/M, z17.s, z12.s\n"
569     "st1w { z28.s }, p0, [x20, x25, LSL #2]\n"
570     "add x20, x19, %x[output_col_stride], LSL #2\n"
571     "fmax z16.s, p5/M, z16.s, z12.s\n"
572     "st1w { z27.s }, p0, [x19, x25, LSL #2]\n"
573     "add x19, x19, %x[output_row_stride], LSL #2\n"
574     "st1w { z26.s }, p0, [x20, x25, LSL #2]\n"
575     "add x20, x20, %x[output_col_stride], LSL #2\n"
576     "st1w { z25.s }, p0, [x20, x25, LSL #2]\n"
577     "add x20, x20, %x[output_col_stride], LSL #2\n"
578     "st1w { z24.s }, p0, [x20, x25, LSL #2]\n"
579     "add x20, x19, %x[output_col_stride], LSL #2\n"
580     "st1w { z23.s }, p0, [x19, x25, LSL #2]\n"
581     "add x19, x19, %x[output_row_stride], LSL #2\n"
582     "st1w { z22.s }, p0, [x20, x25, LSL #2]\n"
583     "add x20, x20, %x[output_col_stride], LSL #2\n"
584     "st1w { z21.s }, p0, [x20, x25, LSL #2]\n"
585     "add x20, x20, %x[output_col_stride], LSL #2\n"
586     "st1w { z20.s }, p0, [x20, x25, LSL #2]\n"
587     "add x20, x19, %x[output_col_stride], LSL #2\n"
588     "st1w { z19.s }, p0, [x19, x25, LSL #2]\n"
589     "st1w { z18.s }, p0, [x20, x25, LSL #2]\n"
590     "add x20, x20, %x[output_col_stride], LSL #2\n"
591     "st1w { z17.s }, p0, [x20, x25, LSL #2]\n"
592     "add x20, x20, %x[output_col_stride], LSL #2\n"
593     "st1w { z16.s }, p0, [x20, x25, LSL #2]\n"
594     "whilelt p0.s, x24, %x[n_channels]\n"
595     "b.none 3f\n"
596     ".inst 0xc082749f  // mova z31.s, p5/M, za1h.s[XZR]\n"
597     ".inst 0xc082349e  // mova z30.s, p5/M, za1h.s[x13]\n"
598     "fmin z31.s, p5/M, z31.s, z10.s\n"
599     ".inst 0xc082149d  // mova z29.s, p5/M, za1h.s[x12]\n"
600     "fmin z30.s, p5/M, z30.s, z10.s\n"
601     ".inst 0xc082549c  // mova z28.s, p5/M, za1h.s[x14]\n"
602     "fmin z29.s, p5/M, z29.s, z10.s\n"
603     ".inst 0xc08274bb  // mova z27.s, p5/M, za1h.s[XZR, #1]\n"
604     "fmin z28.s, p5/M, z28.s, z10.s\n"
605     ".inst 0xc08234ba  // mova z26.s, p5/M, za1h.s[x13, #1]\n"
606     "fmax z31.s, p5/M, z31.s, z12.s\n"
607     "fmin z27.s, p5/M, z27.s, z10.s\n"
608     ".inst 0xc08214b9  // mova z25.s, p5/M, za1h.s[x12, #1]\n"
609     "fmax z30.s, p5/M, z30.s, z12.s\n"
610     "fmin z26.s, p5/M, z26.s, z10.s\n"
611     ".inst 0xc08254b8  // mova z24.s, p5/M, za1h.s[x14, #1]\n"
612     "fmax z29.s, p5/M, z29.s, z12.s\n"
613     "fmin z25.s, p5/M, z25.s, z10.s\n"
614     ".inst 0xc08274d7  // mova z23.s, p5/M, za1h.s[XZR, #2]\n"
615     "fmax z28.s, p5/M, z28.s, z12.s\n"
616     "fmin z24.s, p5/M, z24.s, z10.s\n"
617     ".inst 0xc08234d6  // mova z22.s, p5/M, za1h.s[x13, #2]\n"
618     "fmax z27.s, p5/M, z27.s, z12.s\n"
619     "fmin z23.s, p5/M, z23.s, z10.s\n"
620     ".inst 0xc08214d5  // mova z21.s, p5/M, za1h.s[x12, #2]\n"
621     "fmax z26.s, p5/M, z26.s, z12.s\n"
622     "fmin z22.s, p5/M, z22.s, z10.s\n"
623     "add x20, %x[output], %x[output_col_stride], LSL #2\n"
624     ".inst 0xc08254d4  // mova z20.s, p5/M, za1h.s[x14, #2]\n"
625     "fmax z25.s, p5/M, z25.s, z12.s\n"
626     "fmin z21.s, p5/M, z21.s, z10.s\n"
627     "add x19, %x[output], %x[output_row_stride], LSL #2\n"
628     ".inst 0xc08274f3  // mova z19.s, p5/M, za1h.s[XZR, #3]\n"
629     "fmax z24.s, p5/M, z24.s, z12.s\n"
630     "fmin z20.s, p5/M, z20.s, z10.s\n"
631     ".inst 0xc08234f2  // mova z18.s, p5/M, za1h.s[x13, #3]\n"
632     "fmax z23.s, p5/M, z23.s, z12.s\n"
633     "fmin z19.s, p5/M, z19.s, z10.s\n"
634     ".inst 0xc08214f1  // mova z17.s, p5/M, za1h.s[x12, #3]\n"
635     "fmax z22.s, p5/M, z22.s, z12.s\n"
636     "fmin z18.s, p5/M, z18.s, z10.s\n"
637     ".inst 0xc08254f0  // mova z16.s, p5/M, za1h.s[x14, #3]\n"
638     "fmax z21.s, p5/M, z21.s, z12.s\n"
639     "fmin z17.s, p5/M, z17.s, z10.s\n"
640     "fmax z20.s, p5/M, z20.s, z12.s\n"
641     "fmin z16.s, p5/M, z16.s, z10.s\n"
642     "st1w { z31.s }, p0, [%x[output], x24, LSL #2]\n"
643     "fmax z19.s, p5/M, z19.s, z12.s\n"
644     "st1w { z30.s }, p0, [x20, x24, LSL #2]\n"
645     "add x20, x20, %x[output_col_stride], LSL #2\n"
646     "fmax z18.s, p5/M, z18.s, z12.s\n"
647     "st1w { z29.s }, p0, [x20, x24, LSL #2]\n"
648     "add x20, x20, %x[output_col_stride], LSL #2\n"
649     "fmax z17.s, p5/M, z17.s, z12.s\n"
650     "st1w { z28.s }, p0, [x20, x24, LSL #2]\n"
651     "add x20, x19, %x[output_col_stride], LSL #2\n"
652     "fmax z16.s, p5/M, z16.s, z12.s\n"
653     "st1w { z27.s }, p0, [x19, x24, LSL #2]\n"
654     "add x19, x19, %x[output_row_stride], LSL #2\n"
655     "st1w { z26.s }, p0, [x20, x24, LSL #2]\n"
656     "add x20, x20, %x[output_col_stride], LSL #2\n"
657     "st1w { z25.s }, p0, [x20, x24, LSL #2]\n"
658     "add x20, x20, %x[output_col_stride], LSL #2\n"
659     "st1w { z24.s }, p0, [x20, x24, LSL #2]\n"
660     "add x20, x19, %x[output_col_stride], LSL #2\n"
661     "st1w { z23.s }, p0, [x19, x24, LSL #2]\n"
662     "add x19, x19, %x[output_row_stride], LSL #2\n"
663     "st1w { z22.s }, p0, [x20, x24, LSL #2]\n"
664     "add x20, x20, %x[output_col_stride], LSL #2\n"
665     "st1w { z21.s }, p0, [x20, x24, LSL #2]\n"
666     "add x20, x20, %x[output_col_stride], LSL #2\n"
667     "st1w { z20.s }, p0, [x20, x24, LSL #2]\n"
668     "add x20, x19, %x[output_col_stride], LSL #2\n"
669     "st1w { z19.s }, p0, [x19, x24, LSL #2]\n"
670     "st1w { z18.s }, p0, [x20, x24, LSL #2]\n"
671     "add x20, x20, %x[output_col_stride], LSL #2\n"
672     "st1w { z17.s }, p0, [x20, x24, LSL #2]\n"
673     "add x20, x20, %x[output_col_stride], LSL #2\n"
674     "st1w { z16.s }, p0, [x20, x24, LSL #2]\n"
675     "whilelt p0.s, x23, %x[n_channels]\n"
676     "b.none 3f\n"
677     ".inst 0xc082751f  // mova z31.s, p5/M, za2h.s[XZR]\n"
678     ".inst 0xc082351e  // mova z30.s, p5/M, za2h.s[x13]\n"
679     "fmin z31.s, p5/M, z31.s, z10.s\n"
680     ".inst 0xc082151d  // mova z29.s, p5/M, za2h.s[x12]\n"
681     "fmin z30.s, p5/M, z30.s, z10.s\n"
682     ".inst 0xc082551c  // mova z28.s, p5/M, za2h.s[x14]\n"
683     "fmin z29.s, p5/M, z29.s, z10.s\n"
684     ".inst 0xc082753b  // mova z27.s, p5/M, za2h.s[XZR, #1]\n"
685     "fmin z28.s, p5/M, z28.s, z10.s\n"
686     ".inst 0xc082353a  // mova z26.s, p5/M, za2h.s[x13, #1]\n"
687     "fmax z31.s, p5/M, z31.s, z12.s\n"
688     "fmin z27.s, p5/M, z27.s, z10.s\n"
689     ".inst 0xc0821539  // mova z25.s, p5/M, za2h.s[x12, #1]\n"
690     "fmax z30.s, p5/M, z30.s, z12.s\n"
691     "fmin z26.s, p5/M, z26.s, z10.s\n"
692     ".inst 0xc0825538  // mova z24.s, p5/M, za2h.s[x14, #1]\n"
693     "fmax z29.s, p5/M, z29.s, z12.s\n"
694     "fmin z25.s, p5/M, z25.s, z10.s\n"
695     ".inst 0xc0827557  // mova z23.s, p5/M, za2h.s[XZR, #2]\n"
696     "fmax z28.s, p5/M, z28.s, z12.s\n"
697     "fmin z24.s, p5/M, z24.s, z10.s\n"
698     ".inst 0xc0823556  // mova z22.s, p5/M, za2h.s[x13, #2]\n"
699     "fmax z27.s, p5/M, z27.s, z12.s\n"
700     "fmin z23.s, p5/M, z23.s, z10.s\n"
701     ".inst 0xc0821555  // mova z21.s, p5/M, za2h.s[x12, #2]\n"
702     "fmax z26.s, p5/M, z26.s, z12.s\n"
703     "fmin z22.s, p5/M, z22.s, z10.s\n"
704     "add x20, %x[output], %x[output_col_stride], LSL #2\n"
705     ".inst 0xc0825554  // mova z20.s, p5/M, za2h.s[x14, #2]\n"
706     "fmax z25.s, p5/M, z25.s, z12.s\n"
707     "fmin z21.s, p5/M, z21.s, z10.s\n"
708     "add x19, %x[output], %x[output_row_stride], LSL #2\n"
709     ".inst 0xc0827573  // mova z19.s, p5/M, za2h.s[XZR, #3]\n"
710     "fmax z24.s, p5/M, z24.s, z12.s\n"
711     "fmin z20.s, p5/M, z20.s, z10.s\n"
712     ".inst 0xc0823572  // mova z18.s, p5/M, za2h.s[x13, #3]\n"
713     "fmax z23.s, p5/M, z23.s, z12.s\n"
714     "fmin z19.s, p5/M, z19.s, z10.s\n"
715     ".inst 0xc0821571  // mova z17.s, p5/M, za2h.s[x12, #3]\n"
716     "fmax z22.s, p5/M, z22.s, z12.s\n"
717     "fmin z18.s, p5/M, z18.s, z10.s\n"
718     ".inst 0xc0825570  // mova z16.s, p5/M, za2h.s[x14, #3]\n"
719     "fmax z21.s, p5/M, z21.s, z12.s\n"
720     "fmin z17.s, p5/M, z17.s, z10.s\n"
721     "fmax z20.s, p5/M, z20.s, z12.s\n"
722     "fmin z16.s, p5/M, z16.s, z10.s\n"
723     "st1w { z31.s }, p0, [%x[output], x23, LSL #2]\n"
724     "fmax z19.s, p5/M, z19.s, z12.s\n"
725     "st1w { z30.s }, p0, [x20, x23, LSL #2]\n"
726     "add x20, x20, %x[output_col_stride], LSL #2\n"
727     "fmax z18.s, p5/M, z18.s, z12.s\n"
728     "st1w { z29.s }, p0, [x20, x23, LSL #2]\n"
729     "add x20, x20, %x[output_col_stride], LSL #2\n"
730     "fmax z17.s, p5/M, z17.s, z12.s\n"
731     "st1w { z28.s }, p0, [x20, x23, LSL #2]\n"
732     "add x20, x19, %x[output_col_stride], LSL #2\n"
733     "fmax z16.s, p5/M, z16.s, z12.s\n"
734     "st1w { z27.s }, p0, [x19, x23, LSL #2]\n"
735     "add x19, x19, %x[output_row_stride], LSL #2\n"
736     "st1w { z26.s }, p0, [x20, x23, LSL #2]\n"
737     "add x20, x20, %x[output_col_stride], LSL #2\n"
738     "st1w { z25.s }, p0, [x20, x23, LSL #2]\n"
739     "add x20, x20, %x[output_col_stride], LSL #2\n"
740     "st1w { z24.s }, p0, [x20, x23, LSL #2]\n"
741     "add x20, x19, %x[output_col_stride], LSL #2\n"
742     "st1w { z23.s }, p0, [x19, x23, LSL #2]\n"
743     "add x19, x19, %x[output_row_stride], LSL #2\n"
744     "st1w { z22.s }, p0, [x20, x23, LSL #2]\n"
745     "add x20, x20, %x[output_col_stride], LSL #2\n"
746     "st1w { z21.s }, p0, [x20, x23, LSL #2]\n"
747     "add x20, x20, %x[output_col_stride], LSL #2\n"
748     "st1w { z20.s }, p0, [x20, x23, LSL #2]\n"
749     "add x20, x19, %x[output_col_stride], LSL #2\n"
750     "st1w { z19.s }, p0, [x19, x23, LSL #2]\n"
751     "st1w { z18.s }, p0, [x20, x23, LSL #2]\n"
752     "add x20, x20, %x[output_col_stride], LSL #2\n"
753     "st1w { z17.s }, p0, [x20, x23, LSL #2]\n"
754     "add x20, x20, %x[output_col_stride], LSL #2\n"
755     "st1w { z16.s }, p0, [x20, x23, LSL #2]\n"
756     "whilelt p0.s, x22, %x[n_channels]\n"
757     "b.none 3f\n"
758     "fmov z1.s, #1.0\n"
759     ".inst 0xc082759f  // mova z31.s, p5/M, za3h.s[XZR]\n"
760     ".inst 0xc082359e  // mova z30.s, p5/M, za3h.s[x13]\n"
761     "fmin z31.s, p5/M, z31.s, z10.s\n"
762     ".inst 0xc082159d  // mova z29.s, p5/M, za3h.s[x12]\n"
763     "fmin z30.s, p5/M, z30.s, z10.s\n"
764     ".inst 0xc082559c  // mova z28.s, p5/M, za3h.s[x14]\n"
765     "fmin z29.s, p5/M, z29.s, z10.s\n"
766     ".inst 0xc08275bb  // mova z27.s, p5/M, za3h.s[XZR, #1]\n"
767     "fmin z28.s, p5/M, z28.s, z10.s\n"
768     ".inst 0xc08235ba  // mova z26.s, p5/M, za3h.s[x13, #1]\n"
769     "fmax z31.s, p5/M, z31.s, z12.s\n"
770     "fmin z27.s, p5/M, z27.s, z10.s\n"
771     ".inst 0xc08215b9  // mova z25.s, p5/M, za3h.s[x12, #1]\n"
772     "fmax z30.s, p5/M, z30.s, z12.s\n"
773     "fmin z26.s, p5/M, z26.s, z10.s\n"
774     ".inst 0xc08255b8  // mova z24.s, p5/M, za3h.s[x14, #1]\n"
775     "fmax z29.s, p5/M, z29.s, z12.s\n"
776     "fmin z25.s, p5/M, z25.s, z10.s\n"
777     ".inst 0xc08275d7  // mova z23.s, p5/M, za3h.s[XZR, #2]\n"
778     "fmax z28.s, p5/M, z28.s, z12.s\n"
779     "fmin z24.s, p5/M, z24.s, z10.s\n"
780     ".inst 0xc08235d6  // mova z22.s, p5/M, za3h.s[x13, #2]\n"
781     "fmax z27.s, p5/M, z27.s, z12.s\n"
782     "fmin z23.s, p5/M, z23.s, z10.s\n"
783     ".inst 0xc08215d5  // mova z21.s, p5/M, za3h.s[x12, #2]\n"
784     "fmax z26.s, p5/M, z26.s, z12.s\n"
785     "fmin z22.s, p5/M, z22.s, z10.s\n"
786     ".inst 0xc08255d4  // mova z20.s, p5/M, za3h.s[x14, #2]\n"
787     "fmax z25.s, p5/M, z25.s, z12.s\n"
788     "fmin z21.s, p5/M, z21.s, z10.s\n"
789     "add x20, %x[output], %x[output_col_stride], LSL #2\n"
790     ".inst 0xc08275f3  // mova z19.s, p5/M, za3h.s[XZR, #3]\n"
791     "fmax z24.s, p5/M, z24.s, z12.s\n"
792     "fmin z20.s, p5/M, z20.s, z10.s\n"
793     "add x19, %x[output], %x[output_row_stride], LSL #2\n"
794     ".inst 0xc08235f2  // mova z18.s, p5/M, za3h.s[x13, #3]\n"
795     "fmax z23.s, p5/M, z23.s, z12.s\n"
796     "fmin z19.s, p5/M, z19.s, z10.s\n"
797     "incw x25, ALL, MUL #4\n"
798     ".inst 0xc08215f1  // mova z17.s, p5/M, za3h.s[x12, #3]\n"
799     "fmax z22.s, p5/M, z22.s, z12.s\n"
800     "fmin z18.s, p5/M, z18.s, z10.s\n"
801     "incw x24, ALL, MUL #4\n"
802     ".inst 0xc08255f0  // mova z16.s, p5/M, za3h.s[x14, #3]\n"
803     "fmax z21.s, p5/M, z21.s, z12.s\n"
804     "fmin z17.s, p5/M, z17.s, z10.s\n"
805     "incw x23, ALL, MUL #4\n"
806     ".inst 0xc00800ff  // zero { zad0, zad1, zad2, zad3, zad4, zad5, zad6, zad7 }\n"
807     "fmax z20.s, p5/M, z20.s, z12.s\n"
808     "fmin z16.s, p5/M, z16.s, z10.s\n"
809     "add x21, %x[inptr], %x[matrix_stride], LSL #2\n"
810     "fmax z19.s, p5/M, z19.s, z12.s\n"
811     "st1w { z31.s }, p0, [%x[output], x22, LSL #2]\n"
812     "fmax z18.s, p5/M, z18.s, z12.s\n"
813     "st1w { z30.s }, p0, [x20, x22, LSL #2]\n"
814     "add x20, x20, %x[output_col_stride], LSL #2\n"
815     "fmax z17.s, p5/M, z17.s, z12.s\n"
816     "st1w { z29.s }, p0, [x20, x22, LSL #2]\n"
817     "add x20, x20, %x[output_col_stride], LSL #2\n"
818     "fmax z16.s, p5/M, z16.s, z12.s\n"
819     "st1w { z28.s }, p0, [x20, x22, LSL #2]\n"
820     "add x20, x19, %x[output_col_stride], LSL #2\n"
821     "st1w { z27.s }, p0, [x19, x22, LSL #2]\n"
822     "add x19, x19, %x[output_row_stride], LSL #2\n"
823     "st1w { z26.s }, p0, [x20, x22, LSL #2]\n"
824     "add x20, x20, %x[output_col_stride], LSL #2\n"
825     "st1w { z25.s }, p0, [x20, x22, LSL #2]\n"
826     "add x20, x20, %x[output_col_stride], LSL #2\n"
827     "st1w { z24.s }, p0, [x20, x22, LSL #2]\n"
828     "add x20, x19, %x[output_col_stride], LSL #2\n"
829     "st1w { z23.s }, p0, [x19, x22, LSL #2]\n"
830     "add x19, x19, %x[output_row_stride], LSL #2\n"
831     "st1w { z22.s }, p0, [x20, x22, LSL #2]\n"
832     "add x20, x20, %x[output_col_stride], LSL #2\n"
833     "st1w { z21.s }, p0, [x20, x22, LSL #2]\n"
834     "add x20, x20, %x[output_col_stride], LSL #2\n"
835     "st1w { z20.s }, p0, [x20, x22, LSL #2]\n"
836     "add x20, x19, %x[output_col_stride], LSL #2\n"
837     "st1w { z19.s }, p0, [x19, x22, LSL #2]\n"
838     "st1w { z18.s }, p0, [x20, x22, LSL #2]\n"
839     "add x20, x20, %x[output_col_stride], LSL #2\n"
840     "st1w { z17.s }, p0, [x20, x22, LSL #2]\n"
841     "add x20, x20, %x[output_col_stride], LSL #2\n"
842     "st1w { z16.s }, p0, [x20, x22, LSL #2]\n"
843     "incw x22, ALL, MUL #4\n"
844     "whilelt p1.s, x22, %x[n_channels]\n"
845     "ld1w { z28.s }, p1/Z, [%x[inptr], x22, LSL #2]\n"
846     "ld1w { z24.s }, p1/Z, [x21, x22, LSL #2]\n"
847     "whilelt p2.s, x23, %x[n_channels]\n"
848     "whilelt p3.s, x24, %x[n_channels]\n"
849     "ld1w { z30.s }, p3/Z, [%x[inptr], x24, LSL #2]\n"
850     "whilelt p4.s, x25, %x[n_channels]\n"
851     "ld1w { z31.s }, p4/Z, [%x[inptr], x25, LSL #2]\n"
852     "and p0.b, p5/Z, p8.b, p4.b\n"
853     "ld1w { z29.s }, p2/Z, [%x[inptr], x23, LSL #2]\n"
854     "ld1w { z27.s }, p4/Z, [x21, x25, LSL #2]\n"
855     "ld1w { z26.s }, p3/Z, [x21, x24, LSL #2]\n"
856     "ld1w { z25.s }, p2/Z, [x21, x23, LSL #2]\n"
857     "add x21, x21, %x[matrix_stride], LSL #2\n"
858     "ld1w { z23.s }, p4/Z, [x21, x25, LSL #2]\n"
859     "ld1w { z22.s }, p3/Z, [x21, x24, LSL #2]\n"
860     "ld1w { z21.s }, p2/Z, [x21, x23, LSL #2]\n"
861     "ld1w { z20.s }, p1/Z, [x21, x22, LSL #2]\n"
862     "add x21, x21, %x[matrix_stride], LSL #2\n"
863     "ld1w { z19.s }, p4/Z, [x21, x25, LSL #2]\n"
864     "ld1w { z18.s }, p3/Z, [x21, x24, LSL #2]\n"
865     "ld1w { z17.s }, p2/Z, [x21, x23, LSL #2]\n"
866     "ld1w { z16.s }, p1/Z, [x21, x22, LSL #2]\n"
867     "ld1w { z0.s }, p0/Z, [%x[bptr], x25, LSL #2]\n"
868     "and p0.b, p5/Z, p8.b, p3.b\n"
869     ".inst 0x8080b420  // fmopa za0.s, p5/M, p5/M, z1.s, z0.s\n"
870     "ld1w { z0.s }, p0/Z, [%x[bptr], x24, LSL #2]\n"
871     "and p0.b, p5/Z, p8.b, p2.b\n"
872     ".inst 0x8080b421  // fmopa za1.s, p5/M, p5/M, z1.s, z0.s\n"
873     "ld1w { z0.s }, p0/Z, [%x[bptr], x23, LSL #2]\n"
874     "and p0.b, p5/Z, p8.b, p1.b\n"
875     ".inst 0x8080b422  // fmopa za2.s, p5/M, p5/M, z1.s, z0.s\n"
876     "ld1w { z0.s }, p0/Z, [%x[bptr], x22, LSL #2]\n"
877     ".inst 0x8080b423  // fmopa za3.s, p5/M, p5/M, z1.s, z0.s\n"
878     "b.any 2b\n"
879     "3:"  // End
880     ".inst 0xd503467f  // SMSTOP\n"
881     :
882     : [bptr] "r" (bptr), [inptr] "r" (inptr), [matrix_stride] "r" (matrix_stride), [n_channels] "r" (n_channels), [offsetof_Params_act_max] "I" (offsetof(Params, act_max)), [offsetof_Params_act_min] "I" (offsetof(Params, act_min)), [offsetof_Params_inner_terms] "I" (offsetof(Params, inner_terms)), [offsetof_Params_outer_terms] "I" (offsetof(Params, outer_terms)), [output] "r" (output), [output_col_stride] "r" (output_col_stride), [output_row_stride] "r" (output_row_stride), [params] "r" (&params)
883     : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p8", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
884   );
885 }
886 
887 }  // namespace output_transform
888 }  // namespace winograd
889 }  // namespace arm_conv
890 
891 #endif //defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SME)
892 
893