• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
16 #define GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
17 
18 #ifdef GEMMLOWP_NEON_64
19 
20 #include <cassert>
21 #include <cstdint>
22 
23 namespace gemmlowp {
24 namespace meta {
25 
26 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)27 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 0>::Transform(
28     const int32_t* input, const Requantize& params, uint8_t* output) {
29 #ifdef DEBUG
30 #ifdef DEBUG_METAGEMM_VERBOSE
31   std::cout << __FILE__ << "(" << __LINE__
32             << ") Requantize<int32_t, uint8_t, Requantize, 16, 0>::Transform()"
33             << std::endl
34             << std::flush;
35 #endif
36 #endif
37   int params_count_copy = params.count;
38   asm volatile(
39 
40       // Requantize::Prepare
41       "dup v4.4s, %w[input_range_min]\n"
42       "dup v5.4s, %w[output_range_min]\n"
43       "dup v6.4s, %w[input_range_offset]\n"
44       "dup v7.4s, %w[input_range_scale]\n"
45       "dup v8.4s, %w[one_over_output_range_scale]\n"
46       "fsub v4.4s, v4.4s, v5.4s\n"
47 
48       "1:"
49       "subs %x[count], %x[count], #16\n"
50 
51       // Requantize::Transform
52       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
53       "prfm pldl1keep, [%x[input], #64]\n"
54       "scvtf v0.4s, v0.4s\n"
55       "scvtf v1.4s, v1.4s\n"
56       "scvtf v2.4s, v2.4s\n"
57       "scvtf v3.4s, v3.4s\n"
58       "fsub v0.4s, v0.4s, v6.4s\n"
59       "fsub v1.4s, v1.4s, v6.4s\n"
60       "fsub v2.4s, v2.4s, v6.4s\n"
61       "fsub v3.4s, v3.4s, v6.4s\n"
62       "fmul v0.4s, v0.4s, v7.4s\n"
63       "fmul v1.4s, v1.4s, v7.4s\n"
64       "fmul v2.4s, v2.4s, v7.4s\n"
65       "fmul v3.4s, v3.4s, v7.4s\n"
66       "fadd v0.4s, v0.4s, v4.4s\n"
67       "fadd v1.4s, v1.4s, v4.4s\n"
68       "fadd v2.4s, v2.4s, v4.4s\n"
69       "fadd v3.4s, v3.4s, v4.4s\n"
70       "fmul v0.4s, v0.4s, v8.4s\n"
71       "fmul v1.4s, v1.4s, v8.4s\n"
72       "fmul v2.4s, v2.4s, v8.4s\n"
73       "fmul v3.4s, v3.4s, v8.4s\n"
74       "fcvtzs v0.4s, v0.4s\n"
75       "fcvtzs v1.4s, v1.4s\n"
76       "fcvtzs v2.4s, v2.4s\n"
77       "fcvtzs v3.4s, v3.4s\n"
78       "sqxtn v0.4h, v0.4s\n"
79       "sqxtn2 v0.8h, v1.4s\n"
80       "sqxtn v2.4h, v2.4s\n"
81       "sqxtn2 v2.8h, v3.4s\n"
82       "sqxtun v0.8b, v0.8h\n"
83       "sqxtun2 v0.16b, v2.8h\n"
84 
85       "st1 {v0.4s}, [%x[output]], #16\n"
86       "prfm pldl1keep, [%x[output]]\n"
87 
88       "bne 1b\n"
89       : [count] "+r"(params_count_copy), [input] "+r"(input),
90         [output] "+r"(output)
91       : [input_range_min] "r"(params.input_range_min),
92         [output_range_min] "r"(params.output_range_min),
93         [input_range_offset] "r"(params.input_range_offset),
94         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
95         [input_range_scale] "r"(params.input_range_scale)
96       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
97 }
98 
99 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)100 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 1>::Transform(
101     const int32_t* input, const Requantize& params, uint8_t* output) {
102 #ifdef DEBUG
103 #ifdef DEBUG_METAGEMM_VERBOSE
104   std::cout << __FILE__ << "(" << __LINE__
105             << ") Requantize<int32_t, uint8_t, Requantize, 16, 1>::Transform()"
106             << std::endl
107             << std::flush;
108 #endif
109 #endif
110   int params_count_copy = params.count;
111   asm volatile(
112 
113       // Requantize::Prepare
114       "dup v4.4s, %w[input_range_min]\n"
115       "dup v5.4s, %w[output_range_min]\n"
116       "dup v6.4s, %w[input_range_offset]\n"
117       "dup v7.4s, %w[input_range_scale]\n"
118       "dup v8.4s, %w[one_over_output_range_scale]\n"
119       "fsub v4.4s, v4.4s, v5.4s\n"
120 
121       // Reduce count by leftovers.
122       "subs %x[count], %x[count], #1\n"
123       "beq 2f\n"
124 
125       "1:"
126       "subs %x[count], %x[count], #16\n"
127 
128       // Requantize::Transform
129       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
130       "prfm pldl1keep, [%x[input], #64]\n"
131       "scvtf v0.4s, v0.4s\n"
132       "scvtf v1.4s, v1.4s\n"
133       "scvtf v2.4s, v2.4s\n"
134       "scvtf v3.4s, v3.4s\n"
135       "fsub v0.4s, v0.4s, v6.4s\n"
136       "fsub v1.4s, v1.4s, v6.4s\n"
137       "fsub v2.4s, v2.4s, v6.4s\n"
138       "fsub v3.4s, v3.4s, v6.4s\n"
139       "fmul v0.4s, v0.4s, v7.4s\n"
140       "fmul v1.4s, v1.4s, v7.4s\n"
141       "fmul v2.4s, v2.4s, v7.4s\n"
142       "fmul v3.4s, v3.4s, v7.4s\n"
143       "fadd v0.4s, v0.4s, v4.4s\n"
144       "fadd v1.4s, v1.4s, v4.4s\n"
145       "fadd v2.4s, v2.4s, v4.4s\n"
146       "fadd v3.4s, v3.4s, v4.4s\n"
147       "fmul v0.4s, v0.4s, v8.4s\n"
148       "fmul v1.4s, v1.4s, v8.4s\n"
149       "fmul v2.4s, v2.4s, v8.4s\n"
150       "fmul v3.4s, v3.4s, v8.4s\n"
151       "fcvtzs v0.4s, v0.4s\n"
152       "fcvtzs v1.4s, v1.4s\n"
153       "fcvtzs v2.4s, v2.4s\n"
154       "fcvtzs v3.4s, v3.4s\n"
155       "sqxtn v0.4h, v0.4s\n"
156       "sqxtn2 v0.8h, v1.4s\n"
157       "sqxtn v2.4h, v2.4s\n"
158       "sqxtn2 v2.8h, v3.4s\n"
159       "sqxtun v0.8b, v0.8h\n"
160       "sqxtun2 v0.16b, v2.8h\n"
161 
162       "st1 {v0.4s}, [%x[output]], #16\n"
163       "prfm pldl1keep, [%x[output]]\n"
164 
165       "bne 1b\n"
166       "2:"
167 
168       // Handle leftovers.
169 
170       // Requantize::Transform
171       "ld1 {v0.s}[0], [%x[input]], #4\n"
172       "prfm pldl1keep, [%x[input], #64]\n"
173       "scvtf v0.4s, v0.4s\n"
174       "fsub v0.4s, v0.4s, v6.4s\n"
175       "fmul v0.4s, v0.4s, v7.4s\n"
176       "fadd v0.4s, v0.4s, v4.4s\n"
177       "fmul v0.4s, v0.4s, v8.4s\n"
178       "fcvtzs v0.4s, v0.4s\n"
179       "sqxtn v0.4h, v0.4s\n"
180       "sqxtun v0.8b, v0.8h\n"
181 
182       "st1 {v0.b}[0], [%x[output]], #1\n"
183       "prfm pldl1keep, [%x[output]]\n"
184       : [count] "+r"(params_count_copy), [input] "+r"(input),
185         [output] "+r"(output)
186       : [input_range_min] "r"(params.input_range_min),
187         [output_range_min] "r"(params.output_range_min),
188         [input_range_offset] "r"(params.input_range_offset),
189         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
190         [input_range_scale] "r"(params.input_range_scale)
191       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
192 }
193 
194 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)195 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 2>::Transform(
196     const int32_t* input, const Requantize& params, uint8_t* output) {
197 #ifdef DEBUG
198 #ifdef DEBUG_METAGEMM_VERBOSE
199   std::cout << __FILE__ << "(" << __LINE__
200             << ") Requantize<int32_t, uint8_t, Requantize, 16, 2>::Transform()"
201             << std::endl
202             << std::flush;
203 #endif
204 #endif
205   int params_count_copy = params.count;
206   asm volatile(
207 
208       // Requantize::Prepare
209       "dup v4.4s, %w[input_range_min]\n"
210       "dup v5.4s, %w[output_range_min]\n"
211       "dup v6.4s, %w[input_range_offset]\n"
212       "dup v7.4s, %w[input_range_scale]\n"
213       "dup v8.4s, %w[one_over_output_range_scale]\n"
214       "fsub v4.4s, v4.4s, v5.4s\n"
215 
216       // Reduce count by leftovers.
217       "subs %x[count], %x[count], #2\n"
218       "beq 2f\n"
219 
220       "1:"
221       "subs %x[count], %x[count], #16\n"
222 
223       // Requantize::Transform
224       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
225       "prfm pldl1keep, [%x[input], #64]\n"
226       "scvtf v0.4s, v0.4s\n"
227       "scvtf v1.4s, v1.4s\n"
228       "scvtf v2.4s, v2.4s\n"
229       "scvtf v3.4s, v3.4s\n"
230       "fsub v0.4s, v0.4s, v6.4s\n"
231       "fsub v1.4s, v1.4s, v6.4s\n"
232       "fsub v2.4s, v2.4s, v6.4s\n"
233       "fsub v3.4s, v3.4s, v6.4s\n"
234       "fmul v0.4s, v0.4s, v7.4s\n"
235       "fmul v1.4s, v1.4s, v7.4s\n"
236       "fmul v2.4s, v2.4s, v7.4s\n"
237       "fmul v3.4s, v3.4s, v7.4s\n"
238       "fadd v0.4s, v0.4s, v4.4s\n"
239       "fadd v1.4s, v1.4s, v4.4s\n"
240       "fadd v2.4s, v2.4s, v4.4s\n"
241       "fadd v3.4s, v3.4s, v4.4s\n"
242       "fmul v0.4s, v0.4s, v8.4s\n"
243       "fmul v1.4s, v1.4s, v8.4s\n"
244       "fmul v2.4s, v2.4s, v8.4s\n"
245       "fmul v3.4s, v3.4s, v8.4s\n"
246       "fcvtzs v0.4s, v0.4s\n"
247       "fcvtzs v1.4s, v1.4s\n"
248       "fcvtzs v2.4s, v2.4s\n"
249       "fcvtzs v3.4s, v3.4s\n"
250       "sqxtn v0.4h, v0.4s\n"
251       "sqxtn2 v0.8h, v1.4s\n"
252       "sqxtn v2.4h, v2.4s\n"
253       "sqxtn2 v2.8h, v3.4s\n"
254       "sqxtun v0.8b, v0.8h\n"
255       "sqxtun2 v0.16b, v2.8h\n"
256 
257       "st1 {v0.4s}, [%x[output]], #16\n"
258       "prfm pldl1keep, [%x[output]]\n"
259 
260       "bne 1b\n"
261       "2:"
262 
263       // Handle leftovers.
264 
265       // Requantize::Transform
266       "ld1 {v0.2s}, [%x[input]], #8\n"
267       "prfm pldl1keep, [%x[input], #64]\n"
268       "scvtf v0.4s, v0.4s\n"
269       "fsub v0.4s, v0.4s, v6.4s\n"
270       "fmul v0.4s, v0.4s, v7.4s\n"
271       "fadd v0.4s, v0.4s, v4.4s\n"
272       "fmul v0.4s, v0.4s, v8.4s\n"
273       "fcvtzs v0.4s, v0.4s\n"
274       "sqxtn v0.4h, v0.4s\n"
275       "sqxtun v0.8b, v0.8h\n"
276 
277       "st1 {v0.h}[0], [%x[output]], #2\n"
278       "prfm pldl1keep, [%x[output]]\n"
279       : [count] "+r"(params_count_copy), [input] "+r"(input),
280         [output] "+r"(output)
281       : [input_range_min] "r"(params.input_range_min),
282         [output_range_min] "r"(params.output_range_min),
283         [input_range_offset] "r"(params.input_range_offset),
284         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
285         [input_range_scale] "r"(params.input_range_scale)
286       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
287 }
288 
289 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)290 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 3>::Transform(
291     const int32_t* input, const Requantize& params, uint8_t* output) {
292 #ifdef DEBUG
293 #ifdef DEBUG_METAGEMM_VERBOSE
294   std::cout << __FILE__ << "(" << __LINE__
295             << ") Requantize<int32_t, uint8_t, Requantize, 16, 3>::Transform()"
296             << std::endl
297             << std::flush;
298 #endif
299 #endif
300   int params_count_copy = params.count;
301   asm volatile(
302 
303       // Requantize::Prepare
304       "dup v4.4s, %w[input_range_min]\n"
305       "dup v5.4s, %w[output_range_min]\n"
306       "dup v6.4s, %w[input_range_offset]\n"
307       "dup v7.4s, %w[input_range_scale]\n"
308       "dup v8.4s, %w[one_over_output_range_scale]\n"
309       "fsub v4.4s, v4.4s, v5.4s\n"
310 
311       // Reduce count by leftovers.
312       "subs %x[count], %x[count], #3\n"
313       "beq 2f\n"
314 
315       "1:"
316       "subs %x[count], %x[count], #16\n"
317 
318       // Requantize::Transform
319       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
320       "prfm pldl1keep, [%x[input], #64]\n"
321       "scvtf v0.4s, v0.4s\n"
322       "scvtf v1.4s, v1.4s\n"
323       "scvtf v2.4s, v2.4s\n"
324       "scvtf v3.4s, v3.4s\n"
325       "fsub v0.4s, v0.4s, v6.4s\n"
326       "fsub v1.4s, v1.4s, v6.4s\n"
327       "fsub v2.4s, v2.4s, v6.4s\n"
328       "fsub v3.4s, v3.4s, v6.4s\n"
329       "fmul v0.4s, v0.4s, v7.4s\n"
330       "fmul v1.4s, v1.4s, v7.4s\n"
331       "fmul v2.4s, v2.4s, v7.4s\n"
332       "fmul v3.4s, v3.4s, v7.4s\n"
333       "fadd v0.4s, v0.4s, v4.4s\n"
334       "fadd v1.4s, v1.4s, v4.4s\n"
335       "fadd v2.4s, v2.4s, v4.4s\n"
336       "fadd v3.4s, v3.4s, v4.4s\n"
337       "fmul v0.4s, v0.4s, v8.4s\n"
338       "fmul v1.4s, v1.4s, v8.4s\n"
339       "fmul v2.4s, v2.4s, v8.4s\n"
340       "fmul v3.4s, v3.4s, v8.4s\n"
341       "fcvtzs v0.4s, v0.4s\n"
342       "fcvtzs v1.4s, v1.4s\n"
343       "fcvtzs v2.4s, v2.4s\n"
344       "fcvtzs v3.4s, v3.4s\n"
345       "sqxtn v0.4h, v0.4s\n"
346       "sqxtn2 v0.8h, v1.4s\n"
347       "sqxtn v2.4h, v2.4s\n"
348       "sqxtn2 v2.8h, v3.4s\n"
349       "sqxtun v0.8b, v0.8h\n"
350       "sqxtun2 v0.16b, v2.8h\n"
351 
352       "st1 {v0.4s}, [%x[output]], #16\n"
353       "prfm pldl1keep, [%x[output]]\n"
354 
355       "bne 1b\n"
356       "2:"
357 
358       // Handle leftovers.
359 
360       // Requantize::Transform
361       "ld1 {v0.2s}, [%x[input]], #8\n"
362       "ld1 {v0.s}[2], [%x[input]], #4\n"
363       "prfm pldl1keep, [%x[input], #64]\n"
364       "scvtf v0.4s, v0.4s\n"
365       "fsub v0.4s, v0.4s, v6.4s\n"
366       "fmul v0.4s, v0.4s, v7.4s\n"
367       "fadd v0.4s, v0.4s, v4.4s\n"
368       "fmul v0.4s, v0.4s, v8.4s\n"
369       "fcvtzs v0.4s, v0.4s\n"
370       "sqxtn v0.4h, v0.4s\n"
371       "sqxtun v0.8b, v0.8h\n"
372 
373       "st1 {v0.h}[0], [%x[output]], #2\n"
374       "st1 {v0.b}[2], [%x[output]], #1\n"
375       "prfm pldl1keep, [%x[output]]\n"
376       : [count] "+r"(params_count_copy), [input] "+r"(input),
377         [output] "+r"(output)
378       : [input_range_min] "r"(params.input_range_min),
379         [output_range_min] "r"(params.output_range_min),
380         [input_range_offset] "r"(params.input_range_offset),
381         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
382         [input_range_scale] "r"(params.input_range_scale)
383       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
384 }
385 
386 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)387 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 4>::Transform(
388     const int32_t* input, const Requantize& params, uint8_t* output) {
389 #ifdef DEBUG
390 #ifdef DEBUG_METAGEMM_VERBOSE
391   std::cout << __FILE__ << "(" << __LINE__
392             << ") Requantize<int32_t, uint8_t, Requantize, 16, 4>::Transform()"
393             << std::endl
394             << std::flush;
395 #endif
396 #endif
397   int params_count_copy = params.count;
398   asm volatile(
399 
400       // Requantize::Prepare
401       "dup v4.4s, %w[input_range_min]\n"
402       "dup v5.4s, %w[output_range_min]\n"
403       "dup v6.4s, %w[input_range_offset]\n"
404       "dup v7.4s, %w[input_range_scale]\n"
405       "dup v8.4s, %w[one_over_output_range_scale]\n"
406       "fsub v4.4s, v4.4s, v5.4s\n"
407 
408       // Reduce count by leftovers.
409       "subs %x[count], %x[count], #4\n"
410       "beq 2f\n"
411 
412       "1:"
413       "subs %x[count], %x[count], #16\n"
414 
415       // Requantize::Transform
416       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
417       "prfm pldl1keep, [%x[input], #64]\n"
418       "scvtf v0.4s, v0.4s\n"
419       "scvtf v1.4s, v1.4s\n"
420       "scvtf v2.4s, v2.4s\n"
421       "scvtf v3.4s, v3.4s\n"
422       "fsub v0.4s, v0.4s, v6.4s\n"
423       "fsub v1.4s, v1.4s, v6.4s\n"
424       "fsub v2.4s, v2.4s, v6.4s\n"
425       "fsub v3.4s, v3.4s, v6.4s\n"
426       "fmul v0.4s, v0.4s, v7.4s\n"
427       "fmul v1.4s, v1.4s, v7.4s\n"
428       "fmul v2.4s, v2.4s, v7.4s\n"
429       "fmul v3.4s, v3.4s, v7.4s\n"
430       "fadd v0.4s, v0.4s, v4.4s\n"
431       "fadd v1.4s, v1.4s, v4.4s\n"
432       "fadd v2.4s, v2.4s, v4.4s\n"
433       "fadd v3.4s, v3.4s, v4.4s\n"
434       "fmul v0.4s, v0.4s, v8.4s\n"
435       "fmul v1.4s, v1.4s, v8.4s\n"
436       "fmul v2.4s, v2.4s, v8.4s\n"
437       "fmul v3.4s, v3.4s, v8.4s\n"
438       "fcvtzs v0.4s, v0.4s\n"
439       "fcvtzs v1.4s, v1.4s\n"
440       "fcvtzs v2.4s, v2.4s\n"
441       "fcvtzs v3.4s, v3.4s\n"
442       "sqxtn v0.4h, v0.4s\n"
443       "sqxtn2 v0.8h, v1.4s\n"
444       "sqxtn v2.4h, v2.4s\n"
445       "sqxtn2 v2.8h, v3.4s\n"
446       "sqxtun v0.8b, v0.8h\n"
447       "sqxtun2 v0.16b, v2.8h\n"
448 
449       "st1 {v0.4s}, [%x[output]], #16\n"
450       "prfm pldl1keep, [%x[output]]\n"
451 
452       "bne 1b\n"
453       "2:"
454 
455       // Handle leftovers.
456 
457       // Requantize::Transform
458       "ld1 {v0.4s}, [%x[input]], #16\n"
459       "prfm pldl1keep, [%x[input], #64]\n"
460       "scvtf v0.4s, v0.4s\n"
461       "fsub v0.4s, v0.4s, v6.4s\n"
462       "fmul v0.4s, v0.4s, v7.4s\n"
463       "fadd v0.4s, v0.4s, v4.4s\n"
464       "fmul v0.4s, v0.4s, v8.4s\n"
465       "fcvtzs v0.4s, v0.4s\n"
466       "sqxtn v0.4h, v0.4s\n"
467       "sqxtun v0.8b, v0.8h\n"
468 
469       "st1 {v0.s}[0], [%x[output]], #4\n"
470       "prfm pldl1keep, [%x[output]]\n"
471       : [count] "+r"(params_count_copy), [input] "+r"(input),
472         [output] "+r"(output)
473       : [input_range_min] "r"(params.input_range_min),
474         [output_range_min] "r"(params.output_range_min),
475         [input_range_offset] "r"(params.input_range_offset),
476         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
477         [input_range_scale] "r"(params.input_range_scale)
478       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
479 }
480 
481 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)482 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 5>::Transform(
483     const int32_t* input, const Requantize& params, uint8_t* output) {
484 #ifdef DEBUG
485 #ifdef DEBUG_METAGEMM_VERBOSE
486   std::cout << __FILE__ << "(" << __LINE__
487             << ") Requantize<int32_t, uint8_t, Requantize, 16, 5>::Transform()"
488             << std::endl
489             << std::flush;
490 #endif
491 #endif
492   int params_count_copy = params.count;
493   asm volatile(
494 
495       // Requantize::Prepare
496       "dup v4.4s, %w[input_range_min]\n"
497       "dup v5.4s, %w[output_range_min]\n"
498       "dup v6.4s, %w[input_range_offset]\n"
499       "dup v7.4s, %w[input_range_scale]\n"
500       "dup v8.4s, %w[one_over_output_range_scale]\n"
501       "fsub v4.4s, v4.4s, v5.4s\n"
502 
503       // Reduce count by leftovers.
504       "subs %x[count], %x[count], #5\n"
505       "beq 2f\n"
506 
507       "1:"
508       "subs %x[count], %x[count], #16\n"
509 
510       // Requantize::Transform
511       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
512       "prfm pldl1keep, [%x[input], #64]\n"
513       "scvtf v0.4s, v0.4s\n"
514       "scvtf v1.4s, v1.4s\n"
515       "scvtf v2.4s, v2.4s\n"
516       "scvtf v3.4s, v3.4s\n"
517       "fsub v0.4s, v0.4s, v6.4s\n"
518       "fsub v1.4s, v1.4s, v6.4s\n"
519       "fsub v2.4s, v2.4s, v6.4s\n"
520       "fsub v3.4s, v3.4s, v6.4s\n"
521       "fmul v0.4s, v0.4s, v7.4s\n"
522       "fmul v1.4s, v1.4s, v7.4s\n"
523       "fmul v2.4s, v2.4s, v7.4s\n"
524       "fmul v3.4s, v3.4s, v7.4s\n"
525       "fadd v0.4s, v0.4s, v4.4s\n"
526       "fadd v1.4s, v1.4s, v4.4s\n"
527       "fadd v2.4s, v2.4s, v4.4s\n"
528       "fadd v3.4s, v3.4s, v4.4s\n"
529       "fmul v0.4s, v0.4s, v8.4s\n"
530       "fmul v1.4s, v1.4s, v8.4s\n"
531       "fmul v2.4s, v2.4s, v8.4s\n"
532       "fmul v3.4s, v3.4s, v8.4s\n"
533       "fcvtzs v0.4s, v0.4s\n"
534       "fcvtzs v1.4s, v1.4s\n"
535       "fcvtzs v2.4s, v2.4s\n"
536       "fcvtzs v3.4s, v3.4s\n"
537       "sqxtn v0.4h, v0.4s\n"
538       "sqxtn2 v0.8h, v1.4s\n"
539       "sqxtn v2.4h, v2.4s\n"
540       "sqxtn2 v2.8h, v3.4s\n"
541       "sqxtun v0.8b, v0.8h\n"
542       "sqxtun2 v0.16b, v2.8h\n"
543 
544       "st1 {v0.4s}, [%x[output]], #16\n"
545       "prfm pldl1keep, [%x[output]]\n"
546 
547       "bne 1b\n"
548       "2:"
549 
550       // Handle leftovers.
551 
552       // Requantize::Transform
553       "ld1 {v0.4s}, [%x[input]], #16\n"
554       "ld1 {v1.s}[0], [%x[input]], #4\n"
555       "prfm pldl1keep, [%x[input], #64]\n"
556       "scvtf v0.4s, v0.4s\n"
557       "scvtf v1.4s, v1.4s\n"
558       "fsub v0.4s, v0.4s, v6.4s\n"
559       "fsub v1.4s, v1.4s, v6.4s\n"
560       "fmul v0.4s, v0.4s, v7.4s\n"
561       "fmul v1.4s, v1.4s, v7.4s\n"
562       "fadd v0.4s, v0.4s, v4.4s\n"
563       "fadd v1.4s, v1.4s, v4.4s\n"
564       "fmul v0.4s, v0.4s, v8.4s\n"
565       "fmul v1.4s, v1.4s, v8.4s\n"
566       "fcvtzs v0.4s, v0.4s\n"
567       "fcvtzs v1.4s, v1.4s\n"
568       "sqxtn v0.4h, v0.4s\n"
569       "sqxtn2 v0.8h, v1.4s\n"
570       "sqxtun v0.8b, v0.8h\n"
571 
572       "st1 {v0.s}[0], [%x[output]], #4\n"
573       "st1 {v0.b}[4], [%x[output]], #1\n"
574       "prfm pldl1keep, [%x[output]]\n"
575       : [count] "+r"(params_count_copy), [input] "+r"(input),
576         [output] "+r"(output)
577       : [input_range_min] "r"(params.input_range_min),
578         [output_range_min] "r"(params.output_range_min),
579         [input_range_offset] "r"(params.input_range_offset),
580         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
581         [input_range_scale] "r"(params.input_range_scale)
582       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
583 }
584 
585 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)586 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 6>::Transform(
587     const int32_t* input, const Requantize& params, uint8_t* output) {
588 #ifdef DEBUG
589 #ifdef DEBUG_METAGEMM_VERBOSE
590   std::cout << __FILE__ << "(" << __LINE__
591             << ") Requantize<int32_t, uint8_t, Requantize, 16, 6>::Transform()"
592             << std::endl
593             << std::flush;
594 #endif
595 #endif
596   int params_count_copy = params.count;
597   asm volatile(
598 
599       // Requantize::Prepare
600       "dup v4.4s, %w[input_range_min]\n"
601       "dup v5.4s, %w[output_range_min]\n"
602       "dup v6.4s, %w[input_range_offset]\n"
603       "dup v7.4s, %w[input_range_scale]\n"
604       "dup v8.4s, %w[one_over_output_range_scale]\n"
605       "fsub v4.4s, v4.4s, v5.4s\n"
606 
607       // Reduce count by leftovers.
608       "subs %x[count], %x[count], #6\n"
609       "beq 2f\n"
610 
611       "1:"
612       "subs %x[count], %x[count], #16\n"
613 
614       // Requantize::Transform
615       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
616       "prfm pldl1keep, [%x[input], #64]\n"
617       "scvtf v0.4s, v0.4s\n"
618       "scvtf v1.4s, v1.4s\n"
619       "scvtf v2.4s, v2.4s\n"
620       "scvtf v3.4s, v3.4s\n"
621       "fsub v0.4s, v0.4s, v6.4s\n"
622       "fsub v1.4s, v1.4s, v6.4s\n"
623       "fsub v2.4s, v2.4s, v6.4s\n"
624       "fsub v3.4s, v3.4s, v6.4s\n"
625       "fmul v0.4s, v0.4s, v7.4s\n"
626       "fmul v1.4s, v1.4s, v7.4s\n"
627       "fmul v2.4s, v2.4s, v7.4s\n"
628       "fmul v3.4s, v3.4s, v7.4s\n"
629       "fadd v0.4s, v0.4s, v4.4s\n"
630       "fadd v1.4s, v1.4s, v4.4s\n"
631       "fadd v2.4s, v2.4s, v4.4s\n"
632       "fadd v3.4s, v3.4s, v4.4s\n"
633       "fmul v0.4s, v0.4s, v8.4s\n"
634       "fmul v1.4s, v1.4s, v8.4s\n"
635       "fmul v2.4s, v2.4s, v8.4s\n"
636       "fmul v3.4s, v3.4s, v8.4s\n"
637       "fcvtzs v0.4s, v0.4s\n"
638       "fcvtzs v1.4s, v1.4s\n"
639       "fcvtzs v2.4s, v2.4s\n"
640       "fcvtzs v3.4s, v3.4s\n"
641       "sqxtn v0.4h, v0.4s\n"
642       "sqxtn2 v0.8h, v1.4s\n"
643       "sqxtn v2.4h, v2.4s\n"
644       "sqxtn2 v2.8h, v3.4s\n"
645       "sqxtun v0.8b, v0.8h\n"
646       "sqxtun2 v0.16b, v2.8h\n"
647 
648       "st1 {v0.4s}, [%x[output]], #16\n"
649       "prfm pldl1keep, [%x[output]]\n"
650 
651       "bne 1b\n"
652       "2:"
653 
654       // Handle leftovers.
655 
656       // Requantize::Transform
657       "ld1 {v0.4s}, [%x[input]], #16\n"
658       "ld1 {v1.2s}, [%x[input]], #8\n"
659       "prfm pldl1keep, [%x[input], #64]\n"
660       "scvtf v0.4s, v0.4s\n"
661       "scvtf v1.4s, v1.4s\n"
662       "fsub v0.4s, v0.4s, v6.4s\n"
663       "fsub v1.4s, v1.4s, v6.4s\n"
664       "fmul v0.4s, v0.4s, v7.4s\n"
665       "fmul v1.4s, v1.4s, v7.4s\n"
666       "fadd v0.4s, v0.4s, v4.4s\n"
667       "fadd v1.4s, v1.4s, v4.4s\n"
668       "fmul v0.4s, v0.4s, v8.4s\n"
669       "fmul v1.4s, v1.4s, v8.4s\n"
670       "fcvtzs v0.4s, v0.4s\n"
671       "fcvtzs v1.4s, v1.4s\n"
672       "sqxtn v0.4h, v0.4s\n"
673       "sqxtn2 v0.8h, v1.4s\n"
674       "sqxtun v0.8b, v0.8h\n"
675 
676       "st1 {v0.s}[0], [%x[output]], #4\n"
677       "st1 {v0.h}[2], [%x[output]], #2\n"
678       "prfm pldl1keep, [%x[output]]\n"
679       : [count] "+r"(params_count_copy), [input] "+r"(input),
680         [output] "+r"(output)
681       : [input_range_min] "r"(params.input_range_min),
682         [output_range_min] "r"(params.output_range_min),
683         [input_range_offset] "r"(params.input_range_offset),
684         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
685         [input_range_scale] "r"(params.input_range_scale)
686       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
687 }
688 
689 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)690 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 7>::Transform(
691     const int32_t* input, const Requantize& params, uint8_t* output) {
692 #ifdef DEBUG
693 #ifdef DEBUG_METAGEMM_VERBOSE
694   std::cout << __FILE__ << "(" << __LINE__
695             << ") Requantize<int32_t, uint8_t, Requantize, 16, 7>::Transform()"
696             << std::endl
697             << std::flush;
698 #endif
699 #endif
700   int params_count_copy = params.count;
701   asm volatile(
702 
703       // Requantize::Prepare
704       "dup v4.4s, %w[input_range_min]\n"
705       "dup v5.4s, %w[output_range_min]\n"
706       "dup v6.4s, %w[input_range_offset]\n"
707       "dup v7.4s, %w[input_range_scale]\n"
708       "dup v8.4s, %w[one_over_output_range_scale]\n"
709       "fsub v4.4s, v4.4s, v5.4s\n"
710 
711       // Reduce count by leftovers.
712       "subs %x[count], %x[count], #7\n"
713       "beq 2f\n"
714 
715       "1:"
716       "subs %x[count], %x[count], #16\n"
717 
718       // Requantize::Transform
719       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
720       "prfm pldl1keep, [%x[input], #64]\n"
721       "scvtf v0.4s, v0.4s\n"
722       "scvtf v1.4s, v1.4s\n"
723       "scvtf v2.4s, v2.4s\n"
724       "scvtf v3.4s, v3.4s\n"
725       "fsub v0.4s, v0.4s, v6.4s\n"
726       "fsub v1.4s, v1.4s, v6.4s\n"
727       "fsub v2.4s, v2.4s, v6.4s\n"
728       "fsub v3.4s, v3.4s, v6.4s\n"
729       "fmul v0.4s, v0.4s, v7.4s\n"
730       "fmul v1.4s, v1.4s, v7.4s\n"
731       "fmul v2.4s, v2.4s, v7.4s\n"
732       "fmul v3.4s, v3.4s, v7.4s\n"
733       "fadd v0.4s, v0.4s, v4.4s\n"
734       "fadd v1.4s, v1.4s, v4.4s\n"
735       "fadd v2.4s, v2.4s, v4.4s\n"
736       "fadd v3.4s, v3.4s, v4.4s\n"
737       "fmul v0.4s, v0.4s, v8.4s\n"
738       "fmul v1.4s, v1.4s, v8.4s\n"
739       "fmul v2.4s, v2.4s, v8.4s\n"
740       "fmul v3.4s, v3.4s, v8.4s\n"
741       "fcvtzs v0.4s, v0.4s\n"
742       "fcvtzs v1.4s, v1.4s\n"
743       "fcvtzs v2.4s, v2.4s\n"
744       "fcvtzs v3.4s, v3.4s\n"
745       "sqxtn v0.4h, v0.4s\n"
746       "sqxtn2 v0.8h, v1.4s\n"
747       "sqxtn v2.4h, v2.4s\n"
748       "sqxtn2 v2.8h, v3.4s\n"
749       "sqxtun v0.8b, v0.8h\n"
750       "sqxtun2 v0.16b, v2.8h\n"
751 
752       "st1 {v0.4s}, [%x[output]], #16\n"
753       "prfm pldl1keep, [%x[output]]\n"
754 
755       "bne 1b\n"
756       "2:"
757 
758       // Handle leftovers.
759 
760       // Requantize::Transform
761       "ld1 {v0.4s}, [%x[input]], #16\n"
762       "ld1 {v1.2s}, [%x[input]], #8\n"
763       "ld1 {v1.s}[2], [%x[input]], #4\n"
764       "prfm pldl1keep, [%x[input], #64]\n"
765       "scvtf v0.4s, v0.4s\n"
766       "scvtf v1.4s, v1.4s\n"
767       "fsub v0.4s, v0.4s, v6.4s\n"
768       "fsub v1.4s, v1.4s, v6.4s\n"
769       "fmul v0.4s, v0.4s, v7.4s\n"
770       "fmul v1.4s, v1.4s, v7.4s\n"
771       "fadd v0.4s, v0.4s, v4.4s\n"
772       "fadd v1.4s, v1.4s, v4.4s\n"
773       "fmul v0.4s, v0.4s, v8.4s\n"
774       "fmul v1.4s, v1.4s, v8.4s\n"
775       "fcvtzs v0.4s, v0.4s\n"
776       "fcvtzs v1.4s, v1.4s\n"
777       "sqxtn v0.4h, v0.4s\n"
778       "sqxtn2 v0.8h, v1.4s\n"
779       "sqxtun v0.8b, v0.8h\n"
780 
781       "st1 {v0.s}[0], [%x[output]], #4\n"
782       "st1 {v0.h}[2], [%x[output]], #2\n"
783       "st1 {v0.b}[6], [%x[output]], #1\n"
784       "prfm pldl1keep, [%x[output]]\n"
785       : [count] "+r"(params_count_copy), [input] "+r"(input),
786         [output] "+r"(output)
787       : [input_range_min] "r"(params.input_range_min),
788         [output_range_min] "r"(params.output_range_min),
789         [input_range_offset] "r"(params.input_range_offset),
790         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
791         [input_range_scale] "r"(params.input_range_scale)
792       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
793 }
794 
795 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)796 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 8>::Transform(
797     const int32_t* input, const Requantize& params, uint8_t* output) {
798 #ifdef DEBUG
799 #ifdef DEBUG_METAGEMM_VERBOSE
800   std::cout << __FILE__ << "(" << __LINE__
801             << ") Requantize<int32_t, uint8_t, Requantize, 16, 8>::Transform()"
802             << std::endl
803             << std::flush;
804 #endif
805 #endif
806   int params_count_copy = params.count;
807   asm volatile(
808 
809       // Requantize::Prepare
810       "dup v4.4s, %w[input_range_min]\n"
811       "dup v5.4s, %w[output_range_min]\n"
812       "dup v6.4s, %w[input_range_offset]\n"
813       "dup v7.4s, %w[input_range_scale]\n"
814       "dup v8.4s, %w[one_over_output_range_scale]\n"
815       "fsub v4.4s, v4.4s, v5.4s\n"
816 
817       // Reduce count by leftovers.
818       "subs %x[count], %x[count], #8\n"
819       "beq 2f\n"
820 
821       "1:"
822       "subs %x[count], %x[count], #16\n"
823 
824       // Requantize::Transform
825       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
826       "prfm pldl1keep, [%x[input], #64]\n"
827       "scvtf v0.4s, v0.4s\n"
828       "scvtf v1.4s, v1.4s\n"
829       "scvtf v2.4s, v2.4s\n"
830       "scvtf v3.4s, v3.4s\n"
831       "fsub v0.4s, v0.4s, v6.4s\n"
832       "fsub v1.4s, v1.4s, v6.4s\n"
833       "fsub v2.4s, v2.4s, v6.4s\n"
834       "fsub v3.4s, v3.4s, v6.4s\n"
835       "fmul v0.4s, v0.4s, v7.4s\n"
836       "fmul v1.4s, v1.4s, v7.4s\n"
837       "fmul v2.4s, v2.4s, v7.4s\n"
838       "fmul v3.4s, v3.4s, v7.4s\n"
839       "fadd v0.4s, v0.4s, v4.4s\n"
840       "fadd v1.4s, v1.4s, v4.4s\n"
841       "fadd v2.4s, v2.4s, v4.4s\n"
842       "fadd v3.4s, v3.4s, v4.4s\n"
843       "fmul v0.4s, v0.4s, v8.4s\n"
844       "fmul v1.4s, v1.4s, v8.4s\n"
845       "fmul v2.4s, v2.4s, v8.4s\n"
846       "fmul v3.4s, v3.4s, v8.4s\n"
847       "fcvtzs v0.4s, v0.4s\n"
848       "fcvtzs v1.4s, v1.4s\n"
849       "fcvtzs v2.4s, v2.4s\n"
850       "fcvtzs v3.4s, v3.4s\n"
851       "sqxtn v0.4h, v0.4s\n"
852       "sqxtn2 v0.8h, v1.4s\n"
853       "sqxtn v2.4h, v2.4s\n"
854       "sqxtn2 v2.8h, v3.4s\n"
855       "sqxtun v0.8b, v0.8h\n"
856       "sqxtun2 v0.16b, v2.8h\n"
857 
858       "st1 {v0.4s}, [%x[output]], #16\n"
859       "prfm pldl1keep, [%x[output]]\n"
860 
861       "bne 1b\n"
862       "2:"
863 
864       // Handle leftovers.
865 
866       // Requantize::Transform
867       "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
868       "prfm pldl1keep, [%x[input], #64]\n"
869       "scvtf v0.4s, v0.4s\n"
870       "scvtf v1.4s, v1.4s\n"
871       "fsub v0.4s, v0.4s, v6.4s\n"
872       "fsub v1.4s, v1.4s, v6.4s\n"
873       "fmul v0.4s, v0.4s, v7.4s\n"
874       "fmul v1.4s, v1.4s, v7.4s\n"
875       "fadd v0.4s, v0.4s, v4.4s\n"
876       "fadd v1.4s, v1.4s, v4.4s\n"
877       "fmul v0.4s, v0.4s, v8.4s\n"
878       "fmul v1.4s, v1.4s, v8.4s\n"
879       "fcvtzs v0.4s, v0.4s\n"
880       "fcvtzs v1.4s, v1.4s\n"
881       "sqxtn v0.4h, v0.4s\n"
882       "sqxtn2 v0.8h, v1.4s\n"
883       "sqxtun v0.8b, v0.8h\n"
884 
885       "st1 {v0.2s}, [%x[output]], #8\n"
886       "prfm pldl1keep, [%x[output]]\n"
887       : [count] "+r"(params_count_copy), [input] "+r"(input),
888         [output] "+r"(output)
889       : [input_range_min] "r"(params.input_range_min),
890         [output_range_min] "r"(params.output_range_min),
891         [input_range_offset] "r"(params.input_range_offset),
892         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
893         [input_range_scale] "r"(params.input_range_scale)
894       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
895 }
896 
897 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)898 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 9>::Transform(
899     const int32_t* input, const Requantize& params, uint8_t* output) {
900 #ifdef DEBUG
901 #ifdef DEBUG_METAGEMM_VERBOSE
902   std::cout << __FILE__ << "(" << __LINE__
903             << ") Requantize<int32_t, uint8_t, Requantize, 16, 9>::Transform()"
904             << std::endl
905             << std::flush;
906 #endif
907 #endif
908   int params_count_copy = params.count;
909   asm volatile(
910 
911       // Requantize::Prepare
912       "dup v4.4s, %w[input_range_min]\n"
913       "dup v5.4s, %w[output_range_min]\n"
914       "dup v6.4s, %w[input_range_offset]\n"
915       "dup v7.4s, %w[input_range_scale]\n"
916       "dup v8.4s, %w[one_over_output_range_scale]\n"
917       "fsub v4.4s, v4.4s, v5.4s\n"
918 
919       // Reduce count by leftovers.
920       "subs %x[count], %x[count], #9\n"
921       "beq 2f\n"
922 
923       "1:"
924       "subs %x[count], %x[count], #16\n"
925 
926       // Requantize::Transform
927       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
928       "prfm pldl1keep, [%x[input], #64]\n"
929       "scvtf v0.4s, v0.4s\n"
930       "scvtf v1.4s, v1.4s\n"
931       "scvtf v2.4s, v2.4s\n"
932       "scvtf v3.4s, v3.4s\n"
933       "fsub v0.4s, v0.4s, v6.4s\n"
934       "fsub v1.4s, v1.4s, v6.4s\n"
935       "fsub v2.4s, v2.4s, v6.4s\n"
936       "fsub v3.4s, v3.4s, v6.4s\n"
937       "fmul v0.4s, v0.4s, v7.4s\n"
938       "fmul v1.4s, v1.4s, v7.4s\n"
939       "fmul v2.4s, v2.4s, v7.4s\n"
940       "fmul v3.4s, v3.4s, v7.4s\n"
941       "fadd v0.4s, v0.4s, v4.4s\n"
942       "fadd v1.4s, v1.4s, v4.4s\n"
943       "fadd v2.4s, v2.4s, v4.4s\n"
944       "fadd v3.4s, v3.4s, v4.4s\n"
945       "fmul v0.4s, v0.4s, v8.4s\n"
946       "fmul v1.4s, v1.4s, v8.4s\n"
947       "fmul v2.4s, v2.4s, v8.4s\n"
948       "fmul v3.4s, v3.4s, v8.4s\n"
949       "fcvtzs v0.4s, v0.4s\n"
950       "fcvtzs v1.4s, v1.4s\n"
951       "fcvtzs v2.4s, v2.4s\n"
952       "fcvtzs v3.4s, v3.4s\n"
953       "sqxtn v0.4h, v0.4s\n"
954       "sqxtn2 v0.8h, v1.4s\n"
955       "sqxtn v2.4h, v2.4s\n"
956       "sqxtn2 v2.8h, v3.4s\n"
957       "sqxtun v0.8b, v0.8h\n"
958       "sqxtun2 v0.16b, v2.8h\n"
959 
960       "st1 {v0.4s}, [%x[output]], #16\n"
961       "prfm pldl1keep, [%x[output]]\n"
962 
963       "bne 1b\n"
964       "2:"
965 
966       // Handle leftovers.
967 
968       // Requantize::Transform
969       "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
970       "ld1 {v2.s}[0], [%x[input]], #4\n"
971       "prfm pldl1keep, [%x[input], #64]\n"
972       "scvtf v0.4s, v0.4s\n"
973       "scvtf v1.4s, v1.4s\n"
974       "scvtf v2.4s, v2.4s\n"
975       "fsub v0.4s, v0.4s, v6.4s\n"
976       "fsub v1.4s, v1.4s, v6.4s\n"
977       "fsub v2.4s, v2.4s, v6.4s\n"
978       "fmul v0.4s, v0.4s, v7.4s\n"
979       "fmul v1.4s, v1.4s, v7.4s\n"
980       "fmul v2.4s, v2.4s, v7.4s\n"
981       "fadd v0.4s, v0.4s, v4.4s\n"
982       "fadd v1.4s, v1.4s, v4.4s\n"
983       "fadd v2.4s, v2.4s, v4.4s\n"
984       "fmul v0.4s, v0.4s, v8.4s\n"
985       "fmul v1.4s, v1.4s, v8.4s\n"
986       "fmul v2.4s, v2.4s, v8.4s\n"
987       "fcvtzs v0.4s, v0.4s\n"
988       "fcvtzs v1.4s, v1.4s\n"
989       "fcvtzs v2.4s, v2.4s\n"
990       "sqxtn v0.4h, v0.4s\n"
991       "sqxtn2 v0.8h, v1.4s\n"
992       "sqxtn v2.4h, v2.4s\n"
993       "sqxtun v0.8b, v0.8h\n"
994       "sqxtun2 v0.16b, v2.8h\n"
995 
996       "st1 {v0.2s}, [%x[output]], #8\n"
997       "st1 {v0.b}[8], [%x[output]], #1\n"
998       "prfm pldl1keep, [%x[output]]\n"
999       : [count] "+r"(params_count_copy), [input] "+r"(input),
1000         [output] "+r"(output)
1001       : [input_range_min] "r"(params.input_range_min),
1002         [output_range_min] "r"(params.output_range_min),
1003         [input_range_offset] "r"(params.input_range_offset),
1004         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1005         [input_range_scale] "r"(params.input_range_scale)
1006       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1007 }
1008 
1009 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1010 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 10>::Transform(
1011     const int32_t* input, const Requantize& params, uint8_t* output) {
1012 #ifdef DEBUG
1013 #ifdef DEBUG_METAGEMM_VERBOSE
1014   std::cout << __FILE__ << "(" << __LINE__
1015             << ") Requantize<int32_t, uint8_t, Requantize, 16, 10>::Transform()"
1016             << std::endl
1017             << std::flush;
1018 #endif
1019 #endif
1020   int params_count_copy = params.count;
1021   asm volatile(
1022 
1023       // Requantize::Prepare
1024       "dup v4.4s, %w[input_range_min]\n"
1025       "dup v5.4s, %w[output_range_min]\n"
1026       "dup v6.4s, %w[input_range_offset]\n"
1027       "dup v7.4s, %w[input_range_scale]\n"
1028       "dup v8.4s, %w[one_over_output_range_scale]\n"
1029       "fsub v4.4s, v4.4s, v5.4s\n"
1030 
1031       // Reduce count by leftovers.
1032       "subs %x[count], %x[count], #10\n"
1033       "beq 2f\n"
1034 
1035       "1:"
1036       "subs %x[count], %x[count], #16\n"
1037 
1038       // Requantize::Transform
1039       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1040       "prfm pldl1keep, [%x[input], #64]\n"
1041       "scvtf v0.4s, v0.4s\n"
1042       "scvtf v1.4s, v1.4s\n"
1043       "scvtf v2.4s, v2.4s\n"
1044       "scvtf v3.4s, v3.4s\n"
1045       "fsub v0.4s, v0.4s, v6.4s\n"
1046       "fsub v1.4s, v1.4s, v6.4s\n"
1047       "fsub v2.4s, v2.4s, v6.4s\n"
1048       "fsub v3.4s, v3.4s, v6.4s\n"
1049       "fmul v0.4s, v0.4s, v7.4s\n"
1050       "fmul v1.4s, v1.4s, v7.4s\n"
1051       "fmul v2.4s, v2.4s, v7.4s\n"
1052       "fmul v3.4s, v3.4s, v7.4s\n"
1053       "fadd v0.4s, v0.4s, v4.4s\n"
1054       "fadd v1.4s, v1.4s, v4.4s\n"
1055       "fadd v2.4s, v2.4s, v4.4s\n"
1056       "fadd v3.4s, v3.4s, v4.4s\n"
1057       "fmul v0.4s, v0.4s, v8.4s\n"
1058       "fmul v1.4s, v1.4s, v8.4s\n"
1059       "fmul v2.4s, v2.4s, v8.4s\n"
1060       "fmul v3.4s, v3.4s, v8.4s\n"
1061       "fcvtzs v0.4s, v0.4s\n"
1062       "fcvtzs v1.4s, v1.4s\n"
1063       "fcvtzs v2.4s, v2.4s\n"
1064       "fcvtzs v3.4s, v3.4s\n"
1065       "sqxtn v0.4h, v0.4s\n"
1066       "sqxtn2 v0.8h, v1.4s\n"
1067       "sqxtn v2.4h, v2.4s\n"
1068       "sqxtn2 v2.8h, v3.4s\n"
1069       "sqxtun v0.8b, v0.8h\n"
1070       "sqxtun2 v0.16b, v2.8h\n"
1071 
1072       "st1 {v0.4s}, [%x[output]], #16\n"
1073       "prfm pldl1keep, [%x[output]]\n"
1074 
1075       "bne 1b\n"
1076       "2:"
1077 
1078       // Handle leftovers.
1079 
1080       // Requantize::Transform
1081       "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
1082       "ld1 {v2.2s}, [%x[input]], #8\n"
1083       "prfm pldl1keep, [%x[input], #64]\n"
1084       "scvtf v0.4s, v0.4s\n"
1085       "scvtf v1.4s, v1.4s\n"
1086       "scvtf v2.4s, v2.4s\n"
1087       "fsub v0.4s, v0.4s, v6.4s\n"
1088       "fsub v1.4s, v1.4s, v6.4s\n"
1089       "fsub v2.4s, v2.4s, v6.4s\n"
1090       "fmul v0.4s, v0.4s, v7.4s\n"
1091       "fmul v1.4s, v1.4s, v7.4s\n"
1092       "fmul v2.4s, v2.4s, v7.4s\n"
1093       "fadd v0.4s, v0.4s, v4.4s\n"
1094       "fadd v1.4s, v1.4s, v4.4s\n"
1095       "fadd v2.4s, v2.4s, v4.4s\n"
1096       "fmul v0.4s, v0.4s, v8.4s\n"
1097       "fmul v1.4s, v1.4s, v8.4s\n"
1098       "fmul v2.4s, v2.4s, v8.4s\n"
1099       "fcvtzs v0.4s, v0.4s\n"
1100       "fcvtzs v1.4s, v1.4s\n"
1101       "fcvtzs v2.4s, v2.4s\n"
1102       "sqxtn v0.4h, v0.4s\n"
1103       "sqxtn2 v0.8h, v1.4s\n"
1104       "sqxtn v2.4h, v2.4s\n"
1105       "sqxtun v0.8b, v0.8h\n"
1106       "sqxtun2 v0.16b, v2.8h\n"
1107 
1108       "st1 {v0.2s}, [%x[output]], #8\n"
1109       "st1 {v0.h}[4], [%x[output]], #2\n"
1110       "prfm pldl1keep, [%x[output]]\n"
1111       : [count] "+r"(params_count_copy), [input] "+r"(input),
1112         [output] "+r"(output)
1113       : [input_range_min] "r"(params.input_range_min),
1114         [output_range_min] "r"(params.output_range_min),
1115         [input_range_offset] "r"(params.input_range_offset),
1116         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1117         [input_range_scale] "r"(params.input_range_scale)
1118       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1119 }
1120 
1121 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1122 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 11>::Transform(
1123     const int32_t* input, const Requantize& params, uint8_t* output) {
1124 #ifdef DEBUG
1125 #ifdef DEBUG_METAGEMM_VERBOSE
1126   std::cout << __FILE__ << "(" << __LINE__
1127             << ") Requantize<int32_t, uint8_t, Requantize, 16, 11>::Transform()"
1128             << std::endl
1129             << std::flush;
1130 #endif
1131 #endif
1132   int params_count_copy = params.count;
1133   asm volatile(
1134 
1135       // Requantize::Prepare
1136       "dup v4.4s, %w[input_range_min]\n"
1137       "dup v5.4s, %w[output_range_min]\n"
1138       "dup v6.4s, %w[input_range_offset]\n"
1139       "dup v7.4s, %w[input_range_scale]\n"
1140       "dup v8.4s, %w[one_over_output_range_scale]\n"
1141       "fsub v4.4s, v4.4s, v5.4s\n"
1142 
1143       // Reduce count by leftovers.
1144       "subs %x[count], %x[count], #11\n"
1145       "beq 2f\n"
1146 
1147       "1:"
1148       "subs %x[count], %x[count], #16\n"
1149 
1150       // Requantize::Transform
1151       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1152       "prfm pldl1keep, [%x[input], #64]\n"
1153       "scvtf v0.4s, v0.4s\n"
1154       "scvtf v1.4s, v1.4s\n"
1155       "scvtf v2.4s, v2.4s\n"
1156       "scvtf v3.4s, v3.4s\n"
1157       "fsub v0.4s, v0.4s, v6.4s\n"
1158       "fsub v1.4s, v1.4s, v6.4s\n"
1159       "fsub v2.4s, v2.4s, v6.4s\n"
1160       "fsub v3.4s, v3.4s, v6.4s\n"
1161       "fmul v0.4s, v0.4s, v7.4s\n"
1162       "fmul v1.4s, v1.4s, v7.4s\n"
1163       "fmul v2.4s, v2.4s, v7.4s\n"
1164       "fmul v3.4s, v3.4s, v7.4s\n"
1165       "fadd v0.4s, v0.4s, v4.4s\n"
1166       "fadd v1.4s, v1.4s, v4.4s\n"
1167       "fadd v2.4s, v2.4s, v4.4s\n"
1168       "fadd v3.4s, v3.4s, v4.4s\n"
1169       "fmul v0.4s, v0.4s, v8.4s\n"
1170       "fmul v1.4s, v1.4s, v8.4s\n"
1171       "fmul v2.4s, v2.4s, v8.4s\n"
1172       "fmul v3.4s, v3.4s, v8.4s\n"
1173       "fcvtzs v0.4s, v0.4s\n"
1174       "fcvtzs v1.4s, v1.4s\n"
1175       "fcvtzs v2.4s, v2.4s\n"
1176       "fcvtzs v3.4s, v3.4s\n"
1177       "sqxtn v0.4h, v0.4s\n"
1178       "sqxtn2 v0.8h, v1.4s\n"
1179       "sqxtn v2.4h, v2.4s\n"
1180       "sqxtn2 v2.8h, v3.4s\n"
1181       "sqxtun v0.8b, v0.8h\n"
1182       "sqxtun2 v0.16b, v2.8h\n"
1183 
1184       "st1 {v0.4s}, [%x[output]], #16\n"
1185       "prfm pldl1keep, [%x[output]]\n"
1186 
1187       "bne 1b\n"
1188       "2:"
1189 
1190       // Handle leftovers.
1191 
1192       // Requantize::Transform
1193       "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
1194       "ld1 {v2.2s}, [%x[input]], #8\n"
1195       "ld1 {v2.s}[2], [%x[input]], #4\n"
1196       "prfm pldl1keep, [%x[input], #64]\n"
1197       "scvtf v0.4s, v0.4s\n"
1198       "scvtf v1.4s, v1.4s\n"
1199       "scvtf v2.4s, v2.4s\n"
1200       "fsub v0.4s, v0.4s, v6.4s\n"
1201       "fsub v1.4s, v1.4s, v6.4s\n"
1202       "fsub v2.4s, v2.4s, v6.4s\n"
1203       "fmul v0.4s, v0.4s, v7.4s\n"
1204       "fmul v1.4s, v1.4s, v7.4s\n"
1205       "fmul v2.4s, v2.4s, v7.4s\n"
1206       "fadd v0.4s, v0.4s, v4.4s\n"
1207       "fadd v1.4s, v1.4s, v4.4s\n"
1208       "fadd v2.4s, v2.4s, v4.4s\n"
1209       "fmul v0.4s, v0.4s, v8.4s\n"
1210       "fmul v1.4s, v1.4s, v8.4s\n"
1211       "fmul v2.4s, v2.4s, v8.4s\n"
1212       "fcvtzs v0.4s, v0.4s\n"
1213       "fcvtzs v1.4s, v1.4s\n"
1214       "fcvtzs v2.4s, v2.4s\n"
1215       "sqxtn v0.4h, v0.4s\n"
1216       "sqxtn2 v0.8h, v1.4s\n"
1217       "sqxtn v2.4h, v2.4s\n"
1218       "sqxtun v0.8b, v0.8h\n"
1219       "sqxtun2 v0.16b, v2.8h\n"
1220 
1221       "st1 {v0.2s}, [%x[output]], #8\n"
1222       "st1 {v0.h}[4], [%x[output]], #2\n"
1223       "st1 {v0.b}[10], [%x[output]], #1\n"
1224       "prfm pldl1keep, [%x[output]]\n"
1225       : [count] "+r"(params_count_copy), [input] "+r"(input),
1226         [output] "+r"(output)
1227       : [input_range_min] "r"(params.input_range_min),
1228         [output_range_min] "r"(params.output_range_min),
1229         [input_range_offset] "r"(params.input_range_offset),
1230         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1231         [input_range_scale] "r"(params.input_range_scale)
1232       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1233 }
1234 
1235 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1236 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 12>::Transform(
1237     const int32_t* input, const Requantize& params, uint8_t* output) {
1238 #ifdef DEBUG
1239 #ifdef DEBUG_METAGEMM_VERBOSE
1240   std::cout << __FILE__ << "(" << __LINE__
1241             << ") Requantize<int32_t, uint8_t, Requantize, 16, 12>::Transform()"
1242             << std::endl
1243             << std::flush;
1244 #endif
1245 #endif
1246   int params_count_copy = params.count;
1247   asm volatile(
1248 
1249       // Requantize::Prepare
1250       "dup v4.4s, %w[input_range_min]\n"
1251       "dup v5.4s, %w[output_range_min]\n"
1252       "dup v6.4s, %w[input_range_offset]\n"
1253       "dup v7.4s, %w[input_range_scale]\n"
1254       "dup v8.4s, %w[one_over_output_range_scale]\n"
1255       "fsub v4.4s, v4.4s, v5.4s\n"
1256 
1257       // Reduce count by leftovers.
1258       "subs %x[count], %x[count], #12\n"
1259       "beq 2f\n"
1260 
1261       "1:"
1262       "subs %x[count], %x[count], #16\n"
1263 
1264       // Requantize::Transform
1265       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1266       "prfm pldl1keep, [%x[input], #64]\n"
1267       "scvtf v0.4s, v0.4s\n"
1268       "scvtf v1.4s, v1.4s\n"
1269       "scvtf v2.4s, v2.4s\n"
1270       "scvtf v3.4s, v3.4s\n"
1271       "fsub v0.4s, v0.4s, v6.4s\n"
1272       "fsub v1.4s, v1.4s, v6.4s\n"
1273       "fsub v2.4s, v2.4s, v6.4s\n"
1274       "fsub v3.4s, v3.4s, v6.4s\n"
1275       "fmul v0.4s, v0.4s, v7.4s\n"
1276       "fmul v1.4s, v1.4s, v7.4s\n"
1277       "fmul v2.4s, v2.4s, v7.4s\n"
1278       "fmul v3.4s, v3.4s, v7.4s\n"
1279       "fadd v0.4s, v0.4s, v4.4s\n"
1280       "fadd v1.4s, v1.4s, v4.4s\n"
1281       "fadd v2.4s, v2.4s, v4.4s\n"
1282       "fadd v3.4s, v3.4s, v4.4s\n"
1283       "fmul v0.4s, v0.4s, v8.4s\n"
1284       "fmul v1.4s, v1.4s, v8.4s\n"
1285       "fmul v2.4s, v2.4s, v8.4s\n"
1286       "fmul v3.4s, v3.4s, v8.4s\n"
1287       "fcvtzs v0.4s, v0.4s\n"
1288       "fcvtzs v1.4s, v1.4s\n"
1289       "fcvtzs v2.4s, v2.4s\n"
1290       "fcvtzs v3.4s, v3.4s\n"
1291       "sqxtn v0.4h, v0.4s\n"
1292       "sqxtn2 v0.8h, v1.4s\n"
1293       "sqxtn v2.4h, v2.4s\n"
1294       "sqxtn2 v2.8h, v3.4s\n"
1295       "sqxtun v0.8b, v0.8h\n"
1296       "sqxtun2 v0.16b, v2.8h\n"
1297 
1298       "st1 {v0.4s}, [%x[output]], #16\n"
1299       "prfm pldl1keep, [%x[output]]\n"
1300 
1301       "bne 1b\n"
1302       "2:"
1303 
1304       // Handle leftovers.
1305 
1306       // Requantize::Transform
1307       "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
1308       "prfm pldl1keep, [%x[input], #64]\n"
1309       "scvtf v0.4s, v0.4s\n"
1310       "scvtf v1.4s, v1.4s\n"
1311       "scvtf v2.4s, v2.4s\n"
1312       "fsub v0.4s, v0.4s, v6.4s\n"
1313       "fsub v1.4s, v1.4s, v6.4s\n"
1314       "fsub v2.4s, v2.4s, v6.4s\n"
1315       "fmul v0.4s, v0.4s, v7.4s\n"
1316       "fmul v1.4s, v1.4s, v7.4s\n"
1317       "fmul v2.4s, v2.4s, v7.4s\n"
1318       "fadd v0.4s, v0.4s, v4.4s\n"
1319       "fadd v1.4s, v1.4s, v4.4s\n"
1320       "fadd v2.4s, v2.4s, v4.4s\n"
1321       "fmul v0.4s, v0.4s, v8.4s\n"
1322       "fmul v1.4s, v1.4s, v8.4s\n"
1323       "fmul v2.4s, v2.4s, v8.4s\n"
1324       "fcvtzs v0.4s, v0.4s\n"
1325       "fcvtzs v1.4s, v1.4s\n"
1326       "fcvtzs v2.4s, v2.4s\n"
1327       "sqxtn v0.4h, v0.4s\n"
1328       "sqxtn2 v0.8h, v1.4s\n"
1329       "sqxtn v2.4h, v2.4s\n"
1330       "sqxtun v0.8b, v0.8h\n"
1331       "sqxtun2 v0.16b, v2.8h\n"
1332 
1333       "st1 {v0.2s}, [%x[output]], #8\n"
1334       "st1 {v0.s}[2], [%x[output]], #4\n"
1335       "prfm pldl1keep, [%x[output]]\n"
1336       : [count] "+r"(params_count_copy), [input] "+r"(input),
1337         [output] "+r"(output)
1338       : [input_range_min] "r"(params.input_range_min),
1339         [output_range_min] "r"(params.output_range_min),
1340         [input_range_offset] "r"(params.input_range_offset),
1341         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1342         [input_range_scale] "r"(params.input_range_scale)
1343       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1344 }
1345 
1346 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1347 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 13>::Transform(
1348     const int32_t* input, const Requantize& params, uint8_t* output) {
1349 #ifdef DEBUG
1350 #ifdef DEBUG_METAGEMM_VERBOSE
1351   std::cout << __FILE__ << "(" << __LINE__
1352             << ") Requantize<int32_t, uint8_t, Requantize, 16, 13>::Transform()"
1353             << std::endl
1354             << std::flush;
1355 #endif
1356 #endif
1357   int params_count_copy = params.count;
1358   asm volatile(
1359 
1360       // Requantize::Prepare
1361       "dup v4.4s, %w[input_range_min]\n"
1362       "dup v5.4s, %w[output_range_min]\n"
1363       "dup v6.4s, %w[input_range_offset]\n"
1364       "dup v7.4s, %w[input_range_scale]\n"
1365       "dup v8.4s, %w[one_over_output_range_scale]\n"
1366       "fsub v4.4s, v4.4s, v5.4s\n"
1367 
1368       // Reduce count by leftovers.
1369       "subs %x[count], %x[count], #13\n"
1370       "beq 2f\n"
1371 
1372       "1:"
1373       "subs %x[count], %x[count], #16\n"
1374 
1375       // Requantize::Transform
1376       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1377       "prfm pldl1keep, [%x[input], #64]\n"
1378       "scvtf v0.4s, v0.4s\n"
1379       "scvtf v1.4s, v1.4s\n"
1380       "scvtf v2.4s, v2.4s\n"
1381       "scvtf v3.4s, v3.4s\n"
1382       "fsub v0.4s, v0.4s, v6.4s\n"
1383       "fsub v1.4s, v1.4s, v6.4s\n"
1384       "fsub v2.4s, v2.4s, v6.4s\n"
1385       "fsub v3.4s, v3.4s, v6.4s\n"
1386       "fmul v0.4s, v0.4s, v7.4s\n"
1387       "fmul v1.4s, v1.4s, v7.4s\n"
1388       "fmul v2.4s, v2.4s, v7.4s\n"
1389       "fmul v3.4s, v3.4s, v7.4s\n"
1390       "fadd v0.4s, v0.4s, v4.4s\n"
1391       "fadd v1.4s, v1.4s, v4.4s\n"
1392       "fadd v2.4s, v2.4s, v4.4s\n"
1393       "fadd v3.4s, v3.4s, v4.4s\n"
1394       "fmul v0.4s, v0.4s, v8.4s\n"
1395       "fmul v1.4s, v1.4s, v8.4s\n"
1396       "fmul v2.4s, v2.4s, v8.4s\n"
1397       "fmul v3.4s, v3.4s, v8.4s\n"
1398       "fcvtzs v0.4s, v0.4s\n"
1399       "fcvtzs v1.4s, v1.4s\n"
1400       "fcvtzs v2.4s, v2.4s\n"
1401       "fcvtzs v3.4s, v3.4s\n"
1402       "sqxtn v0.4h, v0.4s\n"
1403       "sqxtn2 v0.8h, v1.4s\n"
1404       "sqxtn v2.4h, v2.4s\n"
1405       "sqxtn2 v2.8h, v3.4s\n"
1406       "sqxtun v0.8b, v0.8h\n"
1407       "sqxtun2 v0.16b, v2.8h\n"
1408 
1409       "st1 {v0.4s}, [%x[output]], #16\n"
1410       "prfm pldl1keep, [%x[output]]\n"
1411 
1412       "bne 1b\n"
1413       "2:"
1414 
1415       // Handle leftovers.
1416 
1417       // Requantize::Transform
1418       "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
1419       "ld1 {v3.s}[0], [%x[input]], #4\n"
1420       "prfm pldl1keep, [%x[input], #64]\n"
1421       "scvtf v0.4s, v0.4s\n"
1422       "scvtf v1.4s, v1.4s\n"
1423       "scvtf v2.4s, v2.4s\n"
1424       "scvtf v3.4s, v3.4s\n"
1425       "fsub v0.4s, v0.4s, v6.4s\n"
1426       "fsub v1.4s, v1.4s, v6.4s\n"
1427       "fsub v2.4s, v2.4s, v6.4s\n"
1428       "fsub v3.4s, v3.4s, v6.4s\n"
1429       "fmul v0.4s, v0.4s, v7.4s\n"
1430       "fmul v1.4s, v1.4s, v7.4s\n"
1431       "fmul v2.4s, v2.4s, v7.4s\n"
1432       "fmul v3.4s, v3.4s, v7.4s\n"
1433       "fadd v0.4s, v0.4s, v4.4s\n"
1434       "fadd v1.4s, v1.4s, v4.4s\n"
1435       "fadd v2.4s, v2.4s, v4.4s\n"
1436       "fadd v3.4s, v3.4s, v4.4s\n"
1437       "fmul v0.4s, v0.4s, v8.4s\n"
1438       "fmul v1.4s, v1.4s, v8.4s\n"
1439       "fmul v2.4s, v2.4s, v8.4s\n"
1440       "fmul v3.4s, v3.4s, v8.4s\n"
1441       "fcvtzs v0.4s, v0.4s\n"
1442       "fcvtzs v1.4s, v1.4s\n"
1443       "fcvtzs v2.4s, v2.4s\n"
1444       "fcvtzs v3.4s, v3.4s\n"
1445       "sqxtn v0.4h, v0.4s\n"
1446       "sqxtn2 v0.8h, v1.4s\n"
1447       "sqxtn v2.4h, v2.4s\n"
1448       "sqxtn2 v2.8h, v3.4s\n"
1449       "sqxtun v0.8b, v0.8h\n"
1450       "sqxtun2 v0.16b, v2.8h\n"
1451 
1452       "st1 {v0.2s}, [%x[output]], #8\n"
1453       "st1 {v0.s}[2], [%x[output]], #4\n"
1454       "st1 {v0.b}[12], [%x[output]], #1\n"
1455       "prfm pldl1keep, [%x[output]]\n"
1456       : [count] "+r"(params_count_copy), [input] "+r"(input),
1457         [output] "+r"(output)
1458       : [input_range_min] "r"(params.input_range_min),
1459         [output_range_min] "r"(params.output_range_min),
1460         [input_range_offset] "r"(params.input_range_offset),
1461         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1462         [input_range_scale] "r"(params.input_range_scale)
1463       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1464 }
1465 
1466 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1467 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 14>::Transform(
1468     const int32_t* input, const Requantize& params, uint8_t* output) {
1469 #ifdef DEBUG
1470 #ifdef DEBUG_METAGEMM_VERBOSE
1471   std::cout << __FILE__ << "(" << __LINE__
1472             << ") Requantize<int32_t, uint8_t, Requantize, 16, 14>::Transform()"
1473             << std::endl
1474             << std::flush;
1475 #endif
1476 #endif
1477   int params_count_copy = params.count;
1478   asm volatile(
1479 
1480       // Requantize::Prepare
1481       "dup v4.4s, %w[input_range_min]\n"
1482       "dup v5.4s, %w[output_range_min]\n"
1483       "dup v6.4s, %w[input_range_offset]\n"
1484       "dup v7.4s, %w[input_range_scale]\n"
1485       "dup v8.4s, %w[one_over_output_range_scale]\n"
1486       "fsub v4.4s, v4.4s, v5.4s\n"
1487 
1488       // Reduce count by leftovers.
1489       "subs %x[count], %x[count], #14\n"
1490       "beq 2f\n"
1491 
1492       "1:"
1493       "subs %x[count], %x[count], #16\n"
1494 
1495       // Requantize::Transform
1496       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1497       "prfm pldl1keep, [%x[input], #64]\n"
1498       "scvtf v0.4s, v0.4s\n"
1499       "scvtf v1.4s, v1.4s\n"
1500       "scvtf v2.4s, v2.4s\n"
1501       "scvtf v3.4s, v3.4s\n"
1502       "fsub v0.4s, v0.4s, v6.4s\n"
1503       "fsub v1.4s, v1.4s, v6.4s\n"
1504       "fsub v2.4s, v2.4s, v6.4s\n"
1505       "fsub v3.4s, v3.4s, v6.4s\n"
1506       "fmul v0.4s, v0.4s, v7.4s\n"
1507       "fmul v1.4s, v1.4s, v7.4s\n"
1508       "fmul v2.4s, v2.4s, v7.4s\n"
1509       "fmul v3.4s, v3.4s, v7.4s\n"
1510       "fadd v0.4s, v0.4s, v4.4s\n"
1511       "fadd v1.4s, v1.4s, v4.4s\n"
1512       "fadd v2.4s, v2.4s, v4.4s\n"
1513       "fadd v3.4s, v3.4s, v4.4s\n"
1514       "fmul v0.4s, v0.4s, v8.4s\n"
1515       "fmul v1.4s, v1.4s, v8.4s\n"
1516       "fmul v2.4s, v2.4s, v8.4s\n"
1517       "fmul v3.4s, v3.4s, v8.4s\n"
1518       "fcvtzs v0.4s, v0.4s\n"
1519       "fcvtzs v1.4s, v1.4s\n"
1520       "fcvtzs v2.4s, v2.4s\n"
1521       "fcvtzs v3.4s, v3.4s\n"
1522       "sqxtn v0.4h, v0.4s\n"
1523       "sqxtn2 v0.8h, v1.4s\n"
1524       "sqxtn v2.4h, v2.4s\n"
1525       "sqxtn2 v2.8h, v3.4s\n"
1526       "sqxtun v0.8b, v0.8h\n"
1527       "sqxtun2 v0.16b, v2.8h\n"
1528 
1529       "st1 {v0.4s}, [%x[output]], #16\n"
1530       "prfm pldl1keep, [%x[output]]\n"
1531 
1532       "bne 1b\n"
1533       "2:"
1534 
1535       // Handle leftovers.
1536 
1537       // Requantize::Transform
1538       "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
1539       "ld1 {v3.2s}, [%x[input]], #8\n"
1540       "prfm pldl1keep, [%x[input], #64]\n"
1541       "scvtf v0.4s, v0.4s\n"
1542       "scvtf v1.4s, v1.4s\n"
1543       "scvtf v2.4s, v2.4s\n"
1544       "scvtf v3.4s, v3.4s\n"
1545       "fsub v0.4s, v0.4s, v6.4s\n"
1546       "fsub v1.4s, v1.4s, v6.4s\n"
1547       "fsub v2.4s, v2.4s, v6.4s\n"
1548       "fsub v3.4s, v3.4s, v6.4s\n"
1549       "fmul v0.4s, v0.4s, v7.4s\n"
1550       "fmul v1.4s, v1.4s, v7.4s\n"
1551       "fmul v2.4s, v2.4s, v7.4s\n"
1552       "fmul v3.4s, v3.4s, v7.4s\n"
1553       "fadd v0.4s, v0.4s, v4.4s\n"
1554       "fadd v1.4s, v1.4s, v4.4s\n"
1555       "fadd v2.4s, v2.4s, v4.4s\n"
1556       "fadd v3.4s, v3.4s, v4.4s\n"
1557       "fmul v0.4s, v0.4s, v8.4s\n"
1558       "fmul v1.4s, v1.4s, v8.4s\n"
1559       "fmul v2.4s, v2.4s, v8.4s\n"
1560       "fmul v3.4s, v3.4s, v8.4s\n"
1561       "fcvtzs v0.4s, v0.4s\n"
1562       "fcvtzs v1.4s, v1.4s\n"
1563       "fcvtzs v2.4s, v2.4s\n"
1564       "fcvtzs v3.4s, v3.4s\n"
1565       "sqxtn v0.4h, v0.4s\n"
1566       "sqxtn2 v0.8h, v1.4s\n"
1567       "sqxtn v2.4h, v2.4s\n"
1568       "sqxtn2 v2.8h, v3.4s\n"
1569       "sqxtun v0.8b, v0.8h\n"
1570       "sqxtun2 v0.16b, v2.8h\n"
1571 
1572       "st1 {v0.2s}, [%x[output]], #8\n"
1573       "st1 {v0.s}[2], [%x[output]], #4\n"
1574       "st1 {v0.h}[6], [%x[output]], #2\n"
1575       "prfm pldl1keep, [%x[output]]\n"
1576       : [count] "+r"(params_count_copy), [input] "+r"(input),
1577         [output] "+r"(output)
1578       : [input_range_min] "r"(params.input_range_min),
1579         [output_range_min] "r"(params.output_range_min),
1580         [input_range_offset] "r"(params.input_range_offset),
1581         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1582         [input_range_scale] "r"(params.input_range_scale)
1583       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1584 }
1585 
1586 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1587 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 15>::Transform(
1588     const int32_t* input, const Requantize& params, uint8_t* output) {
1589 #ifdef DEBUG
1590 #ifdef DEBUG_METAGEMM_VERBOSE
1591   std::cout << __FILE__ << "(" << __LINE__
1592             << ") Requantize<int32_t, uint8_t, Requantize, 16, 15>::Transform()"
1593             << std::endl
1594             << std::flush;
1595 #endif
1596 #endif
1597   int params_count_copy = params.count;
1598   asm volatile(
1599 
1600       // Requantize::Prepare
1601       "dup v4.4s, %w[input_range_min]\n"
1602       "dup v5.4s, %w[output_range_min]\n"
1603       "dup v6.4s, %w[input_range_offset]\n"
1604       "dup v7.4s, %w[input_range_scale]\n"
1605       "dup v8.4s, %w[one_over_output_range_scale]\n"
1606       "fsub v4.4s, v4.4s, v5.4s\n"
1607 
1608       // Reduce count by leftovers.
1609       "subs %x[count], %x[count], #15\n"
1610       "beq 2f\n"
1611 
1612       "1:"
1613       "subs %x[count], %x[count], #16\n"
1614 
1615       // Requantize::Transform
1616       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1617       "prfm pldl1keep, [%x[input], #64]\n"
1618       "scvtf v0.4s, v0.4s\n"
1619       "scvtf v1.4s, v1.4s\n"
1620       "scvtf v2.4s, v2.4s\n"
1621       "scvtf v3.4s, v3.4s\n"
1622       "fsub v0.4s, v0.4s, v6.4s\n"
1623       "fsub v1.4s, v1.4s, v6.4s\n"
1624       "fsub v2.4s, v2.4s, v6.4s\n"
1625       "fsub v3.4s, v3.4s, v6.4s\n"
1626       "fmul v0.4s, v0.4s, v7.4s\n"
1627       "fmul v1.4s, v1.4s, v7.4s\n"
1628       "fmul v2.4s, v2.4s, v7.4s\n"
1629       "fmul v3.4s, v3.4s, v7.4s\n"
1630       "fadd v0.4s, v0.4s, v4.4s\n"
1631       "fadd v1.4s, v1.4s, v4.4s\n"
1632       "fadd v2.4s, v2.4s, v4.4s\n"
1633       "fadd v3.4s, v3.4s, v4.4s\n"
1634       "fmul v0.4s, v0.4s, v8.4s\n"
1635       "fmul v1.4s, v1.4s, v8.4s\n"
1636       "fmul v2.4s, v2.4s, v8.4s\n"
1637       "fmul v3.4s, v3.4s, v8.4s\n"
1638       "fcvtzs v0.4s, v0.4s\n"
1639       "fcvtzs v1.4s, v1.4s\n"
1640       "fcvtzs v2.4s, v2.4s\n"
1641       "fcvtzs v3.4s, v3.4s\n"
1642       "sqxtn v0.4h, v0.4s\n"
1643       "sqxtn2 v0.8h, v1.4s\n"
1644       "sqxtn v2.4h, v2.4s\n"
1645       "sqxtn2 v2.8h, v3.4s\n"
1646       "sqxtun v0.8b, v0.8h\n"
1647       "sqxtun2 v0.16b, v2.8h\n"
1648 
1649       "st1 {v0.4s}, [%x[output]], #16\n"
1650       "prfm pldl1keep, [%x[output]]\n"
1651 
1652       "bne 1b\n"
1653       "2:"
1654 
1655       // Handle leftovers.
1656 
1657       // Requantize::Transform
1658       "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
1659       "ld1 {v3.2s}, [%x[input]], #8\n"
1660       "ld1 {v3.s}[2], [%x[input]], #4\n"
1661       "prfm pldl1keep, [%x[input], #64]\n"
1662       "scvtf v0.4s, v0.4s\n"
1663       "scvtf v1.4s, v1.4s\n"
1664       "scvtf v2.4s, v2.4s\n"
1665       "scvtf v3.4s, v3.4s\n"
1666       "fsub v0.4s, v0.4s, v6.4s\n"
1667       "fsub v1.4s, v1.4s, v6.4s\n"
1668       "fsub v2.4s, v2.4s, v6.4s\n"
1669       "fsub v3.4s, v3.4s, v6.4s\n"
1670       "fmul v0.4s, v0.4s, v7.4s\n"
1671       "fmul v1.4s, v1.4s, v7.4s\n"
1672       "fmul v2.4s, v2.4s, v7.4s\n"
1673       "fmul v3.4s, v3.4s, v7.4s\n"
1674       "fadd v0.4s, v0.4s, v4.4s\n"
1675       "fadd v1.4s, v1.4s, v4.4s\n"
1676       "fadd v2.4s, v2.4s, v4.4s\n"
1677       "fadd v3.4s, v3.4s, v4.4s\n"
1678       "fmul v0.4s, v0.4s, v8.4s\n"
1679       "fmul v1.4s, v1.4s, v8.4s\n"
1680       "fmul v2.4s, v2.4s, v8.4s\n"
1681       "fmul v3.4s, v3.4s, v8.4s\n"
1682       "fcvtzs v0.4s, v0.4s\n"
1683       "fcvtzs v1.4s, v1.4s\n"
1684       "fcvtzs v2.4s, v2.4s\n"
1685       "fcvtzs v3.4s, v3.4s\n"
1686       "sqxtn v0.4h, v0.4s\n"
1687       "sqxtn2 v0.8h, v1.4s\n"
1688       "sqxtn v2.4h, v2.4s\n"
1689       "sqxtn2 v2.8h, v3.4s\n"
1690       "sqxtun v0.8b, v0.8h\n"
1691       "sqxtun2 v0.16b, v2.8h\n"
1692 
1693       "st1 {v0.2s}, [%x[output]], #8\n"
1694       "st1 {v0.s}[2], [%x[output]], #4\n"
1695       "st1 {v0.h}[6], [%x[output]], #2\n"
1696       "st1 {v0.b}[14], [%x[output]], #1\n"
1697       "prfm pldl1keep, [%x[output]]\n"
1698       : [count] "+r"(params_count_copy), [input] "+r"(input),
1699         [output] "+r"(output)
1700       : [input_range_min] "r"(params.input_range_min),
1701         [output_range_min] "r"(params.output_range_min),
1702         [input_range_offset] "r"(params.input_range_offset),
1703         [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1704         [input_range_scale] "r"(params.input_range_scale)
1705       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1706 }
1707 
1708 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)1709 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 0>::Transform(
1710     const float* input, const Quantize& params, uint8_t* output) {
1711 #ifdef DEBUG
1712 #ifdef DEBUG_METAGEMM_VERBOSE
1713   std::cout << __FILE__ << "(" << __LINE__
1714             << ") Quantize<float, uint8_t, Quantize, 16, 0>::Transform()"
1715             << std::endl
1716             << std::flush;
1717 #endif
1718 #endif
1719   int params_count_copy = params.count;
1720   asm volatile(
1721 
1722       // Quantize::Prepare
1723       "dup v4.4s, %w[range_min]\n"
1724       "dup v5.4s, %w[range_offset]\n"
1725       "dup v6.4s, %w[range_scale]\n"
1726 
1727       "1:"
1728       "subs %x[count], %x[count], #16\n"
1729 
1730       // Quantize::Transform
1731       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1732       "prfm pldl1keep, [%x[input], #64]\n"
1733       "fsub v0.4s, v0.4s, v4.4s\n"
1734       "fsub v1.4s, v1.4s, v4.4s\n"
1735       "fsub v2.4s, v2.4s, v4.4s\n"
1736       "fsub v3.4s, v3.4s, v4.4s\n"
1737       "fmul v0.4s, v0.4s, v6.4s\n"
1738       "fmul v1.4s, v1.4s, v6.4s\n"
1739       "fmul v2.4s, v2.4s, v6.4s\n"
1740       "fmul v3.4s, v3.4s, v6.4s\n"
1741       "fadd v0.4s, v0.4s, v5.4s\n"
1742       "fadd v1.4s, v1.4s, v5.4s\n"
1743       "fadd v2.4s, v2.4s, v5.4s\n"
1744       "fadd v3.4s, v3.4s, v5.4s\n"
1745       "fcvtzs v0.4s, v0.4s\n"
1746       "fcvtzs v1.4s, v1.4s\n"
1747       "fcvtzs v2.4s, v2.4s\n"
1748       "fcvtzs v3.4s, v3.4s\n"
1749       "sqxtn v0.4h, v0.4s\n"
1750       "sqxtn2 v0.8h, v1.4s\n"
1751       "sqxtn v2.4h, v2.4s\n"
1752       "sqxtn2 v2.8h, v3.4s\n"
1753       "sqxtun v0.8b, v0.8h\n"
1754       "sqxtun2 v0.16b, v2.8h\n"
1755 
1756       "st1 {v0.4s}, [%x[output]], #16\n"
1757       "prfm pldl1keep, [%x[output]]\n"
1758 
1759       "bne 1b\n"
1760       : [count] "+r"(params_count_copy), [input] "+r"(input),
1761         [output] "+r"(output)
1762       : [range_offset] "r"(params.range_offset),
1763         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
1764       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
1765 }
1766 
1767 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)1768 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 1>::Transform(
1769     const float* input, const Quantize& params, uint8_t* output) {
1770 #ifdef DEBUG
1771 #ifdef DEBUG_METAGEMM_VERBOSE
1772   std::cout << __FILE__ << "(" << __LINE__
1773             << ") Quantize<float, uint8_t, Quantize, 16, 1>::Transform()"
1774             << std::endl
1775             << std::flush;
1776 #endif
1777 #endif
1778   int params_count_copy = params.count;
1779   asm volatile(
1780 
1781       // Quantize::Prepare
1782       "dup v4.4s, %w[range_min]\n"
1783       "dup v5.4s, %w[range_offset]\n"
1784       "dup v6.4s, %w[range_scale]\n"
1785 
1786       // Reduce count by leftovers.
1787       "subs %x[count], %x[count], #1\n"
1788       "beq 2f\n"
1789 
1790       "1:"
1791       "subs %x[count], %x[count], #16\n"
1792 
1793       // Quantize::Transform
1794       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1795       "prfm pldl1keep, [%x[input], #64]\n"
1796       "fsub v0.4s, v0.4s, v4.4s\n"
1797       "fsub v1.4s, v1.4s, v4.4s\n"
1798       "fsub v2.4s, v2.4s, v4.4s\n"
1799       "fsub v3.4s, v3.4s, v4.4s\n"
1800       "fmul v0.4s, v0.4s, v6.4s\n"
1801       "fmul v1.4s, v1.4s, v6.4s\n"
1802       "fmul v2.4s, v2.4s, v6.4s\n"
1803       "fmul v3.4s, v3.4s, v6.4s\n"
1804       "fadd v0.4s, v0.4s, v5.4s\n"
1805       "fadd v1.4s, v1.4s, v5.4s\n"
1806       "fadd v2.4s, v2.4s, v5.4s\n"
1807       "fadd v3.4s, v3.4s, v5.4s\n"
1808       "fcvtzs v0.4s, v0.4s\n"
1809       "fcvtzs v1.4s, v1.4s\n"
1810       "fcvtzs v2.4s, v2.4s\n"
1811       "fcvtzs v3.4s, v3.4s\n"
1812       "sqxtn v0.4h, v0.4s\n"
1813       "sqxtn2 v0.8h, v1.4s\n"
1814       "sqxtn v2.4h, v2.4s\n"
1815       "sqxtn2 v2.8h, v3.4s\n"
1816       "sqxtun v0.8b, v0.8h\n"
1817       "sqxtun2 v0.16b, v2.8h\n"
1818 
1819       "st1 {v0.4s}, [%x[output]], #16\n"
1820       "prfm pldl1keep, [%x[output]]\n"
1821 
1822       "bne 1b\n"
1823       "2:"
1824 
1825       // Handle leftovers.
1826 
1827       // Quantize::Transform
1828       "ld1 {v0.s}[0], [%x[input]], #4\n"
1829       "prfm pldl1keep, [%x[input], #64]\n"
1830       "fsub v0.4s, v0.4s, v4.4s\n"
1831       "fmul v0.4s, v0.4s, v6.4s\n"
1832       "fadd v0.4s, v0.4s, v5.4s\n"
1833       "fcvtzs v0.4s, v0.4s\n"
1834       "sqxtn v0.4h, v0.4s\n"
1835       "sqxtun v0.8b, v0.8h\n"
1836 
1837       "st1 {v0.b}[0], [%x[output]], #1\n"
1838       "prfm pldl1keep, [%x[output]]\n"
1839       : [count] "+r"(params_count_copy), [input] "+r"(input),
1840         [output] "+r"(output)
1841       : [range_offset] "r"(params.range_offset),
1842         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
1843       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
1844 }
1845 
1846 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)1847 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 2>::Transform(
1848     const float* input, const Quantize& params, uint8_t* output) {
1849 #ifdef DEBUG
1850 #ifdef DEBUG_METAGEMM_VERBOSE
1851   std::cout << __FILE__ << "(" << __LINE__
1852             << ") Quantize<float, uint8_t, Quantize, 16, 2>::Transform()"
1853             << std::endl
1854             << std::flush;
1855 #endif
1856 #endif
1857   int params_count_copy = params.count;
1858   asm volatile(
1859 
1860       // Quantize::Prepare
1861       "dup v4.4s, %w[range_min]\n"
1862       "dup v5.4s, %w[range_offset]\n"
1863       "dup v6.4s, %w[range_scale]\n"
1864 
1865       // Reduce count by leftovers.
1866       "subs %x[count], %x[count], #2\n"
1867       "beq 2f\n"
1868 
1869       "1:"
1870       "subs %x[count], %x[count], #16\n"
1871 
1872       // Quantize::Transform
1873       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1874       "prfm pldl1keep, [%x[input], #64]\n"
1875       "fsub v0.4s, v0.4s, v4.4s\n"
1876       "fsub v1.4s, v1.4s, v4.4s\n"
1877       "fsub v2.4s, v2.4s, v4.4s\n"
1878       "fsub v3.4s, v3.4s, v4.4s\n"
1879       "fmul v0.4s, v0.4s, v6.4s\n"
1880       "fmul v1.4s, v1.4s, v6.4s\n"
1881       "fmul v2.4s, v2.4s, v6.4s\n"
1882       "fmul v3.4s, v3.4s, v6.4s\n"
1883       "fadd v0.4s, v0.4s, v5.4s\n"
1884       "fadd v1.4s, v1.4s, v5.4s\n"
1885       "fadd v2.4s, v2.4s, v5.4s\n"
1886       "fadd v3.4s, v3.4s, v5.4s\n"
1887       "fcvtzs v0.4s, v0.4s\n"
1888       "fcvtzs v1.4s, v1.4s\n"
1889       "fcvtzs v2.4s, v2.4s\n"
1890       "fcvtzs v3.4s, v3.4s\n"
1891       "sqxtn v0.4h, v0.4s\n"
1892       "sqxtn2 v0.8h, v1.4s\n"
1893       "sqxtn v2.4h, v2.4s\n"
1894       "sqxtn2 v2.8h, v3.4s\n"
1895       "sqxtun v0.8b, v0.8h\n"
1896       "sqxtun2 v0.16b, v2.8h\n"
1897 
1898       "st1 {v0.4s}, [%x[output]], #16\n"
1899       "prfm pldl1keep, [%x[output]]\n"
1900 
1901       "bne 1b\n"
1902       "2:"
1903 
1904       // Handle leftovers.
1905 
1906       // Quantize::Transform
1907       "ld1 {v0.2s}, [%x[input]], #8\n"
1908       "prfm pldl1keep, [%x[input], #64]\n"
1909       "fsub v0.4s, v0.4s, v4.4s\n"
1910       "fmul v0.4s, v0.4s, v6.4s\n"
1911       "fadd v0.4s, v0.4s, v5.4s\n"
1912       "fcvtzs v0.4s, v0.4s\n"
1913       "sqxtn v0.4h, v0.4s\n"
1914       "sqxtun v0.8b, v0.8h\n"
1915 
1916       "st1 {v0.h}[0], [%x[output]], #2\n"
1917       "prfm pldl1keep, [%x[output]]\n"
1918       : [count] "+r"(params_count_copy), [input] "+r"(input),
1919         [output] "+r"(output)
1920       : [range_offset] "r"(params.range_offset),
1921         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
1922       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
1923 }
1924 
1925 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)1926 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 3>::Transform(
1927     const float* input, const Quantize& params, uint8_t* output) {
1928 #ifdef DEBUG
1929 #ifdef DEBUG_METAGEMM_VERBOSE
1930   std::cout << __FILE__ << "(" << __LINE__
1931             << ") Quantize<float, uint8_t, Quantize, 16, 3>::Transform()"
1932             << std::endl
1933             << std::flush;
1934 #endif
1935 #endif
1936   int params_count_copy = params.count;
1937   asm volatile(
1938 
1939       // Quantize::Prepare
1940       "dup v4.4s, %w[range_min]\n"
1941       "dup v5.4s, %w[range_offset]\n"
1942       "dup v6.4s, %w[range_scale]\n"
1943 
1944       // Reduce count by leftovers.
1945       "subs %x[count], %x[count], #3\n"
1946       "beq 2f\n"
1947 
1948       "1:"
1949       "subs %x[count], %x[count], #16\n"
1950 
1951       // Quantize::Transform
1952       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1953       "prfm pldl1keep, [%x[input], #64]\n"
1954       "fsub v0.4s, v0.4s, v4.4s\n"
1955       "fsub v1.4s, v1.4s, v4.4s\n"
1956       "fsub v2.4s, v2.4s, v4.4s\n"
1957       "fsub v3.4s, v3.4s, v4.4s\n"
1958       "fmul v0.4s, v0.4s, v6.4s\n"
1959       "fmul v1.4s, v1.4s, v6.4s\n"
1960       "fmul v2.4s, v2.4s, v6.4s\n"
1961       "fmul v3.4s, v3.4s, v6.4s\n"
1962       "fadd v0.4s, v0.4s, v5.4s\n"
1963       "fadd v1.4s, v1.4s, v5.4s\n"
1964       "fadd v2.4s, v2.4s, v5.4s\n"
1965       "fadd v3.4s, v3.4s, v5.4s\n"
1966       "fcvtzs v0.4s, v0.4s\n"
1967       "fcvtzs v1.4s, v1.4s\n"
1968       "fcvtzs v2.4s, v2.4s\n"
1969       "fcvtzs v3.4s, v3.4s\n"
1970       "sqxtn v0.4h, v0.4s\n"
1971       "sqxtn2 v0.8h, v1.4s\n"
1972       "sqxtn v2.4h, v2.4s\n"
1973       "sqxtn2 v2.8h, v3.4s\n"
1974       "sqxtun v0.8b, v0.8h\n"
1975       "sqxtun2 v0.16b, v2.8h\n"
1976 
1977       "st1 {v0.4s}, [%x[output]], #16\n"
1978       "prfm pldl1keep, [%x[output]]\n"
1979 
1980       "bne 1b\n"
1981       "2:"
1982 
1983       // Handle leftovers.
1984 
1985       // Quantize::Transform
1986       "ld1 {v0.2s}, [%x[input]], #8\n"
1987       "ld1 {v0.s}[2], [%x[input]], #4\n"
1988       "prfm pldl1keep, [%x[input], #64]\n"
1989       "fsub v0.4s, v0.4s, v4.4s\n"
1990       "fmul v0.4s, v0.4s, v6.4s\n"
1991       "fadd v0.4s, v0.4s, v5.4s\n"
1992       "fcvtzs v0.4s, v0.4s\n"
1993       "sqxtn v0.4h, v0.4s\n"
1994       "sqxtun v0.8b, v0.8h\n"
1995 
1996       "st1 {v0.h}[0], [%x[output]], #2\n"
1997       "st1 {v0.b}[2], [%x[output]], #1\n"
1998       "prfm pldl1keep, [%x[output]]\n"
1999       : [count] "+r"(params_count_copy), [input] "+r"(input),
2000         [output] "+r"(output)
2001       : [range_offset] "r"(params.range_offset),
2002         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2003       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2004 }
2005 
2006 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2007 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 4>::Transform(
2008     const float* input, const Quantize& params, uint8_t* output) {
2009 #ifdef DEBUG
2010 #ifdef DEBUG_METAGEMM_VERBOSE
2011   std::cout << __FILE__ << "(" << __LINE__
2012             << ") Quantize<float, uint8_t, Quantize, 16, 4>::Transform()"
2013             << std::endl
2014             << std::flush;
2015 #endif
2016 #endif
2017   int params_count_copy = params.count;
2018   asm volatile(
2019 
2020       // Quantize::Prepare
2021       "dup v4.4s, %w[range_min]\n"
2022       "dup v5.4s, %w[range_offset]\n"
2023       "dup v6.4s, %w[range_scale]\n"
2024 
2025       // Reduce count by leftovers.
2026       "subs %x[count], %x[count], #4\n"
2027       "beq 2f\n"
2028 
2029       "1:"
2030       "subs %x[count], %x[count], #16\n"
2031 
2032       // Quantize::Transform
2033       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2034       "prfm pldl1keep, [%x[input], #64]\n"
2035       "fsub v0.4s, v0.4s, v4.4s\n"
2036       "fsub v1.4s, v1.4s, v4.4s\n"
2037       "fsub v2.4s, v2.4s, v4.4s\n"
2038       "fsub v3.4s, v3.4s, v4.4s\n"
2039       "fmul v0.4s, v0.4s, v6.4s\n"
2040       "fmul v1.4s, v1.4s, v6.4s\n"
2041       "fmul v2.4s, v2.4s, v6.4s\n"
2042       "fmul v3.4s, v3.4s, v6.4s\n"
2043       "fadd v0.4s, v0.4s, v5.4s\n"
2044       "fadd v1.4s, v1.4s, v5.4s\n"
2045       "fadd v2.4s, v2.4s, v5.4s\n"
2046       "fadd v3.4s, v3.4s, v5.4s\n"
2047       "fcvtzs v0.4s, v0.4s\n"
2048       "fcvtzs v1.4s, v1.4s\n"
2049       "fcvtzs v2.4s, v2.4s\n"
2050       "fcvtzs v3.4s, v3.4s\n"
2051       "sqxtn v0.4h, v0.4s\n"
2052       "sqxtn2 v0.8h, v1.4s\n"
2053       "sqxtn v2.4h, v2.4s\n"
2054       "sqxtn2 v2.8h, v3.4s\n"
2055       "sqxtun v0.8b, v0.8h\n"
2056       "sqxtun2 v0.16b, v2.8h\n"
2057 
2058       "st1 {v0.4s}, [%x[output]], #16\n"
2059       "prfm pldl1keep, [%x[output]]\n"
2060 
2061       "bne 1b\n"
2062       "2:"
2063 
2064       // Handle leftovers.
2065 
2066       // Quantize::Transform
2067       "ld1 {v0.4s}, [%x[input]], #16\n"
2068       "prfm pldl1keep, [%x[input], #64]\n"
2069       "fsub v0.4s, v0.4s, v4.4s\n"
2070       "fmul v0.4s, v0.4s, v6.4s\n"
2071       "fadd v0.4s, v0.4s, v5.4s\n"
2072       "fcvtzs v0.4s, v0.4s\n"
2073       "sqxtn v0.4h, v0.4s\n"
2074       "sqxtun v0.8b, v0.8h\n"
2075 
2076       "st1 {v0.s}[0], [%x[output]], #4\n"
2077       "prfm pldl1keep, [%x[output]]\n"
2078       : [count] "+r"(params_count_copy), [input] "+r"(input),
2079         [output] "+r"(output)
2080       : [range_offset] "r"(params.range_offset),
2081         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2082       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2083 }
2084 
2085 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2086 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 5>::Transform(
2087     const float* input, const Quantize& params, uint8_t* output) {
2088 #ifdef DEBUG
2089 #ifdef DEBUG_METAGEMM_VERBOSE
2090   std::cout << __FILE__ << "(" << __LINE__
2091             << ") Quantize<float, uint8_t, Quantize, 16, 5>::Transform()"
2092             << std::endl
2093             << std::flush;
2094 #endif
2095 #endif
2096   int params_count_copy = params.count;
2097   asm volatile(
2098 
2099       // Quantize::Prepare
2100       "dup v4.4s, %w[range_min]\n"
2101       "dup v5.4s, %w[range_offset]\n"
2102       "dup v6.4s, %w[range_scale]\n"
2103 
2104       // Reduce count by leftovers.
2105       "subs %x[count], %x[count], #5\n"
2106       "beq 2f\n"
2107 
2108       "1:"
2109       "subs %x[count], %x[count], #16\n"
2110 
2111       // Quantize::Transform
2112       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2113       "prfm pldl1keep, [%x[input], #64]\n"
2114       "fsub v0.4s, v0.4s, v4.4s\n"
2115       "fsub v1.4s, v1.4s, v4.4s\n"
2116       "fsub v2.4s, v2.4s, v4.4s\n"
2117       "fsub v3.4s, v3.4s, v4.4s\n"
2118       "fmul v0.4s, v0.4s, v6.4s\n"
2119       "fmul v1.4s, v1.4s, v6.4s\n"
2120       "fmul v2.4s, v2.4s, v6.4s\n"
2121       "fmul v3.4s, v3.4s, v6.4s\n"
2122       "fadd v0.4s, v0.4s, v5.4s\n"
2123       "fadd v1.4s, v1.4s, v5.4s\n"
2124       "fadd v2.4s, v2.4s, v5.4s\n"
2125       "fadd v3.4s, v3.4s, v5.4s\n"
2126       "fcvtzs v0.4s, v0.4s\n"
2127       "fcvtzs v1.4s, v1.4s\n"
2128       "fcvtzs v2.4s, v2.4s\n"
2129       "fcvtzs v3.4s, v3.4s\n"
2130       "sqxtn v0.4h, v0.4s\n"
2131       "sqxtn2 v0.8h, v1.4s\n"
2132       "sqxtn v2.4h, v2.4s\n"
2133       "sqxtn2 v2.8h, v3.4s\n"
2134       "sqxtun v0.8b, v0.8h\n"
2135       "sqxtun2 v0.16b, v2.8h\n"
2136 
2137       "st1 {v0.4s}, [%x[output]], #16\n"
2138       "prfm pldl1keep, [%x[output]]\n"
2139 
2140       "bne 1b\n"
2141       "2:"
2142 
2143       // Handle leftovers.
2144 
2145       // Quantize::Transform
2146       "ld1 {v0.4s}, [%x[input]], #16\n"
2147       "ld1 {v1.s}[0], [%x[input]], #4\n"
2148       "prfm pldl1keep, [%x[input], #64]\n"
2149       "fsub v0.4s, v0.4s, v4.4s\n"
2150       "fsub v1.4s, v1.4s, v4.4s\n"
2151       "fmul v0.4s, v0.4s, v6.4s\n"
2152       "fmul v1.4s, v1.4s, v6.4s\n"
2153       "fadd v0.4s, v0.4s, v5.4s\n"
2154       "fadd v1.4s, v1.4s, v5.4s\n"
2155       "fcvtzs v0.4s, v0.4s\n"
2156       "fcvtzs v1.4s, v1.4s\n"
2157       "sqxtn v0.4h, v0.4s\n"
2158       "sqxtn2 v0.8h, v1.4s\n"
2159       "sqxtun v0.8b, v0.8h\n"
2160 
2161       "st1 {v0.s}[0], [%x[output]], #4\n"
2162       "st1 {v0.b}[4], [%x[output]], #1\n"
2163       "prfm pldl1keep, [%x[output]]\n"
2164       : [count] "+r"(params_count_copy), [input] "+r"(input),
2165         [output] "+r"(output)
2166       : [range_offset] "r"(params.range_offset),
2167         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2168       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2169 }
2170 
2171 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2172 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 6>::Transform(
2173     const float* input, const Quantize& params, uint8_t* output) {
2174 #ifdef DEBUG
2175 #ifdef DEBUG_METAGEMM_VERBOSE
2176   std::cout << __FILE__ << "(" << __LINE__
2177             << ") Quantize<float, uint8_t, Quantize, 16, 6>::Transform()"
2178             << std::endl
2179             << std::flush;
2180 #endif
2181 #endif
2182   int params_count_copy = params.count;
2183   asm volatile(
2184 
2185       // Quantize::Prepare
2186       "dup v4.4s, %w[range_min]\n"
2187       "dup v5.4s, %w[range_offset]\n"
2188       "dup v6.4s, %w[range_scale]\n"
2189 
2190       // Reduce count by leftovers.
2191       "subs %x[count], %x[count], #6\n"
2192       "beq 2f\n"
2193 
2194       "1:"
2195       "subs %x[count], %x[count], #16\n"
2196 
2197       // Quantize::Transform
2198       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2199       "prfm pldl1keep, [%x[input], #64]\n"
2200       "fsub v0.4s, v0.4s, v4.4s\n"
2201       "fsub v1.4s, v1.4s, v4.4s\n"
2202       "fsub v2.4s, v2.4s, v4.4s\n"
2203       "fsub v3.4s, v3.4s, v4.4s\n"
2204       "fmul v0.4s, v0.4s, v6.4s\n"
2205       "fmul v1.4s, v1.4s, v6.4s\n"
2206       "fmul v2.4s, v2.4s, v6.4s\n"
2207       "fmul v3.4s, v3.4s, v6.4s\n"
2208       "fadd v0.4s, v0.4s, v5.4s\n"
2209       "fadd v1.4s, v1.4s, v5.4s\n"
2210       "fadd v2.4s, v2.4s, v5.4s\n"
2211       "fadd v3.4s, v3.4s, v5.4s\n"
2212       "fcvtzs v0.4s, v0.4s\n"
2213       "fcvtzs v1.4s, v1.4s\n"
2214       "fcvtzs v2.4s, v2.4s\n"
2215       "fcvtzs v3.4s, v3.4s\n"
2216       "sqxtn v0.4h, v0.4s\n"
2217       "sqxtn2 v0.8h, v1.4s\n"
2218       "sqxtn v2.4h, v2.4s\n"
2219       "sqxtn2 v2.8h, v3.4s\n"
2220       "sqxtun v0.8b, v0.8h\n"
2221       "sqxtun2 v0.16b, v2.8h\n"
2222 
2223       "st1 {v0.4s}, [%x[output]], #16\n"
2224       "prfm pldl1keep, [%x[output]]\n"
2225 
2226       "bne 1b\n"
2227       "2:"
2228 
2229       // Handle leftovers.
2230 
2231       // Quantize::Transform
2232       "ld1 {v0.4s}, [%x[input]], #16\n"
2233       "ld1 {v1.2s}, [%x[input]], #8\n"
2234       "prfm pldl1keep, [%x[input], #64]\n"
2235       "fsub v0.4s, v0.4s, v4.4s\n"
2236       "fsub v1.4s, v1.4s, v4.4s\n"
2237       "fmul v0.4s, v0.4s, v6.4s\n"
2238       "fmul v1.4s, v1.4s, v6.4s\n"
2239       "fadd v0.4s, v0.4s, v5.4s\n"
2240       "fadd v1.4s, v1.4s, v5.4s\n"
2241       "fcvtzs v0.4s, v0.4s\n"
2242       "fcvtzs v1.4s, v1.4s\n"
2243       "sqxtn v0.4h, v0.4s\n"
2244       "sqxtn2 v0.8h, v1.4s\n"
2245       "sqxtun v0.8b, v0.8h\n"
2246 
2247       "st1 {v0.s}[0], [%x[output]], #4\n"
2248       "st1 {v0.h}[2], [%x[output]], #2\n"
2249       "prfm pldl1keep, [%x[output]]\n"
2250       : [count] "+r"(params_count_copy), [input] "+r"(input),
2251         [output] "+r"(output)
2252       : [range_offset] "r"(params.range_offset),
2253         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2254       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2255 }
2256 
2257 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2258 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 7>::Transform(
2259     const float* input, const Quantize& params, uint8_t* output) {
2260 #ifdef DEBUG
2261 #ifdef DEBUG_METAGEMM_VERBOSE
2262   std::cout << __FILE__ << "(" << __LINE__
2263             << ") Quantize<float, uint8_t, Quantize, 16, 7>::Transform()"
2264             << std::endl
2265             << std::flush;
2266 #endif
2267 #endif
2268   int params_count_copy = params.count;
2269   asm volatile(
2270 
2271       // Quantize::Prepare
2272       "dup v4.4s, %w[range_min]\n"
2273       "dup v5.4s, %w[range_offset]\n"
2274       "dup v6.4s, %w[range_scale]\n"
2275 
2276       // Reduce count by leftovers.
2277       "subs %x[count], %x[count], #7\n"
2278       "beq 2f\n"
2279 
2280       "1:"
2281       "subs %x[count], %x[count], #16\n"
2282 
2283       // Quantize::Transform
2284       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2285       "prfm pldl1keep, [%x[input], #64]\n"
2286       "fsub v0.4s, v0.4s, v4.4s\n"
2287       "fsub v1.4s, v1.4s, v4.4s\n"
2288       "fsub v2.4s, v2.4s, v4.4s\n"
2289       "fsub v3.4s, v3.4s, v4.4s\n"
2290       "fmul v0.4s, v0.4s, v6.4s\n"
2291       "fmul v1.4s, v1.4s, v6.4s\n"
2292       "fmul v2.4s, v2.4s, v6.4s\n"
2293       "fmul v3.4s, v3.4s, v6.4s\n"
2294       "fadd v0.4s, v0.4s, v5.4s\n"
2295       "fadd v1.4s, v1.4s, v5.4s\n"
2296       "fadd v2.4s, v2.4s, v5.4s\n"
2297       "fadd v3.4s, v3.4s, v5.4s\n"
2298       "fcvtzs v0.4s, v0.4s\n"
2299       "fcvtzs v1.4s, v1.4s\n"
2300       "fcvtzs v2.4s, v2.4s\n"
2301       "fcvtzs v3.4s, v3.4s\n"
2302       "sqxtn v0.4h, v0.4s\n"
2303       "sqxtn2 v0.8h, v1.4s\n"
2304       "sqxtn v2.4h, v2.4s\n"
2305       "sqxtn2 v2.8h, v3.4s\n"
2306       "sqxtun v0.8b, v0.8h\n"
2307       "sqxtun2 v0.16b, v2.8h\n"
2308 
2309       "st1 {v0.4s}, [%x[output]], #16\n"
2310       "prfm pldl1keep, [%x[output]]\n"
2311 
2312       "bne 1b\n"
2313       "2:"
2314 
2315       // Handle leftovers.
2316 
2317       // Quantize::Transform
2318       "ld1 {v0.4s}, [%x[input]], #16\n"
2319       "ld1 {v1.2s}, [%x[input]], #8\n"
2320       "ld1 {v1.s}[2], [%x[input]], #4\n"
2321       "prfm pldl1keep, [%x[input], #64]\n"
2322       "fsub v0.4s, v0.4s, v4.4s\n"
2323       "fsub v1.4s, v1.4s, v4.4s\n"
2324       "fmul v0.4s, v0.4s, v6.4s\n"
2325       "fmul v1.4s, v1.4s, v6.4s\n"
2326       "fadd v0.4s, v0.4s, v5.4s\n"
2327       "fadd v1.4s, v1.4s, v5.4s\n"
2328       "fcvtzs v0.4s, v0.4s\n"
2329       "fcvtzs v1.4s, v1.4s\n"
2330       "sqxtn v0.4h, v0.4s\n"
2331       "sqxtn2 v0.8h, v1.4s\n"
2332       "sqxtun v0.8b, v0.8h\n"
2333 
2334       "st1 {v0.s}[0], [%x[output]], #4\n"
2335       "st1 {v0.h}[2], [%x[output]], #2\n"
2336       "st1 {v0.b}[6], [%x[output]], #1\n"
2337       "prfm pldl1keep, [%x[output]]\n"
2338       : [count] "+r"(params_count_copy), [input] "+r"(input),
2339         [output] "+r"(output)
2340       : [range_offset] "r"(params.range_offset),
2341         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2342       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2343 }
2344 
2345 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2346 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 8>::Transform(
2347     const float* input, const Quantize& params, uint8_t* output) {
2348 #ifdef DEBUG
2349 #ifdef DEBUG_METAGEMM_VERBOSE
2350   std::cout << __FILE__ << "(" << __LINE__
2351             << ") Quantize<float, uint8_t, Quantize, 16, 8>::Transform()"
2352             << std::endl
2353             << std::flush;
2354 #endif
2355 #endif
2356   int params_count_copy = params.count;
2357   asm volatile(
2358 
2359       // Quantize::Prepare
2360       "dup v4.4s, %w[range_min]\n"
2361       "dup v5.4s, %w[range_offset]\n"
2362       "dup v6.4s, %w[range_scale]\n"
2363 
2364       // Reduce count by leftovers.
2365       "subs %x[count], %x[count], #8\n"
2366       "beq 2f\n"
2367 
2368       "1:"
2369       "subs %x[count], %x[count], #16\n"
2370 
2371       // Quantize::Transform
2372       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2373       "prfm pldl1keep, [%x[input], #64]\n"
2374       "fsub v0.4s, v0.4s, v4.4s\n"
2375       "fsub v1.4s, v1.4s, v4.4s\n"
2376       "fsub v2.4s, v2.4s, v4.4s\n"
2377       "fsub v3.4s, v3.4s, v4.4s\n"
2378       "fmul v0.4s, v0.4s, v6.4s\n"
2379       "fmul v1.4s, v1.4s, v6.4s\n"
2380       "fmul v2.4s, v2.4s, v6.4s\n"
2381       "fmul v3.4s, v3.4s, v6.4s\n"
2382       "fadd v0.4s, v0.4s, v5.4s\n"
2383       "fadd v1.4s, v1.4s, v5.4s\n"
2384       "fadd v2.4s, v2.4s, v5.4s\n"
2385       "fadd v3.4s, v3.4s, v5.4s\n"
2386       "fcvtzs v0.4s, v0.4s\n"
2387       "fcvtzs v1.4s, v1.4s\n"
2388       "fcvtzs v2.4s, v2.4s\n"
2389       "fcvtzs v3.4s, v3.4s\n"
2390       "sqxtn v0.4h, v0.4s\n"
2391       "sqxtn2 v0.8h, v1.4s\n"
2392       "sqxtn v2.4h, v2.4s\n"
2393       "sqxtn2 v2.8h, v3.4s\n"
2394       "sqxtun v0.8b, v0.8h\n"
2395       "sqxtun2 v0.16b, v2.8h\n"
2396 
2397       "st1 {v0.4s}, [%x[output]], #16\n"
2398       "prfm pldl1keep, [%x[output]]\n"
2399 
2400       "bne 1b\n"
2401       "2:"
2402 
2403       // Handle leftovers.
2404 
2405       // Quantize::Transform
2406       "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
2407       "prfm pldl1keep, [%x[input], #64]\n"
2408       "fsub v0.4s, v0.4s, v4.4s\n"
2409       "fsub v1.4s, v1.4s, v4.4s\n"
2410       "fmul v0.4s, v0.4s, v6.4s\n"
2411       "fmul v1.4s, v1.4s, v6.4s\n"
2412       "fadd v0.4s, v0.4s, v5.4s\n"
2413       "fadd v1.4s, v1.4s, v5.4s\n"
2414       "fcvtzs v0.4s, v0.4s\n"
2415       "fcvtzs v1.4s, v1.4s\n"
2416       "sqxtn v0.4h, v0.4s\n"
2417       "sqxtn2 v0.8h, v1.4s\n"
2418       "sqxtun v0.8b, v0.8h\n"
2419 
2420       "st1 {v0.2s}, [%x[output]], #8\n"
2421       "prfm pldl1keep, [%x[output]]\n"
2422       : [count] "+r"(params_count_copy), [input] "+r"(input),
2423         [output] "+r"(output)
2424       : [range_offset] "r"(params.range_offset),
2425         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2426       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2427 }
2428 
2429 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2430 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 9>::Transform(
2431     const float* input, const Quantize& params, uint8_t* output) {
2432 #ifdef DEBUG
2433 #ifdef DEBUG_METAGEMM_VERBOSE
2434   std::cout << __FILE__ << "(" << __LINE__
2435             << ") Quantize<float, uint8_t, Quantize, 16, 9>::Transform()"
2436             << std::endl
2437             << std::flush;
2438 #endif
2439 #endif
2440   int params_count_copy = params.count;
2441   asm volatile(
2442 
2443       // Quantize::Prepare
2444       "dup v4.4s, %w[range_min]\n"
2445       "dup v5.4s, %w[range_offset]\n"
2446       "dup v6.4s, %w[range_scale]\n"
2447 
2448       // Reduce count by leftovers.
2449       "subs %x[count], %x[count], #9\n"
2450       "beq 2f\n"
2451 
2452       "1:"
2453       "subs %x[count], %x[count], #16\n"
2454 
2455       // Quantize::Transform
2456       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2457       "prfm pldl1keep, [%x[input], #64]\n"
2458       "fsub v0.4s, v0.4s, v4.4s\n"
2459       "fsub v1.4s, v1.4s, v4.4s\n"
2460       "fsub v2.4s, v2.4s, v4.4s\n"
2461       "fsub v3.4s, v3.4s, v4.4s\n"
2462       "fmul v0.4s, v0.4s, v6.4s\n"
2463       "fmul v1.4s, v1.4s, v6.4s\n"
2464       "fmul v2.4s, v2.4s, v6.4s\n"
2465       "fmul v3.4s, v3.4s, v6.4s\n"
2466       "fadd v0.4s, v0.4s, v5.4s\n"
2467       "fadd v1.4s, v1.4s, v5.4s\n"
2468       "fadd v2.4s, v2.4s, v5.4s\n"
2469       "fadd v3.4s, v3.4s, v5.4s\n"
2470       "fcvtzs v0.4s, v0.4s\n"
2471       "fcvtzs v1.4s, v1.4s\n"
2472       "fcvtzs v2.4s, v2.4s\n"
2473       "fcvtzs v3.4s, v3.4s\n"
2474       "sqxtn v0.4h, v0.4s\n"
2475       "sqxtn2 v0.8h, v1.4s\n"
2476       "sqxtn v2.4h, v2.4s\n"
2477       "sqxtn2 v2.8h, v3.4s\n"
2478       "sqxtun v0.8b, v0.8h\n"
2479       "sqxtun2 v0.16b, v2.8h\n"
2480 
2481       "st1 {v0.4s}, [%x[output]], #16\n"
2482       "prfm pldl1keep, [%x[output]]\n"
2483 
2484       "bne 1b\n"
2485       "2:"
2486 
2487       // Handle leftovers.
2488 
2489       // Quantize::Transform
2490       "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
2491       "ld1 {v2.s}[0], [%x[input]], #4\n"
2492       "prfm pldl1keep, [%x[input], #64]\n"
2493       "fsub v0.4s, v0.4s, v4.4s\n"
2494       "fsub v1.4s, v1.4s, v4.4s\n"
2495       "fsub v2.4s, v2.4s, v4.4s\n"
2496       "fmul v0.4s, v0.4s, v6.4s\n"
2497       "fmul v1.4s, v1.4s, v6.4s\n"
2498       "fmul v2.4s, v2.4s, v6.4s\n"
2499       "fadd v0.4s, v0.4s, v5.4s\n"
2500       "fadd v1.4s, v1.4s, v5.4s\n"
2501       "fadd v2.4s, v2.4s, v5.4s\n"
2502       "fcvtzs v0.4s, v0.4s\n"
2503       "fcvtzs v1.4s, v1.4s\n"
2504       "fcvtzs v2.4s, v2.4s\n"
2505       "sqxtn v0.4h, v0.4s\n"
2506       "sqxtn2 v0.8h, v1.4s\n"
2507       "sqxtn v2.4h, v2.4s\n"
2508       "sqxtun v0.8b, v0.8h\n"
2509       "sqxtun2 v0.16b, v2.8h\n"
2510 
2511       "st1 {v0.2s}, [%x[output]], #8\n"
2512       "st1 {v0.b}[8], [%x[output]], #1\n"
2513       "prfm pldl1keep, [%x[output]]\n"
2514       : [count] "+r"(params_count_copy), [input] "+r"(input),
2515         [output] "+r"(output)
2516       : [range_offset] "r"(params.range_offset),
2517         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2518       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2519 }
2520 
2521 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2522 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 10>::Transform(
2523     const float* input, const Quantize& params, uint8_t* output) {
2524 #ifdef DEBUG
2525 #ifdef DEBUG_METAGEMM_VERBOSE
2526   std::cout << __FILE__ << "(" << __LINE__
2527             << ") Quantize<float, uint8_t, Quantize, 16, 10>::Transform()"
2528             << std::endl
2529             << std::flush;
2530 #endif
2531 #endif
2532   int params_count_copy = params.count;
2533   asm volatile(
2534 
2535       // Quantize::Prepare
2536       "dup v4.4s, %w[range_min]\n"
2537       "dup v5.4s, %w[range_offset]\n"
2538       "dup v6.4s, %w[range_scale]\n"
2539 
2540       // Reduce count by leftovers.
2541       "subs %x[count], %x[count], #10\n"
2542       "beq 2f\n"
2543 
2544       "1:"
2545       "subs %x[count], %x[count], #16\n"
2546 
2547       // Quantize::Transform
2548       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2549       "prfm pldl1keep, [%x[input], #64]\n"
2550       "fsub v0.4s, v0.4s, v4.4s\n"
2551       "fsub v1.4s, v1.4s, v4.4s\n"
2552       "fsub v2.4s, v2.4s, v4.4s\n"
2553       "fsub v3.4s, v3.4s, v4.4s\n"
2554       "fmul v0.4s, v0.4s, v6.4s\n"
2555       "fmul v1.4s, v1.4s, v6.4s\n"
2556       "fmul v2.4s, v2.4s, v6.4s\n"
2557       "fmul v3.4s, v3.4s, v6.4s\n"
2558       "fadd v0.4s, v0.4s, v5.4s\n"
2559       "fadd v1.4s, v1.4s, v5.4s\n"
2560       "fadd v2.4s, v2.4s, v5.4s\n"
2561       "fadd v3.4s, v3.4s, v5.4s\n"
2562       "fcvtzs v0.4s, v0.4s\n"
2563       "fcvtzs v1.4s, v1.4s\n"
2564       "fcvtzs v2.4s, v2.4s\n"
2565       "fcvtzs v3.4s, v3.4s\n"
2566       "sqxtn v0.4h, v0.4s\n"
2567       "sqxtn2 v0.8h, v1.4s\n"
2568       "sqxtn v2.4h, v2.4s\n"
2569       "sqxtn2 v2.8h, v3.4s\n"
2570       "sqxtun v0.8b, v0.8h\n"
2571       "sqxtun2 v0.16b, v2.8h\n"
2572 
2573       "st1 {v0.4s}, [%x[output]], #16\n"
2574       "prfm pldl1keep, [%x[output]]\n"
2575 
2576       "bne 1b\n"
2577       "2:"
2578 
2579       // Handle leftovers.
2580 
2581       // Quantize::Transform
2582       "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
2583       "ld1 {v2.2s}, [%x[input]], #8\n"
2584       "prfm pldl1keep, [%x[input], #64]\n"
2585       "fsub v0.4s, v0.4s, v4.4s\n"
2586       "fsub v1.4s, v1.4s, v4.4s\n"
2587       "fsub v2.4s, v2.4s, v4.4s\n"
2588       "fmul v0.4s, v0.4s, v6.4s\n"
2589       "fmul v1.4s, v1.4s, v6.4s\n"
2590       "fmul v2.4s, v2.4s, v6.4s\n"
2591       "fadd v0.4s, v0.4s, v5.4s\n"
2592       "fadd v1.4s, v1.4s, v5.4s\n"
2593       "fadd v2.4s, v2.4s, v5.4s\n"
2594       "fcvtzs v0.4s, v0.4s\n"
2595       "fcvtzs v1.4s, v1.4s\n"
2596       "fcvtzs v2.4s, v2.4s\n"
2597       "sqxtn v0.4h, v0.4s\n"
2598       "sqxtn2 v0.8h, v1.4s\n"
2599       "sqxtn v2.4h, v2.4s\n"
2600       "sqxtun v0.8b, v0.8h\n"
2601       "sqxtun2 v0.16b, v2.8h\n"
2602 
2603       "st1 {v0.2s}, [%x[output]], #8\n"
2604       "st1 {v0.h}[4], [%x[output]], #2\n"
2605       "prfm pldl1keep, [%x[output]]\n"
2606       : [count] "+r"(params_count_copy), [input] "+r"(input),
2607         [output] "+r"(output)
2608       : [range_offset] "r"(params.range_offset),
2609         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2610       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2611 }
2612 
2613 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2614 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 11>::Transform(
2615     const float* input, const Quantize& params, uint8_t* output) {
2616 #ifdef DEBUG
2617 #ifdef DEBUG_METAGEMM_VERBOSE
2618   std::cout << __FILE__ << "(" << __LINE__
2619             << ") Quantize<float, uint8_t, Quantize, 16, 11>::Transform()"
2620             << std::endl
2621             << std::flush;
2622 #endif
2623 #endif
2624   int params_count_copy = params.count;
2625   asm volatile(
2626 
2627       // Quantize::Prepare
2628       "dup v4.4s, %w[range_min]\n"
2629       "dup v5.4s, %w[range_offset]\n"
2630       "dup v6.4s, %w[range_scale]\n"
2631 
2632       // Reduce count by leftovers.
2633       "subs %x[count], %x[count], #11\n"
2634       "beq 2f\n"
2635 
2636       "1:"
2637       "subs %x[count], %x[count], #16\n"
2638 
2639       // Quantize::Transform
2640       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2641       "prfm pldl1keep, [%x[input], #64]\n"
2642       "fsub v0.4s, v0.4s, v4.4s\n"
2643       "fsub v1.4s, v1.4s, v4.4s\n"
2644       "fsub v2.4s, v2.4s, v4.4s\n"
2645       "fsub v3.4s, v3.4s, v4.4s\n"
2646       "fmul v0.4s, v0.4s, v6.4s\n"
2647       "fmul v1.4s, v1.4s, v6.4s\n"
2648       "fmul v2.4s, v2.4s, v6.4s\n"
2649       "fmul v3.4s, v3.4s, v6.4s\n"
2650       "fadd v0.4s, v0.4s, v5.4s\n"
2651       "fadd v1.4s, v1.4s, v5.4s\n"
2652       "fadd v2.4s, v2.4s, v5.4s\n"
2653       "fadd v3.4s, v3.4s, v5.4s\n"
2654       "fcvtzs v0.4s, v0.4s\n"
2655       "fcvtzs v1.4s, v1.4s\n"
2656       "fcvtzs v2.4s, v2.4s\n"
2657       "fcvtzs v3.4s, v3.4s\n"
2658       "sqxtn v0.4h, v0.4s\n"
2659       "sqxtn2 v0.8h, v1.4s\n"
2660       "sqxtn v2.4h, v2.4s\n"
2661       "sqxtn2 v2.8h, v3.4s\n"
2662       "sqxtun v0.8b, v0.8h\n"
2663       "sqxtun2 v0.16b, v2.8h\n"
2664 
2665       "st1 {v0.4s}, [%x[output]], #16\n"
2666       "prfm pldl1keep, [%x[output]]\n"
2667 
2668       "bne 1b\n"
2669       "2:"
2670 
2671       // Handle leftovers.
2672 
2673       // Quantize::Transform
2674       "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
2675       "ld1 {v2.2s}, [%x[input]], #8\n"
2676       "ld1 {v2.s}[2], [%x[input]], #4\n"
2677       "prfm pldl1keep, [%x[input], #64]\n"
2678       "fsub v0.4s, v0.4s, v4.4s\n"
2679       "fsub v1.4s, v1.4s, v4.4s\n"
2680       "fsub v2.4s, v2.4s, v4.4s\n"
2681       "fmul v0.4s, v0.4s, v6.4s\n"
2682       "fmul v1.4s, v1.4s, v6.4s\n"
2683       "fmul v2.4s, v2.4s, v6.4s\n"
2684       "fadd v0.4s, v0.4s, v5.4s\n"
2685       "fadd v1.4s, v1.4s, v5.4s\n"
2686       "fadd v2.4s, v2.4s, v5.4s\n"
2687       "fcvtzs v0.4s, v0.4s\n"
2688       "fcvtzs v1.4s, v1.4s\n"
2689       "fcvtzs v2.4s, v2.4s\n"
2690       "sqxtn v0.4h, v0.4s\n"
2691       "sqxtn2 v0.8h, v1.4s\n"
2692       "sqxtn v2.4h, v2.4s\n"
2693       "sqxtun v0.8b, v0.8h\n"
2694       "sqxtun2 v0.16b, v2.8h\n"
2695 
2696       "st1 {v0.2s}, [%x[output]], #8\n"
2697       "st1 {v0.h}[4], [%x[output]], #2\n"
2698       "st1 {v0.b}[10], [%x[output]], #1\n"
2699       "prfm pldl1keep, [%x[output]]\n"
2700       : [count] "+r"(params_count_copy), [input] "+r"(input),
2701         [output] "+r"(output)
2702       : [range_offset] "r"(params.range_offset),
2703         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2704       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2705 }
2706 
2707 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2708 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 12>::Transform(
2709     const float* input, const Quantize& params, uint8_t* output) {
2710 #ifdef DEBUG
2711 #ifdef DEBUG_METAGEMM_VERBOSE
2712   std::cout << __FILE__ << "(" << __LINE__
2713             << ") Quantize<float, uint8_t, Quantize, 16, 12>::Transform()"
2714             << std::endl
2715             << std::flush;
2716 #endif
2717 #endif
2718   int params_count_copy = params.count;
2719   asm volatile(
2720 
2721       // Quantize::Prepare
2722       "dup v4.4s, %w[range_min]\n"
2723       "dup v5.4s, %w[range_offset]\n"
2724       "dup v6.4s, %w[range_scale]\n"
2725 
2726       // Reduce count by leftovers.
2727       "subs %x[count], %x[count], #12\n"
2728       "beq 2f\n"
2729 
2730       "1:"
2731       "subs %x[count], %x[count], #16\n"
2732 
2733       // Quantize::Transform
2734       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2735       "prfm pldl1keep, [%x[input], #64]\n"
2736       "fsub v0.4s, v0.4s, v4.4s\n"
2737       "fsub v1.4s, v1.4s, v4.4s\n"
2738       "fsub v2.4s, v2.4s, v4.4s\n"
2739       "fsub v3.4s, v3.4s, v4.4s\n"
2740       "fmul v0.4s, v0.4s, v6.4s\n"
2741       "fmul v1.4s, v1.4s, v6.4s\n"
2742       "fmul v2.4s, v2.4s, v6.4s\n"
2743       "fmul v3.4s, v3.4s, v6.4s\n"
2744       "fadd v0.4s, v0.4s, v5.4s\n"
2745       "fadd v1.4s, v1.4s, v5.4s\n"
2746       "fadd v2.4s, v2.4s, v5.4s\n"
2747       "fadd v3.4s, v3.4s, v5.4s\n"
2748       "fcvtzs v0.4s, v0.4s\n"
2749       "fcvtzs v1.4s, v1.4s\n"
2750       "fcvtzs v2.4s, v2.4s\n"
2751       "fcvtzs v3.4s, v3.4s\n"
2752       "sqxtn v0.4h, v0.4s\n"
2753       "sqxtn2 v0.8h, v1.4s\n"
2754       "sqxtn v2.4h, v2.4s\n"
2755       "sqxtn2 v2.8h, v3.4s\n"
2756       "sqxtun v0.8b, v0.8h\n"
2757       "sqxtun2 v0.16b, v2.8h\n"
2758 
2759       "st1 {v0.4s}, [%x[output]], #16\n"
2760       "prfm pldl1keep, [%x[output]]\n"
2761 
2762       "bne 1b\n"
2763       "2:"
2764 
2765       // Handle leftovers.
2766 
2767       // Quantize::Transform
2768       "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
2769       "prfm pldl1keep, [%x[input], #64]\n"
2770       "fsub v0.4s, v0.4s, v4.4s\n"
2771       "fsub v1.4s, v1.4s, v4.4s\n"
2772       "fsub v2.4s, v2.4s, v4.4s\n"
2773       "fmul v0.4s, v0.4s, v6.4s\n"
2774       "fmul v1.4s, v1.4s, v6.4s\n"
2775       "fmul v2.4s, v2.4s, v6.4s\n"
2776       "fadd v0.4s, v0.4s, v5.4s\n"
2777       "fadd v1.4s, v1.4s, v5.4s\n"
2778       "fadd v2.4s, v2.4s, v5.4s\n"
2779       "fcvtzs v0.4s, v0.4s\n"
2780       "fcvtzs v1.4s, v1.4s\n"
2781       "fcvtzs v2.4s, v2.4s\n"
2782       "sqxtn v0.4h, v0.4s\n"
2783       "sqxtn2 v0.8h, v1.4s\n"
2784       "sqxtn v2.4h, v2.4s\n"
2785       "sqxtun v0.8b, v0.8h\n"
2786       "sqxtun2 v0.16b, v2.8h\n"
2787 
2788       "st1 {v0.2s}, [%x[output]], #8\n"
2789       "st1 {v0.s}[2], [%x[output]], #4\n"
2790       "prfm pldl1keep, [%x[output]]\n"
2791       : [count] "+r"(params_count_copy), [input] "+r"(input),
2792         [output] "+r"(output)
2793       : [range_offset] "r"(params.range_offset),
2794         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2795       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2796 }
2797 
2798 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2799 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 13>::Transform(
2800     const float* input, const Quantize& params, uint8_t* output) {
2801 #ifdef DEBUG
2802 #ifdef DEBUG_METAGEMM_VERBOSE
2803   std::cout << __FILE__ << "(" << __LINE__
2804             << ") Quantize<float, uint8_t, Quantize, 16, 13>::Transform()"
2805             << std::endl
2806             << std::flush;
2807 #endif
2808 #endif
2809   int params_count_copy = params.count;
2810   asm volatile(
2811 
2812       // Quantize::Prepare
2813       "dup v4.4s, %w[range_min]\n"
2814       "dup v5.4s, %w[range_offset]\n"
2815       "dup v6.4s, %w[range_scale]\n"
2816 
2817       // Reduce count by leftovers.
2818       "subs %x[count], %x[count], #13\n"
2819       "beq 2f\n"
2820 
2821       "1:"
2822       "subs %x[count], %x[count], #16\n"
2823 
2824       // Quantize::Transform
2825       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2826       "prfm pldl1keep, [%x[input], #64]\n"
2827       "fsub v0.4s, v0.4s, v4.4s\n"
2828       "fsub v1.4s, v1.4s, v4.4s\n"
2829       "fsub v2.4s, v2.4s, v4.4s\n"
2830       "fsub v3.4s, v3.4s, v4.4s\n"
2831       "fmul v0.4s, v0.4s, v6.4s\n"
2832       "fmul v1.4s, v1.4s, v6.4s\n"
2833       "fmul v2.4s, v2.4s, v6.4s\n"
2834       "fmul v3.4s, v3.4s, v6.4s\n"
2835       "fadd v0.4s, v0.4s, v5.4s\n"
2836       "fadd v1.4s, v1.4s, v5.4s\n"
2837       "fadd v2.4s, v2.4s, v5.4s\n"
2838       "fadd v3.4s, v3.4s, v5.4s\n"
2839       "fcvtzs v0.4s, v0.4s\n"
2840       "fcvtzs v1.4s, v1.4s\n"
2841       "fcvtzs v2.4s, v2.4s\n"
2842       "fcvtzs v3.4s, v3.4s\n"
2843       "sqxtn v0.4h, v0.4s\n"
2844       "sqxtn2 v0.8h, v1.4s\n"
2845       "sqxtn v2.4h, v2.4s\n"
2846       "sqxtn2 v2.8h, v3.4s\n"
2847       "sqxtun v0.8b, v0.8h\n"
2848       "sqxtun2 v0.16b, v2.8h\n"
2849 
2850       "st1 {v0.4s}, [%x[output]], #16\n"
2851       "prfm pldl1keep, [%x[output]]\n"
2852 
2853       "bne 1b\n"
2854       "2:"
2855 
2856       // Handle leftovers.
2857 
2858       // Quantize::Transform
2859       "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
2860       "ld1 {v3.s}[0], [%x[input]], #4\n"
2861       "prfm pldl1keep, [%x[input], #64]\n"
2862       "fsub v0.4s, v0.4s, v4.4s\n"
2863       "fsub v1.4s, v1.4s, v4.4s\n"
2864       "fsub v2.4s, v2.4s, v4.4s\n"
2865       "fsub v3.4s, v3.4s, v4.4s\n"
2866       "fmul v0.4s, v0.4s, v6.4s\n"
2867       "fmul v1.4s, v1.4s, v6.4s\n"
2868       "fmul v2.4s, v2.4s, v6.4s\n"
2869       "fmul v3.4s, v3.4s, v6.4s\n"
2870       "fadd v0.4s, v0.4s, v5.4s\n"
2871       "fadd v1.4s, v1.4s, v5.4s\n"
2872       "fadd v2.4s, v2.4s, v5.4s\n"
2873       "fadd v3.4s, v3.4s, v5.4s\n"
2874       "fcvtzs v0.4s, v0.4s\n"
2875       "fcvtzs v1.4s, v1.4s\n"
2876       "fcvtzs v2.4s, v2.4s\n"
2877       "fcvtzs v3.4s, v3.4s\n"
2878       "sqxtn v0.4h, v0.4s\n"
2879       "sqxtn2 v0.8h, v1.4s\n"
2880       "sqxtn v2.4h, v2.4s\n"
2881       "sqxtn2 v2.8h, v3.4s\n"
2882       "sqxtun v0.8b, v0.8h\n"
2883       "sqxtun2 v0.16b, v2.8h\n"
2884 
2885       "st1 {v0.2s}, [%x[output]], #8\n"
2886       "st1 {v0.s}[2], [%x[output]], #4\n"
2887       "st1 {v0.b}[12], [%x[output]], #1\n"
2888       "prfm pldl1keep, [%x[output]]\n"
2889       : [count] "+r"(params_count_copy), [input] "+r"(input),
2890         [output] "+r"(output)
2891       : [range_offset] "r"(params.range_offset),
2892         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2893       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2894 }
2895 
2896 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2897 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 14>::Transform(
2898     const float* input, const Quantize& params, uint8_t* output) {
2899 #ifdef DEBUG
2900 #ifdef DEBUG_METAGEMM_VERBOSE
2901   std::cout << __FILE__ << "(" << __LINE__
2902             << ") Quantize<float, uint8_t, Quantize, 16, 14>::Transform()"
2903             << std::endl
2904             << std::flush;
2905 #endif
2906 #endif
2907   int params_count_copy = params.count;
2908   asm volatile(
2909 
2910       // Quantize::Prepare
2911       "dup v4.4s, %w[range_min]\n"
2912       "dup v5.4s, %w[range_offset]\n"
2913       "dup v6.4s, %w[range_scale]\n"
2914 
2915       // Reduce count by leftovers.
2916       "subs %x[count], %x[count], #14\n"
2917       "beq 2f\n"
2918 
2919       "1:"
2920       "subs %x[count], %x[count], #16\n"
2921 
2922       // Quantize::Transform
2923       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2924       "prfm pldl1keep, [%x[input], #64]\n"
2925       "fsub v0.4s, v0.4s, v4.4s\n"
2926       "fsub v1.4s, v1.4s, v4.4s\n"
2927       "fsub v2.4s, v2.4s, v4.4s\n"
2928       "fsub v3.4s, v3.4s, v4.4s\n"
2929       "fmul v0.4s, v0.4s, v6.4s\n"
2930       "fmul v1.4s, v1.4s, v6.4s\n"
2931       "fmul v2.4s, v2.4s, v6.4s\n"
2932       "fmul v3.4s, v3.4s, v6.4s\n"
2933       "fadd v0.4s, v0.4s, v5.4s\n"
2934       "fadd v1.4s, v1.4s, v5.4s\n"
2935       "fadd v2.4s, v2.4s, v5.4s\n"
2936       "fadd v3.4s, v3.4s, v5.4s\n"
2937       "fcvtzs v0.4s, v0.4s\n"
2938       "fcvtzs v1.4s, v1.4s\n"
2939       "fcvtzs v2.4s, v2.4s\n"
2940       "fcvtzs v3.4s, v3.4s\n"
2941       "sqxtn v0.4h, v0.4s\n"
2942       "sqxtn2 v0.8h, v1.4s\n"
2943       "sqxtn v2.4h, v2.4s\n"
2944       "sqxtn2 v2.8h, v3.4s\n"
2945       "sqxtun v0.8b, v0.8h\n"
2946       "sqxtun2 v0.16b, v2.8h\n"
2947 
2948       "st1 {v0.4s}, [%x[output]], #16\n"
2949       "prfm pldl1keep, [%x[output]]\n"
2950 
2951       "bne 1b\n"
2952       "2:"
2953 
2954       // Handle leftovers.
2955 
2956       // Quantize::Transform
2957       "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
2958       "ld1 {v3.2s}, [%x[input]], #8\n"
2959       "prfm pldl1keep, [%x[input], #64]\n"
2960       "fsub v0.4s, v0.4s, v4.4s\n"
2961       "fsub v1.4s, v1.4s, v4.4s\n"
2962       "fsub v2.4s, v2.4s, v4.4s\n"
2963       "fsub v3.4s, v3.4s, v4.4s\n"
2964       "fmul v0.4s, v0.4s, v6.4s\n"
2965       "fmul v1.4s, v1.4s, v6.4s\n"
2966       "fmul v2.4s, v2.4s, v6.4s\n"
2967       "fmul v3.4s, v3.4s, v6.4s\n"
2968       "fadd v0.4s, v0.4s, v5.4s\n"
2969       "fadd v1.4s, v1.4s, v5.4s\n"
2970       "fadd v2.4s, v2.4s, v5.4s\n"
2971       "fadd v3.4s, v3.4s, v5.4s\n"
2972       "fcvtzs v0.4s, v0.4s\n"
2973       "fcvtzs v1.4s, v1.4s\n"
2974       "fcvtzs v2.4s, v2.4s\n"
2975       "fcvtzs v3.4s, v3.4s\n"
2976       "sqxtn v0.4h, v0.4s\n"
2977       "sqxtn2 v0.8h, v1.4s\n"
2978       "sqxtn v2.4h, v2.4s\n"
2979       "sqxtn2 v2.8h, v3.4s\n"
2980       "sqxtun v0.8b, v0.8h\n"
2981       "sqxtun2 v0.16b, v2.8h\n"
2982 
2983       "st1 {v0.2s}, [%x[output]], #8\n"
2984       "st1 {v0.s}[2], [%x[output]], #4\n"
2985       "st1 {v0.h}[6], [%x[output]], #2\n"
2986       "prfm pldl1keep, [%x[output]]\n"
2987       : [count] "+r"(params_count_copy), [input] "+r"(input),
2988         [output] "+r"(output)
2989       : [range_offset] "r"(params.range_offset),
2990         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2991       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2992 }
2993 
2994 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2995 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 15>::Transform(
2996     const float* input, const Quantize& params, uint8_t* output) {
2997 #ifdef DEBUG
2998 #ifdef DEBUG_METAGEMM_VERBOSE
2999   std::cout << __FILE__ << "(" << __LINE__
3000             << ") Quantize<float, uint8_t, Quantize, 16, 15>::Transform()"
3001             << std::endl
3002             << std::flush;
3003 #endif
3004 #endif
3005   int params_count_copy = params.count;
3006   asm volatile(
3007 
3008       // Quantize::Prepare
3009       "dup v4.4s, %w[range_min]\n"
3010       "dup v5.4s, %w[range_offset]\n"
3011       "dup v6.4s, %w[range_scale]\n"
3012 
3013       // Reduce count by leftovers.
3014       "subs %x[count], %x[count], #15\n"
3015       "beq 2f\n"
3016 
3017       "1:"
3018       "subs %x[count], %x[count], #16\n"
3019 
3020       // Quantize::Transform
3021       "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
3022       "prfm pldl1keep, [%x[input], #64]\n"
3023       "fsub v0.4s, v0.4s, v4.4s\n"
3024       "fsub v1.4s, v1.4s, v4.4s\n"
3025       "fsub v2.4s, v2.4s, v4.4s\n"
3026       "fsub v3.4s, v3.4s, v4.4s\n"
3027       "fmul v0.4s, v0.4s, v6.4s\n"
3028       "fmul v1.4s, v1.4s, v6.4s\n"
3029       "fmul v2.4s, v2.4s, v6.4s\n"
3030       "fmul v3.4s, v3.4s, v6.4s\n"
3031       "fadd v0.4s, v0.4s, v5.4s\n"
3032       "fadd v1.4s, v1.4s, v5.4s\n"
3033       "fadd v2.4s, v2.4s, v5.4s\n"
3034       "fadd v3.4s, v3.4s, v5.4s\n"
3035       "fcvtzs v0.4s, v0.4s\n"
3036       "fcvtzs v1.4s, v1.4s\n"
3037       "fcvtzs v2.4s, v2.4s\n"
3038       "fcvtzs v3.4s, v3.4s\n"
3039       "sqxtn v0.4h, v0.4s\n"
3040       "sqxtn2 v0.8h, v1.4s\n"
3041       "sqxtn v2.4h, v2.4s\n"
3042       "sqxtn2 v2.8h, v3.4s\n"
3043       "sqxtun v0.8b, v0.8h\n"
3044       "sqxtun2 v0.16b, v2.8h\n"
3045 
3046       "st1 {v0.4s}, [%x[output]], #16\n"
3047       "prfm pldl1keep, [%x[output]]\n"
3048 
3049       "bne 1b\n"
3050       "2:"
3051 
3052       // Handle leftovers.
3053 
3054       // Quantize::Transform
3055       "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
3056       "ld1 {v3.2s}, [%x[input]], #8\n"
3057       "ld1 {v3.s}[2], [%x[input]], #4\n"
3058       "prfm pldl1keep, [%x[input], #64]\n"
3059       "fsub v0.4s, v0.4s, v4.4s\n"
3060       "fsub v1.4s, v1.4s, v4.4s\n"
3061       "fsub v2.4s, v2.4s, v4.4s\n"
3062       "fsub v3.4s, v3.4s, v4.4s\n"
3063       "fmul v0.4s, v0.4s, v6.4s\n"
3064       "fmul v1.4s, v1.4s, v6.4s\n"
3065       "fmul v2.4s, v2.4s, v6.4s\n"
3066       "fmul v3.4s, v3.4s, v6.4s\n"
3067       "fadd v0.4s, v0.4s, v5.4s\n"
3068       "fadd v1.4s, v1.4s, v5.4s\n"
3069       "fadd v2.4s, v2.4s, v5.4s\n"
3070       "fadd v3.4s, v3.4s, v5.4s\n"
3071       "fcvtzs v0.4s, v0.4s\n"
3072       "fcvtzs v1.4s, v1.4s\n"
3073       "fcvtzs v2.4s, v2.4s\n"
3074       "fcvtzs v3.4s, v3.4s\n"
3075       "sqxtn v0.4h, v0.4s\n"
3076       "sqxtn2 v0.8h, v1.4s\n"
3077       "sqxtn v2.4h, v2.4s\n"
3078       "sqxtn2 v2.8h, v3.4s\n"
3079       "sqxtun v0.8b, v0.8h\n"
3080       "sqxtun2 v0.16b, v2.8h\n"
3081 
3082       "st1 {v0.2s}, [%x[output]], #8\n"
3083       "st1 {v0.s}[2], [%x[output]], #4\n"
3084       "st1 {v0.h}[6], [%x[output]], #2\n"
3085       "st1 {v0.b}[14], [%x[output]], #1\n"
3086       "prfm pldl1keep, [%x[output]]\n"
3087       : [count] "+r"(params_count_copy), [input] "+r"(input),
3088         [output] "+r"(output)
3089       : [range_offset] "r"(params.range_offset),
3090         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3091       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3092 }
3093 
3094 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3095 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 0>::Transform(
3096     const uint8_t* input, const Dequantize& params, float* output) {
3097 #ifdef DEBUG
3098 #ifdef DEBUG_METAGEMM_VERBOSE
3099   std::cout << __FILE__ << "(" << __LINE__
3100             << ") Dequantize<uint8_t, float, Dequantize, 16, 0>::Transform()"
3101             << std::endl
3102             << std::flush;
3103 #endif
3104 #endif
3105   int params_count_copy = params.count;
3106   asm volatile(
3107 
3108       // Dequantize::Prepare
3109       "dup v4.4s, %w[range_min]\n"
3110       "dup v5.4s, %w[range_offset]\n"
3111       "dup v6.4s, %w[range_scale]\n"
3112 
3113       "1:"
3114       "subs %x[count], %x[count], #16\n"
3115 
3116       // Dequantize::Transform
3117       "ld1 {v0.4s}, [%x[input]], #16\n"
3118       "prfm pldl1keep, [%x[input], #32]\n"
3119       "uxtl2 v1.8h, v0.16b\n"
3120       "uxtl v0.8h, v0.8b\n"
3121       "sxtl2 v3.4s, v1.8h\n"
3122       "sxtl v2.4s, v1.4h\n"
3123       "sxtl2 v1.4s, v0.8h\n"
3124       "sxtl v0.4s, v0.4h\n"
3125       "scvtf v0.4s, v0.4s\n"
3126       "scvtf v1.4s, v1.4s\n"
3127       "scvtf v2.4s, v2.4s\n"
3128       "scvtf v3.4s, v3.4s\n"
3129       "fsub v0.4s, v0.4s, v5.4s\n"
3130       "fsub v1.4s, v1.4s, v5.4s\n"
3131       "fsub v2.4s, v2.4s, v5.4s\n"
3132       "fsub v3.4s, v3.4s, v5.4s\n"
3133       "fmul v0.4s, v0.4s, v6.4s\n"
3134       "fmul v1.4s, v1.4s, v6.4s\n"
3135       "fmul v2.4s, v2.4s, v6.4s\n"
3136       "fmul v3.4s, v3.4s, v6.4s\n"
3137       "fadd v0.4s, v0.4s, v4.4s\n"
3138       "fadd v1.4s, v1.4s, v4.4s\n"
3139       "fadd v2.4s, v2.4s, v4.4s\n"
3140       "fadd v3.4s, v3.4s, v4.4s\n"
3141 
3142       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3143       "prfm pldl1keep, [%x[output]]\n"
3144 
3145       "bne 1b\n"
3146       : [count] "+r"(params_count_copy), [input] "+r"(input),
3147         [output] "+r"(output)
3148       : [range_offset] "r"(params.range_offset),
3149         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3150       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3151 }
3152 
3153 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3154 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 1>::Transform(
3155     const uint8_t* input, const Dequantize& params, float* output) {
3156 #ifdef DEBUG
3157 #ifdef DEBUG_METAGEMM_VERBOSE
3158   std::cout << __FILE__ << "(" << __LINE__
3159             << ") Dequantize<uint8_t, float, Dequantize, 16, 1>::Transform()"
3160             << std::endl
3161             << std::flush;
3162 #endif
3163 #endif
3164   int params_count_copy = params.count;
3165   asm volatile(
3166 
3167       // Dequantize::Prepare
3168       "dup v4.4s, %w[range_min]\n"
3169       "dup v5.4s, %w[range_offset]\n"
3170       "dup v6.4s, %w[range_scale]\n"
3171 
3172       // Reduce count by leftovers.
3173       "subs %x[count], %x[count], #1\n"
3174       "beq 2f\n"
3175 
3176       "1:"
3177       "subs %x[count], %x[count], #16\n"
3178 
3179       // Dequantize::Transform
3180       "ld1 {v0.4s}, [%x[input]], #16\n"
3181       "prfm pldl1keep, [%x[input], #32]\n"
3182       "uxtl2 v1.8h, v0.16b\n"
3183       "uxtl v0.8h, v0.8b\n"
3184       "sxtl2 v3.4s, v1.8h\n"
3185       "sxtl v2.4s, v1.4h\n"
3186       "sxtl2 v1.4s, v0.8h\n"
3187       "sxtl v0.4s, v0.4h\n"
3188       "scvtf v0.4s, v0.4s\n"
3189       "scvtf v1.4s, v1.4s\n"
3190       "scvtf v2.4s, v2.4s\n"
3191       "scvtf v3.4s, v3.4s\n"
3192       "fsub v0.4s, v0.4s, v5.4s\n"
3193       "fsub v1.4s, v1.4s, v5.4s\n"
3194       "fsub v2.4s, v2.4s, v5.4s\n"
3195       "fsub v3.4s, v3.4s, v5.4s\n"
3196       "fmul v0.4s, v0.4s, v6.4s\n"
3197       "fmul v1.4s, v1.4s, v6.4s\n"
3198       "fmul v2.4s, v2.4s, v6.4s\n"
3199       "fmul v3.4s, v3.4s, v6.4s\n"
3200       "fadd v0.4s, v0.4s, v4.4s\n"
3201       "fadd v1.4s, v1.4s, v4.4s\n"
3202       "fadd v2.4s, v2.4s, v4.4s\n"
3203       "fadd v3.4s, v3.4s, v4.4s\n"
3204 
3205       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3206       "prfm pldl1keep, [%x[output]]\n"
3207 
3208       "bne 1b\n"
3209       "2:"
3210 
3211       // Handle leftovers.
3212 
3213       // Dequantize::Transform
3214       "ld1 {v0.b}[0], [%x[input]], #1\n"
3215       "prfm pldl1keep, [%x[input], #32]\n"
3216       "uxtl v0.8h, v0.8b\n"
3217       "sxtl v0.4s, v0.4h\n"
3218       "scvtf v0.4s, v0.4s\n"
3219       "fsub v0.4s, v0.4s, v5.4s\n"
3220       "fmul v0.4s, v0.4s, v6.4s\n"
3221       "fadd v0.4s, v0.4s, v4.4s\n"
3222 
3223       "st1 {v0.s}[0], [%x[output]], #4\n"
3224       "prfm pldl1keep, [%x[output]]\n"
3225       : [count] "+r"(params_count_copy), [input] "+r"(input),
3226         [output] "+r"(output)
3227       : [range_offset] "r"(params.range_offset),
3228         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3229       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3230 }
3231 
3232 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3233 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 2>::Transform(
3234     const uint8_t* input, const Dequantize& params, float* output) {
3235 #ifdef DEBUG
3236 #ifdef DEBUG_METAGEMM_VERBOSE
3237   std::cout << __FILE__ << "(" << __LINE__
3238             << ") Dequantize<uint8_t, float, Dequantize, 16, 2>::Transform()"
3239             << std::endl
3240             << std::flush;
3241 #endif
3242 #endif
3243   int params_count_copy = params.count;
3244   asm volatile(
3245 
3246       // Dequantize::Prepare
3247       "dup v4.4s, %w[range_min]\n"
3248       "dup v5.4s, %w[range_offset]\n"
3249       "dup v6.4s, %w[range_scale]\n"
3250 
3251       // Reduce count by leftovers.
3252       "subs %x[count], %x[count], #2\n"
3253       "beq 2f\n"
3254 
3255       "1:"
3256       "subs %x[count], %x[count], #16\n"
3257 
3258       // Dequantize::Transform
3259       "ld1 {v0.4s}, [%x[input]], #16\n"
3260       "prfm pldl1keep, [%x[input], #32]\n"
3261       "uxtl2 v1.8h, v0.16b\n"
3262       "uxtl v0.8h, v0.8b\n"
3263       "sxtl2 v3.4s, v1.8h\n"
3264       "sxtl v2.4s, v1.4h\n"
3265       "sxtl2 v1.4s, v0.8h\n"
3266       "sxtl v0.4s, v0.4h\n"
3267       "scvtf v0.4s, v0.4s\n"
3268       "scvtf v1.4s, v1.4s\n"
3269       "scvtf v2.4s, v2.4s\n"
3270       "scvtf v3.4s, v3.4s\n"
3271       "fsub v0.4s, v0.4s, v5.4s\n"
3272       "fsub v1.4s, v1.4s, v5.4s\n"
3273       "fsub v2.4s, v2.4s, v5.4s\n"
3274       "fsub v3.4s, v3.4s, v5.4s\n"
3275       "fmul v0.4s, v0.4s, v6.4s\n"
3276       "fmul v1.4s, v1.4s, v6.4s\n"
3277       "fmul v2.4s, v2.4s, v6.4s\n"
3278       "fmul v3.4s, v3.4s, v6.4s\n"
3279       "fadd v0.4s, v0.4s, v4.4s\n"
3280       "fadd v1.4s, v1.4s, v4.4s\n"
3281       "fadd v2.4s, v2.4s, v4.4s\n"
3282       "fadd v3.4s, v3.4s, v4.4s\n"
3283 
3284       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3285       "prfm pldl1keep, [%x[output]]\n"
3286 
3287       "bne 1b\n"
3288       "2:"
3289 
3290       // Handle leftovers.
3291 
3292       // Dequantize::Transform
3293       "ld1 {v0.h}[0], [%x[input]], #2\n"
3294       "prfm pldl1keep, [%x[input], #32]\n"
3295       "uxtl v0.8h, v0.8b\n"
3296       "sxtl v0.4s, v0.4h\n"
3297       "scvtf v0.4s, v0.4s\n"
3298       "fsub v0.4s, v0.4s, v5.4s\n"
3299       "fmul v0.4s, v0.4s, v6.4s\n"
3300       "fadd v0.4s, v0.4s, v4.4s\n"
3301 
3302       "st1 {v0.2s}, [%x[output]], #8\n"
3303       "prfm pldl1keep, [%x[output]]\n"
3304       : [count] "+r"(params_count_copy), [input] "+r"(input),
3305         [output] "+r"(output)
3306       : [range_offset] "r"(params.range_offset),
3307         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3308       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3309 }
3310 
3311 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3312 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 3>::Transform(
3313     const uint8_t* input, const Dequantize& params, float* output) {
3314 #ifdef DEBUG
3315 #ifdef DEBUG_METAGEMM_VERBOSE
3316   std::cout << __FILE__ << "(" << __LINE__
3317             << ") Dequantize<uint8_t, float, Dequantize, 16, 3>::Transform()"
3318             << std::endl
3319             << std::flush;
3320 #endif
3321 #endif
3322   int params_count_copy = params.count;
3323   asm volatile(
3324 
3325       // Dequantize::Prepare
3326       "dup v4.4s, %w[range_min]\n"
3327       "dup v5.4s, %w[range_offset]\n"
3328       "dup v6.4s, %w[range_scale]\n"
3329 
3330       // Reduce count by leftovers.
3331       "subs %x[count], %x[count], #3\n"
3332       "beq 2f\n"
3333 
3334       "1:"
3335       "subs %x[count], %x[count], #16\n"
3336 
3337       // Dequantize::Transform
3338       "ld1 {v0.4s}, [%x[input]], #16\n"
3339       "prfm pldl1keep, [%x[input], #32]\n"
3340       "uxtl2 v1.8h, v0.16b\n"
3341       "uxtl v0.8h, v0.8b\n"
3342       "sxtl2 v3.4s, v1.8h\n"
3343       "sxtl v2.4s, v1.4h\n"
3344       "sxtl2 v1.4s, v0.8h\n"
3345       "sxtl v0.4s, v0.4h\n"
3346       "scvtf v0.4s, v0.4s\n"
3347       "scvtf v1.4s, v1.4s\n"
3348       "scvtf v2.4s, v2.4s\n"
3349       "scvtf v3.4s, v3.4s\n"
3350       "fsub v0.4s, v0.4s, v5.4s\n"
3351       "fsub v1.4s, v1.4s, v5.4s\n"
3352       "fsub v2.4s, v2.4s, v5.4s\n"
3353       "fsub v3.4s, v3.4s, v5.4s\n"
3354       "fmul v0.4s, v0.4s, v6.4s\n"
3355       "fmul v1.4s, v1.4s, v6.4s\n"
3356       "fmul v2.4s, v2.4s, v6.4s\n"
3357       "fmul v3.4s, v3.4s, v6.4s\n"
3358       "fadd v0.4s, v0.4s, v4.4s\n"
3359       "fadd v1.4s, v1.4s, v4.4s\n"
3360       "fadd v2.4s, v2.4s, v4.4s\n"
3361       "fadd v3.4s, v3.4s, v4.4s\n"
3362 
3363       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3364       "prfm pldl1keep, [%x[output]]\n"
3365 
3366       "bne 1b\n"
3367       "2:"
3368 
3369       // Handle leftovers.
3370 
3371       // Dequantize::Transform
3372       "ld1 {v0.h}[0], [%x[input]], #2\n"
3373       "ld1 {v0.b}[2], [%x[input]], #1\n"
3374       "prfm pldl1keep, [%x[input], #32]\n"
3375       "uxtl v0.8h, v0.8b\n"
3376       "sxtl v0.4s, v0.4h\n"
3377       "scvtf v0.4s, v0.4s\n"
3378       "fsub v0.4s, v0.4s, v5.4s\n"
3379       "fmul v0.4s, v0.4s, v6.4s\n"
3380       "fadd v0.4s, v0.4s, v4.4s\n"
3381 
3382       "st1 {v0.2s}, [%x[output]], #8\n"
3383       "st1 {v0.s}[2], [%x[output]], #4\n"
3384       "prfm pldl1keep, [%x[output]]\n"
3385       : [count] "+r"(params_count_copy), [input] "+r"(input),
3386         [output] "+r"(output)
3387       : [range_offset] "r"(params.range_offset),
3388         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3389       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3390 }
3391 
3392 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3393 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 4>::Transform(
3394     const uint8_t* input, const Dequantize& params, float* output) {
3395 #ifdef DEBUG
3396 #ifdef DEBUG_METAGEMM_VERBOSE
3397   std::cout << __FILE__ << "(" << __LINE__
3398             << ") Dequantize<uint8_t, float, Dequantize, 16, 4>::Transform()"
3399             << std::endl
3400             << std::flush;
3401 #endif
3402 #endif
3403   int params_count_copy = params.count;
3404   asm volatile(
3405 
3406       // Dequantize::Prepare
3407       "dup v4.4s, %w[range_min]\n"
3408       "dup v5.4s, %w[range_offset]\n"
3409       "dup v6.4s, %w[range_scale]\n"
3410 
3411       // Reduce count by leftovers.
3412       "subs %x[count], %x[count], #4\n"
3413       "beq 2f\n"
3414 
3415       "1:"
3416       "subs %x[count], %x[count], #16\n"
3417 
3418       // Dequantize::Transform
3419       "ld1 {v0.4s}, [%x[input]], #16\n"
3420       "prfm pldl1keep, [%x[input], #32]\n"
3421       "uxtl2 v1.8h, v0.16b\n"
3422       "uxtl v0.8h, v0.8b\n"
3423       "sxtl2 v3.4s, v1.8h\n"
3424       "sxtl v2.4s, v1.4h\n"
3425       "sxtl2 v1.4s, v0.8h\n"
3426       "sxtl v0.4s, v0.4h\n"
3427       "scvtf v0.4s, v0.4s\n"
3428       "scvtf v1.4s, v1.4s\n"
3429       "scvtf v2.4s, v2.4s\n"
3430       "scvtf v3.4s, v3.4s\n"
3431       "fsub v0.4s, v0.4s, v5.4s\n"
3432       "fsub v1.4s, v1.4s, v5.4s\n"
3433       "fsub v2.4s, v2.4s, v5.4s\n"
3434       "fsub v3.4s, v3.4s, v5.4s\n"
3435       "fmul v0.4s, v0.4s, v6.4s\n"
3436       "fmul v1.4s, v1.4s, v6.4s\n"
3437       "fmul v2.4s, v2.4s, v6.4s\n"
3438       "fmul v3.4s, v3.4s, v6.4s\n"
3439       "fadd v0.4s, v0.4s, v4.4s\n"
3440       "fadd v1.4s, v1.4s, v4.4s\n"
3441       "fadd v2.4s, v2.4s, v4.4s\n"
3442       "fadd v3.4s, v3.4s, v4.4s\n"
3443 
3444       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3445       "prfm pldl1keep, [%x[output]]\n"
3446 
3447       "bne 1b\n"
3448       "2:"
3449 
3450       // Handle leftovers.
3451 
3452       // Dequantize::Transform
3453       "ld1 {v0.s}[0], [%x[input]], #4\n"
3454       "prfm pldl1keep, [%x[input], #32]\n"
3455       "uxtl v0.8h, v0.8b\n"
3456       "sxtl v0.4s, v0.4h\n"
3457       "scvtf v0.4s, v0.4s\n"
3458       "fsub v0.4s, v0.4s, v5.4s\n"
3459       "fmul v0.4s, v0.4s, v6.4s\n"
3460       "fadd v0.4s, v0.4s, v4.4s\n"
3461 
3462       "st1 {v0.4s}, [%x[output]], #16\n"
3463       "prfm pldl1keep, [%x[output]]\n"
3464       : [count] "+r"(params_count_copy), [input] "+r"(input),
3465         [output] "+r"(output)
3466       : [range_offset] "r"(params.range_offset),
3467         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3468       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3469 }
3470 
3471 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3472 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 5>::Transform(
3473     const uint8_t* input, const Dequantize& params, float* output) {
3474 #ifdef DEBUG
3475 #ifdef DEBUG_METAGEMM_VERBOSE
3476   std::cout << __FILE__ << "(" << __LINE__
3477             << ") Dequantize<uint8_t, float, Dequantize, 16, 5>::Transform()"
3478             << std::endl
3479             << std::flush;
3480 #endif
3481 #endif
3482   int params_count_copy = params.count;
3483   asm volatile(
3484 
3485       // Dequantize::Prepare
3486       "dup v4.4s, %w[range_min]\n"
3487       "dup v5.4s, %w[range_offset]\n"
3488       "dup v6.4s, %w[range_scale]\n"
3489 
3490       // Reduce count by leftovers.
3491       "subs %x[count], %x[count], #5\n"
3492       "beq 2f\n"
3493 
3494       "1:"
3495       "subs %x[count], %x[count], #16\n"
3496 
3497       // Dequantize::Transform
3498       "ld1 {v0.4s}, [%x[input]], #16\n"
3499       "prfm pldl1keep, [%x[input], #32]\n"
3500       "uxtl2 v1.8h, v0.16b\n"
3501       "uxtl v0.8h, v0.8b\n"
3502       "sxtl2 v3.4s, v1.8h\n"
3503       "sxtl v2.4s, v1.4h\n"
3504       "sxtl2 v1.4s, v0.8h\n"
3505       "sxtl v0.4s, v0.4h\n"
3506       "scvtf v0.4s, v0.4s\n"
3507       "scvtf v1.4s, v1.4s\n"
3508       "scvtf v2.4s, v2.4s\n"
3509       "scvtf v3.4s, v3.4s\n"
3510       "fsub v0.4s, v0.4s, v5.4s\n"
3511       "fsub v1.4s, v1.4s, v5.4s\n"
3512       "fsub v2.4s, v2.4s, v5.4s\n"
3513       "fsub v3.4s, v3.4s, v5.4s\n"
3514       "fmul v0.4s, v0.4s, v6.4s\n"
3515       "fmul v1.4s, v1.4s, v6.4s\n"
3516       "fmul v2.4s, v2.4s, v6.4s\n"
3517       "fmul v3.4s, v3.4s, v6.4s\n"
3518       "fadd v0.4s, v0.4s, v4.4s\n"
3519       "fadd v1.4s, v1.4s, v4.4s\n"
3520       "fadd v2.4s, v2.4s, v4.4s\n"
3521       "fadd v3.4s, v3.4s, v4.4s\n"
3522 
3523       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3524       "prfm pldl1keep, [%x[output]]\n"
3525 
3526       "bne 1b\n"
3527       "2:"
3528 
3529       // Handle leftovers.
3530 
3531       // Dequantize::Transform
3532       "ld1 {v0.s}[0], [%x[input]], #4\n"
3533       "ld1 {v0.b}[4], [%x[input]], #1\n"
3534       "prfm pldl1keep, [%x[input], #32]\n"
3535       "uxtl v0.8h, v0.8b\n"
3536       "sxtl2 v1.4s, v0.8h\n"
3537       "sxtl v0.4s, v0.4h\n"
3538       "scvtf v0.4s, v0.4s\n"
3539       "scvtf v1.4s, v1.4s\n"
3540       "fsub v0.4s, v0.4s, v5.4s\n"
3541       "fsub v1.4s, v1.4s, v5.4s\n"
3542       "fmul v0.4s, v0.4s, v6.4s\n"
3543       "fmul v1.4s, v1.4s, v6.4s\n"
3544       "fadd v0.4s, v0.4s, v4.4s\n"
3545       "fadd v1.4s, v1.4s, v4.4s\n"
3546 
3547       "st1 {v0.4s}, [%x[output]], #16\n"
3548       "st1 {v1.s}[0], [%x[output]], #4\n"
3549       "prfm pldl1keep, [%x[output]]\n"
3550       : [count] "+r"(params_count_copy), [input] "+r"(input),
3551         [output] "+r"(output)
3552       : [range_offset] "r"(params.range_offset),
3553         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3554       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3555 }
3556 
3557 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3558 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 6>::Transform(
3559     const uint8_t* input, const Dequantize& params, float* output) {
3560 #ifdef DEBUG
3561 #ifdef DEBUG_METAGEMM_VERBOSE
3562   std::cout << __FILE__ << "(" << __LINE__
3563             << ") Dequantize<uint8_t, float, Dequantize, 16, 6>::Transform()"
3564             << std::endl
3565             << std::flush;
3566 #endif
3567 #endif
3568   int params_count_copy = params.count;
3569   asm volatile(
3570 
3571       // Dequantize::Prepare
3572       "dup v4.4s, %w[range_min]\n"
3573       "dup v5.4s, %w[range_offset]\n"
3574       "dup v6.4s, %w[range_scale]\n"
3575 
3576       // Reduce count by leftovers.
3577       "subs %x[count], %x[count], #6\n"
3578       "beq 2f\n"
3579 
3580       "1:"
3581       "subs %x[count], %x[count], #16\n"
3582 
3583       // Dequantize::Transform
3584       "ld1 {v0.4s}, [%x[input]], #16\n"
3585       "prfm pldl1keep, [%x[input], #32]\n"
3586       "uxtl2 v1.8h, v0.16b\n"
3587       "uxtl v0.8h, v0.8b\n"
3588       "sxtl2 v3.4s, v1.8h\n"
3589       "sxtl v2.4s, v1.4h\n"
3590       "sxtl2 v1.4s, v0.8h\n"
3591       "sxtl v0.4s, v0.4h\n"
3592       "scvtf v0.4s, v0.4s\n"
3593       "scvtf v1.4s, v1.4s\n"
3594       "scvtf v2.4s, v2.4s\n"
3595       "scvtf v3.4s, v3.4s\n"
3596       "fsub v0.4s, v0.4s, v5.4s\n"
3597       "fsub v1.4s, v1.4s, v5.4s\n"
3598       "fsub v2.4s, v2.4s, v5.4s\n"
3599       "fsub v3.4s, v3.4s, v5.4s\n"
3600       "fmul v0.4s, v0.4s, v6.4s\n"
3601       "fmul v1.4s, v1.4s, v6.4s\n"
3602       "fmul v2.4s, v2.4s, v6.4s\n"
3603       "fmul v3.4s, v3.4s, v6.4s\n"
3604       "fadd v0.4s, v0.4s, v4.4s\n"
3605       "fadd v1.4s, v1.4s, v4.4s\n"
3606       "fadd v2.4s, v2.4s, v4.4s\n"
3607       "fadd v3.4s, v3.4s, v4.4s\n"
3608 
3609       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3610       "prfm pldl1keep, [%x[output]]\n"
3611 
3612       "bne 1b\n"
3613       "2:"
3614 
3615       // Handle leftovers.
3616 
3617       // Dequantize::Transform
3618       "ld1 {v0.s}[0], [%x[input]], #4\n"
3619       "ld1 {v0.h}[2], [%x[input]], #2\n"
3620       "prfm pldl1keep, [%x[input], #32]\n"
3621       "uxtl v0.8h, v0.8b\n"
3622       "sxtl2 v1.4s, v0.8h\n"
3623       "sxtl v0.4s, v0.4h\n"
3624       "scvtf v0.4s, v0.4s\n"
3625       "scvtf v1.4s, v1.4s\n"
3626       "fsub v0.4s, v0.4s, v5.4s\n"
3627       "fsub v1.4s, v1.4s, v5.4s\n"
3628       "fmul v0.4s, v0.4s, v6.4s\n"
3629       "fmul v1.4s, v1.4s, v6.4s\n"
3630       "fadd v0.4s, v0.4s, v4.4s\n"
3631       "fadd v1.4s, v1.4s, v4.4s\n"
3632 
3633       "st1 {v0.4s}, [%x[output]], #16\n"
3634       "st1 {v1.2s}, [%x[output]], #8\n"
3635       "prfm pldl1keep, [%x[output]]\n"
3636       : [count] "+r"(params_count_copy), [input] "+r"(input),
3637         [output] "+r"(output)
3638       : [range_offset] "r"(params.range_offset),
3639         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3640       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3641 }
3642 
3643 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3644 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 7>::Transform(
3645     const uint8_t* input, const Dequantize& params, float* output) {
3646 #ifdef DEBUG
3647 #ifdef DEBUG_METAGEMM_VERBOSE
3648   std::cout << __FILE__ << "(" << __LINE__
3649             << ") Dequantize<uint8_t, float, Dequantize, 16, 7>::Transform()"
3650             << std::endl
3651             << std::flush;
3652 #endif
3653 #endif
3654   int params_count_copy = params.count;
3655   asm volatile(
3656 
3657       // Dequantize::Prepare
3658       "dup v4.4s, %w[range_min]\n"
3659       "dup v5.4s, %w[range_offset]\n"
3660       "dup v6.4s, %w[range_scale]\n"
3661 
3662       // Reduce count by leftovers.
3663       "subs %x[count], %x[count], #7\n"
3664       "beq 2f\n"
3665 
3666       "1:"
3667       "subs %x[count], %x[count], #16\n"
3668 
3669       // Dequantize::Transform
3670       "ld1 {v0.4s}, [%x[input]], #16\n"
3671       "prfm pldl1keep, [%x[input], #32]\n"
3672       "uxtl2 v1.8h, v0.16b\n"
3673       "uxtl v0.8h, v0.8b\n"
3674       "sxtl2 v3.4s, v1.8h\n"
3675       "sxtl v2.4s, v1.4h\n"
3676       "sxtl2 v1.4s, v0.8h\n"
3677       "sxtl v0.4s, v0.4h\n"
3678       "scvtf v0.4s, v0.4s\n"
3679       "scvtf v1.4s, v1.4s\n"
3680       "scvtf v2.4s, v2.4s\n"
3681       "scvtf v3.4s, v3.4s\n"
3682       "fsub v0.4s, v0.4s, v5.4s\n"
3683       "fsub v1.4s, v1.4s, v5.4s\n"
3684       "fsub v2.4s, v2.4s, v5.4s\n"
3685       "fsub v3.4s, v3.4s, v5.4s\n"
3686       "fmul v0.4s, v0.4s, v6.4s\n"
3687       "fmul v1.4s, v1.4s, v6.4s\n"
3688       "fmul v2.4s, v2.4s, v6.4s\n"
3689       "fmul v3.4s, v3.4s, v6.4s\n"
3690       "fadd v0.4s, v0.4s, v4.4s\n"
3691       "fadd v1.4s, v1.4s, v4.4s\n"
3692       "fadd v2.4s, v2.4s, v4.4s\n"
3693       "fadd v3.4s, v3.4s, v4.4s\n"
3694 
3695       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3696       "prfm pldl1keep, [%x[output]]\n"
3697 
3698       "bne 1b\n"
3699       "2:"
3700 
3701       // Handle leftovers.
3702 
3703       // Dequantize::Transform
3704       "ld1 {v0.s}[0], [%x[input]], #4\n"
3705       "ld1 {v0.h}[2], [%x[input]], #2\n"
3706       "ld1 {v0.b}[6], [%x[input]], #1\n"
3707       "prfm pldl1keep, [%x[input], #32]\n"
3708       "uxtl v0.8h, v0.8b\n"
3709       "sxtl2 v1.4s, v0.8h\n"
3710       "sxtl v0.4s, v0.4h\n"
3711       "scvtf v0.4s, v0.4s\n"
3712       "scvtf v1.4s, v1.4s\n"
3713       "fsub v0.4s, v0.4s, v5.4s\n"
3714       "fsub v1.4s, v1.4s, v5.4s\n"
3715       "fmul v0.4s, v0.4s, v6.4s\n"
3716       "fmul v1.4s, v1.4s, v6.4s\n"
3717       "fadd v0.4s, v0.4s, v4.4s\n"
3718       "fadd v1.4s, v1.4s, v4.4s\n"
3719 
3720       "st1 {v0.4s}, [%x[output]], #16\n"
3721       "st1 {v1.2s}, [%x[output]], #8\n"
3722       "st1 {v1.s}[2], [%x[output]], #4\n"
3723       "prfm pldl1keep, [%x[output]]\n"
3724       : [count] "+r"(params_count_copy), [input] "+r"(input),
3725         [output] "+r"(output)
3726       : [range_offset] "r"(params.range_offset),
3727         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3728       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3729 }
3730 
3731 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3732 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 8>::Transform(
3733     const uint8_t* input, const Dequantize& params, float* output) {
3734 #ifdef DEBUG
3735 #ifdef DEBUG_METAGEMM_VERBOSE
3736   std::cout << __FILE__ << "(" << __LINE__
3737             << ") Dequantize<uint8_t, float, Dequantize, 16, 8>::Transform()"
3738             << std::endl
3739             << std::flush;
3740 #endif
3741 #endif
3742   int params_count_copy = params.count;
3743   asm volatile(
3744 
3745       // Dequantize::Prepare
3746       "dup v4.4s, %w[range_min]\n"
3747       "dup v5.4s, %w[range_offset]\n"
3748       "dup v6.4s, %w[range_scale]\n"
3749 
3750       // Reduce count by leftovers.
3751       "subs %x[count], %x[count], #8\n"
3752       "beq 2f\n"
3753 
3754       "1:"
3755       "subs %x[count], %x[count], #16\n"
3756 
3757       // Dequantize::Transform
3758       "ld1 {v0.4s}, [%x[input]], #16\n"
3759       "prfm pldl1keep, [%x[input], #32]\n"
3760       "uxtl2 v1.8h, v0.16b\n"
3761       "uxtl v0.8h, v0.8b\n"
3762       "sxtl2 v3.4s, v1.8h\n"
3763       "sxtl v2.4s, v1.4h\n"
3764       "sxtl2 v1.4s, v0.8h\n"
3765       "sxtl v0.4s, v0.4h\n"
3766       "scvtf v0.4s, v0.4s\n"
3767       "scvtf v1.4s, v1.4s\n"
3768       "scvtf v2.4s, v2.4s\n"
3769       "scvtf v3.4s, v3.4s\n"
3770       "fsub v0.4s, v0.4s, v5.4s\n"
3771       "fsub v1.4s, v1.4s, v5.4s\n"
3772       "fsub v2.4s, v2.4s, v5.4s\n"
3773       "fsub v3.4s, v3.4s, v5.4s\n"
3774       "fmul v0.4s, v0.4s, v6.4s\n"
3775       "fmul v1.4s, v1.4s, v6.4s\n"
3776       "fmul v2.4s, v2.4s, v6.4s\n"
3777       "fmul v3.4s, v3.4s, v6.4s\n"
3778       "fadd v0.4s, v0.4s, v4.4s\n"
3779       "fadd v1.4s, v1.4s, v4.4s\n"
3780       "fadd v2.4s, v2.4s, v4.4s\n"
3781       "fadd v3.4s, v3.4s, v4.4s\n"
3782 
3783       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3784       "prfm pldl1keep, [%x[output]]\n"
3785 
3786       "bne 1b\n"
3787       "2:"
3788 
3789       // Handle leftovers.
3790 
3791       // Dequantize::Transform
3792       "ld1 {v0.2s}, [%x[input]], #8\n"
3793       "prfm pldl1keep, [%x[input], #32]\n"
3794       "uxtl v0.8h, v0.8b\n"
3795       "sxtl2 v1.4s, v0.8h\n"
3796       "sxtl v0.4s, v0.4h\n"
3797       "scvtf v0.4s, v0.4s\n"
3798       "scvtf v1.4s, v1.4s\n"
3799       "fsub v0.4s, v0.4s, v5.4s\n"
3800       "fsub v1.4s, v1.4s, v5.4s\n"
3801       "fmul v0.4s, v0.4s, v6.4s\n"
3802       "fmul v1.4s, v1.4s, v6.4s\n"
3803       "fadd v0.4s, v0.4s, v4.4s\n"
3804       "fadd v1.4s, v1.4s, v4.4s\n"
3805 
3806       "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
3807       "prfm pldl1keep, [%x[output]]\n"
3808       : [count] "+r"(params_count_copy), [input] "+r"(input),
3809         [output] "+r"(output)
3810       : [range_offset] "r"(params.range_offset),
3811         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3812       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3813 }
3814 
3815 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3816 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 9>::Transform(
3817     const uint8_t* input, const Dequantize& params, float* output) {
3818 #ifdef DEBUG
3819 #ifdef DEBUG_METAGEMM_VERBOSE
3820   std::cout << __FILE__ << "(" << __LINE__
3821             << ") Dequantize<uint8_t, float, Dequantize, 16, 9>::Transform()"
3822             << std::endl
3823             << std::flush;
3824 #endif
3825 #endif
3826   int params_count_copy = params.count;
3827   asm volatile(
3828 
3829       // Dequantize::Prepare
3830       "dup v4.4s, %w[range_min]\n"
3831       "dup v5.4s, %w[range_offset]\n"
3832       "dup v6.4s, %w[range_scale]\n"
3833 
3834       // Reduce count by leftovers.
3835       "subs %x[count], %x[count], #9\n"
3836       "beq 2f\n"
3837 
3838       "1:"
3839       "subs %x[count], %x[count], #16\n"
3840 
3841       // Dequantize::Transform
3842       "ld1 {v0.4s}, [%x[input]], #16\n"
3843       "prfm pldl1keep, [%x[input], #32]\n"
3844       "uxtl2 v1.8h, v0.16b\n"
3845       "uxtl v0.8h, v0.8b\n"
3846       "sxtl2 v3.4s, v1.8h\n"
3847       "sxtl v2.4s, v1.4h\n"
3848       "sxtl2 v1.4s, v0.8h\n"
3849       "sxtl v0.4s, v0.4h\n"
3850       "scvtf v0.4s, v0.4s\n"
3851       "scvtf v1.4s, v1.4s\n"
3852       "scvtf v2.4s, v2.4s\n"
3853       "scvtf v3.4s, v3.4s\n"
3854       "fsub v0.4s, v0.4s, v5.4s\n"
3855       "fsub v1.4s, v1.4s, v5.4s\n"
3856       "fsub v2.4s, v2.4s, v5.4s\n"
3857       "fsub v3.4s, v3.4s, v5.4s\n"
3858       "fmul v0.4s, v0.4s, v6.4s\n"
3859       "fmul v1.4s, v1.4s, v6.4s\n"
3860       "fmul v2.4s, v2.4s, v6.4s\n"
3861       "fmul v3.4s, v3.4s, v6.4s\n"
3862       "fadd v0.4s, v0.4s, v4.4s\n"
3863       "fadd v1.4s, v1.4s, v4.4s\n"
3864       "fadd v2.4s, v2.4s, v4.4s\n"
3865       "fadd v3.4s, v3.4s, v4.4s\n"
3866 
3867       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3868       "prfm pldl1keep, [%x[output]]\n"
3869 
3870       "bne 1b\n"
3871       "2:"
3872 
3873       // Handle leftovers.
3874 
3875       // Dequantize::Transform
3876       "ld1 {v0.2s}, [%x[input]], #8\n"
3877       "ld1 {v0.b}[8], [%x[input]], #1\n"
3878       "prfm pldl1keep, [%x[input], #32]\n"
3879       "uxtl2 v1.8h, v0.16b\n"
3880       "uxtl v0.8h, v0.8b\n"
3881       "sxtl v2.4s, v1.4h\n"
3882       "sxtl2 v1.4s, v0.8h\n"
3883       "sxtl v0.4s, v0.4h\n"
3884       "scvtf v0.4s, v0.4s\n"
3885       "scvtf v1.4s, v1.4s\n"
3886       "scvtf v2.4s, v2.4s\n"
3887       "fsub v0.4s, v0.4s, v5.4s\n"
3888       "fsub v1.4s, v1.4s, v5.4s\n"
3889       "fsub v2.4s, v2.4s, v5.4s\n"
3890       "fmul v0.4s, v0.4s, v6.4s\n"
3891       "fmul v1.4s, v1.4s, v6.4s\n"
3892       "fmul v2.4s, v2.4s, v6.4s\n"
3893       "fadd v0.4s, v0.4s, v4.4s\n"
3894       "fadd v1.4s, v1.4s, v4.4s\n"
3895       "fadd v2.4s, v2.4s, v4.4s\n"
3896 
3897       "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
3898       "st1 {v2.s}[0], [%x[output]], #4\n"
3899       "prfm pldl1keep, [%x[output]]\n"
3900       : [count] "+r"(params_count_copy), [input] "+r"(input),
3901         [output] "+r"(output)
3902       : [range_offset] "r"(params.range_offset),
3903         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3904       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3905 }
3906 
3907 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3908 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 10>::Transform(
3909     const uint8_t* input, const Dequantize& params, float* output) {
3910 #ifdef DEBUG
3911 #ifdef DEBUG_METAGEMM_VERBOSE
3912   std::cout << __FILE__ << "(" << __LINE__
3913             << ") Dequantize<uint8_t, float, Dequantize, 16, 10>::Transform()"
3914             << std::endl
3915             << std::flush;
3916 #endif
3917 #endif
3918   int params_count_copy = params.count;
3919   asm volatile(
3920 
3921       // Dequantize::Prepare
3922       "dup v4.4s, %w[range_min]\n"
3923       "dup v5.4s, %w[range_offset]\n"
3924       "dup v6.4s, %w[range_scale]\n"
3925 
3926       // Reduce count by leftovers.
3927       "subs %x[count], %x[count], #10\n"
3928       "beq 2f\n"
3929 
3930       "1:"
3931       "subs %x[count], %x[count], #16\n"
3932 
3933       // Dequantize::Transform
3934       "ld1 {v0.4s}, [%x[input]], #16\n"
3935       "prfm pldl1keep, [%x[input], #32]\n"
3936       "uxtl2 v1.8h, v0.16b\n"
3937       "uxtl v0.8h, v0.8b\n"
3938       "sxtl2 v3.4s, v1.8h\n"
3939       "sxtl v2.4s, v1.4h\n"
3940       "sxtl2 v1.4s, v0.8h\n"
3941       "sxtl v0.4s, v0.4h\n"
3942       "scvtf v0.4s, v0.4s\n"
3943       "scvtf v1.4s, v1.4s\n"
3944       "scvtf v2.4s, v2.4s\n"
3945       "scvtf v3.4s, v3.4s\n"
3946       "fsub v0.4s, v0.4s, v5.4s\n"
3947       "fsub v1.4s, v1.4s, v5.4s\n"
3948       "fsub v2.4s, v2.4s, v5.4s\n"
3949       "fsub v3.4s, v3.4s, v5.4s\n"
3950       "fmul v0.4s, v0.4s, v6.4s\n"
3951       "fmul v1.4s, v1.4s, v6.4s\n"
3952       "fmul v2.4s, v2.4s, v6.4s\n"
3953       "fmul v3.4s, v3.4s, v6.4s\n"
3954       "fadd v0.4s, v0.4s, v4.4s\n"
3955       "fadd v1.4s, v1.4s, v4.4s\n"
3956       "fadd v2.4s, v2.4s, v4.4s\n"
3957       "fadd v3.4s, v3.4s, v4.4s\n"
3958 
3959       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3960       "prfm pldl1keep, [%x[output]]\n"
3961 
3962       "bne 1b\n"
3963       "2:"
3964 
3965       // Handle leftovers.
3966 
3967       // Dequantize::Transform
3968       "ld1 {v0.2s}, [%x[input]], #8\n"
3969       "ld1 {v0.h}[4], [%x[input]], #2\n"
3970       "prfm pldl1keep, [%x[input], #32]\n"
3971       "uxtl2 v1.8h, v0.16b\n"
3972       "uxtl v0.8h, v0.8b\n"
3973       "sxtl v2.4s, v1.4h\n"
3974       "sxtl2 v1.4s, v0.8h\n"
3975       "sxtl v0.4s, v0.4h\n"
3976       "scvtf v0.4s, v0.4s\n"
3977       "scvtf v1.4s, v1.4s\n"
3978       "scvtf v2.4s, v2.4s\n"
3979       "fsub v0.4s, v0.4s, v5.4s\n"
3980       "fsub v1.4s, v1.4s, v5.4s\n"
3981       "fsub v2.4s, v2.4s, v5.4s\n"
3982       "fmul v0.4s, v0.4s, v6.4s\n"
3983       "fmul v1.4s, v1.4s, v6.4s\n"
3984       "fmul v2.4s, v2.4s, v6.4s\n"
3985       "fadd v0.4s, v0.4s, v4.4s\n"
3986       "fadd v1.4s, v1.4s, v4.4s\n"
3987       "fadd v2.4s, v2.4s, v4.4s\n"
3988 
3989       "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
3990       "st1 {v2.2s}, [%x[output]], #8\n"
3991       "prfm pldl1keep, [%x[output]]\n"
3992       : [count] "+r"(params_count_copy), [input] "+r"(input),
3993         [output] "+r"(output)
3994       : [range_offset] "r"(params.range_offset),
3995         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3996       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3997 }
3998 
3999 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4000 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 11>::Transform(
4001     const uint8_t* input, const Dequantize& params, float* output) {
4002 #ifdef DEBUG
4003 #ifdef DEBUG_METAGEMM_VERBOSE
4004   std::cout << __FILE__ << "(" << __LINE__
4005             << ") Dequantize<uint8_t, float, Dequantize, 16, 11>::Transform()"
4006             << std::endl
4007             << std::flush;
4008 #endif
4009 #endif
4010   int params_count_copy = params.count;
4011   asm volatile(
4012 
4013       // Dequantize::Prepare
4014       "dup v4.4s, %w[range_min]\n"
4015       "dup v5.4s, %w[range_offset]\n"
4016       "dup v6.4s, %w[range_scale]\n"
4017 
4018       // Reduce count by leftovers.
4019       "subs %x[count], %x[count], #11\n"
4020       "beq 2f\n"
4021 
4022       "1:"
4023       "subs %x[count], %x[count], #16\n"
4024 
4025       // Dequantize::Transform
4026       "ld1 {v0.4s}, [%x[input]], #16\n"
4027       "prfm pldl1keep, [%x[input], #32]\n"
4028       "uxtl2 v1.8h, v0.16b\n"
4029       "uxtl v0.8h, v0.8b\n"
4030       "sxtl2 v3.4s, v1.8h\n"
4031       "sxtl v2.4s, v1.4h\n"
4032       "sxtl2 v1.4s, v0.8h\n"
4033       "sxtl v0.4s, v0.4h\n"
4034       "scvtf v0.4s, v0.4s\n"
4035       "scvtf v1.4s, v1.4s\n"
4036       "scvtf v2.4s, v2.4s\n"
4037       "scvtf v3.4s, v3.4s\n"
4038       "fsub v0.4s, v0.4s, v5.4s\n"
4039       "fsub v1.4s, v1.4s, v5.4s\n"
4040       "fsub v2.4s, v2.4s, v5.4s\n"
4041       "fsub v3.4s, v3.4s, v5.4s\n"
4042       "fmul v0.4s, v0.4s, v6.4s\n"
4043       "fmul v1.4s, v1.4s, v6.4s\n"
4044       "fmul v2.4s, v2.4s, v6.4s\n"
4045       "fmul v3.4s, v3.4s, v6.4s\n"
4046       "fadd v0.4s, v0.4s, v4.4s\n"
4047       "fadd v1.4s, v1.4s, v4.4s\n"
4048       "fadd v2.4s, v2.4s, v4.4s\n"
4049       "fadd v3.4s, v3.4s, v4.4s\n"
4050 
4051       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4052       "prfm pldl1keep, [%x[output]]\n"
4053 
4054       "bne 1b\n"
4055       "2:"
4056 
4057       // Handle leftovers.
4058 
4059       // Dequantize::Transform
4060       "ld1 {v0.2s}, [%x[input]], #8\n"
4061       "ld1 {v0.h}[4], [%x[input]], #2\n"
4062       "ld1 {v0.b}[10], [%x[input]], #1\n"
4063       "prfm pldl1keep, [%x[input], #32]\n"
4064       "uxtl2 v1.8h, v0.16b\n"
4065       "uxtl v0.8h, v0.8b\n"
4066       "sxtl v2.4s, v1.4h\n"
4067       "sxtl2 v1.4s, v0.8h\n"
4068       "sxtl v0.4s, v0.4h\n"
4069       "scvtf v0.4s, v0.4s\n"
4070       "scvtf v1.4s, v1.4s\n"
4071       "scvtf v2.4s, v2.4s\n"
4072       "fsub v0.4s, v0.4s, v5.4s\n"
4073       "fsub v1.4s, v1.4s, v5.4s\n"
4074       "fsub v2.4s, v2.4s, v5.4s\n"
4075       "fmul v0.4s, v0.4s, v6.4s\n"
4076       "fmul v1.4s, v1.4s, v6.4s\n"
4077       "fmul v2.4s, v2.4s, v6.4s\n"
4078       "fadd v0.4s, v0.4s, v4.4s\n"
4079       "fadd v1.4s, v1.4s, v4.4s\n"
4080       "fadd v2.4s, v2.4s, v4.4s\n"
4081 
4082       "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
4083       "st1 {v2.2s}, [%x[output]], #8\n"
4084       "st1 {v2.s}[2], [%x[output]], #4\n"
4085       "prfm pldl1keep, [%x[output]]\n"
4086       : [count] "+r"(params_count_copy), [input] "+r"(input),
4087         [output] "+r"(output)
4088       : [range_offset] "r"(params.range_offset),
4089         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4090       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4091 }
4092 
4093 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4094 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 12>::Transform(
4095     const uint8_t* input, const Dequantize& params, float* output) {
4096 #ifdef DEBUG
4097 #ifdef DEBUG_METAGEMM_VERBOSE
4098   std::cout << __FILE__ << "(" << __LINE__
4099             << ") Dequantize<uint8_t, float, Dequantize, 16, 12>::Transform()"
4100             << std::endl
4101             << std::flush;
4102 #endif
4103 #endif
4104   int params_count_copy = params.count;
4105   asm volatile(
4106 
4107       // Dequantize::Prepare
4108       "dup v4.4s, %w[range_min]\n"
4109       "dup v5.4s, %w[range_offset]\n"
4110       "dup v6.4s, %w[range_scale]\n"
4111 
4112       // Reduce count by leftovers.
4113       "subs %x[count], %x[count], #12\n"
4114       "beq 2f\n"
4115 
4116       "1:"
4117       "subs %x[count], %x[count], #16\n"
4118 
4119       // Dequantize::Transform
4120       "ld1 {v0.4s}, [%x[input]], #16\n"
4121       "prfm pldl1keep, [%x[input], #32]\n"
4122       "uxtl2 v1.8h, v0.16b\n"
4123       "uxtl v0.8h, v0.8b\n"
4124       "sxtl2 v3.4s, v1.8h\n"
4125       "sxtl v2.4s, v1.4h\n"
4126       "sxtl2 v1.4s, v0.8h\n"
4127       "sxtl v0.4s, v0.4h\n"
4128       "scvtf v0.4s, v0.4s\n"
4129       "scvtf v1.4s, v1.4s\n"
4130       "scvtf v2.4s, v2.4s\n"
4131       "scvtf v3.4s, v3.4s\n"
4132       "fsub v0.4s, v0.4s, v5.4s\n"
4133       "fsub v1.4s, v1.4s, v5.4s\n"
4134       "fsub v2.4s, v2.4s, v5.4s\n"
4135       "fsub v3.4s, v3.4s, v5.4s\n"
4136       "fmul v0.4s, v0.4s, v6.4s\n"
4137       "fmul v1.4s, v1.4s, v6.4s\n"
4138       "fmul v2.4s, v2.4s, v6.4s\n"
4139       "fmul v3.4s, v3.4s, v6.4s\n"
4140       "fadd v0.4s, v0.4s, v4.4s\n"
4141       "fadd v1.4s, v1.4s, v4.4s\n"
4142       "fadd v2.4s, v2.4s, v4.4s\n"
4143       "fadd v3.4s, v3.4s, v4.4s\n"
4144 
4145       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4146       "prfm pldl1keep, [%x[output]]\n"
4147 
4148       "bne 1b\n"
4149       "2:"
4150 
4151       // Handle leftovers.
4152 
4153       // Dequantize::Transform
4154       "ld1 {v0.2s}, [%x[input]], #8\n"
4155       "ld1 {v0.s}[2], [%x[input]], #4\n"
4156       "prfm pldl1keep, [%x[input], #32]\n"
4157       "uxtl2 v1.8h, v0.16b\n"
4158       "uxtl v0.8h, v0.8b\n"
4159       "sxtl v2.4s, v1.4h\n"
4160       "sxtl2 v1.4s, v0.8h\n"
4161       "sxtl v0.4s, v0.4h\n"
4162       "scvtf v0.4s, v0.4s\n"
4163       "scvtf v1.4s, v1.4s\n"
4164       "scvtf v2.4s, v2.4s\n"
4165       "fsub v0.4s, v0.4s, v5.4s\n"
4166       "fsub v1.4s, v1.4s, v5.4s\n"
4167       "fsub v2.4s, v2.4s, v5.4s\n"
4168       "fmul v0.4s, v0.4s, v6.4s\n"
4169       "fmul v1.4s, v1.4s, v6.4s\n"
4170       "fmul v2.4s, v2.4s, v6.4s\n"
4171       "fadd v0.4s, v0.4s, v4.4s\n"
4172       "fadd v1.4s, v1.4s, v4.4s\n"
4173       "fadd v2.4s, v2.4s, v4.4s\n"
4174 
4175       "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
4176       "prfm pldl1keep, [%x[output]]\n"
4177       : [count] "+r"(params_count_copy), [input] "+r"(input),
4178         [output] "+r"(output)
4179       : [range_offset] "r"(params.range_offset),
4180         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4181       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4182 }
4183 
4184 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4185 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 13>::Transform(
4186     const uint8_t* input, const Dequantize& params, float* output) {
4187 #ifdef DEBUG
4188 #ifdef DEBUG_METAGEMM_VERBOSE
4189   std::cout << __FILE__ << "(" << __LINE__
4190             << ") Dequantize<uint8_t, float, Dequantize, 16, 13>::Transform()"
4191             << std::endl
4192             << std::flush;
4193 #endif
4194 #endif
4195   int params_count_copy = params.count;
4196   asm volatile(
4197 
4198       // Dequantize::Prepare
4199       "dup v4.4s, %w[range_min]\n"
4200       "dup v5.4s, %w[range_offset]\n"
4201       "dup v6.4s, %w[range_scale]\n"
4202 
4203       // Reduce count by leftovers.
4204       "subs %x[count], %x[count], #13\n"
4205       "beq 2f\n"
4206 
4207       "1:"
4208       "subs %x[count], %x[count], #16\n"
4209 
4210       // Dequantize::Transform
4211       "ld1 {v0.4s}, [%x[input]], #16\n"
4212       "prfm pldl1keep, [%x[input], #32]\n"
4213       "uxtl2 v1.8h, v0.16b\n"
4214       "uxtl v0.8h, v0.8b\n"
4215       "sxtl2 v3.4s, v1.8h\n"
4216       "sxtl v2.4s, v1.4h\n"
4217       "sxtl2 v1.4s, v0.8h\n"
4218       "sxtl v0.4s, v0.4h\n"
4219       "scvtf v0.4s, v0.4s\n"
4220       "scvtf v1.4s, v1.4s\n"
4221       "scvtf v2.4s, v2.4s\n"
4222       "scvtf v3.4s, v3.4s\n"
4223       "fsub v0.4s, v0.4s, v5.4s\n"
4224       "fsub v1.4s, v1.4s, v5.4s\n"
4225       "fsub v2.4s, v2.4s, v5.4s\n"
4226       "fsub v3.4s, v3.4s, v5.4s\n"
4227       "fmul v0.4s, v0.4s, v6.4s\n"
4228       "fmul v1.4s, v1.4s, v6.4s\n"
4229       "fmul v2.4s, v2.4s, v6.4s\n"
4230       "fmul v3.4s, v3.4s, v6.4s\n"
4231       "fadd v0.4s, v0.4s, v4.4s\n"
4232       "fadd v1.4s, v1.4s, v4.4s\n"
4233       "fadd v2.4s, v2.4s, v4.4s\n"
4234       "fadd v3.4s, v3.4s, v4.4s\n"
4235 
4236       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4237       "prfm pldl1keep, [%x[output]]\n"
4238 
4239       "bne 1b\n"
4240       "2:"
4241 
4242       // Handle leftovers.
4243 
4244       // Dequantize::Transform
4245       "ld1 {v0.2s}, [%x[input]], #8\n"
4246       "ld1 {v0.s}[2], [%x[input]], #4\n"
4247       "ld1 {v0.b}[12], [%x[input]], #1\n"
4248       "prfm pldl1keep, [%x[input], #32]\n"
4249       "uxtl2 v1.8h, v0.16b\n"
4250       "uxtl v0.8h, v0.8b\n"
4251       "sxtl2 v3.4s, v1.8h\n"
4252       "sxtl v2.4s, v1.4h\n"
4253       "sxtl2 v1.4s, v0.8h\n"
4254       "sxtl v0.4s, v0.4h\n"
4255       "scvtf v0.4s, v0.4s\n"
4256       "scvtf v1.4s, v1.4s\n"
4257       "scvtf v2.4s, v2.4s\n"
4258       "scvtf v3.4s, v3.4s\n"
4259       "fsub v0.4s, v0.4s, v5.4s\n"
4260       "fsub v1.4s, v1.4s, v5.4s\n"
4261       "fsub v2.4s, v2.4s, v5.4s\n"
4262       "fsub v3.4s, v3.4s, v5.4s\n"
4263       "fmul v0.4s, v0.4s, v6.4s\n"
4264       "fmul v1.4s, v1.4s, v6.4s\n"
4265       "fmul v2.4s, v2.4s, v6.4s\n"
4266       "fmul v3.4s, v3.4s, v6.4s\n"
4267       "fadd v0.4s, v0.4s, v4.4s\n"
4268       "fadd v1.4s, v1.4s, v4.4s\n"
4269       "fadd v2.4s, v2.4s, v4.4s\n"
4270       "fadd v3.4s, v3.4s, v4.4s\n"
4271 
4272       "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
4273       "st1 {v3.s}[0], [%x[output]], #4\n"
4274       "prfm pldl1keep, [%x[output]]\n"
4275       : [count] "+r"(params_count_copy), [input] "+r"(input),
4276         [output] "+r"(output)
4277       : [range_offset] "r"(params.range_offset),
4278         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4279       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4280 }
4281 
4282 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4283 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 14>::Transform(
4284     const uint8_t* input, const Dequantize& params, float* output) {
4285 #ifdef DEBUG
4286 #ifdef DEBUG_METAGEMM_VERBOSE
4287   std::cout << __FILE__ << "(" << __LINE__
4288             << ") Dequantize<uint8_t, float, Dequantize, 16, 14>::Transform()"
4289             << std::endl
4290             << std::flush;
4291 #endif
4292 #endif
4293   int params_count_copy = params.count;
4294   asm volatile(
4295 
4296       // Dequantize::Prepare
4297       "dup v4.4s, %w[range_min]\n"
4298       "dup v5.4s, %w[range_offset]\n"
4299       "dup v6.4s, %w[range_scale]\n"
4300 
4301       // Reduce count by leftovers.
4302       "subs %x[count], %x[count], #14\n"
4303       "beq 2f\n"
4304 
4305       "1:"
4306       "subs %x[count], %x[count], #16\n"
4307 
4308       // Dequantize::Transform
4309       "ld1 {v0.4s}, [%x[input]], #16\n"
4310       "prfm pldl1keep, [%x[input], #32]\n"
4311       "uxtl2 v1.8h, v0.16b\n"
4312       "uxtl v0.8h, v0.8b\n"
4313       "sxtl2 v3.4s, v1.8h\n"
4314       "sxtl v2.4s, v1.4h\n"
4315       "sxtl2 v1.4s, v0.8h\n"
4316       "sxtl v0.4s, v0.4h\n"
4317       "scvtf v0.4s, v0.4s\n"
4318       "scvtf v1.4s, v1.4s\n"
4319       "scvtf v2.4s, v2.4s\n"
4320       "scvtf v3.4s, v3.4s\n"
4321       "fsub v0.4s, v0.4s, v5.4s\n"
4322       "fsub v1.4s, v1.4s, v5.4s\n"
4323       "fsub v2.4s, v2.4s, v5.4s\n"
4324       "fsub v3.4s, v3.4s, v5.4s\n"
4325       "fmul v0.4s, v0.4s, v6.4s\n"
4326       "fmul v1.4s, v1.4s, v6.4s\n"
4327       "fmul v2.4s, v2.4s, v6.4s\n"
4328       "fmul v3.4s, v3.4s, v6.4s\n"
4329       "fadd v0.4s, v0.4s, v4.4s\n"
4330       "fadd v1.4s, v1.4s, v4.4s\n"
4331       "fadd v2.4s, v2.4s, v4.4s\n"
4332       "fadd v3.4s, v3.4s, v4.4s\n"
4333 
4334       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4335       "prfm pldl1keep, [%x[output]]\n"
4336 
4337       "bne 1b\n"
4338       "2:"
4339 
4340       // Handle leftovers.
4341 
4342       // Dequantize::Transform
4343       "ld1 {v0.2s}, [%x[input]], #8\n"
4344       "ld1 {v0.s}[2], [%x[input]], #4\n"
4345       "ld1 {v0.h}[6], [%x[input]], #2\n"
4346       "prfm pldl1keep, [%x[input], #32]\n"
4347       "uxtl2 v1.8h, v0.16b\n"
4348       "uxtl v0.8h, v0.8b\n"
4349       "sxtl2 v3.4s, v1.8h\n"
4350       "sxtl v2.4s, v1.4h\n"
4351       "sxtl2 v1.4s, v0.8h\n"
4352       "sxtl v0.4s, v0.4h\n"
4353       "scvtf v0.4s, v0.4s\n"
4354       "scvtf v1.4s, v1.4s\n"
4355       "scvtf v2.4s, v2.4s\n"
4356       "scvtf v3.4s, v3.4s\n"
4357       "fsub v0.4s, v0.4s, v5.4s\n"
4358       "fsub v1.4s, v1.4s, v5.4s\n"
4359       "fsub v2.4s, v2.4s, v5.4s\n"
4360       "fsub v3.4s, v3.4s, v5.4s\n"
4361       "fmul v0.4s, v0.4s, v6.4s\n"
4362       "fmul v1.4s, v1.4s, v6.4s\n"
4363       "fmul v2.4s, v2.4s, v6.4s\n"
4364       "fmul v3.4s, v3.4s, v6.4s\n"
4365       "fadd v0.4s, v0.4s, v4.4s\n"
4366       "fadd v1.4s, v1.4s, v4.4s\n"
4367       "fadd v2.4s, v2.4s, v4.4s\n"
4368       "fadd v3.4s, v3.4s, v4.4s\n"
4369 
4370       "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
4371       "st1 {v3.2s}, [%x[output]], #8\n"
4372       "prfm pldl1keep, [%x[output]]\n"
4373       : [count] "+r"(params_count_copy), [input] "+r"(input),
4374         [output] "+r"(output)
4375       : [range_offset] "r"(params.range_offset),
4376         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4377       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4378 }
4379 
4380 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4381 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 15>::Transform(
4382     const uint8_t* input, const Dequantize& params, float* output) {
4383 #ifdef DEBUG
4384 #ifdef DEBUG_METAGEMM_VERBOSE
4385   std::cout << __FILE__ << "(" << __LINE__
4386             << ") Dequantize<uint8_t, float, Dequantize, 16, 15>::Transform()"
4387             << std::endl
4388             << std::flush;
4389 #endif
4390 #endif
4391   int params_count_copy = params.count;
4392   asm volatile(
4393 
4394       // Dequantize::Prepare
4395       "dup v4.4s, %w[range_min]\n"
4396       "dup v5.4s, %w[range_offset]\n"
4397       "dup v6.4s, %w[range_scale]\n"
4398 
4399       // Reduce count by leftovers.
4400       "subs %x[count], %x[count], #15\n"
4401       "beq 2f\n"
4402 
4403       "1:"
4404       "subs %x[count], %x[count], #16\n"
4405 
4406       // Dequantize::Transform
4407       "ld1 {v0.4s}, [%x[input]], #16\n"
4408       "prfm pldl1keep, [%x[input], #32]\n"
4409       "uxtl2 v1.8h, v0.16b\n"
4410       "uxtl v0.8h, v0.8b\n"
4411       "sxtl2 v3.4s, v1.8h\n"
4412       "sxtl v2.4s, v1.4h\n"
4413       "sxtl2 v1.4s, v0.8h\n"
4414       "sxtl v0.4s, v0.4h\n"
4415       "scvtf v0.4s, v0.4s\n"
4416       "scvtf v1.4s, v1.4s\n"
4417       "scvtf v2.4s, v2.4s\n"
4418       "scvtf v3.4s, v3.4s\n"
4419       "fsub v0.4s, v0.4s, v5.4s\n"
4420       "fsub v1.4s, v1.4s, v5.4s\n"
4421       "fsub v2.4s, v2.4s, v5.4s\n"
4422       "fsub v3.4s, v3.4s, v5.4s\n"
4423       "fmul v0.4s, v0.4s, v6.4s\n"
4424       "fmul v1.4s, v1.4s, v6.4s\n"
4425       "fmul v2.4s, v2.4s, v6.4s\n"
4426       "fmul v3.4s, v3.4s, v6.4s\n"
4427       "fadd v0.4s, v0.4s, v4.4s\n"
4428       "fadd v1.4s, v1.4s, v4.4s\n"
4429       "fadd v2.4s, v2.4s, v4.4s\n"
4430       "fadd v3.4s, v3.4s, v4.4s\n"
4431 
4432       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4433       "prfm pldl1keep, [%x[output]]\n"
4434 
4435       "bne 1b\n"
4436       "2:"
4437 
4438       // Handle leftovers.
4439 
4440       // Dequantize::Transform
4441       "ld1 {v0.2s}, [%x[input]], #8\n"
4442       "ld1 {v0.s}[2], [%x[input]], #4\n"
4443       "ld1 {v0.h}[6], [%x[input]], #2\n"
4444       "ld1 {v0.b}[14], [%x[input]], #1\n"
4445       "prfm pldl1keep, [%x[input], #32]\n"
4446       "uxtl2 v1.8h, v0.16b\n"
4447       "uxtl v0.8h, v0.8b\n"
4448       "sxtl2 v3.4s, v1.8h\n"
4449       "sxtl v2.4s, v1.4h\n"
4450       "sxtl2 v1.4s, v0.8h\n"
4451       "sxtl v0.4s, v0.4h\n"
4452       "scvtf v0.4s, v0.4s\n"
4453       "scvtf v1.4s, v1.4s\n"
4454       "scvtf v2.4s, v2.4s\n"
4455       "scvtf v3.4s, v3.4s\n"
4456       "fsub v0.4s, v0.4s, v5.4s\n"
4457       "fsub v1.4s, v1.4s, v5.4s\n"
4458       "fsub v2.4s, v2.4s, v5.4s\n"
4459       "fsub v3.4s, v3.4s, v5.4s\n"
4460       "fmul v0.4s, v0.4s, v6.4s\n"
4461       "fmul v1.4s, v1.4s, v6.4s\n"
4462       "fmul v2.4s, v2.4s, v6.4s\n"
4463       "fmul v3.4s, v3.4s, v6.4s\n"
4464       "fadd v0.4s, v0.4s, v4.4s\n"
4465       "fadd v1.4s, v1.4s, v4.4s\n"
4466       "fadd v2.4s, v2.4s, v4.4s\n"
4467       "fadd v3.4s, v3.4s, v4.4s\n"
4468 
4469       "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
4470       "st1 {v3.2s}, [%x[output]], #8\n"
4471       "st1 {v3.s}[2], [%x[output]], #4\n"
4472       "prfm pldl1keep, [%x[output]]\n"
4473       : [count] "+r"(params_count_copy), [input] "+r"(input),
4474         [output] "+r"(output)
4475       : [range_offset] "r"(params.range_offset),
4476         [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4477       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4478 }
4479 
4480 template <>
4481 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4482                               0>::Transform(const uint8_t* input,
4483                                             const MinMax<uint8_t>& params,
4484                                             uint8_t* output) {
4485 #ifdef DEBUG
4486 #ifdef DEBUG_METAGEMM_VERBOSE
4487   std::cout << __FILE__ << "(" << __LINE__
4488             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4489                "0>::Transform()"
4490             << std::endl
4491             << std::flush;
4492 #endif
4493 #endif
4494   int params_count_copy = params.count;
4495   asm volatile(
4496 
4497       // MinMax::Prepare
4498       "dup v4.16b, %w[min]\n"
4499       "dup v5.16b, %w[max]\n"
4500 
4501       "1:"
4502       "subs %x[count], %x[count], #16\n"
4503 
4504       // MinMax::Transform
4505       "ld1 {v0.4s}, [%x[input]], #16\n"
4506       "prfm pldl1keep, [%x[input], #16]\n"
4507       "umax v0.16b, v0.16b, v4.16b\n"
4508       "umin v0.16b, v0.16b, v5.16b\n"
4509 
4510       "st1 {v0.4s}, [%x[output]], #16\n"
4511       "prfm pldl1keep, [%x[output]]\n"
4512 
4513       "bne 1b\n"
4514       : [count] "+r"(params_count_copy), [input] "+r"(input),
4515         [output] "+r"(output)
4516       : [max] "r"(params.max), [min] "r"(params.min)
4517       : "v0", "v4", "v5", "cc", "memory");
4518 }
4519 
4520 template <>
4521 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4522                               1>::Transform(const uint8_t* input,
4523                                             const MinMax<uint8_t>& params,
4524                                             uint8_t* output) {
4525 #ifdef DEBUG
4526 #ifdef DEBUG_METAGEMM_VERBOSE
4527   std::cout << __FILE__ << "(" << __LINE__
4528             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4529                "1>::Transform()"
4530             << std::endl
4531             << std::flush;
4532 #endif
4533 #endif
4534   int params_count_copy = params.count;
4535   asm volatile(
4536 
4537       // MinMax::Prepare
4538       "dup v4.16b, %w[min]\n"
4539       "dup v5.16b, %w[max]\n"
4540 
4541       // Reduce count by leftovers.
4542       "subs %x[count], %x[count], #1\n"
4543       "beq 2f\n"
4544 
4545       "1:"
4546       "subs %x[count], %x[count], #16\n"
4547 
4548       // MinMax::Transform
4549       "ld1 {v0.4s}, [%x[input]], #16\n"
4550       "prfm pldl1keep, [%x[input], #16]\n"
4551       "umax v0.16b, v0.16b, v4.16b\n"
4552       "umin v0.16b, v0.16b, v5.16b\n"
4553 
4554       "st1 {v0.4s}, [%x[output]], #16\n"
4555       "prfm pldl1keep, [%x[output]]\n"
4556 
4557       "bne 1b\n"
4558       "2:"
4559 
4560       // Handle leftovers.
4561 
4562       // MinMax::Transform
4563       "ld1 {v0.b}[0], [%x[input]], #1\n"
4564       "prfm pldl1keep, [%x[input], #16]\n"
4565       "umax v0.16b, v0.16b, v4.16b\n"
4566       "umin v0.16b, v0.16b, v5.16b\n"
4567 
4568       "st1 {v0.b}[0], [%x[output]], #1\n"
4569       "prfm pldl1keep, [%x[output]]\n"
4570       : [count] "+r"(params_count_copy), [input] "+r"(input),
4571         [output] "+r"(output)
4572       : [max] "r"(params.max), [min] "r"(params.min)
4573       : "v0", "v4", "v5", "cc", "memory");
4574 }
4575 
4576 template <>
4577 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4578                               2>::Transform(const uint8_t* input,
4579                                             const MinMax<uint8_t>& params,
4580                                             uint8_t* output) {
4581 #ifdef DEBUG
4582 #ifdef DEBUG_METAGEMM_VERBOSE
4583   std::cout << __FILE__ << "(" << __LINE__
4584             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4585                "2>::Transform()"
4586             << std::endl
4587             << std::flush;
4588 #endif
4589 #endif
4590   int params_count_copy = params.count;
4591   asm volatile(
4592 
4593       // MinMax::Prepare
4594       "dup v4.16b, %w[min]\n"
4595       "dup v5.16b, %w[max]\n"
4596 
4597       // Reduce count by leftovers.
4598       "subs %x[count], %x[count], #2\n"
4599       "beq 2f\n"
4600 
4601       "1:"
4602       "subs %x[count], %x[count], #16\n"
4603 
4604       // MinMax::Transform
4605       "ld1 {v0.4s}, [%x[input]], #16\n"
4606       "prfm pldl1keep, [%x[input], #16]\n"
4607       "umax v0.16b, v0.16b, v4.16b\n"
4608       "umin v0.16b, v0.16b, v5.16b\n"
4609 
4610       "st1 {v0.4s}, [%x[output]], #16\n"
4611       "prfm pldl1keep, [%x[output]]\n"
4612 
4613       "bne 1b\n"
4614       "2:"
4615 
4616       // Handle leftovers.
4617 
4618       // MinMax::Transform
4619       "ld1 {v0.h}[0], [%x[input]], #2\n"
4620       "prfm pldl1keep, [%x[input], #16]\n"
4621       "umax v0.16b, v0.16b, v4.16b\n"
4622       "umin v0.16b, v0.16b, v5.16b\n"
4623 
4624       "st1 {v0.h}[0], [%x[output]], #2\n"
4625       "prfm pldl1keep, [%x[output]]\n"
4626       : [count] "+r"(params_count_copy), [input] "+r"(input),
4627         [output] "+r"(output)
4628       : [max] "r"(params.max), [min] "r"(params.min)
4629       : "v0", "v4", "v5", "cc", "memory");
4630 }
4631 
4632 template <>
4633 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4634                               3>::Transform(const uint8_t* input,
4635                                             const MinMax<uint8_t>& params,
4636                                             uint8_t* output) {
4637 #ifdef DEBUG
4638 #ifdef DEBUG_METAGEMM_VERBOSE
4639   std::cout << __FILE__ << "(" << __LINE__
4640             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4641                "3>::Transform()"
4642             << std::endl
4643             << std::flush;
4644 #endif
4645 #endif
4646   int params_count_copy = params.count;
4647   asm volatile(
4648 
4649       // MinMax::Prepare
4650       "dup v4.16b, %w[min]\n"
4651       "dup v5.16b, %w[max]\n"
4652 
4653       // Reduce count by leftovers.
4654       "subs %x[count], %x[count], #3\n"
4655       "beq 2f\n"
4656 
4657       "1:"
4658       "subs %x[count], %x[count], #16\n"
4659 
4660       // MinMax::Transform
4661       "ld1 {v0.4s}, [%x[input]], #16\n"
4662       "prfm pldl1keep, [%x[input], #16]\n"
4663       "umax v0.16b, v0.16b, v4.16b\n"
4664       "umin v0.16b, v0.16b, v5.16b\n"
4665 
4666       "st1 {v0.4s}, [%x[output]], #16\n"
4667       "prfm pldl1keep, [%x[output]]\n"
4668 
4669       "bne 1b\n"
4670       "2:"
4671 
4672       // Handle leftovers.
4673 
4674       // MinMax::Transform
4675       "ld1 {v0.h}[0], [%x[input]], #2\n"
4676       "ld1 {v0.b}[2], [%x[input]], #1\n"
4677       "prfm pldl1keep, [%x[input], #16]\n"
4678       "umax v0.16b, v0.16b, v4.16b\n"
4679       "umin v0.16b, v0.16b, v5.16b\n"
4680 
4681       "st1 {v0.h}[0], [%x[output]], #2\n"
4682       "st1 {v0.b}[2], [%x[output]], #1\n"
4683       "prfm pldl1keep, [%x[output]]\n"
4684       : [count] "+r"(params_count_copy), [input] "+r"(input),
4685         [output] "+r"(output)
4686       : [max] "r"(params.max), [min] "r"(params.min)
4687       : "v0", "v4", "v5", "cc", "memory");
4688 }
4689 
4690 template <>
4691 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4692                               4>::Transform(const uint8_t* input,
4693                                             const MinMax<uint8_t>& params,
4694                                             uint8_t* output) {
4695 #ifdef DEBUG
4696 #ifdef DEBUG_METAGEMM_VERBOSE
4697   std::cout << __FILE__ << "(" << __LINE__
4698             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4699                "4>::Transform()"
4700             << std::endl
4701             << std::flush;
4702 #endif
4703 #endif
4704   int params_count_copy = params.count;
4705   asm volatile(
4706 
4707       // MinMax::Prepare
4708       "dup v4.16b, %w[min]\n"
4709       "dup v5.16b, %w[max]\n"
4710 
4711       // Reduce count by leftovers.
4712       "subs %x[count], %x[count], #4\n"
4713       "beq 2f\n"
4714 
4715       "1:"
4716       "subs %x[count], %x[count], #16\n"
4717 
4718       // MinMax::Transform
4719       "ld1 {v0.4s}, [%x[input]], #16\n"
4720       "prfm pldl1keep, [%x[input], #16]\n"
4721       "umax v0.16b, v0.16b, v4.16b\n"
4722       "umin v0.16b, v0.16b, v5.16b\n"
4723 
4724       "st1 {v0.4s}, [%x[output]], #16\n"
4725       "prfm pldl1keep, [%x[output]]\n"
4726 
4727       "bne 1b\n"
4728       "2:"
4729 
4730       // Handle leftovers.
4731 
4732       // MinMax::Transform
4733       "ld1 {v0.s}[0], [%x[input]], #4\n"
4734       "prfm pldl1keep, [%x[input], #16]\n"
4735       "umax v0.16b, v0.16b, v4.16b\n"
4736       "umin v0.16b, v0.16b, v5.16b\n"
4737 
4738       "st1 {v0.s}[0], [%x[output]], #4\n"
4739       "prfm pldl1keep, [%x[output]]\n"
4740       : [count] "+r"(params_count_copy), [input] "+r"(input),
4741         [output] "+r"(output)
4742       : [max] "r"(params.max), [min] "r"(params.min)
4743       : "v0", "v4", "v5", "cc", "memory");
4744 }
4745 
4746 template <>
4747 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4748                               5>::Transform(const uint8_t* input,
4749                                             const MinMax<uint8_t>& params,
4750                                             uint8_t* output) {
4751 #ifdef DEBUG
4752 #ifdef DEBUG_METAGEMM_VERBOSE
4753   std::cout << __FILE__ << "(" << __LINE__
4754             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4755                "5>::Transform()"
4756             << std::endl
4757             << std::flush;
4758 #endif
4759 #endif
4760   int params_count_copy = params.count;
4761   asm volatile(
4762 
4763       // MinMax::Prepare
4764       "dup v4.16b, %w[min]\n"
4765       "dup v5.16b, %w[max]\n"
4766 
4767       // Reduce count by leftovers.
4768       "subs %x[count], %x[count], #5\n"
4769       "beq 2f\n"
4770 
4771       "1:"
4772       "subs %x[count], %x[count], #16\n"
4773 
4774       // MinMax::Transform
4775       "ld1 {v0.4s}, [%x[input]], #16\n"
4776       "prfm pldl1keep, [%x[input], #16]\n"
4777       "umax v0.16b, v0.16b, v4.16b\n"
4778       "umin v0.16b, v0.16b, v5.16b\n"
4779 
4780       "st1 {v0.4s}, [%x[output]], #16\n"
4781       "prfm pldl1keep, [%x[output]]\n"
4782 
4783       "bne 1b\n"
4784       "2:"
4785 
4786       // Handle leftovers.
4787 
4788       // MinMax::Transform
4789       "ld1 {v0.s}[0], [%x[input]], #4\n"
4790       "ld1 {v0.b}[4], [%x[input]], #1\n"
4791       "prfm pldl1keep, [%x[input], #16]\n"
4792       "umax v0.16b, v0.16b, v4.16b\n"
4793       "umin v0.16b, v0.16b, v5.16b\n"
4794 
4795       "st1 {v0.s}[0], [%x[output]], #4\n"
4796       "st1 {v0.b}[4], [%x[output]], #1\n"
4797       "prfm pldl1keep, [%x[output]]\n"
4798       : [count] "+r"(params_count_copy), [input] "+r"(input),
4799         [output] "+r"(output)
4800       : [max] "r"(params.max), [min] "r"(params.min)
4801       : "v0", "v4", "v5", "cc", "memory");
4802 }
4803 
4804 template <>
4805 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4806                               6>::Transform(const uint8_t* input,
4807                                             const MinMax<uint8_t>& params,
4808                                             uint8_t* output) {
4809 #ifdef DEBUG
4810 #ifdef DEBUG_METAGEMM_VERBOSE
4811   std::cout << __FILE__ << "(" << __LINE__
4812             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4813                "6>::Transform()"
4814             << std::endl
4815             << std::flush;
4816 #endif
4817 #endif
4818   int params_count_copy = params.count;
4819   asm volatile(
4820 
4821       // MinMax::Prepare
4822       "dup v4.16b, %w[min]\n"
4823       "dup v5.16b, %w[max]\n"
4824 
4825       // Reduce count by leftovers.
4826       "subs %x[count], %x[count], #6\n"
4827       "beq 2f\n"
4828 
4829       "1:"
4830       "subs %x[count], %x[count], #16\n"
4831 
4832       // MinMax::Transform
4833       "ld1 {v0.4s}, [%x[input]], #16\n"
4834       "prfm pldl1keep, [%x[input], #16]\n"
4835       "umax v0.16b, v0.16b, v4.16b\n"
4836       "umin v0.16b, v0.16b, v5.16b\n"
4837 
4838       "st1 {v0.4s}, [%x[output]], #16\n"
4839       "prfm pldl1keep, [%x[output]]\n"
4840 
4841       "bne 1b\n"
4842       "2:"
4843 
4844       // Handle leftovers.
4845 
4846       // MinMax::Transform
4847       "ld1 {v0.s}[0], [%x[input]], #4\n"
4848       "ld1 {v0.h}[2], [%x[input]], #2\n"
4849       "prfm pldl1keep, [%x[input], #16]\n"
4850       "umax v0.16b, v0.16b, v4.16b\n"
4851       "umin v0.16b, v0.16b, v5.16b\n"
4852 
4853       "st1 {v0.s}[0], [%x[output]], #4\n"
4854       "st1 {v0.h}[2], [%x[output]], #2\n"
4855       "prfm pldl1keep, [%x[output]]\n"
4856       : [count] "+r"(params_count_copy), [input] "+r"(input),
4857         [output] "+r"(output)
4858       : [max] "r"(params.max), [min] "r"(params.min)
4859       : "v0", "v4", "v5", "cc", "memory");
4860 }
4861 
4862 template <>
4863 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4864                               7>::Transform(const uint8_t* input,
4865                                             const MinMax<uint8_t>& params,
4866                                             uint8_t* output) {
4867 #ifdef DEBUG
4868 #ifdef DEBUG_METAGEMM_VERBOSE
4869   std::cout << __FILE__ << "(" << __LINE__
4870             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4871                "7>::Transform()"
4872             << std::endl
4873             << std::flush;
4874 #endif
4875 #endif
4876   int params_count_copy = params.count;
4877   asm volatile(
4878 
4879       // MinMax::Prepare
4880       "dup v4.16b, %w[min]\n"
4881       "dup v5.16b, %w[max]\n"
4882 
4883       // Reduce count by leftovers.
4884       "subs %x[count], %x[count], #7\n"
4885       "beq 2f\n"
4886 
4887       "1:"
4888       "subs %x[count], %x[count], #16\n"
4889 
4890       // MinMax::Transform
4891       "ld1 {v0.4s}, [%x[input]], #16\n"
4892       "prfm pldl1keep, [%x[input], #16]\n"
4893       "umax v0.16b, v0.16b, v4.16b\n"
4894       "umin v0.16b, v0.16b, v5.16b\n"
4895 
4896       "st1 {v0.4s}, [%x[output]], #16\n"
4897       "prfm pldl1keep, [%x[output]]\n"
4898 
4899       "bne 1b\n"
4900       "2:"
4901 
4902       // Handle leftovers.
4903 
4904       // MinMax::Transform
4905       "ld1 {v0.s}[0], [%x[input]], #4\n"
4906       "ld1 {v0.h}[2], [%x[input]], #2\n"
4907       "ld1 {v0.b}[6], [%x[input]], #1\n"
4908       "prfm pldl1keep, [%x[input], #16]\n"
4909       "umax v0.16b, v0.16b, v4.16b\n"
4910       "umin v0.16b, v0.16b, v5.16b\n"
4911 
4912       "st1 {v0.s}[0], [%x[output]], #4\n"
4913       "st1 {v0.h}[2], [%x[output]], #2\n"
4914       "st1 {v0.b}[6], [%x[output]], #1\n"
4915       "prfm pldl1keep, [%x[output]]\n"
4916       : [count] "+r"(params_count_copy), [input] "+r"(input),
4917         [output] "+r"(output)
4918       : [max] "r"(params.max), [min] "r"(params.min)
4919       : "v0", "v4", "v5", "cc", "memory");
4920 }
4921 
4922 template <>
4923 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4924                               8>::Transform(const uint8_t* input,
4925                                             const MinMax<uint8_t>& params,
4926                                             uint8_t* output) {
4927 #ifdef DEBUG
4928 #ifdef DEBUG_METAGEMM_VERBOSE
4929   std::cout << __FILE__ << "(" << __LINE__
4930             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4931                "8>::Transform()"
4932             << std::endl
4933             << std::flush;
4934 #endif
4935 #endif
4936   int params_count_copy = params.count;
4937   asm volatile(
4938 
4939       // MinMax::Prepare
4940       "dup v4.16b, %w[min]\n"
4941       "dup v5.16b, %w[max]\n"
4942 
4943       // Reduce count by leftovers.
4944       "subs %x[count], %x[count], #8\n"
4945       "beq 2f\n"
4946 
4947       "1:"
4948       "subs %x[count], %x[count], #16\n"
4949 
4950       // MinMax::Transform
4951       "ld1 {v0.4s}, [%x[input]], #16\n"
4952       "prfm pldl1keep, [%x[input], #16]\n"
4953       "umax v0.16b, v0.16b, v4.16b\n"
4954       "umin v0.16b, v0.16b, v5.16b\n"
4955 
4956       "st1 {v0.4s}, [%x[output]], #16\n"
4957       "prfm pldl1keep, [%x[output]]\n"
4958 
4959       "bne 1b\n"
4960       "2:"
4961 
4962       // Handle leftovers.
4963 
4964       // MinMax::Transform
4965       "ld1 {v0.2s}, [%x[input]], #8\n"
4966       "prfm pldl1keep, [%x[input], #16]\n"
4967       "umax v0.16b, v0.16b, v4.16b\n"
4968       "umin v0.16b, v0.16b, v5.16b\n"
4969 
4970       "st1 {v0.2s}, [%x[output]], #8\n"
4971       "prfm pldl1keep, [%x[output]]\n"
4972       : [count] "+r"(params_count_copy), [input] "+r"(input),
4973         [output] "+r"(output)
4974       : [max] "r"(params.max), [min] "r"(params.min)
4975       : "v0", "v4", "v5", "cc", "memory");
4976 }
4977 
4978 template <>
4979 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4980                               9>::Transform(const uint8_t* input,
4981                                             const MinMax<uint8_t>& params,
4982                                             uint8_t* output) {
4983 #ifdef DEBUG
4984 #ifdef DEBUG_METAGEMM_VERBOSE
4985   std::cout << __FILE__ << "(" << __LINE__
4986             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4987                "9>::Transform()"
4988             << std::endl
4989             << std::flush;
4990 #endif
4991 #endif
4992   int params_count_copy = params.count;
4993   asm volatile(
4994 
4995       // MinMax::Prepare
4996       "dup v4.16b, %w[min]\n"
4997       "dup v5.16b, %w[max]\n"
4998 
4999       // Reduce count by leftovers.
5000       "subs %x[count], %x[count], #9\n"
5001       "beq 2f\n"
5002 
5003       "1:"
5004       "subs %x[count], %x[count], #16\n"
5005 
5006       // MinMax::Transform
5007       "ld1 {v0.4s}, [%x[input]], #16\n"
5008       "prfm pldl1keep, [%x[input], #16]\n"
5009       "umax v0.16b, v0.16b, v4.16b\n"
5010       "umin v0.16b, v0.16b, v5.16b\n"
5011 
5012       "st1 {v0.4s}, [%x[output]], #16\n"
5013       "prfm pldl1keep, [%x[output]]\n"
5014 
5015       "bne 1b\n"
5016       "2:"
5017 
5018       // Handle leftovers.
5019 
5020       // MinMax::Transform
5021       "ld1 {v0.2s}, [%x[input]], #8\n"
5022       "ld1 {v0.b}[8], [%x[input]], #1\n"
5023       "prfm pldl1keep, [%x[input], #16]\n"
5024       "umax v0.16b, v0.16b, v4.16b\n"
5025       "umin v0.16b, v0.16b, v5.16b\n"
5026 
5027       "st1 {v0.2s}, [%x[output]], #8\n"
5028       "st1 {v0.b}[8], [%x[output]], #1\n"
5029       "prfm pldl1keep, [%x[output]]\n"
5030       : [count] "+r"(params_count_copy), [input] "+r"(input),
5031         [output] "+r"(output)
5032       : [max] "r"(params.max), [min] "r"(params.min)
5033       : "v0", "v4", "v5", "cc", "memory");
5034 }
5035 
5036 template <>
5037 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5038                               10>::Transform(const uint8_t* input,
5039                                              const MinMax<uint8_t>& params,
5040                                              uint8_t* output) {
5041 #ifdef DEBUG
5042 #ifdef DEBUG_METAGEMM_VERBOSE
5043   std::cout << __FILE__ << "(" << __LINE__
5044             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5045                "10>::Transform()"
5046             << std::endl
5047             << std::flush;
5048 #endif
5049 #endif
5050   int params_count_copy = params.count;
5051   asm volatile(
5052 
5053       // MinMax::Prepare
5054       "dup v4.16b, %w[min]\n"
5055       "dup v5.16b, %w[max]\n"
5056 
5057       // Reduce count by leftovers.
5058       "subs %x[count], %x[count], #10\n"
5059       "beq 2f\n"
5060 
5061       "1:"
5062       "subs %x[count], %x[count], #16\n"
5063 
5064       // MinMax::Transform
5065       "ld1 {v0.4s}, [%x[input]], #16\n"
5066       "prfm pldl1keep, [%x[input], #16]\n"
5067       "umax v0.16b, v0.16b, v4.16b\n"
5068       "umin v0.16b, v0.16b, v5.16b\n"
5069 
5070       "st1 {v0.4s}, [%x[output]], #16\n"
5071       "prfm pldl1keep, [%x[output]]\n"
5072 
5073       "bne 1b\n"
5074       "2:"
5075 
5076       // Handle leftovers.
5077 
5078       // MinMax::Transform
5079       "ld1 {v0.2s}, [%x[input]], #8\n"
5080       "ld1 {v0.h}[4], [%x[input]], #2\n"
5081       "prfm pldl1keep, [%x[input], #16]\n"
5082       "umax v0.16b, v0.16b, v4.16b\n"
5083       "umin v0.16b, v0.16b, v5.16b\n"
5084 
5085       "st1 {v0.2s}, [%x[output]], #8\n"
5086       "st1 {v0.h}[4], [%x[output]], #2\n"
5087       "prfm pldl1keep, [%x[output]]\n"
5088       : [count] "+r"(params_count_copy), [input] "+r"(input),
5089         [output] "+r"(output)
5090       : [max] "r"(params.max), [min] "r"(params.min)
5091       : "v0", "v4", "v5", "cc", "memory");
5092 }
5093 
5094 template <>
5095 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5096                               11>::Transform(const uint8_t* input,
5097                                              const MinMax<uint8_t>& params,
5098                                              uint8_t* output) {
5099 #ifdef DEBUG
5100 #ifdef DEBUG_METAGEMM_VERBOSE
5101   std::cout << __FILE__ << "(" << __LINE__
5102             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5103                "11>::Transform()"
5104             << std::endl
5105             << std::flush;
5106 #endif
5107 #endif
5108   int params_count_copy = params.count;
5109   asm volatile(
5110 
5111       // MinMax::Prepare
5112       "dup v4.16b, %w[min]\n"
5113       "dup v5.16b, %w[max]\n"
5114 
5115       // Reduce count by leftovers.
5116       "subs %x[count], %x[count], #11\n"
5117       "beq 2f\n"
5118 
5119       "1:"
5120       "subs %x[count], %x[count], #16\n"
5121 
5122       // MinMax::Transform
5123       "ld1 {v0.4s}, [%x[input]], #16\n"
5124       "prfm pldl1keep, [%x[input], #16]\n"
5125       "umax v0.16b, v0.16b, v4.16b\n"
5126       "umin v0.16b, v0.16b, v5.16b\n"
5127 
5128       "st1 {v0.4s}, [%x[output]], #16\n"
5129       "prfm pldl1keep, [%x[output]]\n"
5130 
5131       "bne 1b\n"
5132       "2:"
5133 
5134       // Handle leftovers.
5135 
5136       // MinMax::Transform
5137       "ld1 {v0.2s}, [%x[input]], #8\n"
5138       "ld1 {v0.h}[4], [%x[input]], #2\n"
5139       "ld1 {v0.b}[10], [%x[input]], #1\n"
5140       "prfm pldl1keep, [%x[input], #16]\n"
5141       "umax v0.16b, v0.16b, v4.16b\n"
5142       "umin v0.16b, v0.16b, v5.16b\n"
5143 
5144       "st1 {v0.2s}, [%x[output]], #8\n"
5145       "st1 {v0.h}[4], [%x[output]], #2\n"
5146       "st1 {v0.b}[10], [%x[output]], #1\n"
5147       "prfm pldl1keep, [%x[output]]\n"
5148       : [count] "+r"(params_count_copy), [input] "+r"(input),
5149         [output] "+r"(output)
5150       : [max] "r"(params.max), [min] "r"(params.min)
5151       : "v0", "v4", "v5", "cc", "memory");
5152 }
5153 
5154 template <>
5155 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5156                               12>::Transform(const uint8_t* input,
5157                                              const MinMax<uint8_t>& params,
5158                                              uint8_t* output) {
5159 #ifdef DEBUG
5160 #ifdef DEBUG_METAGEMM_VERBOSE
5161   std::cout << __FILE__ << "(" << __LINE__
5162             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5163                "12>::Transform()"
5164             << std::endl
5165             << std::flush;
5166 #endif
5167 #endif
5168   int params_count_copy = params.count;
5169   asm volatile(
5170 
5171       // MinMax::Prepare
5172       "dup v4.16b, %w[min]\n"
5173       "dup v5.16b, %w[max]\n"
5174 
5175       // Reduce count by leftovers.
5176       "subs %x[count], %x[count], #12\n"
5177       "beq 2f\n"
5178 
5179       "1:"
5180       "subs %x[count], %x[count], #16\n"
5181 
5182       // MinMax::Transform
5183       "ld1 {v0.4s}, [%x[input]], #16\n"
5184       "prfm pldl1keep, [%x[input], #16]\n"
5185       "umax v0.16b, v0.16b, v4.16b\n"
5186       "umin v0.16b, v0.16b, v5.16b\n"
5187 
5188       "st1 {v0.4s}, [%x[output]], #16\n"
5189       "prfm pldl1keep, [%x[output]]\n"
5190 
5191       "bne 1b\n"
5192       "2:"
5193 
5194       // Handle leftovers.
5195 
5196       // MinMax::Transform
5197       "ld1 {v0.2s}, [%x[input]], #8\n"
5198       "ld1 {v0.s}[2], [%x[input]], #4\n"
5199       "prfm pldl1keep, [%x[input], #16]\n"
5200       "umax v0.16b, v0.16b, v4.16b\n"
5201       "umin v0.16b, v0.16b, v5.16b\n"
5202 
5203       "st1 {v0.2s}, [%x[output]], #8\n"
5204       "st1 {v0.s}[2], [%x[output]], #4\n"
5205       "prfm pldl1keep, [%x[output]]\n"
5206       : [count] "+r"(params_count_copy), [input] "+r"(input),
5207         [output] "+r"(output)
5208       : [max] "r"(params.max), [min] "r"(params.min)
5209       : "v0", "v4", "v5", "cc", "memory");
5210 }
5211 
5212 template <>
5213 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5214                               13>::Transform(const uint8_t* input,
5215                                              const MinMax<uint8_t>& params,
5216                                              uint8_t* output) {
5217 #ifdef DEBUG
5218 #ifdef DEBUG_METAGEMM_VERBOSE
5219   std::cout << __FILE__ << "(" << __LINE__
5220             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5221                "13>::Transform()"
5222             << std::endl
5223             << std::flush;
5224 #endif
5225 #endif
5226   int params_count_copy = params.count;
5227   asm volatile(
5228 
5229       // MinMax::Prepare
5230       "dup v4.16b, %w[min]\n"
5231       "dup v5.16b, %w[max]\n"
5232 
5233       // Reduce count by leftovers.
5234       "subs %x[count], %x[count], #13\n"
5235       "beq 2f\n"
5236 
5237       "1:"
5238       "subs %x[count], %x[count], #16\n"
5239 
5240       // MinMax::Transform
5241       "ld1 {v0.4s}, [%x[input]], #16\n"
5242       "prfm pldl1keep, [%x[input], #16]\n"
5243       "umax v0.16b, v0.16b, v4.16b\n"
5244       "umin v0.16b, v0.16b, v5.16b\n"
5245 
5246       "st1 {v0.4s}, [%x[output]], #16\n"
5247       "prfm pldl1keep, [%x[output]]\n"
5248 
5249       "bne 1b\n"
5250       "2:"
5251 
5252       // Handle leftovers.
5253 
5254       // MinMax::Transform
5255       "ld1 {v0.2s}, [%x[input]], #8\n"
5256       "ld1 {v0.s}[2], [%x[input]], #4\n"
5257       "ld1 {v0.b}[12], [%x[input]], #1\n"
5258       "prfm pldl1keep, [%x[input], #16]\n"
5259       "umax v0.16b, v0.16b, v4.16b\n"
5260       "umin v0.16b, v0.16b, v5.16b\n"
5261 
5262       "st1 {v0.2s}, [%x[output]], #8\n"
5263       "st1 {v0.s}[2], [%x[output]], #4\n"
5264       "st1 {v0.b}[12], [%x[output]], #1\n"
5265       "prfm pldl1keep, [%x[output]]\n"
5266       : [count] "+r"(params_count_copy), [input] "+r"(input),
5267         [output] "+r"(output)
5268       : [max] "r"(params.max), [min] "r"(params.min)
5269       : "v0", "v4", "v5", "cc", "memory");
5270 }
5271 
5272 template <>
5273 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5274                               14>::Transform(const uint8_t* input,
5275                                              const MinMax<uint8_t>& params,
5276                                              uint8_t* output) {
5277 #ifdef DEBUG
5278 #ifdef DEBUG_METAGEMM_VERBOSE
5279   std::cout << __FILE__ << "(" << __LINE__
5280             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5281                "14>::Transform()"
5282             << std::endl
5283             << std::flush;
5284 #endif
5285 #endif
5286   int params_count_copy = params.count;
5287   asm volatile(
5288 
5289       // MinMax::Prepare
5290       "dup v4.16b, %w[min]\n"
5291       "dup v5.16b, %w[max]\n"
5292 
5293       // Reduce count by leftovers.
5294       "subs %x[count], %x[count], #14\n"
5295       "beq 2f\n"
5296 
5297       "1:"
5298       "subs %x[count], %x[count], #16\n"
5299 
5300       // MinMax::Transform
5301       "ld1 {v0.4s}, [%x[input]], #16\n"
5302       "prfm pldl1keep, [%x[input], #16]\n"
5303       "umax v0.16b, v0.16b, v4.16b\n"
5304       "umin v0.16b, v0.16b, v5.16b\n"
5305 
5306       "st1 {v0.4s}, [%x[output]], #16\n"
5307       "prfm pldl1keep, [%x[output]]\n"
5308 
5309       "bne 1b\n"
5310       "2:"
5311 
5312       // Handle leftovers.
5313 
5314       // MinMax::Transform
5315       "ld1 {v0.2s}, [%x[input]], #8\n"
5316       "ld1 {v0.s}[2], [%x[input]], #4\n"
5317       "ld1 {v0.h}[6], [%x[input]], #2\n"
5318       "prfm pldl1keep, [%x[input], #16]\n"
5319       "umax v0.16b, v0.16b, v4.16b\n"
5320       "umin v0.16b, v0.16b, v5.16b\n"
5321 
5322       "st1 {v0.2s}, [%x[output]], #8\n"
5323       "st1 {v0.s}[2], [%x[output]], #4\n"
5324       "st1 {v0.h}[6], [%x[output]], #2\n"
5325       "prfm pldl1keep, [%x[output]]\n"
5326       : [count] "+r"(params_count_copy), [input] "+r"(input),
5327         [output] "+r"(output)
5328       : [max] "r"(params.max), [min] "r"(params.min)
5329       : "v0", "v4", "v5", "cc", "memory");
5330 }
5331 
5332 template <>
5333 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5334                               15>::Transform(const uint8_t* input,
5335                                              const MinMax<uint8_t>& params,
5336                                              uint8_t* output) {
5337 #ifdef DEBUG
5338 #ifdef DEBUG_METAGEMM_VERBOSE
5339   std::cout << __FILE__ << "(" << __LINE__
5340             << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5341                "15>::Transform()"
5342             << std::endl
5343             << std::flush;
5344 #endif
5345 #endif
5346   int params_count_copy = params.count;
5347   asm volatile(
5348 
5349       // MinMax::Prepare
5350       "dup v4.16b, %w[min]\n"
5351       "dup v5.16b, %w[max]\n"
5352 
5353       // Reduce count by leftovers.
5354       "subs %x[count], %x[count], #15\n"
5355       "beq 2f\n"
5356 
5357       "1:"
5358       "subs %x[count], %x[count], #16\n"
5359 
5360       // MinMax::Transform
5361       "ld1 {v0.4s}, [%x[input]], #16\n"
5362       "prfm pldl1keep, [%x[input], #16]\n"
5363       "umax v0.16b, v0.16b, v4.16b\n"
5364       "umin v0.16b, v0.16b, v5.16b\n"
5365 
5366       "st1 {v0.4s}, [%x[output]], #16\n"
5367       "prfm pldl1keep, [%x[output]]\n"
5368 
5369       "bne 1b\n"
5370       "2:"
5371 
5372       // Handle leftovers.
5373 
5374       // MinMax::Transform
5375       "ld1 {v0.2s}, [%x[input]], #8\n"
5376       "ld1 {v0.s}[2], [%x[input]], #4\n"
5377       "ld1 {v0.h}[6], [%x[input]], #2\n"
5378       "ld1 {v0.b}[14], [%x[input]], #1\n"
5379       "prfm pldl1keep, [%x[input], #16]\n"
5380       "umax v0.16b, v0.16b, v4.16b\n"
5381       "umin v0.16b, v0.16b, v5.16b\n"
5382 
5383       "st1 {v0.2s}, [%x[output]], #8\n"
5384       "st1 {v0.s}[2], [%x[output]], #4\n"
5385       "st1 {v0.h}[6], [%x[output]], #2\n"
5386       "st1 {v0.b}[14], [%x[output]], #1\n"
5387       "prfm pldl1keep, [%x[output]]\n"
5388       : [count] "+r"(params_count_copy), [input] "+r"(input),
5389         [output] "+r"(output)
5390       : [max] "r"(params.max), [min] "r"(params.min)
5391       : "v0", "v4", "v5", "cc", "memory");
5392 }
5393 
5394 template <>
5395 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5396                               0>::Transform(const uint8_t* input,
5397                                             const BiasAdd<uint8_t>& params,
5398                                             int32_t* output) {
5399 #ifdef DEBUG
5400 #ifdef DEBUG_METAGEMM_VERBOSE
5401   std::cout << __FILE__ << "(" << __LINE__
5402             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5403                "0>::Transform()"
5404             << std::endl
5405             << std::flush;
5406 #endif
5407 #endif
5408   int params_rows_copy = params.rows;
5409   asm volatile(
5410       "ldr w0, %[input_range_min]\n"
5411       "dup v8.4s, w0\n"
5412       "ldr w0, %[input_range_scale]\n"
5413       "dup v9.4s, w0\n"
5414       "ldr w0, %[bias_range_min]\n"
5415       "dup v10.4s, w0\n"
5416       "ldr w0, %[bias_range_scale]\n"
5417       "dup v11.4s, w0\n"
5418       "ldr w0, %[output_range_min]\n"
5419       "dup v12.4s, w0\n"
5420       "ldr w0, %[one_over_output_range_scale]\n"
5421       "dup v13.4s, w0\n"
5422       "ldr w0, %[output_range_offset]\n"
5423       "dup v14.4s, w0\n"
5424       "1:"
5425       "mov x0, %x[count]\n"
5426       "mov x1, %x[bias]\n"
5427       "2:"
5428       "subs x0, x0, #16\n"
5429 
5430       // BiasAdd::Transform
5431       "ld1 {v0.4s}, [%x[input]], #16\n"
5432       "ld1 {v4.4s}, [x1], #16\n"
5433       "prfm pldl1keep, [%x[input], #32]\n"
5434       "uxtl2 v1.8h, v0.16b\n"
5435       "uxtl v0.8h, v0.8b\n"
5436       "uxtl2 v5.8h, v4.16b\n"
5437       "uxtl v4.8h, v4.8b\n"
5438       "sxtl2 v3.4s, v1.8h\n"
5439       "sxtl v2.4s, v1.4h\n"
5440       "sxtl2 v7.4s, v5.8h\n"
5441       "sxtl v6.4s, v5.4h\n"
5442       "sxtl2 v1.4s, v0.8h\n"
5443       "sxtl v0.4s, v0.4h\n"
5444       "sxtl2 v5.4s, v4.8h\n"
5445       "sxtl v4.4s, v4.4h\n"
5446       "scvtf v0.4s, v0.4s\n"
5447       "scvtf v1.4s, v1.4s\n"
5448       "scvtf v2.4s, v2.4s\n"
5449       "scvtf v3.4s, v3.4s\n"
5450       "scvtf v4.4s, v4.4s\n"
5451       "scvtf v5.4s, v5.4s\n"
5452       "scvtf v6.4s, v6.4s\n"
5453       "scvtf v7.4s, v7.4s\n"
5454       "fmul v0.4s, v0.4s, v9.4s\n"
5455       "fmul v1.4s, v1.4s, v9.4s\n"
5456       "fmul v2.4s, v2.4s, v9.4s\n"
5457       "fmul v3.4s, v3.4s, v9.4s\n"
5458       "fmul v4.4s, v4.4s, v11.4s\n"
5459       "fmul v5.4s, v5.4s, v11.4s\n"
5460       "fmul v6.4s, v6.4s, v11.4s\n"
5461       "fmul v7.4s, v7.4s, v11.4s\n"
5462       "fadd v0.4s, v0.4s, v8.4s\n"
5463       "fadd v1.4s, v1.4s, v8.4s\n"
5464       "fadd v2.4s, v2.4s, v8.4s\n"
5465       "fadd v3.4s, v3.4s, v8.4s\n"
5466       "fadd v4.4s, v4.4s, v10.4s\n"
5467       "fadd v5.4s, v5.4s, v10.4s\n"
5468       "fadd v6.4s, v6.4s, v10.4s\n"
5469       "fadd v7.4s, v7.4s, v10.4s\n"
5470       "fadd v0.4s, v0.4s, v4.4s\n"
5471       "fadd v1.4s, v1.4s, v5.4s\n"
5472       "fadd v2.4s, v2.4s, v6.4s\n"
5473       "fadd v3.4s, v3.4s, v7.4s\n"
5474       "fsub v0.4s, v0.4s, v12.4s\n"
5475       "fsub v1.4s, v1.4s, v12.4s\n"
5476       "fsub v2.4s, v2.4s, v12.4s\n"
5477       "fsub v3.4s, v3.4s, v12.4s\n"
5478       "fmul v0.4s, v0.4s, v13.4s\n"
5479       "fmul v1.4s, v1.4s, v13.4s\n"
5480       "fmul v2.4s, v2.4s, v13.4s\n"
5481       "fmul v3.4s, v3.4s, v13.4s\n"
5482       "fadd v0.4s, v0.4s, v14.4s\n"
5483       "fadd v1.4s, v1.4s, v14.4s\n"
5484       "fadd v2.4s, v2.4s, v14.4s\n"
5485       "fadd v3.4s, v3.4s, v14.4s\n"
5486       "fcvtzs v0.4s, v0.4s\n"
5487       "fcvtzs v1.4s, v1.4s\n"
5488       "fcvtzs v2.4s, v2.4s\n"
5489       "fcvtzs v3.4s, v3.4s\n"
5490 
5491       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
5492       "prfm pldl1keep, [%x[output]]\n"
5493       "bne 2b\n"
5494       "subs %x[rows], %x[rows], #1\n"
5495       "bne 1b\n"
5496       : [input] "+r"(input), [output] "+r"(output)
5497       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
5498         [output_range_offset] "m"(params.output_range_offset),
5499         [input_range_scale] "m"(params.input_range_scale),
5500         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
5501         [bias_range_min] "m"(params.bias_range_min),
5502         [output_range_min] "m"(params.output_range_min),
5503         [bias_range_scale] "m"(params.bias_range_scale),
5504         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
5505       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
5506         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
5507 }
5508 
5509 template <>
5510 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5511                               1>::Transform(const uint8_t* input,
5512                                             const BiasAdd<uint8_t>& params,
5513                                             int32_t* output) {
5514 #ifdef DEBUG
5515 #ifdef DEBUG_METAGEMM_VERBOSE
5516   std::cout << __FILE__ << "(" << __LINE__
5517             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5518                "1>::Transform()"
5519             << std::endl
5520             << std::flush;
5521 #endif
5522 #endif
5523   int params_rows_copy = params.rows;
5524   asm volatile(
5525       "ldr w0, %[input_range_min]\n"
5526       "dup v8.4s, w0\n"
5527       "ldr w0, %[input_range_scale]\n"
5528       "dup v9.4s, w0\n"
5529       "ldr w0, %[bias_range_min]\n"
5530       "dup v10.4s, w0\n"
5531       "ldr w0, %[bias_range_scale]\n"
5532       "dup v11.4s, w0\n"
5533       "ldr w0, %[output_range_min]\n"
5534       "dup v12.4s, w0\n"
5535       "ldr w0, %[one_over_output_range_scale]\n"
5536       "dup v13.4s, w0\n"
5537       "ldr w0, %[output_range_offset]\n"
5538       "dup v14.4s, w0\n"
5539       "1:"
5540       "mov x0, %x[count]\n"
5541       "mov x1, %x[bias]\n"
5542       "subs x0, x0, #1\n"
5543       "beq 3f\n"
5544       "2:"
5545       "subs x0, x0, #16\n"
5546 
5547       // BiasAdd::Transform
5548       "ld1 {v0.4s}, [%x[input]], #16\n"
5549       "ld1 {v4.4s}, [x1], #16\n"
5550       "prfm pldl1keep, [%x[input], #32]\n"
5551       "uxtl2 v1.8h, v0.16b\n"
5552       "uxtl v0.8h, v0.8b\n"
5553       "uxtl2 v5.8h, v4.16b\n"
5554       "uxtl v4.8h, v4.8b\n"
5555       "sxtl2 v3.4s, v1.8h\n"
5556       "sxtl v2.4s, v1.4h\n"
5557       "sxtl2 v7.4s, v5.8h\n"
5558       "sxtl v6.4s, v5.4h\n"
5559       "sxtl2 v1.4s, v0.8h\n"
5560       "sxtl v0.4s, v0.4h\n"
5561       "sxtl2 v5.4s, v4.8h\n"
5562       "sxtl v4.4s, v4.4h\n"
5563       "scvtf v0.4s, v0.4s\n"
5564       "scvtf v1.4s, v1.4s\n"
5565       "scvtf v2.4s, v2.4s\n"
5566       "scvtf v3.4s, v3.4s\n"
5567       "scvtf v4.4s, v4.4s\n"
5568       "scvtf v5.4s, v5.4s\n"
5569       "scvtf v6.4s, v6.4s\n"
5570       "scvtf v7.4s, v7.4s\n"
5571       "fmul v0.4s, v0.4s, v9.4s\n"
5572       "fmul v1.4s, v1.4s, v9.4s\n"
5573       "fmul v2.4s, v2.4s, v9.4s\n"
5574       "fmul v3.4s, v3.4s, v9.4s\n"
5575       "fmul v4.4s, v4.4s, v11.4s\n"
5576       "fmul v5.4s, v5.4s, v11.4s\n"
5577       "fmul v6.4s, v6.4s, v11.4s\n"
5578       "fmul v7.4s, v7.4s, v11.4s\n"
5579       "fadd v0.4s, v0.4s, v8.4s\n"
5580       "fadd v1.4s, v1.4s, v8.4s\n"
5581       "fadd v2.4s, v2.4s, v8.4s\n"
5582       "fadd v3.4s, v3.4s, v8.4s\n"
5583       "fadd v4.4s, v4.4s, v10.4s\n"
5584       "fadd v5.4s, v5.4s, v10.4s\n"
5585       "fadd v6.4s, v6.4s, v10.4s\n"
5586       "fadd v7.4s, v7.4s, v10.4s\n"
5587       "fadd v0.4s, v0.4s, v4.4s\n"
5588       "fadd v1.4s, v1.4s, v5.4s\n"
5589       "fadd v2.4s, v2.4s, v6.4s\n"
5590       "fadd v3.4s, v3.4s, v7.4s\n"
5591       "fsub v0.4s, v0.4s, v12.4s\n"
5592       "fsub v1.4s, v1.4s, v12.4s\n"
5593       "fsub v2.4s, v2.4s, v12.4s\n"
5594       "fsub v3.4s, v3.4s, v12.4s\n"
5595       "fmul v0.4s, v0.4s, v13.4s\n"
5596       "fmul v1.4s, v1.4s, v13.4s\n"
5597       "fmul v2.4s, v2.4s, v13.4s\n"
5598       "fmul v3.4s, v3.4s, v13.4s\n"
5599       "fadd v0.4s, v0.4s, v14.4s\n"
5600       "fadd v1.4s, v1.4s, v14.4s\n"
5601       "fadd v2.4s, v2.4s, v14.4s\n"
5602       "fadd v3.4s, v3.4s, v14.4s\n"
5603       "fcvtzs v0.4s, v0.4s\n"
5604       "fcvtzs v1.4s, v1.4s\n"
5605       "fcvtzs v2.4s, v2.4s\n"
5606       "fcvtzs v3.4s, v3.4s\n"
5607 
5608       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
5609       "prfm pldl1keep, [%x[output]]\n"
5610       "bne 2b\n"
5611       "3:"
5612 
5613       // BiasAdd::Transform
5614       "ld1 {v0.b}[0], [%x[input]], #1\n"
5615       "ld1 {v1.b}[0], [x1], #1\n"
5616       "prfm pldl1keep, [%x[input], #32]\n"
5617       "uxtl v0.8h, v0.8b\n"
5618       "uxtl v1.8h, v1.8b\n"
5619       "sxtl v0.4s, v0.4h\n"
5620       "sxtl v1.4s, v1.4h\n"
5621       "scvtf v0.4s, v0.4s\n"
5622       "scvtf v1.4s, v1.4s\n"
5623       "fmul v0.4s, v0.4s, v9.4s\n"
5624       "fmul v1.4s, v1.4s, v11.4s\n"
5625       "fadd v0.4s, v0.4s, v8.4s\n"
5626       "fadd v1.4s, v1.4s, v10.4s\n"
5627       "fadd v0.4s, v0.4s, v1.4s\n"
5628       "fsub v0.4s, v0.4s, v12.4s\n"
5629       "fmul v0.4s, v0.4s, v13.4s\n"
5630       "fadd v0.4s, v0.4s, v14.4s\n"
5631       "fcvtzs v0.4s, v0.4s\n"
5632 
5633       "st1 {v0.s}[0], [%x[output]], #4\n"
5634       "prfm pldl1keep, [%x[output]]\n"
5635       "subs %x[rows], %x[rows], #1\n"
5636       "bne 1b\n"
5637       : [input] "+r"(input), [output] "+r"(output)
5638       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
5639         [output_range_offset] "m"(params.output_range_offset),
5640         [input_range_scale] "m"(params.input_range_scale),
5641         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
5642         [bias_range_min] "m"(params.bias_range_min),
5643         [output_range_min] "m"(params.output_range_min),
5644         [bias_range_scale] "m"(params.bias_range_scale),
5645         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
5646       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
5647         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
5648 }
5649 
5650 template <>
5651 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5652                               2>::Transform(const uint8_t* input,
5653                                             const BiasAdd<uint8_t>& params,
5654                                             int32_t* output) {
5655 #ifdef DEBUG
5656 #ifdef DEBUG_METAGEMM_VERBOSE
5657   std::cout << __FILE__ << "(" << __LINE__
5658             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5659                "2>::Transform()"
5660             << std::endl
5661             << std::flush;
5662 #endif
5663 #endif
5664   int params_rows_copy = params.rows;
5665   asm volatile(
5666       "ldr w0, %[input_range_min]\n"
5667       "dup v8.4s, w0\n"
5668       "ldr w0, %[input_range_scale]\n"
5669       "dup v9.4s, w0\n"
5670       "ldr w0, %[bias_range_min]\n"
5671       "dup v10.4s, w0\n"
5672       "ldr w0, %[bias_range_scale]\n"
5673       "dup v11.4s, w0\n"
5674       "ldr w0, %[output_range_min]\n"
5675       "dup v12.4s, w0\n"
5676       "ldr w0, %[one_over_output_range_scale]\n"
5677       "dup v13.4s, w0\n"
5678       "ldr w0, %[output_range_offset]\n"
5679       "dup v14.4s, w0\n"
5680       "1:"
5681       "mov x0, %x[count]\n"
5682       "mov x1, %x[bias]\n"
5683       "subs x0, x0, #2\n"
5684       "beq 3f\n"
5685       "2:"
5686       "subs x0, x0, #16\n"
5687 
5688       // BiasAdd::Transform
5689       "ld1 {v0.4s}, [%x[input]], #16\n"
5690       "ld1 {v4.4s}, [x1], #16\n"
5691       "prfm pldl1keep, [%x[input], #32]\n"
5692       "uxtl2 v1.8h, v0.16b\n"
5693       "uxtl v0.8h, v0.8b\n"
5694       "uxtl2 v5.8h, v4.16b\n"
5695       "uxtl v4.8h, v4.8b\n"
5696       "sxtl2 v3.4s, v1.8h\n"
5697       "sxtl v2.4s, v1.4h\n"
5698       "sxtl2 v7.4s, v5.8h\n"
5699       "sxtl v6.4s, v5.4h\n"
5700       "sxtl2 v1.4s, v0.8h\n"
5701       "sxtl v0.4s, v0.4h\n"
5702       "sxtl2 v5.4s, v4.8h\n"
5703       "sxtl v4.4s, v4.4h\n"
5704       "scvtf v0.4s, v0.4s\n"
5705       "scvtf v1.4s, v1.4s\n"
5706       "scvtf v2.4s, v2.4s\n"
5707       "scvtf v3.4s, v3.4s\n"
5708       "scvtf v4.4s, v4.4s\n"
5709       "scvtf v5.4s, v5.4s\n"
5710       "scvtf v6.4s, v6.4s\n"
5711       "scvtf v7.4s, v7.4s\n"
5712       "fmul v0.4s, v0.4s, v9.4s\n"
5713       "fmul v1.4s, v1.4s, v9.4s\n"
5714       "fmul v2.4s, v2.4s, v9.4s\n"
5715       "fmul v3.4s, v3.4s, v9.4s\n"
5716       "fmul v4.4s, v4.4s, v11.4s\n"
5717       "fmul v5.4s, v5.4s, v11.4s\n"
5718       "fmul v6.4s, v6.4s, v11.4s\n"
5719       "fmul v7.4s, v7.4s, v11.4s\n"
5720       "fadd v0.4s, v0.4s, v8.4s\n"
5721       "fadd v1.4s, v1.4s, v8.4s\n"
5722       "fadd v2.4s, v2.4s, v8.4s\n"
5723       "fadd v3.4s, v3.4s, v8.4s\n"
5724       "fadd v4.4s, v4.4s, v10.4s\n"
5725       "fadd v5.4s, v5.4s, v10.4s\n"
5726       "fadd v6.4s, v6.4s, v10.4s\n"
5727       "fadd v7.4s, v7.4s, v10.4s\n"
5728       "fadd v0.4s, v0.4s, v4.4s\n"
5729       "fadd v1.4s, v1.4s, v5.4s\n"
5730       "fadd v2.4s, v2.4s, v6.4s\n"
5731       "fadd v3.4s, v3.4s, v7.4s\n"
5732       "fsub v0.4s, v0.4s, v12.4s\n"
5733       "fsub v1.4s, v1.4s, v12.4s\n"
5734       "fsub v2.4s, v2.4s, v12.4s\n"
5735       "fsub v3.4s, v3.4s, v12.4s\n"
5736       "fmul v0.4s, v0.4s, v13.4s\n"
5737       "fmul v1.4s, v1.4s, v13.4s\n"
5738       "fmul v2.4s, v2.4s, v13.4s\n"
5739       "fmul v3.4s, v3.4s, v13.4s\n"
5740       "fadd v0.4s, v0.4s, v14.4s\n"
5741       "fadd v1.4s, v1.4s, v14.4s\n"
5742       "fadd v2.4s, v2.4s, v14.4s\n"
5743       "fadd v3.4s, v3.4s, v14.4s\n"
5744       "fcvtzs v0.4s, v0.4s\n"
5745       "fcvtzs v1.4s, v1.4s\n"
5746       "fcvtzs v2.4s, v2.4s\n"
5747       "fcvtzs v3.4s, v3.4s\n"
5748 
5749       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
5750       "prfm pldl1keep, [%x[output]]\n"
5751       "bne 2b\n"
5752       "3:"
5753 
5754       // BiasAdd::Transform
5755       "ld1 {v0.h}[0], [%x[input]], #2\n"
5756       "ld1 {v1.h}[0], [x1], #2\n"
5757       "prfm pldl1keep, [%x[input], #32]\n"
5758       "uxtl v0.8h, v0.8b\n"
5759       "uxtl v1.8h, v1.8b\n"
5760       "sxtl v0.4s, v0.4h\n"
5761       "sxtl v1.4s, v1.4h\n"
5762       "scvtf v0.4s, v0.4s\n"
5763       "scvtf v1.4s, v1.4s\n"
5764       "fmul v0.4s, v0.4s, v9.4s\n"
5765       "fmul v1.4s, v1.4s, v11.4s\n"
5766       "fadd v0.4s, v0.4s, v8.4s\n"
5767       "fadd v1.4s, v1.4s, v10.4s\n"
5768       "fadd v0.4s, v0.4s, v1.4s\n"
5769       "fsub v0.4s, v0.4s, v12.4s\n"
5770       "fmul v0.4s, v0.4s, v13.4s\n"
5771       "fadd v0.4s, v0.4s, v14.4s\n"
5772       "fcvtzs v0.4s, v0.4s\n"
5773 
5774       "st1 {v0.2s}, [%x[output]], #8\n"
5775       "prfm pldl1keep, [%x[output]]\n"
5776       "subs %x[rows], %x[rows], #1\n"
5777       "bne 1b\n"
5778       : [input] "+r"(input), [output] "+r"(output)
5779       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
5780         [output_range_offset] "m"(params.output_range_offset),
5781         [input_range_scale] "m"(params.input_range_scale),
5782         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
5783         [bias_range_min] "m"(params.bias_range_min),
5784         [output_range_min] "m"(params.output_range_min),
5785         [bias_range_scale] "m"(params.bias_range_scale),
5786         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
5787       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
5788         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
5789 }
5790 
5791 template <>
5792 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5793                               3>::Transform(const uint8_t* input,
5794                                             const BiasAdd<uint8_t>& params,
5795                                             int32_t* output) {
5796 #ifdef DEBUG
5797 #ifdef DEBUG_METAGEMM_VERBOSE
5798   std::cout << __FILE__ << "(" << __LINE__
5799             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5800                "3>::Transform()"
5801             << std::endl
5802             << std::flush;
5803 #endif
5804 #endif
5805   int params_rows_copy = params.rows;
5806   asm volatile(
5807       "ldr w0, %[input_range_min]\n"
5808       "dup v8.4s, w0\n"
5809       "ldr w0, %[input_range_scale]\n"
5810       "dup v9.4s, w0\n"
5811       "ldr w0, %[bias_range_min]\n"
5812       "dup v10.4s, w0\n"
5813       "ldr w0, %[bias_range_scale]\n"
5814       "dup v11.4s, w0\n"
5815       "ldr w0, %[output_range_min]\n"
5816       "dup v12.4s, w0\n"
5817       "ldr w0, %[one_over_output_range_scale]\n"
5818       "dup v13.4s, w0\n"
5819       "ldr w0, %[output_range_offset]\n"
5820       "dup v14.4s, w0\n"
5821       "1:"
5822       "mov x0, %x[count]\n"
5823       "mov x1, %x[bias]\n"
5824       "subs x0, x0, #3\n"
5825       "beq 3f\n"
5826       "2:"
5827       "subs x0, x0, #16\n"
5828 
5829       // BiasAdd::Transform
5830       "ld1 {v0.4s}, [%x[input]], #16\n"
5831       "ld1 {v4.4s}, [x1], #16\n"
5832       "prfm pldl1keep, [%x[input], #32]\n"
5833       "uxtl2 v1.8h, v0.16b\n"
5834       "uxtl v0.8h, v0.8b\n"
5835       "uxtl2 v5.8h, v4.16b\n"
5836       "uxtl v4.8h, v4.8b\n"
5837       "sxtl2 v3.4s, v1.8h\n"
5838       "sxtl v2.4s, v1.4h\n"
5839       "sxtl2 v7.4s, v5.8h\n"
5840       "sxtl v6.4s, v5.4h\n"
5841       "sxtl2 v1.4s, v0.8h\n"
5842       "sxtl v0.4s, v0.4h\n"
5843       "sxtl2 v5.4s, v4.8h\n"
5844       "sxtl v4.4s, v4.4h\n"
5845       "scvtf v0.4s, v0.4s\n"
5846       "scvtf v1.4s, v1.4s\n"
5847       "scvtf v2.4s, v2.4s\n"
5848       "scvtf v3.4s, v3.4s\n"
5849       "scvtf v4.4s, v4.4s\n"
5850       "scvtf v5.4s, v5.4s\n"
5851       "scvtf v6.4s, v6.4s\n"
5852       "scvtf v7.4s, v7.4s\n"
5853       "fmul v0.4s, v0.4s, v9.4s\n"
5854       "fmul v1.4s, v1.4s, v9.4s\n"
5855       "fmul v2.4s, v2.4s, v9.4s\n"
5856       "fmul v3.4s, v3.4s, v9.4s\n"
5857       "fmul v4.4s, v4.4s, v11.4s\n"
5858       "fmul v5.4s, v5.4s, v11.4s\n"
5859       "fmul v6.4s, v6.4s, v11.4s\n"
5860       "fmul v7.4s, v7.4s, v11.4s\n"
5861       "fadd v0.4s, v0.4s, v8.4s\n"
5862       "fadd v1.4s, v1.4s, v8.4s\n"
5863       "fadd v2.4s, v2.4s, v8.4s\n"
5864       "fadd v3.4s, v3.4s, v8.4s\n"
5865       "fadd v4.4s, v4.4s, v10.4s\n"
5866       "fadd v5.4s, v5.4s, v10.4s\n"
5867       "fadd v6.4s, v6.4s, v10.4s\n"
5868       "fadd v7.4s, v7.4s, v10.4s\n"
5869       "fadd v0.4s, v0.4s, v4.4s\n"
5870       "fadd v1.4s, v1.4s, v5.4s\n"
5871       "fadd v2.4s, v2.4s, v6.4s\n"
5872       "fadd v3.4s, v3.4s, v7.4s\n"
5873       "fsub v0.4s, v0.4s, v12.4s\n"
5874       "fsub v1.4s, v1.4s, v12.4s\n"
5875       "fsub v2.4s, v2.4s, v12.4s\n"
5876       "fsub v3.4s, v3.4s, v12.4s\n"
5877       "fmul v0.4s, v0.4s, v13.4s\n"
5878       "fmul v1.4s, v1.4s, v13.4s\n"
5879       "fmul v2.4s, v2.4s, v13.4s\n"
5880       "fmul v3.4s, v3.4s, v13.4s\n"
5881       "fadd v0.4s, v0.4s, v14.4s\n"
5882       "fadd v1.4s, v1.4s, v14.4s\n"
5883       "fadd v2.4s, v2.4s, v14.4s\n"
5884       "fadd v3.4s, v3.4s, v14.4s\n"
5885       "fcvtzs v0.4s, v0.4s\n"
5886       "fcvtzs v1.4s, v1.4s\n"
5887       "fcvtzs v2.4s, v2.4s\n"
5888       "fcvtzs v3.4s, v3.4s\n"
5889 
5890       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
5891       "prfm pldl1keep, [%x[output]]\n"
5892       "bne 2b\n"
5893       "3:"
5894 
5895       // BiasAdd::Transform
5896       "ld1 {v0.h}[0], [%x[input]], #2\n"
5897       "ld1 {v0.b}[2], [%x[input]], #1\n"
5898       "ld1 {v1.h}[0], [x1], #2\n"
5899       "ld1 {v1.b}[2], [x1], #1\n"
5900       "prfm pldl1keep, [%x[input], #32]\n"
5901       "uxtl v0.8h, v0.8b\n"
5902       "uxtl v1.8h, v1.8b\n"
5903       "sxtl v0.4s, v0.4h\n"
5904       "sxtl v1.4s, v1.4h\n"
5905       "scvtf v0.4s, v0.4s\n"
5906       "scvtf v1.4s, v1.4s\n"
5907       "fmul v0.4s, v0.4s, v9.4s\n"
5908       "fmul v1.4s, v1.4s, v11.4s\n"
5909       "fadd v0.4s, v0.4s, v8.4s\n"
5910       "fadd v1.4s, v1.4s, v10.4s\n"
5911       "fadd v0.4s, v0.4s, v1.4s\n"
5912       "fsub v0.4s, v0.4s, v12.4s\n"
5913       "fmul v0.4s, v0.4s, v13.4s\n"
5914       "fadd v0.4s, v0.4s, v14.4s\n"
5915       "fcvtzs v0.4s, v0.4s\n"
5916 
5917       "st1 {v0.2s}, [%x[output]], #8\n"
5918       "st1 {v0.s}[2], [%x[output]], #4\n"
5919       "prfm pldl1keep, [%x[output]]\n"
5920       "subs %x[rows], %x[rows], #1\n"
5921       "bne 1b\n"
5922       : [input] "+r"(input), [output] "+r"(output)
5923       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
5924         [output_range_offset] "m"(params.output_range_offset),
5925         [input_range_scale] "m"(params.input_range_scale),
5926         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
5927         [bias_range_min] "m"(params.bias_range_min),
5928         [output_range_min] "m"(params.output_range_min),
5929         [bias_range_scale] "m"(params.bias_range_scale),
5930         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
5931       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
5932         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
5933 }
5934 
5935 template <>
5936 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5937                               4>::Transform(const uint8_t* input,
5938                                             const BiasAdd<uint8_t>& params,
5939                                             int32_t* output) {
5940 #ifdef DEBUG
5941 #ifdef DEBUG_METAGEMM_VERBOSE
5942   std::cout << __FILE__ << "(" << __LINE__
5943             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5944                "4>::Transform()"
5945             << std::endl
5946             << std::flush;
5947 #endif
5948 #endif
5949   int params_rows_copy = params.rows;
5950   asm volatile(
5951       "ldr w0, %[input_range_min]\n"
5952       "dup v8.4s, w0\n"
5953       "ldr w0, %[input_range_scale]\n"
5954       "dup v9.4s, w0\n"
5955       "ldr w0, %[bias_range_min]\n"
5956       "dup v10.4s, w0\n"
5957       "ldr w0, %[bias_range_scale]\n"
5958       "dup v11.4s, w0\n"
5959       "ldr w0, %[output_range_min]\n"
5960       "dup v12.4s, w0\n"
5961       "ldr w0, %[one_over_output_range_scale]\n"
5962       "dup v13.4s, w0\n"
5963       "ldr w0, %[output_range_offset]\n"
5964       "dup v14.4s, w0\n"
5965       "1:"
5966       "mov x0, %x[count]\n"
5967       "mov x1, %x[bias]\n"
5968       "subs x0, x0, #4\n"
5969       "beq 3f\n"
5970       "2:"
5971       "subs x0, x0, #16\n"
5972 
5973       // BiasAdd::Transform
5974       "ld1 {v0.4s}, [%x[input]], #16\n"
5975       "ld1 {v4.4s}, [x1], #16\n"
5976       "prfm pldl1keep, [%x[input], #32]\n"
5977       "uxtl2 v1.8h, v0.16b\n"
5978       "uxtl v0.8h, v0.8b\n"
5979       "uxtl2 v5.8h, v4.16b\n"
5980       "uxtl v4.8h, v4.8b\n"
5981       "sxtl2 v3.4s, v1.8h\n"
5982       "sxtl v2.4s, v1.4h\n"
5983       "sxtl2 v7.4s, v5.8h\n"
5984       "sxtl v6.4s, v5.4h\n"
5985       "sxtl2 v1.4s, v0.8h\n"
5986       "sxtl v0.4s, v0.4h\n"
5987       "sxtl2 v5.4s, v4.8h\n"
5988       "sxtl v4.4s, v4.4h\n"
5989       "scvtf v0.4s, v0.4s\n"
5990       "scvtf v1.4s, v1.4s\n"
5991       "scvtf v2.4s, v2.4s\n"
5992       "scvtf v3.4s, v3.4s\n"
5993       "scvtf v4.4s, v4.4s\n"
5994       "scvtf v5.4s, v5.4s\n"
5995       "scvtf v6.4s, v6.4s\n"
5996       "scvtf v7.4s, v7.4s\n"
5997       "fmul v0.4s, v0.4s, v9.4s\n"
5998       "fmul v1.4s, v1.4s, v9.4s\n"
5999       "fmul v2.4s, v2.4s, v9.4s\n"
6000       "fmul v3.4s, v3.4s, v9.4s\n"
6001       "fmul v4.4s, v4.4s, v11.4s\n"
6002       "fmul v5.4s, v5.4s, v11.4s\n"
6003       "fmul v6.4s, v6.4s, v11.4s\n"
6004       "fmul v7.4s, v7.4s, v11.4s\n"
6005       "fadd v0.4s, v0.4s, v8.4s\n"
6006       "fadd v1.4s, v1.4s, v8.4s\n"
6007       "fadd v2.4s, v2.4s, v8.4s\n"
6008       "fadd v3.4s, v3.4s, v8.4s\n"
6009       "fadd v4.4s, v4.4s, v10.4s\n"
6010       "fadd v5.4s, v5.4s, v10.4s\n"
6011       "fadd v6.4s, v6.4s, v10.4s\n"
6012       "fadd v7.4s, v7.4s, v10.4s\n"
6013       "fadd v0.4s, v0.4s, v4.4s\n"
6014       "fadd v1.4s, v1.4s, v5.4s\n"
6015       "fadd v2.4s, v2.4s, v6.4s\n"
6016       "fadd v3.4s, v3.4s, v7.4s\n"
6017       "fsub v0.4s, v0.4s, v12.4s\n"
6018       "fsub v1.4s, v1.4s, v12.4s\n"
6019       "fsub v2.4s, v2.4s, v12.4s\n"
6020       "fsub v3.4s, v3.4s, v12.4s\n"
6021       "fmul v0.4s, v0.4s, v13.4s\n"
6022       "fmul v1.4s, v1.4s, v13.4s\n"
6023       "fmul v2.4s, v2.4s, v13.4s\n"
6024       "fmul v3.4s, v3.4s, v13.4s\n"
6025       "fadd v0.4s, v0.4s, v14.4s\n"
6026       "fadd v1.4s, v1.4s, v14.4s\n"
6027       "fadd v2.4s, v2.4s, v14.4s\n"
6028       "fadd v3.4s, v3.4s, v14.4s\n"
6029       "fcvtzs v0.4s, v0.4s\n"
6030       "fcvtzs v1.4s, v1.4s\n"
6031       "fcvtzs v2.4s, v2.4s\n"
6032       "fcvtzs v3.4s, v3.4s\n"
6033 
6034       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6035       "prfm pldl1keep, [%x[output]]\n"
6036       "bne 2b\n"
6037       "3:"
6038 
6039       // BiasAdd::Transform
6040       "ld1 {v0.s}[0], [%x[input]], #4\n"
6041       "ld1 {v1.s}[0], [x1], #4\n"
6042       "prfm pldl1keep, [%x[input], #32]\n"
6043       "uxtl v0.8h, v0.8b\n"
6044       "uxtl v1.8h, v1.8b\n"
6045       "sxtl v0.4s, v0.4h\n"
6046       "sxtl v1.4s, v1.4h\n"
6047       "scvtf v0.4s, v0.4s\n"
6048       "scvtf v1.4s, v1.4s\n"
6049       "fmul v0.4s, v0.4s, v9.4s\n"
6050       "fmul v1.4s, v1.4s, v11.4s\n"
6051       "fadd v0.4s, v0.4s, v8.4s\n"
6052       "fadd v1.4s, v1.4s, v10.4s\n"
6053       "fadd v0.4s, v0.4s, v1.4s\n"
6054       "fsub v0.4s, v0.4s, v12.4s\n"
6055       "fmul v0.4s, v0.4s, v13.4s\n"
6056       "fadd v0.4s, v0.4s, v14.4s\n"
6057       "fcvtzs v0.4s, v0.4s\n"
6058 
6059       "st1 {v0.4s}, [%x[output]], #16\n"
6060       "prfm pldl1keep, [%x[output]]\n"
6061       "subs %x[rows], %x[rows], #1\n"
6062       "bne 1b\n"
6063       : [input] "+r"(input), [output] "+r"(output)
6064       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6065         [output_range_offset] "m"(params.output_range_offset),
6066         [input_range_scale] "m"(params.input_range_scale),
6067         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6068         [bias_range_min] "m"(params.bias_range_min),
6069         [output_range_min] "m"(params.output_range_min),
6070         [bias_range_scale] "m"(params.bias_range_scale),
6071         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6072       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6073         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6074 }
6075 
6076 template <>
6077 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6078                               5>::Transform(const uint8_t* input,
6079                                             const BiasAdd<uint8_t>& params,
6080                                             int32_t* output) {
6081 #ifdef DEBUG
6082 #ifdef DEBUG_METAGEMM_VERBOSE
6083   std::cout << __FILE__ << "(" << __LINE__
6084             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6085                "5>::Transform()"
6086             << std::endl
6087             << std::flush;
6088 #endif
6089 #endif
6090   int params_rows_copy = params.rows;
6091   asm volatile(
6092       "ldr w0, %[input_range_min]\n"
6093       "dup v8.4s, w0\n"
6094       "ldr w0, %[input_range_scale]\n"
6095       "dup v9.4s, w0\n"
6096       "ldr w0, %[bias_range_min]\n"
6097       "dup v10.4s, w0\n"
6098       "ldr w0, %[bias_range_scale]\n"
6099       "dup v11.4s, w0\n"
6100       "ldr w0, %[output_range_min]\n"
6101       "dup v12.4s, w0\n"
6102       "ldr w0, %[one_over_output_range_scale]\n"
6103       "dup v13.4s, w0\n"
6104       "ldr w0, %[output_range_offset]\n"
6105       "dup v14.4s, w0\n"
6106       "1:"
6107       "mov x0, %x[count]\n"
6108       "mov x1, %x[bias]\n"
6109       "subs x0, x0, #5\n"
6110       "beq 3f\n"
6111       "2:"
6112       "subs x0, x0, #16\n"
6113 
6114       // BiasAdd::Transform
6115       "ld1 {v0.4s}, [%x[input]], #16\n"
6116       "ld1 {v4.4s}, [x1], #16\n"
6117       "prfm pldl1keep, [%x[input], #32]\n"
6118       "uxtl2 v1.8h, v0.16b\n"
6119       "uxtl v0.8h, v0.8b\n"
6120       "uxtl2 v5.8h, v4.16b\n"
6121       "uxtl v4.8h, v4.8b\n"
6122       "sxtl2 v3.4s, v1.8h\n"
6123       "sxtl v2.4s, v1.4h\n"
6124       "sxtl2 v7.4s, v5.8h\n"
6125       "sxtl v6.4s, v5.4h\n"
6126       "sxtl2 v1.4s, v0.8h\n"
6127       "sxtl v0.4s, v0.4h\n"
6128       "sxtl2 v5.4s, v4.8h\n"
6129       "sxtl v4.4s, v4.4h\n"
6130       "scvtf v0.4s, v0.4s\n"
6131       "scvtf v1.4s, v1.4s\n"
6132       "scvtf v2.4s, v2.4s\n"
6133       "scvtf v3.4s, v3.4s\n"
6134       "scvtf v4.4s, v4.4s\n"
6135       "scvtf v5.4s, v5.4s\n"
6136       "scvtf v6.4s, v6.4s\n"
6137       "scvtf v7.4s, v7.4s\n"
6138       "fmul v0.4s, v0.4s, v9.4s\n"
6139       "fmul v1.4s, v1.4s, v9.4s\n"
6140       "fmul v2.4s, v2.4s, v9.4s\n"
6141       "fmul v3.4s, v3.4s, v9.4s\n"
6142       "fmul v4.4s, v4.4s, v11.4s\n"
6143       "fmul v5.4s, v5.4s, v11.4s\n"
6144       "fmul v6.4s, v6.4s, v11.4s\n"
6145       "fmul v7.4s, v7.4s, v11.4s\n"
6146       "fadd v0.4s, v0.4s, v8.4s\n"
6147       "fadd v1.4s, v1.4s, v8.4s\n"
6148       "fadd v2.4s, v2.4s, v8.4s\n"
6149       "fadd v3.4s, v3.4s, v8.4s\n"
6150       "fadd v4.4s, v4.4s, v10.4s\n"
6151       "fadd v5.4s, v5.4s, v10.4s\n"
6152       "fadd v6.4s, v6.4s, v10.4s\n"
6153       "fadd v7.4s, v7.4s, v10.4s\n"
6154       "fadd v0.4s, v0.4s, v4.4s\n"
6155       "fadd v1.4s, v1.4s, v5.4s\n"
6156       "fadd v2.4s, v2.4s, v6.4s\n"
6157       "fadd v3.4s, v3.4s, v7.4s\n"
6158       "fsub v0.4s, v0.4s, v12.4s\n"
6159       "fsub v1.4s, v1.4s, v12.4s\n"
6160       "fsub v2.4s, v2.4s, v12.4s\n"
6161       "fsub v3.4s, v3.4s, v12.4s\n"
6162       "fmul v0.4s, v0.4s, v13.4s\n"
6163       "fmul v1.4s, v1.4s, v13.4s\n"
6164       "fmul v2.4s, v2.4s, v13.4s\n"
6165       "fmul v3.4s, v3.4s, v13.4s\n"
6166       "fadd v0.4s, v0.4s, v14.4s\n"
6167       "fadd v1.4s, v1.4s, v14.4s\n"
6168       "fadd v2.4s, v2.4s, v14.4s\n"
6169       "fadd v3.4s, v3.4s, v14.4s\n"
6170       "fcvtzs v0.4s, v0.4s\n"
6171       "fcvtzs v1.4s, v1.4s\n"
6172       "fcvtzs v2.4s, v2.4s\n"
6173       "fcvtzs v3.4s, v3.4s\n"
6174 
6175       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6176       "prfm pldl1keep, [%x[output]]\n"
6177       "bne 2b\n"
6178       "3:"
6179 
6180       // BiasAdd::Transform
6181       "ld1 {v0.s}[0], [%x[input]], #4\n"
6182       "ld1 {v0.b}[4], [%x[input]], #1\n"
6183       "ld1 {v2.s}[0], [x1], #4\n"
6184       "ld1 {v2.b}[4], [x1], #1\n"
6185       "prfm pldl1keep, [%x[input], #32]\n"
6186       "uxtl v0.8h, v0.8b\n"
6187       "uxtl v2.8h, v2.8b\n"
6188       "sxtl2 v1.4s, v0.8h\n"
6189       "sxtl v0.4s, v0.4h\n"
6190       "sxtl2 v3.4s, v2.8h\n"
6191       "sxtl v2.4s, v2.4h\n"
6192       "scvtf v0.4s, v0.4s\n"
6193       "scvtf v1.4s, v1.4s\n"
6194       "scvtf v2.4s, v2.4s\n"
6195       "scvtf v3.4s, v3.4s\n"
6196       "fmul v0.4s, v0.4s, v9.4s\n"
6197       "fmul v1.4s, v1.4s, v9.4s\n"
6198       "fmul v2.4s, v2.4s, v11.4s\n"
6199       "fmul v3.4s, v3.4s, v11.4s\n"
6200       "fadd v0.4s, v0.4s, v8.4s\n"
6201       "fadd v1.4s, v1.4s, v8.4s\n"
6202       "fadd v2.4s, v2.4s, v10.4s\n"
6203       "fadd v3.4s, v3.4s, v10.4s\n"
6204       "fadd v0.4s, v0.4s, v2.4s\n"
6205       "fadd v1.4s, v1.4s, v3.4s\n"
6206       "fsub v0.4s, v0.4s, v12.4s\n"
6207       "fsub v1.4s, v1.4s, v12.4s\n"
6208       "fmul v0.4s, v0.4s, v13.4s\n"
6209       "fmul v1.4s, v1.4s, v13.4s\n"
6210       "fadd v0.4s, v0.4s, v14.4s\n"
6211       "fadd v1.4s, v1.4s, v14.4s\n"
6212       "fcvtzs v0.4s, v0.4s\n"
6213       "fcvtzs v1.4s, v1.4s\n"
6214 
6215       "st1 {v0.4s}, [%x[output]], #16\n"
6216       "st1 {v1.s}[0], [%x[output]], #4\n"
6217       "prfm pldl1keep, [%x[output]]\n"
6218       "subs %x[rows], %x[rows], #1\n"
6219       "bne 1b\n"
6220       : [input] "+r"(input), [output] "+r"(output)
6221       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6222         [output_range_offset] "m"(params.output_range_offset),
6223         [input_range_scale] "m"(params.input_range_scale),
6224         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6225         [bias_range_min] "m"(params.bias_range_min),
6226         [output_range_min] "m"(params.output_range_min),
6227         [bias_range_scale] "m"(params.bias_range_scale),
6228         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6229       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6230         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6231 }
6232 
6233 template <>
6234 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6235                               6>::Transform(const uint8_t* input,
6236                                             const BiasAdd<uint8_t>& params,
6237                                             int32_t* output) {
6238 #ifdef DEBUG
6239 #ifdef DEBUG_METAGEMM_VERBOSE
6240   std::cout << __FILE__ << "(" << __LINE__
6241             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6242                "6>::Transform()"
6243             << std::endl
6244             << std::flush;
6245 #endif
6246 #endif
6247   int params_rows_copy = params.rows;
6248   asm volatile(
6249       "ldr w0, %[input_range_min]\n"
6250       "dup v8.4s, w0\n"
6251       "ldr w0, %[input_range_scale]\n"
6252       "dup v9.4s, w0\n"
6253       "ldr w0, %[bias_range_min]\n"
6254       "dup v10.4s, w0\n"
6255       "ldr w0, %[bias_range_scale]\n"
6256       "dup v11.4s, w0\n"
6257       "ldr w0, %[output_range_min]\n"
6258       "dup v12.4s, w0\n"
6259       "ldr w0, %[one_over_output_range_scale]\n"
6260       "dup v13.4s, w0\n"
6261       "ldr w0, %[output_range_offset]\n"
6262       "dup v14.4s, w0\n"
6263       "1:"
6264       "mov x0, %x[count]\n"
6265       "mov x1, %x[bias]\n"
6266       "subs x0, x0, #6\n"
6267       "beq 3f\n"
6268       "2:"
6269       "subs x0, x0, #16\n"
6270 
6271       // BiasAdd::Transform
6272       "ld1 {v0.4s}, [%x[input]], #16\n"
6273       "ld1 {v4.4s}, [x1], #16\n"
6274       "prfm pldl1keep, [%x[input], #32]\n"
6275       "uxtl2 v1.8h, v0.16b\n"
6276       "uxtl v0.8h, v0.8b\n"
6277       "uxtl2 v5.8h, v4.16b\n"
6278       "uxtl v4.8h, v4.8b\n"
6279       "sxtl2 v3.4s, v1.8h\n"
6280       "sxtl v2.4s, v1.4h\n"
6281       "sxtl2 v7.4s, v5.8h\n"
6282       "sxtl v6.4s, v5.4h\n"
6283       "sxtl2 v1.4s, v0.8h\n"
6284       "sxtl v0.4s, v0.4h\n"
6285       "sxtl2 v5.4s, v4.8h\n"
6286       "sxtl v4.4s, v4.4h\n"
6287       "scvtf v0.4s, v0.4s\n"
6288       "scvtf v1.4s, v1.4s\n"
6289       "scvtf v2.4s, v2.4s\n"
6290       "scvtf v3.4s, v3.4s\n"
6291       "scvtf v4.4s, v4.4s\n"
6292       "scvtf v5.4s, v5.4s\n"
6293       "scvtf v6.4s, v6.4s\n"
6294       "scvtf v7.4s, v7.4s\n"
6295       "fmul v0.4s, v0.4s, v9.4s\n"
6296       "fmul v1.4s, v1.4s, v9.4s\n"
6297       "fmul v2.4s, v2.4s, v9.4s\n"
6298       "fmul v3.4s, v3.4s, v9.4s\n"
6299       "fmul v4.4s, v4.4s, v11.4s\n"
6300       "fmul v5.4s, v5.4s, v11.4s\n"
6301       "fmul v6.4s, v6.4s, v11.4s\n"
6302       "fmul v7.4s, v7.4s, v11.4s\n"
6303       "fadd v0.4s, v0.4s, v8.4s\n"
6304       "fadd v1.4s, v1.4s, v8.4s\n"
6305       "fadd v2.4s, v2.4s, v8.4s\n"
6306       "fadd v3.4s, v3.4s, v8.4s\n"
6307       "fadd v4.4s, v4.4s, v10.4s\n"
6308       "fadd v5.4s, v5.4s, v10.4s\n"
6309       "fadd v6.4s, v6.4s, v10.4s\n"
6310       "fadd v7.4s, v7.4s, v10.4s\n"
6311       "fadd v0.4s, v0.4s, v4.4s\n"
6312       "fadd v1.4s, v1.4s, v5.4s\n"
6313       "fadd v2.4s, v2.4s, v6.4s\n"
6314       "fadd v3.4s, v3.4s, v7.4s\n"
6315       "fsub v0.4s, v0.4s, v12.4s\n"
6316       "fsub v1.4s, v1.4s, v12.4s\n"
6317       "fsub v2.4s, v2.4s, v12.4s\n"
6318       "fsub v3.4s, v3.4s, v12.4s\n"
6319       "fmul v0.4s, v0.4s, v13.4s\n"
6320       "fmul v1.4s, v1.4s, v13.4s\n"
6321       "fmul v2.4s, v2.4s, v13.4s\n"
6322       "fmul v3.4s, v3.4s, v13.4s\n"
6323       "fadd v0.4s, v0.4s, v14.4s\n"
6324       "fadd v1.4s, v1.4s, v14.4s\n"
6325       "fadd v2.4s, v2.4s, v14.4s\n"
6326       "fadd v3.4s, v3.4s, v14.4s\n"
6327       "fcvtzs v0.4s, v0.4s\n"
6328       "fcvtzs v1.4s, v1.4s\n"
6329       "fcvtzs v2.4s, v2.4s\n"
6330       "fcvtzs v3.4s, v3.4s\n"
6331 
6332       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6333       "prfm pldl1keep, [%x[output]]\n"
6334       "bne 2b\n"
6335       "3:"
6336 
6337       // BiasAdd::Transform
6338       "ld1 {v0.s}[0], [%x[input]], #4\n"
6339       "ld1 {v0.h}[2], [%x[input]], #2\n"
6340       "ld1 {v2.s}[0], [x1], #4\n"
6341       "ld1 {v2.h}[2], [x1], #2\n"
6342       "prfm pldl1keep, [%x[input], #32]\n"
6343       "uxtl v0.8h, v0.8b\n"
6344       "uxtl v2.8h, v2.8b\n"
6345       "sxtl2 v1.4s, v0.8h\n"
6346       "sxtl v0.4s, v0.4h\n"
6347       "sxtl2 v3.4s, v2.8h\n"
6348       "sxtl v2.4s, v2.4h\n"
6349       "scvtf v0.4s, v0.4s\n"
6350       "scvtf v1.4s, v1.4s\n"
6351       "scvtf v2.4s, v2.4s\n"
6352       "scvtf v3.4s, v3.4s\n"
6353       "fmul v0.4s, v0.4s, v9.4s\n"
6354       "fmul v1.4s, v1.4s, v9.4s\n"
6355       "fmul v2.4s, v2.4s, v11.4s\n"
6356       "fmul v3.4s, v3.4s, v11.4s\n"
6357       "fadd v0.4s, v0.4s, v8.4s\n"
6358       "fadd v1.4s, v1.4s, v8.4s\n"
6359       "fadd v2.4s, v2.4s, v10.4s\n"
6360       "fadd v3.4s, v3.4s, v10.4s\n"
6361       "fadd v0.4s, v0.4s, v2.4s\n"
6362       "fadd v1.4s, v1.4s, v3.4s\n"
6363       "fsub v0.4s, v0.4s, v12.4s\n"
6364       "fsub v1.4s, v1.4s, v12.4s\n"
6365       "fmul v0.4s, v0.4s, v13.4s\n"
6366       "fmul v1.4s, v1.4s, v13.4s\n"
6367       "fadd v0.4s, v0.4s, v14.4s\n"
6368       "fadd v1.4s, v1.4s, v14.4s\n"
6369       "fcvtzs v0.4s, v0.4s\n"
6370       "fcvtzs v1.4s, v1.4s\n"
6371 
6372       "st1 {v0.4s}, [%x[output]], #16\n"
6373       "st1 {v1.2s}, [%x[output]], #8\n"
6374       "prfm pldl1keep, [%x[output]]\n"
6375       "subs %x[rows], %x[rows], #1\n"
6376       "bne 1b\n"
6377       : [input] "+r"(input), [output] "+r"(output)
6378       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6379         [output_range_offset] "m"(params.output_range_offset),
6380         [input_range_scale] "m"(params.input_range_scale),
6381         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6382         [bias_range_min] "m"(params.bias_range_min),
6383         [output_range_min] "m"(params.output_range_min),
6384         [bias_range_scale] "m"(params.bias_range_scale),
6385         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6386       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6387         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6388 }
6389 
6390 template <>
6391 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6392                               7>::Transform(const uint8_t* input,
6393                                             const BiasAdd<uint8_t>& params,
6394                                             int32_t* output) {
6395 #ifdef DEBUG
6396 #ifdef DEBUG_METAGEMM_VERBOSE
6397   std::cout << __FILE__ << "(" << __LINE__
6398             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6399                "7>::Transform()"
6400             << std::endl
6401             << std::flush;
6402 #endif
6403 #endif
6404   int params_rows_copy = params.rows;
6405   asm volatile(
6406       "ldr w0, %[input_range_min]\n"
6407       "dup v8.4s, w0\n"
6408       "ldr w0, %[input_range_scale]\n"
6409       "dup v9.4s, w0\n"
6410       "ldr w0, %[bias_range_min]\n"
6411       "dup v10.4s, w0\n"
6412       "ldr w0, %[bias_range_scale]\n"
6413       "dup v11.4s, w0\n"
6414       "ldr w0, %[output_range_min]\n"
6415       "dup v12.4s, w0\n"
6416       "ldr w0, %[one_over_output_range_scale]\n"
6417       "dup v13.4s, w0\n"
6418       "ldr w0, %[output_range_offset]\n"
6419       "dup v14.4s, w0\n"
6420       "1:"
6421       "mov x0, %x[count]\n"
6422       "mov x1, %x[bias]\n"
6423       "subs x0, x0, #7\n"
6424       "beq 3f\n"
6425       "2:"
6426       "subs x0, x0, #16\n"
6427 
6428       // BiasAdd::Transform
6429       "ld1 {v0.4s}, [%x[input]], #16\n"
6430       "ld1 {v4.4s}, [x1], #16\n"
6431       "prfm pldl1keep, [%x[input], #32]\n"
6432       "uxtl2 v1.8h, v0.16b\n"
6433       "uxtl v0.8h, v0.8b\n"
6434       "uxtl2 v5.8h, v4.16b\n"
6435       "uxtl v4.8h, v4.8b\n"
6436       "sxtl2 v3.4s, v1.8h\n"
6437       "sxtl v2.4s, v1.4h\n"
6438       "sxtl2 v7.4s, v5.8h\n"
6439       "sxtl v6.4s, v5.4h\n"
6440       "sxtl2 v1.4s, v0.8h\n"
6441       "sxtl v0.4s, v0.4h\n"
6442       "sxtl2 v5.4s, v4.8h\n"
6443       "sxtl v4.4s, v4.4h\n"
6444       "scvtf v0.4s, v0.4s\n"
6445       "scvtf v1.4s, v1.4s\n"
6446       "scvtf v2.4s, v2.4s\n"
6447       "scvtf v3.4s, v3.4s\n"
6448       "scvtf v4.4s, v4.4s\n"
6449       "scvtf v5.4s, v5.4s\n"
6450       "scvtf v6.4s, v6.4s\n"
6451       "scvtf v7.4s, v7.4s\n"
6452       "fmul v0.4s, v0.4s, v9.4s\n"
6453       "fmul v1.4s, v1.4s, v9.4s\n"
6454       "fmul v2.4s, v2.4s, v9.4s\n"
6455       "fmul v3.4s, v3.4s, v9.4s\n"
6456       "fmul v4.4s, v4.4s, v11.4s\n"
6457       "fmul v5.4s, v5.4s, v11.4s\n"
6458       "fmul v6.4s, v6.4s, v11.4s\n"
6459       "fmul v7.4s, v7.4s, v11.4s\n"
6460       "fadd v0.4s, v0.4s, v8.4s\n"
6461       "fadd v1.4s, v1.4s, v8.4s\n"
6462       "fadd v2.4s, v2.4s, v8.4s\n"
6463       "fadd v3.4s, v3.4s, v8.4s\n"
6464       "fadd v4.4s, v4.4s, v10.4s\n"
6465       "fadd v5.4s, v5.4s, v10.4s\n"
6466       "fadd v6.4s, v6.4s, v10.4s\n"
6467       "fadd v7.4s, v7.4s, v10.4s\n"
6468       "fadd v0.4s, v0.4s, v4.4s\n"
6469       "fadd v1.4s, v1.4s, v5.4s\n"
6470       "fadd v2.4s, v2.4s, v6.4s\n"
6471       "fadd v3.4s, v3.4s, v7.4s\n"
6472       "fsub v0.4s, v0.4s, v12.4s\n"
6473       "fsub v1.4s, v1.4s, v12.4s\n"
6474       "fsub v2.4s, v2.4s, v12.4s\n"
6475       "fsub v3.4s, v3.4s, v12.4s\n"
6476       "fmul v0.4s, v0.4s, v13.4s\n"
6477       "fmul v1.4s, v1.4s, v13.4s\n"
6478       "fmul v2.4s, v2.4s, v13.4s\n"
6479       "fmul v3.4s, v3.4s, v13.4s\n"
6480       "fadd v0.4s, v0.4s, v14.4s\n"
6481       "fadd v1.4s, v1.4s, v14.4s\n"
6482       "fadd v2.4s, v2.4s, v14.4s\n"
6483       "fadd v3.4s, v3.4s, v14.4s\n"
6484       "fcvtzs v0.4s, v0.4s\n"
6485       "fcvtzs v1.4s, v1.4s\n"
6486       "fcvtzs v2.4s, v2.4s\n"
6487       "fcvtzs v3.4s, v3.4s\n"
6488 
6489       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6490       "prfm pldl1keep, [%x[output]]\n"
6491       "bne 2b\n"
6492       "3:"
6493 
6494       // BiasAdd::Transform
6495       "ld1 {v0.s}[0], [%x[input]], #4\n"
6496       "ld1 {v0.h}[2], [%x[input]], #2\n"
6497       "ld1 {v0.b}[6], [%x[input]], #1\n"
6498       "ld1 {v2.s}[0], [x1], #4\n"
6499       "ld1 {v2.h}[2], [x1], #2\n"
6500       "ld1 {v2.b}[6], [x1], #1\n"
6501       "prfm pldl1keep, [%x[input], #32]\n"
6502       "uxtl v0.8h, v0.8b\n"
6503       "uxtl v2.8h, v2.8b\n"
6504       "sxtl2 v1.4s, v0.8h\n"
6505       "sxtl v0.4s, v0.4h\n"
6506       "sxtl2 v3.4s, v2.8h\n"
6507       "sxtl v2.4s, v2.4h\n"
6508       "scvtf v0.4s, v0.4s\n"
6509       "scvtf v1.4s, v1.4s\n"
6510       "scvtf v2.4s, v2.4s\n"
6511       "scvtf v3.4s, v3.4s\n"
6512       "fmul v0.4s, v0.4s, v9.4s\n"
6513       "fmul v1.4s, v1.4s, v9.4s\n"
6514       "fmul v2.4s, v2.4s, v11.4s\n"
6515       "fmul v3.4s, v3.4s, v11.4s\n"
6516       "fadd v0.4s, v0.4s, v8.4s\n"
6517       "fadd v1.4s, v1.4s, v8.4s\n"
6518       "fadd v2.4s, v2.4s, v10.4s\n"
6519       "fadd v3.4s, v3.4s, v10.4s\n"
6520       "fadd v0.4s, v0.4s, v2.4s\n"
6521       "fadd v1.4s, v1.4s, v3.4s\n"
6522       "fsub v0.4s, v0.4s, v12.4s\n"
6523       "fsub v1.4s, v1.4s, v12.4s\n"
6524       "fmul v0.4s, v0.4s, v13.4s\n"
6525       "fmul v1.4s, v1.4s, v13.4s\n"
6526       "fadd v0.4s, v0.4s, v14.4s\n"
6527       "fadd v1.4s, v1.4s, v14.4s\n"
6528       "fcvtzs v0.4s, v0.4s\n"
6529       "fcvtzs v1.4s, v1.4s\n"
6530 
6531       "st1 {v0.4s}, [%x[output]], #16\n"
6532       "st1 {v1.2s}, [%x[output]], #8\n"
6533       "st1 {v1.s}[2], [%x[output]], #4\n"
6534       "prfm pldl1keep, [%x[output]]\n"
6535       "subs %x[rows], %x[rows], #1\n"
6536       "bne 1b\n"
6537       : [input] "+r"(input), [output] "+r"(output)
6538       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6539         [output_range_offset] "m"(params.output_range_offset),
6540         [input_range_scale] "m"(params.input_range_scale),
6541         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6542         [bias_range_min] "m"(params.bias_range_min),
6543         [output_range_min] "m"(params.output_range_min),
6544         [bias_range_scale] "m"(params.bias_range_scale),
6545         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6546       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6547         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6548 }
6549 
6550 template <>
6551 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6552                               8>::Transform(const uint8_t* input,
6553                                             const BiasAdd<uint8_t>& params,
6554                                             int32_t* output) {
6555 #ifdef DEBUG
6556 #ifdef DEBUG_METAGEMM_VERBOSE
6557   std::cout << __FILE__ << "(" << __LINE__
6558             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6559                "8>::Transform()"
6560             << std::endl
6561             << std::flush;
6562 #endif
6563 #endif
6564   int params_rows_copy = params.rows;
6565   asm volatile(
6566       "ldr w0, %[input_range_min]\n"
6567       "dup v8.4s, w0\n"
6568       "ldr w0, %[input_range_scale]\n"
6569       "dup v9.4s, w0\n"
6570       "ldr w0, %[bias_range_min]\n"
6571       "dup v10.4s, w0\n"
6572       "ldr w0, %[bias_range_scale]\n"
6573       "dup v11.4s, w0\n"
6574       "ldr w0, %[output_range_min]\n"
6575       "dup v12.4s, w0\n"
6576       "ldr w0, %[one_over_output_range_scale]\n"
6577       "dup v13.4s, w0\n"
6578       "ldr w0, %[output_range_offset]\n"
6579       "dup v14.4s, w0\n"
6580       "1:"
6581       "mov x0, %x[count]\n"
6582       "mov x1, %x[bias]\n"
6583       "subs x0, x0, #8\n"
6584       "beq 3f\n"
6585       "2:"
6586       "subs x0, x0, #16\n"
6587 
6588       // BiasAdd::Transform
6589       "ld1 {v0.4s}, [%x[input]], #16\n"
6590       "ld1 {v4.4s}, [x1], #16\n"
6591       "prfm pldl1keep, [%x[input], #32]\n"
6592       "uxtl2 v1.8h, v0.16b\n"
6593       "uxtl v0.8h, v0.8b\n"
6594       "uxtl2 v5.8h, v4.16b\n"
6595       "uxtl v4.8h, v4.8b\n"
6596       "sxtl2 v3.4s, v1.8h\n"
6597       "sxtl v2.4s, v1.4h\n"
6598       "sxtl2 v7.4s, v5.8h\n"
6599       "sxtl v6.4s, v5.4h\n"
6600       "sxtl2 v1.4s, v0.8h\n"
6601       "sxtl v0.4s, v0.4h\n"
6602       "sxtl2 v5.4s, v4.8h\n"
6603       "sxtl v4.4s, v4.4h\n"
6604       "scvtf v0.4s, v0.4s\n"
6605       "scvtf v1.4s, v1.4s\n"
6606       "scvtf v2.4s, v2.4s\n"
6607       "scvtf v3.4s, v3.4s\n"
6608       "scvtf v4.4s, v4.4s\n"
6609       "scvtf v5.4s, v5.4s\n"
6610       "scvtf v6.4s, v6.4s\n"
6611       "scvtf v7.4s, v7.4s\n"
6612       "fmul v0.4s, v0.4s, v9.4s\n"
6613       "fmul v1.4s, v1.4s, v9.4s\n"
6614       "fmul v2.4s, v2.4s, v9.4s\n"
6615       "fmul v3.4s, v3.4s, v9.4s\n"
6616       "fmul v4.4s, v4.4s, v11.4s\n"
6617       "fmul v5.4s, v5.4s, v11.4s\n"
6618       "fmul v6.4s, v6.4s, v11.4s\n"
6619       "fmul v7.4s, v7.4s, v11.4s\n"
6620       "fadd v0.4s, v0.4s, v8.4s\n"
6621       "fadd v1.4s, v1.4s, v8.4s\n"
6622       "fadd v2.4s, v2.4s, v8.4s\n"
6623       "fadd v3.4s, v3.4s, v8.4s\n"
6624       "fadd v4.4s, v4.4s, v10.4s\n"
6625       "fadd v5.4s, v5.4s, v10.4s\n"
6626       "fadd v6.4s, v6.4s, v10.4s\n"
6627       "fadd v7.4s, v7.4s, v10.4s\n"
6628       "fadd v0.4s, v0.4s, v4.4s\n"
6629       "fadd v1.4s, v1.4s, v5.4s\n"
6630       "fadd v2.4s, v2.4s, v6.4s\n"
6631       "fadd v3.4s, v3.4s, v7.4s\n"
6632       "fsub v0.4s, v0.4s, v12.4s\n"
6633       "fsub v1.4s, v1.4s, v12.4s\n"
6634       "fsub v2.4s, v2.4s, v12.4s\n"
6635       "fsub v3.4s, v3.4s, v12.4s\n"
6636       "fmul v0.4s, v0.4s, v13.4s\n"
6637       "fmul v1.4s, v1.4s, v13.4s\n"
6638       "fmul v2.4s, v2.4s, v13.4s\n"
6639       "fmul v3.4s, v3.4s, v13.4s\n"
6640       "fadd v0.4s, v0.4s, v14.4s\n"
6641       "fadd v1.4s, v1.4s, v14.4s\n"
6642       "fadd v2.4s, v2.4s, v14.4s\n"
6643       "fadd v3.4s, v3.4s, v14.4s\n"
6644       "fcvtzs v0.4s, v0.4s\n"
6645       "fcvtzs v1.4s, v1.4s\n"
6646       "fcvtzs v2.4s, v2.4s\n"
6647       "fcvtzs v3.4s, v3.4s\n"
6648 
6649       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6650       "prfm pldl1keep, [%x[output]]\n"
6651       "bne 2b\n"
6652       "3:"
6653 
6654       // BiasAdd::Transform
6655       "ld1 {v0.2s}, [%x[input]], #8\n"
6656       "ld1 {v2.2s}, [x1], #8\n"
6657       "prfm pldl1keep, [%x[input], #32]\n"
6658       "uxtl v0.8h, v0.8b\n"
6659       "uxtl v2.8h, v2.8b\n"
6660       "sxtl2 v1.4s, v0.8h\n"
6661       "sxtl v0.4s, v0.4h\n"
6662       "sxtl2 v3.4s, v2.8h\n"
6663       "sxtl v2.4s, v2.4h\n"
6664       "scvtf v0.4s, v0.4s\n"
6665       "scvtf v1.4s, v1.4s\n"
6666       "scvtf v2.4s, v2.4s\n"
6667       "scvtf v3.4s, v3.4s\n"
6668       "fmul v0.4s, v0.4s, v9.4s\n"
6669       "fmul v1.4s, v1.4s, v9.4s\n"
6670       "fmul v2.4s, v2.4s, v11.4s\n"
6671       "fmul v3.4s, v3.4s, v11.4s\n"
6672       "fadd v0.4s, v0.4s, v8.4s\n"
6673       "fadd v1.4s, v1.4s, v8.4s\n"
6674       "fadd v2.4s, v2.4s, v10.4s\n"
6675       "fadd v3.4s, v3.4s, v10.4s\n"
6676       "fadd v0.4s, v0.4s, v2.4s\n"
6677       "fadd v1.4s, v1.4s, v3.4s\n"
6678       "fsub v0.4s, v0.4s, v12.4s\n"
6679       "fsub v1.4s, v1.4s, v12.4s\n"
6680       "fmul v0.4s, v0.4s, v13.4s\n"
6681       "fmul v1.4s, v1.4s, v13.4s\n"
6682       "fadd v0.4s, v0.4s, v14.4s\n"
6683       "fadd v1.4s, v1.4s, v14.4s\n"
6684       "fcvtzs v0.4s, v0.4s\n"
6685       "fcvtzs v1.4s, v1.4s\n"
6686 
6687       "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
6688       "prfm pldl1keep, [%x[output]]\n"
6689       "subs %x[rows], %x[rows], #1\n"
6690       "bne 1b\n"
6691       : [input] "+r"(input), [output] "+r"(output)
6692       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6693         [output_range_offset] "m"(params.output_range_offset),
6694         [input_range_scale] "m"(params.input_range_scale),
6695         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6696         [bias_range_min] "m"(params.bias_range_min),
6697         [output_range_min] "m"(params.output_range_min),
6698         [bias_range_scale] "m"(params.bias_range_scale),
6699         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6700       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6701         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6702 }
6703 
6704 template <>
6705 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6706                               9>::Transform(const uint8_t* input,
6707                                             const BiasAdd<uint8_t>& params,
6708                                             int32_t* output) {
6709 #ifdef DEBUG
6710 #ifdef DEBUG_METAGEMM_VERBOSE
6711   std::cout << __FILE__ << "(" << __LINE__
6712             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6713                "9>::Transform()"
6714             << std::endl
6715             << std::flush;
6716 #endif
6717 #endif
6718   int params_rows_copy = params.rows;
6719   asm volatile(
6720       "ldr w0, %[input_range_min]\n"
6721       "dup v8.4s, w0\n"
6722       "ldr w0, %[input_range_scale]\n"
6723       "dup v9.4s, w0\n"
6724       "ldr w0, %[bias_range_min]\n"
6725       "dup v10.4s, w0\n"
6726       "ldr w0, %[bias_range_scale]\n"
6727       "dup v11.4s, w0\n"
6728       "ldr w0, %[output_range_min]\n"
6729       "dup v12.4s, w0\n"
6730       "ldr w0, %[one_over_output_range_scale]\n"
6731       "dup v13.4s, w0\n"
6732       "ldr w0, %[output_range_offset]\n"
6733       "dup v14.4s, w0\n"
6734       "1:"
6735       "mov x0, %x[count]\n"
6736       "mov x1, %x[bias]\n"
6737       "subs x0, x0, #9\n"
6738       "beq 3f\n"
6739       "2:"
6740       "subs x0, x0, #16\n"
6741 
6742       // BiasAdd::Transform
6743       "ld1 {v0.4s}, [%x[input]], #16\n"
6744       "ld1 {v4.4s}, [x1], #16\n"
6745       "prfm pldl1keep, [%x[input], #32]\n"
6746       "uxtl2 v1.8h, v0.16b\n"
6747       "uxtl v0.8h, v0.8b\n"
6748       "uxtl2 v5.8h, v4.16b\n"
6749       "uxtl v4.8h, v4.8b\n"
6750       "sxtl2 v3.4s, v1.8h\n"
6751       "sxtl v2.4s, v1.4h\n"
6752       "sxtl2 v7.4s, v5.8h\n"
6753       "sxtl v6.4s, v5.4h\n"
6754       "sxtl2 v1.4s, v0.8h\n"
6755       "sxtl v0.4s, v0.4h\n"
6756       "sxtl2 v5.4s, v4.8h\n"
6757       "sxtl v4.4s, v4.4h\n"
6758       "scvtf v0.4s, v0.4s\n"
6759       "scvtf v1.4s, v1.4s\n"
6760       "scvtf v2.4s, v2.4s\n"
6761       "scvtf v3.4s, v3.4s\n"
6762       "scvtf v4.4s, v4.4s\n"
6763       "scvtf v5.4s, v5.4s\n"
6764       "scvtf v6.4s, v6.4s\n"
6765       "scvtf v7.4s, v7.4s\n"
6766       "fmul v0.4s, v0.4s, v9.4s\n"
6767       "fmul v1.4s, v1.4s, v9.4s\n"
6768       "fmul v2.4s, v2.4s, v9.4s\n"
6769       "fmul v3.4s, v3.4s, v9.4s\n"
6770       "fmul v4.4s, v4.4s, v11.4s\n"
6771       "fmul v5.4s, v5.4s, v11.4s\n"
6772       "fmul v6.4s, v6.4s, v11.4s\n"
6773       "fmul v7.4s, v7.4s, v11.4s\n"
6774       "fadd v0.4s, v0.4s, v8.4s\n"
6775       "fadd v1.4s, v1.4s, v8.4s\n"
6776       "fadd v2.4s, v2.4s, v8.4s\n"
6777       "fadd v3.4s, v3.4s, v8.4s\n"
6778       "fadd v4.4s, v4.4s, v10.4s\n"
6779       "fadd v5.4s, v5.4s, v10.4s\n"
6780       "fadd v6.4s, v6.4s, v10.4s\n"
6781       "fadd v7.4s, v7.4s, v10.4s\n"
6782       "fadd v0.4s, v0.4s, v4.4s\n"
6783       "fadd v1.4s, v1.4s, v5.4s\n"
6784       "fadd v2.4s, v2.4s, v6.4s\n"
6785       "fadd v3.4s, v3.4s, v7.4s\n"
6786       "fsub v0.4s, v0.4s, v12.4s\n"
6787       "fsub v1.4s, v1.4s, v12.4s\n"
6788       "fsub v2.4s, v2.4s, v12.4s\n"
6789       "fsub v3.4s, v3.4s, v12.4s\n"
6790       "fmul v0.4s, v0.4s, v13.4s\n"
6791       "fmul v1.4s, v1.4s, v13.4s\n"
6792       "fmul v2.4s, v2.4s, v13.4s\n"
6793       "fmul v3.4s, v3.4s, v13.4s\n"
6794       "fadd v0.4s, v0.4s, v14.4s\n"
6795       "fadd v1.4s, v1.4s, v14.4s\n"
6796       "fadd v2.4s, v2.4s, v14.4s\n"
6797       "fadd v3.4s, v3.4s, v14.4s\n"
6798       "fcvtzs v0.4s, v0.4s\n"
6799       "fcvtzs v1.4s, v1.4s\n"
6800       "fcvtzs v2.4s, v2.4s\n"
6801       "fcvtzs v3.4s, v3.4s\n"
6802 
6803       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6804       "prfm pldl1keep, [%x[output]]\n"
6805       "bne 2b\n"
6806       "3:"
6807 
6808       // BiasAdd::Transform
6809       "ld1 {v0.2s}, [%x[input]], #8\n"
6810       "ld1 {v0.b}[8], [%x[input]], #1\n"
6811       "ld1 {v3.2s}, [x1], #8\n"
6812       "ld1 {v3.b}[8], [x1], #1\n"
6813       "prfm pldl1keep, [%x[input], #32]\n"
6814       "uxtl2 v1.8h, v0.16b\n"
6815       "uxtl v0.8h, v0.8b\n"
6816       "uxtl2 v4.8h, v3.16b\n"
6817       "uxtl v3.8h, v3.8b\n"
6818       "sxtl v2.4s, v1.4h\n"
6819       "sxtl v5.4s, v4.4h\n"
6820       "sxtl2 v1.4s, v0.8h\n"
6821       "sxtl v0.4s, v0.4h\n"
6822       "sxtl2 v4.4s, v3.8h\n"
6823       "sxtl v3.4s, v3.4h\n"
6824       "scvtf v0.4s, v0.4s\n"
6825       "scvtf v1.4s, v1.4s\n"
6826       "scvtf v2.4s, v2.4s\n"
6827       "scvtf v3.4s, v3.4s\n"
6828       "scvtf v4.4s, v4.4s\n"
6829       "scvtf v5.4s, v5.4s\n"
6830       "fmul v0.4s, v0.4s, v9.4s\n"
6831       "fmul v1.4s, v1.4s, v9.4s\n"
6832       "fmul v2.4s, v2.4s, v9.4s\n"
6833       "fmul v3.4s, v3.4s, v11.4s\n"
6834       "fmul v4.4s, v4.4s, v11.4s\n"
6835       "fmul v5.4s, v5.4s, v11.4s\n"
6836       "fadd v0.4s, v0.4s, v8.4s\n"
6837       "fadd v1.4s, v1.4s, v8.4s\n"
6838       "fadd v2.4s, v2.4s, v8.4s\n"
6839       "fadd v3.4s, v3.4s, v10.4s\n"
6840       "fadd v4.4s, v4.4s, v10.4s\n"
6841       "fadd v5.4s, v5.4s, v10.4s\n"
6842       "fadd v0.4s, v0.4s, v3.4s\n"
6843       "fadd v1.4s, v1.4s, v4.4s\n"
6844       "fadd v2.4s, v2.4s, v5.4s\n"
6845       "fsub v0.4s, v0.4s, v12.4s\n"
6846       "fsub v1.4s, v1.4s, v12.4s\n"
6847       "fsub v2.4s, v2.4s, v12.4s\n"
6848       "fmul v0.4s, v0.4s, v13.4s\n"
6849       "fmul v1.4s, v1.4s, v13.4s\n"
6850       "fmul v2.4s, v2.4s, v13.4s\n"
6851       "fadd v0.4s, v0.4s, v14.4s\n"
6852       "fadd v1.4s, v1.4s, v14.4s\n"
6853       "fadd v2.4s, v2.4s, v14.4s\n"
6854       "fcvtzs v0.4s, v0.4s\n"
6855       "fcvtzs v1.4s, v1.4s\n"
6856       "fcvtzs v2.4s, v2.4s\n"
6857 
6858       "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
6859       "st1 {v2.s}[0], [%x[output]], #4\n"
6860       "prfm pldl1keep, [%x[output]]\n"
6861       "subs %x[rows], %x[rows], #1\n"
6862       "bne 1b\n"
6863       : [input] "+r"(input), [output] "+r"(output)
6864       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6865         [output_range_offset] "m"(params.output_range_offset),
6866         [input_range_scale] "m"(params.input_range_scale),
6867         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6868         [bias_range_min] "m"(params.bias_range_min),
6869         [output_range_min] "m"(params.output_range_min),
6870         [bias_range_scale] "m"(params.bias_range_scale),
6871         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6872       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6873         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6874 }
6875 
6876 template <>
6877 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6878                               10>::Transform(const uint8_t* input,
6879                                              const BiasAdd<uint8_t>& params,
6880                                              int32_t* output) {
6881 #ifdef DEBUG
6882 #ifdef DEBUG_METAGEMM_VERBOSE
6883   std::cout << __FILE__ << "(" << __LINE__
6884             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6885                "10>::Transform()"
6886             << std::endl
6887             << std::flush;
6888 #endif
6889 #endif
6890   int params_rows_copy = params.rows;
6891   asm volatile(
6892       "ldr w0, %[input_range_min]\n"
6893       "dup v8.4s, w0\n"
6894       "ldr w0, %[input_range_scale]\n"
6895       "dup v9.4s, w0\n"
6896       "ldr w0, %[bias_range_min]\n"
6897       "dup v10.4s, w0\n"
6898       "ldr w0, %[bias_range_scale]\n"
6899       "dup v11.4s, w0\n"
6900       "ldr w0, %[output_range_min]\n"
6901       "dup v12.4s, w0\n"
6902       "ldr w0, %[one_over_output_range_scale]\n"
6903       "dup v13.4s, w0\n"
6904       "ldr w0, %[output_range_offset]\n"
6905       "dup v14.4s, w0\n"
6906       "1:"
6907       "mov x0, %x[count]\n"
6908       "mov x1, %x[bias]\n"
6909       "subs x0, x0, #10\n"
6910       "beq 3f\n"
6911       "2:"
6912       "subs x0, x0, #16\n"
6913 
6914       // BiasAdd::Transform
6915       "ld1 {v0.4s}, [%x[input]], #16\n"
6916       "ld1 {v4.4s}, [x1], #16\n"
6917       "prfm pldl1keep, [%x[input], #32]\n"
6918       "uxtl2 v1.8h, v0.16b\n"
6919       "uxtl v0.8h, v0.8b\n"
6920       "uxtl2 v5.8h, v4.16b\n"
6921       "uxtl v4.8h, v4.8b\n"
6922       "sxtl2 v3.4s, v1.8h\n"
6923       "sxtl v2.4s, v1.4h\n"
6924       "sxtl2 v7.4s, v5.8h\n"
6925       "sxtl v6.4s, v5.4h\n"
6926       "sxtl2 v1.4s, v0.8h\n"
6927       "sxtl v0.4s, v0.4h\n"
6928       "sxtl2 v5.4s, v4.8h\n"
6929       "sxtl v4.4s, v4.4h\n"
6930       "scvtf v0.4s, v0.4s\n"
6931       "scvtf v1.4s, v1.4s\n"
6932       "scvtf v2.4s, v2.4s\n"
6933       "scvtf v3.4s, v3.4s\n"
6934       "scvtf v4.4s, v4.4s\n"
6935       "scvtf v5.4s, v5.4s\n"
6936       "scvtf v6.4s, v6.4s\n"
6937       "scvtf v7.4s, v7.4s\n"
6938       "fmul v0.4s, v0.4s, v9.4s\n"
6939       "fmul v1.4s, v1.4s, v9.4s\n"
6940       "fmul v2.4s, v2.4s, v9.4s\n"
6941       "fmul v3.4s, v3.4s, v9.4s\n"
6942       "fmul v4.4s, v4.4s, v11.4s\n"
6943       "fmul v5.4s, v5.4s, v11.4s\n"
6944       "fmul v6.4s, v6.4s, v11.4s\n"
6945       "fmul v7.4s, v7.4s, v11.4s\n"
6946       "fadd v0.4s, v0.4s, v8.4s\n"
6947       "fadd v1.4s, v1.4s, v8.4s\n"
6948       "fadd v2.4s, v2.4s, v8.4s\n"
6949       "fadd v3.4s, v3.4s, v8.4s\n"
6950       "fadd v4.4s, v4.4s, v10.4s\n"
6951       "fadd v5.4s, v5.4s, v10.4s\n"
6952       "fadd v6.4s, v6.4s, v10.4s\n"
6953       "fadd v7.4s, v7.4s, v10.4s\n"
6954       "fadd v0.4s, v0.4s, v4.4s\n"
6955       "fadd v1.4s, v1.4s, v5.4s\n"
6956       "fadd v2.4s, v2.4s, v6.4s\n"
6957       "fadd v3.4s, v3.4s, v7.4s\n"
6958       "fsub v0.4s, v0.4s, v12.4s\n"
6959       "fsub v1.4s, v1.4s, v12.4s\n"
6960       "fsub v2.4s, v2.4s, v12.4s\n"
6961       "fsub v3.4s, v3.4s, v12.4s\n"
6962       "fmul v0.4s, v0.4s, v13.4s\n"
6963       "fmul v1.4s, v1.4s, v13.4s\n"
6964       "fmul v2.4s, v2.4s, v13.4s\n"
6965       "fmul v3.4s, v3.4s, v13.4s\n"
6966       "fadd v0.4s, v0.4s, v14.4s\n"
6967       "fadd v1.4s, v1.4s, v14.4s\n"
6968       "fadd v2.4s, v2.4s, v14.4s\n"
6969       "fadd v3.4s, v3.4s, v14.4s\n"
6970       "fcvtzs v0.4s, v0.4s\n"
6971       "fcvtzs v1.4s, v1.4s\n"
6972       "fcvtzs v2.4s, v2.4s\n"
6973       "fcvtzs v3.4s, v3.4s\n"
6974 
6975       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6976       "prfm pldl1keep, [%x[output]]\n"
6977       "bne 2b\n"
6978       "3:"
6979 
6980       // BiasAdd::Transform
6981       "ld1 {v0.2s}, [%x[input]], #8\n"
6982       "ld1 {v0.h}[4], [%x[input]], #2\n"
6983       "ld1 {v3.2s}, [x1], #8\n"
6984       "ld1 {v3.h}[4], [x1], #2\n"
6985       "prfm pldl1keep, [%x[input], #32]\n"
6986       "uxtl2 v1.8h, v0.16b\n"
6987       "uxtl v0.8h, v0.8b\n"
6988       "uxtl2 v4.8h, v3.16b\n"
6989       "uxtl v3.8h, v3.8b\n"
6990       "sxtl v2.4s, v1.4h\n"
6991       "sxtl v5.4s, v4.4h\n"
6992       "sxtl2 v1.4s, v0.8h\n"
6993       "sxtl v0.4s, v0.4h\n"
6994       "sxtl2 v4.4s, v3.8h\n"
6995       "sxtl v3.4s, v3.4h\n"
6996       "scvtf v0.4s, v0.4s\n"
6997       "scvtf v1.4s, v1.4s\n"
6998       "scvtf v2.4s, v2.4s\n"
6999       "scvtf v3.4s, v3.4s\n"
7000       "scvtf v4.4s, v4.4s\n"
7001       "scvtf v5.4s, v5.4s\n"
7002       "fmul v0.4s, v0.4s, v9.4s\n"
7003       "fmul v1.4s, v1.4s, v9.4s\n"
7004       "fmul v2.4s, v2.4s, v9.4s\n"
7005       "fmul v3.4s, v3.4s, v11.4s\n"
7006       "fmul v4.4s, v4.4s, v11.4s\n"
7007       "fmul v5.4s, v5.4s, v11.4s\n"
7008       "fadd v0.4s, v0.4s, v8.4s\n"
7009       "fadd v1.4s, v1.4s, v8.4s\n"
7010       "fadd v2.4s, v2.4s, v8.4s\n"
7011       "fadd v3.4s, v3.4s, v10.4s\n"
7012       "fadd v4.4s, v4.4s, v10.4s\n"
7013       "fadd v5.4s, v5.4s, v10.4s\n"
7014       "fadd v0.4s, v0.4s, v3.4s\n"
7015       "fadd v1.4s, v1.4s, v4.4s\n"
7016       "fadd v2.4s, v2.4s, v5.4s\n"
7017       "fsub v0.4s, v0.4s, v12.4s\n"
7018       "fsub v1.4s, v1.4s, v12.4s\n"
7019       "fsub v2.4s, v2.4s, v12.4s\n"
7020       "fmul v0.4s, v0.4s, v13.4s\n"
7021       "fmul v1.4s, v1.4s, v13.4s\n"
7022       "fmul v2.4s, v2.4s, v13.4s\n"
7023       "fadd v0.4s, v0.4s, v14.4s\n"
7024       "fadd v1.4s, v1.4s, v14.4s\n"
7025       "fadd v2.4s, v2.4s, v14.4s\n"
7026       "fcvtzs v0.4s, v0.4s\n"
7027       "fcvtzs v1.4s, v1.4s\n"
7028       "fcvtzs v2.4s, v2.4s\n"
7029 
7030       "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
7031       "st1 {v2.2s}, [%x[output]], #8\n"
7032       "prfm pldl1keep, [%x[output]]\n"
7033       "subs %x[rows], %x[rows], #1\n"
7034       "bne 1b\n"
7035       : [input] "+r"(input), [output] "+r"(output)
7036       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7037         [output_range_offset] "m"(params.output_range_offset),
7038         [input_range_scale] "m"(params.input_range_scale),
7039         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7040         [bias_range_min] "m"(params.bias_range_min),
7041         [output_range_min] "m"(params.output_range_min),
7042         [bias_range_scale] "m"(params.bias_range_scale),
7043         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7044       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7045         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7046 }
7047 
7048 template <>
7049 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7050                               11>::Transform(const uint8_t* input,
7051                                              const BiasAdd<uint8_t>& params,
7052                                              int32_t* output) {
7053 #ifdef DEBUG
7054 #ifdef DEBUG_METAGEMM_VERBOSE
7055   std::cout << __FILE__ << "(" << __LINE__
7056             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7057                "11>::Transform()"
7058             << std::endl
7059             << std::flush;
7060 #endif
7061 #endif
7062   int params_rows_copy = params.rows;
7063   asm volatile(
7064       "ldr w0, %[input_range_min]\n"
7065       "dup v8.4s, w0\n"
7066       "ldr w0, %[input_range_scale]\n"
7067       "dup v9.4s, w0\n"
7068       "ldr w0, %[bias_range_min]\n"
7069       "dup v10.4s, w0\n"
7070       "ldr w0, %[bias_range_scale]\n"
7071       "dup v11.4s, w0\n"
7072       "ldr w0, %[output_range_min]\n"
7073       "dup v12.4s, w0\n"
7074       "ldr w0, %[one_over_output_range_scale]\n"
7075       "dup v13.4s, w0\n"
7076       "ldr w0, %[output_range_offset]\n"
7077       "dup v14.4s, w0\n"
7078       "1:"
7079       "mov x0, %x[count]\n"
7080       "mov x1, %x[bias]\n"
7081       "subs x0, x0, #11\n"
7082       "beq 3f\n"
7083       "2:"
7084       "subs x0, x0, #16\n"
7085 
7086       // BiasAdd::Transform
7087       "ld1 {v0.4s}, [%x[input]], #16\n"
7088       "ld1 {v4.4s}, [x1], #16\n"
7089       "prfm pldl1keep, [%x[input], #32]\n"
7090       "uxtl2 v1.8h, v0.16b\n"
7091       "uxtl v0.8h, v0.8b\n"
7092       "uxtl2 v5.8h, v4.16b\n"
7093       "uxtl v4.8h, v4.8b\n"
7094       "sxtl2 v3.4s, v1.8h\n"
7095       "sxtl v2.4s, v1.4h\n"
7096       "sxtl2 v7.4s, v5.8h\n"
7097       "sxtl v6.4s, v5.4h\n"
7098       "sxtl2 v1.4s, v0.8h\n"
7099       "sxtl v0.4s, v0.4h\n"
7100       "sxtl2 v5.4s, v4.8h\n"
7101       "sxtl v4.4s, v4.4h\n"
7102       "scvtf v0.4s, v0.4s\n"
7103       "scvtf v1.4s, v1.4s\n"
7104       "scvtf v2.4s, v2.4s\n"
7105       "scvtf v3.4s, v3.4s\n"
7106       "scvtf v4.4s, v4.4s\n"
7107       "scvtf v5.4s, v5.4s\n"
7108       "scvtf v6.4s, v6.4s\n"
7109       "scvtf v7.4s, v7.4s\n"
7110       "fmul v0.4s, v0.4s, v9.4s\n"
7111       "fmul v1.4s, v1.4s, v9.4s\n"
7112       "fmul v2.4s, v2.4s, v9.4s\n"
7113       "fmul v3.4s, v3.4s, v9.4s\n"
7114       "fmul v4.4s, v4.4s, v11.4s\n"
7115       "fmul v5.4s, v5.4s, v11.4s\n"
7116       "fmul v6.4s, v6.4s, v11.4s\n"
7117       "fmul v7.4s, v7.4s, v11.4s\n"
7118       "fadd v0.4s, v0.4s, v8.4s\n"
7119       "fadd v1.4s, v1.4s, v8.4s\n"
7120       "fadd v2.4s, v2.4s, v8.4s\n"
7121       "fadd v3.4s, v3.4s, v8.4s\n"
7122       "fadd v4.4s, v4.4s, v10.4s\n"
7123       "fadd v5.4s, v5.4s, v10.4s\n"
7124       "fadd v6.4s, v6.4s, v10.4s\n"
7125       "fadd v7.4s, v7.4s, v10.4s\n"
7126       "fadd v0.4s, v0.4s, v4.4s\n"
7127       "fadd v1.4s, v1.4s, v5.4s\n"
7128       "fadd v2.4s, v2.4s, v6.4s\n"
7129       "fadd v3.4s, v3.4s, v7.4s\n"
7130       "fsub v0.4s, v0.4s, v12.4s\n"
7131       "fsub v1.4s, v1.4s, v12.4s\n"
7132       "fsub v2.4s, v2.4s, v12.4s\n"
7133       "fsub v3.4s, v3.4s, v12.4s\n"
7134       "fmul v0.4s, v0.4s, v13.4s\n"
7135       "fmul v1.4s, v1.4s, v13.4s\n"
7136       "fmul v2.4s, v2.4s, v13.4s\n"
7137       "fmul v3.4s, v3.4s, v13.4s\n"
7138       "fadd v0.4s, v0.4s, v14.4s\n"
7139       "fadd v1.4s, v1.4s, v14.4s\n"
7140       "fadd v2.4s, v2.4s, v14.4s\n"
7141       "fadd v3.4s, v3.4s, v14.4s\n"
7142       "fcvtzs v0.4s, v0.4s\n"
7143       "fcvtzs v1.4s, v1.4s\n"
7144       "fcvtzs v2.4s, v2.4s\n"
7145       "fcvtzs v3.4s, v3.4s\n"
7146 
7147       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7148       "prfm pldl1keep, [%x[output]]\n"
7149       "bne 2b\n"
7150       "3:"
7151 
7152       // BiasAdd::Transform
7153       "ld1 {v0.2s}, [%x[input]], #8\n"
7154       "ld1 {v0.h}[4], [%x[input]], #2\n"
7155       "ld1 {v0.b}[10], [%x[input]], #1\n"
7156       "ld1 {v3.2s}, [x1], #8\n"
7157       "ld1 {v3.h}[4], [x1], #2\n"
7158       "ld1 {v3.b}[10], [x1], #1\n"
7159       "prfm pldl1keep, [%x[input], #32]\n"
7160       "uxtl2 v1.8h, v0.16b\n"
7161       "uxtl v0.8h, v0.8b\n"
7162       "uxtl2 v4.8h, v3.16b\n"
7163       "uxtl v3.8h, v3.8b\n"
7164       "sxtl v2.4s, v1.4h\n"
7165       "sxtl v5.4s, v4.4h\n"
7166       "sxtl2 v1.4s, v0.8h\n"
7167       "sxtl v0.4s, v0.4h\n"
7168       "sxtl2 v4.4s, v3.8h\n"
7169       "sxtl v3.4s, v3.4h\n"
7170       "scvtf v0.4s, v0.4s\n"
7171       "scvtf v1.4s, v1.4s\n"
7172       "scvtf v2.4s, v2.4s\n"
7173       "scvtf v3.4s, v3.4s\n"
7174       "scvtf v4.4s, v4.4s\n"
7175       "scvtf v5.4s, v5.4s\n"
7176       "fmul v0.4s, v0.4s, v9.4s\n"
7177       "fmul v1.4s, v1.4s, v9.4s\n"
7178       "fmul v2.4s, v2.4s, v9.4s\n"
7179       "fmul v3.4s, v3.4s, v11.4s\n"
7180       "fmul v4.4s, v4.4s, v11.4s\n"
7181       "fmul v5.4s, v5.4s, v11.4s\n"
7182       "fadd v0.4s, v0.4s, v8.4s\n"
7183       "fadd v1.4s, v1.4s, v8.4s\n"
7184       "fadd v2.4s, v2.4s, v8.4s\n"
7185       "fadd v3.4s, v3.4s, v10.4s\n"
7186       "fadd v4.4s, v4.4s, v10.4s\n"
7187       "fadd v5.4s, v5.4s, v10.4s\n"
7188       "fadd v0.4s, v0.4s, v3.4s\n"
7189       "fadd v1.4s, v1.4s, v4.4s\n"
7190       "fadd v2.4s, v2.4s, v5.4s\n"
7191       "fsub v0.4s, v0.4s, v12.4s\n"
7192       "fsub v1.4s, v1.4s, v12.4s\n"
7193       "fsub v2.4s, v2.4s, v12.4s\n"
7194       "fmul v0.4s, v0.4s, v13.4s\n"
7195       "fmul v1.4s, v1.4s, v13.4s\n"
7196       "fmul v2.4s, v2.4s, v13.4s\n"
7197       "fadd v0.4s, v0.4s, v14.4s\n"
7198       "fadd v1.4s, v1.4s, v14.4s\n"
7199       "fadd v2.4s, v2.4s, v14.4s\n"
7200       "fcvtzs v0.4s, v0.4s\n"
7201       "fcvtzs v1.4s, v1.4s\n"
7202       "fcvtzs v2.4s, v2.4s\n"
7203 
7204       "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
7205       "st1 {v2.2s}, [%x[output]], #8\n"
7206       "st1 {v2.s}[2], [%x[output]], #4\n"
7207       "prfm pldl1keep, [%x[output]]\n"
7208       "subs %x[rows], %x[rows], #1\n"
7209       "bne 1b\n"
7210       : [input] "+r"(input), [output] "+r"(output)
7211       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7212         [output_range_offset] "m"(params.output_range_offset),
7213         [input_range_scale] "m"(params.input_range_scale),
7214         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7215         [bias_range_min] "m"(params.bias_range_min),
7216         [output_range_min] "m"(params.output_range_min),
7217         [bias_range_scale] "m"(params.bias_range_scale),
7218         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7219       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7220         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7221 }
7222 
7223 template <>
7224 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7225                               12>::Transform(const uint8_t* input,
7226                                              const BiasAdd<uint8_t>& params,
7227                                              int32_t* output) {
7228 #ifdef DEBUG
7229 #ifdef DEBUG_METAGEMM_VERBOSE
7230   std::cout << __FILE__ << "(" << __LINE__
7231             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7232                "12>::Transform()"
7233             << std::endl
7234             << std::flush;
7235 #endif
7236 #endif
7237   int params_rows_copy = params.rows;
7238   asm volatile(
7239       "ldr w0, %[input_range_min]\n"
7240       "dup v8.4s, w0\n"
7241       "ldr w0, %[input_range_scale]\n"
7242       "dup v9.4s, w0\n"
7243       "ldr w0, %[bias_range_min]\n"
7244       "dup v10.4s, w0\n"
7245       "ldr w0, %[bias_range_scale]\n"
7246       "dup v11.4s, w0\n"
7247       "ldr w0, %[output_range_min]\n"
7248       "dup v12.4s, w0\n"
7249       "ldr w0, %[one_over_output_range_scale]\n"
7250       "dup v13.4s, w0\n"
7251       "ldr w0, %[output_range_offset]\n"
7252       "dup v14.4s, w0\n"
7253       "1:"
7254       "mov x0, %x[count]\n"
7255       "mov x1, %x[bias]\n"
7256       "subs x0, x0, #12\n"
7257       "beq 3f\n"
7258       "2:"
7259       "subs x0, x0, #16\n"
7260 
7261       // BiasAdd::Transform
7262       "ld1 {v0.4s}, [%x[input]], #16\n"
7263       "ld1 {v4.4s}, [x1], #16\n"
7264       "prfm pldl1keep, [%x[input], #32]\n"
7265       "uxtl2 v1.8h, v0.16b\n"
7266       "uxtl v0.8h, v0.8b\n"
7267       "uxtl2 v5.8h, v4.16b\n"
7268       "uxtl v4.8h, v4.8b\n"
7269       "sxtl2 v3.4s, v1.8h\n"
7270       "sxtl v2.4s, v1.4h\n"
7271       "sxtl2 v7.4s, v5.8h\n"
7272       "sxtl v6.4s, v5.4h\n"
7273       "sxtl2 v1.4s, v0.8h\n"
7274       "sxtl v0.4s, v0.4h\n"
7275       "sxtl2 v5.4s, v4.8h\n"
7276       "sxtl v4.4s, v4.4h\n"
7277       "scvtf v0.4s, v0.4s\n"
7278       "scvtf v1.4s, v1.4s\n"
7279       "scvtf v2.4s, v2.4s\n"
7280       "scvtf v3.4s, v3.4s\n"
7281       "scvtf v4.4s, v4.4s\n"
7282       "scvtf v5.4s, v5.4s\n"
7283       "scvtf v6.4s, v6.4s\n"
7284       "scvtf v7.4s, v7.4s\n"
7285       "fmul v0.4s, v0.4s, v9.4s\n"
7286       "fmul v1.4s, v1.4s, v9.4s\n"
7287       "fmul v2.4s, v2.4s, v9.4s\n"
7288       "fmul v3.4s, v3.4s, v9.4s\n"
7289       "fmul v4.4s, v4.4s, v11.4s\n"
7290       "fmul v5.4s, v5.4s, v11.4s\n"
7291       "fmul v6.4s, v6.4s, v11.4s\n"
7292       "fmul v7.4s, v7.4s, v11.4s\n"
7293       "fadd v0.4s, v0.4s, v8.4s\n"
7294       "fadd v1.4s, v1.4s, v8.4s\n"
7295       "fadd v2.4s, v2.4s, v8.4s\n"
7296       "fadd v3.4s, v3.4s, v8.4s\n"
7297       "fadd v4.4s, v4.4s, v10.4s\n"
7298       "fadd v5.4s, v5.4s, v10.4s\n"
7299       "fadd v6.4s, v6.4s, v10.4s\n"
7300       "fadd v7.4s, v7.4s, v10.4s\n"
7301       "fadd v0.4s, v0.4s, v4.4s\n"
7302       "fadd v1.4s, v1.4s, v5.4s\n"
7303       "fadd v2.4s, v2.4s, v6.4s\n"
7304       "fadd v3.4s, v3.4s, v7.4s\n"
7305       "fsub v0.4s, v0.4s, v12.4s\n"
7306       "fsub v1.4s, v1.4s, v12.4s\n"
7307       "fsub v2.4s, v2.4s, v12.4s\n"
7308       "fsub v3.4s, v3.4s, v12.4s\n"
7309       "fmul v0.4s, v0.4s, v13.4s\n"
7310       "fmul v1.4s, v1.4s, v13.4s\n"
7311       "fmul v2.4s, v2.4s, v13.4s\n"
7312       "fmul v3.4s, v3.4s, v13.4s\n"
7313       "fadd v0.4s, v0.4s, v14.4s\n"
7314       "fadd v1.4s, v1.4s, v14.4s\n"
7315       "fadd v2.4s, v2.4s, v14.4s\n"
7316       "fadd v3.4s, v3.4s, v14.4s\n"
7317       "fcvtzs v0.4s, v0.4s\n"
7318       "fcvtzs v1.4s, v1.4s\n"
7319       "fcvtzs v2.4s, v2.4s\n"
7320       "fcvtzs v3.4s, v3.4s\n"
7321 
7322       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7323       "prfm pldl1keep, [%x[output]]\n"
7324       "bne 2b\n"
7325       "3:"
7326 
7327       // BiasAdd::Transform
7328       "ld1 {v0.2s}, [%x[input]], #8\n"
7329       "ld1 {v0.s}[2], [%x[input]], #4\n"
7330       "ld1 {v3.2s}, [x1], #8\n"
7331       "ld1 {v3.s}[2], [x1], #4\n"
7332       "prfm pldl1keep, [%x[input], #32]\n"
7333       "uxtl2 v1.8h, v0.16b\n"
7334       "uxtl v0.8h, v0.8b\n"
7335       "uxtl2 v4.8h, v3.16b\n"
7336       "uxtl v3.8h, v3.8b\n"
7337       "sxtl v2.4s, v1.4h\n"
7338       "sxtl v5.4s, v4.4h\n"
7339       "sxtl2 v1.4s, v0.8h\n"
7340       "sxtl v0.4s, v0.4h\n"
7341       "sxtl2 v4.4s, v3.8h\n"
7342       "sxtl v3.4s, v3.4h\n"
7343       "scvtf v0.4s, v0.4s\n"
7344       "scvtf v1.4s, v1.4s\n"
7345       "scvtf v2.4s, v2.4s\n"
7346       "scvtf v3.4s, v3.4s\n"
7347       "scvtf v4.4s, v4.4s\n"
7348       "scvtf v5.4s, v5.4s\n"
7349       "fmul v0.4s, v0.4s, v9.4s\n"
7350       "fmul v1.4s, v1.4s, v9.4s\n"
7351       "fmul v2.4s, v2.4s, v9.4s\n"
7352       "fmul v3.4s, v3.4s, v11.4s\n"
7353       "fmul v4.4s, v4.4s, v11.4s\n"
7354       "fmul v5.4s, v5.4s, v11.4s\n"
7355       "fadd v0.4s, v0.4s, v8.4s\n"
7356       "fadd v1.4s, v1.4s, v8.4s\n"
7357       "fadd v2.4s, v2.4s, v8.4s\n"
7358       "fadd v3.4s, v3.4s, v10.4s\n"
7359       "fadd v4.4s, v4.4s, v10.4s\n"
7360       "fadd v5.4s, v5.4s, v10.4s\n"
7361       "fadd v0.4s, v0.4s, v3.4s\n"
7362       "fadd v1.4s, v1.4s, v4.4s\n"
7363       "fadd v2.4s, v2.4s, v5.4s\n"
7364       "fsub v0.4s, v0.4s, v12.4s\n"
7365       "fsub v1.4s, v1.4s, v12.4s\n"
7366       "fsub v2.4s, v2.4s, v12.4s\n"
7367       "fmul v0.4s, v0.4s, v13.4s\n"
7368       "fmul v1.4s, v1.4s, v13.4s\n"
7369       "fmul v2.4s, v2.4s, v13.4s\n"
7370       "fadd v0.4s, v0.4s, v14.4s\n"
7371       "fadd v1.4s, v1.4s, v14.4s\n"
7372       "fadd v2.4s, v2.4s, v14.4s\n"
7373       "fcvtzs v0.4s, v0.4s\n"
7374       "fcvtzs v1.4s, v1.4s\n"
7375       "fcvtzs v2.4s, v2.4s\n"
7376 
7377       "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
7378       "prfm pldl1keep, [%x[output]]\n"
7379       "subs %x[rows], %x[rows], #1\n"
7380       "bne 1b\n"
7381       : [input] "+r"(input), [output] "+r"(output)
7382       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7383         [output_range_offset] "m"(params.output_range_offset),
7384         [input_range_scale] "m"(params.input_range_scale),
7385         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7386         [bias_range_min] "m"(params.bias_range_min),
7387         [output_range_min] "m"(params.output_range_min),
7388         [bias_range_scale] "m"(params.bias_range_scale),
7389         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7390       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7391         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7392 }
7393 
7394 template <>
7395 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7396                               13>::Transform(const uint8_t* input,
7397                                              const BiasAdd<uint8_t>& params,
7398                                              int32_t* output) {
7399 #ifdef DEBUG
7400 #ifdef DEBUG_METAGEMM_VERBOSE
7401   std::cout << __FILE__ << "(" << __LINE__
7402             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7403                "13>::Transform()"
7404             << std::endl
7405             << std::flush;
7406 #endif
7407 #endif
7408   int params_rows_copy = params.rows;
7409   asm volatile(
7410       "ldr w0, %[input_range_min]\n"
7411       "dup v8.4s, w0\n"
7412       "ldr w0, %[input_range_scale]\n"
7413       "dup v9.4s, w0\n"
7414       "ldr w0, %[bias_range_min]\n"
7415       "dup v10.4s, w0\n"
7416       "ldr w0, %[bias_range_scale]\n"
7417       "dup v11.4s, w0\n"
7418       "ldr w0, %[output_range_min]\n"
7419       "dup v12.4s, w0\n"
7420       "ldr w0, %[one_over_output_range_scale]\n"
7421       "dup v13.4s, w0\n"
7422       "ldr w0, %[output_range_offset]\n"
7423       "dup v14.4s, w0\n"
7424       "1:"
7425       "mov x0, %x[count]\n"
7426       "mov x1, %x[bias]\n"
7427       "subs x0, x0, #13\n"
7428       "beq 3f\n"
7429       "2:"
7430       "subs x0, x0, #16\n"
7431 
7432       // BiasAdd::Transform
7433       "ld1 {v0.4s}, [%x[input]], #16\n"
7434       "ld1 {v4.4s}, [x1], #16\n"
7435       "prfm pldl1keep, [%x[input], #32]\n"
7436       "uxtl2 v1.8h, v0.16b\n"
7437       "uxtl v0.8h, v0.8b\n"
7438       "uxtl2 v5.8h, v4.16b\n"
7439       "uxtl v4.8h, v4.8b\n"
7440       "sxtl2 v3.4s, v1.8h\n"
7441       "sxtl v2.4s, v1.4h\n"
7442       "sxtl2 v7.4s, v5.8h\n"
7443       "sxtl v6.4s, v5.4h\n"
7444       "sxtl2 v1.4s, v0.8h\n"
7445       "sxtl v0.4s, v0.4h\n"
7446       "sxtl2 v5.4s, v4.8h\n"
7447       "sxtl v4.4s, v4.4h\n"
7448       "scvtf v0.4s, v0.4s\n"
7449       "scvtf v1.4s, v1.4s\n"
7450       "scvtf v2.4s, v2.4s\n"
7451       "scvtf v3.4s, v3.4s\n"
7452       "scvtf v4.4s, v4.4s\n"
7453       "scvtf v5.4s, v5.4s\n"
7454       "scvtf v6.4s, v6.4s\n"
7455       "scvtf v7.4s, v7.4s\n"
7456       "fmul v0.4s, v0.4s, v9.4s\n"
7457       "fmul v1.4s, v1.4s, v9.4s\n"
7458       "fmul v2.4s, v2.4s, v9.4s\n"
7459       "fmul v3.4s, v3.4s, v9.4s\n"
7460       "fmul v4.4s, v4.4s, v11.4s\n"
7461       "fmul v5.4s, v5.4s, v11.4s\n"
7462       "fmul v6.4s, v6.4s, v11.4s\n"
7463       "fmul v7.4s, v7.4s, v11.4s\n"
7464       "fadd v0.4s, v0.4s, v8.4s\n"
7465       "fadd v1.4s, v1.4s, v8.4s\n"
7466       "fadd v2.4s, v2.4s, v8.4s\n"
7467       "fadd v3.4s, v3.4s, v8.4s\n"
7468       "fadd v4.4s, v4.4s, v10.4s\n"
7469       "fadd v5.4s, v5.4s, v10.4s\n"
7470       "fadd v6.4s, v6.4s, v10.4s\n"
7471       "fadd v7.4s, v7.4s, v10.4s\n"
7472       "fadd v0.4s, v0.4s, v4.4s\n"
7473       "fadd v1.4s, v1.4s, v5.4s\n"
7474       "fadd v2.4s, v2.4s, v6.4s\n"
7475       "fadd v3.4s, v3.4s, v7.4s\n"
7476       "fsub v0.4s, v0.4s, v12.4s\n"
7477       "fsub v1.4s, v1.4s, v12.4s\n"
7478       "fsub v2.4s, v2.4s, v12.4s\n"
7479       "fsub v3.4s, v3.4s, v12.4s\n"
7480       "fmul v0.4s, v0.4s, v13.4s\n"
7481       "fmul v1.4s, v1.4s, v13.4s\n"
7482       "fmul v2.4s, v2.4s, v13.4s\n"
7483       "fmul v3.4s, v3.4s, v13.4s\n"
7484       "fadd v0.4s, v0.4s, v14.4s\n"
7485       "fadd v1.4s, v1.4s, v14.4s\n"
7486       "fadd v2.4s, v2.4s, v14.4s\n"
7487       "fadd v3.4s, v3.4s, v14.4s\n"
7488       "fcvtzs v0.4s, v0.4s\n"
7489       "fcvtzs v1.4s, v1.4s\n"
7490       "fcvtzs v2.4s, v2.4s\n"
7491       "fcvtzs v3.4s, v3.4s\n"
7492 
7493       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7494       "prfm pldl1keep, [%x[output]]\n"
7495       "bne 2b\n"
7496       "3:"
7497 
7498       // BiasAdd::Transform
7499       "ld1 {v0.2s}, [%x[input]], #8\n"
7500       "ld1 {v0.s}[2], [%x[input]], #4\n"
7501       "ld1 {v0.b}[12], [%x[input]], #1\n"
7502       "ld1 {v4.2s}, [x1], #8\n"
7503       "ld1 {v4.s}[2], [x1], #4\n"
7504       "ld1 {v4.b}[12], [x1], #1\n"
7505       "prfm pldl1keep, [%x[input], #32]\n"
7506       "uxtl2 v1.8h, v0.16b\n"
7507       "uxtl v0.8h, v0.8b\n"
7508       "uxtl2 v5.8h, v4.16b\n"
7509       "uxtl v4.8h, v4.8b\n"
7510       "sxtl2 v3.4s, v1.8h\n"
7511       "sxtl v2.4s, v1.4h\n"
7512       "sxtl2 v7.4s, v5.8h\n"
7513       "sxtl v6.4s, v5.4h\n"
7514       "sxtl2 v1.4s, v0.8h\n"
7515       "sxtl v0.4s, v0.4h\n"
7516       "sxtl2 v5.4s, v4.8h\n"
7517       "sxtl v4.4s, v4.4h\n"
7518       "scvtf v0.4s, v0.4s\n"
7519       "scvtf v1.4s, v1.4s\n"
7520       "scvtf v2.4s, v2.4s\n"
7521       "scvtf v3.4s, v3.4s\n"
7522       "scvtf v4.4s, v4.4s\n"
7523       "scvtf v5.4s, v5.4s\n"
7524       "scvtf v6.4s, v6.4s\n"
7525       "scvtf v7.4s, v7.4s\n"
7526       "fmul v0.4s, v0.4s, v9.4s\n"
7527       "fmul v1.4s, v1.4s, v9.4s\n"
7528       "fmul v2.4s, v2.4s, v9.4s\n"
7529       "fmul v3.4s, v3.4s, v9.4s\n"
7530       "fmul v4.4s, v4.4s, v11.4s\n"
7531       "fmul v5.4s, v5.4s, v11.4s\n"
7532       "fmul v6.4s, v6.4s, v11.4s\n"
7533       "fmul v7.4s, v7.4s, v11.4s\n"
7534       "fadd v0.4s, v0.4s, v8.4s\n"
7535       "fadd v1.4s, v1.4s, v8.4s\n"
7536       "fadd v2.4s, v2.4s, v8.4s\n"
7537       "fadd v3.4s, v3.4s, v8.4s\n"
7538       "fadd v4.4s, v4.4s, v10.4s\n"
7539       "fadd v5.4s, v5.4s, v10.4s\n"
7540       "fadd v6.4s, v6.4s, v10.4s\n"
7541       "fadd v7.4s, v7.4s, v10.4s\n"
7542       "fadd v0.4s, v0.4s, v4.4s\n"
7543       "fadd v1.4s, v1.4s, v5.4s\n"
7544       "fadd v2.4s, v2.4s, v6.4s\n"
7545       "fadd v3.4s, v3.4s, v7.4s\n"
7546       "fsub v0.4s, v0.4s, v12.4s\n"
7547       "fsub v1.4s, v1.4s, v12.4s\n"
7548       "fsub v2.4s, v2.4s, v12.4s\n"
7549       "fsub v3.4s, v3.4s, v12.4s\n"
7550       "fmul v0.4s, v0.4s, v13.4s\n"
7551       "fmul v1.4s, v1.4s, v13.4s\n"
7552       "fmul v2.4s, v2.4s, v13.4s\n"
7553       "fmul v3.4s, v3.4s, v13.4s\n"
7554       "fadd v0.4s, v0.4s, v14.4s\n"
7555       "fadd v1.4s, v1.4s, v14.4s\n"
7556       "fadd v2.4s, v2.4s, v14.4s\n"
7557       "fadd v3.4s, v3.4s, v14.4s\n"
7558       "fcvtzs v0.4s, v0.4s\n"
7559       "fcvtzs v1.4s, v1.4s\n"
7560       "fcvtzs v2.4s, v2.4s\n"
7561       "fcvtzs v3.4s, v3.4s\n"
7562 
7563       "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
7564       "st1 {v3.s}[0], [%x[output]], #4\n"
7565       "prfm pldl1keep, [%x[output]]\n"
7566       "subs %x[rows], %x[rows], #1\n"
7567       "bne 1b\n"
7568       : [input] "+r"(input), [output] "+r"(output)
7569       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7570         [output_range_offset] "m"(params.output_range_offset),
7571         [input_range_scale] "m"(params.input_range_scale),
7572         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7573         [bias_range_min] "m"(params.bias_range_min),
7574         [output_range_min] "m"(params.output_range_min),
7575         [bias_range_scale] "m"(params.bias_range_scale),
7576         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7577       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7578         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7579 }
7580 
7581 template <>
7582 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7583                               14>::Transform(const uint8_t* input,
7584                                              const BiasAdd<uint8_t>& params,
7585                                              int32_t* output) {
7586 #ifdef DEBUG
7587 #ifdef DEBUG_METAGEMM_VERBOSE
7588   std::cout << __FILE__ << "(" << __LINE__
7589             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7590                "14>::Transform()"
7591             << std::endl
7592             << std::flush;
7593 #endif
7594 #endif
7595   int params_rows_copy = params.rows;
7596   asm volatile(
7597       "ldr w0, %[input_range_min]\n"
7598       "dup v8.4s, w0\n"
7599       "ldr w0, %[input_range_scale]\n"
7600       "dup v9.4s, w0\n"
7601       "ldr w0, %[bias_range_min]\n"
7602       "dup v10.4s, w0\n"
7603       "ldr w0, %[bias_range_scale]\n"
7604       "dup v11.4s, w0\n"
7605       "ldr w0, %[output_range_min]\n"
7606       "dup v12.4s, w0\n"
7607       "ldr w0, %[one_over_output_range_scale]\n"
7608       "dup v13.4s, w0\n"
7609       "ldr w0, %[output_range_offset]\n"
7610       "dup v14.4s, w0\n"
7611       "1:"
7612       "mov x0, %x[count]\n"
7613       "mov x1, %x[bias]\n"
7614       "subs x0, x0, #14\n"
7615       "beq 3f\n"
7616       "2:"
7617       "subs x0, x0, #16\n"
7618 
7619       // BiasAdd::Transform
7620       "ld1 {v0.4s}, [%x[input]], #16\n"
7621       "ld1 {v4.4s}, [x1], #16\n"
7622       "prfm pldl1keep, [%x[input], #32]\n"
7623       "uxtl2 v1.8h, v0.16b\n"
7624       "uxtl v0.8h, v0.8b\n"
7625       "uxtl2 v5.8h, v4.16b\n"
7626       "uxtl v4.8h, v4.8b\n"
7627       "sxtl2 v3.4s, v1.8h\n"
7628       "sxtl v2.4s, v1.4h\n"
7629       "sxtl2 v7.4s, v5.8h\n"
7630       "sxtl v6.4s, v5.4h\n"
7631       "sxtl2 v1.4s, v0.8h\n"
7632       "sxtl v0.4s, v0.4h\n"
7633       "sxtl2 v5.4s, v4.8h\n"
7634       "sxtl v4.4s, v4.4h\n"
7635       "scvtf v0.4s, v0.4s\n"
7636       "scvtf v1.4s, v1.4s\n"
7637       "scvtf v2.4s, v2.4s\n"
7638       "scvtf v3.4s, v3.4s\n"
7639       "scvtf v4.4s, v4.4s\n"
7640       "scvtf v5.4s, v5.4s\n"
7641       "scvtf v6.4s, v6.4s\n"
7642       "scvtf v7.4s, v7.4s\n"
7643       "fmul v0.4s, v0.4s, v9.4s\n"
7644       "fmul v1.4s, v1.4s, v9.4s\n"
7645       "fmul v2.4s, v2.4s, v9.4s\n"
7646       "fmul v3.4s, v3.4s, v9.4s\n"
7647       "fmul v4.4s, v4.4s, v11.4s\n"
7648       "fmul v5.4s, v5.4s, v11.4s\n"
7649       "fmul v6.4s, v6.4s, v11.4s\n"
7650       "fmul v7.4s, v7.4s, v11.4s\n"
7651       "fadd v0.4s, v0.4s, v8.4s\n"
7652       "fadd v1.4s, v1.4s, v8.4s\n"
7653       "fadd v2.4s, v2.4s, v8.4s\n"
7654       "fadd v3.4s, v3.4s, v8.4s\n"
7655       "fadd v4.4s, v4.4s, v10.4s\n"
7656       "fadd v5.4s, v5.4s, v10.4s\n"
7657       "fadd v6.4s, v6.4s, v10.4s\n"
7658       "fadd v7.4s, v7.4s, v10.4s\n"
7659       "fadd v0.4s, v0.4s, v4.4s\n"
7660       "fadd v1.4s, v1.4s, v5.4s\n"
7661       "fadd v2.4s, v2.4s, v6.4s\n"
7662       "fadd v3.4s, v3.4s, v7.4s\n"
7663       "fsub v0.4s, v0.4s, v12.4s\n"
7664       "fsub v1.4s, v1.4s, v12.4s\n"
7665       "fsub v2.4s, v2.4s, v12.4s\n"
7666       "fsub v3.4s, v3.4s, v12.4s\n"
7667       "fmul v0.4s, v0.4s, v13.4s\n"
7668       "fmul v1.4s, v1.4s, v13.4s\n"
7669       "fmul v2.4s, v2.4s, v13.4s\n"
7670       "fmul v3.4s, v3.4s, v13.4s\n"
7671       "fadd v0.4s, v0.4s, v14.4s\n"
7672       "fadd v1.4s, v1.4s, v14.4s\n"
7673       "fadd v2.4s, v2.4s, v14.4s\n"
7674       "fadd v3.4s, v3.4s, v14.4s\n"
7675       "fcvtzs v0.4s, v0.4s\n"
7676       "fcvtzs v1.4s, v1.4s\n"
7677       "fcvtzs v2.4s, v2.4s\n"
7678       "fcvtzs v3.4s, v3.4s\n"
7679 
7680       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7681       "prfm pldl1keep, [%x[output]]\n"
7682       "bne 2b\n"
7683       "3:"
7684 
7685       // BiasAdd::Transform
7686       "ld1 {v0.2s}, [%x[input]], #8\n"
7687       "ld1 {v0.s}[2], [%x[input]], #4\n"
7688       "ld1 {v0.h}[6], [%x[input]], #2\n"
7689       "ld1 {v4.2s}, [x1], #8\n"
7690       "ld1 {v4.s}[2], [x1], #4\n"
7691       "ld1 {v4.h}[6], [x1], #2\n"
7692       "prfm pldl1keep, [%x[input], #32]\n"
7693       "uxtl2 v1.8h, v0.16b\n"
7694       "uxtl v0.8h, v0.8b\n"
7695       "uxtl2 v5.8h, v4.16b\n"
7696       "uxtl v4.8h, v4.8b\n"
7697       "sxtl2 v3.4s, v1.8h\n"
7698       "sxtl v2.4s, v1.4h\n"
7699       "sxtl2 v7.4s, v5.8h\n"
7700       "sxtl v6.4s, v5.4h\n"
7701       "sxtl2 v1.4s, v0.8h\n"
7702       "sxtl v0.4s, v0.4h\n"
7703       "sxtl2 v5.4s, v4.8h\n"
7704       "sxtl v4.4s, v4.4h\n"
7705       "scvtf v0.4s, v0.4s\n"
7706       "scvtf v1.4s, v1.4s\n"
7707       "scvtf v2.4s, v2.4s\n"
7708       "scvtf v3.4s, v3.4s\n"
7709       "scvtf v4.4s, v4.4s\n"
7710       "scvtf v5.4s, v5.4s\n"
7711       "scvtf v6.4s, v6.4s\n"
7712       "scvtf v7.4s, v7.4s\n"
7713       "fmul v0.4s, v0.4s, v9.4s\n"
7714       "fmul v1.4s, v1.4s, v9.4s\n"
7715       "fmul v2.4s, v2.4s, v9.4s\n"
7716       "fmul v3.4s, v3.4s, v9.4s\n"
7717       "fmul v4.4s, v4.4s, v11.4s\n"
7718       "fmul v5.4s, v5.4s, v11.4s\n"
7719       "fmul v6.4s, v6.4s, v11.4s\n"
7720       "fmul v7.4s, v7.4s, v11.4s\n"
7721       "fadd v0.4s, v0.4s, v8.4s\n"
7722       "fadd v1.4s, v1.4s, v8.4s\n"
7723       "fadd v2.4s, v2.4s, v8.4s\n"
7724       "fadd v3.4s, v3.4s, v8.4s\n"
7725       "fadd v4.4s, v4.4s, v10.4s\n"
7726       "fadd v5.4s, v5.4s, v10.4s\n"
7727       "fadd v6.4s, v6.4s, v10.4s\n"
7728       "fadd v7.4s, v7.4s, v10.4s\n"
7729       "fadd v0.4s, v0.4s, v4.4s\n"
7730       "fadd v1.4s, v1.4s, v5.4s\n"
7731       "fadd v2.4s, v2.4s, v6.4s\n"
7732       "fadd v3.4s, v3.4s, v7.4s\n"
7733       "fsub v0.4s, v0.4s, v12.4s\n"
7734       "fsub v1.4s, v1.4s, v12.4s\n"
7735       "fsub v2.4s, v2.4s, v12.4s\n"
7736       "fsub v3.4s, v3.4s, v12.4s\n"
7737       "fmul v0.4s, v0.4s, v13.4s\n"
7738       "fmul v1.4s, v1.4s, v13.4s\n"
7739       "fmul v2.4s, v2.4s, v13.4s\n"
7740       "fmul v3.4s, v3.4s, v13.4s\n"
7741       "fadd v0.4s, v0.4s, v14.4s\n"
7742       "fadd v1.4s, v1.4s, v14.4s\n"
7743       "fadd v2.4s, v2.4s, v14.4s\n"
7744       "fadd v3.4s, v3.4s, v14.4s\n"
7745       "fcvtzs v0.4s, v0.4s\n"
7746       "fcvtzs v1.4s, v1.4s\n"
7747       "fcvtzs v2.4s, v2.4s\n"
7748       "fcvtzs v3.4s, v3.4s\n"
7749 
7750       "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
7751       "st1 {v3.2s}, [%x[output]], #8\n"
7752       "prfm pldl1keep, [%x[output]]\n"
7753       "subs %x[rows], %x[rows], #1\n"
7754       "bne 1b\n"
7755       : [input] "+r"(input), [output] "+r"(output)
7756       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7757         [output_range_offset] "m"(params.output_range_offset),
7758         [input_range_scale] "m"(params.input_range_scale),
7759         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7760         [bias_range_min] "m"(params.bias_range_min),
7761         [output_range_min] "m"(params.output_range_min),
7762         [bias_range_scale] "m"(params.bias_range_scale),
7763         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7764       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7765         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7766 }
7767 
7768 template <>
7769 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7770                               15>::Transform(const uint8_t* input,
7771                                              const BiasAdd<uint8_t>& params,
7772                                              int32_t* output) {
7773 #ifdef DEBUG
7774 #ifdef DEBUG_METAGEMM_VERBOSE
7775   std::cout << __FILE__ << "(" << __LINE__
7776             << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7777                "15>::Transform()"
7778             << std::endl
7779             << std::flush;
7780 #endif
7781 #endif
7782   int params_rows_copy = params.rows;
7783   asm volatile(
7784       "ldr w0, %[input_range_min]\n"
7785       "dup v8.4s, w0\n"
7786       "ldr w0, %[input_range_scale]\n"
7787       "dup v9.4s, w0\n"
7788       "ldr w0, %[bias_range_min]\n"
7789       "dup v10.4s, w0\n"
7790       "ldr w0, %[bias_range_scale]\n"
7791       "dup v11.4s, w0\n"
7792       "ldr w0, %[output_range_min]\n"
7793       "dup v12.4s, w0\n"
7794       "ldr w0, %[one_over_output_range_scale]\n"
7795       "dup v13.4s, w0\n"
7796       "ldr w0, %[output_range_offset]\n"
7797       "dup v14.4s, w0\n"
7798       "1:"
7799       "mov x0, %x[count]\n"
7800       "mov x1, %x[bias]\n"
7801       "subs x0, x0, #15\n"
7802       "beq 3f\n"
7803       "2:"
7804       "subs x0, x0, #16\n"
7805 
7806       // BiasAdd::Transform
7807       "ld1 {v0.4s}, [%x[input]], #16\n"
7808       "ld1 {v4.4s}, [x1], #16\n"
7809       "prfm pldl1keep, [%x[input], #32]\n"
7810       "uxtl2 v1.8h, v0.16b\n"
7811       "uxtl v0.8h, v0.8b\n"
7812       "uxtl2 v5.8h, v4.16b\n"
7813       "uxtl v4.8h, v4.8b\n"
7814       "sxtl2 v3.4s, v1.8h\n"
7815       "sxtl v2.4s, v1.4h\n"
7816       "sxtl2 v7.4s, v5.8h\n"
7817       "sxtl v6.4s, v5.4h\n"
7818       "sxtl2 v1.4s, v0.8h\n"
7819       "sxtl v0.4s, v0.4h\n"
7820       "sxtl2 v5.4s, v4.8h\n"
7821       "sxtl v4.4s, v4.4h\n"
7822       "scvtf v0.4s, v0.4s\n"
7823       "scvtf v1.4s, v1.4s\n"
7824       "scvtf v2.4s, v2.4s\n"
7825       "scvtf v3.4s, v3.4s\n"
7826       "scvtf v4.4s, v4.4s\n"
7827       "scvtf v5.4s, v5.4s\n"
7828       "scvtf v6.4s, v6.4s\n"
7829       "scvtf v7.4s, v7.4s\n"
7830       "fmul v0.4s, v0.4s, v9.4s\n"
7831       "fmul v1.4s, v1.4s, v9.4s\n"
7832       "fmul v2.4s, v2.4s, v9.4s\n"
7833       "fmul v3.4s, v3.4s, v9.4s\n"
7834       "fmul v4.4s, v4.4s, v11.4s\n"
7835       "fmul v5.4s, v5.4s, v11.4s\n"
7836       "fmul v6.4s, v6.4s, v11.4s\n"
7837       "fmul v7.4s, v7.4s, v11.4s\n"
7838       "fadd v0.4s, v0.4s, v8.4s\n"
7839       "fadd v1.4s, v1.4s, v8.4s\n"
7840       "fadd v2.4s, v2.4s, v8.4s\n"
7841       "fadd v3.4s, v3.4s, v8.4s\n"
7842       "fadd v4.4s, v4.4s, v10.4s\n"
7843       "fadd v5.4s, v5.4s, v10.4s\n"
7844       "fadd v6.4s, v6.4s, v10.4s\n"
7845       "fadd v7.4s, v7.4s, v10.4s\n"
7846       "fadd v0.4s, v0.4s, v4.4s\n"
7847       "fadd v1.4s, v1.4s, v5.4s\n"
7848       "fadd v2.4s, v2.4s, v6.4s\n"
7849       "fadd v3.4s, v3.4s, v7.4s\n"
7850       "fsub v0.4s, v0.4s, v12.4s\n"
7851       "fsub v1.4s, v1.4s, v12.4s\n"
7852       "fsub v2.4s, v2.4s, v12.4s\n"
7853       "fsub v3.4s, v3.4s, v12.4s\n"
7854       "fmul v0.4s, v0.4s, v13.4s\n"
7855       "fmul v1.4s, v1.4s, v13.4s\n"
7856       "fmul v2.4s, v2.4s, v13.4s\n"
7857       "fmul v3.4s, v3.4s, v13.4s\n"
7858       "fadd v0.4s, v0.4s, v14.4s\n"
7859       "fadd v1.4s, v1.4s, v14.4s\n"
7860       "fadd v2.4s, v2.4s, v14.4s\n"
7861       "fadd v3.4s, v3.4s, v14.4s\n"
7862       "fcvtzs v0.4s, v0.4s\n"
7863       "fcvtzs v1.4s, v1.4s\n"
7864       "fcvtzs v2.4s, v2.4s\n"
7865       "fcvtzs v3.4s, v3.4s\n"
7866 
7867       "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7868       "prfm pldl1keep, [%x[output]]\n"
7869       "bne 2b\n"
7870       "3:"
7871 
7872       // BiasAdd::Transform
7873       "ld1 {v0.2s}, [%x[input]], #8\n"
7874       "ld1 {v0.s}[2], [%x[input]], #4\n"
7875       "ld1 {v0.h}[6], [%x[input]], #2\n"
7876       "ld1 {v0.b}[14], [%x[input]], #1\n"
7877       "ld1 {v4.2s}, [x1], #8\n"
7878       "ld1 {v4.s}[2], [x1], #4\n"
7879       "ld1 {v4.h}[6], [x1], #2\n"
7880       "ld1 {v4.b}[14], [x1], #1\n"
7881       "prfm pldl1keep, [%x[input], #32]\n"
7882       "uxtl2 v1.8h, v0.16b\n"
7883       "uxtl v0.8h, v0.8b\n"
7884       "uxtl2 v5.8h, v4.16b\n"
7885       "uxtl v4.8h, v4.8b\n"
7886       "sxtl2 v3.4s, v1.8h\n"
7887       "sxtl v2.4s, v1.4h\n"
7888       "sxtl2 v7.4s, v5.8h\n"
7889       "sxtl v6.4s, v5.4h\n"
7890       "sxtl2 v1.4s, v0.8h\n"
7891       "sxtl v0.4s, v0.4h\n"
7892       "sxtl2 v5.4s, v4.8h\n"
7893       "sxtl v4.4s, v4.4h\n"
7894       "scvtf v0.4s, v0.4s\n"
7895       "scvtf v1.4s, v1.4s\n"
7896       "scvtf v2.4s, v2.4s\n"
7897       "scvtf v3.4s, v3.4s\n"
7898       "scvtf v4.4s, v4.4s\n"
7899       "scvtf v5.4s, v5.4s\n"
7900       "scvtf v6.4s, v6.4s\n"
7901       "scvtf v7.4s, v7.4s\n"
7902       "fmul v0.4s, v0.4s, v9.4s\n"
7903       "fmul v1.4s, v1.4s, v9.4s\n"
7904       "fmul v2.4s, v2.4s, v9.4s\n"
7905       "fmul v3.4s, v3.4s, v9.4s\n"
7906       "fmul v4.4s, v4.4s, v11.4s\n"
7907       "fmul v5.4s, v5.4s, v11.4s\n"
7908       "fmul v6.4s, v6.4s, v11.4s\n"
7909       "fmul v7.4s, v7.4s, v11.4s\n"
7910       "fadd v0.4s, v0.4s, v8.4s\n"
7911       "fadd v1.4s, v1.4s, v8.4s\n"
7912       "fadd v2.4s, v2.4s, v8.4s\n"
7913       "fadd v3.4s, v3.4s, v8.4s\n"
7914       "fadd v4.4s, v4.4s, v10.4s\n"
7915       "fadd v5.4s, v5.4s, v10.4s\n"
7916       "fadd v6.4s, v6.4s, v10.4s\n"
7917       "fadd v7.4s, v7.4s, v10.4s\n"
7918       "fadd v0.4s, v0.4s, v4.4s\n"
7919       "fadd v1.4s, v1.4s, v5.4s\n"
7920       "fadd v2.4s, v2.4s, v6.4s\n"
7921       "fadd v3.4s, v3.4s, v7.4s\n"
7922       "fsub v0.4s, v0.4s, v12.4s\n"
7923       "fsub v1.4s, v1.4s, v12.4s\n"
7924       "fsub v2.4s, v2.4s, v12.4s\n"
7925       "fsub v3.4s, v3.4s, v12.4s\n"
7926       "fmul v0.4s, v0.4s, v13.4s\n"
7927       "fmul v1.4s, v1.4s, v13.4s\n"
7928       "fmul v2.4s, v2.4s, v13.4s\n"
7929       "fmul v3.4s, v3.4s, v13.4s\n"
7930       "fadd v0.4s, v0.4s, v14.4s\n"
7931       "fadd v1.4s, v1.4s, v14.4s\n"
7932       "fadd v2.4s, v2.4s, v14.4s\n"
7933       "fadd v3.4s, v3.4s, v14.4s\n"
7934       "fcvtzs v0.4s, v0.4s\n"
7935       "fcvtzs v1.4s, v1.4s\n"
7936       "fcvtzs v2.4s, v2.4s\n"
7937       "fcvtzs v3.4s, v3.4s\n"
7938 
7939       "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
7940       "st1 {v3.2s}, [%x[output]], #8\n"
7941       "st1 {v3.s}[2], [%x[output]], #4\n"
7942       "prfm pldl1keep, [%x[output]]\n"
7943       "subs %x[rows], %x[rows], #1\n"
7944       "bne 1b\n"
7945       : [input] "+r"(input), [output] "+r"(output)
7946       : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7947         [output_range_offset] "m"(params.output_range_offset),
7948         [input_range_scale] "m"(params.input_range_scale),
7949         [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7950         [bias_range_min] "m"(params.bias_range_min),
7951         [output_range_min] "m"(params.output_range_min),
7952         [bias_range_scale] "m"(params.bias_range_scale),
7953         [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7954       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7955         "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7956 }
7957 
7958 }  // namespace meta
7959 }  // namespace gemmlowp
7960 
7961 #else
7962 #warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!"
7963 #endif
7964 
7965 #endif  // GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
7966