1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
16 #define GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
17
18 #ifdef GEMMLOWP_NEON_64
19
20 #include <cassert>
21 #include <cstdint>
22
23 namespace gemmlowp {
24 namespace meta {
25
26 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)27 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 0>::Transform(
28 const int32_t* input, const Requantize& params, uint8_t* output) {
29 #ifdef DEBUG
30 #ifdef DEBUG_METAGEMM_VERBOSE
31 std::cout << __FILE__ << "(" << __LINE__
32 << ") Requantize<int32_t, uint8_t, Requantize, 16, 0>::Transform()"
33 << std::endl
34 << std::flush;
35 #endif
36 #endif
37 int params_count_copy = params.count;
38 asm volatile(
39
40 // Requantize::Prepare
41 "dup v4.4s, %w[input_range_min]\n"
42 "dup v5.4s, %w[output_range_min]\n"
43 "dup v6.4s, %w[input_range_offset]\n"
44 "dup v7.4s, %w[input_range_scale]\n"
45 "dup v8.4s, %w[one_over_output_range_scale]\n"
46 "fsub v4.4s, v4.4s, v5.4s\n"
47
48 "1:"
49 "subs %x[count], %x[count], #16\n"
50
51 // Requantize::Transform
52 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
53 "prfm pldl1keep, [%x[input], #64]\n"
54 "scvtf v0.4s, v0.4s\n"
55 "scvtf v1.4s, v1.4s\n"
56 "scvtf v2.4s, v2.4s\n"
57 "scvtf v3.4s, v3.4s\n"
58 "fsub v0.4s, v0.4s, v6.4s\n"
59 "fsub v1.4s, v1.4s, v6.4s\n"
60 "fsub v2.4s, v2.4s, v6.4s\n"
61 "fsub v3.4s, v3.4s, v6.4s\n"
62 "fmul v0.4s, v0.4s, v7.4s\n"
63 "fmul v1.4s, v1.4s, v7.4s\n"
64 "fmul v2.4s, v2.4s, v7.4s\n"
65 "fmul v3.4s, v3.4s, v7.4s\n"
66 "fadd v0.4s, v0.4s, v4.4s\n"
67 "fadd v1.4s, v1.4s, v4.4s\n"
68 "fadd v2.4s, v2.4s, v4.4s\n"
69 "fadd v3.4s, v3.4s, v4.4s\n"
70 "fmul v0.4s, v0.4s, v8.4s\n"
71 "fmul v1.4s, v1.4s, v8.4s\n"
72 "fmul v2.4s, v2.4s, v8.4s\n"
73 "fmul v3.4s, v3.4s, v8.4s\n"
74 "fcvtzs v0.4s, v0.4s\n"
75 "fcvtzs v1.4s, v1.4s\n"
76 "fcvtzs v2.4s, v2.4s\n"
77 "fcvtzs v3.4s, v3.4s\n"
78 "sqxtn v0.4h, v0.4s\n"
79 "sqxtn2 v0.8h, v1.4s\n"
80 "sqxtn v2.4h, v2.4s\n"
81 "sqxtn2 v2.8h, v3.4s\n"
82 "sqxtun v0.8b, v0.8h\n"
83 "sqxtun2 v0.16b, v2.8h\n"
84
85 "st1 {v0.4s}, [%x[output]], #16\n"
86 "prfm pldl1keep, [%x[output]]\n"
87
88 "bne 1b\n"
89 : [count] "+r"(params_count_copy), [input] "+r"(input),
90 [output] "+r"(output)
91 : [input_range_min] "r"(params.input_range_min),
92 [output_range_min] "r"(params.output_range_min),
93 [input_range_offset] "r"(params.input_range_offset),
94 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
95 [input_range_scale] "r"(params.input_range_scale)
96 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
97 }
98
99 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)100 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 1>::Transform(
101 const int32_t* input, const Requantize& params, uint8_t* output) {
102 #ifdef DEBUG
103 #ifdef DEBUG_METAGEMM_VERBOSE
104 std::cout << __FILE__ << "(" << __LINE__
105 << ") Requantize<int32_t, uint8_t, Requantize, 16, 1>::Transform()"
106 << std::endl
107 << std::flush;
108 #endif
109 #endif
110 int params_count_copy = params.count;
111 asm volatile(
112
113 // Requantize::Prepare
114 "dup v4.4s, %w[input_range_min]\n"
115 "dup v5.4s, %w[output_range_min]\n"
116 "dup v6.4s, %w[input_range_offset]\n"
117 "dup v7.4s, %w[input_range_scale]\n"
118 "dup v8.4s, %w[one_over_output_range_scale]\n"
119 "fsub v4.4s, v4.4s, v5.4s\n"
120
121 // Reduce count by leftovers.
122 "subs %x[count], %x[count], #1\n"
123 "beq 2f\n"
124
125 "1:"
126 "subs %x[count], %x[count], #16\n"
127
128 // Requantize::Transform
129 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
130 "prfm pldl1keep, [%x[input], #64]\n"
131 "scvtf v0.4s, v0.4s\n"
132 "scvtf v1.4s, v1.4s\n"
133 "scvtf v2.4s, v2.4s\n"
134 "scvtf v3.4s, v3.4s\n"
135 "fsub v0.4s, v0.4s, v6.4s\n"
136 "fsub v1.4s, v1.4s, v6.4s\n"
137 "fsub v2.4s, v2.4s, v6.4s\n"
138 "fsub v3.4s, v3.4s, v6.4s\n"
139 "fmul v0.4s, v0.4s, v7.4s\n"
140 "fmul v1.4s, v1.4s, v7.4s\n"
141 "fmul v2.4s, v2.4s, v7.4s\n"
142 "fmul v3.4s, v3.4s, v7.4s\n"
143 "fadd v0.4s, v0.4s, v4.4s\n"
144 "fadd v1.4s, v1.4s, v4.4s\n"
145 "fadd v2.4s, v2.4s, v4.4s\n"
146 "fadd v3.4s, v3.4s, v4.4s\n"
147 "fmul v0.4s, v0.4s, v8.4s\n"
148 "fmul v1.4s, v1.4s, v8.4s\n"
149 "fmul v2.4s, v2.4s, v8.4s\n"
150 "fmul v3.4s, v3.4s, v8.4s\n"
151 "fcvtzs v0.4s, v0.4s\n"
152 "fcvtzs v1.4s, v1.4s\n"
153 "fcvtzs v2.4s, v2.4s\n"
154 "fcvtzs v3.4s, v3.4s\n"
155 "sqxtn v0.4h, v0.4s\n"
156 "sqxtn2 v0.8h, v1.4s\n"
157 "sqxtn v2.4h, v2.4s\n"
158 "sqxtn2 v2.8h, v3.4s\n"
159 "sqxtun v0.8b, v0.8h\n"
160 "sqxtun2 v0.16b, v2.8h\n"
161
162 "st1 {v0.4s}, [%x[output]], #16\n"
163 "prfm pldl1keep, [%x[output]]\n"
164
165 "bne 1b\n"
166 "2:"
167
168 // Handle leftovers.
169
170 // Requantize::Transform
171 "ld1 {v0.s}[0], [%x[input]], #4\n"
172 "prfm pldl1keep, [%x[input], #64]\n"
173 "scvtf v0.4s, v0.4s\n"
174 "fsub v0.4s, v0.4s, v6.4s\n"
175 "fmul v0.4s, v0.4s, v7.4s\n"
176 "fadd v0.4s, v0.4s, v4.4s\n"
177 "fmul v0.4s, v0.4s, v8.4s\n"
178 "fcvtzs v0.4s, v0.4s\n"
179 "sqxtn v0.4h, v0.4s\n"
180 "sqxtun v0.8b, v0.8h\n"
181
182 "st1 {v0.b}[0], [%x[output]], #1\n"
183 "prfm pldl1keep, [%x[output]]\n"
184 : [count] "+r"(params_count_copy), [input] "+r"(input),
185 [output] "+r"(output)
186 : [input_range_min] "r"(params.input_range_min),
187 [output_range_min] "r"(params.output_range_min),
188 [input_range_offset] "r"(params.input_range_offset),
189 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
190 [input_range_scale] "r"(params.input_range_scale)
191 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
192 }
193
194 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)195 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 2>::Transform(
196 const int32_t* input, const Requantize& params, uint8_t* output) {
197 #ifdef DEBUG
198 #ifdef DEBUG_METAGEMM_VERBOSE
199 std::cout << __FILE__ << "(" << __LINE__
200 << ") Requantize<int32_t, uint8_t, Requantize, 16, 2>::Transform()"
201 << std::endl
202 << std::flush;
203 #endif
204 #endif
205 int params_count_copy = params.count;
206 asm volatile(
207
208 // Requantize::Prepare
209 "dup v4.4s, %w[input_range_min]\n"
210 "dup v5.4s, %w[output_range_min]\n"
211 "dup v6.4s, %w[input_range_offset]\n"
212 "dup v7.4s, %w[input_range_scale]\n"
213 "dup v8.4s, %w[one_over_output_range_scale]\n"
214 "fsub v4.4s, v4.4s, v5.4s\n"
215
216 // Reduce count by leftovers.
217 "subs %x[count], %x[count], #2\n"
218 "beq 2f\n"
219
220 "1:"
221 "subs %x[count], %x[count], #16\n"
222
223 // Requantize::Transform
224 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
225 "prfm pldl1keep, [%x[input], #64]\n"
226 "scvtf v0.4s, v0.4s\n"
227 "scvtf v1.4s, v1.4s\n"
228 "scvtf v2.4s, v2.4s\n"
229 "scvtf v3.4s, v3.4s\n"
230 "fsub v0.4s, v0.4s, v6.4s\n"
231 "fsub v1.4s, v1.4s, v6.4s\n"
232 "fsub v2.4s, v2.4s, v6.4s\n"
233 "fsub v3.4s, v3.4s, v6.4s\n"
234 "fmul v0.4s, v0.4s, v7.4s\n"
235 "fmul v1.4s, v1.4s, v7.4s\n"
236 "fmul v2.4s, v2.4s, v7.4s\n"
237 "fmul v3.4s, v3.4s, v7.4s\n"
238 "fadd v0.4s, v0.4s, v4.4s\n"
239 "fadd v1.4s, v1.4s, v4.4s\n"
240 "fadd v2.4s, v2.4s, v4.4s\n"
241 "fadd v3.4s, v3.4s, v4.4s\n"
242 "fmul v0.4s, v0.4s, v8.4s\n"
243 "fmul v1.4s, v1.4s, v8.4s\n"
244 "fmul v2.4s, v2.4s, v8.4s\n"
245 "fmul v3.4s, v3.4s, v8.4s\n"
246 "fcvtzs v0.4s, v0.4s\n"
247 "fcvtzs v1.4s, v1.4s\n"
248 "fcvtzs v2.4s, v2.4s\n"
249 "fcvtzs v3.4s, v3.4s\n"
250 "sqxtn v0.4h, v0.4s\n"
251 "sqxtn2 v0.8h, v1.4s\n"
252 "sqxtn v2.4h, v2.4s\n"
253 "sqxtn2 v2.8h, v3.4s\n"
254 "sqxtun v0.8b, v0.8h\n"
255 "sqxtun2 v0.16b, v2.8h\n"
256
257 "st1 {v0.4s}, [%x[output]], #16\n"
258 "prfm pldl1keep, [%x[output]]\n"
259
260 "bne 1b\n"
261 "2:"
262
263 // Handle leftovers.
264
265 // Requantize::Transform
266 "ld1 {v0.2s}, [%x[input]], #8\n"
267 "prfm pldl1keep, [%x[input], #64]\n"
268 "scvtf v0.4s, v0.4s\n"
269 "fsub v0.4s, v0.4s, v6.4s\n"
270 "fmul v0.4s, v0.4s, v7.4s\n"
271 "fadd v0.4s, v0.4s, v4.4s\n"
272 "fmul v0.4s, v0.4s, v8.4s\n"
273 "fcvtzs v0.4s, v0.4s\n"
274 "sqxtn v0.4h, v0.4s\n"
275 "sqxtun v0.8b, v0.8h\n"
276
277 "st1 {v0.h}[0], [%x[output]], #2\n"
278 "prfm pldl1keep, [%x[output]]\n"
279 : [count] "+r"(params_count_copy), [input] "+r"(input),
280 [output] "+r"(output)
281 : [input_range_min] "r"(params.input_range_min),
282 [output_range_min] "r"(params.output_range_min),
283 [input_range_offset] "r"(params.input_range_offset),
284 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
285 [input_range_scale] "r"(params.input_range_scale)
286 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
287 }
288
289 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)290 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 3>::Transform(
291 const int32_t* input, const Requantize& params, uint8_t* output) {
292 #ifdef DEBUG
293 #ifdef DEBUG_METAGEMM_VERBOSE
294 std::cout << __FILE__ << "(" << __LINE__
295 << ") Requantize<int32_t, uint8_t, Requantize, 16, 3>::Transform()"
296 << std::endl
297 << std::flush;
298 #endif
299 #endif
300 int params_count_copy = params.count;
301 asm volatile(
302
303 // Requantize::Prepare
304 "dup v4.4s, %w[input_range_min]\n"
305 "dup v5.4s, %w[output_range_min]\n"
306 "dup v6.4s, %w[input_range_offset]\n"
307 "dup v7.4s, %w[input_range_scale]\n"
308 "dup v8.4s, %w[one_over_output_range_scale]\n"
309 "fsub v4.4s, v4.4s, v5.4s\n"
310
311 // Reduce count by leftovers.
312 "subs %x[count], %x[count], #3\n"
313 "beq 2f\n"
314
315 "1:"
316 "subs %x[count], %x[count], #16\n"
317
318 // Requantize::Transform
319 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
320 "prfm pldl1keep, [%x[input], #64]\n"
321 "scvtf v0.4s, v0.4s\n"
322 "scvtf v1.4s, v1.4s\n"
323 "scvtf v2.4s, v2.4s\n"
324 "scvtf v3.4s, v3.4s\n"
325 "fsub v0.4s, v0.4s, v6.4s\n"
326 "fsub v1.4s, v1.4s, v6.4s\n"
327 "fsub v2.4s, v2.4s, v6.4s\n"
328 "fsub v3.4s, v3.4s, v6.4s\n"
329 "fmul v0.4s, v0.4s, v7.4s\n"
330 "fmul v1.4s, v1.4s, v7.4s\n"
331 "fmul v2.4s, v2.4s, v7.4s\n"
332 "fmul v3.4s, v3.4s, v7.4s\n"
333 "fadd v0.4s, v0.4s, v4.4s\n"
334 "fadd v1.4s, v1.4s, v4.4s\n"
335 "fadd v2.4s, v2.4s, v4.4s\n"
336 "fadd v3.4s, v3.4s, v4.4s\n"
337 "fmul v0.4s, v0.4s, v8.4s\n"
338 "fmul v1.4s, v1.4s, v8.4s\n"
339 "fmul v2.4s, v2.4s, v8.4s\n"
340 "fmul v3.4s, v3.4s, v8.4s\n"
341 "fcvtzs v0.4s, v0.4s\n"
342 "fcvtzs v1.4s, v1.4s\n"
343 "fcvtzs v2.4s, v2.4s\n"
344 "fcvtzs v3.4s, v3.4s\n"
345 "sqxtn v0.4h, v0.4s\n"
346 "sqxtn2 v0.8h, v1.4s\n"
347 "sqxtn v2.4h, v2.4s\n"
348 "sqxtn2 v2.8h, v3.4s\n"
349 "sqxtun v0.8b, v0.8h\n"
350 "sqxtun2 v0.16b, v2.8h\n"
351
352 "st1 {v0.4s}, [%x[output]], #16\n"
353 "prfm pldl1keep, [%x[output]]\n"
354
355 "bne 1b\n"
356 "2:"
357
358 // Handle leftovers.
359
360 // Requantize::Transform
361 "ld1 {v0.2s}, [%x[input]], #8\n"
362 "ld1 {v0.s}[2], [%x[input]], #4\n"
363 "prfm pldl1keep, [%x[input], #64]\n"
364 "scvtf v0.4s, v0.4s\n"
365 "fsub v0.4s, v0.4s, v6.4s\n"
366 "fmul v0.4s, v0.4s, v7.4s\n"
367 "fadd v0.4s, v0.4s, v4.4s\n"
368 "fmul v0.4s, v0.4s, v8.4s\n"
369 "fcvtzs v0.4s, v0.4s\n"
370 "sqxtn v0.4h, v0.4s\n"
371 "sqxtun v0.8b, v0.8h\n"
372
373 "st1 {v0.h}[0], [%x[output]], #2\n"
374 "st1 {v0.b}[2], [%x[output]], #1\n"
375 "prfm pldl1keep, [%x[output]]\n"
376 : [count] "+r"(params_count_copy), [input] "+r"(input),
377 [output] "+r"(output)
378 : [input_range_min] "r"(params.input_range_min),
379 [output_range_min] "r"(params.output_range_min),
380 [input_range_offset] "r"(params.input_range_offset),
381 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
382 [input_range_scale] "r"(params.input_range_scale)
383 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
384 }
385
386 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)387 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 4>::Transform(
388 const int32_t* input, const Requantize& params, uint8_t* output) {
389 #ifdef DEBUG
390 #ifdef DEBUG_METAGEMM_VERBOSE
391 std::cout << __FILE__ << "(" << __LINE__
392 << ") Requantize<int32_t, uint8_t, Requantize, 16, 4>::Transform()"
393 << std::endl
394 << std::flush;
395 #endif
396 #endif
397 int params_count_copy = params.count;
398 asm volatile(
399
400 // Requantize::Prepare
401 "dup v4.4s, %w[input_range_min]\n"
402 "dup v5.4s, %w[output_range_min]\n"
403 "dup v6.4s, %w[input_range_offset]\n"
404 "dup v7.4s, %w[input_range_scale]\n"
405 "dup v8.4s, %w[one_over_output_range_scale]\n"
406 "fsub v4.4s, v4.4s, v5.4s\n"
407
408 // Reduce count by leftovers.
409 "subs %x[count], %x[count], #4\n"
410 "beq 2f\n"
411
412 "1:"
413 "subs %x[count], %x[count], #16\n"
414
415 // Requantize::Transform
416 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
417 "prfm pldl1keep, [%x[input], #64]\n"
418 "scvtf v0.4s, v0.4s\n"
419 "scvtf v1.4s, v1.4s\n"
420 "scvtf v2.4s, v2.4s\n"
421 "scvtf v3.4s, v3.4s\n"
422 "fsub v0.4s, v0.4s, v6.4s\n"
423 "fsub v1.4s, v1.4s, v6.4s\n"
424 "fsub v2.4s, v2.4s, v6.4s\n"
425 "fsub v3.4s, v3.4s, v6.4s\n"
426 "fmul v0.4s, v0.4s, v7.4s\n"
427 "fmul v1.4s, v1.4s, v7.4s\n"
428 "fmul v2.4s, v2.4s, v7.4s\n"
429 "fmul v3.4s, v3.4s, v7.4s\n"
430 "fadd v0.4s, v0.4s, v4.4s\n"
431 "fadd v1.4s, v1.4s, v4.4s\n"
432 "fadd v2.4s, v2.4s, v4.4s\n"
433 "fadd v3.4s, v3.4s, v4.4s\n"
434 "fmul v0.4s, v0.4s, v8.4s\n"
435 "fmul v1.4s, v1.4s, v8.4s\n"
436 "fmul v2.4s, v2.4s, v8.4s\n"
437 "fmul v3.4s, v3.4s, v8.4s\n"
438 "fcvtzs v0.4s, v0.4s\n"
439 "fcvtzs v1.4s, v1.4s\n"
440 "fcvtzs v2.4s, v2.4s\n"
441 "fcvtzs v3.4s, v3.4s\n"
442 "sqxtn v0.4h, v0.4s\n"
443 "sqxtn2 v0.8h, v1.4s\n"
444 "sqxtn v2.4h, v2.4s\n"
445 "sqxtn2 v2.8h, v3.4s\n"
446 "sqxtun v0.8b, v0.8h\n"
447 "sqxtun2 v0.16b, v2.8h\n"
448
449 "st1 {v0.4s}, [%x[output]], #16\n"
450 "prfm pldl1keep, [%x[output]]\n"
451
452 "bne 1b\n"
453 "2:"
454
455 // Handle leftovers.
456
457 // Requantize::Transform
458 "ld1 {v0.4s}, [%x[input]], #16\n"
459 "prfm pldl1keep, [%x[input], #64]\n"
460 "scvtf v0.4s, v0.4s\n"
461 "fsub v0.4s, v0.4s, v6.4s\n"
462 "fmul v0.4s, v0.4s, v7.4s\n"
463 "fadd v0.4s, v0.4s, v4.4s\n"
464 "fmul v0.4s, v0.4s, v8.4s\n"
465 "fcvtzs v0.4s, v0.4s\n"
466 "sqxtn v0.4h, v0.4s\n"
467 "sqxtun v0.8b, v0.8h\n"
468
469 "st1 {v0.s}[0], [%x[output]], #4\n"
470 "prfm pldl1keep, [%x[output]]\n"
471 : [count] "+r"(params_count_copy), [input] "+r"(input),
472 [output] "+r"(output)
473 : [input_range_min] "r"(params.input_range_min),
474 [output_range_min] "r"(params.output_range_min),
475 [input_range_offset] "r"(params.input_range_offset),
476 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
477 [input_range_scale] "r"(params.input_range_scale)
478 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
479 }
480
481 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)482 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 5>::Transform(
483 const int32_t* input, const Requantize& params, uint8_t* output) {
484 #ifdef DEBUG
485 #ifdef DEBUG_METAGEMM_VERBOSE
486 std::cout << __FILE__ << "(" << __LINE__
487 << ") Requantize<int32_t, uint8_t, Requantize, 16, 5>::Transform()"
488 << std::endl
489 << std::flush;
490 #endif
491 #endif
492 int params_count_copy = params.count;
493 asm volatile(
494
495 // Requantize::Prepare
496 "dup v4.4s, %w[input_range_min]\n"
497 "dup v5.4s, %w[output_range_min]\n"
498 "dup v6.4s, %w[input_range_offset]\n"
499 "dup v7.4s, %w[input_range_scale]\n"
500 "dup v8.4s, %w[one_over_output_range_scale]\n"
501 "fsub v4.4s, v4.4s, v5.4s\n"
502
503 // Reduce count by leftovers.
504 "subs %x[count], %x[count], #5\n"
505 "beq 2f\n"
506
507 "1:"
508 "subs %x[count], %x[count], #16\n"
509
510 // Requantize::Transform
511 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
512 "prfm pldl1keep, [%x[input], #64]\n"
513 "scvtf v0.4s, v0.4s\n"
514 "scvtf v1.4s, v1.4s\n"
515 "scvtf v2.4s, v2.4s\n"
516 "scvtf v3.4s, v3.4s\n"
517 "fsub v0.4s, v0.4s, v6.4s\n"
518 "fsub v1.4s, v1.4s, v6.4s\n"
519 "fsub v2.4s, v2.4s, v6.4s\n"
520 "fsub v3.4s, v3.4s, v6.4s\n"
521 "fmul v0.4s, v0.4s, v7.4s\n"
522 "fmul v1.4s, v1.4s, v7.4s\n"
523 "fmul v2.4s, v2.4s, v7.4s\n"
524 "fmul v3.4s, v3.4s, v7.4s\n"
525 "fadd v0.4s, v0.4s, v4.4s\n"
526 "fadd v1.4s, v1.4s, v4.4s\n"
527 "fadd v2.4s, v2.4s, v4.4s\n"
528 "fadd v3.4s, v3.4s, v4.4s\n"
529 "fmul v0.4s, v0.4s, v8.4s\n"
530 "fmul v1.4s, v1.4s, v8.4s\n"
531 "fmul v2.4s, v2.4s, v8.4s\n"
532 "fmul v3.4s, v3.4s, v8.4s\n"
533 "fcvtzs v0.4s, v0.4s\n"
534 "fcvtzs v1.4s, v1.4s\n"
535 "fcvtzs v2.4s, v2.4s\n"
536 "fcvtzs v3.4s, v3.4s\n"
537 "sqxtn v0.4h, v0.4s\n"
538 "sqxtn2 v0.8h, v1.4s\n"
539 "sqxtn v2.4h, v2.4s\n"
540 "sqxtn2 v2.8h, v3.4s\n"
541 "sqxtun v0.8b, v0.8h\n"
542 "sqxtun2 v0.16b, v2.8h\n"
543
544 "st1 {v0.4s}, [%x[output]], #16\n"
545 "prfm pldl1keep, [%x[output]]\n"
546
547 "bne 1b\n"
548 "2:"
549
550 // Handle leftovers.
551
552 // Requantize::Transform
553 "ld1 {v0.4s}, [%x[input]], #16\n"
554 "ld1 {v1.s}[0], [%x[input]], #4\n"
555 "prfm pldl1keep, [%x[input], #64]\n"
556 "scvtf v0.4s, v0.4s\n"
557 "scvtf v1.4s, v1.4s\n"
558 "fsub v0.4s, v0.4s, v6.4s\n"
559 "fsub v1.4s, v1.4s, v6.4s\n"
560 "fmul v0.4s, v0.4s, v7.4s\n"
561 "fmul v1.4s, v1.4s, v7.4s\n"
562 "fadd v0.4s, v0.4s, v4.4s\n"
563 "fadd v1.4s, v1.4s, v4.4s\n"
564 "fmul v0.4s, v0.4s, v8.4s\n"
565 "fmul v1.4s, v1.4s, v8.4s\n"
566 "fcvtzs v0.4s, v0.4s\n"
567 "fcvtzs v1.4s, v1.4s\n"
568 "sqxtn v0.4h, v0.4s\n"
569 "sqxtn2 v0.8h, v1.4s\n"
570 "sqxtun v0.8b, v0.8h\n"
571
572 "st1 {v0.s}[0], [%x[output]], #4\n"
573 "st1 {v0.b}[4], [%x[output]], #1\n"
574 "prfm pldl1keep, [%x[output]]\n"
575 : [count] "+r"(params_count_copy), [input] "+r"(input),
576 [output] "+r"(output)
577 : [input_range_min] "r"(params.input_range_min),
578 [output_range_min] "r"(params.output_range_min),
579 [input_range_offset] "r"(params.input_range_offset),
580 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
581 [input_range_scale] "r"(params.input_range_scale)
582 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
583 }
584
585 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)586 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 6>::Transform(
587 const int32_t* input, const Requantize& params, uint8_t* output) {
588 #ifdef DEBUG
589 #ifdef DEBUG_METAGEMM_VERBOSE
590 std::cout << __FILE__ << "(" << __LINE__
591 << ") Requantize<int32_t, uint8_t, Requantize, 16, 6>::Transform()"
592 << std::endl
593 << std::flush;
594 #endif
595 #endif
596 int params_count_copy = params.count;
597 asm volatile(
598
599 // Requantize::Prepare
600 "dup v4.4s, %w[input_range_min]\n"
601 "dup v5.4s, %w[output_range_min]\n"
602 "dup v6.4s, %w[input_range_offset]\n"
603 "dup v7.4s, %w[input_range_scale]\n"
604 "dup v8.4s, %w[one_over_output_range_scale]\n"
605 "fsub v4.4s, v4.4s, v5.4s\n"
606
607 // Reduce count by leftovers.
608 "subs %x[count], %x[count], #6\n"
609 "beq 2f\n"
610
611 "1:"
612 "subs %x[count], %x[count], #16\n"
613
614 // Requantize::Transform
615 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
616 "prfm pldl1keep, [%x[input], #64]\n"
617 "scvtf v0.4s, v0.4s\n"
618 "scvtf v1.4s, v1.4s\n"
619 "scvtf v2.4s, v2.4s\n"
620 "scvtf v3.4s, v3.4s\n"
621 "fsub v0.4s, v0.4s, v6.4s\n"
622 "fsub v1.4s, v1.4s, v6.4s\n"
623 "fsub v2.4s, v2.4s, v6.4s\n"
624 "fsub v3.4s, v3.4s, v6.4s\n"
625 "fmul v0.4s, v0.4s, v7.4s\n"
626 "fmul v1.4s, v1.4s, v7.4s\n"
627 "fmul v2.4s, v2.4s, v7.4s\n"
628 "fmul v3.4s, v3.4s, v7.4s\n"
629 "fadd v0.4s, v0.4s, v4.4s\n"
630 "fadd v1.4s, v1.4s, v4.4s\n"
631 "fadd v2.4s, v2.4s, v4.4s\n"
632 "fadd v3.4s, v3.4s, v4.4s\n"
633 "fmul v0.4s, v0.4s, v8.4s\n"
634 "fmul v1.4s, v1.4s, v8.4s\n"
635 "fmul v2.4s, v2.4s, v8.4s\n"
636 "fmul v3.4s, v3.4s, v8.4s\n"
637 "fcvtzs v0.4s, v0.4s\n"
638 "fcvtzs v1.4s, v1.4s\n"
639 "fcvtzs v2.4s, v2.4s\n"
640 "fcvtzs v3.4s, v3.4s\n"
641 "sqxtn v0.4h, v0.4s\n"
642 "sqxtn2 v0.8h, v1.4s\n"
643 "sqxtn v2.4h, v2.4s\n"
644 "sqxtn2 v2.8h, v3.4s\n"
645 "sqxtun v0.8b, v0.8h\n"
646 "sqxtun2 v0.16b, v2.8h\n"
647
648 "st1 {v0.4s}, [%x[output]], #16\n"
649 "prfm pldl1keep, [%x[output]]\n"
650
651 "bne 1b\n"
652 "2:"
653
654 // Handle leftovers.
655
656 // Requantize::Transform
657 "ld1 {v0.4s}, [%x[input]], #16\n"
658 "ld1 {v1.2s}, [%x[input]], #8\n"
659 "prfm pldl1keep, [%x[input], #64]\n"
660 "scvtf v0.4s, v0.4s\n"
661 "scvtf v1.4s, v1.4s\n"
662 "fsub v0.4s, v0.4s, v6.4s\n"
663 "fsub v1.4s, v1.4s, v6.4s\n"
664 "fmul v0.4s, v0.4s, v7.4s\n"
665 "fmul v1.4s, v1.4s, v7.4s\n"
666 "fadd v0.4s, v0.4s, v4.4s\n"
667 "fadd v1.4s, v1.4s, v4.4s\n"
668 "fmul v0.4s, v0.4s, v8.4s\n"
669 "fmul v1.4s, v1.4s, v8.4s\n"
670 "fcvtzs v0.4s, v0.4s\n"
671 "fcvtzs v1.4s, v1.4s\n"
672 "sqxtn v0.4h, v0.4s\n"
673 "sqxtn2 v0.8h, v1.4s\n"
674 "sqxtun v0.8b, v0.8h\n"
675
676 "st1 {v0.s}[0], [%x[output]], #4\n"
677 "st1 {v0.h}[2], [%x[output]], #2\n"
678 "prfm pldl1keep, [%x[output]]\n"
679 : [count] "+r"(params_count_copy), [input] "+r"(input),
680 [output] "+r"(output)
681 : [input_range_min] "r"(params.input_range_min),
682 [output_range_min] "r"(params.output_range_min),
683 [input_range_offset] "r"(params.input_range_offset),
684 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
685 [input_range_scale] "r"(params.input_range_scale)
686 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
687 }
688
689 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)690 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 7>::Transform(
691 const int32_t* input, const Requantize& params, uint8_t* output) {
692 #ifdef DEBUG
693 #ifdef DEBUG_METAGEMM_VERBOSE
694 std::cout << __FILE__ << "(" << __LINE__
695 << ") Requantize<int32_t, uint8_t, Requantize, 16, 7>::Transform()"
696 << std::endl
697 << std::flush;
698 #endif
699 #endif
700 int params_count_copy = params.count;
701 asm volatile(
702
703 // Requantize::Prepare
704 "dup v4.4s, %w[input_range_min]\n"
705 "dup v5.4s, %w[output_range_min]\n"
706 "dup v6.4s, %w[input_range_offset]\n"
707 "dup v7.4s, %w[input_range_scale]\n"
708 "dup v8.4s, %w[one_over_output_range_scale]\n"
709 "fsub v4.4s, v4.4s, v5.4s\n"
710
711 // Reduce count by leftovers.
712 "subs %x[count], %x[count], #7\n"
713 "beq 2f\n"
714
715 "1:"
716 "subs %x[count], %x[count], #16\n"
717
718 // Requantize::Transform
719 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
720 "prfm pldl1keep, [%x[input], #64]\n"
721 "scvtf v0.4s, v0.4s\n"
722 "scvtf v1.4s, v1.4s\n"
723 "scvtf v2.4s, v2.4s\n"
724 "scvtf v3.4s, v3.4s\n"
725 "fsub v0.4s, v0.4s, v6.4s\n"
726 "fsub v1.4s, v1.4s, v6.4s\n"
727 "fsub v2.4s, v2.4s, v6.4s\n"
728 "fsub v3.4s, v3.4s, v6.4s\n"
729 "fmul v0.4s, v0.4s, v7.4s\n"
730 "fmul v1.4s, v1.4s, v7.4s\n"
731 "fmul v2.4s, v2.4s, v7.4s\n"
732 "fmul v3.4s, v3.4s, v7.4s\n"
733 "fadd v0.4s, v0.4s, v4.4s\n"
734 "fadd v1.4s, v1.4s, v4.4s\n"
735 "fadd v2.4s, v2.4s, v4.4s\n"
736 "fadd v3.4s, v3.4s, v4.4s\n"
737 "fmul v0.4s, v0.4s, v8.4s\n"
738 "fmul v1.4s, v1.4s, v8.4s\n"
739 "fmul v2.4s, v2.4s, v8.4s\n"
740 "fmul v3.4s, v3.4s, v8.4s\n"
741 "fcvtzs v0.4s, v0.4s\n"
742 "fcvtzs v1.4s, v1.4s\n"
743 "fcvtzs v2.4s, v2.4s\n"
744 "fcvtzs v3.4s, v3.4s\n"
745 "sqxtn v0.4h, v0.4s\n"
746 "sqxtn2 v0.8h, v1.4s\n"
747 "sqxtn v2.4h, v2.4s\n"
748 "sqxtn2 v2.8h, v3.4s\n"
749 "sqxtun v0.8b, v0.8h\n"
750 "sqxtun2 v0.16b, v2.8h\n"
751
752 "st1 {v0.4s}, [%x[output]], #16\n"
753 "prfm pldl1keep, [%x[output]]\n"
754
755 "bne 1b\n"
756 "2:"
757
758 // Handle leftovers.
759
760 // Requantize::Transform
761 "ld1 {v0.4s}, [%x[input]], #16\n"
762 "ld1 {v1.2s}, [%x[input]], #8\n"
763 "ld1 {v1.s}[2], [%x[input]], #4\n"
764 "prfm pldl1keep, [%x[input], #64]\n"
765 "scvtf v0.4s, v0.4s\n"
766 "scvtf v1.4s, v1.4s\n"
767 "fsub v0.4s, v0.4s, v6.4s\n"
768 "fsub v1.4s, v1.4s, v6.4s\n"
769 "fmul v0.4s, v0.4s, v7.4s\n"
770 "fmul v1.4s, v1.4s, v7.4s\n"
771 "fadd v0.4s, v0.4s, v4.4s\n"
772 "fadd v1.4s, v1.4s, v4.4s\n"
773 "fmul v0.4s, v0.4s, v8.4s\n"
774 "fmul v1.4s, v1.4s, v8.4s\n"
775 "fcvtzs v0.4s, v0.4s\n"
776 "fcvtzs v1.4s, v1.4s\n"
777 "sqxtn v0.4h, v0.4s\n"
778 "sqxtn2 v0.8h, v1.4s\n"
779 "sqxtun v0.8b, v0.8h\n"
780
781 "st1 {v0.s}[0], [%x[output]], #4\n"
782 "st1 {v0.h}[2], [%x[output]], #2\n"
783 "st1 {v0.b}[6], [%x[output]], #1\n"
784 "prfm pldl1keep, [%x[output]]\n"
785 : [count] "+r"(params_count_copy), [input] "+r"(input),
786 [output] "+r"(output)
787 : [input_range_min] "r"(params.input_range_min),
788 [output_range_min] "r"(params.output_range_min),
789 [input_range_offset] "r"(params.input_range_offset),
790 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
791 [input_range_scale] "r"(params.input_range_scale)
792 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
793 }
794
795 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)796 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 8>::Transform(
797 const int32_t* input, const Requantize& params, uint8_t* output) {
798 #ifdef DEBUG
799 #ifdef DEBUG_METAGEMM_VERBOSE
800 std::cout << __FILE__ << "(" << __LINE__
801 << ") Requantize<int32_t, uint8_t, Requantize, 16, 8>::Transform()"
802 << std::endl
803 << std::flush;
804 #endif
805 #endif
806 int params_count_copy = params.count;
807 asm volatile(
808
809 // Requantize::Prepare
810 "dup v4.4s, %w[input_range_min]\n"
811 "dup v5.4s, %w[output_range_min]\n"
812 "dup v6.4s, %w[input_range_offset]\n"
813 "dup v7.4s, %w[input_range_scale]\n"
814 "dup v8.4s, %w[one_over_output_range_scale]\n"
815 "fsub v4.4s, v4.4s, v5.4s\n"
816
817 // Reduce count by leftovers.
818 "subs %x[count], %x[count], #8\n"
819 "beq 2f\n"
820
821 "1:"
822 "subs %x[count], %x[count], #16\n"
823
824 // Requantize::Transform
825 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
826 "prfm pldl1keep, [%x[input], #64]\n"
827 "scvtf v0.4s, v0.4s\n"
828 "scvtf v1.4s, v1.4s\n"
829 "scvtf v2.4s, v2.4s\n"
830 "scvtf v3.4s, v3.4s\n"
831 "fsub v0.4s, v0.4s, v6.4s\n"
832 "fsub v1.4s, v1.4s, v6.4s\n"
833 "fsub v2.4s, v2.4s, v6.4s\n"
834 "fsub v3.4s, v3.4s, v6.4s\n"
835 "fmul v0.4s, v0.4s, v7.4s\n"
836 "fmul v1.4s, v1.4s, v7.4s\n"
837 "fmul v2.4s, v2.4s, v7.4s\n"
838 "fmul v3.4s, v3.4s, v7.4s\n"
839 "fadd v0.4s, v0.4s, v4.4s\n"
840 "fadd v1.4s, v1.4s, v4.4s\n"
841 "fadd v2.4s, v2.4s, v4.4s\n"
842 "fadd v3.4s, v3.4s, v4.4s\n"
843 "fmul v0.4s, v0.4s, v8.4s\n"
844 "fmul v1.4s, v1.4s, v8.4s\n"
845 "fmul v2.4s, v2.4s, v8.4s\n"
846 "fmul v3.4s, v3.4s, v8.4s\n"
847 "fcvtzs v0.4s, v0.4s\n"
848 "fcvtzs v1.4s, v1.4s\n"
849 "fcvtzs v2.4s, v2.4s\n"
850 "fcvtzs v3.4s, v3.4s\n"
851 "sqxtn v0.4h, v0.4s\n"
852 "sqxtn2 v0.8h, v1.4s\n"
853 "sqxtn v2.4h, v2.4s\n"
854 "sqxtn2 v2.8h, v3.4s\n"
855 "sqxtun v0.8b, v0.8h\n"
856 "sqxtun2 v0.16b, v2.8h\n"
857
858 "st1 {v0.4s}, [%x[output]], #16\n"
859 "prfm pldl1keep, [%x[output]]\n"
860
861 "bne 1b\n"
862 "2:"
863
864 // Handle leftovers.
865
866 // Requantize::Transform
867 "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
868 "prfm pldl1keep, [%x[input], #64]\n"
869 "scvtf v0.4s, v0.4s\n"
870 "scvtf v1.4s, v1.4s\n"
871 "fsub v0.4s, v0.4s, v6.4s\n"
872 "fsub v1.4s, v1.4s, v6.4s\n"
873 "fmul v0.4s, v0.4s, v7.4s\n"
874 "fmul v1.4s, v1.4s, v7.4s\n"
875 "fadd v0.4s, v0.4s, v4.4s\n"
876 "fadd v1.4s, v1.4s, v4.4s\n"
877 "fmul v0.4s, v0.4s, v8.4s\n"
878 "fmul v1.4s, v1.4s, v8.4s\n"
879 "fcvtzs v0.4s, v0.4s\n"
880 "fcvtzs v1.4s, v1.4s\n"
881 "sqxtn v0.4h, v0.4s\n"
882 "sqxtn2 v0.8h, v1.4s\n"
883 "sqxtun v0.8b, v0.8h\n"
884
885 "st1 {v0.2s}, [%x[output]], #8\n"
886 "prfm pldl1keep, [%x[output]]\n"
887 : [count] "+r"(params_count_copy), [input] "+r"(input),
888 [output] "+r"(output)
889 : [input_range_min] "r"(params.input_range_min),
890 [output_range_min] "r"(params.output_range_min),
891 [input_range_offset] "r"(params.input_range_offset),
892 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
893 [input_range_scale] "r"(params.input_range_scale)
894 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
895 }
896
897 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)898 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 9>::Transform(
899 const int32_t* input, const Requantize& params, uint8_t* output) {
900 #ifdef DEBUG
901 #ifdef DEBUG_METAGEMM_VERBOSE
902 std::cout << __FILE__ << "(" << __LINE__
903 << ") Requantize<int32_t, uint8_t, Requantize, 16, 9>::Transform()"
904 << std::endl
905 << std::flush;
906 #endif
907 #endif
908 int params_count_copy = params.count;
909 asm volatile(
910
911 // Requantize::Prepare
912 "dup v4.4s, %w[input_range_min]\n"
913 "dup v5.4s, %w[output_range_min]\n"
914 "dup v6.4s, %w[input_range_offset]\n"
915 "dup v7.4s, %w[input_range_scale]\n"
916 "dup v8.4s, %w[one_over_output_range_scale]\n"
917 "fsub v4.4s, v4.4s, v5.4s\n"
918
919 // Reduce count by leftovers.
920 "subs %x[count], %x[count], #9\n"
921 "beq 2f\n"
922
923 "1:"
924 "subs %x[count], %x[count], #16\n"
925
926 // Requantize::Transform
927 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
928 "prfm pldl1keep, [%x[input], #64]\n"
929 "scvtf v0.4s, v0.4s\n"
930 "scvtf v1.4s, v1.4s\n"
931 "scvtf v2.4s, v2.4s\n"
932 "scvtf v3.4s, v3.4s\n"
933 "fsub v0.4s, v0.4s, v6.4s\n"
934 "fsub v1.4s, v1.4s, v6.4s\n"
935 "fsub v2.4s, v2.4s, v6.4s\n"
936 "fsub v3.4s, v3.4s, v6.4s\n"
937 "fmul v0.4s, v0.4s, v7.4s\n"
938 "fmul v1.4s, v1.4s, v7.4s\n"
939 "fmul v2.4s, v2.4s, v7.4s\n"
940 "fmul v3.4s, v3.4s, v7.4s\n"
941 "fadd v0.4s, v0.4s, v4.4s\n"
942 "fadd v1.4s, v1.4s, v4.4s\n"
943 "fadd v2.4s, v2.4s, v4.4s\n"
944 "fadd v3.4s, v3.4s, v4.4s\n"
945 "fmul v0.4s, v0.4s, v8.4s\n"
946 "fmul v1.4s, v1.4s, v8.4s\n"
947 "fmul v2.4s, v2.4s, v8.4s\n"
948 "fmul v3.4s, v3.4s, v8.4s\n"
949 "fcvtzs v0.4s, v0.4s\n"
950 "fcvtzs v1.4s, v1.4s\n"
951 "fcvtzs v2.4s, v2.4s\n"
952 "fcvtzs v3.4s, v3.4s\n"
953 "sqxtn v0.4h, v0.4s\n"
954 "sqxtn2 v0.8h, v1.4s\n"
955 "sqxtn v2.4h, v2.4s\n"
956 "sqxtn2 v2.8h, v3.4s\n"
957 "sqxtun v0.8b, v0.8h\n"
958 "sqxtun2 v0.16b, v2.8h\n"
959
960 "st1 {v0.4s}, [%x[output]], #16\n"
961 "prfm pldl1keep, [%x[output]]\n"
962
963 "bne 1b\n"
964 "2:"
965
966 // Handle leftovers.
967
968 // Requantize::Transform
969 "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
970 "ld1 {v2.s}[0], [%x[input]], #4\n"
971 "prfm pldl1keep, [%x[input], #64]\n"
972 "scvtf v0.4s, v0.4s\n"
973 "scvtf v1.4s, v1.4s\n"
974 "scvtf v2.4s, v2.4s\n"
975 "fsub v0.4s, v0.4s, v6.4s\n"
976 "fsub v1.4s, v1.4s, v6.4s\n"
977 "fsub v2.4s, v2.4s, v6.4s\n"
978 "fmul v0.4s, v0.4s, v7.4s\n"
979 "fmul v1.4s, v1.4s, v7.4s\n"
980 "fmul v2.4s, v2.4s, v7.4s\n"
981 "fadd v0.4s, v0.4s, v4.4s\n"
982 "fadd v1.4s, v1.4s, v4.4s\n"
983 "fadd v2.4s, v2.4s, v4.4s\n"
984 "fmul v0.4s, v0.4s, v8.4s\n"
985 "fmul v1.4s, v1.4s, v8.4s\n"
986 "fmul v2.4s, v2.4s, v8.4s\n"
987 "fcvtzs v0.4s, v0.4s\n"
988 "fcvtzs v1.4s, v1.4s\n"
989 "fcvtzs v2.4s, v2.4s\n"
990 "sqxtn v0.4h, v0.4s\n"
991 "sqxtn2 v0.8h, v1.4s\n"
992 "sqxtn v2.4h, v2.4s\n"
993 "sqxtun v0.8b, v0.8h\n"
994 "sqxtun2 v0.16b, v2.8h\n"
995
996 "st1 {v0.2s}, [%x[output]], #8\n"
997 "st1 {v0.b}[8], [%x[output]], #1\n"
998 "prfm pldl1keep, [%x[output]]\n"
999 : [count] "+r"(params_count_copy), [input] "+r"(input),
1000 [output] "+r"(output)
1001 : [input_range_min] "r"(params.input_range_min),
1002 [output_range_min] "r"(params.output_range_min),
1003 [input_range_offset] "r"(params.input_range_offset),
1004 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1005 [input_range_scale] "r"(params.input_range_scale)
1006 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1007 }
1008
1009 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1010 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 10>::Transform(
1011 const int32_t* input, const Requantize& params, uint8_t* output) {
1012 #ifdef DEBUG
1013 #ifdef DEBUG_METAGEMM_VERBOSE
1014 std::cout << __FILE__ << "(" << __LINE__
1015 << ") Requantize<int32_t, uint8_t, Requantize, 16, 10>::Transform()"
1016 << std::endl
1017 << std::flush;
1018 #endif
1019 #endif
1020 int params_count_copy = params.count;
1021 asm volatile(
1022
1023 // Requantize::Prepare
1024 "dup v4.4s, %w[input_range_min]\n"
1025 "dup v5.4s, %w[output_range_min]\n"
1026 "dup v6.4s, %w[input_range_offset]\n"
1027 "dup v7.4s, %w[input_range_scale]\n"
1028 "dup v8.4s, %w[one_over_output_range_scale]\n"
1029 "fsub v4.4s, v4.4s, v5.4s\n"
1030
1031 // Reduce count by leftovers.
1032 "subs %x[count], %x[count], #10\n"
1033 "beq 2f\n"
1034
1035 "1:"
1036 "subs %x[count], %x[count], #16\n"
1037
1038 // Requantize::Transform
1039 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1040 "prfm pldl1keep, [%x[input], #64]\n"
1041 "scvtf v0.4s, v0.4s\n"
1042 "scvtf v1.4s, v1.4s\n"
1043 "scvtf v2.4s, v2.4s\n"
1044 "scvtf v3.4s, v3.4s\n"
1045 "fsub v0.4s, v0.4s, v6.4s\n"
1046 "fsub v1.4s, v1.4s, v6.4s\n"
1047 "fsub v2.4s, v2.4s, v6.4s\n"
1048 "fsub v3.4s, v3.4s, v6.4s\n"
1049 "fmul v0.4s, v0.4s, v7.4s\n"
1050 "fmul v1.4s, v1.4s, v7.4s\n"
1051 "fmul v2.4s, v2.4s, v7.4s\n"
1052 "fmul v3.4s, v3.4s, v7.4s\n"
1053 "fadd v0.4s, v0.4s, v4.4s\n"
1054 "fadd v1.4s, v1.4s, v4.4s\n"
1055 "fadd v2.4s, v2.4s, v4.4s\n"
1056 "fadd v3.4s, v3.4s, v4.4s\n"
1057 "fmul v0.4s, v0.4s, v8.4s\n"
1058 "fmul v1.4s, v1.4s, v8.4s\n"
1059 "fmul v2.4s, v2.4s, v8.4s\n"
1060 "fmul v3.4s, v3.4s, v8.4s\n"
1061 "fcvtzs v0.4s, v0.4s\n"
1062 "fcvtzs v1.4s, v1.4s\n"
1063 "fcvtzs v2.4s, v2.4s\n"
1064 "fcvtzs v3.4s, v3.4s\n"
1065 "sqxtn v0.4h, v0.4s\n"
1066 "sqxtn2 v0.8h, v1.4s\n"
1067 "sqxtn v2.4h, v2.4s\n"
1068 "sqxtn2 v2.8h, v3.4s\n"
1069 "sqxtun v0.8b, v0.8h\n"
1070 "sqxtun2 v0.16b, v2.8h\n"
1071
1072 "st1 {v0.4s}, [%x[output]], #16\n"
1073 "prfm pldl1keep, [%x[output]]\n"
1074
1075 "bne 1b\n"
1076 "2:"
1077
1078 // Handle leftovers.
1079
1080 // Requantize::Transform
1081 "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
1082 "ld1 {v2.2s}, [%x[input]], #8\n"
1083 "prfm pldl1keep, [%x[input], #64]\n"
1084 "scvtf v0.4s, v0.4s\n"
1085 "scvtf v1.4s, v1.4s\n"
1086 "scvtf v2.4s, v2.4s\n"
1087 "fsub v0.4s, v0.4s, v6.4s\n"
1088 "fsub v1.4s, v1.4s, v6.4s\n"
1089 "fsub v2.4s, v2.4s, v6.4s\n"
1090 "fmul v0.4s, v0.4s, v7.4s\n"
1091 "fmul v1.4s, v1.4s, v7.4s\n"
1092 "fmul v2.4s, v2.4s, v7.4s\n"
1093 "fadd v0.4s, v0.4s, v4.4s\n"
1094 "fadd v1.4s, v1.4s, v4.4s\n"
1095 "fadd v2.4s, v2.4s, v4.4s\n"
1096 "fmul v0.4s, v0.4s, v8.4s\n"
1097 "fmul v1.4s, v1.4s, v8.4s\n"
1098 "fmul v2.4s, v2.4s, v8.4s\n"
1099 "fcvtzs v0.4s, v0.4s\n"
1100 "fcvtzs v1.4s, v1.4s\n"
1101 "fcvtzs v2.4s, v2.4s\n"
1102 "sqxtn v0.4h, v0.4s\n"
1103 "sqxtn2 v0.8h, v1.4s\n"
1104 "sqxtn v2.4h, v2.4s\n"
1105 "sqxtun v0.8b, v0.8h\n"
1106 "sqxtun2 v0.16b, v2.8h\n"
1107
1108 "st1 {v0.2s}, [%x[output]], #8\n"
1109 "st1 {v0.h}[4], [%x[output]], #2\n"
1110 "prfm pldl1keep, [%x[output]]\n"
1111 : [count] "+r"(params_count_copy), [input] "+r"(input),
1112 [output] "+r"(output)
1113 : [input_range_min] "r"(params.input_range_min),
1114 [output_range_min] "r"(params.output_range_min),
1115 [input_range_offset] "r"(params.input_range_offset),
1116 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1117 [input_range_scale] "r"(params.input_range_scale)
1118 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1119 }
1120
1121 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1122 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 11>::Transform(
1123 const int32_t* input, const Requantize& params, uint8_t* output) {
1124 #ifdef DEBUG
1125 #ifdef DEBUG_METAGEMM_VERBOSE
1126 std::cout << __FILE__ << "(" << __LINE__
1127 << ") Requantize<int32_t, uint8_t, Requantize, 16, 11>::Transform()"
1128 << std::endl
1129 << std::flush;
1130 #endif
1131 #endif
1132 int params_count_copy = params.count;
1133 asm volatile(
1134
1135 // Requantize::Prepare
1136 "dup v4.4s, %w[input_range_min]\n"
1137 "dup v5.4s, %w[output_range_min]\n"
1138 "dup v6.4s, %w[input_range_offset]\n"
1139 "dup v7.4s, %w[input_range_scale]\n"
1140 "dup v8.4s, %w[one_over_output_range_scale]\n"
1141 "fsub v4.4s, v4.4s, v5.4s\n"
1142
1143 // Reduce count by leftovers.
1144 "subs %x[count], %x[count], #11\n"
1145 "beq 2f\n"
1146
1147 "1:"
1148 "subs %x[count], %x[count], #16\n"
1149
1150 // Requantize::Transform
1151 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1152 "prfm pldl1keep, [%x[input], #64]\n"
1153 "scvtf v0.4s, v0.4s\n"
1154 "scvtf v1.4s, v1.4s\n"
1155 "scvtf v2.4s, v2.4s\n"
1156 "scvtf v3.4s, v3.4s\n"
1157 "fsub v0.4s, v0.4s, v6.4s\n"
1158 "fsub v1.4s, v1.4s, v6.4s\n"
1159 "fsub v2.4s, v2.4s, v6.4s\n"
1160 "fsub v3.4s, v3.4s, v6.4s\n"
1161 "fmul v0.4s, v0.4s, v7.4s\n"
1162 "fmul v1.4s, v1.4s, v7.4s\n"
1163 "fmul v2.4s, v2.4s, v7.4s\n"
1164 "fmul v3.4s, v3.4s, v7.4s\n"
1165 "fadd v0.4s, v0.4s, v4.4s\n"
1166 "fadd v1.4s, v1.4s, v4.4s\n"
1167 "fadd v2.4s, v2.4s, v4.4s\n"
1168 "fadd v3.4s, v3.4s, v4.4s\n"
1169 "fmul v0.4s, v0.4s, v8.4s\n"
1170 "fmul v1.4s, v1.4s, v8.4s\n"
1171 "fmul v2.4s, v2.4s, v8.4s\n"
1172 "fmul v3.4s, v3.4s, v8.4s\n"
1173 "fcvtzs v0.4s, v0.4s\n"
1174 "fcvtzs v1.4s, v1.4s\n"
1175 "fcvtzs v2.4s, v2.4s\n"
1176 "fcvtzs v3.4s, v3.4s\n"
1177 "sqxtn v0.4h, v0.4s\n"
1178 "sqxtn2 v0.8h, v1.4s\n"
1179 "sqxtn v2.4h, v2.4s\n"
1180 "sqxtn2 v2.8h, v3.4s\n"
1181 "sqxtun v0.8b, v0.8h\n"
1182 "sqxtun2 v0.16b, v2.8h\n"
1183
1184 "st1 {v0.4s}, [%x[output]], #16\n"
1185 "prfm pldl1keep, [%x[output]]\n"
1186
1187 "bne 1b\n"
1188 "2:"
1189
1190 // Handle leftovers.
1191
1192 // Requantize::Transform
1193 "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
1194 "ld1 {v2.2s}, [%x[input]], #8\n"
1195 "ld1 {v2.s}[2], [%x[input]], #4\n"
1196 "prfm pldl1keep, [%x[input], #64]\n"
1197 "scvtf v0.4s, v0.4s\n"
1198 "scvtf v1.4s, v1.4s\n"
1199 "scvtf v2.4s, v2.4s\n"
1200 "fsub v0.4s, v0.4s, v6.4s\n"
1201 "fsub v1.4s, v1.4s, v6.4s\n"
1202 "fsub v2.4s, v2.4s, v6.4s\n"
1203 "fmul v0.4s, v0.4s, v7.4s\n"
1204 "fmul v1.4s, v1.4s, v7.4s\n"
1205 "fmul v2.4s, v2.4s, v7.4s\n"
1206 "fadd v0.4s, v0.4s, v4.4s\n"
1207 "fadd v1.4s, v1.4s, v4.4s\n"
1208 "fadd v2.4s, v2.4s, v4.4s\n"
1209 "fmul v0.4s, v0.4s, v8.4s\n"
1210 "fmul v1.4s, v1.4s, v8.4s\n"
1211 "fmul v2.4s, v2.4s, v8.4s\n"
1212 "fcvtzs v0.4s, v0.4s\n"
1213 "fcvtzs v1.4s, v1.4s\n"
1214 "fcvtzs v2.4s, v2.4s\n"
1215 "sqxtn v0.4h, v0.4s\n"
1216 "sqxtn2 v0.8h, v1.4s\n"
1217 "sqxtn v2.4h, v2.4s\n"
1218 "sqxtun v0.8b, v0.8h\n"
1219 "sqxtun2 v0.16b, v2.8h\n"
1220
1221 "st1 {v0.2s}, [%x[output]], #8\n"
1222 "st1 {v0.h}[4], [%x[output]], #2\n"
1223 "st1 {v0.b}[10], [%x[output]], #1\n"
1224 "prfm pldl1keep, [%x[output]]\n"
1225 : [count] "+r"(params_count_copy), [input] "+r"(input),
1226 [output] "+r"(output)
1227 : [input_range_min] "r"(params.input_range_min),
1228 [output_range_min] "r"(params.output_range_min),
1229 [input_range_offset] "r"(params.input_range_offset),
1230 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1231 [input_range_scale] "r"(params.input_range_scale)
1232 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1233 }
1234
1235 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1236 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 12>::Transform(
1237 const int32_t* input, const Requantize& params, uint8_t* output) {
1238 #ifdef DEBUG
1239 #ifdef DEBUG_METAGEMM_VERBOSE
1240 std::cout << __FILE__ << "(" << __LINE__
1241 << ") Requantize<int32_t, uint8_t, Requantize, 16, 12>::Transform()"
1242 << std::endl
1243 << std::flush;
1244 #endif
1245 #endif
1246 int params_count_copy = params.count;
1247 asm volatile(
1248
1249 // Requantize::Prepare
1250 "dup v4.4s, %w[input_range_min]\n"
1251 "dup v5.4s, %w[output_range_min]\n"
1252 "dup v6.4s, %w[input_range_offset]\n"
1253 "dup v7.4s, %w[input_range_scale]\n"
1254 "dup v8.4s, %w[one_over_output_range_scale]\n"
1255 "fsub v4.4s, v4.4s, v5.4s\n"
1256
1257 // Reduce count by leftovers.
1258 "subs %x[count], %x[count], #12\n"
1259 "beq 2f\n"
1260
1261 "1:"
1262 "subs %x[count], %x[count], #16\n"
1263
1264 // Requantize::Transform
1265 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1266 "prfm pldl1keep, [%x[input], #64]\n"
1267 "scvtf v0.4s, v0.4s\n"
1268 "scvtf v1.4s, v1.4s\n"
1269 "scvtf v2.4s, v2.4s\n"
1270 "scvtf v3.4s, v3.4s\n"
1271 "fsub v0.4s, v0.4s, v6.4s\n"
1272 "fsub v1.4s, v1.4s, v6.4s\n"
1273 "fsub v2.4s, v2.4s, v6.4s\n"
1274 "fsub v3.4s, v3.4s, v6.4s\n"
1275 "fmul v0.4s, v0.4s, v7.4s\n"
1276 "fmul v1.4s, v1.4s, v7.4s\n"
1277 "fmul v2.4s, v2.4s, v7.4s\n"
1278 "fmul v3.4s, v3.4s, v7.4s\n"
1279 "fadd v0.4s, v0.4s, v4.4s\n"
1280 "fadd v1.4s, v1.4s, v4.4s\n"
1281 "fadd v2.4s, v2.4s, v4.4s\n"
1282 "fadd v3.4s, v3.4s, v4.4s\n"
1283 "fmul v0.4s, v0.4s, v8.4s\n"
1284 "fmul v1.4s, v1.4s, v8.4s\n"
1285 "fmul v2.4s, v2.4s, v8.4s\n"
1286 "fmul v3.4s, v3.4s, v8.4s\n"
1287 "fcvtzs v0.4s, v0.4s\n"
1288 "fcvtzs v1.4s, v1.4s\n"
1289 "fcvtzs v2.4s, v2.4s\n"
1290 "fcvtzs v3.4s, v3.4s\n"
1291 "sqxtn v0.4h, v0.4s\n"
1292 "sqxtn2 v0.8h, v1.4s\n"
1293 "sqxtn v2.4h, v2.4s\n"
1294 "sqxtn2 v2.8h, v3.4s\n"
1295 "sqxtun v0.8b, v0.8h\n"
1296 "sqxtun2 v0.16b, v2.8h\n"
1297
1298 "st1 {v0.4s}, [%x[output]], #16\n"
1299 "prfm pldl1keep, [%x[output]]\n"
1300
1301 "bne 1b\n"
1302 "2:"
1303
1304 // Handle leftovers.
1305
1306 // Requantize::Transform
1307 "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
1308 "prfm pldl1keep, [%x[input], #64]\n"
1309 "scvtf v0.4s, v0.4s\n"
1310 "scvtf v1.4s, v1.4s\n"
1311 "scvtf v2.4s, v2.4s\n"
1312 "fsub v0.4s, v0.4s, v6.4s\n"
1313 "fsub v1.4s, v1.4s, v6.4s\n"
1314 "fsub v2.4s, v2.4s, v6.4s\n"
1315 "fmul v0.4s, v0.4s, v7.4s\n"
1316 "fmul v1.4s, v1.4s, v7.4s\n"
1317 "fmul v2.4s, v2.4s, v7.4s\n"
1318 "fadd v0.4s, v0.4s, v4.4s\n"
1319 "fadd v1.4s, v1.4s, v4.4s\n"
1320 "fadd v2.4s, v2.4s, v4.4s\n"
1321 "fmul v0.4s, v0.4s, v8.4s\n"
1322 "fmul v1.4s, v1.4s, v8.4s\n"
1323 "fmul v2.4s, v2.4s, v8.4s\n"
1324 "fcvtzs v0.4s, v0.4s\n"
1325 "fcvtzs v1.4s, v1.4s\n"
1326 "fcvtzs v2.4s, v2.4s\n"
1327 "sqxtn v0.4h, v0.4s\n"
1328 "sqxtn2 v0.8h, v1.4s\n"
1329 "sqxtn v2.4h, v2.4s\n"
1330 "sqxtun v0.8b, v0.8h\n"
1331 "sqxtun2 v0.16b, v2.8h\n"
1332
1333 "st1 {v0.2s}, [%x[output]], #8\n"
1334 "st1 {v0.s}[2], [%x[output]], #4\n"
1335 "prfm pldl1keep, [%x[output]]\n"
1336 : [count] "+r"(params_count_copy), [input] "+r"(input),
1337 [output] "+r"(output)
1338 : [input_range_min] "r"(params.input_range_min),
1339 [output_range_min] "r"(params.output_range_min),
1340 [input_range_offset] "r"(params.input_range_offset),
1341 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1342 [input_range_scale] "r"(params.input_range_scale)
1343 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1344 }
1345
1346 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1347 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 13>::Transform(
1348 const int32_t* input, const Requantize& params, uint8_t* output) {
1349 #ifdef DEBUG
1350 #ifdef DEBUG_METAGEMM_VERBOSE
1351 std::cout << __FILE__ << "(" << __LINE__
1352 << ") Requantize<int32_t, uint8_t, Requantize, 16, 13>::Transform()"
1353 << std::endl
1354 << std::flush;
1355 #endif
1356 #endif
1357 int params_count_copy = params.count;
1358 asm volatile(
1359
1360 // Requantize::Prepare
1361 "dup v4.4s, %w[input_range_min]\n"
1362 "dup v5.4s, %w[output_range_min]\n"
1363 "dup v6.4s, %w[input_range_offset]\n"
1364 "dup v7.4s, %w[input_range_scale]\n"
1365 "dup v8.4s, %w[one_over_output_range_scale]\n"
1366 "fsub v4.4s, v4.4s, v5.4s\n"
1367
1368 // Reduce count by leftovers.
1369 "subs %x[count], %x[count], #13\n"
1370 "beq 2f\n"
1371
1372 "1:"
1373 "subs %x[count], %x[count], #16\n"
1374
1375 // Requantize::Transform
1376 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1377 "prfm pldl1keep, [%x[input], #64]\n"
1378 "scvtf v0.4s, v0.4s\n"
1379 "scvtf v1.4s, v1.4s\n"
1380 "scvtf v2.4s, v2.4s\n"
1381 "scvtf v3.4s, v3.4s\n"
1382 "fsub v0.4s, v0.4s, v6.4s\n"
1383 "fsub v1.4s, v1.4s, v6.4s\n"
1384 "fsub v2.4s, v2.4s, v6.4s\n"
1385 "fsub v3.4s, v3.4s, v6.4s\n"
1386 "fmul v0.4s, v0.4s, v7.4s\n"
1387 "fmul v1.4s, v1.4s, v7.4s\n"
1388 "fmul v2.4s, v2.4s, v7.4s\n"
1389 "fmul v3.4s, v3.4s, v7.4s\n"
1390 "fadd v0.4s, v0.4s, v4.4s\n"
1391 "fadd v1.4s, v1.4s, v4.4s\n"
1392 "fadd v2.4s, v2.4s, v4.4s\n"
1393 "fadd v3.4s, v3.4s, v4.4s\n"
1394 "fmul v0.4s, v0.4s, v8.4s\n"
1395 "fmul v1.4s, v1.4s, v8.4s\n"
1396 "fmul v2.4s, v2.4s, v8.4s\n"
1397 "fmul v3.4s, v3.4s, v8.4s\n"
1398 "fcvtzs v0.4s, v0.4s\n"
1399 "fcvtzs v1.4s, v1.4s\n"
1400 "fcvtzs v2.4s, v2.4s\n"
1401 "fcvtzs v3.4s, v3.4s\n"
1402 "sqxtn v0.4h, v0.4s\n"
1403 "sqxtn2 v0.8h, v1.4s\n"
1404 "sqxtn v2.4h, v2.4s\n"
1405 "sqxtn2 v2.8h, v3.4s\n"
1406 "sqxtun v0.8b, v0.8h\n"
1407 "sqxtun2 v0.16b, v2.8h\n"
1408
1409 "st1 {v0.4s}, [%x[output]], #16\n"
1410 "prfm pldl1keep, [%x[output]]\n"
1411
1412 "bne 1b\n"
1413 "2:"
1414
1415 // Handle leftovers.
1416
1417 // Requantize::Transform
1418 "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
1419 "ld1 {v3.s}[0], [%x[input]], #4\n"
1420 "prfm pldl1keep, [%x[input], #64]\n"
1421 "scvtf v0.4s, v0.4s\n"
1422 "scvtf v1.4s, v1.4s\n"
1423 "scvtf v2.4s, v2.4s\n"
1424 "scvtf v3.4s, v3.4s\n"
1425 "fsub v0.4s, v0.4s, v6.4s\n"
1426 "fsub v1.4s, v1.4s, v6.4s\n"
1427 "fsub v2.4s, v2.4s, v6.4s\n"
1428 "fsub v3.4s, v3.4s, v6.4s\n"
1429 "fmul v0.4s, v0.4s, v7.4s\n"
1430 "fmul v1.4s, v1.4s, v7.4s\n"
1431 "fmul v2.4s, v2.4s, v7.4s\n"
1432 "fmul v3.4s, v3.4s, v7.4s\n"
1433 "fadd v0.4s, v0.4s, v4.4s\n"
1434 "fadd v1.4s, v1.4s, v4.4s\n"
1435 "fadd v2.4s, v2.4s, v4.4s\n"
1436 "fadd v3.4s, v3.4s, v4.4s\n"
1437 "fmul v0.4s, v0.4s, v8.4s\n"
1438 "fmul v1.4s, v1.4s, v8.4s\n"
1439 "fmul v2.4s, v2.4s, v8.4s\n"
1440 "fmul v3.4s, v3.4s, v8.4s\n"
1441 "fcvtzs v0.4s, v0.4s\n"
1442 "fcvtzs v1.4s, v1.4s\n"
1443 "fcvtzs v2.4s, v2.4s\n"
1444 "fcvtzs v3.4s, v3.4s\n"
1445 "sqxtn v0.4h, v0.4s\n"
1446 "sqxtn2 v0.8h, v1.4s\n"
1447 "sqxtn v2.4h, v2.4s\n"
1448 "sqxtn2 v2.8h, v3.4s\n"
1449 "sqxtun v0.8b, v0.8h\n"
1450 "sqxtun2 v0.16b, v2.8h\n"
1451
1452 "st1 {v0.2s}, [%x[output]], #8\n"
1453 "st1 {v0.s}[2], [%x[output]], #4\n"
1454 "st1 {v0.b}[12], [%x[output]], #1\n"
1455 "prfm pldl1keep, [%x[output]]\n"
1456 : [count] "+r"(params_count_copy), [input] "+r"(input),
1457 [output] "+r"(output)
1458 : [input_range_min] "r"(params.input_range_min),
1459 [output_range_min] "r"(params.output_range_min),
1460 [input_range_offset] "r"(params.input_range_offset),
1461 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1462 [input_range_scale] "r"(params.input_range_scale)
1463 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1464 }
1465
1466 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1467 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 14>::Transform(
1468 const int32_t* input, const Requantize& params, uint8_t* output) {
1469 #ifdef DEBUG
1470 #ifdef DEBUG_METAGEMM_VERBOSE
1471 std::cout << __FILE__ << "(" << __LINE__
1472 << ") Requantize<int32_t, uint8_t, Requantize, 16, 14>::Transform()"
1473 << std::endl
1474 << std::flush;
1475 #endif
1476 #endif
1477 int params_count_copy = params.count;
1478 asm volatile(
1479
1480 // Requantize::Prepare
1481 "dup v4.4s, %w[input_range_min]\n"
1482 "dup v5.4s, %w[output_range_min]\n"
1483 "dup v6.4s, %w[input_range_offset]\n"
1484 "dup v7.4s, %w[input_range_scale]\n"
1485 "dup v8.4s, %w[one_over_output_range_scale]\n"
1486 "fsub v4.4s, v4.4s, v5.4s\n"
1487
1488 // Reduce count by leftovers.
1489 "subs %x[count], %x[count], #14\n"
1490 "beq 2f\n"
1491
1492 "1:"
1493 "subs %x[count], %x[count], #16\n"
1494
1495 // Requantize::Transform
1496 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1497 "prfm pldl1keep, [%x[input], #64]\n"
1498 "scvtf v0.4s, v0.4s\n"
1499 "scvtf v1.4s, v1.4s\n"
1500 "scvtf v2.4s, v2.4s\n"
1501 "scvtf v3.4s, v3.4s\n"
1502 "fsub v0.4s, v0.4s, v6.4s\n"
1503 "fsub v1.4s, v1.4s, v6.4s\n"
1504 "fsub v2.4s, v2.4s, v6.4s\n"
1505 "fsub v3.4s, v3.4s, v6.4s\n"
1506 "fmul v0.4s, v0.4s, v7.4s\n"
1507 "fmul v1.4s, v1.4s, v7.4s\n"
1508 "fmul v2.4s, v2.4s, v7.4s\n"
1509 "fmul v3.4s, v3.4s, v7.4s\n"
1510 "fadd v0.4s, v0.4s, v4.4s\n"
1511 "fadd v1.4s, v1.4s, v4.4s\n"
1512 "fadd v2.4s, v2.4s, v4.4s\n"
1513 "fadd v3.4s, v3.4s, v4.4s\n"
1514 "fmul v0.4s, v0.4s, v8.4s\n"
1515 "fmul v1.4s, v1.4s, v8.4s\n"
1516 "fmul v2.4s, v2.4s, v8.4s\n"
1517 "fmul v3.4s, v3.4s, v8.4s\n"
1518 "fcvtzs v0.4s, v0.4s\n"
1519 "fcvtzs v1.4s, v1.4s\n"
1520 "fcvtzs v2.4s, v2.4s\n"
1521 "fcvtzs v3.4s, v3.4s\n"
1522 "sqxtn v0.4h, v0.4s\n"
1523 "sqxtn2 v0.8h, v1.4s\n"
1524 "sqxtn v2.4h, v2.4s\n"
1525 "sqxtn2 v2.8h, v3.4s\n"
1526 "sqxtun v0.8b, v0.8h\n"
1527 "sqxtun2 v0.16b, v2.8h\n"
1528
1529 "st1 {v0.4s}, [%x[output]], #16\n"
1530 "prfm pldl1keep, [%x[output]]\n"
1531
1532 "bne 1b\n"
1533 "2:"
1534
1535 // Handle leftovers.
1536
1537 // Requantize::Transform
1538 "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
1539 "ld1 {v3.2s}, [%x[input]], #8\n"
1540 "prfm pldl1keep, [%x[input], #64]\n"
1541 "scvtf v0.4s, v0.4s\n"
1542 "scvtf v1.4s, v1.4s\n"
1543 "scvtf v2.4s, v2.4s\n"
1544 "scvtf v3.4s, v3.4s\n"
1545 "fsub v0.4s, v0.4s, v6.4s\n"
1546 "fsub v1.4s, v1.4s, v6.4s\n"
1547 "fsub v2.4s, v2.4s, v6.4s\n"
1548 "fsub v3.4s, v3.4s, v6.4s\n"
1549 "fmul v0.4s, v0.4s, v7.4s\n"
1550 "fmul v1.4s, v1.4s, v7.4s\n"
1551 "fmul v2.4s, v2.4s, v7.4s\n"
1552 "fmul v3.4s, v3.4s, v7.4s\n"
1553 "fadd v0.4s, v0.4s, v4.4s\n"
1554 "fadd v1.4s, v1.4s, v4.4s\n"
1555 "fadd v2.4s, v2.4s, v4.4s\n"
1556 "fadd v3.4s, v3.4s, v4.4s\n"
1557 "fmul v0.4s, v0.4s, v8.4s\n"
1558 "fmul v1.4s, v1.4s, v8.4s\n"
1559 "fmul v2.4s, v2.4s, v8.4s\n"
1560 "fmul v3.4s, v3.4s, v8.4s\n"
1561 "fcvtzs v0.4s, v0.4s\n"
1562 "fcvtzs v1.4s, v1.4s\n"
1563 "fcvtzs v2.4s, v2.4s\n"
1564 "fcvtzs v3.4s, v3.4s\n"
1565 "sqxtn v0.4h, v0.4s\n"
1566 "sqxtn2 v0.8h, v1.4s\n"
1567 "sqxtn v2.4h, v2.4s\n"
1568 "sqxtn2 v2.8h, v3.4s\n"
1569 "sqxtun v0.8b, v0.8h\n"
1570 "sqxtun2 v0.16b, v2.8h\n"
1571
1572 "st1 {v0.2s}, [%x[output]], #8\n"
1573 "st1 {v0.s}[2], [%x[output]], #4\n"
1574 "st1 {v0.h}[6], [%x[output]], #2\n"
1575 "prfm pldl1keep, [%x[output]]\n"
1576 : [count] "+r"(params_count_copy), [input] "+r"(input),
1577 [output] "+r"(output)
1578 : [input_range_min] "r"(params.input_range_min),
1579 [output_range_min] "r"(params.output_range_min),
1580 [input_range_offset] "r"(params.input_range_offset),
1581 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1582 [input_range_scale] "r"(params.input_range_scale)
1583 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1584 }
1585
1586 template <>
Transform(const int32_t * input,const Requantize & params,uint8_t * output)1587 inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 15>::Transform(
1588 const int32_t* input, const Requantize& params, uint8_t* output) {
1589 #ifdef DEBUG
1590 #ifdef DEBUG_METAGEMM_VERBOSE
1591 std::cout << __FILE__ << "(" << __LINE__
1592 << ") Requantize<int32_t, uint8_t, Requantize, 16, 15>::Transform()"
1593 << std::endl
1594 << std::flush;
1595 #endif
1596 #endif
1597 int params_count_copy = params.count;
1598 asm volatile(
1599
1600 // Requantize::Prepare
1601 "dup v4.4s, %w[input_range_min]\n"
1602 "dup v5.4s, %w[output_range_min]\n"
1603 "dup v6.4s, %w[input_range_offset]\n"
1604 "dup v7.4s, %w[input_range_scale]\n"
1605 "dup v8.4s, %w[one_over_output_range_scale]\n"
1606 "fsub v4.4s, v4.4s, v5.4s\n"
1607
1608 // Reduce count by leftovers.
1609 "subs %x[count], %x[count], #15\n"
1610 "beq 2f\n"
1611
1612 "1:"
1613 "subs %x[count], %x[count], #16\n"
1614
1615 // Requantize::Transform
1616 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1617 "prfm pldl1keep, [%x[input], #64]\n"
1618 "scvtf v0.4s, v0.4s\n"
1619 "scvtf v1.4s, v1.4s\n"
1620 "scvtf v2.4s, v2.4s\n"
1621 "scvtf v3.4s, v3.4s\n"
1622 "fsub v0.4s, v0.4s, v6.4s\n"
1623 "fsub v1.4s, v1.4s, v6.4s\n"
1624 "fsub v2.4s, v2.4s, v6.4s\n"
1625 "fsub v3.4s, v3.4s, v6.4s\n"
1626 "fmul v0.4s, v0.4s, v7.4s\n"
1627 "fmul v1.4s, v1.4s, v7.4s\n"
1628 "fmul v2.4s, v2.4s, v7.4s\n"
1629 "fmul v3.4s, v3.4s, v7.4s\n"
1630 "fadd v0.4s, v0.4s, v4.4s\n"
1631 "fadd v1.4s, v1.4s, v4.4s\n"
1632 "fadd v2.4s, v2.4s, v4.4s\n"
1633 "fadd v3.4s, v3.4s, v4.4s\n"
1634 "fmul v0.4s, v0.4s, v8.4s\n"
1635 "fmul v1.4s, v1.4s, v8.4s\n"
1636 "fmul v2.4s, v2.4s, v8.4s\n"
1637 "fmul v3.4s, v3.4s, v8.4s\n"
1638 "fcvtzs v0.4s, v0.4s\n"
1639 "fcvtzs v1.4s, v1.4s\n"
1640 "fcvtzs v2.4s, v2.4s\n"
1641 "fcvtzs v3.4s, v3.4s\n"
1642 "sqxtn v0.4h, v0.4s\n"
1643 "sqxtn2 v0.8h, v1.4s\n"
1644 "sqxtn v2.4h, v2.4s\n"
1645 "sqxtn2 v2.8h, v3.4s\n"
1646 "sqxtun v0.8b, v0.8h\n"
1647 "sqxtun2 v0.16b, v2.8h\n"
1648
1649 "st1 {v0.4s}, [%x[output]], #16\n"
1650 "prfm pldl1keep, [%x[output]]\n"
1651
1652 "bne 1b\n"
1653 "2:"
1654
1655 // Handle leftovers.
1656
1657 // Requantize::Transform
1658 "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
1659 "ld1 {v3.2s}, [%x[input]], #8\n"
1660 "ld1 {v3.s}[2], [%x[input]], #4\n"
1661 "prfm pldl1keep, [%x[input], #64]\n"
1662 "scvtf v0.4s, v0.4s\n"
1663 "scvtf v1.4s, v1.4s\n"
1664 "scvtf v2.4s, v2.4s\n"
1665 "scvtf v3.4s, v3.4s\n"
1666 "fsub v0.4s, v0.4s, v6.4s\n"
1667 "fsub v1.4s, v1.4s, v6.4s\n"
1668 "fsub v2.4s, v2.4s, v6.4s\n"
1669 "fsub v3.4s, v3.4s, v6.4s\n"
1670 "fmul v0.4s, v0.4s, v7.4s\n"
1671 "fmul v1.4s, v1.4s, v7.4s\n"
1672 "fmul v2.4s, v2.4s, v7.4s\n"
1673 "fmul v3.4s, v3.4s, v7.4s\n"
1674 "fadd v0.4s, v0.4s, v4.4s\n"
1675 "fadd v1.4s, v1.4s, v4.4s\n"
1676 "fadd v2.4s, v2.4s, v4.4s\n"
1677 "fadd v3.4s, v3.4s, v4.4s\n"
1678 "fmul v0.4s, v0.4s, v8.4s\n"
1679 "fmul v1.4s, v1.4s, v8.4s\n"
1680 "fmul v2.4s, v2.4s, v8.4s\n"
1681 "fmul v3.4s, v3.4s, v8.4s\n"
1682 "fcvtzs v0.4s, v0.4s\n"
1683 "fcvtzs v1.4s, v1.4s\n"
1684 "fcvtzs v2.4s, v2.4s\n"
1685 "fcvtzs v3.4s, v3.4s\n"
1686 "sqxtn v0.4h, v0.4s\n"
1687 "sqxtn2 v0.8h, v1.4s\n"
1688 "sqxtn v2.4h, v2.4s\n"
1689 "sqxtn2 v2.8h, v3.4s\n"
1690 "sqxtun v0.8b, v0.8h\n"
1691 "sqxtun2 v0.16b, v2.8h\n"
1692
1693 "st1 {v0.2s}, [%x[output]], #8\n"
1694 "st1 {v0.s}[2], [%x[output]], #4\n"
1695 "st1 {v0.h}[6], [%x[output]], #2\n"
1696 "st1 {v0.b}[14], [%x[output]], #1\n"
1697 "prfm pldl1keep, [%x[output]]\n"
1698 : [count] "+r"(params_count_copy), [input] "+r"(input),
1699 [output] "+r"(output)
1700 : [input_range_min] "r"(params.input_range_min),
1701 [output_range_min] "r"(params.output_range_min),
1702 [input_range_offset] "r"(params.input_range_offset),
1703 [one_over_output_range_scale] "r"(params.one_over_output_range_scale),
1704 [input_range_scale] "r"(params.input_range_scale)
1705 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
1706 }
1707
1708 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)1709 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 0>::Transform(
1710 const float* input, const Quantize& params, uint8_t* output) {
1711 #ifdef DEBUG
1712 #ifdef DEBUG_METAGEMM_VERBOSE
1713 std::cout << __FILE__ << "(" << __LINE__
1714 << ") Quantize<float, uint8_t, Quantize, 16, 0>::Transform()"
1715 << std::endl
1716 << std::flush;
1717 #endif
1718 #endif
1719 int params_count_copy = params.count;
1720 asm volatile(
1721
1722 // Quantize::Prepare
1723 "dup v4.4s, %w[range_min]\n"
1724 "dup v5.4s, %w[range_offset]\n"
1725 "dup v6.4s, %w[range_scale]\n"
1726
1727 "1:"
1728 "subs %x[count], %x[count], #16\n"
1729
1730 // Quantize::Transform
1731 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1732 "prfm pldl1keep, [%x[input], #64]\n"
1733 "fsub v0.4s, v0.4s, v4.4s\n"
1734 "fsub v1.4s, v1.4s, v4.4s\n"
1735 "fsub v2.4s, v2.4s, v4.4s\n"
1736 "fsub v3.4s, v3.4s, v4.4s\n"
1737 "fmul v0.4s, v0.4s, v6.4s\n"
1738 "fmul v1.4s, v1.4s, v6.4s\n"
1739 "fmul v2.4s, v2.4s, v6.4s\n"
1740 "fmul v3.4s, v3.4s, v6.4s\n"
1741 "fadd v0.4s, v0.4s, v5.4s\n"
1742 "fadd v1.4s, v1.4s, v5.4s\n"
1743 "fadd v2.4s, v2.4s, v5.4s\n"
1744 "fadd v3.4s, v3.4s, v5.4s\n"
1745 "fcvtzs v0.4s, v0.4s\n"
1746 "fcvtzs v1.4s, v1.4s\n"
1747 "fcvtzs v2.4s, v2.4s\n"
1748 "fcvtzs v3.4s, v3.4s\n"
1749 "sqxtn v0.4h, v0.4s\n"
1750 "sqxtn2 v0.8h, v1.4s\n"
1751 "sqxtn v2.4h, v2.4s\n"
1752 "sqxtn2 v2.8h, v3.4s\n"
1753 "sqxtun v0.8b, v0.8h\n"
1754 "sqxtun2 v0.16b, v2.8h\n"
1755
1756 "st1 {v0.4s}, [%x[output]], #16\n"
1757 "prfm pldl1keep, [%x[output]]\n"
1758
1759 "bne 1b\n"
1760 : [count] "+r"(params_count_copy), [input] "+r"(input),
1761 [output] "+r"(output)
1762 : [range_offset] "r"(params.range_offset),
1763 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
1764 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
1765 }
1766
1767 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)1768 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 1>::Transform(
1769 const float* input, const Quantize& params, uint8_t* output) {
1770 #ifdef DEBUG
1771 #ifdef DEBUG_METAGEMM_VERBOSE
1772 std::cout << __FILE__ << "(" << __LINE__
1773 << ") Quantize<float, uint8_t, Quantize, 16, 1>::Transform()"
1774 << std::endl
1775 << std::flush;
1776 #endif
1777 #endif
1778 int params_count_copy = params.count;
1779 asm volatile(
1780
1781 // Quantize::Prepare
1782 "dup v4.4s, %w[range_min]\n"
1783 "dup v5.4s, %w[range_offset]\n"
1784 "dup v6.4s, %w[range_scale]\n"
1785
1786 // Reduce count by leftovers.
1787 "subs %x[count], %x[count], #1\n"
1788 "beq 2f\n"
1789
1790 "1:"
1791 "subs %x[count], %x[count], #16\n"
1792
1793 // Quantize::Transform
1794 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1795 "prfm pldl1keep, [%x[input], #64]\n"
1796 "fsub v0.4s, v0.4s, v4.4s\n"
1797 "fsub v1.4s, v1.4s, v4.4s\n"
1798 "fsub v2.4s, v2.4s, v4.4s\n"
1799 "fsub v3.4s, v3.4s, v4.4s\n"
1800 "fmul v0.4s, v0.4s, v6.4s\n"
1801 "fmul v1.4s, v1.4s, v6.4s\n"
1802 "fmul v2.4s, v2.4s, v6.4s\n"
1803 "fmul v3.4s, v3.4s, v6.4s\n"
1804 "fadd v0.4s, v0.4s, v5.4s\n"
1805 "fadd v1.4s, v1.4s, v5.4s\n"
1806 "fadd v2.4s, v2.4s, v5.4s\n"
1807 "fadd v3.4s, v3.4s, v5.4s\n"
1808 "fcvtzs v0.4s, v0.4s\n"
1809 "fcvtzs v1.4s, v1.4s\n"
1810 "fcvtzs v2.4s, v2.4s\n"
1811 "fcvtzs v3.4s, v3.4s\n"
1812 "sqxtn v0.4h, v0.4s\n"
1813 "sqxtn2 v0.8h, v1.4s\n"
1814 "sqxtn v2.4h, v2.4s\n"
1815 "sqxtn2 v2.8h, v3.4s\n"
1816 "sqxtun v0.8b, v0.8h\n"
1817 "sqxtun2 v0.16b, v2.8h\n"
1818
1819 "st1 {v0.4s}, [%x[output]], #16\n"
1820 "prfm pldl1keep, [%x[output]]\n"
1821
1822 "bne 1b\n"
1823 "2:"
1824
1825 // Handle leftovers.
1826
1827 // Quantize::Transform
1828 "ld1 {v0.s}[0], [%x[input]], #4\n"
1829 "prfm pldl1keep, [%x[input], #64]\n"
1830 "fsub v0.4s, v0.4s, v4.4s\n"
1831 "fmul v0.4s, v0.4s, v6.4s\n"
1832 "fadd v0.4s, v0.4s, v5.4s\n"
1833 "fcvtzs v0.4s, v0.4s\n"
1834 "sqxtn v0.4h, v0.4s\n"
1835 "sqxtun v0.8b, v0.8h\n"
1836
1837 "st1 {v0.b}[0], [%x[output]], #1\n"
1838 "prfm pldl1keep, [%x[output]]\n"
1839 : [count] "+r"(params_count_copy), [input] "+r"(input),
1840 [output] "+r"(output)
1841 : [range_offset] "r"(params.range_offset),
1842 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
1843 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
1844 }
1845
1846 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)1847 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 2>::Transform(
1848 const float* input, const Quantize& params, uint8_t* output) {
1849 #ifdef DEBUG
1850 #ifdef DEBUG_METAGEMM_VERBOSE
1851 std::cout << __FILE__ << "(" << __LINE__
1852 << ") Quantize<float, uint8_t, Quantize, 16, 2>::Transform()"
1853 << std::endl
1854 << std::flush;
1855 #endif
1856 #endif
1857 int params_count_copy = params.count;
1858 asm volatile(
1859
1860 // Quantize::Prepare
1861 "dup v4.4s, %w[range_min]\n"
1862 "dup v5.4s, %w[range_offset]\n"
1863 "dup v6.4s, %w[range_scale]\n"
1864
1865 // Reduce count by leftovers.
1866 "subs %x[count], %x[count], #2\n"
1867 "beq 2f\n"
1868
1869 "1:"
1870 "subs %x[count], %x[count], #16\n"
1871
1872 // Quantize::Transform
1873 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1874 "prfm pldl1keep, [%x[input], #64]\n"
1875 "fsub v0.4s, v0.4s, v4.4s\n"
1876 "fsub v1.4s, v1.4s, v4.4s\n"
1877 "fsub v2.4s, v2.4s, v4.4s\n"
1878 "fsub v3.4s, v3.4s, v4.4s\n"
1879 "fmul v0.4s, v0.4s, v6.4s\n"
1880 "fmul v1.4s, v1.4s, v6.4s\n"
1881 "fmul v2.4s, v2.4s, v6.4s\n"
1882 "fmul v3.4s, v3.4s, v6.4s\n"
1883 "fadd v0.4s, v0.4s, v5.4s\n"
1884 "fadd v1.4s, v1.4s, v5.4s\n"
1885 "fadd v2.4s, v2.4s, v5.4s\n"
1886 "fadd v3.4s, v3.4s, v5.4s\n"
1887 "fcvtzs v0.4s, v0.4s\n"
1888 "fcvtzs v1.4s, v1.4s\n"
1889 "fcvtzs v2.4s, v2.4s\n"
1890 "fcvtzs v3.4s, v3.4s\n"
1891 "sqxtn v0.4h, v0.4s\n"
1892 "sqxtn2 v0.8h, v1.4s\n"
1893 "sqxtn v2.4h, v2.4s\n"
1894 "sqxtn2 v2.8h, v3.4s\n"
1895 "sqxtun v0.8b, v0.8h\n"
1896 "sqxtun2 v0.16b, v2.8h\n"
1897
1898 "st1 {v0.4s}, [%x[output]], #16\n"
1899 "prfm pldl1keep, [%x[output]]\n"
1900
1901 "bne 1b\n"
1902 "2:"
1903
1904 // Handle leftovers.
1905
1906 // Quantize::Transform
1907 "ld1 {v0.2s}, [%x[input]], #8\n"
1908 "prfm pldl1keep, [%x[input], #64]\n"
1909 "fsub v0.4s, v0.4s, v4.4s\n"
1910 "fmul v0.4s, v0.4s, v6.4s\n"
1911 "fadd v0.4s, v0.4s, v5.4s\n"
1912 "fcvtzs v0.4s, v0.4s\n"
1913 "sqxtn v0.4h, v0.4s\n"
1914 "sqxtun v0.8b, v0.8h\n"
1915
1916 "st1 {v0.h}[0], [%x[output]], #2\n"
1917 "prfm pldl1keep, [%x[output]]\n"
1918 : [count] "+r"(params_count_copy), [input] "+r"(input),
1919 [output] "+r"(output)
1920 : [range_offset] "r"(params.range_offset),
1921 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
1922 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
1923 }
1924
1925 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)1926 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 3>::Transform(
1927 const float* input, const Quantize& params, uint8_t* output) {
1928 #ifdef DEBUG
1929 #ifdef DEBUG_METAGEMM_VERBOSE
1930 std::cout << __FILE__ << "(" << __LINE__
1931 << ") Quantize<float, uint8_t, Quantize, 16, 3>::Transform()"
1932 << std::endl
1933 << std::flush;
1934 #endif
1935 #endif
1936 int params_count_copy = params.count;
1937 asm volatile(
1938
1939 // Quantize::Prepare
1940 "dup v4.4s, %w[range_min]\n"
1941 "dup v5.4s, %w[range_offset]\n"
1942 "dup v6.4s, %w[range_scale]\n"
1943
1944 // Reduce count by leftovers.
1945 "subs %x[count], %x[count], #3\n"
1946 "beq 2f\n"
1947
1948 "1:"
1949 "subs %x[count], %x[count], #16\n"
1950
1951 // Quantize::Transform
1952 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
1953 "prfm pldl1keep, [%x[input], #64]\n"
1954 "fsub v0.4s, v0.4s, v4.4s\n"
1955 "fsub v1.4s, v1.4s, v4.4s\n"
1956 "fsub v2.4s, v2.4s, v4.4s\n"
1957 "fsub v3.4s, v3.4s, v4.4s\n"
1958 "fmul v0.4s, v0.4s, v6.4s\n"
1959 "fmul v1.4s, v1.4s, v6.4s\n"
1960 "fmul v2.4s, v2.4s, v6.4s\n"
1961 "fmul v3.4s, v3.4s, v6.4s\n"
1962 "fadd v0.4s, v0.4s, v5.4s\n"
1963 "fadd v1.4s, v1.4s, v5.4s\n"
1964 "fadd v2.4s, v2.4s, v5.4s\n"
1965 "fadd v3.4s, v3.4s, v5.4s\n"
1966 "fcvtzs v0.4s, v0.4s\n"
1967 "fcvtzs v1.4s, v1.4s\n"
1968 "fcvtzs v2.4s, v2.4s\n"
1969 "fcvtzs v3.4s, v3.4s\n"
1970 "sqxtn v0.4h, v0.4s\n"
1971 "sqxtn2 v0.8h, v1.4s\n"
1972 "sqxtn v2.4h, v2.4s\n"
1973 "sqxtn2 v2.8h, v3.4s\n"
1974 "sqxtun v0.8b, v0.8h\n"
1975 "sqxtun2 v0.16b, v2.8h\n"
1976
1977 "st1 {v0.4s}, [%x[output]], #16\n"
1978 "prfm pldl1keep, [%x[output]]\n"
1979
1980 "bne 1b\n"
1981 "2:"
1982
1983 // Handle leftovers.
1984
1985 // Quantize::Transform
1986 "ld1 {v0.2s}, [%x[input]], #8\n"
1987 "ld1 {v0.s}[2], [%x[input]], #4\n"
1988 "prfm pldl1keep, [%x[input], #64]\n"
1989 "fsub v0.4s, v0.4s, v4.4s\n"
1990 "fmul v0.4s, v0.4s, v6.4s\n"
1991 "fadd v0.4s, v0.4s, v5.4s\n"
1992 "fcvtzs v0.4s, v0.4s\n"
1993 "sqxtn v0.4h, v0.4s\n"
1994 "sqxtun v0.8b, v0.8h\n"
1995
1996 "st1 {v0.h}[0], [%x[output]], #2\n"
1997 "st1 {v0.b}[2], [%x[output]], #1\n"
1998 "prfm pldl1keep, [%x[output]]\n"
1999 : [count] "+r"(params_count_copy), [input] "+r"(input),
2000 [output] "+r"(output)
2001 : [range_offset] "r"(params.range_offset),
2002 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2003 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2004 }
2005
2006 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2007 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 4>::Transform(
2008 const float* input, const Quantize& params, uint8_t* output) {
2009 #ifdef DEBUG
2010 #ifdef DEBUG_METAGEMM_VERBOSE
2011 std::cout << __FILE__ << "(" << __LINE__
2012 << ") Quantize<float, uint8_t, Quantize, 16, 4>::Transform()"
2013 << std::endl
2014 << std::flush;
2015 #endif
2016 #endif
2017 int params_count_copy = params.count;
2018 asm volatile(
2019
2020 // Quantize::Prepare
2021 "dup v4.4s, %w[range_min]\n"
2022 "dup v5.4s, %w[range_offset]\n"
2023 "dup v6.4s, %w[range_scale]\n"
2024
2025 // Reduce count by leftovers.
2026 "subs %x[count], %x[count], #4\n"
2027 "beq 2f\n"
2028
2029 "1:"
2030 "subs %x[count], %x[count], #16\n"
2031
2032 // Quantize::Transform
2033 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2034 "prfm pldl1keep, [%x[input], #64]\n"
2035 "fsub v0.4s, v0.4s, v4.4s\n"
2036 "fsub v1.4s, v1.4s, v4.4s\n"
2037 "fsub v2.4s, v2.4s, v4.4s\n"
2038 "fsub v3.4s, v3.4s, v4.4s\n"
2039 "fmul v0.4s, v0.4s, v6.4s\n"
2040 "fmul v1.4s, v1.4s, v6.4s\n"
2041 "fmul v2.4s, v2.4s, v6.4s\n"
2042 "fmul v3.4s, v3.4s, v6.4s\n"
2043 "fadd v0.4s, v0.4s, v5.4s\n"
2044 "fadd v1.4s, v1.4s, v5.4s\n"
2045 "fadd v2.4s, v2.4s, v5.4s\n"
2046 "fadd v3.4s, v3.4s, v5.4s\n"
2047 "fcvtzs v0.4s, v0.4s\n"
2048 "fcvtzs v1.4s, v1.4s\n"
2049 "fcvtzs v2.4s, v2.4s\n"
2050 "fcvtzs v3.4s, v3.4s\n"
2051 "sqxtn v0.4h, v0.4s\n"
2052 "sqxtn2 v0.8h, v1.4s\n"
2053 "sqxtn v2.4h, v2.4s\n"
2054 "sqxtn2 v2.8h, v3.4s\n"
2055 "sqxtun v0.8b, v0.8h\n"
2056 "sqxtun2 v0.16b, v2.8h\n"
2057
2058 "st1 {v0.4s}, [%x[output]], #16\n"
2059 "prfm pldl1keep, [%x[output]]\n"
2060
2061 "bne 1b\n"
2062 "2:"
2063
2064 // Handle leftovers.
2065
2066 // Quantize::Transform
2067 "ld1 {v0.4s}, [%x[input]], #16\n"
2068 "prfm pldl1keep, [%x[input], #64]\n"
2069 "fsub v0.4s, v0.4s, v4.4s\n"
2070 "fmul v0.4s, v0.4s, v6.4s\n"
2071 "fadd v0.4s, v0.4s, v5.4s\n"
2072 "fcvtzs v0.4s, v0.4s\n"
2073 "sqxtn v0.4h, v0.4s\n"
2074 "sqxtun v0.8b, v0.8h\n"
2075
2076 "st1 {v0.s}[0], [%x[output]], #4\n"
2077 "prfm pldl1keep, [%x[output]]\n"
2078 : [count] "+r"(params_count_copy), [input] "+r"(input),
2079 [output] "+r"(output)
2080 : [range_offset] "r"(params.range_offset),
2081 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2082 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2083 }
2084
2085 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2086 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 5>::Transform(
2087 const float* input, const Quantize& params, uint8_t* output) {
2088 #ifdef DEBUG
2089 #ifdef DEBUG_METAGEMM_VERBOSE
2090 std::cout << __FILE__ << "(" << __LINE__
2091 << ") Quantize<float, uint8_t, Quantize, 16, 5>::Transform()"
2092 << std::endl
2093 << std::flush;
2094 #endif
2095 #endif
2096 int params_count_copy = params.count;
2097 asm volatile(
2098
2099 // Quantize::Prepare
2100 "dup v4.4s, %w[range_min]\n"
2101 "dup v5.4s, %w[range_offset]\n"
2102 "dup v6.4s, %w[range_scale]\n"
2103
2104 // Reduce count by leftovers.
2105 "subs %x[count], %x[count], #5\n"
2106 "beq 2f\n"
2107
2108 "1:"
2109 "subs %x[count], %x[count], #16\n"
2110
2111 // Quantize::Transform
2112 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2113 "prfm pldl1keep, [%x[input], #64]\n"
2114 "fsub v0.4s, v0.4s, v4.4s\n"
2115 "fsub v1.4s, v1.4s, v4.4s\n"
2116 "fsub v2.4s, v2.4s, v4.4s\n"
2117 "fsub v3.4s, v3.4s, v4.4s\n"
2118 "fmul v0.4s, v0.4s, v6.4s\n"
2119 "fmul v1.4s, v1.4s, v6.4s\n"
2120 "fmul v2.4s, v2.4s, v6.4s\n"
2121 "fmul v3.4s, v3.4s, v6.4s\n"
2122 "fadd v0.4s, v0.4s, v5.4s\n"
2123 "fadd v1.4s, v1.4s, v5.4s\n"
2124 "fadd v2.4s, v2.4s, v5.4s\n"
2125 "fadd v3.4s, v3.4s, v5.4s\n"
2126 "fcvtzs v0.4s, v0.4s\n"
2127 "fcvtzs v1.4s, v1.4s\n"
2128 "fcvtzs v2.4s, v2.4s\n"
2129 "fcvtzs v3.4s, v3.4s\n"
2130 "sqxtn v0.4h, v0.4s\n"
2131 "sqxtn2 v0.8h, v1.4s\n"
2132 "sqxtn v2.4h, v2.4s\n"
2133 "sqxtn2 v2.8h, v3.4s\n"
2134 "sqxtun v0.8b, v0.8h\n"
2135 "sqxtun2 v0.16b, v2.8h\n"
2136
2137 "st1 {v0.4s}, [%x[output]], #16\n"
2138 "prfm pldl1keep, [%x[output]]\n"
2139
2140 "bne 1b\n"
2141 "2:"
2142
2143 // Handle leftovers.
2144
2145 // Quantize::Transform
2146 "ld1 {v0.4s}, [%x[input]], #16\n"
2147 "ld1 {v1.s}[0], [%x[input]], #4\n"
2148 "prfm pldl1keep, [%x[input], #64]\n"
2149 "fsub v0.4s, v0.4s, v4.4s\n"
2150 "fsub v1.4s, v1.4s, v4.4s\n"
2151 "fmul v0.4s, v0.4s, v6.4s\n"
2152 "fmul v1.4s, v1.4s, v6.4s\n"
2153 "fadd v0.4s, v0.4s, v5.4s\n"
2154 "fadd v1.4s, v1.4s, v5.4s\n"
2155 "fcvtzs v0.4s, v0.4s\n"
2156 "fcvtzs v1.4s, v1.4s\n"
2157 "sqxtn v0.4h, v0.4s\n"
2158 "sqxtn2 v0.8h, v1.4s\n"
2159 "sqxtun v0.8b, v0.8h\n"
2160
2161 "st1 {v0.s}[0], [%x[output]], #4\n"
2162 "st1 {v0.b}[4], [%x[output]], #1\n"
2163 "prfm pldl1keep, [%x[output]]\n"
2164 : [count] "+r"(params_count_copy), [input] "+r"(input),
2165 [output] "+r"(output)
2166 : [range_offset] "r"(params.range_offset),
2167 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2168 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2169 }
2170
2171 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2172 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 6>::Transform(
2173 const float* input, const Quantize& params, uint8_t* output) {
2174 #ifdef DEBUG
2175 #ifdef DEBUG_METAGEMM_VERBOSE
2176 std::cout << __FILE__ << "(" << __LINE__
2177 << ") Quantize<float, uint8_t, Quantize, 16, 6>::Transform()"
2178 << std::endl
2179 << std::flush;
2180 #endif
2181 #endif
2182 int params_count_copy = params.count;
2183 asm volatile(
2184
2185 // Quantize::Prepare
2186 "dup v4.4s, %w[range_min]\n"
2187 "dup v5.4s, %w[range_offset]\n"
2188 "dup v6.4s, %w[range_scale]\n"
2189
2190 // Reduce count by leftovers.
2191 "subs %x[count], %x[count], #6\n"
2192 "beq 2f\n"
2193
2194 "1:"
2195 "subs %x[count], %x[count], #16\n"
2196
2197 // Quantize::Transform
2198 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2199 "prfm pldl1keep, [%x[input], #64]\n"
2200 "fsub v0.4s, v0.4s, v4.4s\n"
2201 "fsub v1.4s, v1.4s, v4.4s\n"
2202 "fsub v2.4s, v2.4s, v4.4s\n"
2203 "fsub v3.4s, v3.4s, v4.4s\n"
2204 "fmul v0.4s, v0.4s, v6.4s\n"
2205 "fmul v1.4s, v1.4s, v6.4s\n"
2206 "fmul v2.4s, v2.4s, v6.4s\n"
2207 "fmul v3.4s, v3.4s, v6.4s\n"
2208 "fadd v0.4s, v0.4s, v5.4s\n"
2209 "fadd v1.4s, v1.4s, v5.4s\n"
2210 "fadd v2.4s, v2.4s, v5.4s\n"
2211 "fadd v3.4s, v3.4s, v5.4s\n"
2212 "fcvtzs v0.4s, v0.4s\n"
2213 "fcvtzs v1.4s, v1.4s\n"
2214 "fcvtzs v2.4s, v2.4s\n"
2215 "fcvtzs v3.4s, v3.4s\n"
2216 "sqxtn v0.4h, v0.4s\n"
2217 "sqxtn2 v0.8h, v1.4s\n"
2218 "sqxtn v2.4h, v2.4s\n"
2219 "sqxtn2 v2.8h, v3.4s\n"
2220 "sqxtun v0.8b, v0.8h\n"
2221 "sqxtun2 v0.16b, v2.8h\n"
2222
2223 "st1 {v0.4s}, [%x[output]], #16\n"
2224 "prfm pldl1keep, [%x[output]]\n"
2225
2226 "bne 1b\n"
2227 "2:"
2228
2229 // Handle leftovers.
2230
2231 // Quantize::Transform
2232 "ld1 {v0.4s}, [%x[input]], #16\n"
2233 "ld1 {v1.2s}, [%x[input]], #8\n"
2234 "prfm pldl1keep, [%x[input], #64]\n"
2235 "fsub v0.4s, v0.4s, v4.4s\n"
2236 "fsub v1.4s, v1.4s, v4.4s\n"
2237 "fmul v0.4s, v0.4s, v6.4s\n"
2238 "fmul v1.4s, v1.4s, v6.4s\n"
2239 "fadd v0.4s, v0.4s, v5.4s\n"
2240 "fadd v1.4s, v1.4s, v5.4s\n"
2241 "fcvtzs v0.4s, v0.4s\n"
2242 "fcvtzs v1.4s, v1.4s\n"
2243 "sqxtn v0.4h, v0.4s\n"
2244 "sqxtn2 v0.8h, v1.4s\n"
2245 "sqxtun v0.8b, v0.8h\n"
2246
2247 "st1 {v0.s}[0], [%x[output]], #4\n"
2248 "st1 {v0.h}[2], [%x[output]], #2\n"
2249 "prfm pldl1keep, [%x[output]]\n"
2250 : [count] "+r"(params_count_copy), [input] "+r"(input),
2251 [output] "+r"(output)
2252 : [range_offset] "r"(params.range_offset),
2253 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2254 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2255 }
2256
2257 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2258 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 7>::Transform(
2259 const float* input, const Quantize& params, uint8_t* output) {
2260 #ifdef DEBUG
2261 #ifdef DEBUG_METAGEMM_VERBOSE
2262 std::cout << __FILE__ << "(" << __LINE__
2263 << ") Quantize<float, uint8_t, Quantize, 16, 7>::Transform()"
2264 << std::endl
2265 << std::flush;
2266 #endif
2267 #endif
2268 int params_count_copy = params.count;
2269 asm volatile(
2270
2271 // Quantize::Prepare
2272 "dup v4.4s, %w[range_min]\n"
2273 "dup v5.4s, %w[range_offset]\n"
2274 "dup v6.4s, %w[range_scale]\n"
2275
2276 // Reduce count by leftovers.
2277 "subs %x[count], %x[count], #7\n"
2278 "beq 2f\n"
2279
2280 "1:"
2281 "subs %x[count], %x[count], #16\n"
2282
2283 // Quantize::Transform
2284 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2285 "prfm pldl1keep, [%x[input], #64]\n"
2286 "fsub v0.4s, v0.4s, v4.4s\n"
2287 "fsub v1.4s, v1.4s, v4.4s\n"
2288 "fsub v2.4s, v2.4s, v4.4s\n"
2289 "fsub v3.4s, v3.4s, v4.4s\n"
2290 "fmul v0.4s, v0.4s, v6.4s\n"
2291 "fmul v1.4s, v1.4s, v6.4s\n"
2292 "fmul v2.4s, v2.4s, v6.4s\n"
2293 "fmul v3.4s, v3.4s, v6.4s\n"
2294 "fadd v0.4s, v0.4s, v5.4s\n"
2295 "fadd v1.4s, v1.4s, v5.4s\n"
2296 "fadd v2.4s, v2.4s, v5.4s\n"
2297 "fadd v3.4s, v3.4s, v5.4s\n"
2298 "fcvtzs v0.4s, v0.4s\n"
2299 "fcvtzs v1.4s, v1.4s\n"
2300 "fcvtzs v2.4s, v2.4s\n"
2301 "fcvtzs v3.4s, v3.4s\n"
2302 "sqxtn v0.4h, v0.4s\n"
2303 "sqxtn2 v0.8h, v1.4s\n"
2304 "sqxtn v2.4h, v2.4s\n"
2305 "sqxtn2 v2.8h, v3.4s\n"
2306 "sqxtun v0.8b, v0.8h\n"
2307 "sqxtun2 v0.16b, v2.8h\n"
2308
2309 "st1 {v0.4s}, [%x[output]], #16\n"
2310 "prfm pldl1keep, [%x[output]]\n"
2311
2312 "bne 1b\n"
2313 "2:"
2314
2315 // Handle leftovers.
2316
2317 // Quantize::Transform
2318 "ld1 {v0.4s}, [%x[input]], #16\n"
2319 "ld1 {v1.2s}, [%x[input]], #8\n"
2320 "ld1 {v1.s}[2], [%x[input]], #4\n"
2321 "prfm pldl1keep, [%x[input], #64]\n"
2322 "fsub v0.4s, v0.4s, v4.4s\n"
2323 "fsub v1.4s, v1.4s, v4.4s\n"
2324 "fmul v0.4s, v0.4s, v6.4s\n"
2325 "fmul v1.4s, v1.4s, v6.4s\n"
2326 "fadd v0.4s, v0.4s, v5.4s\n"
2327 "fadd v1.4s, v1.4s, v5.4s\n"
2328 "fcvtzs v0.4s, v0.4s\n"
2329 "fcvtzs v1.4s, v1.4s\n"
2330 "sqxtn v0.4h, v0.4s\n"
2331 "sqxtn2 v0.8h, v1.4s\n"
2332 "sqxtun v0.8b, v0.8h\n"
2333
2334 "st1 {v0.s}[0], [%x[output]], #4\n"
2335 "st1 {v0.h}[2], [%x[output]], #2\n"
2336 "st1 {v0.b}[6], [%x[output]], #1\n"
2337 "prfm pldl1keep, [%x[output]]\n"
2338 : [count] "+r"(params_count_copy), [input] "+r"(input),
2339 [output] "+r"(output)
2340 : [range_offset] "r"(params.range_offset),
2341 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2342 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2343 }
2344
2345 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2346 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 8>::Transform(
2347 const float* input, const Quantize& params, uint8_t* output) {
2348 #ifdef DEBUG
2349 #ifdef DEBUG_METAGEMM_VERBOSE
2350 std::cout << __FILE__ << "(" << __LINE__
2351 << ") Quantize<float, uint8_t, Quantize, 16, 8>::Transform()"
2352 << std::endl
2353 << std::flush;
2354 #endif
2355 #endif
2356 int params_count_copy = params.count;
2357 asm volatile(
2358
2359 // Quantize::Prepare
2360 "dup v4.4s, %w[range_min]\n"
2361 "dup v5.4s, %w[range_offset]\n"
2362 "dup v6.4s, %w[range_scale]\n"
2363
2364 // Reduce count by leftovers.
2365 "subs %x[count], %x[count], #8\n"
2366 "beq 2f\n"
2367
2368 "1:"
2369 "subs %x[count], %x[count], #16\n"
2370
2371 // Quantize::Transform
2372 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2373 "prfm pldl1keep, [%x[input], #64]\n"
2374 "fsub v0.4s, v0.4s, v4.4s\n"
2375 "fsub v1.4s, v1.4s, v4.4s\n"
2376 "fsub v2.4s, v2.4s, v4.4s\n"
2377 "fsub v3.4s, v3.4s, v4.4s\n"
2378 "fmul v0.4s, v0.4s, v6.4s\n"
2379 "fmul v1.4s, v1.4s, v6.4s\n"
2380 "fmul v2.4s, v2.4s, v6.4s\n"
2381 "fmul v3.4s, v3.4s, v6.4s\n"
2382 "fadd v0.4s, v0.4s, v5.4s\n"
2383 "fadd v1.4s, v1.4s, v5.4s\n"
2384 "fadd v2.4s, v2.4s, v5.4s\n"
2385 "fadd v3.4s, v3.4s, v5.4s\n"
2386 "fcvtzs v0.4s, v0.4s\n"
2387 "fcvtzs v1.4s, v1.4s\n"
2388 "fcvtzs v2.4s, v2.4s\n"
2389 "fcvtzs v3.4s, v3.4s\n"
2390 "sqxtn v0.4h, v0.4s\n"
2391 "sqxtn2 v0.8h, v1.4s\n"
2392 "sqxtn v2.4h, v2.4s\n"
2393 "sqxtn2 v2.8h, v3.4s\n"
2394 "sqxtun v0.8b, v0.8h\n"
2395 "sqxtun2 v0.16b, v2.8h\n"
2396
2397 "st1 {v0.4s}, [%x[output]], #16\n"
2398 "prfm pldl1keep, [%x[output]]\n"
2399
2400 "bne 1b\n"
2401 "2:"
2402
2403 // Handle leftovers.
2404
2405 // Quantize::Transform
2406 "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
2407 "prfm pldl1keep, [%x[input], #64]\n"
2408 "fsub v0.4s, v0.4s, v4.4s\n"
2409 "fsub v1.4s, v1.4s, v4.4s\n"
2410 "fmul v0.4s, v0.4s, v6.4s\n"
2411 "fmul v1.4s, v1.4s, v6.4s\n"
2412 "fadd v0.4s, v0.4s, v5.4s\n"
2413 "fadd v1.4s, v1.4s, v5.4s\n"
2414 "fcvtzs v0.4s, v0.4s\n"
2415 "fcvtzs v1.4s, v1.4s\n"
2416 "sqxtn v0.4h, v0.4s\n"
2417 "sqxtn2 v0.8h, v1.4s\n"
2418 "sqxtun v0.8b, v0.8h\n"
2419
2420 "st1 {v0.2s}, [%x[output]], #8\n"
2421 "prfm pldl1keep, [%x[output]]\n"
2422 : [count] "+r"(params_count_copy), [input] "+r"(input),
2423 [output] "+r"(output)
2424 : [range_offset] "r"(params.range_offset),
2425 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2426 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2427 }
2428
2429 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2430 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 9>::Transform(
2431 const float* input, const Quantize& params, uint8_t* output) {
2432 #ifdef DEBUG
2433 #ifdef DEBUG_METAGEMM_VERBOSE
2434 std::cout << __FILE__ << "(" << __LINE__
2435 << ") Quantize<float, uint8_t, Quantize, 16, 9>::Transform()"
2436 << std::endl
2437 << std::flush;
2438 #endif
2439 #endif
2440 int params_count_copy = params.count;
2441 asm volatile(
2442
2443 // Quantize::Prepare
2444 "dup v4.4s, %w[range_min]\n"
2445 "dup v5.4s, %w[range_offset]\n"
2446 "dup v6.4s, %w[range_scale]\n"
2447
2448 // Reduce count by leftovers.
2449 "subs %x[count], %x[count], #9\n"
2450 "beq 2f\n"
2451
2452 "1:"
2453 "subs %x[count], %x[count], #16\n"
2454
2455 // Quantize::Transform
2456 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2457 "prfm pldl1keep, [%x[input], #64]\n"
2458 "fsub v0.4s, v0.4s, v4.4s\n"
2459 "fsub v1.4s, v1.4s, v4.4s\n"
2460 "fsub v2.4s, v2.4s, v4.4s\n"
2461 "fsub v3.4s, v3.4s, v4.4s\n"
2462 "fmul v0.4s, v0.4s, v6.4s\n"
2463 "fmul v1.4s, v1.4s, v6.4s\n"
2464 "fmul v2.4s, v2.4s, v6.4s\n"
2465 "fmul v3.4s, v3.4s, v6.4s\n"
2466 "fadd v0.4s, v0.4s, v5.4s\n"
2467 "fadd v1.4s, v1.4s, v5.4s\n"
2468 "fadd v2.4s, v2.4s, v5.4s\n"
2469 "fadd v3.4s, v3.4s, v5.4s\n"
2470 "fcvtzs v0.4s, v0.4s\n"
2471 "fcvtzs v1.4s, v1.4s\n"
2472 "fcvtzs v2.4s, v2.4s\n"
2473 "fcvtzs v3.4s, v3.4s\n"
2474 "sqxtn v0.4h, v0.4s\n"
2475 "sqxtn2 v0.8h, v1.4s\n"
2476 "sqxtn v2.4h, v2.4s\n"
2477 "sqxtn2 v2.8h, v3.4s\n"
2478 "sqxtun v0.8b, v0.8h\n"
2479 "sqxtun2 v0.16b, v2.8h\n"
2480
2481 "st1 {v0.4s}, [%x[output]], #16\n"
2482 "prfm pldl1keep, [%x[output]]\n"
2483
2484 "bne 1b\n"
2485 "2:"
2486
2487 // Handle leftovers.
2488
2489 // Quantize::Transform
2490 "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
2491 "ld1 {v2.s}[0], [%x[input]], #4\n"
2492 "prfm pldl1keep, [%x[input], #64]\n"
2493 "fsub v0.4s, v0.4s, v4.4s\n"
2494 "fsub v1.4s, v1.4s, v4.4s\n"
2495 "fsub v2.4s, v2.4s, v4.4s\n"
2496 "fmul v0.4s, v0.4s, v6.4s\n"
2497 "fmul v1.4s, v1.4s, v6.4s\n"
2498 "fmul v2.4s, v2.4s, v6.4s\n"
2499 "fadd v0.4s, v0.4s, v5.4s\n"
2500 "fadd v1.4s, v1.4s, v5.4s\n"
2501 "fadd v2.4s, v2.4s, v5.4s\n"
2502 "fcvtzs v0.4s, v0.4s\n"
2503 "fcvtzs v1.4s, v1.4s\n"
2504 "fcvtzs v2.4s, v2.4s\n"
2505 "sqxtn v0.4h, v0.4s\n"
2506 "sqxtn2 v0.8h, v1.4s\n"
2507 "sqxtn v2.4h, v2.4s\n"
2508 "sqxtun v0.8b, v0.8h\n"
2509 "sqxtun2 v0.16b, v2.8h\n"
2510
2511 "st1 {v0.2s}, [%x[output]], #8\n"
2512 "st1 {v0.b}[8], [%x[output]], #1\n"
2513 "prfm pldl1keep, [%x[output]]\n"
2514 : [count] "+r"(params_count_copy), [input] "+r"(input),
2515 [output] "+r"(output)
2516 : [range_offset] "r"(params.range_offset),
2517 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2518 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2519 }
2520
2521 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2522 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 10>::Transform(
2523 const float* input, const Quantize& params, uint8_t* output) {
2524 #ifdef DEBUG
2525 #ifdef DEBUG_METAGEMM_VERBOSE
2526 std::cout << __FILE__ << "(" << __LINE__
2527 << ") Quantize<float, uint8_t, Quantize, 16, 10>::Transform()"
2528 << std::endl
2529 << std::flush;
2530 #endif
2531 #endif
2532 int params_count_copy = params.count;
2533 asm volatile(
2534
2535 // Quantize::Prepare
2536 "dup v4.4s, %w[range_min]\n"
2537 "dup v5.4s, %w[range_offset]\n"
2538 "dup v6.4s, %w[range_scale]\n"
2539
2540 // Reduce count by leftovers.
2541 "subs %x[count], %x[count], #10\n"
2542 "beq 2f\n"
2543
2544 "1:"
2545 "subs %x[count], %x[count], #16\n"
2546
2547 // Quantize::Transform
2548 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2549 "prfm pldl1keep, [%x[input], #64]\n"
2550 "fsub v0.4s, v0.4s, v4.4s\n"
2551 "fsub v1.4s, v1.4s, v4.4s\n"
2552 "fsub v2.4s, v2.4s, v4.4s\n"
2553 "fsub v3.4s, v3.4s, v4.4s\n"
2554 "fmul v0.4s, v0.4s, v6.4s\n"
2555 "fmul v1.4s, v1.4s, v6.4s\n"
2556 "fmul v2.4s, v2.4s, v6.4s\n"
2557 "fmul v3.4s, v3.4s, v6.4s\n"
2558 "fadd v0.4s, v0.4s, v5.4s\n"
2559 "fadd v1.4s, v1.4s, v5.4s\n"
2560 "fadd v2.4s, v2.4s, v5.4s\n"
2561 "fadd v3.4s, v3.4s, v5.4s\n"
2562 "fcvtzs v0.4s, v0.4s\n"
2563 "fcvtzs v1.4s, v1.4s\n"
2564 "fcvtzs v2.4s, v2.4s\n"
2565 "fcvtzs v3.4s, v3.4s\n"
2566 "sqxtn v0.4h, v0.4s\n"
2567 "sqxtn2 v0.8h, v1.4s\n"
2568 "sqxtn v2.4h, v2.4s\n"
2569 "sqxtn2 v2.8h, v3.4s\n"
2570 "sqxtun v0.8b, v0.8h\n"
2571 "sqxtun2 v0.16b, v2.8h\n"
2572
2573 "st1 {v0.4s}, [%x[output]], #16\n"
2574 "prfm pldl1keep, [%x[output]]\n"
2575
2576 "bne 1b\n"
2577 "2:"
2578
2579 // Handle leftovers.
2580
2581 // Quantize::Transform
2582 "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
2583 "ld1 {v2.2s}, [%x[input]], #8\n"
2584 "prfm pldl1keep, [%x[input], #64]\n"
2585 "fsub v0.4s, v0.4s, v4.4s\n"
2586 "fsub v1.4s, v1.4s, v4.4s\n"
2587 "fsub v2.4s, v2.4s, v4.4s\n"
2588 "fmul v0.4s, v0.4s, v6.4s\n"
2589 "fmul v1.4s, v1.4s, v6.4s\n"
2590 "fmul v2.4s, v2.4s, v6.4s\n"
2591 "fadd v0.4s, v0.4s, v5.4s\n"
2592 "fadd v1.4s, v1.4s, v5.4s\n"
2593 "fadd v2.4s, v2.4s, v5.4s\n"
2594 "fcvtzs v0.4s, v0.4s\n"
2595 "fcvtzs v1.4s, v1.4s\n"
2596 "fcvtzs v2.4s, v2.4s\n"
2597 "sqxtn v0.4h, v0.4s\n"
2598 "sqxtn2 v0.8h, v1.4s\n"
2599 "sqxtn v2.4h, v2.4s\n"
2600 "sqxtun v0.8b, v0.8h\n"
2601 "sqxtun2 v0.16b, v2.8h\n"
2602
2603 "st1 {v0.2s}, [%x[output]], #8\n"
2604 "st1 {v0.h}[4], [%x[output]], #2\n"
2605 "prfm pldl1keep, [%x[output]]\n"
2606 : [count] "+r"(params_count_copy), [input] "+r"(input),
2607 [output] "+r"(output)
2608 : [range_offset] "r"(params.range_offset),
2609 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2610 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2611 }
2612
2613 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2614 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 11>::Transform(
2615 const float* input, const Quantize& params, uint8_t* output) {
2616 #ifdef DEBUG
2617 #ifdef DEBUG_METAGEMM_VERBOSE
2618 std::cout << __FILE__ << "(" << __LINE__
2619 << ") Quantize<float, uint8_t, Quantize, 16, 11>::Transform()"
2620 << std::endl
2621 << std::flush;
2622 #endif
2623 #endif
2624 int params_count_copy = params.count;
2625 asm volatile(
2626
2627 // Quantize::Prepare
2628 "dup v4.4s, %w[range_min]\n"
2629 "dup v5.4s, %w[range_offset]\n"
2630 "dup v6.4s, %w[range_scale]\n"
2631
2632 // Reduce count by leftovers.
2633 "subs %x[count], %x[count], #11\n"
2634 "beq 2f\n"
2635
2636 "1:"
2637 "subs %x[count], %x[count], #16\n"
2638
2639 // Quantize::Transform
2640 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2641 "prfm pldl1keep, [%x[input], #64]\n"
2642 "fsub v0.4s, v0.4s, v4.4s\n"
2643 "fsub v1.4s, v1.4s, v4.4s\n"
2644 "fsub v2.4s, v2.4s, v4.4s\n"
2645 "fsub v3.4s, v3.4s, v4.4s\n"
2646 "fmul v0.4s, v0.4s, v6.4s\n"
2647 "fmul v1.4s, v1.4s, v6.4s\n"
2648 "fmul v2.4s, v2.4s, v6.4s\n"
2649 "fmul v3.4s, v3.4s, v6.4s\n"
2650 "fadd v0.4s, v0.4s, v5.4s\n"
2651 "fadd v1.4s, v1.4s, v5.4s\n"
2652 "fadd v2.4s, v2.4s, v5.4s\n"
2653 "fadd v3.4s, v3.4s, v5.4s\n"
2654 "fcvtzs v0.4s, v0.4s\n"
2655 "fcvtzs v1.4s, v1.4s\n"
2656 "fcvtzs v2.4s, v2.4s\n"
2657 "fcvtzs v3.4s, v3.4s\n"
2658 "sqxtn v0.4h, v0.4s\n"
2659 "sqxtn2 v0.8h, v1.4s\n"
2660 "sqxtn v2.4h, v2.4s\n"
2661 "sqxtn2 v2.8h, v3.4s\n"
2662 "sqxtun v0.8b, v0.8h\n"
2663 "sqxtun2 v0.16b, v2.8h\n"
2664
2665 "st1 {v0.4s}, [%x[output]], #16\n"
2666 "prfm pldl1keep, [%x[output]]\n"
2667
2668 "bne 1b\n"
2669 "2:"
2670
2671 // Handle leftovers.
2672
2673 // Quantize::Transform
2674 "ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
2675 "ld1 {v2.2s}, [%x[input]], #8\n"
2676 "ld1 {v2.s}[2], [%x[input]], #4\n"
2677 "prfm pldl1keep, [%x[input], #64]\n"
2678 "fsub v0.4s, v0.4s, v4.4s\n"
2679 "fsub v1.4s, v1.4s, v4.4s\n"
2680 "fsub v2.4s, v2.4s, v4.4s\n"
2681 "fmul v0.4s, v0.4s, v6.4s\n"
2682 "fmul v1.4s, v1.4s, v6.4s\n"
2683 "fmul v2.4s, v2.4s, v6.4s\n"
2684 "fadd v0.4s, v0.4s, v5.4s\n"
2685 "fadd v1.4s, v1.4s, v5.4s\n"
2686 "fadd v2.4s, v2.4s, v5.4s\n"
2687 "fcvtzs v0.4s, v0.4s\n"
2688 "fcvtzs v1.4s, v1.4s\n"
2689 "fcvtzs v2.4s, v2.4s\n"
2690 "sqxtn v0.4h, v0.4s\n"
2691 "sqxtn2 v0.8h, v1.4s\n"
2692 "sqxtn v2.4h, v2.4s\n"
2693 "sqxtun v0.8b, v0.8h\n"
2694 "sqxtun2 v0.16b, v2.8h\n"
2695
2696 "st1 {v0.2s}, [%x[output]], #8\n"
2697 "st1 {v0.h}[4], [%x[output]], #2\n"
2698 "st1 {v0.b}[10], [%x[output]], #1\n"
2699 "prfm pldl1keep, [%x[output]]\n"
2700 : [count] "+r"(params_count_copy), [input] "+r"(input),
2701 [output] "+r"(output)
2702 : [range_offset] "r"(params.range_offset),
2703 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2704 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2705 }
2706
2707 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2708 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 12>::Transform(
2709 const float* input, const Quantize& params, uint8_t* output) {
2710 #ifdef DEBUG
2711 #ifdef DEBUG_METAGEMM_VERBOSE
2712 std::cout << __FILE__ << "(" << __LINE__
2713 << ") Quantize<float, uint8_t, Quantize, 16, 12>::Transform()"
2714 << std::endl
2715 << std::flush;
2716 #endif
2717 #endif
2718 int params_count_copy = params.count;
2719 asm volatile(
2720
2721 // Quantize::Prepare
2722 "dup v4.4s, %w[range_min]\n"
2723 "dup v5.4s, %w[range_offset]\n"
2724 "dup v6.4s, %w[range_scale]\n"
2725
2726 // Reduce count by leftovers.
2727 "subs %x[count], %x[count], #12\n"
2728 "beq 2f\n"
2729
2730 "1:"
2731 "subs %x[count], %x[count], #16\n"
2732
2733 // Quantize::Transform
2734 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2735 "prfm pldl1keep, [%x[input], #64]\n"
2736 "fsub v0.4s, v0.4s, v4.4s\n"
2737 "fsub v1.4s, v1.4s, v4.4s\n"
2738 "fsub v2.4s, v2.4s, v4.4s\n"
2739 "fsub v3.4s, v3.4s, v4.4s\n"
2740 "fmul v0.4s, v0.4s, v6.4s\n"
2741 "fmul v1.4s, v1.4s, v6.4s\n"
2742 "fmul v2.4s, v2.4s, v6.4s\n"
2743 "fmul v3.4s, v3.4s, v6.4s\n"
2744 "fadd v0.4s, v0.4s, v5.4s\n"
2745 "fadd v1.4s, v1.4s, v5.4s\n"
2746 "fadd v2.4s, v2.4s, v5.4s\n"
2747 "fadd v3.4s, v3.4s, v5.4s\n"
2748 "fcvtzs v0.4s, v0.4s\n"
2749 "fcvtzs v1.4s, v1.4s\n"
2750 "fcvtzs v2.4s, v2.4s\n"
2751 "fcvtzs v3.4s, v3.4s\n"
2752 "sqxtn v0.4h, v0.4s\n"
2753 "sqxtn2 v0.8h, v1.4s\n"
2754 "sqxtn v2.4h, v2.4s\n"
2755 "sqxtn2 v2.8h, v3.4s\n"
2756 "sqxtun v0.8b, v0.8h\n"
2757 "sqxtun2 v0.16b, v2.8h\n"
2758
2759 "st1 {v0.4s}, [%x[output]], #16\n"
2760 "prfm pldl1keep, [%x[output]]\n"
2761
2762 "bne 1b\n"
2763 "2:"
2764
2765 // Handle leftovers.
2766
2767 // Quantize::Transform
2768 "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
2769 "prfm pldl1keep, [%x[input], #64]\n"
2770 "fsub v0.4s, v0.4s, v4.4s\n"
2771 "fsub v1.4s, v1.4s, v4.4s\n"
2772 "fsub v2.4s, v2.4s, v4.4s\n"
2773 "fmul v0.4s, v0.4s, v6.4s\n"
2774 "fmul v1.4s, v1.4s, v6.4s\n"
2775 "fmul v2.4s, v2.4s, v6.4s\n"
2776 "fadd v0.4s, v0.4s, v5.4s\n"
2777 "fadd v1.4s, v1.4s, v5.4s\n"
2778 "fadd v2.4s, v2.4s, v5.4s\n"
2779 "fcvtzs v0.4s, v0.4s\n"
2780 "fcvtzs v1.4s, v1.4s\n"
2781 "fcvtzs v2.4s, v2.4s\n"
2782 "sqxtn v0.4h, v0.4s\n"
2783 "sqxtn2 v0.8h, v1.4s\n"
2784 "sqxtn v2.4h, v2.4s\n"
2785 "sqxtun v0.8b, v0.8h\n"
2786 "sqxtun2 v0.16b, v2.8h\n"
2787
2788 "st1 {v0.2s}, [%x[output]], #8\n"
2789 "st1 {v0.s}[2], [%x[output]], #4\n"
2790 "prfm pldl1keep, [%x[output]]\n"
2791 : [count] "+r"(params_count_copy), [input] "+r"(input),
2792 [output] "+r"(output)
2793 : [range_offset] "r"(params.range_offset),
2794 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2795 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2796 }
2797
2798 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2799 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 13>::Transform(
2800 const float* input, const Quantize& params, uint8_t* output) {
2801 #ifdef DEBUG
2802 #ifdef DEBUG_METAGEMM_VERBOSE
2803 std::cout << __FILE__ << "(" << __LINE__
2804 << ") Quantize<float, uint8_t, Quantize, 16, 13>::Transform()"
2805 << std::endl
2806 << std::flush;
2807 #endif
2808 #endif
2809 int params_count_copy = params.count;
2810 asm volatile(
2811
2812 // Quantize::Prepare
2813 "dup v4.4s, %w[range_min]\n"
2814 "dup v5.4s, %w[range_offset]\n"
2815 "dup v6.4s, %w[range_scale]\n"
2816
2817 // Reduce count by leftovers.
2818 "subs %x[count], %x[count], #13\n"
2819 "beq 2f\n"
2820
2821 "1:"
2822 "subs %x[count], %x[count], #16\n"
2823
2824 // Quantize::Transform
2825 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2826 "prfm pldl1keep, [%x[input], #64]\n"
2827 "fsub v0.4s, v0.4s, v4.4s\n"
2828 "fsub v1.4s, v1.4s, v4.4s\n"
2829 "fsub v2.4s, v2.4s, v4.4s\n"
2830 "fsub v3.4s, v3.4s, v4.4s\n"
2831 "fmul v0.4s, v0.4s, v6.4s\n"
2832 "fmul v1.4s, v1.4s, v6.4s\n"
2833 "fmul v2.4s, v2.4s, v6.4s\n"
2834 "fmul v3.4s, v3.4s, v6.4s\n"
2835 "fadd v0.4s, v0.4s, v5.4s\n"
2836 "fadd v1.4s, v1.4s, v5.4s\n"
2837 "fadd v2.4s, v2.4s, v5.4s\n"
2838 "fadd v3.4s, v3.4s, v5.4s\n"
2839 "fcvtzs v0.4s, v0.4s\n"
2840 "fcvtzs v1.4s, v1.4s\n"
2841 "fcvtzs v2.4s, v2.4s\n"
2842 "fcvtzs v3.4s, v3.4s\n"
2843 "sqxtn v0.4h, v0.4s\n"
2844 "sqxtn2 v0.8h, v1.4s\n"
2845 "sqxtn v2.4h, v2.4s\n"
2846 "sqxtn2 v2.8h, v3.4s\n"
2847 "sqxtun v0.8b, v0.8h\n"
2848 "sqxtun2 v0.16b, v2.8h\n"
2849
2850 "st1 {v0.4s}, [%x[output]], #16\n"
2851 "prfm pldl1keep, [%x[output]]\n"
2852
2853 "bne 1b\n"
2854 "2:"
2855
2856 // Handle leftovers.
2857
2858 // Quantize::Transform
2859 "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
2860 "ld1 {v3.s}[0], [%x[input]], #4\n"
2861 "prfm pldl1keep, [%x[input], #64]\n"
2862 "fsub v0.4s, v0.4s, v4.4s\n"
2863 "fsub v1.4s, v1.4s, v4.4s\n"
2864 "fsub v2.4s, v2.4s, v4.4s\n"
2865 "fsub v3.4s, v3.4s, v4.4s\n"
2866 "fmul v0.4s, v0.4s, v6.4s\n"
2867 "fmul v1.4s, v1.4s, v6.4s\n"
2868 "fmul v2.4s, v2.4s, v6.4s\n"
2869 "fmul v3.4s, v3.4s, v6.4s\n"
2870 "fadd v0.4s, v0.4s, v5.4s\n"
2871 "fadd v1.4s, v1.4s, v5.4s\n"
2872 "fadd v2.4s, v2.4s, v5.4s\n"
2873 "fadd v3.4s, v3.4s, v5.4s\n"
2874 "fcvtzs v0.4s, v0.4s\n"
2875 "fcvtzs v1.4s, v1.4s\n"
2876 "fcvtzs v2.4s, v2.4s\n"
2877 "fcvtzs v3.4s, v3.4s\n"
2878 "sqxtn v0.4h, v0.4s\n"
2879 "sqxtn2 v0.8h, v1.4s\n"
2880 "sqxtn v2.4h, v2.4s\n"
2881 "sqxtn2 v2.8h, v3.4s\n"
2882 "sqxtun v0.8b, v0.8h\n"
2883 "sqxtun2 v0.16b, v2.8h\n"
2884
2885 "st1 {v0.2s}, [%x[output]], #8\n"
2886 "st1 {v0.s}[2], [%x[output]], #4\n"
2887 "st1 {v0.b}[12], [%x[output]], #1\n"
2888 "prfm pldl1keep, [%x[output]]\n"
2889 : [count] "+r"(params_count_copy), [input] "+r"(input),
2890 [output] "+r"(output)
2891 : [range_offset] "r"(params.range_offset),
2892 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2893 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2894 }
2895
2896 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2897 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 14>::Transform(
2898 const float* input, const Quantize& params, uint8_t* output) {
2899 #ifdef DEBUG
2900 #ifdef DEBUG_METAGEMM_VERBOSE
2901 std::cout << __FILE__ << "(" << __LINE__
2902 << ") Quantize<float, uint8_t, Quantize, 16, 14>::Transform()"
2903 << std::endl
2904 << std::flush;
2905 #endif
2906 #endif
2907 int params_count_copy = params.count;
2908 asm volatile(
2909
2910 // Quantize::Prepare
2911 "dup v4.4s, %w[range_min]\n"
2912 "dup v5.4s, %w[range_offset]\n"
2913 "dup v6.4s, %w[range_scale]\n"
2914
2915 // Reduce count by leftovers.
2916 "subs %x[count], %x[count], #14\n"
2917 "beq 2f\n"
2918
2919 "1:"
2920 "subs %x[count], %x[count], #16\n"
2921
2922 // Quantize::Transform
2923 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
2924 "prfm pldl1keep, [%x[input], #64]\n"
2925 "fsub v0.4s, v0.4s, v4.4s\n"
2926 "fsub v1.4s, v1.4s, v4.4s\n"
2927 "fsub v2.4s, v2.4s, v4.4s\n"
2928 "fsub v3.4s, v3.4s, v4.4s\n"
2929 "fmul v0.4s, v0.4s, v6.4s\n"
2930 "fmul v1.4s, v1.4s, v6.4s\n"
2931 "fmul v2.4s, v2.4s, v6.4s\n"
2932 "fmul v3.4s, v3.4s, v6.4s\n"
2933 "fadd v0.4s, v0.4s, v5.4s\n"
2934 "fadd v1.4s, v1.4s, v5.4s\n"
2935 "fadd v2.4s, v2.4s, v5.4s\n"
2936 "fadd v3.4s, v3.4s, v5.4s\n"
2937 "fcvtzs v0.4s, v0.4s\n"
2938 "fcvtzs v1.4s, v1.4s\n"
2939 "fcvtzs v2.4s, v2.4s\n"
2940 "fcvtzs v3.4s, v3.4s\n"
2941 "sqxtn v0.4h, v0.4s\n"
2942 "sqxtn2 v0.8h, v1.4s\n"
2943 "sqxtn v2.4h, v2.4s\n"
2944 "sqxtn2 v2.8h, v3.4s\n"
2945 "sqxtun v0.8b, v0.8h\n"
2946 "sqxtun2 v0.16b, v2.8h\n"
2947
2948 "st1 {v0.4s}, [%x[output]], #16\n"
2949 "prfm pldl1keep, [%x[output]]\n"
2950
2951 "bne 1b\n"
2952 "2:"
2953
2954 // Handle leftovers.
2955
2956 // Quantize::Transform
2957 "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
2958 "ld1 {v3.2s}, [%x[input]], #8\n"
2959 "prfm pldl1keep, [%x[input], #64]\n"
2960 "fsub v0.4s, v0.4s, v4.4s\n"
2961 "fsub v1.4s, v1.4s, v4.4s\n"
2962 "fsub v2.4s, v2.4s, v4.4s\n"
2963 "fsub v3.4s, v3.4s, v4.4s\n"
2964 "fmul v0.4s, v0.4s, v6.4s\n"
2965 "fmul v1.4s, v1.4s, v6.4s\n"
2966 "fmul v2.4s, v2.4s, v6.4s\n"
2967 "fmul v3.4s, v3.4s, v6.4s\n"
2968 "fadd v0.4s, v0.4s, v5.4s\n"
2969 "fadd v1.4s, v1.4s, v5.4s\n"
2970 "fadd v2.4s, v2.4s, v5.4s\n"
2971 "fadd v3.4s, v3.4s, v5.4s\n"
2972 "fcvtzs v0.4s, v0.4s\n"
2973 "fcvtzs v1.4s, v1.4s\n"
2974 "fcvtzs v2.4s, v2.4s\n"
2975 "fcvtzs v3.4s, v3.4s\n"
2976 "sqxtn v0.4h, v0.4s\n"
2977 "sqxtn2 v0.8h, v1.4s\n"
2978 "sqxtn v2.4h, v2.4s\n"
2979 "sqxtn2 v2.8h, v3.4s\n"
2980 "sqxtun v0.8b, v0.8h\n"
2981 "sqxtun2 v0.16b, v2.8h\n"
2982
2983 "st1 {v0.2s}, [%x[output]], #8\n"
2984 "st1 {v0.s}[2], [%x[output]], #4\n"
2985 "st1 {v0.h}[6], [%x[output]], #2\n"
2986 "prfm pldl1keep, [%x[output]]\n"
2987 : [count] "+r"(params_count_copy), [input] "+r"(input),
2988 [output] "+r"(output)
2989 : [range_offset] "r"(params.range_offset),
2990 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
2991 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2992 }
2993
2994 template <>
Transform(const float * input,const Quantize & params,uint8_t * output)2995 inline void Transform1DKernel<float, uint8_t, Quantize, 16, 15>::Transform(
2996 const float* input, const Quantize& params, uint8_t* output) {
2997 #ifdef DEBUG
2998 #ifdef DEBUG_METAGEMM_VERBOSE
2999 std::cout << __FILE__ << "(" << __LINE__
3000 << ") Quantize<float, uint8_t, Quantize, 16, 15>::Transform()"
3001 << std::endl
3002 << std::flush;
3003 #endif
3004 #endif
3005 int params_count_copy = params.count;
3006 asm volatile(
3007
3008 // Quantize::Prepare
3009 "dup v4.4s, %w[range_min]\n"
3010 "dup v5.4s, %w[range_offset]\n"
3011 "dup v6.4s, %w[range_scale]\n"
3012
3013 // Reduce count by leftovers.
3014 "subs %x[count], %x[count], #15\n"
3015 "beq 2f\n"
3016
3017 "1:"
3018 "subs %x[count], %x[count], #16\n"
3019
3020 // Quantize::Transform
3021 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
3022 "prfm pldl1keep, [%x[input], #64]\n"
3023 "fsub v0.4s, v0.4s, v4.4s\n"
3024 "fsub v1.4s, v1.4s, v4.4s\n"
3025 "fsub v2.4s, v2.4s, v4.4s\n"
3026 "fsub v3.4s, v3.4s, v4.4s\n"
3027 "fmul v0.4s, v0.4s, v6.4s\n"
3028 "fmul v1.4s, v1.4s, v6.4s\n"
3029 "fmul v2.4s, v2.4s, v6.4s\n"
3030 "fmul v3.4s, v3.4s, v6.4s\n"
3031 "fadd v0.4s, v0.4s, v5.4s\n"
3032 "fadd v1.4s, v1.4s, v5.4s\n"
3033 "fadd v2.4s, v2.4s, v5.4s\n"
3034 "fadd v3.4s, v3.4s, v5.4s\n"
3035 "fcvtzs v0.4s, v0.4s\n"
3036 "fcvtzs v1.4s, v1.4s\n"
3037 "fcvtzs v2.4s, v2.4s\n"
3038 "fcvtzs v3.4s, v3.4s\n"
3039 "sqxtn v0.4h, v0.4s\n"
3040 "sqxtn2 v0.8h, v1.4s\n"
3041 "sqxtn v2.4h, v2.4s\n"
3042 "sqxtn2 v2.8h, v3.4s\n"
3043 "sqxtun v0.8b, v0.8h\n"
3044 "sqxtun2 v0.16b, v2.8h\n"
3045
3046 "st1 {v0.4s}, [%x[output]], #16\n"
3047 "prfm pldl1keep, [%x[output]]\n"
3048
3049 "bne 1b\n"
3050 "2:"
3051
3052 // Handle leftovers.
3053
3054 // Quantize::Transform
3055 "ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
3056 "ld1 {v3.2s}, [%x[input]], #8\n"
3057 "ld1 {v3.s}[2], [%x[input]], #4\n"
3058 "prfm pldl1keep, [%x[input], #64]\n"
3059 "fsub v0.4s, v0.4s, v4.4s\n"
3060 "fsub v1.4s, v1.4s, v4.4s\n"
3061 "fsub v2.4s, v2.4s, v4.4s\n"
3062 "fsub v3.4s, v3.4s, v4.4s\n"
3063 "fmul v0.4s, v0.4s, v6.4s\n"
3064 "fmul v1.4s, v1.4s, v6.4s\n"
3065 "fmul v2.4s, v2.4s, v6.4s\n"
3066 "fmul v3.4s, v3.4s, v6.4s\n"
3067 "fadd v0.4s, v0.4s, v5.4s\n"
3068 "fadd v1.4s, v1.4s, v5.4s\n"
3069 "fadd v2.4s, v2.4s, v5.4s\n"
3070 "fadd v3.4s, v3.4s, v5.4s\n"
3071 "fcvtzs v0.4s, v0.4s\n"
3072 "fcvtzs v1.4s, v1.4s\n"
3073 "fcvtzs v2.4s, v2.4s\n"
3074 "fcvtzs v3.4s, v3.4s\n"
3075 "sqxtn v0.4h, v0.4s\n"
3076 "sqxtn2 v0.8h, v1.4s\n"
3077 "sqxtn v2.4h, v2.4s\n"
3078 "sqxtn2 v2.8h, v3.4s\n"
3079 "sqxtun v0.8b, v0.8h\n"
3080 "sqxtun2 v0.16b, v2.8h\n"
3081
3082 "st1 {v0.2s}, [%x[output]], #8\n"
3083 "st1 {v0.s}[2], [%x[output]], #4\n"
3084 "st1 {v0.h}[6], [%x[output]], #2\n"
3085 "st1 {v0.b}[14], [%x[output]], #1\n"
3086 "prfm pldl1keep, [%x[output]]\n"
3087 : [count] "+r"(params_count_copy), [input] "+r"(input),
3088 [output] "+r"(output)
3089 : [range_offset] "r"(params.range_offset),
3090 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3091 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3092 }
3093
3094 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3095 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 0>::Transform(
3096 const uint8_t* input, const Dequantize& params, float* output) {
3097 #ifdef DEBUG
3098 #ifdef DEBUG_METAGEMM_VERBOSE
3099 std::cout << __FILE__ << "(" << __LINE__
3100 << ") Dequantize<uint8_t, float, Dequantize, 16, 0>::Transform()"
3101 << std::endl
3102 << std::flush;
3103 #endif
3104 #endif
3105 int params_count_copy = params.count;
3106 asm volatile(
3107
3108 // Dequantize::Prepare
3109 "dup v4.4s, %w[range_min]\n"
3110 "dup v5.4s, %w[range_offset]\n"
3111 "dup v6.4s, %w[range_scale]\n"
3112
3113 "1:"
3114 "subs %x[count], %x[count], #16\n"
3115
3116 // Dequantize::Transform
3117 "ld1 {v0.4s}, [%x[input]], #16\n"
3118 "prfm pldl1keep, [%x[input], #32]\n"
3119 "uxtl2 v1.8h, v0.16b\n"
3120 "uxtl v0.8h, v0.8b\n"
3121 "sxtl2 v3.4s, v1.8h\n"
3122 "sxtl v2.4s, v1.4h\n"
3123 "sxtl2 v1.4s, v0.8h\n"
3124 "sxtl v0.4s, v0.4h\n"
3125 "scvtf v0.4s, v0.4s\n"
3126 "scvtf v1.4s, v1.4s\n"
3127 "scvtf v2.4s, v2.4s\n"
3128 "scvtf v3.4s, v3.4s\n"
3129 "fsub v0.4s, v0.4s, v5.4s\n"
3130 "fsub v1.4s, v1.4s, v5.4s\n"
3131 "fsub v2.4s, v2.4s, v5.4s\n"
3132 "fsub v3.4s, v3.4s, v5.4s\n"
3133 "fmul v0.4s, v0.4s, v6.4s\n"
3134 "fmul v1.4s, v1.4s, v6.4s\n"
3135 "fmul v2.4s, v2.4s, v6.4s\n"
3136 "fmul v3.4s, v3.4s, v6.4s\n"
3137 "fadd v0.4s, v0.4s, v4.4s\n"
3138 "fadd v1.4s, v1.4s, v4.4s\n"
3139 "fadd v2.4s, v2.4s, v4.4s\n"
3140 "fadd v3.4s, v3.4s, v4.4s\n"
3141
3142 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3143 "prfm pldl1keep, [%x[output]]\n"
3144
3145 "bne 1b\n"
3146 : [count] "+r"(params_count_copy), [input] "+r"(input),
3147 [output] "+r"(output)
3148 : [range_offset] "r"(params.range_offset),
3149 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3150 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3151 }
3152
3153 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3154 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 1>::Transform(
3155 const uint8_t* input, const Dequantize& params, float* output) {
3156 #ifdef DEBUG
3157 #ifdef DEBUG_METAGEMM_VERBOSE
3158 std::cout << __FILE__ << "(" << __LINE__
3159 << ") Dequantize<uint8_t, float, Dequantize, 16, 1>::Transform()"
3160 << std::endl
3161 << std::flush;
3162 #endif
3163 #endif
3164 int params_count_copy = params.count;
3165 asm volatile(
3166
3167 // Dequantize::Prepare
3168 "dup v4.4s, %w[range_min]\n"
3169 "dup v5.4s, %w[range_offset]\n"
3170 "dup v6.4s, %w[range_scale]\n"
3171
3172 // Reduce count by leftovers.
3173 "subs %x[count], %x[count], #1\n"
3174 "beq 2f\n"
3175
3176 "1:"
3177 "subs %x[count], %x[count], #16\n"
3178
3179 // Dequantize::Transform
3180 "ld1 {v0.4s}, [%x[input]], #16\n"
3181 "prfm pldl1keep, [%x[input], #32]\n"
3182 "uxtl2 v1.8h, v0.16b\n"
3183 "uxtl v0.8h, v0.8b\n"
3184 "sxtl2 v3.4s, v1.8h\n"
3185 "sxtl v2.4s, v1.4h\n"
3186 "sxtl2 v1.4s, v0.8h\n"
3187 "sxtl v0.4s, v0.4h\n"
3188 "scvtf v0.4s, v0.4s\n"
3189 "scvtf v1.4s, v1.4s\n"
3190 "scvtf v2.4s, v2.4s\n"
3191 "scvtf v3.4s, v3.4s\n"
3192 "fsub v0.4s, v0.4s, v5.4s\n"
3193 "fsub v1.4s, v1.4s, v5.4s\n"
3194 "fsub v2.4s, v2.4s, v5.4s\n"
3195 "fsub v3.4s, v3.4s, v5.4s\n"
3196 "fmul v0.4s, v0.4s, v6.4s\n"
3197 "fmul v1.4s, v1.4s, v6.4s\n"
3198 "fmul v2.4s, v2.4s, v6.4s\n"
3199 "fmul v3.4s, v3.4s, v6.4s\n"
3200 "fadd v0.4s, v0.4s, v4.4s\n"
3201 "fadd v1.4s, v1.4s, v4.4s\n"
3202 "fadd v2.4s, v2.4s, v4.4s\n"
3203 "fadd v3.4s, v3.4s, v4.4s\n"
3204
3205 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3206 "prfm pldl1keep, [%x[output]]\n"
3207
3208 "bne 1b\n"
3209 "2:"
3210
3211 // Handle leftovers.
3212
3213 // Dequantize::Transform
3214 "ld1 {v0.b}[0], [%x[input]], #1\n"
3215 "prfm pldl1keep, [%x[input], #32]\n"
3216 "uxtl v0.8h, v0.8b\n"
3217 "sxtl v0.4s, v0.4h\n"
3218 "scvtf v0.4s, v0.4s\n"
3219 "fsub v0.4s, v0.4s, v5.4s\n"
3220 "fmul v0.4s, v0.4s, v6.4s\n"
3221 "fadd v0.4s, v0.4s, v4.4s\n"
3222
3223 "st1 {v0.s}[0], [%x[output]], #4\n"
3224 "prfm pldl1keep, [%x[output]]\n"
3225 : [count] "+r"(params_count_copy), [input] "+r"(input),
3226 [output] "+r"(output)
3227 : [range_offset] "r"(params.range_offset),
3228 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3229 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3230 }
3231
3232 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3233 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 2>::Transform(
3234 const uint8_t* input, const Dequantize& params, float* output) {
3235 #ifdef DEBUG
3236 #ifdef DEBUG_METAGEMM_VERBOSE
3237 std::cout << __FILE__ << "(" << __LINE__
3238 << ") Dequantize<uint8_t, float, Dequantize, 16, 2>::Transform()"
3239 << std::endl
3240 << std::flush;
3241 #endif
3242 #endif
3243 int params_count_copy = params.count;
3244 asm volatile(
3245
3246 // Dequantize::Prepare
3247 "dup v4.4s, %w[range_min]\n"
3248 "dup v5.4s, %w[range_offset]\n"
3249 "dup v6.4s, %w[range_scale]\n"
3250
3251 // Reduce count by leftovers.
3252 "subs %x[count], %x[count], #2\n"
3253 "beq 2f\n"
3254
3255 "1:"
3256 "subs %x[count], %x[count], #16\n"
3257
3258 // Dequantize::Transform
3259 "ld1 {v0.4s}, [%x[input]], #16\n"
3260 "prfm pldl1keep, [%x[input], #32]\n"
3261 "uxtl2 v1.8h, v0.16b\n"
3262 "uxtl v0.8h, v0.8b\n"
3263 "sxtl2 v3.4s, v1.8h\n"
3264 "sxtl v2.4s, v1.4h\n"
3265 "sxtl2 v1.4s, v0.8h\n"
3266 "sxtl v0.4s, v0.4h\n"
3267 "scvtf v0.4s, v0.4s\n"
3268 "scvtf v1.4s, v1.4s\n"
3269 "scvtf v2.4s, v2.4s\n"
3270 "scvtf v3.4s, v3.4s\n"
3271 "fsub v0.4s, v0.4s, v5.4s\n"
3272 "fsub v1.4s, v1.4s, v5.4s\n"
3273 "fsub v2.4s, v2.4s, v5.4s\n"
3274 "fsub v3.4s, v3.4s, v5.4s\n"
3275 "fmul v0.4s, v0.4s, v6.4s\n"
3276 "fmul v1.4s, v1.4s, v6.4s\n"
3277 "fmul v2.4s, v2.4s, v6.4s\n"
3278 "fmul v3.4s, v3.4s, v6.4s\n"
3279 "fadd v0.4s, v0.4s, v4.4s\n"
3280 "fadd v1.4s, v1.4s, v4.4s\n"
3281 "fadd v2.4s, v2.4s, v4.4s\n"
3282 "fadd v3.4s, v3.4s, v4.4s\n"
3283
3284 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3285 "prfm pldl1keep, [%x[output]]\n"
3286
3287 "bne 1b\n"
3288 "2:"
3289
3290 // Handle leftovers.
3291
3292 // Dequantize::Transform
3293 "ld1 {v0.h}[0], [%x[input]], #2\n"
3294 "prfm pldl1keep, [%x[input], #32]\n"
3295 "uxtl v0.8h, v0.8b\n"
3296 "sxtl v0.4s, v0.4h\n"
3297 "scvtf v0.4s, v0.4s\n"
3298 "fsub v0.4s, v0.4s, v5.4s\n"
3299 "fmul v0.4s, v0.4s, v6.4s\n"
3300 "fadd v0.4s, v0.4s, v4.4s\n"
3301
3302 "st1 {v0.2s}, [%x[output]], #8\n"
3303 "prfm pldl1keep, [%x[output]]\n"
3304 : [count] "+r"(params_count_copy), [input] "+r"(input),
3305 [output] "+r"(output)
3306 : [range_offset] "r"(params.range_offset),
3307 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3308 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3309 }
3310
3311 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3312 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 3>::Transform(
3313 const uint8_t* input, const Dequantize& params, float* output) {
3314 #ifdef DEBUG
3315 #ifdef DEBUG_METAGEMM_VERBOSE
3316 std::cout << __FILE__ << "(" << __LINE__
3317 << ") Dequantize<uint8_t, float, Dequantize, 16, 3>::Transform()"
3318 << std::endl
3319 << std::flush;
3320 #endif
3321 #endif
3322 int params_count_copy = params.count;
3323 asm volatile(
3324
3325 // Dequantize::Prepare
3326 "dup v4.4s, %w[range_min]\n"
3327 "dup v5.4s, %w[range_offset]\n"
3328 "dup v6.4s, %w[range_scale]\n"
3329
3330 // Reduce count by leftovers.
3331 "subs %x[count], %x[count], #3\n"
3332 "beq 2f\n"
3333
3334 "1:"
3335 "subs %x[count], %x[count], #16\n"
3336
3337 // Dequantize::Transform
3338 "ld1 {v0.4s}, [%x[input]], #16\n"
3339 "prfm pldl1keep, [%x[input], #32]\n"
3340 "uxtl2 v1.8h, v0.16b\n"
3341 "uxtl v0.8h, v0.8b\n"
3342 "sxtl2 v3.4s, v1.8h\n"
3343 "sxtl v2.4s, v1.4h\n"
3344 "sxtl2 v1.4s, v0.8h\n"
3345 "sxtl v0.4s, v0.4h\n"
3346 "scvtf v0.4s, v0.4s\n"
3347 "scvtf v1.4s, v1.4s\n"
3348 "scvtf v2.4s, v2.4s\n"
3349 "scvtf v3.4s, v3.4s\n"
3350 "fsub v0.4s, v0.4s, v5.4s\n"
3351 "fsub v1.4s, v1.4s, v5.4s\n"
3352 "fsub v2.4s, v2.4s, v5.4s\n"
3353 "fsub v3.4s, v3.4s, v5.4s\n"
3354 "fmul v0.4s, v0.4s, v6.4s\n"
3355 "fmul v1.4s, v1.4s, v6.4s\n"
3356 "fmul v2.4s, v2.4s, v6.4s\n"
3357 "fmul v3.4s, v3.4s, v6.4s\n"
3358 "fadd v0.4s, v0.4s, v4.4s\n"
3359 "fadd v1.4s, v1.4s, v4.4s\n"
3360 "fadd v2.4s, v2.4s, v4.4s\n"
3361 "fadd v3.4s, v3.4s, v4.4s\n"
3362
3363 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3364 "prfm pldl1keep, [%x[output]]\n"
3365
3366 "bne 1b\n"
3367 "2:"
3368
3369 // Handle leftovers.
3370
3371 // Dequantize::Transform
3372 "ld1 {v0.h}[0], [%x[input]], #2\n"
3373 "ld1 {v0.b}[2], [%x[input]], #1\n"
3374 "prfm pldl1keep, [%x[input], #32]\n"
3375 "uxtl v0.8h, v0.8b\n"
3376 "sxtl v0.4s, v0.4h\n"
3377 "scvtf v0.4s, v0.4s\n"
3378 "fsub v0.4s, v0.4s, v5.4s\n"
3379 "fmul v0.4s, v0.4s, v6.4s\n"
3380 "fadd v0.4s, v0.4s, v4.4s\n"
3381
3382 "st1 {v0.2s}, [%x[output]], #8\n"
3383 "st1 {v0.s}[2], [%x[output]], #4\n"
3384 "prfm pldl1keep, [%x[output]]\n"
3385 : [count] "+r"(params_count_copy), [input] "+r"(input),
3386 [output] "+r"(output)
3387 : [range_offset] "r"(params.range_offset),
3388 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3389 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3390 }
3391
3392 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3393 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 4>::Transform(
3394 const uint8_t* input, const Dequantize& params, float* output) {
3395 #ifdef DEBUG
3396 #ifdef DEBUG_METAGEMM_VERBOSE
3397 std::cout << __FILE__ << "(" << __LINE__
3398 << ") Dequantize<uint8_t, float, Dequantize, 16, 4>::Transform()"
3399 << std::endl
3400 << std::flush;
3401 #endif
3402 #endif
3403 int params_count_copy = params.count;
3404 asm volatile(
3405
3406 // Dequantize::Prepare
3407 "dup v4.4s, %w[range_min]\n"
3408 "dup v5.4s, %w[range_offset]\n"
3409 "dup v6.4s, %w[range_scale]\n"
3410
3411 // Reduce count by leftovers.
3412 "subs %x[count], %x[count], #4\n"
3413 "beq 2f\n"
3414
3415 "1:"
3416 "subs %x[count], %x[count], #16\n"
3417
3418 // Dequantize::Transform
3419 "ld1 {v0.4s}, [%x[input]], #16\n"
3420 "prfm pldl1keep, [%x[input], #32]\n"
3421 "uxtl2 v1.8h, v0.16b\n"
3422 "uxtl v0.8h, v0.8b\n"
3423 "sxtl2 v3.4s, v1.8h\n"
3424 "sxtl v2.4s, v1.4h\n"
3425 "sxtl2 v1.4s, v0.8h\n"
3426 "sxtl v0.4s, v0.4h\n"
3427 "scvtf v0.4s, v0.4s\n"
3428 "scvtf v1.4s, v1.4s\n"
3429 "scvtf v2.4s, v2.4s\n"
3430 "scvtf v3.4s, v3.4s\n"
3431 "fsub v0.4s, v0.4s, v5.4s\n"
3432 "fsub v1.4s, v1.4s, v5.4s\n"
3433 "fsub v2.4s, v2.4s, v5.4s\n"
3434 "fsub v3.4s, v3.4s, v5.4s\n"
3435 "fmul v0.4s, v0.4s, v6.4s\n"
3436 "fmul v1.4s, v1.4s, v6.4s\n"
3437 "fmul v2.4s, v2.4s, v6.4s\n"
3438 "fmul v3.4s, v3.4s, v6.4s\n"
3439 "fadd v0.4s, v0.4s, v4.4s\n"
3440 "fadd v1.4s, v1.4s, v4.4s\n"
3441 "fadd v2.4s, v2.4s, v4.4s\n"
3442 "fadd v3.4s, v3.4s, v4.4s\n"
3443
3444 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3445 "prfm pldl1keep, [%x[output]]\n"
3446
3447 "bne 1b\n"
3448 "2:"
3449
3450 // Handle leftovers.
3451
3452 // Dequantize::Transform
3453 "ld1 {v0.s}[0], [%x[input]], #4\n"
3454 "prfm pldl1keep, [%x[input], #32]\n"
3455 "uxtl v0.8h, v0.8b\n"
3456 "sxtl v0.4s, v0.4h\n"
3457 "scvtf v0.4s, v0.4s\n"
3458 "fsub v0.4s, v0.4s, v5.4s\n"
3459 "fmul v0.4s, v0.4s, v6.4s\n"
3460 "fadd v0.4s, v0.4s, v4.4s\n"
3461
3462 "st1 {v0.4s}, [%x[output]], #16\n"
3463 "prfm pldl1keep, [%x[output]]\n"
3464 : [count] "+r"(params_count_copy), [input] "+r"(input),
3465 [output] "+r"(output)
3466 : [range_offset] "r"(params.range_offset),
3467 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3468 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3469 }
3470
3471 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3472 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 5>::Transform(
3473 const uint8_t* input, const Dequantize& params, float* output) {
3474 #ifdef DEBUG
3475 #ifdef DEBUG_METAGEMM_VERBOSE
3476 std::cout << __FILE__ << "(" << __LINE__
3477 << ") Dequantize<uint8_t, float, Dequantize, 16, 5>::Transform()"
3478 << std::endl
3479 << std::flush;
3480 #endif
3481 #endif
3482 int params_count_copy = params.count;
3483 asm volatile(
3484
3485 // Dequantize::Prepare
3486 "dup v4.4s, %w[range_min]\n"
3487 "dup v5.4s, %w[range_offset]\n"
3488 "dup v6.4s, %w[range_scale]\n"
3489
3490 // Reduce count by leftovers.
3491 "subs %x[count], %x[count], #5\n"
3492 "beq 2f\n"
3493
3494 "1:"
3495 "subs %x[count], %x[count], #16\n"
3496
3497 // Dequantize::Transform
3498 "ld1 {v0.4s}, [%x[input]], #16\n"
3499 "prfm pldl1keep, [%x[input], #32]\n"
3500 "uxtl2 v1.8h, v0.16b\n"
3501 "uxtl v0.8h, v0.8b\n"
3502 "sxtl2 v3.4s, v1.8h\n"
3503 "sxtl v2.4s, v1.4h\n"
3504 "sxtl2 v1.4s, v0.8h\n"
3505 "sxtl v0.4s, v0.4h\n"
3506 "scvtf v0.4s, v0.4s\n"
3507 "scvtf v1.4s, v1.4s\n"
3508 "scvtf v2.4s, v2.4s\n"
3509 "scvtf v3.4s, v3.4s\n"
3510 "fsub v0.4s, v0.4s, v5.4s\n"
3511 "fsub v1.4s, v1.4s, v5.4s\n"
3512 "fsub v2.4s, v2.4s, v5.4s\n"
3513 "fsub v3.4s, v3.4s, v5.4s\n"
3514 "fmul v0.4s, v0.4s, v6.4s\n"
3515 "fmul v1.4s, v1.4s, v6.4s\n"
3516 "fmul v2.4s, v2.4s, v6.4s\n"
3517 "fmul v3.4s, v3.4s, v6.4s\n"
3518 "fadd v0.4s, v0.4s, v4.4s\n"
3519 "fadd v1.4s, v1.4s, v4.4s\n"
3520 "fadd v2.4s, v2.4s, v4.4s\n"
3521 "fadd v3.4s, v3.4s, v4.4s\n"
3522
3523 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3524 "prfm pldl1keep, [%x[output]]\n"
3525
3526 "bne 1b\n"
3527 "2:"
3528
3529 // Handle leftovers.
3530
3531 // Dequantize::Transform
3532 "ld1 {v0.s}[0], [%x[input]], #4\n"
3533 "ld1 {v0.b}[4], [%x[input]], #1\n"
3534 "prfm pldl1keep, [%x[input], #32]\n"
3535 "uxtl v0.8h, v0.8b\n"
3536 "sxtl2 v1.4s, v0.8h\n"
3537 "sxtl v0.4s, v0.4h\n"
3538 "scvtf v0.4s, v0.4s\n"
3539 "scvtf v1.4s, v1.4s\n"
3540 "fsub v0.4s, v0.4s, v5.4s\n"
3541 "fsub v1.4s, v1.4s, v5.4s\n"
3542 "fmul v0.4s, v0.4s, v6.4s\n"
3543 "fmul v1.4s, v1.4s, v6.4s\n"
3544 "fadd v0.4s, v0.4s, v4.4s\n"
3545 "fadd v1.4s, v1.4s, v4.4s\n"
3546
3547 "st1 {v0.4s}, [%x[output]], #16\n"
3548 "st1 {v1.s}[0], [%x[output]], #4\n"
3549 "prfm pldl1keep, [%x[output]]\n"
3550 : [count] "+r"(params_count_copy), [input] "+r"(input),
3551 [output] "+r"(output)
3552 : [range_offset] "r"(params.range_offset),
3553 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3554 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3555 }
3556
3557 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3558 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 6>::Transform(
3559 const uint8_t* input, const Dequantize& params, float* output) {
3560 #ifdef DEBUG
3561 #ifdef DEBUG_METAGEMM_VERBOSE
3562 std::cout << __FILE__ << "(" << __LINE__
3563 << ") Dequantize<uint8_t, float, Dequantize, 16, 6>::Transform()"
3564 << std::endl
3565 << std::flush;
3566 #endif
3567 #endif
3568 int params_count_copy = params.count;
3569 asm volatile(
3570
3571 // Dequantize::Prepare
3572 "dup v4.4s, %w[range_min]\n"
3573 "dup v5.4s, %w[range_offset]\n"
3574 "dup v6.4s, %w[range_scale]\n"
3575
3576 // Reduce count by leftovers.
3577 "subs %x[count], %x[count], #6\n"
3578 "beq 2f\n"
3579
3580 "1:"
3581 "subs %x[count], %x[count], #16\n"
3582
3583 // Dequantize::Transform
3584 "ld1 {v0.4s}, [%x[input]], #16\n"
3585 "prfm pldl1keep, [%x[input], #32]\n"
3586 "uxtl2 v1.8h, v0.16b\n"
3587 "uxtl v0.8h, v0.8b\n"
3588 "sxtl2 v3.4s, v1.8h\n"
3589 "sxtl v2.4s, v1.4h\n"
3590 "sxtl2 v1.4s, v0.8h\n"
3591 "sxtl v0.4s, v0.4h\n"
3592 "scvtf v0.4s, v0.4s\n"
3593 "scvtf v1.4s, v1.4s\n"
3594 "scvtf v2.4s, v2.4s\n"
3595 "scvtf v3.4s, v3.4s\n"
3596 "fsub v0.4s, v0.4s, v5.4s\n"
3597 "fsub v1.4s, v1.4s, v5.4s\n"
3598 "fsub v2.4s, v2.4s, v5.4s\n"
3599 "fsub v3.4s, v3.4s, v5.4s\n"
3600 "fmul v0.4s, v0.4s, v6.4s\n"
3601 "fmul v1.4s, v1.4s, v6.4s\n"
3602 "fmul v2.4s, v2.4s, v6.4s\n"
3603 "fmul v3.4s, v3.4s, v6.4s\n"
3604 "fadd v0.4s, v0.4s, v4.4s\n"
3605 "fadd v1.4s, v1.4s, v4.4s\n"
3606 "fadd v2.4s, v2.4s, v4.4s\n"
3607 "fadd v3.4s, v3.4s, v4.4s\n"
3608
3609 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3610 "prfm pldl1keep, [%x[output]]\n"
3611
3612 "bne 1b\n"
3613 "2:"
3614
3615 // Handle leftovers.
3616
3617 // Dequantize::Transform
3618 "ld1 {v0.s}[0], [%x[input]], #4\n"
3619 "ld1 {v0.h}[2], [%x[input]], #2\n"
3620 "prfm pldl1keep, [%x[input], #32]\n"
3621 "uxtl v0.8h, v0.8b\n"
3622 "sxtl2 v1.4s, v0.8h\n"
3623 "sxtl v0.4s, v0.4h\n"
3624 "scvtf v0.4s, v0.4s\n"
3625 "scvtf v1.4s, v1.4s\n"
3626 "fsub v0.4s, v0.4s, v5.4s\n"
3627 "fsub v1.4s, v1.4s, v5.4s\n"
3628 "fmul v0.4s, v0.4s, v6.4s\n"
3629 "fmul v1.4s, v1.4s, v6.4s\n"
3630 "fadd v0.4s, v0.4s, v4.4s\n"
3631 "fadd v1.4s, v1.4s, v4.4s\n"
3632
3633 "st1 {v0.4s}, [%x[output]], #16\n"
3634 "st1 {v1.2s}, [%x[output]], #8\n"
3635 "prfm pldl1keep, [%x[output]]\n"
3636 : [count] "+r"(params_count_copy), [input] "+r"(input),
3637 [output] "+r"(output)
3638 : [range_offset] "r"(params.range_offset),
3639 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3640 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3641 }
3642
3643 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3644 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 7>::Transform(
3645 const uint8_t* input, const Dequantize& params, float* output) {
3646 #ifdef DEBUG
3647 #ifdef DEBUG_METAGEMM_VERBOSE
3648 std::cout << __FILE__ << "(" << __LINE__
3649 << ") Dequantize<uint8_t, float, Dequantize, 16, 7>::Transform()"
3650 << std::endl
3651 << std::flush;
3652 #endif
3653 #endif
3654 int params_count_copy = params.count;
3655 asm volatile(
3656
3657 // Dequantize::Prepare
3658 "dup v4.4s, %w[range_min]\n"
3659 "dup v5.4s, %w[range_offset]\n"
3660 "dup v6.4s, %w[range_scale]\n"
3661
3662 // Reduce count by leftovers.
3663 "subs %x[count], %x[count], #7\n"
3664 "beq 2f\n"
3665
3666 "1:"
3667 "subs %x[count], %x[count], #16\n"
3668
3669 // Dequantize::Transform
3670 "ld1 {v0.4s}, [%x[input]], #16\n"
3671 "prfm pldl1keep, [%x[input], #32]\n"
3672 "uxtl2 v1.8h, v0.16b\n"
3673 "uxtl v0.8h, v0.8b\n"
3674 "sxtl2 v3.4s, v1.8h\n"
3675 "sxtl v2.4s, v1.4h\n"
3676 "sxtl2 v1.4s, v0.8h\n"
3677 "sxtl v0.4s, v0.4h\n"
3678 "scvtf v0.4s, v0.4s\n"
3679 "scvtf v1.4s, v1.4s\n"
3680 "scvtf v2.4s, v2.4s\n"
3681 "scvtf v3.4s, v3.4s\n"
3682 "fsub v0.4s, v0.4s, v5.4s\n"
3683 "fsub v1.4s, v1.4s, v5.4s\n"
3684 "fsub v2.4s, v2.4s, v5.4s\n"
3685 "fsub v3.4s, v3.4s, v5.4s\n"
3686 "fmul v0.4s, v0.4s, v6.4s\n"
3687 "fmul v1.4s, v1.4s, v6.4s\n"
3688 "fmul v2.4s, v2.4s, v6.4s\n"
3689 "fmul v3.4s, v3.4s, v6.4s\n"
3690 "fadd v0.4s, v0.4s, v4.4s\n"
3691 "fadd v1.4s, v1.4s, v4.4s\n"
3692 "fadd v2.4s, v2.4s, v4.4s\n"
3693 "fadd v3.4s, v3.4s, v4.4s\n"
3694
3695 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3696 "prfm pldl1keep, [%x[output]]\n"
3697
3698 "bne 1b\n"
3699 "2:"
3700
3701 // Handle leftovers.
3702
3703 // Dequantize::Transform
3704 "ld1 {v0.s}[0], [%x[input]], #4\n"
3705 "ld1 {v0.h}[2], [%x[input]], #2\n"
3706 "ld1 {v0.b}[6], [%x[input]], #1\n"
3707 "prfm pldl1keep, [%x[input], #32]\n"
3708 "uxtl v0.8h, v0.8b\n"
3709 "sxtl2 v1.4s, v0.8h\n"
3710 "sxtl v0.4s, v0.4h\n"
3711 "scvtf v0.4s, v0.4s\n"
3712 "scvtf v1.4s, v1.4s\n"
3713 "fsub v0.4s, v0.4s, v5.4s\n"
3714 "fsub v1.4s, v1.4s, v5.4s\n"
3715 "fmul v0.4s, v0.4s, v6.4s\n"
3716 "fmul v1.4s, v1.4s, v6.4s\n"
3717 "fadd v0.4s, v0.4s, v4.4s\n"
3718 "fadd v1.4s, v1.4s, v4.4s\n"
3719
3720 "st1 {v0.4s}, [%x[output]], #16\n"
3721 "st1 {v1.2s}, [%x[output]], #8\n"
3722 "st1 {v1.s}[2], [%x[output]], #4\n"
3723 "prfm pldl1keep, [%x[output]]\n"
3724 : [count] "+r"(params_count_copy), [input] "+r"(input),
3725 [output] "+r"(output)
3726 : [range_offset] "r"(params.range_offset),
3727 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3728 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3729 }
3730
3731 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3732 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 8>::Transform(
3733 const uint8_t* input, const Dequantize& params, float* output) {
3734 #ifdef DEBUG
3735 #ifdef DEBUG_METAGEMM_VERBOSE
3736 std::cout << __FILE__ << "(" << __LINE__
3737 << ") Dequantize<uint8_t, float, Dequantize, 16, 8>::Transform()"
3738 << std::endl
3739 << std::flush;
3740 #endif
3741 #endif
3742 int params_count_copy = params.count;
3743 asm volatile(
3744
3745 // Dequantize::Prepare
3746 "dup v4.4s, %w[range_min]\n"
3747 "dup v5.4s, %w[range_offset]\n"
3748 "dup v6.4s, %w[range_scale]\n"
3749
3750 // Reduce count by leftovers.
3751 "subs %x[count], %x[count], #8\n"
3752 "beq 2f\n"
3753
3754 "1:"
3755 "subs %x[count], %x[count], #16\n"
3756
3757 // Dequantize::Transform
3758 "ld1 {v0.4s}, [%x[input]], #16\n"
3759 "prfm pldl1keep, [%x[input], #32]\n"
3760 "uxtl2 v1.8h, v0.16b\n"
3761 "uxtl v0.8h, v0.8b\n"
3762 "sxtl2 v3.4s, v1.8h\n"
3763 "sxtl v2.4s, v1.4h\n"
3764 "sxtl2 v1.4s, v0.8h\n"
3765 "sxtl v0.4s, v0.4h\n"
3766 "scvtf v0.4s, v0.4s\n"
3767 "scvtf v1.4s, v1.4s\n"
3768 "scvtf v2.4s, v2.4s\n"
3769 "scvtf v3.4s, v3.4s\n"
3770 "fsub v0.4s, v0.4s, v5.4s\n"
3771 "fsub v1.4s, v1.4s, v5.4s\n"
3772 "fsub v2.4s, v2.4s, v5.4s\n"
3773 "fsub v3.4s, v3.4s, v5.4s\n"
3774 "fmul v0.4s, v0.4s, v6.4s\n"
3775 "fmul v1.4s, v1.4s, v6.4s\n"
3776 "fmul v2.4s, v2.4s, v6.4s\n"
3777 "fmul v3.4s, v3.4s, v6.4s\n"
3778 "fadd v0.4s, v0.4s, v4.4s\n"
3779 "fadd v1.4s, v1.4s, v4.4s\n"
3780 "fadd v2.4s, v2.4s, v4.4s\n"
3781 "fadd v3.4s, v3.4s, v4.4s\n"
3782
3783 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3784 "prfm pldl1keep, [%x[output]]\n"
3785
3786 "bne 1b\n"
3787 "2:"
3788
3789 // Handle leftovers.
3790
3791 // Dequantize::Transform
3792 "ld1 {v0.2s}, [%x[input]], #8\n"
3793 "prfm pldl1keep, [%x[input], #32]\n"
3794 "uxtl v0.8h, v0.8b\n"
3795 "sxtl2 v1.4s, v0.8h\n"
3796 "sxtl v0.4s, v0.4h\n"
3797 "scvtf v0.4s, v0.4s\n"
3798 "scvtf v1.4s, v1.4s\n"
3799 "fsub v0.4s, v0.4s, v5.4s\n"
3800 "fsub v1.4s, v1.4s, v5.4s\n"
3801 "fmul v0.4s, v0.4s, v6.4s\n"
3802 "fmul v1.4s, v1.4s, v6.4s\n"
3803 "fadd v0.4s, v0.4s, v4.4s\n"
3804 "fadd v1.4s, v1.4s, v4.4s\n"
3805
3806 "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
3807 "prfm pldl1keep, [%x[output]]\n"
3808 : [count] "+r"(params_count_copy), [input] "+r"(input),
3809 [output] "+r"(output)
3810 : [range_offset] "r"(params.range_offset),
3811 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3812 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3813 }
3814
3815 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3816 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 9>::Transform(
3817 const uint8_t* input, const Dequantize& params, float* output) {
3818 #ifdef DEBUG
3819 #ifdef DEBUG_METAGEMM_VERBOSE
3820 std::cout << __FILE__ << "(" << __LINE__
3821 << ") Dequantize<uint8_t, float, Dequantize, 16, 9>::Transform()"
3822 << std::endl
3823 << std::flush;
3824 #endif
3825 #endif
3826 int params_count_copy = params.count;
3827 asm volatile(
3828
3829 // Dequantize::Prepare
3830 "dup v4.4s, %w[range_min]\n"
3831 "dup v5.4s, %w[range_offset]\n"
3832 "dup v6.4s, %w[range_scale]\n"
3833
3834 // Reduce count by leftovers.
3835 "subs %x[count], %x[count], #9\n"
3836 "beq 2f\n"
3837
3838 "1:"
3839 "subs %x[count], %x[count], #16\n"
3840
3841 // Dequantize::Transform
3842 "ld1 {v0.4s}, [%x[input]], #16\n"
3843 "prfm pldl1keep, [%x[input], #32]\n"
3844 "uxtl2 v1.8h, v0.16b\n"
3845 "uxtl v0.8h, v0.8b\n"
3846 "sxtl2 v3.4s, v1.8h\n"
3847 "sxtl v2.4s, v1.4h\n"
3848 "sxtl2 v1.4s, v0.8h\n"
3849 "sxtl v0.4s, v0.4h\n"
3850 "scvtf v0.4s, v0.4s\n"
3851 "scvtf v1.4s, v1.4s\n"
3852 "scvtf v2.4s, v2.4s\n"
3853 "scvtf v3.4s, v3.4s\n"
3854 "fsub v0.4s, v0.4s, v5.4s\n"
3855 "fsub v1.4s, v1.4s, v5.4s\n"
3856 "fsub v2.4s, v2.4s, v5.4s\n"
3857 "fsub v3.4s, v3.4s, v5.4s\n"
3858 "fmul v0.4s, v0.4s, v6.4s\n"
3859 "fmul v1.4s, v1.4s, v6.4s\n"
3860 "fmul v2.4s, v2.4s, v6.4s\n"
3861 "fmul v3.4s, v3.4s, v6.4s\n"
3862 "fadd v0.4s, v0.4s, v4.4s\n"
3863 "fadd v1.4s, v1.4s, v4.4s\n"
3864 "fadd v2.4s, v2.4s, v4.4s\n"
3865 "fadd v3.4s, v3.4s, v4.4s\n"
3866
3867 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3868 "prfm pldl1keep, [%x[output]]\n"
3869
3870 "bne 1b\n"
3871 "2:"
3872
3873 // Handle leftovers.
3874
3875 // Dequantize::Transform
3876 "ld1 {v0.2s}, [%x[input]], #8\n"
3877 "ld1 {v0.b}[8], [%x[input]], #1\n"
3878 "prfm pldl1keep, [%x[input], #32]\n"
3879 "uxtl2 v1.8h, v0.16b\n"
3880 "uxtl v0.8h, v0.8b\n"
3881 "sxtl v2.4s, v1.4h\n"
3882 "sxtl2 v1.4s, v0.8h\n"
3883 "sxtl v0.4s, v0.4h\n"
3884 "scvtf v0.4s, v0.4s\n"
3885 "scvtf v1.4s, v1.4s\n"
3886 "scvtf v2.4s, v2.4s\n"
3887 "fsub v0.4s, v0.4s, v5.4s\n"
3888 "fsub v1.4s, v1.4s, v5.4s\n"
3889 "fsub v2.4s, v2.4s, v5.4s\n"
3890 "fmul v0.4s, v0.4s, v6.4s\n"
3891 "fmul v1.4s, v1.4s, v6.4s\n"
3892 "fmul v2.4s, v2.4s, v6.4s\n"
3893 "fadd v0.4s, v0.4s, v4.4s\n"
3894 "fadd v1.4s, v1.4s, v4.4s\n"
3895 "fadd v2.4s, v2.4s, v4.4s\n"
3896
3897 "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
3898 "st1 {v2.s}[0], [%x[output]], #4\n"
3899 "prfm pldl1keep, [%x[output]]\n"
3900 : [count] "+r"(params_count_copy), [input] "+r"(input),
3901 [output] "+r"(output)
3902 : [range_offset] "r"(params.range_offset),
3903 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3904 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3905 }
3906
3907 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)3908 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 10>::Transform(
3909 const uint8_t* input, const Dequantize& params, float* output) {
3910 #ifdef DEBUG
3911 #ifdef DEBUG_METAGEMM_VERBOSE
3912 std::cout << __FILE__ << "(" << __LINE__
3913 << ") Dequantize<uint8_t, float, Dequantize, 16, 10>::Transform()"
3914 << std::endl
3915 << std::flush;
3916 #endif
3917 #endif
3918 int params_count_copy = params.count;
3919 asm volatile(
3920
3921 // Dequantize::Prepare
3922 "dup v4.4s, %w[range_min]\n"
3923 "dup v5.4s, %w[range_offset]\n"
3924 "dup v6.4s, %w[range_scale]\n"
3925
3926 // Reduce count by leftovers.
3927 "subs %x[count], %x[count], #10\n"
3928 "beq 2f\n"
3929
3930 "1:"
3931 "subs %x[count], %x[count], #16\n"
3932
3933 // Dequantize::Transform
3934 "ld1 {v0.4s}, [%x[input]], #16\n"
3935 "prfm pldl1keep, [%x[input], #32]\n"
3936 "uxtl2 v1.8h, v0.16b\n"
3937 "uxtl v0.8h, v0.8b\n"
3938 "sxtl2 v3.4s, v1.8h\n"
3939 "sxtl v2.4s, v1.4h\n"
3940 "sxtl2 v1.4s, v0.8h\n"
3941 "sxtl v0.4s, v0.4h\n"
3942 "scvtf v0.4s, v0.4s\n"
3943 "scvtf v1.4s, v1.4s\n"
3944 "scvtf v2.4s, v2.4s\n"
3945 "scvtf v3.4s, v3.4s\n"
3946 "fsub v0.4s, v0.4s, v5.4s\n"
3947 "fsub v1.4s, v1.4s, v5.4s\n"
3948 "fsub v2.4s, v2.4s, v5.4s\n"
3949 "fsub v3.4s, v3.4s, v5.4s\n"
3950 "fmul v0.4s, v0.4s, v6.4s\n"
3951 "fmul v1.4s, v1.4s, v6.4s\n"
3952 "fmul v2.4s, v2.4s, v6.4s\n"
3953 "fmul v3.4s, v3.4s, v6.4s\n"
3954 "fadd v0.4s, v0.4s, v4.4s\n"
3955 "fadd v1.4s, v1.4s, v4.4s\n"
3956 "fadd v2.4s, v2.4s, v4.4s\n"
3957 "fadd v3.4s, v3.4s, v4.4s\n"
3958
3959 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
3960 "prfm pldl1keep, [%x[output]]\n"
3961
3962 "bne 1b\n"
3963 "2:"
3964
3965 // Handle leftovers.
3966
3967 // Dequantize::Transform
3968 "ld1 {v0.2s}, [%x[input]], #8\n"
3969 "ld1 {v0.h}[4], [%x[input]], #2\n"
3970 "prfm pldl1keep, [%x[input], #32]\n"
3971 "uxtl2 v1.8h, v0.16b\n"
3972 "uxtl v0.8h, v0.8b\n"
3973 "sxtl v2.4s, v1.4h\n"
3974 "sxtl2 v1.4s, v0.8h\n"
3975 "sxtl v0.4s, v0.4h\n"
3976 "scvtf v0.4s, v0.4s\n"
3977 "scvtf v1.4s, v1.4s\n"
3978 "scvtf v2.4s, v2.4s\n"
3979 "fsub v0.4s, v0.4s, v5.4s\n"
3980 "fsub v1.4s, v1.4s, v5.4s\n"
3981 "fsub v2.4s, v2.4s, v5.4s\n"
3982 "fmul v0.4s, v0.4s, v6.4s\n"
3983 "fmul v1.4s, v1.4s, v6.4s\n"
3984 "fmul v2.4s, v2.4s, v6.4s\n"
3985 "fadd v0.4s, v0.4s, v4.4s\n"
3986 "fadd v1.4s, v1.4s, v4.4s\n"
3987 "fadd v2.4s, v2.4s, v4.4s\n"
3988
3989 "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
3990 "st1 {v2.2s}, [%x[output]], #8\n"
3991 "prfm pldl1keep, [%x[output]]\n"
3992 : [count] "+r"(params_count_copy), [input] "+r"(input),
3993 [output] "+r"(output)
3994 : [range_offset] "r"(params.range_offset),
3995 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
3996 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3997 }
3998
3999 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4000 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 11>::Transform(
4001 const uint8_t* input, const Dequantize& params, float* output) {
4002 #ifdef DEBUG
4003 #ifdef DEBUG_METAGEMM_VERBOSE
4004 std::cout << __FILE__ << "(" << __LINE__
4005 << ") Dequantize<uint8_t, float, Dequantize, 16, 11>::Transform()"
4006 << std::endl
4007 << std::flush;
4008 #endif
4009 #endif
4010 int params_count_copy = params.count;
4011 asm volatile(
4012
4013 // Dequantize::Prepare
4014 "dup v4.4s, %w[range_min]\n"
4015 "dup v5.4s, %w[range_offset]\n"
4016 "dup v6.4s, %w[range_scale]\n"
4017
4018 // Reduce count by leftovers.
4019 "subs %x[count], %x[count], #11\n"
4020 "beq 2f\n"
4021
4022 "1:"
4023 "subs %x[count], %x[count], #16\n"
4024
4025 // Dequantize::Transform
4026 "ld1 {v0.4s}, [%x[input]], #16\n"
4027 "prfm pldl1keep, [%x[input], #32]\n"
4028 "uxtl2 v1.8h, v0.16b\n"
4029 "uxtl v0.8h, v0.8b\n"
4030 "sxtl2 v3.4s, v1.8h\n"
4031 "sxtl v2.4s, v1.4h\n"
4032 "sxtl2 v1.4s, v0.8h\n"
4033 "sxtl v0.4s, v0.4h\n"
4034 "scvtf v0.4s, v0.4s\n"
4035 "scvtf v1.4s, v1.4s\n"
4036 "scvtf v2.4s, v2.4s\n"
4037 "scvtf v3.4s, v3.4s\n"
4038 "fsub v0.4s, v0.4s, v5.4s\n"
4039 "fsub v1.4s, v1.4s, v5.4s\n"
4040 "fsub v2.4s, v2.4s, v5.4s\n"
4041 "fsub v3.4s, v3.4s, v5.4s\n"
4042 "fmul v0.4s, v0.4s, v6.4s\n"
4043 "fmul v1.4s, v1.4s, v6.4s\n"
4044 "fmul v2.4s, v2.4s, v6.4s\n"
4045 "fmul v3.4s, v3.4s, v6.4s\n"
4046 "fadd v0.4s, v0.4s, v4.4s\n"
4047 "fadd v1.4s, v1.4s, v4.4s\n"
4048 "fadd v2.4s, v2.4s, v4.4s\n"
4049 "fadd v3.4s, v3.4s, v4.4s\n"
4050
4051 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4052 "prfm pldl1keep, [%x[output]]\n"
4053
4054 "bne 1b\n"
4055 "2:"
4056
4057 // Handle leftovers.
4058
4059 // Dequantize::Transform
4060 "ld1 {v0.2s}, [%x[input]], #8\n"
4061 "ld1 {v0.h}[4], [%x[input]], #2\n"
4062 "ld1 {v0.b}[10], [%x[input]], #1\n"
4063 "prfm pldl1keep, [%x[input], #32]\n"
4064 "uxtl2 v1.8h, v0.16b\n"
4065 "uxtl v0.8h, v0.8b\n"
4066 "sxtl v2.4s, v1.4h\n"
4067 "sxtl2 v1.4s, v0.8h\n"
4068 "sxtl v0.4s, v0.4h\n"
4069 "scvtf v0.4s, v0.4s\n"
4070 "scvtf v1.4s, v1.4s\n"
4071 "scvtf v2.4s, v2.4s\n"
4072 "fsub v0.4s, v0.4s, v5.4s\n"
4073 "fsub v1.4s, v1.4s, v5.4s\n"
4074 "fsub v2.4s, v2.4s, v5.4s\n"
4075 "fmul v0.4s, v0.4s, v6.4s\n"
4076 "fmul v1.4s, v1.4s, v6.4s\n"
4077 "fmul v2.4s, v2.4s, v6.4s\n"
4078 "fadd v0.4s, v0.4s, v4.4s\n"
4079 "fadd v1.4s, v1.4s, v4.4s\n"
4080 "fadd v2.4s, v2.4s, v4.4s\n"
4081
4082 "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
4083 "st1 {v2.2s}, [%x[output]], #8\n"
4084 "st1 {v2.s}[2], [%x[output]], #4\n"
4085 "prfm pldl1keep, [%x[output]]\n"
4086 : [count] "+r"(params_count_copy), [input] "+r"(input),
4087 [output] "+r"(output)
4088 : [range_offset] "r"(params.range_offset),
4089 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4090 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4091 }
4092
4093 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4094 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 12>::Transform(
4095 const uint8_t* input, const Dequantize& params, float* output) {
4096 #ifdef DEBUG
4097 #ifdef DEBUG_METAGEMM_VERBOSE
4098 std::cout << __FILE__ << "(" << __LINE__
4099 << ") Dequantize<uint8_t, float, Dequantize, 16, 12>::Transform()"
4100 << std::endl
4101 << std::flush;
4102 #endif
4103 #endif
4104 int params_count_copy = params.count;
4105 asm volatile(
4106
4107 // Dequantize::Prepare
4108 "dup v4.4s, %w[range_min]\n"
4109 "dup v5.4s, %w[range_offset]\n"
4110 "dup v6.4s, %w[range_scale]\n"
4111
4112 // Reduce count by leftovers.
4113 "subs %x[count], %x[count], #12\n"
4114 "beq 2f\n"
4115
4116 "1:"
4117 "subs %x[count], %x[count], #16\n"
4118
4119 // Dequantize::Transform
4120 "ld1 {v0.4s}, [%x[input]], #16\n"
4121 "prfm pldl1keep, [%x[input], #32]\n"
4122 "uxtl2 v1.8h, v0.16b\n"
4123 "uxtl v0.8h, v0.8b\n"
4124 "sxtl2 v3.4s, v1.8h\n"
4125 "sxtl v2.4s, v1.4h\n"
4126 "sxtl2 v1.4s, v0.8h\n"
4127 "sxtl v0.4s, v0.4h\n"
4128 "scvtf v0.4s, v0.4s\n"
4129 "scvtf v1.4s, v1.4s\n"
4130 "scvtf v2.4s, v2.4s\n"
4131 "scvtf v3.4s, v3.4s\n"
4132 "fsub v0.4s, v0.4s, v5.4s\n"
4133 "fsub v1.4s, v1.4s, v5.4s\n"
4134 "fsub v2.4s, v2.4s, v5.4s\n"
4135 "fsub v3.4s, v3.4s, v5.4s\n"
4136 "fmul v0.4s, v0.4s, v6.4s\n"
4137 "fmul v1.4s, v1.4s, v6.4s\n"
4138 "fmul v2.4s, v2.4s, v6.4s\n"
4139 "fmul v3.4s, v3.4s, v6.4s\n"
4140 "fadd v0.4s, v0.4s, v4.4s\n"
4141 "fadd v1.4s, v1.4s, v4.4s\n"
4142 "fadd v2.4s, v2.4s, v4.4s\n"
4143 "fadd v3.4s, v3.4s, v4.4s\n"
4144
4145 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4146 "prfm pldl1keep, [%x[output]]\n"
4147
4148 "bne 1b\n"
4149 "2:"
4150
4151 // Handle leftovers.
4152
4153 // Dequantize::Transform
4154 "ld1 {v0.2s}, [%x[input]], #8\n"
4155 "ld1 {v0.s}[2], [%x[input]], #4\n"
4156 "prfm pldl1keep, [%x[input], #32]\n"
4157 "uxtl2 v1.8h, v0.16b\n"
4158 "uxtl v0.8h, v0.8b\n"
4159 "sxtl v2.4s, v1.4h\n"
4160 "sxtl2 v1.4s, v0.8h\n"
4161 "sxtl v0.4s, v0.4h\n"
4162 "scvtf v0.4s, v0.4s\n"
4163 "scvtf v1.4s, v1.4s\n"
4164 "scvtf v2.4s, v2.4s\n"
4165 "fsub v0.4s, v0.4s, v5.4s\n"
4166 "fsub v1.4s, v1.4s, v5.4s\n"
4167 "fsub v2.4s, v2.4s, v5.4s\n"
4168 "fmul v0.4s, v0.4s, v6.4s\n"
4169 "fmul v1.4s, v1.4s, v6.4s\n"
4170 "fmul v2.4s, v2.4s, v6.4s\n"
4171 "fadd v0.4s, v0.4s, v4.4s\n"
4172 "fadd v1.4s, v1.4s, v4.4s\n"
4173 "fadd v2.4s, v2.4s, v4.4s\n"
4174
4175 "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
4176 "prfm pldl1keep, [%x[output]]\n"
4177 : [count] "+r"(params_count_copy), [input] "+r"(input),
4178 [output] "+r"(output)
4179 : [range_offset] "r"(params.range_offset),
4180 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4181 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4182 }
4183
4184 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4185 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 13>::Transform(
4186 const uint8_t* input, const Dequantize& params, float* output) {
4187 #ifdef DEBUG
4188 #ifdef DEBUG_METAGEMM_VERBOSE
4189 std::cout << __FILE__ << "(" << __LINE__
4190 << ") Dequantize<uint8_t, float, Dequantize, 16, 13>::Transform()"
4191 << std::endl
4192 << std::flush;
4193 #endif
4194 #endif
4195 int params_count_copy = params.count;
4196 asm volatile(
4197
4198 // Dequantize::Prepare
4199 "dup v4.4s, %w[range_min]\n"
4200 "dup v5.4s, %w[range_offset]\n"
4201 "dup v6.4s, %w[range_scale]\n"
4202
4203 // Reduce count by leftovers.
4204 "subs %x[count], %x[count], #13\n"
4205 "beq 2f\n"
4206
4207 "1:"
4208 "subs %x[count], %x[count], #16\n"
4209
4210 // Dequantize::Transform
4211 "ld1 {v0.4s}, [%x[input]], #16\n"
4212 "prfm pldl1keep, [%x[input], #32]\n"
4213 "uxtl2 v1.8h, v0.16b\n"
4214 "uxtl v0.8h, v0.8b\n"
4215 "sxtl2 v3.4s, v1.8h\n"
4216 "sxtl v2.4s, v1.4h\n"
4217 "sxtl2 v1.4s, v0.8h\n"
4218 "sxtl v0.4s, v0.4h\n"
4219 "scvtf v0.4s, v0.4s\n"
4220 "scvtf v1.4s, v1.4s\n"
4221 "scvtf v2.4s, v2.4s\n"
4222 "scvtf v3.4s, v3.4s\n"
4223 "fsub v0.4s, v0.4s, v5.4s\n"
4224 "fsub v1.4s, v1.4s, v5.4s\n"
4225 "fsub v2.4s, v2.4s, v5.4s\n"
4226 "fsub v3.4s, v3.4s, v5.4s\n"
4227 "fmul v0.4s, v0.4s, v6.4s\n"
4228 "fmul v1.4s, v1.4s, v6.4s\n"
4229 "fmul v2.4s, v2.4s, v6.4s\n"
4230 "fmul v3.4s, v3.4s, v6.4s\n"
4231 "fadd v0.4s, v0.4s, v4.4s\n"
4232 "fadd v1.4s, v1.4s, v4.4s\n"
4233 "fadd v2.4s, v2.4s, v4.4s\n"
4234 "fadd v3.4s, v3.4s, v4.4s\n"
4235
4236 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4237 "prfm pldl1keep, [%x[output]]\n"
4238
4239 "bne 1b\n"
4240 "2:"
4241
4242 // Handle leftovers.
4243
4244 // Dequantize::Transform
4245 "ld1 {v0.2s}, [%x[input]], #8\n"
4246 "ld1 {v0.s}[2], [%x[input]], #4\n"
4247 "ld1 {v0.b}[12], [%x[input]], #1\n"
4248 "prfm pldl1keep, [%x[input], #32]\n"
4249 "uxtl2 v1.8h, v0.16b\n"
4250 "uxtl v0.8h, v0.8b\n"
4251 "sxtl2 v3.4s, v1.8h\n"
4252 "sxtl v2.4s, v1.4h\n"
4253 "sxtl2 v1.4s, v0.8h\n"
4254 "sxtl v0.4s, v0.4h\n"
4255 "scvtf v0.4s, v0.4s\n"
4256 "scvtf v1.4s, v1.4s\n"
4257 "scvtf v2.4s, v2.4s\n"
4258 "scvtf v3.4s, v3.4s\n"
4259 "fsub v0.4s, v0.4s, v5.4s\n"
4260 "fsub v1.4s, v1.4s, v5.4s\n"
4261 "fsub v2.4s, v2.4s, v5.4s\n"
4262 "fsub v3.4s, v3.4s, v5.4s\n"
4263 "fmul v0.4s, v0.4s, v6.4s\n"
4264 "fmul v1.4s, v1.4s, v6.4s\n"
4265 "fmul v2.4s, v2.4s, v6.4s\n"
4266 "fmul v3.4s, v3.4s, v6.4s\n"
4267 "fadd v0.4s, v0.4s, v4.4s\n"
4268 "fadd v1.4s, v1.4s, v4.4s\n"
4269 "fadd v2.4s, v2.4s, v4.4s\n"
4270 "fadd v3.4s, v3.4s, v4.4s\n"
4271
4272 "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
4273 "st1 {v3.s}[0], [%x[output]], #4\n"
4274 "prfm pldl1keep, [%x[output]]\n"
4275 : [count] "+r"(params_count_copy), [input] "+r"(input),
4276 [output] "+r"(output)
4277 : [range_offset] "r"(params.range_offset),
4278 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4279 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4280 }
4281
4282 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4283 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 14>::Transform(
4284 const uint8_t* input, const Dequantize& params, float* output) {
4285 #ifdef DEBUG
4286 #ifdef DEBUG_METAGEMM_VERBOSE
4287 std::cout << __FILE__ << "(" << __LINE__
4288 << ") Dequantize<uint8_t, float, Dequantize, 16, 14>::Transform()"
4289 << std::endl
4290 << std::flush;
4291 #endif
4292 #endif
4293 int params_count_copy = params.count;
4294 asm volatile(
4295
4296 // Dequantize::Prepare
4297 "dup v4.4s, %w[range_min]\n"
4298 "dup v5.4s, %w[range_offset]\n"
4299 "dup v6.4s, %w[range_scale]\n"
4300
4301 // Reduce count by leftovers.
4302 "subs %x[count], %x[count], #14\n"
4303 "beq 2f\n"
4304
4305 "1:"
4306 "subs %x[count], %x[count], #16\n"
4307
4308 // Dequantize::Transform
4309 "ld1 {v0.4s}, [%x[input]], #16\n"
4310 "prfm pldl1keep, [%x[input], #32]\n"
4311 "uxtl2 v1.8h, v0.16b\n"
4312 "uxtl v0.8h, v0.8b\n"
4313 "sxtl2 v3.4s, v1.8h\n"
4314 "sxtl v2.4s, v1.4h\n"
4315 "sxtl2 v1.4s, v0.8h\n"
4316 "sxtl v0.4s, v0.4h\n"
4317 "scvtf v0.4s, v0.4s\n"
4318 "scvtf v1.4s, v1.4s\n"
4319 "scvtf v2.4s, v2.4s\n"
4320 "scvtf v3.4s, v3.4s\n"
4321 "fsub v0.4s, v0.4s, v5.4s\n"
4322 "fsub v1.4s, v1.4s, v5.4s\n"
4323 "fsub v2.4s, v2.4s, v5.4s\n"
4324 "fsub v3.4s, v3.4s, v5.4s\n"
4325 "fmul v0.4s, v0.4s, v6.4s\n"
4326 "fmul v1.4s, v1.4s, v6.4s\n"
4327 "fmul v2.4s, v2.4s, v6.4s\n"
4328 "fmul v3.4s, v3.4s, v6.4s\n"
4329 "fadd v0.4s, v0.4s, v4.4s\n"
4330 "fadd v1.4s, v1.4s, v4.4s\n"
4331 "fadd v2.4s, v2.4s, v4.4s\n"
4332 "fadd v3.4s, v3.4s, v4.4s\n"
4333
4334 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4335 "prfm pldl1keep, [%x[output]]\n"
4336
4337 "bne 1b\n"
4338 "2:"
4339
4340 // Handle leftovers.
4341
4342 // Dequantize::Transform
4343 "ld1 {v0.2s}, [%x[input]], #8\n"
4344 "ld1 {v0.s}[2], [%x[input]], #4\n"
4345 "ld1 {v0.h}[6], [%x[input]], #2\n"
4346 "prfm pldl1keep, [%x[input], #32]\n"
4347 "uxtl2 v1.8h, v0.16b\n"
4348 "uxtl v0.8h, v0.8b\n"
4349 "sxtl2 v3.4s, v1.8h\n"
4350 "sxtl v2.4s, v1.4h\n"
4351 "sxtl2 v1.4s, v0.8h\n"
4352 "sxtl v0.4s, v0.4h\n"
4353 "scvtf v0.4s, v0.4s\n"
4354 "scvtf v1.4s, v1.4s\n"
4355 "scvtf v2.4s, v2.4s\n"
4356 "scvtf v3.4s, v3.4s\n"
4357 "fsub v0.4s, v0.4s, v5.4s\n"
4358 "fsub v1.4s, v1.4s, v5.4s\n"
4359 "fsub v2.4s, v2.4s, v5.4s\n"
4360 "fsub v3.4s, v3.4s, v5.4s\n"
4361 "fmul v0.4s, v0.4s, v6.4s\n"
4362 "fmul v1.4s, v1.4s, v6.4s\n"
4363 "fmul v2.4s, v2.4s, v6.4s\n"
4364 "fmul v3.4s, v3.4s, v6.4s\n"
4365 "fadd v0.4s, v0.4s, v4.4s\n"
4366 "fadd v1.4s, v1.4s, v4.4s\n"
4367 "fadd v2.4s, v2.4s, v4.4s\n"
4368 "fadd v3.4s, v3.4s, v4.4s\n"
4369
4370 "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
4371 "st1 {v3.2s}, [%x[output]], #8\n"
4372 "prfm pldl1keep, [%x[output]]\n"
4373 : [count] "+r"(params_count_copy), [input] "+r"(input),
4374 [output] "+r"(output)
4375 : [range_offset] "r"(params.range_offset),
4376 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4377 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4378 }
4379
4380 template <>
Transform(const uint8_t * input,const Dequantize & params,float * output)4381 inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 15>::Transform(
4382 const uint8_t* input, const Dequantize& params, float* output) {
4383 #ifdef DEBUG
4384 #ifdef DEBUG_METAGEMM_VERBOSE
4385 std::cout << __FILE__ << "(" << __LINE__
4386 << ") Dequantize<uint8_t, float, Dequantize, 16, 15>::Transform()"
4387 << std::endl
4388 << std::flush;
4389 #endif
4390 #endif
4391 int params_count_copy = params.count;
4392 asm volatile(
4393
4394 // Dequantize::Prepare
4395 "dup v4.4s, %w[range_min]\n"
4396 "dup v5.4s, %w[range_offset]\n"
4397 "dup v6.4s, %w[range_scale]\n"
4398
4399 // Reduce count by leftovers.
4400 "subs %x[count], %x[count], #15\n"
4401 "beq 2f\n"
4402
4403 "1:"
4404 "subs %x[count], %x[count], #16\n"
4405
4406 // Dequantize::Transform
4407 "ld1 {v0.4s}, [%x[input]], #16\n"
4408 "prfm pldl1keep, [%x[input], #32]\n"
4409 "uxtl2 v1.8h, v0.16b\n"
4410 "uxtl v0.8h, v0.8b\n"
4411 "sxtl2 v3.4s, v1.8h\n"
4412 "sxtl v2.4s, v1.4h\n"
4413 "sxtl2 v1.4s, v0.8h\n"
4414 "sxtl v0.4s, v0.4h\n"
4415 "scvtf v0.4s, v0.4s\n"
4416 "scvtf v1.4s, v1.4s\n"
4417 "scvtf v2.4s, v2.4s\n"
4418 "scvtf v3.4s, v3.4s\n"
4419 "fsub v0.4s, v0.4s, v5.4s\n"
4420 "fsub v1.4s, v1.4s, v5.4s\n"
4421 "fsub v2.4s, v2.4s, v5.4s\n"
4422 "fsub v3.4s, v3.4s, v5.4s\n"
4423 "fmul v0.4s, v0.4s, v6.4s\n"
4424 "fmul v1.4s, v1.4s, v6.4s\n"
4425 "fmul v2.4s, v2.4s, v6.4s\n"
4426 "fmul v3.4s, v3.4s, v6.4s\n"
4427 "fadd v0.4s, v0.4s, v4.4s\n"
4428 "fadd v1.4s, v1.4s, v4.4s\n"
4429 "fadd v2.4s, v2.4s, v4.4s\n"
4430 "fadd v3.4s, v3.4s, v4.4s\n"
4431
4432 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
4433 "prfm pldl1keep, [%x[output]]\n"
4434
4435 "bne 1b\n"
4436 "2:"
4437
4438 // Handle leftovers.
4439
4440 // Dequantize::Transform
4441 "ld1 {v0.2s}, [%x[input]], #8\n"
4442 "ld1 {v0.s}[2], [%x[input]], #4\n"
4443 "ld1 {v0.h}[6], [%x[input]], #2\n"
4444 "ld1 {v0.b}[14], [%x[input]], #1\n"
4445 "prfm pldl1keep, [%x[input], #32]\n"
4446 "uxtl2 v1.8h, v0.16b\n"
4447 "uxtl v0.8h, v0.8b\n"
4448 "sxtl2 v3.4s, v1.8h\n"
4449 "sxtl v2.4s, v1.4h\n"
4450 "sxtl2 v1.4s, v0.8h\n"
4451 "sxtl v0.4s, v0.4h\n"
4452 "scvtf v0.4s, v0.4s\n"
4453 "scvtf v1.4s, v1.4s\n"
4454 "scvtf v2.4s, v2.4s\n"
4455 "scvtf v3.4s, v3.4s\n"
4456 "fsub v0.4s, v0.4s, v5.4s\n"
4457 "fsub v1.4s, v1.4s, v5.4s\n"
4458 "fsub v2.4s, v2.4s, v5.4s\n"
4459 "fsub v3.4s, v3.4s, v5.4s\n"
4460 "fmul v0.4s, v0.4s, v6.4s\n"
4461 "fmul v1.4s, v1.4s, v6.4s\n"
4462 "fmul v2.4s, v2.4s, v6.4s\n"
4463 "fmul v3.4s, v3.4s, v6.4s\n"
4464 "fadd v0.4s, v0.4s, v4.4s\n"
4465 "fadd v1.4s, v1.4s, v4.4s\n"
4466 "fadd v2.4s, v2.4s, v4.4s\n"
4467 "fadd v3.4s, v3.4s, v4.4s\n"
4468
4469 "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
4470 "st1 {v3.2s}, [%x[output]], #8\n"
4471 "st1 {v3.s}[2], [%x[output]], #4\n"
4472 "prfm pldl1keep, [%x[output]]\n"
4473 : [count] "+r"(params_count_copy), [input] "+r"(input),
4474 [output] "+r"(output)
4475 : [range_offset] "r"(params.range_offset),
4476 [range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
4477 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
4478 }
4479
4480 template <>
4481 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4482 0>::Transform(const uint8_t* input,
4483 const MinMax<uint8_t>& params,
4484 uint8_t* output) {
4485 #ifdef DEBUG
4486 #ifdef DEBUG_METAGEMM_VERBOSE
4487 std::cout << __FILE__ << "(" << __LINE__
4488 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4489 "0>::Transform()"
4490 << std::endl
4491 << std::flush;
4492 #endif
4493 #endif
4494 int params_count_copy = params.count;
4495 asm volatile(
4496
4497 // MinMax::Prepare
4498 "dup v4.16b, %w[min]\n"
4499 "dup v5.16b, %w[max]\n"
4500
4501 "1:"
4502 "subs %x[count], %x[count], #16\n"
4503
4504 // MinMax::Transform
4505 "ld1 {v0.4s}, [%x[input]], #16\n"
4506 "prfm pldl1keep, [%x[input], #16]\n"
4507 "umax v0.16b, v0.16b, v4.16b\n"
4508 "umin v0.16b, v0.16b, v5.16b\n"
4509
4510 "st1 {v0.4s}, [%x[output]], #16\n"
4511 "prfm pldl1keep, [%x[output]]\n"
4512
4513 "bne 1b\n"
4514 : [count] "+r"(params_count_copy), [input] "+r"(input),
4515 [output] "+r"(output)
4516 : [max] "r"(params.max), [min] "r"(params.min)
4517 : "v0", "v4", "v5", "cc", "memory");
4518 }
4519
4520 template <>
4521 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4522 1>::Transform(const uint8_t* input,
4523 const MinMax<uint8_t>& params,
4524 uint8_t* output) {
4525 #ifdef DEBUG
4526 #ifdef DEBUG_METAGEMM_VERBOSE
4527 std::cout << __FILE__ << "(" << __LINE__
4528 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4529 "1>::Transform()"
4530 << std::endl
4531 << std::flush;
4532 #endif
4533 #endif
4534 int params_count_copy = params.count;
4535 asm volatile(
4536
4537 // MinMax::Prepare
4538 "dup v4.16b, %w[min]\n"
4539 "dup v5.16b, %w[max]\n"
4540
4541 // Reduce count by leftovers.
4542 "subs %x[count], %x[count], #1\n"
4543 "beq 2f\n"
4544
4545 "1:"
4546 "subs %x[count], %x[count], #16\n"
4547
4548 // MinMax::Transform
4549 "ld1 {v0.4s}, [%x[input]], #16\n"
4550 "prfm pldl1keep, [%x[input], #16]\n"
4551 "umax v0.16b, v0.16b, v4.16b\n"
4552 "umin v0.16b, v0.16b, v5.16b\n"
4553
4554 "st1 {v0.4s}, [%x[output]], #16\n"
4555 "prfm pldl1keep, [%x[output]]\n"
4556
4557 "bne 1b\n"
4558 "2:"
4559
4560 // Handle leftovers.
4561
4562 // MinMax::Transform
4563 "ld1 {v0.b}[0], [%x[input]], #1\n"
4564 "prfm pldl1keep, [%x[input], #16]\n"
4565 "umax v0.16b, v0.16b, v4.16b\n"
4566 "umin v0.16b, v0.16b, v5.16b\n"
4567
4568 "st1 {v0.b}[0], [%x[output]], #1\n"
4569 "prfm pldl1keep, [%x[output]]\n"
4570 : [count] "+r"(params_count_copy), [input] "+r"(input),
4571 [output] "+r"(output)
4572 : [max] "r"(params.max), [min] "r"(params.min)
4573 : "v0", "v4", "v5", "cc", "memory");
4574 }
4575
4576 template <>
4577 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4578 2>::Transform(const uint8_t* input,
4579 const MinMax<uint8_t>& params,
4580 uint8_t* output) {
4581 #ifdef DEBUG
4582 #ifdef DEBUG_METAGEMM_VERBOSE
4583 std::cout << __FILE__ << "(" << __LINE__
4584 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4585 "2>::Transform()"
4586 << std::endl
4587 << std::flush;
4588 #endif
4589 #endif
4590 int params_count_copy = params.count;
4591 asm volatile(
4592
4593 // MinMax::Prepare
4594 "dup v4.16b, %w[min]\n"
4595 "dup v5.16b, %w[max]\n"
4596
4597 // Reduce count by leftovers.
4598 "subs %x[count], %x[count], #2\n"
4599 "beq 2f\n"
4600
4601 "1:"
4602 "subs %x[count], %x[count], #16\n"
4603
4604 // MinMax::Transform
4605 "ld1 {v0.4s}, [%x[input]], #16\n"
4606 "prfm pldl1keep, [%x[input], #16]\n"
4607 "umax v0.16b, v0.16b, v4.16b\n"
4608 "umin v0.16b, v0.16b, v5.16b\n"
4609
4610 "st1 {v0.4s}, [%x[output]], #16\n"
4611 "prfm pldl1keep, [%x[output]]\n"
4612
4613 "bne 1b\n"
4614 "2:"
4615
4616 // Handle leftovers.
4617
4618 // MinMax::Transform
4619 "ld1 {v0.h}[0], [%x[input]], #2\n"
4620 "prfm pldl1keep, [%x[input], #16]\n"
4621 "umax v0.16b, v0.16b, v4.16b\n"
4622 "umin v0.16b, v0.16b, v5.16b\n"
4623
4624 "st1 {v0.h}[0], [%x[output]], #2\n"
4625 "prfm pldl1keep, [%x[output]]\n"
4626 : [count] "+r"(params_count_copy), [input] "+r"(input),
4627 [output] "+r"(output)
4628 : [max] "r"(params.max), [min] "r"(params.min)
4629 : "v0", "v4", "v5", "cc", "memory");
4630 }
4631
4632 template <>
4633 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4634 3>::Transform(const uint8_t* input,
4635 const MinMax<uint8_t>& params,
4636 uint8_t* output) {
4637 #ifdef DEBUG
4638 #ifdef DEBUG_METAGEMM_VERBOSE
4639 std::cout << __FILE__ << "(" << __LINE__
4640 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4641 "3>::Transform()"
4642 << std::endl
4643 << std::flush;
4644 #endif
4645 #endif
4646 int params_count_copy = params.count;
4647 asm volatile(
4648
4649 // MinMax::Prepare
4650 "dup v4.16b, %w[min]\n"
4651 "dup v5.16b, %w[max]\n"
4652
4653 // Reduce count by leftovers.
4654 "subs %x[count], %x[count], #3\n"
4655 "beq 2f\n"
4656
4657 "1:"
4658 "subs %x[count], %x[count], #16\n"
4659
4660 // MinMax::Transform
4661 "ld1 {v0.4s}, [%x[input]], #16\n"
4662 "prfm pldl1keep, [%x[input], #16]\n"
4663 "umax v0.16b, v0.16b, v4.16b\n"
4664 "umin v0.16b, v0.16b, v5.16b\n"
4665
4666 "st1 {v0.4s}, [%x[output]], #16\n"
4667 "prfm pldl1keep, [%x[output]]\n"
4668
4669 "bne 1b\n"
4670 "2:"
4671
4672 // Handle leftovers.
4673
4674 // MinMax::Transform
4675 "ld1 {v0.h}[0], [%x[input]], #2\n"
4676 "ld1 {v0.b}[2], [%x[input]], #1\n"
4677 "prfm pldl1keep, [%x[input], #16]\n"
4678 "umax v0.16b, v0.16b, v4.16b\n"
4679 "umin v0.16b, v0.16b, v5.16b\n"
4680
4681 "st1 {v0.h}[0], [%x[output]], #2\n"
4682 "st1 {v0.b}[2], [%x[output]], #1\n"
4683 "prfm pldl1keep, [%x[output]]\n"
4684 : [count] "+r"(params_count_copy), [input] "+r"(input),
4685 [output] "+r"(output)
4686 : [max] "r"(params.max), [min] "r"(params.min)
4687 : "v0", "v4", "v5", "cc", "memory");
4688 }
4689
4690 template <>
4691 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4692 4>::Transform(const uint8_t* input,
4693 const MinMax<uint8_t>& params,
4694 uint8_t* output) {
4695 #ifdef DEBUG
4696 #ifdef DEBUG_METAGEMM_VERBOSE
4697 std::cout << __FILE__ << "(" << __LINE__
4698 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4699 "4>::Transform()"
4700 << std::endl
4701 << std::flush;
4702 #endif
4703 #endif
4704 int params_count_copy = params.count;
4705 asm volatile(
4706
4707 // MinMax::Prepare
4708 "dup v4.16b, %w[min]\n"
4709 "dup v5.16b, %w[max]\n"
4710
4711 // Reduce count by leftovers.
4712 "subs %x[count], %x[count], #4\n"
4713 "beq 2f\n"
4714
4715 "1:"
4716 "subs %x[count], %x[count], #16\n"
4717
4718 // MinMax::Transform
4719 "ld1 {v0.4s}, [%x[input]], #16\n"
4720 "prfm pldl1keep, [%x[input], #16]\n"
4721 "umax v0.16b, v0.16b, v4.16b\n"
4722 "umin v0.16b, v0.16b, v5.16b\n"
4723
4724 "st1 {v0.4s}, [%x[output]], #16\n"
4725 "prfm pldl1keep, [%x[output]]\n"
4726
4727 "bne 1b\n"
4728 "2:"
4729
4730 // Handle leftovers.
4731
4732 // MinMax::Transform
4733 "ld1 {v0.s}[0], [%x[input]], #4\n"
4734 "prfm pldl1keep, [%x[input], #16]\n"
4735 "umax v0.16b, v0.16b, v4.16b\n"
4736 "umin v0.16b, v0.16b, v5.16b\n"
4737
4738 "st1 {v0.s}[0], [%x[output]], #4\n"
4739 "prfm pldl1keep, [%x[output]]\n"
4740 : [count] "+r"(params_count_copy), [input] "+r"(input),
4741 [output] "+r"(output)
4742 : [max] "r"(params.max), [min] "r"(params.min)
4743 : "v0", "v4", "v5", "cc", "memory");
4744 }
4745
4746 template <>
4747 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4748 5>::Transform(const uint8_t* input,
4749 const MinMax<uint8_t>& params,
4750 uint8_t* output) {
4751 #ifdef DEBUG
4752 #ifdef DEBUG_METAGEMM_VERBOSE
4753 std::cout << __FILE__ << "(" << __LINE__
4754 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4755 "5>::Transform()"
4756 << std::endl
4757 << std::flush;
4758 #endif
4759 #endif
4760 int params_count_copy = params.count;
4761 asm volatile(
4762
4763 // MinMax::Prepare
4764 "dup v4.16b, %w[min]\n"
4765 "dup v5.16b, %w[max]\n"
4766
4767 // Reduce count by leftovers.
4768 "subs %x[count], %x[count], #5\n"
4769 "beq 2f\n"
4770
4771 "1:"
4772 "subs %x[count], %x[count], #16\n"
4773
4774 // MinMax::Transform
4775 "ld1 {v0.4s}, [%x[input]], #16\n"
4776 "prfm pldl1keep, [%x[input], #16]\n"
4777 "umax v0.16b, v0.16b, v4.16b\n"
4778 "umin v0.16b, v0.16b, v5.16b\n"
4779
4780 "st1 {v0.4s}, [%x[output]], #16\n"
4781 "prfm pldl1keep, [%x[output]]\n"
4782
4783 "bne 1b\n"
4784 "2:"
4785
4786 // Handle leftovers.
4787
4788 // MinMax::Transform
4789 "ld1 {v0.s}[0], [%x[input]], #4\n"
4790 "ld1 {v0.b}[4], [%x[input]], #1\n"
4791 "prfm pldl1keep, [%x[input], #16]\n"
4792 "umax v0.16b, v0.16b, v4.16b\n"
4793 "umin v0.16b, v0.16b, v5.16b\n"
4794
4795 "st1 {v0.s}[0], [%x[output]], #4\n"
4796 "st1 {v0.b}[4], [%x[output]], #1\n"
4797 "prfm pldl1keep, [%x[output]]\n"
4798 : [count] "+r"(params_count_copy), [input] "+r"(input),
4799 [output] "+r"(output)
4800 : [max] "r"(params.max), [min] "r"(params.min)
4801 : "v0", "v4", "v5", "cc", "memory");
4802 }
4803
4804 template <>
4805 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4806 6>::Transform(const uint8_t* input,
4807 const MinMax<uint8_t>& params,
4808 uint8_t* output) {
4809 #ifdef DEBUG
4810 #ifdef DEBUG_METAGEMM_VERBOSE
4811 std::cout << __FILE__ << "(" << __LINE__
4812 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4813 "6>::Transform()"
4814 << std::endl
4815 << std::flush;
4816 #endif
4817 #endif
4818 int params_count_copy = params.count;
4819 asm volatile(
4820
4821 // MinMax::Prepare
4822 "dup v4.16b, %w[min]\n"
4823 "dup v5.16b, %w[max]\n"
4824
4825 // Reduce count by leftovers.
4826 "subs %x[count], %x[count], #6\n"
4827 "beq 2f\n"
4828
4829 "1:"
4830 "subs %x[count], %x[count], #16\n"
4831
4832 // MinMax::Transform
4833 "ld1 {v0.4s}, [%x[input]], #16\n"
4834 "prfm pldl1keep, [%x[input], #16]\n"
4835 "umax v0.16b, v0.16b, v4.16b\n"
4836 "umin v0.16b, v0.16b, v5.16b\n"
4837
4838 "st1 {v0.4s}, [%x[output]], #16\n"
4839 "prfm pldl1keep, [%x[output]]\n"
4840
4841 "bne 1b\n"
4842 "2:"
4843
4844 // Handle leftovers.
4845
4846 // MinMax::Transform
4847 "ld1 {v0.s}[0], [%x[input]], #4\n"
4848 "ld1 {v0.h}[2], [%x[input]], #2\n"
4849 "prfm pldl1keep, [%x[input], #16]\n"
4850 "umax v0.16b, v0.16b, v4.16b\n"
4851 "umin v0.16b, v0.16b, v5.16b\n"
4852
4853 "st1 {v0.s}[0], [%x[output]], #4\n"
4854 "st1 {v0.h}[2], [%x[output]], #2\n"
4855 "prfm pldl1keep, [%x[output]]\n"
4856 : [count] "+r"(params_count_copy), [input] "+r"(input),
4857 [output] "+r"(output)
4858 : [max] "r"(params.max), [min] "r"(params.min)
4859 : "v0", "v4", "v5", "cc", "memory");
4860 }
4861
4862 template <>
4863 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4864 7>::Transform(const uint8_t* input,
4865 const MinMax<uint8_t>& params,
4866 uint8_t* output) {
4867 #ifdef DEBUG
4868 #ifdef DEBUG_METAGEMM_VERBOSE
4869 std::cout << __FILE__ << "(" << __LINE__
4870 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4871 "7>::Transform()"
4872 << std::endl
4873 << std::flush;
4874 #endif
4875 #endif
4876 int params_count_copy = params.count;
4877 asm volatile(
4878
4879 // MinMax::Prepare
4880 "dup v4.16b, %w[min]\n"
4881 "dup v5.16b, %w[max]\n"
4882
4883 // Reduce count by leftovers.
4884 "subs %x[count], %x[count], #7\n"
4885 "beq 2f\n"
4886
4887 "1:"
4888 "subs %x[count], %x[count], #16\n"
4889
4890 // MinMax::Transform
4891 "ld1 {v0.4s}, [%x[input]], #16\n"
4892 "prfm pldl1keep, [%x[input], #16]\n"
4893 "umax v0.16b, v0.16b, v4.16b\n"
4894 "umin v0.16b, v0.16b, v5.16b\n"
4895
4896 "st1 {v0.4s}, [%x[output]], #16\n"
4897 "prfm pldl1keep, [%x[output]]\n"
4898
4899 "bne 1b\n"
4900 "2:"
4901
4902 // Handle leftovers.
4903
4904 // MinMax::Transform
4905 "ld1 {v0.s}[0], [%x[input]], #4\n"
4906 "ld1 {v0.h}[2], [%x[input]], #2\n"
4907 "ld1 {v0.b}[6], [%x[input]], #1\n"
4908 "prfm pldl1keep, [%x[input], #16]\n"
4909 "umax v0.16b, v0.16b, v4.16b\n"
4910 "umin v0.16b, v0.16b, v5.16b\n"
4911
4912 "st1 {v0.s}[0], [%x[output]], #4\n"
4913 "st1 {v0.h}[2], [%x[output]], #2\n"
4914 "st1 {v0.b}[6], [%x[output]], #1\n"
4915 "prfm pldl1keep, [%x[output]]\n"
4916 : [count] "+r"(params_count_copy), [input] "+r"(input),
4917 [output] "+r"(output)
4918 : [max] "r"(params.max), [min] "r"(params.min)
4919 : "v0", "v4", "v5", "cc", "memory");
4920 }
4921
4922 template <>
4923 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4924 8>::Transform(const uint8_t* input,
4925 const MinMax<uint8_t>& params,
4926 uint8_t* output) {
4927 #ifdef DEBUG
4928 #ifdef DEBUG_METAGEMM_VERBOSE
4929 std::cout << __FILE__ << "(" << __LINE__
4930 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4931 "8>::Transform()"
4932 << std::endl
4933 << std::flush;
4934 #endif
4935 #endif
4936 int params_count_copy = params.count;
4937 asm volatile(
4938
4939 // MinMax::Prepare
4940 "dup v4.16b, %w[min]\n"
4941 "dup v5.16b, %w[max]\n"
4942
4943 // Reduce count by leftovers.
4944 "subs %x[count], %x[count], #8\n"
4945 "beq 2f\n"
4946
4947 "1:"
4948 "subs %x[count], %x[count], #16\n"
4949
4950 // MinMax::Transform
4951 "ld1 {v0.4s}, [%x[input]], #16\n"
4952 "prfm pldl1keep, [%x[input], #16]\n"
4953 "umax v0.16b, v0.16b, v4.16b\n"
4954 "umin v0.16b, v0.16b, v5.16b\n"
4955
4956 "st1 {v0.4s}, [%x[output]], #16\n"
4957 "prfm pldl1keep, [%x[output]]\n"
4958
4959 "bne 1b\n"
4960 "2:"
4961
4962 // Handle leftovers.
4963
4964 // MinMax::Transform
4965 "ld1 {v0.2s}, [%x[input]], #8\n"
4966 "prfm pldl1keep, [%x[input], #16]\n"
4967 "umax v0.16b, v0.16b, v4.16b\n"
4968 "umin v0.16b, v0.16b, v5.16b\n"
4969
4970 "st1 {v0.2s}, [%x[output]], #8\n"
4971 "prfm pldl1keep, [%x[output]]\n"
4972 : [count] "+r"(params_count_copy), [input] "+r"(input),
4973 [output] "+r"(output)
4974 : [max] "r"(params.max), [min] "r"(params.min)
4975 : "v0", "v4", "v5", "cc", "memory");
4976 }
4977
4978 template <>
4979 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)4980 9>::Transform(const uint8_t* input,
4981 const MinMax<uint8_t>& params,
4982 uint8_t* output) {
4983 #ifdef DEBUG
4984 #ifdef DEBUG_METAGEMM_VERBOSE
4985 std::cout << __FILE__ << "(" << __LINE__
4986 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
4987 "9>::Transform()"
4988 << std::endl
4989 << std::flush;
4990 #endif
4991 #endif
4992 int params_count_copy = params.count;
4993 asm volatile(
4994
4995 // MinMax::Prepare
4996 "dup v4.16b, %w[min]\n"
4997 "dup v5.16b, %w[max]\n"
4998
4999 // Reduce count by leftovers.
5000 "subs %x[count], %x[count], #9\n"
5001 "beq 2f\n"
5002
5003 "1:"
5004 "subs %x[count], %x[count], #16\n"
5005
5006 // MinMax::Transform
5007 "ld1 {v0.4s}, [%x[input]], #16\n"
5008 "prfm pldl1keep, [%x[input], #16]\n"
5009 "umax v0.16b, v0.16b, v4.16b\n"
5010 "umin v0.16b, v0.16b, v5.16b\n"
5011
5012 "st1 {v0.4s}, [%x[output]], #16\n"
5013 "prfm pldl1keep, [%x[output]]\n"
5014
5015 "bne 1b\n"
5016 "2:"
5017
5018 // Handle leftovers.
5019
5020 // MinMax::Transform
5021 "ld1 {v0.2s}, [%x[input]], #8\n"
5022 "ld1 {v0.b}[8], [%x[input]], #1\n"
5023 "prfm pldl1keep, [%x[input], #16]\n"
5024 "umax v0.16b, v0.16b, v4.16b\n"
5025 "umin v0.16b, v0.16b, v5.16b\n"
5026
5027 "st1 {v0.2s}, [%x[output]], #8\n"
5028 "st1 {v0.b}[8], [%x[output]], #1\n"
5029 "prfm pldl1keep, [%x[output]]\n"
5030 : [count] "+r"(params_count_copy), [input] "+r"(input),
5031 [output] "+r"(output)
5032 : [max] "r"(params.max), [min] "r"(params.min)
5033 : "v0", "v4", "v5", "cc", "memory");
5034 }
5035
5036 template <>
5037 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5038 10>::Transform(const uint8_t* input,
5039 const MinMax<uint8_t>& params,
5040 uint8_t* output) {
5041 #ifdef DEBUG
5042 #ifdef DEBUG_METAGEMM_VERBOSE
5043 std::cout << __FILE__ << "(" << __LINE__
5044 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5045 "10>::Transform()"
5046 << std::endl
5047 << std::flush;
5048 #endif
5049 #endif
5050 int params_count_copy = params.count;
5051 asm volatile(
5052
5053 // MinMax::Prepare
5054 "dup v4.16b, %w[min]\n"
5055 "dup v5.16b, %w[max]\n"
5056
5057 // Reduce count by leftovers.
5058 "subs %x[count], %x[count], #10\n"
5059 "beq 2f\n"
5060
5061 "1:"
5062 "subs %x[count], %x[count], #16\n"
5063
5064 // MinMax::Transform
5065 "ld1 {v0.4s}, [%x[input]], #16\n"
5066 "prfm pldl1keep, [%x[input], #16]\n"
5067 "umax v0.16b, v0.16b, v4.16b\n"
5068 "umin v0.16b, v0.16b, v5.16b\n"
5069
5070 "st1 {v0.4s}, [%x[output]], #16\n"
5071 "prfm pldl1keep, [%x[output]]\n"
5072
5073 "bne 1b\n"
5074 "2:"
5075
5076 // Handle leftovers.
5077
5078 // MinMax::Transform
5079 "ld1 {v0.2s}, [%x[input]], #8\n"
5080 "ld1 {v0.h}[4], [%x[input]], #2\n"
5081 "prfm pldl1keep, [%x[input], #16]\n"
5082 "umax v0.16b, v0.16b, v4.16b\n"
5083 "umin v0.16b, v0.16b, v5.16b\n"
5084
5085 "st1 {v0.2s}, [%x[output]], #8\n"
5086 "st1 {v0.h}[4], [%x[output]], #2\n"
5087 "prfm pldl1keep, [%x[output]]\n"
5088 : [count] "+r"(params_count_copy), [input] "+r"(input),
5089 [output] "+r"(output)
5090 : [max] "r"(params.max), [min] "r"(params.min)
5091 : "v0", "v4", "v5", "cc", "memory");
5092 }
5093
5094 template <>
5095 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5096 11>::Transform(const uint8_t* input,
5097 const MinMax<uint8_t>& params,
5098 uint8_t* output) {
5099 #ifdef DEBUG
5100 #ifdef DEBUG_METAGEMM_VERBOSE
5101 std::cout << __FILE__ << "(" << __LINE__
5102 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5103 "11>::Transform()"
5104 << std::endl
5105 << std::flush;
5106 #endif
5107 #endif
5108 int params_count_copy = params.count;
5109 asm volatile(
5110
5111 // MinMax::Prepare
5112 "dup v4.16b, %w[min]\n"
5113 "dup v5.16b, %w[max]\n"
5114
5115 // Reduce count by leftovers.
5116 "subs %x[count], %x[count], #11\n"
5117 "beq 2f\n"
5118
5119 "1:"
5120 "subs %x[count], %x[count], #16\n"
5121
5122 // MinMax::Transform
5123 "ld1 {v0.4s}, [%x[input]], #16\n"
5124 "prfm pldl1keep, [%x[input], #16]\n"
5125 "umax v0.16b, v0.16b, v4.16b\n"
5126 "umin v0.16b, v0.16b, v5.16b\n"
5127
5128 "st1 {v0.4s}, [%x[output]], #16\n"
5129 "prfm pldl1keep, [%x[output]]\n"
5130
5131 "bne 1b\n"
5132 "2:"
5133
5134 // Handle leftovers.
5135
5136 // MinMax::Transform
5137 "ld1 {v0.2s}, [%x[input]], #8\n"
5138 "ld1 {v0.h}[4], [%x[input]], #2\n"
5139 "ld1 {v0.b}[10], [%x[input]], #1\n"
5140 "prfm pldl1keep, [%x[input], #16]\n"
5141 "umax v0.16b, v0.16b, v4.16b\n"
5142 "umin v0.16b, v0.16b, v5.16b\n"
5143
5144 "st1 {v0.2s}, [%x[output]], #8\n"
5145 "st1 {v0.h}[4], [%x[output]], #2\n"
5146 "st1 {v0.b}[10], [%x[output]], #1\n"
5147 "prfm pldl1keep, [%x[output]]\n"
5148 : [count] "+r"(params_count_copy), [input] "+r"(input),
5149 [output] "+r"(output)
5150 : [max] "r"(params.max), [min] "r"(params.min)
5151 : "v0", "v4", "v5", "cc", "memory");
5152 }
5153
5154 template <>
5155 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5156 12>::Transform(const uint8_t* input,
5157 const MinMax<uint8_t>& params,
5158 uint8_t* output) {
5159 #ifdef DEBUG
5160 #ifdef DEBUG_METAGEMM_VERBOSE
5161 std::cout << __FILE__ << "(" << __LINE__
5162 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5163 "12>::Transform()"
5164 << std::endl
5165 << std::flush;
5166 #endif
5167 #endif
5168 int params_count_copy = params.count;
5169 asm volatile(
5170
5171 // MinMax::Prepare
5172 "dup v4.16b, %w[min]\n"
5173 "dup v5.16b, %w[max]\n"
5174
5175 // Reduce count by leftovers.
5176 "subs %x[count], %x[count], #12\n"
5177 "beq 2f\n"
5178
5179 "1:"
5180 "subs %x[count], %x[count], #16\n"
5181
5182 // MinMax::Transform
5183 "ld1 {v0.4s}, [%x[input]], #16\n"
5184 "prfm pldl1keep, [%x[input], #16]\n"
5185 "umax v0.16b, v0.16b, v4.16b\n"
5186 "umin v0.16b, v0.16b, v5.16b\n"
5187
5188 "st1 {v0.4s}, [%x[output]], #16\n"
5189 "prfm pldl1keep, [%x[output]]\n"
5190
5191 "bne 1b\n"
5192 "2:"
5193
5194 // Handle leftovers.
5195
5196 // MinMax::Transform
5197 "ld1 {v0.2s}, [%x[input]], #8\n"
5198 "ld1 {v0.s}[2], [%x[input]], #4\n"
5199 "prfm pldl1keep, [%x[input], #16]\n"
5200 "umax v0.16b, v0.16b, v4.16b\n"
5201 "umin v0.16b, v0.16b, v5.16b\n"
5202
5203 "st1 {v0.2s}, [%x[output]], #8\n"
5204 "st1 {v0.s}[2], [%x[output]], #4\n"
5205 "prfm pldl1keep, [%x[output]]\n"
5206 : [count] "+r"(params_count_copy), [input] "+r"(input),
5207 [output] "+r"(output)
5208 : [max] "r"(params.max), [min] "r"(params.min)
5209 : "v0", "v4", "v5", "cc", "memory");
5210 }
5211
5212 template <>
5213 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5214 13>::Transform(const uint8_t* input,
5215 const MinMax<uint8_t>& params,
5216 uint8_t* output) {
5217 #ifdef DEBUG
5218 #ifdef DEBUG_METAGEMM_VERBOSE
5219 std::cout << __FILE__ << "(" << __LINE__
5220 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5221 "13>::Transform()"
5222 << std::endl
5223 << std::flush;
5224 #endif
5225 #endif
5226 int params_count_copy = params.count;
5227 asm volatile(
5228
5229 // MinMax::Prepare
5230 "dup v4.16b, %w[min]\n"
5231 "dup v5.16b, %w[max]\n"
5232
5233 // Reduce count by leftovers.
5234 "subs %x[count], %x[count], #13\n"
5235 "beq 2f\n"
5236
5237 "1:"
5238 "subs %x[count], %x[count], #16\n"
5239
5240 // MinMax::Transform
5241 "ld1 {v0.4s}, [%x[input]], #16\n"
5242 "prfm pldl1keep, [%x[input], #16]\n"
5243 "umax v0.16b, v0.16b, v4.16b\n"
5244 "umin v0.16b, v0.16b, v5.16b\n"
5245
5246 "st1 {v0.4s}, [%x[output]], #16\n"
5247 "prfm pldl1keep, [%x[output]]\n"
5248
5249 "bne 1b\n"
5250 "2:"
5251
5252 // Handle leftovers.
5253
5254 // MinMax::Transform
5255 "ld1 {v0.2s}, [%x[input]], #8\n"
5256 "ld1 {v0.s}[2], [%x[input]], #4\n"
5257 "ld1 {v0.b}[12], [%x[input]], #1\n"
5258 "prfm pldl1keep, [%x[input], #16]\n"
5259 "umax v0.16b, v0.16b, v4.16b\n"
5260 "umin v0.16b, v0.16b, v5.16b\n"
5261
5262 "st1 {v0.2s}, [%x[output]], #8\n"
5263 "st1 {v0.s}[2], [%x[output]], #4\n"
5264 "st1 {v0.b}[12], [%x[output]], #1\n"
5265 "prfm pldl1keep, [%x[output]]\n"
5266 : [count] "+r"(params_count_copy), [input] "+r"(input),
5267 [output] "+r"(output)
5268 : [max] "r"(params.max), [min] "r"(params.min)
5269 : "v0", "v4", "v5", "cc", "memory");
5270 }
5271
5272 template <>
5273 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5274 14>::Transform(const uint8_t* input,
5275 const MinMax<uint8_t>& params,
5276 uint8_t* output) {
5277 #ifdef DEBUG
5278 #ifdef DEBUG_METAGEMM_VERBOSE
5279 std::cout << __FILE__ << "(" << __LINE__
5280 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5281 "14>::Transform()"
5282 << std::endl
5283 << std::flush;
5284 #endif
5285 #endif
5286 int params_count_copy = params.count;
5287 asm volatile(
5288
5289 // MinMax::Prepare
5290 "dup v4.16b, %w[min]\n"
5291 "dup v5.16b, %w[max]\n"
5292
5293 // Reduce count by leftovers.
5294 "subs %x[count], %x[count], #14\n"
5295 "beq 2f\n"
5296
5297 "1:"
5298 "subs %x[count], %x[count], #16\n"
5299
5300 // MinMax::Transform
5301 "ld1 {v0.4s}, [%x[input]], #16\n"
5302 "prfm pldl1keep, [%x[input], #16]\n"
5303 "umax v0.16b, v0.16b, v4.16b\n"
5304 "umin v0.16b, v0.16b, v5.16b\n"
5305
5306 "st1 {v0.4s}, [%x[output]], #16\n"
5307 "prfm pldl1keep, [%x[output]]\n"
5308
5309 "bne 1b\n"
5310 "2:"
5311
5312 // Handle leftovers.
5313
5314 // MinMax::Transform
5315 "ld1 {v0.2s}, [%x[input]], #8\n"
5316 "ld1 {v0.s}[2], [%x[input]], #4\n"
5317 "ld1 {v0.h}[6], [%x[input]], #2\n"
5318 "prfm pldl1keep, [%x[input], #16]\n"
5319 "umax v0.16b, v0.16b, v4.16b\n"
5320 "umin v0.16b, v0.16b, v5.16b\n"
5321
5322 "st1 {v0.2s}, [%x[output]], #8\n"
5323 "st1 {v0.s}[2], [%x[output]], #4\n"
5324 "st1 {v0.h}[6], [%x[output]], #2\n"
5325 "prfm pldl1keep, [%x[output]]\n"
5326 : [count] "+r"(params_count_copy), [input] "+r"(input),
5327 [output] "+r"(output)
5328 : [max] "r"(params.max), [min] "r"(params.min)
5329 : "v0", "v4", "v5", "cc", "memory");
5330 }
5331
5332 template <>
5333 inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
Transform(const uint8_t * input,const MinMax<uint8_t> & params,uint8_t * output)5334 15>::Transform(const uint8_t* input,
5335 const MinMax<uint8_t>& params,
5336 uint8_t* output) {
5337 #ifdef DEBUG
5338 #ifdef DEBUG_METAGEMM_VERBOSE
5339 std::cout << __FILE__ << "(" << __LINE__
5340 << ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
5341 "15>::Transform()"
5342 << std::endl
5343 << std::flush;
5344 #endif
5345 #endif
5346 int params_count_copy = params.count;
5347 asm volatile(
5348
5349 // MinMax::Prepare
5350 "dup v4.16b, %w[min]\n"
5351 "dup v5.16b, %w[max]\n"
5352
5353 // Reduce count by leftovers.
5354 "subs %x[count], %x[count], #15\n"
5355 "beq 2f\n"
5356
5357 "1:"
5358 "subs %x[count], %x[count], #16\n"
5359
5360 // MinMax::Transform
5361 "ld1 {v0.4s}, [%x[input]], #16\n"
5362 "prfm pldl1keep, [%x[input], #16]\n"
5363 "umax v0.16b, v0.16b, v4.16b\n"
5364 "umin v0.16b, v0.16b, v5.16b\n"
5365
5366 "st1 {v0.4s}, [%x[output]], #16\n"
5367 "prfm pldl1keep, [%x[output]]\n"
5368
5369 "bne 1b\n"
5370 "2:"
5371
5372 // Handle leftovers.
5373
5374 // MinMax::Transform
5375 "ld1 {v0.2s}, [%x[input]], #8\n"
5376 "ld1 {v0.s}[2], [%x[input]], #4\n"
5377 "ld1 {v0.h}[6], [%x[input]], #2\n"
5378 "ld1 {v0.b}[14], [%x[input]], #1\n"
5379 "prfm pldl1keep, [%x[input], #16]\n"
5380 "umax v0.16b, v0.16b, v4.16b\n"
5381 "umin v0.16b, v0.16b, v5.16b\n"
5382
5383 "st1 {v0.2s}, [%x[output]], #8\n"
5384 "st1 {v0.s}[2], [%x[output]], #4\n"
5385 "st1 {v0.h}[6], [%x[output]], #2\n"
5386 "st1 {v0.b}[14], [%x[output]], #1\n"
5387 "prfm pldl1keep, [%x[output]]\n"
5388 : [count] "+r"(params_count_copy), [input] "+r"(input),
5389 [output] "+r"(output)
5390 : [max] "r"(params.max), [min] "r"(params.min)
5391 : "v0", "v4", "v5", "cc", "memory");
5392 }
5393
5394 template <>
5395 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5396 0>::Transform(const uint8_t* input,
5397 const BiasAdd<uint8_t>& params,
5398 int32_t* output) {
5399 #ifdef DEBUG
5400 #ifdef DEBUG_METAGEMM_VERBOSE
5401 std::cout << __FILE__ << "(" << __LINE__
5402 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5403 "0>::Transform()"
5404 << std::endl
5405 << std::flush;
5406 #endif
5407 #endif
5408 int params_rows_copy = params.rows;
5409 asm volatile(
5410 "ldr w0, %[input_range_min]\n"
5411 "dup v8.4s, w0\n"
5412 "ldr w0, %[input_range_scale]\n"
5413 "dup v9.4s, w0\n"
5414 "ldr w0, %[bias_range_min]\n"
5415 "dup v10.4s, w0\n"
5416 "ldr w0, %[bias_range_scale]\n"
5417 "dup v11.4s, w0\n"
5418 "ldr w0, %[output_range_min]\n"
5419 "dup v12.4s, w0\n"
5420 "ldr w0, %[one_over_output_range_scale]\n"
5421 "dup v13.4s, w0\n"
5422 "ldr w0, %[output_range_offset]\n"
5423 "dup v14.4s, w0\n"
5424 "1:"
5425 "mov x0, %x[count]\n"
5426 "mov x1, %x[bias]\n"
5427 "2:"
5428 "subs x0, x0, #16\n"
5429
5430 // BiasAdd::Transform
5431 "ld1 {v0.4s}, [%x[input]], #16\n"
5432 "ld1 {v4.4s}, [x1], #16\n"
5433 "prfm pldl1keep, [%x[input], #32]\n"
5434 "uxtl2 v1.8h, v0.16b\n"
5435 "uxtl v0.8h, v0.8b\n"
5436 "uxtl2 v5.8h, v4.16b\n"
5437 "uxtl v4.8h, v4.8b\n"
5438 "sxtl2 v3.4s, v1.8h\n"
5439 "sxtl v2.4s, v1.4h\n"
5440 "sxtl2 v7.4s, v5.8h\n"
5441 "sxtl v6.4s, v5.4h\n"
5442 "sxtl2 v1.4s, v0.8h\n"
5443 "sxtl v0.4s, v0.4h\n"
5444 "sxtl2 v5.4s, v4.8h\n"
5445 "sxtl v4.4s, v4.4h\n"
5446 "scvtf v0.4s, v0.4s\n"
5447 "scvtf v1.4s, v1.4s\n"
5448 "scvtf v2.4s, v2.4s\n"
5449 "scvtf v3.4s, v3.4s\n"
5450 "scvtf v4.4s, v4.4s\n"
5451 "scvtf v5.4s, v5.4s\n"
5452 "scvtf v6.4s, v6.4s\n"
5453 "scvtf v7.4s, v7.4s\n"
5454 "fmul v0.4s, v0.4s, v9.4s\n"
5455 "fmul v1.4s, v1.4s, v9.4s\n"
5456 "fmul v2.4s, v2.4s, v9.4s\n"
5457 "fmul v3.4s, v3.4s, v9.4s\n"
5458 "fmul v4.4s, v4.4s, v11.4s\n"
5459 "fmul v5.4s, v5.4s, v11.4s\n"
5460 "fmul v6.4s, v6.4s, v11.4s\n"
5461 "fmul v7.4s, v7.4s, v11.4s\n"
5462 "fadd v0.4s, v0.4s, v8.4s\n"
5463 "fadd v1.4s, v1.4s, v8.4s\n"
5464 "fadd v2.4s, v2.4s, v8.4s\n"
5465 "fadd v3.4s, v3.4s, v8.4s\n"
5466 "fadd v4.4s, v4.4s, v10.4s\n"
5467 "fadd v5.4s, v5.4s, v10.4s\n"
5468 "fadd v6.4s, v6.4s, v10.4s\n"
5469 "fadd v7.4s, v7.4s, v10.4s\n"
5470 "fadd v0.4s, v0.4s, v4.4s\n"
5471 "fadd v1.4s, v1.4s, v5.4s\n"
5472 "fadd v2.4s, v2.4s, v6.4s\n"
5473 "fadd v3.4s, v3.4s, v7.4s\n"
5474 "fsub v0.4s, v0.4s, v12.4s\n"
5475 "fsub v1.4s, v1.4s, v12.4s\n"
5476 "fsub v2.4s, v2.4s, v12.4s\n"
5477 "fsub v3.4s, v3.4s, v12.4s\n"
5478 "fmul v0.4s, v0.4s, v13.4s\n"
5479 "fmul v1.4s, v1.4s, v13.4s\n"
5480 "fmul v2.4s, v2.4s, v13.4s\n"
5481 "fmul v3.4s, v3.4s, v13.4s\n"
5482 "fadd v0.4s, v0.4s, v14.4s\n"
5483 "fadd v1.4s, v1.4s, v14.4s\n"
5484 "fadd v2.4s, v2.4s, v14.4s\n"
5485 "fadd v3.4s, v3.4s, v14.4s\n"
5486 "fcvtzs v0.4s, v0.4s\n"
5487 "fcvtzs v1.4s, v1.4s\n"
5488 "fcvtzs v2.4s, v2.4s\n"
5489 "fcvtzs v3.4s, v3.4s\n"
5490
5491 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
5492 "prfm pldl1keep, [%x[output]]\n"
5493 "bne 2b\n"
5494 "subs %x[rows], %x[rows], #1\n"
5495 "bne 1b\n"
5496 : [input] "+r"(input), [output] "+r"(output)
5497 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
5498 [output_range_offset] "m"(params.output_range_offset),
5499 [input_range_scale] "m"(params.input_range_scale),
5500 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
5501 [bias_range_min] "m"(params.bias_range_min),
5502 [output_range_min] "m"(params.output_range_min),
5503 [bias_range_scale] "m"(params.bias_range_scale),
5504 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
5505 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
5506 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
5507 }
5508
5509 template <>
5510 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5511 1>::Transform(const uint8_t* input,
5512 const BiasAdd<uint8_t>& params,
5513 int32_t* output) {
5514 #ifdef DEBUG
5515 #ifdef DEBUG_METAGEMM_VERBOSE
5516 std::cout << __FILE__ << "(" << __LINE__
5517 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5518 "1>::Transform()"
5519 << std::endl
5520 << std::flush;
5521 #endif
5522 #endif
5523 int params_rows_copy = params.rows;
5524 asm volatile(
5525 "ldr w0, %[input_range_min]\n"
5526 "dup v8.4s, w0\n"
5527 "ldr w0, %[input_range_scale]\n"
5528 "dup v9.4s, w0\n"
5529 "ldr w0, %[bias_range_min]\n"
5530 "dup v10.4s, w0\n"
5531 "ldr w0, %[bias_range_scale]\n"
5532 "dup v11.4s, w0\n"
5533 "ldr w0, %[output_range_min]\n"
5534 "dup v12.4s, w0\n"
5535 "ldr w0, %[one_over_output_range_scale]\n"
5536 "dup v13.4s, w0\n"
5537 "ldr w0, %[output_range_offset]\n"
5538 "dup v14.4s, w0\n"
5539 "1:"
5540 "mov x0, %x[count]\n"
5541 "mov x1, %x[bias]\n"
5542 "subs x0, x0, #1\n"
5543 "beq 3f\n"
5544 "2:"
5545 "subs x0, x0, #16\n"
5546
5547 // BiasAdd::Transform
5548 "ld1 {v0.4s}, [%x[input]], #16\n"
5549 "ld1 {v4.4s}, [x1], #16\n"
5550 "prfm pldl1keep, [%x[input], #32]\n"
5551 "uxtl2 v1.8h, v0.16b\n"
5552 "uxtl v0.8h, v0.8b\n"
5553 "uxtl2 v5.8h, v4.16b\n"
5554 "uxtl v4.8h, v4.8b\n"
5555 "sxtl2 v3.4s, v1.8h\n"
5556 "sxtl v2.4s, v1.4h\n"
5557 "sxtl2 v7.4s, v5.8h\n"
5558 "sxtl v6.4s, v5.4h\n"
5559 "sxtl2 v1.4s, v0.8h\n"
5560 "sxtl v0.4s, v0.4h\n"
5561 "sxtl2 v5.4s, v4.8h\n"
5562 "sxtl v4.4s, v4.4h\n"
5563 "scvtf v0.4s, v0.4s\n"
5564 "scvtf v1.4s, v1.4s\n"
5565 "scvtf v2.4s, v2.4s\n"
5566 "scvtf v3.4s, v3.4s\n"
5567 "scvtf v4.4s, v4.4s\n"
5568 "scvtf v5.4s, v5.4s\n"
5569 "scvtf v6.4s, v6.4s\n"
5570 "scvtf v7.4s, v7.4s\n"
5571 "fmul v0.4s, v0.4s, v9.4s\n"
5572 "fmul v1.4s, v1.4s, v9.4s\n"
5573 "fmul v2.4s, v2.4s, v9.4s\n"
5574 "fmul v3.4s, v3.4s, v9.4s\n"
5575 "fmul v4.4s, v4.4s, v11.4s\n"
5576 "fmul v5.4s, v5.4s, v11.4s\n"
5577 "fmul v6.4s, v6.4s, v11.4s\n"
5578 "fmul v7.4s, v7.4s, v11.4s\n"
5579 "fadd v0.4s, v0.4s, v8.4s\n"
5580 "fadd v1.4s, v1.4s, v8.4s\n"
5581 "fadd v2.4s, v2.4s, v8.4s\n"
5582 "fadd v3.4s, v3.4s, v8.4s\n"
5583 "fadd v4.4s, v4.4s, v10.4s\n"
5584 "fadd v5.4s, v5.4s, v10.4s\n"
5585 "fadd v6.4s, v6.4s, v10.4s\n"
5586 "fadd v7.4s, v7.4s, v10.4s\n"
5587 "fadd v0.4s, v0.4s, v4.4s\n"
5588 "fadd v1.4s, v1.4s, v5.4s\n"
5589 "fadd v2.4s, v2.4s, v6.4s\n"
5590 "fadd v3.4s, v3.4s, v7.4s\n"
5591 "fsub v0.4s, v0.4s, v12.4s\n"
5592 "fsub v1.4s, v1.4s, v12.4s\n"
5593 "fsub v2.4s, v2.4s, v12.4s\n"
5594 "fsub v3.4s, v3.4s, v12.4s\n"
5595 "fmul v0.4s, v0.4s, v13.4s\n"
5596 "fmul v1.4s, v1.4s, v13.4s\n"
5597 "fmul v2.4s, v2.4s, v13.4s\n"
5598 "fmul v3.4s, v3.4s, v13.4s\n"
5599 "fadd v0.4s, v0.4s, v14.4s\n"
5600 "fadd v1.4s, v1.4s, v14.4s\n"
5601 "fadd v2.4s, v2.4s, v14.4s\n"
5602 "fadd v3.4s, v3.4s, v14.4s\n"
5603 "fcvtzs v0.4s, v0.4s\n"
5604 "fcvtzs v1.4s, v1.4s\n"
5605 "fcvtzs v2.4s, v2.4s\n"
5606 "fcvtzs v3.4s, v3.4s\n"
5607
5608 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
5609 "prfm pldl1keep, [%x[output]]\n"
5610 "bne 2b\n"
5611 "3:"
5612
5613 // BiasAdd::Transform
5614 "ld1 {v0.b}[0], [%x[input]], #1\n"
5615 "ld1 {v1.b}[0], [x1], #1\n"
5616 "prfm pldl1keep, [%x[input], #32]\n"
5617 "uxtl v0.8h, v0.8b\n"
5618 "uxtl v1.8h, v1.8b\n"
5619 "sxtl v0.4s, v0.4h\n"
5620 "sxtl v1.4s, v1.4h\n"
5621 "scvtf v0.4s, v0.4s\n"
5622 "scvtf v1.4s, v1.4s\n"
5623 "fmul v0.4s, v0.4s, v9.4s\n"
5624 "fmul v1.4s, v1.4s, v11.4s\n"
5625 "fadd v0.4s, v0.4s, v8.4s\n"
5626 "fadd v1.4s, v1.4s, v10.4s\n"
5627 "fadd v0.4s, v0.4s, v1.4s\n"
5628 "fsub v0.4s, v0.4s, v12.4s\n"
5629 "fmul v0.4s, v0.4s, v13.4s\n"
5630 "fadd v0.4s, v0.4s, v14.4s\n"
5631 "fcvtzs v0.4s, v0.4s\n"
5632
5633 "st1 {v0.s}[0], [%x[output]], #4\n"
5634 "prfm pldl1keep, [%x[output]]\n"
5635 "subs %x[rows], %x[rows], #1\n"
5636 "bne 1b\n"
5637 : [input] "+r"(input), [output] "+r"(output)
5638 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
5639 [output_range_offset] "m"(params.output_range_offset),
5640 [input_range_scale] "m"(params.input_range_scale),
5641 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
5642 [bias_range_min] "m"(params.bias_range_min),
5643 [output_range_min] "m"(params.output_range_min),
5644 [bias_range_scale] "m"(params.bias_range_scale),
5645 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
5646 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
5647 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
5648 }
5649
5650 template <>
5651 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5652 2>::Transform(const uint8_t* input,
5653 const BiasAdd<uint8_t>& params,
5654 int32_t* output) {
5655 #ifdef DEBUG
5656 #ifdef DEBUG_METAGEMM_VERBOSE
5657 std::cout << __FILE__ << "(" << __LINE__
5658 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5659 "2>::Transform()"
5660 << std::endl
5661 << std::flush;
5662 #endif
5663 #endif
5664 int params_rows_copy = params.rows;
5665 asm volatile(
5666 "ldr w0, %[input_range_min]\n"
5667 "dup v8.4s, w0\n"
5668 "ldr w0, %[input_range_scale]\n"
5669 "dup v9.4s, w0\n"
5670 "ldr w0, %[bias_range_min]\n"
5671 "dup v10.4s, w0\n"
5672 "ldr w0, %[bias_range_scale]\n"
5673 "dup v11.4s, w0\n"
5674 "ldr w0, %[output_range_min]\n"
5675 "dup v12.4s, w0\n"
5676 "ldr w0, %[one_over_output_range_scale]\n"
5677 "dup v13.4s, w0\n"
5678 "ldr w0, %[output_range_offset]\n"
5679 "dup v14.4s, w0\n"
5680 "1:"
5681 "mov x0, %x[count]\n"
5682 "mov x1, %x[bias]\n"
5683 "subs x0, x0, #2\n"
5684 "beq 3f\n"
5685 "2:"
5686 "subs x0, x0, #16\n"
5687
5688 // BiasAdd::Transform
5689 "ld1 {v0.4s}, [%x[input]], #16\n"
5690 "ld1 {v4.4s}, [x1], #16\n"
5691 "prfm pldl1keep, [%x[input], #32]\n"
5692 "uxtl2 v1.8h, v0.16b\n"
5693 "uxtl v0.8h, v0.8b\n"
5694 "uxtl2 v5.8h, v4.16b\n"
5695 "uxtl v4.8h, v4.8b\n"
5696 "sxtl2 v3.4s, v1.8h\n"
5697 "sxtl v2.4s, v1.4h\n"
5698 "sxtl2 v7.4s, v5.8h\n"
5699 "sxtl v6.4s, v5.4h\n"
5700 "sxtl2 v1.4s, v0.8h\n"
5701 "sxtl v0.4s, v0.4h\n"
5702 "sxtl2 v5.4s, v4.8h\n"
5703 "sxtl v4.4s, v4.4h\n"
5704 "scvtf v0.4s, v0.4s\n"
5705 "scvtf v1.4s, v1.4s\n"
5706 "scvtf v2.4s, v2.4s\n"
5707 "scvtf v3.4s, v3.4s\n"
5708 "scvtf v4.4s, v4.4s\n"
5709 "scvtf v5.4s, v5.4s\n"
5710 "scvtf v6.4s, v6.4s\n"
5711 "scvtf v7.4s, v7.4s\n"
5712 "fmul v0.4s, v0.4s, v9.4s\n"
5713 "fmul v1.4s, v1.4s, v9.4s\n"
5714 "fmul v2.4s, v2.4s, v9.4s\n"
5715 "fmul v3.4s, v3.4s, v9.4s\n"
5716 "fmul v4.4s, v4.4s, v11.4s\n"
5717 "fmul v5.4s, v5.4s, v11.4s\n"
5718 "fmul v6.4s, v6.4s, v11.4s\n"
5719 "fmul v7.4s, v7.4s, v11.4s\n"
5720 "fadd v0.4s, v0.4s, v8.4s\n"
5721 "fadd v1.4s, v1.4s, v8.4s\n"
5722 "fadd v2.4s, v2.4s, v8.4s\n"
5723 "fadd v3.4s, v3.4s, v8.4s\n"
5724 "fadd v4.4s, v4.4s, v10.4s\n"
5725 "fadd v5.4s, v5.4s, v10.4s\n"
5726 "fadd v6.4s, v6.4s, v10.4s\n"
5727 "fadd v7.4s, v7.4s, v10.4s\n"
5728 "fadd v0.4s, v0.4s, v4.4s\n"
5729 "fadd v1.4s, v1.4s, v5.4s\n"
5730 "fadd v2.4s, v2.4s, v6.4s\n"
5731 "fadd v3.4s, v3.4s, v7.4s\n"
5732 "fsub v0.4s, v0.4s, v12.4s\n"
5733 "fsub v1.4s, v1.4s, v12.4s\n"
5734 "fsub v2.4s, v2.4s, v12.4s\n"
5735 "fsub v3.4s, v3.4s, v12.4s\n"
5736 "fmul v0.4s, v0.4s, v13.4s\n"
5737 "fmul v1.4s, v1.4s, v13.4s\n"
5738 "fmul v2.4s, v2.4s, v13.4s\n"
5739 "fmul v3.4s, v3.4s, v13.4s\n"
5740 "fadd v0.4s, v0.4s, v14.4s\n"
5741 "fadd v1.4s, v1.4s, v14.4s\n"
5742 "fadd v2.4s, v2.4s, v14.4s\n"
5743 "fadd v3.4s, v3.4s, v14.4s\n"
5744 "fcvtzs v0.4s, v0.4s\n"
5745 "fcvtzs v1.4s, v1.4s\n"
5746 "fcvtzs v2.4s, v2.4s\n"
5747 "fcvtzs v3.4s, v3.4s\n"
5748
5749 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
5750 "prfm pldl1keep, [%x[output]]\n"
5751 "bne 2b\n"
5752 "3:"
5753
5754 // BiasAdd::Transform
5755 "ld1 {v0.h}[0], [%x[input]], #2\n"
5756 "ld1 {v1.h}[0], [x1], #2\n"
5757 "prfm pldl1keep, [%x[input], #32]\n"
5758 "uxtl v0.8h, v0.8b\n"
5759 "uxtl v1.8h, v1.8b\n"
5760 "sxtl v0.4s, v0.4h\n"
5761 "sxtl v1.4s, v1.4h\n"
5762 "scvtf v0.4s, v0.4s\n"
5763 "scvtf v1.4s, v1.4s\n"
5764 "fmul v0.4s, v0.4s, v9.4s\n"
5765 "fmul v1.4s, v1.4s, v11.4s\n"
5766 "fadd v0.4s, v0.4s, v8.4s\n"
5767 "fadd v1.4s, v1.4s, v10.4s\n"
5768 "fadd v0.4s, v0.4s, v1.4s\n"
5769 "fsub v0.4s, v0.4s, v12.4s\n"
5770 "fmul v0.4s, v0.4s, v13.4s\n"
5771 "fadd v0.4s, v0.4s, v14.4s\n"
5772 "fcvtzs v0.4s, v0.4s\n"
5773
5774 "st1 {v0.2s}, [%x[output]], #8\n"
5775 "prfm pldl1keep, [%x[output]]\n"
5776 "subs %x[rows], %x[rows], #1\n"
5777 "bne 1b\n"
5778 : [input] "+r"(input), [output] "+r"(output)
5779 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
5780 [output_range_offset] "m"(params.output_range_offset),
5781 [input_range_scale] "m"(params.input_range_scale),
5782 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
5783 [bias_range_min] "m"(params.bias_range_min),
5784 [output_range_min] "m"(params.output_range_min),
5785 [bias_range_scale] "m"(params.bias_range_scale),
5786 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
5787 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
5788 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
5789 }
5790
5791 template <>
5792 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5793 3>::Transform(const uint8_t* input,
5794 const BiasAdd<uint8_t>& params,
5795 int32_t* output) {
5796 #ifdef DEBUG
5797 #ifdef DEBUG_METAGEMM_VERBOSE
5798 std::cout << __FILE__ << "(" << __LINE__
5799 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5800 "3>::Transform()"
5801 << std::endl
5802 << std::flush;
5803 #endif
5804 #endif
5805 int params_rows_copy = params.rows;
5806 asm volatile(
5807 "ldr w0, %[input_range_min]\n"
5808 "dup v8.4s, w0\n"
5809 "ldr w0, %[input_range_scale]\n"
5810 "dup v9.4s, w0\n"
5811 "ldr w0, %[bias_range_min]\n"
5812 "dup v10.4s, w0\n"
5813 "ldr w0, %[bias_range_scale]\n"
5814 "dup v11.4s, w0\n"
5815 "ldr w0, %[output_range_min]\n"
5816 "dup v12.4s, w0\n"
5817 "ldr w0, %[one_over_output_range_scale]\n"
5818 "dup v13.4s, w0\n"
5819 "ldr w0, %[output_range_offset]\n"
5820 "dup v14.4s, w0\n"
5821 "1:"
5822 "mov x0, %x[count]\n"
5823 "mov x1, %x[bias]\n"
5824 "subs x0, x0, #3\n"
5825 "beq 3f\n"
5826 "2:"
5827 "subs x0, x0, #16\n"
5828
5829 // BiasAdd::Transform
5830 "ld1 {v0.4s}, [%x[input]], #16\n"
5831 "ld1 {v4.4s}, [x1], #16\n"
5832 "prfm pldl1keep, [%x[input], #32]\n"
5833 "uxtl2 v1.8h, v0.16b\n"
5834 "uxtl v0.8h, v0.8b\n"
5835 "uxtl2 v5.8h, v4.16b\n"
5836 "uxtl v4.8h, v4.8b\n"
5837 "sxtl2 v3.4s, v1.8h\n"
5838 "sxtl v2.4s, v1.4h\n"
5839 "sxtl2 v7.4s, v5.8h\n"
5840 "sxtl v6.4s, v5.4h\n"
5841 "sxtl2 v1.4s, v0.8h\n"
5842 "sxtl v0.4s, v0.4h\n"
5843 "sxtl2 v5.4s, v4.8h\n"
5844 "sxtl v4.4s, v4.4h\n"
5845 "scvtf v0.4s, v0.4s\n"
5846 "scvtf v1.4s, v1.4s\n"
5847 "scvtf v2.4s, v2.4s\n"
5848 "scvtf v3.4s, v3.4s\n"
5849 "scvtf v4.4s, v4.4s\n"
5850 "scvtf v5.4s, v5.4s\n"
5851 "scvtf v6.4s, v6.4s\n"
5852 "scvtf v7.4s, v7.4s\n"
5853 "fmul v0.4s, v0.4s, v9.4s\n"
5854 "fmul v1.4s, v1.4s, v9.4s\n"
5855 "fmul v2.4s, v2.4s, v9.4s\n"
5856 "fmul v3.4s, v3.4s, v9.4s\n"
5857 "fmul v4.4s, v4.4s, v11.4s\n"
5858 "fmul v5.4s, v5.4s, v11.4s\n"
5859 "fmul v6.4s, v6.4s, v11.4s\n"
5860 "fmul v7.4s, v7.4s, v11.4s\n"
5861 "fadd v0.4s, v0.4s, v8.4s\n"
5862 "fadd v1.4s, v1.4s, v8.4s\n"
5863 "fadd v2.4s, v2.4s, v8.4s\n"
5864 "fadd v3.4s, v3.4s, v8.4s\n"
5865 "fadd v4.4s, v4.4s, v10.4s\n"
5866 "fadd v5.4s, v5.4s, v10.4s\n"
5867 "fadd v6.4s, v6.4s, v10.4s\n"
5868 "fadd v7.4s, v7.4s, v10.4s\n"
5869 "fadd v0.4s, v0.4s, v4.4s\n"
5870 "fadd v1.4s, v1.4s, v5.4s\n"
5871 "fadd v2.4s, v2.4s, v6.4s\n"
5872 "fadd v3.4s, v3.4s, v7.4s\n"
5873 "fsub v0.4s, v0.4s, v12.4s\n"
5874 "fsub v1.4s, v1.4s, v12.4s\n"
5875 "fsub v2.4s, v2.4s, v12.4s\n"
5876 "fsub v3.4s, v3.4s, v12.4s\n"
5877 "fmul v0.4s, v0.4s, v13.4s\n"
5878 "fmul v1.4s, v1.4s, v13.4s\n"
5879 "fmul v2.4s, v2.4s, v13.4s\n"
5880 "fmul v3.4s, v3.4s, v13.4s\n"
5881 "fadd v0.4s, v0.4s, v14.4s\n"
5882 "fadd v1.4s, v1.4s, v14.4s\n"
5883 "fadd v2.4s, v2.4s, v14.4s\n"
5884 "fadd v3.4s, v3.4s, v14.4s\n"
5885 "fcvtzs v0.4s, v0.4s\n"
5886 "fcvtzs v1.4s, v1.4s\n"
5887 "fcvtzs v2.4s, v2.4s\n"
5888 "fcvtzs v3.4s, v3.4s\n"
5889
5890 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
5891 "prfm pldl1keep, [%x[output]]\n"
5892 "bne 2b\n"
5893 "3:"
5894
5895 // BiasAdd::Transform
5896 "ld1 {v0.h}[0], [%x[input]], #2\n"
5897 "ld1 {v0.b}[2], [%x[input]], #1\n"
5898 "ld1 {v1.h}[0], [x1], #2\n"
5899 "ld1 {v1.b}[2], [x1], #1\n"
5900 "prfm pldl1keep, [%x[input], #32]\n"
5901 "uxtl v0.8h, v0.8b\n"
5902 "uxtl v1.8h, v1.8b\n"
5903 "sxtl v0.4s, v0.4h\n"
5904 "sxtl v1.4s, v1.4h\n"
5905 "scvtf v0.4s, v0.4s\n"
5906 "scvtf v1.4s, v1.4s\n"
5907 "fmul v0.4s, v0.4s, v9.4s\n"
5908 "fmul v1.4s, v1.4s, v11.4s\n"
5909 "fadd v0.4s, v0.4s, v8.4s\n"
5910 "fadd v1.4s, v1.4s, v10.4s\n"
5911 "fadd v0.4s, v0.4s, v1.4s\n"
5912 "fsub v0.4s, v0.4s, v12.4s\n"
5913 "fmul v0.4s, v0.4s, v13.4s\n"
5914 "fadd v0.4s, v0.4s, v14.4s\n"
5915 "fcvtzs v0.4s, v0.4s\n"
5916
5917 "st1 {v0.2s}, [%x[output]], #8\n"
5918 "st1 {v0.s}[2], [%x[output]], #4\n"
5919 "prfm pldl1keep, [%x[output]]\n"
5920 "subs %x[rows], %x[rows], #1\n"
5921 "bne 1b\n"
5922 : [input] "+r"(input), [output] "+r"(output)
5923 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
5924 [output_range_offset] "m"(params.output_range_offset),
5925 [input_range_scale] "m"(params.input_range_scale),
5926 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
5927 [bias_range_min] "m"(params.bias_range_min),
5928 [output_range_min] "m"(params.output_range_min),
5929 [bias_range_scale] "m"(params.bias_range_scale),
5930 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
5931 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
5932 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
5933 }
5934
5935 template <>
5936 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)5937 4>::Transform(const uint8_t* input,
5938 const BiasAdd<uint8_t>& params,
5939 int32_t* output) {
5940 #ifdef DEBUG
5941 #ifdef DEBUG_METAGEMM_VERBOSE
5942 std::cout << __FILE__ << "(" << __LINE__
5943 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
5944 "4>::Transform()"
5945 << std::endl
5946 << std::flush;
5947 #endif
5948 #endif
5949 int params_rows_copy = params.rows;
5950 asm volatile(
5951 "ldr w0, %[input_range_min]\n"
5952 "dup v8.4s, w0\n"
5953 "ldr w0, %[input_range_scale]\n"
5954 "dup v9.4s, w0\n"
5955 "ldr w0, %[bias_range_min]\n"
5956 "dup v10.4s, w0\n"
5957 "ldr w0, %[bias_range_scale]\n"
5958 "dup v11.4s, w0\n"
5959 "ldr w0, %[output_range_min]\n"
5960 "dup v12.4s, w0\n"
5961 "ldr w0, %[one_over_output_range_scale]\n"
5962 "dup v13.4s, w0\n"
5963 "ldr w0, %[output_range_offset]\n"
5964 "dup v14.4s, w0\n"
5965 "1:"
5966 "mov x0, %x[count]\n"
5967 "mov x1, %x[bias]\n"
5968 "subs x0, x0, #4\n"
5969 "beq 3f\n"
5970 "2:"
5971 "subs x0, x0, #16\n"
5972
5973 // BiasAdd::Transform
5974 "ld1 {v0.4s}, [%x[input]], #16\n"
5975 "ld1 {v4.4s}, [x1], #16\n"
5976 "prfm pldl1keep, [%x[input], #32]\n"
5977 "uxtl2 v1.8h, v0.16b\n"
5978 "uxtl v0.8h, v0.8b\n"
5979 "uxtl2 v5.8h, v4.16b\n"
5980 "uxtl v4.8h, v4.8b\n"
5981 "sxtl2 v3.4s, v1.8h\n"
5982 "sxtl v2.4s, v1.4h\n"
5983 "sxtl2 v7.4s, v5.8h\n"
5984 "sxtl v6.4s, v5.4h\n"
5985 "sxtl2 v1.4s, v0.8h\n"
5986 "sxtl v0.4s, v0.4h\n"
5987 "sxtl2 v5.4s, v4.8h\n"
5988 "sxtl v4.4s, v4.4h\n"
5989 "scvtf v0.4s, v0.4s\n"
5990 "scvtf v1.4s, v1.4s\n"
5991 "scvtf v2.4s, v2.4s\n"
5992 "scvtf v3.4s, v3.4s\n"
5993 "scvtf v4.4s, v4.4s\n"
5994 "scvtf v5.4s, v5.4s\n"
5995 "scvtf v6.4s, v6.4s\n"
5996 "scvtf v7.4s, v7.4s\n"
5997 "fmul v0.4s, v0.4s, v9.4s\n"
5998 "fmul v1.4s, v1.4s, v9.4s\n"
5999 "fmul v2.4s, v2.4s, v9.4s\n"
6000 "fmul v3.4s, v3.4s, v9.4s\n"
6001 "fmul v4.4s, v4.4s, v11.4s\n"
6002 "fmul v5.4s, v5.4s, v11.4s\n"
6003 "fmul v6.4s, v6.4s, v11.4s\n"
6004 "fmul v7.4s, v7.4s, v11.4s\n"
6005 "fadd v0.4s, v0.4s, v8.4s\n"
6006 "fadd v1.4s, v1.4s, v8.4s\n"
6007 "fadd v2.4s, v2.4s, v8.4s\n"
6008 "fadd v3.4s, v3.4s, v8.4s\n"
6009 "fadd v4.4s, v4.4s, v10.4s\n"
6010 "fadd v5.4s, v5.4s, v10.4s\n"
6011 "fadd v6.4s, v6.4s, v10.4s\n"
6012 "fadd v7.4s, v7.4s, v10.4s\n"
6013 "fadd v0.4s, v0.4s, v4.4s\n"
6014 "fadd v1.4s, v1.4s, v5.4s\n"
6015 "fadd v2.4s, v2.4s, v6.4s\n"
6016 "fadd v3.4s, v3.4s, v7.4s\n"
6017 "fsub v0.4s, v0.4s, v12.4s\n"
6018 "fsub v1.4s, v1.4s, v12.4s\n"
6019 "fsub v2.4s, v2.4s, v12.4s\n"
6020 "fsub v3.4s, v3.4s, v12.4s\n"
6021 "fmul v0.4s, v0.4s, v13.4s\n"
6022 "fmul v1.4s, v1.4s, v13.4s\n"
6023 "fmul v2.4s, v2.4s, v13.4s\n"
6024 "fmul v3.4s, v3.4s, v13.4s\n"
6025 "fadd v0.4s, v0.4s, v14.4s\n"
6026 "fadd v1.4s, v1.4s, v14.4s\n"
6027 "fadd v2.4s, v2.4s, v14.4s\n"
6028 "fadd v3.4s, v3.4s, v14.4s\n"
6029 "fcvtzs v0.4s, v0.4s\n"
6030 "fcvtzs v1.4s, v1.4s\n"
6031 "fcvtzs v2.4s, v2.4s\n"
6032 "fcvtzs v3.4s, v3.4s\n"
6033
6034 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6035 "prfm pldl1keep, [%x[output]]\n"
6036 "bne 2b\n"
6037 "3:"
6038
6039 // BiasAdd::Transform
6040 "ld1 {v0.s}[0], [%x[input]], #4\n"
6041 "ld1 {v1.s}[0], [x1], #4\n"
6042 "prfm pldl1keep, [%x[input], #32]\n"
6043 "uxtl v0.8h, v0.8b\n"
6044 "uxtl v1.8h, v1.8b\n"
6045 "sxtl v0.4s, v0.4h\n"
6046 "sxtl v1.4s, v1.4h\n"
6047 "scvtf v0.4s, v0.4s\n"
6048 "scvtf v1.4s, v1.4s\n"
6049 "fmul v0.4s, v0.4s, v9.4s\n"
6050 "fmul v1.4s, v1.4s, v11.4s\n"
6051 "fadd v0.4s, v0.4s, v8.4s\n"
6052 "fadd v1.4s, v1.4s, v10.4s\n"
6053 "fadd v0.4s, v0.4s, v1.4s\n"
6054 "fsub v0.4s, v0.4s, v12.4s\n"
6055 "fmul v0.4s, v0.4s, v13.4s\n"
6056 "fadd v0.4s, v0.4s, v14.4s\n"
6057 "fcvtzs v0.4s, v0.4s\n"
6058
6059 "st1 {v0.4s}, [%x[output]], #16\n"
6060 "prfm pldl1keep, [%x[output]]\n"
6061 "subs %x[rows], %x[rows], #1\n"
6062 "bne 1b\n"
6063 : [input] "+r"(input), [output] "+r"(output)
6064 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6065 [output_range_offset] "m"(params.output_range_offset),
6066 [input_range_scale] "m"(params.input_range_scale),
6067 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6068 [bias_range_min] "m"(params.bias_range_min),
6069 [output_range_min] "m"(params.output_range_min),
6070 [bias_range_scale] "m"(params.bias_range_scale),
6071 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6072 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6073 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6074 }
6075
6076 template <>
6077 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6078 5>::Transform(const uint8_t* input,
6079 const BiasAdd<uint8_t>& params,
6080 int32_t* output) {
6081 #ifdef DEBUG
6082 #ifdef DEBUG_METAGEMM_VERBOSE
6083 std::cout << __FILE__ << "(" << __LINE__
6084 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6085 "5>::Transform()"
6086 << std::endl
6087 << std::flush;
6088 #endif
6089 #endif
6090 int params_rows_copy = params.rows;
6091 asm volatile(
6092 "ldr w0, %[input_range_min]\n"
6093 "dup v8.4s, w0\n"
6094 "ldr w0, %[input_range_scale]\n"
6095 "dup v9.4s, w0\n"
6096 "ldr w0, %[bias_range_min]\n"
6097 "dup v10.4s, w0\n"
6098 "ldr w0, %[bias_range_scale]\n"
6099 "dup v11.4s, w0\n"
6100 "ldr w0, %[output_range_min]\n"
6101 "dup v12.4s, w0\n"
6102 "ldr w0, %[one_over_output_range_scale]\n"
6103 "dup v13.4s, w0\n"
6104 "ldr w0, %[output_range_offset]\n"
6105 "dup v14.4s, w0\n"
6106 "1:"
6107 "mov x0, %x[count]\n"
6108 "mov x1, %x[bias]\n"
6109 "subs x0, x0, #5\n"
6110 "beq 3f\n"
6111 "2:"
6112 "subs x0, x0, #16\n"
6113
6114 // BiasAdd::Transform
6115 "ld1 {v0.4s}, [%x[input]], #16\n"
6116 "ld1 {v4.4s}, [x1], #16\n"
6117 "prfm pldl1keep, [%x[input], #32]\n"
6118 "uxtl2 v1.8h, v0.16b\n"
6119 "uxtl v0.8h, v0.8b\n"
6120 "uxtl2 v5.8h, v4.16b\n"
6121 "uxtl v4.8h, v4.8b\n"
6122 "sxtl2 v3.4s, v1.8h\n"
6123 "sxtl v2.4s, v1.4h\n"
6124 "sxtl2 v7.4s, v5.8h\n"
6125 "sxtl v6.4s, v5.4h\n"
6126 "sxtl2 v1.4s, v0.8h\n"
6127 "sxtl v0.4s, v0.4h\n"
6128 "sxtl2 v5.4s, v4.8h\n"
6129 "sxtl v4.4s, v4.4h\n"
6130 "scvtf v0.4s, v0.4s\n"
6131 "scvtf v1.4s, v1.4s\n"
6132 "scvtf v2.4s, v2.4s\n"
6133 "scvtf v3.4s, v3.4s\n"
6134 "scvtf v4.4s, v4.4s\n"
6135 "scvtf v5.4s, v5.4s\n"
6136 "scvtf v6.4s, v6.4s\n"
6137 "scvtf v7.4s, v7.4s\n"
6138 "fmul v0.4s, v0.4s, v9.4s\n"
6139 "fmul v1.4s, v1.4s, v9.4s\n"
6140 "fmul v2.4s, v2.4s, v9.4s\n"
6141 "fmul v3.4s, v3.4s, v9.4s\n"
6142 "fmul v4.4s, v4.4s, v11.4s\n"
6143 "fmul v5.4s, v5.4s, v11.4s\n"
6144 "fmul v6.4s, v6.4s, v11.4s\n"
6145 "fmul v7.4s, v7.4s, v11.4s\n"
6146 "fadd v0.4s, v0.4s, v8.4s\n"
6147 "fadd v1.4s, v1.4s, v8.4s\n"
6148 "fadd v2.4s, v2.4s, v8.4s\n"
6149 "fadd v3.4s, v3.4s, v8.4s\n"
6150 "fadd v4.4s, v4.4s, v10.4s\n"
6151 "fadd v5.4s, v5.4s, v10.4s\n"
6152 "fadd v6.4s, v6.4s, v10.4s\n"
6153 "fadd v7.4s, v7.4s, v10.4s\n"
6154 "fadd v0.4s, v0.4s, v4.4s\n"
6155 "fadd v1.4s, v1.4s, v5.4s\n"
6156 "fadd v2.4s, v2.4s, v6.4s\n"
6157 "fadd v3.4s, v3.4s, v7.4s\n"
6158 "fsub v0.4s, v0.4s, v12.4s\n"
6159 "fsub v1.4s, v1.4s, v12.4s\n"
6160 "fsub v2.4s, v2.4s, v12.4s\n"
6161 "fsub v3.4s, v3.4s, v12.4s\n"
6162 "fmul v0.4s, v0.4s, v13.4s\n"
6163 "fmul v1.4s, v1.4s, v13.4s\n"
6164 "fmul v2.4s, v2.4s, v13.4s\n"
6165 "fmul v3.4s, v3.4s, v13.4s\n"
6166 "fadd v0.4s, v0.4s, v14.4s\n"
6167 "fadd v1.4s, v1.4s, v14.4s\n"
6168 "fadd v2.4s, v2.4s, v14.4s\n"
6169 "fadd v3.4s, v3.4s, v14.4s\n"
6170 "fcvtzs v0.4s, v0.4s\n"
6171 "fcvtzs v1.4s, v1.4s\n"
6172 "fcvtzs v2.4s, v2.4s\n"
6173 "fcvtzs v3.4s, v3.4s\n"
6174
6175 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6176 "prfm pldl1keep, [%x[output]]\n"
6177 "bne 2b\n"
6178 "3:"
6179
6180 // BiasAdd::Transform
6181 "ld1 {v0.s}[0], [%x[input]], #4\n"
6182 "ld1 {v0.b}[4], [%x[input]], #1\n"
6183 "ld1 {v2.s}[0], [x1], #4\n"
6184 "ld1 {v2.b}[4], [x1], #1\n"
6185 "prfm pldl1keep, [%x[input], #32]\n"
6186 "uxtl v0.8h, v0.8b\n"
6187 "uxtl v2.8h, v2.8b\n"
6188 "sxtl2 v1.4s, v0.8h\n"
6189 "sxtl v0.4s, v0.4h\n"
6190 "sxtl2 v3.4s, v2.8h\n"
6191 "sxtl v2.4s, v2.4h\n"
6192 "scvtf v0.4s, v0.4s\n"
6193 "scvtf v1.4s, v1.4s\n"
6194 "scvtf v2.4s, v2.4s\n"
6195 "scvtf v3.4s, v3.4s\n"
6196 "fmul v0.4s, v0.4s, v9.4s\n"
6197 "fmul v1.4s, v1.4s, v9.4s\n"
6198 "fmul v2.4s, v2.4s, v11.4s\n"
6199 "fmul v3.4s, v3.4s, v11.4s\n"
6200 "fadd v0.4s, v0.4s, v8.4s\n"
6201 "fadd v1.4s, v1.4s, v8.4s\n"
6202 "fadd v2.4s, v2.4s, v10.4s\n"
6203 "fadd v3.4s, v3.4s, v10.4s\n"
6204 "fadd v0.4s, v0.4s, v2.4s\n"
6205 "fadd v1.4s, v1.4s, v3.4s\n"
6206 "fsub v0.4s, v0.4s, v12.4s\n"
6207 "fsub v1.4s, v1.4s, v12.4s\n"
6208 "fmul v0.4s, v0.4s, v13.4s\n"
6209 "fmul v1.4s, v1.4s, v13.4s\n"
6210 "fadd v0.4s, v0.4s, v14.4s\n"
6211 "fadd v1.4s, v1.4s, v14.4s\n"
6212 "fcvtzs v0.4s, v0.4s\n"
6213 "fcvtzs v1.4s, v1.4s\n"
6214
6215 "st1 {v0.4s}, [%x[output]], #16\n"
6216 "st1 {v1.s}[0], [%x[output]], #4\n"
6217 "prfm pldl1keep, [%x[output]]\n"
6218 "subs %x[rows], %x[rows], #1\n"
6219 "bne 1b\n"
6220 : [input] "+r"(input), [output] "+r"(output)
6221 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6222 [output_range_offset] "m"(params.output_range_offset),
6223 [input_range_scale] "m"(params.input_range_scale),
6224 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6225 [bias_range_min] "m"(params.bias_range_min),
6226 [output_range_min] "m"(params.output_range_min),
6227 [bias_range_scale] "m"(params.bias_range_scale),
6228 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6229 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6230 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6231 }
6232
6233 template <>
6234 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6235 6>::Transform(const uint8_t* input,
6236 const BiasAdd<uint8_t>& params,
6237 int32_t* output) {
6238 #ifdef DEBUG
6239 #ifdef DEBUG_METAGEMM_VERBOSE
6240 std::cout << __FILE__ << "(" << __LINE__
6241 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6242 "6>::Transform()"
6243 << std::endl
6244 << std::flush;
6245 #endif
6246 #endif
6247 int params_rows_copy = params.rows;
6248 asm volatile(
6249 "ldr w0, %[input_range_min]\n"
6250 "dup v8.4s, w0\n"
6251 "ldr w0, %[input_range_scale]\n"
6252 "dup v9.4s, w0\n"
6253 "ldr w0, %[bias_range_min]\n"
6254 "dup v10.4s, w0\n"
6255 "ldr w0, %[bias_range_scale]\n"
6256 "dup v11.4s, w0\n"
6257 "ldr w0, %[output_range_min]\n"
6258 "dup v12.4s, w0\n"
6259 "ldr w0, %[one_over_output_range_scale]\n"
6260 "dup v13.4s, w0\n"
6261 "ldr w0, %[output_range_offset]\n"
6262 "dup v14.4s, w0\n"
6263 "1:"
6264 "mov x0, %x[count]\n"
6265 "mov x1, %x[bias]\n"
6266 "subs x0, x0, #6\n"
6267 "beq 3f\n"
6268 "2:"
6269 "subs x0, x0, #16\n"
6270
6271 // BiasAdd::Transform
6272 "ld1 {v0.4s}, [%x[input]], #16\n"
6273 "ld1 {v4.4s}, [x1], #16\n"
6274 "prfm pldl1keep, [%x[input], #32]\n"
6275 "uxtl2 v1.8h, v0.16b\n"
6276 "uxtl v0.8h, v0.8b\n"
6277 "uxtl2 v5.8h, v4.16b\n"
6278 "uxtl v4.8h, v4.8b\n"
6279 "sxtl2 v3.4s, v1.8h\n"
6280 "sxtl v2.4s, v1.4h\n"
6281 "sxtl2 v7.4s, v5.8h\n"
6282 "sxtl v6.4s, v5.4h\n"
6283 "sxtl2 v1.4s, v0.8h\n"
6284 "sxtl v0.4s, v0.4h\n"
6285 "sxtl2 v5.4s, v4.8h\n"
6286 "sxtl v4.4s, v4.4h\n"
6287 "scvtf v0.4s, v0.4s\n"
6288 "scvtf v1.4s, v1.4s\n"
6289 "scvtf v2.4s, v2.4s\n"
6290 "scvtf v3.4s, v3.4s\n"
6291 "scvtf v4.4s, v4.4s\n"
6292 "scvtf v5.4s, v5.4s\n"
6293 "scvtf v6.4s, v6.4s\n"
6294 "scvtf v7.4s, v7.4s\n"
6295 "fmul v0.4s, v0.4s, v9.4s\n"
6296 "fmul v1.4s, v1.4s, v9.4s\n"
6297 "fmul v2.4s, v2.4s, v9.4s\n"
6298 "fmul v3.4s, v3.4s, v9.4s\n"
6299 "fmul v4.4s, v4.4s, v11.4s\n"
6300 "fmul v5.4s, v5.4s, v11.4s\n"
6301 "fmul v6.4s, v6.4s, v11.4s\n"
6302 "fmul v7.4s, v7.4s, v11.4s\n"
6303 "fadd v0.4s, v0.4s, v8.4s\n"
6304 "fadd v1.4s, v1.4s, v8.4s\n"
6305 "fadd v2.4s, v2.4s, v8.4s\n"
6306 "fadd v3.4s, v3.4s, v8.4s\n"
6307 "fadd v4.4s, v4.4s, v10.4s\n"
6308 "fadd v5.4s, v5.4s, v10.4s\n"
6309 "fadd v6.4s, v6.4s, v10.4s\n"
6310 "fadd v7.4s, v7.4s, v10.4s\n"
6311 "fadd v0.4s, v0.4s, v4.4s\n"
6312 "fadd v1.4s, v1.4s, v5.4s\n"
6313 "fadd v2.4s, v2.4s, v6.4s\n"
6314 "fadd v3.4s, v3.4s, v7.4s\n"
6315 "fsub v0.4s, v0.4s, v12.4s\n"
6316 "fsub v1.4s, v1.4s, v12.4s\n"
6317 "fsub v2.4s, v2.4s, v12.4s\n"
6318 "fsub v3.4s, v3.4s, v12.4s\n"
6319 "fmul v0.4s, v0.4s, v13.4s\n"
6320 "fmul v1.4s, v1.4s, v13.4s\n"
6321 "fmul v2.4s, v2.4s, v13.4s\n"
6322 "fmul v3.4s, v3.4s, v13.4s\n"
6323 "fadd v0.4s, v0.4s, v14.4s\n"
6324 "fadd v1.4s, v1.4s, v14.4s\n"
6325 "fadd v2.4s, v2.4s, v14.4s\n"
6326 "fadd v3.4s, v3.4s, v14.4s\n"
6327 "fcvtzs v0.4s, v0.4s\n"
6328 "fcvtzs v1.4s, v1.4s\n"
6329 "fcvtzs v2.4s, v2.4s\n"
6330 "fcvtzs v3.4s, v3.4s\n"
6331
6332 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6333 "prfm pldl1keep, [%x[output]]\n"
6334 "bne 2b\n"
6335 "3:"
6336
6337 // BiasAdd::Transform
6338 "ld1 {v0.s}[0], [%x[input]], #4\n"
6339 "ld1 {v0.h}[2], [%x[input]], #2\n"
6340 "ld1 {v2.s}[0], [x1], #4\n"
6341 "ld1 {v2.h}[2], [x1], #2\n"
6342 "prfm pldl1keep, [%x[input], #32]\n"
6343 "uxtl v0.8h, v0.8b\n"
6344 "uxtl v2.8h, v2.8b\n"
6345 "sxtl2 v1.4s, v0.8h\n"
6346 "sxtl v0.4s, v0.4h\n"
6347 "sxtl2 v3.4s, v2.8h\n"
6348 "sxtl v2.4s, v2.4h\n"
6349 "scvtf v0.4s, v0.4s\n"
6350 "scvtf v1.4s, v1.4s\n"
6351 "scvtf v2.4s, v2.4s\n"
6352 "scvtf v3.4s, v3.4s\n"
6353 "fmul v0.4s, v0.4s, v9.4s\n"
6354 "fmul v1.4s, v1.4s, v9.4s\n"
6355 "fmul v2.4s, v2.4s, v11.4s\n"
6356 "fmul v3.4s, v3.4s, v11.4s\n"
6357 "fadd v0.4s, v0.4s, v8.4s\n"
6358 "fadd v1.4s, v1.4s, v8.4s\n"
6359 "fadd v2.4s, v2.4s, v10.4s\n"
6360 "fadd v3.4s, v3.4s, v10.4s\n"
6361 "fadd v0.4s, v0.4s, v2.4s\n"
6362 "fadd v1.4s, v1.4s, v3.4s\n"
6363 "fsub v0.4s, v0.4s, v12.4s\n"
6364 "fsub v1.4s, v1.4s, v12.4s\n"
6365 "fmul v0.4s, v0.4s, v13.4s\n"
6366 "fmul v1.4s, v1.4s, v13.4s\n"
6367 "fadd v0.4s, v0.4s, v14.4s\n"
6368 "fadd v1.4s, v1.4s, v14.4s\n"
6369 "fcvtzs v0.4s, v0.4s\n"
6370 "fcvtzs v1.4s, v1.4s\n"
6371
6372 "st1 {v0.4s}, [%x[output]], #16\n"
6373 "st1 {v1.2s}, [%x[output]], #8\n"
6374 "prfm pldl1keep, [%x[output]]\n"
6375 "subs %x[rows], %x[rows], #1\n"
6376 "bne 1b\n"
6377 : [input] "+r"(input), [output] "+r"(output)
6378 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6379 [output_range_offset] "m"(params.output_range_offset),
6380 [input_range_scale] "m"(params.input_range_scale),
6381 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6382 [bias_range_min] "m"(params.bias_range_min),
6383 [output_range_min] "m"(params.output_range_min),
6384 [bias_range_scale] "m"(params.bias_range_scale),
6385 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6386 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6387 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6388 }
6389
6390 template <>
6391 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6392 7>::Transform(const uint8_t* input,
6393 const BiasAdd<uint8_t>& params,
6394 int32_t* output) {
6395 #ifdef DEBUG
6396 #ifdef DEBUG_METAGEMM_VERBOSE
6397 std::cout << __FILE__ << "(" << __LINE__
6398 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6399 "7>::Transform()"
6400 << std::endl
6401 << std::flush;
6402 #endif
6403 #endif
6404 int params_rows_copy = params.rows;
6405 asm volatile(
6406 "ldr w0, %[input_range_min]\n"
6407 "dup v8.4s, w0\n"
6408 "ldr w0, %[input_range_scale]\n"
6409 "dup v9.4s, w0\n"
6410 "ldr w0, %[bias_range_min]\n"
6411 "dup v10.4s, w0\n"
6412 "ldr w0, %[bias_range_scale]\n"
6413 "dup v11.4s, w0\n"
6414 "ldr w0, %[output_range_min]\n"
6415 "dup v12.4s, w0\n"
6416 "ldr w0, %[one_over_output_range_scale]\n"
6417 "dup v13.4s, w0\n"
6418 "ldr w0, %[output_range_offset]\n"
6419 "dup v14.4s, w0\n"
6420 "1:"
6421 "mov x0, %x[count]\n"
6422 "mov x1, %x[bias]\n"
6423 "subs x0, x0, #7\n"
6424 "beq 3f\n"
6425 "2:"
6426 "subs x0, x0, #16\n"
6427
6428 // BiasAdd::Transform
6429 "ld1 {v0.4s}, [%x[input]], #16\n"
6430 "ld1 {v4.4s}, [x1], #16\n"
6431 "prfm pldl1keep, [%x[input], #32]\n"
6432 "uxtl2 v1.8h, v0.16b\n"
6433 "uxtl v0.8h, v0.8b\n"
6434 "uxtl2 v5.8h, v4.16b\n"
6435 "uxtl v4.8h, v4.8b\n"
6436 "sxtl2 v3.4s, v1.8h\n"
6437 "sxtl v2.4s, v1.4h\n"
6438 "sxtl2 v7.4s, v5.8h\n"
6439 "sxtl v6.4s, v5.4h\n"
6440 "sxtl2 v1.4s, v0.8h\n"
6441 "sxtl v0.4s, v0.4h\n"
6442 "sxtl2 v5.4s, v4.8h\n"
6443 "sxtl v4.4s, v4.4h\n"
6444 "scvtf v0.4s, v0.4s\n"
6445 "scvtf v1.4s, v1.4s\n"
6446 "scvtf v2.4s, v2.4s\n"
6447 "scvtf v3.4s, v3.4s\n"
6448 "scvtf v4.4s, v4.4s\n"
6449 "scvtf v5.4s, v5.4s\n"
6450 "scvtf v6.4s, v6.4s\n"
6451 "scvtf v7.4s, v7.4s\n"
6452 "fmul v0.4s, v0.4s, v9.4s\n"
6453 "fmul v1.4s, v1.4s, v9.4s\n"
6454 "fmul v2.4s, v2.4s, v9.4s\n"
6455 "fmul v3.4s, v3.4s, v9.4s\n"
6456 "fmul v4.4s, v4.4s, v11.4s\n"
6457 "fmul v5.4s, v5.4s, v11.4s\n"
6458 "fmul v6.4s, v6.4s, v11.4s\n"
6459 "fmul v7.4s, v7.4s, v11.4s\n"
6460 "fadd v0.4s, v0.4s, v8.4s\n"
6461 "fadd v1.4s, v1.4s, v8.4s\n"
6462 "fadd v2.4s, v2.4s, v8.4s\n"
6463 "fadd v3.4s, v3.4s, v8.4s\n"
6464 "fadd v4.4s, v4.4s, v10.4s\n"
6465 "fadd v5.4s, v5.4s, v10.4s\n"
6466 "fadd v6.4s, v6.4s, v10.4s\n"
6467 "fadd v7.4s, v7.4s, v10.4s\n"
6468 "fadd v0.4s, v0.4s, v4.4s\n"
6469 "fadd v1.4s, v1.4s, v5.4s\n"
6470 "fadd v2.4s, v2.4s, v6.4s\n"
6471 "fadd v3.4s, v3.4s, v7.4s\n"
6472 "fsub v0.4s, v0.4s, v12.4s\n"
6473 "fsub v1.4s, v1.4s, v12.4s\n"
6474 "fsub v2.4s, v2.4s, v12.4s\n"
6475 "fsub v3.4s, v3.4s, v12.4s\n"
6476 "fmul v0.4s, v0.4s, v13.4s\n"
6477 "fmul v1.4s, v1.4s, v13.4s\n"
6478 "fmul v2.4s, v2.4s, v13.4s\n"
6479 "fmul v3.4s, v3.4s, v13.4s\n"
6480 "fadd v0.4s, v0.4s, v14.4s\n"
6481 "fadd v1.4s, v1.4s, v14.4s\n"
6482 "fadd v2.4s, v2.4s, v14.4s\n"
6483 "fadd v3.4s, v3.4s, v14.4s\n"
6484 "fcvtzs v0.4s, v0.4s\n"
6485 "fcvtzs v1.4s, v1.4s\n"
6486 "fcvtzs v2.4s, v2.4s\n"
6487 "fcvtzs v3.4s, v3.4s\n"
6488
6489 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6490 "prfm pldl1keep, [%x[output]]\n"
6491 "bne 2b\n"
6492 "3:"
6493
6494 // BiasAdd::Transform
6495 "ld1 {v0.s}[0], [%x[input]], #4\n"
6496 "ld1 {v0.h}[2], [%x[input]], #2\n"
6497 "ld1 {v0.b}[6], [%x[input]], #1\n"
6498 "ld1 {v2.s}[0], [x1], #4\n"
6499 "ld1 {v2.h}[2], [x1], #2\n"
6500 "ld1 {v2.b}[6], [x1], #1\n"
6501 "prfm pldl1keep, [%x[input], #32]\n"
6502 "uxtl v0.8h, v0.8b\n"
6503 "uxtl v2.8h, v2.8b\n"
6504 "sxtl2 v1.4s, v0.8h\n"
6505 "sxtl v0.4s, v0.4h\n"
6506 "sxtl2 v3.4s, v2.8h\n"
6507 "sxtl v2.4s, v2.4h\n"
6508 "scvtf v0.4s, v0.4s\n"
6509 "scvtf v1.4s, v1.4s\n"
6510 "scvtf v2.4s, v2.4s\n"
6511 "scvtf v3.4s, v3.4s\n"
6512 "fmul v0.4s, v0.4s, v9.4s\n"
6513 "fmul v1.4s, v1.4s, v9.4s\n"
6514 "fmul v2.4s, v2.4s, v11.4s\n"
6515 "fmul v3.4s, v3.4s, v11.4s\n"
6516 "fadd v0.4s, v0.4s, v8.4s\n"
6517 "fadd v1.4s, v1.4s, v8.4s\n"
6518 "fadd v2.4s, v2.4s, v10.4s\n"
6519 "fadd v3.4s, v3.4s, v10.4s\n"
6520 "fadd v0.4s, v0.4s, v2.4s\n"
6521 "fadd v1.4s, v1.4s, v3.4s\n"
6522 "fsub v0.4s, v0.4s, v12.4s\n"
6523 "fsub v1.4s, v1.4s, v12.4s\n"
6524 "fmul v0.4s, v0.4s, v13.4s\n"
6525 "fmul v1.4s, v1.4s, v13.4s\n"
6526 "fadd v0.4s, v0.4s, v14.4s\n"
6527 "fadd v1.4s, v1.4s, v14.4s\n"
6528 "fcvtzs v0.4s, v0.4s\n"
6529 "fcvtzs v1.4s, v1.4s\n"
6530
6531 "st1 {v0.4s}, [%x[output]], #16\n"
6532 "st1 {v1.2s}, [%x[output]], #8\n"
6533 "st1 {v1.s}[2], [%x[output]], #4\n"
6534 "prfm pldl1keep, [%x[output]]\n"
6535 "subs %x[rows], %x[rows], #1\n"
6536 "bne 1b\n"
6537 : [input] "+r"(input), [output] "+r"(output)
6538 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6539 [output_range_offset] "m"(params.output_range_offset),
6540 [input_range_scale] "m"(params.input_range_scale),
6541 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6542 [bias_range_min] "m"(params.bias_range_min),
6543 [output_range_min] "m"(params.output_range_min),
6544 [bias_range_scale] "m"(params.bias_range_scale),
6545 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6546 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6547 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6548 }
6549
6550 template <>
6551 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6552 8>::Transform(const uint8_t* input,
6553 const BiasAdd<uint8_t>& params,
6554 int32_t* output) {
6555 #ifdef DEBUG
6556 #ifdef DEBUG_METAGEMM_VERBOSE
6557 std::cout << __FILE__ << "(" << __LINE__
6558 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6559 "8>::Transform()"
6560 << std::endl
6561 << std::flush;
6562 #endif
6563 #endif
6564 int params_rows_copy = params.rows;
6565 asm volatile(
6566 "ldr w0, %[input_range_min]\n"
6567 "dup v8.4s, w0\n"
6568 "ldr w0, %[input_range_scale]\n"
6569 "dup v9.4s, w0\n"
6570 "ldr w0, %[bias_range_min]\n"
6571 "dup v10.4s, w0\n"
6572 "ldr w0, %[bias_range_scale]\n"
6573 "dup v11.4s, w0\n"
6574 "ldr w0, %[output_range_min]\n"
6575 "dup v12.4s, w0\n"
6576 "ldr w0, %[one_over_output_range_scale]\n"
6577 "dup v13.4s, w0\n"
6578 "ldr w0, %[output_range_offset]\n"
6579 "dup v14.4s, w0\n"
6580 "1:"
6581 "mov x0, %x[count]\n"
6582 "mov x1, %x[bias]\n"
6583 "subs x0, x0, #8\n"
6584 "beq 3f\n"
6585 "2:"
6586 "subs x0, x0, #16\n"
6587
6588 // BiasAdd::Transform
6589 "ld1 {v0.4s}, [%x[input]], #16\n"
6590 "ld1 {v4.4s}, [x1], #16\n"
6591 "prfm pldl1keep, [%x[input], #32]\n"
6592 "uxtl2 v1.8h, v0.16b\n"
6593 "uxtl v0.8h, v0.8b\n"
6594 "uxtl2 v5.8h, v4.16b\n"
6595 "uxtl v4.8h, v4.8b\n"
6596 "sxtl2 v3.4s, v1.8h\n"
6597 "sxtl v2.4s, v1.4h\n"
6598 "sxtl2 v7.4s, v5.8h\n"
6599 "sxtl v6.4s, v5.4h\n"
6600 "sxtl2 v1.4s, v0.8h\n"
6601 "sxtl v0.4s, v0.4h\n"
6602 "sxtl2 v5.4s, v4.8h\n"
6603 "sxtl v4.4s, v4.4h\n"
6604 "scvtf v0.4s, v0.4s\n"
6605 "scvtf v1.4s, v1.4s\n"
6606 "scvtf v2.4s, v2.4s\n"
6607 "scvtf v3.4s, v3.4s\n"
6608 "scvtf v4.4s, v4.4s\n"
6609 "scvtf v5.4s, v5.4s\n"
6610 "scvtf v6.4s, v6.4s\n"
6611 "scvtf v7.4s, v7.4s\n"
6612 "fmul v0.4s, v0.4s, v9.4s\n"
6613 "fmul v1.4s, v1.4s, v9.4s\n"
6614 "fmul v2.4s, v2.4s, v9.4s\n"
6615 "fmul v3.4s, v3.4s, v9.4s\n"
6616 "fmul v4.4s, v4.4s, v11.4s\n"
6617 "fmul v5.4s, v5.4s, v11.4s\n"
6618 "fmul v6.4s, v6.4s, v11.4s\n"
6619 "fmul v7.4s, v7.4s, v11.4s\n"
6620 "fadd v0.4s, v0.4s, v8.4s\n"
6621 "fadd v1.4s, v1.4s, v8.4s\n"
6622 "fadd v2.4s, v2.4s, v8.4s\n"
6623 "fadd v3.4s, v3.4s, v8.4s\n"
6624 "fadd v4.4s, v4.4s, v10.4s\n"
6625 "fadd v5.4s, v5.4s, v10.4s\n"
6626 "fadd v6.4s, v6.4s, v10.4s\n"
6627 "fadd v7.4s, v7.4s, v10.4s\n"
6628 "fadd v0.4s, v0.4s, v4.4s\n"
6629 "fadd v1.4s, v1.4s, v5.4s\n"
6630 "fadd v2.4s, v2.4s, v6.4s\n"
6631 "fadd v3.4s, v3.4s, v7.4s\n"
6632 "fsub v0.4s, v0.4s, v12.4s\n"
6633 "fsub v1.4s, v1.4s, v12.4s\n"
6634 "fsub v2.4s, v2.4s, v12.4s\n"
6635 "fsub v3.4s, v3.4s, v12.4s\n"
6636 "fmul v0.4s, v0.4s, v13.4s\n"
6637 "fmul v1.4s, v1.4s, v13.4s\n"
6638 "fmul v2.4s, v2.4s, v13.4s\n"
6639 "fmul v3.4s, v3.4s, v13.4s\n"
6640 "fadd v0.4s, v0.4s, v14.4s\n"
6641 "fadd v1.4s, v1.4s, v14.4s\n"
6642 "fadd v2.4s, v2.4s, v14.4s\n"
6643 "fadd v3.4s, v3.4s, v14.4s\n"
6644 "fcvtzs v0.4s, v0.4s\n"
6645 "fcvtzs v1.4s, v1.4s\n"
6646 "fcvtzs v2.4s, v2.4s\n"
6647 "fcvtzs v3.4s, v3.4s\n"
6648
6649 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6650 "prfm pldl1keep, [%x[output]]\n"
6651 "bne 2b\n"
6652 "3:"
6653
6654 // BiasAdd::Transform
6655 "ld1 {v0.2s}, [%x[input]], #8\n"
6656 "ld1 {v2.2s}, [x1], #8\n"
6657 "prfm pldl1keep, [%x[input], #32]\n"
6658 "uxtl v0.8h, v0.8b\n"
6659 "uxtl v2.8h, v2.8b\n"
6660 "sxtl2 v1.4s, v0.8h\n"
6661 "sxtl v0.4s, v0.4h\n"
6662 "sxtl2 v3.4s, v2.8h\n"
6663 "sxtl v2.4s, v2.4h\n"
6664 "scvtf v0.4s, v0.4s\n"
6665 "scvtf v1.4s, v1.4s\n"
6666 "scvtf v2.4s, v2.4s\n"
6667 "scvtf v3.4s, v3.4s\n"
6668 "fmul v0.4s, v0.4s, v9.4s\n"
6669 "fmul v1.4s, v1.4s, v9.4s\n"
6670 "fmul v2.4s, v2.4s, v11.4s\n"
6671 "fmul v3.4s, v3.4s, v11.4s\n"
6672 "fadd v0.4s, v0.4s, v8.4s\n"
6673 "fadd v1.4s, v1.4s, v8.4s\n"
6674 "fadd v2.4s, v2.4s, v10.4s\n"
6675 "fadd v3.4s, v3.4s, v10.4s\n"
6676 "fadd v0.4s, v0.4s, v2.4s\n"
6677 "fadd v1.4s, v1.4s, v3.4s\n"
6678 "fsub v0.4s, v0.4s, v12.4s\n"
6679 "fsub v1.4s, v1.4s, v12.4s\n"
6680 "fmul v0.4s, v0.4s, v13.4s\n"
6681 "fmul v1.4s, v1.4s, v13.4s\n"
6682 "fadd v0.4s, v0.4s, v14.4s\n"
6683 "fadd v1.4s, v1.4s, v14.4s\n"
6684 "fcvtzs v0.4s, v0.4s\n"
6685 "fcvtzs v1.4s, v1.4s\n"
6686
6687 "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
6688 "prfm pldl1keep, [%x[output]]\n"
6689 "subs %x[rows], %x[rows], #1\n"
6690 "bne 1b\n"
6691 : [input] "+r"(input), [output] "+r"(output)
6692 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6693 [output_range_offset] "m"(params.output_range_offset),
6694 [input_range_scale] "m"(params.input_range_scale),
6695 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6696 [bias_range_min] "m"(params.bias_range_min),
6697 [output_range_min] "m"(params.output_range_min),
6698 [bias_range_scale] "m"(params.bias_range_scale),
6699 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6700 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6701 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6702 }
6703
6704 template <>
6705 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6706 9>::Transform(const uint8_t* input,
6707 const BiasAdd<uint8_t>& params,
6708 int32_t* output) {
6709 #ifdef DEBUG
6710 #ifdef DEBUG_METAGEMM_VERBOSE
6711 std::cout << __FILE__ << "(" << __LINE__
6712 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6713 "9>::Transform()"
6714 << std::endl
6715 << std::flush;
6716 #endif
6717 #endif
6718 int params_rows_copy = params.rows;
6719 asm volatile(
6720 "ldr w0, %[input_range_min]\n"
6721 "dup v8.4s, w0\n"
6722 "ldr w0, %[input_range_scale]\n"
6723 "dup v9.4s, w0\n"
6724 "ldr w0, %[bias_range_min]\n"
6725 "dup v10.4s, w0\n"
6726 "ldr w0, %[bias_range_scale]\n"
6727 "dup v11.4s, w0\n"
6728 "ldr w0, %[output_range_min]\n"
6729 "dup v12.4s, w0\n"
6730 "ldr w0, %[one_over_output_range_scale]\n"
6731 "dup v13.4s, w0\n"
6732 "ldr w0, %[output_range_offset]\n"
6733 "dup v14.4s, w0\n"
6734 "1:"
6735 "mov x0, %x[count]\n"
6736 "mov x1, %x[bias]\n"
6737 "subs x0, x0, #9\n"
6738 "beq 3f\n"
6739 "2:"
6740 "subs x0, x0, #16\n"
6741
6742 // BiasAdd::Transform
6743 "ld1 {v0.4s}, [%x[input]], #16\n"
6744 "ld1 {v4.4s}, [x1], #16\n"
6745 "prfm pldl1keep, [%x[input], #32]\n"
6746 "uxtl2 v1.8h, v0.16b\n"
6747 "uxtl v0.8h, v0.8b\n"
6748 "uxtl2 v5.8h, v4.16b\n"
6749 "uxtl v4.8h, v4.8b\n"
6750 "sxtl2 v3.4s, v1.8h\n"
6751 "sxtl v2.4s, v1.4h\n"
6752 "sxtl2 v7.4s, v5.8h\n"
6753 "sxtl v6.4s, v5.4h\n"
6754 "sxtl2 v1.4s, v0.8h\n"
6755 "sxtl v0.4s, v0.4h\n"
6756 "sxtl2 v5.4s, v4.8h\n"
6757 "sxtl v4.4s, v4.4h\n"
6758 "scvtf v0.4s, v0.4s\n"
6759 "scvtf v1.4s, v1.4s\n"
6760 "scvtf v2.4s, v2.4s\n"
6761 "scvtf v3.4s, v3.4s\n"
6762 "scvtf v4.4s, v4.4s\n"
6763 "scvtf v5.4s, v5.4s\n"
6764 "scvtf v6.4s, v6.4s\n"
6765 "scvtf v7.4s, v7.4s\n"
6766 "fmul v0.4s, v0.4s, v9.4s\n"
6767 "fmul v1.4s, v1.4s, v9.4s\n"
6768 "fmul v2.4s, v2.4s, v9.4s\n"
6769 "fmul v3.4s, v3.4s, v9.4s\n"
6770 "fmul v4.4s, v4.4s, v11.4s\n"
6771 "fmul v5.4s, v5.4s, v11.4s\n"
6772 "fmul v6.4s, v6.4s, v11.4s\n"
6773 "fmul v7.4s, v7.4s, v11.4s\n"
6774 "fadd v0.4s, v0.4s, v8.4s\n"
6775 "fadd v1.4s, v1.4s, v8.4s\n"
6776 "fadd v2.4s, v2.4s, v8.4s\n"
6777 "fadd v3.4s, v3.4s, v8.4s\n"
6778 "fadd v4.4s, v4.4s, v10.4s\n"
6779 "fadd v5.4s, v5.4s, v10.4s\n"
6780 "fadd v6.4s, v6.4s, v10.4s\n"
6781 "fadd v7.4s, v7.4s, v10.4s\n"
6782 "fadd v0.4s, v0.4s, v4.4s\n"
6783 "fadd v1.4s, v1.4s, v5.4s\n"
6784 "fadd v2.4s, v2.4s, v6.4s\n"
6785 "fadd v3.4s, v3.4s, v7.4s\n"
6786 "fsub v0.4s, v0.4s, v12.4s\n"
6787 "fsub v1.4s, v1.4s, v12.4s\n"
6788 "fsub v2.4s, v2.4s, v12.4s\n"
6789 "fsub v3.4s, v3.4s, v12.4s\n"
6790 "fmul v0.4s, v0.4s, v13.4s\n"
6791 "fmul v1.4s, v1.4s, v13.4s\n"
6792 "fmul v2.4s, v2.4s, v13.4s\n"
6793 "fmul v3.4s, v3.4s, v13.4s\n"
6794 "fadd v0.4s, v0.4s, v14.4s\n"
6795 "fadd v1.4s, v1.4s, v14.4s\n"
6796 "fadd v2.4s, v2.4s, v14.4s\n"
6797 "fadd v3.4s, v3.4s, v14.4s\n"
6798 "fcvtzs v0.4s, v0.4s\n"
6799 "fcvtzs v1.4s, v1.4s\n"
6800 "fcvtzs v2.4s, v2.4s\n"
6801 "fcvtzs v3.4s, v3.4s\n"
6802
6803 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6804 "prfm pldl1keep, [%x[output]]\n"
6805 "bne 2b\n"
6806 "3:"
6807
6808 // BiasAdd::Transform
6809 "ld1 {v0.2s}, [%x[input]], #8\n"
6810 "ld1 {v0.b}[8], [%x[input]], #1\n"
6811 "ld1 {v3.2s}, [x1], #8\n"
6812 "ld1 {v3.b}[8], [x1], #1\n"
6813 "prfm pldl1keep, [%x[input], #32]\n"
6814 "uxtl2 v1.8h, v0.16b\n"
6815 "uxtl v0.8h, v0.8b\n"
6816 "uxtl2 v4.8h, v3.16b\n"
6817 "uxtl v3.8h, v3.8b\n"
6818 "sxtl v2.4s, v1.4h\n"
6819 "sxtl v5.4s, v4.4h\n"
6820 "sxtl2 v1.4s, v0.8h\n"
6821 "sxtl v0.4s, v0.4h\n"
6822 "sxtl2 v4.4s, v3.8h\n"
6823 "sxtl v3.4s, v3.4h\n"
6824 "scvtf v0.4s, v0.4s\n"
6825 "scvtf v1.4s, v1.4s\n"
6826 "scvtf v2.4s, v2.4s\n"
6827 "scvtf v3.4s, v3.4s\n"
6828 "scvtf v4.4s, v4.4s\n"
6829 "scvtf v5.4s, v5.4s\n"
6830 "fmul v0.4s, v0.4s, v9.4s\n"
6831 "fmul v1.4s, v1.4s, v9.4s\n"
6832 "fmul v2.4s, v2.4s, v9.4s\n"
6833 "fmul v3.4s, v3.4s, v11.4s\n"
6834 "fmul v4.4s, v4.4s, v11.4s\n"
6835 "fmul v5.4s, v5.4s, v11.4s\n"
6836 "fadd v0.4s, v0.4s, v8.4s\n"
6837 "fadd v1.4s, v1.4s, v8.4s\n"
6838 "fadd v2.4s, v2.4s, v8.4s\n"
6839 "fadd v3.4s, v3.4s, v10.4s\n"
6840 "fadd v4.4s, v4.4s, v10.4s\n"
6841 "fadd v5.4s, v5.4s, v10.4s\n"
6842 "fadd v0.4s, v0.4s, v3.4s\n"
6843 "fadd v1.4s, v1.4s, v4.4s\n"
6844 "fadd v2.4s, v2.4s, v5.4s\n"
6845 "fsub v0.4s, v0.4s, v12.4s\n"
6846 "fsub v1.4s, v1.4s, v12.4s\n"
6847 "fsub v2.4s, v2.4s, v12.4s\n"
6848 "fmul v0.4s, v0.4s, v13.4s\n"
6849 "fmul v1.4s, v1.4s, v13.4s\n"
6850 "fmul v2.4s, v2.4s, v13.4s\n"
6851 "fadd v0.4s, v0.4s, v14.4s\n"
6852 "fadd v1.4s, v1.4s, v14.4s\n"
6853 "fadd v2.4s, v2.4s, v14.4s\n"
6854 "fcvtzs v0.4s, v0.4s\n"
6855 "fcvtzs v1.4s, v1.4s\n"
6856 "fcvtzs v2.4s, v2.4s\n"
6857
6858 "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
6859 "st1 {v2.s}[0], [%x[output]], #4\n"
6860 "prfm pldl1keep, [%x[output]]\n"
6861 "subs %x[rows], %x[rows], #1\n"
6862 "bne 1b\n"
6863 : [input] "+r"(input), [output] "+r"(output)
6864 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
6865 [output_range_offset] "m"(params.output_range_offset),
6866 [input_range_scale] "m"(params.input_range_scale),
6867 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
6868 [bias_range_min] "m"(params.bias_range_min),
6869 [output_range_min] "m"(params.output_range_min),
6870 [bias_range_scale] "m"(params.bias_range_scale),
6871 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
6872 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
6873 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
6874 }
6875
6876 template <>
6877 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)6878 10>::Transform(const uint8_t* input,
6879 const BiasAdd<uint8_t>& params,
6880 int32_t* output) {
6881 #ifdef DEBUG
6882 #ifdef DEBUG_METAGEMM_VERBOSE
6883 std::cout << __FILE__ << "(" << __LINE__
6884 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
6885 "10>::Transform()"
6886 << std::endl
6887 << std::flush;
6888 #endif
6889 #endif
6890 int params_rows_copy = params.rows;
6891 asm volatile(
6892 "ldr w0, %[input_range_min]\n"
6893 "dup v8.4s, w0\n"
6894 "ldr w0, %[input_range_scale]\n"
6895 "dup v9.4s, w0\n"
6896 "ldr w0, %[bias_range_min]\n"
6897 "dup v10.4s, w0\n"
6898 "ldr w0, %[bias_range_scale]\n"
6899 "dup v11.4s, w0\n"
6900 "ldr w0, %[output_range_min]\n"
6901 "dup v12.4s, w0\n"
6902 "ldr w0, %[one_over_output_range_scale]\n"
6903 "dup v13.4s, w0\n"
6904 "ldr w0, %[output_range_offset]\n"
6905 "dup v14.4s, w0\n"
6906 "1:"
6907 "mov x0, %x[count]\n"
6908 "mov x1, %x[bias]\n"
6909 "subs x0, x0, #10\n"
6910 "beq 3f\n"
6911 "2:"
6912 "subs x0, x0, #16\n"
6913
6914 // BiasAdd::Transform
6915 "ld1 {v0.4s}, [%x[input]], #16\n"
6916 "ld1 {v4.4s}, [x1], #16\n"
6917 "prfm pldl1keep, [%x[input], #32]\n"
6918 "uxtl2 v1.8h, v0.16b\n"
6919 "uxtl v0.8h, v0.8b\n"
6920 "uxtl2 v5.8h, v4.16b\n"
6921 "uxtl v4.8h, v4.8b\n"
6922 "sxtl2 v3.4s, v1.8h\n"
6923 "sxtl v2.4s, v1.4h\n"
6924 "sxtl2 v7.4s, v5.8h\n"
6925 "sxtl v6.4s, v5.4h\n"
6926 "sxtl2 v1.4s, v0.8h\n"
6927 "sxtl v0.4s, v0.4h\n"
6928 "sxtl2 v5.4s, v4.8h\n"
6929 "sxtl v4.4s, v4.4h\n"
6930 "scvtf v0.4s, v0.4s\n"
6931 "scvtf v1.4s, v1.4s\n"
6932 "scvtf v2.4s, v2.4s\n"
6933 "scvtf v3.4s, v3.4s\n"
6934 "scvtf v4.4s, v4.4s\n"
6935 "scvtf v5.4s, v5.4s\n"
6936 "scvtf v6.4s, v6.4s\n"
6937 "scvtf v7.4s, v7.4s\n"
6938 "fmul v0.4s, v0.4s, v9.4s\n"
6939 "fmul v1.4s, v1.4s, v9.4s\n"
6940 "fmul v2.4s, v2.4s, v9.4s\n"
6941 "fmul v3.4s, v3.4s, v9.4s\n"
6942 "fmul v4.4s, v4.4s, v11.4s\n"
6943 "fmul v5.4s, v5.4s, v11.4s\n"
6944 "fmul v6.4s, v6.4s, v11.4s\n"
6945 "fmul v7.4s, v7.4s, v11.4s\n"
6946 "fadd v0.4s, v0.4s, v8.4s\n"
6947 "fadd v1.4s, v1.4s, v8.4s\n"
6948 "fadd v2.4s, v2.4s, v8.4s\n"
6949 "fadd v3.4s, v3.4s, v8.4s\n"
6950 "fadd v4.4s, v4.4s, v10.4s\n"
6951 "fadd v5.4s, v5.4s, v10.4s\n"
6952 "fadd v6.4s, v6.4s, v10.4s\n"
6953 "fadd v7.4s, v7.4s, v10.4s\n"
6954 "fadd v0.4s, v0.4s, v4.4s\n"
6955 "fadd v1.4s, v1.4s, v5.4s\n"
6956 "fadd v2.4s, v2.4s, v6.4s\n"
6957 "fadd v3.4s, v3.4s, v7.4s\n"
6958 "fsub v0.4s, v0.4s, v12.4s\n"
6959 "fsub v1.4s, v1.4s, v12.4s\n"
6960 "fsub v2.4s, v2.4s, v12.4s\n"
6961 "fsub v3.4s, v3.4s, v12.4s\n"
6962 "fmul v0.4s, v0.4s, v13.4s\n"
6963 "fmul v1.4s, v1.4s, v13.4s\n"
6964 "fmul v2.4s, v2.4s, v13.4s\n"
6965 "fmul v3.4s, v3.4s, v13.4s\n"
6966 "fadd v0.4s, v0.4s, v14.4s\n"
6967 "fadd v1.4s, v1.4s, v14.4s\n"
6968 "fadd v2.4s, v2.4s, v14.4s\n"
6969 "fadd v3.4s, v3.4s, v14.4s\n"
6970 "fcvtzs v0.4s, v0.4s\n"
6971 "fcvtzs v1.4s, v1.4s\n"
6972 "fcvtzs v2.4s, v2.4s\n"
6973 "fcvtzs v3.4s, v3.4s\n"
6974
6975 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
6976 "prfm pldl1keep, [%x[output]]\n"
6977 "bne 2b\n"
6978 "3:"
6979
6980 // BiasAdd::Transform
6981 "ld1 {v0.2s}, [%x[input]], #8\n"
6982 "ld1 {v0.h}[4], [%x[input]], #2\n"
6983 "ld1 {v3.2s}, [x1], #8\n"
6984 "ld1 {v3.h}[4], [x1], #2\n"
6985 "prfm pldl1keep, [%x[input], #32]\n"
6986 "uxtl2 v1.8h, v0.16b\n"
6987 "uxtl v0.8h, v0.8b\n"
6988 "uxtl2 v4.8h, v3.16b\n"
6989 "uxtl v3.8h, v3.8b\n"
6990 "sxtl v2.4s, v1.4h\n"
6991 "sxtl v5.4s, v4.4h\n"
6992 "sxtl2 v1.4s, v0.8h\n"
6993 "sxtl v0.4s, v0.4h\n"
6994 "sxtl2 v4.4s, v3.8h\n"
6995 "sxtl v3.4s, v3.4h\n"
6996 "scvtf v0.4s, v0.4s\n"
6997 "scvtf v1.4s, v1.4s\n"
6998 "scvtf v2.4s, v2.4s\n"
6999 "scvtf v3.4s, v3.4s\n"
7000 "scvtf v4.4s, v4.4s\n"
7001 "scvtf v5.4s, v5.4s\n"
7002 "fmul v0.4s, v0.4s, v9.4s\n"
7003 "fmul v1.4s, v1.4s, v9.4s\n"
7004 "fmul v2.4s, v2.4s, v9.4s\n"
7005 "fmul v3.4s, v3.4s, v11.4s\n"
7006 "fmul v4.4s, v4.4s, v11.4s\n"
7007 "fmul v5.4s, v5.4s, v11.4s\n"
7008 "fadd v0.4s, v0.4s, v8.4s\n"
7009 "fadd v1.4s, v1.4s, v8.4s\n"
7010 "fadd v2.4s, v2.4s, v8.4s\n"
7011 "fadd v3.4s, v3.4s, v10.4s\n"
7012 "fadd v4.4s, v4.4s, v10.4s\n"
7013 "fadd v5.4s, v5.4s, v10.4s\n"
7014 "fadd v0.4s, v0.4s, v3.4s\n"
7015 "fadd v1.4s, v1.4s, v4.4s\n"
7016 "fadd v2.4s, v2.4s, v5.4s\n"
7017 "fsub v0.4s, v0.4s, v12.4s\n"
7018 "fsub v1.4s, v1.4s, v12.4s\n"
7019 "fsub v2.4s, v2.4s, v12.4s\n"
7020 "fmul v0.4s, v0.4s, v13.4s\n"
7021 "fmul v1.4s, v1.4s, v13.4s\n"
7022 "fmul v2.4s, v2.4s, v13.4s\n"
7023 "fadd v0.4s, v0.4s, v14.4s\n"
7024 "fadd v1.4s, v1.4s, v14.4s\n"
7025 "fadd v2.4s, v2.4s, v14.4s\n"
7026 "fcvtzs v0.4s, v0.4s\n"
7027 "fcvtzs v1.4s, v1.4s\n"
7028 "fcvtzs v2.4s, v2.4s\n"
7029
7030 "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
7031 "st1 {v2.2s}, [%x[output]], #8\n"
7032 "prfm pldl1keep, [%x[output]]\n"
7033 "subs %x[rows], %x[rows], #1\n"
7034 "bne 1b\n"
7035 : [input] "+r"(input), [output] "+r"(output)
7036 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7037 [output_range_offset] "m"(params.output_range_offset),
7038 [input_range_scale] "m"(params.input_range_scale),
7039 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7040 [bias_range_min] "m"(params.bias_range_min),
7041 [output_range_min] "m"(params.output_range_min),
7042 [bias_range_scale] "m"(params.bias_range_scale),
7043 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7044 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7045 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7046 }
7047
7048 template <>
7049 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7050 11>::Transform(const uint8_t* input,
7051 const BiasAdd<uint8_t>& params,
7052 int32_t* output) {
7053 #ifdef DEBUG
7054 #ifdef DEBUG_METAGEMM_VERBOSE
7055 std::cout << __FILE__ << "(" << __LINE__
7056 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7057 "11>::Transform()"
7058 << std::endl
7059 << std::flush;
7060 #endif
7061 #endif
7062 int params_rows_copy = params.rows;
7063 asm volatile(
7064 "ldr w0, %[input_range_min]\n"
7065 "dup v8.4s, w0\n"
7066 "ldr w0, %[input_range_scale]\n"
7067 "dup v9.4s, w0\n"
7068 "ldr w0, %[bias_range_min]\n"
7069 "dup v10.4s, w0\n"
7070 "ldr w0, %[bias_range_scale]\n"
7071 "dup v11.4s, w0\n"
7072 "ldr w0, %[output_range_min]\n"
7073 "dup v12.4s, w0\n"
7074 "ldr w0, %[one_over_output_range_scale]\n"
7075 "dup v13.4s, w0\n"
7076 "ldr w0, %[output_range_offset]\n"
7077 "dup v14.4s, w0\n"
7078 "1:"
7079 "mov x0, %x[count]\n"
7080 "mov x1, %x[bias]\n"
7081 "subs x0, x0, #11\n"
7082 "beq 3f\n"
7083 "2:"
7084 "subs x0, x0, #16\n"
7085
7086 // BiasAdd::Transform
7087 "ld1 {v0.4s}, [%x[input]], #16\n"
7088 "ld1 {v4.4s}, [x1], #16\n"
7089 "prfm pldl1keep, [%x[input], #32]\n"
7090 "uxtl2 v1.8h, v0.16b\n"
7091 "uxtl v0.8h, v0.8b\n"
7092 "uxtl2 v5.8h, v4.16b\n"
7093 "uxtl v4.8h, v4.8b\n"
7094 "sxtl2 v3.4s, v1.8h\n"
7095 "sxtl v2.4s, v1.4h\n"
7096 "sxtl2 v7.4s, v5.8h\n"
7097 "sxtl v6.4s, v5.4h\n"
7098 "sxtl2 v1.4s, v0.8h\n"
7099 "sxtl v0.4s, v0.4h\n"
7100 "sxtl2 v5.4s, v4.8h\n"
7101 "sxtl v4.4s, v4.4h\n"
7102 "scvtf v0.4s, v0.4s\n"
7103 "scvtf v1.4s, v1.4s\n"
7104 "scvtf v2.4s, v2.4s\n"
7105 "scvtf v3.4s, v3.4s\n"
7106 "scvtf v4.4s, v4.4s\n"
7107 "scvtf v5.4s, v5.4s\n"
7108 "scvtf v6.4s, v6.4s\n"
7109 "scvtf v7.4s, v7.4s\n"
7110 "fmul v0.4s, v0.4s, v9.4s\n"
7111 "fmul v1.4s, v1.4s, v9.4s\n"
7112 "fmul v2.4s, v2.4s, v9.4s\n"
7113 "fmul v3.4s, v3.4s, v9.4s\n"
7114 "fmul v4.4s, v4.4s, v11.4s\n"
7115 "fmul v5.4s, v5.4s, v11.4s\n"
7116 "fmul v6.4s, v6.4s, v11.4s\n"
7117 "fmul v7.4s, v7.4s, v11.4s\n"
7118 "fadd v0.4s, v0.4s, v8.4s\n"
7119 "fadd v1.4s, v1.4s, v8.4s\n"
7120 "fadd v2.4s, v2.4s, v8.4s\n"
7121 "fadd v3.4s, v3.4s, v8.4s\n"
7122 "fadd v4.4s, v4.4s, v10.4s\n"
7123 "fadd v5.4s, v5.4s, v10.4s\n"
7124 "fadd v6.4s, v6.4s, v10.4s\n"
7125 "fadd v7.4s, v7.4s, v10.4s\n"
7126 "fadd v0.4s, v0.4s, v4.4s\n"
7127 "fadd v1.4s, v1.4s, v5.4s\n"
7128 "fadd v2.4s, v2.4s, v6.4s\n"
7129 "fadd v3.4s, v3.4s, v7.4s\n"
7130 "fsub v0.4s, v0.4s, v12.4s\n"
7131 "fsub v1.4s, v1.4s, v12.4s\n"
7132 "fsub v2.4s, v2.4s, v12.4s\n"
7133 "fsub v3.4s, v3.4s, v12.4s\n"
7134 "fmul v0.4s, v0.4s, v13.4s\n"
7135 "fmul v1.4s, v1.4s, v13.4s\n"
7136 "fmul v2.4s, v2.4s, v13.4s\n"
7137 "fmul v3.4s, v3.4s, v13.4s\n"
7138 "fadd v0.4s, v0.4s, v14.4s\n"
7139 "fadd v1.4s, v1.4s, v14.4s\n"
7140 "fadd v2.4s, v2.4s, v14.4s\n"
7141 "fadd v3.4s, v3.4s, v14.4s\n"
7142 "fcvtzs v0.4s, v0.4s\n"
7143 "fcvtzs v1.4s, v1.4s\n"
7144 "fcvtzs v2.4s, v2.4s\n"
7145 "fcvtzs v3.4s, v3.4s\n"
7146
7147 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7148 "prfm pldl1keep, [%x[output]]\n"
7149 "bne 2b\n"
7150 "3:"
7151
7152 // BiasAdd::Transform
7153 "ld1 {v0.2s}, [%x[input]], #8\n"
7154 "ld1 {v0.h}[4], [%x[input]], #2\n"
7155 "ld1 {v0.b}[10], [%x[input]], #1\n"
7156 "ld1 {v3.2s}, [x1], #8\n"
7157 "ld1 {v3.h}[4], [x1], #2\n"
7158 "ld1 {v3.b}[10], [x1], #1\n"
7159 "prfm pldl1keep, [%x[input], #32]\n"
7160 "uxtl2 v1.8h, v0.16b\n"
7161 "uxtl v0.8h, v0.8b\n"
7162 "uxtl2 v4.8h, v3.16b\n"
7163 "uxtl v3.8h, v3.8b\n"
7164 "sxtl v2.4s, v1.4h\n"
7165 "sxtl v5.4s, v4.4h\n"
7166 "sxtl2 v1.4s, v0.8h\n"
7167 "sxtl v0.4s, v0.4h\n"
7168 "sxtl2 v4.4s, v3.8h\n"
7169 "sxtl v3.4s, v3.4h\n"
7170 "scvtf v0.4s, v0.4s\n"
7171 "scvtf v1.4s, v1.4s\n"
7172 "scvtf v2.4s, v2.4s\n"
7173 "scvtf v3.4s, v3.4s\n"
7174 "scvtf v4.4s, v4.4s\n"
7175 "scvtf v5.4s, v5.4s\n"
7176 "fmul v0.4s, v0.4s, v9.4s\n"
7177 "fmul v1.4s, v1.4s, v9.4s\n"
7178 "fmul v2.4s, v2.4s, v9.4s\n"
7179 "fmul v3.4s, v3.4s, v11.4s\n"
7180 "fmul v4.4s, v4.4s, v11.4s\n"
7181 "fmul v5.4s, v5.4s, v11.4s\n"
7182 "fadd v0.4s, v0.4s, v8.4s\n"
7183 "fadd v1.4s, v1.4s, v8.4s\n"
7184 "fadd v2.4s, v2.4s, v8.4s\n"
7185 "fadd v3.4s, v3.4s, v10.4s\n"
7186 "fadd v4.4s, v4.4s, v10.4s\n"
7187 "fadd v5.4s, v5.4s, v10.4s\n"
7188 "fadd v0.4s, v0.4s, v3.4s\n"
7189 "fadd v1.4s, v1.4s, v4.4s\n"
7190 "fadd v2.4s, v2.4s, v5.4s\n"
7191 "fsub v0.4s, v0.4s, v12.4s\n"
7192 "fsub v1.4s, v1.4s, v12.4s\n"
7193 "fsub v2.4s, v2.4s, v12.4s\n"
7194 "fmul v0.4s, v0.4s, v13.4s\n"
7195 "fmul v1.4s, v1.4s, v13.4s\n"
7196 "fmul v2.4s, v2.4s, v13.4s\n"
7197 "fadd v0.4s, v0.4s, v14.4s\n"
7198 "fadd v1.4s, v1.4s, v14.4s\n"
7199 "fadd v2.4s, v2.4s, v14.4s\n"
7200 "fcvtzs v0.4s, v0.4s\n"
7201 "fcvtzs v1.4s, v1.4s\n"
7202 "fcvtzs v2.4s, v2.4s\n"
7203
7204 "st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
7205 "st1 {v2.2s}, [%x[output]], #8\n"
7206 "st1 {v2.s}[2], [%x[output]], #4\n"
7207 "prfm pldl1keep, [%x[output]]\n"
7208 "subs %x[rows], %x[rows], #1\n"
7209 "bne 1b\n"
7210 : [input] "+r"(input), [output] "+r"(output)
7211 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7212 [output_range_offset] "m"(params.output_range_offset),
7213 [input_range_scale] "m"(params.input_range_scale),
7214 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7215 [bias_range_min] "m"(params.bias_range_min),
7216 [output_range_min] "m"(params.output_range_min),
7217 [bias_range_scale] "m"(params.bias_range_scale),
7218 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7219 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7220 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7221 }
7222
7223 template <>
7224 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7225 12>::Transform(const uint8_t* input,
7226 const BiasAdd<uint8_t>& params,
7227 int32_t* output) {
7228 #ifdef DEBUG
7229 #ifdef DEBUG_METAGEMM_VERBOSE
7230 std::cout << __FILE__ << "(" << __LINE__
7231 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7232 "12>::Transform()"
7233 << std::endl
7234 << std::flush;
7235 #endif
7236 #endif
7237 int params_rows_copy = params.rows;
7238 asm volatile(
7239 "ldr w0, %[input_range_min]\n"
7240 "dup v8.4s, w0\n"
7241 "ldr w0, %[input_range_scale]\n"
7242 "dup v9.4s, w0\n"
7243 "ldr w0, %[bias_range_min]\n"
7244 "dup v10.4s, w0\n"
7245 "ldr w0, %[bias_range_scale]\n"
7246 "dup v11.4s, w0\n"
7247 "ldr w0, %[output_range_min]\n"
7248 "dup v12.4s, w0\n"
7249 "ldr w0, %[one_over_output_range_scale]\n"
7250 "dup v13.4s, w0\n"
7251 "ldr w0, %[output_range_offset]\n"
7252 "dup v14.4s, w0\n"
7253 "1:"
7254 "mov x0, %x[count]\n"
7255 "mov x1, %x[bias]\n"
7256 "subs x0, x0, #12\n"
7257 "beq 3f\n"
7258 "2:"
7259 "subs x0, x0, #16\n"
7260
7261 // BiasAdd::Transform
7262 "ld1 {v0.4s}, [%x[input]], #16\n"
7263 "ld1 {v4.4s}, [x1], #16\n"
7264 "prfm pldl1keep, [%x[input], #32]\n"
7265 "uxtl2 v1.8h, v0.16b\n"
7266 "uxtl v0.8h, v0.8b\n"
7267 "uxtl2 v5.8h, v4.16b\n"
7268 "uxtl v4.8h, v4.8b\n"
7269 "sxtl2 v3.4s, v1.8h\n"
7270 "sxtl v2.4s, v1.4h\n"
7271 "sxtl2 v7.4s, v5.8h\n"
7272 "sxtl v6.4s, v5.4h\n"
7273 "sxtl2 v1.4s, v0.8h\n"
7274 "sxtl v0.4s, v0.4h\n"
7275 "sxtl2 v5.4s, v4.8h\n"
7276 "sxtl v4.4s, v4.4h\n"
7277 "scvtf v0.4s, v0.4s\n"
7278 "scvtf v1.4s, v1.4s\n"
7279 "scvtf v2.4s, v2.4s\n"
7280 "scvtf v3.4s, v3.4s\n"
7281 "scvtf v4.4s, v4.4s\n"
7282 "scvtf v5.4s, v5.4s\n"
7283 "scvtf v6.4s, v6.4s\n"
7284 "scvtf v7.4s, v7.4s\n"
7285 "fmul v0.4s, v0.4s, v9.4s\n"
7286 "fmul v1.4s, v1.4s, v9.4s\n"
7287 "fmul v2.4s, v2.4s, v9.4s\n"
7288 "fmul v3.4s, v3.4s, v9.4s\n"
7289 "fmul v4.4s, v4.4s, v11.4s\n"
7290 "fmul v5.4s, v5.4s, v11.4s\n"
7291 "fmul v6.4s, v6.4s, v11.4s\n"
7292 "fmul v7.4s, v7.4s, v11.4s\n"
7293 "fadd v0.4s, v0.4s, v8.4s\n"
7294 "fadd v1.4s, v1.4s, v8.4s\n"
7295 "fadd v2.4s, v2.4s, v8.4s\n"
7296 "fadd v3.4s, v3.4s, v8.4s\n"
7297 "fadd v4.4s, v4.4s, v10.4s\n"
7298 "fadd v5.4s, v5.4s, v10.4s\n"
7299 "fadd v6.4s, v6.4s, v10.4s\n"
7300 "fadd v7.4s, v7.4s, v10.4s\n"
7301 "fadd v0.4s, v0.4s, v4.4s\n"
7302 "fadd v1.4s, v1.4s, v5.4s\n"
7303 "fadd v2.4s, v2.4s, v6.4s\n"
7304 "fadd v3.4s, v3.4s, v7.4s\n"
7305 "fsub v0.4s, v0.4s, v12.4s\n"
7306 "fsub v1.4s, v1.4s, v12.4s\n"
7307 "fsub v2.4s, v2.4s, v12.4s\n"
7308 "fsub v3.4s, v3.4s, v12.4s\n"
7309 "fmul v0.4s, v0.4s, v13.4s\n"
7310 "fmul v1.4s, v1.4s, v13.4s\n"
7311 "fmul v2.4s, v2.4s, v13.4s\n"
7312 "fmul v3.4s, v3.4s, v13.4s\n"
7313 "fadd v0.4s, v0.4s, v14.4s\n"
7314 "fadd v1.4s, v1.4s, v14.4s\n"
7315 "fadd v2.4s, v2.4s, v14.4s\n"
7316 "fadd v3.4s, v3.4s, v14.4s\n"
7317 "fcvtzs v0.4s, v0.4s\n"
7318 "fcvtzs v1.4s, v1.4s\n"
7319 "fcvtzs v2.4s, v2.4s\n"
7320 "fcvtzs v3.4s, v3.4s\n"
7321
7322 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7323 "prfm pldl1keep, [%x[output]]\n"
7324 "bne 2b\n"
7325 "3:"
7326
7327 // BiasAdd::Transform
7328 "ld1 {v0.2s}, [%x[input]], #8\n"
7329 "ld1 {v0.s}[2], [%x[input]], #4\n"
7330 "ld1 {v3.2s}, [x1], #8\n"
7331 "ld1 {v3.s}[2], [x1], #4\n"
7332 "prfm pldl1keep, [%x[input], #32]\n"
7333 "uxtl2 v1.8h, v0.16b\n"
7334 "uxtl v0.8h, v0.8b\n"
7335 "uxtl2 v4.8h, v3.16b\n"
7336 "uxtl v3.8h, v3.8b\n"
7337 "sxtl v2.4s, v1.4h\n"
7338 "sxtl v5.4s, v4.4h\n"
7339 "sxtl2 v1.4s, v0.8h\n"
7340 "sxtl v0.4s, v0.4h\n"
7341 "sxtl2 v4.4s, v3.8h\n"
7342 "sxtl v3.4s, v3.4h\n"
7343 "scvtf v0.4s, v0.4s\n"
7344 "scvtf v1.4s, v1.4s\n"
7345 "scvtf v2.4s, v2.4s\n"
7346 "scvtf v3.4s, v3.4s\n"
7347 "scvtf v4.4s, v4.4s\n"
7348 "scvtf v5.4s, v5.4s\n"
7349 "fmul v0.4s, v0.4s, v9.4s\n"
7350 "fmul v1.4s, v1.4s, v9.4s\n"
7351 "fmul v2.4s, v2.4s, v9.4s\n"
7352 "fmul v3.4s, v3.4s, v11.4s\n"
7353 "fmul v4.4s, v4.4s, v11.4s\n"
7354 "fmul v5.4s, v5.4s, v11.4s\n"
7355 "fadd v0.4s, v0.4s, v8.4s\n"
7356 "fadd v1.4s, v1.4s, v8.4s\n"
7357 "fadd v2.4s, v2.4s, v8.4s\n"
7358 "fadd v3.4s, v3.4s, v10.4s\n"
7359 "fadd v4.4s, v4.4s, v10.4s\n"
7360 "fadd v5.4s, v5.4s, v10.4s\n"
7361 "fadd v0.4s, v0.4s, v3.4s\n"
7362 "fadd v1.4s, v1.4s, v4.4s\n"
7363 "fadd v2.4s, v2.4s, v5.4s\n"
7364 "fsub v0.4s, v0.4s, v12.4s\n"
7365 "fsub v1.4s, v1.4s, v12.4s\n"
7366 "fsub v2.4s, v2.4s, v12.4s\n"
7367 "fmul v0.4s, v0.4s, v13.4s\n"
7368 "fmul v1.4s, v1.4s, v13.4s\n"
7369 "fmul v2.4s, v2.4s, v13.4s\n"
7370 "fadd v0.4s, v0.4s, v14.4s\n"
7371 "fadd v1.4s, v1.4s, v14.4s\n"
7372 "fadd v2.4s, v2.4s, v14.4s\n"
7373 "fcvtzs v0.4s, v0.4s\n"
7374 "fcvtzs v1.4s, v1.4s\n"
7375 "fcvtzs v2.4s, v2.4s\n"
7376
7377 "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
7378 "prfm pldl1keep, [%x[output]]\n"
7379 "subs %x[rows], %x[rows], #1\n"
7380 "bne 1b\n"
7381 : [input] "+r"(input), [output] "+r"(output)
7382 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7383 [output_range_offset] "m"(params.output_range_offset),
7384 [input_range_scale] "m"(params.input_range_scale),
7385 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7386 [bias_range_min] "m"(params.bias_range_min),
7387 [output_range_min] "m"(params.output_range_min),
7388 [bias_range_scale] "m"(params.bias_range_scale),
7389 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7390 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7391 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7392 }
7393
7394 template <>
7395 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7396 13>::Transform(const uint8_t* input,
7397 const BiasAdd<uint8_t>& params,
7398 int32_t* output) {
7399 #ifdef DEBUG
7400 #ifdef DEBUG_METAGEMM_VERBOSE
7401 std::cout << __FILE__ << "(" << __LINE__
7402 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7403 "13>::Transform()"
7404 << std::endl
7405 << std::flush;
7406 #endif
7407 #endif
7408 int params_rows_copy = params.rows;
7409 asm volatile(
7410 "ldr w0, %[input_range_min]\n"
7411 "dup v8.4s, w0\n"
7412 "ldr w0, %[input_range_scale]\n"
7413 "dup v9.4s, w0\n"
7414 "ldr w0, %[bias_range_min]\n"
7415 "dup v10.4s, w0\n"
7416 "ldr w0, %[bias_range_scale]\n"
7417 "dup v11.4s, w0\n"
7418 "ldr w0, %[output_range_min]\n"
7419 "dup v12.4s, w0\n"
7420 "ldr w0, %[one_over_output_range_scale]\n"
7421 "dup v13.4s, w0\n"
7422 "ldr w0, %[output_range_offset]\n"
7423 "dup v14.4s, w0\n"
7424 "1:"
7425 "mov x0, %x[count]\n"
7426 "mov x1, %x[bias]\n"
7427 "subs x0, x0, #13\n"
7428 "beq 3f\n"
7429 "2:"
7430 "subs x0, x0, #16\n"
7431
7432 // BiasAdd::Transform
7433 "ld1 {v0.4s}, [%x[input]], #16\n"
7434 "ld1 {v4.4s}, [x1], #16\n"
7435 "prfm pldl1keep, [%x[input], #32]\n"
7436 "uxtl2 v1.8h, v0.16b\n"
7437 "uxtl v0.8h, v0.8b\n"
7438 "uxtl2 v5.8h, v4.16b\n"
7439 "uxtl v4.8h, v4.8b\n"
7440 "sxtl2 v3.4s, v1.8h\n"
7441 "sxtl v2.4s, v1.4h\n"
7442 "sxtl2 v7.4s, v5.8h\n"
7443 "sxtl v6.4s, v5.4h\n"
7444 "sxtl2 v1.4s, v0.8h\n"
7445 "sxtl v0.4s, v0.4h\n"
7446 "sxtl2 v5.4s, v4.8h\n"
7447 "sxtl v4.4s, v4.4h\n"
7448 "scvtf v0.4s, v0.4s\n"
7449 "scvtf v1.4s, v1.4s\n"
7450 "scvtf v2.4s, v2.4s\n"
7451 "scvtf v3.4s, v3.4s\n"
7452 "scvtf v4.4s, v4.4s\n"
7453 "scvtf v5.4s, v5.4s\n"
7454 "scvtf v6.4s, v6.4s\n"
7455 "scvtf v7.4s, v7.4s\n"
7456 "fmul v0.4s, v0.4s, v9.4s\n"
7457 "fmul v1.4s, v1.4s, v9.4s\n"
7458 "fmul v2.4s, v2.4s, v9.4s\n"
7459 "fmul v3.4s, v3.4s, v9.4s\n"
7460 "fmul v4.4s, v4.4s, v11.4s\n"
7461 "fmul v5.4s, v5.4s, v11.4s\n"
7462 "fmul v6.4s, v6.4s, v11.4s\n"
7463 "fmul v7.4s, v7.4s, v11.4s\n"
7464 "fadd v0.4s, v0.4s, v8.4s\n"
7465 "fadd v1.4s, v1.4s, v8.4s\n"
7466 "fadd v2.4s, v2.4s, v8.4s\n"
7467 "fadd v3.4s, v3.4s, v8.4s\n"
7468 "fadd v4.4s, v4.4s, v10.4s\n"
7469 "fadd v5.4s, v5.4s, v10.4s\n"
7470 "fadd v6.4s, v6.4s, v10.4s\n"
7471 "fadd v7.4s, v7.4s, v10.4s\n"
7472 "fadd v0.4s, v0.4s, v4.4s\n"
7473 "fadd v1.4s, v1.4s, v5.4s\n"
7474 "fadd v2.4s, v2.4s, v6.4s\n"
7475 "fadd v3.4s, v3.4s, v7.4s\n"
7476 "fsub v0.4s, v0.4s, v12.4s\n"
7477 "fsub v1.4s, v1.4s, v12.4s\n"
7478 "fsub v2.4s, v2.4s, v12.4s\n"
7479 "fsub v3.4s, v3.4s, v12.4s\n"
7480 "fmul v0.4s, v0.4s, v13.4s\n"
7481 "fmul v1.4s, v1.4s, v13.4s\n"
7482 "fmul v2.4s, v2.4s, v13.4s\n"
7483 "fmul v3.4s, v3.4s, v13.4s\n"
7484 "fadd v0.4s, v0.4s, v14.4s\n"
7485 "fadd v1.4s, v1.4s, v14.4s\n"
7486 "fadd v2.4s, v2.4s, v14.4s\n"
7487 "fadd v3.4s, v3.4s, v14.4s\n"
7488 "fcvtzs v0.4s, v0.4s\n"
7489 "fcvtzs v1.4s, v1.4s\n"
7490 "fcvtzs v2.4s, v2.4s\n"
7491 "fcvtzs v3.4s, v3.4s\n"
7492
7493 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7494 "prfm pldl1keep, [%x[output]]\n"
7495 "bne 2b\n"
7496 "3:"
7497
7498 // BiasAdd::Transform
7499 "ld1 {v0.2s}, [%x[input]], #8\n"
7500 "ld1 {v0.s}[2], [%x[input]], #4\n"
7501 "ld1 {v0.b}[12], [%x[input]], #1\n"
7502 "ld1 {v4.2s}, [x1], #8\n"
7503 "ld1 {v4.s}[2], [x1], #4\n"
7504 "ld1 {v4.b}[12], [x1], #1\n"
7505 "prfm pldl1keep, [%x[input], #32]\n"
7506 "uxtl2 v1.8h, v0.16b\n"
7507 "uxtl v0.8h, v0.8b\n"
7508 "uxtl2 v5.8h, v4.16b\n"
7509 "uxtl v4.8h, v4.8b\n"
7510 "sxtl2 v3.4s, v1.8h\n"
7511 "sxtl v2.4s, v1.4h\n"
7512 "sxtl2 v7.4s, v5.8h\n"
7513 "sxtl v6.4s, v5.4h\n"
7514 "sxtl2 v1.4s, v0.8h\n"
7515 "sxtl v0.4s, v0.4h\n"
7516 "sxtl2 v5.4s, v4.8h\n"
7517 "sxtl v4.4s, v4.4h\n"
7518 "scvtf v0.4s, v0.4s\n"
7519 "scvtf v1.4s, v1.4s\n"
7520 "scvtf v2.4s, v2.4s\n"
7521 "scvtf v3.4s, v3.4s\n"
7522 "scvtf v4.4s, v4.4s\n"
7523 "scvtf v5.4s, v5.4s\n"
7524 "scvtf v6.4s, v6.4s\n"
7525 "scvtf v7.4s, v7.4s\n"
7526 "fmul v0.4s, v0.4s, v9.4s\n"
7527 "fmul v1.4s, v1.4s, v9.4s\n"
7528 "fmul v2.4s, v2.4s, v9.4s\n"
7529 "fmul v3.4s, v3.4s, v9.4s\n"
7530 "fmul v4.4s, v4.4s, v11.4s\n"
7531 "fmul v5.4s, v5.4s, v11.4s\n"
7532 "fmul v6.4s, v6.4s, v11.4s\n"
7533 "fmul v7.4s, v7.4s, v11.4s\n"
7534 "fadd v0.4s, v0.4s, v8.4s\n"
7535 "fadd v1.4s, v1.4s, v8.4s\n"
7536 "fadd v2.4s, v2.4s, v8.4s\n"
7537 "fadd v3.4s, v3.4s, v8.4s\n"
7538 "fadd v4.4s, v4.4s, v10.4s\n"
7539 "fadd v5.4s, v5.4s, v10.4s\n"
7540 "fadd v6.4s, v6.4s, v10.4s\n"
7541 "fadd v7.4s, v7.4s, v10.4s\n"
7542 "fadd v0.4s, v0.4s, v4.4s\n"
7543 "fadd v1.4s, v1.4s, v5.4s\n"
7544 "fadd v2.4s, v2.4s, v6.4s\n"
7545 "fadd v3.4s, v3.4s, v7.4s\n"
7546 "fsub v0.4s, v0.4s, v12.4s\n"
7547 "fsub v1.4s, v1.4s, v12.4s\n"
7548 "fsub v2.4s, v2.4s, v12.4s\n"
7549 "fsub v3.4s, v3.4s, v12.4s\n"
7550 "fmul v0.4s, v0.4s, v13.4s\n"
7551 "fmul v1.4s, v1.4s, v13.4s\n"
7552 "fmul v2.4s, v2.4s, v13.4s\n"
7553 "fmul v3.4s, v3.4s, v13.4s\n"
7554 "fadd v0.4s, v0.4s, v14.4s\n"
7555 "fadd v1.4s, v1.4s, v14.4s\n"
7556 "fadd v2.4s, v2.4s, v14.4s\n"
7557 "fadd v3.4s, v3.4s, v14.4s\n"
7558 "fcvtzs v0.4s, v0.4s\n"
7559 "fcvtzs v1.4s, v1.4s\n"
7560 "fcvtzs v2.4s, v2.4s\n"
7561 "fcvtzs v3.4s, v3.4s\n"
7562
7563 "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
7564 "st1 {v3.s}[0], [%x[output]], #4\n"
7565 "prfm pldl1keep, [%x[output]]\n"
7566 "subs %x[rows], %x[rows], #1\n"
7567 "bne 1b\n"
7568 : [input] "+r"(input), [output] "+r"(output)
7569 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7570 [output_range_offset] "m"(params.output_range_offset),
7571 [input_range_scale] "m"(params.input_range_scale),
7572 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7573 [bias_range_min] "m"(params.bias_range_min),
7574 [output_range_min] "m"(params.output_range_min),
7575 [bias_range_scale] "m"(params.bias_range_scale),
7576 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7577 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7578 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7579 }
7580
7581 template <>
7582 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7583 14>::Transform(const uint8_t* input,
7584 const BiasAdd<uint8_t>& params,
7585 int32_t* output) {
7586 #ifdef DEBUG
7587 #ifdef DEBUG_METAGEMM_VERBOSE
7588 std::cout << __FILE__ << "(" << __LINE__
7589 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7590 "14>::Transform()"
7591 << std::endl
7592 << std::flush;
7593 #endif
7594 #endif
7595 int params_rows_copy = params.rows;
7596 asm volatile(
7597 "ldr w0, %[input_range_min]\n"
7598 "dup v8.4s, w0\n"
7599 "ldr w0, %[input_range_scale]\n"
7600 "dup v9.4s, w0\n"
7601 "ldr w0, %[bias_range_min]\n"
7602 "dup v10.4s, w0\n"
7603 "ldr w0, %[bias_range_scale]\n"
7604 "dup v11.4s, w0\n"
7605 "ldr w0, %[output_range_min]\n"
7606 "dup v12.4s, w0\n"
7607 "ldr w0, %[one_over_output_range_scale]\n"
7608 "dup v13.4s, w0\n"
7609 "ldr w0, %[output_range_offset]\n"
7610 "dup v14.4s, w0\n"
7611 "1:"
7612 "mov x0, %x[count]\n"
7613 "mov x1, %x[bias]\n"
7614 "subs x0, x0, #14\n"
7615 "beq 3f\n"
7616 "2:"
7617 "subs x0, x0, #16\n"
7618
7619 // BiasAdd::Transform
7620 "ld1 {v0.4s}, [%x[input]], #16\n"
7621 "ld1 {v4.4s}, [x1], #16\n"
7622 "prfm pldl1keep, [%x[input], #32]\n"
7623 "uxtl2 v1.8h, v0.16b\n"
7624 "uxtl v0.8h, v0.8b\n"
7625 "uxtl2 v5.8h, v4.16b\n"
7626 "uxtl v4.8h, v4.8b\n"
7627 "sxtl2 v3.4s, v1.8h\n"
7628 "sxtl v2.4s, v1.4h\n"
7629 "sxtl2 v7.4s, v5.8h\n"
7630 "sxtl v6.4s, v5.4h\n"
7631 "sxtl2 v1.4s, v0.8h\n"
7632 "sxtl v0.4s, v0.4h\n"
7633 "sxtl2 v5.4s, v4.8h\n"
7634 "sxtl v4.4s, v4.4h\n"
7635 "scvtf v0.4s, v0.4s\n"
7636 "scvtf v1.4s, v1.4s\n"
7637 "scvtf v2.4s, v2.4s\n"
7638 "scvtf v3.4s, v3.4s\n"
7639 "scvtf v4.4s, v4.4s\n"
7640 "scvtf v5.4s, v5.4s\n"
7641 "scvtf v6.4s, v6.4s\n"
7642 "scvtf v7.4s, v7.4s\n"
7643 "fmul v0.4s, v0.4s, v9.4s\n"
7644 "fmul v1.4s, v1.4s, v9.4s\n"
7645 "fmul v2.4s, v2.4s, v9.4s\n"
7646 "fmul v3.4s, v3.4s, v9.4s\n"
7647 "fmul v4.4s, v4.4s, v11.4s\n"
7648 "fmul v5.4s, v5.4s, v11.4s\n"
7649 "fmul v6.4s, v6.4s, v11.4s\n"
7650 "fmul v7.4s, v7.4s, v11.4s\n"
7651 "fadd v0.4s, v0.4s, v8.4s\n"
7652 "fadd v1.4s, v1.4s, v8.4s\n"
7653 "fadd v2.4s, v2.4s, v8.4s\n"
7654 "fadd v3.4s, v3.4s, v8.4s\n"
7655 "fadd v4.4s, v4.4s, v10.4s\n"
7656 "fadd v5.4s, v5.4s, v10.4s\n"
7657 "fadd v6.4s, v6.4s, v10.4s\n"
7658 "fadd v7.4s, v7.4s, v10.4s\n"
7659 "fadd v0.4s, v0.4s, v4.4s\n"
7660 "fadd v1.4s, v1.4s, v5.4s\n"
7661 "fadd v2.4s, v2.4s, v6.4s\n"
7662 "fadd v3.4s, v3.4s, v7.4s\n"
7663 "fsub v0.4s, v0.4s, v12.4s\n"
7664 "fsub v1.4s, v1.4s, v12.4s\n"
7665 "fsub v2.4s, v2.4s, v12.4s\n"
7666 "fsub v3.4s, v3.4s, v12.4s\n"
7667 "fmul v0.4s, v0.4s, v13.4s\n"
7668 "fmul v1.4s, v1.4s, v13.4s\n"
7669 "fmul v2.4s, v2.4s, v13.4s\n"
7670 "fmul v3.4s, v3.4s, v13.4s\n"
7671 "fadd v0.4s, v0.4s, v14.4s\n"
7672 "fadd v1.4s, v1.4s, v14.4s\n"
7673 "fadd v2.4s, v2.4s, v14.4s\n"
7674 "fadd v3.4s, v3.4s, v14.4s\n"
7675 "fcvtzs v0.4s, v0.4s\n"
7676 "fcvtzs v1.4s, v1.4s\n"
7677 "fcvtzs v2.4s, v2.4s\n"
7678 "fcvtzs v3.4s, v3.4s\n"
7679
7680 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7681 "prfm pldl1keep, [%x[output]]\n"
7682 "bne 2b\n"
7683 "3:"
7684
7685 // BiasAdd::Transform
7686 "ld1 {v0.2s}, [%x[input]], #8\n"
7687 "ld1 {v0.s}[2], [%x[input]], #4\n"
7688 "ld1 {v0.h}[6], [%x[input]], #2\n"
7689 "ld1 {v4.2s}, [x1], #8\n"
7690 "ld1 {v4.s}[2], [x1], #4\n"
7691 "ld1 {v4.h}[6], [x1], #2\n"
7692 "prfm pldl1keep, [%x[input], #32]\n"
7693 "uxtl2 v1.8h, v0.16b\n"
7694 "uxtl v0.8h, v0.8b\n"
7695 "uxtl2 v5.8h, v4.16b\n"
7696 "uxtl v4.8h, v4.8b\n"
7697 "sxtl2 v3.4s, v1.8h\n"
7698 "sxtl v2.4s, v1.4h\n"
7699 "sxtl2 v7.4s, v5.8h\n"
7700 "sxtl v6.4s, v5.4h\n"
7701 "sxtl2 v1.4s, v0.8h\n"
7702 "sxtl v0.4s, v0.4h\n"
7703 "sxtl2 v5.4s, v4.8h\n"
7704 "sxtl v4.4s, v4.4h\n"
7705 "scvtf v0.4s, v0.4s\n"
7706 "scvtf v1.4s, v1.4s\n"
7707 "scvtf v2.4s, v2.4s\n"
7708 "scvtf v3.4s, v3.4s\n"
7709 "scvtf v4.4s, v4.4s\n"
7710 "scvtf v5.4s, v5.4s\n"
7711 "scvtf v6.4s, v6.4s\n"
7712 "scvtf v7.4s, v7.4s\n"
7713 "fmul v0.4s, v0.4s, v9.4s\n"
7714 "fmul v1.4s, v1.4s, v9.4s\n"
7715 "fmul v2.4s, v2.4s, v9.4s\n"
7716 "fmul v3.4s, v3.4s, v9.4s\n"
7717 "fmul v4.4s, v4.4s, v11.4s\n"
7718 "fmul v5.4s, v5.4s, v11.4s\n"
7719 "fmul v6.4s, v6.4s, v11.4s\n"
7720 "fmul v7.4s, v7.4s, v11.4s\n"
7721 "fadd v0.4s, v0.4s, v8.4s\n"
7722 "fadd v1.4s, v1.4s, v8.4s\n"
7723 "fadd v2.4s, v2.4s, v8.4s\n"
7724 "fadd v3.4s, v3.4s, v8.4s\n"
7725 "fadd v4.4s, v4.4s, v10.4s\n"
7726 "fadd v5.4s, v5.4s, v10.4s\n"
7727 "fadd v6.4s, v6.4s, v10.4s\n"
7728 "fadd v7.4s, v7.4s, v10.4s\n"
7729 "fadd v0.4s, v0.4s, v4.4s\n"
7730 "fadd v1.4s, v1.4s, v5.4s\n"
7731 "fadd v2.4s, v2.4s, v6.4s\n"
7732 "fadd v3.4s, v3.4s, v7.4s\n"
7733 "fsub v0.4s, v0.4s, v12.4s\n"
7734 "fsub v1.4s, v1.4s, v12.4s\n"
7735 "fsub v2.4s, v2.4s, v12.4s\n"
7736 "fsub v3.4s, v3.4s, v12.4s\n"
7737 "fmul v0.4s, v0.4s, v13.4s\n"
7738 "fmul v1.4s, v1.4s, v13.4s\n"
7739 "fmul v2.4s, v2.4s, v13.4s\n"
7740 "fmul v3.4s, v3.4s, v13.4s\n"
7741 "fadd v0.4s, v0.4s, v14.4s\n"
7742 "fadd v1.4s, v1.4s, v14.4s\n"
7743 "fadd v2.4s, v2.4s, v14.4s\n"
7744 "fadd v3.4s, v3.4s, v14.4s\n"
7745 "fcvtzs v0.4s, v0.4s\n"
7746 "fcvtzs v1.4s, v1.4s\n"
7747 "fcvtzs v2.4s, v2.4s\n"
7748 "fcvtzs v3.4s, v3.4s\n"
7749
7750 "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
7751 "st1 {v3.2s}, [%x[output]], #8\n"
7752 "prfm pldl1keep, [%x[output]]\n"
7753 "subs %x[rows], %x[rows], #1\n"
7754 "bne 1b\n"
7755 : [input] "+r"(input), [output] "+r"(output)
7756 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7757 [output_range_offset] "m"(params.output_range_offset),
7758 [input_range_scale] "m"(params.input_range_scale),
7759 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7760 [bias_range_min] "m"(params.bias_range_min),
7761 [output_range_min] "m"(params.output_range_min),
7762 [bias_range_scale] "m"(params.bias_range_scale),
7763 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7764 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7765 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7766 }
7767
7768 template <>
7769 inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
Transform(const uint8_t * input,const BiasAdd<uint8_t> & params,int32_t * output)7770 15>::Transform(const uint8_t* input,
7771 const BiasAdd<uint8_t>& params,
7772 int32_t* output) {
7773 #ifdef DEBUG
7774 #ifdef DEBUG_METAGEMM_VERBOSE
7775 std::cout << __FILE__ << "(" << __LINE__
7776 << ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
7777 "15>::Transform()"
7778 << std::endl
7779 << std::flush;
7780 #endif
7781 #endif
7782 int params_rows_copy = params.rows;
7783 asm volatile(
7784 "ldr w0, %[input_range_min]\n"
7785 "dup v8.4s, w0\n"
7786 "ldr w0, %[input_range_scale]\n"
7787 "dup v9.4s, w0\n"
7788 "ldr w0, %[bias_range_min]\n"
7789 "dup v10.4s, w0\n"
7790 "ldr w0, %[bias_range_scale]\n"
7791 "dup v11.4s, w0\n"
7792 "ldr w0, %[output_range_min]\n"
7793 "dup v12.4s, w0\n"
7794 "ldr w0, %[one_over_output_range_scale]\n"
7795 "dup v13.4s, w0\n"
7796 "ldr w0, %[output_range_offset]\n"
7797 "dup v14.4s, w0\n"
7798 "1:"
7799 "mov x0, %x[count]\n"
7800 "mov x1, %x[bias]\n"
7801 "subs x0, x0, #15\n"
7802 "beq 3f\n"
7803 "2:"
7804 "subs x0, x0, #16\n"
7805
7806 // BiasAdd::Transform
7807 "ld1 {v0.4s}, [%x[input]], #16\n"
7808 "ld1 {v4.4s}, [x1], #16\n"
7809 "prfm pldl1keep, [%x[input], #32]\n"
7810 "uxtl2 v1.8h, v0.16b\n"
7811 "uxtl v0.8h, v0.8b\n"
7812 "uxtl2 v5.8h, v4.16b\n"
7813 "uxtl v4.8h, v4.8b\n"
7814 "sxtl2 v3.4s, v1.8h\n"
7815 "sxtl v2.4s, v1.4h\n"
7816 "sxtl2 v7.4s, v5.8h\n"
7817 "sxtl v6.4s, v5.4h\n"
7818 "sxtl2 v1.4s, v0.8h\n"
7819 "sxtl v0.4s, v0.4h\n"
7820 "sxtl2 v5.4s, v4.8h\n"
7821 "sxtl v4.4s, v4.4h\n"
7822 "scvtf v0.4s, v0.4s\n"
7823 "scvtf v1.4s, v1.4s\n"
7824 "scvtf v2.4s, v2.4s\n"
7825 "scvtf v3.4s, v3.4s\n"
7826 "scvtf v4.4s, v4.4s\n"
7827 "scvtf v5.4s, v5.4s\n"
7828 "scvtf v6.4s, v6.4s\n"
7829 "scvtf v7.4s, v7.4s\n"
7830 "fmul v0.4s, v0.4s, v9.4s\n"
7831 "fmul v1.4s, v1.4s, v9.4s\n"
7832 "fmul v2.4s, v2.4s, v9.4s\n"
7833 "fmul v3.4s, v3.4s, v9.4s\n"
7834 "fmul v4.4s, v4.4s, v11.4s\n"
7835 "fmul v5.4s, v5.4s, v11.4s\n"
7836 "fmul v6.4s, v6.4s, v11.4s\n"
7837 "fmul v7.4s, v7.4s, v11.4s\n"
7838 "fadd v0.4s, v0.4s, v8.4s\n"
7839 "fadd v1.4s, v1.4s, v8.4s\n"
7840 "fadd v2.4s, v2.4s, v8.4s\n"
7841 "fadd v3.4s, v3.4s, v8.4s\n"
7842 "fadd v4.4s, v4.4s, v10.4s\n"
7843 "fadd v5.4s, v5.4s, v10.4s\n"
7844 "fadd v6.4s, v6.4s, v10.4s\n"
7845 "fadd v7.4s, v7.4s, v10.4s\n"
7846 "fadd v0.4s, v0.4s, v4.4s\n"
7847 "fadd v1.4s, v1.4s, v5.4s\n"
7848 "fadd v2.4s, v2.4s, v6.4s\n"
7849 "fadd v3.4s, v3.4s, v7.4s\n"
7850 "fsub v0.4s, v0.4s, v12.4s\n"
7851 "fsub v1.4s, v1.4s, v12.4s\n"
7852 "fsub v2.4s, v2.4s, v12.4s\n"
7853 "fsub v3.4s, v3.4s, v12.4s\n"
7854 "fmul v0.4s, v0.4s, v13.4s\n"
7855 "fmul v1.4s, v1.4s, v13.4s\n"
7856 "fmul v2.4s, v2.4s, v13.4s\n"
7857 "fmul v3.4s, v3.4s, v13.4s\n"
7858 "fadd v0.4s, v0.4s, v14.4s\n"
7859 "fadd v1.4s, v1.4s, v14.4s\n"
7860 "fadd v2.4s, v2.4s, v14.4s\n"
7861 "fadd v3.4s, v3.4s, v14.4s\n"
7862 "fcvtzs v0.4s, v0.4s\n"
7863 "fcvtzs v1.4s, v1.4s\n"
7864 "fcvtzs v2.4s, v2.4s\n"
7865 "fcvtzs v3.4s, v3.4s\n"
7866
7867 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
7868 "prfm pldl1keep, [%x[output]]\n"
7869 "bne 2b\n"
7870 "3:"
7871
7872 // BiasAdd::Transform
7873 "ld1 {v0.2s}, [%x[input]], #8\n"
7874 "ld1 {v0.s}[2], [%x[input]], #4\n"
7875 "ld1 {v0.h}[6], [%x[input]], #2\n"
7876 "ld1 {v0.b}[14], [%x[input]], #1\n"
7877 "ld1 {v4.2s}, [x1], #8\n"
7878 "ld1 {v4.s}[2], [x1], #4\n"
7879 "ld1 {v4.h}[6], [x1], #2\n"
7880 "ld1 {v4.b}[14], [x1], #1\n"
7881 "prfm pldl1keep, [%x[input], #32]\n"
7882 "uxtl2 v1.8h, v0.16b\n"
7883 "uxtl v0.8h, v0.8b\n"
7884 "uxtl2 v5.8h, v4.16b\n"
7885 "uxtl v4.8h, v4.8b\n"
7886 "sxtl2 v3.4s, v1.8h\n"
7887 "sxtl v2.4s, v1.4h\n"
7888 "sxtl2 v7.4s, v5.8h\n"
7889 "sxtl v6.4s, v5.4h\n"
7890 "sxtl2 v1.4s, v0.8h\n"
7891 "sxtl v0.4s, v0.4h\n"
7892 "sxtl2 v5.4s, v4.8h\n"
7893 "sxtl v4.4s, v4.4h\n"
7894 "scvtf v0.4s, v0.4s\n"
7895 "scvtf v1.4s, v1.4s\n"
7896 "scvtf v2.4s, v2.4s\n"
7897 "scvtf v3.4s, v3.4s\n"
7898 "scvtf v4.4s, v4.4s\n"
7899 "scvtf v5.4s, v5.4s\n"
7900 "scvtf v6.4s, v6.4s\n"
7901 "scvtf v7.4s, v7.4s\n"
7902 "fmul v0.4s, v0.4s, v9.4s\n"
7903 "fmul v1.4s, v1.4s, v9.4s\n"
7904 "fmul v2.4s, v2.4s, v9.4s\n"
7905 "fmul v3.4s, v3.4s, v9.4s\n"
7906 "fmul v4.4s, v4.4s, v11.4s\n"
7907 "fmul v5.4s, v5.4s, v11.4s\n"
7908 "fmul v6.4s, v6.4s, v11.4s\n"
7909 "fmul v7.4s, v7.4s, v11.4s\n"
7910 "fadd v0.4s, v0.4s, v8.4s\n"
7911 "fadd v1.4s, v1.4s, v8.4s\n"
7912 "fadd v2.4s, v2.4s, v8.4s\n"
7913 "fadd v3.4s, v3.4s, v8.4s\n"
7914 "fadd v4.4s, v4.4s, v10.4s\n"
7915 "fadd v5.4s, v5.4s, v10.4s\n"
7916 "fadd v6.4s, v6.4s, v10.4s\n"
7917 "fadd v7.4s, v7.4s, v10.4s\n"
7918 "fadd v0.4s, v0.4s, v4.4s\n"
7919 "fadd v1.4s, v1.4s, v5.4s\n"
7920 "fadd v2.4s, v2.4s, v6.4s\n"
7921 "fadd v3.4s, v3.4s, v7.4s\n"
7922 "fsub v0.4s, v0.4s, v12.4s\n"
7923 "fsub v1.4s, v1.4s, v12.4s\n"
7924 "fsub v2.4s, v2.4s, v12.4s\n"
7925 "fsub v3.4s, v3.4s, v12.4s\n"
7926 "fmul v0.4s, v0.4s, v13.4s\n"
7927 "fmul v1.4s, v1.4s, v13.4s\n"
7928 "fmul v2.4s, v2.4s, v13.4s\n"
7929 "fmul v3.4s, v3.4s, v13.4s\n"
7930 "fadd v0.4s, v0.4s, v14.4s\n"
7931 "fadd v1.4s, v1.4s, v14.4s\n"
7932 "fadd v2.4s, v2.4s, v14.4s\n"
7933 "fadd v3.4s, v3.4s, v14.4s\n"
7934 "fcvtzs v0.4s, v0.4s\n"
7935 "fcvtzs v1.4s, v1.4s\n"
7936 "fcvtzs v2.4s, v2.4s\n"
7937 "fcvtzs v3.4s, v3.4s\n"
7938
7939 "st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
7940 "st1 {v3.2s}, [%x[output]], #8\n"
7941 "st1 {v3.s}[2], [%x[output]], #4\n"
7942 "prfm pldl1keep, [%x[output]]\n"
7943 "subs %x[rows], %x[rows], #1\n"
7944 "bne 1b\n"
7945 : [input] "+r"(input), [output] "+r"(output)
7946 : [count] "r"(params.count), [rows] "r"(params_rows_copy),
7947 [output_range_offset] "m"(params.output_range_offset),
7948 [input_range_scale] "m"(params.input_range_scale),
7949 [one_over_output_range_scale] "m"(params.one_over_output_range_scale),
7950 [bias_range_min] "m"(params.bias_range_min),
7951 [output_range_min] "m"(params.output_range_min),
7952 [bias_range_scale] "m"(params.bias_range_scale),
7953 [bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
7954 : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
7955 "v10", "v11", "v12", "v13", "v14", "cc", "memory");
7956 }
7957
7958 } // namespace meta
7959 } // namespace gemmlowp
7960
7961 #else
7962 #warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!"
7963 #endif
7964
7965 #endif // GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
7966