• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
16 #define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
17 
18 #ifdef GEMMLOWP_NEON_64
19 
20 #include <cassert>
21 #include <cstdint>
22 
23 namespace gemmlowp {
24 namespace meta {
25 
26 template <>
27 inline void
28 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)29           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
30                        const FusedKernelParams<QuantizedStaticPreprocessed,
31                                                RowMajor>& params,
32                        uint8_t* result) {
33 #ifdef DEBUG
34 #ifdef DEBUG_METAGEMM_VERBOSE
35   std::cout << __FILE__ << "(" << __LINE__
36             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
37                "QuantizedStaticPreprocessed, RowMajor, 1, 1, 8>::Multiply()"
38             << std::endl
39             << std::flush;
40 #endif
41 #endif
42   asm volatile(
43       "prfm pldl1keep, [%x[lhs]]\n"
44       "prfm pldl1keep, [%x[rhs]]\n"
45 
46       // Clear aggregators.
47       "movi v0.4s, #0\n"
48 
49       // General NxM lanes loop.
50       "1:"
51 
52       // Subtract counter.
53       "subs %x[count], %x[count], #8\n"
54 
55       "ld1 {v1.2s}, [%x[lhs]], #8\n"
56       "ld1 {v2.2s}, [%x[rhs]], #8\n"
57       "prfm pldl1keep, [%x[lhs], #64]\n"
58       "prfm pldl1keep, [%x[rhs], #64]\n"
59       "umull v3.8h, v2.8b, v1.8b\n"
60       "uadalp v0.4s, v3.8h\n"
61 
62       // Loop break.
63       "bgt 1b\n"
64 
65       // StaticQuantization::Prepare
66       "ld1 {v4.4s}, [%x[lhs]], #16\n"
67       "ld1 {v5.4s}, [%x[rhs]], #16\n"
68       "dup v6.4s, %w[multiplicative_offset]\n"
69       "dup v7.4s, %w[rounding_offset]\n"
70       "dup v8.4s, %w[shift]\n"
71       "dup v4.4s, v4.s[0]\n"
72 
73       // RowMajorOutput::Prepare
74 
75       // Reduce aggregators.
76       "addp v0.4s, v0.4s, v0.4s\n"
77       "addp v0.4s, v0.4s, v0.4s\n"
78 
79       // StaticQuantization::Transform
80       "add v0.4s, v0.4s, v4.4s\n"
81       "add v0.4s, v0.4s, v5.4s\n"
82       "mul v0.4s, v0.4s, v6.4s\n"
83       "add v0.4s, v0.4s, v7.4s\n"
84       "sshl v0.4s, v0.4s, v8.4s\n"
85       "sqxtn v0.4h, v0.4s\n"
86       "sqxtun v0.8b, v0.8h\n"
87 
88       // RowMajorOutput::Output
89       "st1 {v0.b}[0], [%x[result]], #1\n"
90       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
91       : [count] "r"(params.kernel.count),
92         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
93         [shift] "r"(params.kernel.shift),
94         [stride] "r"(params.output_stream.stride),
95         [rounding_offset] "r"(params.kernel.rounding_offset)
96       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
97 }
98 
99 template <>
100 inline void
101 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)102           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
103                        const FusedKernelParams<QuantizedStaticPreprocessed,
104                                                RowMajor>& params,
105                        uint8_t* result) {
106 #ifdef DEBUG
107 #ifdef DEBUG_METAGEMM_VERBOSE
108   std::cout << __FILE__ << "(" << __LINE__
109             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
110                "QuantizedStaticPreprocessed, RowMajor, 1, 2, 8>::Multiply()"
111             << std::endl
112             << std::flush;
113 #endif
114 #endif
115   asm volatile(
116       "prfm pldl1keep, [%x[lhs]]\n"
117       "prfm pldl1keep, [%x[rhs]]\n"
118 
119       // Clear aggregators.
120       "movi v0.4s, #0\n"
121       "movi v1.4s, #0\n"
122 
123       // General NxM lanes loop.
124       "1:"
125 
126       // Subtract counter.
127       "subs %x[count], %x[count], #8\n"
128 
129       "ld1 {v2.2s}, [%x[lhs]], #8\n"
130       "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
131       "prfm pldl1keep, [%x[lhs], #64]\n"
132       "prfm pldl1keep, [%x[rhs], #64]\n"
133       "umull v5.8h, v3.8b, v2.8b\n"
134       "umull v6.8h, v4.8b, v2.8b\n"
135       "uadalp v0.4s, v5.8h\n"
136       "uadalp v1.4s, v6.8h\n"
137 
138       // Loop break.
139       "bgt 1b\n"
140 
141       // StaticQuantization::Prepare
142       "ld1 {v4.4s}, [%x[lhs]], #16\n"
143       "ld1 {v5.4s}, [%x[rhs]], #16\n"
144       "dup v6.4s, %w[multiplicative_offset]\n"
145       "dup v7.4s, %w[rounding_offset]\n"
146       "dup v8.4s, %w[shift]\n"
147       "dup v4.4s, v4.s[0]\n"
148 
149       // RowMajorOutput::Prepare
150 
151       // Reduce aggregators.
152       "addp v0.4s, v0.4s, v1.4s\n"
153       "addp v0.4s, v0.4s, v0.4s\n"
154 
155       // StaticQuantization::Transform
156       "add v0.4s, v0.4s, v4.4s\n"
157       "add v0.4s, v0.4s, v5.4s\n"
158       "mul v0.4s, v0.4s, v6.4s\n"
159       "add v0.4s, v0.4s, v7.4s\n"
160       "sshl v0.4s, v0.4s, v8.4s\n"
161       "sqxtn v0.4h, v0.4s\n"
162       "sqxtun v0.8b, v0.8h\n"
163 
164       // RowMajorOutput::Output
165       "st1 {v0.h}[0], [%x[result]], #2\n"
166       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
167       : [count] "r"(params.kernel.count),
168         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
169         [shift] "r"(params.kernel.shift),
170         [stride] "r"(params.output_stream.stride),
171         [rounding_offset] "r"(params.kernel.rounding_offset)
172       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
173 }
174 
175 template <>
176 inline void
177 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)178           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
179                        const FusedKernelParams<QuantizedStaticPreprocessed,
180                                                RowMajor>& params,
181                        uint8_t* result) {
182 #ifdef DEBUG
183 #ifdef DEBUG_METAGEMM_VERBOSE
184   std::cout << __FILE__ << "(" << __LINE__
185             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
186                "QuantizedStaticPreprocessed, RowMajor, 1, 3, 8>::Multiply()"
187             << std::endl
188             << std::flush;
189 #endif
190 #endif
191   asm volatile(
192       "prfm pldl1keep, [%x[lhs]]\n"
193       "prfm pldl1keep, [%x[rhs]]\n"
194 
195       // Clear aggregators.
196       "movi v0.4s, #0\n"
197       "movi v1.4s, #0\n"
198       "movi v2.4s, #0\n"
199 
200       // General NxM lanes loop.
201       "1:"
202 
203       // Subtract counter.
204       "subs %x[count], %x[count], #8\n"
205 
206       "ld1 {v3.2s}, [%x[lhs]], #8\n"
207       "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
208       "prfm pldl1keep, [%x[lhs], #64]\n"
209       "prfm pldl1keep, [%x[rhs], #64]\n"
210       "umull v7.8h, v4.8b, v3.8b\n"
211       "umull v8.8h, v5.8b, v3.8b\n"
212       "umull v9.8h, v6.8b, v3.8b\n"
213       "uadalp v0.4s, v7.8h\n"
214       "uadalp v1.4s, v8.8h\n"
215       "uadalp v2.4s, v9.8h\n"
216 
217       // Loop break.
218       "bgt 1b\n"
219 
220       // StaticQuantization::Prepare
221       "ld1 {v4.4s}, [%x[lhs]], #16\n"
222       "ld1 {v5.4s}, [%x[rhs]], #16\n"
223       "dup v6.4s, %w[multiplicative_offset]\n"
224       "dup v7.4s, %w[rounding_offset]\n"
225       "dup v8.4s, %w[shift]\n"
226       "dup v4.4s, v4.s[0]\n"
227 
228       // RowMajorOutput::Prepare
229 
230       // Reduce aggregators.
231       "addp v0.4s, v0.4s, v1.4s\n"
232       "addp v2.4s, v2.4s, v2.4s\n"
233       "addp v0.4s, v0.4s, v2.4s\n"
234 
235       // StaticQuantization::Transform
236       "add v0.4s, v0.4s, v4.4s\n"
237       "add v0.4s, v0.4s, v5.4s\n"
238       "mul v0.4s, v0.4s, v6.4s\n"
239       "add v0.4s, v0.4s, v7.4s\n"
240       "sshl v0.4s, v0.4s, v8.4s\n"
241       "sqxtn v0.4h, v0.4s\n"
242       "sqxtun v0.8b, v0.8h\n"
243 
244       // RowMajorOutput::Output
245       "st1 {v0.h}[0], [%x[result]], #2\n"
246       "st1 {v0.b}[2], [%x[result]], #1\n"
247       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
248       : [count] "r"(params.kernel.count),
249         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
250         [shift] "r"(params.kernel.shift),
251         [stride] "r"(params.output_stream.stride),
252         [rounding_offset] "r"(params.kernel.rounding_offset)
253       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
254         "memory");
255 }
256 
257 template <>
258 inline void
259 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 4,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)260           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
261                        const FusedKernelParams<QuantizedStaticPreprocessed,
262                                                RowMajor>& params,
263                        uint8_t* result) {
264 #ifdef DEBUG
265 #ifdef DEBUG_METAGEMM_VERBOSE
266   std::cout << __FILE__ << "(" << __LINE__
267             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
268                "QuantizedStaticPreprocessed, RowMajor, 1, 4, 8>::Multiply()"
269             << std::endl
270             << std::flush;
271 #endif
272 #endif
273   asm volatile(
274       "prfm pldl1keep, [%x[lhs]]\n"
275       "prfm pldl1keep, [%x[rhs]]\n"
276 
277       // Clear aggregators.
278       "movi v0.4s, #0\n"
279       "movi v1.4s, #0\n"
280       "movi v2.4s, #0\n"
281       "mov v3.16b, v0.16b\n"
282 
283       // General NxM lanes loop.
284       "1:"
285 
286       // Subtract counter.
287       "subs %x[count], %x[count], #8\n"
288 
289       "ld1 {v4.2s}, [%x[lhs]], #8\n"
290       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
291       "prfm pldl1keep, [%x[lhs], #64]\n"
292       "prfm pldl1keep, [%x[rhs], #64]\n"
293       "umull v9.8h, v5.8b, v4.8b\n"
294       "umull v10.8h, v6.8b, v4.8b\n"
295       "umull v11.8h, v7.8b, v4.8b\n"
296       "umull v12.8h, v8.8b, v4.8b\n"
297       "uadalp v0.4s, v9.8h\n"
298       "uadalp v1.4s, v10.8h\n"
299       "uadalp v2.4s, v11.8h\n"
300       "uadalp v3.4s, v12.8h\n"
301 
302       // Loop break.
303       "bgt 1b\n"
304 
305       // StaticQuantization::Prepare
306       "ld1 {v4.4s}, [%x[lhs]], #16\n"
307       "ld1 {v5.4s}, [%x[rhs]], #16\n"
308       "dup v6.4s, %w[multiplicative_offset]\n"
309       "dup v7.4s, %w[rounding_offset]\n"
310       "dup v8.4s, %w[shift]\n"
311       "dup v4.4s, v4.s[0]\n"
312 
313       // RowMajorOutput::Prepare
314 
315       // Reduce aggregators.
316       "addp v0.4s, v0.4s, v1.4s\n"
317       "addp v2.4s, v2.4s, v3.4s\n"
318       "addp v0.4s, v0.4s, v2.4s\n"
319 
320       // StaticQuantization::Transform
321       "add v0.4s, v0.4s, v4.4s\n"
322       "add v0.4s, v0.4s, v5.4s\n"
323       "mul v0.4s, v0.4s, v6.4s\n"
324       "add v0.4s, v0.4s, v7.4s\n"
325       "sshl v0.4s, v0.4s, v8.4s\n"
326       "sqxtn v0.4h, v0.4s\n"
327       "sqxtun v0.8b, v0.8h\n"
328 
329       // RowMajorOutput::Output
330       "st1 {v0.s}[0], [%x[result]], #4\n"
331       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
332       : [count] "r"(params.kernel.count),
333         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
334         [shift] "r"(params.kernel.shift),
335         [stride] "r"(params.output_stream.stride),
336         [rounding_offset] "r"(params.kernel.rounding_offset)
337       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
338         "v11", "v12", "cc", "memory");
339 }
340 
341 template <>
342 inline void
343 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 5,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)344           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
345                        const FusedKernelParams<QuantizedStaticPreprocessed,
346                                                RowMajor>& params,
347                        uint8_t* result) {
348 #ifdef DEBUG
349 #ifdef DEBUG_METAGEMM_VERBOSE
350   std::cout << __FILE__ << "(" << __LINE__
351             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
352                "QuantizedStaticPreprocessed, RowMajor, 1, 5, 8>::Multiply()"
353             << std::endl
354             << std::flush;
355 #endif
356 #endif
357   asm volatile(
358       "prfm pldl1keep, [%x[lhs]]\n"
359       "prfm pldl1keep, [%x[rhs]]\n"
360 
361       // Clear aggregators.
362       "movi v0.4s, #0\n"
363       "movi v1.4s, #0\n"
364       "movi v2.4s, #0\n"
365       "mov v3.16b, v0.16b\n"
366       "mov v4.16b, v1.16b\n"
367 
368       // General 1xM lanes loop.
369       "1:"
370 
371       // Subtract counter.
372       "subs %x[count], %x[count], #8\n"
373 
374       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
375       "ld1 {v9.2s}, [%x[lhs]], #8\n"
376       "prfm pldl1keep, [%x[lhs], #64]\n"
377       "umull v10.8h, v5.8b, v9.8b\n"
378       "umull v11.8h, v6.8b, v9.8b\n"
379       "umull v12.8h, v7.8b, v9.8b\n"
380       "umull v13.8h, v8.8b, v9.8b\n"
381       "ld1 {v5.2s}, [%x[rhs]], #8\n"
382       "prfm pldl1keep, [%x[rhs], #128]\n"
383       "uadalp v0.4s, v10.8h\n"
384       "uadalp v1.4s, v11.8h\n"
385       "uadalp v2.4s, v12.8h\n"
386       "uadalp v3.4s, v13.8h\n"
387       "umull v10.8h, v5.8b, v9.8b\n"
388       "uadalp v4.4s, v10.8h\n"
389 
390       // Loop break.
391       "bgt 1b\n"
392 
393       // StaticQuantization::Prepare
394       "ld1 {v5.4s}, [%x[lhs]], #16\n"
395       "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
396       "dup v8.4s, %w[multiplicative_offset]\n"
397       "dup v9.4s, %w[rounding_offset]\n"
398       "dup v10.4s, %w[shift]\n"
399       "dup v5.4s, v5.s[0]\n"
400 
401       // RowMajorOutput::Prepare
402 
403       // Reduce aggregators.
404       "addp v0.4s, v0.4s, v1.4s\n"
405       "addp v2.4s, v2.4s, v3.4s\n"
406       "addp v4.4s, v4.4s, v4.4s\n"
407       "addp v0.4s, v0.4s, v2.4s\n"
408       "addp v1.4s, v4.4s, v4.4s\n"
409 
410       // StaticQuantization::Transform
411       "add v0.4s, v0.4s, v5.4s\n"
412       "add v1.4s, v1.4s, v5.4s\n"
413       "add v0.4s, v0.4s, v6.4s\n"
414       "add v1.4s, v1.4s, v7.4s\n"
415       "mul v0.4s, v0.4s, v8.4s\n"
416       "mul v1.4s, v1.4s, v8.4s\n"
417       "add v0.4s, v0.4s, v9.4s\n"
418       "add v1.4s, v1.4s, v9.4s\n"
419       "sshl v0.4s, v0.4s, v10.4s\n"
420       "sshl v1.4s, v1.4s, v10.4s\n"
421       "sqxtn v0.4h, v0.4s\n"
422       "sqxtn2 v0.8h, v1.4s\n"
423       "sqxtun v0.8b, v0.8h\n"
424 
425       // RowMajorOutput::Output
426       "st1 {v0.s}[0], [%x[result]], #4\n"
427       "st1 {v0.b}[4], [%x[result]], #1\n"
428       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
429       : [count] "r"(params.kernel.count),
430         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
431         [shift] "r"(params.kernel.shift),
432         [stride] "r"(params.output_stream.stride),
433         [rounding_offset] "r"(params.kernel.rounding_offset)
434       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
435         "v11", "v12", "v13", "cc", "memory");
436 }
437 
438 template <>
439 inline void
440 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 6,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)441           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
442                        const FusedKernelParams<QuantizedStaticPreprocessed,
443                                                RowMajor>& params,
444                        uint8_t* result) {
445 #ifdef DEBUG
446 #ifdef DEBUG_METAGEMM_VERBOSE
447   std::cout << __FILE__ << "(" << __LINE__
448             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
449                "QuantizedStaticPreprocessed, RowMajor, 1, 6, 8>::Multiply()"
450             << std::endl
451             << std::flush;
452 #endif
453 #endif
454   asm volatile(
455       "prfm pldl1keep, [%x[lhs]]\n"
456       "prfm pldl1keep, [%x[rhs]]\n"
457 
458       // Clear aggregators.
459       "movi v0.4s, #0\n"
460       "movi v1.4s, #0\n"
461       "movi v2.4s, #0\n"
462       "mov v3.16b, v0.16b\n"
463       "mov v4.16b, v1.16b\n"
464       "mov v5.16b, v2.16b\n"
465 
466       // General 1xM lanes loop.
467       "1:"
468 
469       // Subtract counter.
470       "subs %x[count], %x[count], #8\n"
471 
472       "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
473       "ld1 {v10.2s}, [%x[lhs]], #8\n"
474       "prfm pldl1keep, [%x[lhs], #64]\n"
475       "umull v11.8h, v6.8b, v10.8b\n"
476       "umull v12.8h, v7.8b, v10.8b\n"
477       "umull v13.8h, v8.8b, v10.8b\n"
478       "umull v14.8h, v9.8b, v10.8b\n"
479       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
480       "prfm pldl1keep, [%x[rhs], #128]\n"
481       "uadalp v0.4s, v11.8h\n"
482       "uadalp v1.4s, v12.8h\n"
483       "uadalp v2.4s, v13.8h\n"
484       "uadalp v3.4s, v14.8h\n"
485       "umull v11.8h, v6.8b, v10.8b\n"
486       "umull v12.8h, v7.8b, v10.8b\n"
487       "uadalp v4.4s, v11.8h\n"
488       "uadalp v5.4s, v12.8h\n"
489 
490       // Loop break.
491       "bgt 1b\n"
492 
493       // StaticQuantization::Prepare
494       "ld1 {v6.4s}, [%x[lhs]], #16\n"
495       "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
496       "dup v9.4s, %w[multiplicative_offset]\n"
497       "dup v10.4s, %w[rounding_offset]\n"
498       "dup v11.4s, %w[shift]\n"
499       "dup v6.4s, v6.s[0]\n"
500 
501       // RowMajorOutput::Prepare
502 
503       // Reduce aggregators.
504       "addp v0.4s, v0.4s, v1.4s\n"
505       "addp v2.4s, v2.4s, v3.4s\n"
506       "addp v4.4s, v4.4s, v5.4s\n"
507       "addp v0.4s, v0.4s, v2.4s\n"
508       "addp v1.4s, v4.4s, v4.4s\n"
509 
510       // StaticQuantization::Transform
511       "add v0.4s, v0.4s, v6.4s\n"
512       "add v1.4s, v1.4s, v6.4s\n"
513       "add v0.4s, v0.4s, v7.4s\n"
514       "add v1.4s, v1.4s, v8.4s\n"
515       "mul v0.4s, v0.4s, v9.4s\n"
516       "mul v1.4s, v1.4s, v9.4s\n"
517       "add v0.4s, v0.4s, v10.4s\n"
518       "add v1.4s, v1.4s, v10.4s\n"
519       "sshl v0.4s, v0.4s, v11.4s\n"
520       "sshl v1.4s, v1.4s, v11.4s\n"
521       "sqxtn v0.4h, v0.4s\n"
522       "sqxtn2 v0.8h, v1.4s\n"
523       "sqxtun v0.8b, v0.8h\n"
524 
525       // RowMajorOutput::Output
526       "st1 {v0.s}[0], [%x[result]], #4\n"
527       "st1 {v0.h}[2], [%x[result]], #2\n"
528       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
529       : [count] "r"(params.kernel.count),
530         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
531         [shift] "r"(params.kernel.shift),
532         [stride] "r"(params.output_stream.stride),
533         [rounding_offset] "r"(params.kernel.rounding_offset)
534       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
535         "v11", "v12", "v13", "v14", "cc", "memory");
536 }
537 
538 template <>
539 inline void
540 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 7,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)541           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
542                        const FusedKernelParams<QuantizedStaticPreprocessed,
543                                                RowMajor>& params,
544                        uint8_t* result) {
545 #ifdef DEBUG
546 #ifdef DEBUG_METAGEMM_VERBOSE
547   std::cout << __FILE__ << "(" << __LINE__
548             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
549                "QuantizedStaticPreprocessed, RowMajor, 1, 7, 8>::Multiply()"
550             << std::endl
551             << std::flush;
552 #endif
553 #endif
554   asm volatile(
555       "prfm pldl1keep, [%x[lhs]]\n"
556       "prfm pldl1keep, [%x[rhs]]\n"
557 
558       // Clear aggregators.
559       "movi v0.4s, #0\n"
560       "movi v1.4s, #0\n"
561       "movi v2.4s, #0\n"
562       "mov v3.16b, v0.16b\n"
563       "mov v4.16b, v1.16b\n"
564       "mov v5.16b, v2.16b\n"
565       "mov v6.16b, v3.16b\n"
566 
567       // General 1xM lanes loop.
568       "1:"
569 
570       // Subtract counter.
571       "subs %x[count], %x[count], #8\n"
572 
573       "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
574       "ld1 {v11.2s}, [%x[lhs]], #8\n"
575       "prfm pldl1keep, [%x[lhs], #64]\n"
576       "umull v12.8h, v7.8b, v11.8b\n"
577       "umull v13.8h, v8.8b, v11.8b\n"
578       "umull v14.8h, v9.8b, v11.8b\n"
579       "umull v15.8h, v10.8b, v11.8b\n"
580       "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
581       "prfm pldl1keep, [%x[rhs], #128]\n"
582       "uadalp v0.4s, v12.8h\n"
583       "uadalp v1.4s, v13.8h\n"
584       "uadalp v2.4s, v14.8h\n"
585       "uadalp v3.4s, v15.8h\n"
586       "umull v12.8h, v7.8b, v11.8b\n"
587       "umull v13.8h, v8.8b, v11.8b\n"
588       "umull v14.8h, v9.8b, v11.8b\n"
589       "uadalp v4.4s, v12.8h\n"
590       "uadalp v5.4s, v13.8h\n"
591       "uadalp v6.4s, v14.8h\n"
592 
593       // Loop break.
594       "bgt 1b\n"
595 
596       // StaticQuantization::Prepare
597       "ld1 {v7.4s}, [%x[lhs]], #16\n"
598       "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
599       "dup v10.4s, %w[multiplicative_offset]\n"
600       "dup v11.4s, %w[rounding_offset]\n"
601       "dup v12.4s, %w[shift]\n"
602       "dup v7.4s, v7.s[0]\n"
603 
604       // RowMajorOutput::Prepare
605 
606       // Reduce aggregators.
607       "addp v0.4s, v0.4s, v1.4s\n"
608       "addp v2.4s, v2.4s, v3.4s\n"
609       "addp v4.4s, v4.4s, v5.4s\n"
610       "addp v6.4s, v6.4s, v6.4s\n"
611       "addp v0.4s, v0.4s, v2.4s\n"
612       "addp v1.4s, v4.4s, v6.4s\n"
613 
614       // StaticQuantization::Transform
615       "add v0.4s, v0.4s, v7.4s\n"
616       "add v1.4s, v1.4s, v7.4s\n"
617       "add v0.4s, v0.4s, v8.4s\n"
618       "add v1.4s, v1.4s, v9.4s\n"
619       "mul v0.4s, v0.4s, v10.4s\n"
620       "mul v1.4s, v1.4s, v10.4s\n"
621       "add v0.4s, v0.4s, v11.4s\n"
622       "add v1.4s, v1.4s, v11.4s\n"
623       "sshl v0.4s, v0.4s, v12.4s\n"
624       "sshl v1.4s, v1.4s, v12.4s\n"
625       "sqxtn v0.4h, v0.4s\n"
626       "sqxtn2 v0.8h, v1.4s\n"
627       "sqxtun v0.8b, v0.8h\n"
628 
629       // RowMajorOutput::Output
630       "st1 {v0.s}[0], [%x[result]], #4\n"
631       "st1 {v0.h}[2], [%x[result]], #2\n"
632       "st1 {v0.b}[6], [%x[result]], #1\n"
633       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
634       : [count] "r"(params.kernel.count),
635         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
636         [shift] "r"(params.kernel.shift),
637         [stride] "r"(params.output_stream.stride),
638         [rounding_offset] "r"(params.kernel.rounding_offset)
639       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
640         "v11", "v12", "v13", "v14", "v15", "cc", "memory");
641 }
642 
643 template <>
644 inline void
645 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 1, 8,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)646           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
647                        const FusedKernelParams<QuantizedStaticPreprocessed,
648                                                RowMajor>& params,
649                        uint8_t* result) {
650 #ifdef DEBUG
651 #ifdef DEBUG_METAGEMM_VERBOSE
652   std::cout << __FILE__ << "(" << __LINE__
653             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
654                "QuantizedStaticPreprocessed, RowMajor, 1, 8, 8>::Multiply()"
655             << std::endl
656             << std::flush;
657 #endif
658 #endif
659   asm volatile(
660       "prfm pldl1keep, [%x[lhs]]\n"
661       "prfm pldl1keep, [%x[rhs]]\n"
662 
663       // Clear aggregators.
664       "movi v0.4s, #0\n"
665       "movi v1.4s, #0\n"
666       "movi v2.4s, #0\n"
667       "mov v3.16b, v0.16b\n"
668       "mov v4.16b, v1.16b\n"
669       "mov v5.16b, v2.16b\n"
670       "mov v6.16b, v3.16b\n"
671       "mov v7.16b, v4.16b\n"
672 
673       // 1x8 lanes loop.
674       "1:"
675 
676       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
677       "ld1 {v8.2s}, [%x[lhs]], #8\n"
678       "umull v13.8h, v8.8b, v9.8b\n"
679       "umull v14.8h, v8.8b, v10.8b\n"
680       "umull v15.8h, v8.8b, v11.8b\n"
681       "umull v16.8h, v8.8b, v12.8b\n"
682       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
683       "uadalp v0.4s, v13.8h\n"
684       "uadalp v1.4s, v14.8h\n"
685       "uadalp v2.4s, v15.8h\n"
686       "uadalp v3.4s, v16.8h\n"
687       "prfm pldl1keep, [%x[rhs], #256]\n"
688       "umull v17.8h, v8.8b, v9.8b\n"
689       "umull v13.8h, v8.8b, v10.8b\n"
690       "umull v14.8h, v8.8b, v11.8b\n"
691       "umull v15.8h, v8.8b, v12.8b\n"
692       "prfm pldl1keep, [%x[lhs], #32]\n"
693 
694       // Subtract counter.
695       "subs %x[count], %x[count], #8\n"
696 
697       "uadalp v4.4s, v17.8h\n"
698       "uadalp v5.4s, v13.8h\n"
699       "uadalp v6.4s, v14.8h\n"
700       "uadalp v7.4s, v15.8h\n"
701 
702       // Loop break.
703       "bgt 1b\n"
704 
705       // StaticQuantization::Prepare
706       "ld1 {v8.4s}, [%x[lhs]], #16\n"
707       "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
708       "dup v11.4s, %w[multiplicative_offset]\n"
709       "dup v12.4s, %w[rounding_offset]\n"
710       "dup v13.4s, %w[shift]\n"
711       "dup v8.4s, v8.s[0]\n"
712 
713       // RowMajorOutput::Prepare
714 
715       // Reduce aggregators.
716       "addp v0.4s, v0.4s, v1.4s\n"
717       "addp v2.4s, v2.4s, v3.4s\n"
718       "addp v4.4s, v4.4s, v5.4s\n"
719       "addp v6.4s, v6.4s, v7.4s\n"
720       "addp v0.4s, v0.4s, v2.4s\n"
721       "addp v1.4s, v4.4s, v6.4s\n"
722 
723       // StaticQuantization::Transform
724       "add v0.4s, v0.4s, v8.4s\n"
725       "add v1.4s, v1.4s, v8.4s\n"
726       "add v0.4s, v0.4s, v9.4s\n"
727       "add v1.4s, v1.4s, v10.4s\n"
728       "mul v0.4s, v0.4s, v11.4s\n"
729       "mul v1.4s, v1.4s, v11.4s\n"
730       "add v0.4s, v0.4s, v12.4s\n"
731       "add v1.4s, v1.4s, v12.4s\n"
732       "sshl v0.4s, v0.4s, v13.4s\n"
733       "sshl v1.4s, v1.4s, v13.4s\n"
734       "sqxtn v0.4h, v0.4s\n"
735       "sqxtn2 v0.8h, v1.4s\n"
736       "sqxtun v0.8b, v0.8h\n"
737 
738       // RowMajorOutput::Output
739       "st1 {v0.2s}, [%x[result]], #8\n"
740       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
741       : [count] "r"(params.kernel.count),
742         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
743         [shift] "r"(params.kernel.shift),
744         [stride] "r"(params.output_stream.stride),
745         [rounding_offset] "r"(params.kernel.rounding_offset)
746       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
747         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
748 }
749 
750 template <>
751 inline void
752 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)753           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
754                        const FusedKernelParams<QuantizedStaticPreprocessed,
755                                                RowMajor>& params,
756                        uint8_t* result) {
757 #ifdef DEBUG
758 #ifdef DEBUG_METAGEMM_VERBOSE
759   std::cout << __FILE__ << "(" << __LINE__
760             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
761                "QuantizedStaticPreprocessed, RowMajor, 2, 1, 8>::Multiply()"
762             << std::endl
763             << std::flush;
764 #endif
765 #endif
766   asm volatile(
767       "prfm pldl1keep, [%x[lhs]]\n"
768       "prfm pldl1keep, [%x[rhs]]\n"
769 
770       // Clear aggregators.
771       "movi v0.4s, #0\n"
772       "movi v1.4s, #0\n"
773 
774       // General NxM lanes loop.
775       "1:"
776 
777       // Subtract counter.
778       "subs %x[count], %x[count], #8\n"
779 
780       "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
781       "ld1 {v4.2s}, [%x[rhs]], #8\n"
782       "prfm pldl1keep, [%x[lhs], #64]\n"
783       "prfm pldl1keep, [%x[rhs], #64]\n"
784       "umull v5.8h, v4.8b, v2.8b\n"
785       "umull v6.8h, v4.8b, v3.8b\n"
786       "uadalp v0.4s, v5.8h\n"
787       "uadalp v1.4s, v6.8h\n"
788 
789       // Loop break.
790       "bgt 1b\n"
791 
792       // StaticQuantization::Prepare
793       "ld1 {v4.4s}, [%x[lhs]], #16\n"
794       "ld1 {v5.4s}, [%x[rhs]], #16\n"
795       "dup v6.4s, %w[multiplicative_offset]\n"
796       "dup v7.4s, %w[rounding_offset]\n"
797       "dup v8.4s, %w[shift]\n"
798       "dup v2.4s, v4.s[0]\n"
799       "dup v4.4s, v4.s[1]\n"
800 
801       // RowMajorOutput::Prepare
802       "add x0, %x[result], %x[stride]\n"
803 
804       // Reduce aggregators.
805       "addp v0.4s, v0.4s, v0.4s\n"
806       "addp v0.4s, v0.4s, v0.4s\n"
807       "addp v1.4s, v1.4s, v1.4s\n"
808       "addp v1.4s, v1.4s, v1.4s\n"
809 
810       // StaticQuantization::Transform
811       "add v0.4s, v0.4s, v2.4s\n"
812       "add v1.4s, v1.4s, v4.4s\n"
813       "add v0.4s, v0.4s, v5.4s\n"
814       "add v1.4s, v1.4s, v5.4s\n"
815       "mul v0.4s, v0.4s, v6.4s\n"
816       "mul v1.4s, v1.4s, v6.4s\n"
817       "add v0.4s, v0.4s, v7.4s\n"
818       "add v1.4s, v1.4s, v7.4s\n"
819       "sshl v0.4s, v0.4s, v8.4s\n"
820       "sshl v1.4s, v1.4s, v8.4s\n"
821       "sqxtn v0.4h, v0.4s\n"
822       "sqxtn v1.4h, v1.4s\n"
823       "sqxtun v0.8b, v0.8h\n"
824       "sqxtun v1.8b, v1.8h\n"
825 
826       // RowMajorOutput::Output
827       "st1 {v0.b}[0], [%x[result]], #1\n"
828       "st1 {v1.b}[0], [x0], #1\n"
829       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
830       : [count] "r"(params.kernel.count),
831         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
832         [shift] "r"(params.kernel.shift),
833         [stride] "r"(params.output_stream.stride),
834         [rounding_offset] "r"(params.kernel.rounding_offset)
835       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc",
836         "memory");
837 }
838 
839 template <>
840 inline void
841 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)842           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
843                        const FusedKernelParams<QuantizedStaticPreprocessed,
844                                                RowMajor>& params,
845                        uint8_t* result) {
846 #ifdef DEBUG
847 #ifdef DEBUG_METAGEMM_VERBOSE
848   std::cout << __FILE__ << "(" << __LINE__
849             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
850                "QuantizedStaticPreprocessed, RowMajor, 2, 2, 8>::Multiply()"
851             << std::endl
852             << std::flush;
853 #endif
854 #endif
855   asm volatile(
856       "prfm pldl1keep, [%x[lhs]]\n"
857       "prfm pldl1keep, [%x[rhs]]\n"
858 
859       // Clear aggregators.
860       "movi v0.4s, #0\n"
861       "movi v1.4s, #0\n"
862       "movi v2.4s, #0\n"
863       "mov v3.16b, v0.16b\n"
864 
865       // General NxM lanes loop.
866       "1:"
867 
868       // Subtract counter.
869       "subs %x[count], %x[count], #8\n"
870 
871       "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
872       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
873       "prfm pldl1keep, [%x[lhs], #64]\n"
874       "prfm pldl1keep, [%x[rhs], #64]\n"
875       "umull v8.8h, v6.8b, v4.8b\n"
876       "umull v9.8h, v7.8b, v4.8b\n"
877       "umull v10.8h, v6.8b, v5.8b\n"
878       "umull v11.8h, v7.8b, v5.8b\n"
879       "uadalp v0.4s, v8.8h\n"
880       "uadalp v1.4s, v9.8h\n"
881       "uadalp v2.4s, v10.8h\n"
882       "uadalp v3.4s, v11.8h\n"
883 
884       // Loop break.
885       "bgt 1b\n"
886 
887       // StaticQuantization::Prepare
888       "ld1 {v4.4s}, [%x[lhs]], #16\n"
889       "ld1 {v5.4s}, [%x[rhs]], #16\n"
890       "dup v6.4s, %w[multiplicative_offset]\n"
891       "dup v7.4s, %w[rounding_offset]\n"
892       "dup v8.4s, %w[shift]\n"
893       "dup v9.4s, v4.s[0]\n"
894       "dup v4.4s, v4.s[1]\n"
895 
896       // RowMajorOutput::Prepare
897       "add x0, %x[result], %x[stride]\n"
898 
899       // Reduce aggregators.
900       "addp v0.4s, v0.4s, v1.4s\n"
901       "addp v0.4s, v0.4s, v0.4s\n"
902       "addp v2.4s, v2.4s, v3.4s\n"
903       "addp v2.4s, v2.4s, v2.4s\n"
904 
905       // StaticQuantization::Transform
906       "add v0.4s, v0.4s, v9.4s\n"
907       "add v2.4s, v2.4s, v4.4s\n"
908       "add v0.4s, v0.4s, v5.4s\n"
909       "add v2.4s, v2.4s, v5.4s\n"
910       "mul v0.4s, v0.4s, v6.4s\n"
911       "mul v2.4s, v2.4s, v6.4s\n"
912       "add v0.4s, v0.4s, v7.4s\n"
913       "add v2.4s, v2.4s, v7.4s\n"
914       "sshl v0.4s, v0.4s, v8.4s\n"
915       "sshl v2.4s, v2.4s, v8.4s\n"
916       "sqxtn v0.4h, v0.4s\n"
917       "sqxtn v2.4h, v2.4s\n"
918       "sqxtun v0.8b, v0.8h\n"
919       "sqxtun v2.8b, v2.8h\n"
920 
921       // RowMajorOutput::Output
922       "st1 {v0.h}[0], [%x[result]], #2\n"
923       "st1 {v2.h}[0], [x0], #2\n"
924       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
925       : [count] "r"(params.kernel.count),
926         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
927         [shift] "r"(params.kernel.shift),
928         [stride] "r"(params.output_stream.stride),
929         [rounding_offset] "r"(params.kernel.rounding_offset)
930       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
931         "v11", "cc", "memory");
932 }
933 
934 template <>
935 inline void
936 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)937           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
938                        const FusedKernelParams<QuantizedStaticPreprocessed,
939                                                RowMajor>& params,
940                        uint8_t* result) {
941 #ifdef DEBUG
942 #ifdef DEBUG_METAGEMM_VERBOSE
943   std::cout << __FILE__ << "(" << __LINE__
944             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
945                "QuantizedStaticPreprocessed, RowMajor, 2, 3, 8>::Multiply()"
946             << std::endl
947             << std::flush;
948 #endif
949 #endif
950   asm volatile(
951       "prfm pldl1keep, [%x[lhs]]\n"
952       "prfm pldl1keep, [%x[rhs]]\n"
953 
954       // Clear aggregators.
955       "movi v0.4s, #0\n"
956       "movi v1.4s, #0\n"
957       "movi v2.4s, #0\n"
958       "mov v3.16b, v0.16b\n"
959       "mov v4.16b, v1.16b\n"
960       "mov v5.16b, v2.16b\n"
961 
962       // General NxM lanes loop.
963       "1:"
964 
965       // Subtract counter.
966       "subs %x[count], %x[count], #8\n"
967 
968       "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
969       "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
970       "prfm pldl1keep, [%x[lhs], #64]\n"
971       "prfm pldl1keep, [%x[rhs], #64]\n"
972       "umull v11.8h, v8.8b, v6.8b\n"
973       "umull v12.8h, v9.8b, v6.8b\n"
974       "umull v13.8h, v10.8b, v6.8b\n"
975       "umull v14.8h, v8.8b, v7.8b\n"
976       "umull v15.8h, v9.8b, v7.8b\n"
977       "umull v16.8h, v10.8b, v7.8b\n"
978       "uadalp v0.4s, v11.8h\n"
979       "uadalp v1.4s, v12.8h\n"
980       "uadalp v2.4s, v13.8h\n"
981       "uadalp v3.4s, v14.8h\n"
982       "uadalp v4.4s, v15.8h\n"
983       "uadalp v5.4s, v16.8h\n"
984 
985       // Loop break.
986       "bgt 1b\n"
987 
988       // StaticQuantization::Prepare
989       "ld1 {v6.4s}, [%x[lhs]], #16\n"
990       "ld1 {v7.4s}, [%x[rhs]], #16\n"
991       "dup v8.4s, %w[multiplicative_offset]\n"
992       "dup v9.4s, %w[rounding_offset]\n"
993       "dup v10.4s, %w[shift]\n"
994       "dup v11.4s, v6.s[0]\n"
995       "dup v6.4s, v6.s[1]\n"
996 
997       // RowMajorOutput::Prepare
998       "add x0, %x[result], %x[stride]\n"
999 
1000       // Reduce aggregators.
1001       "addp v0.4s, v0.4s, v1.4s\n"
1002       "addp v2.4s, v2.4s, v2.4s\n"
1003       "addp v0.4s, v0.4s, v2.4s\n"
1004       "addp v3.4s, v3.4s, v4.4s\n"
1005       "addp v5.4s, v5.4s, v5.4s\n"
1006       "addp v3.4s, v3.4s, v5.4s\n"
1007 
1008       // StaticQuantization::Transform
1009       "add v0.4s, v0.4s, v11.4s\n"
1010       "add v3.4s, v3.4s, v6.4s\n"
1011       "add v0.4s, v0.4s, v7.4s\n"
1012       "add v3.4s, v3.4s, v7.4s\n"
1013       "mul v0.4s, v0.4s, v8.4s\n"
1014       "mul v3.4s, v3.4s, v8.4s\n"
1015       "add v0.4s, v0.4s, v9.4s\n"
1016       "add v3.4s, v3.4s, v9.4s\n"
1017       "sshl v0.4s, v0.4s, v10.4s\n"
1018       "sshl v3.4s, v3.4s, v10.4s\n"
1019       "sqxtn v0.4h, v0.4s\n"
1020       "sqxtn v3.4h, v3.4s\n"
1021       "sqxtun v0.8b, v0.8h\n"
1022       "sqxtun v3.8b, v3.8h\n"
1023 
1024       // RowMajorOutput::Output
1025       "st1 {v0.h}[0], [%x[result]], #2\n"
1026       "st1 {v0.b}[2], [%x[result]], #1\n"
1027       "st1 {v3.h}[0], [x0], #2\n"
1028       "st1 {v3.b}[2], [x0], #1\n"
1029       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1030       : [count] "r"(params.kernel.count),
1031         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1032         [shift] "r"(params.kernel.shift),
1033         [stride] "r"(params.output_stream.stride),
1034         [rounding_offset] "r"(params.kernel.rounding_offset)
1035       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1036         "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
1037 }
1038 
1039 template <>
1040 inline void
1041 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 2, 4,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1042           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1043                        const FusedKernelParams<QuantizedStaticPreprocessed,
1044                                                RowMajor>& params,
1045                        uint8_t* result) {
1046 #ifdef DEBUG
1047 #ifdef DEBUG_METAGEMM_VERBOSE
1048   std::cout << __FILE__ << "(" << __LINE__
1049             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1050                "QuantizedStaticPreprocessed, RowMajor, 2, 4, 8>::Multiply()"
1051             << std::endl
1052             << std::flush;
1053 #endif
1054 #endif
1055   asm volatile(
1056       "prfm pldl1keep, [%x[lhs]]\n"
1057       "prfm pldl1keep, [%x[rhs]]\n"
1058 
1059       // Clear aggregators.
1060       "movi v0.4s, #0\n"
1061       "movi v1.4s, #0\n"
1062       "movi v2.4s, #0\n"
1063       "mov v3.16b, v0.16b\n"
1064       "mov v4.16b, v1.16b\n"
1065       "mov v5.16b, v2.16b\n"
1066       "mov v6.16b, v3.16b\n"
1067       "mov v7.16b, v4.16b\n"
1068 
1069       // 2x4 lanes loop.
1070       "1:"
1071 
1072       "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
1073       "ld1 {v8.8b}, [%x[lhs]], #8\n"
1074       "umull v14.8h, v8.8b, v10.8b\n"
1075       "ld1 {v9.8b}, [%x[lhs]], #8\n"
1076       "umull v15.8h, v8.8b, v11.8b\n"
1077       "prfm pldl1keep, [%x[rhs], #64]\n"
1078       "umull v16.8h, v8.8b, v12.8b\n"
1079       "prfm pldl1keep, [%x[lhs], #64]\n"
1080       "umull v17.8h, v8.8b, v13.8b\n"
1081       "umull v18.8h, v9.8b, v10.8b\n"
1082       "uadalp v0.4s, v14.8h\n"
1083       "uadalp v1.4s, v15.8h\n"
1084       "uadalp v2.4s, v16.8h\n"
1085       "umull v14.8h, v9.8b, v11.8b\n"
1086       "umull v15.8h, v9.8b, v12.8b\n"
1087       "umull v16.8h, v9.8b, v13.8b\n"
1088 
1089       // Subtract counter.
1090       "subs %x[count], %x[count], #8\n"
1091 
1092       "uadalp v3.4s, v17.8h\n"
1093       "uadalp v4.4s, v18.8h\n"
1094       "uadalp v5.4s, v14.8h\n"
1095       "uadalp v6.4s, v15.8h\n"
1096       "uadalp v7.4s, v16.8h\n"
1097 
1098       // Loop break.
1099       "bgt 1b\n"
1100 
1101       // StaticQuantization::Prepare
1102       "ld1 {v8.4s}, [%x[lhs]], #16\n"
1103       "ld1 {v9.4s}, [%x[rhs]], #16\n"
1104       "dup v10.4s, %w[multiplicative_offset]\n"
1105       "dup v11.4s, %w[rounding_offset]\n"
1106       "dup v12.4s, %w[shift]\n"
1107       "dup v13.4s, v8.s[0]\n"
1108       "dup v8.4s, v8.s[1]\n"
1109 
1110       // RowMajorOutput::Prepare
1111       "add x0, %x[result], %x[stride]\n"
1112 
1113       // Reduce aggregators.
1114       "addp v0.4s, v0.4s, v1.4s\n"
1115       "addp v2.4s, v2.4s, v3.4s\n"
1116       "addp v0.4s, v0.4s, v2.4s\n"
1117       "addp v4.4s, v4.4s, v5.4s\n"
1118       "addp v6.4s, v6.4s, v7.4s\n"
1119       "addp v4.4s, v4.4s, v6.4s\n"
1120 
1121       // StaticQuantization::Transform
1122       "add v0.4s, v0.4s, v13.4s\n"
1123       "add v4.4s, v4.4s, v8.4s\n"
1124       "add v0.4s, v0.4s, v9.4s\n"
1125       "add v4.4s, v4.4s, v9.4s\n"
1126       "mul v0.4s, v0.4s, v10.4s\n"
1127       "mul v4.4s, v4.4s, v10.4s\n"
1128       "add v0.4s, v0.4s, v11.4s\n"
1129       "add v4.4s, v4.4s, v11.4s\n"
1130       "sshl v0.4s, v0.4s, v12.4s\n"
1131       "sshl v4.4s, v4.4s, v12.4s\n"
1132       "sqxtn v0.4h, v0.4s\n"
1133       "sqxtn v4.4h, v4.4s\n"
1134       "sqxtun v0.8b, v0.8h\n"
1135       "sqxtun v4.8b, v4.8h\n"
1136 
1137       // RowMajorOutput::Output
1138       "st1 {v0.s}[0], [%x[result]], #4\n"
1139       "st1 {v4.s}[0], [x0], #4\n"
1140       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1141       : [count] "r"(params.kernel.count),
1142         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1143         [shift] "r"(params.kernel.shift),
1144         [stride] "r"(params.output_stream.stride),
1145         [rounding_offset] "r"(params.kernel.rounding_offset)
1146       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1147         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
1148 }
1149 
1150 template <>
1151 inline void
1152 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1153           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1154                        const FusedKernelParams<QuantizedStaticPreprocessed,
1155                                                RowMajor>& params,
1156                        uint8_t* result) {
1157 #ifdef DEBUG
1158 #ifdef DEBUG_METAGEMM_VERBOSE
1159   std::cout << __FILE__ << "(" << __LINE__
1160             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1161                "QuantizedStaticPreprocessed, RowMajor, 3, 1, 8>::Multiply()"
1162             << std::endl
1163             << std::flush;
1164 #endif
1165 #endif
1166   asm volatile(
1167       "prfm pldl1keep, [%x[lhs]]\n"
1168       "prfm pldl1keep, [%x[rhs]]\n"
1169 
1170       // Clear aggregators.
1171       "movi v0.4s, #0\n"
1172       "movi v1.4s, #0\n"
1173       "movi v2.4s, #0\n"
1174 
1175       // General NxM lanes loop.
1176       "1:"
1177 
1178       // Subtract counter.
1179       "subs %x[count], %x[count], #8\n"
1180 
1181       "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
1182       "ld1 {v6.2s}, [%x[rhs]], #8\n"
1183       "prfm pldl1keep, [%x[lhs], #64]\n"
1184       "prfm pldl1keep, [%x[rhs], #64]\n"
1185       "umull v7.8h, v6.8b, v3.8b\n"
1186       "umull v8.8h, v6.8b, v4.8b\n"
1187       "umull v9.8h, v6.8b, v5.8b\n"
1188       "uadalp v0.4s, v7.8h\n"
1189       "uadalp v1.4s, v8.8h\n"
1190       "uadalp v2.4s, v9.8h\n"
1191 
1192       // Loop break.
1193       "bgt 1b\n"
1194 
1195       // StaticQuantization::Prepare
1196       "ld1 {v4.4s}, [%x[lhs]], #16\n"
1197       "ld1 {v5.4s}, [%x[rhs]], #16\n"
1198       "dup v6.4s, %w[multiplicative_offset]\n"
1199       "dup v7.4s, %w[rounding_offset]\n"
1200       "dup v8.4s, %w[shift]\n"
1201       "dup v3.4s, v4.s[0]\n"
1202       "dup v9.4s, v4.s[1]\n"
1203       "dup v4.4s, v4.s[2]\n"
1204 
1205       // RowMajorOutput::Prepare
1206       "add x0, %x[result], %x[stride]\n"
1207       "add x1, x0, %x[stride]\n"
1208 
1209       // Reduce aggregators.
1210       "addp v0.4s, v0.4s, v0.4s\n"
1211       "addp v0.4s, v0.4s, v0.4s\n"
1212       "addp v1.4s, v1.4s, v1.4s\n"
1213       "addp v1.4s, v1.4s, v1.4s\n"
1214       "addp v2.4s, v2.4s, v2.4s\n"
1215       "addp v2.4s, v2.4s, v2.4s\n"
1216 
1217       // StaticQuantization::Transform
1218       "add v0.4s, v0.4s, v3.4s\n"
1219       "add v1.4s, v1.4s, v9.4s\n"
1220       "add v2.4s, v2.4s, v4.4s\n"
1221       "add v0.4s, v0.4s, v5.4s\n"
1222       "add v1.4s, v1.4s, v5.4s\n"
1223       "add v2.4s, v2.4s, v5.4s\n"
1224       "mul v0.4s, v0.4s, v6.4s\n"
1225       "mul v1.4s, v1.4s, v6.4s\n"
1226       "mul v2.4s, v2.4s, v6.4s\n"
1227       "add v0.4s, v0.4s, v7.4s\n"
1228       "add v1.4s, v1.4s, v7.4s\n"
1229       "add v2.4s, v2.4s, v7.4s\n"
1230       "sshl v0.4s, v0.4s, v8.4s\n"
1231       "sshl v1.4s, v1.4s, v8.4s\n"
1232       "sshl v2.4s, v2.4s, v8.4s\n"
1233       "sqxtn v0.4h, v0.4s\n"
1234       "sqxtn v1.4h, v1.4s\n"
1235       "sqxtn v2.4h, v2.4s\n"
1236       "sqxtun v0.8b, v0.8h\n"
1237       "sqxtun v1.8b, v1.8h\n"
1238       "sqxtun v2.8b, v2.8h\n"
1239 
1240       // RowMajorOutput::Output
1241       "st1 {v0.b}[0], [%x[result]], #1\n"
1242       "st1 {v1.b}[0], [x0], #1\n"
1243       "st1 {v2.b}[0], [x1], #1\n"
1244       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1245       : [count] "r"(params.kernel.count),
1246         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1247         [shift] "r"(params.kernel.shift),
1248         [stride] "r"(params.output_stream.stride),
1249         [rounding_offset] "r"(params.kernel.rounding_offset)
1250       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
1251         "cc", "memory");
1252 }
1253 
1254 template <>
1255 inline void
1256 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1257           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1258                        const FusedKernelParams<QuantizedStaticPreprocessed,
1259                                                RowMajor>& params,
1260                        uint8_t* result) {
1261 #ifdef DEBUG
1262 #ifdef DEBUG_METAGEMM_VERBOSE
1263   std::cout << __FILE__ << "(" << __LINE__
1264             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1265                "QuantizedStaticPreprocessed, RowMajor, 3, 2, 8>::Multiply()"
1266             << std::endl
1267             << std::flush;
1268 #endif
1269 #endif
1270   asm volatile(
1271       "prfm pldl1keep, [%x[lhs]]\n"
1272       "prfm pldl1keep, [%x[rhs]]\n"
1273 
1274       // Clear aggregators.
1275       "movi v0.4s, #0\n"
1276       "movi v1.4s, #0\n"
1277       "movi v2.4s, #0\n"
1278       "mov v3.16b, v0.16b\n"
1279       "mov v4.16b, v1.16b\n"
1280       "mov v5.16b, v2.16b\n"
1281 
1282       // General NxM lanes loop.
1283       "1:"
1284 
1285       // Subtract counter.
1286       "subs %x[count], %x[count], #8\n"
1287 
1288       "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
1289       "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
1290       "prfm pldl1keep, [%x[lhs], #64]\n"
1291       "prfm pldl1keep, [%x[rhs], #64]\n"
1292       "umull v11.8h, v9.8b, v6.8b\n"
1293       "umull v12.8h, v10.8b, v6.8b\n"
1294       "umull v13.8h, v9.8b, v7.8b\n"
1295       "umull v14.8h, v10.8b, v7.8b\n"
1296       "umull v15.8h, v9.8b, v8.8b\n"
1297       "umull v16.8h, v10.8b, v8.8b\n"
1298       "uadalp v0.4s, v11.8h\n"
1299       "uadalp v1.4s, v12.8h\n"
1300       "uadalp v2.4s, v13.8h\n"
1301       "uadalp v3.4s, v14.8h\n"
1302       "uadalp v4.4s, v15.8h\n"
1303       "uadalp v5.4s, v16.8h\n"
1304 
1305       // Loop break.
1306       "bgt 1b\n"
1307 
1308       // StaticQuantization::Prepare
1309       "ld1 {v6.4s}, [%x[lhs]], #16\n"
1310       "ld1 {v7.4s}, [%x[rhs]], #16\n"
1311       "dup v8.4s, %w[multiplicative_offset]\n"
1312       "dup v9.4s, %w[rounding_offset]\n"
1313       "dup v10.4s, %w[shift]\n"
1314       "dup v11.4s, v6.s[0]\n"
1315       "dup v12.4s, v6.s[1]\n"
1316       "dup v6.4s, v6.s[2]\n"
1317 
1318       // RowMajorOutput::Prepare
1319       "add x0, %x[result], %x[stride]\n"
1320       "add x1, x0, %x[stride]\n"
1321 
1322       // Reduce aggregators.
1323       "addp v0.4s, v0.4s, v1.4s\n"
1324       "addp v0.4s, v0.4s, v0.4s\n"
1325       "addp v2.4s, v2.4s, v3.4s\n"
1326       "addp v2.4s, v2.4s, v2.4s\n"
1327       "addp v4.4s, v4.4s, v5.4s\n"
1328       "addp v4.4s, v4.4s, v4.4s\n"
1329 
1330       // StaticQuantization::Transform
1331       "add v0.4s, v0.4s, v11.4s\n"
1332       "add v2.4s, v2.4s, v12.4s\n"
1333       "add v4.4s, v4.4s, v6.4s\n"
1334       "add v0.4s, v0.4s, v7.4s\n"
1335       "add v2.4s, v2.4s, v7.4s\n"
1336       "add v4.4s, v4.4s, v7.4s\n"
1337       "mul v0.4s, v0.4s, v8.4s\n"
1338       "mul v2.4s, v2.4s, v8.4s\n"
1339       "mul v4.4s, v4.4s, v8.4s\n"
1340       "add v0.4s, v0.4s, v9.4s\n"
1341       "add v2.4s, v2.4s, v9.4s\n"
1342       "add v4.4s, v4.4s, v9.4s\n"
1343       "sshl v0.4s, v0.4s, v10.4s\n"
1344       "sshl v2.4s, v2.4s, v10.4s\n"
1345       "sshl v4.4s, v4.4s, v10.4s\n"
1346       "sqxtn v0.4h, v0.4s\n"
1347       "sqxtn v2.4h, v2.4s\n"
1348       "sqxtn v4.4h, v4.4s\n"
1349       "sqxtun v0.8b, v0.8h\n"
1350       "sqxtun v2.8b, v2.8h\n"
1351       "sqxtun v4.8b, v4.8h\n"
1352 
1353       // RowMajorOutput::Output
1354       "st1 {v0.h}[0], [%x[result]], #2\n"
1355       "st1 {v2.h}[0], [x0], #2\n"
1356       "st1 {v4.h}[0], [x1], #2\n"
1357       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1358       : [count] "r"(params.kernel.count),
1359         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1360         [shift] "r"(params.kernel.shift),
1361         [stride] "r"(params.output_stream.stride),
1362         [rounding_offset] "r"(params.kernel.rounding_offset)
1363       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
1364         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
1365 }
1366 
1367 template <>
1368 inline void
1369 MulKernel<uint8_t, uint8_t, QuantizedStaticPreprocessed, RowMajor, 3, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessed,RowMajor> & params,uint8_t * result)1370           8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1371                        const FusedKernelParams<QuantizedStaticPreprocessed,
1372                                                RowMajor>& params,
1373                        uint8_t* result) {
1374 #ifdef DEBUG
1375 #ifdef DEBUG_METAGEMM_VERBOSE
1376   std::cout << __FILE__ << "(" << __LINE__
1377             << ") QuantizedStaticPreprocessedRowMajor<uint8_t, uint8_t, "
1378                "QuantizedStaticPreprocessed, RowMajor, 3, 3, 8>::Multiply()"
1379             << std::endl
1380             << std::flush;
1381 #endif
1382 #endif
1383   asm volatile(
1384       "prfm pldl1keep, [%x[lhs]]\n"
1385       "prfm pldl1keep, [%x[rhs]]\n"
1386 
1387       // Clear aggregators.
1388       "movi v0.4s, #0\n"
1389       "movi v1.4s, #0\n"
1390       "movi v2.4s, #0\n"
1391       "mov v3.16b, v0.16b\n"
1392       "mov v4.16b, v1.16b\n"
1393       "mov v5.16b, v2.16b\n"
1394       "mov v6.16b, v3.16b\n"
1395       "mov v7.16b, v4.16b\n"
1396       "mov v8.16b, v5.16b\n"
1397 
1398       // 3x3 lanes loop.
1399       "1:"
1400 
1401       "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
1402       "ld1 {v9.8b}, [%x[lhs]], #8\n"
1403       "umull v15.8h, v9.8b, v12.8b\n"
1404       "ld1 {v10.8b}, [%x[lhs]], #8\n"
1405       "umull v16.8h, v9.8b, v13.8b\n"
1406       "ld1 {v11.8b}, [%x[lhs]], #8\n"
1407       "umull v17.8h, v9.8b, v14.8b\n"
1408       "prfm pldl1keep, [%x[lhs], #64]\n"
1409       "umull v18.8h, v10.8b, v12.8b\n"
1410       "prfm pldl1keep, [%x[rhs], #64]\n"
1411       "uadalp v0.4s, v15.8h\n"
1412       "uadalp v1.4s, v16.8h\n"
1413       "uadalp v2.4s, v17.8h\n"
1414       "uadalp v3.4s, v18.8h\n"
1415       "umull v15.8h, v10.8b, v13.8b\n"
1416       "umull v16.8h, v10.8b, v14.8b\n"
1417       "umull v17.8h, v11.8b, v12.8b\n"
1418       "umull v18.8h, v11.8b, v13.8b\n"
1419 
1420       // Subtract counter.
1421       "subs %x[count], %x[count], #8\n"
1422 
1423       "umull v9.8h, v11.8b, v14.8b\n"
1424       "uadalp v4.4s, v15.8h\n"
1425       "uadalp v5.4s, v16.8h\n"
1426       "uadalp v6.4s, v17.8h\n"
1427       "uadalp v7.4s, v18.8h\n"
1428       "uadalp v8.4s, v9.8h\n"
1429 
1430       // Loop break.
1431       "bgt 1b\n"
1432 
1433       // StaticQuantization::Prepare
1434       "ld1 {v9.4s}, [%x[lhs]], #16\n"
1435       "ld1 {v10.4s}, [%x[rhs]], #16\n"
1436       "dup v11.4s, %w[multiplicative_offset]\n"
1437       "dup v12.4s, %w[rounding_offset]\n"
1438       "dup v13.4s, %w[shift]\n"
1439       "dup v14.4s, v9.s[0]\n"
1440       "dup v15.4s, v9.s[1]\n"
1441       "dup v9.4s, v9.s[2]\n"
1442 
1443       // RowMajorOutput::Prepare
1444       "add x0, %x[result], %x[stride]\n"
1445       "add x1, x0, %x[stride]\n"
1446 
1447       // Reduce aggregators.
1448       "addp v0.4s, v0.4s, v1.4s\n"
1449       "addp v2.4s, v2.4s, v2.4s\n"
1450       "addp v0.4s, v0.4s, v2.4s\n"
1451       "addp v3.4s, v3.4s, v4.4s\n"
1452       "addp v5.4s, v5.4s, v5.4s\n"
1453       "addp v3.4s, v3.4s, v5.4s\n"
1454       "addp v6.4s, v6.4s, v7.4s\n"
1455       "addp v8.4s, v8.4s, v8.4s\n"
1456       "addp v6.4s, v6.4s, v8.4s\n"
1457 
1458       // StaticQuantization::Transform
1459       "add v0.4s, v0.4s, v14.4s\n"
1460       "add v3.4s, v3.4s, v15.4s\n"
1461       "add v6.4s, v6.4s, v9.4s\n"
1462       "add v0.4s, v0.4s, v10.4s\n"
1463       "add v3.4s, v3.4s, v10.4s\n"
1464       "add v6.4s, v6.4s, v10.4s\n"
1465       "mul v0.4s, v0.4s, v11.4s\n"
1466       "mul v3.4s, v3.4s, v11.4s\n"
1467       "mul v6.4s, v6.4s, v11.4s\n"
1468       "add v0.4s, v0.4s, v12.4s\n"
1469       "add v3.4s, v3.4s, v12.4s\n"
1470       "add v6.4s, v6.4s, v12.4s\n"
1471       "sshl v0.4s, v0.4s, v13.4s\n"
1472       "sshl v3.4s, v3.4s, v13.4s\n"
1473       "sshl v6.4s, v6.4s, v13.4s\n"
1474       "sqxtn v0.4h, v0.4s\n"
1475       "sqxtn v3.4h, v3.4s\n"
1476       "sqxtn v6.4h, v6.4s\n"
1477       "sqxtun v0.8b, v0.8h\n"
1478       "sqxtun v3.8b, v3.8h\n"
1479       "sqxtun v6.8b, v6.8h\n"
1480 
1481       // RowMajorOutput::Output
1482       "st1 {v0.h}[0], [%x[result]], #2\n"
1483       "st1 {v0.b}[2], [%x[result]], #1\n"
1484       "st1 {v3.h}[0], [x0], #2\n"
1485       "st1 {v3.b}[2], [x0], #1\n"
1486       "st1 {v6.h}[0], [x1], #2\n"
1487       "st1 {v6.b}[2], [x1], #1\n"
1488       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1489       : [count] "r"(params.kernel.count),
1490         [multiplicative_offset] "r"(params.kernel.multiplicative_offset),
1491         [shift] "r"(params.kernel.shift),
1492         [stride] "r"(params.output_stream.stride),
1493         [rounding_offset] "r"(params.kernel.rounding_offset)
1494       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
1495         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
1496         "memory");
1497 }
1498 
1499 template <>
1500 inline void MulKernel<
1501     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1502     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1503                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1504                                          RowMajor>& params,
1505                  int32_t* result) {
1506 #ifdef DEBUG
1507 #ifdef DEBUG_METAGEMM_VERBOSE
1508   std::cout << __FILE__ << "(" << __LINE__
1509             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1510                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 1, "
1511                "8>::Multiply()"
1512             << std::endl
1513             << std::flush;
1514 #endif
1515 #endif
1516   asm volatile(
1517       "prfm pldl1keep, [%x[lhs]]\n"
1518       "prfm pldl1keep, [%x[rhs]]\n"
1519 
1520       // Clear aggregators.
1521       "movi v0.4s, #0\n"
1522 
1523       // General NxM lanes loop.
1524       "1:"
1525 
1526       // Subtract counter.
1527       "subs %x[count], %x[count], #8\n"
1528 
1529       "ld1 {v1.2s}, [%x[lhs]], #8\n"
1530       "ld1 {v2.2s}, [%x[rhs]], #8\n"
1531       "prfm pldl1keep, [%x[lhs], #64]\n"
1532       "prfm pldl1keep, [%x[rhs], #64]\n"
1533       "umull v3.8h, v2.8b, v1.8b\n"
1534       "uadalp v0.4s, v3.8h\n"
1535 
1536       // Loop break.
1537       "bgt 1b\n"
1538 
1539       // StaticQuantizationInt32::Prepare
1540       "ld1 {v4.4s}, [%x[lhs]], #16\n"
1541       "ld1 {v5.4s}, [%x[rhs]], #16\n"
1542       "dup v4.4s, v4.s[0]\n"
1543 
1544       // RowMajorOutput::Prepare
1545 
1546       // Reduce aggregators.
1547       "addp v0.4s, v0.4s, v0.4s\n"
1548       "addp v0.4s, v0.4s, v0.4s\n"
1549 
1550       // StaticQuantizationInt32::Transform
1551       "add v0.4s, v0.4s, v4.4s\n"
1552       "add v0.4s, v0.4s, v5.4s\n"
1553 
1554       // RowMajorOutput::Output
1555       "st1 {v0.s}[0], [%x[result]], #4\n"
1556       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1557       : [count] "r"(params.kernel.count),
1558         [stride] "r"(params.output_stream.stride)
1559       : "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory");
1560 }
1561 
1562 template <>
1563 inline void MulKernel<
1564     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1565     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1566                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1567                                          RowMajor>& params,
1568                  int32_t* result) {
1569 #ifdef DEBUG
1570 #ifdef DEBUG_METAGEMM_VERBOSE
1571   std::cout << __FILE__ << "(" << __LINE__
1572             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1573                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 2, "
1574                "8>::Multiply()"
1575             << std::endl
1576             << std::flush;
1577 #endif
1578 #endif
1579   asm volatile(
1580       "prfm pldl1keep, [%x[lhs]]\n"
1581       "prfm pldl1keep, [%x[rhs]]\n"
1582 
1583       // Clear aggregators.
1584       "movi v0.4s, #0\n"
1585       "movi v1.4s, #0\n"
1586 
1587       // General NxM lanes loop.
1588       "1:"
1589 
1590       // Subtract counter.
1591       "subs %x[count], %x[count], #8\n"
1592 
1593       "ld1 {v2.2s}, [%x[lhs]], #8\n"
1594       "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
1595       "prfm pldl1keep, [%x[lhs], #64]\n"
1596       "prfm pldl1keep, [%x[rhs], #64]\n"
1597       "umull v5.8h, v3.8b, v2.8b\n"
1598       "umull v6.8h, v4.8b, v2.8b\n"
1599       "uadalp v0.4s, v5.8h\n"
1600       "uadalp v1.4s, v6.8h\n"
1601 
1602       // Loop break.
1603       "bgt 1b\n"
1604 
1605       // StaticQuantizationInt32::Prepare
1606       "ld1 {v4.4s}, [%x[lhs]], #16\n"
1607       "ld1 {v5.4s}, [%x[rhs]], #16\n"
1608       "dup v4.4s, v4.s[0]\n"
1609 
1610       // RowMajorOutput::Prepare
1611 
1612       // Reduce aggregators.
1613       "addp v0.4s, v0.4s, v1.4s\n"
1614       "addp v0.4s, v0.4s, v0.4s\n"
1615 
1616       // StaticQuantizationInt32::Transform
1617       "add v0.4s, v0.4s, v4.4s\n"
1618       "add v0.4s, v0.4s, v5.4s\n"
1619 
1620       // RowMajorOutput::Output
1621       "st1 {v0.2s}, [%x[result]], #8\n"
1622       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1623       : [count] "r"(params.kernel.count),
1624         [stride] "r"(params.output_stream.stride)
1625       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
1626 }
1627 
1628 template <>
1629 inline void MulKernel<
1630     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1631     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1632                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1633                                          RowMajor>& params,
1634                  int32_t* result) {
1635 #ifdef DEBUG
1636 #ifdef DEBUG_METAGEMM_VERBOSE
1637   std::cout << __FILE__ << "(" << __LINE__
1638             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1639                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 3, "
1640                "8>::Multiply()"
1641             << std::endl
1642             << std::flush;
1643 #endif
1644 #endif
1645   asm volatile(
1646       "prfm pldl1keep, [%x[lhs]]\n"
1647       "prfm pldl1keep, [%x[rhs]]\n"
1648 
1649       // Clear aggregators.
1650       "movi v0.4s, #0\n"
1651       "movi v1.4s, #0\n"
1652       "movi v2.4s, #0\n"
1653 
1654       // General NxM lanes loop.
1655       "1:"
1656 
1657       // Subtract counter.
1658       "subs %x[count], %x[count], #8\n"
1659 
1660       "ld1 {v3.2s}, [%x[lhs]], #8\n"
1661       "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
1662       "prfm pldl1keep, [%x[lhs], #64]\n"
1663       "prfm pldl1keep, [%x[rhs], #64]\n"
1664       "umull v7.8h, v4.8b, v3.8b\n"
1665       "umull v8.8h, v5.8b, v3.8b\n"
1666       "umull v9.8h, v6.8b, v3.8b\n"
1667       "uadalp v0.4s, v7.8h\n"
1668       "uadalp v1.4s, v8.8h\n"
1669       "uadalp v2.4s, v9.8h\n"
1670 
1671       // Loop break.
1672       "bgt 1b\n"
1673 
1674       // StaticQuantizationInt32::Prepare
1675       "ld1 {v4.4s}, [%x[lhs]], #16\n"
1676       "ld1 {v5.4s}, [%x[rhs]], #16\n"
1677       "dup v4.4s, v4.s[0]\n"
1678 
1679       // RowMajorOutput::Prepare
1680 
1681       // Reduce aggregators.
1682       "addp v0.4s, v0.4s, v1.4s\n"
1683       "addp v2.4s, v2.4s, v2.4s\n"
1684       "addp v0.4s, v0.4s, v2.4s\n"
1685 
1686       // StaticQuantizationInt32::Transform
1687       "add v0.4s, v0.4s, v4.4s\n"
1688       "add v0.4s, v0.4s, v5.4s\n"
1689 
1690       // RowMajorOutput::Output
1691       "st1 {v0.2s}, [%x[result]], #8\n"
1692       "st1 {v0.s}[2], [%x[result]], #4\n"
1693       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1694       : [count] "r"(params.kernel.count),
1695         [stride] "r"(params.output_stream.stride)
1696       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
1697         "memory");
1698 }
1699 
1700 template <>
1701 inline void MulKernel<
1702     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1703     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1704                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1705                                          RowMajor>& params,
1706                  int32_t* result) {
1707 #ifdef DEBUG
1708 #ifdef DEBUG_METAGEMM_VERBOSE
1709   std::cout << __FILE__ << "(" << __LINE__
1710             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1711                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 4, "
1712                "8>::Multiply()"
1713             << std::endl
1714             << std::flush;
1715 #endif
1716 #endif
1717   asm volatile(
1718       "prfm pldl1keep, [%x[lhs]]\n"
1719       "prfm pldl1keep, [%x[rhs]]\n"
1720 
1721       // Clear aggregators.
1722       "movi v0.4s, #0\n"
1723       "movi v1.4s, #0\n"
1724       "movi v2.4s, #0\n"
1725       "mov v3.16b, v0.16b\n"
1726 
1727       // General NxM lanes loop.
1728       "1:"
1729 
1730       // Subtract counter.
1731       "subs %x[count], %x[count], #8\n"
1732 
1733       "ld1 {v4.2s}, [%x[lhs]], #8\n"
1734       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
1735       "prfm pldl1keep, [%x[lhs], #64]\n"
1736       "prfm pldl1keep, [%x[rhs], #64]\n"
1737       "umull v9.8h, v5.8b, v4.8b\n"
1738       "umull v10.8h, v6.8b, v4.8b\n"
1739       "umull v11.8h, v7.8b, v4.8b\n"
1740       "umull v12.8h, v8.8b, v4.8b\n"
1741       "uadalp v0.4s, v9.8h\n"
1742       "uadalp v1.4s, v10.8h\n"
1743       "uadalp v2.4s, v11.8h\n"
1744       "uadalp v3.4s, v12.8h\n"
1745 
1746       // Loop break.
1747       "bgt 1b\n"
1748 
1749       // StaticQuantizationInt32::Prepare
1750       "ld1 {v4.4s}, [%x[lhs]], #16\n"
1751       "ld1 {v5.4s}, [%x[rhs]], #16\n"
1752       "dup v4.4s, v4.s[0]\n"
1753 
1754       // RowMajorOutput::Prepare
1755 
1756       // Reduce aggregators.
1757       "addp v0.4s, v0.4s, v1.4s\n"
1758       "addp v2.4s, v2.4s, v3.4s\n"
1759       "addp v0.4s, v0.4s, v2.4s\n"
1760 
1761       // StaticQuantizationInt32::Transform
1762       "add v0.4s, v0.4s, v4.4s\n"
1763       "add v0.4s, v0.4s, v5.4s\n"
1764 
1765       // RowMajorOutput::Output
1766       "st1 {v0.4s}, [%x[result]], #16\n"
1767       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1768       : [count] "r"(params.kernel.count),
1769         [stride] "r"(params.output_stream.stride)
1770       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1771         "v11", "v12", "cc", "memory");
1772 }
1773 
1774 template <>
1775 inline void MulKernel<
1776     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1777     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1778                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1779                                          RowMajor>& params,
1780                  int32_t* result) {
1781 #ifdef DEBUG
1782 #ifdef DEBUG_METAGEMM_VERBOSE
1783   std::cout << __FILE__ << "(" << __LINE__
1784             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1785                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 5, "
1786                "8>::Multiply()"
1787             << std::endl
1788             << std::flush;
1789 #endif
1790 #endif
1791   asm volatile(
1792       "prfm pldl1keep, [%x[lhs]]\n"
1793       "prfm pldl1keep, [%x[rhs]]\n"
1794 
1795       // Clear aggregators.
1796       "movi v0.4s, #0\n"
1797       "movi v1.4s, #0\n"
1798       "movi v2.4s, #0\n"
1799       "mov v3.16b, v0.16b\n"
1800       "mov v4.16b, v1.16b\n"
1801 
1802       // General 1xM lanes loop.
1803       "1:"
1804 
1805       // Subtract counter.
1806       "subs %x[count], %x[count], #8\n"
1807 
1808       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
1809       "ld1 {v9.2s}, [%x[lhs]], #8\n"
1810       "prfm pldl1keep, [%x[lhs], #64]\n"
1811       "umull v10.8h, v5.8b, v9.8b\n"
1812       "umull v11.8h, v6.8b, v9.8b\n"
1813       "umull v12.8h, v7.8b, v9.8b\n"
1814       "umull v13.8h, v8.8b, v9.8b\n"
1815       "ld1 {v5.2s}, [%x[rhs]], #8\n"
1816       "prfm pldl1keep, [%x[rhs], #128]\n"
1817       "uadalp v0.4s, v10.8h\n"
1818       "uadalp v1.4s, v11.8h\n"
1819       "uadalp v2.4s, v12.8h\n"
1820       "uadalp v3.4s, v13.8h\n"
1821       "umull v10.8h, v5.8b, v9.8b\n"
1822       "uadalp v4.4s, v10.8h\n"
1823 
1824       // Loop break.
1825       "bgt 1b\n"
1826 
1827       // StaticQuantizationInt32::Prepare
1828       "ld1 {v5.4s}, [%x[lhs]], #16\n"
1829       "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
1830       "dup v5.4s, v5.s[0]\n"
1831 
1832       // RowMajorOutput::Prepare
1833 
1834       // Reduce aggregators.
1835       "addp v0.4s, v0.4s, v1.4s\n"
1836       "addp v2.4s, v2.4s, v3.4s\n"
1837       "addp v4.4s, v4.4s, v4.4s\n"
1838       "addp v0.4s, v0.4s, v2.4s\n"
1839       "addp v1.4s, v4.4s, v4.4s\n"
1840 
1841       // StaticQuantizationInt32::Transform
1842       "add v0.4s, v0.4s, v5.4s\n"
1843       "add v1.4s, v1.4s, v5.4s\n"
1844       "add v0.4s, v0.4s, v6.4s\n"
1845       "add v1.4s, v1.4s, v7.4s\n"
1846 
1847       // RowMajorOutput::Output
1848       "st1 {v0.4s}, [%x[result]], #16\n"
1849       "st1 {v1.s}[0], [%x[result]], #4\n"
1850       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1851       : [count] "r"(params.kernel.count),
1852         [stride] "r"(params.output_stream.stride)
1853       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1854         "v11", "v12", "v13", "cc", "memory");
1855 }
1856 
1857 template <>
1858 inline void MulKernel<
1859     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1860     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1861                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1862                                          RowMajor>& params,
1863                  int32_t* result) {
1864 #ifdef DEBUG
1865 #ifdef DEBUG_METAGEMM_VERBOSE
1866   std::cout << __FILE__ << "(" << __LINE__
1867             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1868                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 6, "
1869                "8>::Multiply()"
1870             << std::endl
1871             << std::flush;
1872 #endif
1873 #endif
1874   asm volatile(
1875       "prfm pldl1keep, [%x[lhs]]\n"
1876       "prfm pldl1keep, [%x[rhs]]\n"
1877 
1878       // Clear aggregators.
1879       "movi v0.4s, #0\n"
1880       "movi v1.4s, #0\n"
1881       "movi v2.4s, #0\n"
1882       "mov v3.16b, v0.16b\n"
1883       "mov v4.16b, v1.16b\n"
1884       "mov v5.16b, v2.16b\n"
1885 
1886       // General 1xM lanes loop.
1887       "1:"
1888 
1889       // Subtract counter.
1890       "subs %x[count], %x[count], #8\n"
1891 
1892       "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
1893       "ld1 {v10.2s}, [%x[lhs]], #8\n"
1894       "prfm pldl1keep, [%x[lhs], #64]\n"
1895       "umull v11.8h, v6.8b, v10.8b\n"
1896       "umull v12.8h, v7.8b, v10.8b\n"
1897       "umull v13.8h, v8.8b, v10.8b\n"
1898       "umull v14.8h, v9.8b, v10.8b\n"
1899       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
1900       "prfm pldl1keep, [%x[rhs], #128]\n"
1901       "uadalp v0.4s, v11.8h\n"
1902       "uadalp v1.4s, v12.8h\n"
1903       "uadalp v2.4s, v13.8h\n"
1904       "uadalp v3.4s, v14.8h\n"
1905       "umull v11.8h, v6.8b, v10.8b\n"
1906       "umull v12.8h, v7.8b, v10.8b\n"
1907       "uadalp v4.4s, v11.8h\n"
1908       "uadalp v5.4s, v12.8h\n"
1909 
1910       // Loop break.
1911       "bgt 1b\n"
1912 
1913       // StaticQuantizationInt32::Prepare
1914       "ld1 {v6.4s}, [%x[lhs]], #16\n"
1915       "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
1916       "dup v6.4s, v6.s[0]\n"
1917 
1918       // RowMajorOutput::Prepare
1919 
1920       // Reduce aggregators.
1921       "addp v0.4s, v0.4s, v1.4s\n"
1922       "addp v2.4s, v2.4s, v3.4s\n"
1923       "addp v4.4s, v4.4s, v5.4s\n"
1924       "addp v0.4s, v0.4s, v2.4s\n"
1925       "addp v1.4s, v4.4s, v4.4s\n"
1926 
1927       // StaticQuantizationInt32::Transform
1928       "add v0.4s, v0.4s, v6.4s\n"
1929       "add v1.4s, v1.4s, v6.4s\n"
1930       "add v0.4s, v0.4s, v7.4s\n"
1931       "add v1.4s, v1.4s, v8.4s\n"
1932 
1933       // RowMajorOutput::Output
1934       "st1 {v0.4s}, [%x[result]], #16\n"
1935       "st1 {v1.2s}, [%x[result]], #8\n"
1936       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
1937       : [count] "r"(params.kernel.count),
1938         [stride] "r"(params.output_stream.stride)
1939       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1940         "v11", "v12", "v13", "v14", "cc", "memory");
1941 }
1942 
1943 template <>
1944 inline void MulKernel<
1945     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)1946     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
1947                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
1948                                          RowMajor>& params,
1949                  int32_t* result) {
1950 #ifdef DEBUG
1951 #ifdef DEBUG_METAGEMM_VERBOSE
1952   std::cout << __FILE__ << "(" << __LINE__
1953             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
1954                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 7, "
1955                "8>::Multiply()"
1956             << std::endl
1957             << std::flush;
1958 #endif
1959 #endif
1960   asm volatile(
1961       "prfm pldl1keep, [%x[lhs]]\n"
1962       "prfm pldl1keep, [%x[rhs]]\n"
1963 
1964       // Clear aggregators.
1965       "movi v0.4s, #0\n"
1966       "movi v1.4s, #0\n"
1967       "movi v2.4s, #0\n"
1968       "mov v3.16b, v0.16b\n"
1969       "mov v4.16b, v1.16b\n"
1970       "mov v5.16b, v2.16b\n"
1971       "mov v6.16b, v3.16b\n"
1972 
1973       // General 1xM lanes loop.
1974       "1:"
1975 
1976       // Subtract counter.
1977       "subs %x[count], %x[count], #8\n"
1978 
1979       "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
1980       "ld1 {v11.2s}, [%x[lhs]], #8\n"
1981       "prfm pldl1keep, [%x[lhs], #64]\n"
1982       "umull v12.8h, v7.8b, v11.8b\n"
1983       "umull v13.8h, v8.8b, v11.8b\n"
1984       "umull v14.8h, v9.8b, v11.8b\n"
1985       "umull v15.8h, v10.8b, v11.8b\n"
1986       "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
1987       "prfm pldl1keep, [%x[rhs], #128]\n"
1988       "uadalp v0.4s, v12.8h\n"
1989       "uadalp v1.4s, v13.8h\n"
1990       "uadalp v2.4s, v14.8h\n"
1991       "uadalp v3.4s, v15.8h\n"
1992       "umull v12.8h, v7.8b, v11.8b\n"
1993       "umull v13.8h, v8.8b, v11.8b\n"
1994       "umull v14.8h, v9.8b, v11.8b\n"
1995       "uadalp v4.4s, v12.8h\n"
1996       "uadalp v5.4s, v13.8h\n"
1997       "uadalp v6.4s, v14.8h\n"
1998 
1999       // Loop break.
2000       "bgt 1b\n"
2001 
2002       // StaticQuantizationInt32::Prepare
2003       "ld1 {v7.4s}, [%x[lhs]], #16\n"
2004       "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
2005       "dup v7.4s, v7.s[0]\n"
2006 
2007       // RowMajorOutput::Prepare
2008 
2009       // Reduce aggregators.
2010       "addp v0.4s, v0.4s, v1.4s\n"
2011       "addp v2.4s, v2.4s, v3.4s\n"
2012       "addp v4.4s, v4.4s, v5.4s\n"
2013       "addp v6.4s, v6.4s, v6.4s\n"
2014       "addp v0.4s, v0.4s, v2.4s\n"
2015       "addp v1.4s, v4.4s, v6.4s\n"
2016 
2017       // StaticQuantizationInt32::Transform
2018       "add v0.4s, v0.4s, v7.4s\n"
2019       "add v1.4s, v1.4s, v7.4s\n"
2020       "add v0.4s, v0.4s, v8.4s\n"
2021       "add v1.4s, v1.4s, v9.4s\n"
2022 
2023       // RowMajorOutput::Output
2024       "st1 {v0.4s}, [%x[result]], #16\n"
2025       "st1 {v1.2s}, [%x[result]], #8\n"
2026       "st1 {v1.s}[2], [%x[result]], #4\n"
2027       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2028       : [count] "r"(params.kernel.count),
2029         [stride] "r"(params.output_stream.stride)
2030       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
2031         "v11", "v12", "v13", "v14", "v15", "cc", "memory");
2032 }
2033 
2034 template <>
2035 inline void MulKernel<
2036     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2037     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2038                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2039                                          RowMajor>& params,
2040                  int32_t* result) {
2041 #ifdef DEBUG
2042 #ifdef DEBUG_METAGEMM_VERBOSE
2043   std::cout << __FILE__ << "(" << __LINE__
2044             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2045                "QuantizedStaticPreprocessedAsInt32, RowMajor, 1, 8, "
2046                "8>::Multiply()"
2047             << std::endl
2048             << std::flush;
2049 #endif
2050 #endif
2051   asm volatile(
2052       "prfm pldl1keep, [%x[lhs]]\n"
2053       "prfm pldl1keep, [%x[rhs]]\n"
2054 
2055       // Clear aggregators.
2056       "movi v0.4s, #0\n"
2057       "movi v1.4s, #0\n"
2058       "movi v2.4s, #0\n"
2059       "mov v3.16b, v0.16b\n"
2060       "mov v4.16b, v1.16b\n"
2061       "mov v5.16b, v2.16b\n"
2062       "mov v6.16b, v3.16b\n"
2063       "mov v7.16b, v4.16b\n"
2064 
2065       // 1x8 lanes loop.
2066       "1:"
2067 
2068       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
2069       "ld1 {v8.2s}, [%x[lhs]], #8\n"
2070       "umull v13.8h, v8.8b, v9.8b\n"
2071       "umull v14.8h, v8.8b, v10.8b\n"
2072       "umull v15.8h, v8.8b, v11.8b\n"
2073       "umull v16.8h, v8.8b, v12.8b\n"
2074       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
2075       "uadalp v0.4s, v13.8h\n"
2076       "uadalp v1.4s, v14.8h\n"
2077       "uadalp v2.4s, v15.8h\n"
2078       "uadalp v3.4s, v16.8h\n"
2079       "prfm pldl1keep, [%x[rhs], #256]\n"
2080       "umull v17.8h, v8.8b, v9.8b\n"
2081       "umull v13.8h, v8.8b, v10.8b\n"
2082       "umull v14.8h, v8.8b, v11.8b\n"
2083       "umull v15.8h, v8.8b, v12.8b\n"
2084       "prfm pldl1keep, [%x[lhs], #32]\n"
2085 
2086       // Subtract counter.
2087       "subs %x[count], %x[count], #8\n"
2088 
2089       "uadalp v4.4s, v17.8h\n"
2090       "uadalp v5.4s, v13.8h\n"
2091       "uadalp v6.4s, v14.8h\n"
2092       "uadalp v7.4s, v15.8h\n"
2093 
2094       // Loop break.
2095       "bgt 1b\n"
2096 
2097       // StaticQuantizationInt32::Prepare
2098       "ld1 {v8.4s}, [%x[lhs]], #16\n"
2099       "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
2100       "dup v8.4s, v8.s[0]\n"
2101 
2102       // RowMajorOutput::Prepare
2103 
2104       // Reduce aggregators.
2105       "addp v0.4s, v0.4s, v1.4s\n"
2106       "addp v2.4s, v2.4s, v3.4s\n"
2107       "addp v4.4s, v4.4s, v5.4s\n"
2108       "addp v6.4s, v6.4s, v7.4s\n"
2109       "addp v0.4s, v0.4s, v2.4s\n"
2110       "addp v1.4s, v4.4s, v6.4s\n"
2111 
2112       // StaticQuantizationInt32::Transform
2113       "add v0.4s, v0.4s, v8.4s\n"
2114       "add v1.4s, v1.4s, v8.4s\n"
2115       "add v0.4s, v0.4s, v9.4s\n"
2116       "add v1.4s, v1.4s, v10.4s\n"
2117 
2118       // RowMajorOutput::Output
2119       "st1 {v0.4s, v1.4s}, [%x[result]], #32\n"
2120       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2121       : [count] "r"(params.kernel.count),
2122         [stride] "r"(params.output_stream.stride)
2123       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
2124         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
2125 }
2126 
2127 template <>
2128 inline void MulKernel<
2129     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2130     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2131                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2132                                          RowMajor>& params,
2133                  int32_t* result) {
2134 #ifdef DEBUG
2135 #ifdef DEBUG_METAGEMM_VERBOSE
2136   std::cout << __FILE__ << "(" << __LINE__
2137             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2138                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 1, "
2139                "8>::Multiply()"
2140             << std::endl
2141             << std::flush;
2142 #endif
2143 #endif
2144   asm volatile(
2145       "prfm pldl1keep, [%x[lhs]]\n"
2146       "prfm pldl1keep, [%x[rhs]]\n"
2147 
2148       // Clear aggregators.
2149       "movi v0.4s, #0\n"
2150       "movi v1.4s, #0\n"
2151 
2152       // General NxM lanes loop.
2153       "1:"
2154 
2155       // Subtract counter.
2156       "subs %x[count], %x[count], #8\n"
2157 
2158       "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
2159       "ld1 {v4.2s}, [%x[rhs]], #8\n"
2160       "prfm pldl1keep, [%x[lhs], #64]\n"
2161       "prfm pldl1keep, [%x[rhs], #64]\n"
2162       "umull v5.8h, v4.8b, v2.8b\n"
2163       "umull v6.8h, v4.8b, v3.8b\n"
2164       "uadalp v0.4s, v5.8h\n"
2165       "uadalp v1.4s, v6.8h\n"
2166 
2167       // Loop break.
2168       "bgt 1b\n"
2169 
2170       // StaticQuantizationInt32::Prepare
2171       "ld1 {v4.4s}, [%x[lhs]], #16\n"
2172       "ld1 {v5.4s}, [%x[rhs]], #16\n"
2173       "dup v2.4s, v4.s[0]\n"
2174       "dup v4.4s, v4.s[1]\n"
2175 
2176       // RowMajorOutput::Prepare
2177       "add x0, %x[result], %x[stride]\n"
2178 
2179       // Reduce aggregators.
2180       "addp v0.4s, v0.4s, v0.4s\n"
2181       "addp v0.4s, v0.4s, v0.4s\n"
2182       "addp v1.4s, v1.4s, v1.4s\n"
2183       "addp v1.4s, v1.4s, v1.4s\n"
2184 
2185       // StaticQuantizationInt32::Transform
2186       "add v0.4s, v0.4s, v2.4s\n"
2187       "add v1.4s, v1.4s, v4.4s\n"
2188       "add v0.4s, v0.4s, v5.4s\n"
2189       "add v1.4s, v1.4s, v5.4s\n"
2190 
2191       // RowMajorOutput::Output
2192       "st1 {v0.s}[0], [%x[result]], #4\n"
2193       "st1 {v1.s}[0], [x0], #4\n"
2194       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2195       : [count] "r"(params.kernel.count),
2196         [stride] "r"(params.output_stream.stride)
2197       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2198 }
2199 
2200 template <>
2201 inline void MulKernel<
2202     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2203     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2204                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2205                                          RowMajor>& params,
2206                  int32_t* result) {
2207 #ifdef DEBUG
2208 #ifdef DEBUG_METAGEMM_VERBOSE
2209   std::cout << __FILE__ << "(" << __LINE__
2210             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2211                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 2, "
2212                "8>::Multiply()"
2213             << std::endl
2214             << std::flush;
2215 #endif
2216 #endif
2217   asm volatile(
2218       "prfm pldl1keep, [%x[lhs]]\n"
2219       "prfm pldl1keep, [%x[rhs]]\n"
2220 
2221       // Clear aggregators.
2222       "movi v0.4s, #0\n"
2223       "movi v1.4s, #0\n"
2224       "movi v2.4s, #0\n"
2225       "mov v3.16b, v0.16b\n"
2226 
2227       // General NxM lanes loop.
2228       "1:"
2229 
2230       // Subtract counter.
2231       "subs %x[count], %x[count], #8\n"
2232 
2233       "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
2234       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
2235       "prfm pldl1keep, [%x[lhs], #64]\n"
2236       "prfm pldl1keep, [%x[rhs], #64]\n"
2237       "umull v8.8h, v6.8b, v4.8b\n"
2238       "umull v9.8h, v7.8b, v4.8b\n"
2239       "umull v10.8h, v6.8b, v5.8b\n"
2240       "umull v11.8h, v7.8b, v5.8b\n"
2241       "uadalp v0.4s, v8.8h\n"
2242       "uadalp v1.4s, v9.8h\n"
2243       "uadalp v2.4s, v10.8h\n"
2244       "uadalp v3.4s, v11.8h\n"
2245 
2246       // Loop break.
2247       "bgt 1b\n"
2248 
2249       // StaticQuantizationInt32::Prepare
2250       "ld1 {v4.4s}, [%x[lhs]], #16\n"
2251       "ld1 {v5.4s}, [%x[rhs]], #16\n"
2252       "dup v6.4s, v4.s[0]\n"
2253       "dup v4.4s, v4.s[1]\n"
2254 
2255       // RowMajorOutput::Prepare
2256       "add x0, %x[result], %x[stride]\n"
2257 
2258       // Reduce aggregators.
2259       "addp v0.4s, v0.4s, v1.4s\n"
2260       "addp v0.4s, v0.4s, v0.4s\n"
2261       "addp v2.4s, v2.4s, v3.4s\n"
2262       "addp v2.4s, v2.4s, v2.4s\n"
2263 
2264       // StaticQuantizationInt32::Transform
2265       "add v0.4s, v0.4s, v6.4s\n"
2266       "add v2.4s, v2.4s, v4.4s\n"
2267       "add v0.4s, v0.4s, v5.4s\n"
2268       "add v2.4s, v2.4s, v5.4s\n"
2269 
2270       // RowMajorOutput::Output
2271       "st1 {v0.2s}, [%x[result]], #8\n"
2272       "st1 {v2.2s}, [x0], #8\n"
2273       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2274       : [count] "r"(params.kernel.count),
2275         [stride] "r"(params.output_stream.stride)
2276       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
2277         "v11", "cc", "memory");
2278 }
2279 
2280 template <>
2281 inline void MulKernel<
2282     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2283     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2284                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2285                                          RowMajor>& params,
2286                  int32_t* result) {
2287 #ifdef DEBUG
2288 #ifdef DEBUG_METAGEMM_VERBOSE
2289   std::cout << __FILE__ << "(" << __LINE__
2290             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2291                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 3, "
2292                "8>::Multiply()"
2293             << std::endl
2294             << std::flush;
2295 #endif
2296 #endif
2297   asm volatile(
2298       "prfm pldl1keep, [%x[lhs]]\n"
2299       "prfm pldl1keep, [%x[rhs]]\n"
2300 
2301       // Clear aggregators.
2302       "movi v0.4s, #0\n"
2303       "movi v1.4s, #0\n"
2304       "movi v2.4s, #0\n"
2305       "mov v3.16b, v0.16b\n"
2306       "mov v4.16b, v1.16b\n"
2307       "mov v5.16b, v2.16b\n"
2308 
2309       // General NxM lanes loop.
2310       "1:"
2311 
2312       // Subtract counter.
2313       "subs %x[count], %x[count], #8\n"
2314 
2315       "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
2316       "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
2317       "prfm pldl1keep, [%x[lhs], #64]\n"
2318       "prfm pldl1keep, [%x[rhs], #64]\n"
2319       "umull v11.8h, v8.8b, v6.8b\n"
2320       "umull v12.8h, v9.8b, v6.8b\n"
2321       "umull v13.8h, v10.8b, v6.8b\n"
2322       "umull v14.8h, v8.8b, v7.8b\n"
2323       "umull v15.8h, v9.8b, v7.8b\n"
2324       "umull v16.8h, v10.8b, v7.8b\n"
2325       "uadalp v0.4s, v11.8h\n"
2326       "uadalp v1.4s, v12.8h\n"
2327       "uadalp v2.4s, v13.8h\n"
2328       "uadalp v3.4s, v14.8h\n"
2329       "uadalp v4.4s, v15.8h\n"
2330       "uadalp v5.4s, v16.8h\n"
2331 
2332       // Loop break.
2333       "bgt 1b\n"
2334 
2335       // StaticQuantizationInt32::Prepare
2336       "ld1 {v6.4s}, [%x[lhs]], #16\n"
2337       "ld1 {v7.4s}, [%x[rhs]], #16\n"
2338       "dup v8.4s, v6.s[0]\n"
2339       "dup v6.4s, v6.s[1]\n"
2340 
2341       // RowMajorOutput::Prepare
2342       "add x0, %x[result], %x[stride]\n"
2343 
2344       // Reduce aggregators.
2345       "addp v0.4s, v0.4s, v1.4s\n"
2346       "addp v2.4s, v2.4s, v2.4s\n"
2347       "addp v0.4s, v0.4s, v2.4s\n"
2348       "addp v3.4s, v3.4s, v4.4s\n"
2349       "addp v5.4s, v5.4s, v5.4s\n"
2350       "addp v3.4s, v3.4s, v5.4s\n"
2351 
2352       // StaticQuantizationInt32::Transform
2353       "add v0.4s, v0.4s, v8.4s\n"
2354       "add v3.4s, v3.4s, v6.4s\n"
2355       "add v0.4s, v0.4s, v7.4s\n"
2356       "add v3.4s, v3.4s, v7.4s\n"
2357 
2358       // RowMajorOutput::Output
2359       "st1 {v0.2s}, [%x[result]], #8\n"
2360       "st1 {v0.s}[2], [%x[result]], #4\n"
2361       "st1 {v3.2s}, [x0], #8\n"
2362       "st1 {v3.s}[2], [x0], #4\n"
2363       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2364       : [count] "r"(params.kernel.count),
2365         [stride] "r"(params.output_stream.stride)
2366       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
2367         "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
2368 }
2369 
2370 template <>
2371 inline void MulKernel<
2372     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2373     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2374                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2375                                          RowMajor>& params,
2376                  int32_t* result) {
2377 #ifdef DEBUG
2378 #ifdef DEBUG_METAGEMM_VERBOSE
2379   std::cout << __FILE__ << "(" << __LINE__
2380             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2381                "QuantizedStaticPreprocessedAsInt32, RowMajor, 2, 4, "
2382                "8>::Multiply()"
2383             << std::endl
2384             << std::flush;
2385 #endif
2386 #endif
2387   asm volatile(
2388       "prfm pldl1keep, [%x[lhs]]\n"
2389       "prfm pldl1keep, [%x[rhs]]\n"
2390 
2391       // Clear aggregators.
2392       "movi v0.4s, #0\n"
2393       "movi v1.4s, #0\n"
2394       "movi v2.4s, #0\n"
2395       "mov v3.16b, v0.16b\n"
2396       "mov v4.16b, v1.16b\n"
2397       "mov v5.16b, v2.16b\n"
2398       "mov v6.16b, v3.16b\n"
2399       "mov v7.16b, v4.16b\n"
2400 
2401       // 2x4 lanes loop.
2402       "1:"
2403 
2404       "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
2405       "ld1 {v8.8b}, [%x[lhs]], #8\n"
2406       "umull v14.8h, v8.8b, v10.8b\n"
2407       "ld1 {v9.8b}, [%x[lhs]], #8\n"
2408       "umull v15.8h, v8.8b, v11.8b\n"
2409       "prfm pldl1keep, [%x[rhs], #64]\n"
2410       "umull v16.8h, v8.8b, v12.8b\n"
2411       "prfm pldl1keep, [%x[lhs], #64]\n"
2412       "umull v17.8h, v8.8b, v13.8b\n"
2413       "umull v18.8h, v9.8b, v10.8b\n"
2414       "uadalp v0.4s, v14.8h\n"
2415       "uadalp v1.4s, v15.8h\n"
2416       "uadalp v2.4s, v16.8h\n"
2417       "umull v14.8h, v9.8b, v11.8b\n"
2418       "umull v15.8h, v9.8b, v12.8b\n"
2419       "umull v16.8h, v9.8b, v13.8b\n"
2420 
2421       // Subtract counter.
2422       "subs %x[count], %x[count], #8\n"
2423 
2424       "uadalp v3.4s, v17.8h\n"
2425       "uadalp v4.4s, v18.8h\n"
2426       "uadalp v5.4s, v14.8h\n"
2427       "uadalp v6.4s, v15.8h\n"
2428       "uadalp v7.4s, v16.8h\n"
2429 
2430       // Loop break.
2431       "bgt 1b\n"
2432 
2433       // StaticQuantizationInt32::Prepare
2434       "ld1 {v8.4s}, [%x[lhs]], #16\n"
2435       "ld1 {v9.4s}, [%x[rhs]], #16\n"
2436       "dup v10.4s, v8.s[0]\n"
2437       "dup v8.4s, v8.s[1]\n"
2438 
2439       // RowMajorOutput::Prepare
2440       "add x0, %x[result], %x[stride]\n"
2441 
2442       // Reduce aggregators.
2443       "addp v0.4s, v0.4s, v1.4s\n"
2444       "addp v2.4s, v2.4s, v3.4s\n"
2445       "addp v0.4s, v0.4s, v2.4s\n"
2446       "addp v4.4s, v4.4s, v5.4s\n"
2447       "addp v6.4s, v6.4s, v7.4s\n"
2448       "addp v4.4s, v4.4s, v6.4s\n"
2449 
2450       // StaticQuantizationInt32::Transform
2451       "add v0.4s, v0.4s, v10.4s\n"
2452       "add v4.4s, v4.4s, v8.4s\n"
2453       "add v0.4s, v0.4s, v9.4s\n"
2454       "add v4.4s, v4.4s, v9.4s\n"
2455 
2456       // RowMajorOutput::Output
2457       "st1 {v0.4s}, [%x[result]], #16\n"
2458       "st1 {v4.4s}, [x0], #16\n"
2459       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2460       : [count] "r"(params.kernel.count),
2461         [stride] "r"(params.output_stream.stride)
2462       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
2463         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
2464 }
2465 
2466 template <>
2467 inline void MulKernel<
2468     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2469     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2470                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2471                                          RowMajor>& params,
2472                  int32_t* result) {
2473 #ifdef DEBUG
2474 #ifdef DEBUG_METAGEMM_VERBOSE
2475   std::cout << __FILE__ << "(" << __LINE__
2476             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2477                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 1, "
2478                "8>::Multiply()"
2479             << std::endl
2480             << std::flush;
2481 #endif
2482 #endif
2483   asm volatile(
2484       "prfm pldl1keep, [%x[lhs]]\n"
2485       "prfm pldl1keep, [%x[rhs]]\n"
2486 
2487       // Clear aggregators.
2488       "movi v0.4s, #0\n"
2489       "movi v1.4s, #0\n"
2490       "movi v2.4s, #0\n"
2491 
2492       // General NxM lanes loop.
2493       "1:"
2494 
2495       // Subtract counter.
2496       "subs %x[count], %x[count], #8\n"
2497 
2498       "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
2499       "ld1 {v6.2s}, [%x[rhs]], #8\n"
2500       "prfm pldl1keep, [%x[lhs], #64]\n"
2501       "prfm pldl1keep, [%x[rhs], #64]\n"
2502       "umull v7.8h, v6.8b, v3.8b\n"
2503       "umull v8.8h, v6.8b, v4.8b\n"
2504       "umull v9.8h, v6.8b, v5.8b\n"
2505       "uadalp v0.4s, v7.8h\n"
2506       "uadalp v1.4s, v8.8h\n"
2507       "uadalp v2.4s, v9.8h\n"
2508 
2509       // Loop break.
2510       "bgt 1b\n"
2511 
2512       // StaticQuantizationInt32::Prepare
2513       "ld1 {v4.4s}, [%x[lhs]], #16\n"
2514       "ld1 {v5.4s}, [%x[rhs]], #16\n"
2515       "dup v3.4s, v4.s[0]\n"
2516       "dup v6.4s, v4.s[1]\n"
2517       "dup v4.4s, v4.s[2]\n"
2518 
2519       // RowMajorOutput::Prepare
2520       "add x0, %x[result], %x[stride]\n"
2521       "add x1, x0, %x[stride]\n"
2522 
2523       // Reduce aggregators.
2524       "addp v0.4s, v0.4s, v0.4s\n"
2525       "addp v0.4s, v0.4s, v0.4s\n"
2526       "addp v1.4s, v1.4s, v1.4s\n"
2527       "addp v1.4s, v1.4s, v1.4s\n"
2528       "addp v2.4s, v2.4s, v2.4s\n"
2529       "addp v2.4s, v2.4s, v2.4s\n"
2530 
2531       // StaticQuantizationInt32::Transform
2532       "add v0.4s, v0.4s, v3.4s\n"
2533       "add v1.4s, v1.4s, v6.4s\n"
2534       "add v2.4s, v2.4s, v4.4s\n"
2535       "add v0.4s, v0.4s, v5.4s\n"
2536       "add v1.4s, v1.4s, v5.4s\n"
2537       "add v2.4s, v2.4s, v5.4s\n"
2538 
2539       // RowMajorOutput::Output
2540       "st1 {v0.s}[0], [%x[result]], #4\n"
2541       "st1 {v1.s}[0], [x0], #4\n"
2542       "st1 {v2.s}[0], [x1], #4\n"
2543       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2544       : [count] "r"(params.kernel.count),
2545         [stride] "r"(params.output_stream.stride)
2546       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
2547         "cc", "memory");
2548 }
2549 
2550 template <>
2551 inline void MulKernel<
2552     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2553     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2554                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2555                                          RowMajor>& params,
2556                  int32_t* result) {
2557 #ifdef DEBUG
2558 #ifdef DEBUG_METAGEMM_VERBOSE
2559   std::cout << __FILE__ << "(" << __LINE__
2560             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2561                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 2, "
2562                "8>::Multiply()"
2563             << std::endl
2564             << std::flush;
2565 #endif
2566 #endif
2567   asm volatile(
2568       "prfm pldl1keep, [%x[lhs]]\n"
2569       "prfm pldl1keep, [%x[rhs]]\n"
2570 
2571       // Clear aggregators.
2572       "movi v0.4s, #0\n"
2573       "movi v1.4s, #0\n"
2574       "movi v2.4s, #0\n"
2575       "mov v3.16b, v0.16b\n"
2576       "mov v4.16b, v1.16b\n"
2577       "mov v5.16b, v2.16b\n"
2578 
2579       // General NxM lanes loop.
2580       "1:"
2581 
2582       // Subtract counter.
2583       "subs %x[count], %x[count], #8\n"
2584 
2585       "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
2586       "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
2587       "prfm pldl1keep, [%x[lhs], #64]\n"
2588       "prfm pldl1keep, [%x[rhs], #64]\n"
2589       "umull v11.8h, v9.8b, v6.8b\n"
2590       "umull v12.8h, v10.8b, v6.8b\n"
2591       "umull v13.8h, v9.8b, v7.8b\n"
2592       "umull v14.8h, v10.8b, v7.8b\n"
2593       "umull v15.8h, v9.8b, v8.8b\n"
2594       "umull v16.8h, v10.8b, v8.8b\n"
2595       "uadalp v0.4s, v11.8h\n"
2596       "uadalp v1.4s, v12.8h\n"
2597       "uadalp v2.4s, v13.8h\n"
2598       "uadalp v3.4s, v14.8h\n"
2599       "uadalp v4.4s, v15.8h\n"
2600       "uadalp v5.4s, v16.8h\n"
2601 
2602       // Loop break.
2603       "bgt 1b\n"
2604 
2605       // StaticQuantizationInt32::Prepare
2606       "ld1 {v6.4s}, [%x[lhs]], #16\n"
2607       "ld1 {v7.4s}, [%x[rhs]], #16\n"
2608       "dup v8.4s, v6.s[0]\n"
2609       "dup v9.4s, v6.s[1]\n"
2610       "dup v6.4s, v6.s[2]\n"
2611 
2612       // RowMajorOutput::Prepare
2613       "add x0, %x[result], %x[stride]\n"
2614       "add x1, x0, %x[stride]\n"
2615 
2616       // Reduce aggregators.
2617       "addp v0.4s, v0.4s, v1.4s\n"
2618       "addp v0.4s, v0.4s, v0.4s\n"
2619       "addp v2.4s, v2.4s, v3.4s\n"
2620       "addp v2.4s, v2.4s, v2.4s\n"
2621       "addp v4.4s, v4.4s, v5.4s\n"
2622       "addp v4.4s, v4.4s, v4.4s\n"
2623 
2624       // StaticQuantizationInt32::Transform
2625       "add v0.4s, v0.4s, v8.4s\n"
2626       "add v2.4s, v2.4s, v9.4s\n"
2627       "add v4.4s, v4.4s, v6.4s\n"
2628       "add v0.4s, v0.4s, v7.4s\n"
2629       "add v2.4s, v2.4s, v7.4s\n"
2630       "add v4.4s, v4.4s, v7.4s\n"
2631 
2632       // RowMajorOutput::Output
2633       "st1 {v0.2s}, [%x[result]], #8\n"
2634       "st1 {v2.2s}, [x0], #8\n"
2635       "st1 {v4.2s}, [x1], #8\n"
2636       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2637       : [count] "r"(params.kernel.count),
2638         [stride] "r"(params.output_stream.stride)
2639       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
2640         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
2641 }
2642 
2643 template <>
2644 inline void MulKernel<
2645     uint8_t, int32_t, QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,RowMajor> & params,int32_t * result)2646     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2647                  const FusedKernelParams<QuantizedStaticPreprocessedAsInt32,
2648                                          RowMajor>& params,
2649                  int32_t* result) {
2650 #ifdef DEBUG
2651 #ifdef DEBUG_METAGEMM_VERBOSE
2652   std::cout << __FILE__ << "(" << __LINE__
2653             << ") QuantizedStaticPreprocessedAsInt32RowMajor<uint8_t, int32_t, "
2654                "QuantizedStaticPreprocessedAsInt32, RowMajor, 3, 3, "
2655                "8>::Multiply()"
2656             << std::endl
2657             << std::flush;
2658 #endif
2659 #endif
2660   asm volatile(
2661       "prfm pldl1keep, [%x[lhs]]\n"
2662       "prfm pldl1keep, [%x[rhs]]\n"
2663 
2664       // Clear aggregators.
2665       "movi v0.4s, #0\n"
2666       "movi v1.4s, #0\n"
2667       "movi v2.4s, #0\n"
2668       "mov v3.16b, v0.16b\n"
2669       "mov v4.16b, v1.16b\n"
2670       "mov v5.16b, v2.16b\n"
2671       "mov v6.16b, v3.16b\n"
2672       "mov v7.16b, v4.16b\n"
2673       "mov v8.16b, v5.16b\n"
2674 
2675       // 3x3 lanes loop.
2676       "1:"
2677 
2678       "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
2679       "ld1 {v9.8b}, [%x[lhs]], #8\n"
2680       "umull v15.8h, v9.8b, v12.8b\n"
2681       "ld1 {v10.8b}, [%x[lhs]], #8\n"
2682       "umull v16.8h, v9.8b, v13.8b\n"
2683       "ld1 {v11.8b}, [%x[lhs]], #8\n"
2684       "umull v17.8h, v9.8b, v14.8b\n"
2685       "prfm pldl1keep, [%x[lhs], #64]\n"
2686       "umull v18.8h, v10.8b, v12.8b\n"
2687       "prfm pldl1keep, [%x[rhs], #64]\n"
2688       "uadalp v0.4s, v15.8h\n"
2689       "uadalp v1.4s, v16.8h\n"
2690       "uadalp v2.4s, v17.8h\n"
2691       "uadalp v3.4s, v18.8h\n"
2692       "umull v15.8h, v10.8b, v13.8b\n"
2693       "umull v16.8h, v10.8b, v14.8b\n"
2694       "umull v17.8h, v11.8b, v12.8b\n"
2695       "umull v18.8h, v11.8b, v13.8b\n"
2696 
2697       // Subtract counter.
2698       "subs %x[count], %x[count], #8\n"
2699 
2700       "umull v9.8h, v11.8b, v14.8b\n"
2701       "uadalp v4.4s, v15.8h\n"
2702       "uadalp v5.4s, v16.8h\n"
2703       "uadalp v6.4s, v17.8h\n"
2704       "uadalp v7.4s, v18.8h\n"
2705       "uadalp v8.4s, v9.8h\n"
2706 
2707       // Loop break.
2708       "bgt 1b\n"
2709 
2710       // StaticQuantizationInt32::Prepare
2711       "ld1 {v9.4s}, [%x[lhs]], #16\n"
2712       "ld1 {v10.4s}, [%x[rhs]], #16\n"
2713       "dup v11.4s, v9.s[0]\n"
2714       "dup v12.4s, v9.s[1]\n"
2715       "dup v9.4s, v9.s[2]\n"
2716 
2717       // RowMajorOutput::Prepare
2718       "add x0, %x[result], %x[stride]\n"
2719       "add x1, x0, %x[stride]\n"
2720 
2721       // Reduce aggregators.
2722       "addp v0.4s, v0.4s, v1.4s\n"
2723       "addp v2.4s, v2.4s, v2.4s\n"
2724       "addp v0.4s, v0.4s, v2.4s\n"
2725       "addp v3.4s, v3.4s, v4.4s\n"
2726       "addp v5.4s, v5.4s, v5.4s\n"
2727       "addp v3.4s, v3.4s, v5.4s\n"
2728       "addp v6.4s, v6.4s, v7.4s\n"
2729       "addp v8.4s, v8.4s, v8.4s\n"
2730       "addp v6.4s, v6.4s, v8.4s\n"
2731 
2732       // StaticQuantizationInt32::Transform
2733       "add v0.4s, v0.4s, v11.4s\n"
2734       "add v3.4s, v3.4s, v12.4s\n"
2735       "add v6.4s, v6.4s, v9.4s\n"
2736       "add v0.4s, v0.4s, v10.4s\n"
2737       "add v3.4s, v3.4s, v10.4s\n"
2738       "add v6.4s, v6.4s, v10.4s\n"
2739 
2740       // RowMajorOutput::Output
2741       "st1 {v0.2s}, [%x[result]], #8\n"
2742       "st1 {v0.s}[2], [%x[result]], #4\n"
2743       "st1 {v3.2s}, [x0], #8\n"
2744       "st1 {v3.s}[2], [x0], #4\n"
2745       "st1 {v6.2s}, [x1], #8\n"
2746       "st1 {v6.s}[2], [x1], #4\n"
2747       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2748       : [count] "r"(params.kernel.count),
2749         [stride] "r"(params.output_stream.stride)
2750       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
2751         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
2752         "memory");
2753 }
2754 
2755 template <>
2756 inline void MulKernel<
2757     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2758     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2759                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
2760                                          RowMajor>& params,
2761                  float* result) {
2762 #ifdef DEBUG
2763 #ifdef DEBUG_METAGEMM_VERBOSE
2764   std::cout << __FILE__ << "(" << __LINE__
2765             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
2766                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 1, "
2767                "8>::Multiply()"
2768             << std::endl
2769             << std::flush;
2770 #endif
2771 #endif
2772   asm volatile(
2773       "prfm pldl1keep, [%x[lhs]]\n"
2774       "prfm pldl1keep, [%x[rhs]]\n"
2775 
2776       // Clear aggregators.
2777       "movi v0.4s, #0\n"
2778 
2779       // General NxM lanes loop.
2780       "1:"
2781 
2782       // Subtract counter.
2783       "subs %x[count], %x[count], #8\n"
2784 
2785       "ld1 {v1.2s}, [%x[lhs]], #8\n"
2786       "ld1 {v2.2s}, [%x[rhs]], #8\n"
2787       "prfm pldl1keep, [%x[lhs], #64]\n"
2788       "prfm pldl1keep, [%x[rhs], #64]\n"
2789       "umull v3.8h, v2.8b, v1.8b\n"
2790       "uadalp v0.4s, v3.8h\n"
2791 
2792       // Loop break.
2793       "bgt 1b\n"
2794 
2795       // StaticQuantizationFloat::Prepare
2796       "ld1 {v4.4s}, [%x[lhs]], #16\n"
2797       "ld1 {v5.4s}, [%x[rhs]], #16\n"
2798       "dup v6.4s, %w[scale]\n"
2799       "dup v4.4s, v4.s[0]\n"
2800 
2801       // RowMajorOutput::Prepare
2802 
2803       // Reduce aggregators.
2804       "addp v0.4s, v0.4s, v0.4s\n"
2805       "addp v0.4s, v0.4s, v0.4s\n"
2806 
2807       // StaticQuantizationFloat::Transform
2808       "add v0.4s, v0.4s, v4.4s\n"
2809       "add v0.4s, v0.4s, v5.4s\n"
2810       "scvtf v0.4s, v0.4s\n"
2811       "fmul v0.4s, v0.4s, v6.4s\n"
2812 
2813       // RowMajorOutput::Output
2814       "st1 {v0.s}[0], [%x[result]], #4\n"
2815       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2816       : [count] "r"(params.kernel.count),
2817         [stride] "r"(params.output_stream.stride),
2818         [scale] "r"(params.kernel.scale)
2819       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2820 }
2821 
2822 template <>
2823 inline void MulKernel<
2824     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2825     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2826                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
2827                                          RowMajor>& params,
2828                  float* result) {
2829 #ifdef DEBUG
2830 #ifdef DEBUG_METAGEMM_VERBOSE
2831   std::cout << __FILE__ << "(" << __LINE__
2832             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
2833                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 2, "
2834                "8>::Multiply()"
2835             << std::endl
2836             << std::flush;
2837 #endif
2838 #endif
2839   asm volatile(
2840       "prfm pldl1keep, [%x[lhs]]\n"
2841       "prfm pldl1keep, [%x[rhs]]\n"
2842 
2843       // Clear aggregators.
2844       "movi v0.4s, #0\n"
2845       "movi v1.4s, #0\n"
2846 
2847       // General NxM lanes loop.
2848       "1:"
2849 
2850       // Subtract counter.
2851       "subs %x[count], %x[count], #8\n"
2852 
2853       "ld1 {v2.2s}, [%x[lhs]], #8\n"
2854       "ld1 {v3.2s, v4.2s}, [%x[rhs]], #16\n"
2855       "prfm pldl1keep, [%x[lhs], #64]\n"
2856       "prfm pldl1keep, [%x[rhs], #64]\n"
2857       "umull v5.8h, v3.8b, v2.8b\n"
2858       "umull v6.8h, v4.8b, v2.8b\n"
2859       "uadalp v0.4s, v5.8h\n"
2860       "uadalp v1.4s, v6.8h\n"
2861 
2862       // Loop break.
2863       "bgt 1b\n"
2864 
2865       // StaticQuantizationFloat::Prepare
2866       "ld1 {v4.4s}, [%x[lhs]], #16\n"
2867       "ld1 {v5.4s}, [%x[rhs]], #16\n"
2868       "dup v6.4s, %w[scale]\n"
2869       "dup v4.4s, v4.s[0]\n"
2870 
2871       // RowMajorOutput::Prepare
2872 
2873       // Reduce aggregators.
2874       "addp v0.4s, v0.4s, v1.4s\n"
2875       "addp v0.4s, v0.4s, v0.4s\n"
2876 
2877       // StaticQuantizationFloat::Transform
2878       "add v0.4s, v0.4s, v4.4s\n"
2879       "add v0.4s, v0.4s, v5.4s\n"
2880       "scvtf v0.4s, v0.4s\n"
2881       "fmul v0.4s, v0.4s, v6.4s\n"
2882 
2883       // RowMajorOutput::Output
2884       "st1 {v0.2s}, [%x[result]], #8\n"
2885       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2886       : [count] "r"(params.kernel.count),
2887         [stride] "r"(params.output_stream.stride),
2888         [scale] "r"(params.kernel.scale)
2889       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
2890 }
2891 
2892 template <>
2893 inline void MulKernel<
2894     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2895     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2896                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
2897                                          RowMajor>& params,
2898                  float* result) {
2899 #ifdef DEBUG
2900 #ifdef DEBUG_METAGEMM_VERBOSE
2901   std::cout << __FILE__ << "(" << __LINE__
2902             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
2903                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 3, "
2904                "8>::Multiply()"
2905             << std::endl
2906             << std::flush;
2907 #endif
2908 #endif
2909   asm volatile(
2910       "prfm pldl1keep, [%x[lhs]]\n"
2911       "prfm pldl1keep, [%x[rhs]]\n"
2912 
2913       // Clear aggregators.
2914       "movi v0.4s, #0\n"
2915       "movi v1.4s, #0\n"
2916       "movi v2.4s, #0\n"
2917 
2918       // General NxM lanes loop.
2919       "1:"
2920 
2921       // Subtract counter.
2922       "subs %x[count], %x[count], #8\n"
2923 
2924       "ld1 {v3.2s}, [%x[lhs]], #8\n"
2925       "ld1 {v4.2s, v5.2s, v6.2s}, [%x[rhs]], #24\n"
2926       "prfm pldl1keep, [%x[lhs], #64]\n"
2927       "prfm pldl1keep, [%x[rhs], #64]\n"
2928       "umull v7.8h, v4.8b, v3.8b\n"
2929       "umull v8.8h, v5.8b, v3.8b\n"
2930       "umull v9.8h, v6.8b, v3.8b\n"
2931       "uadalp v0.4s, v7.8h\n"
2932       "uadalp v1.4s, v8.8h\n"
2933       "uadalp v2.4s, v9.8h\n"
2934 
2935       // Loop break.
2936       "bgt 1b\n"
2937 
2938       // StaticQuantizationFloat::Prepare
2939       "ld1 {v4.4s}, [%x[lhs]], #16\n"
2940       "ld1 {v5.4s}, [%x[rhs]], #16\n"
2941       "dup v6.4s, %w[scale]\n"
2942       "dup v4.4s, v4.s[0]\n"
2943 
2944       // RowMajorOutput::Prepare
2945 
2946       // Reduce aggregators.
2947       "addp v0.4s, v0.4s, v1.4s\n"
2948       "addp v2.4s, v2.4s, v2.4s\n"
2949       "addp v0.4s, v0.4s, v2.4s\n"
2950 
2951       // StaticQuantizationFloat::Transform
2952       "add v0.4s, v0.4s, v4.4s\n"
2953       "add v0.4s, v0.4s, v5.4s\n"
2954       "scvtf v0.4s, v0.4s\n"
2955       "fmul v0.4s, v0.4s, v6.4s\n"
2956 
2957       // RowMajorOutput::Output
2958       "st1 {v0.2s}, [%x[result]], #8\n"
2959       "st1 {v0.s}[2], [%x[result]], #4\n"
2960       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
2961       : [count] "r"(params.kernel.count),
2962         [stride] "r"(params.output_stream.stride),
2963         [scale] "r"(params.kernel.scale)
2964       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc",
2965         "memory");
2966 }
2967 
2968 template <>
2969 inline void MulKernel<
2970     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)2971     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
2972                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
2973                                          RowMajor>& params,
2974                  float* result) {
2975 #ifdef DEBUG
2976 #ifdef DEBUG_METAGEMM_VERBOSE
2977   std::cout << __FILE__ << "(" << __LINE__
2978             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
2979                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 4, "
2980                "8>::Multiply()"
2981             << std::endl
2982             << std::flush;
2983 #endif
2984 #endif
2985   asm volatile(
2986       "prfm pldl1keep, [%x[lhs]]\n"
2987       "prfm pldl1keep, [%x[rhs]]\n"
2988 
2989       // Clear aggregators.
2990       "movi v0.4s, #0\n"
2991       "movi v1.4s, #0\n"
2992       "movi v2.4s, #0\n"
2993       "mov v3.16b, v0.16b\n"
2994 
2995       // General NxM lanes loop.
2996       "1:"
2997 
2998       // Subtract counter.
2999       "subs %x[count], %x[count], #8\n"
3000 
3001       "ld1 {v4.2s}, [%x[lhs]], #8\n"
3002       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
3003       "prfm pldl1keep, [%x[lhs], #64]\n"
3004       "prfm pldl1keep, [%x[rhs], #64]\n"
3005       "umull v9.8h, v5.8b, v4.8b\n"
3006       "umull v10.8h, v6.8b, v4.8b\n"
3007       "umull v11.8h, v7.8b, v4.8b\n"
3008       "umull v12.8h, v8.8b, v4.8b\n"
3009       "uadalp v0.4s, v9.8h\n"
3010       "uadalp v1.4s, v10.8h\n"
3011       "uadalp v2.4s, v11.8h\n"
3012       "uadalp v3.4s, v12.8h\n"
3013 
3014       // Loop break.
3015       "bgt 1b\n"
3016 
3017       // StaticQuantizationFloat::Prepare
3018       "ld1 {v4.4s}, [%x[lhs]], #16\n"
3019       "ld1 {v5.4s}, [%x[rhs]], #16\n"
3020       "dup v6.4s, %w[scale]\n"
3021       "dup v4.4s, v4.s[0]\n"
3022 
3023       // RowMajorOutput::Prepare
3024 
3025       // Reduce aggregators.
3026       "addp v0.4s, v0.4s, v1.4s\n"
3027       "addp v2.4s, v2.4s, v3.4s\n"
3028       "addp v0.4s, v0.4s, v2.4s\n"
3029 
3030       // StaticQuantizationFloat::Transform
3031       "add v0.4s, v0.4s, v4.4s\n"
3032       "add v0.4s, v0.4s, v5.4s\n"
3033       "scvtf v0.4s, v0.4s\n"
3034       "fmul v0.4s, v0.4s, v6.4s\n"
3035 
3036       // RowMajorOutput::Output
3037       "st1 {v0.4s}, [%x[result]], #16\n"
3038       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3039       : [count] "r"(params.kernel.count),
3040         [stride] "r"(params.output_stream.stride),
3041         [scale] "r"(params.kernel.scale)
3042       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
3043         "v11", "v12", "cc", "memory");
3044 }
3045 
3046 template <>
3047 inline void MulKernel<
3048     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3049     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3050                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3051                                          RowMajor>& params,
3052                  float* result) {
3053 #ifdef DEBUG
3054 #ifdef DEBUG_METAGEMM_VERBOSE
3055   std::cout << __FILE__ << "(" << __LINE__
3056             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3057                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 5, "
3058                "8>::Multiply()"
3059             << std::endl
3060             << std::flush;
3061 #endif
3062 #endif
3063   asm volatile(
3064       "prfm pldl1keep, [%x[lhs]]\n"
3065       "prfm pldl1keep, [%x[rhs]]\n"
3066 
3067       // Clear aggregators.
3068       "movi v0.4s, #0\n"
3069       "movi v1.4s, #0\n"
3070       "movi v2.4s, #0\n"
3071       "mov v3.16b, v0.16b\n"
3072       "mov v4.16b, v1.16b\n"
3073 
3074       // General 1xM lanes loop.
3075       "1:"
3076 
3077       // Subtract counter.
3078       "subs %x[count], %x[count], #8\n"
3079 
3080       "ld1 {v5.2s, v6.2s, v7.2s, v8.2s}, [%x[rhs]], #32\n"
3081       "ld1 {v9.2s}, [%x[lhs]], #8\n"
3082       "prfm pldl1keep, [%x[lhs], #64]\n"
3083       "umull v10.8h, v5.8b, v9.8b\n"
3084       "umull v11.8h, v6.8b, v9.8b\n"
3085       "umull v12.8h, v7.8b, v9.8b\n"
3086       "umull v13.8h, v8.8b, v9.8b\n"
3087       "ld1 {v5.2s}, [%x[rhs]], #8\n"
3088       "prfm pldl1keep, [%x[rhs], #128]\n"
3089       "uadalp v0.4s, v10.8h\n"
3090       "uadalp v1.4s, v11.8h\n"
3091       "uadalp v2.4s, v12.8h\n"
3092       "uadalp v3.4s, v13.8h\n"
3093       "umull v10.8h, v5.8b, v9.8b\n"
3094       "uadalp v4.4s, v10.8h\n"
3095 
3096       // Loop break.
3097       "bgt 1b\n"
3098 
3099       // StaticQuantizationFloat::Prepare
3100       "ld1 {v5.4s}, [%x[lhs]], #16\n"
3101       "ld1 {v6.4s, v7.4s}, [%x[rhs]], #32\n"
3102       "dup v8.4s, %w[scale]\n"
3103       "dup v5.4s, v5.s[0]\n"
3104 
3105       // RowMajorOutput::Prepare
3106 
3107       // Reduce aggregators.
3108       "addp v0.4s, v0.4s, v1.4s\n"
3109       "addp v2.4s, v2.4s, v3.4s\n"
3110       "addp v4.4s, v4.4s, v4.4s\n"
3111       "addp v0.4s, v0.4s, v2.4s\n"
3112       "addp v1.4s, v4.4s, v4.4s\n"
3113 
3114       // StaticQuantizationFloat::Transform
3115       "add v0.4s, v0.4s, v5.4s\n"
3116       "add v1.4s, v1.4s, v5.4s\n"
3117       "add v0.4s, v0.4s, v6.4s\n"
3118       "add v1.4s, v1.4s, v7.4s\n"
3119       "scvtf v0.4s, v0.4s\n"
3120       "scvtf v1.4s, v1.4s\n"
3121       "fmul v0.4s, v0.4s, v8.4s\n"
3122       "fmul v1.4s, v1.4s, v8.4s\n"
3123 
3124       // RowMajorOutput::Output
3125       "st1 {v0.4s}, [%x[result]], #16\n"
3126       "st1 {v1.s}[0], [%x[result]], #4\n"
3127       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3128       : [count] "r"(params.kernel.count),
3129         [stride] "r"(params.output_stream.stride),
3130         [scale] "r"(params.kernel.scale)
3131       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
3132         "v11", "v12", "v13", "cc", "memory");
3133 }
3134 
3135 template <>
3136 inline void MulKernel<
3137     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3138     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3139                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3140                                          RowMajor>& params,
3141                  float* result) {
3142 #ifdef DEBUG
3143 #ifdef DEBUG_METAGEMM_VERBOSE
3144   std::cout << __FILE__ << "(" << __LINE__
3145             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3146                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 6, "
3147                "8>::Multiply()"
3148             << std::endl
3149             << std::flush;
3150 #endif
3151 #endif
3152   asm volatile(
3153       "prfm pldl1keep, [%x[lhs]]\n"
3154       "prfm pldl1keep, [%x[rhs]]\n"
3155 
3156       // Clear aggregators.
3157       "movi v0.4s, #0\n"
3158       "movi v1.4s, #0\n"
3159       "movi v2.4s, #0\n"
3160       "mov v3.16b, v0.16b\n"
3161       "mov v4.16b, v1.16b\n"
3162       "mov v5.16b, v2.16b\n"
3163 
3164       // General 1xM lanes loop.
3165       "1:"
3166 
3167       // Subtract counter.
3168       "subs %x[count], %x[count], #8\n"
3169 
3170       "ld1 {v6.2s, v7.2s, v8.2s, v9.2s}, [%x[rhs]], #32\n"
3171       "ld1 {v10.2s}, [%x[lhs]], #8\n"
3172       "prfm pldl1keep, [%x[lhs], #64]\n"
3173       "umull v11.8h, v6.8b, v10.8b\n"
3174       "umull v12.8h, v7.8b, v10.8b\n"
3175       "umull v13.8h, v8.8b, v10.8b\n"
3176       "umull v14.8h, v9.8b, v10.8b\n"
3177       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
3178       "prfm pldl1keep, [%x[rhs], #128]\n"
3179       "uadalp v0.4s, v11.8h\n"
3180       "uadalp v1.4s, v12.8h\n"
3181       "uadalp v2.4s, v13.8h\n"
3182       "uadalp v3.4s, v14.8h\n"
3183       "umull v11.8h, v6.8b, v10.8b\n"
3184       "umull v12.8h, v7.8b, v10.8b\n"
3185       "uadalp v4.4s, v11.8h\n"
3186       "uadalp v5.4s, v12.8h\n"
3187 
3188       // Loop break.
3189       "bgt 1b\n"
3190 
3191       // StaticQuantizationFloat::Prepare
3192       "ld1 {v6.4s}, [%x[lhs]], #16\n"
3193       "ld1 {v7.4s, v8.4s}, [%x[rhs]], #32\n"
3194       "dup v9.4s, %w[scale]\n"
3195       "dup v6.4s, v6.s[0]\n"
3196 
3197       // RowMajorOutput::Prepare
3198 
3199       // Reduce aggregators.
3200       "addp v0.4s, v0.4s, v1.4s\n"
3201       "addp v2.4s, v2.4s, v3.4s\n"
3202       "addp v4.4s, v4.4s, v5.4s\n"
3203       "addp v0.4s, v0.4s, v2.4s\n"
3204       "addp v1.4s, v4.4s, v4.4s\n"
3205 
3206       // StaticQuantizationFloat::Transform
3207       "add v0.4s, v0.4s, v6.4s\n"
3208       "add v1.4s, v1.4s, v6.4s\n"
3209       "add v0.4s, v0.4s, v7.4s\n"
3210       "add v1.4s, v1.4s, v8.4s\n"
3211       "scvtf v0.4s, v0.4s\n"
3212       "scvtf v1.4s, v1.4s\n"
3213       "fmul v0.4s, v0.4s, v9.4s\n"
3214       "fmul v1.4s, v1.4s, v9.4s\n"
3215 
3216       // RowMajorOutput::Output
3217       "st1 {v0.4s}, [%x[result]], #16\n"
3218       "st1 {v1.2s}, [%x[result]], #8\n"
3219       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3220       : [count] "r"(params.kernel.count),
3221         [stride] "r"(params.output_stream.stride),
3222         [scale] "r"(params.kernel.scale)
3223       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
3224         "v11", "v12", "v13", "v14", "cc", "memory");
3225 }
3226 
3227 template <>
3228 inline void MulKernel<
3229     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3230     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3231                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3232                                          RowMajor>& params,
3233                  float* result) {
3234 #ifdef DEBUG
3235 #ifdef DEBUG_METAGEMM_VERBOSE
3236   std::cout << __FILE__ << "(" << __LINE__
3237             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3238                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 7, "
3239                "8>::Multiply()"
3240             << std::endl
3241             << std::flush;
3242 #endif
3243 #endif
3244   asm volatile(
3245       "prfm pldl1keep, [%x[lhs]]\n"
3246       "prfm pldl1keep, [%x[rhs]]\n"
3247 
3248       // Clear aggregators.
3249       "movi v0.4s, #0\n"
3250       "movi v1.4s, #0\n"
3251       "movi v2.4s, #0\n"
3252       "mov v3.16b, v0.16b\n"
3253       "mov v4.16b, v1.16b\n"
3254       "mov v5.16b, v2.16b\n"
3255       "mov v6.16b, v3.16b\n"
3256 
3257       // General 1xM lanes loop.
3258       "1:"
3259 
3260       // Subtract counter.
3261       "subs %x[count], %x[count], #8\n"
3262 
3263       "ld1 {v7.2s, v8.2s, v9.2s, v10.2s}, [%x[rhs]], #32\n"
3264       "ld1 {v11.2s}, [%x[lhs]], #8\n"
3265       "prfm pldl1keep, [%x[lhs], #64]\n"
3266       "umull v12.8h, v7.8b, v11.8b\n"
3267       "umull v13.8h, v8.8b, v11.8b\n"
3268       "umull v14.8h, v9.8b, v11.8b\n"
3269       "umull v15.8h, v10.8b, v11.8b\n"
3270       "ld1 {v7.2s, v8.2s, v9.2s}, [%x[rhs]], #24\n"
3271       "prfm pldl1keep, [%x[rhs], #128]\n"
3272       "uadalp v0.4s, v12.8h\n"
3273       "uadalp v1.4s, v13.8h\n"
3274       "uadalp v2.4s, v14.8h\n"
3275       "uadalp v3.4s, v15.8h\n"
3276       "umull v12.8h, v7.8b, v11.8b\n"
3277       "umull v13.8h, v8.8b, v11.8b\n"
3278       "umull v14.8h, v9.8b, v11.8b\n"
3279       "uadalp v4.4s, v12.8h\n"
3280       "uadalp v5.4s, v13.8h\n"
3281       "uadalp v6.4s, v14.8h\n"
3282 
3283       // Loop break.
3284       "bgt 1b\n"
3285 
3286       // StaticQuantizationFloat::Prepare
3287       "ld1 {v7.4s}, [%x[lhs]], #16\n"
3288       "ld1 {v8.4s, v9.4s}, [%x[rhs]], #32\n"
3289       "dup v10.4s, %w[scale]\n"
3290       "dup v7.4s, v7.s[0]\n"
3291 
3292       // RowMajorOutput::Prepare
3293 
3294       // Reduce aggregators.
3295       "addp v0.4s, v0.4s, v1.4s\n"
3296       "addp v2.4s, v2.4s, v3.4s\n"
3297       "addp v4.4s, v4.4s, v5.4s\n"
3298       "addp v6.4s, v6.4s, v6.4s\n"
3299       "addp v0.4s, v0.4s, v2.4s\n"
3300       "addp v1.4s, v4.4s, v6.4s\n"
3301 
3302       // StaticQuantizationFloat::Transform
3303       "add v0.4s, v0.4s, v7.4s\n"
3304       "add v1.4s, v1.4s, v7.4s\n"
3305       "add v0.4s, v0.4s, v8.4s\n"
3306       "add v1.4s, v1.4s, v9.4s\n"
3307       "scvtf v0.4s, v0.4s\n"
3308       "scvtf v1.4s, v1.4s\n"
3309       "fmul v0.4s, v0.4s, v10.4s\n"
3310       "fmul v1.4s, v1.4s, v10.4s\n"
3311 
3312       // RowMajorOutput::Output
3313       "st1 {v0.4s}, [%x[result]], #16\n"
3314       "st1 {v1.2s}, [%x[result]], #8\n"
3315       "st1 {v1.s}[2], [%x[result]], #4\n"
3316       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3317       : [count] "r"(params.kernel.count),
3318         [stride] "r"(params.output_stream.stride),
3319         [scale] "r"(params.kernel.scale)
3320       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
3321         "v11", "v12", "v13", "v14", "v15", "cc", "memory");
3322 }
3323 
3324 template <>
3325 inline void MulKernel<
3326     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3327     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3328                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3329                                          RowMajor>& params,
3330                  float* result) {
3331 #ifdef DEBUG
3332 #ifdef DEBUG_METAGEMM_VERBOSE
3333   std::cout << __FILE__ << "(" << __LINE__
3334             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3335                "QuantizedStaticPreprocessedAsFloat, RowMajor, 1, 8, "
3336                "8>::Multiply()"
3337             << std::endl
3338             << std::flush;
3339 #endif
3340 #endif
3341   asm volatile(
3342       "prfm pldl1keep, [%x[lhs]]\n"
3343       "prfm pldl1keep, [%x[rhs]]\n"
3344 
3345       // Clear aggregators.
3346       "movi v0.4s, #0\n"
3347       "movi v1.4s, #0\n"
3348       "movi v2.4s, #0\n"
3349       "mov v3.16b, v0.16b\n"
3350       "mov v4.16b, v1.16b\n"
3351       "mov v5.16b, v2.16b\n"
3352       "mov v6.16b, v3.16b\n"
3353       "mov v7.16b, v4.16b\n"
3354 
3355       // 1x8 lanes loop.
3356       "1:"
3357 
3358       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
3359       "ld1 {v8.2s}, [%x[lhs]], #8\n"
3360       "umull v13.8h, v8.8b, v9.8b\n"
3361       "umull v14.8h, v8.8b, v10.8b\n"
3362       "umull v15.8h, v8.8b, v11.8b\n"
3363       "umull v16.8h, v8.8b, v12.8b\n"
3364       "ld1 {v9.2s, v10.2s, v11.2s, v12.2s}, [%x[rhs]], #32\n"
3365       "uadalp v0.4s, v13.8h\n"
3366       "uadalp v1.4s, v14.8h\n"
3367       "uadalp v2.4s, v15.8h\n"
3368       "uadalp v3.4s, v16.8h\n"
3369       "prfm pldl1keep, [%x[rhs], #256]\n"
3370       "umull v17.8h, v8.8b, v9.8b\n"
3371       "umull v13.8h, v8.8b, v10.8b\n"
3372       "umull v14.8h, v8.8b, v11.8b\n"
3373       "umull v15.8h, v8.8b, v12.8b\n"
3374       "prfm pldl1keep, [%x[lhs], #32]\n"
3375 
3376       // Subtract counter.
3377       "subs %x[count], %x[count], #8\n"
3378 
3379       "uadalp v4.4s, v17.8h\n"
3380       "uadalp v5.4s, v13.8h\n"
3381       "uadalp v6.4s, v14.8h\n"
3382       "uadalp v7.4s, v15.8h\n"
3383 
3384       // Loop break.
3385       "bgt 1b\n"
3386 
3387       // StaticQuantizationFloat::Prepare
3388       "ld1 {v8.4s}, [%x[lhs]], #16\n"
3389       "ld1 {v9.4s, v10.4s}, [%x[rhs]], #32\n"
3390       "dup v11.4s, %w[scale]\n"
3391       "dup v8.4s, v8.s[0]\n"
3392 
3393       // RowMajorOutput::Prepare
3394 
3395       // Reduce aggregators.
3396       "addp v0.4s, v0.4s, v1.4s\n"
3397       "addp v2.4s, v2.4s, v3.4s\n"
3398       "addp v4.4s, v4.4s, v5.4s\n"
3399       "addp v6.4s, v6.4s, v7.4s\n"
3400       "addp v0.4s, v0.4s, v2.4s\n"
3401       "addp v1.4s, v4.4s, v6.4s\n"
3402 
3403       // StaticQuantizationFloat::Transform
3404       "add v0.4s, v0.4s, v8.4s\n"
3405       "add v1.4s, v1.4s, v8.4s\n"
3406       "add v0.4s, v0.4s, v9.4s\n"
3407       "add v1.4s, v1.4s, v10.4s\n"
3408       "scvtf v0.4s, v0.4s\n"
3409       "scvtf v1.4s, v1.4s\n"
3410       "fmul v0.4s, v0.4s, v11.4s\n"
3411       "fmul v1.4s, v1.4s, v11.4s\n"
3412 
3413       // RowMajorOutput::Output
3414       "st1 {v0.4s, v1.4s}, [%x[result]], #32\n"
3415       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3416       : [count] "r"(params.kernel.count),
3417         [stride] "r"(params.output_stream.stride),
3418         [scale] "r"(params.kernel.scale)
3419       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
3420         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
3421 }
3422 
3423 template <>
3424 inline void MulKernel<
3425     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3426     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3427                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3428                                          RowMajor>& params,
3429                  float* result) {
3430 #ifdef DEBUG
3431 #ifdef DEBUG_METAGEMM_VERBOSE
3432   std::cout << __FILE__ << "(" << __LINE__
3433             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3434                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 1, "
3435                "8>::Multiply()"
3436             << std::endl
3437             << std::flush;
3438 #endif
3439 #endif
3440   asm volatile(
3441       "prfm pldl1keep, [%x[lhs]]\n"
3442       "prfm pldl1keep, [%x[rhs]]\n"
3443 
3444       // Clear aggregators.
3445       "movi v0.4s, #0\n"
3446       "movi v1.4s, #0\n"
3447 
3448       // General NxM lanes loop.
3449       "1:"
3450 
3451       // Subtract counter.
3452       "subs %x[count], %x[count], #8\n"
3453 
3454       "ld1 {v2.2s, v3.2s}, [%x[lhs]], #16\n"
3455       "ld1 {v4.2s}, [%x[rhs]], #8\n"
3456       "prfm pldl1keep, [%x[lhs], #64]\n"
3457       "prfm pldl1keep, [%x[rhs], #64]\n"
3458       "umull v5.8h, v4.8b, v2.8b\n"
3459       "umull v6.8h, v4.8b, v3.8b\n"
3460       "uadalp v0.4s, v5.8h\n"
3461       "uadalp v1.4s, v6.8h\n"
3462 
3463       // Loop break.
3464       "bgt 1b\n"
3465 
3466       // StaticQuantizationFloat::Prepare
3467       "ld1 {v4.4s}, [%x[lhs]], #16\n"
3468       "ld1 {v5.4s}, [%x[rhs]], #16\n"
3469       "dup v6.4s, %w[scale]\n"
3470       "dup v2.4s, v4.s[0]\n"
3471       "dup v4.4s, v4.s[1]\n"
3472 
3473       // RowMajorOutput::Prepare
3474       "add x0, %x[result], %x[stride]\n"
3475 
3476       // Reduce aggregators.
3477       "addp v0.4s, v0.4s, v0.4s\n"
3478       "addp v0.4s, v0.4s, v0.4s\n"
3479       "addp v1.4s, v1.4s, v1.4s\n"
3480       "addp v1.4s, v1.4s, v1.4s\n"
3481 
3482       // StaticQuantizationFloat::Transform
3483       "add v0.4s, v0.4s, v2.4s\n"
3484       "add v1.4s, v1.4s, v4.4s\n"
3485       "add v0.4s, v0.4s, v5.4s\n"
3486       "add v1.4s, v1.4s, v5.4s\n"
3487       "scvtf v0.4s, v0.4s\n"
3488       "scvtf v1.4s, v1.4s\n"
3489       "fmul v0.4s, v0.4s, v6.4s\n"
3490       "fmul v1.4s, v1.4s, v6.4s\n"
3491 
3492       // RowMajorOutput::Output
3493       "st1 {v0.s}[0], [%x[result]], #4\n"
3494       "st1 {v1.s}[0], [x0], #4\n"
3495       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3496       : [count] "r"(params.kernel.count),
3497         [stride] "r"(params.output_stream.stride),
3498         [scale] "r"(params.kernel.scale)
3499       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
3500 }
3501 
3502 template <>
3503 inline void MulKernel<
3504     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3505     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3506                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3507                                          RowMajor>& params,
3508                  float* result) {
3509 #ifdef DEBUG
3510 #ifdef DEBUG_METAGEMM_VERBOSE
3511   std::cout << __FILE__ << "(" << __LINE__
3512             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3513                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 2, "
3514                "8>::Multiply()"
3515             << std::endl
3516             << std::flush;
3517 #endif
3518 #endif
3519   asm volatile(
3520       "prfm pldl1keep, [%x[lhs]]\n"
3521       "prfm pldl1keep, [%x[rhs]]\n"
3522 
3523       // Clear aggregators.
3524       "movi v0.4s, #0\n"
3525       "movi v1.4s, #0\n"
3526       "movi v2.4s, #0\n"
3527       "mov v3.16b, v0.16b\n"
3528 
3529       // General NxM lanes loop.
3530       "1:"
3531 
3532       // Subtract counter.
3533       "subs %x[count], %x[count], #8\n"
3534 
3535       "ld1 {v4.2s, v5.2s}, [%x[lhs]], #16\n"
3536       "ld1 {v6.2s, v7.2s}, [%x[rhs]], #16\n"
3537       "prfm pldl1keep, [%x[lhs], #64]\n"
3538       "prfm pldl1keep, [%x[rhs], #64]\n"
3539       "umull v8.8h, v6.8b, v4.8b\n"
3540       "umull v9.8h, v7.8b, v4.8b\n"
3541       "umull v10.8h, v6.8b, v5.8b\n"
3542       "umull v11.8h, v7.8b, v5.8b\n"
3543       "uadalp v0.4s, v8.8h\n"
3544       "uadalp v1.4s, v9.8h\n"
3545       "uadalp v2.4s, v10.8h\n"
3546       "uadalp v3.4s, v11.8h\n"
3547 
3548       // Loop break.
3549       "bgt 1b\n"
3550 
3551       // StaticQuantizationFloat::Prepare
3552       "ld1 {v4.4s}, [%x[lhs]], #16\n"
3553       "ld1 {v5.4s}, [%x[rhs]], #16\n"
3554       "dup v6.4s, %w[scale]\n"
3555       "dup v7.4s, v4.s[0]\n"
3556       "dup v4.4s, v4.s[1]\n"
3557 
3558       // RowMajorOutput::Prepare
3559       "add x0, %x[result], %x[stride]\n"
3560 
3561       // Reduce aggregators.
3562       "addp v0.4s, v0.4s, v1.4s\n"
3563       "addp v0.4s, v0.4s, v0.4s\n"
3564       "addp v2.4s, v2.4s, v3.4s\n"
3565       "addp v2.4s, v2.4s, v2.4s\n"
3566 
3567       // StaticQuantizationFloat::Transform
3568       "add v0.4s, v0.4s, v7.4s\n"
3569       "add v2.4s, v2.4s, v4.4s\n"
3570       "add v0.4s, v0.4s, v5.4s\n"
3571       "add v2.4s, v2.4s, v5.4s\n"
3572       "scvtf v0.4s, v0.4s\n"
3573       "scvtf v2.4s, v2.4s\n"
3574       "fmul v0.4s, v0.4s, v6.4s\n"
3575       "fmul v2.4s, v2.4s, v6.4s\n"
3576 
3577       // RowMajorOutput::Output
3578       "st1 {v0.2s}, [%x[result]], #8\n"
3579       "st1 {v2.2s}, [x0], #8\n"
3580       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3581       : [count] "r"(params.kernel.count),
3582         [stride] "r"(params.output_stream.stride),
3583         [scale] "r"(params.kernel.scale)
3584       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
3585         "v11", "cc", "memory");
3586 }
3587 
3588 template <>
3589 inline void MulKernel<
3590     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3591     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3592                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3593                                          RowMajor>& params,
3594                  float* result) {
3595 #ifdef DEBUG
3596 #ifdef DEBUG_METAGEMM_VERBOSE
3597   std::cout << __FILE__ << "(" << __LINE__
3598             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3599                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 3, "
3600                "8>::Multiply()"
3601             << std::endl
3602             << std::flush;
3603 #endif
3604 #endif
3605   asm volatile(
3606       "prfm pldl1keep, [%x[lhs]]\n"
3607       "prfm pldl1keep, [%x[rhs]]\n"
3608 
3609       // Clear aggregators.
3610       "movi v0.4s, #0\n"
3611       "movi v1.4s, #0\n"
3612       "movi v2.4s, #0\n"
3613       "mov v3.16b, v0.16b\n"
3614       "mov v4.16b, v1.16b\n"
3615       "mov v5.16b, v2.16b\n"
3616 
3617       // General NxM lanes loop.
3618       "1:"
3619 
3620       // Subtract counter.
3621       "subs %x[count], %x[count], #8\n"
3622 
3623       "ld1 {v6.2s, v7.2s}, [%x[lhs]], #16\n"
3624       "ld1 {v8.2s, v9.2s, v10.2s}, [%x[rhs]], #24\n"
3625       "prfm pldl1keep, [%x[lhs], #64]\n"
3626       "prfm pldl1keep, [%x[rhs], #64]\n"
3627       "umull v11.8h, v8.8b, v6.8b\n"
3628       "umull v12.8h, v9.8b, v6.8b\n"
3629       "umull v13.8h, v10.8b, v6.8b\n"
3630       "umull v14.8h, v8.8b, v7.8b\n"
3631       "umull v15.8h, v9.8b, v7.8b\n"
3632       "umull v16.8h, v10.8b, v7.8b\n"
3633       "uadalp v0.4s, v11.8h\n"
3634       "uadalp v1.4s, v12.8h\n"
3635       "uadalp v2.4s, v13.8h\n"
3636       "uadalp v3.4s, v14.8h\n"
3637       "uadalp v4.4s, v15.8h\n"
3638       "uadalp v5.4s, v16.8h\n"
3639 
3640       // Loop break.
3641       "bgt 1b\n"
3642 
3643       // StaticQuantizationFloat::Prepare
3644       "ld1 {v6.4s}, [%x[lhs]], #16\n"
3645       "ld1 {v7.4s}, [%x[rhs]], #16\n"
3646       "dup v8.4s, %w[scale]\n"
3647       "dup v9.4s, v6.s[0]\n"
3648       "dup v6.4s, v6.s[1]\n"
3649 
3650       // RowMajorOutput::Prepare
3651       "add x0, %x[result], %x[stride]\n"
3652 
3653       // Reduce aggregators.
3654       "addp v0.4s, v0.4s, v1.4s\n"
3655       "addp v2.4s, v2.4s, v2.4s\n"
3656       "addp v0.4s, v0.4s, v2.4s\n"
3657       "addp v3.4s, v3.4s, v4.4s\n"
3658       "addp v5.4s, v5.4s, v5.4s\n"
3659       "addp v3.4s, v3.4s, v5.4s\n"
3660 
3661       // StaticQuantizationFloat::Transform
3662       "add v0.4s, v0.4s, v9.4s\n"
3663       "add v3.4s, v3.4s, v6.4s\n"
3664       "add v0.4s, v0.4s, v7.4s\n"
3665       "add v3.4s, v3.4s, v7.4s\n"
3666       "scvtf v0.4s, v0.4s\n"
3667       "scvtf v3.4s, v3.4s\n"
3668       "fmul v0.4s, v0.4s, v8.4s\n"
3669       "fmul v3.4s, v3.4s, v8.4s\n"
3670 
3671       // RowMajorOutput::Output
3672       "st1 {v0.2s}, [%x[result]], #8\n"
3673       "st1 {v0.s}[2], [%x[result]], #4\n"
3674       "st1 {v3.2s}, [x0], #8\n"
3675       "st1 {v3.s}[2], [x0], #4\n"
3676       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3677       : [count] "r"(params.kernel.count),
3678         [stride] "r"(params.output_stream.stride),
3679         [scale] "r"(params.kernel.scale)
3680       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
3681         "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
3682 }
3683 
3684 template <>
3685 inline void MulKernel<
3686     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3687     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3688                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3689                                          RowMajor>& params,
3690                  float* result) {
3691 #ifdef DEBUG
3692 #ifdef DEBUG_METAGEMM_VERBOSE
3693   std::cout << __FILE__ << "(" << __LINE__
3694             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3695                "QuantizedStaticPreprocessedAsFloat, RowMajor, 2, 4, "
3696                "8>::Multiply()"
3697             << std::endl
3698             << std::flush;
3699 #endif
3700 #endif
3701   asm volatile(
3702       "prfm pldl1keep, [%x[lhs]]\n"
3703       "prfm pldl1keep, [%x[rhs]]\n"
3704 
3705       // Clear aggregators.
3706       "movi v0.4s, #0\n"
3707       "movi v1.4s, #0\n"
3708       "movi v2.4s, #0\n"
3709       "mov v3.16b, v0.16b\n"
3710       "mov v4.16b, v1.16b\n"
3711       "mov v5.16b, v2.16b\n"
3712       "mov v6.16b, v3.16b\n"
3713       "mov v7.16b, v4.16b\n"
3714 
3715       // 2x4 lanes loop.
3716       "1:"
3717 
3718       "ld1 {v10.8b, v11.8b, v12.8b, v13.8b}, [%x[rhs]], #32\n"
3719       "ld1 {v8.8b}, [%x[lhs]], #8\n"
3720       "umull v14.8h, v8.8b, v10.8b\n"
3721       "ld1 {v9.8b}, [%x[lhs]], #8\n"
3722       "umull v15.8h, v8.8b, v11.8b\n"
3723       "prfm pldl1keep, [%x[rhs], #64]\n"
3724       "umull v16.8h, v8.8b, v12.8b\n"
3725       "prfm pldl1keep, [%x[lhs], #64]\n"
3726       "umull v17.8h, v8.8b, v13.8b\n"
3727       "umull v18.8h, v9.8b, v10.8b\n"
3728       "uadalp v0.4s, v14.8h\n"
3729       "uadalp v1.4s, v15.8h\n"
3730       "uadalp v2.4s, v16.8h\n"
3731       "umull v14.8h, v9.8b, v11.8b\n"
3732       "umull v15.8h, v9.8b, v12.8b\n"
3733       "umull v16.8h, v9.8b, v13.8b\n"
3734 
3735       // Subtract counter.
3736       "subs %x[count], %x[count], #8\n"
3737 
3738       "uadalp v3.4s, v17.8h\n"
3739       "uadalp v4.4s, v18.8h\n"
3740       "uadalp v5.4s, v14.8h\n"
3741       "uadalp v6.4s, v15.8h\n"
3742       "uadalp v7.4s, v16.8h\n"
3743 
3744       // Loop break.
3745       "bgt 1b\n"
3746 
3747       // StaticQuantizationFloat::Prepare
3748       "ld1 {v8.4s}, [%x[lhs]], #16\n"
3749       "ld1 {v9.4s}, [%x[rhs]], #16\n"
3750       "dup v10.4s, %w[scale]\n"
3751       "dup v11.4s, v8.s[0]\n"
3752       "dup v8.4s, v8.s[1]\n"
3753 
3754       // RowMajorOutput::Prepare
3755       "add x0, %x[result], %x[stride]\n"
3756 
3757       // Reduce aggregators.
3758       "addp v0.4s, v0.4s, v1.4s\n"
3759       "addp v2.4s, v2.4s, v3.4s\n"
3760       "addp v0.4s, v0.4s, v2.4s\n"
3761       "addp v4.4s, v4.4s, v5.4s\n"
3762       "addp v6.4s, v6.4s, v7.4s\n"
3763       "addp v4.4s, v4.4s, v6.4s\n"
3764 
3765       // StaticQuantizationFloat::Transform
3766       "add v0.4s, v0.4s, v11.4s\n"
3767       "add v4.4s, v4.4s, v8.4s\n"
3768       "add v0.4s, v0.4s, v9.4s\n"
3769       "add v4.4s, v4.4s, v9.4s\n"
3770       "scvtf v0.4s, v0.4s\n"
3771       "scvtf v4.4s, v4.4s\n"
3772       "fmul v0.4s, v0.4s, v10.4s\n"
3773       "fmul v4.4s, v4.4s, v10.4s\n"
3774 
3775       // RowMajorOutput::Output
3776       "st1 {v0.4s}, [%x[result]], #16\n"
3777       "st1 {v4.4s}, [x0], #16\n"
3778       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3779       : [count] "r"(params.kernel.count),
3780         [stride] "r"(params.output_stream.stride),
3781         [scale] "r"(params.kernel.scale)
3782       : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
3783         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc", "memory");
3784 }
3785 
3786 template <>
3787 inline void MulKernel<
3788     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3789     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3790                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3791                                          RowMajor>& params,
3792                  float* result) {
3793 #ifdef DEBUG
3794 #ifdef DEBUG_METAGEMM_VERBOSE
3795   std::cout << __FILE__ << "(" << __LINE__
3796             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3797                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 1, "
3798                "8>::Multiply()"
3799             << std::endl
3800             << std::flush;
3801 #endif
3802 #endif
3803   asm volatile(
3804       "prfm pldl1keep, [%x[lhs]]\n"
3805       "prfm pldl1keep, [%x[rhs]]\n"
3806 
3807       // Clear aggregators.
3808       "movi v0.4s, #0\n"
3809       "movi v1.4s, #0\n"
3810       "movi v2.4s, #0\n"
3811 
3812       // General NxM lanes loop.
3813       "1:"
3814 
3815       // Subtract counter.
3816       "subs %x[count], %x[count], #8\n"
3817 
3818       "ld1 {v3.2s, v4.2s, v5.2s}, [%x[lhs]], #24\n"
3819       "ld1 {v6.2s}, [%x[rhs]], #8\n"
3820       "prfm pldl1keep, [%x[lhs], #64]\n"
3821       "prfm pldl1keep, [%x[rhs], #64]\n"
3822       "umull v7.8h, v6.8b, v3.8b\n"
3823       "umull v8.8h, v6.8b, v4.8b\n"
3824       "umull v9.8h, v6.8b, v5.8b\n"
3825       "uadalp v0.4s, v7.8h\n"
3826       "uadalp v1.4s, v8.8h\n"
3827       "uadalp v2.4s, v9.8h\n"
3828 
3829       // Loop break.
3830       "bgt 1b\n"
3831 
3832       // StaticQuantizationFloat::Prepare
3833       "ld1 {v4.4s}, [%x[lhs]], #16\n"
3834       "ld1 {v5.4s}, [%x[rhs]], #16\n"
3835       "dup v6.4s, %w[scale]\n"
3836       "dup v3.4s, v4.s[0]\n"
3837       "dup v7.4s, v4.s[1]\n"
3838       "dup v4.4s, v4.s[2]\n"
3839 
3840       // RowMajorOutput::Prepare
3841       "add x0, %x[result], %x[stride]\n"
3842       "add x1, x0, %x[stride]\n"
3843 
3844       // Reduce aggregators.
3845       "addp v0.4s, v0.4s, v0.4s\n"
3846       "addp v0.4s, v0.4s, v0.4s\n"
3847       "addp v1.4s, v1.4s, v1.4s\n"
3848       "addp v1.4s, v1.4s, v1.4s\n"
3849       "addp v2.4s, v2.4s, v2.4s\n"
3850       "addp v2.4s, v2.4s, v2.4s\n"
3851 
3852       // StaticQuantizationFloat::Transform
3853       "add v0.4s, v0.4s, v3.4s\n"
3854       "add v1.4s, v1.4s, v7.4s\n"
3855       "add v2.4s, v2.4s, v4.4s\n"
3856       "add v0.4s, v0.4s, v5.4s\n"
3857       "add v1.4s, v1.4s, v5.4s\n"
3858       "add v2.4s, v2.4s, v5.4s\n"
3859       "scvtf v0.4s, v0.4s\n"
3860       "scvtf v1.4s, v1.4s\n"
3861       "scvtf v2.4s, v2.4s\n"
3862       "fmul v0.4s, v0.4s, v6.4s\n"
3863       "fmul v1.4s, v1.4s, v6.4s\n"
3864       "fmul v2.4s, v2.4s, v6.4s\n"
3865 
3866       // RowMajorOutput::Output
3867       "st1 {v0.s}[0], [%x[result]], #4\n"
3868       "st1 {v1.s}[0], [x0], #4\n"
3869       "st1 {v2.s}[0], [x1], #4\n"
3870       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3871       : [count] "r"(params.kernel.count),
3872         [stride] "r"(params.output_stream.stride),
3873         [scale] "r"(params.kernel.scale)
3874       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
3875         "cc", "memory");
3876 }
3877 
3878 template <>
3879 inline void MulKernel<
3880     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3881     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3882                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3883                                          RowMajor>& params,
3884                  float* result) {
3885 #ifdef DEBUG
3886 #ifdef DEBUG_METAGEMM_VERBOSE
3887   std::cout << __FILE__ << "(" << __LINE__
3888             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3889                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 2, "
3890                "8>::Multiply()"
3891             << std::endl
3892             << std::flush;
3893 #endif
3894 #endif
3895   asm volatile(
3896       "prfm pldl1keep, [%x[lhs]]\n"
3897       "prfm pldl1keep, [%x[rhs]]\n"
3898 
3899       // Clear aggregators.
3900       "movi v0.4s, #0\n"
3901       "movi v1.4s, #0\n"
3902       "movi v2.4s, #0\n"
3903       "mov v3.16b, v0.16b\n"
3904       "mov v4.16b, v1.16b\n"
3905       "mov v5.16b, v2.16b\n"
3906 
3907       // General NxM lanes loop.
3908       "1:"
3909 
3910       // Subtract counter.
3911       "subs %x[count], %x[count], #8\n"
3912 
3913       "ld1 {v6.2s, v7.2s, v8.2s}, [%x[lhs]], #24\n"
3914       "ld1 {v9.2s, v10.2s}, [%x[rhs]], #16\n"
3915       "prfm pldl1keep, [%x[lhs], #64]\n"
3916       "prfm pldl1keep, [%x[rhs], #64]\n"
3917       "umull v11.8h, v9.8b, v6.8b\n"
3918       "umull v12.8h, v10.8b, v6.8b\n"
3919       "umull v13.8h, v9.8b, v7.8b\n"
3920       "umull v14.8h, v10.8b, v7.8b\n"
3921       "umull v15.8h, v9.8b, v8.8b\n"
3922       "umull v16.8h, v10.8b, v8.8b\n"
3923       "uadalp v0.4s, v11.8h\n"
3924       "uadalp v1.4s, v12.8h\n"
3925       "uadalp v2.4s, v13.8h\n"
3926       "uadalp v3.4s, v14.8h\n"
3927       "uadalp v4.4s, v15.8h\n"
3928       "uadalp v5.4s, v16.8h\n"
3929 
3930       // Loop break.
3931       "bgt 1b\n"
3932 
3933       // StaticQuantizationFloat::Prepare
3934       "ld1 {v6.4s}, [%x[lhs]], #16\n"
3935       "ld1 {v7.4s}, [%x[rhs]], #16\n"
3936       "dup v8.4s, %w[scale]\n"
3937       "dup v9.4s, v6.s[0]\n"
3938       "dup v10.4s, v6.s[1]\n"
3939       "dup v6.4s, v6.s[2]\n"
3940 
3941       // RowMajorOutput::Prepare
3942       "add x0, %x[result], %x[stride]\n"
3943       "add x1, x0, %x[stride]\n"
3944 
3945       // Reduce aggregators.
3946       "addp v0.4s, v0.4s, v1.4s\n"
3947       "addp v0.4s, v0.4s, v0.4s\n"
3948       "addp v2.4s, v2.4s, v3.4s\n"
3949       "addp v2.4s, v2.4s, v2.4s\n"
3950       "addp v4.4s, v4.4s, v5.4s\n"
3951       "addp v4.4s, v4.4s, v4.4s\n"
3952 
3953       // StaticQuantizationFloat::Transform
3954       "add v0.4s, v0.4s, v9.4s\n"
3955       "add v2.4s, v2.4s, v10.4s\n"
3956       "add v4.4s, v4.4s, v6.4s\n"
3957       "add v0.4s, v0.4s, v7.4s\n"
3958       "add v2.4s, v2.4s, v7.4s\n"
3959       "add v4.4s, v4.4s, v7.4s\n"
3960       "scvtf v0.4s, v0.4s\n"
3961       "scvtf v2.4s, v2.4s\n"
3962       "scvtf v4.4s, v4.4s\n"
3963       "fmul v0.4s, v0.4s, v8.4s\n"
3964       "fmul v2.4s, v2.4s, v8.4s\n"
3965       "fmul v4.4s, v4.4s, v8.4s\n"
3966 
3967       // RowMajorOutput::Output
3968       "st1 {v0.2s}, [%x[result]], #8\n"
3969       "st1 {v2.2s}, [x0], #8\n"
3970       "st1 {v4.2s}, [x1], #8\n"
3971       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
3972       : [count] "r"(params.kernel.count),
3973         [stride] "r"(params.output_stream.stride),
3974         [scale] "r"(params.kernel.scale)
3975       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
3976         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory");
3977 }
3978 
3979 template <>
3980 inline void MulKernel<
3981     uint8_t, float, QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3,
Multiply(const uint8_t * lhs,const uint8_t * rhs,const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,RowMajor> & params,float * result)3982     8>::Multiply(const uint8_t* lhs, const uint8_t* rhs,
3983                  const FusedKernelParams<QuantizedStaticPreprocessedAsFloat,
3984                                          RowMajor>& params,
3985                  float* result) {
3986 #ifdef DEBUG
3987 #ifdef DEBUG_METAGEMM_VERBOSE
3988   std::cout << __FILE__ << "(" << __LINE__
3989             << ") QuantizedStaticPreprocessedAsFloatRowMajor<uint8_t, float, "
3990                "QuantizedStaticPreprocessedAsFloat, RowMajor, 3, 3, "
3991                "8>::Multiply()"
3992             << std::endl
3993             << std::flush;
3994 #endif
3995 #endif
3996   asm volatile(
3997       "prfm pldl1keep, [%x[lhs]]\n"
3998       "prfm pldl1keep, [%x[rhs]]\n"
3999 
4000       // Clear aggregators.
4001       "movi v0.4s, #0\n"
4002       "movi v1.4s, #0\n"
4003       "movi v2.4s, #0\n"
4004       "mov v3.16b, v0.16b\n"
4005       "mov v4.16b, v1.16b\n"
4006       "mov v5.16b, v2.16b\n"
4007       "mov v6.16b, v3.16b\n"
4008       "mov v7.16b, v4.16b\n"
4009       "mov v8.16b, v5.16b\n"
4010 
4011       // 3x3 lanes loop.
4012       "1:"
4013 
4014       "ld1 {v12.8b, v13.8b, v14.8b}, [%x[rhs]], #24\n"
4015       "ld1 {v9.8b}, [%x[lhs]], #8\n"
4016       "umull v15.8h, v9.8b, v12.8b\n"
4017       "ld1 {v10.8b}, [%x[lhs]], #8\n"
4018       "umull v16.8h, v9.8b, v13.8b\n"
4019       "ld1 {v11.8b}, [%x[lhs]], #8\n"
4020       "umull v17.8h, v9.8b, v14.8b\n"
4021       "prfm pldl1keep, [%x[lhs], #64]\n"
4022       "umull v18.8h, v10.8b, v12.8b\n"
4023       "prfm pldl1keep, [%x[rhs], #64]\n"
4024       "uadalp v0.4s, v15.8h\n"
4025       "uadalp v1.4s, v16.8h\n"
4026       "uadalp v2.4s, v17.8h\n"
4027       "uadalp v3.4s, v18.8h\n"
4028       "umull v15.8h, v10.8b, v13.8b\n"
4029       "umull v16.8h, v10.8b, v14.8b\n"
4030       "umull v17.8h, v11.8b, v12.8b\n"
4031       "umull v18.8h, v11.8b, v13.8b\n"
4032 
4033       // Subtract counter.
4034       "subs %x[count], %x[count], #8\n"
4035 
4036       "umull v9.8h, v11.8b, v14.8b\n"
4037       "uadalp v4.4s, v15.8h\n"
4038       "uadalp v5.4s, v16.8h\n"
4039       "uadalp v6.4s, v17.8h\n"
4040       "uadalp v7.4s, v18.8h\n"
4041       "uadalp v8.4s, v9.8h\n"
4042 
4043       // Loop break.
4044       "bgt 1b\n"
4045 
4046       // StaticQuantizationFloat::Prepare
4047       "ld1 {v9.4s}, [%x[lhs]], #16\n"
4048       "ld1 {v10.4s}, [%x[rhs]], #16\n"
4049       "dup v11.4s, %w[scale]\n"
4050       "dup v12.4s, v9.s[0]\n"
4051       "dup v13.4s, v9.s[1]\n"
4052       "dup v9.4s, v9.s[2]\n"
4053 
4054       // RowMajorOutput::Prepare
4055       "add x0, %x[result], %x[stride]\n"
4056       "add x1, x0, %x[stride]\n"
4057 
4058       // Reduce aggregators.
4059       "addp v0.4s, v0.4s, v1.4s\n"
4060       "addp v2.4s, v2.4s, v2.4s\n"
4061       "addp v0.4s, v0.4s, v2.4s\n"
4062       "addp v3.4s, v3.4s, v4.4s\n"
4063       "addp v5.4s, v5.4s, v5.4s\n"
4064       "addp v3.4s, v3.4s, v5.4s\n"
4065       "addp v6.4s, v6.4s, v7.4s\n"
4066       "addp v8.4s, v8.4s, v8.4s\n"
4067       "addp v6.4s, v6.4s, v8.4s\n"
4068 
4069       // StaticQuantizationFloat::Transform
4070       "add v0.4s, v0.4s, v12.4s\n"
4071       "add v3.4s, v3.4s, v13.4s\n"
4072       "add v6.4s, v6.4s, v9.4s\n"
4073       "add v0.4s, v0.4s, v10.4s\n"
4074       "add v3.4s, v3.4s, v10.4s\n"
4075       "add v6.4s, v6.4s, v10.4s\n"
4076       "scvtf v0.4s, v0.4s\n"
4077       "scvtf v3.4s, v3.4s\n"
4078       "scvtf v6.4s, v6.4s\n"
4079       "fmul v0.4s, v0.4s, v11.4s\n"
4080       "fmul v3.4s, v3.4s, v11.4s\n"
4081       "fmul v6.4s, v6.4s, v11.4s\n"
4082 
4083       // RowMajorOutput::Output
4084       "st1 {v0.2s}, [%x[result]], #8\n"
4085       "st1 {v0.s}[2], [%x[result]], #4\n"
4086       "st1 {v3.2s}, [x0], #8\n"
4087       "st1 {v3.s}[2], [x0], #4\n"
4088       "st1 {v6.2s}, [x1], #8\n"
4089       "st1 {v6.s}[2], [x1], #4\n"
4090       : [rhs] "+r"(rhs), [lhs] "+r"(lhs), [result] "+r"(result)
4091       : [count] "r"(params.kernel.count),
4092         [stride] "r"(params.output_stream.stride),
4093         [scale] "r"(params.kernel.scale)
4094       : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
4095         "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc",
4096         "memory");
4097 }
4098 
4099 }  // namespace meta
4100 }  // namespace gemmlowp
4101 
4102 #else
4103 #warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!"
4104 #endif
4105 
4106 #endif  // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_ARM_64_H_
4107