• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef GEMMLOWP_META_STREAMS_ARM_64_H_
16 #define GEMMLOWP_META_STREAMS_ARM_64_H_
17 
18 #ifdef GEMMLOWP_NEON_64
19 
20 #include <cassert>
21 #include <cstdint>
22 
23 namespace gemmlowp {
24 namespace meta {
25 
26 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)27 inline void Stream<uint8_t, 1, 8, 0, RowMajorWithSum>::Pack(
28     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
29 #ifdef DEBUG
30 #ifdef DEBUG_METAGEMM_VERBOSE
31   std::cout << __FILE__ << "(" << __LINE__
32             << ") RowMajorWithSum<uint8_t, 1, 8, 0, RowMajorWithSum>::Pack()"
33             << std::endl
34             << std::flush;
35 #endif
36 #endif
37   int params_count_copy = params.count;
38   asm volatile(
39       "movi v8.8h, #0\n"
40 
41       "1:"
42       "subs %x[count], %x[count], #8\n"
43 
44       // Load Aggregate Store: 1x8.
45       "ld1 {v0.2s}, [%x[in]], #8\n"
46       "uaddw v8.8h, v8.8h, v0.8b\n"
47       "st1 {v0.2s}, [%x[out]], #8\n"
48 
49       "bne 1b\n"
50 
51       // Aggregator Reduction.
52       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
53       "dup v1.4s, %w[additive_sum_offset]\n"
54       "uaddlp v8.4s, v8.8h\n"
55       "addp v8.4s, v8.4s, v8.4s\n"
56       "addp v8.4s, v8.4s, v8.4s\n"
57       "mul v8.4s, v8.4s, v0.s[0]\n"
58       "add v8.4s, v8.4s, v1.4s\n"
59       "st1 {v8.4s}, [%x[out]]\n"
60       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
61       : [stride] "r"(params.stride),
62         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
63         [additive_sum_offset] "r"(params.additive_sum_offset)
64       : "v8", "v0", "v1", "cc", "memory");
65 }
66 
67 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)68 inline void Stream<uint8_t, 1, 8, 1, RowMajorWithSum>::Pack(
69     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
70 #ifdef DEBUG
71 #ifdef DEBUG_METAGEMM_VERBOSE
72   std::cout << __FILE__ << "(" << __LINE__
73             << ") RowMajorWithSum<uint8_t, 1, 8, 1, RowMajorWithSum>::Pack()"
74             << std::endl
75             << std::flush;
76 #endif
77 #endif
78   int params_count_copy = params.count;
79   asm volatile(
80       "movi v8.8h, #0\n"
81 
82       // Reduce count by leftovers.
83       "subs %x[count], %x[count], #1\n"
84       "beq 2f\n"
85 
86       "1:"
87       "subs %x[count], %x[count], #8\n"
88 
89       // Load Aggregate Store: 1x8.
90       "ld1 {v0.2s}, [%x[in]], #8\n"
91       "uaddw v8.8h, v8.8h, v0.8b\n"
92       "st1 {v0.2s}, [%x[out]], #8\n"
93 
94       "bne 1b\n"
95 
96       "2:"
97 
98       // Load Aggregate Store: 1x1.
99       "movi v0.8b, #0\n"
100       "ld1 {v0.b}[0], [%x[in]], #1\n"
101       "uaddw v8.8h, v8.8h, v0.8b\n"
102       "st1 {v0.2s}, [%x[out]], #8\n"
103 
104       // Aggregator Reduction.
105       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
106       "dup v1.4s, %w[additive_sum_offset]\n"
107       "uaddlp v8.4s, v8.8h\n"
108       "addp v8.4s, v8.4s, v8.4s\n"
109       "addp v8.4s, v8.4s, v8.4s\n"
110       "mul v8.4s, v8.4s, v0.s[0]\n"
111       "add v8.4s, v8.4s, v1.4s\n"
112       "st1 {v8.4s}, [%x[out]]\n"
113       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
114       : [stride] "r"(params.stride),
115         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
116         [additive_sum_offset] "r"(params.additive_sum_offset)
117       : "v8", "v0", "v1", "cc", "memory");
118 }
119 
120 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)121 inline void Stream<uint8_t, 1, 8, 2, RowMajorWithSum>::Pack(
122     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
123 #ifdef DEBUG
124 #ifdef DEBUG_METAGEMM_VERBOSE
125   std::cout << __FILE__ << "(" << __LINE__
126             << ") RowMajorWithSum<uint8_t, 1, 8, 2, RowMajorWithSum>::Pack()"
127             << std::endl
128             << std::flush;
129 #endif
130 #endif
131   int params_count_copy = params.count;
132   asm volatile(
133       "movi v8.8h, #0\n"
134 
135       // Reduce count by leftovers.
136       "subs %x[count], %x[count], #2\n"
137       "beq 2f\n"
138 
139       "1:"
140       "subs %x[count], %x[count], #8\n"
141 
142       // Load Aggregate Store: 1x8.
143       "ld1 {v0.2s}, [%x[in]], #8\n"
144       "uaddw v8.8h, v8.8h, v0.8b\n"
145       "st1 {v0.2s}, [%x[out]], #8\n"
146 
147       "bne 1b\n"
148 
149       "2:"
150 
151       // Load Aggregate Store: 1x2.
152       "movi v0.8b, #0\n"
153       "ld1 {v0.h}[0], [%x[in]], #2\n"
154       "uaddw v8.8h, v8.8h, v0.8b\n"
155       "st1 {v0.2s}, [%x[out]], #8\n"
156 
157       // Aggregator Reduction.
158       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
159       "dup v1.4s, %w[additive_sum_offset]\n"
160       "uaddlp v8.4s, v8.8h\n"
161       "addp v8.4s, v8.4s, v8.4s\n"
162       "addp v8.4s, v8.4s, v8.4s\n"
163       "mul v8.4s, v8.4s, v0.s[0]\n"
164       "add v8.4s, v8.4s, v1.4s\n"
165       "st1 {v8.4s}, [%x[out]]\n"
166       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
167       : [stride] "r"(params.stride),
168         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
169         [additive_sum_offset] "r"(params.additive_sum_offset)
170       : "v8", "v0", "v1", "cc", "memory");
171 }
172 
173 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)174 inline void Stream<uint8_t, 1, 8, 3, RowMajorWithSum>::Pack(
175     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
176 #ifdef DEBUG
177 #ifdef DEBUG_METAGEMM_VERBOSE
178   std::cout << __FILE__ << "(" << __LINE__
179             << ") RowMajorWithSum<uint8_t, 1, 8, 3, RowMajorWithSum>::Pack()"
180             << std::endl
181             << std::flush;
182 #endif
183 #endif
184   int params_count_copy = params.count;
185   asm volatile(
186       "movi v8.8h, #0\n"
187 
188       // Reduce count by leftovers.
189       "subs %x[count], %x[count], #3\n"
190       "beq 2f\n"
191 
192       "1:"
193       "subs %x[count], %x[count], #8\n"
194 
195       // Load Aggregate Store: 1x8.
196       "ld1 {v0.2s}, [%x[in]], #8\n"
197       "uaddw v8.8h, v8.8h, v0.8b\n"
198       "st1 {v0.2s}, [%x[out]], #8\n"
199 
200       "bne 1b\n"
201 
202       "2:"
203 
204       // Load Aggregate Store: 1x3.
205       "movi v0.8b, #0\n"
206       "ld1 {v0.h}[0], [%x[in]], #2\n"
207       "ld1 {v0.b}[2], [%x[in]], #1\n"
208       "uaddw v8.8h, v8.8h, v0.8b\n"
209       "st1 {v0.2s}, [%x[out]], #8\n"
210 
211       // Aggregator Reduction.
212       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
213       "dup v1.4s, %w[additive_sum_offset]\n"
214       "uaddlp v8.4s, v8.8h\n"
215       "addp v8.4s, v8.4s, v8.4s\n"
216       "addp v8.4s, v8.4s, v8.4s\n"
217       "mul v8.4s, v8.4s, v0.s[0]\n"
218       "add v8.4s, v8.4s, v1.4s\n"
219       "st1 {v8.4s}, [%x[out]]\n"
220       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
221       : [stride] "r"(params.stride),
222         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
223         [additive_sum_offset] "r"(params.additive_sum_offset)
224       : "v8", "v0", "v1", "cc", "memory");
225 }
226 
227 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)228 inline void Stream<uint8_t, 1, 8, 4, RowMajorWithSum>::Pack(
229     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
230 #ifdef DEBUG
231 #ifdef DEBUG_METAGEMM_VERBOSE
232   std::cout << __FILE__ << "(" << __LINE__
233             << ") RowMajorWithSum<uint8_t, 1, 8, 4, RowMajorWithSum>::Pack()"
234             << std::endl
235             << std::flush;
236 #endif
237 #endif
238   int params_count_copy = params.count;
239   asm volatile(
240       "movi v8.8h, #0\n"
241 
242       // Reduce count by leftovers.
243       "subs %x[count], %x[count], #4\n"
244       "beq 2f\n"
245 
246       "1:"
247       "subs %x[count], %x[count], #8\n"
248 
249       // Load Aggregate Store: 1x8.
250       "ld1 {v0.2s}, [%x[in]], #8\n"
251       "uaddw v8.8h, v8.8h, v0.8b\n"
252       "st1 {v0.2s}, [%x[out]], #8\n"
253 
254       "bne 1b\n"
255 
256       "2:"
257 
258       // Load Aggregate Store: 1x4.
259       "movi v0.8b, #0\n"
260       "ld1 {v0.s}[0], [%x[in]], #4\n"
261       "uaddw v8.8h, v8.8h, v0.8b\n"
262       "st1 {v0.2s}, [%x[out]], #8\n"
263 
264       // Aggregator Reduction.
265       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
266       "dup v1.4s, %w[additive_sum_offset]\n"
267       "uaddlp v8.4s, v8.8h\n"
268       "addp v8.4s, v8.4s, v8.4s\n"
269       "addp v8.4s, v8.4s, v8.4s\n"
270       "mul v8.4s, v8.4s, v0.s[0]\n"
271       "add v8.4s, v8.4s, v1.4s\n"
272       "st1 {v8.4s}, [%x[out]]\n"
273       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
274       : [stride] "r"(params.stride),
275         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
276         [additive_sum_offset] "r"(params.additive_sum_offset)
277       : "v8", "v0", "v1", "cc", "memory");
278 }
279 
280 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)281 inline void Stream<uint8_t, 1, 8, 5, RowMajorWithSum>::Pack(
282     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
283 #ifdef DEBUG
284 #ifdef DEBUG_METAGEMM_VERBOSE
285   std::cout << __FILE__ << "(" << __LINE__
286             << ") RowMajorWithSum<uint8_t, 1, 8, 5, RowMajorWithSum>::Pack()"
287             << std::endl
288             << std::flush;
289 #endif
290 #endif
291   int params_count_copy = params.count;
292   asm volatile(
293       "movi v8.8h, #0\n"
294 
295       // Reduce count by leftovers.
296       "subs %x[count], %x[count], #5\n"
297       "beq 2f\n"
298 
299       "1:"
300       "subs %x[count], %x[count], #8\n"
301 
302       // Load Aggregate Store: 1x8.
303       "ld1 {v0.2s}, [%x[in]], #8\n"
304       "uaddw v8.8h, v8.8h, v0.8b\n"
305       "st1 {v0.2s}, [%x[out]], #8\n"
306 
307       "bne 1b\n"
308 
309       "2:"
310 
311       // Load Aggregate Store: 1x5.
312       "movi v0.8b, #0\n"
313       "ld1 {v0.s}[0], [%x[in]], #4\n"
314       "ld1 {v0.b}[4], [%x[in]], #1\n"
315       "uaddw v8.8h, v8.8h, v0.8b\n"
316       "st1 {v0.2s}, [%x[out]], #8\n"
317 
318       // Aggregator Reduction.
319       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
320       "dup v1.4s, %w[additive_sum_offset]\n"
321       "uaddlp v8.4s, v8.8h\n"
322       "addp v8.4s, v8.4s, v8.4s\n"
323       "addp v8.4s, v8.4s, v8.4s\n"
324       "mul v8.4s, v8.4s, v0.s[0]\n"
325       "add v8.4s, v8.4s, v1.4s\n"
326       "st1 {v8.4s}, [%x[out]]\n"
327       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
328       : [stride] "r"(params.stride),
329         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
330         [additive_sum_offset] "r"(params.additive_sum_offset)
331       : "v8", "v0", "v1", "cc", "memory");
332 }
333 
334 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)335 inline void Stream<uint8_t, 1, 8, 6, RowMajorWithSum>::Pack(
336     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
337 #ifdef DEBUG
338 #ifdef DEBUG_METAGEMM_VERBOSE
339   std::cout << __FILE__ << "(" << __LINE__
340             << ") RowMajorWithSum<uint8_t, 1, 8, 6, RowMajorWithSum>::Pack()"
341             << std::endl
342             << std::flush;
343 #endif
344 #endif
345   int params_count_copy = params.count;
346   asm volatile(
347       "movi v8.8h, #0\n"
348 
349       // Reduce count by leftovers.
350       "subs %x[count], %x[count], #6\n"
351       "beq 2f\n"
352 
353       "1:"
354       "subs %x[count], %x[count], #8\n"
355 
356       // Load Aggregate Store: 1x8.
357       "ld1 {v0.2s}, [%x[in]], #8\n"
358       "uaddw v8.8h, v8.8h, v0.8b\n"
359       "st1 {v0.2s}, [%x[out]], #8\n"
360 
361       "bne 1b\n"
362 
363       "2:"
364 
365       // Load Aggregate Store: 1x6.
366       "movi v0.8b, #0\n"
367       "ld1 {v0.s}[0], [%x[in]], #4\n"
368       "ld1 {v0.h}[2], [%x[in]], #2\n"
369       "uaddw v8.8h, v8.8h, v0.8b\n"
370       "st1 {v0.2s}, [%x[out]], #8\n"
371 
372       // Aggregator Reduction.
373       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
374       "dup v1.4s, %w[additive_sum_offset]\n"
375       "uaddlp v8.4s, v8.8h\n"
376       "addp v8.4s, v8.4s, v8.4s\n"
377       "addp v8.4s, v8.4s, v8.4s\n"
378       "mul v8.4s, v8.4s, v0.s[0]\n"
379       "add v8.4s, v8.4s, v1.4s\n"
380       "st1 {v8.4s}, [%x[out]]\n"
381       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
382       : [stride] "r"(params.stride),
383         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
384         [additive_sum_offset] "r"(params.additive_sum_offset)
385       : "v8", "v0", "v1", "cc", "memory");
386 }
387 
388 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)389 inline void Stream<uint8_t, 1, 8, 7, RowMajorWithSum>::Pack(
390     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
391 #ifdef DEBUG
392 #ifdef DEBUG_METAGEMM_VERBOSE
393   std::cout << __FILE__ << "(" << __LINE__
394             << ") RowMajorWithSum<uint8_t, 1, 8, 7, RowMajorWithSum>::Pack()"
395             << std::endl
396             << std::flush;
397 #endif
398 #endif
399   int params_count_copy = params.count;
400   asm volatile(
401       "movi v8.8h, #0\n"
402 
403       // Reduce count by leftovers.
404       "subs %x[count], %x[count], #7\n"
405       "beq 2f\n"
406 
407       "1:"
408       "subs %x[count], %x[count], #8\n"
409 
410       // Load Aggregate Store: 1x8.
411       "ld1 {v0.2s}, [%x[in]], #8\n"
412       "uaddw v8.8h, v8.8h, v0.8b\n"
413       "st1 {v0.2s}, [%x[out]], #8\n"
414 
415       "bne 1b\n"
416 
417       "2:"
418 
419       // Load Aggregate Store: 1x7.
420       "movi v0.8b, #0\n"
421       "ld1 {v0.s}[0], [%x[in]], #4\n"
422       "ld1 {v0.h}[2], [%x[in]], #2\n"
423       "ld1 {v0.b}[6], [%x[in]], #1\n"
424       "uaddw v8.8h, v8.8h, v0.8b\n"
425       "st1 {v0.2s}, [%x[out]], #8\n"
426 
427       // Aggregator Reduction.
428       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
429       "dup v1.4s, %w[additive_sum_offset]\n"
430       "uaddlp v8.4s, v8.8h\n"
431       "addp v8.4s, v8.4s, v8.4s\n"
432       "addp v8.4s, v8.4s, v8.4s\n"
433       "mul v8.4s, v8.4s, v0.s[0]\n"
434       "add v8.4s, v8.4s, v1.4s\n"
435       "st1 {v8.4s}, [%x[out]]\n"
436       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
437       : [stride] "r"(params.stride),
438         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
439         [additive_sum_offset] "r"(params.additive_sum_offset)
440       : "v8", "v0", "v1", "cc", "memory");
441 }
442 
443 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)444 inline void Stream<uint8_t, 2, 8, 0, RowMajorWithSum>::Pack(
445     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
446 #ifdef DEBUG
447 #ifdef DEBUG_METAGEMM_VERBOSE
448   std::cout << __FILE__ << "(" << __LINE__
449             << ") RowMajorWithSum<uint8_t, 2, 8, 0, RowMajorWithSum>::Pack()"
450             << std::endl
451             << std::flush;
452 #endif
453 #endif
454   int params_count_copy = params.count;
455   asm volatile(
456       "add x0, %x[in], %x[stride]\n"
457       "movi v8.8h, #0\n"
458       "movi v9.8h, #0\n"
459 
460       "1:"
461       "subs %x[count], %x[count], #8\n"
462 
463       // Load Aggregate Store: 2x8.
464       "ld1 {v0.2s}, [%x[in]], #8\n"
465       "ld1 {v1.2s}, [x0], #8\n"
466       "uaddw v8.8h, v8.8h, v0.8b\n"
467       "uaddw v9.8h, v9.8h, v1.8b\n"
468       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
469 
470       "bne 1b\n"
471 
472       // Aggregator Reduction.
473       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
474       "dup v1.4s, %w[additive_sum_offset]\n"
475       "uaddlp v8.4s, v8.8h\n"
476       "uaddlp v9.4s, v9.8h\n"
477       "addp v8.4s, v8.4s, v9.4s\n"
478       "addp v8.4s, v8.4s, v8.4s\n"
479       "mul v8.4s, v8.4s, v0.s[0]\n"
480       "add v8.4s, v8.4s, v1.4s\n"
481       "st1 {v8.4s}, [%x[out]]\n"
482       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
483       : [stride] "r"(params.stride),
484         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
485         [additive_sum_offset] "r"(params.additive_sum_offset)
486       : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
487 }
488 
489 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)490 inline void Stream<uint8_t, 2, 8, 1, RowMajorWithSum>::Pack(
491     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
492 #ifdef DEBUG
493 #ifdef DEBUG_METAGEMM_VERBOSE
494   std::cout << __FILE__ << "(" << __LINE__
495             << ") RowMajorWithSum<uint8_t, 2, 8, 1, RowMajorWithSum>::Pack()"
496             << std::endl
497             << std::flush;
498 #endif
499 #endif
500   int params_count_copy = params.count;
501   asm volatile(
502       "add x0, %x[in], %x[stride]\n"
503       "movi v8.8h, #0\n"
504       "movi v9.8h, #0\n"
505 
506       // Reduce count by leftovers.
507       "subs %x[count], %x[count], #1\n"
508       "beq 2f\n"
509 
510       "1:"
511       "subs %x[count], %x[count], #8\n"
512 
513       // Load Aggregate Store: 2x8.
514       "ld1 {v0.2s}, [%x[in]], #8\n"
515       "ld1 {v1.2s}, [x0], #8\n"
516       "uaddw v8.8h, v8.8h, v0.8b\n"
517       "uaddw v9.8h, v9.8h, v1.8b\n"
518       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
519 
520       "bne 1b\n"
521 
522       "2:"
523 
524       // Load Aggregate Store: 2x1.
525       "movi v0.8b, #0\n"
526       "movi v1.8b, #0\n"
527       "ld1 {v0.b}[0], [%x[in]], #1\n"
528       "ld1 {v1.b}[0], [x0], #1\n"
529       "uaddw v8.8h, v8.8h, v0.8b\n"
530       "uaddw v9.8h, v9.8h, v1.8b\n"
531       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
532 
533       // Aggregator Reduction.
534       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
535       "dup v1.4s, %w[additive_sum_offset]\n"
536       "uaddlp v8.4s, v8.8h\n"
537       "uaddlp v9.4s, v9.8h\n"
538       "addp v8.4s, v8.4s, v9.4s\n"
539       "addp v8.4s, v8.4s, v8.4s\n"
540       "mul v8.4s, v8.4s, v0.s[0]\n"
541       "add v8.4s, v8.4s, v1.4s\n"
542       "st1 {v8.4s}, [%x[out]]\n"
543       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
544       : [stride] "r"(params.stride),
545         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
546         [additive_sum_offset] "r"(params.additive_sum_offset)
547       : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
548 }
549 
550 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)551 inline void Stream<uint8_t, 2, 8, 2, RowMajorWithSum>::Pack(
552     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
553 #ifdef DEBUG
554 #ifdef DEBUG_METAGEMM_VERBOSE
555   std::cout << __FILE__ << "(" << __LINE__
556             << ") RowMajorWithSum<uint8_t, 2, 8, 2, RowMajorWithSum>::Pack()"
557             << std::endl
558             << std::flush;
559 #endif
560 #endif
561   int params_count_copy = params.count;
562   asm volatile(
563       "add x0, %x[in], %x[stride]\n"
564       "movi v8.8h, #0\n"
565       "movi v9.8h, #0\n"
566 
567       // Reduce count by leftovers.
568       "subs %x[count], %x[count], #2\n"
569       "beq 2f\n"
570 
571       "1:"
572       "subs %x[count], %x[count], #8\n"
573 
574       // Load Aggregate Store: 2x8.
575       "ld1 {v0.2s}, [%x[in]], #8\n"
576       "ld1 {v1.2s}, [x0], #8\n"
577       "uaddw v8.8h, v8.8h, v0.8b\n"
578       "uaddw v9.8h, v9.8h, v1.8b\n"
579       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
580 
581       "bne 1b\n"
582 
583       "2:"
584 
585       // Load Aggregate Store: 2x2.
586       "movi v0.8b, #0\n"
587       "movi v1.8b, #0\n"
588       "ld1 {v0.h}[0], [%x[in]], #2\n"
589       "ld1 {v1.h}[0], [x0], #2\n"
590       "uaddw v8.8h, v8.8h, v0.8b\n"
591       "uaddw v9.8h, v9.8h, v1.8b\n"
592       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
593 
594       // Aggregator Reduction.
595       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
596       "dup v1.4s, %w[additive_sum_offset]\n"
597       "uaddlp v8.4s, v8.8h\n"
598       "uaddlp v9.4s, v9.8h\n"
599       "addp v8.4s, v8.4s, v9.4s\n"
600       "addp v8.4s, v8.4s, v8.4s\n"
601       "mul v8.4s, v8.4s, v0.s[0]\n"
602       "add v8.4s, v8.4s, v1.4s\n"
603       "st1 {v8.4s}, [%x[out]]\n"
604       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
605       : [stride] "r"(params.stride),
606         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
607         [additive_sum_offset] "r"(params.additive_sum_offset)
608       : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
609 }
610 
611 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)612 inline void Stream<uint8_t, 2, 8, 3, RowMajorWithSum>::Pack(
613     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
614 #ifdef DEBUG
615 #ifdef DEBUG_METAGEMM_VERBOSE
616   std::cout << __FILE__ << "(" << __LINE__
617             << ") RowMajorWithSum<uint8_t, 2, 8, 3, RowMajorWithSum>::Pack()"
618             << std::endl
619             << std::flush;
620 #endif
621 #endif
622   int params_count_copy = params.count;
623   asm volatile(
624       "add x0, %x[in], %x[stride]\n"
625       "movi v8.8h, #0\n"
626       "movi v9.8h, #0\n"
627 
628       // Reduce count by leftovers.
629       "subs %x[count], %x[count], #3\n"
630       "beq 2f\n"
631 
632       "1:"
633       "subs %x[count], %x[count], #8\n"
634 
635       // Load Aggregate Store: 2x8.
636       "ld1 {v0.2s}, [%x[in]], #8\n"
637       "ld1 {v1.2s}, [x0], #8\n"
638       "uaddw v8.8h, v8.8h, v0.8b\n"
639       "uaddw v9.8h, v9.8h, v1.8b\n"
640       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
641 
642       "bne 1b\n"
643 
644       "2:"
645 
646       // Load Aggregate Store: 2x3.
647       "movi v0.8b, #0\n"
648       "movi v1.8b, #0\n"
649       "ld1 {v0.h}[0], [%x[in]], #2\n"
650       "ld1 {v0.b}[2], [%x[in]], #1\n"
651       "ld1 {v1.h}[0], [x0], #2\n"
652       "ld1 {v1.b}[2], [x0], #1\n"
653       "uaddw v8.8h, v8.8h, v0.8b\n"
654       "uaddw v9.8h, v9.8h, v1.8b\n"
655       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
656 
657       // Aggregator Reduction.
658       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
659       "dup v1.4s, %w[additive_sum_offset]\n"
660       "uaddlp v8.4s, v8.8h\n"
661       "uaddlp v9.4s, v9.8h\n"
662       "addp v8.4s, v8.4s, v9.4s\n"
663       "addp v8.4s, v8.4s, v8.4s\n"
664       "mul v8.4s, v8.4s, v0.s[0]\n"
665       "add v8.4s, v8.4s, v1.4s\n"
666       "st1 {v8.4s}, [%x[out]]\n"
667       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
668       : [stride] "r"(params.stride),
669         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
670         [additive_sum_offset] "r"(params.additive_sum_offset)
671       : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
672 }
673 
674 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)675 inline void Stream<uint8_t, 2, 8, 4, RowMajorWithSum>::Pack(
676     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
677 #ifdef DEBUG
678 #ifdef DEBUG_METAGEMM_VERBOSE
679   std::cout << __FILE__ << "(" << __LINE__
680             << ") RowMajorWithSum<uint8_t, 2, 8, 4, RowMajorWithSum>::Pack()"
681             << std::endl
682             << std::flush;
683 #endif
684 #endif
685   int params_count_copy = params.count;
686   asm volatile(
687       "add x0, %x[in], %x[stride]\n"
688       "movi v8.8h, #0\n"
689       "movi v9.8h, #0\n"
690 
691       // Reduce count by leftovers.
692       "subs %x[count], %x[count], #4\n"
693       "beq 2f\n"
694 
695       "1:"
696       "subs %x[count], %x[count], #8\n"
697 
698       // Load Aggregate Store: 2x8.
699       "ld1 {v0.2s}, [%x[in]], #8\n"
700       "ld1 {v1.2s}, [x0], #8\n"
701       "uaddw v8.8h, v8.8h, v0.8b\n"
702       "uaddw v9.8h, v9.8h, v1.8b\n"
703       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
704 
705       "bne 1b\n"
706 
707       "2:"
708 
709       // Load Aggregate Store: 2x4.
710       "movi v0.8b, #0\n"
711       "movi v1.8b, #0\n"
712       "ld1 {v0.s}[0], [%x[in]], #4\n"
713       "ld1 {v1.s}[0], [x0], #4\n"
714       "uaddw v8.8h, v8.8h, v0.8b\n"
715       "uaddw v9.8h, v9.8h, v1.8b\n"
716       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
717 
718       // Aggregator Reduction.
719       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
720       "dup v1.4s, %w[additive_sum_offset]\n"
721       "uaddlp v8.4s, v8.8h\n"
722       "uaddlp v9.4s, v9.8h\n"
723       "addp v8.4s, v8.4s, v9.4s\n"
724       "addp v8.4s, v8.4s, v8.4s\n"
725       "mul v8.4s, v8.4s, v0.s[0]\n"
726       "add v8.4s, v8.4s, v1.4s\n"
727       "st1 {v8.4s}, [%x[out]]\n"
728       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
729       : [stride] "r"(params.stride),
730         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
731         [additive_sum_offset] "r"(params.additive_sum_offset)
732       : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
733 }
734 
735 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)736 inline void Stream<uint8_t, 2, 8, 5, RowMajorWithSum>::Pack(
737     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
738 #ifdef DEBUG
739 #ifdef DEBUG_METAGEMM_VERBOSE
740   std::cout << __FILE__ << "(" << __LINE__
741             << ") RowMajorWithSum<uint8_t, 2, 8, 5, RowMajorWithSum>::Pack()"
742             << std::endl
743             << std::flush;
744 #endif
745 #endif
746   int params_count_copy = params.count;
747   asm volatile(
748       "add x0, %x[in], %x[stride]\n"
749       "movi v8.8h, #0\n"
750       "movi v9.8h, #0\n"
751 
752       // Reduce count by leftovers.
753       "subs %x[count], %x[count], #5\n"
754       "beq 2f\n"
755 
756       "1:"
757       "subs %x[count], %x[count], #8\n"
758 
759       // Load Aggregate Store: 2x8.
760       "ld1 {v0.2s}, [%x[in]], #8\n"
761       "ld1 {v1.2s}, [x0], #8\n"
762       "uaddw v8.8h, v8.8h, v0.8b\n"
763       "uaddw v9.8h, v9.8h, v1.8b\n"
764       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
765 
766       "bne 1b\n"
767 
768       "2:"
769 
770       // Load Aggregate Store: 2x5.
771       "movi v0.8b, #0\n"
772       "movi v1.8b, #0\n"
773       "ld1 {v0.s}[0], [%x[in]], #4\n"
774       "ld1 {v0.b}[4], [%x[in]], #1\n"
775       "ld1 {v1.s}[0], [x0], #4\n"
776       "ld1 {v1.b}[4], [x0], #1\n"
777       "uaddw v8.8h, v8.8h, v0.8b\n"
778       "uaddw v9.8h, v9.8h, v1.8b\n"
779       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
780 
781       // Aggregator Reduction.
782       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
783       "dup v1.4s, %w[additive_sum_offset]\n"
784       "uaddlp v8.4s, v8.8h\n"
785       "uaddlp v9.4s, v9.8h\n"
786       "addp v8.4s, v8.4s, v9.4s\n"
787       "addp v8.4s, v8.4s, v8.4s\n"
788       "mul v8.4s, v8.4s, v0.s[0]\n"
789       "add v8.4s, v8.4s, v1.4s\n"
790       "st1 {v8.4s}, [%x[out]]\n"
791       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
792       : [stride] "r"(params.stride),
793         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
794         [additive_sum_offset] "r"(params.additive_sum_offset)
795       : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
796 }
797 
798 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)799 inline void Stream<uint8_t, 2, 8, 6, RowMajorWithSum>::Pack(
800     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
801 #ifdef DEBUG
802 #ifdef DEBUG_METAGEMM_VERBOSE
803   std::cout << __FILE__ << "(" << __LINE__
804             << ") RowMajorWithSum<uint8_t, 2, 8, 6, RowMajorWithSum>::Pack()"
805             << std::endl
806             << std::flush;
807 #endif
808 #endif
809   int params_count_copy = params.count;
810   asm volatile(
811       "add x0, %x[in], %x[stride]\n"
812       "movi v8.8h, #0\n"
813       "movi v9.8h, #0\n"
814 
815       // Reduce count by leftovers.
816       "subs %x[count], %x[count], #6\n"
817       "beq 2f\n"
818 
819       "1:"
820       "subs %x[count], %x[count], #8\n"
821 
822       // Load Aggregate Store: 2x8.
823       "ld1 {v0.2s}, [%x[in]], #8\n"
824       "ld1 {v1.2s}, [x0], #8\n"
825       "uaddw v8.8h, v8.8h, v0.8b\n"
826       "uaddw v9.8h, v9.8h, v1.8b\n"
827       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
828 
829       "bne 1b\n"
830 
831       "2:"
832 
833       // Load Aggregate Store: 2x6.
834       "movi v0.8b, #0\n"
835       "movi v1.8b, #0\n"
836       "ld1 {v0.s}[0], [%x[in]], #4\n"
837       "ld1 {v0.h}[2], [%x[in]], #2\n"
838       "ld1 {v1.s}[0], [x0], #4\n"
839       "ld1 {v1.h}[2], [x0], #2\n"
840       "uaddw v8.8h, v8.8h, v0.8b\n"
841       "uaddw v9.8h, v9.8h, v1.8b\n"
842       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
843 
844       // Aggregator Reduction.
845       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
846       "dup v1.4s, %w[additive_sum_offset]\n"
847       "uaddlp v8.4s, v8.8h\n"
848       "uaddlp v9.4s, v9.8h\n"
849       "addp v8.4s, v8.4s, v9.4s\n"
850       "addp v8.4s, v8.4s, v8.4s\n"
851       "mul v8.4s, v8.4s, v0.s[0]\n"
852       "add v8.4s, v8.4s, v1.4s\n"
853       "st1 {v8.4s}, [%x[out]]\n"
854       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
855       : [stride] "r"(params.stride),
856         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
857         [additive_sum_offset] "r"(params.additive_sum_offset)
858       : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
859 }
860 
861 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)862 inline void Stream<uint8_t, 2, 8, 7, RowMajorWithSum>::Pack(
863     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
864 #ifdef DEBUG
865 #ifdef DEBUG_METAGEMM_VERBOSE
866   std::cout << __FILE__ << "(" << __LINE__
867             << ") RowMajorWithSum<uint8_t, 2, 8, 7, RowMajorWithSum>::Pack()"
868             << std::endl
869             << std::flush;
870 #endif
871 #endif
872   int params_count_copy = params.count;
873   asm volatile(
874       "add x0, %x[in], %x[stride]\n"
875       "movi v8.8h, #0\n"
876       "movi v9.8h, #0\n"
877 
878       // Reduce count by leftovers.
879       "subs %x[count], %x[count], #7\n"
880       "beq 2f\n"
881 
882       "1:"
883       "subs %x[count], %x[count], #8\n"
884 
885       // Load Aggregate Store: 2x8.
886       "ld1 {v0.2s}, [%x[in]], #8\n"
887       "ld1 {v1.2s}, [x0], #8\n"
888       "uaddw v8.8h, v8.8h, v0.8b\n"
889       "uaddw v9.8h, v9.8h, v1.8b\n"
890       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
891 
892       "bne 1b\n"
893 
894       "2:"
895 
896       // Load Aggregate Store: 2x7.
897       "movi v0.8b, #0\n"
898       "movi v1.8b, #0\n"
899       "ld1 {v0.s}[0], [%x[in]], #4\n"
900       "ld1 {v0.h}[2], [%x[in]], #2\n"
901       "ld1 {v0.b}[6], [%x[in]], #1\n"
902       "ld1 {v1.s}[0], [x0], #4\n"
903       "ld1 {v1.h}[2], [x0], #2\n"
904       "ld1 {v1.b}[6], [x0], #1\n"
905       "uaddw v8.8h, v8.8h, v0.8b\n"
906       "uaddw v9.8h, v9.8h, v1.8b\n"
907       "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
908 
909       // Aggregator Reduction.
910       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
911       "dup v1.4s, %w[additive_sum_offset]\n"
912       "uaddlp v8.4s, v8.8h\n"
913       "uaddlp v9.4s, v9.8h\n"
914       "addp v8.4s, v8.4s, v9.4s\n"
915       "addp v8.4s, v8.4s, v8.4s\n"
916       "mul v8.4s, v8.4s, v0.s[0]\n"
917       "add v8.4s, v8.4s, v1.4s\n"
918       "st1 {v8.4s}, [%x[out]]\n"
919       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
920       : [stride] "r"(params.stride),
921         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
922         [additive_sum_offset] "r"(params.additive_sum_offset)
923       : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
924 }
925 
926 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)927 inline void Stream<uint8_t, 3, 8, 0, RowMajorWithSum>::Pack(
928     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
929 #ifdef DEBUG
930 #ifdef DEBUG_METAGEMM_VERBOSE
931   std::cout << __FILE__ << "(" << __LINE__
932             << ") RowMajorWithSum<uint8_t, 3, 8, 0, RowMajorWithSum>::Pack()"
933             << std::endl
934             << std::flush;
935 #endif
936 #endif
937   int params_count_copy = params.count;
938   asm volatile(
939       "add x0, %x[in], %x[stride]\n"
940       "add x1, x0, %x[stride]\n"
941       "movi v8.8h, #0\n"
942       "movi v9.8h, #0\n"
943       "movi v10.8h, #0\n"
944 
945       "1:"
946       "subs %x[count], %x[count], #8\n"
947 
948       // Load Aggregate Store: 3x8.
949       "ld1 {v0.2s}, [%x[in]], #8\n"
950       "ld1 {v1.2s}, [x0], #8\n"
951       "ld1 {v2.2s}, [x1], #8\n"
952       "uaddw v8.8h, v8.8h, v0.8b\n"
953       "uaddw v9.8h, v9.8h, v1.8b\n"
954       "uaddw v10.8h, v10.8h, v2.8b\n"
955       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
956 
957       "bne 1b\n"
958 
959       // Aggregator Reduction.
960       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
961       "dup v1.4s, %w[additive_sum_offset]\n"
962       "uaddlp v8.4s, v8.8h\n"
963       "uaddlp v9.4s, v9.8h\n"
964       "uaddlp v10.4s, v10.8h\n"
965       "addp v8.4s, v8.4s, v9.4s\n"
966       "addp v10.4s, v10.4s, v10.4s\n"
967       "addp v8.4s, v8.4s, v10.4s\n"
968       "mul v8.4s, v8.4s, v0.s[0]\n"
969       "add v8.4s, v8.4s, v1.4s\n"
970       "st1 {v8.4s}, [%x[out]]\n"
971       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
972       : [stride] "r"(params.stride),
973         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
974         [additive_sum_offset] "r"(params.additive_sum_offset)
975       : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
976 }
977 
978 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)979 inline void Stream<uint8_t, 3, 8, 1, RowMajorWithSum>::Pack(
980     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
981 #ifdef DEBUG
982 #ifdef DEBUG_METAGEMM_VERBOSE
983   std::cout << __FILE__ << "(" << __LINE__
984             << ") RowMajorWithSum<uint8_t, 3, 8, 1, RowMajorWithSum>::Pack()"
985             << std::endl
986             << std::flush;
987 #endif
988 #endif
989   int params_count_copy = params.count;
990   asm volatile(
991       "add x0, %x[in], %x[stride]\n"
992       "add x1, x0, %x[stride]\n"
993       "movi v8.8h, #0\n"
994       "movi v9.8h, #0\n"
995       "movi v10.8h, #0\n"
996 
997       // Reduce count by leftovers.
998       "subs %x[count], %x[count], #1\n"
999       "beq 2f\n"
1000 
1001       "1:"
1002       "subs %x[count], %x[count], #8\n"
1003 
1004       // Load Aggregate Store: 3x8.
1005       "ld1 {v0.2s}, [%x[in]], #8\n"
1006       "ld1 {v1.2s}, [x0], #8\n"
1007       "ld1 {v2.2s}, [x1], #8\n"
1008       "uaddw v8.8h, v8.8h, v0.8b\n"
1009       "uaddw v9.8h, v9.8h, v1.8b\n"
1010       "uaddw v10.8h, v10.8h, v2.8b\n"
1011       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1012 
1013       "bne 1b\n"
1014 
1015       "2:"
1016 
1017       // Load Aggregate Store: 3x1.
1018       "movi v0.8b, #0\n"
1019       "movi v1.8b, #0\n"
1020       "movi v2.8b, #0\n"
1021       "ld1 {v0.b}[0], [%x[in]], #1\n"
1022       "ld1 {v1.b}[0], [x0], #1\n"
1023       "ld1 {v2.b}[0], [x1], #1\n"
1024       "uaddw v8.8h, v8.8h, v0.8b\n"
1025       "uaddw v9.8h, v9.8h, v1.8b\n"
1026       "uaddw v10.8h, v10.8h, v2.8b\n"
1027       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1028 
1029       // Aggregator Reduction.
1030       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1031       "dup v1.4s, %w[additive_sum_offset]\n"
1032       "uaddlp v8.4s, v8.8h\n"
1033       "uaddlp v9.4s, v9.8h\n"
1034       "uaddlp v10.4s, v10.8h\n"
1035       "addp v8.4s, v8.4s, v9.4s\n"
1036       "addp v10.4s, v10.4s, v10.4s\n"
1037       "addp v8.4s, v8.4s, v10.4s\n"
1038       "mul v8.4s, v8.4s, v0.s[0]\n"
1039       "add v8.4s, v8.4s, v1.4s\n"
1040       "st1 {v8.4s}, [%x[out]]\n"
1041       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1042       : [stride] "r"(params.stride),
1043         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1044         [additive_sum_offset] "r"(params.additive_sum_offset)
1045       : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1046 }
1047 
1048 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1049 inline void Stream<uint8_t, 3, 8, 2, RowMajorWithSum>::Pack(
1050     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1051 #ifdef DEBUG
1052 #ifdef DEBUG_METAGEMM_VERBOSE
1053   std::cout << __FILE__ << "(" << __LINE__
1054             << ") RowMajorWithSum<uint8_t, 3, 8, 2, RowMajorWithSum>::Pack()"
1055             << std::endl
1056             << std::flush;
1057 #endif
1058 #endif
1059   int params_count_copy = params.count;
1060   asm volatile(
1061       "add x0, %x[in], %x[stride]\n"
1062       "add x1, x0, %x[stride]\n"
1063       "movi v8.8h, #0\n"
1064       "movi v9.8h, #0\n"
1065       "movi v10.8h, #0\n"
1066 
1067       // Reduce count by leftovers.
1068       "subs %x[count], %x[count], #2\n"
1069       "beq 2f\n"
1070 
1071       "1:"
1072       "subs %x[count], %x[count], #8\n"
1073 
1074       // Load Aggregate Store: 3x8.
1075       "ld1 {v0.2s}, [%x[in]], #8\n"
1076       "ld1 {v1.2s}, [x0], #8\n"
1077       "ld1 {v2.2s}, [x1], #8\n"
1078       "uaddw v8.8h, v8.8h, v0.8b\n"
1079       "uaddw v9.8h, v9.8h, v1.8b\n"
1080       "uaddw v10.8h, v10.8h, v2.8b\n"
1081       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1082 
1083       "bne 1b\n"
1084 
1085       "2:"
1086 
1087       // Load Aggregate Store: 3x2.
1088       "movi v0.8b, #0\n"
1089       "movi v1.8b, #0\n"
1090       "movi v2.8b, #0\n"
1091       "ld1 {v0.h}[0], [%x[in]], #2\n"
1092       "ld1 {v1.h}[0], [x0], #2\n"
1093       "ld1 {v2.h}[0], [x1], #2\n"
1094       "uaddw v8.8h, v8.8h, v0.8b\n"
1095       "uaddw v9.8h, v9.8h, v1.8b\n"
1096       "uaddw v10.8h, v10.8h, v2.8b\n"
1097       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1098 
1099       // Aggregator Reduction.
1100       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1101       "dup v1.4s, %w[additive_sum_offset]\n"
1102       "uaddlp v8.4s, v8.8h\n"
1103       "uaddlp v9.4s, v9.8h\n"
1104       "uaddlp v10.4s, v10.8h\n"
1105       "addp v8.4s, v8.4s, v9.4s\n"
1106       "addp v10.4s, v10.4s, v10.4s\n"
1107       "addp v8.4s, v8.4s, v10.4s\n"
1108       "mul v8.4s, v8.4s, v0.s[0]\n"
1109       "add v8.4s, v8.4s, v1.4s\n"
1110       "st1 {v8.4s}, [%x[out]]\n"
1111       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1112       : [stride] "r"(params.stride),
1113         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1114         [additive_sum_offset] "r"(params.additive_sum_offset)
1115       : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1116 }
1117 
1118 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1119 inline void Stream<uint8_t, 3, 8, 3, RowMajorWithSum>::Pack(
1120     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1121 #ifdef DEBUG
1122 #ifdef DEBUG_METAGEMM_VERBOSE
1123   std::cout << __FILE__ << "(" << __LINE__
1124             << ") RowMajorWithSum<uint8_t, 3, 8, 3, RowMajorWithSum>::Pack()"
1125             << std::endl
1126             << std::flush;
1127 #endif
1128 #endif
1129   int params_count_copy = params.count;
1130   asm volatile(
1131       "add x0, %x[in], %x[stride]\n"
1132       "add x1, x0, %x[stride]\n"
1133       "movi v8.8h, #0\n"
1134       "movi v9.8h, #0\n"
1135       "movi v10.8h, #0\n"
1136 
1137       // Reduce count by leftovers.
1138       "subs %x[count], %x[count], #3\n"
1139       "beq 2f\n"
1140 
1141       "1:"
1142       "subs %x[count], %x[count], #8\n"
1143 
1144       // Load Aggregate Store: 3x8.
1145       "ld1 {v0.2s}, [%x[in]], #8\n"
1146       "ld1 {v1.2s}, [x0], #8\n"
1147       "ld1 {v2.2s}, [x1], #8\n"
1148       "uaddw v8.8h, v8.8h, v0.8b\n"
1149       "uaddw v9.8h, v9.8h, v1.8b\n"
1150       "uaddw v10.8h, v10.8h, v2.8b\n"
1151       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1152 
1153       "bne 1b\n"
1154 
1155       "2:"
1156 
1157       // Load Aggregate Store: 3x3.
1158       "movi v0.8b, #0\n"
1159       "movi v1.8b, #0\n"
1160       "movi v2.8b, #0\n"
1161       "ld1 {v0.h}[0], [%x[in]], #2\n"
1162       "ld1 {v0.b}[2], [%x[in]], #1\n"
1163       "ld1 {v1.h}[0], [x0], #2\n"
1164       "ld1 {v1.b}[2], [x0], #1\n"
1165       "ld1 {v2.h}[0], [x1], #2\n"
1166       "ld1 {v2.b}[2], [x1], #1\n"
1167       "uaddw v8.8h, v8.8h, v0.8b\n"
1168       "uaddw v9.8h, v9.8h, v1.8b\n"
1169       "uaddw v10.8h, v10.8h, v2.8b\n"
1170       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1171 
1172       // Aggregator Reduction.
1173       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1174       "dup v1.4s, %w[additive_sum_offset]\n"
1175       "uaddlp v8.4s, v8.8h\n"
1176       "uaddlp v9.4s, v9.8h\n"
1177       "uaddlp v10.4s, v10.8h\n"
1178       "addp v8.4s, v8.4s, v9.4s\n"
1179       "addp v10.4s, v10.4s, v10.4s\n"
1180       "addp v8.4s, v8.4s, v10.4s\n"
1181       "mul v8.4s, v8.4s, v0.s[0]\n"
1182       "add v8.4s, v8.4s, v1.4s\n"
1183       "st1 {v8.4s}, [%x[out]]\n"
1184       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1185       : [stride] "r"(params.stride),
1186         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1187         [additive_sum_offset] "r"(params.additive_sum_offset)
1188       : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1189 }
1190 
1191 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1192 inline void Stream<uint8_t, 3, 8, 4, RowMajorWithSum>::Pack(
1193     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1194 #ifdef DEBUG
1195 #ifdef DEBUG_METAGEMM_VERBOSE
1196   std::cout << __FILE__ << "(" << __LINE__
1197             << ") RowMajorWithSum<uint8_t, 3, 8, 4, RowMajorWithSum>::Pack()"
1198             << std::endl
1199             << std::flush;
1200 #endif
1201 #endif
1202   int params_count_copy = params.count;
1203   asm volatile(
1204       "add x0, %x[in], %x[stride]\n"
1205       "add x1, x0, %x[stride]\n"
1206       "movi v8.8h, #0\n"
1207       "movi v9.8h, #0\n"
1208       "movi v10.8h, #0\n"
1209 
1210       // Reduce count by leftovers.
1211       "subs %x[count], %x[count], #4\n"
1212       "beq 2f\n"
1213 
1214       "1:"
1215       "subs %x[count], %x[count], #8\n"
1216 
1217       // Load Aggregate Store: 3x8.
1218       "ld1 {v0.2s}, [%x[in]], #8\n"
1219       "ld1 {v1.2s}, [x0], #8\n"
1220       "ld1 {v2.2s}, [x1], #8\n"
1221       "uaddw v8.8h, v8.8h, v0.8b\n"
1222       "uaddw v9.8h, v9.8h, v1.8b\n"
1223       "uaddw v10.8h, v10.8h, v2.8b\n"
1224       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1225 
1226       "bne 1b\n"
1227 
1228       "2:"
1229 
1230       // Load Aggregate Store: 3x4.
1231       "movi v0.8b, #0\n"
1232       "movi v1.8b, #0\n"
1233       "movi v2.8b, #0\n"
1234       "ld1 {v0.s}[0], [%x[in]], #4\n"
1235       "ld1 {v1.s}[0], [x0], #4\n"
1236       "ld1 {v2.s}[0], [x1], #4\n"
1237       "uaddw v8.8h, v8.8h, v0.8b\n"
1238       "uaddw v9.8h, v9.8h, v1.8b\n"
1239       "uaddw v10.8h, v10.8h, v2.8b\n"
1240       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1241 
1242       // Aggregator Reduction.
1243       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1244       "dup v1.4s, %w[additive_sum_offset]\n"
1245       "uaddlp v8.4s, v8.8h\n"
1246       "uaddlp v9.4s, v9.8h\n"
1247       "uaddlp v10.4s, v10.8h\n"
1248       "addp v8.4s, v8.4s, v9.4s\n"
1249       "addp v10.4s, v10.4s, v10.4s\n"
1250       "addp v8.4s, v8.4s, v10.4s\n"
1251       "mul v8.4s, v8.4s, v0.s[0]\n"
1252       "add v8.4s, v8.4s, v1.4s\n"
1253       "st1 {v8.4s}, [%x[out]]\n"
1254       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1255       : [stride] "r"(params.stride),
1256         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1257         [additive_sum_offset] "r"(params.additive_sum_offset)
1258       : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1259 }
1260 
1261 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1262 inline void Stream<uint8_t, 3, 8, 5, RowMajorWithSum>::Pack(
1263     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1264 #ifdef DEBUG
1265 #ifdef DEBUG_METAGEMM_VERBOSE
1266   std::cout << __FILE__ << "(" << __LINE__
1267             << ") RowMajorWithSum<uint8_t, 3, 8, 5, RowMajorWithSum>::Pack()"
1268             << std::endl
1269             << std::flush;
1270 #endif
1271 #endif
1272   int params_count_copy = params.count;
1273   asm volatile(
1274       "add x0, %x[in], %x[stride]\n"
1275       "add x1, x0, %x[stride]\n"
1276       "movi v8.8h, #0\n"
1277       "movi v9.8h, #0\n"
1278       "movi v10.8h, #0\n"
1279 
1280       // Reduce count by leftovers.
1281       "subs %x[count], %x[count], #5\n"
1282       "beq 2f\n"
1283 
1284       "1:"
1285       "subs %x[count], %x[count], #8\n"
1286 
1287       // Load Aggregate Store: 3x8.
1288       "ld1 {v0.2s}, [%x[in]], #8\n"
1289       "ld1 {v1.2s}, [x0], #8\n"
1290       "ld1 {v2.2s}, [x1], #8\n"
1291       "uaddw v8.8h, v8.8h, v0.8b\n"
1292       "uaddw v9.8h, v9.8h, v1.8b\n"
1293       "uaddw v10.8h, v10.8h, v2.8b\n"
1294       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1295 
1296       "bne 1b\n"
1297 
1298       "2:"
1299 
1300       // Load Aggregate Store: 3x5.
1301       "movi v0.8b, #0\n"
1302       "movi v1.8b, #0\n"
1303       "movi v2.8b, #0\n"
1304       "ld1 {v0.s}[0], [%x[in]], #4\n"
1305       "ld1 {v0.b}[4], [%x[in]], #1\n"
1306       "ld1 {v1.s}[0], [x0], #4\n"
1307       "ld1 {v1.b}[4], [x0], #1\n"
1308       "ld1 {v2.s}[0], [x1], #4\n"
1309       "ld1 {v2.b}[4], [x1], #1\n"
1310       "uaddw v8.8h, v8.8h, v0.8b\n"
1311       "uaddw v9.8h, v9.8h, v1.8b\n"
1312       "uaddw v10.8h, v10.8h, v2.8b\n"
1313       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1314 
1315       // Aggregator Reduction.
1316       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1317       "dup v1.4s, %w[additive_sum_offset]\n"
1318       "uaddlp v8.4s, v8.8h\n"
1319       "uaddlp v9.4s, v9.8h\n"
1320       "uaddlp v10.4s, v10.8h\n"
1321       "addp v8.4s, v8.4s, v9.4s\n"
1322       "addp v10.4s, v10.4s, v10.4s\n"
1323       "addp v8.4s, v8.4s, v10.4s\n"
1324       "mul v8.4s, v8.4s, v0.s[0]\n"
1325       "add v8.4s, v8.4s, v1.4s\n"
1326       "st1 {v8.4s}, [%x[out]]\n"
1327       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1328       : [stride] "r"(params.stride),
1329         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1330         [additive_sum_offset] "r"(params.additive_sum_offset)
1331       : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1332 }
1333 
1334 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1335 inline void Stream<uint8_t, 3, 8, 6, RowMajorWithSum>::Pack(
1336     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1337 #ifdef DEBUG
1338 #ifdef DEBUG_METAGEMM_VERBOSE
1339   std::cout << __FILE__ << "(" << __LINE__
1340             << ") RowMajorWithSum<uint8_t, 3, 8, 6, RowMajorWithSum>::Pack()"
1341             << std::endl
1342             << std::flush;
1343 #endif
1344 #endif
1345   int params_count_copy = params.count;
1346   asm volatile(
1347       "add x0, %x[in], %x[stride]\n"
1348       "add x1, x0, %x[stride]\n"
1349       "movi v8.8h, #0\n"
1350       "movi v9.8h, #0\n"
1351       "movi v10.8h, #0\n"
1352 
1353       // Reduce count by leftovers.
1354       "subs %x[count], %x[count], #6\n"
1355       "beq 2f\n"
1356 
1357       "1:"
1358       "subs %x[count], %x[count], #8\n"
1359 
1360       // Load Aggregate Store: 3x8.
1361       "ld1 {v0.2s}, [%x[in]], #8\n"
1362       "ld1 {v1.2s}, [x0], #8\n"
1363       "ld1 {v2.2s}, [x1], #8\n"
1364       "uaddw v8.8h, v8.8h, v0.8b\n"
1365       "uaddw v9.8h, v9.8h, v1.8b\n"
1366       "uaddw v10.8h, v10.8h, v2.8b\n"
1367       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1368 
1369       "bne 1b\n"
1370 
1371       "2:"
1372 
1373       // Load Aggregate Store: 3x6.
1374       "movi v0.8b, #0\n"
1375       "movi v1.8b, #0\n"
1376       "movi v2.8b, #0\n"
1377       "ld1 {v0.s}[0], [%x[in]], #4\n"
1378       "ld1 {v0.h}[2], [%x[in]], #2\n"
1379       "ld1 {v1.s}[0], [x0], #4\n"
1380       "ld1 {v1.h}[2], [x0], #2\n"
1381       "ld1 {v2.s}[0], [x1], #4\n"
1382       "ld1 {v2.h}[2], [x1], #2\n"
1383       "uaddw v8.8h, v8.8h, v0.8b\n"
1384       "uaddw v9.8h, v9.8h, v1.8b\n"
1385       "uaddw v10.8h, v10.8h, v2.8b\n"
1386       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1387 
1388       // Aggregator Reduction.
1389       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1390       "dup v1.4s, %w[additive_sum_offset]\n"
1391       "uaddlp v8.4s, v8.8h\n"
1392       "uaddlp v9.4s, v9.8h\n"
1393       "uaddlp v10.4s, v10.8h\n"
1394       "addp v8.4s, v8.4s, v9.4s\n"
1395       "addp v10.4s, v10.4s, v10.4s\n"
1396       "addp v8.4s, v8.4s, v10.4s\n"
1397       "mul v8.4s, v8.4s, v0.s[0]\n"
1398       "add v8.4s, v8.4s, v1.4s\n"
1399       "st1 {v8.4s}, [%x[out]]\n"
1400       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1401       : [stride] "r"(params.stride),
1402         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1403         [additive_sum_offset] "r"(params.additive_sum_offset)
1404       : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1405 }
1406 
1407 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1408 inline void Stream<uint8_t, 3, 8, 7, RowMajorWithSum>::Pack(
1409     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1410 #ifdef DEBUG
1411 #ifdef DEBUG_METAGEMM_VERBOSE
1412   std::cout << __FILE__ << "(" << __LINE__
1413             << ") RowMajorWithSum<uint8_t, 3, 8, 7, RowMajorWithSum>::Pack()"
1414             << std::endl
1415             << std::flush;
1416 #endif
1417 #endif
1418   int params_count_copy = params.count;
1419   asm volatile(
1420       "add x0, %x[in], %x[stride]\n"
1421       "add x1, x0, %x[stride]\n"
1422       "movi v8.8h, #0\n"
1423       "movi v9.8h, #0\n"
1424       "movi v10.8h, #0\n"
1425 
1426       // Reduce count by leftovers.
1427       "subs %x[count], %x[count], #7\n"
1428       "beq 2f\n"
1429 
1430       "1:"
1431       "subs %x[count], %x[count], #8\n"
1432 
1433       // Load Aggregate Store: 3x8.
1434       "ld1 {v0.2s}, [%x[in]], #8\n"
1435       "ld1 {v1.2s}, [x0], #8\n"
1436       "ld1 {v2.2s}, [x1], #8\n"
1437       "uaddw v8.8h, v8.8h, v0.8b\n"
1438       "uaddw v9.8h, v9.8h, v1.8b\n"
1439       "uaddw v10.8h, v10.8h, v2.8b\n"
1440       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1441 
1442       "bne 1b\n"
1443 
1444       "2:"
1445 
1446       // Load Aggregate Store: 3x7.
1447       "movi v0.8b, #0\n"
1448       "movi v1.8b, #0\n"
1449       "movi v2.8b, #0\n"
1450       "ld1 {v0.s}[0], [%x[in]], #4\n"
1451       "ld1 {v0.h}[2], [%x[in]], #2\n"
1452       "ld1 {v0.b}[6], [%x[in]], #1\n"
1453       "ld1 {v1.s}[0], [x0], #4\n"
1454       "ld1 {v1.h}[2], [x0], #2\n"
1455       "ld1 {v1.b}[6], [x0], #1\n"
1456       "ld1 {v2.s}[0], [x1], #4\n"
1457       "ld1 {v2.h}[2], [x1], #2\n"
1458       "ld1 {v2.b}[6], [x1], #1\n"
1459       "uaddw v8.8h, v8.8h, v0.8b\n"
1460       "uaddw v9.8h, v9.8h, v1.8b\n"
1461       "uaddw v10.8h, v10.8h, v2.8b\n"
1462       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1463 
1464       // Aggregator Reduction.
1465       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1466       "dup v1.4s, %w[additive_sum_offset]\n"
1467       "uaddlp v8.4s, v8.8h\n"
1468       "uaddlp v9.4s, v9.8h\n"
1469       "uaddlp v10.4s, v10.8h\n"
1470       "addp v8.4s, v8.4s, v9.4s\n"
1471       "addp v10.4s, v10.4s, v10.4s\n"
1472       "addp v8.4s, v8.4s, v10.4s\n"
1473       "mul v8.4s, v8.4s, v0.s[0]\n"
1474       "add v8.4s, v8.4s, v1.4s\n"
1475       "st1 {v8.4s}, [%x[out]]\n"
1476       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1477       : [stride] "r"(params.stride),
1478         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1479         [additive_sum_offset] "r"(params.additive_sum_offset)
1480       : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1481 }
1482 
1483 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1484 inline void Stream<uint8_t, 4, 8, 0, RowMajorWithSum>::Pack(
1485     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1486 #ifdef DEBUG
1487 #ifdef DEBUG_METAGEMM_VERBOSE
1488   std::cout << __FILE__ << "(" << __LINE__
1489             << ") RowMajorWithSum<uint8_t, 4, 8, 0, RowMajorWithSum>::Pack()"
1490             << std::endl
1491             << std::flush;
1492 #endif
1493 #endif
1494   int params_count_copy = params.count;
1495   asm volatile(
1496       "add x0, %x[in], %x[stride]\n"
1497       "add x1, x0, %x[stride]\n"
1498       "add x2, x1, %x[stride]\n"
1499       "movi v8.8h, #0\n"
1500       "movi v9.8h, #0\n"
1501       "movi v10.8h, #0\n"
1502       "movi v11.8h, #0\n"
1503 
1504       "1:"
1505       "subs %x[count], %x[count], #8\n"
1506 
1507       // Load Aggregate Store: 4x8.
1508       "ld1 {v0.2s}, [%x[in]], #8\n"
1509       "ld1 {v1.2s}, [x0], #8\n"
1510       "ld1 {v2.2s}, [x1], #8\n"
1511       "ld1 {v3.2s}, [x2], #8\n"
1512       "uaddw v8.8h, v8.8h, v0.8b\n"
1513       "uaddw v9.8h, v9.8h, v1.8b\n"
1514       "uaddw v10.8h, v10.8h, v2.8b\n"
1515       "uaddw v11.8h, v11.8h, v3.8b\n"
1516       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1517 
1518       "bne 1b\n"
1519 
1520       // Aggregator Reduction.
1521       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1522       "dup v1.4s, %w[additive_sum_offset]\n"
1523       "uaddlp v8.4s, v8.8h\n"
1524       "uaddlp v9.4s, v9.8h\n"
1525       "uaddlp v10.4s, v10.8h\n"
1526       "uaddlp v11.4s, v11.8h\n"
1527       "addp v8.4s, v8.4s, v9.4s\n"
1528       "addp v10.4s, v10.4s, v11.4s\n"
1529       "addp v8.4s, v8.4s, v10.4s\n"
1530       "mul v8.4s, v8.4s, v0.s[0]\n"
1531       "add v8.4s, v8.4s, v1.4s\n"
1532       "st1 {v8.4s}, [%x[out]]\n"
1533       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1534       : [stride] "r"(params.stride),
1535         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1536         [additive_sum_offset] "r"(params.additive_sum_offset)
1537       : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1538         "cc", "memory");
1539 }
1540 
1541 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1542 inline void Stream<uint8_t, 4, 8, 1, RowMajorWithSum>::Pack(
1543     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1544 #ifdef DEBUG
1545 #ifdef DEBUG_METAGEMM_VERBOSE
1546   std::cout << __FILE__ << "(" << __LINE__
1547             << ") RowMajorWithSum<uint8_t, 4, 8, 1, RowMajorWithSum>::Pack()"
1548             << std::endl
1549             << std::flush;
1550 #endif
1551 #endif
1552   int params_count_copy = params.count;
1553   asm volatile(
1554       "add x0, %x[in], %x[stride]\n"
1555       "add x1, x0, %x[stride]\n"
1556       "add x2, x1, %x[stride]\n"
1557       "movi v8.8h, #0\n"
1558       "movi v9.8h, #0\n"
1559       "movi v10.8h, #0\n"
1560       "movi v11.8h, #0\n"
1561 
1562       // Reduce count by leftovers.
1563       "subs %x[count], %x[count], #1\n"
1564       "beq 2f\n"
1565 
1566       "1:"
1567       "subs %x[count], %x[count], #8\n"
1568 
1569       // Load Aggregate Store: 4x8.
1570       "ld1 {v0.2s}, [%x[in]], #8\n"
1571       "ld1 {v1.2s}, [x0], #8\n"
1572       "ld1 {v2.2s}, [x1], #8\n"
1573       "ld1 {v3.2s}, [x2], #8\n"
1574       "uaddw v8.8h, v8.8h, v0.8b\n"
1575       "uaddw v9.8h, v9.8h, v1.8b\n"
1576       "uaddw v10.8h, v10.8h, v2.8b\n"
1577       "uaddw v11.8h, v11.8h, v3.8b\n"
1578       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1579 
1580       "bne 1b\n"
1581 
1582       "2:"
1583 
1584       // Load Aggregate Store: 4x1.
1585       "movi v0.8b, #0\n"
1586       "movi v1.8b, #0\n"
1587       "movi v2.8b, #0\n"
1588       "movi v3.8b, #0\n"
1589       "ld1 {v0.b}[0], [%x[in]], #1\n"
1590       "ld1 {v1.b}[0], [x0], #1\n"
1591       "ld1 {v2.b}[0], [x1], #1\n"
1592       "ld1 {v3.b}[0], [x2], #1\n"
1593       "uaddw v8.8h, v8.8h, v0.8b\n"
1594       "uaddw v9.8h, v9.8h, v1.8b\n"
1595       "uaddw v10.8h, v10.8h, v2.8b\n"
1596       "uaddw v11.8h, v11.8h, v3.8b\n"
1597       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1598 
1599       // Aggregator Reduction.
1600       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1601       "dup v1.4s, %w[additive_sum_offset]\n"
1602       "uaddlp v8.4s, v8.8h\n"
1603       "uaddlp v9.4s, v9.8h\n"
1604       "uaddlp v10.4s, v10.8h\n"
1605       "uaddlp v11.4s, v11.8h\n"
1606       "addp v8.4s, v8.4s, v9.4s\n"
1607       "addp v10.4s, v10.4s, v11.4s\n"
1608       "addp v8.4s, v8.4s, v10.4s\n"
1609       "mul v8.4s, v8.4s, v0.s[0]\n"
1610       "add v8.4s, v8.4s, v1.4s\n"
1611       "st1 {v8.4s}, [%x[out]]\n"
1612       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1613       : [stride] "r"(params.stride),
1614         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1615         [additive_sum_offset] "r"(params.additive_sum_offset)
1616       : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1617         "cc", "memory");
1618 }
1619 
1620 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1621 inline void Stream<uint8_t, 4, 8, 2, RowMajorWithSum>::Pack(
1622     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1623 #ifdef DEBUG
1624 #ifdef DEBUG_METAGEMM_VERBOSE
1625   std::cout << __FILE__ << "(" << __LINE__
1626             << ") RowMajorWithSum<uint8_t, 4, 8, 2, RowMajorWithSum>::Pack()"
1627             << std::endl
1628             << std::flush;
1629 #endif
1630 #endif
1631   int params_count_copy = params.count;
1632   asm volatile(
1633       "add x0, %x[in], %x[stride]\n"
1634       "add x1, x0, %x[stride]\n"
1635       "add x2, x1, %x[stride]\n"
1636       "movi v8.8h, #0\n"
1637       "movi v9.8h, #0\n"
1638       "movi v10.8h, #0\n"
1639       "movi v11.8h, #0\n"
1640 
1641       // Reduce count by leftovers.
1642       "subs %x[count], %x[count], #2\n"
1643       "beq 2f\n"
1644 
1645       "1:"
1646       "subs %x[count], %x[count], #8\n"
1647 
1648       // Load Aggregate Store: 4x8.
1649       "ld1 {v0.2s}, [%x[in]], #8\n"
1650       "ld1 {v1.2s}, [x0], #8\n"
1651       "ld1 {v2.2s}, [x1], #8\n"
1652       "ld1 {v3.2s}, [x2], #8\n"
1653       "uaddw v8.8h, v8.8h, v0.8b\n"
1654       "uaddw v9.8h, v9.8h, v1.8b\n"
1655       "uaddw v10.8h, v10.8h, v2.8b\n"
1656       "uaddw v11.8h, v11.8h, v3.8b\n"
1657       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1658 
1659       "bne 1b\n"
1660 
1661       "2:"
1662 
1663       // Load Aggregate Store: 4x2.
1664       "movi v0.8b, #0\n"
1665       "movi v1.8b, #0\n"
1666       "movi v2.8b, #0\n"
1667       "movi v3.8b, #0\n"
1668       "ld1 {v0.h}[0], [%x[in]], #2\n"
1669       "ld1 {v1.h}[0], [x0], #2\n"
1670       "ld1 {v2.h}[0], [x1], #2\n"
1671       "ld1 {v3.h}[0], [x2], #2\n"
1672       "uaddw v8.8h, v8.8h, v0.8b\n"
1673       "uaddw v9.8h, v9.8h, v1.8b\n"
1674       "uaddw v10.8h, v10.8h, v2.8b\n"
1675       "uaddw v11.8h, v11.8h, v3.8b\n"
1676       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1677 
1678       // Aggregator Reduction.
1679       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1680       "dup v1.4s, %w[additive_sum_offset]\n"
1681       "uaddlp v8.4s, v8.8h\n"
1682       "uaddlp v9.4s, v9.8h\n"
1683       "uaddlp v10.4s, v10.8h\n"
1684       "uaddlp v11.4s, v11.8h\n"
1685       "addp v8.4s, v8.4s, v9.4s\n"
1686       "addp v10.4s, v10.4s, v11.4s\n"
1687       "addp v8.4s, v8.4s, v10.4s\n"
1688       "mul v8.4s, v8.4s, v0.s[0]\n"
1689       "add v8.4s, v8.4s, v1.4s\n"
1690       "st1 {v8.4s}, [%x[out]]\n"
1691       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1692       : [stride] "r"(params.stride),
1693         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1694         [additive_sum_offset] "r"(params.additive_sum_offset)
1695       : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1696         "cc", "memory");
1697 }
1698 
1699 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1700 inline void Stream<uint8_t, 4, 8, 3, RowMajorWithSum>::Pack(
1701     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1702 #ifdef DEBUG
1703 #ifdef DEBUG_METAGEMM_VERBOSE
1704   std::cout << __FILE__ << "(" << __LINE__
1705             << ") RowMajorWithSum<uint8_t, 4, 8, 3, RowMajorWithSum>::Pack()"
1706             << std::endl
1707             << std::flush;
1708 #endif
1709 #endif
1710   int params_count_copy = params.count;
1711   asm volatile(
1712       "add x0, %x[in], %x[stride]\n"
1713       "add x1, x0, %x[stride]\n"
1714       "add x2, x1, %x[stride]\n"
1715       "movi v8.8h, #0\n"
1716       "movi v9.8h, #0\n"
1717       "movi v10.8h, #0\n"
1718       "movi v11.8h, #0\n"
1719 
1720       // Reduce count by leftovers.
1721       "subs %x[count], %x[count], #3\n"
1722       "beq 2f\n"
1723 
1724       "1:"
1725       "subs %x[count], %x[count], #8\n"
1726 
1727       // Load Aggregate Store: 4x8.
1728       "ld1 {v0.2s}, [%x[in]], #8\n"
1729       "ld1 {v1.2s}, [x0], #8\n"
1730       "ld1 {v2.2s}, [x1], #8\n"
1731       "ld1 {v3.2s}, [x2], #8\n"
1732       "uaddw v8.8h, v8.8h, v0.8b\n"
1733       "uaddw v9.8h, v9.8h, v1.8b\n"
1734       "uaddw v10.8h, v10.8h, v2.8b\n"
1735       "uaddw v11.8h, v11.8h, v3.8b\n"
1736       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1737 
1738       "bne 1b\n"
1739 
1740       "2:"
1741 
1742       // Load Aggregate Store: 4x3.
1743       "movi v0.8b, #0\n"
1744       "movi v1.8b, #0\n"
1745       "movi v2.8b, #0\n"
1746       "movi v3.8b, #0\n"
1747       "ld1 {v0.h}[0], [%x[in]], #2\n"
1748       "ld1 {v0.b}[2], [%x[in]], #1\n"
1749       "ld1 {v1.h}[0], [x0], #2\n"
1750       "ld1 {v1.b}[2], [x0], #1\n"
1751       "ld1 {v2.h}[0], [x1], #2\n"
1752       "ld1 {v2.b}[2], [x1], #1\n"
1753       "ld1 {v3.h}[0], [x2], #2\n"
1754       "ld1 {v3.b}[2], [x2], #1\n"
1755       "uaddw v8.8h, v8.8h, v0.8b\n"
1756       "uaddw v9.8h, v9.8h, v1.8b\n"
1757       "uaddw v10.8h, v10.8h, v2.8b\n"
1758       "uaddw v11.8h, v11.8h, v3.8b\n"
1759       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1760 
1761       // Aggregator Reduction.
1762       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1763       "dup v1.4s, %w[additive_sum_offset]\n"
1764       "uaddlp v8.4s, v8.8h\n"
1765       "uaddlp v9.4s, v9.8h\n"
1766       "uaddlp v10.4s, v10.8h\n"
1767       "uaddlp v11.4s, v11.8h\n"
1768       "addp v8.4s, v8.4s, v9.4s\n"
1769       "addp v10.4s, v10.4s, v11.4s\n"
1770       "addp v8.4s, v8.4s, v10.4s\n"
1771       "mul v8.4s, v8.4s, v0.s[0]\n"
1772       "add v8.4s, v8.4s, v1.4s\n"
1773       "st1 {v8.4s}, [%x[out]]\n"
1774       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1775       : [stride] "r"(params.stride),
1776         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1777         [additive_sum_offset] "r"(params.additive_sum_offset)
1778       : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1779         "cc", "memory");
1780 }
1781 
1782 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1783 inline void Stream<uint8_t, 4, 8, 4, RowMajorWithSum>::Pack(
1784     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1785 #ifdef DEBUG
1786 #ifdef DEBUG_METAGEMM_VERBOSE
1787   std::cout << __FILE__ << "(" << __LINE__
1788             << ") RowMajorWithSum<uint8_t, 4, 8, 4, RowMajorWithSum>::Pack()"
1789             << std::endl
1790             << std::flush;
1791 #endif
1792 #endif
1793   int params_count_copy = params.count;
1794   asm volatile(
1795       "add x0, %x[in], %x[stride]\n"
1796       "add x1, x0, %x[stride]\n"
1797       "add x2, x1, %x[stride]\n"
1798       "movi v8.8h, #0\n"
1799       "movi v9.8h, #0\n"
1800       "movi v10.8h, #0\n"
1801       "movi v11.8h, #0\n"
1802 
1803       // Reduce count by leftovers.
1804       "subs %x[count], %x[count], #4\n"
1805       "beq 2f\n"
1806 
1807       "1:"
1808       "subs %x[count], %x[count], #8\n"
1809 
1810       // Load Aggregate Store: 4x8.
1811       "ld1 {v0.2s}, [%x[in]], #8\n"
1812       "ld1 {v1.2s}, [x0], #8\n"
1813       "ld1 {v2.2s}, [x1], #8\n"
1814       "ld1 {v3.2s}, [x2], #8\n"
1815       "uaddw v8.8h, v8.8h, v0.8b\n"
1816       "uaddw v9.8h, v9.8h, v1.8b\n"
1817       "uaddw v10.8h, v10.8h, v2.8b\n"
1818       "uaddw v11.8h, v11.8h, v3.8b\n"
1819       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1820 
1821       "bne 1b\n"
1822 
1823       "2:"
1824 
1825       // Load Aggregate Store: 4x4.
1826       "movi v0.8b, #0\n"
1827       "movi v1.8b, #0\n"
1828       "movi v2.8b, #0\n"
1829       "movi v3.8b, #0\n"
1830       "ld1 {v0.s}[0], [%x[in]], #4\n"
1831       "ld1 {v1.s}[0], [x0], #4\n"
1832       "ld1 {v2.s}[0], [x1], #4\n"
1833       "ld1 {v3.s}[0], [x2], #4\n"
1834       "uaddw v8.8h, v8.8h, v0.8b\n"
1835       "uaddw v9.8h, v9.8h, v1.8b\n"
1836       "uaddw v10.8h, v10.8h, v2.8b\n"
1837       "uaddw v11.8h, v11.8h, v3.8b\n"
1838       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1839 
1840       // Aggregator Reduction.
1841       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1842       "dup v1.4s, %w[additive_sum_offset]\n"
1843       "uaddlp v8.4s, v8.8h\n"
1844       "uaddlp v9.4s, v9.8h\n"
1845       "uaddlp v10.4s, v10.8h\n"
1846       "uaddlp v11.4s, v11.8h\n"
1847       "addp v8.4s, v8.4s, v9.4s\n"
1848       "addp v10.4s, v10.4s, v11.4s\n"
1849       "addp v8.4s, v8.4s, v10.4s\n"
1850       "mul v8.4s, v8.4s, v0.s[0]\n"
1851       "add v8.4s, v8.4s, v1.4s\n"
1852       "st1 {v8.4s}, [%x[out]]\n"
1853       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1854       : [stride] "r"(params.stride),
1855         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1856         [additive_sum_offset] "r"(params.additive_sum_offset)
1857       : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1858         "cc", "memory");
1859 }
1860 
1861 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1862 inline void Stream<uint8_t, 4, 8, 5, RowMajorWithSum>::Pack(
1863     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1864 #ifdef DEBUG
1865 #ifdef DEBUG_METAGEMM_VERBOSE
1866   std::cout << __FILE__ << "(" << __LINE__
1867             << ") RowMajorWithSum<uint8_t, 4, 8, 5, RowMajorWithSum>::Pack()"
1868             << std::endl
1869             << std::flush;
1870 #endif
1871 #endif
1872   int params_count_copy = params.count;
1873   asm volatile(
1874       "add x0, %x[in], %x[stride]\n"
1875       "add x1, x0, %x[stride]\n"
1876       "add x2, x1, %x[stride]\n"
1877       "movi v8.8h, #0\n"
1878       "movi v9.8h, #0\n"
1879       "movi v10.8h, #0\n"
1880       "movi v11.8h, #0\n"
1881 
1882       // Reduce count by leftovers.
1883       "subs %x[count], %x[count], #5\n"
1884       "beq 2f\n"
1885 
1886       "1:"
1887       "subs %x[count], %x[count], #8\n"
1888 
1889       // Load Aggregate Store: 4x8.
1890       "ld1 {v0.2s}, [%x[in]], #8\n"
1891       "ld1 {v1.2s}, [x0], #8\n"
1892       "ld1 {v2.2s}, [x1], #8\n"
1893       "ld1 {v3.2s}, [x2], #8\n"
1894       "uaddw v8.8h, v8.8h, v0.8b\n"
1895       "uaddw v9.8h, v9.8h, v1.8b\n"
1896       "uaddw v10.8h, v10.8h, v2.8b\n"
1897       "uaddw v11.8h, v11.8h, v3.8b\n"
1898       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1899 
1900       "bne 1b\n"
1901 
1902       "2:"
1903 
1904       // Load Aggregate Store: 4x5.
1905       "movi v0.8b, #0\n"
1906       "movi v1.8b, #0\n"
1907       "movi v2.8b, #0\n"
1908       "movi v3.8b, #0\n"
1909       "ld1 {v0.s}[0], [%x[in]], #4\n"
1910       "ld1 {v0.b}[4], [%x[in]], #1\n"
1911       "ld1 {v1.s}[0], [x0], #4\n"
1912       "ld1 {v1.b}[4], [x0], #1\n"
1913       "ld1 {v2.s}[0], [x1], #4\n"
1914       "ld1 {v2.b}[4], [x1], #1\n"
1915       "ld1 {v3.s}[0], [x2], #4\n"
1916       "ld1 {v3.b}[4], [x2], #1\n"
1917       "uaddw v8.8h, v8.8h, v0.8b\n"
1918       "uaddw v9.8h, v9.8h, v1.8b\n"
1919       "uaddw v10.8h, v10.8h, v2.8b\n"
1920       "uaddw v11.8h, v11.8h, v3.8b\n"
1921       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1922 
1923       // Aggregator Reduction.
1924       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1925       "dup v1.4s, %w[additive_sum_offset]\n"
1926       "uaddlp v8.4s, v8.8h\n"
1927       "uaddlp v9.4s, v9.8h\n"
1928       "uaddlp v10.4s, v10.8h\n"
1929       "uaddlp v11.4s, v11.8h\n"
1930       "addp v8.4s, v8.4s, v9.4s\n"
1931       "addp v10.4s, v10.4s, v11.4s\n"
1932       "addp v8.4s, v8.4s, v10.4s\n"
1933       "mul v8.4s, v8.4s, v0.s[0]\n"
1934       "add v8.4s, v8.4s, v1.4s\n"
1935       "st1 {v8.4s}, [%x[out]]\n"
1936       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1937       : [stride] "r"(params.stride),
1938         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1939         [additive_sum_offset] "r"(params.additive_sum_offset)
1940       : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1941         "cc", "memory");
1942 }
1943 
1944 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1945 inline void Stream<uint8_t, 4, 8, 6, RowMajorWithSum>::Pack(
1946     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1947 #ifdef DEBUG
1948 #ifdef DEBUG_METAGEMM_VERBOSE
1949   std::cout << __FILE__ << "(" << __LINE__
1950             << ") RowMajorWithSum<uint8_t, 4, 8, 6, RowMajorWithSum>::Pack()"
1951             << std::endl
1952             << std::flush;
1953 #endif
1954 #endif
1955   int params_count_copy = params.count;
1956   asm volatile(
1957       "add x0, %x[in], %x[stride]\n"
1958       "add x1, x0, %x[stride]\n"
1959       "add x2, x1, %x[stride]\n"
1960       "movi v8.8h, #0\n"
1961       "movi v9.8h, #0\n"
1962       "movi v10.8h, #0\n"
1963       "movi v11.8h, #0\n"
1964 
1965       // Reduce count by leftovers.
1966       "subs %x[count], %x[count], #6\n"
1967       "beq 2f\n"
1968 
1969       "1:"
1970       "subs %x[count], %x[count], #8\n"
1971 
1972       // Load Aggregate Store: 4x8.
1973       "ld1 {v0.2s}, [%x[in]], #8\n"
1974       "ld1 {v1.2s}, [x0], #8\n"
1975       "ld1 {v2.2s}, [x1], #8\n"
1976       "ld1 {v3.2s}, [x2], #8\n"
1977       "uaddw v8.8h, v8.8h, v0.8b\n"
1978       "uaddw v9.8h, v9.8h, v1.8b\n"
1979       "uaddw v10.8h, v10.8h, v2.8b\n"
1980       "uaddw v11.8h, v11.8h, v3.8b\n"
1981       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1982 
1983       "bne 1b\n"
1984 
1985       "2:"
1986 
1987       // Load Aggregate Store: 4x6.
1988       "movi v0.8b, #0\n"
1989       "movi v1.8b, #0\n"
1990       "movi v2.8b, #0\n"
1991       "movi v3.8b, #0\n"
1992       "ld1 {v0.s}[0], [%x[in]], #4\n"
1993       "ld1 {v0.h}[2], [%x[in]], #2\n"
1994       "ld1 {v1.s}[0], [x0], #4\n"
1995       "ld1 {v1.h}[2], [x0], #2\n"
1996       "ld1 {v2.s}[0], [x1], #4\n"
1997       "ld1 {v2.h}[2], [x1], #2\n"
1998       "ld1 {v3.s}[0], [x2], #4\n"
1999       "ld1 {v3.h}[2], [x2], #2\n"
2000       "uaddw v8.8h, v8.8h, v0.8b\n"
2001       "uaddw v9.8h, v9.8h, v1.8b\n"
2002       "uaddw v10.8h, v10.8h, v2.8b\n"
2003       "uaddw v11.8h, v11.8h, v3.8b\n"
2004       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2005 
2006       // Aggregator Reduction.
2007       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2008       "dup v1.4s, %w[additive_sum_offset]\n"
2009       "uaddlp v8.4s, v8.8h\n"
2010       "uaddlp v9.4s, v9.8h\n"
2011       "uaddlp v10.4s, v10.8h\n"
2012       "uaddlp v11.4s, v11.8h\n"
2013       "addp v8.4s, v8.4s, v9.4s\n"
2014       "addp v10.4s, v10.4s, v11.4s\n"
2015       "addp v8.4s, v8.4s, v10.4s\n"
2016       "mul v8.4s, v8.4s, v0.s[0]\n"
2017       "add v8.4s, v8.4s, v1.4s\n"
2018       "st1 {v8.4s}, [%x[out]]\n"
2019       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2020       : [stride] "r"(params.stride),
2021         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2022         [additive_sum_offset] "r"(params.additive_sum_offset)
2023       : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
2024         "cc", "memory");
2025 }
2026 
2027 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2028 inline void Stream<uint8_t, 4, 8, 7, RowMajorWithSum>::Pack(
2029     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2030 #ifdef DEBUG
2031 #ifdef DEBUG_METAGEMM_VERBOSE
2032   std::cout << __FILE__ << "(" << __LINE__
2033             << ") RowMajorWithSum<uint8_t, 4, 8, 7, RowMajorWithSum>::Pack()"
2034             << std::endl
2035             << std::flush;
2036 #endif
2037 #endif
2038   int params_count_copy = params.count;
2039   asm volatile(
2040       "add x0, %x[in], %x[stride]\n"
2041       "add x1, x0, %x[stride]\n"
2042       "add x2, x1, %x[stride]\n"
2043       "movi v8.8h, #0\n"
2044       "movi v9.8h, #0\n"
2045       "movi v10.8h, #0\n"
2046       "movi v11.8h, #0\n"
2047 
2048       // Reduce count by leftovers.
2049       "subs %x[count], %x[count], #7\n"
2050       "beq 2f\n"
2051 
2052       "1:"
2053       "subs %x[count], %x[count], #8\n"
2054 
2055       // Load Aggregate Store: 4x8.
2056       "ld1 {v0.2s}, [%x[in]], #8\n"
2057       "ld1 {v1.2s}, [x0], #8\n"
2058       "ld1 {v2.2s}, [x1], #8\n"
2059       "ld1 {v3.2s}, [x2], #8\n"
2060       "uaddw v8.8h, v8.8h, v0.8b\n"
2061       "uaddw v9.8h, v9.8h, v1.8b\n"
2062       "uaddw v10.8h, v10.8h, v2.8b\n"
2063       "uaddw v11.8h, v11.8h, v3.8b\n"
2064       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2065 
2066       "bne 1b\n"
2067 
2068       "2:"
2069 
2070       // Load Aggregate Store: 4x7.
2071       "movi v0.8b, #0\n"
2072       "movi v1.8b, #0\n"
2073       "movi v2.8b, #0\n"
2074       "movi v3.8b, #0\n"
2075       "ld1 {v0.s}[0], [%x[in]], #4\n"
2076       "ld1 {v0.h}[2], [%x[in]], #2\n"
2077       "ld1 {v0.b}[6], [%x[in]], #1\n"
2078       "ld1 {v1.s}[0], [x0], #4\n"
2079       "ld1 {v1.h}[2], [x0], #2\n"
2080       "ld1 {v1.b}[6], [x0], #1\n"
2081       "ld1 {v2.s}[0], [x1], #4\n"
2082       "ld1 {v2.h}[2], [x1], #2\n"
2083       "ld1 {v2.b}[6], [x1], #1\n"
2084       "ld1 {v3.s}[0], [x2], #4\n"
2085       "ld1 {v3.h}[2], [x2], #2\n"
2086       "ld1 {v3.b}[6], [x2], #1\n"
2087       "uaddw v8.8h, v8.8h, v0.8b\n"
2088       "uaddw v9.8h, v9.8h, v1.8b\n"
2089       "uaddw v10.8h, v10.8h, v2.8b\n"
2090       "uaddw v11.8h, v11.8h, v3.8b\n"
2091       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2092 
2093       // Aggregator Reduction.
2094       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2095       "dup v1.4s, %w[additive_sum_offset]\n"
2096       "uaddlp v8.4s, v8.8h\n"
2097       "uaddlp v9.4s, v9.8h\n"
2098       "uaddlp v10.4s, v10.8h\n"
2099       "uaddlp v11.4s, v11.8h\n"
2100       "addp v8.4s, v8.4s, v9.4s\n"
2101       "addp v10.4s, v10.4s, v11.4s\n"
2102       "addp v8.4s, v8.4s, v10.4s\n"
2103       "mul v8.4s, v8.4s, v0.s[0]\n"
2104       "add v8.4s, v8.4s, v1.4s\n"
2105       "st1 {v8.4s}, [%x[out]]\n"
2106       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2107       : [stride] "r"(params.stride),
2108         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2109         [additive_sum_offset] "r"(params.additive_sum_offset)
2110       : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
2111         "cc", "memory");
2112 }
2113 
2114 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2115 inline void Stream<uint8_t, 5, 8, 0, RowMajorWithSum>::Pack(
2116     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2117 #ifdef DEBUG
2118 #ifdef DEBUG_METAGEMM_VERBOSE
2119   std::cout << __FILE__ << "(" << __LINE__
2120             << ") RowMajorWithSum<uint8_t, 5, 8, 0, RowMajorWithSum>::Pack()"
2121             << std::endl
2122             << std::flush;
2123 #endif
2124 #endif
2125   int params_count_copy = params.count;
2126   asm volatile(
2127       "add x0, %x[in], %x[stride]\n"
2128       "add x1, x0, %x[stride]\n"
2129       "add x2, x1, %x[stride]\n"
2130       "add x3, x2, %x[stride]\n"
2131       "movi v8.8h, #0\n"
2132       "movi v9.8h, #0\n"
2133       "movi v10.8h, #0\n"
2134       "movi v11.8h, #0\n"
2135       "movi v12.8h, #0\n"
2136 
2137       "1:"
2138       "subs %x[count], %x[count], #8\n"
2139 
2140       // Load Aggregate Store: 5x8.
2141       "ld1 {v0.2s}, [%x[in]], #8\n"
2142       "ld1 {v1.2s}, [x0], #8\n"
2143       "ld1 {v2.2s}, [x1], #8\n"
2144       "ld1 {v3.2s}, [x2], #8\n"
2145       "ld1 {v4.2s}, [x3], #8\n"
2146       "uaddw v8.8h, v8.8h, v0.8b\n"
2147       "uaddw v9.8h, v9.8h, v1.8b\n"
2148       "uaddw v10.8h, v10.8h, v2.8b\n"
2149       "uaddw v11.8h, v11.8h, v3.8b\n"
2150       "uaddw v12.8h, v12.8h, v4.8b\n"
2151       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2152       "st1 {v4.2s}, [%x[out]], #8\n"
2153 
2154       "bne 1b\n"
2155 
2156       // Aggregator Reduction.
2157       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2158       "dup v1.4s, %w[additive_sum_offset]\n"
2159       "uaddlp v8.4s, v8.8h\n"
2160       "uaddlp v9.4s, v9.8h\n"
2161       "uaddlp v10.4s, v10.8h\n"
2162       "uaddlp v11.4s, v11.8h\n"
2163       "uaddlp v12.4s, v12.8h\n"
2164       "addp v8.4s, v8.4s, v9.4s\n"
2165       "addp v10.4s, v10.4s, v11.4s\n"
2166       "addp v12.4s, v12.4s, v12.4s\n"
2167       "addp v8.4s, v8.4s, v10.4s\n"
2168       "addp v9.4s, v12.4s, v12.4s\n"
2169       "mul v8.4s, v8.4s, v0.s[0]\n"
2170       "mul v9.4s, v9.4s, v0.s[0]\n"
2171       "add v8.4s, v8.4s, v1.4s\n"
2172       "add v9.4s, v9.4s, v1.4s\n"
2173       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2174       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2175       : [stride] "r"(params.stride),
2176         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2177         [additive_sum_offset] "r"(params.additive_sum_offset)
2178       : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2179         "v11", "v12", "cc", "memory");
2180 }
2181 
2182 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2183 inline void Stream<uint8_t, 5, 8, 1, RowMajorWithSum>::Pack(
2184     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2185 #ifdef DEBUG
2186 #ifdef DEBUG_METAGEMM_VERBOSE
2187   std::cout << __FILE__ << "(" << __LINE__
2188             << ") RowMajorWithSum<uint8_t, 5, 8, 1, RowMajorWithSum>::Pack()"
2189             << std::endl
2190             << std::flush;
2191 #endif
2192 #endif
2193   int params_count_copy = params.count;
2194   asm volatile(
2195       "add x0, %x[in], %x[stride]\n"
2196       "add x1, x0, %x[stride]\n"
2197       "add x2, x1, %x[stride]\n"
2198       "add x3, x2, %x[stride]\n"
2199       "movi v8.8h, #0\n"
2200       "movi v9.8h, #0\n"
2201       "movi v10.8h, #0\n"
2202       "movi v11.8h, #0\n"
2203       "movi v12.8h, #0\n"
2204 
2205       // Reduce count by leftovers.
2206       "subs %x[count], %x[count], #1\n"
2207       "beq 2f\n"
2208 
2209       "1:"
2210       "subs %x[count], %x[count], #8\n"
2211 
2212       // Load Aggregate Store: 5x8.
2213       "ld1 {v0.2s}, [%x[in]], #8\n"
2214       "ld1 {v1.2s}, [x0], #8\n"
2215       "ld1 {v2.2s}, [x1], #8\n"
2216       "ld1 {v3.2s}, [x2], #8\n"
2217       "ld1 {v4.2s}, [x3], #8\n"
2218       "uaddw v8.8h, v8.8h, v0.8b\n"
2219       "uaddw v9.8h, v9.8h, v1.8b\n"
2220       "uaddw v10.8h, v10.8h, v2.8b\n"
2221       "uaddw v11.8h, v11.8h, v3.8b\n"
2222       "uaddw v12.8h, v12.8h, v4.8b\n"
2223       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2224       "st1 {v4.2s}, [%x[out]], #8\n"
2225 
2226       "bne 1b\n"
2227 
2228       "2:"
2229 
2230       // Load Aggregate Store: 5x1.
2231       "movi v0.8b, #0\n"
2232       "movi v1.8b, #0\n"
2233       "movi v2.8b, #0\n"
2234       "movi v3.8b, #0\n"
2235       "movi v4.8b, #0\n"
2236       "ld1 {v0.b}[0], [%x[in]], #1\n"
2237       "ld1 {v1.b}[0], [x0], #1\n"
2238       "ld1 {v2.b}[0], [x1], #1\n"
2239       "ld1 {v3.b}[0], [x2], #1\n"
2240       "ld1 {v4.b}[0], [x3], #1\n"
2241       "uaddw v8.8h, v8.8h, v0.8b\n"
2242       "uaddw v9.8h, v9.8h, v1.8b\n"
2243       "uaddw v10.8h, v10.8h, v2.8b\n"
2244       "uaddw v11.8h, v11.8h, v3.8b\n"
2245       "uaddw v12.8h, v12.8h, v4.8b\n"
2246       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2247       "st1 {v4.2s}, [%x[out]], #8\n"
2248 
2249       // Aggregator Reduction.
2250       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2251       "dup v1.4s, %w[additive_sum_offset]\n"
2252       "uaddlp v8.4s, v8.8h\n"
2253       "uaddlp v9.4s, v9.8h\n"
2254       "uaddlp v10.4s, v10.8h\n"
2255       "uaddlp v11.4s, v11.8h\n"
2256       "uaddlp v12.4s, v12.8h\n"
2257       "addp v8.4s, v8.4s, v9.4s\n"
2258       "addp v10.4s, v10.4s, v11.4s\n"
2259       "addp v12.4s, v12.4s, v12.4s\n"
2260       "addp v8.4s, v8.4s, v10.4s\n"
2261       "addp v9.4s, v12.4s, v12.4s\n"
2262       "mul v8.4s, v8.4s, v0.s[0]\n"
2263       "mul v9.4s, v9.4s, v0.s[0]\n"
2264       "add v8.4s, v8.4s, v1.4s\n"
2265       "add v9.4s, v9.4s, v1.4s\n"
2266       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2267       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2268       : [stride] "r"(params.stride),
2269         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2270         [additive_sum_offset] "r"(params.additive_sum_offset)
2271       : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2272         "v11", "v12", "cc", "memory");
2273 }
2274 
2275 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2276 inline void Stream<uint8_t, 5, 8, 2, RowMajorWithSum>::Pack(
2277     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2278 #ifdef DEBUG
2279 #ifdef DEBUG_METAGEMM_VERBOSE
2280   std::cout << __FILE__ << "(" << __LINE__
2281             << ") RowMajorWithSum<uint8_t, 5, 8, 2, RowMajorWithSum>::Pack()"
2282             << std::endl
2283             << std::flush;
2284 #endif
2285 #endif
2286   int params_count_copy = params.count;
2287   asm volatile(
2288       "add x0, %x[in], %x[stride]\n"
2289       "add x1, x0, %x[stride]\n"
2290       "add x2, x1, %x[stride]\n"
2291       "add x3, x2, %x[stride]\n"
2292       "movi v8.8h, #0\n"
2293       "movi v9.8h, #0\n"
2294       "movi v10.8h, #0\n"
2295       "movi v11.8h, #0\n"
2296       "movi v12.8h, #0\n"
2297 
2298       // Reduce count by leftovers.
2299       "subs %x[count], %x[count], #2\n"
2300       "beq 2f\n"
2301 
2302       "1:"
2303       "subs %x[count], %x[count], #8\n"
2304 
2305       // Load Aggregate Store: 5x8.
2306       "ld1 {v0.2s}, [%x[in]], #8\n"
2307       "ld1 {v1.2s}, [x0], #8\n"
2308       "ld1 {v2.2s}, [x1], #8\n"
2309       "ld1 {v3.2s}, [x2], #8\n"
2310       "ld1 {v4.2s}, [x3], #8\n"
2311       "uaddw v8.8h, v8.8h, v0.8b\n"
2312       "uaddw v9.8h, v9.8h, v1.8b\n"
2313       "uaddw v10.8h, v10.8h, v2.8b\n"
2314       "uaddw v11.8h, v11.8h, v3.8b\n"
2315       "uaddw v12.8h, v12.8h, v4.8b\n"
2316       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2317       "st1 {v4.2s}, [%x[out]], #8\n"
2318 
2319       "bne 1b\n"
2320 
2321       "2:"
2322 
2323       // Load Aggregate Store: 5x2.
2324       "movi v0.8b, #0\n"
2325       "movi v1.8b, #0\n"
2326       "movi v2.8b, #0\n"
2327       "movi v3.8b, #0\n"
2328       "movi v4.8b, #0\n"
2329       "ld1 {v0.h}[0], [%x[in]], #2\n"
2330       "ld1 {v1.h}[0], [x0], #2\n"
2331       "ld1 {v2.h}[0], [x1], #2\n"
2332       "ld1 {v3.h}[0], [x2], #2\n"
2333       "ld1 {v4.h}[0], [x3], #2\n"
2334       "uaddw v8.8h, v8.8h, v0.8b\n"
2335       "uaddw v9.8h, v9.8h, v1.8b\n"
2336       "uaddw v10.8h, v10.8h, v2.8b\n"
2337       "uaddw v11.8h, v11.8h, v3.8b\n"
2338       "uaddw v12.8h, v12.8h, v4.8b\n"
2339       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2340       "st1 {v4.2s}, [%x[out]], #8\n"
2341 
2342       // Aggregator Reduction.
2343       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2344       "dup v1.4s, %w[additive_sum_offset]\n"
2345       "uaddlp v8.4s, v8.8h\n"
2346       "uaddlp v9.4s, v9.8h\n"
2347       "uaddlp v10.4s, v10.8h\n"
2348       "uaddlp v11.4s, v11.8h\n"
2349       "uaddlp v12.4s, v12.8h\n"
2350       "addp v8.4s, v8.4s, v9.4s\n"
2351       "addp v10.4s, v10.4s, v11.4s\n"
2352       "addp v12.4s, v12.4s, v12.4s\n"
2353       "addp v8.4s, v8.4s, v10.4s\n"
2354       "addp v9.4s, v12.4s, v12.4s\n"
2355       "mul v8.4s, v8.4s, v0.s[0]\n"
2356       "mul v9.4s, v9.4s, v0.s[0]\n"
2357       "add v8.4s, v8.4s, v1.4s\n"
2358       "add v9.4s, v9.4s, v1.4s\n"
2359       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2360       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2361       : [stride] "r"(params.stride),
2362         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2363         [additive_sum_offset] "r"(params.additive_sum_offset)
2364       : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2365         "v11", "v12", "cc", "memory");
2366 }
2367 
2368 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2369 inline void Stream<uint8_t, 5, 8, 3, RowMajorWithSum>::Pack(
2370     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2371 #ifdef DEBUG
2372 #ifdef DEBUG_METAGEMM_VERBOSE
2373   std::cout << __FILE__ << "(" << __LINE__
2374             << ") RowMajorWithSum<uint8_t, 5, 8, 3, RowMajorWithSum>::Pack()"
2375             << std::endl
2376             << std::flush;
2377 #endif
2378 #endif
2379   int params_count_copy = params.count;
2380   asm volatile(
2381       "add x0, %x[in], %x[stride]\n"
2382       "add x1, x0, %x[stride]\n"
2383       "add x2, x1, %x[stride]\n"
2384       "add x3, x2, %x[stride]\n"
2385       "movi v8.8h, #0\n"
2386       "movi v9.8h, #0\n"
2387       "movi v10.8h, #0\n"
2388       "movi v11.8h, #0\n"
2389       "movi v12.8h, #0\n"
2390 
2391       // Reduce count by leftovers.
2392       "subs %x[count], %x[count], #3\n"
2393       "beq 2f\n"
2394 
2395       "1:"
2396       "subs %x[count], %x[count], #8\n"
2397 
2398       // Load Aggregate Store: 5x8.
2399       "ld1 {v0.2s}, [%x[in]], #8\n"
2400       "ld1 {v1.2s}, [x0], #8\n"
2401       "ld1 {v2.2s}, [x1], #8\n"
2402       "ld1 {v3.2s}, [x2], #8\n"
2403       "ld1 {v4.2s}, [x3], #8\n"
2404       "uaddw v8.8h, v8.8h, v0.8b\n"
2405       "uaddw v9.8h, v9.8h, v1.8b\n"
2406       "uaddw v10.8h, v10.8h, v2.8b\n"
2407       "uaddw v11.8h, v11.8h, v3.8b\n"
2408       "uaddw v12.8h, v12.8h, v4.8b\n"
2409       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2410       "st1 {v4.2s}, [%x[out]], #8\n"
2411 
2412       "bne 1b\n"
2413 
2414       "2:"
2415 
2416       // Load Aggregate Store: 5x3.
2417       "movi v0.8b, #0\n"
2418       "movi v1.8b, #0\n"
2419       "movi v2.8b, #0\n"
2420       "movi v3.8b, #0\n"
2421       "movi v4.8b, #0\n"
2422       "ld1 {v0.h}[0], [%x[in]], #2\n"
2423       "ld1 {v0.b}[2], [%x[in]], #1\n"
2424       "ld1 {v1.h}[0], [x0], #2\n"
2425       "ld1 {v1.b}[2], [x0], #1\n"
2426       "ld1 {v2.h}[0], [x1], #2\n"
2427       "ld1 {v2.b}[2], [x1], #1\n"
2428       "ld1 {v3.h}[0], [x2], #2\n"
2429       "ld1 {v3.b}[2], [x2], #1\n"
2430       "ld1 {v4.h}[0], [x3], #2\n"
2431       "ld1 {v4.b}[2], [x3], #1\n"
2432       "uaddw v8.8h, v8.8h, v0.8b\n"
2433       "uaddw v9.8h, v9.8h, v1.8b\n"
2434       "uaddw v10.8h, v10.8h, v2.8b\n"
2435       "uaddw v11.8h, v11.8h, v3.8b\n"
2436       "uaddw v12.8h, v12.8h, v4.8b\n"
2437       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2438       "st1 {v4.2s}, [%x[out]], #8\n"
2439 
2440       // Aggregator Reduction.
2441       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2442       "dup v1.4s, %w[additive_sum_offset]\n"
2443       "uaddlp v8.4s, v8.8h\n"
2444       "uaddlp v9.4s, v9.8h\n"
2445       "uaddlp v10.4s, v10.8h\n"
2446       "uaddlp v11.4s, v11.8h\n"
2447       "uaddlp v12.4s, v12.8h\n"
2448       "addp v8.4s, v8.4s, v9.4s\n"
2449       "addp v10.4s, v10.4s, v11.4s\n"
2450       "addp v12.4s, v12.4s, v12.4s\n"
2451       "addp v8.4s, v8.4s, v10.4s\n"
2452       "addp v9.4s, v12.4s, v12.4s\n"
2453       "mul v8.4s, v8.4s, v0.s[0]\n"
2454       "mul v9.4s, v9.4s, v0.s[0]\n"
2455       "add v8.4s, v8.4s, v1.4s\n"
2456       "add v9.4s, v9.4s, v1.4s\n"
2457       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2458       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2459       : [stride] "r"(params.stride),
2460         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2461         [additive_sum_offset] "r"(params.additive_sum_offset)
2462       : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2463         "v11", "v12", "cc", "memory");
2464 }
2465 
2466 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2467 inline void Stream<uint8_t, 5, 8, 4, RowMajorWithSum>::Pack(
2468     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2469 #ifdef DEBUG
2470 #ifdef DEBUG_METAGEMM_VERBOSE
2471   std::cout << __FILE__ << "(" << __LINE__
2472             << ") RowMajorWithSum<uint8_t, 5, 8, 4, RowMajorWithSum>::Pack()"
2473             << std::endl
2474             << std::flush;
2475 #endif
2476 #endif
2477   int params_count_copy = params.count;
2478   asm volatile(
2479       "add x0, %x[in], %x[stride]\n"
2480       "add x1, x0, %x[stride]\n"
2481       "add x2, x1, %x[stride]\n"
2482       "add x3, x2, %x[stride]\n"
2483       "movi v8.8h, #0\n"
2484       "movi v9.8h, #0\n"
2485       "movi v10.8h, #0\n"
2486       "movi v11.8h, #0\n"
2487       "movi v12.8h, #0\n"
2488 
2489       // Reduce count by leftovers.
2490       "subs %x[count], %x[count], #4\n"
2491       "beq 2f\n"
2492 
2493       "1:"
2494       "subs %x[count], %x[count], #8\n"
2495 
2496       // Load Aggregate Store: 5x8.
2497       "ld1 {v0.2s}, [%x[in]], #8\n"
2498       "ld1 {v1.2s}, [x0], #8\n"
2499       "ld1 {v2.2s}, [x1], #8\n"
2500       "ld1 {v3.2s}, [x2], #8\n"
2501       "ld1 {v4.2s}, [x3], #8\n"
2502       "uaddw v8.8h, v8.8h, v0.8b\n"
2503       "uaddw v9.8h, v9.8h, v1.8b\n"
2504       "uaddw v10.8h, v10.8h, v2.8b\n"
2505       "uaddw v11.8h, v11.8h, v3.8b\n"
2506       "uaddw v12.8h, v12.8h, v4.8b\n"
2507       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2508       "st1 {v4.2s}, [%x[out]], #8\n"
2509 
2510       "bne 1b\n"
2511 
2512       "2:"
2513 
2514       // Load Aggregate Store: 5x4.
2515       "movi v0.8b, #0\n"
2516       "movi v1.8b, #0\n"
2517       "movi v2.8b, #0\n"
2518       "movi v3.8b, #0\n"
2519       "movi v4.8b, #0\n"
2520       "ld1 {v0.s}[0], [%x[in]], #4\n"
2521       "ld1 {v1.s}[0], [x0], #4\n"
2522       "ld1 {v2.s}[0], [x1], #4\n"
2523       "ld1 {v3.s}[0], [x2], #4\n"
2524       "ld1 {v4.s}[0], [x3], #4\n"
2525       "uaddw v8.8h, v8.8h, v0.8b\n"
2526       "uaddw v9.8h, v9.8h, v1.8b\n"
2527       "uaddw v10.8h, v10.8h, v2.8b\n"
2528       "uaddw v11.8h, v11.8h, v3.8b\n"
2529       "uaddw v12.8h, v12.8h, v4.8b\n"
2530       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2531       "st1 {v4.2s}, [%x[out]], #8\n"
2532 
2533       // Aggregator Reduction.
2534       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2535       "dup v1.4s, %w[additive_sum_offset]\n"
2536       "uaddlp v8.4s, v8.8h\n"
2537       "uaddlp v9.4s, v9.8h\n"
2538       "uaddlp v10.4s, v10.8h\n"
2539       "uaddlp v11.4s, v11.8h\n"
2540       "uaddlp v12.4s, v12.8h\n"
2541       "addp v8.4s, v8.4s, v9.4s\n"
2542       "addp v10.4s, v10.4s, v11.4s\n"
2543       "addp v12.4s, v12.4s, v12.4s\n"
2544       "addp v8.4s, v8.4s, v10.4s\n"
2545       "addp v9.4s, v12.4s, v12.4s\n"
2546       "mul v8.4s, v8.4s, v0.s[0]\n"
2547       "mul v9.4s, v9.4s, v0.s[0]\n"
2548       "add v8.4s, v8.4s, v1.4s\n"
2549       "add v9.4s, v9.4s, v1.4s\n"
2550       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2551       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2552       : [stride] "r"(params.stride),
2553         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2554         [additive_sum_offset] "r"(params.additive_sum_offset)
2555       : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2556         "v11", "v12", "cc", "memory");
2557 }
2558 
2559 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2560 inline void Stream<uint8_t, 5, 8, 5, RowMajorWithSum>::Pack(
2561     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2562 #ifdef DEBUG
2563 #ifdef DEBUG_METAGEMM_VERBOSE
2564   std::cout << __FILE__ << "(" << __LINE__
2565             << ") RowMajorWithSum<uint8_t, 5, 8, 5, RowMajorWithSum>::Pack()"
2566             << std::endl
2567             << std::flush;
2568 #endif
2569 #endif
2570   int params_count_copy = params.count;
2571   asm volatile(
2572       "add x0, %x[in], %x[stride]\n"
2573       "add x1, x0, %x[stride]\n"
2574       "add x2, x1, %x[stride]\n"
2575       "add x3, x2, %x[stride]\n"
2576       "movi v8.8h, #0\n"
2577       "movi v9.8h, #0\n"
2578       "movi v10.8h, #0\n"
2579       "movi v11.8h, #0\n"
2580       "movi v12.8h, #0\n"
2581 
2582       // Reduce count by leftovers.
2583       "subs %x[count], %x[count], #5\n"
2584       "beq 2f\n"
2585 
2586       "1:"
2587       "subs %x[count], %x[count], #8\n"
2588 
2589       // Load Aggregate Store: 5x8.
2590       "ld1 {v0.2s}, [%x[in]], #8\n"
2591       "ld1 {v1.2s}, [x0], #8\n"
2592       "ld1 {v2.2s}, [x1], #8\n"
2593       "ld1 {v3.2s}, [x2], #8\n"
2594       "ld1 {v4.2s}, [x3], #8\n"
2595       "uaddw v8.8h, v8.8h, v0.8b\n"
2596       "uaddw v9.8h, v9.8h, v1.8b\n"
2597       "uaddw v10.8h, v10.8h, v2.8b\n"
2598       "uaddw v11.8h, v11.8h, v3.8b\n"
2599       "uaddw v12.8h, v12.8h, v4.8b\n"
2600       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2601       "st1 {v4.2s}, [%x[out]], #8\n"
2602 
2603       "bne 1b\n"
2604 
2605       "2:"
2606 
2607       // Load Aggregate Store: 5x5.
2608       "movi v0.8b, #0\n"
2609       "movi v1.8b, #0\n"
2610       "movi v2.8b, #0\n"
2611       "movi v3.8b, #0\n"
2612       "movi v4.8b, #0\n"
2613       "ld1 {v0.s}[0], [%x[in]], #4\n"
2614       "ld1 {v0.b}[4], [%x[in]], #1\n"
2615       "ld1 {v1.s}[0], [x0], #4\n"
2616       "ld1 {v1.b}[4], [x0], #1\n"
2617       "ld1 {v2.s}[0], [x1], #4\n"
2618       "ld1 {v2.b}[4], [x1], #1\n"
2619       "ld1 {v3.s}[0], [x2], #4\n"
2620       "ld1 {v3.b}[4], [x2], #1\n"
2621       "ld1 {v4.s}[0], [x3], #4\n"
2622       "ld1 {v4.b}[4], [x3], #1\n"
2623       "uaddw v8.8h, v8.8h, v0.8b\n"
2624       "uaddw v9.8h, v9.8h, v1.8b\n"
2625       "uaddw v10.8h, v10.8h, v2.8b\n"
2626       "uaddw v11.8h, v11.8h, v3.8b\n"
2627       "uaddw v12.8h, v12.8h, v4.8b\n"
2628       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2629       "st1 {v4.2s}, [%x[out]], #8\n"
2630 
2631       // Aggregator Reduction.
2632       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2633       "dup v1.4s, %w[additive_sum_offset]\n"
2634       "uaddlp v8.4s, v8.8h\n"
2635       "uaddlp v9.4s, v9.8h\n"
2636       "uaddlp v10.4s, v10.8h\n"
2637       "uaddlp v11.4s, v11.8h\n"
2638       "uaddlp v12.4s, v12.8h\n"
2639       "addp v8.4s, v8.4s, v9.4s\n"
2640       "addp v10.4s, v10.4s, v11.4s\n"
2641       "addp v12.4s, v12.4s, v12.4s\n"
2642       "addp v8.4s, v8.4s, v10.4s\n"
2643       "addp v9.4s, v12.4s, v12.4s\n"
2644       "mul v8.4s, v8.4s, v0.s[0]\n"
2645       "mul v9.4s, v9.4s, v0.s[0]\n"
2646       "add v8.4s, v8.4s, v1.4s\n"
2647       "add v9.4s, v9.4s, v1.4s\n"
2648       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2649       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2650       : [stride] "r"(params.stride),
2651         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2652         [additive_sum_offset] "r"(params.additive_sum_offset)
2653       : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2654         "v11", "v12", "cc", "memory");
2655 }
2656 
2657 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2658 inline void Stream<uint8_t, 5, 8, 6, RowMajorWithSum>::Pack(
2659     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2660 #ifdef DEBUG
2661 #ifdef DEBUG_METAGEMM_VERBOSE
2662   std::cout << __FILE__ << "(" << __LINE__
2663             << ") RowMajorWithSum<uint8_t, 5, 8, 6, RowMajorWithSum>::Pack()"
2664             << std::endl
2665             << std::flush;
2666 #endif
2667 #endif
2668   int params_count_copy = params.count;
2669   asm volatile(
2670       "add x0, %x[in], %x[stride]\n"
2671       "add x1, x0, %x[stride]\n"
2672       "add x2, x1, %x[stride]\n"
2673       "add x3, x2, %x[stride]\n"
2674       "movi v8.8h, #0\n"
2675       "movi v9.8h, #0\n"
2676       "movi v10.8h, #0\n"
2677       "movi v11.8h, #0\n"
2678       "movi v12.8h, #0\n"
2679 
2680       // Reduce count by leftovers.
2681       "subs %x[count], %x[count], #6\n"
2682       "beq 2f\n"
2683 
2684       "1:"
2685       "subs %x[count], %x[count], #8\n"
2686 
2687       // Load Aggregate Store: 5x8.
2688       "ld1 {v0.2s}, [%x[in]], #8\n"
2689       "ld1 {v1.2s}, [x0], #8\n"
2690       "ld1 {v2.2s}, [x1], #8\n"
2691       "ld1 {v3.2s}, [x2], #8\n"
2692       "ld1 {v4.2s}, [x3], #8\n"
2693       "uaddw v8.8h, v8.8h, v0.8b\n"
2694       "uaddw v9.8h, v9.8h, v1.8b\n"
2695       "uaddw v10.8h, v10.8h, v2.8b\n"
2696       "uaddw v11.8h, v11.8h, v3.8b\n"
2697       "uaddw v12.8h, v12.8h, v4.8b\n"
2698       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2699       "st1 {v4.2s}, [%x[out]], #8\n"
2700 
2701       "bne 1b\n"
2702 
2703       "2:"
2704 
2705       // Load Aggregate Store: 5x6.
2706       "movi v0.8b, #0\n"
2707       "movi v1.8b, #0\n"
2708       "movi v2.8b, #0\n"
2709       "movi v3.8b, #0\n"
2710       "movi v4.8b, #0\n"
2711       "ld1 {v0.s}[0], [%x[in]], #4\n"
2712       "ld1 {v0.h}[2], [%x[in]], #2\n"
2713       "ld1 {v1.s}[0], [x0], #4\n"
2714       "ld1 {v1.h}[2], [x0], #2\n"
2715       "ld1 {v2.s}[0], [x1], #4\n"
2716       "ld1 {v2.h}[2], [x1], #2\n"
2717       "ld1 {v3.s}[0], [x2], #4\n"
2718       "ld1 {v3.h}[2], [x2], #2\n"
2719       "ld1 {v4.s}[0], [x3], #4\n"
2720       "ld1 {v4.h}[2], [x3], #2\n"
2721       "uaddw v8.8h, v8.8h, v0.8b\n"
2722       "uaddw v9.8h, v9.8h, v1.8b\n"
2723       "uaddw v10.8h, v10.8h, v2.8b\n"
2724       "uaddw v11.8h, v11.8h, v3.8b\n"
2725       "uaddw v12.8h, v12.8h, v4.8b\n"
2726       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2727       "st1 {v4.2s}, [%x[out]], #8\n"
2728 
2729       // Aggregator Reduction.
2730       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2731       "dup v1.4s, %w[additive_sum_offset]\n"
2732       "uaddlp v8.4s, v8.8h\n"
2733       "uaddlp v9.4s, v9.8h\n"
2734       "uaddlp v10.4s, v10.8h\n"
2735       "uaddlp v11.4s, v11.8h\n"
2736       "uaddlp v12.4s, v12.8h\n"
2737       "addp v8.4s, v8.4s, v9.4s\n"
2738       "addp v10.4s, v10.4s, v11.4s\n"
2739       "addp v12.4s, v12.4s, v12.4s\n"
2740       "addp v8.4s, v8.4s, v10.4s\n"
2741       "addp v9.4s, v12.4s, v12.4s\n"
2742       "mul v8.4s, v8.4s, v0.s[0]\n"
2743       "mul v9.4s, v9.4s, v0.s[0]\n"
2744       "add v8.4s, v8.4s, v1.4s\n"
2745       "add v9.4s, v9.4s, v1.4s\n"
2746       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2747       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2748       : [stride] "r"(params.stride),
2749         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2750         [additive_sum_offset] "r"(params.additive_sum_offset)
2751       : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2752         "v11", "v12", "cc", "memory");
2753 }
2754 
2755 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2756 inline void Stream<uint8_t, 5, 8, 7, RowMajorWithSum>::Pack(
2757     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2758 #ifdef DEBUG
2759 #ifdef DEBUG_METAGEMM_VERBOSE
2760   std::cout << __FILE__ << "(" << __LINE__
2761             << ") RowMajorWithSum<uint8_t, 5, 8, 7, RowMajorWithSum>::Pack()"
2762             << std::endl
2763             << std::flush;
2764 #endif
2765 #endif
2766   int params_count_copy = params.count;
2767   asm volatile(
2768       "add x0, %x[in], %x[stride]\n"
2769       "add x1, x0, %x[stride]\n"
2770       "add x2, x1, %x[stride]\n"
2771       "add x3, x2, %x[stride]\n"
2772       "movi v8.8h, #0\n"
2773       "movi v9.8h, #0\n"
2774       "movi v10.8h, #0\n"
2775       "movi v11.8h, #0\n"
2776       "movi v12.8h, #0\n"
2777 
2778       // Reduce count by leftovers.
2779       "subs %x[count], %x[count], #7\n"
2780       "beq 2f\n"
2781 
2782       "1:"
2783       "subs %x[count], %x[count], #8\n"
2784 
2785       // Load Aggregate Store: 5x8.
2786       "ld1 {v0.2s}, [%x[in]], #8\n"
2787       "ld1 {v1.2s}, [x0], #8\n"
2788       "ld1 {v2.2s}, [x1], #8\n"
2789       "ld1 {v3.2s}, [x2], #8\n"
2790       "ld1 {v4.2s}, [x3], #8\n"
2791       "uaddw v8.8h, v8.8h, v0.8b\n"
2792       "uaddw v9.8h, v9.8h, v1.8b\n"
2793       "uaddw v10.8h, v10.8h, v2.8b\n"
2794       "uaddw v11.8h, v11.8h, v3.8b\n"
2795       "uaddw v12.8h, v12.8h, v4.8b\n"
2796       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2797       "st1 {v4.2s}, [%x[out]], #8\n"
2798 
2799       "bne 1b\n"
2800 
2801       "2:"
2802 
2803       // Load Aggregate Store: 5x7.
2804       "movi v0.8b, #0\n"
2805       "movi v1.8b, #0\n"
2806       "movi v2.8b, #0\n"
2807       "movi v3.8b, #0\n"
2808       "movi v4.8b, #0\n"
2809       "ld1 {v0.s}[0], [%x[in]], #4\n"
2810       "ld1 {v0.h}[2], [%x[in]], #2\n"
2811       "ld1 {v0.b}[6], [%x[in]], #1\n"
2812       "ld1 {v1.s}[0], [x0], #4\n"
2813       "ld1 {v1.h}[2], [x0], #2\n"
2814       "ld1 {v1.b}[6], [x0], #1\n"
2815       "ld1 {v2.s}[0], [x1], #4\n"
2816       "ld1 {v2.h}[2], [x1], #2\n"
2817       "ld1 {v2.b}[6], [x1], #1\n"
2818       "ld1 {v3.s}[0], [x2], #4\n"
2819       "ld1 {v3.h}[2], [x2], #2\n"
2820       "ld1 {v3.b}[6], [x2], #1\n"
2821       "ld1 {v4.s}[0], [x3], #4\n"
2822       "ld1 {v4.h}[2], [x3], #2\n"
2823       "ld1 {v4.b}[6], [x3], #1\n"
2824       "uaddw v8.8h, v8.8h, v0.8b\n"
2825       "uaddw v9.8h, v9.8h, v1.8b\n"
2826       "uaddw v10.8h, v10.8h, v2.8b\n"
2827       "uaddw v11.8h, v11.8h, v3.8b\n"
2828       "uaddw v12.8h, v12.8h, v4.8b\n"
2829       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2830       "st1 {v4.2s}, [%x[out]], #8\n"
2831 
2832       // Aggregator Reduction.
2833       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2834       "dup v1.4s, %w[additive_sum_offset]\n"
2835       "uaddlp v8.4s, v8.8h\n"
2836       "uaddlp v9.4s, v9.8h\n"
2837       "uaddlp v10.4s, v10.8h\n"
2838       "uaddlp v11.4s, v11.8h\n"
2839       "uaddlp v12.4s, v12.8h\n"
2840       "addp v8.4s, v8.4s, v9.4s\n"
2841       "addp v10.4s, v10.4s, v11.4s\n"
2842       "addp v12.4s, v12.4s, v12.4s\n"
2843       "addp v8.4s, v8.4s, v10.4s\n"
2844       "addp v9.4s, v12.4s, v12.4s\n"
2845       "mul v8.4s, v8.4s, v0.s[0]\n"
2846       "mul v9.4s, v9.4s, v0.s[0]\n"
2847       "add v8.4s, v8.4s, v1.4s\n"
2848       "add v9.4s, v9.4s, v1.4s\n"
2849       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2850       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2851       : [stride] "r"(params.stride),
2852         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2853         [additive_sum_offset] "r"(params.additive_sum_offset)
2854       : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2855         "v11", "v12", "cc", "memory");
2856 }
2857 
2858 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2859 inline void Stream<uint8_t, 6, 8, 0, RowMajorWithSum>::Pack(
2860     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2861 #ifdef DEBUG
2862 #ifdef DEBUG_METAGEMM_VERBOSE
2863   std::cout << __FILE__ << "(" << __LINE__
2864             << ") RowMajorWithSum<uint8_t, 6, 8, 0, RowMajorWithSum>::Pack()"
2865             << std::endl
2866             << std::flush;
2867 #endif
2868 #endif
2869   int params_count_copy = params.count;
2870   asm volatile(
2871       "add x0, %x[in], %x[stride]\n"
2872       "add x1, x0, %x[stride]\n"
2873       "add x2, x1, %x[stride]\n"
2874       "add x3, x2, %x[stride]\n"
2875       "add x4, x3, %x[stride]\n"
2876       "movi v8.8h, #0\n"
2877       "movi v9.8h, #0\n"
2878       "movi v10.8h, #0\n"
2879       "movi v11.8h, #0\n"
2880       "movi v12.8h, #0\n"
2881       "movi v13.8h, #0\n"
2882 
2883       "1:"
2884       "subs %x[count], %x[count], #8\n"
2885 
2886       // Load Aggregate Store: 6x8.
2887       "ld1 {v0.2s}, [%x[in]], #8\n"
2888       "ld1 {v1.2s}, [x0], #8\n"
2889       "ld1 {v2.2s}, [x1], #8\n"
2890       "ld1 {v3.2s}, [x2], #8\n"
2891       "ld1 {v4.2s}, [x3], #8\n"
2892       "ld1 {v5.2s}, [x4], #8\n"
2893       "uaddw v8.8h, v8.8h, v0.8b\n"
2894       "uaddw v9.8h, v9.8h, v1.8b\n"
2895       "uaddw v10.8h, v10.8h, v2.8b\n"
2896       "uaddw v11.8h, v11.8h, v3.8b\n"
2897       "uaddw v12.8h, v12.8h, v4.8b\n"
2898       "uaddw v13.8h, v13.8h, v5.8b\n"
2899       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2900       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
2901 
2902       "bne 1b\n"
2903 
2904       // Aggregator Reduction.
2905       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2906       "dup v1.4s, %w[additive_sum_offset]\n"
2907       "uaddlp v8.4s, v8.8h\n"
2908       "uaddlp v9.4s, v9.8h\n"
2909       "uaddlp v10.4s, v10.8h\n"
2910       "uaddlp v11.4s, v11.8h\n"
2911       "uaddlp v12.4s, v12.8h\n"
2912       "uaddlp v13.4s, v13.8h\n"
2913       "addp v8.4s, v8.4s, v9.4s\n"
2914       "addp v10.4s, v10.4s, v11.4s\n"
2915       "addp v12.4s, v12.4s, v13.4s\n"
2916       "addp v8.4s, v8.4s, v10.4s\n"
2917       "addp v9.4s, v12.4s, v12.4s\n"
2918       "mul v8.4s, v8.4s, v0.s[0]\n"
2919       "mul v9.4s, v9.4s, v0.s[0]\n"
2920       "add v8.4s, v8.4s, v1.4s\n"
2921       "add v9.4s, v9.4s, v1.4s\n"
2922       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2923       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2924       : [stride] "r"(params.stride),
2925         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2926         [additive_sum_offset] "r"(params.additive_sum_offset)
2927       : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
2928         "v9", "v10", "v11", "v12", "v13", "cc", "memory");
2929 }
2930 
2931 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2932 inline void Stream<uint8_t, 6, 8, 1, RowMajorWithSum>::Pack(
2933     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2934 #ifdef DEBUG
2935 #ifdef DEBUG_METAGEMM_VERBOSE
2936   std::cout << __FILE__ << "(" << __LINE__
2937             << ") RowMajorWithSum<uint8_t, 6, 8, 1, RowMajorWithSum>::Pack()"
2938             << std::endl
2939             << std::flush;
2940 #endif
2941 #endif
2942   int params_count_copy = params.count;
2943   asm volatile(
2944       "add x0, %x[in], %x[stride]\n"
2945       "add x1, x0, %x[stride]\n"
2946       "add x2, x1, %x[stride]\n"
2947       "add x3, x2, %x[stride]\n"
2948       "add x4, x3, %x[stride]\n"
2949       "movi v8.8h, #0\n"
2950       "movi v9.8h, #0\n"
2951       "movi v10.8h, #0\n"
2952       "movi v11.8h, #0\n"
2953       "movi v12.8h, #0\n"
2954       "movi v13.8h, #0\n"
2955 
2956       // Reduce count by leftovers.
2957       "subs %x[count], %x[count], #1\n"
2958       "beq 2f\n"
2959 
2960       "1:"
2961       "subs %x[count], %x[count], #8\n"
2962 
2963       // Load Aggregate Store: 6x8.
2964       "ld1 {v0.2s}, [%x[in]], #8\n"
2965       "ld1 {v1.2s}, [x0], #8\n"
2966       "ld1 {v2.2s}, [x1], #8\n"
2967       "ld1 {v3.2s}, [x2], #8\n"
2968       "ld1 {v4.2s}, [x3], #8\n"
2969       "ld1 {v5.2s}, [x4], #8\n"
2970       "uaddw v8.8h, v8.8h, v0.8b\n"
2971       "uaddw v9.8h, v9.8h, v1.8b\n"
2972       "uaddw v10.8h, v10.8h, v2.8b\n"
2973       "uaddw v11.8h, v11.8h, v3.8b\n"
2974       "uaddw v12.8h, v12.8h, v4.8b\n"
2975       "uaddw v13.8h, v13.8h, v5.8b\n"
2976       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2977       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
2978 
2979       "bne 1b\n"
2980 
2981       "2:"
2982 
2983       // Load Aggregate Store: 6x1.
2984       "movi v0.8b, #0\n"
2985       "movi v1.8b, #0\n"
2986       "movi v2.8b, #0\n"
2987       "movi v3.8b, #0\n"
2988       "movi v4.8b, #0\n"
2989       "movi v5.8b, #0\n"
2990       "ld1 {v0.b}[0], [%x[in]], #1\n"
2991       "ld1 {v1.b}[0], [x0], #1\n"
2992       "ld1 {v2.b}[0], [x1], #1\n"
2993       "ld1 {v3.b}[0], [x2], #1\n"
2994       "ld1 {v4.b}[0], [x3], #1\n"
2995       "ld1 {v5.b}[0], [x4], #1\n"
2996       "uaddw v8.8h, v8.8h, v0.8b\n"
2997       "uaddw v9.8h, v9.8h, v1.8b\n"
2998       "uaddw v10.8h, v10.8h, v2.8b\n"
2999       "uaddw v11.8h, v11.8h, v3.8b\n"
3000       "uaddw v12.8h, v12.8h, v4.8b\n"
3001       "uaddw v13.8h, v13.8h, v5.8b\n"
3002       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3003       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3004 
3005       // Aggregator Reduction.
3006       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3007       "dup v1.4s, %w[additive_sum_offset]\n"
3008       "uaddlp v8.4s, v8.8h\n"
3009       "uaddlp v9.4s, v9.8h\n"
3010       "uaddlp v10.4s, v10.8h\n"
3011       "uaddlp v11.4s, v11.8h\n"
3012       "uaddlp v12.4s, v12.8h\n"
3013       "uaddlp v13.4s, v13.8h\n"
3014       "addp v8.4s, v8.4s, v9.4s\n"
3015       "addp v10.4s, v10.4s, v11.4s\n"
3016       "addp v12.4s, v12.4s, v13.4s\n"
3017       "addp v8.4s, v8.4s, v10.4s\n"
3018       "addp v9.4s, v12.4s, v12.4s\n"
3019       "mul v8.4s, v8.4s, v0.s[0]\n"
3020       "mul v9.4s, v9.4s, v0.s[0]\n"
3021       "add v8.4s, v8.4s, v1.4s\n"
3022       "add v9.4s, v9.4s, v1.4s\n"
3023       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3024       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3025       : [stride] "r"(params.stride),
3026         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3027         [additive_sum_offset] "r"(params.additive_sum_offset)
3028       : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3029         "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3030 }
3031 
3032 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3033 inline void Stream<uint8_t, 6, 8, 2, RowMajorWithSum>::Pack(
3034     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3035 #ifdef DEBUG
3036 #ifdef DEBUG_METAGEMM_VERBOSE
3037   std::cout << __FILE__ << "(" << __LINE__
3038             << ") RowMajorWithSum<uint8_t, 6, 8, 2, RowMajorWithSum>::Pack()"
3039             << std::endl
3040             << std::flush;
3041 #endif
3042 #endif
3043   int params_count_copy = params.count;
3044   asm volatile(
3045       "add x0, %x[in], %x[stride]\n"
3046       "add x1, x0, %x[stride]\n"
3047       "add x2, x1, %x[stride]\n"
3048       "add x3, x2, %x[stride]\n"
3049       "add x4, x3, %x[stride]\n"
3050       "movi v8.8h, #0\n"
3051       "movi v9.8h, #0\n"
3052       "movi v10.8h, #0\n"
3053       "movi v11.8h, #0\n"
3054       "movi v12.8h, #0\n"
3055       "movi v13.8h, #0\n"
3056 
3057       // Reduce count by leftovers.
3058       "subs %x[count], %x[count], #2\n"
3059       "beq 2f\n"
3060 
3061       "1:"
3062       "subs %x[count], %x[count], #8\n"
3063 
3064       // Load Aggregate Store: 6x8.
3065       "ld1 {v0.2s}, [%x[in]], #8\n"
3066       "ld1 {v1.2s}, [x0], #8\n"
3067       "ld1 {v2.2s}, [x1], #8\n"
3068       "ld1 {v3.2s}, [x2], #8\n"
3069       "ld1 {v4.2s}, [x3], #8\n"
3070       "ld1 {v5.2s}, [x4], #8\n"
3071       "uaddw v8.8h, v8.8h, v0.8b\n"
3072       "uaddw v9.8h, v9.8h, v1.8b\n"
3073       "uaddw v10.8h, v10.8h, v2.8b\n"
3074       "uaddw v11.8h, v11.8h, v3.8b\n"
3075       "uaddw v12.8h, v12.8h, v4.8b\n"
3076       "uaddw v13.8h, v13.8h, v5.8b\n"
3077       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3078       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3079 
3080       "bne 1b\n"
3081 
3082       "2:"
3083 
3084       // Load Aggregate Store: 6x2.
3085       "movi v0.8b, #0\n"
3086       "movi v1.8b, #0\n"
3087       "movi v2.8b, #0\n"
3088       "movi v3.8b, #0\n"
3089       "movi v4.8b, #0\n"
3090       "movi v5.8b, #0\n"
3091       "ld1 {v0.h}[0], [%x[in]], #2\n"
3092       "ld1 {v1.h}[0], [x0], #2\n"
3093       "ld1 {v2.h}[0], [x1], #2\n"
3094       "ld1 {v3.h}[0], [x2], #2\n"
3095       "ld1 {v4.h}[0], [x3], #2\n"
3096       "ld1 {v5.h}[0], [x4], #2\n"
3097       "uaddw v8.8h, v8.8h, v0.8b\n"
3098       "uaddw v9.8h, v9.8h, v1.8b\n"
3099       "uaddw v10.8h, v10.8h, v2.8b\n"
3100       "uaddw v11.8h, v11.8h, v3.8b\n"
3101       "uaddw v12.8h, v12.8h, v4.8b\n"
3102       "uaddw v13.8h, v13.8h, v5.8b\n"
3103       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3104       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3105 
3106       // Aggregator Reduction.
3107       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3108       "dup v1.4s, %w[additive_sum_offset]\n"
3109       "uaddlp v8.4s, v8.8h\n"
3110       "uaddlp v9.4s, v9.8h\n"
3111       "uaddlp v10.4s, v10.8h\n"
3112       "uaddlp v11.4s, v11.8h\n"
3113       "uaddlp v12.4s, v12.8h\n"
3114       "uaddlp v13.4s, v13.8h\n"
3115       "addp v8.4s, v8.4s, v9.4s\n"
3116       "addp v10.4s, v10.4s, v11.4s\n"
3117       "addp v12.4s, v12.4s, v13.4s\n"
3118       "addp v8.4s, v8.4s, v10.4s\n"
3119       "addp v9.4s, v12.4s, v12.4s\n"
3120       "mul v8.4s, v8.4s, v0.s[0]\n"
3121       "mul v9.4s, v9.4s, v0.s[0]\n"
3122       "add v8.4s, v8.4s, v1.4s\n"
3123       "add v9.4s, v9.4s, v1.4s\n"
3124       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3125       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3126       : [stride] "r"(params.stride),
3127         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3128         [additive_sum_offset] "r"(params.additive_sum_offset)
3129       : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3130         "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3131 }
3132 
3133 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3134 inline void Stream<uint8_t, 6, 8, 3, RowMajorWithSum>::Pack(
3135     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3136 #ifdef DEBUG
3137 #ifdef DEBUG_METAGEMM_VERBOSE
3138   std::cout << __FILE__ << "(" << __LINE__
3139             << ") RowMajorWithSum<uint8_t, 6, 8, 3, RowMajorWithSum>::Pack()"
3140             << std::endl
3141             << std::flush;
3142 #endif
3143 #endif
3144   int params_count_copy = params.count;
3145   asm volatile(
3146       "add x0, %x[in], %x[stride]\n"
3147       "add x1, x0, %x[stride]\n"
3148       "add x2, x1, %x[stride]\n"
3149       "add x3, x2, %x[stride]\n"
3150       "add x4, x3, %x[stride]\n"
3151       "movi v8.8h, #0\n"
3152       "movi v9.8h, #0\n"
3153       "movi v10.8h, #0\n"
3154       "movi v11.8h, #0\n"
3155       "movi v12.8h, #0\n"
3156       "movi v13.8h, #0\n"
3157 
3158       // Reduce count by leftovers.
3159       "subs %x[count], %x[count], #3\n"
3160       "beq 2f\n"
3161 
3162       "1:"
3163       "subs %x[count], %x[count], #8\n"
3164 
3165       // Load Aggregate Store: 6x8.
3166       "ld1 {v0.2s}, [%x[in]], #8\n"
3167       "ld1 {v1.2s}, [x0], #8\n"
3168       "ld1 {v2.2s}, [x1], #8\n"
3169       "ld1 {v3.2s}, [x2], #8\n"
3170       "ld1 {v4.2s}, [x3], #8\n"
3171       "ld1 {v5.2s}, [x4], #8\n"
3172       "uaddw v8.8h, v8.8h, v0.8b\n"
3173       "uaddw v9.8h, v9.8h, v1.8b\n"
3174       "uaddw v10.8h, v10.8h, v2.8b\n"
3175       "uaddw v11.8h, v11.8h, v3.8b\n"
3176       "uaddw v12.8h, v12.8h, v4.8b\n"
3177       "uaddw v13.8h, v13.8h, v5.8b\n"
3178       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3179       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3180 
3181       "bne 1b\n"
3182 
3183       "2:"
3184 
3185       // Load Aggregate Store: 6x3.
3186       "movi v0.8b, #0\n"
3187       "movi v1.8b, #0\n"
3188       "movi v2.8b, #0\n"
3189       "movi v3.8b, #0\n"
3190       "movi v4.8b, #0\n"
3191       "movi v5.8b, #0\n"
3192       "ld1 {v0.h}[0], [%x[in]], #2\n"
3193       "ld1 {v0.b}[2], [%x[in]], #1\n"
3194       "ld1 {v1.h}[0], [x0], #2\n"
3195       "ld1 {v1.b}[2], [x0], #1\n"
3196       "ld1 {v2.h}[0], [x1], #2\n"
3197       "ld1 {v2.b}[2], [x1], #1\n"
3198       "ld1 {v3.h}[0], [x2], #2\n"
3199       "ld1 {v3.b}[2], [x2], #1\n"
3200       "ld1 {v4.h}[0], [x3], #2\n"
3201       "ld1 {v4.b}[2], [x3], #1\n"
3202       "ld1 {v5.h}[0], [x4], #2\n"
3203       "ld1 {v5.b}[2], [x4], #1\n"
3204       "uaddw v8.8h, v8.8h, v0.8b\n"
3205       "uaddw v9.8h, v9.8h, v1.8b\n"
3206       "uaddw v10.8h, v10.8h, v2.8b\n"
3207       "uaddw v11.8h, v11.8h, v3.8b\n"
3208       "uaddw v12.8h, v12.8h, v4.8b\n"
3209       "uaddw v13.8h, v13.8h, v5.8b\n"
3210       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3211       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3212 
3213       // Aggregator Reduction.
3214       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3215       "dup v1.4s, %w[additive_sum_offset]\n"
3216       "uaddlp v8.4s, v8.8h\n"
3217       "uaddlp v9.4s, v9.8h\n"
3218       "uaddlp v10.4s, v10.8h\n"
3219       "uaddlp v11.4s, v11.8h\n"
3220       "uaddlp v12.4s, v12.8h\n"
3221       "uaddlp v13.4s, v13.8h\n"
3222       "addp v8.4s, v8.4s, v9.4s\n"
3223       "addp v10.4s, v10.4s, v11.4s\n"
3224       "addp v12.4s, v12.4s, v13.4s\n"
3225       "addp v8.4s, v8.4s, v10.4s\n"
3226       "addp v9.4s, v12.4s, v12.4s\n"
3227       "mul v8.4s, v8.4s, v0.s[0]\n"
3228       "mul v9.4s, v9.4s, v0.s[0]\n"
3229       "add v8.4s, v8.4s, v1.4s\n"
3230       "add v9.4s, v9.4s, v1.4s\n"
3231       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3232       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3233       : [stride] "r"(params.stride),
3234         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3235         [additive_sum_offset] "r"(params.additive_sum_offset)
3236       : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3237         "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3238 }
3239 
3240 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3241 inline void Stream<uint8_t, 6, 8, 4, RowMajorWithSum>::Pack(
3242     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3243 #ifdef DEBUG
3244 #ifdef DEBUG_METAGEMM_VERBOSE
3245   std::cout << __FILE__ << "(" << __LINE__
3246             << ") RowMajorWithSum<uint8_t, 6, 8, 4, RowMajorWithSum>::Pack()"
3247             << std::endl
3248             << std::flush;
3249 #endif
3250 #endif
3251   int params_count_copy = params.count;
3252   asm volatile(
3253       "add x0, %x[in], %x[stride]\n"
3254       "add x1, x0, %x[stride]\n"
3255       "add x2, x1, %x[stride]\n"
3256       "add x3, x2, %x[stride]\n"
3257       "add x4, x3, %x[stride]\n"
3258       "movi v8.8h, #0\n"
3259       "movi v9.8h, #0\n"
3260       "movi v10.8h, #0\n"
3261       "movi v11.8h, #0\n"
3262       "movi v12.8h, #0\n"
3263       "movi v13.8h, #0\n"
3264 
3265       // Reduce count by leftovers.
3266       "subs %x[count], %x[count], #4\n"
3267       "beq 2f\n"
3268 
3269       "1:"
3270       "subs %x[count], %x[count], #8\n"
3271 
3272       // Load Aggregate Store: 6x8.
3273       "ld1 {v0.2s}, [%x[in]], #8\n"
3274       "ld1 {v1.2s}, [x0], #8\n"
3275       "ld1 {v2.2s}, [x1], #8\n"
3276       "ld1 {v3.2s}, [x2], #8\n"
3277       "ld1 {v4.2s}, [x3], #8\n"
3278       "ld1 {v5.2s}, [x4], #8\n"
3279       "uaddw v8.8h, v8.8h, v0.8b\n"
3280       "uaddw v9.8h, v9.8h, v1.8b\n"
3281       "uaddw v10.8h, v10.8h, v2.8b\n"
3282       "uaddw v11.8h, v11.8h, v3.8b\n"
3283       "uaddw v12.8h, v12.8h, v4.8b\n"
3284       "uaddw v13.8h, v13.8h, v5.8b\n"
3285       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3286       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3287 
3288       "bne 1b\n"
3289 
3290       "2:"
3291 
3292       // Load Aggregate Store: 6x4.
3293       "movi v0.8b, #0\n"
3294       "movi v1.8b, #0\n"
3295       "movi v2.8b, #0\n"
3296       "movi v3.8b, #0\n"
3297       "movi v4.8b, #0\n"
3298       "movi v5.8b, #0\n"
3299       "ld1 {v0.s}[0], [%x[in]], #4\n"
3300       "ld1 {v1.s}[0], [x0], #4\n"
3301       "ld1 {v2.s}[0], [x1], #4\n"
3302       "ld1 {v3.s}[0], [x2], #4\n"
3303       "ld1 {v4.s}[0], [x3], #4\n"
3304       "ld1 {v5.s}[0], [x4], #4\n"
3305       "uaddw v8.8h, v8.8h, v0.8b\n"
3306       "uaddw v9.8h, v9.8h, v1.8b\n"
3307       "uaddw v10.8h, v10.8h, v2.8b\n"
3308       "uaddw v11.8h, v11.8h, v3.8b\n"
3309       "uaddw v12.8h, v12.8h, v4.8b\n"
3310       "uaddw v13.8h, v13.8h, v5.8b\n"
3311       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3312       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3313 
3314       // Aggregator Reduction.
3315       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3316       "dup v1.4s, %w[additive_sum_offset]\n"
3317       "uaddlp v8.4s, v8.8h\n"
3318       "uaddlp v9.4s, v9.8h\n"
3319       "uaddlp v10.4s, v10.8h\n"
3320       "uaddlp v11.4s, v11.8h\n"
3321       "uaddlp v12.4s, v12.8h\n"
3322       "uaddlp v13.4s, v13.8h\n"
3323       "addp v8.4s, v8.4s, v9.4s\n"
3324       "addp v10.4s, v10.4s, v11.4s\n"
3325       "addp v12.4s, v12.4s, v13.4s\n"
3326       "addp v8.4s, v8.4s, v10.4s\n"
3327       "addp v9.4s, v12.4s, v12.4s\n"
3328       "mul v8.4s, v8.4s, v0.s[0]\n"
3329       "mul v9.4s, v9.4s, v0.s[0]\n"
3330       "add v8.4s, v8.4s, v1.4s\n"
3331       "add v9.4s, v9.4s, v1.4s\n"
3332       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3333       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3334       : [stride] "r"(params.stride),
3335         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3336         [additive_sum_offset] "r"(params.additive_sum_offset)
3337       : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3338         "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3339 }
3340 
3341 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3342 inline void Stream<uint8_t, 6, 8, 5, RowMajorWithSum>::Pack(
3343     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3344 #ifdef DEBUG
3345 #ifdef DEBUG_METAGEMM_VERBOSE
3346   std::cout << __FILE__ << "(" << __LINE__
3347             << ") RowMajorWithSum<uint8_t, 6, 8, 5, RowMajorWithSum>::Pack()"
3348             << std::endl
3349             << std::flush;
3350 #endif
3351 #endif
3352   int params_count_copy = params.count;
3353   asm volatile(
3354       "add x0, %x[in], %x[stride]\n"
3355       "add x1, x0, %x[stride]\n"
3356       "add x2, x1, %x[stride]\n"
3357       "add x3, x2, %x[stride]\n"
3358       "add x4, x3, %x[stride]\n"
3359       "movi v8.8h, #0\n"
3360       "movi v9.8h, #0\n"
3361       "movi v10.8h, #0\n"
3362       "movi v11.8h, #0\n"
3363       "movi v12.8h, #0\n"
3364       "movi v13.8h, #0\n"
3365 
3366       // Reduce count by leftovers.
3367       "subs %x[count], %x[count], #5\n"
3368       "beq 2f\n"
3369 
3370       "1:"
3371       "subs %x[count], %x[count], #8\n"
3372 
3373       // Load Aggregate Store: 6x8.
3374       "ld1 {v0.2s}, [%x[in]], #8\n"
3375       "ld1 {v1.2s}, [x0], #8\n"
3376       "ld1 {v2.2s}, [x1], #8\n"
3377       "ld1 {v3.2s}, [x2], #8\n"
3378       "ld1 {v4.2s}, [x3], #8\n"
3379       "ld1 {v5.2s}, [x4], #8\n"
3380       "uaddw v8.8h, v8.8h, v0.8b\n"
3381       "uaddw v9.8h, v9.8h, v1.8b\n"
3382       "uaddw v10.8h, v10.8h, v2.8b\n"
3383       "uaddw v11.8h, v11.8h, v3.8b\n"
3384       "uaddw v12.8h, v12.8h, v4.8b\n"
3385       "uaddw v13.8h, v13.8h, v5.8b\n"
3386       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3387       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3388 
3389       "bne 1b\n"
3390 
3391       "2:"
3392 
3393       // Load Aggregate Store: 6x5.
3394       "movi v0.8b, #0\n"
3395       "movi v1.8b, #0\n"
3396       "movi v2.8b, #0\n"
3397       "movi v3.8b, #0\n"
3398       "movi v4.8b, #0\n"
3399       "movi v5.8b, #0\n"
3400       "ld1 {v0.s}[0], [%x[in]], #4\n"
3401       "ld1 {v0.b}[4], [%x[in]], #1\n"
3402       "ld1 {v1.s}[0], [x0], #4\n"
3403       "ld1 {v1.b}[4], [x0], #1\n"
3404       "ld1 {v2.s}[0], [x1], #4\n"
3405       "ld1 {v2.b}[4], [x1], #1\n"
3406       "ld1 {v3.s}[0], [x2], #4\n"
3407       "ld1 {v3.b}[4], [x2], #1\n"
3408       "ld1 {v4.s}[0], [x3], #4\n"
3409       "ld1 {v4.b}[4], [x3], #1\n"
3410       "ld1 {v5.s}[0], [x4], #4\n"
3411       "ld1 {v5.b}[4], [x4], #1\n"
3412       "uaddw v8.8h, v8.8h, v0.8b\n"
3413       "uaddw v9.8h, v9.8h, v1.8b\n"
3414       "uaddw v10.8h, v10.8h, v2.8b\n"
3415       "uaddw v11.8h, v11.8h, v3.8b\n"
3416       "uaddw v12.8h, v12.8h, v4.8b\n"
3417       "uaddw v13.8h, v13.8h, v5.8b\n"
3418       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3419       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3420 
3421       // Aggregator Reduction.
3422       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3423       "dup v1.4s, %w[additive_sum_offset]\n"
3424       "uaddlp v8.4s, v8.8h\n"
3425       "uaddlp v9.4s, v9.8h\n"
3426       "uaddlp v10.4s, v10.8h\n"
3427       "uaddlp v11.4s, v11.8h\n"
3428       "uaddlp v12.4s, v12.8h\n"
3429       "uaddlp v13.4s, v13.8h\n"
3430       "addp v8.4s, v8.4s, v9.4s\n"
3431       "addp v10.4s, v10.4s, v11.4s\n"
3432       "addp v12.4s, v12.4s, v13.4s\n"
3433       "addp v8.4s, v8.4s, v10.4s\n"
3434       "addp v9.4s, v12.4s, v12.4s\n"
3435       "mul v8.4s, v8.4s, v0.s[0]\n"
3436       "mul v9.4s, v9.4s, v0.s[0]\n"
3437       "add v8.4s, v8.4s, v1.4s\n"
3438       "add v9.4s, v9.4s, v1.4s\n"
3439       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3440       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3441       : [stride] "r"(params.stride),
3442         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3443         [additive_sum_offset] "r"(params.additive_sum_offset)
3444       : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3445         "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3446 }
3447 
3448 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3449 inline void Stream<uint8_t, 6, 8, 6, RowMajorWithSum>::Pack(
3450     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3451 #ifdef DEBUG
3452 #ifdef DEBUG_METAGEMM_VERBOSE
3453   std::cout << __FILE__ << "(" << __LINE__
3454             << ") RowMajorWithSum<uint8_t, 6, 8, 6, RowMajorWithSum>::Pack()"
3455             << std::endl
3456             << std::flush;
3457 #endif
3458 #endif
3459   int params_count_copy = params.count;
3460   asm volatile(
3461       "add x0, %x[in], %x[stride]\n"
3462       "add x1, x0, %x[stride]\n"
3463       "add x2, x1, %x[stride]\n"
3464       "add x3, x2, %x[stride]\n"
3465       "add x4, x3, %x[stride]\n"
3466       "movi v8.8h, #0\n"
3467       "movi v9.8h, #0\n"
3468       "movi v10.8h, #0\n"
3469       "movi v11.8h, #0\n"
3470       "movi v12.8h, #0\n"
3471       "movi v13.8h, #0\n"
3472 
3473       // Reduce count by leftovers.
3474       "subs %x[count], %x[count], #6\n"
3475       "beq 2f\n"
3476 
3477       "1:"
3478       "subs %x[count], %x[count], #8\n"
3479 
3480       // Load Aggregate Store: 6x8.
3481       "ld1 {v0.2s}, [%x[in]], #8\n"
3482       "ld1 {v1.2s}, [x0], #8\n"
3483       "ld1 {v2.2s}, [x1], #8\n"
3484       "ld1 {v3.2s}, [x2], #8\n"
3485       "ld1 {v4.2s}, [x3], #8\n"
3486       "ld1 {v5.2s}, [x4], #8\n"
3487       "uaddw v8.8h, v8.8h, v0.8b\n"
3488       "uaddw v9.8h, v9.8h, v1.8b\n"
3489       "uaddw v10.8h, v10.8h, v2.8b\n"
3490       "uaddw v11.8h, v11.8h, v3.8b\n"
3491       "uaddw v12.8h, v12.8h, v4.8b\n"
3492       "uaddw v13.8h, v13.8h, v5.8b\n"
3493       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3494       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3495 
3496       "bne 1b\n"
3497 
3498       "2:"
3499 
3500       // Load Aggregate Store: 6x6.
3501       "movi v0.8b, #0\n"
3502       "movi v1.8b, #0\n"
3503       "movi v2.8b, #0\n"
3504       "movi v3.8b, #0\n"
3505       "movi v4.8b, #0\n"
3506       "movi v5.8b, #0\n"
3507       "ld1 {v0.s}[0], [%x[in]], #4\n"
3508       "ld1 {v0.h}[2], [%x[in]], #2\n"
3509       "ld1 {v1.s}[0], [x0], #4\n"
3510       "ld1 {v1.h}[2], [x0], #2\n"
3511       "ld1 {v2.s}[0], [x1], #4\n"
3512       "ld1 {v2.h}[2], [x1], #2\n"
3513       "ld1 {v3.s}[0], [x2], #4\n"
3514       "ld1 {v3.h}[2], [x2], #2\n"
3515       "ld1 {v4.s}[0], [x3], #4\n"
3516       "ld1 {v4.h}[2], [x3], #2\n"
3517       "ld1 {v5.s}[0], [x4], #4\n"
3518       "ld1 {v5.h}[2], [x4], #2\n"
3519       "uaddw v8.8h, v8.8h, v0.8b\n"
3520       "uaddw v9.8h, v9.8h, v1.8b\n"
3521       "uaddw v10.8h, v10.8h, v2.8b\n"
3522       "uaddw v11.8h, v11.8h, v3.8b\n"
3523       "uaddw v12.8h, v12.8h, v4.8b\n"
3524       "uaddw v13.8h, v13.8h, v5.8b\n"
3525       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3526       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3527 
3528       // Aggregator Reduction.
3529       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3530       "dup v1.4s, %w[additive_sum_offset]\n"
3531       "uaddlp v8.4s, v8.8h\n"
3532       "uaddlp v9.4s, v9.8h\n"
3533       "uaddlp v10.4s, v10.8h\n"
3534       "uaddlp v11.4s, v11.8h\n"
3535       "uaddlp v12.4s, v12.8h\n"
3536       "uaddlp v13.4s, v13.8h\n"
3537       "addp v8.4s, v8.4s, v9.4s\n"
3538       "addp v10.4s, v10.4s, v11.4s\n"
3539       "addp v12.4s, v12.4s, v13.4s\n"
3540       "addp v8.4s, v8.4s, v10.4s\n"
3541       "addp v9.4s, v12.4s, v12.4s\n"
3542       "mul v8.4s, v8.4s, v0.s[0]\n"
3543       "mul v9.4s, v9.4s, v0.s[0]\n"
3544       "add v8.4s, v8.4s, v1.4s\n"
3545       "add v9.4s, v9.4s, v1.4s\n"
3546       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3547       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3548       : [stride] "r"(params.stride),
3549         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3550         [additive_sum_offset] "r"(params.additive_sum_offset)
3551       : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3552         "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3553 }
3554 
3555 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3556 inline void Stream<uint8_t, 6, 8, 7, RowMajorWithSum>::Pack(
3557     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3558 #ifdef DEBUG
3559 #ifdef DEBUG_METAGEMM_VERBOSE
3560   std::cout << __FILE__ << "(" << __LINE__
3561             << ") RowMajorWithSum<uint8_t, 6, 8, 7, RowMajorWithSum>::Pack()"
3562             << std::endl
3563             << std::flush;
3564 #endif
3565 #endif
3566   int params_count_copy = params.count;
3567   asm volatile(
3568       "add x0, %x[in], %x[stride]\n"
3569       "add x1, x0, %x[stride]\n"
3570       "add x2, x1, %x[stride]\n"
3571       "add x3, x2, %x[stride]\n"
3572       "add x4, x3, %x[stride]\n"
3573       "movi v8.8h, #0\n"
3574       "movi v9.8h, #0\n"
3575       "movi v10.8h, #0\n"
3576       "movi v11.8h, #0\n"
3577       "movi v12.8h, #0\n"
3578       "movi v13.8h, #0\n"
3579 
3580       // Reduce count by leftovers.
3581       "subs %x[count], %x[count], #7\n"
3582       "beq 2f\n"
3583 
3584       "1:"
3585       "subs %x[count], %x[count], #8\n"
3586 
3587       // Load Aggregate Store: 6x8.
3588       "ld1 {v0.2s}, [%x[in]], #8\n"
3589       "ld1 {v1.2s}, [x0], #8\n"
3590       "ld1 {v2.2s}, [x1], #8\n"
3591       "ld1 {v3.2s}, [x2], #8\n"
3592       "ld1 {v4.2s}, [x3], #8\n"
3593       "ld1 {v5.2s}, [x4], #8\n"
3594       "uaddw v8.8h, v8.8h, v0.8b\n"
3595       "uaddw v9.8h, v9.8h, v1.8b\n"
3596       "uaddw v10.8h, v10.8h, v2.8b\n"
3597       "uaddw v11.8h, v11.8h, v3.8b\n"
3598       "uaddw v12.8h, v12.8h, v4.8b\n"
3599       "uaddw v13.8h, v13.8h, v5.8b\n"
3600       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3601       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3602 
3603       "bne 1b\n"
3604 
3605       "2:"
3606 
3607       // Load Aggregate Store: 6x7.
3608       "movi v0.8b, #0\n"
3609       "movi v1.8b, #0\n"
3610       "movi v2.8b, #0\n"
3611       "movi v3.8b, #0\n"
3612       "movi v4.8b, #0\n"
3613       "movi v5.8b, #0\n"
3614       "ld1 {v0.s}[0], [%x[in]], #4\n"
3615       "ld1 {v0.h}[2], [%x[in]], #2\n"
3616       "ld1 {v0.b}[6], [%x[in]], #1\n"
3617       "ld1 {v1.s}[0], [x0], #4\n"
3618       "ld1 {v1.h}[2], [x0], #2\n"
3619       "ld1 {v1.b}[6], [x0], #1\n"
3620       "ld1 {v2.s}[0], [x1], #4\n"
3621       "ld1 {v2.h}[2], [x1], #2\n"
3622       "ld1 {v2.b}[6], [x1], #1\n"
3623       "ld1 {v3.s}[0], [x2], #4\n"
3624       "ld1 {v3.h}[2], [x2], #2\n"
3625       "ld1 {v3.b}[6], [x2], #1\n"
3626       "ld1 {v4.s}[0], [x3], #4\n"
3627       "ld1 {v4.h}[2], [x3], #2\n"
3628       "ld1 {v4.b}[6], [x3], #1\n"
3629       "ld1 {v5.s}[0], [x4], #4\n"
3630       "ld1 {v5.h}[2], [x4], #2\n"
3631       "ld1 {v5.b}[6], [x4], #1\n"
3632       "uaddw v8.8h, v8.8h, v0.8b\n"
3633       "uaddw v9.8h, v9.8h, v1.8b\n"
3634       "uaddw v10.8h, v10.8h, v2.8b\n"
3635       "uaddw v11.8h, v11.8h, v3.8b\n"
3636       "uaddw v12.8h, v12.8h, v4.8b\n"
3637       "uaddw v13.8h, v13.8h, v5.8b\n"
3638       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3639       "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3640 
3641       // Aggregator Reduction.
3642       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3643       "dup v1.4s, %w[additive_sum_offset]\n"
3644       "uaddlp v8.4s, v8.8h\n"
3645       "uaddlp v9.4s, v9.8h\n"
3646       "uaddlp v10.4s, v10.8h\n"
3647       "uaddlp v11.4s, v11.8h\n"
3648       "uaddlp v12.4s, v12.8h\n"
3649       "uaddlp v13.4s, v13.8h\n"
3650       "addp v8.4s, v8.4s, v9.4s\n"
3651       "addp v10.4s, v10.4s, v11.4s\n"
3652       "addp v12.4s, v12.4s, v13.4s\n"
3653       "addp v8.4s, v8.4s, v10.4s\n"
3654       "addp v9.4s, v12.4s, v12.4s\n"
3655       "mul v8.4s, v8.4s, v0.s[0]\n"
3656       "mul v9.4s, v9.4s, v0.s[0]\n"
3657       "add v8.4s, v8.4s, v1.4s\n"
3658       "add v9.4s, v9.4s, v1.4s\n"
3659       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3660       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3661       : [stride] "r"(params.stride),
3662         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3663         [additive_sum_offset] "r"(params.additive_sum_offset)
3664       : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3665         "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3666 }
3667 
3668 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3669 inline void Stream<uint8_t, 7, 8, 0, RowMajorWithSum>::Pack(
3670     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3671 #ifdef DEBUG
3672 #ifdef DEBUG_METAGEMM_VERBOSE
3673   std::cout << __FILE__ << "(" << __LINE__
3674             << ") RowMajorWithSum<uint8_t, 7, 8, 0, RowMajorWithSum>::Pack()"
3675             << std::endl
3676             << std::flush;
3677 #endif
3678 #endif
3679   int params_count_copy = params.count;
3680   asm volatile(
3681       "add x0, %x[in], %x[stride]\n"
3682       "add x1, x0, %x[stride]\n"
3683       "add x2, x1, %x[stride]\n"
3684       "add x3, x2, %x[stride]\n"
3685       "add x4, x3, %x[stride]\n"
3686       "add x5, x4, %x[stride]\n"
3687       "movi v8.8h, #0\n"
3688       "movi v9.8h, #0\n"
3689       "movi v10.8h, #0\n"
3690       "movi v11.8h, #0\n"
3691       "movi v12.8h, #0\n"
3692       "movi v13.8h, #0\n"
3693       "movi v14.8h, #0\n"
3694 
3695       "1:"
3696       "subs %x[count], %x[count], #8\n"
3697 
3698       // Load Aggregate Store: 7x8.
3699       "ld1 {v0.2s}, [%x[in]], #8\n"
3700       "ld1 {v1.2s}, [x0], #8\n"
3701       "ld1 {v2.2s}, [x1], #8\n"
3702       "ld1 {v3.2s}, [x2], #8\n"
3703       "ld1 {v4.2s}, [x3], #8\n"
3704       "ld1 {v5.2s}, [x4], #8\n"
3705       "ld1 {v6.2s}, [x5], #8\n"
3706       "uaddw v8.8h, v8.8h, v0.8b\n"
3707       "uaddw v9.8h, v9.8h, v1.8b\n"
3708       "uaddw v10.8h, v10.8h, v2.8b\n"
3709       "uaddw v11.8h, v11.8h, v3.8b\n"
3710       "uaddw v12.8h, v12.8h, v4.8b\n"
3711       "uaddw v13.8h, v13.8h, v5.8b\n"
3712       "uaddw v14.8h, v14.8h, v6.8b\n"
3713       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3714       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3715 
3716       "bne 1b\n"
3717 
3718       // Aggregator Reduction.
3719       "ldr w0, %[multiplicative_sum_offset]\n"
3720       "ldr w1, %[additive_sum_offset]\n"
3721       "mov v0.s[0], w0\n"
3722       "dup v1.4s, w1\n"
3723       "uaddlp v8.4s, v8.8h\n"
3724       "uaddlp v9.4s, v9.8h\n"
3725       "uaddlp v10.4s, v10.8h\n"
3726       "uaddlp v11.4s, v11.8h\n"
3727       "uaddlp v12.4s, v12.8h\n"
3728       "uaddlp v13.4s, v13.8h\n"
3729       "uaddlp v14.4s, v14.8h\n"
3730       "addp v8.4s, v8.4s, v9.4s\n"
3731       "addp v10.4s, v10.4s, v11.4s\n"
3732       "addp v12.4s, v12.4s, v13.4s\n"
3733       "addp v14.4s, v14.4s, v14.4s\n"
3734       "addp v8.4s, v8.4s, v10.4s\n"
3735       "addp v9.4s, v12.4s, v14.4s\n"
3736       "mul v8.4s, v8.4s, v0.s[0]\n"
3737       "mul v9.4s, v9.4s, v0.s[0]\n"
3738       "add v8.4s, v8.4s, v1.4s\n"
3739       "add v9.4s, v9.4s, v1.4s\n"
3740       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3741       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3742       : [stride] "r"(params.stride),
3743         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
3744         [additive_sum_offset] "m"(params.additive_sum_offset)
3745       : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
3746         "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
3747 }
3748 
3749 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3750 inline void Stream<uint8_t, 7, 8, 1, RowMajorWithSum>::Pack(
3751     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3752 #ifdef DEBUG
3753 #ifdef DEBUG_METAGEMM_VERBOSE
3754   std::cout << __FILE__ << "(" << __LINE__
3755             << ") RowMajorWithSum<uint8_t, 7, 8, 1, RowMajorWithSum>::Pack()"
3756             << std::endl
3757             << std::flush;
3758 #endif
3759 #endif
3760   int params_count_copy = params.count;
3761   asm volatile(
3762       "add x0, %x[in], %x[stride]\n"
3763       "add x1, x0, %x[stride]\n"
3764       "add x2, x1, %x[stride]\n"
3765       "add x3, x2, %x[stride]\n"
3766       "add x4, x3, %x[stride]\n"
3767       "add x5, x4, %x[stride]\n"
3768       "movi v8.8h, #0\n"
3769       "movi v9.8h, #0\n"
3770       "movi v10.8h, #0\n"
3771       "movi v11.8h, #0\n"
3772       "movi v12.8h, #0\n"
3773       "movi v13.8h, #0\n"
3774       "movi v14.8h, #0\n"
3775 
3776       // Reduce count by leftovers.
3777       "subs %x[count], %x[count], #1\n"
3778       "beq 2f\n"
3779 
3780       "1:"
3781       "subs %x[count], %x[count], #8\n"
3782 
3783       // Load Aggregate Store: 7x8.
3784       "ld1 {v0.2s}, [%x[in]], #8\n"
3785       "ld1 {v1.2s}, [x0], #8\n"
3786       "ld1 {v2.2s}, [x1], #8\n"
3787       "ld1 {v3.2s}, [x2], #8\n"
3788       "ld1 {v4.2s}, [x3], #8\n"
3789       "ld1 {v5.2s}, [x4], #8\n"
3790       "ld1 {v6.2s}, [x5], #8\n"
3791       "uaddw v8.8h, v8.8h, v0.8b\n"
3792       "uaddw v9.8h, v9.8h, v1.8b\n"
3793       "uaddw v10.8h, v10.8h, v2.8b\n"
3794       "uaddw v11.8h, v11.8h, v3.8b\n"
3795       "uaddw v12.8h, v12.8h, v4.8b\n"
3796       "uaddw v13.8h, v13.8h, v5.8b\n"
3797       "uaddw v14.8h, v14.8h, v6.8b\n"
3798       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3799       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3800 
3801       "bne 1b\n"
3802 
3803       "2:"
3804 
3805       // Load Aggregate Store: 7x1.
3806       "movi v0.8b, #0\n"
3807       "movi v1.8b, #0\n"
3808       "movi v2.8b, #0\n"
3809       "movi v3.8b, #0\n"
3810       "movi v4.8b, #0\n"
3811       "movi v5.8b, #0\n"
3812       "movi v6.8b, #0\n"
3813       "ld1 {v0.b}[0], [%x[in]], #1\n"
3814       "ld1 {v1.b}[0], [x0], #1\n"
3815       "ld1 {v2.b}[0], [x1], #1\n"
3816       "ld1 {v3.b}[0], [x2], #1\n"
3817       "ld1 {v4.b}[0], [x3], #1\n"
3818       "ld1 {v5.b}[0], [x4], #1\n"
3819       "ld1 {v6.b}[0], [x5], #1\n"
3820       "uaddw v8.8h, v8.8h, v0.8b\n"
3821       "uaddw v9.8h, v9.8h, v1.8b\n"
3822       "uaddw v10.8h, v10.8h, v2.8b\n"
3823       "uaddw v11.8h, v11.8h, v3.8b\n"
3824       "uaddw v12.8h, v12.8h, v4.8b\n"
3825       "uaddw v13.8h, v13.8h, v5.8b\n"
3826       "uaddw v14.8h, v14.8h, v6.8b\n"
3827       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3828       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3829 
3830       // Aggregator Reduction.
3831       "ldr w0, %[multiplicative_sum_offset]\n"
3832       "ldr w1, %[additive_sum_offset]\n"
3833       "mov v0.s[0], w0\n"
3834       "dup v1.4s, w1\n"
3835       "uaddlp v8.4s, v8.8h\n"
3836       "uaddlp v9.4s, v9.8h\n"
3837       "uaddlp v10.4s, v10.8h\n"
3838       "uaddlp v11.4s, v11.8h\n"
3839       "uaddlp v12.4s, v12.8h\n"
3840       "uaddlp v13.4s, v13.8h\n"
3841       "uaddlp v14.4s, v14.8h\n"
3842       "addp v8.4s, v8.4s, v9.4s\n"
3843       "addp v10.4s, v10.4s, v11.4s\n"
3844       "addp v12.4s, v12.4s, v13.4s\n"
3845       "addp v14.4s, v14.4s, v14.4s\n"
3846       "addp v8.4s, v8.4s, v10.4s\n"
3847       "addp v9.4s, v12.4s, v14.4s\n"
3848       "mul v8.4s, v8.4s, v0.s[0]\n"
3849       "mul v9.4s, v9.4s, v0.s[0]\n"
3850       "add v8.4s, v8.4s, v1.4s\n"
3851       "add v9.4s, v9.4s, v1.4s\n"
3852       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3853       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3854       : [stride] "r"(params.stride),
3855         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
3856         [additive_sum_offset] "m"(params.additive_sum_offset)
3857       : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
3858         "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
3859 }
3860 
3861 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3862 inline void Stream<uint8_t, 7, 8, 2, RowMajorWithSum>::Pack(
3863     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3864 #ifdef DEBUG
3865 #ifdef DEBUG_METAGEMM_VERBOSE
3866   std::cout << __FILE__ << "(" << __LINE__
3867             << ") RowMajorWithSum<uint8_t, 7, 8, 2, RowMajorWithSum>::Pack()"
3868             << std::endl
3869             << std::flush;
3870 #endif
3871 #endif
3872   int params_count_copy = params.count;
3873   asm volatile(
3874       "add x0, %x[in], %x[stride]\n"
3875       "add x1, x0, %x[stride]\n"
3876       "add x2, x1, %x[stride]\n"
3877       "add x3, x2, %x[stride]\n"
3878       "add x4, x3, %x[stride]\n"
3879       "add x5, x4, %x[stride]\n"
3880       "movi v8.8h, #0\n"
3881       "movi v9.8h, #0\n"
3882       "movi v10.8h, #0\n"
3883       "movi v11.8h, #0\n"
3884       "movi v12.8h, #0\n"
3885       "movi v13.8h, #0\n"
3886       "movi v14.8h, #0\n"
3887 
3888       // Reduce count by leftovers.
3889       "subs %x[count], %x[count], #2\n"
3890       "beq 2f\n"
3891 
3892       "1:"
3893       "subs %x[count], %x[count], #8\n"
3894 
3895       // Load Aggregate Store: 7x8.
3896       "ld1 {v0.2s}, [%x[in]], #8\n"
3897       "ld1 {v1.2s}, [x0], #8\n"
3898       "ld1 {v2.2s}, [x1], #8\n"
3899       "ld1 {v3.2s}, [x2], #8\n"
3900       "ld1 {v4.2s}, [x3], #8\n"
3901       "ld1 {v5.2s}, [x4], #8\n"
3902       "ld1 {v6.2s}, [x5], #8\n"
3903       "uaddw v8.8h, v8.8h, v0.8b\n"
3904       "uaddw v9.8h, v9.8h, v1.8b\n"
3905       "uaddw v10.8h, v10.8h, v2.8b\n"
3906       "uaddw v11.8h, v11.8h, v3.8b\n"
3907       "uaddw v12.8h, v12.8h, v4.8b\n"
3908       "uaddw v13.8h, v13.8h, v5.8b\n"
3909       "uaddw v14.8h, v14.8h, v6.8b\n"
3910       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3911       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3912 
3913       "bne 1b\n"
3914 
3915       "2:"
3916 
3917       // Load Aggregate Store: 7x2.
3918       "movi v0.8b, #0\n"
3919       "movi v1.8b, #0\n"
3920       "movi v2.8b, #0\n"
3921       "movi v3.8b, #0\n"
3922       "movi v4.8b, #0\n"
3923       "movi v5.8b, #0\n"
3924       "movi v6.8b, #0\n"
3925       "ld1 {v0.h}[0], [%x[in]], #2\n"
3926       "ld1 {v1.h}[0], [x0], #2\n"
3927       "ld1 {v2.h}[0], [x1], #2\n"
3928       "ld1 {v3.h}[0], [x2], #2\n"
3929       "ld1 {v4.h}[0], [x3], #2\n"
3930       "ld1 {v5.h}[0], [x4], #2\n"
3931       "ld1 {v6.h}[0], [x5], #2\n"
3932       "uaddw v8.8h, v8.8h, v0.8b\n"
3933       "uaddw v9.8h, v9.8h, v1.8b\n"
3934       "uaddw v10.8h, v10.8h, v2.8b\n"
3935       "uaddw v11.8h, v11.8h, v3.8b\n"
3936       "uaddw v12.8h, v12.8h, v4.8b\n"
3937       "uaddw v13.8h, v13.8h, v5.8b\n"
3938       "uaddw v14.8h, v14.8h, v6.8b\n"
3939       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3940       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3941 
3942       // Aggregator Reduction.
3943       "ldr w0, %[multiplicative_sum_offset]\n"
3944       "ldr w1, %[additive_sum_offset]\n"
3945       "mov v0.s[0], w0\n"
3946       "dup v1.4s, w1\n"
3947       "uaddlp v8.4s, v8.8h\n"
3948       "uaddlp v9.4s, v9.8h\n"
3949       "uaddlp v10.4s, v10.8h\n"
3950       "uaddlp v11.4s, v11.8h\n"
3951       "uaddlp v12.4s, v12.8h\n"
3952       "uaddlp v13.4s, v13.8h\n"
3953       "uaddlp v14.4s, v14.8h\n"
3954       "addp v8.4s, v8.4s, v9.4s\n"
3955       "addp v10.4s, v10.4s, v11.4s\n"
3956       "addp v12.4s, v12.4s, v13.4s\n"
3957       "addp v14.4s, v14.4s, v14.4s\n"
3958       "addp v8.4s, v8.4s, v10.4s\n"
3959       "addp v9.4s, v12.4s, v14.4s\n"
3960       "mul v8.4s, v8.4s, v0.s[0]\n"
3961       "mul v9.4s, v9.4s, v0.s[0]\n"
3962       "add v8.4s, v8.4s, v1.4s\n"
3963       "add v9.4s, v9.4s, v1.4s\n"
3964       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3965       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3966       : [stride] "r"(params.stride),
3967         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
3968         [additive_sum_offset] "m"(params.additive_sum_offset)
3969       : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
3970         "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
3971 }
3972 
3973 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3974 inline void Stream<uint8_t, 7, 8, 3, RowMajorWithSum>::Pack(
3975     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3976 #ifdef DEBUG
3977 #ifdef DEBUG_METAGEMM_VERBOSE
3978   std::cout << __FILE__ << "(" << __LINE__
3979             << ") RowMajorWithSum<uint8_t, 7, 8, 3, RowMajorWithSum>::Pack()"
3980             << std::endl
3981             << std::flush;
3982 #endif
3983 #endif
3984   int params_count_copy = params.count;
3985   asm volatile(
3986       "add x0, %x[in], %x[stride]\n"
3987       "add x1, x0, %x[stride]\n"
3988       "add x2, x1, %x[stride]\n"
3989       "add x3, x2, %x[stride]\n"
3990       "add x4, x3, %x[stride]\n"
3991       "add x5, x4, %x[stride]\n"
3992       "movi v8.8h, #0\n"
3993       "movi v9.8h, #0\n"
3994       "movi v10.8h, #0\n"
3995       "movi v11.8h, #0\n"
3996       "movi v12.8h, #0\n"
3997       "movi v13.8h, #0\n"
3998       "movi v14.8h, #0\n"
3999 
4000       // Reduce count by leftovers.
4001       "subs %x[count], %x[count], #3\n"
4002       "beq 2f\n"
4003 
4004       "1:"
4005       "subs %x[count], %x[count], #8\n"
4006 
4007       // Load Aggregate Store: 7x8.
4008       "ld1 {v0.2s}, [%x[in]], #8\n"
4009       "ld1 {v1.2s}, [x0], #8\n"
4010       "ld1 {v2.2s}, [x1], #8\n"
4011       "ld1 {v3.2s}, [x2], #8\n"
4012       "ld1 {v4.2s}, [x3], #8\n"
4013       "ld1 {v5.2s}, [x4], #8\n"
4014       "ld1 {v6.2s}, [x5], #8\n"
4015       "uaddw v8.8h, v8.8h, v0.8b\n"
4016       "uaddw v9.8h, v9.8h, v1.8b\n"
4017       "uaddw v10.8h, v10.8h, v2.8b\n"
4018       "uaddw v11.8h, v11.8h, v3.8b\n"
4019       "uaddw v12.8h, v12.8h, v4.8b\n"
4020       "uaddw v13.8h, v13.8h, v5.8b\n"
4021       "uaddw v14.8h, v14.8h, v6.8b\n"
4022       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4023       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4024 
4025       "bne 1b\n"
4026 
4027       "2:"
4028 
4029       // Load Aggregate Store: 7x3.
4030       "movi v0.8b, #0\n"
4031       "movi v1.8b, #0\n"
4032       "movi v2.8b, #0\n"
4033       "movi v3.8b, #0\n"
4034       "movi v4.8b, #0\n"
4035       "movi v5.8b, #0\n"
4036       "movi v6.8b, #0\n"
4037       "ld1 {v0.h}[0], [%x[in]], #2\n"
4038       "ld1 {v0.b}[2], [%x[in]], #1\n"
4039       "ld1 {v1.h}[0], [x0], #2\n"
4040       "ld1 {v1.b}[2], [x0], #1\n"
4041       "ld1 {v2.h}[0], [x1], #2\n"
4042       "ld1 {v2.b}[2], [x1], #1\n"
4043       "ld1 {v3.h}[0], [x2], #2\n"
4044       "ld1 {v3.b}[2], [x2], #1\n"
4045       "ld1 {v4.h}[0], [x3], #2\n"
4046       "ld1 {v4.b}[2], [x3], #1\n"
4047       "ld1 {v5.h}[0], [x4], #2\n"
4048       "ld1 {v5.b}[2], [x4], #1\n"
4049       "ld1 {v6.h}[0], [x5], #2\n"
4050       "ld1 {v6.b}[2], [x5], #1\n"
4051       "uaddw v8.8h, v8.8h, v0.8b\n"
4052       "uaddw v9.8h, v9.8h, v1.8b\n"
4053       "uaddw v10.8h, v10.8h, v2.8b\n"
4054       "uaddw v11.8h, v11.8h, v3.8b\n"
4055       "uaddw v12.8h, v12.8h, v4.8b\n"
4056       "uaddw v13.8h, v13.8h, v5.8b\n"
4057       "uaddw v14.8h, v14.8h, v6.8b\n"
4058       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4059       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4060 
4061       // Aggregator Reduction.
4062       "ldr w0, %[multiplicative_sum_offset]\n"
4063       "ldr w1, %[additive_sum_offset]\n"
4064       "mov v0.s[0], w0\n"
4065       "dup v1.4s, w1\n"
4066       "uaddlp v8.4s, v8.8h\n"
4067       "uaddlp v9.4s, v9.8h\n"
4068       "uaddlp v10.4s, v10.8h\n"
4069       "uaddlp v11.4s, v11.8h\n"
4070       "uaddlp v12.4s, v12.8h\n"
4071       "uaddlp v13.4s, v13.8h\n"
4072       "uaddlp v14.4s, v14.8h\n"
4073       "addp v8.4s, v8.4s, v9.4s\n"
4074       "addp v10.4s, v10.4s, v11.4s\n"
4075       "addp v12.4s, v12.4s, v13.4s\n"
4076       "addp v14.4s, v14.4s, v14.4s\n"
4077       "addp v8.4s, v8.4s, v10.4s\n"
4078       "addp v9.4s, v12.4s, v14.4s\n"
4079       "mul v8.4s, v8.4s, v0.s[0]\n"
4080       "mul v9.4s, v9.4s, v0.s[0]\n"
4081       "add v8.4s, v8.4s, v1.4s\n"
4082       "add v9.4s, v9.4s, v1.4s\n"
4083       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4084       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4085       : [stride] "r"(params.stride),
4086         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4087         [additive_sum_offset] "m"(params.additive_sum_offset)
4088       : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4089         "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4090 }
4091 
4092 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4093 inline void Stream<uint8_t, 7, 8, 4, RowMajorWithSum>::Pack(
4094     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4095 #ifdef DEBUG
4096 #ifdef DEBUG_METAGEMM_VERBOSE
4097   std::cout << __FILE__ << "(" << __LINE__
4098             << ") RowMajorWithSum<uint8_t, 7, 8, 4, RowMajorWithSum>::Pack()"
4099             << std::endl
4100             << std::flush;
4101 #endif
4102 #endif
4103   int params_count_copy = params.count;
4104   asm volatile(
4105       "add x0, %x[in], %x[stride]\n"
4106       "add x1, x0, %x[stride]\n"
4107       "add x2, x1, %x[stride]\n"
4108       "add x3, x2, %x[stride]\n"
4109       "add x4, x3, %x[stride]\n"
4110       "add x5, x4, %x[stride]\n"
4111       "movi v8.8h, #0\n"
4112       "movi v9.8h, #0\n"
4113       "movi v10.8h, #0\n"
4114       "movi v11.8h, #0\n"
4115       "movi v12.8h, #0\n"
4116       "movi v13.8h, #0\n"
4117       "movi v14.8h, #0\n"
4118 
4119       // Reduce count by leftovers.
4120       "subs %x[count], %x[count], #4\n"
4121       "beq 2f\n"
4122 
4123       "1:"
4124       "subs %x[count], %x[count], #8\n"
4125 
4126       // Load Aggregate Store: 7x8.
4127       "ld1 {v0.2s}, [%x[in]], #8\n"
4128       "ld1 {v1.2s}, [x0], #8\n"
4129       "ld1 {v2.2s}, [x1], #8\n"
4130       "ld1 {v3.2s}, [x2], #8\n"
4131       "ld1 {v4.2s}, [x3], #8\n"
4132       "ld1 {v5.2s}, [x4], #8\n"
4133       "ld1 {v6.2s}, [x5], #8\n"
4134       "uaddw v8.8h, v8.8h, v0.8b\n"
4135       "uaddw v9.8h, v9.8h, v1.8b\n"
4136       "uaddw v10.8h, v10.8h, v2.8b\n"
4137       "uaddw v11.8h, v11.8h, v3.8b\n"
4138       "uaddw v12.8h, v12.8h, v4.8b\n"
4139       "uaddw v13.8h, v13.8h, v5.8b\n"
4140       "uaddw v14.8h, v14.8h, v6.8b\n"
4141       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4142       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4143 
4144       "bne 1b\n"
4145 
4146       "2:"
4147 
4148       // Load Aggregate Store: 7x4.
4149       "movi v0.8b, #0\n"
4150       "movi v1.8b, #0\n"
4151       "movi v2.8b, #0\n"
4152       "movi v3.8b, #0\n"
4153       "movi v4.8b, #0\n"
4154       "movi v5.8b, #0\n"
4155       "movi v6.8b, #0\n"
4156       "ld1 {v0.s}[0], [%x[in]], #4\n"
4157       "ld1 {v1.s}[0], [x0], #4\n"
4158       "ld1 {v2.s}[0], [x1], #4\n"
4159       "ld1 {v3.s}[0], [x2], #4\n"
4160       "ld1 {v4.s}[0], [x3], #4\n"
4161       "ld1 {v5.s}[0], [x4], #4\n"
4162       "ld1 {v6.s}[0], [x5], #4\n"
4163       "uaddw v8.8h, v8.8h, v0.8b\n"
4164       "uaddw v9.8h, v9.8h, v1.8b\n"
4165       "uaddw v10.8h, v10.8h, v2.8b\n"
4166       "uaddw v11.8h, v11.8h, v3.8b\n"
4167       "uaddw v12.8h, v12.8h, v4.8b\n"
4168       "uaddw v13.8h, v13.8h, v5.8b\n"
4169       "uaddw v14.8h, v14.8h, v6.8b\n"
4170       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4171       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4172 
4173       // Aggregator Reduction.
4174       "ldr w0, %[multiplicative_sum_offset]\n"
4175       "ldr w1, %[additive_sum_offset]\n"
4176       "mov v0.s[0], w0\n"
4177       "dup v1.4s, w1\n"
4178       "uaddlp v8.4s, v8.8h\n"
4179       "uaddlp v9.4s, v9.8h\n"
4180       "uaddlp v10.4s, v10.8h\n"
4181       "uaddlp v11.4s, v11.8h\n"
4182       "uaddlp v12.4s, v12.8h\n"
4183       "uaddlp v13.4s, v13.8h\n"
4184       "uaddlp v14.4s, v14.8h\n"
4185       "addp v8.4s, v8.4s, v9.4s\n"
4186       "addp v10.4s, v10.4s, v11.4s\n"
4187       "addp v12.4s, v12.4s, v13.4s\n"
4188       "addp v14.4s, v14.4s, v14.4s\n"
4189       "addp v8.4s, v8.4s, v10.4s\n"
4190       "addp v9.4s, v12.4s, v14.4s\n"
4191       "mul v8.4s, v8.4s, v0.s[0]\n"
4192       "mul v9.4s, v9.4s, v0.s[0]\n"
4193       "add v8.4s, v8.4s, v1.4s\n"
4194       "add v9.4s, v9.4s, v1.4s\n"
4195       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4196       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4197       : [stride] "r"(params.stride),
4198         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4199         [additive_sum_offset] "m"(params.additive_sum_offset)
4200       : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4201         "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4202 }
4203 
4204 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4205 inline void Stream<uint8_t, 7, 8, 5, RowMajorWithSum>::Pack(
4206     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4207 #ifdef DEBUG
4208 #ifdef DEBUG_METAGEMM_VERBOSE
4209   std::cout << __FILE__ << "(" << __LINE__
4210             << ") RowMajorWithSum<uint8_t, 7, 8, 5, RowMajorWithSum>::Pack()"
4211             << std::endl
4212             << std::flush;
4213 #endif
4214 #endif
4215   int params_count_copy = params.count;
4216   asm volatile(
4217       "add x0, %x[in], %x[stride]\n"
4218       "add x1, x0, %x[stride]\n"
4219       "add x2, x1, %x[stride]\n"
4220       "add x3, x2, %x[stride]\n"
4221       "add x4, x3, %x[stride]\n"
4222       "add x5, x4, %x[stride]\n"
4223       "movi v8.8h, #0\n"
4224       "movi v9.8h, #0\n"
4225       "movi v10.8h, #0\n"
4226       "movi v11.8h, #0\n"
4227       "movi v12.8h, #0\n"
4228       "movi v13.8h, #0\n"
4229       "movi v14.8h, #0\n"
4230 
4231       // Reduce count by leftovers.
4232       "subs %x[count], %x[count], #5\n"
4233       "beq 2f\n"
4234 
4235       "1:"
4236       "subs %x[count], %x[count], #8\n"
4237 
4238       // Load Aggregate Store: 7x8.
4239       "ld1 {v0.2s}, [%x[in]], #8\n"
4240       "ld1 {v1.2s}, [x0], #8\n"
4241       "ld1 {v2.2s}, [x1], #8\n"
4242       "ld1 {v3.2s}, [x2], #8\n"
4243       "ld1 {v4.2s}, [x3], #8\n"
4244       "ld1 {v5.2s}, [x4], #8\n"
4245       "ld1 {v6.2s}, [x5], #8\n"
4246       "uaddw v8.8h, v8.8h, v0.8b\n"
4247       "uaddw v9.8h, v9.8h, v1.8b\n"
4248       "uaddw v10.8h, v10.8h, v2.8b\n"
4249       "uaddw v11.8h, v11.8h, v3.8b\n"
4250       "uaddw v12.8h, v12.8h, v4.8b\n"
4251       "uaddw v13.8h, v13.8h, v5.8b\n"
4252       "uaddw v14.8h, v14.8h, v6.8b\n"
4253       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4254       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4255 
4256       "bne 1b\n"
4257 
4258       "2:"
4259 
4260       // Load Aggregate Store: 7x5.
4261       "movi v0.8b, #0\n"
4262       "movi v1.8b, #0\n"
4263       "movi v2.8b, #0\n"
4264       "movi v3.8b, #0\n"
4265       "movi v4.8b, #0\n"
4266       "movi v5.8b, #0\n"
4267       "movi v6.8b, #0\n"
4268       "ld1 {v0.s}[0], [%x[in]], #4\n"
4269       "ld1 {v0.b}[4], [%x[in]], #1\n"
4270       "ld1 {v1.s}[0], [x0], #4\n"
4271       "ld1 {v1.b}[4], [x0], #1\n"
4272       "ld1 {v2.s}[0], [x1], #4\n"
4273       "ld1 {v2.b}[4], [x1], #1\n"
4274       "ld1 {v3.s}[0], [x2], #4\n"
4275       "ld1 {v3.b}[4], [x2], #1\n"
4276       "ld1 {v4.s}[0], [x3], #4\n"
4277       "ld1 {v4.b}[4], [x3], #1\n"
4278       "ld1 {v5.s}[0], [x4], #4\n"
4279       "ld1 {v5.b}[4], [x4], #1\n"
4280       "ld1 {v6.s}[0], [x5], #4\n"
4281       "ld1 {v6.b}[4], [x5], #1\n"
4282       "uaddw v8.8h, v8.8h, v0.8b\n"
4283       "uaddw v9.8h, v9.8h, v1.8b\n"
4284       "uaddw v10.8h, v10.8h, v2.8b\n"
4285       "uaddw v11.8h, v11.8h, v3.8b\n"
4286       "uaddw v12.8h, v12.8h, v4.8b\n"
4287       "uaddw v13.8h, v13.8h, v5.8b\n"
4288       "uaddw v14.8h, v14.8h, v6.8b\n"
4289       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4290       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4291 
4292       // Aggregator Reduction.
4293       "ldr w0, %[multiplicative_sum_offset]\n"
4294       "ldr w1, %[additive_sum_offset]\n"
4295       "mov v0.s[0], w0\n"
4296       "dup v1.4s, w1\n"
4297       "uaddlp v8.4s, v8.8h\n"
4298       "uaddlp v9.4s, v9.8h\n"
4299       "uaddlp v10.4s, v10.8h\n"
4300       "uaddlp v11.4s, v11.8h\n"
4301       "uaddlp v12.4s, v12.8h\n"
4302       "uaddlp v13.4s, v13.8h\n"
4303       "uaddlp v14.4s, v14.8h\n"
4304       "addp v8.4s, v8.4s, v9.4s\n"
4305       "addp v10.4s, v10.4s, v11.4s\n"
4306       "addp v12.4s, v12.4s, v13.4s\n"
4307       "addp v14.4s, v14.4s, v14.4s\n"
4308       "addp v8.4s, v8.4s, v10.4s\n"
4309       "addp v9.4s, v12.4s, v14.4s\n"
4310       "mul v8.4s, v8.4s, v0.s[0]\n"
4311       "mul v9.4s, v9.4s, v0.s[0]\n"
4312       "add v8.4s, v8.4s, v1.4s\n"
4313       "add v9.4s, v9.4s, v1.4s\n"
4314       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4315       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4316       : [stride] "r"(params.stride),
4317         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4318         [additive_sum_offset] "m"(params.additive_sum_offset)
4319       : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4320         "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4321 }
4322 
4323 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4324 inline void Stream<uint8_t, 7, 8, 6, RowMajorWithSum>::Pack(
4325     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4326 #ifdef DEBUG
4327 #ifdef DEBUG_METAGEMM_VERBOSE
4328   std::cout << __FILE__ << "(" << __LINE__
4329             << ") RowMajorWithSum<uint8_t, 7, 8, 6, RowMajorWithSum>::Pack()"
4330             << std::endl
4331             << std::flush;
4332 #endif
4333 #endif
4334   int params_count_copy = params.count;
4335   asm volatile(
4336       "add x0, %x[in], %x[stride]\n"
4337       "add x1, x0, %x[stride]\n"
4338       "add x2, x1, %x[stride]\n"
4339       "add x3, x2, %x[stride]\n"
4340       "add x4, x3, %x[stride]\n"
4341       "add x5, x4, %x[stride]\n"
4342       "movi v8.8h, #0\n"
4343       "movi v9.8h, #0\n"
4344       "movi v10.8h, #0\n"
4345       "movi v11.8h, #0\n"
4346       "movi v12.8h, #0\n"
4347       "movi v13.8h, #0\n"
4348       "movi v14.8h, #0\n"
4349 
4350       // Reduce count by leftovers.
4351       "subs %x[count], %x[count], #6\n"
4352       "beq 2f\n"
4353 
4354       "1:"
4355       "subs %x[count], %x[count], #8\n"
4356 
4357       // Load Aggregate Store: 7x8.
4358       "ld1 {v0.2s}, [%x[in]], #8\n"
4359       "ld1 {v1.2s}, [x0], #8\n"
4360       "ld1 {v2.2s}, [x1], #8\n"
4361       "ld1 {v3.2s}, [x2], #8\n"
4362       "ld1 {v4.2s}, [x3], #8\n"
4363       "ld1 {v5.2s}, [x4], #8\n"
4364       "ld1 {v6.2s}, [x5], #8\n"
4365       "uaddw v8.8h, v8.8h, v0.8b\n"
4366       "uaddw v9.8h, v9.8h, v1.8b\n"
4367       "uaddw v10.8h, v10.8h, v2.8b\n"
4368       "uaddw v11.8h, v11.8h, v3.8b\n"
4369       "uaddw v12.8h, v12.8h, v4.8b\n"
4370       "uaddw v13.8h, v13.8h, v5.8b\n"
4371       "uaddw v14.8h, v14.8h, v6.8b\n"
4372       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4373       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4374 
4375       "bne 1b\n"
4376 
4377       "2:"
4378 
4379       // Load Aggregate Store: 7x6.
4380       "movi v0.8b, #0\n"
4381       "movi v1.8b, #0\n"
4382       "movi v2.8b, #0\n"
4383       "movi v3.8b, #0\n"
4384       "movi v4.8b, #0\n"
4385       "movi v5.8b, #0\n"
4386       "movi v6.8b, #0\n"
4387       "ld1 {v0.s}[0], [%x[in]], #4\n"
4388       "ld1 {v0.h}[2], [%x[in]], #2\n"
4389       "ld1 {v1.s}[0], [x0], #4\n"
4390       "ld1 {v1.h}[2], [x0], #2\n"
4391       "ld1 {v2.s}[0], [x1], #4\n"
4392       "ld1 {v2.h}[2], [x1], #2\n"
4393       "ld1 {v3.s}[0], [x2], #4\n"
4394       "ld1 {v3.h}[2], [x2], #2\n"
4395       "ld1 {v4.s}[0], [x3], #4\n"
4396       "ld1 {v4.h}[2], [x3], #2\n"
4397       "ld1 {v5.s}[0], [x4], #4\n"
4398       "ld1 {v5.h}[2], [x4], #2\n"
4399       "ld1 {v6.s}[0], [x5], #4\n"
4400       "ld1 {v6.h}[2], [x5], #2\n"
4401       "uaddw v8.8h, v8.8h, v0.8b\n"
4402       "uaddw v9.8h, v9.8h, v1.8b\n"
4403       "uaddw v10.8h, v10.8h, v2.8b\n"
4404       "uaddw v11.8h, v11.8h, v3.8b\n"
4405       "uaddw v12.8h, v12.8h, v4.8b\n"
4406       "uaddw v13.8h, v13.8h, v5.8b\n"
4407       "uaddw v14.8h, v14.8h, v6.8b\n"
4408       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4409       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4410 
4411       // Aggregator Reduction.
4412       "ldr w0, %[multiplicative_sum_offset]\n"
4413       "ldr w1, %[additive_sum_offset]\n"
4414       "mov v0.s[0], w0\n"
4415       "dup v1.4s, w1\n"
4416       "uaddlp v8.4s, v8.8h\n"
4417       "uaddlp v9.4s, v9.8h\n"
4418       "uaddlp v10.4s, v10.8h\n"
4419       "uaddlp v11.4s, v11.8h\n"
4420       "uaddlp v12.4s, v12.8h\n"
4421       "uaddlp v13.4s, v13.8h\n"
4422       "uaddlp v14.4s, v14.8h\n"
4423       "addp v8.4s, v8.4s, v9.4s\n"
4424       "addp v10.4s, v10.4s, v11.4s\n"
4425       "addp v12.4s, v12.4s, v13.4s\n"
4426       "addp v14.4s, v14.4s, v14.4s\n"
4427       "addp v8.4s, v8.4s, v10.4s\n"
4428       "addp v9.4s, v12.4s, v14.4s\n"
4429       "mul v8.4s, v8.4s, v0.s[0]\n"
4430       "mul v9.4s, v9.4s, v0.s[0]\n"
4431       "add v8.4s, v8.4s, v1.4s\n"
4432       "add v9.4s, v9.4s, v1.4s\n"
4433       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4434       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4435       : [stride] "r"(params.stride),
4436         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4437         [additive_sum_offset] "m"(params.additive_sum_offset)
4438       : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4439         "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4440 }
4441 
4442 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4443 inline void Stream<uint8_t, 7, 8, 7, RowMajorWithSum>::Pack(
4444     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4445 #ifdef DEBUG
4446 #ifdef DEBUG_METAGEMM_VERBOSE
4447   std::cout << __FILE__ << "(" << __LINE__
4448             << ") RowMajorWithSum<uint8_t, 7, 8, 7, RowMajorWithSum>::Pack()"
4449             << std::endl
4450             << std::flush;
4451 #endif
4452 #endif
4453   int params_count_copy = params.count;
4454   asm volatile(
4455       "add x0, %x[in], %x[stride]\n"
4456       "add x1, x0, %x[stride]\n"
4457       "add x2, x1, %x[stride]\n"
4458       "add x3, x2, %x[stride]\n"
4459       "add x4, x3, %x[stride]\n"
4460       "add x5, x4, %x[stride]\n"
4461       "movi v8.8h, #0\n"
4462       "movi v9.8h, #0\n"
4463       "movi v10.8h, #0\n"
4464       "movi v11.8h, #0\n"
4465       "movi v12.8h, #0\n"
4466       "movi v13.8h, #0\n"
4467       "movi v14.8h, #0\n"
4468 
4469       // Reduce count by leftovers.
4470       "subs %x[count], %x[count], #7\n"
4471       "beq 2f\n"
4472 
4473       "1:"
4474       "subs %x[count], %x[count], #8\n"
4475 
4476       // Load Aggregate Store: 7x8.
4477       "ld1 {v0.2s}, [%x[in]], #8\n"
4478       "ld1 {v1.2s}, [x0], #8\n"
4479       "ld1 {v2.2s}, [x1], #8\n"
4480       "ld1 {v3.2s}, [x2], #8\n"
4481       "ld1 {v4.2s}, [x3], #8\n"
4482       "ld1 {v5.2s}, [x4], #8\n"
4483       "ld1 {v6.2s}, [x5], #8\n"
4484       "uaddw v8.8h, v8.8h, v0.8b\n"
4485       "uaddw v9.8h, v9.8h, v1.8b\n"
4486       "uaddw v10.8h, v10.8h, v2.8b\n"
4487       "uaddw v11.8h, v11.8h, v3.8b\n"
4488       "uaddw v12.8h, v12.8h, v4.8b\n"
4489       "uaddw v13.8h, v13.8h, v5.8b\n"
4490       "uaddw v14.8h, v14.8h, v6.8b\n"
4491       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4492       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4493 
4494       "bne 1b\n"
4495 
4496       "2:"
4497 
4498       // Load Aggregate Store: 7x7.
4499       "movi v0.8b, #0\n"
4500       "movi v1.8b, #0\n"
4501       "movi v2.8b, #0\n"
4502       "movi v3.8b, #0\n"
4503       "movi v4.8b, #0\n"
4504       "movi v5.8b, #0\n"
4505       "movi v6.8b, #0\n"
4506       "ld1 {v0.s}[0], [%x[in]], #4\n"
4507       "ld1 {v0.h}[2], [%x[in]], #2\n"
4508       "ld1 {v0.b}[6], [%x[in]], #1\n"
4509       "ld1 {v1.s}[0], [x0], #4\n"
4510       "ld1 {v1.h}[2], [x0], #2\n"
4511       "ld1 {v1.b}[6], [x0], #1\n"
4512       "ld1 {v2.s}[0], [x1], #4\n"
4513       "ld1 {v2.h}[2], [x1], #2\n"
4514       "ld1 {v2.b}[6], [x1], #1\n"
4515       "ld1 {v3.s}[0], [x2], #4\n"
4516       "ld1 {v3.h}[2], [x2], #2\n"
4517       "ld1 {v3.b}[6], [x2], #1\n"
4518       "ld1 {v4.s}[0], [x3], #4\n"
4519       "ld1 {v4.h}[2], [x3], #2\n"
4520       "ld1 {v4.b}[6], [x3], #1\n"
4521       "ld1 {v5.s}[0], [x4], #4\n"
4522       "ld1 {v5.h}[2], [x4], #2\n"
4523       "ld1 {v5.b}[6], [x4], #1\n"
4524       "ld1 {v6.s}[0], [x5], #4\n"
4525       "ld1 {v6.h}[2], [x5], #2\n"
4526       "ld1 {v6.b}[6], [x5], #1\n"
4527       "uaddw v8.8h, v8.8h, v0.8b\n"
4528       "uaddw v9.8h, v9.8h, v1.8b\n"
4529       "uaddw v10.8h, v10.8h, v2.8b\n"
4530       "uaddw v11.8h, v11.8h, v3.8b\n"
4531       "uaddw v12.8h, v12.8h, v4.8b\n"
4532       "uaddw v13.8h, v13.8h, v5.8b\n"
4533       "uaddw v14.8h, v14.8h, v6.8b\n"
4534       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4535       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4536 
4537       // Aggregator Reduction.
4538       "ldr w0, %[multiplicative_sum_offset]\n"
4539       "ldr w1, %[additive_sum_offset]\n"
4540       "mov v0.s[0], w0\n"
4541       "dup v1.4s, w1\n"
4542       "uaddlp v8.4s, v8.8h\n"
4543       "uaddlp v9.4s, v9.8h\n"
4544       "uaddlp v10.4s, v10.8h\n"
4545       "uaddlp v11.4s, v11.8h\n"
4546       "uaddlp v12.4s, v12.8h\n"
4547       "uaddlp v13.4s, v13.8h\n"
4548       "uaddlp v14.4s, v14.8h\n"
4549       "addp v8.4s, v8.4s, v9.4s\n"
4550       "addp v10.4s, v10.4s, v11.4s\n"
4551       "addp v12.4s, v12.4s, v13.4s\n"
4552       "addp v14.4s, v14.4s, v14.4s\n"
4553       "addp v8.4s, v8.4s, v10.4s\n"
4554       "addp v9.4s, v12.4s, v14.4s\n"
4555       "mul v8.4s, v8.4s, v0.s[0]\n"
4556       "mul v9.4s, v9.4s, v0.s[0]\n"
4557       "add v8.4s, v8.4s, v1.4s\n"
4558       "add v9.4s, v9.4s, v1.4s\n"
4559       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4560       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4561       : [stride] "r"(params.stride),
4562         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4563         [additive_sum_offset] "m"(params.additive_sum_offset)
4564       : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4565         "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4566 }
4567 
4568 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4569 inline void Stream<uint8_t, 8, 8, 0, RowMajorWithSum>::Pack(
4570     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4571 #ifdef DEBUG
4572 #ifdef DEBUG_METAGEMM_VERBOSE
4573   std::cout << __FILE__ << "(" << __LINE__
4574             << ") RowMajorWithSum<uint8_t, 8, 8, 0, RowMajorWithSum>::Pack()"
4575             << std::endl
4576             << std::flush;
4577 #endif
4578 #endif
4579   int params_count_copy = params.count;
4580   asm volatile(
4581       "add x0, %x[in], %x[stride]\n"
4582       "add x1, x0, %x[stride]\n"
4583       "add x2, x1, %x[stride]\n"
4584       "add x3, x2, %x[stride]\n"
4585       "add x4, x3, %x[stride]\n"
4586       "add x5, x4, %x[stride]\n"
4587       "add x6, x5, %x[stride]\n"
4588       "movi v8.8h, #0\n"
4589       "movi v9.8h, #0\n"
4590       "movi v10.8h, #0\n"
4591       "movi v11.8h, #0\n"
4592       "movi v12.8h, #0\n"
4593       "movi v13.8h, #0\n"
4594       "movi v14.8h, #0\n"
4595       "movi v15.8h, #0\n"
4596 
4597       "1:"
4598       "subs %x[count], %x[count], #8\n"
4599 
4600       // Load Aggregate Store: 8x8.
4601       "ld1 {v0.2s}, [%x[in]], #8\n"
4602       "ld1 {v1.2s}, [x0], #8\n"
4603       "ld1 {v2.2s}, [x1], #8\n"
4604       "ld1 {v3.2s}, [x2], #8\n"
4605       "ld1 {v4.2s}, [x3], #8\n"
4606       "ld1 {v5.2s}, [x4], #8\n"
4607       "ld1 {v6.2s}, [x5], #8\n"
4608       "ld1 {v7.2s}, [x6], #8\n"
4609       "uaddw v8.8h, v8.8h, v0.8b\n"
4610       "uaddw v9.8h, v9.8h, v1.8b\n"
4611       "uaddw v10.8h, v10.8h, v2.8b\n"
4612       "uaddw v11.8h, v11.8h, v3.8b\n"
4613       "uaddw v12.8h, v12.8h, v4.8b\n"
4614       "uaddw v13.8h, v13.8h, v5.8b\n"
4615       "uaddw v14.8h, v14.8h, v6.8b\n"
4616       "uaddw v15.8h, v15.8h, v7.8b\n"
4617       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4618       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4619 
4620       "bne 1b\n"
4621 
4622       // Aggregator Reduction.
4623       "ldr w0, %[multiplicative_sum_offset]\n"
4624       "ldr w1, %[additive_sum_offset]\n"
4625       "mov v0.s[0], w0\n"
4626       "dup v1.4s, w1\n"
4627       "uaddlp v8.4s, v8.8h\n"
4628       "uaddlp v9.4s, v9.8h\n"
4629       "uaddlp v10.4s, v10.8h\n"
4630       "uaddlp v11.4s, v11.8h\n"
4631       "uaddlp v12.4s, v12.8h\n"
4632       "uaddlp v13.4s, v13.8h\n"
4633       "uaddlp v14.4s, v14.8h\n"
4634       "uaddlp v15.4s, v15.8h\n"
4635       "addp v8.4s, v8.4s, v9.4s\n"
4636       "addp v10.4s, v10.4s, v11.4s\n"
4637       "addp v12.4s, v12.4s, v13.4s\n"
4638       "addp v14.4s, v14.4s, v15.4s\n"
4639       "addp v8.4s, v8.4s, v10.4s\n"
4640       "addp v9.4s, v12.4s, v14.4s\n"
4641       "mul v8.4s, v8.4s, v0.s[0]\n"
4642       "mul v9.4s, v9.4s, v0.s[0]\n"
4643       "add v8.4s, v8.4s, v1.4s\n"
4644       "add v9.4s, v9.4s, v1.4s\n"
4645       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4646       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4647       : [stride] "r"(params.stride),
4648         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4649         [additive_sum_offset] "m"(params.additive_sum_offset)
4650       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
4651         "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
4652         "cc", "memory");
4653 }
4654 
4655 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4656 inline void Stream<uint8_t, 8, 8, 1, RowMajorWithSum>::Pack(
4657     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4658 #ifdef DEBUG
4659 #ifdef DEBUG_METAGEMM_VERBOSE
4660   std::cout << __FILE__ << "(" << __LINE__
4661             << ") RowMajorWithSum<uint8_t, 8, 8, 1, RowMajorWithSum>::Pack()"
4662             << std::endl
4663             << std::flush;
4664 #endif
4665 #endif
4666   int params_count_copy = params.count;
4667   asm volatile(
4668       "add x0, %x[in], %x[stride]\n"
4669       "add x1, x0, %x[stride]\n"
4670       "add x2, x1, %x[stride]\n"
4671       "add x3, x2, %x[stride]\n"
4672       "add x4, x3, %x[stride]\n"
4673       "add x5, x4, %x[stride]\n"
4674       "add x6, x5, %x[stride]\n"
4675       "movi v8.8h, #0\n"
4676       "movi v9.8h, #0\n"
4677       "movi v10.8h, #0\n"
4678       "movi v11.8h, #0\n"
4679       "movi v12.8h, #0\n"
4680       "movi v13.8h, #0\n"
4681       "movi v14.8h, #0\n"
4682       "movi v15.8h, #0\n"
4683 
4684       // Reduce count by leftovers.
4685       "subs %x[count], %x[count], #1\n"
4686       "beq 2f\n"
4687 
4688       "1:"
4689       "subs %x[count], %x[count], #8\n"
4690 
4691       // Load Aggregate Store: 8x8.
4692       "ld1 {v0.2s}, [%x[in]], #8\n"
4693       "ld1 {v1.2s}, [x0], #8\n"
4694       "ld1 {v2.2s}, [x1], #8\n"
4695       "ld1 {v3.2s}, [x2], #8\n"
4696       "ld1 {v4.2s}, [x3], #8\n"
4697       "ld1 {v5.2s}, [x4], #8\n"
4698       "ld1 {v6.2s}, [x5], #8\n"
4699       "ld1 {v7.2s}, [x6], #8\n"
4700       "uaddw v8.8h, v8.8h, v0.8b\n"
4701       "uaddw v9.8h, v9.8h, v1.8b\n"
4702       "uaddw v10.8h, v10.8h, v2.8b\n"
4703       "uaddw v11.8h, v11.8h, v3.8b\n"
4704       "uaddw v12.8h, v12.8h, v4.8b\n"
4705       "uaddw v13.8h, v13.8h, v5.8b\n"
4706       "uaddw v14.8h, v14.8h, v6.8b\n"
4707       "uaddw v15.8h, v15.8h, v7.8b\n"
4708       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4709       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4710 
4711       "bne 1b\n"
4712 
4713       "2:"
4714 
4715       // Load Aggregate Store: 8x1.
4716       "movi v0.8b, #0\n"
4717       "movi v1.8b, #0\n"
4718       "movi v2.8b, #0\n"
4719       "movi v3.8b, #0\n"
4720       "movi v4.8b, #0\n"
4721       "movi v5.8b, #0\n"
4722       "movi v6.8b, #0\n"
4723       "movi v7.8b, #0\n"
4724       "ld1 {v0.b}[0], [%x[in]], #1\n"
4725       "ld1 {v1.b}[0], [x0], #1\n"
4726       "ld1 {v2.b}[0], [x1], #1\n"
4727       "ld1 {v3.b}[0], [x2], #1\n"
4728       "ld1 {v4.b}[0], [x3], #1\n"
4729       "ld1 {v5.b}[0], [x4], #1\n"
4730       "ld1 {v6.b}[0], [x5], #1\n"
4731       "ld1 {v7.b}[0], [x6], #1\n"
4732       "uaddw v8.8h, v8.8h, v0.8b\n"
4733       "uaddw v9.8h, v9.8h, v1.8b\n"
4734       "uaddw v10.8h, v10.8h, v2.8b\n"
4735       "uaddw v11.8h, v11.8h, v3.8b\n"
4736       "uaddw v12.8h, v12.8h, v4.8b\n"
4737       "uaddw v13.8h, v13.8h, v5.8b\n"
4738       "uaddw v14.8h, v14.8h, v6.8b\n"
4739       "uaddw v15.8h, v15.8h, v7.8b\n"
4740       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4741       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4742 
4743       // Aggregator Reduction.
4744       "ldr w0, %[multiplicative_sum_offset]\n"
4745       "ldr w1, %[additive_sum_offset]\n"
4746       "mov v0.s[0], w0\n"
4747       "dup v1.4s, w1\n"
4748       "uaddlp v8.4s, v8.8h\n"
4749       "uaddlp v9.4s, v9.8h\n"
4750       "uaddlp v10.4s, v10.8h\n"
4751       "uaddlp v11.4s, v11.8h\n"
4752       "uaddlp v12.4s, v12.8h\n"
4753       "uaddlp v13.4s, v13.8h\n"
4754       "uaddlp v14.4s, v14.8h\n"
4755       "uaddlp v15.4s, v15.8h\n"
4756       "addp v8.4s, v8.4s, v9.4s\n"
4757       "addp v10.4s, v10.4s, v11.4s\n"
4758       "addp v12.4s, v12.4s, v13.4s\n"
4759       "addp v14.4s, v14.4s, v15.4s\n"
4760       "addp v8.4s, v8.4s, v10.4s\n"
4761       "addp v9.4s, v12.4s, v14.4s\n"
4762       "mul v8.4s, v8.4s, v0.s[0]\n"
4763       "mul v9.4s, v9.4s, v0.s[0]\n"
4764       "add v8.4s, v8.4s, v1.4s\n"
4765       "add v9.4s, v9.4s, v1.4s\n"
4766       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4767       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4768       : [stride] "r"(params.stride),
4769         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4770         [additive_sum_offset] "m"(params.additive_sum_offset)
4771       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
4772         "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
4773         "cc", "memory");
4774 }
4775 
4776 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4777 inline void Stream<uint8_t, 8, 8, 2, RowMajorWithSum>::Pack(
4778     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4779 #ifdef DEBUG
4780 #ifdef DEBUG_METAGEMM_VERBOSE
4781   std::cout << __FILE__ << "(" << __LINE__
4782             << ") RowMajorWithSum<uint8_t, 8, 8, 2, RowMajorWithSum>::Pack()"
4783             << std::endl
4784             << std::flush;
4785 #endif
4786 #endif
4787   int params_count_copy = params.count;
4788   asm volatile(
4789       "add x0, %x[in], %x[stride]\n"
4790       "add x1, x0, %x[stride]\n"
4791       "add x2, x1, %x[stride]\n"
4792       "add x3, x2, %x[stride]\n"
4793       "add x4, x3, %x[stride]\n"
4794       "add x5, x4, %x[stride]\n"
4795       "add x6, x5, %x[stride]\n"
4796       "movi v8.8h, #0\n"
4797       "movi v9.8h, #0\n"
4798       "movi v10.8h, #0\n"
4799       "movi v11.8h, #0\n"
4800       "movi v12.8h, #0\n"
4801       "movi v13.8h, #0\n"
4802       "movi v14.8h, #0\n"
4803       "movi v15.8h, #0\n"
4804 
4805       // Reduce count by leftovers.
4806       "subs %x[count], %x[count], #2\n"
4807       "beq 2f\n"
4808 
4809       "1:"
4810       "subs %x[count], %x[count], #8\n"
4811 
4812       // Load Aggregate Store: 8x8.
4813       "ld1 {v0.2s}, [%x[in]], #8\n"
4814       "ld1 {v1.2s}, [x0], #8\n"
4815       "ld1 {v2.2s}, [x1], #8\n"
4816       "ld1 {v3.2s}, [x2], #8\n"
4817       "ld1 {v4.2s}, [x3], #8\n"
4818       "ld1 {v5.2s}, [x4], #8\n"
4819       "ld1 {v6.2s}, [x5], #8\n"
4820       "ld1 {v7.2s}, [x6], #8\n"
4821       "uaddw v8.8h, v8.8h, v0.8b\n"
4822       "uaddw v9.8h, v9.8h, v1.8b\n"
4823       "uaddw v10.8h, v10.8h, v2.8b\n"
4824       "uaddw v11.8h, v11.8h, v3.8b\n"
4825       "uaddw v12.8h, v12.8h, v4.8b\n"
4826       "uaddw v13.8h, v13.8h, v5.8b\n"
4827       "uaddw v14.8h, v14.8h, v6.8b\n"
4828       "uaddw v15.8h, v15.8h, v7.8b\n"
4829       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4830       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4831 
4832       "bne 1b\n"
4833 
4834       "2:"
4835 
4836       // Load Aggregate Store: 8x2.
4837       "movi v0.8b, #0\n"
4838       "movi v1.8b, #0\n"
4839       "movi v2.8b, #0\n"
4840       "movi v3.8b, #0\n"
4841       "movi v4.8b, #0\n"
4842       "movi v5.8b, #0\n"
4843       "movi v6.8b, #0\n"
4844       "movi v7.8b, #0\n"
4845       "ld1 {v0.h}[0], [%x[in]], #2\n"
4846       "ld1 {v1.h}[0], [x0], #2\n"
4847       "ld1 {v2.h}[0], [x1], #2\n"
4848       "ld1 {v3.h}[0], [x2], #2\n"
4849       "ld1 {v4.h}[0], [x3], #2\n"
4850       "ld1 {v5.h}[0], [x4], #2\n"
4851       "ld1 {v6.h}[0], [x5], #2\n"
4852       "ld1 {v7.h}[0], [x6], #2\n"
4853       "uaddw v8.8h, v8.8h, v0.8b\n"
4854       "uaddw v9.8h, v9.8h, v1.8b\n"
4855       "uaddw v10.8h, v10.8h, v2.8b\n"
4856       "uaddw v11.8h, v11.8h, v3.8b\n"
4857       "uaddw v12.8h, v12.8h, v4.8b\n"
4858       "uaddw v13.8h, v13.8h, v5.8b\n"
4859       "uaddw v14.8h, v14.8h, v6.8b\n"
4860       "uaddw v15.8h, v15.8h, v7.8b\n"
4861       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4862       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4863 
4864       // Aggregator Reduction.
4865       "ldr w0, %[multiplicative_sum_offset]\n"
4866       "ldr w1, %[additive_sum_offset]\n"
4867       "mov v0.s[0], w0\n"
4868       "dup v1.4s, w1\n"
4869       "uaddlp v8.4s, v8.8h\n"
4870       "uaddlp v9.4s, v9.8h\n"
4871       "uaddlp v10.4s, v10.8h\n"
4872       "uaddlp v11.4s, v11.8h\n"
4873       "uaddlp v12.4s, v12.8h\n"
4874       "uaddlp v13.4s, v13.8h\n"
4875       "uaddlp v14.4s, v14.8h\n"
4876       "uaddlp v15.4s, v15.8h\n"
4877       "addp v8.4s, v8.4s, v9.4s\n"
4878       "addp v10.4s, v10.4s, v11.4s\n"
4879       "addp v12.4s, v12.4s, v13.4s\n"
4880       "addp v14.4s, v14.4s, v15.4s\n"
4881       "addp v8.4s, v8.4s, v10.4s\n"
4882       "addp v9.4s, v12.4s, v14.4s\n"
4883       "mul v8.4s, v8.4s, v0.s[0]\n"
4884       "mul v9.4s, v9.4s, v0.s[0]\n"
4885       "add v8.4s, v8.4s, v1.4s\n"
4886       "add v9.4s, v9.4s, v1.4s\n"
4887       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4888       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4889       : [stride] "r"(params.stride),
4890         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4891         [additive_sum_offset] "m"(params.additive_sum_offset)
4892       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
4893         "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
4894         "cc", "memory");
4895 }
4896 
4897 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4898 inline void Stream<uint8_t, 8, 8, 3, RowMajorWithSum>::Pack(
4899     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4900 #ifdef DEBUG
4901 #ifdef DEBUG_METAGEMM_VERBOSE
4902   std::cout << __FILE__ << "(" << __LINE__
4903             << ") RowMajorWithSum<uint8_t, 8, 8, 3, RowMajorWithSum>::Pack()"
4904             << std::endl
4905             << std::flush;
4906 #endif
4907 #endif
4908   int params_count_copy = params.count;
4909   asm volatile(
4910       "add x0, %x[in], %x[stride]\n"
4911       "add x1, x0, %x[stride]\n"
4912       "add x2, x1, %x[stride]\n"
4913       "add x3, x2, %x[stride]\n"
4914       "add x4, x3, %x[stride]\n"
4915       "add x5, x4, %x[stride]\n"
4916       "add x6, x5, %x[stride]\n"
4917       "movi v8.8h, #0\n"
4918       "movi v9.8h, #0\n"
4919       "movi v10.8h, #0\n"
4920       "movi v11.8h, #0\n"
4921       "movi v12.8h, #0\n"
4922       "movi v13.8h, #0\n"
4923       "movi v14.8h, #0\n"
4924       "movi v15.8h, #0\n"
4925 
4926       // Reduce count by leftovers.
4927       "subs %x[count], %x[count], #3\n"
4928       "beq 2f\n"
4929 
4930       "1:"
4931       "subs %x[count], %x[count], #8\n"
4932 
4933       // Load Aggregate Store: 8x8.
4934       "ld1 {v0.2s}, [%x[in]], #8\n"
4935       "ld1 {v1.2s}, [x0], #8\n"
4936       "ld1 {v2.2s}, [x1], #8\n"
4937       "ld1 {v3.2s}, [x2], #8\n"
4938       "ld1 {v4.2s}, [x3], #8\n"
4939       "ld1 {v5.2s}, [x4], #8\n"
4940       "ld1 {v6.2s}, [x5], #8\n"
4941       "ld1 {v7.2s}, [x6], #8\n"
4942       "uaddw v8.8h, v8.8h, v0.8b\n"
4943       "uaddw v9.8h, v9.8h, v1.8b\n"
4944       "uaddw v10.8h, v10.8h, v2.8b\n"
4945       "uaddw v11.8h, v11.8h, v3.8b\n"
4946       "uaddw v12.8h, v12.8h, v4.8b\n"
4947       "uaddw v13.8h, v13.8h, v5.8b\n"
4948       "uaddw v14.8h, v14.8h, v6.8b\n"
4949       "uaddw v15.8h, v15.8h, v7.8b\n"
4950       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4951       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4952 
4953       "bne 1b\n"
4954 
4955       "2:"
4956 
4957       // Load Aggregate Store: 8x3.
4958       "movi v0.8b, #0\n"
4959       "movi v1.8b, #0\n"
4960       "movi v2.8b, #0\n"
4961       "movi v3.8b, #0\n"
4962       "movi v4.8b, #0\n"
4963       "movi v5.8b, #0\n"
4964       "movi v6.8b, #0\n"
4965       "movi v7.8b, #0\n"
4966       "ld1 {v0.h}[0], [%x[in]], #2\n"
4967       "ld1 {v0.b}[2], [%x[in]], #1\n"
4968       "ld1 {v1.h}[0], [x0], #2\n"
4969       "ld1 {v1.b}[2], [x0], #1\n"
4970       "ld1 {v2.h}[0], [x1], #2\n"
4971       "ld1 {v2.b}[2], [x1], #1\n"
4972       "ld1 {v3.h}[0], [x2], #2\n"
4973       "ld1 {v3.b}[2], [x2], #1\n"
4974       "ld1 {v4.h}[0], [x3], #2\n"
4975       "ld1 {v4.b}[2], [x3], #1\n"
4976       "ld1 {v5.h}[0], [x4], #2\n"
4977       "ld1 {v5.b}[2], [x4], #1\n"
4978       "ld1 {v6.h}[0], [x5], #2\n"
4979       "ld1 {v6.b}[2], [x5], #1\n"
4980       "ld1 {v7.h}[0], [x6], #2\n"
4981       "ld1 {v7.b}[2], [x6], #1\n"
4982       "uaddw v8.8h, v8.8h, v0.8b\n"
4983       "uaddw v9.8h, v9.8h, v1.8b\n"
4984       "uaddw v10.8h, v10.8h, v2.8b\n"
4985       "uaddw v11.8h, v11.8h, v3.8b\n"
4986       "uaddw v12.8h, v12.8h, v4.8b\n"
4987       "uaddw v13.8h, v13.8h, v5.8b\n"
4988       "uaddw v14.8h, v14.8h, v6.8b\n"
4989       "uaddw v15.8h, v15.8h, v7.8b\n"
4990       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4991       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4992 
4993       // Aggregator Reduction.
4994       "ldr w0, %[multiplicative_sum_offset]\n"
4995       "ldr w1, %[additive_sum_offset]\n"
4996       "mov v0.s[0], w0\n"
4997       "dup v1.4s, w1\n"
4998       "uaddlp v8.4s, v8.8h\n"
4999       "uaddlp v9.4s, v9.8h\n"
5000       "uaddlp v10.4s, v10.8h\n"
5001       "uaddlp v11.4s, v11.8h\n"
5002       "uaddlp v12.4s, v12.8h\n"
5003       "uaddlp v13.4s, v13.8h\n"
5004       "uaddlp v14.4s, v14.8h\n"
5005       "uaddlp v15.4s, v15.8h\n"
5006       "addp v8.4s, v8.4s, v9.4s\n"
5007       "addp v10.4s, v10.4s, v11.4s\n"
5008       "addp v12.4s, v12.4s, v13.4s\n"
5009       "addp v14.4s, v14.4s, v15.4s\n"
5010       "addp v8.4s, v8.4s, v10.4s\n"
5011       "addp v9.4s, v12.4s, v14.4s\n"
5012       "mul v8.4s, v8.4s, v0.s[0]\n"
5013       "mul v9.4s, v9.4s, v0.s[0]\n"
5014       "add v8.4s, v8.4s, v1.4s\n"
5015       "add v9.4s, v9.4s, v1.4s\n"
5016       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5017       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5018       : [stride] "r"(params.stride),
5019         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5020         [additive_sum_offset] "m"(params.additive_sum_offset)
5021       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5022         "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5023         "cc", "memory");
5024 }
5025 
5026 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)5027 inline void Stream<uint8_t, 8, 8, 4, RowMajorWithSum>::Pack(
5028     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
5029 #ifdef DEBUG
5030 #ifdef DEBUG_METAGEMM_VERBOSE
5031   std::cout << __FILE__ << "(" << __LINE__
5032             << ") RowMajorWithSum<uint8_t, 8, 8, 4, RowMajorWithSum>::Pack()"
5033             << std::endl
5034             << std::flush;
5035 #endif
5036 #endif
5037   int params_count_copy = params.count;
5038   asm volatile(
5039       "add x0, %x[in], %x[stride]\n"
5040       "add x1, x0, %x[stride]\n"
5041       "add x2, x1, %x[stride]\n"
5042       "add x3, x2, %x[stride]\n"
5043       "add x4, x3, %x[stride]\n"
5044       "add x5, x4, %x[stride]\n"
5045       "add x6, x5, %x[stride]\n"
5046       "movi v8.8h, #0\n"
5047       "movi v9.8h, #0\n"
5048       "movi v10.8h, #0\n"
5049       "movi v11.8h, #0\n"
5050       "movi v12.8h, #0\n"
5051       "movi v13.8h, #0\n"
5052       "movi v14.8h, #0\n"
5053       "movi v15.8h, #0\n"
5054 
5055       // Reduce count by leftovers.
5056       "subs %x[count], %x[count], #4\n"
5057       "beq 2f\n"
5058 
5059       "1:"
5060       "subs %x[count], %x[count], #8\n"
5061 
5062       // Load Aggregate Store: 8x8.
5063       "ld1 {v0.2s}, [%x[in]], #8\n"
5064       "ld1 {v1.2s}, [x0], #8\n"
5065       "ld1 {v2.2s}, [x1], #8\n"
5066       "ld1 {v3.2s}, [x2], #8\n"
5067       "ld1 {v4.2s}, [x3], #8\n"
5068       "ld1 {v5.2s}, [x4], #8\n"
5069       "ld1 {v6.2s}, [x5], #8\n"
5070       "ld1 {v7.2s}, [x6], #8\n"
5071       "uaddw v8.8h, v8.8h, v0.8b\n"
5072       "uaddw v9.8h, v9.8h, v1.8b\n"
5073       "uaddw v10.8h, v10.8h, v2.8b\n"
5074       "uaddw v11.8h, v11.8h, v3.8b\n"
5075       "uaddw v12.8h, v12.8h, v4.8b\n"
5076       "uaddw v13.8h, v13.8h, v5.8b\n"
5077       "uaddw v14.8h, v14.8h, v6.8b\n"
5078       "uaddw v15.8h, v15.8h, v7.8b\n"
5079       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5080       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5081 
5082       "bne 1b\n"
5083 
5084       "2:"
5085 
5086       // Load Aggregate Store: 8x4.
5087       "movi v0.8b, #0\n"
5088       "movi v1.8b, #0\n"
5089       "movi v2.8b, #0\n"
5090       "movi v3.8b, #0\n"
5091       "movi v4.8b, #0\n"
5092       "movi v5.8b, #0\n"
5093       "movi v6.8b, #0\n"
5094       "movi v7.8b, #0\n"
5095       "ld1 {v0.s}[0], [%x[in]], #4\n"
5096       "ld1 {v1.s}[0], [x0], #4\n"
5097       "ld1 {v2.s}[0], [x1], #4\n"
5098       "ld1 {v3.s}[0], [x2], #4\n"
5099       "ld1 {v4.s}[0], [x3], #4\n"
5100       "ld1 {v5.s}[0], [x4], #4\n"
5101       "ld1 {v6.s}[0], [x5], #4\n"
5102       "ld1 {v7.s}[0], [x6], #4\n"
5103       "uaddw v8.8h, v8.8h, v0.8b\n"
5104       "uaddw v9.8h, v9.8h, v1.8b\n"
5105       "uaddw v10.8h, v10.8h, v2.8b\n"
5106       "uaddw v11.8h, v11.8h, v3.8b\n"
5107       "uaddw v12.8h, v12.8h, v4.8b\n"
5108       "uaddw v13.8h, v13.8h, v5.8b\n"
5109       "uaddw v14.8h, v14.8h, v6.8b\n"
5110       "uaddw v15.8h, v15.8h, v7.8b\n"
5111       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5112       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5113 
5114       // Aggregator Reduction.
5115       "ldr w0, %[multiplicative_sum_offset]\n"
5116       "ldr w1, %[additive_sum_offset]\n"
5117       "mov v0.s[0], w0\n"
5118       "dup v1.4s, w1\n"
5119       "uaddlp v8.4s, v8.8h\n"
5120       "uaddlp v9.4s, v9.8h\n"
5121       "uaddlp v10.4s, v10.8h\n"
5122       "uaddlp v11.4s, v11.8h\n"
5123       "uaddlp v12.4s, v12.8h\n"
5124       "uaddlp v13.4s, v13.8h\n"
5125       "uaddlp v14.4s, v14.8h\n"
5126       "uaddlp v15.4s, v15.8h\n"
5127       "addp v8.4s, v8.4s, v9.4s\n"
5128       "addp v10.4s, v10.4s, v11.4s\n"
5129       "addp v12.4s, v12.4s, v13.4s\n"
5130       "addp v14.4s, v14.4s, v15.4s\n"
5131       "addp v8.4s, v8.4s, v10.4s\n"
5132       "addp v9.4s, v12.4s, v14.4s\n"
5133       "mul v8.4s, v8.4s, v0.s[0]\n"
5134       "mul v9.4s, v9.4s, v0.s[0]\n"
5135       "add v8.4s, v8.4s, v1.4s\n"
5136       "add v9.4s, v9.4s, v1.4s\n"
5137       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5138       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5139       : [stride] "r"(params.stride),
5140         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5141         [additive_sum_offset] "m"(params.additive_sum_offset)
5142       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5143         "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5144         "cc", "memory");
5145 }
5146 
5147 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)5148 inline void Stream<uint8_t, 8, 8, 5, RowMajorWithSum>::Pack(
5149     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
5150 #ifdef DEBUG
5151 #ifdef DEBUG_METAGEMM_VERBOSE
5152   std::cout << __FILE__ << "(" << __LINE__
5153             << ") RowMajorWithSum<uint8_t, 8, 8, 5, RowMajorWithSum>::Pack()"
5154             << std::endl
5155             << std::flush;
5156 #endif
5157 #endif
5158   int params_count_copy = params.count;
5159   asm volatile(
5160       "add x0, %x[in], %x[stride]\n"
5161       "add x1, x0, %x[stride]\n"
5162       "add x2, x1, %x[stride]\n"
5163       "add x3, x2, %x[stride]\n"
5164       "add x4, x3, %x[stride]\n"
5165       "add x5, x4, %x[stride]\n"
5166       "add x6, x5, %x[stride]\n"
5167       "movi v8.8h, #0\n"
5168       "movi v9.8h, #0\n"
5169       "movi v10.8h, #0\n"
5170       "movi v11.8h, #0\n"
5171       "movi v12.8h, #0\n"
5172       "movi v13.8h, #0\n"
5173       "movi v14.8h, #0\n"
5174       "movi v15.8h, #0\n"
5175 
5176       // Reduce count by leftovers.
5177       "subs %x[count], %x[count], #5\n"
5178       "beq 2f\n"
5179 
5180       "1:"
5181       "subs %x[count], %x[count], #8\n"
5182 
5183       // Load Aggregate Store: 8x8.
5184       "ld1 {v0.2s}, [%x[in]], #8\n"
5185       "ld1 {v1.2s}, [x0], #8\n"
5186       "ld1 {v2.2s}, [x1], #8\n"
5187       "ld1 {v3.2s}, [x2], #8\n"
5188       "ld1 {v4.2s}, [x3], #8\n"
5189       "ld1 {v5.2s}, [x4], #8\n"
5190       "ld1 {v6.2s}, [x5], #8\n"
5191       "ld1 {v7.2s}, [x6], #8\n"
5192       "uaddw v8.8h, v8.8h, v0.8b\n"
5193       "uaddw v9.8h, v9.8h, v1.8b\n"
5194       "uaddw v10.8h, v10.8h, v2.8b\n"
5195       "uaddw v11.8h, v11.8h, v3.8b\n"
5196       "uaddw v12.8h, v12.8h, v4.8b\n"
5197       "uaddw v13.8h, v13.8h, v5.8b\n"
5198       "uaddw v14.8h, v14.8h, v6.8b\n"
5199       "uaddw v15.8h, v15.8h, v7.8b\n"
5200       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5201       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5202 
5203       "bne 1b\n"
5204 
5205       "2:"
5206 
5207       // Load Aggregate Store: 8x5.
5208       "movi v0.8b, #0\n"
5209       "movi v1.8b, #0\n"
5210       "movi v2.8b, #0\n"
5211       "movi v3.8b, #0\n"
5212       "movi v4.8b, #0\n"
5213       "movi v5.8b, #0\n"
5214       "movi v6.8b, #0\n"
5215       "movi v7.8b, #0\n"
5216       "ld1 {v0.s}[0], [%x[in]], #4\n"
5217       "ld1 {v0.b}[4], [%x[in]], #1\n"
5218       "ld1 {v1.s}[0], [x0], #4\n"
5219       "ld1 {v1.b}[4], [x0], #1\n"
5220       "ld1 {v2.s}[0], [x1], #4\n"
5221       "ld1 {v2.b}[4], [x1], #1\n"
5222       "ld1 {v3.s}[0], [x2], #4\n"
5223       "ld1 {v3.b}[4], [x2], #1\n"
5224       "ld1 {v4.s}[0], [x3], #4\n"
5225       "ld1 {v4.b}[4], [x3], #1\n"
5226       "ld1 {v5.s}[0], [x4], #4\n"
5227       "ld1 {v5.b}[4], [x4], #1\n"
5228       "ld1 {v6.s}[0], [x5], #4\n"
5229       "ld1 {v6.b}[4], [x5], #1\n"
5230       "ld1 {v7.s}[0], [x6], #4\n"
5231       "ld1 {v7.b}[4], [x6], #1\n"
5232       "uaddw v8.8h, v8.8h, v0.8b\n"
5233       "uaddw v9.8h, v9.8h, v1.8b\n"
5234       "uaddw v10.8h, v10.8h, v2.8b\n"
5235       "uaddw v11.8h, v11.8h, v3.8b\n"
5236       "uaddw v12.8h, v12.8h, v4.8b\n"
5237       "uaddw v13.8h, v13.8h, v5.8b\n"
5238       "uaddw v14.8h, v14.8h, v6.8b\n"
5239       "uaddw v15.8h, v15.8h, v7.8b\n"
5240       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5241       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5242 
5243       // Aggregator Reduction.
5244       "ldr w0, %[multiplicative_sum_offset]\n"
5245       "ldr w1, %[additive_sum_offset]\n"
5246       "mov v0.s[0], w0\n"
5247       "dup v1.4s, w1\n"
5248       "uaddlp v8.4s, v8.8h\n"
5249       "uaddlp v9.4s, v9.8h\n"
5250       "uaddlp v10.4s, v10.8h\n"
5251       "uaddlp v11.4s, v11.8h\n"
5252       "uaddlp v12.4s, v12.8h\n"
5253       "uaddlp v13.4s, v13.8h\n"
5254       "uaddlp v14.4s, v14.8h\n"
5255       "uaddlp v15.4s, v15.8h\n"
5256       "addp v8.4s, v8.4s, v9.4s\n"
5257       "addp v10.4s, v10.4s, v11.4s\n"
5258       "addp v12.4s, v12.4s, v13.4s\n"
5259       "addp v14.4s, v14.4s, v15.4s\n"
5260       "addp v8.4s, v8.4s, v10.4s\n"
5261       "addp v9.4s, v12.4s, v14.4s\n"
5262       "mul v8.4s, v8.4s, v0.s[0]\n"
5263       "mul v9.4s, v9.4s, v0.s[0]\n"
5264       "add v8.4s, v8.4s, v1.4s\n"
5265       "add v9.4s, v9.4s, v1.4s\n"
5266       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5267       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5268       : [stride] "r"(params.stride),
5269         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5270         [additive_sum_offset] "m"(params.additive_sum_offset)
5271       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5272         "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5273         "cc", "memory");
5274 }
5275 
5276 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)5277 inline void Stream<uint8_t, 8, 8, 6, RowMajorWithSum>::Pack(
5278     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
5279 #ifdef DEBUG
5280 #ifdef DEBUG_METAGEMM_VERBOSE
5281   std::cout << __FILE__ << "(" << __LINE__
5282             << ") RowMajorWithSum<uint8_t, 8, 8, 6, RowMajorWithSum>::Pack()"
5283             << std::endl
5284             << std::flush;
5285 #endif
5286 #endif
5287   int params_count_copy = params.count;
5288   asm volatile(
5289       "add x0, %x[in], %x[stride]\n"
5290       "add x1, x0, %x[stride]\n"
5291       "add x2, x1, %x[stride]\n"
5292       "add x3, x2, %x[stride]\n"
5293       "add x4, x3, %x[stride]\n"
5294       "add x5, x4, %x[stride]\n"
5295       "add x6, x5, %x[stride]\n"
5296       "movi v8.8h, #0\n"
5297       "movi v9.8h, #0\n"
5298       "movi v10.8h, #0\n"
5299       "movi v11.8h, #0\n"
5300       "movi v12.8h, #0\n"
5301       "movi v13.8h, #0\n"
5302       "movi v14.8h, #0\n"
5303       "movi v15.8h, #0\n"
5304 
5305       // Reduce count by leftovers.
5306       "subs %x[count], %x[count], #6\n"
5307       "beq 2f\n"
5308 
5309       "1:"
5310       "subs %x[count], %x[count], #8\n"
5311 
5312       // Load Aggregate Store: 8x8.
5313       "ld1 {v0.2s}, [%x[in]], #8\n"
5314       "ld1 {v1.2s}, [x0], #8\n"
5315       "ld1 {v2.2s}, [x1], #8\n"
5316       "ld1 {v3.2s}, [x2], #8\n"
5317       "ld1 {v4.2s}, [x3], #8\n"
5318       "ld1 {v5.2s}, [x4], #8\n"
5319       "ld1 {v6.2s}, [x5], #8\n"
5320       "ld1 {v7.2s}, [x6], #8\n"
5321       "uaddw v8.8h, v8.8h, v0.8b\n"
5322       "uaddw v9.8h, v9.8h, v1.8b\n"
5323       "uaddw v10.8h, v10.8h, v2.8b\n"
5324       "uaddw v11.8h, v11.8h, v3.8b\n"
5325       "uaddw v12.8h, v12.8h, v4.8b\n"
5326       "uaddw v13.8h, v13.8h, v5.8b\n"
5327       "uaddw v14.8h, v14.8h, v6.8b\n"
5328       "uaddw v15.8h, v15.8h, v7.8b\n"
5329       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5330       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5331 
5332       "bne 1b\n"
5333 
5334       "2:"
5335 
5336       // Load Aggregate Store: 8x6.
5337       "movi v0.8b, #0\n"
5338       "movi v1.8b, #0\n"
5339       "movi v2.8b, #0\n"
5340       "movi v3.8b, #0\n"
5341       "movi v4.8b, #0\n"
5342       "movi v5.8b, #0\n"
5343       "movi v6.8b, #0\n"
5344       "movi v7.8b, #0\n"
5345       "ld1 {v0.s}[0], [%x[in]], #4\n"
5346       "ld1 {v0.h}[2], [%x[in]], #2\n"
5347       "ld1 {v1.s}[0], [x0], #4\n"
5348       "ld1 {v1.h}[2], [x0], #2\n"
5349       "ld1 {v2.s}[0], [x1], #4\n"
5350       "ld1 {v2.h}[2], [x1], #2\n"
5351       "ld1 {v3.s}[0], [x2], #4\n"
5352       "ld1 {v3.h}[2], [x2], #2\n"
5353       "ld1 {v4.s}[0], [x3], #4\n"
5354       "ld1 {v4.h}[2], [x3], #2\n"
5355       "ld1 {v5.s}[0], [x4], #4\n"
5356       "ld1 {v5.h}[2], [x4], #2\n"
5357       "ld1 {v6.s}[0], [x5], #4\n"
5358       "ld1 {v6.h}[2], [x5], #2\n"
5359       "ld1 {v7.s}[0], [x6], #4\n"
5360       "ld1 {v7.h}[2], [x6], #2\n"
5361       "uaddw v8.8h, v8.8h, v0.8b\n"
5362       "uaddw v9.8h, v9.8h, v1.8b\n"
5363       "uaddw v10.8h, v10.8h, v2.8b\n"
5364       "uaddw v11.8h, v11.8h, v3.8b\n"
5365       "uaddw v12.8h, v12.8h, v4.8b\n"
5366       "uaddw v13.8h, v13.8h, v5.8b\n"
5367       "uaddw v14.8h, v14.8h, v6.8b\n"
5368       "uaddw v15.8h, v15.8h, v7.8b\n"
5369       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5370       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5371 
5372       // Aggregator Reduction.
5373       "ldr w0, %[multiplicative_sum_offset]\n"
5374       "ldr w1, %[additive_sum_offset]\n"
5375       "mov v0.s[0], w0\n"
5376       "dup v1.4s, w1\n"
5377       "uaddlp v8.4s, v8.8h\n"
5378       "uaddlp v9.4s, v9.8h\n"
5379       "uaddlp v10.4s, v10.8h\n"
5380       "uaddlp v11.4s, v11.8h\n"
5381       "uaddlp v12.4s, v12.8h\n"
5382       "uaddlp v13.4s, v13.8h\n"
5383       "uaddlp v14.4s, v14.8h\n"
5384       "uaddlp v15.4s, v15.8h\n"
5385       "addp v8.4s, v8.4s, v9.4s\n"
5386       "addp v10.4s, v10.4s, v11.4s\n"
5387       "addp v12.4s, v12.4s, v13.4s\n"
5388       "addp v14.4s, v14.4s, v15.4s\n"
5389       "addp v8.4s, v8.4s, v10.4s\n"
5390       "addp v9.4s, v12.4s, v14.4s\n"
5391       "mul v8.4s, v8.4s, v0.s[0]\n"
5392       "mul v9.4s, v9.4s, v0.s[0]\n"
5393       "add v8.4s, v8.4s, v1.4s\n"
5394       "add v9.4s, v9.4s, v1.4s\n"
5395       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5396       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5397       : [stride] "r"(params.stride),
5398         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5399         [additive_sum_offset] "m"(params.additive_sum_offset)
5400       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5401         "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5402         "cc", "memory");
5403 }
5404 
5405 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)5406 inline void Stream<uint8_t, 8, 8, 7, RowMajorWithSum>::Pack(
5407     const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
5408 #ifdef DEBUG
5409 #ifdef DEBUG_METAGEMM_VERBOSE
5410   std::cout << __FILE__ << "(" << __LINE__
5411             << ") RowMajorWithSum<uint8_t, 8, 8, 7, RowMajorWithSum>::Pack()"
5412             << std::endl
5413             << std::flush;
5414 #endif
5415 #endif
5416   int params_count_copy = params.count;
5417   asm volatile(
5418       "add x0, %x[in], %x[stride]\n"
5419       "add x1, x0, %x[stride]\n"
5420       "add x2, x1, %x[stride]\n"
5421       "add x3, x2, %x[stride]\n"
5422       "add x4, x3, %x[stride]\n"
5423       "add x5, x4, %x[stride]\n"
5424       "add x6, x5, %x[stride]\n"
5425       "movi v8.8h, #0\n"
5426       "movi v9.8h, #0\n"
5427       "movi v10.8h, #0\n"
5428       "movi v11.8h, #0\n"
5429       "movi v12.8h, #0\n"
5430       "movi v13.8h, #0\n"
5431       "movi v14.8h, #0\n"
5432       "movi v15.8h, #0\n"
5433 
5434       // Reduce count by leftovers.
5435       "subs %x[count], %x[count], #7\n"
5436       "beq 2f\n"
5437 
5438       "1:"
5439       "subs %x[count], %x[count], #8\n"
5440 
5441       // Load Aggregate Store: 8x8.
5442       "ld1 {v0.2s}, [%x[in]], #8\n"
5443       "ld1 {v1.2s}, [x0], #8\n"
5444       "ld1 {v2.2s}, [x1], #8\n"
5445       "ld1 {v3.2s}, [x2], #8\n"
5446       "ld1 {v4.2s}, [x3], #8\n"
5447       "ld1 {v5.2s}, [x4], #8\n"
5448       "ld1 {v6.2s}, [x5], #8\n"
5449       "ld1 {v7.2s}, [x6], #8\n"
5450       "uaddw v8.8h, v8.8h, v0.8b\n"
5451       "uaddw v9.8h, v9.8h, v1.8b\n"
5452       "uaddw v10.8h, v10.8h, v2.8b\n"
5453       "uaddw v11.8h, v11.8h, v3.8b\n"
5454       "uaddw v12.8h, v12.8h, v4.8b\n"
5455       "uaddw v13.8h, v13.8h, v5.8b\n"
5456       "uaddw v14.8h, v14.8h, v6.8b\n"
5457       "uaddw v15.8h, v15.8h, v7.8b\n"
5458       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5459       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5460 
5461       "bne 1b\n"
5462 
5463       "2:"
5464 
5465       // Load Aggregate Store: 8x7.
5466       "movi v0.8b, #0\n"
5467       "movi v1.8b, #0\n"
5468       "movi v2.8b, #0\n"
5469       "movi v3.8b, #0\n"
5470       "movi v4.8b, #0\n"
5471       "movi v5.8b, #0\n"
5472       "movi v6.8b, #0\n"
5473       "movi v7.8b, #0\n"
5474       "ld1 {v0.s}[0], [%x[in]], #4\n"
5475       "ld1 {v0.h}[2], [%x[in]], #2\n"
5476       "ld1 {v0.b}[6], [%x[in]], #1\n"
5477       "ld1 {v1.s}[0], [x0], #4\n"
5478       "ld1 {v1.h}[2], [x0], #2\n"
5479       "ld1 {v1.b}[6], [x0], #1\n"
5480       "ld1 {v2.s}[0], [x1], #4\n"
5481       "ld1 {v2.h}[2], [x1], #2\n"
5482       "ld1 {v2.b}[6], [x1], #1\n"
5483       "ld1 {v3.s}[0], [x2], #4\n"
5484       "ld1 {v3.h}[2], [x2], #2\n"
5485       "ld1 {v3.b}[6], [x2], #1\n"
5486       "ld1 {v4.s}[0], [x3], #4\n"
5487       "ld1 {v4.h}[2], [x3], #2\n"
5488       "ld1 {v4.b}[6], [x3], #1\n"
5489       "ld1 {v5.s}[0], [x4], #4\n"
5490       "ld1 {v5.h}[2], [x4], #2\n"
5491       "ld1 {v5.b}[6], [x4], #1\n"
5492       "ld1 {v6.s}[0], [x5], #4\n"
5493       "ld1 {v6.h}[2], [x5], #2\n"
5494       "ld1 {v6.b}[6], [x5], #1\n"
5495       "ld1 {v7.s}[0], [x6], #4\n"
5496       "ld1 {v7.h}[2], [x6], #2\n"
5497       "ld1 {v7.b}[6], [x6], #1\n"
5498       "uaddw v8.8h, v8.8h, v0.8b\n"
5499       "uaddw v9.8h, v9.8h, v1.8b\n"
5500       "uaddw v10.8h, v10.8h, v2.8b\n"
5501       "uaddw v11.8h, v11.8h, v3.8b\n"
5502       "uaddw v12.8h, v12.8h, v4.8b\n"
5503       "uaddw v13.8h, v13.8h, v5.8b\n"
5504       "uaddw v14.8h, v14.8h, v6.8b\n"
5505       "uaddw v15.8h, v15.8h, v7.8b\n"
5506       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5507       "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5508 
5509       // Aggregator Reduction.
5510       "ldr w0, %[multiplicative_sum_offset]\n"
5511       "ldr w1, %[additive_sum_offset]\n"
5512       "mov v0.s[0], w0\n"
5513       "dup v1.4s, w1\n"
5514       "uaddlp v8.4s, v8.8h\n"
5515       "uaddlp v9.4s, v9.8h\n"
5516       "uaddlp v10.4s, v10.8h\n"
5517       "uaddlp v11.4s, v11.8h\n"
5518       "uaddlp v12.4s, v12.8h\n"
5519       "uaddlp v13.4s, v13.8h\n"
5520       "uaddlp v14.4s, v14.8h\n"
5521       "uaddlp v15.4s, v15.8h\n"
5522       "addp v8.4s, v8.4s, v9.4s\n"
5523       "addp v10.4s, v10.4s, v11.4s\n"
5524       "addp v12.4s, v12.4s, v13.4s\n"
5525       "addp v14.4s, v14.4s, v15.4s\n"
5526       "addp v8.4s, v8.4s, v10.4s\n"
5527       "addp v9.4s, v12.4s, v14.4s\n"
5528       "mul v8.4s, v8.4s, v0.s[0]\n"
5529       "mul v9.4s, v9.4s, v0.s[0]\n"
5530       "add v8.4s, v8.4s, v1.4s\n"
5531       "add v9.4s, v9.4s, v1.4s\n"
5532       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5533       : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5534       : [stride] "r"(params.stride),
5535         [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5536         [additive_sum_offset] "m"(params.additive_sum_offset)
5537       : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5538         "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5539         "cc", "memory");
5540 }
5541 
5542 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5543 inline void Stream<uint8_t, 1, 8, 0, ColumnMajorWithSum>::Pack(
5544     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5545 #ifdef DEBUG
5546 #ifdef DEBUG_METAGEMM_VERBOSE
5547   std::cout
5548       << __FILE__ << "(" << __LINE__
5549       << ") ColumnMajorWithSum<uint8_t, 1, 8, 0, ColumnMajorWithSum>::Pack()"
5550       << std::endl
5551       << std::flush;
5552 #endif
5553 #endif
5554   int params_count_copy = params.count;
5555   int params_stride_copy = params.stride;
5556   asm volatile(
5557       "movi v8.8h, #0\n"
5558 
5559       "1:"
5560       "subs %x[count], %x[count], #8\n"
5561 
5562       // Load Aggregate Store - column major 1x8
5563       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5564       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5565       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5566       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5567       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5568       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5569       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5570       "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5571       "prfm pldl1keep, [%x[in]]\n"
5572       "uaddw v8.8h, v8.8h, v0.8b\n"
5573       "st1 {v0.2s}, [%x[out]], #8\n"
5574 
5575       "bne 1b\n"
5576 
5577       // Aggregator Reduction.
5578       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5579       "dup v1.4s, %w[additive_sum_offset]\n"
5580       "uaddlp v8.4s, v8.8h\n"
5581       "addp v8.4s, v8.4s, v8.4s\n"
5582       "addp v8.4s, v8.4s, v8.4s\n"
5583       "mul v8.4s, v8.4s, v0.s[0]\n"
5584       "add v8.4s, v8.4s, v1.4s\n"
5585       "st1 {v8.4s}, [%x[out]]\n"
5586       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5587         [out] "+r"(out), [in] "+r"(in)
5588       : [additive_sum_offset] "r"(params.additive_sum_offset),
5589         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5590       : "v8", "v0", "v1", "cc", "memory");
5591 }
5592 
5593 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5594 inline void Stream<uint8_t, 1, 8, 1, ColumnMajorWithSum>::Pack(
5595     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5596 #ifdef DEBUG
5597 #ifdef DEBUG_METAGEMM_VERBOSE
5598   std::cout
5599       << __FILE__ << "(" << __LINE__
5600       << ") ColumnMajorWithSum<uint8_t, 1, 8, 1, ColumnMajorWithSum>::Pack()"
5601       << std::endl
5602       << std::flush;
5603 #endif
5604 #endif
5605   int params_count_copy = params.count;
5606   int params_stride_copy = params.stride;
5607   asm volatile(
5608       "movi v8.8h, #0\n"
5609 
5610       // Reduce count by leftovers.
5611       "subs %x[count], %x[count], #1\n"
5612       "beq 2f\n"
5613 
5614       "1:"
5615       "subs %x[count], %x[count], #8\n"
5616 
5617       // Load Aggregate Store - column major 1x8
5618       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5619       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5620       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5621       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5622       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5623       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5624       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5625       "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5626       "prfm pldl1keep, [%x[in]]\n"
5627       "uaddw v8.8h, v8.8h, v0.8b\n"
5628       "st1 {v0.2s}, [%x[out]], #8\n"
5629 
5630       "bne 1b\n"
5631 
5632       "2:"
5633 
5634       // Load Aggregate Store - column major 1x1
5635       "movi v0.8b, #0\n"
5636       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5637       "prfm pldl1keep, [%x[in]]\n"
5638       "uaddw v8.8h, v8.8h, v0.8b\n"
5639       "st1 {v0.2s}, [%x[out]], #8\n"
5640 
5641       // Aggregator Reduction.
5642       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5643       "dup v1.4s, %w[additive_sum_offset]\n"
5644       "uaddlp v8.4s, v8.8h\n"
5645       "addp v8.4s, v8.4s, v8.4s\n"
5646       "addp v8.4s, v8.4s, v8.4s\n"
5647       "mul v8.4s, v8.4s, v0.s[0]\n"
5648       "add v8.4s, v8.4s, v1.4s\n"
5649       "st1 {v8.4s}, [%x[out]]\n"
5650       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5651         [out] "+r"(out), [in] "+r"(in)
5652       : [additive_sum_offset] "r"(params.additive_sum_offset),
5653         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5654       : "v8", "v0", "v1", "cc", "memory");
5655 }
5656 
5657 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5658 inline void Stream<uint8_t, 1, 8, 2, ColumnMajorWithSum>::Pack(
5659     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5660 #ifdef DEBUG
5661 #ifdef DEBUG_METAGEMM_VERBOSE
5662   std::cout
5663       << __FILE__ << "(" << __LINE__
5664       << ") ColumnMajorWithSum<uint8_t, 1, 8, 2, ColumnMajorWithSum>::Pack()"
5665       << std::endl
5666       << std::flush;
5667 #endif
5668 #endif
5669   int params_count_copy = params.count;
5670   int params_stride_copy = params.stride;
5671   asm volatile(
5672       "movi v8.8h, #0\n"
5673 
5674       // Reduce count by leftovers.
5675       "subs %x[count], %x[count], #2\n"
5676       "beq 2f\n"
5677 
5678       "1:"
5679       "subs %x[count], %x[count], #8\n"
5680 
5681       // Load Aggregate Store - column major 1x8
5682       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5683       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5684       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5685       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5686       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5687       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5688       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5689       "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5690       "prfm pldl1keep, [%x[in]]\n"
5691       "uaddw v8.8h, v8.8h, v0.8b\n"
5692       "st1 {v0.2s}, [%x[out]], #8\n"
5693 
5694       "bne 1b\n"
5695 
5696       "2:"
5697 
5698       // Load Aggregate Store - column major 1x2
5699       "movi v0.8b, #0\n"
5700       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5701       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5702       "prfm pldl1keep, [%x[in]]\n"
5703       "uaddw v8.8h, v8.8h, v0.8b\n"
5704       "st1 {v0.2s}, [%x[out]], #8\n"
5705 
5706       // Aggregator Reduction.
5707       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5708       "dup v1.4s, %w[additive_sum_offset]\n"
5709       "uaddlp v8.4s, v8.8h\n"
5710       "addp v8.4s, v8.4s, v8.4s\n"
5711       "addp v8.4s, v8.4s, v8.4s\n"
5712       "mul v8.4s, v8.4s, v0.s[0]\n"
5713       "add v8.4s, v8.4s, v1.4s\n"
5714       "st1 {v8.4s}, [%x[out]]\n"
5715       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5716         [out] "+r"(out), [in] "+r"(in)
5717       : [additive_sum_offset] "r"(params.additive_sum_offset),
5718         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5719       : "v8", "v0", "v1", "cc", "memory");
5720 }
5721 
5722 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5723 inline void Stream<uint8_t, 1, 8, 3, ColumnMajorWithSum>::Pack(
5724     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5725 #ifdef DEBUG
5726 #ifdef DEBUG_METAGEMM_VERBOSE
5727   std::cout
5728       << __FILE__ << "(" << __LINE__
5729       << ") ColumnMajorWithSum<uint8_t, 1, 8, 3, ColumnMajorWithSum>::Pack()"
5730       << std::endl
5731       << std::flush;
5732 #endif
5733 #endif
5734   int params_count_copy = params.count;
5735   int params_stride_copy = params.stride;
5736   asm volatile(
5737       "movi v8.8h, #0\n"
5738 
5739       // Reduce count by leftovers.
5740       "subs %x[count], %x[count], #3\n"
5741       "beq 2f\n"
5742 
5743       "1:"
5744       "subs %x[count], %x[count], #8\n"
5745 
5746       // Load Aggregate Store - column major 1x8
5747       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5748       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5749       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5750       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5751       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5752       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5753       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5754       "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5755       "prfm pldl1keep, [%x[in]]\n"
5756       "uaddw v8.8h, v8.8h, v0.8b\n"
5757       "st1 {v0.2s}, [%x[out]], #8\n"
5758 
5759       "bne 1b\n"
5760 
5761       "2:"
5762 
5763       // Load Aggregate Store - column major 1x3
5764       "movi v0.8b, #0\n"
5765       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5766       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5767       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5768       "prfm pldl1keep, [%x[in]]\n"
5769       "uaddw v8.8h, v8.8h, v0.8b\n"
5770       "st1 {v0.2s}, [%x[out]], #8\n"
5771 
5772       // Aggregator Reduction.
5773       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5774       "dup v1.4s, %w[additive_sum_offset]\n"
5775       "uaddlp v8.4s, v8.8h\n"
5776       "addp v8.4s, v8.4s, v8.4s\n"
5777       "addp v8.4s, v8.4s, v8.4s\n"
5778       "mul v8.4s, v8.4s, v0.s[0]\n"
5779       "add v8.4s, v8.4s, v1.4s\n"
5780       "st1 {v8.4s}, [%x[out]]\n"
5781       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5782         [out] "+r"(out), [in] "+r"(in)
5783       : [additive_sum_offset] "r"(params.additive_sum_offset),
5784         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5785       : "v8", "v0", "v1", "cc", "memory");
5786 }
5787 
5788 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5789 inline void Stream<uint8_t, 1, 8, 4, ColumnMajorWithSum>::Pack(
5790     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5791 #ifdef DEBUG
5792 #ifdef DEBUG_METAGEMM_VERBOSE
5793   std::cout
5794       << __FILE__ << "(" << __LINE__
5795       << ") ColumnMajorWithSum<uint8_t, 1, 8, 4, ColumnMajorWithSum>::Pack()"
5796       << std::endl
5797       << std::flush;
5798 #endif
5799 #endif
5800   int params_count_copy = params.count;
5801   int params_stride_copy = params.stride;
5802   asm volatile(
5803       "movi v8.8h, #0\n"
5804 
5805       // Reduce count by leftovers.
5806       "subs %x[count], %x[count], #4\n"
5807       "beq 2f\n"
5808 
5809       "1:"
5810       "subs %x[count], %x[count], #8\n"
5811 
5812       // Load Aggregate Store - column major 1x8
5813       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5814       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5815       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5816       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5817       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5818       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5819       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5820       "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5821       "prfm pldl1keep, [%x[in]]\n"
5822       "uaddw v8.8h, v8.8h, v0.8b\n"
5823       "st1 {v0.2s}, [%x[out]], #8\n"
5824 
5825       "bne 1b\n"
5826 
5827       "2:"
5828 
5829       // Load Aggregate Store - column major 1x4
5830       "movi v0.8b, #0\n"
5831       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5832       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5833       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5834       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5835       "prfm pldl1keep, [%x[in]]\n"
5836       "uaddw v8.8h, v8.8h, v0.8b\n"
5837       "st1 {v0.2s}, [%x[out]], #8\n"
5838 
5839       // Aggregator Reduction.
5840       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5841       "dup v1.4s, %w[additive_sum_offset]\n"
5842       "uaddlp v8.4s, v8.8h\n"
5843       "addp v8.4s, v8.4s, v8.4s\n"
5844       "addp v8.4s, v8.4s, v8.4s\n"
5845       "mul v8.4s, v8.4s, v0.s[0]\n"
5846       "add v8.4s, v8.4s, v1.4s\n"
5847       "st1 {v8.4s}, [%x[out]]\n"
5848       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5849         [out] "+r"(out), [in] "+r"(in)
5850       : [additive_sum_offset] "r"(params.additive_sum_offset),
5851         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5852       : "v8", "v0", "v1", "cc", "memory");
5853 }
5854 
5855 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5856 inline void Stream<uint8_t, 1, 8, 5, ColumnMajorWithSum>::Pack(
5857     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5858 #ifdef DEBUG
5859 #ifdef DEBUG_METAGEMM_VERBOSE
5860   std::cout
5861       << __FILE__ << "(" << __LINE__
5862       << ") ColumnMajorWithSum<uint8_t, 1, 8, 5, ColumnMajorWithSum>::Pack()"
5863       << std::endl
5864       << std::flush;
5865 #endif
5866 #endif
5867   int params_count_copy = params.count;
5868   int params_stride_copy = params.stride;
5869   asm volatile(
5870       "movi v8.8h, #0\n"
5871 
5872       // Reduce count by leftovers.
5873       "subs %x[count], %x[count], #5\n"
5874       "beq 2f\n"
5875 
5876       "1:"
5877       "subs %x[count], %x[count], #8\n"
5878 
5879       // Load Aggregate Store - column major 1x8
5880       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5881       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5882       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5883       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5884       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5885       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5886       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5887       "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5888       "prfm pldl1keep, [%x[in]]\n"
5889       "uaddw v8.8h, v8.8h, v0.8b\n"
5890       "st1 {v0.2s}, [%x[out]], #8\n"
5891 
5892       "bne 1b\n"
5893 
5894       "2:"
5895 
5896       // Load Aggregate Store - column major 1x5
5897       "movi v0.8b, #0\n"
5898       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5899       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5900       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5901       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5902       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5903       "prfm pldl1keep, [%x[in]]\n"
5904       "uaddw v8.8h, v8.8h, v0.8b\n"
5905       "st1 {v0.2s}, [%x[out]], #8\n"
5906 
5907       // Aggregator Reduction.
5908       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5909       "dup v1.4s, %w[additive_sum_offset]\n"
5910       "uaddlp v8.4s, v8.8h\n"
5911       "addp v8.4s, v8.4s, v8.4s\n"
5912       "addp v8.4s, v8.4s, v8.4s\n"
5913       "mul v8.4s, v8.4s, v0.s[0]\n"
5914       "add v8.4s, v8.4s, v1.4s\n"
5915       "st1 {v8.4s}, [%x[out]]\n"
5916       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5917         [out] "+r"(out), [in] "+r"(in)
5918       : [additive_sum_offset] "r"(params.additive_sum_offset),
5919         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5920       : "v8", "v0", "v1", "cc", "memory");
5921 }
5922 
5923 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5924 inline void Stream<uint8_t, 1, 8, 6, ColumnMajorWithSum>::Pack(
5925     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5926 #ifdef DEBUG
5927 #ifdef DEBUG_METAGEMM_VERBOSE
5928   std::cout
5929       << __FILE__ << "(" << __LINE__
5930       << ") ColumnMajorWithSum<uint8_t, 1, 8, 6, ColumnMajorWithSum>::Pack()"
5931       << std::endl
5932       << std::flush;
5933 #endif
5934 #endif
5935   int params_count_copy = params.count;
5936   int params_stride_copy = params.stride;
5937   asm volatile(
5938       "movi v8.8h, #0\n"
5939 
5940       // Reduce count by leftovers.
5941       "subs %x[count], %x[count], #6\n"
5942       "beq 2f\n"
5943 
5944       "1:"
5945       "subs %x[count], %x[count], #8\n"
5946 
5947       // Load Aggregate Store - column major 1x8
5948       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5949       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5950       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5951       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5952       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5953       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5954       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5955       "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5956       "prfm pldl1keep, [%x[in]]\n"
5957       "uaddw v8.8h, v8.8h, v0.8b\n"
5958       "st1 {v0.2s}, [%x[out]], #8\n"
5959 
5960       "bne 1b\n"
5961 
5962       "2:"
5963 
5964       // Load Aggregate Store - column major 1x6
5965       "movi v0.8b, #0\n"
5966       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5967       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5968       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5969       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5970       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5971       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5972       "prfm pldl1keep, [%x[in]]\n"
5973       "uaddw v8.8h, v8.8h, v0.8b\n"
5974       "st1 {v0.2s}, [%x[out]], #8\n"
5975 
5976       // Aggregator Reduction.
5977       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5978       "dup v1.4s, %w[additive_sum_offset]\n"
5979       "uaddlp v8.4s, v8.8h\n"
5980       "addp v8.4s, v8.4s, v8.4s\n"
5981       "addp v8.4s, v8.4s, v8.4s\n"
5982       "mul v8.4s, v8.4s, v0.s[0]\n"
5983       "add v8.4s, v8.4s, v1.4s\n"
5984       "st1 {v8.4s}, [%x[out]]\n"
5985       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5986         [out] "+r"(out), [in] "+r"(in)
5987       : [additive_sum_offset] "r"(params.additive_sum_offset),
5988         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5989       : "v8", "v0", "v1", "cc", "memory");
5990 }
5991 
5992 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5993 inline void Stream<uint8_t, 1, 8, 7, ColumnMajorWithSum>::Pack(
5994     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5995 #ifdef DEBUG
5996 #ifdef DEBUG_METAGEMM_VERBOSE
5997   std::cout
5998       << __FILE__ << "(" << __LINE__
5999       << ") ColumnMajorWithSum<uint8_t, 1, 8, 7, ColumnMajorWithSum>::Pack()"
6000       << std::endl
6001       << std::flush;
6002 #endif
6003 #endif
6004   int params_count_copy = params.count;
6005   int params_stride_copy = params.stride;
6006   asm volatile(
6007       "movi v8.8h, #0\n"
6008 
6009       // Reduce count by leftovers.
6010       "subs %x[count], %x[count], #7\n"
6011       "beq 2f\n"
6012 
6013       "1:"
6014       "subs %x[count], %x[count], #8\n"
6015 
6016       // Load Aggregate Store - column major 1x8
6017       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
6018       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
6019       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
6020       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
6021       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
6022       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
6023       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
6024       "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
6025       "prfm pldl1keep, [%x[in]]\n"
6026       "uaddw v8.8h, v8.8h, v0.8b\n"
6027       "st1 {v0.2s}, [%x[out]], #8\n"
6028 
6029       "bne 1b\n"
6030 
6031       "2:"
6032 
6033       // Load Aggregate Store - column major 1x7
6034       "movi v0.8b, #0\n"
6035       "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
6036       "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
6037       "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
6038       "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
6039       "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
6040       "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
6041       "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
6042       "prfm pldl1keep, [%x[in]]\n"
6043       "uaddw v8.8h, v8.8h, v0.8b\n"
6044       "st1 {v0.2s}, [%x[out]], #8\n"
6045 
6046       // Aggregator Reduction.
6047       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6048       "dup v1.4s, %w[additive_sum_offset]\n"
6049       "uaddlp v8.4s, v8.8h\n"
6050       "addp v8.4s, v8.4s, v8.4s\n"
6051       "addp v8.4s, v8.4s, v8.4s\n"
6052       "mul v8.4s, v8.4s, v0.s[0]\n"
6053       "add v8.4s, v8.4s, v1.4s\n"
6054       "st1 {v8.4s}, [%x[out]]\n"
6055       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6056         [out] "+r"(out), [in] "+r"(in)
6057       : [additive_sum_offset] "r"(params.additive_sum_offset),
6058         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6059       : "v8", "v0", "v1", "cc", "memory");
6060 }
6061 
6062 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6063 inline void Stream<uint8_t, 2, 8, 0, ColumnMajorWithSum>::Pack(
6064     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6065 #ifdef DEBUG
6066 #ifdef DEBUG_METAGEMM_VERBOSE
6067   std::cout
6068       << __FILE__ << "(" << __LINE__
6069       << ") ColumnMajorWithSum<uint8_t, 2, 8, 0, ColumnMajorWithSum>::Pack()"
6070       << std::endl
6071       << std::flush;
6072 #endif
6073 #endif
6074   int params_count_copy = params.count;
6075   int params_stride_copy = params.stride;
6076   asm volatile(
6077       "movi v8.8h, #0\n"
6078       "movi v9.8h, #0\n"
6079 
6080       "1:"
6081       "subs %x[count], %x[count], #8\n"
6082 
6083       // Load Aggregate Store - column major 2x8
6084       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6085       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6086       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6087       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6088       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6089       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6090       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6091       "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6092       "prfm pldl1keep, [%x[in]]\n"
6093       "uzp1 v2.8b, v0.8b, v1.8b\n"
6094       "uzp2 v3.8b, v0.8b, v1.8b\n"
6095       "uaddw v8.8h, v8.8h, v2.8b\n"
6096       "uaddw v9.8h, v9.8h, v3.8b\n"
6097       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6098 
6099       "bne 1b\n"
6100 
6101       // Aggregator Reduction.
6102       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6103       "dup v1.4s, %w[additive_sum_offset]\n"
6104       "uaddlp v8.4s, v8.8h\n"
6105       "uaddlp v9.4s, v9.8h\n"
6106       "addp v8.4s, v8.4s, v9.4s\n"
6107       "addp v8.4s, v8.4s, v8.4s\n"
6108       "mul v8.4s, v8.4s, v0.s[0]\n"
6109       "add v8.4s, v8.4s, v1.4s\n"
6110       "st1 {v8.4s}, [%x[out]]\n"
6111       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6112         [out] "+r"(out), [in] "+r"(in)
6113       : [additive_sum_offset] "r"(params.additive_sum_offset),
6114         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6115       : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6116 }
6117 
6118 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6119 inline void Stream<uint8_t, 2, 8, 1, ColumnMajorWithSum>::Pack(
6120     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6121 #ifdef DEBUG
6122 #ifdef DEBUG_METAGEMM_VERBOSE
6123   std::cout
6124       << __FILE__ << "(" << __LINE__
6125       << ") ColumnMajorWithSum<uint8_t, 2, 8, 1, ColumnMajorWithSum>::Pack()"
6126       << std::endl
6127       << std::flush;
6128 #endif
6129 #endif
6130   int params_count_copy = params.count;
6131   int params_stride_copy = params.stride;
6132   asm volatile(
6133       "movi v8.8h, #0\n"
6134       "movi v9.8h, #0\n"
6135 
6136       // Reduce count by leftovers.
6137       "subs %x[count], %x[count], #1\n"
6138       "beq 2f\n"
6139 
6140       "1:"
6141       "subs %x[count], %x[count], #8\n"
6142 
6143       // Load Aggregate Store - column major 2x8
6144       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6145       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6146       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6147       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6148       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6149       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6150       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6151       "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6152       "prfm pldl1keep, [%x[in]]\n"
6153       "uzp1 v2.8b, v0.8b, v1.8b\n"
6154       "uzp2 v3.8b, v0.8b, v1.8b\n"
6155       "uaddw v8.8h, v8.8h, v2.8b\n"
6156       "uaddw v9.8h, v9.8h, v3.8b\n"
6157       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6158 
6159       "bne 1b\n"
6160 
6161       "2:"
6162 
6163       // Load Aggregate Store - column major 2x1
6164       "movi v0.8b, #0\n"
6165       "movi v1.8b, #0\n"
6166       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6167       "prfm pldl1keep, [%x[in]]\n"
6168       "uzp1 v2.8b, v0.8b, v1.8b\n"
6169       "uzp2 v3.8b, v0.8b, v1.8b\n"
6170       "uaddw v8.8h, v8.8h, v2.8b\n"
6171       "uaddw v9.8h, v9.8h, v3.8b\n"
6172       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6173 
6174       // Aggregator Reduction.
6175       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6176       "dup v1.4s, %w[additive_sum_offset]\n"
6177       "uaddlp v8.4s, v8.8h\n"
6178       "uaddlp v9.4s, v9.8h\n"
6179       "addp v8.4s, v8.4s, v9.4s\n"
6180       "addp v8.4s, v8.4s, v8.4s\n"
6181       "mul v8.4s, v8.4s, v0.s[0]\n"
6182       "add v8.4s, v8.4s, v1.4s\n"
6183       "st1 {v8.4s}, [%x[out]]\n"
6184       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6185         [out] "+r"(out), [in] "+r"(in)
6186       : [additive_sum_offset] "r"(params.additive_sum_offset),
6187         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6188       : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6189 }
6190 
6191 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6192 inline void Stream<uint8_t, 2, 8, 2, ColumnMajorWithSum>::Pack(
6193     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6194 #ifdef DEBUG
6195 #ifdef DEBUG_METAGEMM_VERBOSE
6196   std::cout
6197       << __FILE__ << "(" << __LINE__
6198       << ") ColumnMajorWithSum<uint8_t, 2, 8, 2, ColumnMajorWithSum>::Pack()"
6199       << std::endl
6200       << std::flush;
6201 #endif
6202 #endif
6203   int params_count_copy = params.count;
6204   int params_stride_copy = params.stride;
6205   asm volatile(
6206       "movi v8.8h, #0\n"
6207       "movi v9.8h, #0\n"
6208 
6209       // Reduce count by leftovers.
6210       "subs %x[count], %x[count], #2\n"
6211       "beq 2f\n"
6212 
6213       "1:"
6214       "subs %x[count], %x[count], #8\n"
6215 
6216       // Load Aggregate Store - column major 2x8
6217       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6218       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6219       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6220       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6221       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6222       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6223       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6224       "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6225       "prfm pldl1keep, [%x[in]]\n"
6226       "uzp1 v2.8b, v0.8b, v1.8b\n"
6227       "uzp2 v3.8b, v0.8b, v1.8b\n"
6228       "uaddw v8.8h, v8.8h, v2.8b\n"
6229       "uaddw v9.8h, v9.8h, v3.8b\n"
6230       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6231 
6232       "bne 1b\n"
6233 
6234       "2:"
6235 
6236       // Load Aggregate Store - column major 2x2
6237       "movi v0.8b, #0\n"
6238       "movi v1.8b, #0\n"
6239       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6240       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6241       "prfm pldl1keep, [%x[in]]\n"
6242       "uzp1 v2.8b, v0.8b, v1.8b\n"
6243       "uzp2 v3.8b, v0.8b, v1.8b\n"
6244       "uaddw v8.8h, v8.8h, v2.8b\n"
6245       "uaddw v9.8h, v9.8h, v3.8b\n"
6246       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6247 
6248       // Aggregator Reduction.
6249       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6250       "dup v1.4s, %w[additive_sum_offset]\n"
6251       "uaddlp v8.4s, v8.8h\n"
6252       "uaddlp v9.4s, v9.8h\n"
6253       "addp v8.4s, v8.4s, v9.4s\n"
6254       "addp v8.4s, v8.4s, v8.4s\n"
6255       "mul v8.4s, v8.4s, v0.s[0]\n"
6256       "add v8.4s, v8.4s, v1.4s\n"
6257       "st1 {v8.4s}, [%x[out]]\n"
6258       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6259         [out] "+r"(out), [in] "+r"(in)
6260       : [additive_sum_offset] "r"(params.additive_sum_offset),
6261         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6262       : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6263 }
6264 
6265 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6266 inline void Stream<uint8_t, 2, 8, 3, ColumnMajorWithSum>::Pack(
6267     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6268 #ifdef DEBUG
6269 #ifdef DEBUG_METAGEMM_VERBOSE
6270   std::cout
6271       << __FILE__ << "(" << __LINE__
6272       << ") ColumnMajorWithSum<uint8_t, 2, 8, 3, ColumnMajorWithSum>::Pack()"
6273       << std::endl
6274       << std::flush;
6275 #endif
6276 #endif
6277   int params_count_copy = params.count;
6278   int params_stride_copy = params.stride;
6279   asm volatile(
6280       "movi v8.8h, #0\n"
6281       "movi v9.8h, #0\n"
6282 
6283       // Reduce count by leftovers.
6284       "subs %x[count], %x[count], #3\n"
6285       "beq 2f\n"
6286 
6287       "1:"
6288       "subs %x[count], %x[count], #8\n"
6289 
6290       // Load Aggregate Store - column major 2x8
6291       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6292       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6293       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6294       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6295       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6296       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6297       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6298       "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6299       "prfm pldl1keep, [%x[in]]\n"
6300       "uzp1 v2.8b, v0.8b, v1.8b\n"
6301       "uzp2 v3.8b, v0.8b, v1.8b\n"
6302       "uaddw v8.8h, v8.8h, v2.8b\n"
6303       "uaddw v9.8h, v9.8h, v3.8b\n"
6304       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6305 
6306       "bne 1b\n"
6307 
6308       "2:"
6309 
6310       // Load Aggregate Store - column major 2x3
6311       "movi v0.8b, #0\n"
6312       "movi v1.8b, #0\n"
6313       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6314       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6315       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6316       "prfm pldl1keep, [%x[in]]\n"
6317       "uzp1 v2.8b, v0.8b, v1.8b\n"
6318       "uzp2 v3.8b, v0.8b, v1.8b\n"
6319       "uaddw v8.8h, v8.8h, v2.8b\n"
6320       "uaddw v9.8h, v9.8h, v3.8b\n"
6321       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6322 
6323       // Aggregator Reduction.
6324       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6325       "dup v1.4s, %w[additive_sum_offset]\n"
6326       "uaddlp v8.4s, v8.8h\n"
6327       "uaddlp v9.4s, v9.8h\n"
6328       "addp v8.4s, v8.4s, v9.4s\n"
6329       "addp v8.4s, v8.4s, v8.4s\n"
6330       "mul v8.4s, v8.4s, v0.s[0]\n"
6331       "add v8.4s, v8.4s, v1.4s\n"
6332       "st1 {v8.4s}, [%x[out]]\n"
6333       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6334         [out] "+r"(out), [in] "+r"(in)
6335       : [additive_sum_offset] "r"(params.additive_sum_offset),
6336         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6337       : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6338 }
6339 
6340 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6341 inline void Stream<uint8_t, 2, 8, 4, ColumnMajorWithSum>::Pack(
6342     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6343 #ifdef DEBUG
6344 #ifdef DEBUG_METAGEMM_VERBOSE
6345   std::cout
6346       << __FILE__ << "(" << __LINE__
6347       << ") ColumnMajorWithSum<uint8_t, 2, 8, 4, ColumnMajorWithSum>::Pack()"
6348       << std::endl
6349       << std::flush;
6350 #endif
6351 #endif
6352   int params_count_copy = params.count;
6353   int params_stride_copy = params.stride;
6354   asm volatile(
6355       "movi v8.8h, #0\n"
6356       "movi v9.8h, #0\n"
6357 
6358       // Reduce count by leftovers.
6359       "subs %x[count], %x[count], #4\n"
6360       "beq 2f\n"
6361 
6362       "1:"
6363       "subs %x[count], %x[count], #8\n"
6364 
6365       // Load Aggregate Store - column major 2x8
6366       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6367       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6368       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6369       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6370       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6371       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6372       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6373       "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6374       "prfm pldl1keep, [%x[in]]\n"
6375       "uzp1 v2.8b, v0.8b, v1.8b\n"
6376       "uzp2 v3.8b, v0.8b, v1.8b\n"
6377       "uaddw v8.8h, v8.8h, v2.8b\n"
6378       "uaddw v9.8h, v9.8h, v3.8b\n"
6379       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6380 
6381       "bne 1b\n"
6382 
6383       "2:"
6384 
6385       // Load Aggregate Store - column major 2x4
6386       "movi v0.8b, #0\n"
6387       "movi v1.8b, #0\n"
6388       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6389       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6390       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6391       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6392       "prfm pldl1keep, [%x[in]]\n"
6393       "uzp1 v2.8b, v0.8b, v1.8b\n"
6394       "uzp2 v3.8b, v0.8b, v1.8b\n"
6395       "uaddw v8.8h, v8.8h, v2.8b\n"
6396       "uaddw v9.8h, v9.8h, v3.8b\n"
6397       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6398 
6399       // Aggregator Reduction.
6400       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6401       "dup v1.4s, %w[additive_sum_offset]\n"
6402       "uaddlp v8.4s, v8.8h\n"
6403       "uaddlp v9.4s, v9.8h\n"
6404       "addp v8.4s, v8.4s, v9.4s\n"
6405       "addp v8.4s, v8.4s, v8.4s\n"
6406       "mul v8.4s, v8.4s, v0.s[0]\n"
6407       "add v8.4s, v8.4s, v1.4s\n"
6408       "st1 {v8.4s}, [%x[out]]\n"
6409       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6410         [out] "+r"(out), [in] "+r"(in)
6411       : [additive_sum_offset] "r"(params.additive_sum_offset),
6412         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6413       : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6414 }
6415 
6416 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6417 inline void Stream<uint8_t, 2, 8, 5, ColumnMajorWithSum>::Pack(
6418     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6419 #ifdef DEBUG
6420 #ifdef DEBUG_METAGEMM_VERBOSE
6421   std::cout
6422       << __FILE__ << "(" << __LINE__
6423       << ") ColumnMajorWithSum<uint8_t, 2, 8, 5, ColumnMajorWithSum>::Pack()"
6424       << std::endl
6425       << std::flush;
6426 #endif
6427 #endif
6428   int params_count_copy = params.count;
6429   int params_stride_copy = params.stride;
6430   asm volatile(
6431       "movi v8.8h, #0\n"
6432       "movi v9.8h, #0\n"
6433 
6434       // Reduce count by leftovers.
6435       "subs %x[count], %x[count], #5\n"
6436       "beq 2f\n"
6437 
6438       "1:"
6439       "subs %x[count], %x[count], #8\n"
6440 
6441       // Load Aggregate Store - column major 2x8
6442       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6443       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6444       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6445       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6446       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6447       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6448       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6449       "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6450       "prfm pldl1keep, [%x[in]]\n"
6451       "uzp1 v2.8b, v0.8b, v1.8b\n"
6452       "uzp2 v3.8b, v0.8b, v1.8b\n"
6453       "uaddw v8.8h, v8.8h, v2.8b\n"
6454       "uaddw v9.8h, v9.8h, v3.8b\n"
6455       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6456 
6457       "bne 1b\n"
6458 
6459       "2:"
6460 
6461       // Load Aggregate Store - column major 2x5
6462       "movi v0.8b, #0\n"
6463       "movi v1.8b, #0\n"
6464       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6465       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6466       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6467       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6468       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6469       "prfm pldl1keep, [%x[in]]\n"
6470       "uzp1 v2.8b, v0.8b, v1.8b\n"
6471       "uzp2 v3.8b, v0.8b, v1.8b\n"
6472       "uaddw v8.8h, v8.8h, v2.8b\n"
6473       "uaddw v9.8h, v9.8h, v3.8b\n"
6474       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6475 
6476       // Aggregator Reduction.
6477       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6478       "dup v1.4s, %w[additive_sum_offset]\n"
6479       "uaddlp v8.4s, v8.8h\n"
6480       "uaddlp v9.4s, v9.8h\n"
6481       "addp v8.4s, v8.4s, v9.4s\n"
6482       "addp v8.4s, v8.4s, v8.4s\n"
6483       "mul v8.4s, v8.4s, v0.s[0]\n"
6484       "add v8.4s, v8.4s, v1.4s\n"
6485       "st1 {v8.4s}, [%x[out]]\n"
6486       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6487         [out] "+r"(out), [in] "+r"(in)
6488       : [additive_sum_offset] "r"(params.additive_sum_offset),
6489         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6490       : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6491 }
6492 
6493 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6494 inline void Stream<uint8_t, 2, 8, 6, ColumnMajorWithSum>::Pack(
6495     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6496 #ifdef DEBUG
6497 #ifdef DEBUG_METAGEMM_VERBOSE
6498   std::cout
6499       << __FILE__ << "(" << __LINE__
6500       << ") ColumnMajorWithSum<uint8_t, 2, 8, 6, ColumnMajorWithSum>::Pack()"
6501       << std::endl
6502       << std::flush;
6503 #endif
6504 #endif
6505   int params_count_copy = params.count;
6506   int params_stride_copy = params.stride;
6507   asm volatile(
6508       "movi v8.8h, #0\n"
6509       "movi v9.8h, #0\n"
6510 
6511       // Reduce count by leftovers.
6512       "subs %x[count], %x[count], #6\n"
6513       "beq 2f\n"
6514 
6515       "1:"
6516       "subs %x[count], %x[count], #8\n"
6517 
6518       // Load Aggregate Store - column major 2x8
6519       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6520       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6521       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6522       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6523       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6524       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6525       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6526       "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6527       "prfm pldl1keep, [%x[in]]\n"
6528       "uzp1 v2.8b, v0.8b, v1.8b\n"
6529       "uzp2 v3.8b, v0.8b, v1.8b\n"
6530       "uaddw v8.8h, v8.8h, v2.8b\n"
6531       "uaddw v9.8h, v9.8h, v3.8b\n"
6532       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6533 
6534       "bne 1b\n"
6535 
6536       "2:"
6537 
6538       // Load Aggregate Store - column major 2x6
6539       "movi v0.8b, #0\n"
6540       "movi v1.8b, #0\n"
6541       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6542       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6543       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6544       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6545       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6546       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6547       "prfm pldl1keep, [%x[in]]\n"
6548       "uzp1 v2.8b, v0.8b, v1.8b\n"
6549       "uzp2 v3.8b, v0.8b, v1.8b\n"
6550       "uaddw v8.8h, v8.8h, v2.8b\n"
6551       "uaddw v9.8h, v9.8h, v3.8b\n"
6552       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6553 
6554       // Aggregator Reduction.
6555       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6556       "dup v1.4s, %w[additive_sum_offset]\n"
6557       "uaddlp v8.4s, v8.8h\n"
6558       "uaddlp v9.4s, v9.8h\n"
6559       "addp v8.4s, v8.4s, v9.4s\n"
6560       "addp v8.4s, v8.4s, v8.4s\n"
6561       "mul v8.4s, v8.4s, v0.s[0]\n"
6562       "add v8.4s, v8.4s, v1.4s\n"
6563       "st1 {v8.4s}, [%x[out]]\n"
6564       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6565         [out] "+r"(out), [in] "+r"(in)
6566       : [additive_sum_offset] "r"(params.additive_sum_offset),
6567         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6568       : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6569 }
6570 
6571 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6572 inline void Stream<uint8_t, 2, 8, 7, ColumnMajorWithSum>::Pack(
6573     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6574 #ifdef DEBUG
6575 #ifdef DEBUG_METAGEMM_VERBOSE
6576   std::cout
6577       << __FILE__ << "(" << __LINE__
6578       << ") ColumnMajorWithSum<uint8_t, 2, 8, 7, ColumnMajorWithSum>::Pack()"
6579       << std::endl
6580       << std::flush;
6581 #endif
6582 #endif
6583   int params_count_copy = params.count;
6584   int params_stride_copy = params.stride;
6585   asm volatile(
6586       "movi v8.8h, #0\n"
6587       "movi v9.8h, #0\n"
6588 
6589       // Reduce count by leftovers.
6590       "subs %x[count], %x[count], #7\n"
6591       "beq 2f\n"
6592 
6593       "1:"
6594       "subs %x[count], %x[count], #8\n"
6595 
6596       // Load Aggregate Store - column major 2x8
6597       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6598       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6599       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6600       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6601       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6602       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6603       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6604       "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6605       "prfm pldl1keep, [%x[in]]\n"
6606       "uzp1 v2.8b, v0.8b, v1.8b\n"
6607       "uzp2 v3.8b, v0.8b, v1.8b\n"
6608       "uaddw v8.8h, v8.8h, v2.8b\n"
6609       "uaddw v9.8h, v9.8h, v3.8b\n"
6610       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6611 
6612       "bne 1b\n"
6613 
6614       "2:"
6615 
6616       // Load Aggregate Store - column major 2x7
6617       "movi v0.8b, #0\n"
6618       "movi v1.8b, #0\n"
6619       "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6620       "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6621       "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6622       "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6623       "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6624       "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6625       "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6626       "prfm pldl1keep, [%x[in]]\n"
6627       "uzp1 v2.8b, v0.8b, v1.8b\n"
6628       "uzp2 v3.8b, v0.8b, v1.8b\n"
6629       "uaddw v8.8h, v8.8h, v2.8b\n"
6630       "uaddw v9.8h, v9.8h, v3.8b\n"
6631       "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6632 
6633       // Aggregator Reduction.
6634       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6635       "dup v1.4s, %w[additive_sum_offset]\n"
6636       "uaddlp v8.4s, v8.8h\n"
6637       "uaddlp v9.4s, v9.8h\n"
6638       "addp v8.4s, v8.4s, v9.4s\n"
6639       "addp v8.4s, v8.4s, v8.4s\n"
6640       "mul v8.4s, v8.4s, v0.s[0]\n"
6641       "add v8.4s, v8.4s, v1.4s\n"
6642       "st1 {v8.4s}, [%x[out]]\n"
6643       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6644         [out] "+r"(out), [in] "+r"(in)
6645       : [additive_sum_offset] "r"(params.additive_sum_offset),
6646         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6647       : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6648 }
6649 
6650 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6651 inline void Stream<uint8_t, 3, 8, 0, ColumnMajorWithSum>::Pack(
6652     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6653 #ifdef DEBUG
6654 #ifdef DEBUG_METAGEMM_VERBOSE
6655   std::cout
6656       << __FILE__ << "(" << __LINE__
6657       << ") ColumnMajorWithSum<uint8_t, 3, 8, 0, ColumnMajorWithSum>::Pack()"
6658       << std::endl
6659       << std::flush;
6660 #endif
6661 #endif
6662   int params_count_copy = params.count;
6663   int params_stride_copy = params.stride;
6664   asm volatile(
6665       "movi v8.8h, #0\n"
6666       "movi v9.8h, #0\n"
6667       "movi v10.8h, #0\n"
6668 
6669       "1:"
6670       "subs %x[count], %x[count], #8\n"
6671 
6672       // Load Aggregate Store - column major 3x8
6673       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6674       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6675       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6676       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6677       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6678       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6679       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6680       "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6681       "prfm pldl1keep, [%x[in]]\n"
6682       "uaddw v8.8h, v8.8h, v0.8b\n"
6683       "uaddw v9.8h, v9.8h, v1.8b\n"
6684       "uaddw v10.8h, v10.8h, v2.8b\n"
6685       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6686 
6687       "bne 1b\n"
6688 
6689       // Aggregator Reduction.
6690       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6691       "dup v1.4s, %w[additive_sum_offset]\n"
6692       "uaddlp v8.4s, v8.8h\n"
6693       "uaddlp v9.4s, v9.8h\n"
6694       "uaddlp v10.4s, v10.8h\n"
6695       "addp v8.4s, v8.4s, v9.4s\n"
6696       "addp v10.4s, v10.4s, v10.4s\n"
6697       "addp v8.4s, v8.4s, v10.4s\n"
6698       "mul v8.4s, v8.4s, v0.s[0]\n"
6699       "add v8.4s, v8.4s, v1.4s\n"
6700       "st1 {v8.4s}, [%x[out]]\n"
6701       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6702         [out] "+r"(out), [in] "+r"(in)
6703       : [additive_sum_offset] "r"(params.additive_sum_offset),
6704         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6705       : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
6706 }
6707 
6708 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6709 inline void Stream<uint8_t, 3, 8, 1, ColumnMajorWithSum>::Pack(
6710     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6711 #ifdef DEBUG
6712 #ifdef DEBUG_METAGEMM_VERBOSE
6713   std::cout
6714       << __FILE__ << "(" << __LINE__
6715       << ") ColumnMajorWithSum<uint8_t, 3, 8, 1, ColumnMajorWithSum>::Pack()"
6716       << std::endl
6717       << std::flush;
6718 #endif
6719 #endif
6720   int params_count_copy = params.count;
6721   int params_stride_copy = params.stride;
6722   asm volatile(
6723       "movi v8.8h, #0\n"
6724       "movi v9.8h, #0\n"
6725       "movi v10.8h, #0\n"
6726 
6727       // Reduce count by leftovers.
6728       "subs %x[count], %x[count], #1\n"
6729       "beq 2f\n"
6730 
6731       "1:"
6732       "subs %x[count], %x[count], #8\n"
6733 
6734       // Load Aggregate Store - column major 3x8
6735       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6736       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6737       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6738       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6739       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6740       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6741       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6742       "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6743       "prfm pldl1keep, [%x[in]]\n"
6744       "uaddw v8.8h, v8.8h, v0.8b\n"
6745       "uaddw v9.8h, v9.8h, v1.8b\n"
6746       "uaddw v10.8h, v10.8h, v2.8b\n"
6747       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6748 
6749       "bne 1b\n"
6750 
6751       "2:"
6752 
6753       // Load Aggregate Store - column major 3x1
6754       "movi v0.8b, #0\n"
6755       "movi v1.8b, #0\n"
6756       "movi v2.8b, #0\n"
6757       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6758       "prfm pldl1keep, [%x[in]]\n"
6759       "uaddw v8.8h, v8.8h, v0.8b\n"
6760       "uaddw v9.8h, v9.8h, v1.8b\n"
6761       "uaddw v10.8h, v10.8h, v2.8b\n"
6762       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6763 
6764       // Aggregator Reduction.
6765       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6766       "dup v1.4s, %w[additive_sum_offset]\n"
6767       "uaddlp v8.4s, v8.8h\n"
6768       "uaddlp v9.4s, v9.8h\n"
6769       "uaddlp v10.4s, v10.8h\n"
6770       "addp v8.4s, v8.4s, v9.4s\n"
6771       "addp v10.4s, v10.4s, v10.4s\n"
6772       "addp v8.4s, v8.4s, v10.4s\n"
6773       "mul v8.4s, v8.4s, v0.s[0]\n"
6774       "add v8.4s, v8.4s, v1.4s\n"
6775       "st1 {v8.4s}, [%x[out]]\n"
6776       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6777         [out] "+r"(out), [in] "+r"(in)
6778       : [additive_sum_offset] "r"(params.additive_sum_offset),
6779         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6780       : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
6781 }
6782 
6783 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6784 inline void Stream<uint8_t, 3, 8, 2, ColumnMajorWithSum>::Pack(
6785     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6786 #ifdef DEBUG
6787 #ifdef DEBUG_METAGEMM_VERBOSE
6788   std::cout
6789       << __FILE__ << "(" << __LINE__
6790       << ") ColumnMajorWithSum<uint8_t, 3, 8, 2, ColumnMajorWithSum>::Pack()"
6791       << std::endl
6792       << std::flush;
6793 #endif
6794 #endif
6795   int params_count_copy = params.count;
6796   int params_stride_copy = params.stride;
6797   asm volatile(
6798       "movi v8.8h, #0\n"
6799       "movi v9.8h, #0\n"
6800       "movi v10.8h, #0\n"
6801 
6802       // Reduce count by leftovers.
6803       "subs %x[count], %x[count], #2\n"
6804       "beq 2f\n"
6805 
6806       "1:"
6807       "subs %x[count], %x[count], #8\n"
6808 
6809       // Load Aggregate Store - column major 3x8
6810       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6811       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6812       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6813       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6814       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6815       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6816       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6817       "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6818       "prfm pldl1keep, [%x[in]]\n"
6819       "uaddw v8.8h, v8.8h, v0.8b\n"
6820       "uaddw v9.8h, v9.8h, v1.8b\n"
6821       "uaddw v10.8h, v10.8h, v2.8b\n"
6822       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6823 
6824       "bne 1b\n"
6825 
6826       "2:"
6827 
6828       // Load Aggregate Store - column major 3x2
6829       "movi v0.8b, #0\n"
6830       "movi v1.8b, #0\n"
6831       "movi v2.8b, #0\n"
6832       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6833       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6834       "prfm pldl1keep, [%x[in]]\n"
6835       "uaddw v8.8h, v8.8h, v0.8b\n"
6836       "uaddw v9.8h, v9.8h, v1.8b\n"
6837       "uaddw v10.8h, v10.8h, v2.8b\n"
6838       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6839 
6840       // Aggregator Reduction.
6841       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6842       "dup v1.4s, %w[additive_sum_offset]\n"
6843       "uaddlp v8.4s, v8.8h\n"
6844       "uaddlp v9.4s, v9.8h\n"
6845       "uaddlp v10.4s, v10.8h\n"
6846       "addp v8.4s, v8.4s, v9.4s\n"
6847       "addp v10.4s, v10.4s, v10.4s\n"
6848       "addp v8.4s, v8.4s, v10.4s\n"
6849       "mul v8.4s, v8.4s, v0.s[0]\n"
6850       "add v8.4s, v8.4s, v1.4s\n"
6851       "st1 {v8.4s}, [%x[out]]\n"
6852       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6853         [out] "+r"(out), [in] "+r"(in)
6854       : [additive_sum_offset] "r"(params.additive_sum_offset),
6855         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6856       : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
6857 }
6858 
6859 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6860 inline void Stream<uint8_t, 3, 8, 3, ColumnMajorWithSum>::Pack(
6861     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6862 #ifdef DEBUG
6863 #ifdef DEBUG_METAGEMM_VERBOSE
6864   std::cout
6865       << __FILE__ << "(" << __LINE__
6866       << ") ColumnMajorWithSum<uint8_t, 3, 8, 3, ColumnMajorWithSum>::Pack()"
6867       << std::endl
6868       << std::flush;
6869 #endif
6870 #endif
6871   int params_count_copy = params.count;
6872   int params_stride_copy = params.stride;
6873   asm volatile(
6874       "movi v8.8h, #0\n"
6875       "movi v9.8h, #0\n"
6876       "movi v10.8h, #0\n"
6877 
6878       // Reduce count by leftovers.
6879       "subs %x[count], %x[count], #3\n"
6880       "beq 2f\n"
6881 
6882       "1:"
6883       "subs %x[count], %x[count], #8\n"
6884 
6885       // Load Aggregate Store - column major 3x8
6886       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6887       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6888       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6889       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6890       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6891       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6892       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6893       "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6894       "prfm pldl1keep, [%x[in]]\n"
6895       "uaddw v8.8h, v8.8h, v0.8b\n"
6896       "uaddw v9.8h, v9.8h, v1.8b\n"
6897       "uaddw v10.8h, v10.8h, v2.8b\n"
6898       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6899 
6900       "bne 1b\n"
6901 
6902       "2:"
6903 
6904       // Load Aggregate Store - column major 3x3
6905       "movi v0.8b, #0\n"
6906       "movi v1.8b, #0\n"
6907       "movi v2.8b, #0\n"
6908       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6909       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6910       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6911       "prfm pldl1keep, [%x[in]]\n"
6912       "uaddw v8.8h, v8.8h, v0.8b\n"
6913       "uaddw v9.8h, v9.8h, v1.8b\n"
6914       "uaddw v10.8h, v10.8h, v2.8b\n"
6915       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6916 
6917       // Aggregator Reduction.
6918       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6919       "dup v1.4s, %w[additive_sum_offset]\n"
6920       "uaddlp v8.4s, v8.8h\n"
6921       "uaddlp v9.4s, v9.8h\n"
6922       "uaddlp v10.4s, v10.8h\n"
6923       "addp v8.4s, v8.4s, v9.4s\n"
6924       "addp v10.4s, v10.4s, v10.4s\n"
6925       "addp v8.4s, v8.4s, v10.4s\n"
6926       "mul v8.4s, v8.4s, v0.s[0]\n"
6927       "add v8.4s, v8.4s, v1.4s\n"
6928       "st1 {v8.4s}, [%x[out]]\n"
6929       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6930         [out] "+r"(out), [in] "+r"(in)
6931       : [additive_sum_offset] "r"(params.additive_sum_offset),
6932         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6933       : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
6934 }
6935 
6936 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6937 inline void Stream<uint8_t, 3, 8, 4, ColumnMajorWithSum>::Pack(
6938     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6939 #ifdef DEBUG
6940 #ifdef DEBUG_METAGEMM_VERBOSE
6941   std::cout
6942       << __FILE__ << "(" << __LINE__
6943       << ") ColumnMajorWithSum<uint8_t, 3, 8, 4, ColumnMajorWithSum>::Pack()"
6944       << std::endl
6945       << std::flush;
6946 #endif
6947 #endif
6948   int params_count_copy = params.count;
6949   int params_stride_copy = params.stride;
6950   asm volatile(
6951       "movi v8.8h, #0\n"
6952       "movi v9.8h, #0\n"
6953       "movi v10.8h, #0\n"
6954 
6955       // Reduce count by leftovers.
6956       "subs %x[count], %x[count], #4\n"
6957       "beq 2f\n"
6958 
6959       "1:"
6960       "subs %x[count], %x[count], #8\n"
6961 
6962       // Load Aggregate Store - column major 3x8
6963       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6964       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6965       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6966       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6967       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6968       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6969       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6970       "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6971       "prfm pldl1keep, [%x[in]]\n"
6972       "uaddw v8.8h, v8.8h, v0.8b\n"
6973       "uaddw v9.8h, v9.8h, v1.8b\n"
6974       "uaddw v10.8h, v10.8h, v2.8b\n"
6975       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6976 
6977       "bne 1b\n"
6978 
6979       "2:"
6980 
6981       // Load Aggregate Store - column major 3x4
6982       "movi v0.8b, #0\n"
6983       "movi v1.8b, #0\n"
6984       "movi v2.8b, #0\n"
6985       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6986       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6987       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6988       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6989       "prfm pldl1keep, [%x[in]]\n"
6990       "uaddw v8.8h, v8.8h, v0.8b\n"
6991       "uaddw v9.8h, v9.8h, v1.8b\n"
6992       "uaddw v10.8h, v10.8h, v2.8b\n"
6993       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6994 
6995       // Aggregator Reduction.
6996       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6997       "dup v1.4s, %w[additive_sum_offset]\n"
6998       "uaddlp v8.4s, v8.8h\n"
6999       "uaddlp v9.4s, v9.8h\n"
7000       "uaddlp v10.4s, v10.8h\n"
7001       "addp v8.4s, v8.4s, v9.4s\n"
7002       "addp v10.4s, v10.4s, v10.4s\n"
7003       "addp v8.4s, v8.4s, v10.4s\n"
7004       "mul v8.4s, v8.4s, v0.s[0]\n"
7005       "add v8.4s, v8.4s, v1.4s\n"
7006       "st1 {v8.4s}, [%x[out]]\n"
7007       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7008         [out] "+r"(out), [in] "+r"(in)
7009       : [additive_sum_offset] "r"(params.additive_sum_offset),
7010         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7011       : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
7012 }
7013 
7014 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7015 inline void Stream<uint8_t, 3, 8, 5, ColumnMajorWithSum>::Pack(
7016     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7017 #ifdef DEBUG
7018 #ifdef DEBUG_METAGEMM_VERBOSE
7019   std::cout
7020       << __FILE__ << "(" << __LINE__
7021       << ") ColumnMajorWithSum<uint8_t, 3, 8, 5, ColumnMajorWithSum>::Pack()"
7022       << std::endl
7023       << std::flush;
7024 #endif
7025 #endif
7026   int params_count_copy = params.count;
7027   int params_stride_copy = params.stride;
7028   asm volatile(
7029       "movi v8.8h, #0\n"
7030       "movi v9.8h, #0\n"
7031       "movi v10.8h, #0\n"
7032 
7033       // Reduce count by leftovers.
7034       "subs %x[count], %x[count], #5\n"
7035       "beq 2f\n"
7036 
7037       "1:"
7038       "subs %x[count], %x[count], #8\n"
7039 
7040       // Load Aggregate Store - column major 3x8
7041       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7042       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7043       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7044       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7045       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7046       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7047       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
7048       "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
7049       "prfm pldl1keep, [%x[in]]\n"
7050       "uaddw v8.8h, v8.8h, v0.8b\n"
7051       "uaddw v9.8h, v9.8h, v1.8b\n"
7052       "uaddw v10.8h, v10.8h, v2.8b\n"
7053       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7054 
7055       "bne 1b\n"
7056 
7057       "2:"
7058 
7059       // Load Aggregate Store - column major 3x5
7060       "movi v0.8b, #0\n"
7061       "movi v1.8b, #0\n"
7062       "movi v2.8b, #0\n"
7063       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7064       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7065       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7066       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7067       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7068       "prfm pldl1keep, [%x[in]]\n"
7069       "uaddw v8.8h, v8.8h, v0.8b\n"
7070       "uaddw v9.8h, v9.8h, v1.8b\n"
7071       "uaddw v10.8h, v10.8h, v2.8b\n"
7072       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7073 
7074       // Aggregator Reduction.
7075       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7076       "dup v1.4s, %w[additive_sum_offset]\n"
7077       "uaddlp v8.4s, v8.8h\n"
7078       "uaddlp v9.4s, v9.8h\n"
7079       "uaddlp v10.4s, v10.8h\n"
7080       "addp v8.4s, v8.4s, v9.4s\n"
7081       "addp v10.4s, v10.4s, v10.4s\n"
7082       "addp v8.4s, v8.4s, v10.4s\n"
7083       "mul v8.4s, v8.4s, v0.s[0]\n"
7084       "add v8.4s, v8.4s, v1.4s\n"
7085       "st1 {v8.4s}, [%x[out]]\n"
7086       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7087         [out] "+r"(out), [in] "+r"(in)
7088       : [additive_sum_offset] "r"(params.additive_sum_offset),
7089         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7090       : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
7091 }
7092 
7093 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7094 inline void Stream<uint8_t, 3, 8, 6, ColumnMajorWithSum>::Pack(
7095     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7096 #ifdef DEBUG
7097 #ifdef DEBUG_METAGEMM_VERBOSE
7098   std::cout
7099       << __FILE__ << "(" << __LINE__
7100       << ") ColumnMajorWithSum<uint8_t, 3, 8, 6, ColumnMajorWithSum>::Pack()"
7101       << std::endl
7102       << std::flush;
7103 #endif
7104 #endif
7105   int params_count_copy = params.count;
7106   int params_stride_copy = params.stride;
7107   asm volatile(
7108       "movi v8.8h, #0\n"
7109       "movi v9.8h, #0\n"
7110       "movi v10.8h, #0\n"
7111 
7112       // Reduce count by leftovers.
7113       "subs %x[count], %x[count], #6\n"
7114       "beq 2f\n"
7115 
7116       "1:"
7117       "subs %x[count], %x[count], #8\n"
7118 
7119       // Load Aggregate Store - column major 3x8
7120       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7121       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7122       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7123       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7124       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7125       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7126       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
7127       "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
7128       "prfm pldl1keep, [%x[in]]\n"
7129       "uaddw v8.8h, v8.8h, v0.8b\n"
7130       "uaddw v9.8h, v9.8h, v1.8b\n"
7131       "uaddw v10.8h, v10.8h, v2.8b\n"
7132       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7133 
7134       "bne 1b\n"
7135 
7136       "2:"
7137 
7138       // Load Aggregate Store - column major 3x6
7139       "movi v0.8b, #0\n"
7140       "movi v1.8b, #0\n"
7141       "movi v2.8b, #0\n"
7142       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7143       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7144       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7145       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7146       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7147       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7148       "prfm pldl1keep, [%x[in]]\n"
7149       "uaddw v8.8h, v8.8h, v0.8b\n"
7150       "uaddw v9.8h, v9.8h, v1.8b\n"
7151       "uaddw v10.8h, v10.8h, v2.8b\n"
7152       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7153 
7154       // Aggregator Reduction.
7155       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7156       "dup v1.4s, %w[additive_sum_offset]\n"
7157       "uaddlp v8.4s, v8.8h\n"
7158       "uaddlp v9.4s, v9.8h\n"
7159       "uaddlp v10.4s, v10.8h\n"
7160       "addp v8.4s, v8.4s, v9.4s\n"
7161       "addp v10.4s, v10.4s, v10.4s\n"
7162       "addp v8.4s, v8.4s, v10.4s\n"
7163       "mul v8.4s, v8.4s, v0.s[0]\n"
7164       "add v8.4s, v8.4s, v1.4s\n"
7165       "st1 {v8.4s}, [%x[out]]\n"
7166       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7167         [out] "+r"(out), [in] "+r"(in)
7168       : [additive_sum_offset] "r"(params.additive_sum_offset),
7169         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7170       : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
7171 }
7172 
7173 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7174 inline void Stream<uint8_t, 3, 8, 7, ColumnMajorWithSum>::Pack(
7175     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7176 #ifdef DEBUG
7177 #ifdef DEBUG_METAGEMM_VERBOSE
7178   std::cout
7179       << __FILE__ << "(" << __LINE__
7180       << ") ColumnMajorWithSum<uint8_t, 3, 8, 7, ColumnMajorWithSum>::Pack()"
7181       << std::endl
7182       << std::flush;
7183 #endif
7184 #endif
7185   int params_count_copy = params.count;
7186   int params_stride_copy = params.stride;
7187   asm volatile(
7188       "movi v8.8h, #0\n"
7189       "movi v9.8h, #0\n"
7190       "movi v10.8h, #0\n"
7191 
7192       // Reduce count by leftovers.
7193       "subs %x[count], %x[count], #7\n"
7194       "beq 2f\n"
7195 
7196       "1:"
7197       "subs %x[count], %x[count], #8\n"
7198 
7199       // Load Aggregate Store - column major 3x8
7200       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7201       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7202       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7203       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7204       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7205       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7206       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
7207       "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
7208       "prfm pldl1keep, [%x[in]]\n"
7209       "uaddw v8.8h, v8.8h, v0.8b\n"
7210       "uaddw v9.8h, v9.8h, v1.8b\n"
7211       "uaddw v10.8h, v10.8h, v2.8b\n"
7212       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7213 
7214       "bne 1b\n"
7215 
7216       "2:"
7217 
7218       // Load Aggregate Store - column major 3x7
7219       "movi v0.8b, #0\n"
7220       "movi v1.8b, #0\n"
7221       "movi v2.8b, #0\n"
7222       "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7223       "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7224       "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7225       "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7226       "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7227       "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7228       "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
7229       "prfm pldl1keep, [%x[in]]\n"
7230       "uaddw v8.8h, v8.8h, v0.8b\n"
7231       "uaddw v9.8h, v9.8h, v1.8b\n"
7232       "uaddw v10.8h, v10.8h, v2.8b\n"
7233       "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7234 
7235       // Aggregator Reduction.
7236       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7237       "dup v1.4s, %w[additive_sum_offset]\n"
7238       "uaddlp v8.4s, v8.8h\n"
7239       "uaddlp v9.4s, v9.8h\n"
7240       "uaddlp v10.4s, v10.8h\n"
7241       "addp v8.4s, v8.4s, v9.4s\n"
7242       "addp v10.4s, v10.4s, v10.4s\n"
7243       "addp v8.4s, v8.4s, v10.4s\n"
7244       "mul v8.4s, v8.4s, v0.s[0]\n"
7245       "add v8.4s, v8.4s, v1.4s\n"
7246       "st1 {v8.4s}, [%x[out]]\n"
7247       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7248         [out] "+r"(out), [in] "+r"(in)
7249       : [additive_sum_offset] "r"(params.additive_sum_offset),
7250         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7251       : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
7252 }
7253 
7254 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7255 inline void Stream<uint8_t, 4, 8, 0, ColumnMajorWithSum>::Pack(
7256     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7257 #ifdef DEBUG
7258 #ifdef DEBUG_METAGEMM_VERBOSE
7259   std::cout
7260       << __FILE__ << "(" << __LINE__
7261       << ") ColumnMajorWithSum<uint8_t, 4, 8, 0, ColumnMajorWithSum>::Pack()"
7262       << std::endl
7263       << std::flush;
7264 #endif
7265 #endif
7266   int params_count_copy = params.count;
7267   int params_stride_copy = params.stride;
7268   asm volatile(
7269       "movi v8.8h, #0\n"
7270       "movi v9.8h, #0\n"
7271       "movi v10.8h, #0\n"
7272       "movi v11.8h, #0\n"
7273 
7274       "1:"
7275       "subs %x[count], %x[count], #8\n"
7276 
7277       // Load Aggregate Store - column major 4x8
7278       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7279       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7280       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7281       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7282       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7283       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7284       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7285       "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7286       "prfm pldl1keep, [%x[in]]\n"
7287       "trn1 v4.4h, v0.4h, v2.4h\n"
7288       "trn2 v6.4h, v0.4h, v2.4h\n"
7289       "trn1 v5.4h, v1.4h, v3.4h\n"
7290       "trn2 v7.4h, v1.4h, v3.4h\n"
7291       "trn1 v0.8b, v4.8b, v5.8b\n"
7292       "trn2 v1.8b, v4.8b, v5.8b\n"
7293       "trn1 v2.8b, v6.8b, v7.8b\n"
7294       "trn2 v3.8b, v6.8b, v7.8b\n"
7295       "uaddw v8.8h, v8.8h, v0.8b\n"
7296       "uaddw v9.8h, v9.8h, v1.8b\n"
7297       "uaddw v10.8h, v10.8h, v2.8b\n"
7298       "uaddw v11.8h, v11.8h, v3.8b\n"
7299       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7300 
7301       "bne 1b\n"
7302 
7303       // Aggregator Reduction.
7304       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7305       "dup v1.4s, %w[additive_sum_offset]\n"
7306       "uaddlp v8.4s, v8.8h\n"
7307       "uaddlp v9.4s, v9.8h\n"
7308       "uaddlp v10.4s, v10.8h\n"
7309       "uaddlp v11.4s, v11.8h\n"
7310       "addp v8.4s, v8.4s, v9.4s\n"
7311       "addp v10.4s, v10.4s, v11.4s\n"
7312       "addp v8.4s, v8.4s, v10.4s\n"
7313       "mul v8.4s, v8.4s, v0.s[0]\n"
7314       "add v8.4s, v8.4s, v1.4s\n"
7315       "st1 {v8.4s}, [%x[out]]\n"
7316       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7317         [out] "+r"(out), [in] "+r"(in)
7318       : [additive_sum_offset] "r"(params.additive_sum_offset),
7319         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7320       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7321         "v11", "cc", "memory");
7322 }
7323 
7324 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7325 inline void Stream<uint8_t, 4, 8, 1, ColumnMajorWithSum>::Pack(
7326     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7327 #ifdef DEBUG
7328 #ifdef DEBUG_METAGEMM_VERBOSE
7329   std::cout
7330       << __FILE__ << "(" << __LINE__
7331       << ") ColumnMajorWithSum<uint8_t, 4, 8, 1, ColumnMajorWithSum>::Pack()"
7332       << std::endl
7333       << std::flush;
7334 #endif
7335 #endif
7336   int params_count_copy = params.count;
7337   int params_stride_copy = params.stride;
7338   asm volatile(
7339       "movi v8.8h, #0\n"
7340       "movi v9.8h, #0\n"
7341       "movi v10.8h, #0\n"
7342       "movi v11.8h, #0\n"
7343 
7344       // Reduce count by leftovers.
7345       "subs %x[count], %x[count], #1\n"
7346       "beq 2f\n"
7347 
7348       "1:"
7349       "subs %x[count], %x[count], #8\n"
7350 
7351       // Load Aggregate Store - column major 4x8
7352       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7353       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7354       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7355       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7356       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7357       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7358       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7359       "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7360       "prfm pldl1keep, [%x[in]]\n"
7361       "trn1 v4.4h, v0.4h, v2.4h\n"
7362       "trn2 v6.4h, v0.4h, v2.4h\n"
7363       "trn1 v5.4h, v1.4h, v3.4h\n"
7364       "trn2 v7.4h, v1.4h, v3.4h\n"
7365       "trn1 v0.8b, v4.8b, v5.8b\n"
7366       "trn2 v1.8b, v4.8b, v5.8b\n"
7367       "trn1 v2.8b, v6.8b, v7.8b\n"
7368       "trn2 v3.8b, v6.8b, v7.8b\n"
7369       "uaddw v8.8h, v8.8h, v0.8b\n"
7370       "uaddw v9.8h, v9.8h, v1.8b\n"
7371       "uaddw v10.8h, v10.8h, v2.8b\n"
7372       "uaddw v11.8h, v11.8h, v3.8b\n"
7373       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7374 
7375       "bne 1b\n"
7376 
7377       "2:"
7378 
7379       // Load Aggregate Store - column major 4x1
7380       "movi v0.8b, #0\n"
7381       "movi v1.8b, #0\n"
7382       "movi v2.8b, #0\n"
7383       "movi v3.8b, #0\n"
7384       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7385       "prfm pldl1keep, [%x[in]]\n"
7386       "trn1 v4.4h, v0.4h, v2.4h\n"
7387       "trn2 v6.4h, v0.4h, v2.4h\n"
7388       "trn1 v5.4h, v1.4h, v3.4h\n"
7389       "trn2 v7.4h, v1.4h, v3.4h\n"
7390       "trn1 v0.8b, v4.8b, v5.8b\n"
7391       "trn2 v1.8b, v4.8b, v5.8b\n"
7392       "trn1 v2.8b, v6.8b, v7.8b\n"
7393       "trn2 v3.8b, v6.8b, v7.8b\n"
7394       "uaddw v8.8h, v8.8h, v0.8b\n"
7395       "uaddw v9.8h, v9.8h, v1.8b\n"
7396       "uaddw v10.8h, v10.8h, v2.8b\n"
7397       "uaddw v11.8h, v11.8h, v3.8b\n"
7398       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7399 
7400       // Aggregator Reduction.
7401       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7402       "dup v1.4s, %w[additive_sum_offset]\n"
7403       "uaddlp v8.4s, v8.8h\n"
7404       "uaddlp v9.4s, v9.8h\n"
7405       "uaddlp v10.4s, v10.8h\n"
7406       "uaddlp v11.4s, v11.8h\n"
7407       "addp v8.4s, v8.4s, v9.4s\n"
7408       "addp v10.4s, v10.4s, v11.4s\n"
7409       "addp v8.4s, v8.4s, v10.4s\n"
7410       "mul v8.4s, v8.4s, v0.s[0]\n"
7411       "add v8.4s, v8.4s, v1.4s\n"
7412       "st1 {v8.4s}, [%x[out]]\n"
7413       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7414         [out] "+r"(out), [in] "+r"(in)
7415       : [additive_sum_offset] "r"(params.additive_sum_offset),
7416         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7417       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7418         "v11", "cc", "memory");
7419 }
7420 
7421 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7422 inline void Stream<uint8_t, 4, 8, 2, ColumnMajorWithSum>::Pack(
7423     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7424 #ifdef DEBUG
7425 #ifdef DEBUG_METAGEMM_VERBOSE
7426   std::cout
7427       << __FILE__ << "(" << __LINE__
7428       << ") ColumnMajorWithSum<uint8_t, 4, 8, 2, ColumnMajorWithSum>::Pack()"
7429       << std::endl
7430       << std::flush;
7431 #endif
7432 #endif
7433   int params_count_copy = params.count;
7434   int params_stride_copy = params.stride;
7435   asm volatile(
7436       "movi v8.8h, #0\n"
7437       "movi v9.8h, #0\n"
7438       "movi v10.8h, #0\n"
7439       "movi v11.8h, #0\n"
7440 
7441       // Reduce count by leftovers.
7442       "subs %x[count], %x[count], #2\n"
7443       "beq 2f\n"
7444 
7445       "1:"
7446       "subs %x[count], %x[count], #8\n"
7447 
7448       // Load Aggregate Store - column major 4x8
7449       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7450       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7451       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7452       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7453       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7454       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7455       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7456       "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7457       "prfm pldl1keep, [%x[in]]\n"
7458       "trn1 v4.4h, v0.4h, v2.4h\n"
7459       "trn2 v6.4h, v0.4h, v2.4h\n"
7460       "trn1 v5.4h, v1.4h, v3.4h\n"
7461       "trn2 v7.4h, v1.4h, v3.4h\n"
7462       "trn1 v0.8b, v4.8b, v5.8b\n"
7463       "trn2 v1.8b, v4.8b, v5.8b\n"
7464       "trn1 v2.8b, v6.8b, v7.8b\n"
7465       "trn2 v3.8b, v6.8b, v7.8b\n"
7466       "uaddw v8.8h, v8.8h, v0.8b\n"
7467       "uaddw v9.8h, v9.8h, v1.8b\n"
7468       "uaddw v10.8h, v10.8h, v2.8b\n"
7469       "uaddw v11.8h, v11.8h, v3.8b\n"
7470       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7471 
7472       "bne 1b\n"
7473 
7474       "2:"
7475 
7476       // Load Aggregate Store - column major 4x2
7477       "movi v0.8b, #0\n"
7478       "movi v1.8b, #0\n"
7479       "movi v2.8b, #0\n"
7480       "movi v3.8b, #0\n"
7481       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7482       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7483       "prfm pldl1keep, [%x[in]]\n"
7484       "trn1 v4.4h, v0.4h, v2.4h\n"
7485       "trn2 v6.4h, v0.4h, v2.4h\n"
7486       "trn1 v5.4h, v1.4h, v3.4h\n"
7487       "trn2 v7.4h, v1.4h, v3.4h\n"
7488       "trn1 v0.8b, v4.8b, v5.8b\n"
7489       "trn2 v1.8b, v4.8b, v5.8b\n"
7490       "trn1 v2.8b, v6.8b, v7.8b\n"
7491       "trn2 v3.8b, v6.8b, v7.8b\n"
7492       "uaddw v8.8h, v8.8h, v0.8b\n"
7493       "uaddw v9.8h, v9.8h, v1.8b\n"
7494       "uaddw v10.8h, v10.8h, v2.8b\n"
7495       "uaddw v11.8h, v11.8h, v3.8b\n"
7496       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7497 
7498       // Aggregator Reduction.
7499       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7500       "dup v1.4s, %w[additive_sum_offset]\n"
7501       "uaddlp v8.4s, v8.8h\n"
7502       "uaddlp v9.4s, v9.8h\n"
7503       "uaddlp v10.4s, v10.8h\n"
7504       "uaddlp v11.4s, v11.8h\n"
7505       "addp v8.4s, v8.4s, v9.4s\n"
7506       "addp v10.4s, v10.4s, v11.4s\n"
7507       "addp v8.4s, v8.4s, v10.4s\n"
7508       "mul v8.4s, v8.4s, v0.s[0]\n"
7509       "add v8.4s, v8.4s, v1.4s\n"
7510       "st1 {v8.4s}, [%x[out]]\n"
7511       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7512         [out] "+r"(out), [in] "+r"(in)
7513       : [additive_sum_offset] "r"(params.additive_sum_offset),
7514         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7515       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7516         "v11", "cc", "memory");
7517 }
7518 
7519 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7520 inline void Stream<uint8_t, 4, 8, 3, ColumnMajorWithSum>::Pack(
7521     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7522 #ifdef DEBUG
7523 #ifdef DEBUG_METAGEMM_VERBOSE
7524   std::cout
7525       << __FILE__ << "(" << __LINE__
7526       << ") ColumnMajorWithSum<uint8_t, 4, 8, 3, ColumnMajorWithSum>::Pack()"
7527       << std::endl
7528       << std::flush;
7529 #endif
7530 #endif
7531   int params_count_copy = params.count;
7532   int params_stride_copy = params.stride;
7533   asm volatile(
7534       "movi v8.8h, #0\n"
7535       "movi v9.8h, #0\n"
7536       "movi v10.8h, #0\n"
7537       "movi v11.8h, #0\n"
7538 
7539       // Reduce count by leftovers.
7540       "subs %x[count], %x[count], #3\n"
7541       "beq 2f\n"
7542 
7543       "1:"
7544       "subs %x[count], %x[count], #8\n"
7545 
7546       // Load Aggregate Store - column major 4x8
7547       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7548       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7549       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7550       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7551       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7552       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7553       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7554       "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7555       "prfm pldl1keep, [%x[in]]\n"
7556       "trn1 v4.4h, v0.4h, v2.4h\n"
7557       "trn2 v6.4h, v0.4h, v2.4h\n"
7558       "trn1 v5.4h, v1.4h, v3.4h\n"
7559       "trn2 v7.4h, v1.4h, v3.4h\n"
7560       "trn1 v0.8b, v4.8b, v5.8b\n"
7561       "trn2 v1.8b, v4.8b, v5.8b\n"
7562       "trn1 v2.8b, v6.8b, v7.8b\n"
7563       "trn2 v3.8b, v6.8b, v7.8b\n"
7564       "uaddw v8.8h, v8.8h, v0.8b\n"
7565       "uaddw v9.8h, v9.8h, v1.8b\n"
7566       "uaddw v10.8h, v10.8h, v2.8b\n"
7567       "uaddw v11.8h, v11.8h, v3.8b\n"
7568       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7569 
7570       "bne 1b\n"
7571 
7572       "2:"
7573 
7574       // Load Aggregate Store - column major 4x3
7575       "movi v0.8b, #0\n"
7576       "movi v1.8b, #0\n"
7577       "movi v2.8b, #0\n"
7578       "movi v3.8b, #0\n"
7579       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7580       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7581       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7582       "prfm pldl1keep, [%x[in]]\n"
7583       "trn1 v4.4h, v0.4h, v2.4h\n"
7584       "trn2 v6.4h, v0.4h, v2.4h\n"
7585       "trn1 v5.4h, v1.4h, v3.4h\n"
7586       "trn2 v7.4h, v1.4h, v3.4h\n"
7587       "trn1 v0.8b, v4.8b, v5.8b\n"
7588       "trn2 v1.8b, v4.8b, v5.8b\n"
7589       "trn1 v2.8b, v6.8b, v7.8b\n"
7590       "trn2 v3.8b, v6.8b, v7.8b\n"
7591       "uaddw v8.8h, v8.8h, v0.8b\n"
7592       "uaddw v9.8h, v9.8h, v1.8b\n"
7593       "uaddw v10.8h, v10.8h, v2.8b\n"
7594       "uaddw v11.8h, v11.8h, v3.8b\n"
7595       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7596 
7597       // Aggregator Reduction.
7598       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7599       "dup v1.4s, %w[additive_sum_offset]\n"
7600       "uaddlp v8.4s, v8.8h\n"
7601       "uaddlp v9.4s, v9.8h\n"
7602       "uaddlp v10.4s, v10.8h\n"
7603       "uaddlp v11.4s, v11.8h\n"
7604       "addp v8.4s, v8.4s, v9.4s\n"
7605       "addp v10.4s, v10.4s, v11.4s\n"
7606       "addp v8.4s, v8.4s, v10.4s\n"
7607       "mul v8.4s, v8.4s, v0.s[0]\n"
7608       "add v8.4s, v8.4s, v1.4s\n"
7609       "st1 {v8.4s}, [%x[out]]\n"
7610       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7611         [out] "+r"(out), [in] "+r"(in)
7612       : [additive_sum_offset] "r"(params.additive_sum_offset),
7613         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7614       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7615         "v11", "cc", "memory");
7616 }
7617 
7618 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7619 inline void Stream<uint8_t, 4, 8, 4, ColumnMajorWithSum>::Pack(
7620     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7621 #ifdef DEBUG
7622 #ifdef DEBUG_METAGEMM_VERBOSE
7623   std::cout
7624       << __FILE__ << "(" << __LINE__
7625       << ") ColumnMajorWithSum<uint8_t, 4, 8, 4, ColumnMajorWithSum>::Pack()"
7626       << std::endl
7627       << std::flush;
7628 #endif
7629 #endif
7630   int params_count_copy = params.count;
7631   int params_stride_copy = params.stride;
7632   asm volatile(
7633       "movi v8.8h, #0\n"
7634       "movi v9.8h, #0\n"
7635       "movi v10.8h, #0\n"
7636       "movi v11.8h, #0\n"
7637 
7638       // Reduce count by leftovers.
7639       "subs %x[count], %x[count], #4\n"
7640       "beq 2f\n"
7641 
7642       "1:"
7643       "subs %x[count], %x[count], #8\n"
7644 
7645       // Load Aggregate Store - column major 4x8
7646       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7647       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7648       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7649       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7650       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7651       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7652       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7653       "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7654       "prfm pldl1keep, [%x[in]]\n"
7655       "trn1 v4.4h, v0.4h, v2.4h\n"
7656       "trn2 v6.4h, v0.4h, v2.4h\n"
7657       "trn1 v5.4h, v1.4h, v3.4h\n"
7658       "trn2 v7.4h, v1.4h, v3.4h\n"
7659       "trn1 v0.8b, v4.8b, v5.8b\n"
7660       "trn2 v1.8b, v4.8b, v5.8b\n"
7661       "trn1 v2.8b, v6.8b, v7.8b\n"
7662       "trn2 v3.8b, v6.8b, v7.8b\n"
7663       "uaddw v8.8h, v8.8h, v0.8b\n"
7664       "uaddw v9.8h, v9.8h, v1.8b\n"
7665       "uaddw v10.8h, v10.8h, v2.8b\n"
7666       "uaddw v11.8h, v11.8h, v3.8b\n"
7667       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7668 
7669       "bne 1b\n"
7670 
7671       "2:"
7672 
7673       // Load Aggregate Store - column major 4x4
7674       "movi v0.8b, #0\n"
7675       "movi v1.8b, #0\n"
7676       "movi v2.8b, #0\n"
7677       "movi v3.8b, #0\n"
7678       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7679       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7680       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7681       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7682       "prfm pldl1keep, [%x[in]]\n"
7683       "trn1 v4.4h, v0.4h, v2.4h\n"
7684       "trn2 v6.4h, v0.4h, v2.4h\n"
7685       "trn1 v5.4h, v1.4h, v3.4h\n"
7686       "trn2 v7.4h, v1.4h, v3.4h\n"
7687       "trn1 v0.8b, v4.8b, v5.8b\n"
7688       "trn2 v1.8b, v4.8b, v5.8b\n"
7689       "trn1 v2.8b, v6.8b, v7.8b\n"
7690       "trn2 v3.8b, v6.8b, v7.8b\n"
7691       "uaddw v8.8h, v8.8h, v0.8b\n"
7692       "uaddw v9.8h, v9.8h, v1.8b\n"
7693       "uaddw v10.8h, v10.8h, v2.8b\n"
7694       "uaddw v11.8h, v11.8h, v3.8b\n"
7695       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7696 
7697       // Aggregator Reduction.
7698       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7699       "dup v1.4s, %w[additive_sum_offset]\n"
7700       "uaddlp v8.4s, v8.8h\n"
7701       "uaddlp v9.4s, v9.8h\n"
7702       "uaddlp v10.4s, v10.8h\n"
7703       "uaddlp v11.4s, v11.8h\n"
7704       "addp v8.4s, v8.4s, v9.4s\n"
7705       "addp v10.4s, v10.4s, v11.4s\n"
7706       "addp v8.4s, v8.4s, v10.4s\n"
7707       "mul v8.4s, v8.4s, v0.s[0]\n"
7708       "add v8.4s, v8.4s, v1.4s\n"
7709       "st1 {v8.4s}, [%x[out]]\n"
7710       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7711         [out] "+r"(out), [in] "+r"(in)
7712       : [additive_sum_offset] "r"(params.additive_sum_offset),
7713         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7714       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7715         "v11", "cc", "memory");
7716 }
7717 
7718 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7719 inline void Stream<uint8_t, 4, 8, 5, ColumnMajorWithSum>::Pack(
7720     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7721 #ifdef DEBUG
7722 #ifdef DEBUG_METAGEMM_VERBOSE
7723   std::cout
7724       << __FILE__ << "(" << __LINE__
7725       << ") ColumnMajorWithSum<uint8_t, 4, 8, 5, ColumnMajorWithSum>::Pack()"
7726       << std::endl
7727       << std::flush;
7728 #endif
7729 #endif
7730   int params_count_copy = params.count;
7731   int params_stride_copy = params.stride;
7732   asm volatile(
7733       "movi v8.8h, #0\n"
7734       "movi v9.8h, #0\n"
7735       "movi v10.8h, #0\n"
7736       "movi v11.8h, #0\n"
7737 
7738       // Reduce count by leftovers.
7739       "subs %x[count], %x[count], #5\n"
7740       "beq 2f\n"
7741 
7742       "1:"
7743       "subs %x[count], %x[count], #8\n"
7744 
7745       // Load Aggregate Store - column major 4x8
7746       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7747       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7748       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7749       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7750       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7751       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7752       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7753       "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7754       "prfm pldl1keep, [%x[in]]\n"
7755       "trn1 v4.4h, v0.4h, v2.4h\n"
7756       "trn2 v6.4h, v0.4h, v2.4h\n"
7757       "trn1 v5.4h, v1.4h, v3.4h\n"
7758       "trn2 v7.4h, v1.4h, v3.4h\n"
7759       "trn1 v0.8b, v4.8b, v5.8b\n"
7760       "trn2 v1.8b, v4.8b, v5.8b\n"
7761       "trn1 v2.8b, v6.8b, v7.8b\n"
7762       "trn2 v3.8b, v6.8b, v7.8b\n"
7763       "uaddw v8.8h, v8.8h, v0.8b\n"
7764       "uaddw v9.8h, v9.8h, v1.8b\n"
7765       "uaddw v10.8h, v10.8h, v2.8b\n"
7766       "uaddw v11.8h, v11.8h, v3.8b\n"
7767       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7768 
7769       "bne 1b\n"
7770 
7771       "2:"
7772 
7773       // Load Aggregate Store - column major 4x5
7774       "movi v0.8b, #0\n"
7775       "movi v1.8b, #0\n"
7776       "movi v2.8b, #0\n"
7777       "movi v3.8b, #0\n"
7778       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7779       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7780       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7781       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7782       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7783       "prfm pldl1keep, [%x[in]]\n"
7784       "trn1 v4.4h, v0.4h, v2.4h\n"
7785       "trn2 v6.4h, v0.4h, v2.4h\n"
7786       "trn1 v5.4h, v1.4h, v3.4h\n"
7787       "trn2 v7.4h, v1.4h, v3.4h\n"
7788       "trn1 v0.8b, v4.8b, v5.8b\n"
7789       "trn2 v1.8b, v4.8b, v5.8b\n"
7790       "trn1 v2.8b, v6.8b, v7.8b\n"
7791       "trn2 v3.8b, v6.8b, v7.8b\n"
7792       "uaddw v8.8h, v8.8h, v0.8b\n"
7793       "uaddw v9.8h, v9.8h, v1.8b\n"
7794       "uaddw v10.8h, v10.8h, v2.8b\n"
7795       "uaddw v11.8h, v11.8h, v3.8b\n"
7796       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7797 
7798       // Aggregator Reduction.
7799       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7800       "dup v1.4s, %w[additive_sum_offset]\n"
7801       "uaddlp v8.4s, v8.8h\n"
7802       "uaddlp v9.4s, v9.8h\n"
7803       "uaddlp v10.4s, v10.8h\n"
7804       "uaddlp v11.4s, v11.8h\n"
7805       "addp v8.4s, v8.4s, v9.4s\n"
7806       "addp v10.4s, v10.4s, v11.4s\n"
7807       "addp v8.4s, v8.4s, v10.4s\n"
7808       "mul v8.4s, v8.4s, v0.s[0]\n"
7809       "add v8.4s, v8.4s, v1.4s\n"
7810       "st1 {v8.4s}, [%x[out]]\n"
7811       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7812         [out] "+r"(out), [in] "+r"(in)
7813       : [additive_sum_offset] "r"(params.additive_sum_offset),
7814         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7815       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7816         "v11", "cc", "memory");
7817 }
7818 
7819 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7820 inline void Stream<uint8_t, 4, 8, 6, ColumnMajorWithSum>::Pack(
7821     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7822 #ifdef DEBUG
7823 #ifdef DEBUG_METAGEMM_VERBOSE
7824   std::cout
7825       << __FILE__ << "(" << __LINE__
7826       << ") ColumnMajorWithSum<uint8_t, 4, 8, 6, ColumnMajorWithSum>::Pack()"
7827       << std::endl
7828       << std::flush;
7829 #endif
7830 #endif
7831   int params_count_copy = params.count;
7832   int params_stride_copy = params.stride;
7833   asm volatile(
7834       "movi v8.8h, #0\n"
7835       "movi v9.8h, #0\n"
7836       "movi v10.8h, #0\n"
7837       "movi v11.8h, #0\n"
7838 
7839       // Reduce count by leftovers.
7840       "subs %x[count], %x[count], #6\n"
7841       "beq 2f\n"
7842 
7843       "1:"
7844       "subs %x[count], %x[count], #8\n"
7845 
7846       // Load Aggregate Store - column major 4x8
7847       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7848       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7849       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7850       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7851       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7852       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7853       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7854       "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7855       "prfm pldl1keep, [%x[in]]\n"
7856       "trn1 v4.4h, v0.4h, v2.4h\n"
7857       "trn2 v6.4h, v0.4h, v2.4h\n"
7858       "trn1 v5.4h, v1.4h, v3.4h\n"
7859       "trn2 v7.4h, v1.4h, v3.4h\n"
7860       "trn1 v0.8b, v4.8b, v5.8b\n"
7861       "trn2 v1.8b, v4.8b, v5.8b\n"
7862       "trn1 v2.8b, v6.8b, v7.8b\n"
7863       "trn2 v3.8b, v6.8b, v7.8b\n"
7864       "uaddw v8.8h, v8.8h, v0.8b\n"
7865       "uaddw v9.8h, v9.8h, v1.8b\n"
7866       "uaddw v10.8h, v10.8h, v2.8b\n"
7867       "uaddw v11.8h, v11.8h, v3.8b\n"
7868       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7869 
7870       "bne 1b\n"
7871 
7872       "2:"
7873 
7874       // Load Aggregate Store - column major 4x6
7875       "movi v0.8b, #0\n"
7876       "movi v1.8b, #0\n"
7877       "movi v2.8b, #0\n"
7878       "movi v3.8b, #0\n"
7879       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7880       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7881       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7882       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7883       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7884       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7885       "prfm pldl1keep, [%x[in]]\n"
7886       "trn1 v4.4h, v0.4h, v2.4h\n"
7887       "trn2 v6.4h, v0.4h, v2.4h\n"
7888       "trn1 v5.4h, v1.4h, v3.4h\n"
7889       "trn2 v7.4h, v1.4h, v3.4h\n"
7890       "trn1 v0.8b, v4.8b, v5.8b\n"
7891       "trn2 v1.8b, v4.8b, v5.8b\n"
7892       "trn1 v2.8b, v6.8b, v7.8b\n"
7893       "trn2 v3.8b, v6.8b, v7.8b\n"
7894       "uaddw v8.8h, v8.8h, v0.8b\n"
7895       "uaddw v9.8h, v9.8h, v1.8b\n"
7896       "uaddw v10.8h, v10.8h, v2.8b\n"
7897       "uaddw v11.8h, v11.8h, v3.8b\n"
7898       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7899 
7900       // Aggregator Reduction.
7901       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7902       "dup v1.4s, %w[additive_sum_offset]\n"
7903       "uaddlp v8.4s, v8.8h\n"
7904       "uaddlp v9.4s, v9.8h\n"
7905       "uaddlp v10.4s, v10.8h\n"
7906       "uaddlp v11.4s, v11.8h\n"
7907       "addp v8.4s, v8.4s, v9.4s\n"
7908       "addp v10.4s, v10.4s, v11.4s\n"
7909       "addp v8.4s, v8.4s, v10.4s\n"
7910       "mul v8.4s, v8.4s, v0.s[0]\n"
7911       "add v8.4s, v8.4s, v1.4s\n"
7912       "st1 {v8.4s}, [%x[out]]\n"
7913       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7914         [out] "+r"(out), [in] "+r"(in)
7915       : [additive_sum_offset] "r"(params.additive_sum_offset),
7916         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7917       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7918         "v11", "cc", "memory");
7919 }
7920 
7921 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7922 inline void Stream<uint8_t, 4, 8, 7, ColumnMajorWithSum>::Pack(
7923     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7924 #ifdef DEBUG
7925 #ifdef DEBUG_METAGEMM_VERBOSE
7926   std::cout
7927       << __FILE__ << "(" << __LINE__
7928       << ") ColumnMajorWithSum<uint8_t, 4, 8, 7, ColumnMajorWithSum>::Pack()"
7929       << std::endl
7930       << std::flush;
7931 #endif
7932 #endif
7933   int params_count_copy = params.count;
7934   int params_stride_copy = params.stride;
7935   asm volatile(
7936       "movi v8.8h, #0\n"
7937       "movi v9.8h, #0\n"
7938       "movi v10.8h, #0\n"
7939       "movi v11.8h, #0\n"
7940 
7941       // Reduce count by leftovers.
7942       "subs %x[count], %x[count], #7\n"
7943       "beq 2f\n"
7944 
7945       "1:"
7946       "subs %x[count], %x[count], #8\n"
7947 
7948       // Load Aggregate Store - column major 4x8
7949       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7950       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7951       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7952       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7953       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7954       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7955       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7956       "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7957       "prfm pldl1keep, [%x[in]]\n"
7958       "trn1 v4.4h, v0.4h, v2.4h\n"
7959       "trn2 v6.4h, v0.4h, v2.4h\n"
7960       "trn1 v5.4h, v1.4h, v3.4h\n"
7961       "trn2 v7.4h, v1.4h, v3.4h\n"
7962       "trn1 v0.8b, v4.8b, v5.8b\n"
7963       "trn2 v1.8b, v4.8b, v5.8b\n"
7964       "trn1 v2.8b, v6.8b, v7.8b\n"
7965       "trn2 v3.8b, v6.8b, v7.8b\n"
7966       "uaddw v8.8h, v8.8h, v0.8b\n"
7967       "uaddw v9.8h, v9.8h, v1.8b\n"
7968       "uaddw v10.8h, v10.8h, v2.8b\n"
7969       "uaddw v11.8h, v11.8h, v3.8b\n"
7970       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7971 
7972       "bne 1b\n"
7973 
7974       "2:"
7975 
7976       // Load Aggregate Store - column major 4x7
7977       "movi v0.8b, #0\n"
7978       "movi v1.8b, #0\n"
7979       "movi v2.8b, #0\n"
7980       "movi v3.8b, #0\n"
7981       "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7982       "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7983       "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7984       "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7985       "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7986       "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7987       "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7988       "prfm pldl1keep, [%x[in]]\n"
7989       "trn1 v4.4h, v0.4h, v2.4h\n"
7990       "trn2 v6.4h, v0.4h, v2.4h\n"
7991       "trn1 v5.4h, v1.4h, v3.4h\n"
7992       "trn2 v7.4h, v1.4h, v3.4h\n"
7993       "trn1 v0.8b, v4.8b, v5.8b\n"
7994       "trn2 v1.8b, v4.8b, v5.8b\n"
7995       "trn1 v2.8b, v6.8b, v7.8b\n"
7996       "trn2 v3.8b, v6.8b, v7.8b\n"
7997       "uaddw v8.8h, v8.8h, v0.8b\n"
7998       "uaddw v9.8h, v9.8h, v1.8b\n"
7999       "uaddw v10.8h, v10.8h, v2.8b\n"
8000       "uaddw v11.8h, v11.8h, v3.8b\n"
8001       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8002 
8003       // Aggregator Reduction.
8004       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8005       "dup v1.4s, %w[additive_sum_offset]\n"
8006       "uaddlp v8.4s, v8.8h\n"
8007       "uaddlp v9.4s, v9.8h\n"
8008       "uaddlp v10.4s, v10.8h\n"
8009       "uaddlp v11.4s, v11.8h\n"
8010       "addp v8.4s, v8.4s, v9.4s\n"
8011       "addp v10.4s, v10.4s, v11.4s\n"
8012       "addp v8.4s, v8.4s, v10.4s\n"
8013       "mul v8.4s, v8.4s, v0.s[0]\n"
8014       "add v8.4s, v8.4s, v1.4s\n"
8015       "st1 {v8.4s}, [%x[out]]\n"
8016       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8017         [out] "+r"(out), [in] "+r"(in)
8018       : [additive_sum_offset] "r"(params.additive_sum_offset),
8019         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8020       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8021         "v11", "cc", "memory");
8022 }
8023 
8024 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8025 inline void Stream<uint8_t, 5, 8, 0, ColumnMajorWithSum>::Pack(
8026     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8027 #ifdef DEBUG
8028 #ifdef DEBUG_METAGEMM_VERBOSE
8029   std::cout
8030       << __FILE__ << "(" << __LINE__
8031       << ") ColumnMajorWithSum<uint8_t, 5, 8, 0, ColumnMajorWithSum>::Pack()"
8032       << std::endl
8033       << std::flush;
8034 #endif
8035 #endif
8036   int params_count_copy = params.count;
8037   int params_stride_copy = params.stride;
8038   asm volatile(
8039       "sub %x[stride], %x[stride], #4\n"
8040       "movi v8.8h, #0\n"
8041       "movi v9.8h, #0\n"
8042       "movi v10.8h, #0\n"
8043       "movi v11.8h, #0\n"
8044       "movi v12.8h, #0\n"
8045 
8046       "1:"
8047       "subs %x[count], %x[count], #8\n"
8048 
8049       // Load Aggregate Store - column major 5x8
8050       "ld1 {v0.s}[0], [%x[in]], #4\n"
8051       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8052       "ld1 {v1.s}[0], [%x[in]], #4\n"
8053       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8054       "ld1 {v2.s}[0], [%x[in]], #4\n"
8055       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8056       "ld1 {v3.s}[0], [%x[in]], #4\n"
8057       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8058       "ld1 {v0.s}[1], [%x[in]], #4\n"
8059       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8060       "ld1 {v1.s}[1], [%x[in]], #4\n"
8061       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8062       "ld1 {v2.s}[1], [%x[in]], #4\n"
8063       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8064       "ld1 {v3.s}[1], [%x[in]], #4\n"
8065       "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8066       "prfm pldl1keep, [%x[in]]\n"
8067       "trn1 v5.4h, v0.4h, v2.4h\n"
8068       "trn2 v7.4h, v0.4h, v2.4h\n"
8069       "trn1 v6.4h, v1.4h, v3.4h\n"
8070       "trn2 v13.4h, v1.4h, v3.4h\n"
8071       "trn1 v0.8b, v5.8b, v6.8b\n"
8072       "trn2 v1.8b, v5.8b, v6.8b\n"
8073       "trn1 v2.8b, v7.8b, v13.8b\n"
8074       "trn2 v3.8b, v7.8b, v13.8b\n"
8075       "uaddw v8.8h, v8.8h, v0.8b\n"
8076       "uaddw v9.8h, v9.8h, v1.8b\n"
8077       "uaddw v10.8h, v10.8h, v2.8b\n"
8078       "uaddw v11.8h, v11.8h, v3.8b\n"
8079       "uaddw v12.8h, v12.8h, v4.8b\n"
8080       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8081       "st1 {v4.2s}, [%x[out]], #8\n"
8082 
8083       "bne 1b\n"
8084 
8085       // Aggregator Reduction.
8086       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8087       "dup v1.4s, %w[additive_sum_offset]\n"
8088       "uaddlp v8.4s, v8.8h\n"
8089       "uaddlp v9.4s, v9.8h\n"
8090       "uaddlp v10.4s, v10.8h\n"
8091       "uaddlp v11.4s, v11.8h\n"
8092       "uaddlp v12.4s, v12.8h\n"
8093       "addp v8.4s, v8.4s, v9.4s\n"
8094       "addp v10.4s, v10.4s, v11.4s\n"
8095       "addp v12.4s, v12.4s, v12.4s\n"
8096       "addp v8.4s, v8.4s, v10.4s\n"
8097       "addp v9.4s, v12.4s, v12.4s\n"
8098       "mul v8.4s, v8.4s, v0.s[0]\n"
8099       "mul v9.4s, v9.4s, v0.s[0]\n"
8100       "add v8.4s, v8.4s, v1.4s\n"
8101       "add v9.4s, v9.4s, v1.4s\n"
8102       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8103       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8104         [out] "+r"(out), [in] "+r"(in)
8105       : [additive_sum_offset] "r"(params.additive_sum_offset),
8106         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8107       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8108         "v11", "v12", "v13", "cc", "memory");
8109 }
8110 
8111 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8112 inline void Stream<uint8_t, 5, 8, 1, ColumnMajorWithSum>::Pack(
8113     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8114 #ifdef DEBUG
8115 #ifdef DEBUG_METAGEMM_VERBOSE
8116   std::cout
8117       << __FILE__ << "(" << __LINE__
8118       << ") ColumnMajorWithSum<uint8_t, 5, 8, 1, ColumnMajorWithSum>::Pack()"
8119       << std::endl
8120       << std::flush;
8121 #endif
8122 #endif
8123   int params_count_copy = params.count;
8124   int params_stride_copy = params.stride;
8125   asm volatile(
8126       "sub %x[stride], %x[stride], #4\n"
8127       "movi v8.8h, #0\n"
8128       "movi v9.8h, #0\n"
8129       "movi v10.8h, #0\n"
8130       "movi v11.8h, #0\n"
8131       "movi v12.8h, #0\n"
8132 
8133       // Reduce count by leftovers.
8134       "subs %x[count], %x[count], #1\n"
8135       "beq 2f\n"
8136 
8137       "1:"
8138       "subs %x[count], %x[count], #8\n"
8139 
8140       // Load Aggregate Store - column major 5x8
8141       "ld1 {v0.s}[0], [%x[in]], #4\n"
8142       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8143       "ld1 {v1.s}[0], [%x[in]], #4\n"
8144       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8145       "ld1 {v2.s}[0], [%x[in]], #4\n"
8146       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8147       "ld1 {v3.s}[0], [%x[in]], #4\n"
8148       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8149       "ld1 {v0.s}[1], [%x[in]], #4\n"
8150       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8151       "ld1 {v1.s}[1], [%x[in]], #4\n"
8152       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8153       "ld1 {v2.s}[1], [%x[in]], #4\n"
8154       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8155       "ld1 {v3.s}[1], [%x[in]], #4\n"
8156       "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8157       "prfm pldl1keep, [%x[in]]\n"
8158       "trn1 v5.4h, v0.4h, v2.4h\n"
8159       "trn2 v7.4h, v0.4h, v2.4h\n"
8160       "trn1 v6.4h, v1.4h, v3.4h\n"
8161       "trn2 v13.4h, v1.4h, v3.4h\n"
8162       "trn1 v0.8b, v5.8b, v6.8b\n"
8163       "trn2 v1.8b, v5.8b, v6.8b\n"
8164       "trn1 v2.8b, v7.8b, v13.8b\n"
8165       "trn2 v3.8b, v7.8b, v13.8b\n"
8166       "uaddw v8.8h, v8.8h, v0.8b\n"
8167       "uaddw v9.8h, v9.8h, v1.8b\n"
8168       "uaddw v10.8h, v10.8h, v2.8b\n"
8169       "uaddw v11.8h, v11.8h, v3.8b\n"
8170       "uaddw v12.8h, v12.8h, v4.8b\n"
8171       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8172       "st1 {v4.2s}, [%x[out]], #8\n"
8173 
8174       "bne 1b\n"
8175 
8176       "2:"
8177 
8178       // Load Aggregate Store - column major 5x1
8179       "movi v0.8b, #0\n"
8180       "movi v1.8b, #0\n"
8181       "movi v2.8b, #0\n"
8182       "movi v3.8b, #0\n"
8183       "movi v4.8b, #0\n"
8184       "ld1 {v0.s}[0], [%x[in]], #4\n"
8185       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8186       "prfm pldl1keep, [%x[in]]\n"
8187       "trn1 v5.4h, v0.4h, v2.4h\n"
8188       "trn2 v7.4h, v0.4h, v2.4h\n"
8189       "trn1 v6.4h, v1.4h, v3.4h\n"
8190       "trn2 v13.4h, v1.4h, v3.4h\n"
8191       "trn1 v0.8b, v5.8b, v6.8b\n"
8192       "trn2 v1.8b, v5.8b, v6.8b\n"
8193       "trn1 v2.8b, v7.8b, v13.8b\n"
8194       "trn2 v3.8b, v7.8b, v13.8b\n"
8195       "uaddw v8.8h, v8.8h, v0.8b\n"
8196       "uaddw v9.8h, v9.8h, v1.8b\n"
8197       "uaddw v10.8h, v10.8h, v2.8b\n"
8198       "uaddw v11.8h, v11.8h, v3.8b\n"
8199       "uaddw v12.8h, v12.8h, v4.8b\n"
8200       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8201       "st1 {v4.2s}, [%x[out]], #8\n"
8202 
8203       // Aggregator Reduction.
8204       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8205       "dup v1.4s, %w[additive_sum_offset]\n"
8206       "uaddlp v8.4s, v8.8h\n"
8207       "uaddlp v9.4s, v9.8h\n"
8208       "uaddlp v10.4s, v10.8h\n"
8209       "uaddlp v11.4s, v11.8h\n"
8210       "uaddlp v12.4s, v12.8h\n"
8211       "addp v8.4s, v8.4s, v9.4s\n"
8212       "addp v10.4s, v10.4s, v11.4s\n"
8213       "addp v12.4s, v12.4s, v12.4s\n"
8214       "addp v8.4s, v8.4s, v10.4s\n"
8215       "addp v9.4s, v12.4s, v12.4s\n"
8216       "mul v8.4s, v8.4s, v0.s[0]\n"
8217       "mul v9.4s, v9.4s, v0.s[0]\n"
8218       "add v8.4s, v8.4s, v1.4s\n"
8219       "add v9.4s, v9.4s, v1.4s\n"
8220       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8221       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8222         [out] "+r"(out), [in] "+r"(in)
8223       : [additive_sum_offset] "r"(params.additive_sum_offset),
8224         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8225       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8226         "v11", "v12", "v13", "cc", "memory");
8227 }
8228 
8229 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8230 inline void Stream<uint8_t, 5, 8, 2, ColumnMajorWithSum>::Pack(
8231     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8232 #ifdef DEBUG
8233 #ifdef DEBUG_METAGEMM_VERBOSE
8234   std::cout
8235       << __FILE__ << "(" << __LINE__
8236       << ") ColumnMajorWithSum<uint8_t, 5, 8, 2, ColumnMajorWithSum>::Pack()"
8237       << std::endl
8238       << std::flush;
8239 #endif
8240 #endif
8241   int params_count_copy = params.count;
8242   int params_stride_copy = params.stride;
8243   asm volatile(
8244       "sub %x[stride], %x[stride], #4\n"
8245       "movi v8.8h, #0\n"
8246       "movi v9.8h, #0\n"
8247       "movi v10.8h, #0\n"
8248       "movi v11.8h, #0\n"
8249       "movi v12.8h, #0\n"
8250 
8251       // Reduce count by leftovers.
8252       "subs %x[count], %x[count], #2\n"
8253       "beq 2f\n"
8254 
8255       "1:"
8256       "subs %x[count], %x[count], #8\n"
8257 
8258       // Load Aggregate Store - column major 5x8
8259       "ld1 {v0.s}[0], [%x[in]], #4\n"
8260       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8261       "ld1 {v1.s}[0], [%x[in]], #4\n"
8262       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8263       "ld1 {v2.s}[0], [%x[in]], #4\n"
8264       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8265       "ld1 {v3.s}[0], [%x[in]], #4\n"
8266       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8267       "ld1 {v0.s}[1], [%x[in]], #4\n"
8268       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8269       "ld1 {v1.s}[1], [%x[in]], #4\n"
8270       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8271       "ld1 {v2.s}[1], [%x[in]], #4\n"
8272       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8273       "ld1 {v3.s}[1], [%x[in]], #4\n"
8274       "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8275       "prfm pldl1keep, [%x[in]]\n"
8276       "trn1 v5.4h, v0.4h, v2.4h\n"
8277       "trn2 v7.4h, v0.4h, v2.4h\n"
8278       "trn1 v6.4h, v1.4h, v3.4h\n"
8279       "trn2 v13.4h, v1.4h, v3.4h\n"
8280       "trn1 v0.8b, v5.8b, v6.8b\n"
8281       "trn2 v1.8b, v5.8b, v6.8b\n"
8282       "trn1 v2.8b, v7.8b, v13.8b\n"
8283       "trn2 v3.8b, v7.8b, v13.8b\n"
8284       "uaddw v8.8h, v8.8h, v0.8b\n"
8285       "uaddw v9.8h, v9.8h, v1.8b\n"
8286       "uaddw v10.8h, v10.8h, v2.8b\n"
8287       "uaddw v11.8h, v11.8h, v3.8b\n"
8288       "uaddw v12.8h, v12.8h, v4.8b\n"
8289       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8290       "st1 {v4.2s}, [%x[out]], #8\n"
8291 
8292       "bne 1b\n"
8293 
8294       "2:"
8295 
8296       // Load Aggregate Store - column major 5x2
8297       "movi v0.8b, #0\n"
8298       "movi v1.8b, #0\n"
8299       "movi v2.8b, #0\n"
8300       "movi v3.8b, #0\n"
8301       "movi v4.8b, #0\n"
8302       "ld1 {v0.s}[0], [%x[in]], #4\n"
8303       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8304       "ld1 {v1.s}[0], [%x[in]], #4\n"
8305       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8306       "prfm pldl1keep, [%x[in]]\n"
8307       "trn1 v5.4h, v0.4h, v2.4h\n"
8308       "trn2 v7.4h, v0.4h, v2.4h\n"
8309       "trn1 v6.4h, v1.4h, v3.4h\n"
8310       "trn2 v13.4h, v1.4h, v3.4h\n"
8311       "trn1 v0.8b, v5.8b, v6.8b\n"
8312       "trn2 v1.8b, v5.8b, v6.8b\n"
8313       "trn1 v2.8b, v7.8b, v13.8b\n"
8314       "trn2 v3.8b, v7.8b, v13.8b\n"
8315       "uaddw v8.8h, v8.8h, v0.8b\n"
8316       "uaddw v9.8h, v9.8h, v1.8b\n"
8317       "uaddw v10.8h, v10.8h, v2.8b\n"
8318       "uaddw v11.8h, v11.8h, v3.8b\n"
8319       "uaddw v12.8h, v12.8h, v4.8b\n"
8320       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8321       "st1 {v4.2s}, [%x[out]], #8\n"
8322 
8323       // Aggregator Reduction.
8324       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8325       "dup v1.4s, %w[additive_sum_offset]\n"
8326       "uaddlp v8.4s, v8.8h\n"
8327       "uaddlp v9.4s, v9.8h\n"
8328       "uaddlp v10.4s, v10.8h\n"
8329       "uaddlp v11.4s, v11.8h\n"
8330       "uaddlp v12.4s, v12.8h\n"
8331       "addp v8.4s, v8.4s, v9.4s\n"
8332       "addp v10.4s, v10.4s, v11.4s\n"
8333       "addp v12.4s, v12.4s, v12.4s\n"
8334       "addp v8.4s, v8.4s, v10.4s\n"
8335       "addp v9.4s, v12.4s, v12.4s\n"
8336       "mul v8.4s, v8.4s, v0.s[0]\n"
8337       "mul v9.4s, v9.4s, v0.s[0]\n"
8338       "add v8.4s, v8.4s, v1.4s\n"
8339       "add v9.4s, v9.4s, v1.4s\n"
8340       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8341       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8342         [out] "+r"(out), [in] "+r"(in)
8343       : [additive_sum_offset] "r"(params.additive_sum_offset),
8344         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8345       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8346         "v11", "v12", "v13", "cc", "memory");
8347 }
8348 
8349 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8350 inline void Stream<uint8_t, 5, 8, 3, ColumnMajorWithSum>::Pack(
8351     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8352 #ifdef DEBUG
8353 #ifdef DEBUG_METAGEMM_VERBOSE
8354   std::cout
8355       << __FILE__ << "(" << __LINE__
8356       << ") ColumnMajorWithSum<uint8_t, 5, 8, 3, ColumnMajorWithSum>::Pack()"
8357       << std::endl
8358       << std::flush;
8359 #endif
8360 #endif
8361   int params_count_copy = params.count;
8362   int params_stride_copy = params.stride;
8363   asm volatile(
8364       "sub %x[stride], %x[stride], #4\n"
8365       "movi v8.8h, #0\n"
8366       "movi v9.8h, #0\n"
8367       "movi v10.8h, #0\n"
8368       "movi v11.8h, #0\n"
8369       "movi v12.8h, #0\n"
8370 
8371       // Reduce count by leftovers.
8372       "subs %x[count], %x[count], #3\n"
8373       "beq 2f\n"
8374 
8375       "1:"
8376       "subs %x[count], %x[count], #8\n"
8377 
8378       // Load Aggregate Store - column major 5x8
8379       "ld1 {v0.s}[0], [%x[in]], #4\n"
8380       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8381       "ld1 {v1.s}[0], [%x[in]], #4\n"
8382       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8383       "ld1 {v2.s}[0], [%x[in]], #4\n"
8384       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8385       "ld1 {v3.s}[0], [%x[in]], #4\n"
8386       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8387       "ld1 {v0.s}[1], [%x[in]], #4\n"
8388       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8389       "ld1 {v1.s}[1], [%x[in]], #4\n"
8390       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8391       "ld1 {v2.s}[1], [%x[in]], #4\n"
8392       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8393       "ld1 {v3.s}[1], [%x[in]], #4\n"
8394       "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8395       "prfm pldl1keep, [%x[in]]\n"
8396       "trn1 v5.4h, v0.4h, v2.4h\n"
8397       "trn2 v7.4h, v0.4h, v2.4h\n"
8398       "trn1 v6.4h, v1.4h, v3.4h\n"
8399       "trn2 v13.4h, v1.4h, v3.4h\n"
8400       "trn1 v0.8b, v5.8b, v6.8b\n"
8401       "trn2 v1.8b, v5.8b, v6.8b\n"
8402       "trn1 v2.8b, v7.8b, v13.8b\n"
8403       "trn2 v3.8b, v7.8b, v13.8b\n"
8404       "uaddw v8.8h, v8.8h, v0.8b\n"
8405       "uaddw v9.8h, v9.8h, v1.8b\n"
8406       "uaddw v10.8h, v10.8h, v2.8b\n"
8407       "uaddw v11.8h, v11.8h, v3.8b\n"
8408       "uaddw v12.8h, v12.8h, v4.8b\n"
8409       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8410       "st1 {v4.2s}, [%x[out]], #8\n"
8411 
8412       "bne 1b\n"
8413 
8414       "2:"
8415 
8416       // Load Aggregate Store - column major 5x3
8417       "movi v0.8b, #0\n"
8418       "movi v1.8b, #0\n"
8419       "movi v2.8b, #0\n"
8420       "movi v3.8b, #0\n"
8421       "movi v4.8b, #0\n"
8422       "ld1 {v0.s}[0], [%x[in]], #4\n"
8423       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8424       "ld1 {v1.s}[0], [%x[in]], #4\n"
8425       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8426       "ld1 {v2.s}[0], [%x[in]], #4\n"
8427       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8428       "prfm pldl1keep, [%x[in]]\n"
8429       "trn1 v5.4h, v0.4h, v2.4h\n"
8430       "trn2 v7.4h, v0.4h, v2.4h\n"
8431       "trn1 v6.4h, v1.4h, v3.4h\n"
8432       "trn2 v13.4h, v1.4h, v3.4h\n"
8433       "trn1 v0.8b, v5.8b, v6.8b\n"
8434       "trn2 v1.8b, v5.8b, v6.8b\n"
8435       "trn1 v2.8b, v7.8b, v13.8b\n"
8436       "trn2 v3.8b, v7.8b, v13.8b\n"
8437       "uaddw v8.8h, v8.8h, v0.8b\n"
8438       "uaddw v9.8h, v9.8h, v1.8b\n"
8439       "uaddw v10.8h, v10.8h, v2.8b\n"
8440       "uaddw v11.8h, v11.8h, v3.8b\n"
8441       "uaddw v12.8h, v12.8h, v4.8b\n"
8442       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8443       "st1 {v4.2s}, [%x[out]], #8\n"
8444 
8445       // Aggregator Reduction.
8446       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8447       "dup v1.4s, %w[additive_sum_offset]\n"
8448       "uaddlp v8.4s, v8.8h\n"
8449       "uaddlp v9.4s, v9.8h\n"
8450       "uaddlp v10.4s, v10.8h\n"
8451       "uaddlp v11.4s, v11.8h\n"
8452       "uaddlp v12.4s, v12.8h\n"
8453       "addp v8.4s, v8.4s, v9.4s\n"
8454       "addp v10.4s, v10.4s, v11.4s\n"
8455       "addp v12.4s, v12.4s, v12.4s\n"
8456       "addp v8.4s, v8.4s, v10.4s\n"
8457       "addp v9.4s, v12.4s, v12.4s\n"
8458       "mul v8.4s, v8.4s, v0.s[0]\n"
8459       "mul v9.4s, v9.4s, v0.s[0]\n"
8460       "add v8.4s, v8.4s, v1.4s\n"
8461       "add v9.4s, v9.4s, v1.4s\n"
8462       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8463       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8464         [out] "+r"(out), [in] "+r"(in)
8465       : [additive_sum_offset] "r"(params.additive_sum_offset),
8466         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8467       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8468         "v11", "v12", "v13", "cc", "memory");
8469 }
8470 
8471 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8472 inline void Stream<uint8_t, 5, 8, 4, ColumnMajorWithSum>::Pack(
8473     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8474 #ifdef DEBUG
8475 #ifdef DEBUG_METAGEMM_VERBOSE
8476   std::cout
8477       << __FILE__ << "(" << __LINE__
8478       << ") ColumnMajorWithSum<uint8_t, 5, 8, 4, ColumnMajorWithSum>::Pack()"
8479       << std::endl
8480       << std::flush;
8481 #endif
8482 #endif
8483   int params_count_copy = params.count;
8484   int params_stride_copy = params.stride;
8485   asm volatile(
8486       "sub %x[stride], %x[stride], #4\n"
8487       "movi v8.8h, #0\n"
8488       "movi v9.8h, #0\n"
8489       "movi v10.8h, #0\n"
8490       "movi v11.8h, #0\n"
8491       "movi v12.8h, #0\n"
8492 
8493       // Reduce count by leftovers.
8494       "subs %x[count], %x[count], #4\n"
8495       "beq 2f\n"
8496 
8497       "1:"
8498       "subs %x[count], %x[count], #8\n"
8499 
8500       // Load Aggregate Store - column major 5x8
8501       "ld1 {v0.s}[0], [%x[in]], #4\n"
8502       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8503       "ld1 {v1.s}[0], [%x[in]], #4\n"
8504       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8505       "ld1 {v2.s}[0], [%x[in]], #4\n"
8506       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8507       "ld1 {v3.s}[0], [%x[in]], #4\n"
8508       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8509       "ld1 {v0.s}[1], [%x[in]], #4\n"
8510       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8511       "ld1 {v1.s}[1], [%x[in]], #4\n"
8512       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8513       "ld1 {v2.s}[1], [%x[in]], #4\n"
8514       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8515       "ld1 {v3.s}[1], [%x[in]], #4\n"
8516       "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8517       "prfm pldl1keep, [%x[in]]\n"
8518       "trn1 v5.4h, v0.4h, v2.4h\n"
8519       "trn2 v7.4h, v0.4h, v2.4h\n"
8520       "trn1 v6.4h, v1.4h, v3.4h\n"
8521       "trn2 v13.4h, v1.4h, v3.4h\n"
8522       "trn1 v0.8b, v5.8b, v6.8b\n"
8523       "trn2 v1.8b, v5.8b, v6.8b\n"
8524       "trn1 v2.8b, v7.8b, v13.8b\n"
8525       "trn2 v3.8b, v7.8b, v13.8b\n"
8526       "uaddw v8.8h, v8.8h, v0.8b\n"
8527       "uaddw v9.8h, v9.8h, v1.8b\n"
8528       "uaddw v10.8h, v10.8h, v2.8b\n"
8529       "uaddw v11.8h, v11.8h, v3.8b\n"
8530       "uaddw v12.8h, v12.8h, v4.8b\n"
8531       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8532       "st1 {v4.2s}, [%x[out]], #8\n"
8533 
8534       "bne 1b\n"
8535 
8536       "2:"
8537 
8538       // Load Aggregate Store - column major 5x4
8539       "movi v0.8b, #0\n"
8540       "movi v1.8b, #0\n"
8541       "movi v2.8b, #0\n"
8542       "movi v3.8b, #0\n"
8543       "movi v4.8b, #0\n"
8544       "ld1 {v0.s}[0], [%x[in]], #4\n"
8545       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8546       "ld1 {v1.s}[0], [%x[in]], #4\n"
8547       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8548       "ld1 {v2.s}[0], [%x[in]], #4\n"
8549       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8550       "ld1 {v3.s}[0], [%x[in]], #4\n"
8551       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8552       "prfm pldl1keep, [%x[in]]\n"
8553       "trn1 v5.4h, v0.4h, v2.4h\n"
8554       "trn2 v7.4h, v0.4h, v2.4h\n"
8555       "trn1 v6.4h, v1.4h, v3.4h\n"
8556       "trn2 v13.4h, v1.4h, v3.4h\n"
8557       "trn1 v0.8b, v5.8b, v6.8b\n"
8558       "trn2 v1.8b, v5.8b, v6.8b\n"
8559       "trn1 v2.8b, v7.8b, v13.8b\n"
8560       "trn2 v3.8b, v7.8b, v13.8b\n"
8561       "uaddw v8.8h, v8.8h, v0.8b\n"
8562       "uaddw v9.8h, v9.8h, v1.8b\n"
8563       "uaddw v10.8h, v10.8h, v2.8b\n"
8564       "uaddw v11.8h, v11.8h, v3.8b\n"
8565       "uaddw v12.8h, v12.8h, v4.8b\n"
8566       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8567       "st1 {v4.2s}, [%x[out]], #8\n"
8568 
8569       // Aggregator Reduction.
8570       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8571       "dup v1.4s, %w[additive_sum_offset]\n"
8572       "uaddlp v8.4s, v8.8h\n"
8573       "uaddlp v9.4s, v9.8h\n"
8574       "uaddlp v10.4s, v10.8h\n"
8575       "uaddlp v11.4s, v11.8h\n"
8576       "uaddlp v12.4s, v12.8h\n"
8577       "addp v8.4s, v8.4s, v9.4s\n"
8578       "addp v10.4s, v10.4s, v11.4s\n"
8579       "addp v12.4s, v12.4s, v12.4s\n"
8580       "addp v8.4s, v8.4s, v10.4s\n"
8581       "addp v9.4s, v12.4s, v12.4s\n"
8582       "mul v8.4s, v8.4s, v0.s[0]\n"
8583       "mul v9.4s, v9.4s, v0.s[0]\n"
8584       "add v8.4s, v8.4s, v1.4s\n"
8585       "add v9.4s, v9.4s, v1.4s\n"
8586       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8587       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8588         [out] "+r"(out), [in] "+r"(in)
8589       : [additive_sum_offset] "r"(params.additive_sum_offset),
8590         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8591       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8592         "v11", "v12", "v13", "cc", "memory");
8593 }
8594 
8595 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8596 inline void Stream<uint8_t, 5, 8, 5, ColumnMajorWithSum>::Pack(
8597     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8598 #ifdef DEBUG
8599 #ifdef DEBUG_METAGEMM_VERBOSE
8600   std::cout
8601       << __FILE__ << "(" << __LINE__
8602       << ") ColumnMajorWithSum<uint8_t, 5, 8, 5, ColumnMajorWithSum>::Pack()"
8603       << std::endl
8604       << std::flush;
8605 #endif
8606 #endif
8607   int params_count_copy = params.count;
8608   int params_stride_copy = params.stride;
8609   asm volatile(
8610       "sub %x[stride], %x[stride], #4\n"
8611       "movi v8.8h, #0\n"
8612       "movi v9.8h, #0\n"
8613       "movi v10.8h, #0\n"
8614       "movi v11.8h, #0\n"
8615       "movi v12.8h, #0\n"
8616 
8617       // Reduce count by leftovers.
8618       "subs %x[count], %x[count], #5\n"
8619       "beq 2f\n"
8620 
8621       "1:"
8622       "subs %x[count], %x[count], #8\n"
8623 
8624       // Load Aggregate Store - column major 5x8
8625       "ld1 {v0.s}[0], [%x[in]], #4\n"
8626       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8627       "ld1 {v1.s}[0], [%x[in]], #4\n"
8628       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8629       "ld1 {v2.s}[0], [%x[in]], #4\n"
8630       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8631       "ld1 {v3.s}[0], [%x[in]], #4\n"
8632       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8633       "ld1 {v0.s}[1], [%x[in]], #4\n"
8634       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8635       "ld1 {v1.s}[1], [%x[in]], #4\n"
8636       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8637       "ld1 {v2.s}[1], [%x[in]], #4\n"
8638       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8639       "ld1 {v3.s}[1], [%x[in]], #4\n"
8640       "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8641       "prfm pldl1keep, [%x[in]]\n"
8642       "trn1 v5.4h, v0.4h, v2.4h\n"
8643       "trn2 v7.4h, v0.4h, v2.4h\n"
8644       "trn1 v6.4h, v1.4h, v3.4h\n"
8645       "trn2 v13.4h, v1.4h, v3.4h\n"
8646       "trn1 v0.8b, v5.8b, v6.8b\n"
8647       "trn2 v1.8b, v5.8b, v6.8b\n"
8648       "trn1 v2.8b, v7.8b, v13.8b\n"
8649       "trn2 v3.8b, v7.8b, v13.8b\n"
8650       "uaddw v8.8h, v8.8h, v0.8b\n"
8651       "uaddw v9.8h, v9.8h, v1.8b\n"
8652       "uaddw v10.8h, v10.8h, v2.8b\n"
8653       "uaddw v11.8h, v11.8h, v3.8b\n"
8654       "uaddw v12.8h, v12.8h, v4.8b\n"
8655       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8656       "st1 {v4.2s}, [%x[out]], #8\n"
8657 
8658       "bne 1b\n"
8659 
8660       "2:"
8661 
8662       // Load Aggregate Store - column major 5x5
8663       "movi v0.8b, #0\n"
8664       "movi v1.8b, #0\n"
8665       "movi v2.8b, #0\n"
8666       "movi v3.8b, #0\n"
8667       "movi v4.8b, #0\n"
8668       "ld1 {v0.s}[0], [%x[in]], #4\n"
8669       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8670       "ld1 {v1.s}[0], [%x[in]], #4\n"
8671       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8672       "ld1 {v2.s}[0], [%x[in]], #4\n"
8673       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8674       "ld1 {v3.s}[0], [%x[in]], #4\n"
8675       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8676       "ld1 {v0.s}[1], [%x[in]], #4\n"
8677       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8678       "prfm pldl1keep, [%x[in]]\n"
8679       "trn1 v5.4h, v0.4h, v2.4h\n"
8680       "trn2 v7.4h, v0.4h, v2.4h\n"
8681       "trn1 v6.4h, v1.4h, v3.4h\n"
8682       "trn2 v13.4h, v1.4h, v3.4h\n"
8683       "trn1 v0.8b, v5.8b, v6.8b\n"
8684       "trn2 v1.8b, v5.8b, v6.8b\n"
8685       "trn1 v2.8b, v7.8b, v13.8b\n"
8686       "trn2 v3.8b, v7.8b, v13.8b\n"
8687       "uaddw v8.8h, v8.8h, v0.8b\n"
8688       "uaddw v9.8h, v9.8h, v1.8b\n"
8689       "uaddw v10.8h, v10.8h, v2.8b\n"
8690       "uaddw v11.8h, v11.8h, v3.8b\n"
8691       "uaddw v12.8h, v12.8h, v4.8b\n"
8692       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8693       "st1 {v4.2s}, [%x[out]], #8\n"
8694 
8695       // Aggregator Reduction.
8696       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8697       "dup v1.4s, %w[additive_sum_offset]\n"
8698       "uaddlp v8.4s, v8.8h\n"
8699       "uaddlp v9.4s, v9.8h\n"
8700       "uaddlp v10.4s, v10.8h\n"
8701       "uaddlp v11.4s, v11.8h\n"
8702       "uaddlp v12.4s, v12.8h\n"
8703       "addp v8.4s, v8.4s, v9.4s\n"
8704       "addp v10.4s, v10.4s, v11.4s\n"
8705       "addp v12.4s, v12.4s, v12.4s\n"
8706       "addp v8.4s, v8.4s, v10.4s\n"
8707       "addp v9.4s, v12.4s, v12.4s\n"
8708       "mul v8.4s, v8.4s, v0.s[0]\n"
8709       "mul v9.4s, v9.4s, v0.s[0]\n"
8710       "add v8.4s, v8.4s, v1.4s\n"
8711       "add v9.4s, v9.4s, v1.4s\n"
8712       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8713       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8714         [out] "+r"(out), [in] "+r"(in)
8715       : [additive_sum_offset] "r"(params.additive_sum_offset),
8716         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8717       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8718         "v11", "v12", "v13", "cc", "memory");
8719 }
8720 
8721 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8722 inline void Stream<uint8_t, 5, 8, 6, ColumnMajorWithSum>::Pack(
8723     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8724 #ifdef DEBUG
8725 #ifdef DEBUG_METAGEMM_VERBOSE
8726   std::cout
8727       << __FILE__ << "(" << __LINE__
8728       << ") ColumnMajorWithSum<uint8_t, 5, 8, 6, ColumnMajorWithSum>::Pack()"
8729       << std::endl
8730       << std::flush;
8731 #endif
8732 #endif
8733   int params_count_copy = params.count;
8734   int params_stride_copy = params.stride;
8735   asm volatile(
8736       "sub %x[stride], %x[stride], #4\n"
8737       "movi v8.8h, #0\n"
8738       "movi v9.8h, #0\n"
8739       "movi v10.8h, #0\n"
8740       "movi v11.8h, #0\n"
8741       "movi v12.8h, #0\n"
8742 
8743       // Reduce count by leftovers.
8744       "subs %x[count], %x[count], #6\n"
8745       "beq 2f\n"
8746 
8747       "1:"
8748       "subs %x[count], %x[count], #8\n"
8749 
8750       // Load Aggregate Store - column major 5x8
8751       "ld1 {v0.s}[0], [%x[in]], #4\n"
8752       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8753       "ld1 {v1.s}[0], [%x[in]], #4\n"
8754       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8755       "ld1 {v2.s}[0], [%x[in]], #4\n"
8756       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8757       "ld1 {v3.s}[0], [%x[in]], #4\n"
8758       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8759       "ld1 {v0.s}[1], [%x[in]], #4\n"
8760       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8761       "ld1 {v1.s}[1], [%x[in]], #4\n"
8762       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8763       "ld1 {v2.s}[1], [%x[in]], #4\n"
8764       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8765       "ld1 {v3.s}[1], [%x[in]], #4\n"
8766       "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8767       "prfm pldl1keep, [%x[in]]\n"
8768       "trn1 v5.4h, v0.4h, v2.4h\n"
8769       "trn2 v7.4h, v0.4h, v2.4h\n"
8770       "trn1 v6.4h, v1.4h, v3.4h\n"
8771       "trn2 v13.4h, v1.4h, v3.4h\n"
8772       "trn1 v0.8b, v5.8b, v6.8b\n"
8773       "trn2 v1.8b, v5.8b, v6.8b\n"
8774       "trn1 v2.8b, v7.8b, v13.8b\n"
8775       "trn2 v3.8b, v7.8b, v13.8b\n"
8776       "uaddw v8.8h, v8.8h, v0.8b\n"
8777       "uaddw v9.8h, v9.8h, v1.8b\n"
8778       "uaddw v10.8h, v10.8h, v2.8b\n"
8779       "uaddw v11.8h, v11.8h, v3.8b\n"
8780       "uaddw v12.8h, v12.8h, v4.8b\n"
8781       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8782       "st1 {v4.2s}, [%x[out]], #8\n"
8783 
8784       "bne 1b\n"
8785 
8786       "2:"
8787 
8788       // Load Aggregate Store - column major 5x6
8789       "movi v0.8b, #0\n"
8790       "movi v1.8b, #0\n"
8791       "movi v2.8b, #0\n"
8792       "movi v3.8b, #0\n"
8793       "movi v4.8b, #0\n"
8794       "ld1 {v0.s}[0], [%x[in]], #4\n"
8795       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8796       "ld1 {v1.s}[0], [%x[in]], #4\n"
8797       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8798       "ld1 {v2.s}[0], [%x[in]], #4\n"
8799       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8800       "ld1 {v3.s}[0], [%x[in]], #4\n"
8801       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8802       "ld1 {v0.s}[1], [%x[in]], #4\n"
8803       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8804       "ld1 {v1.s}[1], [%x[in]], #4\n"
8805       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8806       "prfm pldl1keep, [%x[in]]\n"
8807       "trn1 v5.4h, v0.4h, v2.4h\n"
8808       "trn2 v7.4h, v0.4h, v2.4h\n"
8809       "trn1 v6.4h, v1.4h, v3.4h\n"
8810       "trn2 v13.4h, v1.4h, v3.4h\n"
8811       "trn1 v0.8b, v5.8b, v6.8b\n"
8812       "trn2 v1.8b, v5.8b, v6.8b\n"
8813       "trn1 v2.8b, v7.8b, v13.8b\n"
8814       "trn2 v3.8b, v7.8b, v13.8b\n"
8815       "uaddw v8.8h, v8.8h, v0.8b\n"
8816       "uaddw v9.8h, v9.8h, v1.8b\n"
8817       "uaddw v10.8h, v10.8h, v2.8b\n"
8818       "uaddw v11.8h, v11.8h, v3.8b\n"
8819       "uaddw v12.8h, v12.8h, v4.8b\n"
8820       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8821       "st1 {v4.2s}, [%x[out]], #8\n"
8822 
8823       // Aggregator Reduction.
8824       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8825       "dup v1.4s, %w[additive_sum_offset]\n"
8826       "uaddlp v8.4s, v8.8h\n"
8827       "uaddlp v9.4s, v9.8h\n"
8828       "uaddlp v10.4s, v10.8h\n"
8829       "uaddlp v11.4s, v11.8h\n"
8830       "uaddlp v12.4s, v12.8h\n"
8831       "addp v8.4s, v8.4s, v9.4s\n"
8832       "addp v10.4s, v10.4s, v11.4s\n"
8833       "addp v12.4s, v12.4s, v12.4s\n"
8834       "addp v8.4s, v8.4s, v10.4s\n"
8835       "addp v9.4s, v12.4s, v12.4s\n"
8836       "mul v8.4s, v8.4s, v0.s[0]\n"
8837       "mul v9.4s, v9.4s, v0.s[0]\n"
8838       "add v8.4s, v8.4s, v1.4s\n"
8839       "add v9.4s, v9.4s, v1.4s\n"
8840       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8841       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8842         [out] "+r"(out), [in] "+r"(in)
8843       : [additive_sum_offset] "r"(params.additive_sum_offset),
8844         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8845       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8846         "v11", "v12", "v13", "cc", "memory");
8847 }
8848 
8849 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8850 inline void Stream<uint8_t, 5, 8, 7, ColumnMajorWithSum>::Pack(
8851     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8852 #ifdef DEBUG
8853 #ifdef DEBUG_METAGEMM_VERBOSE
8854   std::cout
8855       << __FILE__ << "(" << __LINE__
8856       << ") ColumnMajorWithSum<uint8_t, 5, 8, 7, ColumnMajorWithSum>::Pack()"
8857       << std::endl
8858       << std::flush;
8859 #endif
8860 #endif
8861   int params_count_copy = params.count;
8862   int params_stride_copy = params.stride;
8863   asm volatile(
8864       "sub %x[stride], %x[stride], #4\n"
8865       "movi v8.8h, #0\n"
8866       "movi v9.8h, #0\n"
8867       "movi v10.8h, #0\n"
8868       "movi v11.8h, #0\n"
8869       "movi v12.8h, #0\n"
8870 
8871       // Reduce count by leftovers.
8872       "subs %x[count], %x[count], #7\n"
8873       "beq 2f\n"
8874 
8875       "1:"
8876       "subs %x[count], %x[count], #8\n"
8877 
8878       // Load Aggregate Store - column major 5x8
8879       "ld1 {v0.s}[0], [%x[in]], #4\n"
8880       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8881       "ld1 {v1.s}[0], [%x[in]], #4\n"
8882       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8883       "ld1 {v2.s}[0], [%x[in]], #4\n"
8884       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8885       "ld1 {v3.s}[0], [%x[in]], #4\n"
8886       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8887       "ld1 {v0.s}[1], [%x[in]], #4\n"
8888       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8889       "ld1 {v1.s}[1], [%x[in]], #4\n"
8890       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8891       "ld1 {v2.s}[1], [%x[in]], #4\n"
8892       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8893       "ld1 {v3.s}[1], [%x[in]], #4\n"
8894       "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8895       "prfm pldl1keep, [%x[in]]\n"
8896       "trn1 v5.4h, v0.4h, v2.4h\n"
8897       "trn2 v7.4h, v0.4h, v2.4h\n"
8898       "trn1 v6.4h, v1.4h, v3.4h\n"
8899       "trn2 v13.4h, v1.4h, v3.4h\n"
8900       "trn1 v0.8b, v5.8b, v6.8b\n"
8901       "trn2 v1.8b, v5.8b, v6.8b\n"
8902       "trn1 v2.8b, v7.8b, v13.8b\n"
8903       "trn2 v3.8b, v7.8b, v13.8b\n"
8904       "uaddw v8.8h, v8.8h, v0.8b\n"
8905       "uaddw v9.8h, v9.8h, v1.8b\n"
8906       "uaddw v10.8h, v10.8h, v2.8b\n"
8907       "uaddw v11.8h, v11.8h, v3.8b\n"
8908       "uaddw v12.8h, v12.8h, v4.8b\n"
8909       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8910       "st1 {v4.2s}, [%x[out]], #8\n"
8911 
8912       "bne 1b\n"
8913 
8914       "2:"
8915 
8916       // Load Aggregate Store - column major 5x7
8917       "movi v0.8b, #0\n"
8918       "movi v1.8b, #0\n"
8919       "movi v2.8b, #0\n"
8920       "movi v3.8b, #0\n"
8921       "movi v4.8b, #0\n"
8922       "ld1 {v0.s}[0], [%x[in]], #4\n"
8923       "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8924       "ld1 {v1.s}[0], [%x[in]], #4\n"
8925       "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8926       "ld1 {v2.s}[0], [%x[in]], #4\n"
8927       "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8928       "ld1 {v3.s}[0], [%x[in]], #4\n"
8929       "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8930       "ld1 {v0.s}[1], [%x[in]], #4\n"
8931       "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8932       "ld1 {v1.s}[1], [%x[in]], #4\n"
8933       "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8934       "ld1 {v2.s}[1], [%x[in]], #4\n"
8935       "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8936       "prfm pldl1keep, [%x[in]]\n"
8937       "trn1 v5.4h, v0.4h, v2.4h\n"
8938       "trn2 v7.4h, v0.4h, v2.4h\n"
8939       "trn1 v6.4h, v1.4h, v3.4h\n"
8940       "trn2 v13.4h, v1.4h, v3.4h\n"
8941       "trn1 v0.8b, v5.8b, v6.8b\n"
8942       "trn2 v1.8b, v5.8b, v6.8b\n"
8943       "trn1 v2.8b, v7.8b, v13.8b\n"
8944       "trn2 v3.8b, v7.8b, v13.8b\n"
8945       "uaddw v8.8h, v8.8h, v0.8b\n"
8946       "uaddw v9.8h, v9.8h, v1.8b\n"
8947       "uaddw v10.8h, v10.8h, v2.8b\n"
8948       "uaddw v11.8h, v11.8h, v3.8b\n"
8949       "uaddw v12.8h, v12.8h, v4.8b\n"
8950       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8951       "st1 {v4.2s}, [%x[out]], #8\n"
8952 
8953       // Aggregator Reduction.
8954       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8955       "dup v1.4s, %w[additive_sum_offset]\n"
8956       "uaddlp v8.4s, v8.8h\n"
8957       "uaddlp v9.4s, v9.8h\n"
8958       "uaddlp v10.4s, v10.8h\n"
8959       "uaddlp v11.4s, v11.8h\n"
8960       "uaddlp v12.4s, v12.8h\n"
8961       "addp v8.4s, v8.4s, v9.4s\n"
8962       "addp v10.4s, v10.4s, v11.4s\n"
8963       "addp v12.4s, v12.4s, v12.4s\n"
8964       "addp v8.4s, v8.4s, v10.4s\n"
8965       "addp v9.4s, v12.4s, v12.4s\n"
8966       "mul v8.4s, v8.4s, v0.s[0]\n"
8967       "mul v9.4s, v9.4s, v0.s[0]\n"
8968       "add v8.4s, v8.4s, v1.4s\n"
8969       "add v9.4s, v9.4s, v1.4s\n"
8970       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8971       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8972         [out] "+r"(out), [in] "+r"(in)
8973       : [additive_sum_offset] "r"(params.additive_sum_offset),
8974         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8975       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8976         "v11", "v12", "v13", "cc", "memory");
8977 }
8978 
8979 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8980 inline void Stream<uint8_t, 6, 8, 0, ColumnMajorWithSum>::Pack(
8981     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8982 #ifdef DEBUG
8983 #ifdef DEBUG_METAGEMM_VERBOSE
8984   std::cout
8985       << __FILE__ << "(" << __LINE__
8986       << ") ColumnMajorWithSum<uint8_t, 6, 8, 0, ColumnMajorWithSum>::Pack()"
8987       << std::endl
8988       << std::flush;
8989 #endif
8990 #endif
8991   int params_count_copy = params.count;
8992   int params_stride_copy = params.stride;
8993   asm volatile(
8994       "sub %x[stride], %x[stride], #4\n"
8995       "movi v8.8h, #0\n"
8996       "movi v9.8h, #0\n"
8997       "movi v10.8h, #0\n"
8998       "movi v11.8h, #0\n"
8999       "movi v12.8h, #0\n"
9000       "movi v13.8h, #0\n"
9001 
9002       "1:"
9003       "subs %x[count], %x[count], #8\n"
9004 
9005       // Load Aggregate Store - column major 6x8
9006       "ld1 {v0.s}[0], [%x[in]], #4\n"
9007       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9008       "ld1 {v1.s}[0], [%x[in]], #4\n"
9009       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9010       "ld1 {v2.s}[0], [%x[in]], #4\n"
9011       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9012       "ld1 {v3.s}[0], [%x[in]], #4\n"
9013       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9014       "ld1 {v0.s}[1], [%x[in]], #4\n"
9015       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9016       "ld1 {v1.s}[1], [%x[in]], #4\n"
9017       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9018       "ld1 {v2.s}[1], [%x[in]], #4\n"
9019       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9020       "ld1 {v3.s}[1], [%x[in]], #4\n"
9021       "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9022       "prfm pldl1keep, [%x[in]]\n"
9023       "trn1 v6.4h, v0.4h, v2.4h\n"
9024       "trn2 v14.4h, v0.4h, v2.4h\n"
9025       "trn1 v7.4h, v1.4h, v3.4h\n"
9026       "trn2 v15.4h, v1.4h, v3.4h\n"
9027       "uzp1 v16.8b, v4.8b, v5.8b\n"
9028       "uzp2 v17.8b, v4.8b, v5.8b\n"
9029       "trn1 v0.8b, v6.8b, v7.8b\n"
9030       "trn2 v1.8b, v6.8b, v7.8b\n"
9031       "trn1 v2.8b, v14.8b, v15.8b\n"
9032       "trn2 v3.8b, v14.8b, v15.8b\n"
9033       "uaddw v8.8h, v8.8h, v0.8b\n"
9034       "uaddw v9.8h, v9.8h, v1.8b\n"
9035       "uaddw v10.8h, v10.8h, v2.8b\n"
9036       "uaddw v11.8h, v11.8h, v3.8b\n"
9037       "uaddw v12.8h, v12.8h, v16.8b\n"
9038       "uaddw v13.8h, v13.8h, v17.8b\n"
9039       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9040       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9041 
9042       "bne 1b\n"
9043 
9044       // Aggregator Reduction.
9045       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9046       "dup v1.4s, %w[additive_sum_offset]\n"
9047       "uaddlp v8.4s, v8.8h\n"
9048       "uaddlp v9.4s, v9.8h\n"
9049       "uaddlp v10.4s, v10.8h\n"
9050       "uaddlp v11.4s, v11.8h\n"
9051       "uaddlp v12.4s, v12.8h\n"
9052       "uaddlp v13.4s, v13.8h\n"
9053       "addp v8.4s, v8.4s, v9.4s\n"
9054       "addp v10.4s, v10.4s, v11.4s\n"
9055       "addp v12.4s, v12.4s, v13.4s\n"
9056       "addp v8.4s, v8.4s, v10.4s\n"
9057       "addp v9.4s, v12.4s, v12.4s\n"
9058       "mul v8.4s, v8.4s, v0.s[0]\n"
9059       "mul v9.4s, v9.4s, v0.s[0]\n"
9060       "add v8.4s, v8.4s, v1.4s\n"
9061       "add v9.4s, v9.4s, v1.4s\n"
9062       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9063       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9064         [out] "+r"(out), [in] "+r"(in)
9065       : [additive_sum_offset] "r"(params.additive_sum_offset),
9066         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9067       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9068         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9069 }
9070 
9071 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9072 inline void Stream<uint8_t, 6, 8, 1, ColumnMajorWithSum>::Pack(
9073     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9074 #ifdef DEBUG
9075 #ifdef DEBUG_METAGEMM_VERBOSE
9076   std::cout
9077       << __FILE__ << "(" << __LINE__
9078       << ") ColumnMajorWithSum<uint8_t, 6, 8, 1, ColumnMajorWithSum>::Pack()"
9079       << std::endl
9080       << std::flush;
9081 #endif
9082 #endif
9083   int params_count_copy = params.count;
9084   int params_stride_copy = params.stride;
9085   asm volatile(
9086       "sub %x[stride], %x[stride], #4\n"
9087       "movi v8.8h, #0\n"
9088       "movi v9.8h, #0\n"
9089       "movi v10.8h, #0\n"
9090       "movi v11.8h, #0\n"
9091       "movi v12.8h, #0\n"
9092       "movi v13.8h, #0\n"
9093 
9094       // Reduce count by leftovers.
9095       "subs %x[count], %x[count], #1\n"
9096       "beq 2f\n"
9097 
9098       "1:"
9099       "subs %x[count], %x[count], #8\n"
9100 
9101       // Load Aggregate Store - column major 6x8
9102       "ld1 {v0.s}[0], [%x[in]], #4\n"
9103       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9104       "ld1 {v1.s}[0], [%x[in]], #4\n"
9105       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9106       "ld1 {v2.s}[0], [%x[in]], #4\n"
9107       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9108       "ld1 {v3.s}[0], [%x[in]], #4\n"
9109       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9110       "ld1 {v0.s}[1], [%x[in]], #4\n"
9111       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9112       "ld1 {v1.s}[1], [%x[in]], #4\n"
9113       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9114       "ld1 {v2.s}[1], [%x[in]], #4\n"
9115       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9116       "ld1 {v3.s}[1], [%x[in]], #4\n"
9117       "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9118       "prfm pldl1keep, [%x[in]]\n"
9119       "trn1 v6.4h, v0.4h, v2.4h\n"
9120       "trn2 v14.4h, v0.4h, v2.4h\n"
9121       "trn1 v7.4h, v1.4h, v3.4h\n"
9122       "trn2 v15.4h, v1.4h, v3.4h\n"
9123       "uzp1 v16.8b, v4.8b, v5.8b\n"
9124       "uzp2 v17.8b, v4.8b, v5.8b\n"
9125       "trn1 v0.8b, v6.8b, v7.8b\n"
9126       "trn2 v1.8b, v6.8b, v7.8b\n"
9127       "trn1 v2.8b, v14.8b, v15.8b\n"
9128       "trn2 v3.8b, v14.8b, v15.8b\n"
9129       "uaddw v8.8h, v8.8h, v0.8b\n"
9130       "uaddw v9.8h, v9.8h, v1.8b\n"
9131       "uaddw v10.8h, v10.8h, v2.8b\n"
9132       "uaddw v11.8h, v11.8h, v3.8b\n"
9133       "uaddw v12.8h, v12.8h, v16.8b\n"
9134       "uaddw v13.8h, v13.8h, v17.8b\n"
9135       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9136       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9137 
9138       "bne 1b\n"
9139 
9140       "2:"
9141 
9142       // Load Aggregate Store - column major 6x1
9143       "movi v0.8b, #0\n"
9144       "movi v1.8b, #0\n"
9145       "movi v2.8b, #0\n"
9146       "movi v3.8b, #0\n"
9147       "movi v4.8b, #0\n"
9148       "movi v5.8b, #0\n"
9149       "ld1 {v0.s}[0], [%x[in]], #4\n"
9150       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9151       "prfm pldl1keep, [%x[in]]\n"
9152       "trn1 v6.4h, v0.4h, v2.4h\n"
9153       "trn2 v14.4h, v0.4h, v2.4h\n"
9154       "trn1 v7.4h, v1.4h, v3.4h\n"
9155       "trn2 v15.4h, v1.4h, v3.4h\n"
9156       "uzp1 v16.8b, v4.8b, v5.8b\n"
9157       "uzp2 v17.8b, v4.8b, v5.8b\n"
9158       "trn1 v0.8b, v6.8b, v7.8b\n"
9159       "trn2 v1.8b, v6.8b, v7.8b\n"
9160       "trn1 v2.8b, v14.8b, v15.8b\n"
9161       "trn2 v3.8b, v14.8b, v15.8b\n"
9162       "uaddw v8.8h, v8.8h, v0.8b\n"
9163       "uaddw v9.8h, v9.8h, v1.8b\n"
9164       "uaddw v10.8h, v10.8h, v2.8b\n"
9165       "uaddw v11.8h, v11.8h, v3.8b\n"
9166       "uaddw v12.8h, v12.8h, v16.8b\n"
9167       "uaddw v13.8h, v13.8h, v17.8b\n"
9168       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9169       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9170 
9171       // Aggregator Reduction.
9172       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9173       "dup v1.4s, %w[additive_sum_offset]\n"
9174       "uaddlp v8.4s, v8.8h\n"
9175       "uaddlp v9.4s, v9.8h\n"
9176       "uaddlp v10.4s, v10.8h\n"
9177       "uaddlp v11.4s, v11.8h\n"
9178       "uaddlp v12.4s, v12.8h\n"
9179       "uaddlp v13.4s, v13.8h\n"
9180       "addp v8.4s, v8.4s, v9.4s\n"
9181       "addp v10.4s, v10.4s, v11.4s\n"
9182       "addp v12.4s, v12.4s, v13.4s\n"
9183       "addp v8.4s, v8.4s, v10.4s\n"
9184       "addp v9.4s, v12.4s, v12.4s\n"
9185       "mul v8.4s, v8.4s, v0.s[0]\n"
9186       "mul v9.4s, v9.4s, v0.s[0]\n"
9187       "add v8.4s, v8.4s, v1.4s\n"
9188       "add v9.4s, v9.4s, v1.4s\n"
9189       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9190       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9191         [out] "+r"(out), [in] "+r"(in)
9192       : [additive_sum_offset] "r"(params.additive_sum_offset),
9193         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9194       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9195         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9196 }
9197 
9198 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9199 inline void Stream<uint8_t, 6, 8, 2, ColumnMajorWithSum>::Pack(
9200     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9201 #ifdef DEBUG
9202 #ifdef DEBUG_METAGEMM_VERBOSE
9203   std::cout
9204       << __FILE__ << "(" << __LINE__
9205       << ") ColumnMajorWithSum<uint8_t, 6, 8, 2, ColumnMajorWithSum>::Pack()"
9206       << std::endl
9207       << std::flush;
9208 #endif
9209 #endif
9210   int params_count_copy = params.count;
9211   int params_stride_copy = params.stride;
9212   asm volatile(
9213       "sub %x[stride], %x[stride], #4\n"
9214       "movi v8.8h, #0\n"
9215       "movi v9.8h, #0\n"
9216       "movi v10.8h, #0\n"
9217       "movi v11.8h, #0\n"
9218       "movi v12.8h, #0\n"
9219       "movi v13.8h, #0\n"
9220 
9221       // Reduce count by leftovers.
9222       "subs %x[count], %x[count], #2\n"
9223       "beq 2f\n"
9224 
9225       "1:"
9226       "subs %x[count], %x[count], #8\n"
9227 
9228       // Load Aggregate Store - column major 6x8
9229       "ld1 {v0.s}[0], [%x[in]], #4\n"
9230       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9231       "ld1 {v1.s}[0], [%x[in]], #4\n"
9232       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9233       "ld1 {v2.s}[0], [%x[in]], #4\n"
9234       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9235       "ld1 {v3.s}[0], [%x[in]], #4\n"
9236       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9237       "ld1 {v0.s}[1], [%x[in]], #4\n"
9238       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9239       "ld1 {v1.s}[1], [%x[in]], #4\n"
9240       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9241       "ld1 {v2.s}[1], [%x[in]], #4\n"
9242       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9243       "ld1 {v3.s}[1], [%x[in]], #4\n"
9244       "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9245       "prfm pldl1keep, [%x[in]]\n"
9246       "trn1 v6.4h, v0.4h, v2.4h\n"
9247       "trn2 v14.4h, v0.4h, v2.4h\n"
9248       "trn1 v7.4h, v1.4h, v3.4h\n"
9249       "trn2 v15.4h, v1.4h, v3.4h\n"
9250       "uzp1 v16.8b, v4.8b, v5.8b\n"
9251       "uzp2 v17.8b, v4.8b, v5.8b\n"
9252       "trn1 v0.8b, v6.8b, v7.8b\n"
9253       "trn2 v1.8b, v6.8b, v7.8b\n"
9254       "trn1 v2.8b, v14.8b, v15.8b\n"
9255       "trn2 v3.8b, v14.8b, v15.8b\n"
9256       "uaddw v8.8h, v8.8h, v0.8b\n"
9257       "uaddw v9.8h, v9.8h, v1.8b\n"
9258       "uaddw v10.8h, v10.8h, v2.8b\n"
9259       "uaddw v11.8h, v11.8h, v3.8b\n"
9260       "uaddw v12.8h, v12.8h, v16.8b\n"
9261       "uaddw v13.8h, v13.8h, v17.8b\n"
9262       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9263       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9264 
9265       "bne 1b\n"
9266 
9267       "2:"
9268 
9269       // Load Aggregate Store - column major 6x2
9270       "movi v0.8b, #0\n"
9271       "movi v1.8b, #0\n"
9272       "movi v2.8b, #0\n"
9273       "movi v3.8b, #0\n"
9274       "movi v4.8b, #0\n"
9275       "movi v5.8b, #0\n"
9276       "ld1 {v0.s}[0], [%x[in]], #4\n"
9277       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9278       "ld1 {v1.s}[0], [%x[in]], #4\n"
9279       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9280       "prfm pldl1keep, [%x[in]]\n"
9281       "trn1 v6.4h, v0.4h, v2.4h\n"
9282       "trn2 v14.4h, v0.4h, v2.4h\n"
9283       "trn1 v7.4h, v1.4h, v3.4h\n"
9284       "trn2 v15.4h, v1.4h, v3.4h\n"
9285       "uzp1 v16.8b, v4.8b, v5.8b\n"
9286       "uzp2 v17.8b, v4.8b, v5.8b\n"
9287       "trn1 v0.8b, v6.8b, v7.8b\n"
9288       "trn2 v1.8b, v6.8b, v7.8b\n"
9289       "trn1 v2.8b, v14.8b, v15.8b\n"
9290       "trn2 v3.8b, v14.8b, v15.8b\n"
9291       "uaddw v8.8h, v8.8h, v0.8b\n"
9292       "uaddw v9.8h, v9.8h, v1.8b\n"
9293       "uaddw v10.8h, v10.8h, v2.8b\n"
9294       "uaddw v11.8h, v11.8h, v3.8b\n"
9295       "uaddw v12.8h, v12.8h, v16.8b\n"
9296       "uaddw v13.8h, v13.8h, v17.8b\n"
9297       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9298       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9299 
9300       // Aggregator Reduction.
9301       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9302       "dup v1.4s, %w[additive_sum_offset]\n"
9303       "uaddlp v8.4s, v8.8h\n"
9304       "uaddlp v9.4s, v9.8h\n"
9305       "uaddlp v10.4s, v10.8h\n"
9306       "uaddlp v11.4s, v11.8h\n"
9307       "uaddlp v12.4s, v12.8h\n"
9308       "uaddlp v13.4s, v13.8h\n"
9309       "addp v8.4s, v8.4s, v9.4s\n"
9310       "addp v10.4s, v10.4s, v11.4s\n"
9311       "addp v12.4s, v12.4s, v13.4s\n"
9312       "addp v8.4s, v8.4s, v10.4s\n"
9313       "addp v9.4s, v12.4s, v12.4s\n"
9314       "mul v8.4s, v8.4s, v0.s[0]\n"
9315       "mul v9.4s, v9.4s, v0.s[0]\n"
9316       "add v8.4s, v8.4s, v1.4s\n"
9317       "add v9.4s, v9.4s, v1.4s\n"
9318       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9319       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9320         [out] "+r"(out), [in] "+r"(in)
9321       : [additive_sum_offset] "r"(params.additive_sum_offset),
9322         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9323       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9324         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9325 }
9326 
9327 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9328 inline void Stream<uint8_t, 6, 8, 3, ColumnMajorWithSum>::Pack(
9329     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9330 #ifdef DEBUG
9331 #ifdef DEBUG_METAGEMM_VERBOSE
9332   std::cout
9333       << __FILE__ << "(" << __LINE__
9334       << ") ColumnMajorWithSum<uint8_t, 6, 8, 3, ColumnMajorWithSum>::Pack()"
9335       << std::endl
9336       << std::flush;
9337 #endif
9338 #endif
9339   int params_count_copy = params.count;
9340   int params_stride_copy = params.stride;
9341   asm volatile(
9342       "sub %x[stride], %x[stride], #4\n"
9343       "movi v8.8h, #0\n"
9344       "movi v9.8h, #0\n"
9345       "movi v10.8h, #0\n"
9346       "movi v11.8h, #0\n"
9347       "movi v12.8h, #0\n"
9348       "movi v13.8h, #0\n"
9349 
9350       // Reduce count by leftovers.
9351       "subs %x[count], %x[count], #3\n"
9352       "beq 2f\n"
9353 
9354       "1:"
9355       "subs %x[count], %x[count], #8\n"
9356 
9357       // Load Aggregate Store - column major 6x8
9358       "ld1 {v0.s}[0], [%x[in]], #4\n"
9359       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9360       "ld1 {v1.s}[0], [%x[in]], #4\n"
9361       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9362       "ld1 {v2.s}[0], [%x[in]], #4\n"
9363       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9364       "ld1 {v3.s}[0], [%x[in]], #4\n"
9365       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9366       "ld1 {v0.s}[1], [%x[in]], #4\n"
9367       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9368       "ld1 {v1.s}[1], [%x[in]], #4\n"
9369       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9370       "ld1 {v2.s}[1], [%x[in]], #4\n"
9371       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9372       "ld1 {v3.s}[1], [%x[in]], #4\n"
9373       "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9374       "prfm pldl1keep, [%x[in]]\n"
9375       "trn1 v6.4h, v0.4h, v2.4h\n"
9376       "trn2 v14.4h, v0.4h, v2.4h\n"
9377       "trn1 v7.4h, v1.4h, v3.4h\n"
9378       "trn2 v15.4h, v1.4h, v3.4h\n"
9379       "uzp1 v16.8b, v4.8b, v5.8b\n"
9380       "uzp2 v17.8b, v4.8b, v5.8b\n"
9381       "trn1 v0.8b, v6.8b, v7.8b\n"
9382       "trn2 v1.8b, v6.8b, v7.8b\n"
9383       "trn1 v2.8b, v14.8b, v15.8b\n"
9384       "trn2 v3.8b, v14.8b, v15.8b\n"
9385       "uaddw v8.8h, v8.8h, v0.8b\n"
9386       "uaddw v9.8h, v9.8h, v1.8b\n"
9387       "uaddw v10.8h, v10.8h, v2.8b\n"
9388       "uaddw v11.8h, v11.8h, v3.8b\n"
9389       "uaddw v12.8h, v12.8h, v16.8b\n"
9390       "uaddw v13.8h, v13.8h, v17.8b\n"
9391       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9392       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9393 
9394       "bne 1b\n"
9395 
9396       "2:"
9397 
9398       // Load Aggregate Store - column major 6x3
9399       "movi v0.8b, #0\n"
9400       "movi v1.8b, #0\n"
9401       "movi v2.8b, #0\n"
9402       "movi v3.8b, #0\n"
9403       "movi v4.8b, #0\n"
9404       "movi v5.8b, #0\n"
9405       "ld1 {v0.s}[0], [%x[in]], #4\n"
9406       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9407       "ld1 {v1.s}[0], [%x[in]], #4\n"
9408       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9409       "ld1 {v2.s}[0], [%x[in]], #4\n"
9410       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9411       "prfm pldl1keep, [%x[in]]\n"
9412       "trn1 v6.4h, v0.4h, v2.4h\n"
9413       "trn2 v14.4h, v0.4h, v2.4h\n"
9414       "trn1 v7.4h, v1.4h, v3.4h\n"
9415       "trn2 v15.4h, v1.4h, v3.4h\n"
9416       "uzp1 v16.8b, v4.8b, v5.8b\n"
9417       "uzp2 v17.8b, v4.8b, v5.8b\n"
9418       "trn1 v0.8b, v6.8b, v7.8b\n"
9419       "trn2 v1.8b, v6.8b, v7.8b\n"
9420       "trn1 v2.8b, v14.8b, v15.8b\n"
9421       "trn2 v3.8b, v14.8b, v15.8b\n"
9422       "uaddw v8.8h, v8.8h, v0.8b\n"
9423       "uaddw v9.8h, v9.8h, v1.8b\n"
9424       "uaddw v10.8h, v10.8h, v2.8b\n"
9425       "uaddw v11.8h, v11.8h, v3.8b\n"
9426       "uaddw v12.8h, v12.8h, v16.8b\n"
9427       "uaddw v13.8h, v13.8h, v17.8b\n"
9428       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9429       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9430 
9431       // Aggregator Reduction.
9432       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9433       "dup v1.4s, %w[additive_sum_offset]\n"
9434       "uaddlp v8.4s, v8.8h\n"
9435       "uaddlp v9.4s, v9.8h\n"
9436       "uaddlp v10.4s, v10.8h\n"
9437       "uaddlp v11.4s, v11.8h\n"
9438       "uaddlp v12.4s, v12.8h\n"
9439       "uaddlp v13.4s, v13.8h\n"
9440       "addp v8.4s, v8.4s, v9.4s\n"
9441       "addp v10.4s, v10.4s, v11.4s\n"
9442       "addp v12.4s, v12.4s, v13.4s\n"
9443       "addp v8.4s, v8.4s, v10.4s\n"
9444       "addp v9.4s, v12.4s, v12.4s\n"
9445       "mul v8.4s, v8.4s, v0.s[0]\n"
9446       "mul v9.4s, v9.4s, v0.s[0]\n"
9447       "add v8.4s, v8.4s, v1.4s\n"
9448       "add v9.4s, v9.4s, v1.4s\n"
9449       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9450       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9451         [out] "+r"(out), [in] "+r"(in)
9452       : [additive_sum_offset] "r"(params.additive_sum_offset),
9453         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9454       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9455         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9456 }
9457 
9458 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9459 inline void Stream<uint8_t, 6, 8, 4, ColumnMajorWithSum>::Pack(
9460     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9461 #ifdef DEBUG
9462 #ifdef DEBUG_METAGEMM_VERBOSE
9463   std::cout
9464       << __FILE__ << "(" << __LINE__
9465       << ") ColumnMajorWithSum<uint8_t, 6, 8, 4, ColumnMajorWithSum>::Pack()"
9466       << std::endl
9467       << std::flush;
9468 #endif
9469 #endif
9470   int params_count_copy = params.count;
9471   int params_stride_copy = params.stride;
9472   asm volatile(
9473       "sub %x[stride], %x[stride], #4\n"
9474       "movi v8.8h, #0\n"
9475       "movi v9.8h, #0\n"
9476       "movi v10.8h, #0\n"
9477       "movi v11.8h, #0\n"
9478       "movi v12.8h, #0\n"
9479       "movi v13.8h, #0\n"
9480 
9481       // Reduce count by leftovers.
9482       "subs %x[count], %x[count], #4\n"
9483       "beq 2f\n"
9484 
9485       "1:"
9486       "subs %x[count], %x[count], #8\n"
9487 
9488       // Load Aggregate Store - column major 6x8
9489       "ld1 {v0.s}[0], [%x[in]], #4\n"
9490       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9491       "ld1 {v1.s}[0], [%x[in]], #4\n"
9492       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9493       "ld1 {v2.s}[0], [%x[in]], #4\n"
9494       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9495       "ld1 {v3.s}[0], [%x[in]], #4\n"
9496       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9497       "ld1 {v0.s}[1], [%x[in]], #4\n"
9498       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9499       "ld1 {v1.s}[1], [%x[in]], #4\n"
9500       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9501       "ld1 {v2.s}[1], [%x[in]], #4\n"
9502       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9503       "ld1 {v3.s}[1], [%x[in]], #4\n"
9504       "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9505       "prfm pldl1keep, [%x[in]]\n"
9506       "trn1 v6.4h, v0.4h, v2.4h\n"
9507       "trn2 v14.4h, v0.4h, v2.4h\n"
9508       "trn1 v7.4h, v1.4h, v3.4h\n"
9509       "trn2 v15.4h, v1.4h, v3.4h\n"
9510       "uzp1 v16.8b, v4.8b, v5.8b\n"
9511       "uzp2 v17.8b, v4.8b, v5.8b\n"
9512       "trn1 v0.8b, v6.8b, v7.8b\n"
9513       "trn2 v1.8b, v6.8b, v7.8b\n"
9514       "trn1 v2.8b, v14.8b, v15.8b\n"
9515       "trn2 v3.8b, v14.8b, v15.8b\n"
9516       "uaddw v8.8h, v8.8h, v0.8b\n"
9517       "uaddw v9.8h, v9.8h, v1.8b\n"
9518       "uaddw v10.8h, v10.8h, v2.8b\n"
9519       "uaddw v11.8h, v11.8h, v3.8b\n"
9520       "uaddw v12.8h, v12.8h, v16.8b\n"
9521       "uaddw v13.8h, v13.8h, v17.8b\n"
9522       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9523       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9524 
9525       "bne 1b\n"
9526 
9527       "2:"
9528 
9529       // Load Aggregate Store - column major 6x4
9530       "movi v0.8b, #0\n"
9531       "movi v1.8b, #0\n"
9532       "movi v2.8b, #0\n"
9533       "movi v3.8b, #0\n"
9534       "movi v4.8b, #0\n"
9535       "movi v5.8b, #0\n"
9536       "ld1 {v0.s}[0], [%x[in]], #4\n"
9537       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9538       "ld1 {v1.s}[0], [%x[in]], #4\n"
9539       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9540       "ld1 {v2.s}[0], [%x[in]], #4\n"
9541       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9542       "ld1 {v3.s}[0], [%x[in]], #4\n"
9543       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9544       "prfm pldl1keep, [%x[in]]\n"
9545       "trn1 v6.4h, v0.4h, v2.4h\n"
9546       "trn2 v14.4h, v0.4h, v2.4h\n"
9547       "trn1 v7.4h, v1.4h, v3.4h\n"
9548       "trn2 v15.4h, v1.4h, v3.4h\n"
9549       "uzp1 v16.8b, v4.8b, v5.8b\n"
9550       "uzp2 v17.8b, v4.8b, v5.8b\n"
9551       "trn1 v0.8b, v6.8b, v7.8b\n"
9552       "trn2 v1.8b, v6.8b, v7.8b\n"
9553       "trn1 v2.8b, v14.8b, v15.8b\n"
9554       "trn2 v3.8b, v14.8b, v15.8b\n"
9555       "uaddw v8.8h, v8.8h, v0.8b\n"
9556       "uaddw v9.8h, v9.8h, v1.8b\n"
9557       "uaddw v10.8h, v10.8h, v2.8b\n"
9558       "uaddw v11.8h, v11.8h, v3.8b\n"
9559       "uaddw v12.8h, v12.8h, v16.8b\n"
9560       "uaddw v13.8h, v13.8h, v17.8b\n"
9561       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9562       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9563 
9564       // Aggregator Reduction.
9565       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9566       "dup v1.4s, %w[additive_sum_offset]\n"
9567       "uaddlp v8.4s, v8.8h\n"
9568       "uaddlp v9.4s, v9.8h\n"
9569       "uaddlp v10.4s, v10.8h\n"
9570       "uaddlp v11.4s, v11.8h\n"
9571       "uaddlp v12.4s, v12.8h\n"
9572       "uaddlp v13.4s, v13.8h\n"
9573       "addp v8.4s, v8.4s, v9.4s\n"
9574       "addp v10.4s, v10.4s, v11.4s\n"
9575       "addp v12.4s, v12.4s, v13.4s\n"
9576       "addp v8.4s, v8.4s, v10.4s\n"
9577       "addp v9.4s, v12.4s, v12.4s\n"
9578       "mul v8.4s, v8.4s, v0.s[0]\n"
9579       "mul v9.4s, v9.4s, v0.s[0]\n"
9580       "add v8.4s, v8.4s, v1.4s\n"
9581       "add v9.4s, v9.4s, v1.4s\n"
9582       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9583       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9584         [out] "+r"(out), [in] "+r"(in)
9585       : [additive_sum_offset] "r"(params.additive_sum_offset),
9586         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9587       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9588         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9589 }
9590 
9591 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9592 inline void Stream<uint8_t, 6, 8, 5, ColumnMajorWithSum>::Pack(
9593     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9594 #ifdef DEBUG
9595 #ifdef DEBUG_METAGEMM_VERBOSE
9596   std::cout
9597       << __FILE__ << "(" << __LINE__
9598       << ") ColumnMajorWithSum<uint8_t, 6, 8, 5, ColumnMajorWithSum>::Pack()"
9599       << std::endl
9600       << std::flush;
9601 #endif
9602 #endif
9603   int params_count_copy = params.count;
9604   int params_stride_copy = params.stride;
9605   asm volatile(
9606       "sub %x[stride], %x[stride], #4\n"
9607       "movi v8.8h, #0\n"
9608       "movi v9.8h, #0\n"
9609       "movi v10.8h, #0\n"
9610       "movi v11.8h, #0\n"
9611       "movi v12.8h, #0\n"
9612       "movi v13.8h, #0\n"
9613 
9614       // Reduce count by leftovers.
9615       "subs %x[count], %x[count], #5\n"
9616       "beq 2f\n"
9617 
9618       "1:"
9619       "subs %x[count], %x[count], #8\n"
9620 
9621       // Load Aggregate Store - column major 6x8
9622       "ld1 {v0.s}[0], [%x[in]], #4\n"
9623       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9624       "ld1 {v1.s}[0], [%x[in]], #4\n"
9625       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9626       "ld1 {v2.s}[0], [%x[in]], #4\n"
9627       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9628       "ld1 {v3.s}[0], [%x[in]], #4\n"
9629       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9630       "ld1 {v0.s}[1], [%x[in]], #4\n"
9631       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9632       "ld1 {v1.s}[1], [%x[in]], #4\n"
9633       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9634       "ld1 {v2.s}[1], [%x[in]], #4\n"
9635       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9636       "ld1 {v3.s}[1], [%x[in]], #4\n"
9637       "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9638       "prfm pldl1keep, [%x[in]]\n"
9639       "trn1 v6.4h, v0.4h, v2.4h\n"
9640       "trn2 v14.4h, v0.4h, v2.4h\n"
9641       "trn1 v7.4h, v1.4h, v3.4h\n"
9642       "trn2 v15.4h, v1.4h, v3.4h\n"
9643       "uzp1 v16.8b, v4.8b, v5.8b\n"
9644       "uzp2 v17.8b, v4.8b, v5.8b\n"
9645       "trn1 v0.8b, v6.8b, v7.8b\n"
9646       "trn2 v1.8b, v6.8b, v7.8b\n"
9647       "trn1 v2.8b, v14.8b, v15.8b\n"
9648       "trn2 v3.8b, v14.8b, v15.8b\n"
9649       "uaddw v8.8h, v8.8h, v0.8b\n"
9650       "uaddw v9.8h, v9.8h, v1.8b\n"
9651       "uaddw v10.8h, v10.8h, v2.8b\n"
9652       "uaddw v11.8h, v11.8h, v3.8b\n"
9653       "uaddw v12.8h, v12.8h, v16.8b\n"
9654       "uaddw v13.8h, v13.8h, v17.8b\n"
9655       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9656       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9657 
9658       "bne 1b\n"
9659 
9660       "2:"
9661 
9662       // Load Aggregate Store - column major 6x5
9663       "movi v0.8b, #0\n"
9664       "movi v1.8b, #0\n"
9665       "movi v2.8b, #0\n"
9666       "movi v3.8b, #0\n"
9667       "movi v4.8b, #0\n"
9668       "movi v5.8b, #0\n"
9669       "ld1 {v0.s}[0], [%x[in]], #4\n"
9670       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9671       "ld1 {v1.s}[0], [%x[in]], #4\n"
9672       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9673       "ld1 {v2.s}[0], [%x[in]], #4\n"
9674       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9675       "ld1 {v3.s}[0], [%x[in]], #4\n"
9676       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9677       "ld1 {v0.s}[1], [%x[in]], #4\n"
9678       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9679       "prfm pldl1keep, [%x[in]]\n"
9680       "trn1 v6.4h, v0.4h, v2.4h\n"
9681       "trn2 v14.4h, v0.4h, v2.4h\n"
9682       "trn1 v7.4h, v1.4h, v3.4h\n"
9683       "trn2 v15.4h, v1.4h, v3.4h\n"
9684       "uzp1 v16.8b, v4.8b, v5.8b\n"
9685       "uzp2 v17.8b, v4.8b, v5.8b\n"
9686       "trn1 v0.8b, v6.8b, v7.8b\n"
9687       "trn2 v1.8b, v6.8b, v7.8b\n"
9688       "trn1 v2.8b, v14.8b, v15.8b\n"
9689       "trn2 v3.8b, v14.8b, v15.8b\n"
9690       "uaddw v8.8h, v8.8h, v0.8b\n"
9691       "uaddw v9.8h, v9.8h, v1.8b\n"
9692       "uaddw v10.8h, v10.8h, v2.8b\n"
9693       "uaddw v11.8h, v11.8h, v3.8b\n"
9694       "uaddw v12.8h, v12.8h, v16.8b\n"
9695       "uaddw v13.8h, v13.8h, v17.8b\n"
9696       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9697       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9698 
9699       // Aggregator Reduction.
9700       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9701       "dup v1.4s, %w[additive_sum_offset]\n"
9702       "uaddlp v8.4s, v8.8h\n"
9703       "uaddlp v9.4s, v9.8h\n"
9704       "uaddlp v10.4s, v10.8h\n"
9705       "uaddlp v11.4s, v11.8h\n"
9706       "uaddlp v12.4s, v12.8h\n"
9707       "uaddlp v13.4s, v13.8h\n"
9708       "addp v8.4s, v8.4s, v9.4s\n"
9709       "addp v10.4s, v10.4s, v11.4s\n"
9710       "addp v12.4s, v12.4s, v13.4s\n"
9711       "addp v8.4s, v8.4s, v10.4s\n"
9712       "addp v9.4s, v12.4s, v12.4s\n"
9713       "mul v8.4s, v8.4s, v0.s[0]\n"
9714       "mul v9.4s, v9.4s, v0.s[0]\n"
9715       "add v8.4s, v8.4s, v1.4s\n"
9716       "add v9.4s, v9.4s, v1.4s\n"
9717       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9718       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9719         [out] "+r"(out), [in] "+r"(in)
9720       : [additive_sum_offset] "r"(params.additive_sum_offset),
9721         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9722       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9723         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9724 }
9725 
9726 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9727 inline void Stream<uint8_t, 6, 8, 6, ColumnMajorWithSum>::Pack(
9728     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9729 #ifdef DEBUG
9730 #ifdef DEBUG_METAGEMM_VERBOSE
9731   std::cout
9732       << __FILE__ << "(" << __LINE__
9733       << ") ColumnMajorWithSum<uint8_t, 6, 8, 6, ColumnMajorWithSum>::Pack()"
9734       << std::endl
9735       << std::flush;
9736 #endif
9737 #endif
9738   int params_count_copy = params.count;
9739   int params_stride_copy = params.stride;
9740   asm volatile(
9741       "sub %x[stride], %x[stride], #4\n"
9742       "movi v8.8h, #0\n"
9743       "movi v9.8h, #0\n"
9744       "movi v10.8h, #0\n"
9745       "movi v11.8h, #0\n"
9746       "movi v12.8h, #0\n"
9747       "movi v13.8h, #0\n"
9748 
9749       // Reduce count by leftovers.
9750       "subs %x[count], %x[count], #6\n"
9751       "beq 2f\n"
9752 
9753       "1:"
9754       "subs %x[count], %x[count], #8\n"
9755 
9756       // Load Aggregate Store - column major 6x8
9757       "ld1 {v0.s}[0], [%x[in]], #4\n"
9758       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9759       "ld1 {v1.s}[0], [%x[in]], #4\n"
9760       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9761       "ld1 {v2.s}[0], [%x[in]], #4\n"
9762       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9763       "ld1 {v3.s}[0], [%x[in]], #4\n"
9764       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9765       "ld1 {v0.s}[1], [%x[in]], #4\n"
9766       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9767       "ld1 {v1.s}[1], [%x[in]], #4\n"
9768       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9769       "ld1 {v2.s}[1], [%x[in]], #4\n"
9770       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9771       "ld1 {v3.s}[1], [%x[in]], #4\n"
9772       "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9773       "prfm pldl1keep, [%x[in]]\n"
9774       "trn1 v6.4h, v0.4h, v2.4h\n"
9775       "trn2 v14.4h, v0.4h, v2.4h\n"
9776       "trn1 v7.4h, v1.4h, v3.4h\n"
9777       "trn2 v15.4h, v1.4h, v3.4h\n"
9778       "uzp1 v16.8b, v4.8b, v5.8b\n"
9779       "uzp2 v17.8b, v4.8b, v5.8b\n"
9780       "trn1 v0.8b, v6.8b, v7.8b\n"
9781       "trn2 v1.8b, v6.8b, v7.8b\n"
9782       "trn1 v2.8b, v14.8b, v15.8b\n"
9783       "trn2 v3.8b, v14.8b, v15.8b\n"
9784       "uaddw v8.8h, v8.8h, v0.8b\n"
9785       "uaddw v9.8h, v9.8h, v1.8b\n"
9786       "uaddw v10.8h, v10.8h, v2.8b\n"
9787       "uaddw v11.8h, v11.8h, v3.8b\n"
9788       "uaddw v12.8h, v12.8h, v16.8b\n"
9789       "uaddw v13.8h, v13.8h, v17.8b\n"
9790       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9791       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9792 
9793       "bne 1b\n"
9794 
9795       "2:"
9796 
9797       // Load Aggregate Store - column major 6x6
9798       "movi v0.8b, #0\n"
9799       "movi v1.8b, #0\n"
9800       "movi v2.8b, #0\n"
9801       "movi v3.8b, #0\n"
9802       "movi v4.8b, #0\n"
9803       "movi v5.8b, #0\n"
9804       "ld1 {v0.s}[0], [%x[in]], #4\n"
9805       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9806       "ld1 {v1.s}[0], [%x[in]], #4\n"
9807       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9808       "ld1 {v2.s}[0], [%x[in]], #4\n"
9809       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9810       "ld1 {v3.s}[0], [%x[in]], #4\n"
9811       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9812       "ld1 {v0.s}[1], [%x[in]], #4\n"
9813       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9814       "ld1 {v1.s}[1], [%x[in]], #4\n"
9815       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9816       "prfm pldl1keep, [%x[in]]\n"
9817       "trn1 v6.4h, v0.4h, v2.4h\n"
9818       "trn2 v14.4h, v0.4h, v2.4h\n"
9819       "trn1 v7.4h, v1.4h, v3.4h\n"
9820       "trn2 v15.4h, v1.4h, v3.4h\n"
9821       "uzp1 v16.8b, v4.8b, v5.8b\n"
9822       "uzp2 v17.8b, v4.8b, v5.8b\n"
9823       "trn1 v0.8b, v6.8b, v7.8b\n"
9824       "trn2 v1.8b, v6.8b, v7.8b\n"
9825       "trn1 v2.8b, v14.8b, v15.8b\n"
9826       "trn2 v3.8b, v14.8b, v15.8b\n"
9827       "uaddw v8.8h, v8.8h, v0.8b\n"
9828       "uaddw v9.8h, v9.8h, v1.8b\n"
9829       "uaddw v10.8h, v10.8h, v2.8b\n"
9830       "uaddw v11.8h, v11.8h, v3.8b\n"
9831       "uaddw v12.8h, v12.8h, v16.8b\n"
9832       "uaddw v13.8h, v13.8h, v17.8b\n"
9833       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9834       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9835 
9836       // Aggregator Reduction.
9837       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9838       "dup v1.4s, %w[additive_sum_offset]\n"
9839       "uaddlp v8.4s, v8.8h\n"
9840       "uaddlp v9.4s, v9.8h\n"
9841       "uaddlp v10.4s, v10.8h\n"
9842       "uaddlp v11.4s, v11.8h\n"
9843       "uaddlp v12.4s, v12.8h\n"
9844       "uaddlp v13.4s, v13.8h\n"
9845       "addp v8.4s, v8.4s, v9.4s\n"
9846       "addp v10.4s, v10.4s, v11.4s\n"
9847       "addp v12.4s, v12.4s, v13.4s\n"
9848       "addp v8.4s, v8.4s, v10.4s\n"
9849       "addp v9.4s, v12.4s, v12.4s\n"
9850       "mul v8.4s, v8.4s, v0.s[0]\n"
9851       "mul v9.4s, v9.4s, v0.s[0]\n"
9852       "add v8.4s, v8.4s, v1.4s\n"
9853       "add v9.4s, v9.4s, v1.4s\n"
9854       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9855       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9856         [out] "+r"(out), [in] "+r"(in)
9857       : [additive_sum_offset] "r"(params.additive_sum_offset),
9858         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9859       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9860         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9861 }
9862 
9863 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9864 inline void Stream<uint8_t, 6, 8, 7, ColumnMajorWithSum>::Pack(
9865     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9866 #ifdef DEBUG
9867 #ifdef DEBUG_METAGEMM_VERBOSE
9868   std::cout
9869       << __FILE__ << "(" << __LINE__
9870       << ") ColumnMajorWithSum<uint8_t, 6, 8, 7, ColumnMajorWithSum>::Pack()"
9871       << std::endl
9872       << std::flush;
9873 #endif
9874 #endif
9875   int params_count_copy = params.count;
9876   int params_stride_copy = params.stride;
9877   asm volatile(
9878       "sub %x[stride], %x[stride], #4\n"
9879       "movi v8.8h, #0\n"
9880       "movi v9.8h, #0\n"
9881       "movi v10.8h, #0\n"
9882       "movi v11.8h, #0\n"
9883       "movi v12.8h, #0\n"
9884       "movi v13.8h, #0\n"
9885 
9886       // Reduce count by leftovers.
9887       "subs %x[count], %x[count], #7\n"
9888       "beq 2f\n"
9889 
9890       "1:"
9891       "subs %x[count], %x[count], #8\n"
9892 
9893       // Load Aggregate Store - column major 6x8
9894       "ld1 {v0.s}[0], [%x[in]], #4\n"
9895       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9896       "ld1 {v1.s}[0], [%x[in]], #4\n"
9897       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9898       "ld1 {v2.s}[0], [%x[in]], #4\n"
9899       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9900       "ld1 {v3.s}[0], [%x[in]], #4\n"
9901       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9902       "ld1 {v0.s}[1], [%x[in]], #4\n"
9903       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9904       "ld1 {v1.s}[1], [%x[in]], #4\n"
9905       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9906       "ld1 {v2.s}[1], [%x[in]], #4\n"
9907       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9908       "ld1 {v3.s}[1], [%x[in]], #4\n"
9909       "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9910       "prfm pldl1keep, [%x[in]]\n"
9911       "trn1 v6.4h, v0.4h, v2.4h\n"
9912       "trn2 v14.4h, v0.4h, v2.4h\n"
9913       "trn1 v7.4h, v1.4h, v3.4h\n"
9914       "trn2 v15.4h, v1.4h, v3.4h\n"
9915       "uzp1 v16.8b, v4.8b, v5.8b\n"
9916       "uzp2 v17.8b, v4.8b, v5.8b\n"
9917       "trn1 v0.8b, v6.8b, v7.8b\n"
9918       "trn2 v1.8b, v6.8b, v7.8b\n"
9919       "trn1 v2.8b, v14.8b, v15.8b\n"
9920       "trn2 v3.8b, v14.8b, v15.8b\n"
9921       "uaddw v8.8h, v8.8h, v0.8b\n"
9922       "uaddw v9.8h, v9.8h, v1.8b\n"
9923       "uaddw v10.8h, v10.8h, v2.8b\n"
9924       "uaddw v11.8h, v11.8h, v3.8b\n"
9925       "uaddw v12.8h, v12.8h, v16.8b\n"
9926       "uaddw v13.8h, v13.8h, v17.8b\n"
9927       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9928       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9929 
9930       "bne 1b\n"
9931 
9932       "2:"
9933 
9934       // Load Aggregate Store - column major 6x7
9935       "movi v0.8b, #0\n"
9936       "movi v1.8b, #0\n"
9937       "movi v2.8b, #0\n"
9938       "movi v3.8b, #0\n"
9939       "movi v4.8b, #0\n"
9940       "movi v5.8b, #0\n"
9941       "ld1 {v0.s}[0], [%x[in]], #4\n"
9942       "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9943       "ld1 {v1.s}[0], [%x[in]], #4\n"
9944       "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9945       "ld1 {v2.s}[0], [%x[in]], #4\n"
9946       "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9947       "ld1 {v3.s}[0], [%x[in]], #4\n"
9948       "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9949       "ld1 {v0.s}[1], [%x[in]], #4\n"
9950       "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9951       "ld1 {v1.s}[1], [%x[in]], #4\n"
9952       "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9953       "ld1 {v2.s}[1], [%x[in]], #4\n"
9954       "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9955       "prfm pldl1keep, [%x[in]]\n"
9956       "trn1 v6.4h, v0.4h, v2.4h\n"
9957       "trn2 v14.4h, v0.4h, v2.4h\n"
9958       "trn1 v7.4h, v1.4h, v3.4h\n"
9959       "trn2 v15.4h, v1.4h, v3.4h\n"
9960       "uzp1 v16.8b, v4.8b, v5.8b\n"
9961       "uzp2 v17.8b, v4.8b, v5.8b\n"
9962       "trn1 v0.8b, v6.8b, v7.8b\n"
9963       "trn2 v1.8b, v6.8b, v7.8b\n"
9964       "trn1 v2.8b, v14.8b, v15.8b\n"
9965       "trn2 v3.8b, v14.8b, v15.8b\n"
9966       "uaddw v8.8h, v8.8h, v0.8b\n"
9967       "uaddw v9.8h, v9.8h, v1.8b\n"
9968       "uaddw v10.8h, v10.8h, v2.8b\n"
9969       "uaddw v11.8h, v11.8h, v3.8b\n"
9970       "uaddw v12.8h, v12.8h, v16.8b\n"
9971       "uaddw v13.8h, v13.8h, v17.8b\n"
9972       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9973       "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9974 
9975       // Aggregator Reduction.
9976       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9977       "dup v1.4s, %w[additive_sum_offset]\n"
9978       "uaddlp v8.4s, v8.8h\n"
9979       "uaddlp v9.4s, v9.8h\n"
9980       "uaddlp v10.4s, v10.8h\n"
9981       "uaddlp v11.4s, v11.8h\n"
9982       "uaddlp v12.4s, v12.8h\n"
9983       "uaddlp v13.4s, v13.8h\n"
9984       "addp v8.4s, v8.4s, v9.4s\n"
9985       "addp v10.4s, v10.4s, v11.4s\n"
9986       "addp v12.4s, v12.4s, v13.4s\n"
9987       "addp v8.4s, v8.4s, v10.4s\n"
9988       "addp v9.4s, v12.4s, v12.4s\n"
9989       "mul v8.4s, v8.4s, v0.s[0]\n"
9990       "mul v9.4s, v9.4s, v0.s[0]\n"
9991       "add v8.4s, v8.4s, v1.4s\n"
9992       "add v9.4s, v9.4s, v1.4s\n"
9993       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9994       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9995         [out] "+r"(out), [in] "+r"(in)
9996       : [additive_sum_offset] "r"(params.additive_sum_offset),
9997         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9998       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9999         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10000 }
10001 
10002 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10003 inline void Stream<uint8_t, 7, 8, 0, ColumnMajorWithSum>::Pack(
10004     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10005 #ifdef DEBUG
10006 #ifdef DEBUG_METAGEMM_VERBOSE
10007   std::cout
10008       << __FILE__ << "(" << __LINE__
10009       << ") ColumnMajorWithSum<uint8_t, 7, 8, 0, ColumnMajorWithSum>::Pack()"
10010       << std::endl
10011       << std::flush;
10012 #endif
10013 #endif
10014   int params_count_copy = params.count;
10015   int params_stride_copy = params.stride;
10016   asm volatile(
10017       "sub %x[stride], %x[stride], #4\n"
10018       "movi v8.8h, #0\n"
10019       "movi v9.8h, #0\n"
10020       "movi v10.8h, #0\n"
10021       "movi v11.8h, #0\n"
10022       "movi v12.8h, #0\n"
10023       "movi v13.8h, #0\n"
10024       "movi v14.8h, #0\n"
10025 
10026       "1:"
10027       "subs %x[count], %x[count], #8\n"
10028 
10029       // Load Aggregate Store - column major 7x8
10030       "ld1 {v0.s}[0], [%x[in]], #4\n"
10031       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10032       "ld1 {v1.s}[0], [%x[in]], #4\n"
10033       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10034       "ld1 {v2.s}[0], [%x[in]], #4\n"
10035       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10036       "ld1 {v3.s}[0], [%x[in]], #4\n"
10037       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10038       "ld1 {v0.s}[1], [%x[in]], #4\n"
10039       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10040       "ld1 {v1.s}[1], [%x[in]], #4\n"
10041       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10042       "ld1 {v2.s}[1], [%x[in]], #4\n"
10043       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10044       "ld1 {v3.s}[1], [%x[in]], #4\n"
10045       "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10046       "prfm pldl1keep, [%x[in]]\n"
10047       "trn1 v7.4h, v0.4h, v2.4h\n"
10048       "trn2 v16.4h, v0.4h, v2.4h\n"
10049       "trn1 v15.4h, v1.4h, v3.4h\n"
10050       "trn2 v17.4h, v1.4h, v3.4h\n"
10051       "trn1 v0.8b, v7.8b, v15.8b\n"
10052       "trn2 v1.8b, v7.8b, v15.8b\n"
10053       "trn1 v2.8b, v16.8b, v17.8b\n"
10054       "trn2 v3.8b, v16.8b, v17.8b\n"
10055       "uaddw v8.8h, v8.8h, v0.8b\n"
10056       "uaddw v9.8h, v9.8h, v1.8b\n"
10057       "uaddw v10.8h, v10.8h, v2.8b\n"
10058       "uaddw v11.8h, v11.8h, v3.8b\n"
10059       "uaddw v12.8h, v12.8h, v4.8b\n"
10060       "uaddw v13.8h, v13.8h, v5.8b\n"
10061       "uaddw v14.8h, v14.8h, v6.8b\n"
10062       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10063       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10064 
10065       "bne 1b\n"
10066 
10067       // Aggregator Reduction.
10068       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10069       "dup v1.4s, %w[additive_sum_offset]\n"
10070       "uaddlp v8.4s, v8.8h\n"
10071       "uaddlp v9.4s, v9.8h\n"
10072       "uaddlp v10.4s, v10.8h\n"
10073       "uaddlp v11.4s, v11.8h\n"
10074       "uaddlp v12.4s, v12.8h\n"
10075       "uaddlp v13.4s, v13.8h\n"
10076       "uaddlp v14.4s, v14.8h\n"
10077       "addp v8.4s, v8.4s, v9.4s\n"
10078       "addp v10.4s, v10.4s, v11.4s\n"
10079       "addp v12.4s, v12.4s, v13.4s\n"
10080       "addp v14.4s, v14.4s, v14.4s\n"
10081       "addp v8.4s, v8.4s, v10.4s\n"
10082       "addp v9.4s, v12.4s, v14.4s\n"
10083       "mul v8.4s, v8.4s, v0.s[0]\n"
10084       "mul v9.4s, v9.4s, v0.s[0]\n"
10085       "add v8.4s, v8.4s, v1.4s\n"
10086       "add v9.4s, v9.4s, v1.4s\n"
10087       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10088       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10089         [out] "+r"(out), [in] "+r"(in)
10090       : [additive_sum_offset] "r"(params.additive_sum_offset),
10091         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10092       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10093         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10094 }
10095 
10096 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10097 inline void Stream<uint8_t, 7, 8, 1, ColumnMajorWithSum>::Pack(
10098     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10099 #ifdef DEBUG
10100 #ifdef DEBUG_METAGEMM_VERBOSE
10101   std::cout
10102       << __FILE__ << "(" << __LINE__
10103       << ") ColumnMajorWithSum<uint8_t, 7, 8, 1, ColumnMajorWithSum>::Pack()"
10104       << std::endl
10105       << std::flush;
10106 #endif
10107 #endif
10108   int params_count_copy = params.count;
10109   int params_stride_copy = params.stride;
10110   asm volatile(
10111       "sub %x[stride], %x[stride], #4\n"
10112       "movi v8.8h, #0\n"
10113       "movi v9.8h, #0\n"
10114       "movi v10.8h, #0\n"
10115       "movi v11.8h, #0\n"
10116       "movi v12.8h, #0\n"
10117       "movi v13.8h, #0\n"
10118       "movi v14.8h, #0\n"
10119 
10120       // Reduce count by leftovers.
10121       "subs %x[count], %x[count], #1\n"
10122       "beq 2f\n"
10123 
10124       "1:"
10125       "subs %x[count], %x[count], #8\n"
10126 
10127       // Load Aggregate Store - column major 7x8
10128       "ld1 {v0.s}[0], [%x[in]], #4\n"
10129       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10130       "ld1 {v1.s}[0], [%x[in]], #4\n"
10131       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10132       "ld1 {v2.s}[0], [%x[in]], #4\n"
10133       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10134       "ld1 {v3.s}[0], [%x[in]], #4\n"
10135       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10136       "ld1 {v0.s}[1], [%x[in]], #4\n"
10137       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10138       "ld1 {v1.s}[1], [%x[in]], #4\n"
10139       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10140       "ld1 {v2.s}[1], [%x[in]], #4\n"
10141       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10142       "ld1 {v3.s}[1], [%x[in]], #4\n"
10143       "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10144       "prfm pldl1keep, [%x[in]]\n"
10145       "trn1 v7.4h, v0.4h, v2.4h\n"
10146       "trn2 v16.4h, v0.4h, v2.4h\n"
10147       "trn1 v15.4h, v1.4h, v3.4h\n"
10148       "trn2 v17.4h, v1.4h, v3.4h\n"
10149       "trn1 v0.8b, v7.8b, v15.8b\n"
10150       "trn2 v1.8b, v7.8b, v15.8b\n"
10151       "trn1 v2.8b, v16.8b, v17.8b\n"
10152       "trn2 v3.8b, v16.8b, v17.8b\n"
10153       "uaddw v8.8h, v8.8h, v0.8b\n"
10154       "uaddw v9.8h, v9.8h, v1.8b\n"
10155       "uaddw v10.8h, v10.8h, v2.8b\n"
10156       "uaddw v11.8h, v11.8h, v3.8b\n"
10157       "uaddw v12.8h, v12.8h, v4.8b\n"
10158       "uaddw v13.8h, v13.8h, v5.8b\n"
10159       "uaddw v14.8h, v14.8h, v6.8b\n"
10160       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10161       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10162 
10163       "bne 1b\n"
10164 
10165       "2:"
10166 
10167       // Load Aggregate Store - column major 7x1
10168       "movi v0.8b, #0\n"
10169       "movi v1.8b, #0\n"
10170       "movi v2.8b, #0\n"
10171       "movi v3.8b, #0\n"
10172       "movi v4.8b, #0\n"
10173       "movi v5.8b, #0\n"
10174       "movi v6.8b, #0\n"
10175       "ld1 {v0.s}[0], [%x[in]], #4\n"
10176       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10177       "prfm pldl1keep, [%x[in]]\n"
10178       "trn1 v7.4h, v0.4h, v2.4h\n"
10179       "trn2 v16.4h, v0.4h, v2.4h\n"
10180       "trn1 v15.4h, v1.4h, v3.4h\n"
10181       "trn2 v17.4h, v1.4h, v3.4h\n"
10182       "trn1 v0.8b, v7.8b, v15.8b\n"
10183       "trn2 v1.8b, v7.8b, v15.8b\n"
10184       "trn1 v2.8b, v16.8b, v17.8b\n"
10185       "trn2 v3.8b, v16.8b, v17.8b\n"
10186       "uaddw v8.8h, v8.8h, v0.8b\n"
10187       "uaddw v9.8h, v9.8h, v1.8b\n"
10188       "uaddw v10.8h, v10.8h, v2.8b\n"
10189       "uaddw v11.8h, v11.8h, v3.8b\n"
10190       "uaddw v12.8h, v12.8h, v4.8b\n"
10191       "uaddw v13.8h, v13.8h, v5.8b\n"
10192       "uaddw v14.8h, v14.8h, v6.8b\n"
10193       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10194       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10195 
10196       // Aggregator Reduction.
10197       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10198       "dup v1.4s, %w[additive_sum_offset]\n"
10199       "uaddlp v8.4s, v8.8h\n"
10200       "uaddlp v9.4s, v9.8h\n"
10201       "uaddlp v10.4s, v10.8h\n"
10202       "uaddlp v11.4s, v11.8h\n"
10203       "uaddlp v12.4s, v12.8h\n"
10204       "uaddlp v13.4s, v13.8h\n"
10205       "uaddlp v14.4s, v14.8h\n"
10206       "addp v8.4s, v8.4s, v9.4s\n"
10207       "addp v10.4s, v10.4s, v11.4s\n"
10208       "addp v12.4s, v12.4s, v13.4s\n"
10209       "addp v14.4s, v14.4s, v14.4s\n"
10210       "addp v8.4s, v8.4s, v10.4s\n"
10211       "addp v9.4s, v12.4s, v14.4s\n"
10212       "mul v8.4s, v8.4s, v0.s[0]\n"
10213       "mul v9.4s, v9.4s, v0.s[0]\n"
10214       "add v8.4s, v8.4s, v1.4s\n"
10215       "add v9.4s, v9.4s, v1.4s\n"
10216       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10217       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10218         [out] "+r"(out), [in] "+r"(in)
10219       : [additive_sum_offset] "r"(params.additive_sum_offset),
10220         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10221       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10222         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10223 }
10224 
10225 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10226 inline void Stream<uint8_t, 7, 8, 2, ColumnMajorWithSum>::Pack(
10227     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10228 #ifdef DEBUG
10229 #ifdef DEBUG_METAGEMM_VERBOSE
10230   std::cout
10231       << __FILE__ << "(" << __LINE__
10232       << ") ColumnMajorWithSum<uint8_t, 7, 8, 2, ColumnMajorWithSum>::Pack()"
10233       << std::endl
10234       << std::flush;
10235 #endif
10236 #endif
10237   int params_count_copy = params.count;
10238   int params_stride_copy = params.stride;
10239   asm volatile(
10240       "sub %x[stride], %x[stride], #4\n"
10241       "movi v8.8h, #0\n"
10242       "movi v9.8h, #0\n"
10243       "movi v10.8h, #0\n"
10244       "movi v11.8h, #0\n"
10245       "movi v12.8h, #0\n"
10246       "movi v13.8h, #0\n"
10247       "movi v14.8h, #0\n"
10248 
10249       // Reduce count by leftovers.
10250       "subs %x[count], %x[count], #2\n"
10251       "beq 2f\n"
10252 
10253       "1:"
10254       "subs %x[count], %x[count], #8\n"
10255 
10256       // Load Aggregate Store - column major 7x8
10257       "ld1 {v0.s}[0], [%x[in]], #4\n"
10258       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10259       "ld1 {v1.s}[0], [%x[in]], #4\n"
10260       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10261       "ld1 {v2.s}[0], [%x[in]], #4\n"
10262       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10263       "ld1 {v3.s}[0], [%x[in]], #4\n"
10264       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10265       "ld1 {v0.s}[1], [%x[in]], #4\n"
10266       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10267       "ld1 {v1.s}[1], [%x[in]], #4\n"
10268       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10269       "ld1 {v2.s}[1], [%x[in]], #4\n"
10270       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10271       "ld1 {v3.s}[1], [%x[in]], #4\n"
10272       "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10273       "prfm pldl1keep, [%x[in]]\n"
10274       "trn1 v7.4h, v0.4h, v2.4h\n"
10275       "trn2 v16.4h, v0.4h, v2.4h\n"
10276       "trn1 v15.4h, v1.4h, v3.4h\n"
10277       "trn2 v17.4h, v1.4h, v3.4h\n"
10278       "trn1 v0.8b, v7.8b, v15.8b\n"
10279       "trn2 v1.8b, v7.8b, v15.8b\n"
10280       "trn1 v2.8b, v16.8b, v17.8b\n"
10281       "trn2 v3.8b, v16.8b, v17.8b\n"
10282       "uaddw v8.8h, v8.8h, v0.8b\n"
10283       "uaddw v9.8h, v9.8h, v1.8b\n"
10284       "uaddw v10.8h, v10.8h, v2.8b\n"
10285       "uaddw v11.8h, v11.8h, v3.8b\n"
10286       "uaddw v12.8h, v12.8h, v4.8b\n"
10287       "uaddw v13.8h, v13.8h, v5.8b\n"
10288       "uaddw v14.8h, v14.8h, v6.8b\n"
10289       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10290       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10291 
10292       "bne 1b\n"
10293 
10294       "2:"
10295 
10296       // Load Aggregate Store - column major 7x2
10297       "movi v0.8b, #0\n"
10298       "movi v1.8b, #0\n"
10299       "movi v2.8b, #0\n"
10300       "movi v3.8b, #0\n"
10301       "movi v4.8b, #0\n"
10302       "movi v5.8b, #0\n"
10303       "movi v6.8b, #0\n"
10304       "ld1 {v0.s}[0], [%x[in]], #4\n"
10305       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10306       "ld1 {v1.s}[0], [%x[in]], #4\n"
10307       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10308       "prfm pldl1keep, [%x[in]]\n"
10309       "trn1 v7.4h, v0.4h, v2.4h\n"
10310       "trn2 v16.4h, v0.4h, v2.4h\n"
10311       "trn1 v15.4h, v1.4h, v3.4h\n"
10312       "trn2 v17.4h, v1.4h, v3.4h\n"
10313       "trn1 v0.8b, v7.8b, v15.8b\n"
10314       "trn2 v1.8b, v7.8b, v15.8b\n"
10315       "trn1 v2.8b, v16.8b, v17.8b\n"
10316       "trn2 v3.8b, v16.8b, v17.8b\n"
10317       "uaddw v8.8h, v8.8h, v0.8b\n"
10318       "uaddw v9.8h, v9.8h, v1.8b\n"
10319       "uaddw v10.8h, v10.8h, v2.8b\n"
10320       "uaddw v11.8h, v11.8h, v3.8b\n"
10321       "uaddw v12.8h, v12.8h, v4.8b\n"
10322       "uaddw v13.8h, v13.8h, v5.8b\n"
10323       "uaddw v14.8h, v14.8h, v6.8b\n"
10324       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10325       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10326 
10327       // Aggregator Reduction.
10328       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10329       "dup v1.4s, %w[additive_sum_offset]\n"
10330       "uaddlp v8.4s, v8.8h\n"
10331       "uaddlp v9.4s, v9.8h\n"
10332       "uaddlp v10.4s, v10.8h\n"
10333       "uaddlp v11.4s, v11.8h\n"
10334       "uaddlp v12.4s, v12.8h\n"
10335       "uaddlp v13.4s, v13.8h\n"
10336       "uaddlp v14.4s, v14.8h\n"
10337       "addp v8.4s, v8.4s, v9.4s\n"
10338       "addp v10.4s, v10.4s, v11.4s\n"
10339       "addp v12.4s, v12.4s, v13.4s\n"
10340       "addp v14.4s, v14.4s, v14.4s\n"
10341       "addp v8.4s, v8.4s, v10.4s\n"
10342       "addp v9.4s, v12.4s, v14.4s\n"
10343       "mul v8.4s, v8.4s, v0.s[0]\n"
10344       "mul v9.4s, v9.4s, v0.s[0]\n"
10345       "add v8.4s, v8.4s, v1.4s\n"
10346       "add v9.4s, v9.4s, v1.4s\n"
10347       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10348       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10349         [out] "+r"(out), [in] "+r"(in)
10350       : [additive_sum_offset] "r"(params.additive_sum_offset),
10351         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10352       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10353         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10354 }
10355 
10356 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10357 inline void Stream<uint8_t, 7, 8, 3, ColumnMajorWithSum>::Pack(
10358     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10359 #ifdef DEBUG
10360 #ifdef DEBUG_METAGEMM_VERBOSE
10361   std::cout
10362       << __FILE__ << "(" << __LINE__
10363       << ") ColumnMajorWithSum<uint8_t, 7, 8, 3, ColumnMajorWithSum>::Pack()"
10364       << std::endl
10365       << std::flush;
10366 #endif
10367 #endif
10368   int params_count_copy = params.count;
10369   int params_stride_copy = params.stride;
10370   asm volatile(
10371       "sub %x[stride], %x[stride], #4\n"
10372       "movi v8.8h, #0\n"
10373       "movi v9.8h, #0\n"
10374       "movi v10.8h, #0\n"
10375       "movi v11.8h, #0\n"
10376       "movi v12.8h, #0\n"
10377       "movi v13.8h, #0\n"
10378       "movi v14.8h, #0\n"
10379 
10380       // Reduce count by leftovers.
10381       "subs %x[count], %x[count], #3\n"
10382       "beq 2f\n"
10383 
10384       "1:"
10385       "subs %x[count], %x[count], #8\n"
10386 
10387       // Load Aggregate Store - column major 7x8
10388       "ld1 {v0.s}[0], [%x[in]], #4\n"
10389       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10390       "ld1 {v1.s}[0], [%x[in]], #4\n"
10391       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10392       "ld1 {v2.s}[0], [%x[in]], #4\n"
10393       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10394       "ld1 {v3.s}[0], [%x[in]], #4\n"
10395       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10396       "ld1 {v0.s}[1], [%x[in]], #4\n"
10397       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10398       "ld1 {v1.s}[1], [%x[in]], #4\n"
10399       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10400       "ld1 {v2.s}[1], [%x[in]], #4\n"
10401       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10402       "ld1 {v3.s}[1], [%x[in]], #4\n"
10403       "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10404       "prfm pldl1keep, [%x[in]]\n"
10405       "trn1 v7.4h, v0.4h, v2.4h\n"
10406       "trn2 v16.4h, v0.4h, v2.4h\n"
10407       "trn1 v15.4h, v1.4h, v3.4h\n"
10408       "trn2 v17.4h, v1.4h, v3.4h\n"
10409       "trn1 v0.8b, v7.8b, v15.8b\n"
10410       "trn2 v1.8b, v7.8b, v15.8b\n"
10411       "trn1 v2.8b, v16.8b, v17.8b\n"
10412       "trn2 v3.8b, v16.8b, v17.8b\n"
10413       "uaddw v8.8h, v8.8h, v0.8b\n"
10414       "uaddw v9.8h, v9.8h, v1.8b\n"
10415       "uaddw v10.8h, v10.8h, v2.8b\n"
10416       "uaddw v11.8h, v11.8h, v3.8b\n"
10417       "uaddw v12.8h, v12.8h, v4.8b\n"
10418       "uaddw v13.8h, v13.8h, v5.8b\n"
10419       "uaddw v14.8h, v14.8h, v6.8b\n"
10420       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10421       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10422 
10423       "bne 1b\n"
10424 
10425       "2:"
10426 
10427       // Load Aggregate Store - column major 7x3
10428       "movi v0.8b, #0\n"
10429       "movi v1.8b, #0\n"
10430       "movi v2.8b, #0\n"
10431       "movi v3.8b, #0\n"
10432       "movi v4.8b, #0\n"
10433       "movi v5.8b, #0\n"
10434       "movi v6.8b, #0\n"
10435       "ld1 {v0.s}[0], [%x[in]], #4\n"
10436       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10437       "ld1 {v1.s}[0], [%x[in]], #4\n"
10438       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10439       "ld1 {v2.s}[0], [%x[in]], #4\n"
10440       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10441       "prfm pldl1keep, [%x[in]]\n"
10442       "trn1 v7.4h, v0.4h, v2.4h\n"
10443       "trn2 v16.4h, v0.4h, v2.4h\n"
10444       "trn1 v15.4h, v1.4h, v3.4h\n"
10445       "trn2 v17.4h, v1.4h, v3.4h\n"
10446       "trn1 v0.8b, v7.8b, v15.8b\n"
10447       "trn2 v1.8b, v7.8b, v15.8b\n"
10448       "trn1 v2.8b, v16.8b, v17.8b\n"
10449       "trn2 v3.8b, v16.8b, v17.8b\n"
10450       "uaddw v8.8h, v8.8h, v0.8b\n"
10451       "uaddw v9.8h, v9.8h, v1.8b\n"
10452       "uaddw v10.8h, v10.8h, v2.8b\n"
10453       "uaddw v11.8h, v11.8h, v3.8b\n"
10454       "uaddw v12.8h, v12.8h, v4.8b\n"
10455       "uaddw v13.8h, v13.8h, v5.8b\n"
10456       "uaddw v14.8h, v14.8h, v6.8b\n"
10457       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10458       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10459 
10460       // Aggregator Reduction.
10461       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10462       "dup v1.4s, %w[additive_sum_offset]\n"
10463       "uaddlp v8.4s, v8.8h\n"
10464       "uaddlp v9.4s, v9.8h\n"
10465       "uaddlp v10.4s, v10.8h\n"
10466       "uaddlp v11.4s, v11.8h\n"
10467       "uaddlp v12.4s, v12.8h\n"
10468       "uaddlp v13.4s, v13.8h\n"
10469       "uaddlp v14.4s, v14.8h\n"
10470       "addp v8.4s, v8.4s, v9.4s\n"
10471       "addp v10.4s, v10.4s, v11.4s\n"
10472       "addp v12.4s, v12.4s, v13.4s\n"
10473       "addp v14.4s, v14.4s, v14.4s\n"
10474       "addp v8.4s, v8.4s, v10.4s\n"
10475       "addp v9.4s, v12.4s, v14.4s\n"
10476       "mul v8.4s, v8.4s, v0.s[0]\n"
10477       "mul v9.4s, v9.4s, v0.s[0]\n"
10478       "add v8.4s, v8.4s, v1.4s\n"
10479       "add v9.4s, v9.4s, v1.4s\n"
10480       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10481       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10482         [out] "+r"(out), [in] "+r"(in)
10483       : [additive_sum_offset] "r"(params.additive_sum_offset),
10484         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10485       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10486         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10487 }
10488 
10489 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10490 inline void Stream<uint8_t, 7, 8, 4, ColumnMajorWithSum>::Pack(
10491     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10492 #ifdef DEBUG
10493 #ifdef DEBUG_METAGEMM_VERBOSE
10494   std::cout
10495       << __FILE__ << "(" << __LINE__
10496       << ") ColumnMajorWithSum<uint8_t, 7, 8, 4, ColumnMajorWithSum>::Pack()"
10497       << std::endl
10498       << std::flush;
10499 #endif
10500 #endif
10501   int params_count_copy = params.count;
10502   int params_stride_copy = params.stride;
10503   asm volatile(
10504       "sub %x[stride], %x[stride], #4\n"
10505       "movi v8.8h, #0\n"
10506       "movi v9.8h, #0\n"
10507       "movi v10.8h, #0\n"
10508       "movi v11.8h, #0\n"
10509       "movi v12.8h, #0\n"
10510       "movi v13.8h, #0\n"
10511       "movi v14.8h, #0\n"
10512 
10513       // Reduce count by leftovers.
10514       "subs %x[count], %x[count], #4\n"
10515       "beq 2f\n"
10516 
10517       "1:"
10518       "subs %x[count], %x[count], #8\n"
10519 
10520       // Load Aggregate Store - column major 7x8
10521       "ld1 {v0.s}[0], [%x[in]], #4\n"
10522       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10523       "ld1 {v1.s}[0], [%x[in]], #4\n"
10524       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10525       "ld1 {v2.s}[0], [%x[in]], #4\n"
10526       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10527       "ld1 {v3.s}[0], [%x[in]], #4\n"
10528       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10529       "ld1 {v0.s}[1], [%x[in]], #4\n"
10530       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10531       "ld1 {v1.s}[1], [%x[in]], #4\n"
10532       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10533       "ld1 {v2.s}[1], [%x[in]], #4\n"
10534       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10535       "ld1 {v3.s}[1], [%x[in]], #4\n"
10536       "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10537       "prfm pldl1keep, [%x[in]]\n"
10538       "trn1 v7.4h, v0.4h, v2.4h\n"
10539       "trn2 v16.4h, v0.4h, v2.4h\n"
10540       "trn1 v15.4h, v1.4h, v3.4h\n"
10541       "trn2 v17.4h, v1.4h, v3.4h\n"
10542       "trn1 v0.8b, v7.8b, v15.8b\n"
10543       "trn2 v1.8b, v7.8b, v15.8b\n"
10544       "trn1 v2.8b, v16.8b, v17.8b\n"
10545       "trn2 v3.8b, v16.8b, v17.8b\n"
10546       "uaddw v8.8h, v8.8h, v0.8b\n"
10547       "uaddw v9.8h, v9.8h, v1.8b\n"
10548       "uaddw v10.8h, v10.8h, v2.8b\n"
10549       "uaddw v11.8h, v11.8h, v3.8b\n"
10550       "uaddw v12.8h, v12.8h, v4.8b\n"
10551       "uaddw v13.8h, v13.8h, v5.8b\n"
10552       "uaddw v14.8h, v14.8h, v6.8b\n"
10553       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10554       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10555 
10556       "bne 1b\n"
10557 
10558       "2:"
10559 
10560       // Load Aggregate Store - column major 7x4
10561       "movi v0.8b, #0\n"
10562       "movi v1.8b, #0\n"
10563       "movi v2.8b, #0\n"
10564       "movi v3.8b, #0\n"
10565       "movi v4.8b, #0\n"
10566       "movi v5.8b, #0\n"
10567       "movi v6.8b, #0\n"
10568       "ld1 {v0.s}[0], [%x[in]], #4\n"
10569       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10570       "ld1 {v1.s}[0], [%x[in]], #4\n"
10571       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10572       "ld1 {v2.s}[0], [%x[in]], #4\n"
10573       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10574       "ld1 {v3.s}[0], [%x[in]], #4\n"
10575       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10576       "prfm pldl1keep, [%x[in]]\n"
10577       "trn1 v7.4h, v0.4h, v2.4h\n"
10578       "trn2 v16.4h, v0.4h, v2.4h\n"
10579       "trn1 v15.4h, v1.4h, v3.4h\n"
10580       "trn2 v17.4h, v1.4h, v3.4h\n"
10581       "trn1 v0.8b, v7.8b, v15.8b\n"
10582       "trn2 v1.8b, v7.8b, v15.8b\n"
10583       "trn1 v2.8b, v16.8b, v17.8b\n"
10584       "trn2 v3.8b, v16.8b, v17.8b\n"
10585       "uaddw v8.8h, v8.8h, v0.8b\n"
10586       "uaddw v9.8h, v9.8h, v1.8b\n"
10587       "uaddw v10.8h, v10.8h, v2.8b\n"
10588       "uaddw v11.8h, v11.8h, v3.8b\n"
10589       "uaddw v12.8h, v12.8h, v4.8b\n"
10590       "uaddw v13.8h, v13.8h, v5.8b\n"
10591       "uaddw v14.8h, v14.8h, v6.8b\n"
10592       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10593       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10594 
10595       // Aggregator Reduction.
10596       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10597       "dup v1.4s, %w[additive_sum_offset]\n"
10598       "uaddlp v8.4s, v8.8h\n"
10599       "uaddlp v9.4s, v9.8h\n"
10600       "uaddlp v10.4s, v10.8h\n"
10601       "uaddlp v11.4s, v11.8h\n"
10602       "uaddlp v12.4s, v12.8h\n"
10603       "uaddlp v13.4s, v13.8h\n"
10604       "uaddlp v14.4s, v14.8h\n"
10605       "addp v8.4s, v8.4s, v9.4s\n"
10606       "addp v10.4s, v10.4s, v11.4s\n"
10607       "addp v12.4s, v12.4s, v13.4s\n"
10608       "addp v14.4s, v14.4s, v14.4s\n"
10609       "addp v8.4s, v8.4s, v10.4s\n"
10610       "addp v9.4s, v12.4s, v14.4s\n"
10611       "mul v8.4s, v8.4s, v0.s[0]\n"
10612       "mul v9.4s, v9.4s, v0.s[0]\n"
10613       "add v8.4s, v8.4s, v1.4s\n"
10614       "add v9.4s, v9.4s, v1.4s\n"
10615       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10616       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10617         [out] "+r"(out), [in] "+r"(in)
10618       : [additive_sum_offset] "r"(params.additive_sum_offset),
10619         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10620       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10621         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10622 }
10623 
10624 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10625 inline void Stream<uint8_t, 7, 8, 5, ColumnMajorWithSum>::Pack(
10626     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10627 #ifdef DEBUG
10628 #ifdef DEBUG_METAGEMM_VERBOSE
10629   std::cout
10630       << __FILE__ << "(" << __LINE__
10631       << ") ColumnMajorWithSum<uint8_t, 7, 8, 5, ColumnMajorWithSum>::Pack()"
10632       << std::endl
10633       << std::flush;
10634 #endif
10635 #endif
10636   int params_count_copy = params.count;
10637   int params_stride_copy = params.stride;
10638   asm volatile(
10639       "sub %x[stride], %x[stride], #4\n"
10640       "movi v8.8h, #0\n"
10641       "movi v9.8h, #0\n"
10642       "movi v10.8h, #0\n"
10643       "movi v11.8h, #0\n"
10644       "movi v12.8h, #0\n"
10645       "movi v13.8h, #0\n"
10646       "movi v14.8h, #0\n"
10647 
10648       // Reduce count by leftovers.
10649       "subs %x[count], %x[count], #5\n"
10650       "beq 2f\n"
10651 
10652       "1:"
10653       "subs %x[count], %x[count], #8\n"
10654 
10655       // Load Aggregate Store - column major 7x8
10656       "ld1 {v0.s}[0], [%x[in]], #4\n"
10657       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10658       "ld1 {v1.s}[0], [%x[in]], #4\n"
10659       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10660       "ld1 {v2.s}[0], [%x[in]], #4\n"
10661       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10662       "ld1 {v3.s}[0], [%x[in]], #4\n"
10663       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10664       "ld1 {v0.s}[1], [%x[in]], #4\n"
10665       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10666       "ld1 {v1.s}[1], [%x[in]], #4\n"
10667       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10668       "ld1 {v2.s}[1], [%x[in]], #4\n"
10669       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10670       "ld1 {v3.s}[1], [%x[in]], #4\n"
10671       "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10672       "prfm pldl1keep, [%x[in]]\n"
10673       "trn1 v7.4h, v0.4h, v2.4h\n"
10674       "trn2 v16.4h, v0.4h, v2.4h\n"
10675       "trn1 v15.4h, v1.4h, v3.4h\n"
10676       "trn2 v17.4h, v1.4h, v3.4h\n"
10677       "trn1 v0.8b, v7.8b, v15.8b\n"
10678       "trn2 v1.8b, v7.8b, v15.8b\n"
10679       "trn1 v2.8b, v16.8b, v17.8b\n"
10680       "trn2 v3.8b, v16.8b, v17.8b\n"
10681       "uaddw v8.8h, v8.8h, v0.8b\n"
10682       "uaddw v9.8h, v9.8h, v1.8b\n"
10683       "uaddw v10.8h, v10.8h, v2.8b\n"
10684       "uaddw v11.8h, v11.8h, v3.8b\n"
10685       "uaddw v12.8h, v12.8h, v4.8b\n"
10686       "uaddw v13.8h, v13.8h, v5.8b\n"
10687       "uaddw v14.8h, v14.8h, v6.8b\n"
10688       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10689       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10690 
10691       "bne 1b\n"
10692 
10693       "2:"
10694 
10695       // Load Aggregate Store - column major 7x5
10696       "movi v0.8b, #0\n"
10697       "movi v1.8b, #0\n"
10698       "movi v2.8b, #0\n"
10699       "movi v3.8b, #0\n"
10700       "movi v4.8b, #0\n"
10701       "movi v5.8b, #0\n"
10702       "movi v6.8b, #0\n"
10703       "ld1 {v0.s}[0], [%x[in]], #4\n"
10704       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10705       "ld1 {v1.s}[0], [%x[in]], #4\n"
10706       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10707       "ld1 {v2.s}[0], [%x[in]], #4\n"
10708       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10709       "ld1 {v3.s}[0], [%x[in]], #4\n"
10710       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10711       "ld1 {v0.s}[1], [%x[in]], #4\n"
10712       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10713       "prfm pldl1keep, [%x[in]]\n"
10714       "trn1 v7.4h, v0.4h, v2.4h\n"
10715       "trn2 v16.4h, v0.4h, v2.4h\n"
10716       "trn1 v15.4h, v1.4h, v3.4h\n"
10717       "trn2 v17.4h, v1.4h, v3.4h\n"
10718       "trn1 v0.8b, v7.8b, v15.8b\n"
10719       "trn2 v1.8b, v7.8b, v15.8b\n"
10720       "trn1 v2.8b, v16.8b, v17.8b\n"
10721       "trn2 v3.8b, v16.8b, v17.8b\n"
10722       "uaddw v8.8h, v8.8h, v0.8b\n"
10723       "uaddw v9.8h, v9.8h, v1.8b\n"
10724       "uaddw v10.8h, v10.8h, v2.8b\n"
10725       "uaddw v11.8h, v11.8h, v3.8b\n"
10726       "uaddw v12.8h, v12.8h, v4.8b\n"
10727       "uaddw v13.8h, v13.8h, v5.8b\n"
10728       "uaddw v14.8h, v14.8h, v6.8b\n"
10729       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10730       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10731 
10732       // Aggregator Reduction.
10733       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10734       "dup v1.4s, %w[additive_sum_offset]\n"
10735       "uaddlp v8.4s, v8.8h\n"
10736       "uaddlp v9.4s, v9.8h\n"
10737       "uaddlp v10.4s, v10.8h\n"
10738       "uaddlp v11.4s, v11.8h\n"
10739       "uaddlp v12.4s, v12.8h\n"
10740       "uaddlp v13.4s, v13.8h\n"
10741       "uaddlp v14.4s, v14.8h\n"
10742       "addp v8.4s, v8.4s, v9.4s\n"
10743       "addp v10.4s, v10.4s, v11.4s\n"
10744       "addp v12.4s, v12.4s, v13.4s\n"
10745       "addp v14.4s, v14.4s, v14.4s\n"
10746       "addp v8.4s, v8.4s, v10.4s\n"
10747       "addp v9.4s, v12.4s, v14.4s\n"
10748       "mul v8.4s, v8.4s, v0.s[0]\n"
10749       "mul v9.4s, v9.4s, v0.s[0]\n"
10750       "add v8.4s, v8.4s, v1.4s\n"
10751       "add v9.4s, v9.4s, v1.4s\n"
10752       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10753       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10754         [out] "+r"(out), [in] "+r"(in)
10755       : [additive_sum_offset] "r"(params.additive_sum_offset),
10756         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10757       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10758         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10759 }
10760 
10761 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10762 inline void Stream<uint8_t, 7, 8, 6, ColumnMajorWithSum>::Pack(
10763     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10764 #ifdef DEBUG
10765 #ifdef DEBUG_METAGEMM_VERBOSE
10766   std::cout
10767       << __FILE__ << "(" << __LINE__
10768       << ") ColumnMajorWithSum<uint8_t, 7, 8, 6, ColumnMajorWithSum>::Pack()"
10769       << std::endl
10770       << std::flush;
10771 #endif
10772 #endif
10773   int params_count_copy = params.count;
10774   int params_stride_copy = params.stride;
10775   asm volatile(
10776       "sub %x[stride], %x[stride], #4\n"
10777       "movi v8.8h, #0\n"
10778       "movi v9.8h, #0\n"
10779       "movi v10.8h, #0\n"
10780       "movi v11.8h, #0\n"
10781       "movi v12.8h, #0\n"
10782       "movi v13.8h, #0\n"
10783       "movi v14.8h, #0\n"
10784 
10785       // Reduce count by leftovers.
10786       "subs %x[count], %x[count], #6\n"
10787       "beq 2f\n"
10788 
10789       "1:"
10790       "subs %x[count], %x[count], #8\n"
10791 
10792       // Load Aggregate Store - column major 7x8
10793       "ld1 {v0.s}[0], [%x[in]], #4\n"
10794       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10795       "ld1 {v1.s}[0], [%x[in]], #4\n"
10796       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10797       "ld1 {v2.s}[0], [%x[in]], #4\n"
10798       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10799       "ld1 {v3.s}[0], [%x[in]], #4\n"
10800       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10801       "ld1 {v0.s}[1], [%x[in]], #4\n"
10802       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10803       "ld1 {v1.s}[1], [%x[in]], #4\n"
10804       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10805       "ld1 {v2.s}[1], [%x[in]], #4\n"
10806       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10807       "ld1 {v3.s}[1], [%x[in]], #4\n"
10808       "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10809       "prfm pldl1keep, [%x[in]]\n"
10810       "trn1 v7.4h, v0.4h, v2.4h\n"
10811       "trn2 v16.4h, v0.4h, v2.4h\n"
10812       "trn1 v15.4h, v1.4h, v3.4h\n"
10813       "trn2 v17.4h, v1.4h, v3.4h\n"
10814       "trn1 v0.8b, v7.8b, v15.8b\n"
10815       "trn2 v1.8b, v7.8b, v15.8b\n"
10816       "trn1 v2.8b, v16.8b, v17.8b\n"
10817       "trn2 v3.8b, v16.8b, v17.8b\n"
10818       "uaddw v8.8h, v8.8h, v0.8b\n"
10819       "uaddw v9.8h, v9.8h, v1.8b\n"
10820       "uaddw v10.8h, v10.8h, v2.8b\n"
10821       "uaddw v11.8h, v11.8h, v3.8b\n"
10822       "uaddw v12.8h, v12.8h, v4.8b\n"
10823       "uaddw v13.8h, v13.8h, v5.8b\n"
10824       "uaddw v14.8h, v14.8h, v6.8b\n"
10825       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10826       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10827 
10828       "bne 1b\n"
10829 
10830       "2:"
10831 
10832       // Load Aggregate Store - column major 7x6
10833       "movi v0.8b, #0\n"
10834       "movi v1.8b, #0\n"
10835       "movi v2.8b, #0\n"
10836       "movi v3.8b, #0\n"
10837       "movi v4.8b, #0\n"
10838       "movi v5.8b, #0\n"
10839       "movi v6.8b, #0\n"
10840       "ld1 {v0.s}[0], [%x[in]], #4\n"
10841       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10842       "ld1 {v1.s}[0], [%x[in]], #4\n"
10843       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10844       "ld1 {v2.s}[0], [%x[in]], #4\n"
10845       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10846       "ld1 {v3.s}[0], [%x[in]], #4\n"
10847       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10848       "ld1 {v0.s}[1], [%x[in]], #4\n"
10849       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10850       "ld1 {v1.s}[1], [%x[in]], #4\n"
10851       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10852       "prfm pldl1keep, [%x[in]]\n"
10853       "trn1 v7.4h, v0.4h, v2.4h\n"
10854       "trn2 v16.4h, v0.4h, v2.4h\n"
10855       "trn1 v15.4h, v1.4h, v3.4h\n"
10856       "trn2 v17.4h, v1.4h, v3.4h\n"
10857       "trn1 v0.8b, v7.8b, v15.8b\n"
10858       "trn2 v1.8b, v7.8b, v15.8b\n"
10859       "trn1 v2.8b, v16.8b, v17.8b\n"
10860       "trn2 v3.8b, v16.8b, v17.8b\n"
10861       "uaddw v8.8h, v8.8h, v0.8b\n"
10862       "uaddw v9.8h, v9.8h, v1.8b\n"
10863       "uaddw v10.8h, v10.8h, v2.8b\n"
10864       "uaddw v11.8h, v11.8h, v3.8b\n"
10865       "uaddw v12.8h, v12.8h, v4.8b\n"
10866       "uaddw v13.8h, v13.8h, v5.8b\n"
10867       "uaddw v14.8h, v14.8h, v6.8b\n"
10868       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10869       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10870 
10871       // Aggregator Reduction.
10872       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10873       "dup v1.4s, %w[additive_sum_offset]\n"
10874       "uaddlp v8.4s, v8.8h\n"
10875       "uaddlp v9.4s, v9.8h\n"
10876       "uaddlp v10.4s, v10.8h\n"
10877       "uaddlp v11.4s, v11.8h\n"
10878       "uaddlp v12.4s, v12.8h\n"
10879       "uaddlp v13.4s, v13.8h\n"
10880       "uaddlp v14.4s, v14.8h\n"
10881       "addp v8.4s, v8.4s, v9.4s\n"
10882       "addp v10.4s, v10.4s, v11.4s\n"
10883       "addp v12.4s, v12.4s, v13.4s\n"
10884       "addp v14.4s, v14.4s, v14.4s\n"
10885       "addp v8.4s, v8.4s, v10.4s\n"
10886       "addp v9.4s, v12.4s, v14.4s\n"
10887       "mul v8.4s, v8.4s, v0.s[0]\n"
10888       "mul v9.4s, v9.4s, v0.s[0]\n"
10889       "add v8.4s, v8.4s, v1.4s\n"
10890       "add v9.4s, v9.4s, v1.4s\n"
10891       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10892       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10893         [out] "+r"(out), [in] "+r"(in)
10894       : [additive_sum_offset] "r"(params.additive_sum_offset),
10895         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10896       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10897         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10898 }
10899 
10900 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10901 inline void Stream<uint8_t, 7, 8, 7, ColumnMajorWithSum>::Pack(
10902     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10903 #ifdef DEBUG
10904 #ifdef DEBUG_METAGEMM_VERBOSE
10905   std::cout
10906       << __FILE__ << "(" << __LINE__
10907       << ") ColumnMajorWithSum<uint8_t, 7, 8, 7, ColumnMajorWithSum>::Pack()"
10908       << std::endl
10909       << std::flush;
10910 #endif
10911 #endif
10912   int params_count_copy = params.count;
10913   int params_stride_copy = params.stride;
10914   asm volatile(
10915       "sub %x[stride], %x[stride], #4\n"
10916       "movi v8.8h, #0\n"
10917       "movi v9.8h, #0\n"
10918       "movi v10.8h, #0\n"
10919       "movi v11.8h, #0\n"
10920       "movi v12.8h, #0\n"
10921       "movi v13.8h, #0\n"
10922       "movi v14.8h, #0\n"
10923 
10924       // Reduce count by leftovers.
10925       "subs %x[count], %x[count], #7\n"
10926       "beq 2f\n"
10927 
10928       "1:"
10929       "subs %x[count], %x[count], #8\n"
10930 
10931       // Load Aggregate Store - column major 7x8
10932       "ld1 {v0.s}[0], [%x[in]], #4\n"
10933       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10934       "ld1 {v1.s}[0], [%x[in]], #4\n"
10935       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10936       "ld1 {v2.s}[0], [%x[in]], #4\n"
10937       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10938       "ld1 {v3.s}[0], [%x[in]], #4\n"
10939       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10940       "ld1 {v0.s}[1], [%x[in]], #4\n"
10941       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10942       "ld1 {v1.s}[1], [%x[in]], #4\n"
10943       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10944       "ld1 {v2.s}[1], [%x[in]], #4\n"
10945       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10946       "ld1 {v3.s}[1], [%x[in]], #4\n"
10947       "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10948       "prfm pldl1keep, [%x[in]]\n"
10949       "trn1 v7.4h, v0.4h, v2.4h\n"
10950       "trn2 v16.4h, v0.4h, v2.4h\n"
10951       "trn1 v15.4h, v1.4h, v3.4h\n"
10952       "trn2 v17.4h, v1.4h, v3.4h\n"
10953       "trn1 v0.8b, v7.8b, v15.8b\n"
10954       "trn2 v1.8b, v7.8b, v15.8b\n"
10955       "trn1 v2.8b, v16.8b, v17.8b\n"
10956       "trn2 v3.8b, v16.8b, v17.8b\n"
10957       "uaddw v8.8h, v8.8h, v0.8b\n"
10958       "uaddw v9.8h, v9.8h, v1.8b\n"
10959       "uaddw v10.8h, v10.8h, v2.8b\n"
10960       "uaddw v11.8h, v11.8h, v3.8b\n"
10961       "uaddw v12.8h, v12.8h, v4.8b\n"
10962       "uaddw v13.8h, v13.8h, v5.8b\n"
10963       "uaddw v14.8h, v14.8h, v6.8b\n"
10964       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10965       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10966 
10967       "bne 1b\n"
10968 
10969       "2:"
10970 
10971       // Load Aggregate Store - column major 7x7
10972       "movi v0.8b, #0\n"
10973       "movi v1.8b, #0\n"
10974       "movi v2.8b, #0\n"
10975       "movi v3.8b, #0\n"
10976       "movi v4.8b, #0\n"
10977       "movi v5.8b, #0\n"
10978       "movi v6.8b, #0\n"
10979       "ld1 {v0.s}[0], [%x[in]], #4\n"
10980       "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10981       "ld1 {v1.s}[0], [%x[in]], #4\n"
10982       "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10983       "ld1 {v2.s}[0], [%x[in]], #4\n"
10984       "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10985       "ld1 {v3.s}[0], [%x[in]], #4\n"
10986       "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10987       "ld1 {v0.s}[1], [%x[in]], #4\n"
10988       "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10989       "ld1 {v1.s}[1], [%x[in]], #4\n"
10990       "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10991       "ld1 {v2.s}[1], [%x[in]], #4\n"
10992       "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10993       "prfm pldl1keep, [%x[in]]\n"
10994       "trn1 v7.4h, v0.4h, v2.4h\n"
10995       "trn2 v16.4h, v0.4h, v2.4h\n"
10996       "trn1 v15.4h, v1.4h, v3.4h\n"
10997       "trn2 v17.4h, v1.4h, v3.4h\n"
10998       "trn1 v0.8b, v7.8b, v15.8b\n"
10999       "trn2 v1.8b, v7.8b, v15.8b\n"
11000       "trn1 v2.8b, v16.8b, v17.8b\n"
11001       "trn2 v3.8b, v16.8b, v17.8b\n"
11002       "uaddw v8.8h, v8.8h, v0.8b\n"
11003       "uaddw v9.8h, v9.8h, v1.8b\n"
11004       "uaddw v10.8h, v10.8h, v2.8b\n"
11005       "uaddw v11.8h, v11.8h, v3.8b\n"
11006       "uaddw v12.8h, v12.8h, v4.8b\n"
11007       "uaddw v13.8h, v13.8h, v5.8b\n"
11008       "uaddw v14.8h, v14.8h, v6.8b\n"
11009       "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
11010       "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
11011 
11012       // Aggregator Reduction.
11013       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11014       "dup v1.4s, %w[additive_sum_offset]\n"
11015       "uaddlp v8.4s, v8.8h\n"
11016       "uaddlp v9.4s, v9.8h\n"
11017       "uaddlp v10.4s, v10.8h\n"
11018       "uaddlp v11.4s, v11.8h\n"
11019       "uaddlp v12.4s, v12.8h\n"
11020       "uaddlp v13.4s, v13.8h\n"
11021       "uaddlp v14.4s, v14.8h\n"
11022       "addp v8.4s, v8.4s, v9.4s\n"
11023       "addp v10.4s, v10.4s, v11.4s\n"
11024       "addp v12.4s, v12.4s, v13.4s\n"
11025       "addp v14.4s, v14.4s, v14.4s\n"
11026       "addp v8.4s, v8.4s, v10.4s\n"
11027       "addp v9.4s, v12.4s, v14.4s\n"
11028       "mul v8.4s, v8.4s, v0.s[0]\n"
11029       "mul v9.4s, v9.4s, v0.s[0]\n"
11030       "add v8.4s, v8.4s, v1.4s\n"
11031       "add v9.4s, v9.4s, v1.4s\n"
11032       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11033       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11034         [out] "+r"(out), [in] "+r"(in)
11035       : [additive_sum_offset] "r"(params.additive_sum_offset),
11036         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11037       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11038         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
11039 }
11040 
11041 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11042 inline void Stream<uint8_t, 8, 8, 0, ColumnMajorWithSum>::Pack(
11043     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11044 #ifdef DEBUG
11045 #ifdef DEBUG_METAGEMM_VERBOSE
11046   std::cout
11047       << __FILE__ << "(" << __LINE__
11048       << ") ColumnMajorWithSum<uint8_t, 8, 8, 0, ColumnMajorWithSum>::Pack()"
11049       << std::endl
11050       << std::flush;
11051 #endif
11052 #endif
11053   int params_count_copy = params.count;
11054   int params_stride_copy = params.stride;
11055   asm volatile(
11056       "movi v8.8h, #0\n"
11057       "movi v9.8h, #0\n"
11058       "movi v10.8h, #0\n"
11059       "movi v11.8h, #0\n"
11060       "movi v12.8h, #0\n"
11061       "movi v13.8h, #0\n"
11062       "movi v14.8h, #0\n"
11063       "movi v15.8h, #0\n"
11064 
11065       "1:"
11066       "subs %x[count], %x[count], #8\n"
11067 
11068       // Load Aggregate Store - column major 8x8
11069       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11070       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11071       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11072       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11073       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11074       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11075       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11076       "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11077       "prfm pldl1keep, [%x[in]]\n"
11078       "trn1 v16.8b, v0.8b, v1.8b\n"
11079       "trn2 v17.8b, v0.8b, v1.8b\n"
11080       "trn1 v18.8b, v2.8b, v3.8b\n"
11081       "trn2 v19.8b, v2.8b, v3.8b\n"
11082       "trn1 v20.8b, v4.8b, v5.8b\n"
11083       "trn2 v21.8b, v4.8b, v5.8b\n"
11084       "trn1 v22.8b, v6.8b, v7.8b\n"
11085       "trn2 v23.8b, v6.8b, v7.8b\n"
11086       "trn1 v0.4h, v16.4h, v18.4h\n"
11087       "trn2 v2.4h, v16.4h, v18.4h\n"
11088       "trn1 v1.4h, v17.4h, v19.4h\n"
11089       "trn2 v3.4h, v17.4h, v19.4h\n"
11090       "trn1 v4.4h, v20.4h, v22.4h\n"
11091       "trn2 v6.4h, v20.4h, v22.4h\n"
11092       "trn1 v5.4h, v21.4h, v23.4h\n"
11093       "trn2 v7.4h, v21.4h, v23.4h\n"
11094       "trn1 v16.2s, v0.2s, v4.2s\n"
11095       "trn2 v20.2s, v0.2s, v4.2s\n"
11096       "trn1 v17.2s, v1.2s, v5.2s\n"
11097       "trn2 v21.2s, v1.2s, v5.2s\n"
11098       "trn1 v18.2s, v2.2s, v6.2s\n"
11099       "trn2 v22.2s, v2.2s, v6.2s\n"
11100       "trn1 v19.2s, v3.2s, v7.2s\n"
11101       "trn2 v23.2s, v3.2s, v7.2s\n"
11102       "uaddw v8.8h, v8.8h, v16.8b\n"
11103       "uaddw v9.8h, v9.8h, v17.8b\n"
11104       "uaddw v10.8h, v10.8h, v18.8b\n"
11105       "uaddw v11.8h, v11.8h, v19.8b\n"
11106       "uaddw v12.8h, v12.8h, v20.8b\n"
11107       "uaddw v13.8h, v13.8h, v21.8b\n"
11108       "uaddw v14.8h, v14.8h, v22.8b\n"
11109       "uaddw v15.8h, v15.8h, v23.8b\n"
11110       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11111       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11112 
11113       "bne 1b\n"
11114 
11115       // Aggregator Reduction.
11116       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11117       "dup v1.4s, %w[additive_sum_offset]\n"
11118       "uaddlp v8.4s, v8.8h\n"
11119       "uaddlp v9.4s, v9.8h\n"
11120       "uaddlp v10.4s, v10.8h\n"
11121       "uaddlp v11.4s, v11.8h\n"
11122       "uaddlp v12.4s, v12.8h\n"
11123       "uaddlp v13.4s, v13.8h\n"
11124       "uaddlp v14.4s, v14.8h\n"
11125       "uaddlp v15.4s, v15.8h\n"
11126       "addp v8.4s, v8.4s, v9.4s\n"
11127       "addp v10.4s, v10.4s, v11.4s\n"
11128       "addp v12.4s, v12.4s, v13.4s\n"
11129       "addp v14.4s, v14.4s, v15.4s\n"
11130       "addp v8.4s, v8.4s, v10.4s\n"
11131       "addp v9.4s, v12.4s, v14.4s\n"
11132       "mul v8.4s, v8.4s, v0.s[0]\n"
11133       "mul v9.4s, v9.4s, v0.s[0]\n"
11134       "add v8.4s, v8.4s, v1.4s\n"
11135       "add v9.4s, v9.4s, v1.4s\n"
11136       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11137       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11138         [out] "+r"(out), [in] "+r"(in)
11139       : [additive_sum_offset] "r"(params.additive_sum_offset),
11140         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11141       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11142         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11143         "v21", "v22", "v23", "cc", "memory");
11144 }
11145 
11146 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11147 inline void Stream<uint8_t, 8, 8, 1, ColumnMajorWithSum>::Pack(
11148     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11149 #ifdef DEBUG
11150 #ifdef DEBUG_METAGEMM_VERBOSE
11151   std::cout
11152       << __FILE__ << "(" << __LINE__
11153       << ") ColumnMajorWithSum<uint8_t, 8, 8, 1, ColumnMajorWithSum>::Pack()"
11154       << std::endl
11155       << std::flush;
11156 #endif
11157 #endif
11158   int params_count_copy = params.count;
11159   int params_stride_copy = params.stride;
11160   asm volatile(
11161       "movi v8.8h, #0\n"
11162       "movi v9.8h, #0\n"
11163       "movi v10.8h, #0\n"
11164       "movi v11.8h, #0\n"
11165       "movi v12.8h, #0\n"
11166       "movi v13.8h, #0\n"
11167       "movi v14.8h, #0\n"
11168       "movi v15.8h, #0\n"
11169 
11170       // Reduce count by leftovers.
11171       "subs %x[count], %x[count], #1\n"
11172       "beq 2f\n"
11173 
11174       "1:"
11175       "subs %x[count], %x[count], #8\n"
11176 
11177       // Load Aggregate Store - column major 8x8
11178       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11179       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11180       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11181       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11182       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11183       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11184       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11185       "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11186       "prfm pldl1keep, [%x[in]]\n"
11187       "trn1 v16.8b, v0.8b, v1.8b\n"
11188       "trn2 v17.8b, v0.8b, v1.8b\n"
11189       "trn1 v18.8b, v2.8b, v3.8b\n"
11190       "trn2 v19.8b, v2.8b, v3.8b\n"
11191       "trn1 v20.8b, v4.8b, v5.8b\n"
11192       "trn2 v21.8b, v4.8b, v5.8b\n"
11193       "trn1 v22.8b, v6.8b, v7.8b\n"
11194       "trn2 v23.8b, v6.8b, v7.8b\n"
11195       "trn1 v0.4h, v16.4h, v18.4h\n"
11196       "trn2 v2.4h, v16.4h, v18.4h\n"
11197       "trn1 v1.4h, v17.4h, v19.4h\n"
11198       "trn2 v3.4h, v17.4h, v19.4h\n"
11199       "trn1 v4.4h, v20.4h, v22.4h\n"
11200       "trn2 v6.4h, v20.4h, v22.4h\n"
11201       "trn1 v5.4h, v21.4h, v23.4h\n"
11202       "trn2 v7.4h, v21.4h, v23.4h\n"
11203       "trn1 v16.2s, v0.2s, v4.2s\n"
11204       "trn2 v20.2s, v0.2s, v4.2s\n"
11205       "trn1 v17.2s, v1.2s, v5.2s\n"
11206       "trn2 v21.2s, v1.2s, v5.2s\n"
11207       "trn1 v18.2s, v2.2s, v6.2s\n"
11208       "trn2 v22.2s, v2.2s, v6.2s\n"
11209       "trn1 v19.2s, v3.2s, v7.2s\n"
11210       "trn2 v23.2s, v3.2s, v7.2s\n"
11211       "uaddw v8.8h, v8.8h, v16.8b\n"
11212       "uaddw v9.8h, v9.8h, v17.8b\n"
11213       "uaddw v10.8h, v10.8h, v18.8b\n"
11214       "uaddw v11.8h, v11.8h, v19.8b\n"
11215       "uaddw v12.8h, v12.8h, v20.8b\n"
11216       "uaddw v13.8h, v13.8h, v21.8b\n"
11217       "uaddw v14.8h, v14.8h, v22.8b\n"
11218       "uaddw v15.8h, v15.8h, v23.8b\n"
11219       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11220       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11221 
11222       "bne 1b\n"
11223 
11224       "2:"
11225 
11226       // Load Aggregate Store - column major 8x1
11227       "movi v0.8b, #0\n"
11228       "movi v1.8b, #0\n"
11229       "movi v2.8b, #0\n"
11230       "movi v3.8b, #0\n"
11231       "movi v4.8b, #0\n"
11232       "movi v5.8b, #0\n"
11233       "movi v6.8b, #0\n"
11234       "movi v7.8b, #0\n"
11235       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11236       "prfm pldl1keep, [%x[in]]\n"
11237       "trn1 v16.8b, v0.8b, v1.8b\n"
11238       "trn2 v17.8b, v0.8b, v1.8b\n"
11239       "trn1 v18.8b, v2.8b, v3.8b\n"
11240       "trn2 v19.8b, v2.8b, v3.8b\n"
11241       "trn1 v20.8b, v4.8b, v5.8b\n"
11242       "trn2 v21.8b, v4.8b, v5.8b\n"
11243       "trn1 v22.8b, v6.8b, v7.8b\n"
11244       "trn2 v23.8b, v6.8b, v7.8b\n"
11245       "trn1 v0.4h, v16.4h, v18.4h\n"
11246       "trn2 v2.4h, v16.4h, v18.4h\n"
11247       "trn1 v1.4h, v17.4h, v19.4h\n"
11248       "trn2 v3.4h, v17.4h, v19.4h\n"
11249       "trn1 v4.4h, v20.4h, v22.4h\n"
11250       "trn2 v6.4h, v20.4h, v22.4h\n"
11251       "trn1 v5.4h, v21.4h, v23.4h\n"
11252       "trn2 v7.4h, v21.4h, v23.4h\n"
11253       "trn1 v16.2s, v0.2s, v4.2s\n"
11254       "trn2 v20.2s, v0.2s, v4.2s\n"
11255       "trn1 v17.2s, v1.2s, v5.2s\n"
11256       "trn2 v21.2s, v1.2s, v5.2s\n"
11257       "trn1 v18.2s, v2.2s, v6.2s\n"
11258       "trn2 v22.2s, v2.2s, v6.2s\n"
11259       "trn1 v19.2s, v3.2s, v7.2s\n"
11260       "trn2 v23.2s, v3.2s, v7.2s\n"
11261       "uaddw v8.8h, v8.8h, v16.8b\n"
11262       "uaddw v9.8h, v9.8h, v17.8b\n"
11263       "uaddw v10.8h, v10.8h, v18.8b\n"
11264       "uaddw v11.8h, v11.8h, v19.8b\n"
11265       "uaddw v12.8h, v12.8h, v20.8b\n"
11266       "uaddw v13.8h, v13.8h, v21.8b\n"
11267       "uaddw v14.8h, v14.8h, v22.8b\n"
11268       "uaddw v15.8h, v15.8h, v23.8b\n"
11269       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11270       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11271 
11272       // Aggregator Reduction.
11273       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11274       "dup v1.4s, %w[additive_sum_offset]\n"
11275       "uaddlp v8.4s, v8.8h\n"
11276       "uaddlp v9.4s, v9.8h\n"
11277       "uaddlp v10.4s, v10.8h\n"
11278       "uaddlp v11.4s, v11.8h\n"
11279       "uaddlp v12.4s, v12.8h\n"
11280       "uaddlp v13.4s, v13.8h\n"
11281       "uaddlp v14.4s, v14.8h\n"
11282       "uaddlp v15.4s, v15.8h\n"
11283       "addp v8.4s, v8.4s, v9.4s\n"
11284       "addp v10.4s, v10.4s, v11.4s\n"
11285       "addp v12.4s, v12.4s, v13.4s\n"
11286       "addp v14.4s, v14.4s, v15.4s\n"
11287       "addp v8.4s, v8.4s, v10.4s\n"
11288       "addp v9.4s, v12.4s, v14.4s\n"
11289       "mul v8.4s, v8.4s, v0.s[0]\n"
11290       "mul v9.4s, v9.4s, v0.s[0]\n"
11291       "add v8.4s, v8.4s, v1.4s\n"
11292       "add v9.4s, v9.4s, v1.4s\n"
11293       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11294       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11295         [out] "+r"(out), [in] "+r"(in)
11296       : [additive_sum_offset] "r"(params.additive_sum_offset),
11297         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11298       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11299         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11300         "v21", "v22", "v23", "cc", "memory");
11301 }
11302 
11303 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11304 inline void Stream<uint8_t, 8, 8, 2, ColumnMajorWithSum>::Pack(
11305     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11306 #ifdef DEBUG
11307 #ifdef DEBUG_METAGEMM_VERBOSE
11308   std::cout
11309       << __FILE__ << "(" << __LINE__
11310       << ") ColumnMajorWithSum<uint8_t, 8, 8, 2, ColumnMajorWithSum>::Pack()"
11311       << std::endl
11312       << std::flush;
11313 #endif
11314 #endif
11315   int params_count_copy = params.count;
11316   int params_stride_copy = params.stride;
11317   asm volatile(
11318       "movi v8.8h, #0\n"
11319       "movi v9.8h, #0\n"
11320       "movi v10.8h, #0\n"
11321       "movi v11.8h, #0\n"
11322       "movi v12.8h, #0\n"
11323       "movi v13.8h, #0\n"
11324       "movi v14.8h, #0\n"
11325       "movi v15.8h, #0\n"
11326 
11327       // Reduce count by leftovers.
11328       "subs %x[count], %x[count], #2\n"
11329       "beq 2f\n"
11330 
11331       "1:"
11332       "subs %x[count], %x[count], #8\n"
11333 
11334       // Load Aggregate Store - column major 8x8
11335       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11336       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11337       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11338       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11339       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11340       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11341       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11342       "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11343       "prfm pldl1keep, [%x[in]]\n"
11344       "trn1 v16.8b, v0.8b, v1.8b\n"
11345       "trn2 v17.8b, v0.8b, v1.8b\n"
11346       "trn1 v18.8b, v2.8b, v3.8b\n"
11347       "trn2 v19.8b, v2.8b, v3.8b\n"
11348       "trn1 v20.8b, v4.8b, v5.8b\n"
11349       "trn2 v21.8b, v4.8b, v5.8b\n"
11350       "trn1 v22.8b, v6.8b, v7.8b\n"
11351       "trn2 v23.8b, v6.8b, v7.8b\n"
11352       "trn1 v0.4h, v16.4h, v18.4h\n"
11353       "trn2 v2.4h, v16.4h, v18.4h\n"
11354       "trn1 v1.4h, v17.4h, v19.4h\n"
11355       "trn2 v3.4h, v17.4h, v19.4h\n"
11356       "trn1 v4.4h, v20.4h, v22.4h\n"
11357       "trn2 v6.4h, v20.4h, v22.4h\n"
11358       "trn1 v5.4h, v21.4h, v23.4h\n"
11359       "trn2 v7.4h, v21.4h, v23.4h\n"
11360       "trn1 v16.2s, v0.2s, v4.2s\n"
11361       "trn2 v20.2s, v0.2s, v4.2s\n"
11362       "trn1 v17.2s, v1.2s, v5.2s\n"
11363       "trn2 v21.2s, v1.2s, v5.2s\n"
11364       "trn1 v18.2s, v2.2s, v6.2s\n"
11365       "trn2 v22.2s, v2.2s, v6.2s\n"
11366       "trn1 v19.2s, v3.2s, v7.2s\n"
11367       "trn2 v23.2s, v3.2s, v7.2s\n"
11368       "uaddw v8.8h, v8.8h, v16.8b\n"
11369       "uaddw v9.8h, v9.8h, v17.8b\n"
11370       "uaddw v10.8h, v10.8h, v18.8b\n"
11371       "uaddw v11.8h, v11.8h, v19.8b\n"
11372       "uaddw v12.8h, v12.8h, v20.8b\n"
11373       "uaddw v13.8h, v13.8h, v21.8b\n"
11374       "uaddw v14.8h, v14.8h, v22.8b\n"
11375       "uaddw v15.8h, v15.8h, v23.8b\n"
11376       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11377       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11378 
11379       "bne 1b\n"
11380 
11381       "2:"
11382 
11383       // Load Aggregate Store - column major 8x2
11384       "movi v0.8b, #0\n"
11385       "movi v1.8b, #0\n"
11386       "movi v2.8b, #0\n"
11387       "movi v3.8b, #0\n"
11388       "movi v4.8b, #0\n"
11389       "movi v5.8b, #0\n"
11390       "movi v6.8b, #0\n"
11391       "movi v7.8b, #0\n"
11392       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11393       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11394       "prfm pldl1keep, [%x[in]]\n"
11395       "trn1 v16.8b, v0.8b, v1.8b\n"
11396       "trn2 v17.8b, v0.8b, v1.8b\n"
11397       "trn1 v18.8b, v2.8b, v3.8b\n"
11398       "trn2 v19.8b, v2.8b, v3.8b\n"
11399       "trn1 v20.8b, v4.8b, v5.8b\n"
11400       "trn2 v21.8b, v4.8b, v5.8b\n"
11401       "trn1 v22.8b, v6.8b, v7.8b\n"
11402       "trn2 v23.8b, v6.8b, v7.8b\n"
11403       "trn1 v0.4h, v16.4h, v18.4h\n"
11404       "trn2 v2.4h, v16.4h, v18.4h\n"
11405       "trn1 v1.4h, v17.4h, v19.4h\n"
11406       "trn2 v3.4h, v17.4h, v19.4h\n"
11407       "trn1 v4.4h, v20.4h, v22.4h\n"
11408       "trn2 v6.4h, v20.4h, v22.4h\n"
11409       "trn1 v5.4h, v21.4h, v23.4h\n"
11410       "trn2 v7.4h, v21.4h, v23.4h\n"
11411       "trn1 v16.2s, v0.2s, v4.2s\n"
11412       "trn2 v20.2s, v0.2s, v4.2s\n"
11413       "trn1 v17.2s, v1.2s, v5.2s\n"
11414       "trn2 v21.2s, v1.2s, v5.2s\n"
11415       "trn1 v18.2s, v2.2s, v6.2s\n"
11416       "trn2 v22.2s, v2.2s, v6.2s\n"
11417       "trn1 v19.2s, v3.2s, v7.2s\n"
11418       "trn2 v23.2s, v3.2s, v7.2s\n"
11419       "uaddw v8.8h, v8.8h, v16.8b\n"
11420       "uaddw v9.8h, v9.8h, v17.8b\n"
11421       "uaddw v10.8h, v10.8h, v18.8b\n"
11422       "uaddw v11.8h, v11.8h, v19.8b\n"
11423       "uaddw v12.8h, v12.8h, v20.8b\n"
11424       "uaddw v13.8h, v13.8h, v21.8b\n"
11425       "uaddw v14.8h, v14.8h, v22.8b\n"
11426       "uaddw v15.8h, v15.8h, v23.8b\n"
11427       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11428       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11429 
11430       // Aggregator Reduction.
11431       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11432       "dup v1.4s, %w[additive_sum_offset]\n"
11433       "uaddlp v8.4s, v8.8h\n"
11434       "uaddlp v9.4s, v9.8h\n"
11435       "uaddlp v10.4s, v10.8h\n"
11436       "uaddlp v11.4s, v11.8h\n"
11437       "uaddlp v12.4s, v12.8h\n"
11438       "uaddlp v13.4s, v13.8h\n"
11439       "uaddlp v14.4s, v14.8h\n"
11440       "uaddlp v15.4s, v15.8h\n"
11441       "addp v8.4s, v8.4s, v9.4s\n"
11442       "addp v10.4s, v10.4s, v11.4s\n"
11443       "addp v12.4s, v12.4s, v13.4s\n"
11444       "addp v14.4s, v14.4s, v15.4s\n"
11445       "addp v8.4s, v8.4s, v10.4s\n"
11446       "addp v9.4s, v12.4s, v14.4s\n"
11447       "mul v8.4s, v8.4s, v0.s[0]\n"
11448       "mul v9.4s, v9.4s, v0.s[0]\n"
11449       "add v8.4s, v8.4s, v1.4s\n"
11450       "add v9.4s, v9.4s, v1.4s\n"
11451       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11452       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11453         [out] "+r"(out), [in] "+r"(in)
11454       : [additive_sum_offset] "r"(params.additive_sum_offset),
11455         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11456       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11457         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11458         "v21", "v22", "v23", "cc", "memory");
11459 }
11460 
11461 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11462 inline void Stream<uint8_t, 8, 8, 3, ColumnMajorWithSum>::Pack(
11463     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11464 #ifdef DEBUG
11465 #ifdef DEBUG_METAGEMM_VERBOSE
11466   std::cout
11467       << __FILE__ << "(" << __LINE__
11468       << ") ColumnMajorWithSum<uint8_t, 8, 8, 3, ColumnMajorWithSum>::Pack()"
11469       << std::endl
11470       << std::flush;
11471 #endif
11472 #endif
11473   int params_count_copy = params.count;
11474   int params_stride_copy = params.stride;
11475   asm volatile(
11476       "movi v8.8h, #0\n"
11477       "movi v9.8h, #0\n"
11478       "movi v10.8h, #0\n"
11479       "movi v11.8h, #0\n"
11480       "movi v12.8h, #0\n"
11481       "movi v13.8h, #0\n"
11482       "movi v14.8h, #0\n"
11483       "movi v15.8h, #0\n"
11484 
11485       // Reduce count by leftovers.
11486       "subs %x[count], %x[count], #3\n"
11487       "beq 2f\n"
11488 
11489       "1:"
11490       "subs %x[count], %x[count], #8\n"
11491 
11492       // Load Aggregate Store - column major 8x8
11493       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11494       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11495       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11496       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11497       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11498       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11499       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11500       "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11501       "prfm pldl1keep, [%x[in]]\n"
11502       "trn1 v16.8b, v0.8b, v1.8b\n"
11503       "trn2 v17.8b, v0.8b, v1.8b\n"
11504       "trn1 v18.8b, v2.8b, v3.8b\n"
11505       "trn2 v19.8b, v2.8b, v3.8b\n"
11506       "trn1 v20.8b, v4.8b, v5.8b\n"
11507       "trn2 v21.8b, v4.8b, v5.8b\n"
11508       "trn1 v22.8b, v6.8b, v7.8b\n"
11509       "trn2 v23.8b, v6.8b, v7.8b\n"
11510       "trn1 v0.4h, v16.4h, v18.4h\n"
11511       "trn2 v2.4h, v16.4h, v18.4h\n"
11512       "trn1 v1.4h, v17.4h, v19.4h\n"
11513       "trn2 v3.4h, v17.4h, v19.4h\n"
11514       "trn1 v4.4h, v20.4h, v22.4h\n"
11515       "trn2 v6.4h, v20.4h, v22.4h\n"
11516       "trn1 v5.4h, v21.4h, v23.4h\n"
11517       "trn2 v7.4h, v21.4h, v23.4h\n"
11518       "trn1 v16.2s, v0.2s, v4.2s\n"
11519       "trn2 v20.2s, v0.2s, v4.2s\n"
11520       "trn1 v17.2s, v1.2s, v5.2s\n"
11521       "trn2 v21.2s, v1.2s, v5.2s\n"
11522       "trn1 v18.2s, v2.2s, v6.2s\n"
11523       "trn2 v22.2s, v2.2s, v6.2s\n"
11524       "trn1 v19.2s, v3.2s, v7.2s\n"
11525       "trn2 v23.2s, v3.2s, v7.2s\n"
11526       "uaddw v8.8h, v8.8h, v16.8b\n"
11527       "uaddw v9.8h, v9.8h, v17.8b\n"
11528       "uaddw v10.8h, v10.8h, v18.8b\n"
11529       "uaddw v11.8h, v11.8h, v19.8b\n"
11530       "uaddw v12.8h, v12.8h, v20.8b\n"
11531       "uaddw v13.8h, v13.8h, v21.8b\n"
11532       "uaddw v14.8h, v14.8h, v22.8b\n"
11533       "uaddw v15.8h, v15.8h, v23.8b\n"
11534       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11535       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11536 
11537       "bne 1b\n"
11538 
11539       "2:"
11540 
11541       // Load Aggregate Store - column major 8x3
11542       "movi v0.8b, #0\n"
11543       "movi v1.8b, #0\n"
11544       "movi v2.8b, #0\n"
11545       "movi v3.8b, #0\n"
11546       "movi v4.8b, #0\n"
11547       "movi v5.8b, #0\n"
11548       "movi v6.8b, #0\n"
11549       "movi v7.8b, #0\n"
11550       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11551       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11552       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11553       "prfm pldl1keep, [%x[in]]\n"
11554       "trn1 v16.8b, v0.8b, v1.8b\n"
11555       "trn2 v17.8b, v0.8b, v1.8b\n"
11556       "trn1 v18.8b, v2.8b, v3.8b\n"
11557       "trn2 v19.8b, v2.8b, v3.8b\n"
11558       "trn1 v20.8b, v4.8b, v5.8b\n"
11559       "trn2 v21.8b, v4.8b, v5.8b\n"
11560       "trn1 v22.8b, v6.8b, v7.8b\n"
11561       "trn2 v23.8b, v6.8b, v7.8b\n"
11562       "trn1 v0.4h, v16.4h, v18.4h\n"
11563       "trn2 v2.4h, v16.4h, v18.4h\n"
11564       "trn1 v1.4h, v17.4h, v19.4h\n"
11565       "trn2 v3.4h, v17.4h, v19.4h\n"
11566       "trn1 v4.4h, v20.4h, v22.4h\n"
11567       "trn2 v6.4h, v20.4h, v22.4h\n"
11568       "trn1 v5.4h, v21.4h, v23.4h\n"
11569       "trn2 v7.4h, v21.4h, v23.4h\n"
11570       "trn1 v16.2s, v0.2s, v4.2s\n"
11571       "trn2 v20.2s, v0.2s, v4.2s\n"
11572       "trn1 v17.2s, v1.2s, v5.2s\n"
11573       "trn2 v21.2s, v1.2s, v5.2s\n"
11574       "trn1 v18.2s, v2.2s, v6.2s\n"
11575       "trn2 v22.2s, v2.2s, v6.2s\n"
11576       "trn1 v19.2s, v3.2s, v7.2s\n"
11577       "trn2 v23.2s, v3.2s, v7.2s\n"
11578       "uaddw v8.8h, v8.8h, v16.8b\n"
11579       "uaddw v9.8h, v9.8h, v17.8b\n"
11580       "uaddw v10.8h, v10.8h, v18.8b\n"
11581       "uaddw v11.8h, v11.8h, v19.8b\n"
11582       "uaddw v12.8h, v12.8h, v20.8b\n"
11583       "uaddw v13.8h, v13.8h, v21.8b\n"
11584       "uaddw v14.8h, v14.8h, v22.8b\n"
11585       "uaddw v15.8h, v15.8h, v23.8b\n"
11586       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11587       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11588 
11589       // Aggregator Reduction.
11590       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11591       "dup v1.4s, %w[additive_sum_offset]\n"
11592       "uaddlp v8.4s, v8.8h\n"
11593       "uaddlp v9.4s, v9.8h\n"
11594       "uaddlp v10.4s, v10.8h\n"
11595       "uaddlp v11.4s, v11.8h\n"
11596       "uaddlp v12.4s, v12.8h\n"
11597       "uaddlp v13.4s, v13.8h\n"
11598       "uaddlp v14.4s, v14.8h\n"
11599       "uaddlp v15.4s, v15.8h\n"
11600       "addp v8.4s, v8.4s, v9.4s\n"
11601       "addp v10.4s, v10.4s, v11.4s\n"
11602       "addp v12.4s, v12.4s, v13.4s\n"
11603       "addp v14.4s, v14.4s, v15.4s\n"
11604       "addp v8.4s, v8.4s, v10.4s\n"
11605       "addp v9.4s, v12.4s, v14.4s\n"
11606       "mul v8.4s, v8.4s, v0.s[0]\n"
11607       "mul v9.4s, v9.4s, v0.s[0]\n"
11608       "add v8.4s, v8.4s, v1.4s\n"
11609       "add v9.4s, v9.4s, v1.4s\n"
11610       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11611       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11612         [out] "+r"(out), [in] "+r"(in)
11613       : [additive_sum_offset] "r"(params.additive_sum_offset),
11614         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11615       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11616         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11617         "v21", "v22", "v23", "cc", "memory");
11618 }
11619 
11620 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11621 inline void Stream<uint8_t, 8, 8, 4, ColumnMajorWithSum>::Pack(
11622     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11623 #ifdef DEBUG
11624 #ifdef DEBUG_METAGEMM_VERBOSE
11625   std::cout
11626       << __FILE__ << "(" << __LINE__
11627       << ") ColumnMajorWithSum<uint8_t, 8, 8, 4, ColumnMajorWithSum>::Pack()"
11628       << std::endl
11629       << std::flush;
11630 #endif
11631 #endif
11632   int params_count_copy = params.count;
11633   int params_stride_copy = params.stride;
11634   asm volatile(
11635       "movi v8.8h, #0\n"
11636       "movi v9.8h, #0\n"
11637       "movi v10.8h, #0\n"
11638       "movi v11.8h, #0\n"
11639       "movi v12.8h, #0\n"
11640       "movi v13.8h, #0\n"
11641       "movi v14.8h, #0\n"
11642       "movi v15.8h, #0\n"
11643 
11644       // Reduce count by leftovers.
11645       "subs %x[count], %x[count], #4\n"
11646       "beq 2f\n"
11647 
11648       "1:"
11649       "subs %x[count], %x[count], #8\n"
11650 
11651       // Load Aggregate Store - column major 8x8
11652       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11653       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11654       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11655       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11656       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11657       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11658       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11659       "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11660       "prfm pldl1keep, [%x[in]]\n"
11661       "trn1 v16.8b, v0.8b, v1.8b\n"
11662       "trn2 v17.8b, v0.8b, v1.8b\n"
11663       "trn1 v18.8b, v2.8b, v3.8b\n"
11664       "trn2 v19.8b, v2.8b, v3.8b\n"
11665       "trn1 v20.8b, v4.8b, v5.8b\n"
11666       "trn2 v21.8b, v4.8b, v5.8b\n"
11667       "trn1 v22.8b, v6.8b, v7.8b\n"
11668       "trn2 v23.8b, v6.8b, v7.8b\n"
11669       "trn1 v0.4h, v16.4h, v18.4h\n"
11670       "trn2 v2.4h, v16.4h, v18.4h\n"
11671       "trn1 v1.4h, v17.4h, v19.4h\n"
11672       "trn2 v3.4h, v17.4h, v19.4h\n"
11673       "trn1 v4.4h, v20.4h, v22.4h\n"
11674       "trn2 v6.4h, v20.4h, v22.4h\n"
11675       "trn1 v5.4h, v21.4h, v23.4h\n"
11676       "trn2 v7.4h, v21.4h, v23.4h\n"
11677       "trn1 v16.2s, v0.2s, v4.2s\n"
11678       "trn2 v20.2s, v0.2s, v4.2s\n"
11679       "trn1 v17.2s, v1.2s, v5.2s\n"
11680       "trn2 v21.2s, v1.2s, v5.2s\n"
11681       "trn1 v18.2s, v2.2s, v6.2s\n"
11682       "trn2 v22.2s, v2.2s, v6.2s\n"
11683       "trn1 v19.2s, v3.2s, v7.2s\n"
11684       "trn2 v23.2s, v3.2s, v7.2s\n"
11685       "uaddw v8.8h, v8.8h, v16.8b\n"
11686       "uaddw v9.8h, v9.8h, v17.8b\n"
11687       "uaddw v10.8h, v10.8h, v18.8b\n"
11688       "uaddw v11.8h, v11.8h, v19.8b\n"
11689       "uaddw v12.8h, v12.8h, v20.8b\n"
11690       "uaddw v13.8h, v13.8h, v21.8b\n"
11691       "uaddw v14.8h, v14.8h, v22.8b\n"
11692       "uaddw v15.8h, v15.8h, v23.8b\n"
11693       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11694       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11695 
11696       "bne 1b\n"
11697 
11698       "2:"
11699 
11700       // Load Aggregate Store - column major 8x4
11701       "movi v0.8b, #0\n"
11702       "movi v1.8b, #0\n"
11703       "movi v2.8b, #0\n"
11704       "movi v3.8b, #0\n"
11705       "movi v4.8b, #0\n"
11706       "movi v5.8b, #0\n"
11707       "movi v6.8b, #0\n"
11708       "movi v7.8b, #0\n"
11709       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11710       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11711       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11712       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11713       "prfm pldl1keep, [%x[in]]\n"
11714       "trn1 v16.8b, v0.8b, v1.8b\n"
11715       "trn2 v17.8b, v0.8b, v1.8b\n"
11716       "trn1 v18.8b, v2.8b, v3.8b\n"
11717       "trn2 v19.8b, v2.8b, v3.8b\n"
11718       "trn1 v20.8b, v4.8b, v5.8b\n"
11719       "trn2 v21.8b, v4.8b, v5.8b\n"
11720       "trn1 v22.8b, v6.8b, v7.8b\n"
11721       "trn2 v23.8b, v6.8b, v7.8b\n"
11722       "trn1 v0.4h, v16.4h, v18.4h\n"
11723       "trn2 v2.4h, v16.4h, v18.4h\n"
11724       "trn1 v1.4h, v17.4h, v19.4h\n"
11725       "trn2 v3.4h, v17.4h, v19.4h\n"
11726       "trn1 v4.4h, v20.4h, v22.4h\n"
11727       "trn2 v6.4h, v20.4h, v22.4h\n"
11728       "trn1 v5.4h, v21.4h, v23.4h\n"
11729       "trn2 v7.4h, v21.4h, v23.4h\n"
11730       "trn1 v16.2s, v0.2s, v4.2s\n"
11731       "trn2 v20.2s, v0.2s, v4.2s\n"
11732       "trn1 v17.2s, v1.2s, v5.2s\n"
11733       "trn2 v21.2s, v1.2s, v5.2s\n"
11734       "trn1 v18.2s, v2.2s, v6.2s\n"
11735       "trn2 v22.2s, v2.2s, v6.2s\n"
11736       "trn1 v19.2s, v3.2s, v7.2s\n"
11737       "trn2 v23.2s, v3.2s, v7.2s\n"
11738       "uaddw v8.8h, v8.8h, v16.8b\n"
11739       "uaddw v9.8h, v9.8h, v17.8b\n"
11740       "uaddw v10.8h, v10.8h, v18.8b\n"
11741       "uaddw v11.8h, v11.8h, v19.8b\n"
11742       "uaddw v12.8h, v12.8h, v20.8b\n"
11743       "uaddw v13.8h, v13.8h, v21.8b\n"
11744       "uaddw v14.8h, v14.8h, v22.8b\n"
11745       "uaddw v15.8h, v15.8h, v23.8b\n"
11746       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11747       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11748 
11749       // Aggregator Reduction.
11750       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11751       "dup v1.4s, %w[additive_sum_offset]\n"
11752       "uaddlp v8.4s, v8.8h\n"
11753       "uaddlp v9.4s, v9.8h\n"
11754       "uaddlp v10.4s, v10.8h\n"
11755       "uaddlp v11.4s, v11.8h\n"
11756       "uaddlp v12.4s, v12.8h\n"
11757       "uaddlp v13.4s, v13.8h\n"
11758       "uaddlp v14.4s, v14.8h\n"
11759       "uaddlp v15.4s, v15.8h\n"
11760       "addp v8.4s, v8.4s, v9.4s\n"
11761       "addp v10.4s, v10.4s, v11.4s\n"
11762       "addp v12.4s, v12.4s, v13.4s\n"
11763       "addp v14.4s, v14.4s, v15.4s\n"
11764       "addp v8.4s, v8.4s, v10.4s\n"
11765       "addp v9.4s, v12.4s, v14.4s\n"
11766       "mul v8.4s, v8.4s, v0.s[0]\n"
11767       "mul v9.4s, v9.4s, v0.s[0]\n"
11768       "add v8.4s, v8.4s, v1.4s\n"
11769       "add v9.4s, v9.4s, v1.4s\n"
11770       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11771       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11772         [out] "+r"(out), [in] "+r"(in)
11773       : [additive_sum_offset] "r"(params.additive_sum_offset),
11774         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11775       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11776         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11777         "v21", "v22", "v23", "cc", "memory");
11778 }
11779 
11780 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11781 inline void Stream<uint8_t, 8, 8, 5, ColumnMajorWithSum>::Pack(
11782     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11783 #ifdef DEBUG
11784 #ifdef DEBUG_METAGEMM_VERBOSE
11785   std::cout
11786       << __FILE__ << "(" << __LINE__
11787       << ") ColumnMajorWithSum<uint8_t, 8, 8, 5, ColumnMajorWithSum>::Pack()"
11788       << std::endl
11789       << std::flush;
11790 #endif
11791 #endif
11792   int params_count_copy = params.count;
11793   int params_stride_copy = params.stride;
11794   asm volatile(
11795       "movi v8.8h, #0\n"
11796       "movi v9.8h, #0\n"
11797       "movi v10.8h, #0\n"
11798       "movi v11.8h, #0\n"
11799       "movi v12.8h, #0\n"
11800       "movi v13.8h, #0\n"
11801       "movi v14.8h, #0\n"
11802       "movi v15.8h, #0\n"
11803 
11804       // Reduce count by leftovers.
11805       "subs %x[count], %x[count], #5\n"
11806       "beq 2f\n"
11807 
11808       "1:"
11809       "subs %x[count], %x[count], #8\n"
11810 
11811       // Load Aggregate Store - column major 8x8
11812       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11813       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11814       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11815       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11816       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11817       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11818       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11819       "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11820       "prfm pldl1keep, [%x[in]]\n"
11821       "trn1 v16.8b, v0.8b, v1.8b\n"
11822       "trn2 v17.8b, v0.8b, v1.8b\n"
11823       "trn1 v18.8b, v2.8b, v3.8b\n"
11824       "trn2 v19.8b, v2.8b, v3.8b\n"
11825       "trn1 v20.8b, v4.8b, v5.8b\n"
11826       "trn2 v21.8b, v4.8b, v5.8b\n"
11827       "trn1 v22.8b, v6.8b, v7.8b\n"
11828       "trn2 v23.8b, v6.8b, v7.8b\n"
11829       "trn1 v0.4h, v16.4h, v18.4h\n"
11830       "trn2 v2.4h, v16.4h, v18.4h\n"
11831       "trn1 v1.4h, v17.4h, v19.4h\n"
11832       "trn2 v3.4h, v17.4h, v19.4h\n"
11833       "trn1 v4.4h, v20.4h, v22.4h\n"
11834       "trn2 v6.4h, v20.4h, v22.4h\n"
11835       "trn1 v5.4h, v21.4h, v23.4h\n"
11836       "trn2 v7.4h, v21.4h, v23.4h\n"
11837       "trn1 v16.2s, v0.2s, v4.2s\n"
11838       "trn2 v20.2s, v0.2s, v4.2s\n"
11839       "trn1 v17.2s, v1.2s, v5.2s\n"
11840       "trn2 v21.2s, v1.2s, v5.2s\n"
11841       "trn1 v18.2s, v2.2s, v6.2s\n"
11842       "trn2 v22.2s, v2.2s, v6.2s\n"
11843       "trn1 v19.2s, v3.2s, v7.2s\n"
11844       "trn2 v23.2s, v3.2s, v7.2s\n"
11845       "uaddw v8.8h, v8.8h, v16.8b\n"
11846       "uaddw v9.8h, v9.8h, v17.8b\n"
11847       "uaddw v10.8h, v10.8h, v18.8b\n"
11848       "uaddw v11.8h, v11.8h, v19.8b\n"
11849       "uaddw v12.8h, v12.8h, v20.8b\n"
11850       "uaddw v13.8h, v13.8h, v21.8b\n"
11851       "uaddw v14.8h, v14.8h, v22.8b\n"
11852       "uaddw v15.8h, v15.8h, v23.8b\n"
11853       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11854       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11855 
11856       "bne 1b\n"
11857 
11858       "2:"
11859 
11860       // Load Aggregate Store - column major 8x5
11861       "movi v0.8b, #0\n"
11862       "movi v1.8b, #0\n"
11863       "movi v2.8b, #0\n"
11864       "movi v3.8b, #0\n"
11865       "movi v4.8b, #0\n"
11866       "movi v5.8b, #0\n"
11867       "movi v6.8b, #0\n"
11868       "movi v7.8b, #0\n"
11869       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11870       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11871       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11872       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11873       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11874       "prfm pldl1keep, [%x[in]]\n"
11875       "trn1 v16.8b, v0.8b, v1.8b\n"
11876       "trn2 v17.8b, v0.8b, v1.8b\n"
11877       "trn1 v18.8b, v2.8b, v3.8b\n"
11878       "trn2 v19.8b, v2.8b, v3.8b\n"
11879       "trn1 v20.8b, v4.8b, v5.8b\n"
11880       "trn2 v21.8b, v4.8b, v5.8b\n"
11881       "trn1 v22.8b, v6.8b, v7.8b\n"
11882       "trn2 v23.8b, v6.8b, v7.8b\n"
11883       "trn1 v0.4h, v16.4h, v18.4h\n"
11884       "trn2 v2.4h, v16.4h, v18.4h\n"
11885       "trn1 v1.4h, v17.4h, v19.4h\n"
11886       "trn2 v3.4h, v17.4h, v19.4h\n"
11887       "trn1 v4.4h, v20.4h, v22.4h\n"
11888       "trn2 v6.4h, v20.4h, v22.4h\n"
11889       "trn1 v5.4h, v21.4h, v23.4h\n"
11890       "trn2 v7.4h, v21.4h, v23.4h\n"
11891       "trn1 v16.2s, v0.2s, v4.2s\n"
11892       "trn2 v20.2s, v0.2s, v4.2s\n"
11893       "trn1 v17.2s, v1.2s, v5.2s\n"
11894       "trn2 v21.2s, v1.2s, v5.2s\n"
11895       "trn1 v18.2s, v2.2s, v6.2s\n"
11896       "trn2 v22.2s, v2.2s, v6.2s\n"
11897       "trn1 v19.2s, v3.2s, v7.2s\n"
11898       "trn2 v23.2s, v3.2s, v7.2s\n"
11899       "uaddw v8.8h, v8.8h, v16.8b\n"
11900       "uaddw v9.8h, v9.8h, v17.8b\n"
11901       "uaddw v10.8h, v10.8h, v18.8b\n"
11902       "uaddw v11.8h, v11.8h, v19.8b\n"
11903       "uaddw v12.8h, v12.8h, v20.8b\n"
11904       "uaddw v13.8h, v13.8h, v21.8b\n"
11905       "uaddw v14.8h, v14.8h, v22.8b\n"
11906       "uaddw v15.8h, v15.8h, v23.8b\n"
11907       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11908       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11909 
11910       // Aggregator Reduction.
11911       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11912       "dup v1.4s, %w[additive_sum_offset]\n"
11913       "uaddlp v8.4s, v8.8h\n"
11914       "uaddlp v9.4s, v9.8h\n"
11915       "uaddlp v10.4s, v10.8h\n"
11916       "uaddlp v11.4s, v11.8h\n"
11917       "uaddlp v12.4s, v12.8h\n"
11918       "uaddlp v13.4s, v13.8h\n"
11919       "uaddlp v14.4s, v14.8h\n"
11920       "uaddlp v15.4s, v15.8h\n"
11921       "addp v8.4s, v8.4s, v9.4s\n"
11922       "addp v10.4s, v10.4s, v11.4s\n"
11923       "addp v12.4s, v12.4s, v13.4s\n"
11924       "addp v14.4s, v14.4s, v15.4s\n"
11925       "addp v8.4s, v8.4s, v10.4s\n"
11926       "addp v9.4s, v12.4s, v14.4s\n"
11927       "mul v8.4s, v8.4s, v0.s[0]\n"
11928       "mul v9.4s, v9.4s, v0.s[0]\n"
11929       "add v8.4s, v8.4s, v1.4s\n"
11930       "add v9.4s, v9.4s, v1.4s\n"
11931       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11932       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11933         [out] "+r"(out), [in] "+r"(in)
11934       : [additive_sum_offset] "r"(params.additive_sum_offset),
11935         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11936       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11937         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11938         "v21", "v22", "v23", "cc", "memory");
11939 }
11940 
11941 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11942 inline void Stream<uint8_t, 8, 8, 6, ColumnMajorWithSum>::Pack(
11943     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11944 #ifdef DEBUG
11945 #ifdef DEBUG_METAGEMM_VERBOSE
11946   std::cout
11947       << __FILE__ << "(" << __LINE__
11948       << ") ColumnMajorWithSum<uint8_t, 8, 8, 6, ColumnMajorWithSum>::Pack()"
11949       << std::endl
11950       << std::flush;
11951 #endif
11952 #endif
11953   int params_count_copy = params.count;
11954   int params_stride_copy = params.stride;
11955   asm volatile(
11956       "movi v8.8h, #0\n"
11957       "movi v9.8h, #0\n"
11958       "movi v10.8h, #0\n"
11959       "movi v11.8h, #0\n"
11960       "movi v12.8h, #0\n"
11961       "movi v13.8h, #0\n"
11962       "movi v14.8h, #0\n"
11963       "movi v15.8h, #0\n"
11964 
11965       // Reduce count by leftovers.
11966       "subs %x[count], %x[count], #6\n"
11967       "beq 2f\n"
11968 
11969       "1:"
11970       "subs %x[count], %x[count], #8\n"
11971 
11972       // Load Aggregate Store - column major 8x8
11973       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11974       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11975       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11976       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11977       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11978       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11979       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11980       "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11981       "prfm pldl1keep, [%x[in]]\n"
11982       "trn1 v16.8b, v0.8b, v1.8b\n"
11983       "trn2 v17.8b, v0.8b, v1.8b\n"
11984       "trn1 v18.8b, v2.8b, v3.8b\n"
11985       "trn2 v19.8b, v2.8b, v3.8b\n"
11986       "trn1 v20.8b, v4.8b, v5.8b\n"
11987       "trn2 v21.8b, v4.8b, v5.8b\n"
11988       "trn1 v22.8b, v6.8b, v7.8b\n"
11989       "trn2 v23.8b, v6.8b, v7.8b\n"
11990       "trn1 v0.4h, v16.4h, v18.4h\n"
11991       "trn2 v2.4h, v16.4h, v18.4h\n"
11992       "trn1 v1.4h, v17.4h, v19.4h\n"
11993       "trn2 v3.4h, v17.4h, v19.4h\n"
11994       "trn1 v4.4h, v20.4h, v22.4h\n"
11995       "trn2 v6.4h, v20.4h, v22.4h\n"
11996       "trn1 v5.4h, v21.4h, v23.4h\n"
11997       "trn2 v7.4h, v21.4h, v23.4h\n"
11998       "trn1 v16.2s, v0.2s, v4.2s\n"
11999       "trn2 v20.2s, v0.2s, v4.2s\n"
12000       "trn1 v17.2s, v1.2s, v5.2s\n"
12001       "trn2 v21.2s, v1.2s, v5.2s\n"
12002       "trn1 v18.2s, v2.2s, v6.2s\n"
12003       "trn2 v22.2s, v2.2s, v6.2s\n"
12004       "trn1 v19.2s, v3.2s, v7.2s\n"
12005       "trn2 v23.2s, v3.2s, v7.2s\n"
12006       "uaddw v8.8h, v8.8h, v16.8b\n"
12007       "uaddw v9.8h, v9.8h, v17.8b\n"
12008       "uaddw v10.8h, v10.8h, v18.8b\n"
12009       "uaddw v11.8h, v11.8h, v19.8b\n"
12010       "uaddw v12.8h, v12.8h, v20.8b\n"
12011       "uaddw v13.8h, v13.8h, v21.8b\n"
12012       "uaddw v14.8h, v14.8h, v22.8b\n"
12013       "uaddw v15.8h, v15.8h, v23.8b\n"
12014       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
12015       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
12016 
12017       "bne 1b\n"
12018 
12019       "2:"
12020 
12021       // Load Aggregate Store - column major 8x6
12022       "movi v0.8b, #0\n"
12023       "movi v1.8b, #0\n"
12024       "movi v2.8b, #0\n"
12025       "movi v3.8b, #0\n"
12026       "movi v4.8b, #0\n"
12027       "movi v5.8b, #0\n"
12028       "movi v6.8b, #0\n"
12029       "movi v7.8b, #0\n"
12030       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
12031       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
12032       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
12033       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
12034       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
12035       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
12036       "prfm pldl1keep, [%x[in]]\n"
12037       "trn1 v16.8b, v0.8b, v1.8b\n"
12038       "trn2 v17.8b, v0.8b, v1.8b\n"
12039       "trn1 v18.8b, v2.8b, v3.8b\n"
12040       "trn2 v19.8b, v2.8b, v3.8b\n"
12041       "trn1 v20.8b, v4.8b, v5.8b\n"
12042       "trn2 v21.8b, v4.8b, v5.8b\n"
12043       "trn1 v22.8b, v6.8b, v7.8b\n"
12044       "trn2 v23.8b, v6.8b, v7.8b\n"
12045       "trn1 v0.4h, v16.4h, v18.4h\n"
12046       "trn2 v2.4h, v16.4h, v18.4h\n"
12047       "trn1 v1.4h, v17.4h, v19.4h\n"
12048       "trn2 v3.4h, v17.4h, v19.4h\n"
12049       "trn1 v4.4h, v20.4h, v22.4h\n"
12050       "trn2 v6.4h, v20.4h, v22.4h\n"
12051       "trn1 v5.4h, v21.4h, v23.4h\n"
12052       "trn2 v7.4h, v21.4h, v23.4h\n"
12053       "trn1 v16.2s, v0.2s, v4.2s\n"
12054       "trn2 v20.2s, v0.2s, v4.2s\n"
12055       "trn1 v17.2s, v1.2s, v5.2s\n"
12056       "trn2 v21.2s, v1.2s, v5.2s\n"
12057       "trn1 v18.2s, v2.2s, v6.2s\n"
12058       "trn2 v22.2s, v2.2s, v6.2s\n"
12059       "trn1 v19.2s, v3.2s, v7.2s\n"
12060       "trn2 v23.2s, v3.2s, v7.2s\n"
12061       "uaddw v8.8h, v8.8h, v16.8b\n"
12062       "uaddw v9.8h, v9.8h, v17.8b\n"
12063       "uaddw v10.8h, v10.8h, v18.8b\n"
12064       "uaddw v11.8h, v11.8h, v19.8b\n"
12065       "uaddw v12.8h, v12.8h, v20.8b\n"
12066       "uaddw v13.8h, v13.8h, v21.8b\n"
12067       "uaddw v14.8h, v14.8h, v22.8b\n"
12068       "uaddw v15.8h, v15.8h, v23.8b\n"
12069       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
12070       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
12071 
12072       // Aggregator Reduction.
12073       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
12074       "dup v1.4s, %w[additive_sum_offset]\n"
12075       "uaddlp v8.4s, v8.8h\n"
12076       "uaddlp v9.4s, v9.8h\n"
12077       "uaddlp v10.4s, v10.8h\n"
12078       "uaddlp v11.4s, v11.8h\n"
12079       "uaddlp v12.4s, v12.8h\n"
12080       "uaddlp v13.4s, v13.8h\n"
12081       "uaddlp v14.4s, v14.8h\n"
12082       "uaddlp v15.4s, v15.8h\n"
12083       "addp v8.4s, v8.4s, v9.4s\n"
12084       "addp v10.4s, v10.4s, v11.4s\n"
12085       "addp v12.4s, v12.4s, v13.4s\n"
12086       "addp v14.4s, v14.4s, v15.4s\n"
12087       "addp v8.4s, v8.4s, v10.4s\n"
12088       "addp v9.4s, v12.4s, v14.4s\n"
12089       "mul v8.4s, v8.4s, v0.s[0]\n"
12090       "mul v9.4s, v9.4s, v0.s[0]\n"
12091       "add v8.4s, v8.4s, v1.4s\n"
12092       "add v9.4s, v9.4s, v1.4s\n"
12093       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
12094       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
12095         [out] "+r"(out), [in] "+r"(in)
12096       : [additive_sum_offset] "r"(params.additive_sum_offset),
12097         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
12098       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
12099         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
12100         "v21", "v22", "v23", "cc", "memory");
12101 }
12102 
12103 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)12104 inline void Stream<uint8_t, 8, 8, 7, ColumnMajorWithSum>::Pack(
12105     const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
12106 #ifdef DEBUG
12107 #ifdef DEBUG_METAGEMM_VERBOSE
12108   std::cout
12109       << __FILE__ << "(" << __LINE__
12110       << ") ColumnMajorWithSum<uint8_t, 8, 8, 7, ColumnMajorWithSum>::Pack()"
12111       << std::endl
12112       << std::flush;
12113 #endif
12114 #endif
12115   int params_count_copy = params.count;
12116   int params_stride_copy = params.stride;
12117   asm volatile(
12118       "movi v8.8h, #0\n"
12119       "movi v9.8h, #0\n"
12120       "movi v10.8h, #0\n"
12121       "movi v11.8h, #0\n"
12122       "movi v12.8h, #0\n"
12123       "movi v13.8h, #0\n"
12124       "movi v14.8h, #0\n"
12125       "movi v15.8h, #0\n"
12126 
12127       // Reduce count by leftovers.
12128       "subs %x[count], %x[count], #7\n"
12129       "beq 2f\n"
12130 
12131       "1:"
12132       "subs %x[count], %x[count], #8\n"
12133 
12134       // Load Aggregate Store - column major 8x8
12135       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
12136       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
12137       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
12138       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
12139       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
12140       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
12141       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
12142       "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
12143       "prfm pldl1keep, [%x[in]]\n"
12144       "trn1 v16.8b, v0.8b, v1.8b\n"
12145       "trn2 v17.8b, v0.8b, v1.8b\n"
12146       "trn1 v18.8b, v2.8b, v3.8b\n"
12147       "trn2 v19.8b, v2.8b, v3.8b\n"
12148       "trn1 v20.8b, v4.8b, v5.8b\n"
12149       "trn2 v21.8b, v4.8b, v5.8b\n"
12150       "trn1 v22.8b, v6.8b, v7.8b\n"
12151       "trn2 v23.8b, v6.8b, v7.8b\n"
12152       "trn1 v0.4h, v16.4h, v18.4h\n"
12153       "trn2 v2.4h, v16.4h, v18.4h\n"
12154       "trn1 v1.4h, v17.4h, v19.4h\n"
12155       "trn2 v3.4h, v17.4h, v19.4h\n"
12156       "trn1 v4.4h, v20.4h, v22.4h\n"
12157       "trn2 v6.4h, v20.4h, v22.4h\n"
12158       "trn1 v5.4h, v21.4h, v23.4h\n"
12159       "trn2 v7.4h, v21.4h, v23.4h\n"
12160       "trn1 v16.2s, v0.2s, v4.2s\n"
12161       "trn2 v20.2s, v0.2s, v4.2s\n"
12162       "trn1 v17.2s, v1.2s, v5.2s\n"
12163       "trn2 v21.2s, v1.2s, v5.2s\n"
12164       "trn1 v18.2s, v2.2s, v6.2s\n"
12165       "trn2 v22.2s, v2.2s, v6.2s\n"
12166       "trn1 v19.2s, v3.2s, v7.2s\n"
12167       "trn2 v23.2s, v3.2s, v7.2s\n"
12168       "uaddw v8.8h, v8.8h, v16.8b\n"
12169       "uaddw v9.8h, v9.8h, v17.8b\n"
12170       "uaddw v10.8h, v10.8h, v18.8b\n"
12171       "uaddw v11.8h, v11.8h, v19.8b\n"
12172       "uaddw v12.8h, v12.8h, v20.8b\n"
12173       "uaddw v13.8h, v13.8h, v21.8b\n"
12174       "uaddw v14.8h, v14.8h, v22.8b\n"
12175       "uaddw v15.8h, v15.8h, v23.8b\n"
12176       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
12177       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
12178 
12179       "bne 1b\n"
12180 
12181       "2:"
12182 
12183       // Load Aggregate Store - column major 8x7
12184       "movi v0.8b, #0\n"
12185       "movi v1.8b, #0\n"
12186       "movi v2.8b, #0\n"
12187       "movi v3.8b, #0\n"
12188       "movi v4.8b, #0\n"
12189       "movi v5.8b, #0\n"
12190       "movi v6.8b, #0\n"
12191       "movi v7.8b, #0\n"
12192       "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
12193       "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
12194       "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
12195       "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
12196       "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
12197       "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
12198       "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
12199       "prfm pldl1keep, [%x[in]]\n"
12200       "trn1 v16.8b, v0.8b, v1.8b\n"
12201       "trn2 v17.8b, v0.8b, v1.8b\n"
12202       "trn1 v18.8b, v2.8b, v3.8b\n"
12203       "trn2 v19.8b, v2.8b, v3.8b\n"
12204       "trn1 v20.8b, v4.8b, v5.8b\n"
12205       "trn2 v21.8b, v4.8b, v5.8b\n"
12206       "trn1 v22.8b, v6.8b, v7.8b\n"
12207       "trn2 v23.8b, v6.8b, v7.8b\n"
12208       "trn1 v0.4h, v16.4h, v18.4h\n"
12209       "trn2 v2.4h, v16.4h, v18.4h\n"
12210       "trn1 v1.4h, v17.4h, v19.4h\n"
12211       "trn2 v3.4h, v17.4h, v19.4h\n"
12212       "trn1 v4.4h, v20.4h, v22.4h\n"
12213       "trn2 v6.4h, v20.4h, v22.4h\n"
12214       "trn1 v5.4h, v21.4h, v23.4h\n"
12215       "trn2 v7.4h, v21.4h, v23.4h\n"
12216       "trn1 v16.2s, v0.2s, v4.2s\n"
12217       "trn2 v20.2s, v0.2s, v4.2s\n"
12218       "trn1 v17.2s, v1.2s, v5.2s\n"
12219       "trn2 v21.2s, v1.2s, v5.2s\n"
12220       "trn1 v18.2s, v2.2s, v6.2s\n"
12221       "trn2 v22.2s, v2.2s, v6.2s\n"
12222       "trn1 v19.2s, v3.2s, v7.2s\n"
12223       "trn2 v23.2s, v3.2s, v7.2s\n"
12224       "uaddw v8.8h, v8.8h, v16.8b\n"
12225       "uaddw v9.8h, v9.8h, v17.8b\n"
12226       "uaddw v10.8h, v10.8h, v18.8b\n"
12227       "uaddw v11.8h, v11.8h, v19.8b\n"
12228       "uaddw v12.8h, v12.8h, v20.8b\n"
12229       "uaddw v13.8h, v13.8h, v21.8b\n"
12230       "uaddw v14.8h, v14.8h, v22.8b\n"
12231       "uaddw v15.8h, v15.8h, v23.8b\n"
12232       "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
12233       "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
12234 
12235       // Aggregator Reduction.
12236       "mov v0.s[0], %w[multiplicative_sum_offset]\n"
12237       "dup v1.4s, %w[additive_sum_offset]\n"
12238       "uaddlp v8.4s, v8.8h\n"
12239       "uaddlp v9.4s, v9.8h\n"
12240       "uaddlp v10.4s, v10.8h\n"
12241       "uaddlp v11.4s, v11.8h\n"
12242       "uaddlp v12.4s, v12.8h\n"
12243       "uaddlp v13.4s, v13.8h\n"
12244       "uaddlp v14.4s, v14.8h\n"
12245       "uaddlp v15.4s, v15.8h\n"
12246       "addp v8.4s, v8.4s, v9.4s\n"
12247       "addp v10.4s, v10.4s, v11.4s\n"
12248       "addp v12.4s, v12.4s, v13.4s\n"
12249       "addp v14.4s, v14.4s, v15.4s\n"
12250       "addp v8.4s, v8.4s, v10.4s\n"
12251       "addp v9.4s, v12.4s, v14.4s\n"
12252       "mul v8.4s, v8.4s, v0.s[0]\n"
12253       "mul v9.4s, v9.4s, v0.s[0]\n"
12254       "add v8.4s, v8.4s, v1.4s\n"
12255       "add v9.4s, v9.4s, v1.4s\n"
12256       "st1 {v8.4s, v9.4s}, [%x[out]]\n"
12257       : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
12258         [out] "+r"(out), [in] "+r"(in)
12259       : [additive_sum_offset] "r"(params.additive_sum_offset),
12260         [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
12261       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
12262         "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
12263         "v21", "v22", "v23", "cc", "memory");
12264 }
12265 
12266 }  // namespace meta
12267 }  // namespace gemmlowp
12268 
12269 #else
12270 #warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!"
12271 #endif
12272 
12273 #endif  // GEMMLOWP_META_STREAMS_ARM_64_H_
12274