1 // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef GEMMLOWP_META_STREAMS_ARM_64_H_
16 #define GEMMLOWP_META_STREAMS_ARM_64_H_
17
18 #ifdef GEMMLOWP_NEON_64
19
20 #include <cassert>
21 #include <cstdint>
22
23 namespace gemmlowp {
24 namespace meta {
25
26 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)27 inline void Stream<uint8_t, 1, 8, 0, RowMajorWithSum>::Pack(
28 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
29 #ifdef DEBUG
30 #ifdef DEBUG_METAGEMM_VERBOSE
31 std::cout << __FILE__ << "(" << __LINE__
32 << ") RowMajorWithSum<uint8_t, 1, 8, 0, RowMajorWithSum>::Pack()"
33 << std::endl
34 << std::flush;
35 #endif
36 #endif
37 int params_count_copy = params.count;
38 asm volatile(
39 "movi v8.8h, #0\n"
40
41 "1:"
42 "subs %x[count], %x[count], #8\n"
43
44 // Load Aggregate Store: 1x8.
45 "ld1 {v0.2s}, [%x[in]], #8\n"
46 "uaddw v8.8h, v8.8h, v0.8b\n"
47 "st1 {v0.2s}, [%x[out]], #8\n"
48
49 "bne 1b\n"
50
51 // Aggregator Reduction.
52 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
53 "dup v1.4s, %w[additive_sum_offset]\n"
54 "uaddlp v8.4s, v8.8h\n"
55 "addp v8.4s, v8.4s, v8.4s\n"
56 "addp v8.4s, v8.4s, v8.4s\n"
57 "mul v8.4s, v8.4s, v0.s[0]\n"
58 "add v8.4s, v8.4s, v1.4s\n"
59 "st1 {v8.4s}, [%x[out]]\n"
60 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
61 : [stride] "r"(params.stride),
62 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
63 [additive_sum_offset] "r"(params.additive_sum_offset)
64 : "v8", "v0", "v1", "cc", "memory");
65 }
66
67 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)68 inline void Stream<uint8_t, 1, 8, 1, RowMajorWithSum>::Pack(
69 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
70 #ifdef DEBUG
71 #ifdef DEBUG_METAGEMM_VERBOSE
72 std::cout << __FILE__ << "(" << __LINE__
73 << ") RowMajorWithSum<uint8_t, 1, 8, 1, RowMajorWithSum>::Pack()"
74 << std::endl
75 << std::flush;
76 #endif
77 #endif
78 int params_count_copy = params.count;
79 asm volatile(
80 "movi v8.8h, #0\n"
81
82 // Reduce count by leftovers.
83 "subs %x[count], %x[count], #1\n"
84 "beq 2f\n"
85
86 "1:"
87 "subs %x[count], %x[count], #8\n"
88
89 // Load Aggregate Store: 1x8.
90 "ld1 {v0.2s}, [%x[in]], #8\n"
91 "uaddw v8.8h, v8.8h, v0.8b\n"
92 "st1 {v0.2s}, [%x[out]], #8\n"
93
94 "bne 1b\n"
95
96 "2:"
97
98 // Load Aggregate Store: 1x1.
99 "movi v0.8b, #0\n"
100 "ld1 {v0.b}[0], [%x[in]], #1\n"
101 "uaddw v8.8h, v8.8h, v0.8b\n"
102 "st1 {v0.2s}, [%x[out]], #8\n"
103
104 // Aggregator Reduction.
105 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
106 "dup v1.4s, %w[additive_sum_offset]\n"
107 "uaddlp v8.4s, v8.8h\n"
108 "addp v8.4s, v8.4s, v8.4s\n"
109 "addp v8.4s, v8.4s, v8.4s\n"
110 "mul v8.4s, v8.4s, v0.s[0]\n"
111 "add v8.4s, v8.4s, v1.4s\n"
112 "st1 {v8.4s}, [%x[out]]\n"
113 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
114 : [stride] "r"(params.stride),
115 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
116 [additive_sum_offset] "r"(params.additive_sum_offset)
117 : "v8", "v0", "v1", "cc", "memory");
118 }
119
120 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)121 inline void Stream<uint8_t, 1, 8, 2, RowMajorWithSum>::Pack(
122 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
123 #ifdef DEBUG
124 #ifdef DEBUG_METAGEMM_VERBOSE
125 std::cout << __FILE__ << "(" << __LINE__
126 << ") RowMajorWithSum<uint8_t, 1, 8, 2, RowMajorWithSum>::Pack()"
127 << std::endl
128 << std::flush;
129 #endif
130 #endif
131 int params_count_copy = params.count;
132 asm volatile(
133 "movi v8.8h, #0\n"
134
135 // Reduce count by leftovers.
136 "subs %x[count], %x[count], #2\n"
137 "beq 2f\n"
138
139 "1:"
140 "subs %x[count], %x[count], #8\n"
141
142 // Load Aggregate Store: 1x8.
143 "ld1 {v0.2s}, [%x[in]], #8\n"
144 "uaddw v8.8h, v8.8h, v0.8b\n"
145 "st1 {v0.2s}, [%x[out]], #8\n"
146
147 "bne 1b\n"
148
149 "2:"
150
151 // Load Aggregate Store: 1x2.
152 "movi v0.8b, #0\n"
153 "ld1 {v0.h}[0], [%x[in]], #2\n"
154 "uaddw v8.8h, v8.8h, v0.8b\n"
155 "st1 {v0.2s}, [%x[out]], #8\n"
156
157 // Aggregator Reduction.
158 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
159 "dup v1.4s, %w[additive_sum_offset]\n"
160 "uaddlp v8.4s, v8.8h\n"
161 "addp v8.4s, v8.4s, v8.4s\n"
162 "addp v8.4s, v8.4s, v8.4s\n"
163 "mul v8.4s, v8.4s, v0.s[0]\n"
164 "add v8.4s, v8.4s, v1.4s\n"
165 "st1 {v8.4s}, [%x[out]]\n"
166 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
167 : [stride] "r"(params.stride),
168 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
169 [additive_sum_offset] "r"(params.additive_sum_offset)
170 : "v8", "v0", "v1", "cc", "memory");
171 }
172
173 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)174 inline void Stream<uint8_t, 1, 8, 3, RowMajorWithSum>::Pack(
175 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
176 #ifdef DEBUG
177 #ifdef DEBUG_METAGEMM_VERBOSE
178 std::cout << __FILE__ << "(" << __LINE__
179 << ") RowMajorWithSum<uint8_t, 1, 8, 3, RowMajorWithSum>::Pack()"
180 << std::endl
181 << std::flush;
182 #endif
183 #endif
184 int params_count_copy = params.count;
185 asm volatile(
186 "movi v8.8h, #0\n"
187
188 // Reduce count by leftovers.
189 "subs %x[count], %x[count], #3\n"
190 "beq 2f\n"
191
192 "1:"
193 "subs %x[count], %x[count], #8\n"
194
195 // Load Aggregate Store: 1x8.
196 "ld1 {v0.2s}, [%x[in]], #8\n"
197 "uaddw v8.8h, v8.8h, v0.8b\n"
198 "st1 {v0.2s}, [%x[out]], #8\n"
199
200 "bne 1b\n"
201
202 "2:"
203
204 // Load Aggregate Store: 1x3.
205 "movi v0.8b, #0\n"
206 "ld1 {v0.h}[0], [%x[in]], #2\n"
207 "ld1 {v0.b}[2], [%x[in]], #1\n"
208 "uaddw v8.8h, v8.8h, v0.8b\n"
209 "st1 {v0.2s}, [%x[out]], #8\n"
210
211 // Aggregator Reduction.
212 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
213 "dup v1.4s, %w[additive_sum_offset]\n"
214 "uaddlp v8.4s, v8.8h\n"
215 "addp v8.4s, v8.4s, v8.4s\n"
216 "addp v8.4s, v8.4s, v8.4s\n"
217 "mul v8.4s, v8.4s, v0.s[0]\n"
218 "add v8.4s, v8.4s, v1.4s\n"
219 "st1 {v8.4s}, [%x[out]]\n"
220 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
221 : [stride] "r"(params.stride),
222 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
223 [additive_sum_offset] "r"(params.additive_sum_offset)
224 : "v8", "v0", "v1", "cc", "memory");
225 }
226
227 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)228 inline void Stream<uint8_t, 1, 8, 4, RowMajorWithSum>::Pack(
229 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
230 #ifdef DEBUG
231 #ifdef DEBUG_METAGEMM_VERBOSE
232 std::cout << __FILE__ << "(" << __LINE__
233 << ") RowMajorWithSum<uint8_t, 1, 8, 4, RowMajorWithSum>::Pack()"
234 << std::endl
235 << std::flush;
236 #endif
237 #endif
238 int params_count_copy = params.count;
239 asm volatile(
240 "movi v8.8h, #0\n"
241
242 // Reduce count by leftovers.
243 "subs %x[count], %x[count], #4\n"
244 "beq 2f\n"
245
246 "1:"
247 "subs %x[count], %x[count], #8\n"
248
249 // Load Aggregate Store: 1x8.
250 "ld1 {v0.2s}, [%x[in]], #8\n"
251 "uaddw v8.8h, v8.8h, v0.8b\n"
252 "st1 {v0.2s}, [%x[out]], #8\n"
253
254 "bne 1b\n"
255
256 "2:"
257
258 // Load Aggregate Store: 1x4.
259 "movi v0.8b, #0\n"
260 "ld1 {v0.s}[0], [%x[in]], #4\n"
261 "uaddw v8.8h, v8.8h, v0.8b\n"
262 "st1 {v0.2s}, [%x[out]], #8\n"
263
264 // Aggregator Reduction.
265 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
266 "dup v1.4s, %w[additive_sum_offset]\n"
267 "uaddlp v8.4s, v8.8h\n"
268 "addp v8.4s, v8.4s, v8.4s\n"
269 "addp v8.4s, v8.4s, v8.4s\n"
270 "mul v8.4s, v8.4s, v0.s[0]\n"
271 "add v8.4s, v8.4s, v1.4s\n"
272 "st1 {v8.4s}, [%x[out]]\n"
273 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
274 : [stride] "r"(params.stride),
275 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
276 [additive_sum_offset] "r"(params.additive_sum_offset)
277 : "v8", "v0", "v1", "cc", "memory");
278 }
279
280 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)281 inline void Stream<uint8_t, 1, 8, 5, RowMajorWithSum>::Pack(
282 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
283 #ifdef DEBUG
284 #ifdef DEBUG_METAGEMM_VERBOSE
285 std::cout << __FILE__ << "(" << __LINE__
286 << ") RowMajorWithSum<uint8_t, 1, 8, 5, RowMajorWithSum>::Pack()"
287 << std::endl
288 << std::flush;
289 #endif
290 #endif
291 int params_count_copy = params.count;
292 asm volatile(
293 "movi v8.8h, #0\n"
294
295 // Reduce count by leftovers.
296 "subs %x[count], %x[count], #5\n"
297 "beq 2f\n"
298
299 "1:"
300 "subs %x[count], %x[count], #8\n"
301
302 // Load Aggregate Store: 1x8.
303 "ld1 {v0.2s}, [%x[in]], #8\n"
304 "uaddw v8.8h, v8.8h, v0.8b\n"
305 "st1 {v0.2s}, [%x[out]], #8\n"
306
307 "bne 1b\n"
308
309 "2:"
310
311 // Load Aggregate Store: 1x5.
312 "movi v0.8b, #0\n"
313 "ld1 {v0.s}[0], [%x[in]], #4\n"
314 "ld1 {v0.b}[4], [%x[in]], #1\n"
315 "uaddw v8.8h, v8.8h, v0.8b\n"
316 "st1 {v0.2s}, [%x[out]], #8\n"
317
318 // Aggregator Reduction.
319 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
320 "dup v1.4s, %w[additive_sum_offset]\n"
321 "uaddlp v8.4s, v8.8h\n"
322 "addp v8.4s, v8.4s, v8.4s\n"
323 "addp v8.4s, v8.4s, v8.4s\n"
324 "mul v8.4s, v8.4s, v0.s[0]\n"
325 "add v8.4s, v8.4s, v1.4s\n"
326 "st1 {v8.4s}, [%x[out]]\n"
327 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
328 : [stride] "r"(params.stride),
329 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
330 [additive_sum_offset] "r"(params.additive_sum_offset)
331 : "v8", "v0", "v1", "cc", "memory");
332 }
333
334 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)335 inline void Stream<uint8_t, 1, 8, 6, RowMajorWithSum>::Pack(
336 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
337 #ifdef DEBUG
338 #ifdef DEBUG_METAGEMM_VERBOSE
339 std::cout << __FILE__ << "(" << __LINE__
340 << ") RowMajorWithSum<uint8_t, 1, 8, 6, RowMajorWithSum>::Pack()"
341 << std::endl
342 << std::flush;
343 #endif
344 #endif
345 int params_count_copy = params.count;
346 asm volatile(
347 "movi v8.8h, #0\n"
348
349 // Reduce count by leftovers.
350 "subs %x[count], %x[count], #6\n"
351 "beq 2f\n"
352
353 "1:"
354 "subs %x[count], %x[count], #8\n"
355
356 // Load Aggregate Store: 1x8.
357 "ld1 {v0.2s}, [%x[in]], #8\n"
358 "uaddw v8.8h, v8.8h, v0.8b\n"
359 "st1 {v0.2s}, [%x[out]], #8\n"
360
361 "bne 1b\n"
362
363 "2:"
364
365 // Load Aggregate Store: 1x6.
366 "movi v0.8b, #0\n"
367 "ld1 {v0.s}[0], [%x[in]], #4\n"
368 "ld1 {v0.h}[2], [%x[in]], #2\n"
369 "uaddw v8.8h, v8.8h, v0.8b\n"
370 "st1 {v0.2s}, [%x[out]], #8\n"
371
372 // Aggregator Reduction.
373 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
374 "dup v1.4s, %w[additive_sum_offset]\n"
375 "uaddlp v8.4s, v8.8h\n"
376 "addp v8.4s, v8.4s, v8.4s\n"
377 "addp v8.4s, v8.4s, v8.4s\n"
378 "mul v8.4s, v8.4s, v0.s[0]\n"
379 "add v8.4s, v8.4s, v1.4s\n"
380 "st1 {v8.4s}, [%x[out]]\n"
381 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
382 : [stride] "r"(params.stride),
383 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
384 [additive_sum_offset] "r"(params.additive_sum_offset)
385 : "v8", "v0", "v1", "cc", "memory");
386 }
387
388 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)389 inline void Stream<uint8_t, 1, 8, 7, RowMajorWithSum>::Pack(
390 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
391 #ifdef DEBUG
392 #ifdef DEBUG_METAGEMM_VERBOSE
393 std::cout << __FILE__ << "(" << __LINE__
394 << ") RowMajorWithSum<uint8_t, 1, 8, 7, RowMajorWithSum>::Pack()"
395 << std::endl
396 << std::flush;
397 #endif
398 #endif
399 int params_count_copy = params.count;
400 asm volatile(
401 "movi v8.8h, #0\n"
402
403 // Reduce count by leftovers.
404 "subs %x[count], %x[count], #7\n"
405 "beq 2f\n"
406
407 "1:"
408 "subs %x[count], %x[count], #8\n"
409
410 // Load Aggregate Store: 1x8.
411 "ld1 {v0.2s}, [%x[in]], #8\n"
412 "uaddw v8.8h, v8.8h, v0.8b\n"
413 "st1 {v0.2s}, [%x[out]], #8\n"
414
415 "bne 1b\n"
416
417 "2:"
418
419 // Load Aggregate Store: 1x7.
420 "movi v0.8b, #0\n"
421 "ld1 {v0.s}[0], [%x[in]], #4\n"
422 "ld1 {v0.h}[2], [%x[in]], #2\n"
423 "ld1 {v0.b}[6], [%x[in]], #1\n"
424 "uaddw v8.8h, v8.8h, v0.8b\n"
425 "st1 {v0.2s}, [%x[out]], #8\n"
426
427 // Aggregator Reduction.
428 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
429 "dup v1.4s, %w[additive_sum_offset]\n"
430 "uaddlp v8.4s, v8.8h\n"
431 "addp v8.4s, v8.4s, v8.4s\n"
432 "addp v8.4s, v8.4s, v8.4s\n"
433 "mul v8.4s, v8.4s, v0.s[0]\n"
434 "add v8.4s, v8.4s, v1.4s\n"
435 "st1 {v8.4s}, [%x[out]]\n"
436 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
437 : [stride] "r"(params.stride),
438 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
439 [additive_sum_offset] "r"(params.additive_sum_offset)
440 : "v8", "v0", "v1", "cc", "memory");
441 }
442
443 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)444 inline void Stream<uint8_t, 2, 8, 0, RowMajorWithSum>::Pack(
445 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
446 #ifdef DEBUG
447 #ifdef DEBUG_METAGEMM_VERBOSE
448 std::cout << __FILE__ << "(" << __LINE__
449 << ") RowMajorWithSum<uint8_t, 2, 8, 0, RowMajorWithSum>::Pack()"
450 << std::endl
451 << std::flush;
452 #endif
453 #endif
454 int params_count_copy = params.count;
455 asm volatile(
456 "add x0, %x[in], %x[stride]\n"
457 "movi v8.8h, #0\n"
458 "movi v9.8h, #0\n"
459
460 "1:"
461 "subs %x[count], %x[count], #8\n"
462
463 // Load Aggregate Store: 2x8.
464 "ld1 {v0.2s}, [%x[in]], #8\n"
465 "ld1 {v1.2s}, [x0], #8\n"
466 "uaddw v8.8h, v8.8h, v0.8b\n"
467 "uaddw v9.8h, v9.8h, v1.8b\n"
468 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
469
470 "bne 1b\n"
471
472 // Aggregator Reduction.
473 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
474 "dup v1.4s, %w[additive_sum_offset]\n"
475 "uaddlp v8.4s, v8.8h\n"
476 "uaddlp v9.4s, v9.8h\n"
477 "addp v8.4s, v8.4s, v9.4s\n"
478 "addp v8.4s, v8.4s, v8.4s\n"
479 "mul v8.4s, v8.4s, v0.s[0]\n"
480 "add v8.4s, v8.4s, v1.4s\n"
481 "st1 {v8.4s}, [%x[out]]\n"
482 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
483 : [stride] "r"(params.stride),
484 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
485 [additive_sum_offset] "r"(params.additive_sum_offset)
486 : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
487 }
488
489 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)490 inline void Stream<uint8_t, 2, 8, 1, RowMajorWithSum>::Pack(
491 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
492 #ifdef DEBUG
493 #ifdef DEBUG_METAGEMM_VERBOSE
494 std::cout << __FILE__ << "(" << __LINE__
495 << ") RowMajorWithSum<uint8_t, 2, 8, 1, RowMajorWithSum>::Pack()"
496 << std::endl
497 << std::flush;
498 #endif
499 #endif
500 int params_count_copy = params.count;
501 asm volatile(
502 "add x0, %x[in], %x[stride]\n"
503 "movi v8.8h, #0\n"
504 "movi v9.8h, #0\n"
505
506 // Reduce count by leftovers.
507 "subs %x[count], %x[count], #1\n"
508 "beq 2f\n"
509
510 "1:"
511 "subs %x[count], %x[count], #8\n"
512
513 // Load Aggregate Store: 2x8.
514 "ld1 {v0.2s}, [%x[in]], #8\n"
515 "ld1 {v1.2s}, [x0], #8\n"
516 "uaddw v8.8h, v8.8h, v0.8b\n"
517 "uaddw v9.8h, v9.8h, v1.8b\n"
518 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
519
520 "bne 1b\n"
521
522 "2:"
523
524 // Load Aggregate Store: 2x1.
525 "movi v0.8b, #0\n"
526 "movi v1.8b, #0\n"
527 "ld1 {v0.b}[0], [%x[in]], #1\n"
528 "ld1 {v1.b}[0], [x0], #1\n"
529 "uaddw v8.8h, v8.8h, v0.8b\n"
530 "uaddw v9.8h, v9.8h, v1.8b\n"
531 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
532
533 // Aggregator Reduction.
534 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
535 "dup v1.4s, %w[additive_sum_offset]\n"
536 "uaddlp v8.4s, v8.8h\n"
537 "uaddlp v9.4s, v9.8h\n"
538 "addp v8.4s, v8.4s, v9.4s\n"
539 "addp v8.4s, v8.4s, v8.4s\n"
540 "mul v8.4s, v8.4s, v0.s[0]\n"
541 "add v8.4s, v8.4s, v1.4s\n"
542 "st1 {v8.4s}, [%x[out]]\n"
543 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
544 : [stride] "r"(params.stride),
545 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
546 [additive_sum_offset] "r"(params.additive_sum_offset)
547 : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
548 }
549
550 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)551 inline void Stream<uint8_t, 2, 8, 2, RowMajorWithSum>::Pack(
552 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
553 #ifdef DEBUG
554 #ifdef DEBUG_METAGEMM_VERBOSE
555 std::cout << __FILE__ << "(" << __LINE__
556 << ") RowMajorWithSum<uint8_t, 2, 8, 2, RowMajorWithSum>::Pack()"
557 << std::endl
558 << std::flush;
559 #endif
560 #endif
561 int params_count_copy = params.count;
562 asm volatile(
563 "add x0, %x[in], %x[stride]\n"
564 "movi v8.8h, #0\n"
565 "movi v9.8h, #0\n"
566
567 // Reduce count by leftovers.
568 "subs %x[count], %x[count], #2\n"
569 "beq 2f\n"
570
571 "1:"
572 "subs %x[count], %x[count], #8\n"
573
574 // Load Aggregate Store: 2x8.
575 "ld1 {v0.2s}, [%x[in]], #8\n"
576 "ld1 {v1.2s}, [x0], #8\n"
577 "uaddw v8.8h, v8.8h, v0.8b\n"
578 "uaddw v9.8h, v9.8h, v1.8b\n"
579 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
580
581 "bne 1b\n"
582
583 "2:"
584
585 // Load Aggregate Store: 2x2.
586 "movi v0.8b, #0\n"
587 "movi v1.8b, #0\n"
588 "ld1 {v0.h}[0], [%x[in]], #2\n"
589 "ld1 {v1.h}[0], [x0], #2\n"
590 "uaddw v8.8h, v8.8h, v0.8b\n"
591 "uaddw v9.8h, v9.8h, v1.8b\n"
592 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
593
594 // Aggregator Reduction.
595 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
596 "dup v1.4s, %w[additive_sum_offset]\n"
597 "uaddlp v8.4s, v8.8h\n"
598 "uaddlp v9.4s, v9.8h\n"
599 "addp v8.4s, v8.4s, v9.4s\n"
600 "addp v8.4s, v8.4s, v8.4s\n"
601 "mul v8.4s, v8.4s, v0.s[0]\n"
602 "add v8.4s, v8.4s, v1.4s\n"
603 "st1 {v8.4s}, [%x[out]]\n"
604 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
605 : [stride] "r"(params.stride),
606 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
607 [additive_sum_offset] "r"(params.additive_sum_offset)
608 : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
609 }
610
611 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)612 inline void Stream<uint8_t, 2, 8, 3, RowMajorWithSum>::Pack(
613 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
614 #ifdef DEBUG
615 #ifdef DEBUG_METAGEMM_VERBOSE
616 std::cout << __FILE__ << "(" << __LINE__
617 << ") RowMajorWithSum<uint8_t, 2, 8, 3, RowMajorWithSum>::Pack()"
618 << std::endl
619 << std::flush;
620 #endif
621 #endif
622 int params_count_copy = params.count;
623 asm volatile(
624 "add x0, %x[in], %x[stride]\n"
625 "movi v8.8h, #0\n"
626 "movi v9.8h, #0\n"
627
628 // Reduce count by leftovers.
629 "subs %x[count], %x[count], #3\n"
630 "beq 2f\n"
631
632 "1:"
633 "subs %x[count], %x[count], #8\n"
634
635 // Load Aggregate Store: 2x8.
636 "ld1 {v0.2s}, [%x[in]], #8\n"
637 "ld1 {v1.2s}, [x0], #8\n"
638 "uaddw v8.8h, v8.8h, v0.8b\n"
639 "uaddw v9.8h, v9.8h, v1.8b\n"
640 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
641
642 "bne 1b\n"
643
644 "2:"
645
646 // Load Aggregate Store: 2x3.
647 "movi v0.8b, #0\n"
648 "movi v1.8b, #0\n"
649 "ld1 {v0.h}[0], [%x[in]], #2\n"
650 "ld1 {v0.b}[2], [%x[in]], #1\n"
651 "ld1 {v1.h}[0], [x0], #2\n"
652 "ld1 {v1.b}[2], [x0], #1\n"
653 "uaddw v8.8h, v8.8h, v0.8b\n"
654 "uaddw v9.8h, v9.8h, v1.8b\n"
655 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
656
657 // Aggregator Reduction.
658 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
659 "dup v1.4s, %w[additive_sum_offset]\n"
660 "uaddlp v8.4s, v8.8h\n"
661 "uaddlp v9.4s, v9.8h\n"
662 "addp v8.4s, v8.4s, v9.4s\n"
663 "addp v8.4s, v8.4s, v8.4s\n"
664 "mul v8.4s, v8.4s, v0.s[0]\n"
665 "add v8.4s, v8.4s, v1.4s\n"
666 "st1 {v8.4s}, [%x[out]]\n"
667 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
668 : [stride] "r"(params.stride),
669 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
670 [additive_sum_offset] "r"(params.additive_sum_offset)
671 : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
672 }
673
674 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)675 inline void Stream<uint8_t, 2, 8, 4, RowMajorWithSum>::Pack(
676 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
677 #ifdef DEBUG
678 #ifdef DEBUG_METAGEMM_VERBOSE
679 std::cout << __FILE__ << "(" << __LINE__
680 << ") RowMajorWithSum<uint8_t, 2, 8, 4, RowMajorWithSum>::Pack()"
681 << std::endl
682 << std::flush;
683 #endif
684 #endif
685 int params_count_copy = params.count;
686 asm volatile(
687 "add x0, %x[in], %x[stride]\n"
688 "movi v8.8h, #0\n"
689 "movi v9.8h, #0\n"
690
691 // Reduce count by leftovers.
692 "subs %x[count], %x[count], #4\n"
693 "beq 2f\n"
694
695 "1:"
696 "subs %x[count], %x[count], #8\n"
697
698 // Load Aggregate Store: 2x8.
699 "ld1 {v0.2s}, [%x[in]], #8\n"
700 "ld1 {v1.2s}, [x0], #8\n"
701 "uaddw v8.8h, v8.8h, v0.8b\n"
702 "uaddw v9.8h, v9.8h, v1.8b\n"
703 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
704
705 "bne 1b\n"
706
707 "2:"
708
709 // Load Aggregate Store: 2x4.
710 "movi v0.8b, #0\n"
711 "movi v1.8b, #0\n"
712 "ld1 {v0.s}[0], [%x[in]], #4\n"
713 "ld1 {v1.s}[0], [x0], #4\n"
714 "uaddw v8.8h, v8.8h, v0.8b\n"
715 "uaddw v9.8h, v9.8h, v1.8b\n"
716 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
717
718 // Aggregator Reduction.
719 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
720 "dup v1.4s, %w[additive_sum_offset]\n"
721 "uaddlp v8.4s, v8.8h\n"
722 "uaddlp v9.4s, v9.8h\n"
723 "addp v8.4s, v8.4s, v9.4s\n"
724 "addp v8.4s, v8.4s, v8.4s\n"
725 "mul v8.4s, v8.4s, v0.s[0]\n"
726 "add v8.4s, v8.4s, v1.4s\n"
727 "st1 {v8.4s}, [%x[out]]\n"
728 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
729 : [stride] "r"(params.stride),
730 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
731 [additive_sum_offset] "r"(params.additive_sum_offset)
732 : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
733 }
734
735 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)736 inline void Stream<uint8_t, 2, 8, 5, RowMajorWithSum>::Pack(
737 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
738 #ifdef DEBUG
739 #ifdef DEBUG_METAGEMM_VERBOSE
740 std::cout << __FILE__ << "(" << __LINE__
741 << ") RowMajorWithSum<uint8_t, 2, 8, 5, RowMajorWithSum>::Pack()"
742 << std::endl
743 << std::flush;
744 #endif
745 #endif
746 int params_count_copy = params.count;
747 asm volatile(
748 "add x0, %x[in], %x[stride]\n"
749 "movi v8.8h, #0\n"
750 "movi v9.8h, #0\n"
751
752 // Reduce count by leftovers.
753 "subs %x[count], %x[count], #5\n"
754 "beq 2f\n"
755
756 "1:"
757 "subs %x[count], %x[count], #8\n"
758
759 // Load Aggregate Store: 2x8.
760 "ld1 {v0.2s}, [%x[in]], #8\n"
761 "ld1 {v1.2s}, [x0], #8\n"
762 "uaddw v8.8h, v8.8h, v0.8b\n"
763 "uaddw v9.8h, v9.8h, v1.8b\n"
764 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
765
766 "bne 1b\n"
767
768 "2:"
769
770 // Load Aggregate Store: 2x5.
771 "movi v0.8b, #0\n"
772 "movi v1.8b, #0\n"
773 "ld1 {v0.s}[0], [%x[in]], #4\n"
774 "ld1 {v0.b}[4], [%x[in]], #1\n"
775 "ld1 {v1.s}[0], [x0], #4\n"
776 "ld1 {v1.b}[4], [x0], #1\n"
777 "uaddw v8.8h, v8.8h, v0.8b\n"
778 "uaddw v9.8h, v9.8h, v1.8b\n"
779 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
780
781 // Aggregator Reduction.
782 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
783 "dup v1.4s, %w[additive_sum_offset]\n"
784 "uaddlp v8.4s, v8.8h\n"
785 "uaddlp v9.4s, v9.8h\n"
786 "addp v8.4s, v8.4s, v9.4s\n"
787 "addp v8.4s, v8.4s, v8.4s\n"
788 "mul v8.4s, v8.4s, v0.s[0]\n"
789 "add v8.4s, v8.4s, v1.4s\n"
790 "st1 {v8.4s}, [%x[out]]\n"
791 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
792 : [stride] "r"(params.stride),
793 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
794 [additive_sum_offset] "r"(params.additive_sum_offset)
795 : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
796 }
797
798 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)799 inline void Stream<uint8_t, 2, 8, 6, RowMajorWithSum>::Pack(
800 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
801 #ifdef DEBUG
802 #ifdef DEBUG_METAGEMM_VERBOSE
803 std::cout << __FILE__ << "(" << __LINE__
804 << ") RowMajorWithSum<uint8_t, 2, 8, 6, RowMajorWithSum>::Pack()"
805 << std::endl
806 << std::flush;
807 #endif
808 #endif
809 int params_count_copy = params.count;
810 asm volatile(
811 "add x0, %x[in], %x[stride]\n"
812 "movi v8.8h, #0\n"
813 "movi v9.8h, #0\n"
814
815 // Reduce count by leftovers.
816 "subs %x[count], %x[count], #6\n"
817 "beq 2f\n"
818
819 "1:"
820 "subs %x[count], %x[count], #8\n"
821
822 // Load Aggregate Store: 2x8.
823 "ld1 {v0.2s}, [%x[in]], #8\n"
824 "ld1 {v1.2s}, [x0], #8\n"
825 "uaddw v8.8h, v8.8h, v0.8b\n"
826 "uaddw v9.8h, v9.8h, v1.8b\n"
827 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
828
829 "bne 1b\n"
830
831 "2:"
832
833 // Load Aggregate Store: 2x6.
834 "movi v0.8b, #0\n"
835 "movi v1.8b, #0\n"
836 "ld1 {v0.s}[0], [%x[in]], #4\n"
837 "ld1 {v0.h}[2], [%x[in]], #2\n"
838 "ld1 {v1.s}[0], [x0], #4\n"
839 "ld1 {v1.h}[2], [x0], #2\n"
840 "uaddw v8.8h, v8.8h, v0.8b\n"
841 "uaddw v9.8h, v9.8h, v1.8b\n"
842 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
843
844 // Aggregator Reduction.
845 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
846 "dup v1.4s, %w[additive_sum_offset]\n"
847 "uaddlp v8.4s, v8.8h\n"
848 "uaddlp v9.4s, v9.8h\n"
849 "addp v8.4s, v8.4s, v9.4s\n"
850 "addp v8.4s, v8.4s, v8.4s\n"
851 "mul v8.4s, v8.4s, v0.s[0]\n"
852 "add v8.4s, v8.4s, v1.4s\n"
853 "st1 {v8.4s}, [%x[out]]\n"
854 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
855 : [stride] "r"(params.stride),
856 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
857 [additive_sum_offset] "r"(params.additive_sum_offset)
858 : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
859 }
860
861 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)862 inline void Stream<uint8_t, 2, 8, 7, RowMajorWithSum>::Pack(
863 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
864 #ifdef DEBUG
865 #ifdef DEBUG_METAGEMM_VERBOSE
866 std::cout << __FILE__ << "(" << __LINE__
867 << ") RowMajorWithSum<uint8_t, 2, 8, 7, RowMajorWithSum>::Pack()"
868 << std::endl
869 << std::flush;
870 #endif
871 #endif
872 int params_count_copy = params.count;
873 asm volatile(
874 "add x0, %x[in], %x[stride]\n"
875 "movi v8.8h, #0\n"
876 "movi v9.8h, #0\n"
877
878 // Reduce count by leftovers.
879 "subs %x[count], %x[count], #7\n"
880 "beq 2f\n"
881
882 "1:"
883 "subs %x[count], %x[count], #8\n"
884
885 // Load Aggregate Store: 2x8.
886 "ld1 {v0.2s}, [%x[in]], #8\n"
887 "ld1 {v1.2s}, [x0], #8\n"
888 "uaddw v8.8h, v8.8h, v0.8b\n"
889 "uaddw v9.8h, v9.8h, v1.8b\n"
890 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
891
892 "bne 1b\n"
893
894 "2:"
895
896 // Load Aggregate Store: 2x7.
897 "movi v0.8b, #0\n"
898 "movi v1.8b, #0\n"
899 "ld1 {v0.s}[0], [%x[in]], #4\n"
900 "ld1 {v0.h}[2], [%x[in]], #2\n"
901 "ld1 {v0.b}[6], [%x[in]], #1\n"
902 "ld1 {v1.s}[0], [x0], #4\n"
903 "ld1 {v1.h}[2], [x0], #2\n"
904 "ld1 {v1.b}[6], [x0], #1\n"
905 "uaddw v8.8h, v8.8h, v0.8b\n"
906 "uaddw v9.8h, v9.8h, v1.8b\n"
907 "st1 {v0.2s, v1.2s}, [%x[out]], #16\n"
908
909 // Aggregator Reduction.
910 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
911 "dup v1.4s, %w[additive_sum_offset]\n"
912 "uaddlp v8.4s, v8.8h\n"
913 "uaddlp v9.4s, v9.8h\n"
914 "addp v8.4s, v8.4s, v9.4s\n"
915 "addp v8.4s, v8.4s, v8.4s\n"
916 "mul v8.4s, v8.4s, v0.s[0]\n"
917 "add v8.4s, v8.4s, v1.4s\n"
918 "st1 {v8.4s}, [%x[out]]\n"
919 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
920 : [stride] "r"(params.stride),
921 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
922 [additive_sum_offset] "r"(params.additive_sum_offset)
923 : "x0", "v8", "v9", "v0", "v1", "cc", "memory");
924 }
925
926 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)927 inline void Stream<uint8_t, 3, 8, 0, RowMajorWithSum>::Pack(
928 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
929 #ifdef DEBUG
930 #ifdef DEBUG_METAGEMM_VERBOSE
931 std::cout << __FILE__ << "(" << __LINE__
932 << ") RowMajorWithSum<uint8_t, 3, 8, 0, RowMajorWithSum>::Pack()"
933 << std::endl
934 << std::flush;
935 #endif
936 #endif
937 int params_count_copy = params.count;
938 asm volatile(
939 "add x0, %x[in], %x[stride]\n"
940 "add x1, x0, %x[stride]\n"
941 "movi v8.8h, #0\n"
942 "movi v9.8h, #0\n"
943 "movi v10.8h, #0\n"
944
945 "1:"
946 "subs %x[count], %x[count], #8\n"
947
948 // Load Aggregate Store: 3x8.
949 "ld1 {v0.2s}, [%x[in]], #8\n"
950 "ld1 {v1.2s}, [x0], #8\n"
951 "ld1 {v2.2s}, [x1], #8\n"
952 "uaddw v8.8h, v8.8h, v0.8b\n"
953 "uaddw v9.8h, v9.8h, v1.8b\n"
954 "uaddw v10.8h, v10.8h, v2.8b\n"
955 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
956
957 "bne 1b\n"
958
959 // Aggregator Reduction.
960 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
961 "dup v1.4s, %w[additive_sum_offset]\n"
962 "uaddlp v8.4s, v8.8h\n"
963 "uaddlp v9.4s, v9.8h\n"
964 "uaddlp v10.4s, v10.8h\n"
965 "addp v8.4s, v8.4s, v9.4s\n"
966 "addp v10.4s, v10.4s, v10.4s\n"
967 "addp v8.4s, v8.4s, v10.4s\n"
968 "mul v8.4s, v8.4s, v0.s[0]\n"
969 "add v8.4s, v8.4s, v1.4s\n"
970 "st1 {v8.4s}, [%x[out]]\n"
971 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
972 : [stride] "r"(params.stride),
973 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
974 [additive_sum_offset] "r"(params.additive_sum_offset)
975 : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
976 }
977
978 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)979 inline void Stream<uint8_t, 3, 8, 1, RowMajorWithSum>::Pack(
980 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
981 #ifdef DEBUG
982 #ifdef DEBUG_METAGEMM_VERBOSE
983 std::cout << __FILE__ << "(" << __LINE__
984 << ") RowMajorWithSum<uint8_t, 3, 8, 1, RowMajorWithSum>::Pack()"
985 << std::endl
986 << std::flush;
987 #endif
988 #endif
989 int params_count_copy = params.count;
990 asm volatile(
991 "add x0, %x[in], %x[stride]\n"
992 "add x1, x0, %x[stride]\n"
993 "movi v8.8h, #0\n"
994 "movi v9.8h, #0\n"
995 "movi v10.8h, #0\n"
996
997 // Reduce count by leftovers.
998 "subs %x[count], %x[count], #1\n"
999 "beq 2f\n"
1000
1001 "1:"
1002 "subs %x[count], %x[count], #8\n"
1003
1004 // Load Aggregate Store: 3x8.
1005 "ld1 {v0.2s}, [%x[in]], #8\n"
1006 "ld1 {v1.2s}, [x0], #8\n"
1007 "ld1 {v2.2s}, [x1], #8\n"
1008 "uaddw v8.8h, v8.8h, v0.8b\n"
1009 "uaddw v9.8h, v9.8h, v1.8b\n"
1010 "uaddw v10.8h, v10.8h, v2.8b\n"
1011 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1012
1013 "bne 1b\n"
1014
1015 "2:"
1016
1017 // Load Aggregate Store: 3x1.
1018 "movi v0.8b, #0\n"
1019 "movi v1.8b, #0\n"
1020 "movi v2.8b, #0\n"
1021 "ld1 {v0.b}[0], [%x[in]], #1\n"
1022 "ld1 {v1.b}[0], [x0], #1\n"
1023 "ld1 {v2.b}[0], [x1], #1\n"
1024 "uaddw v8.8h, v8.8h, v0.8b\n"
1025 "uaddw v9.8h, v9.8h, v1.8b\n"
1026 "uaddw v10.8h, v10.8h, v2.8b\n"
1027 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1028
1029 // Aggregator Reduction.
1030 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1031 "dup v1.4s, %w[additive_sum_offset]\n"
1032 "uaddlp v8.4s, v8.8h\n"
1033 "uaddlp v9.4s, v9.8h\n"
1034 "uaddlp v10.4s, v10.8h\n"
1035 "addp v8.4s, v8.4s, v9.4s\n"
1036 "addp v10.4s, v10.4s, v10.4s\n"
1037 "addp v8.4s, v8.4s, v10.4s\n"
1038 "mul v8.4s, v8.4s, v0.s[0]\n"
1039 "add v8.4s, v8.4s, v1.4s\n"
1040 "st1 {v8.4s}, [%x[out]]\n"
1041 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1042 : [stride] "r"(params.stride),
1043 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1044 [additive_sum_offset] "r"(params.additive_sum_offset)
1045 : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1046 }
1047
1048 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1049 inline void Stream<uint8_t, 3, 8, 2, RowMajorWithSum>::Pack(
1050 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1051 #ifdef DEBUG
1052 #ifdef DEBUG_METAGEMM_VERBOSE
1053 std::cout << __FILE__ << "(" << __LINE__
1054 << ") RowMajorWithSum<uint8_t, 3, 8, 2, RowMajorWithSum>::Pack()"
1055 << std::endl
1056 << std::flush;
1057 #endif
1058 #endif
1059 int params_count_copy = params.count;
1060 asm volatile(
1061 "add x0, %x[in], %x[stride]\n"
1062 "add x1, x0, %x[stride]\n"
1063 "movi v8.8h, #0\n"
1064 "movi v9.8h, #0\n"
1065 "movi v10.8h, #0\n"
1066
1067 // Reduce count by leftovers.
1068 "subs %x[count], %x[count], #2\n"
1069 "beq 2f\n"
1070
1071 "1:"
1072 "subs %x[count], %x[count], #8\n"
1073
1074 // Load Aggregate Store: 3x8.
1075 "ld1 {v0.2s}, [%x[in]], #8\n"
1076 "ld1 {v1.2s}, [x0], #8\n"
1077 "ld1 {v2.2s}, [x1], #8\n"
1078 "uaddw v8.8h, v8.8h, v0.8b\n"
1079 "uaddw v9.8h, v9.8h, v1.8b\n"
1080 "uaddw v10.8h, v10.8h, v2.8b\n"
1081 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1082
1083 "bne 1b\n"
1084
1085 "2:"
1086
1087 // Load Aggregate Store: 3x2.
1088 "movi v0.8b, #0\n"
1089 "movi v1.8b, #0\n"
1090 "movi v2.8b, #0\n"
1091 "ld1 {v0.h}[0], [%x[in]], #2\n"
1092 "ld1 {v1.h}[0], [x0], #2\n"
1093 "ld1 {v2.h}[0], [x1], #2\n"
1094 "uaddw v8.8h, v8.8h, v0.8b\n"
1095 "uaddw v9.8h, v9.8h, v1.8b\n"
1096 "uaddw v10.8h, v10.8h, v2.8b\n"
1097 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1098
1099 // Aggregator Reduction.
1100 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1101 "dup v1.4s, %w[additive_sum_offset]\n"
1102 "uaddlp v8.4s, v8.8h\n"
1103 "uaddlp v9.4s, v9.8h\n"
1104 "uaddlp v10.4s, v10.8h\n"
1105 "addp v8.4s, v8.4s, v9.4s\n"
1106 "addp v10.4s, v10.4s, v10.4s\n"
1107 "addp v8.4s, v8.4s, v10.4s\n"
1108 "mul v8.4s, v8.4s, v0.s[0]\n"
1109 "add v8.4s, v8.4s, v1.4s\n"
1110 "st1 {v8.4s}, [%x[out]]\n"
1111 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1112 : [stride] "r"(params.stride),
1113 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1114 [additive_sum_offset] "r"(params.additive_sum_offset)
1115 : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1116 }
1117
1118 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1119 inline void Stream<uint8_t, 3, 8, 3, RowMajorWithSum>::Pack(
1120 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1121 #ifdef DEBUG
1122 #ifdef DEBUG_METAGEMM_VERBOSE
1123 std::cout << __FILE__ << "(" << __LINE__
1124 << ") RowMajorWithSum<uint8_t, 3, 8, 3, RowMajorWithSum>::Pack()"
1125 << std::endl
1126 << std::flush;
1127 #endif
1128 #endif
1129 int params_count_copy = params.count;
1130 asm volatile(
1131 "add x0, %x[in], %x[stride]\n"
1132 "add x1, x0, %x[stride]\n"
1133 "movi v8.8h, #0\n"
1134 "movi v9.8h, #0\n"
1135 "movi v10.8h, #0\n"
1136
1137 // Reduce count by leftovers.
1138 "subs %x[count], %x[count], #3\n"
1139 "beq 2f\n"
1140
1141 "1:"
1142 "subs %x[count], %x[count], #8\n"
1143
1144 // Load Aggregate Store: 3x8.
1145 "ld1 {v0.2s}, [%x[in]], #8\n"
1146 "ld1 {v1.2s}, [x0], #8\n"
1147 "ld1 {v2.2s}, [x1], #8\n"
1148 "uaddw v8.8h, v8.8h, v0.8b\n"
1149 "uaddw v9.8h, v9.8h, v1.8b\n"
1150 "uaddw v10.8h, v10.8h, v2.8b\n"
1151 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1152
1153 "bne 1b\n"
1154
1155 "2:"
1156
1157 // Load Aggregate Store: 3x3.
1158 "movi v0.8b, #0\n"
1159 "movi v1.8b, #0\n"
1160 "movi v2.8b, #0\n"
1161 "ld1 {v0.h}[0], [%x[in]], #2\n"
1162 "ld1 {v0.b}[2], [%x[in]], #1\n"
1163 "ld1 {v1.h}[0], [x0], #2\n"
1164 "ld1 {v1.b}[2], [x0], #1\n"
1165 "ld1 {v2.h}[0], [x1], #2\n"
1166 "ld1 {v2.b}[2], [x1], #1\n"
1167 "uaddw v8.8h, v8.8h, v0.8b\n"
1168 "uaddw v9.8h, v9.8h, v1.8b\n"
1169 "uaddw v10.8h, v10.8h, v2.8b\n"
1170 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1171
1172 // Aggregator Reduction.
1173 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1174 "dup v1.4s, %w[additive_sum_offset]\n"
1175 "uaddlp v8.4s, v8.8h\n"
1176 "uaddlp v9.4s, v9.8h\n"
1177 "uaddlp v10.4s, v10.8h\n"
1178 "addp v8.4s, v8.4s, v9.4s\n"
1179 "addp v10.4s, v10.4s, v10.4s\n"
1180 "addp v8.4s, v8.4s, v10.4s\n"
1181 "mul v8.4s, v8.4s, v0.s[0]\n"
1182 "add v8.4s, v8.4s, v1.4s\n"
1183 "st1 {v8.4s}, [%x[out]]\n"
1184 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1185 : [stride] "r"(params.stride),
1186 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1187 [additive_sum_offset] "r"(params.additive_sum_offset)
1188 : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1189 }
1190
1191 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1192 inline void Stream<uint8_t, 3, 8, 4, RowMajorWithSum>::Pack(
1193 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1194 #ifdef DEBUG
1195 #ifdef DEBUG_METAGEMM_VERBOSE
1196 std::cout << __FILE__ << "(" << __LINE__
1197 << ") RowMajorWithSum<uint8_t, 3, 8, 4, RowMajorWithSum>::Pack()"
1198 << std::endl
1199 << std::flush;
1200 #endif
1201 #endif
1202 int params_count_copy = params.count;
1203 asm volatile(
1204 "add x0, %x[in], %x[stride]\n"
1205 "add x1, x0, %x[stride]\n"
1206 "movi v8.8h, #0\n"
1207 "movi v9.8h, #0\n"
1208 "movi v10.8h, #0\n"
1209
1210 // Reduce count by leftovers.
1211 "subs %x[count], %x[count], #4\n"
1212 "beq 2f\n"
1213
1214 "1:"
1215 "subs %x[count], %x[count], #8\n"
1216
1217 // Load Aggregate Store: 3x8.
1218 "ld1 {v0.2s}, [%x[in]], #8\n"
1219 "ld1 {v1.2s}, [x0], #8\n"
1220 "ld1 {v2.2s}, [x1], #8\n"
1221 "uaddw v8.8h, v8.8h, v0.8b\n"
1222 "uaddw v9.8h, v9.8h, v1.8b\n"
1223 "uaddw v10.8h, v10.8h, v2.8b\n"
1224 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1225
1226 "bne 1b\n"
1227
1228 "2:"
1229
1230 // Load Aggregate Store: 3x4.
1231 "movi v0.8b, #0\n"
1232 "movi v1.8b, #0\n"
1233 "movi v2.8b, #0\n"
1234 "ld1 {v0.s}[0], [%x[in]], #4\n"
1235 "ld1 {v1.s}[0], [x0], #4\n"
1236 "ld1 {v2.s}[0], [x1], #4\n"
1237 "uaddw v8.8h, v8.8h, v0.8b\n"
1238 "uaddw v9.8h, v9.8h, v1.8b\n"
1239 "uaddw v10.8h, v10.8h, v2.8b\n"
1240 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1241
1242 // Aggregator Reduction.
1243 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1244 "dup v1.4s, %w[additive_sum_offset]\n"
1245 "uaddlp v8.4s, v8.8h\n"
1246 "uaddlp v9.4s, v9.8h\n"
1247 "uaddlp v10.4s, v10.8h\n"
1248 "addp v8.4s, v8.4s, v9.4s\n"
1249 "addp v10.4s, v10.4s, v10.4s\n"
1250 "addp v8.4s, v8.4s, v10.4s\n"
1251 "mul v8.4s, v8.4s, v0.s[0]\n"
1252 "add v8.4s, v8.4s, v1.4s\n"
1253 "st1 {v8.4s}, [%x[out]]\n"
1254 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1255 : [stride] "r"(params.stride),
1256 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1257 [additive_sum_offset] "r"(params.additive_sum_offset)
1258 : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1259 }
1260
1261 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1262 inline void Stream<uint8_t, 3, 8, 5, RowMajorWithSum>::Pack(
1263 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1264 #ifdef DEBUG
1265 #ifdef DEBUG_METAGEMM_VERBOSE
1266 std::cout << __FILE__ << "(" << __LINE__
1267 << ") RowMajorWithSum<uint8_t, 3, 8, 5, RowMajorWithSum>::Pack()"
1268 << std::endl
1269 << std::flush;
1270 #endif
1271 #endif
1272 int params_count_copy = params.count;
1273 asm volatile(
1274 "add x0, %x[in], %x[stride]\n"
1275 "add x1, x0, %x[stride]\n"
1276 "movi v8.8h, #0\n"
1277 "movi v9.8h, #0\n"
1278 "movi v10.8h, #0\n"
1279
1280 // Reduce count by leftovers.
1281 "subs %x[count], %x[count], #5\n"
1282 "beq 2f\n"
1283
1284 "1:"
1285 "subs %x[count], %x[count], #8\n"
1286
1287 // Load Aggregate Store: 3x8.
1288 "ld1 {v0.2s}, [%x[in]], #8\n"
1289 "ld1 {v1.2s}, [x0], #8\n"
1290 "ld1 {v2.2s}, [x1], #8\n"
1291 "uaddw v8.8h, v8.8h, v0.8b\n"
1292 "uaddw v9.8h, v9.8h, v1.8b\n"
1293 "uaddw v10.8h, v10.8h, v2.8b\n"
1294 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1295
1296 "bne 1b\n"
1297
1298 "2:"
1299
1300 // Load Aggregate Store: 3x5.
1301 "movi v0.8b, #0\n"
1302 "movi v1.8b, #0\n"
1303 "movi v2.8b, #0\n"
1304 "ld1 {v0.s}[0], [%x[in]], #4\n"
1305 "ld1 {v0.b}[4], [%x[in]], #1\n"
1306 "ld1 {v1.s}[0], [x0], #4\n"
1307 "ld1 {v1.b}[4], [x0], #1\n"
1308 "ld1 {v2.s}[0], [x1], #4\n"
1309 "ld1 {v2.b}[4], [x1], #1\n"
1310 "uaddw v8.8h, v8.8h, v0.8b\n"
1311 "uaddw v9.8h, v9.8h, v1.8b\n"
1312 "uaddw v10.8h, v10.8h, v2.8b\n"
1313 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1314
1315 // Aggregator Reduction.
1316 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1317 "dup v1.4s, %w[additive_sum_offset]\n"
1318 "uaddlp v8.4s, v8.8h\n"
1319 "uaddlp v9.4s, v9.8h\n"
1320 "uaddlp v10.4s, v10.8h\n"
1321 "addp v8.4s, v8.4s, v9.4s\n"
1322 "addp v10.4s, v10.4s, v10.4s\n"
1323 "addp v8.4s, v8.4s, v10.4s\n"
1324 "mul v8.4s, v8.4s, v0.s[0]\n"
1325 "add v8.4s, v8.4s, v1.4s\n"
1326 "st1 {v8.4s}, [%x[out]]\n"
1327 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1328 : [stride] "r"(params.stride),
1329 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1330 [additive_sum_offset] "r"(params.additive_sum_offset)
1331 : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1332 }
1333
1334 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1335 inline void Stream<uint8_t, 3, 8, 6, RowMajorWithSum>::Pack(
1336 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1337 #ifdef DEBUG
1338 #ifdef DEBUG_METAGEMM_VERBOSE
1339 std::cout << __FILE__ << "(" << __LINE__
1340 << ") RowMajorWithSum<uint8_t, 3, 8, 6, RowMajorWithSum>::Pack()"
1341 << std::endl
1342 << std::flush;
1343 #endif
1344 #endif
1345 int params_count_copy = params.count;
1346 asm volatile(
1347 "add x0, %x[in], %x[stride]\n"
1348 "add x1, x0, %x[stride]\n"
1349 "movi v8.8h, #0\n"
1350 "movi v9.8h, #0\n"
1351 "movi v10.8h, #0\n"
1352
1353 // Reduce count by leftovers.
1354 "subs %x[count], %x[count], #6\n"
1355 "beq 2f\n"
1356
1357 "1:"
1358 "subs %x[count], %x[count], #8\n"
1359
1360 // Load Aggregate Store: 3x8.
1361 "ld1 {v0.2s}, [%x[in]], #8\n"
1362 "ld1 {v1.2s}, [x0], #8\n"
1363 "ld1 {v2.2s}, [x1], #8\n"
1364 "uaddw v8.8h, v8.8h, v0.8b\n"
1365 "uaddw v9.8h, v9.8h, v1.8b\n"
1366 "uaddw v10.8h, v10.8h, v2.8b\n"
1367 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1368
1369 "bne 1b\n"
1370
1371 "2:"
1372
1373 // Load Aggregate Store: 3x6.
1374 "movi v0.8b, #0\n"
1375 "movi v1.8b, #0\n"
1376 "movi v2.8b, #0\n"
1377 "ld1 {v0.s}[0], [%x[in]], #4\n"
1378 "ld1 {v0.h}[2], [%x[in]], #2\n"
1379 "ld1 {v1.s}[0], [x0], #4\n"
1380 "ld1 {v1.h}[2], [x0], #2\n"
1381 "ld1 {v2.s}[0], [x1], #4\n"
1382 "ld1 {v2.h}[2], [x1], #2\n"
1383 "uaddw v8.8h, v8.8h, v0.8b\n"
1384 "uaddw v9.8h, v9.8h, v1.8b\n"
1385 "uaddw v10.8h, v10.8h, v2.8b\n"
1386 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1387
1388 // Aggregator Reduction.
1389 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1390 "dup v1.4s, %w[additive_sum_offset]\n"
1391 "uaddlp v8.4s, v8.8h\n"
1392 "uaddlp v9.4s, v9.8h\n"
1393 "uaddlp v10.4s, v10.8h\n"
1394 "addp v8.4s, v8.4s, v9.4s\n"
1395 "addp v10.4s, v10.4s, v10.4s\n"
1396 "addp v8.4s, v8.4s, v10.4s\n"
1397 "mul v8.4s, v8.4s, v0.s[0]\n"
1398 "add v8.4s, v8.4s, v1.4s\n"
1399 "st1 {v8.4s}, [%x[out]]\n"
1400 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1401 : [stride] "r"(params.stride),
1402 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1403 [additive_sum_offset] "r"(params.additive_sum_offset)
1404 : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1405 }
1406
1407 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1408 inline void Stream<uint8_t, 3, 8, 7, RowMajorWithSum>::Pack(
1409 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1410 #ifdef DEBUG
1411 #ifdef DEBUG_METAGEMM_VERBOSE
1412 std::cout << __FILE__ << "(" << __LINE__
1413 << ") RowMajorWithSum<uint8_t, 3, 8, 7, RowMajorWithSum>::Pack()"
1414 << std::endl
1415 << std::flush;
1416 #endif
1417 #endif
1418 int params_count_copy = params.count;
1419 asm volatile(
1420 "add x0, %x[in], %x[stride]\n"
1421 "add x1, x0, %x[stride]\n"
1422 "movi v8.8h, #0\n"
1423 "movi v9.8h, #0\n"
1424 "movi v10.8h, #0\n"
1425
1426 // Reduce count by leftovers.
1427 "subs %x[count], %x[count], #7\n"
1428 "beq 2f\n"
1429
1430 "1:"
1431 "subs %x[count], %x[count], #8\n"
1432
1433 // Load Aggregate Store: 3x8.
1434 "ld1 {v0.2s}, [%x[in]], #8\n"
1435 "ld1 {v1.2s}, [x0], #8\n"
1436 "ld1 {v2.2s}, [x1], #8\n"
1437 "uaddw v8.8h, v8.8h, v0.8b\n"
1438 "uaddw v9.8h, v9.8h, v1.8b\n"
1439 "uaddw v10.8h, v10.8h, v2.8b\n"
1440 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1441
1442 "bne 1b\n"
1443
1444 "2:"
1445
1446 // Load Aggregate Store: 3x7.
1447 "movi v0.8b, #0\n"
1448 "movi v1.8b, #0\n"
1449 "movi v2.8b, #0\n"
1450 "ld1 {v0.s}[0], [%x[in]], #4\n"
1451 "ld1 {v0.h}[2], [%x[in]], #2\n"
1452 "ld1 {v0.b}[6], [%x[in]], #1\n"
1453 "ld1 {v1.s}[0], [x0], #4\n"
1454 "ld1 {v1.h}[2], [x0], #2\n"
1455 "ld1 {v1.b}[6], [x0], #1\n"
1456 "ld1 {v2.s}[0], [x1], #4\n"
1457 "ld1 {v2.h}[2], [x1], #2\n"
1458 "ld1 {v2.b}[6], [x1], #1\n"
1459 "uaddw v8.8h, v8.8h, v0.8b\n"
1460 "uaddw v9.8h, v9.8h, v1.8b\n"
1461 "uaddw v10.8h, v10.8h, v2.8b\n"
1462 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
1463
1464 // Aggregator Reduction.
1465 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1466 "dup v1.4s, %w[additive_sum_offset]\n"
1467 "uaddlp v8.4s, v8.8h\n"
1468 "uaddlp v9.4s, v9.8h\n"
1469 "uaddlp v10.4s, v10.8h\n"
1470 "addp v8.4s, v8.4s, v9.4s\n"
1471 "addp v10.4s, v10.4s, v10.4s\n"
1472 "addp v8.4s, v8.4s, v10.4s\n"
1473 "mul v8.4s, v8.4s, v0.s[0]\n"
1474 "add v8.4s, v8.4s, v1.4s\n"
1475 "st1 {v8.4s}, [%x[out]]\n"
1476 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1477 : [stride] "r"(params.stride),
1478 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1479 [additive_sum_offset] "r"(params.additive_sum_offset)
1480 : "x0", "x1", "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
1481 }
1482
1483 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1484 inline void Stream<uint8_t, 4, 8, 0, RowMajorWithSum>::Pack(
1485 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1486 #ifdef DEBUG
1487 #ifdef DEBUG_METAGEMM_VERBOSE
1488 std::cout << __FILE__ << "(" << __LINE__
1489 << ") RowMajorWithSum<uint8_t, 4, 8, 0, RowMajorWithSum>::Pack()"
1490 << std::endl
1491 << std::flush;
1492 #endif
1493 #endif
1494 int params_count_copy = params.count;
1495 asm volatile(
1496 "add x0, %x[in], %x[stride]\n"
1497 "add x1, x0, %x[stride]\n"
1498 "add x2, x1, %x[stride]\n"
1499 "movi v8.8h, #0\n"
1500 "movi v9.8h, #0\n"
1501 "movi v10.8h, #0\n"
1502 "movi v11.8h, #0\n"
1503
1504 "1:"
1505 "subs %x[count], %x[count], #8\n"
1506
1507 // Load Aggregate Store: 4x8.
1508 "ld1 {v0.2s}, [%x[in]], #8\n"
1509 "ld1 {v1.2s}, [x0], #8\n"
1510 "ld1 {v2.2s}, [x1], #8\n"
1511 "ld1 {v3.2s}, [x2], #8\n"
1512 "uaddw v8.8h, v8.8h, v0.8b\n"
1513 "uaddw v9.8h, v9.8h, v1.8b\n"
1514 "uaddw v10.8h, v10.8h, v2.8b\n"
1515 "uaddw v11.8h, v11.8h, v3.8b\n"
1516 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1517
1518 "bne 1b\n"
1519
1520 // Aggregator Reduction.
1521 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1522 "dup v1.4s, %w[additive_sum_offset]\n"
1523 "uaddlp v8.4s, v8.8h\n"
1524 "uaddlp v9.4s, v9.8h\n"
1525 "uaddlp v10.4s, v10.8h\n"
1526 "uaddlp v11.4s, v11.8h\n"
1527 "addp v8.4s, v8.4s, v9.4s\n"
1528 "addp v10.4s, v10.4s, v11.4s\n"
1529 "addp v8.4s, v8.4s, v10.4s\n"
1530 "mul v8.4s, v8.4s, v0.s[0]\n"
1531 "add v8.4s, v8.4s, v1.4s\n"
1532 "st1 {v8.4s}, [%x[out]]\n"
1533 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1534 : [stride] "r"(params.stride),
1535 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1536 [additive_sum_offset] "r"(params.additive_sum_offset)
1537 : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1538 "cc", "memory");
1539 }
1540
1541 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1542 inline void Stream<uint8_t, 4, 8, 1, RowMajorWithSum>::Pack(
1543 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1544 #ifdef DEBUG
1545 #ifdef DEBUG_METAGEMM_VERBOSE
1546 std::cout << __FILE__ << "(" << __LINE__
1547 << ") RowMajorWithSum<uint8_t, 4, 8, 1, RowMajorWithSum>::Pack()"
1548 << std::endl
1549 << std::flush;
1550 #endif
1551 #endif
1552 int params_count_copy = params.count;
1553 asm volatile(
1554 "add x0, %x[in], %x[stride]\n"
1555 "add x1, x0, %x[stride]\n"
1556 "add x2, x1, %x[stride]\n"
1557 "movi v8.8h, #0\n"
1558 "movi v9.8h, #0\n"
1559 "movi v10.8h, #0\n"
1560 "movi v11.8h, #0\n"
1561
1562 // Reduce count by leftovers.
1563 "subs %x[count], %x[count], #1\n"
1564 "beq 2f\n"
1565
1566 "1:"
1567 "subs %x[count], %x[count], #8\n"
1568
1569 // Load Aggregate Store: 4x8.
1570 "ld1 {v0.2s}, [%x[in]], #8\n"
1571 "ld1 {v1.2s}, [x0], #8\n"
1572 "ld1 {v2.2s}, [x1], #8\n"
1573 "ld1 {v3.2s}, [x2], #8\n"
1574 "uaddw v8.8h, v8.8h, v0.8b\n"
1575 "uaddw v9.8h, v9.8h, v1.8b\n"
1576 "uaddw v10.8h, v10.8h, v2.8b\n"
1577 "uaddw v11.8h, v11.8h, v3.8b\n"
1578 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1579
1580 "bne 1b\n"
1581
1582 "2:"
1583
1584 // Load Aggregate Store: 4x1.
1585 "movi v0.8b, #0\n"
1586 "movi v1.8b, #0\n"
1587 "movi v2.8b, #0\n"
1588 "movi v3.8b, #0\n"
1589 "ld1 {v0.b}[0], [%x[in]], #1\n"
1590 "ld1 {v1.b}[0], [x0], #1\n"
1591 "ld1 {v2.b}[0], [x1], #1\n"
1592 "ld1 {v3.b}[0], [x2], #1\n"
1593 "uaddw v8.8h, v8.8h, v0.8b\n"
1594 "uaddw v9.8h, v9.8h, v1.8b\n"
1595 "uaddw v10.8h, v10.8h, v2.8b\n"
1596 "uaddw v11.8h, v11.8h, v3.8b\n"
1597 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1598
1599 // Aggregator Reduction.
1600 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1601 "dup v1.4s, %w[additive_sum_offset]\n"
1602 "uaddlp v8.4s, v8.8h\n"
1603 "uaddlp v9.4s, v9.8h\n"
1604 "uaddlp v10.4s, v10.8h\n"
1605 "uaddlp v11.4s, v11.8h\n"
1606 "addp v8.4s, v8.4s, v9.4s\n"
1607 "addp v10.4s, v10.4s, v11.4s\n"
1608 "addp v8.4s, v8.4s, v10.4s\n"
1609 "mul v8.4s, v8.4s, v0.s[0]\n"
1610 "add v8.4s, v8.4s, v1.4s\n"
1611 "st1 {v8.4s}, [%x[out]]\n"
1612 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1613 : [stride] "r"(params.stride),
1614 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1615 [additive_sum_offset] "r"(params.additive_sum_offset)
1616 : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1617 "cc", "memory");
1618 }
1619
1620 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1621 inline void Stream<uint8_t, 4, 8, 2, RowMajorWithSum>::Pack(
1622 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1623 #ifdef DEBUG
1624 #ifdef DEBUG_METAGEMM_VERBOSE
1625 std::cout << __FILE__ << "(" << __LINE__
1626 << ") RowMajorWithSum<uint8_t, 4, 8, 2, RowMajorWithSum>::Pack()"
1627 << std::endl
1628 << std::flush;
1629 #endif
1630 #endif
1631 int params_count_copy = params.count;
1632 asm volatile(
1633 "add x0, %x[in], %x[stride]\n"
1634 "add x1, x0, %x[stride]\n"
1635 "add x2, x1, %x[stride]\n"
1636 "movi v8.8h, #0\n"
1637 "movi v9.8h, #0\n"
1638 "movi v10.8h, #0\n"
1639 "movi v11.8h, #0\n"
1640
1641 // Reduce count by leftovers.
1642 "subs %x[count], %x[count], #2\n"
1643 "beq 2f\n"
1644
1645 "1:"
1646 "subs %x[count], %x[count], #8\n"
1647
1648 // Load Aggregate Store: 4x8.
1649 "ld1 {v0.2s}, [%x[in]], #8\n"
1650 "ld1 {v1.2s}, [x0], #8\n"
1651 "ld1 {v2.2s}, [x1], #8\n"
1652 "ld1 {v3.2s}, [x2], #8\n"
1653 "uaddw v8.8h, v8.8h, v0.8b\n"
1654 "uaddw v9.8h, v9.8h, v1.8b\n"
1655 "uaddw v10.8h, v10.8h, v2.8b\n"
1656 "uaddw v11.8h, v11.8h, v3.8b\n"
1657 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1658
1659 "bne 1b\n"
1660
1661 "2:"
1662
1663 // Load Aggregate Store: 4x2.
1664 "movi v0.8b, #0\n"
1665 "movi v1.8b, #0\n"
1666 "movi v2.8b, #0\n"
1667 "movi v3.8b, #0\n"
1668 "ld1 {v0.h}[0], [%x[in]], #2\n"
1669 "ld1 {v1.h}[0], [x0], #2\n"
1670 "ld1 {v2.h}[0], [x1], #2\n"
1671 "ld1 {v3.h}[0], [x2], #2\n"
1672 "uaddw v8.8h, v8.8h, v0.8b\n"
1673 "uaddw v9.8h, v9.8h, v1.8b\n"
1674 "uaddw v10.8h, v10.8h, v2.8b\n"
1675 "uaddw v11.8h, v11.8h, v3.8b\n"
1676 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1677
1678 // Aggregator Reduction.
1679 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1680 "dup v1.4s, %w[additive_sum_offset]\n"
1681 "uaddlp v8.4s, v8.8h\n"
1682 "uaddlp v9.4s, v9.8h\n"
1683 "uaddlp v10.4s, v10.8h\n"
1684 "uaddlp v11.4s, v11.8h\n"
1685 "addp v8.4s, v8.4s, v9.4s\n"
1686 "addp v10.4s, v10.4s, v11.4s\n"
1687 "addp v8.4s, v8.4s, v10.4s\n"
1688 "mul v8.4s, v8.4s, v0.s[0]\n"
1689 "add v8.4s, v8.4s, v1.4s\n"
1690 "st1 {v8.4s}, [%x[out]]\n"
1691 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1692 : [stride] "r"(params.stride),
1693 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1694 [additive_sum_offset] "r"(params.additive_sum_offset)
1695 : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1696 "cc", "memory");
1697 }
1698
1699 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1700 inline void Stream<uint8_t, 4, 8, 3, RowMajorWithSum>::Pack(
1701 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1702 #ifdef DEBUG
1703 #ifdef DEBUG_METAGEMM_VERBOSE
1704 std::cout << __FILE__ << "(" << __LINE__
1705 << ") RowMajorWithSum<uint8_t, 4, 8, 3, RowMajorWithSum>::Pack()"
1706 << std::endl
1707 << std::flush;
1708 #endif
1709 #endif
1710 int params_count_copy = params.count;
1711 asm volatile(
1712 "add x0, %x[in], %x[stride]\n"
1713 "add x1, x0, %x[stride]\n"
1714 "add x2, x1, %x[stride]\n"
1715 "movi v8.8h, #0\n"
1716 "movi v9.8h, #0\n"
1717 "movi v10.8h, #0\n"
1718 "movi v11.8h, #0\n"
1719
1720 // Reduce count by leftovers.
1721 "subs %x[count], %x[count], #3\n"
1722 "beq 2f\n"
1723
1724 "1:"
1725 "subs %x[count], %x[count], #8\n"
1726
1727 // Load Aggregate Store: 4x8.
1728 "ld1 {v0.2s}, [%x[in]], #8\n"
1729 "ld1 {v1.2s}, [x0], #8\n"
1730 "ld1 {v2.2s}, [x1], #8\n"
1731 "ld1 {v3.2s}, [x2], #8\n"
1732 "uaddw v8.8h, v8.8h, v0.8b\n"
1733 "uaddw v9.8h, v9.8h, v1.8b\n"
1734 "uaddw v10.8h, v10.8h, v2.8b\n"
1735 "uaddw v11.8h, v11.8h, v3.8b\n"
1736 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1737
1738 "bne 1b\n"
1739
1740 "2:"
1741
1742 // Load Aggregate Store: 4x3.
1743 "movi v0.8b, #0\n"
1744 "movi v1.8b, #0\n"
1745 "movi v2.8b, #0\n"
1746 "movi v3.8b, #0\n"
1747 "ld1 {v0.h}[0], [%x[in]], #2\n"
1748 "ld1 {v0.b}[2], [%x[in]], #1\n"
1749 "ld1 {v1.h}[0], [x0], #2\n"
1750 "ld1 {v1.b}[2], [x0], #1\n"
1751 "ld1 {v2.h}[0], [x1], #2\n"
1752 "ld1 {v2.b}[2], [x1], #1\n"
1753 "ld1 {v3.h}[0], [x2], #2\n"
1754 "ld1 {v3.b}[2], [x2], #1\n"
1755 "uaddw v8.8h, v8.8h, v0.8b\n"
1756 "uaddw v9.8h, v9.8h, v1.8b\n"
1757 "uaddw v10.8h, v10.8h, v2.8b\n"
1758 "uaddw v11.8h, v11.8h, v3.8b\n"
1759 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1760
1761 // Aggregator Reduction.
1762 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1763 "dup v1.4s, %w[additive_sum_offset]\n"
1764 "uaddlp v8.4s, v8.8h\n"
1765 "uaddlp v9.4s, v9.8h\n"
1766 "uaddlp v10.4s, v10.8h\n"
1767 "uaddlp v11.4s, v11.8h\n"
1768 "addp v8.4s, v8.4s, v9.4s\n"
1769 "addp v10.4s, v10.4s, v11.4s\n"
1770 "addp v8.4s, v8.4s, v10.4s\n"
1771 "mul v8.4s, v8.4s, v0.s[0]\n"
1772 "add v8.4s, v8.4s, v1.4s\n"
1773 "st1 {v8.4s}, [%x[out]]\n"
1774 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1775 : [stride] "r"(params.stride),
1776 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1777 [additive_sum_offset] "r"(params.additive_sum_offset)
1778 : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1779 "cc", "memory");
1780 }
1781
1782 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1783 inline void Stream<uint8_t, 4, 8, 4, RowMajorWithSum>::Pack(
1784 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1785 #ifdef DEBUG
1786 #ifdef DEBUG_METAGEMM_VERBOSE
1787 std::cout << __FILE__ << "(" << __LINE__
1788 << ") RowMajorWithSum<uint8_t, 4, 8, 4, RowMajorWithSum>::Pack()"
1789 << std::endl
1790 << std::flush;
1791 #endif
1792 #endif
1793 int params_count_copy = params.count;
1794 asm volatile(
1795 "add x0, %x[in], %x[stride]\n"
1796 "add x1, x0, %x[stride]\n"
1797 "add x2, x1, %x[stride]\n"
1798 "movi v8.8h, #0\n"
1799 "movi v9.8h, #0\n"
1800 "movi v10.8h, #0\n"
1801 "movi v11.8h, #0\n"
1802
1803 // Reduce count by leftovers.
1804 "subs %x[count], %x[count], #4\n"
1805 "beq 2f\n"
1806
1807 "1:"
1808 "subs %x[count], %x[count], #8\n"
1809
1810 // Load Aggregate Store: 4x8.
1811 "ld1 {v0.2s}, [%x[in]], #8\n"
1812 "ld1 {v1.2s}, [x0], #8\n"
1813 "ld1 {v2.2s}, [x1], #8\n"
1814 "ld1 {v3.2s}, [x2], #8\n"
1815 "uaddw v8.8h, v8.8h, v0.8b\n"
1816 "uaddw v9.8h, v9.8h, v1.8b\n"
1817 "uaddw v10.8h, v10.8h, v2.8b\n"
1818 "uaddw v11.8h, v11.8h, v3.8b\n"
1819 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1820
1821 "bne 1b\n"
1822
1823 "2:"
1824
1825 // Load Aggregate Store: 4x4.
1826 "movi v0.8b, #0\n"
1827 "movi v1.8b, #0\n"
1828 "movi v2.8b, #0\n"
1829 "movi v3.8b, #0\n"
1830 "ld1 {v0.s}[0], [%x[in]], #4\n"
1831 "ld1 {v1.s}[0], [x0], #4\n"
1832 "ld1 {v2.s}[0], [x1], #4\n"
1833 "ld1 {v3.s}[0], [x2], #4\n"
1834 "uaddw v8.8h, v8.8h, v0.8b\n"
1835 "uaddw v9.8h, v9.8h, v1.8b\n"
1836 "uaddw v10.8h, v10.8h, v2.8b\n"
1837 "uaddw v11.8h, v11.8h, v3.8b\n"
1838 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1839
1840 // Aggregator Reduction.
1841 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1842 "dup v1.4s, %w[additive_sum_offset]\n"
1843 "uaddlp v8.4s, v8.8h\n"
1844 "uaddlp v9.4s, v9.8h\n"
1845 "uaddlp v10.4s, v10.8h\n"
1846 "uaddlp v11.4s, v11.8h\n"
1847 "addp v8.4s, v8.4s, v9.4s\n"
1848 "addp v10.4s, v10.4s, v11.4s\n"
1849 "addp v8.4s, v8.4s, v10.4s\n"
1850 "mul v8.4s, v8.4s, v0.s[0]\n"
1851 "add v8.4s, v8.4s, v1.4s\n"
1852 "st1 {v8.4s}, [%x[out]]\n"
1853 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1854 : [stride] "r"(params.stride),
1855 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1856 [additive_sum_offset] "r"(params.additive_sum_offset)
1857 : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1858 "cc", "memory");
1859 }
1860
1861 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1862 inline void Stream<uint8_t, 4, 8, 5, RowMajorWithSum>::Pack(
1863 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1864 #ifdef DEBUG
1865 #ifdef DEBUG_METAGEMM_VERBOSE
1866 std::cout << __FILE__ << "(" << __LINE__
1867 << ") RowMajorWithSum<uint8_t, 4, 8, 5, RowMajorWithSum>::Pack()"
1868 << std::endl
1869 << std::flush;
1870 #endif
1871 #endif
1872 int params_count_copy = params.count;
1873 asm volatile(
1874 "add x0, %x[in], %x[stride]\n"
1875 "add x1, x0, %x[stride]\n"
1876 "add x2, x1, %x[stride]\n"
1877 "movi v8.8h, #0\n"
1878 "movi v9.8h, #0\n"
1879 "movi v10.8h, #0\n"
1880 "movi v11.8h, #0\n"
1881
1882 // Reduce count by leftovers.
1883 "subs %x[count], %x[count], #5\n"
1884 "beq 2f\n"
1885
1886 "1:"
1887 "subs %x[count], %x[count], #8\n"
1888
1889 // Load Aggregate Store: 4x8.
1890 "ld1 {v0.2s}, [%x[in]], #8\n"
1891 "ld1 {v1.2s}, [x0], #8\n"
1892 "ld1 {v2.2s}, [x1], #8\n"
1893 "ld1 {v3.2s}, [x2], #8\n"
1894 "uaddw v8.8h, v8.8h, v0.8b\n"
1895 "uaddw v9.8h, v9.8h, v1.8b\n"
1896 "uaddw v10.8h, v10.8h, v2.8b\n"
1897 "uaddw v11.8h, v11.8h, v3.8b\n"
1898 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1899
1900 "bne 1b\n"
1901
1902 "2:"
1903
1904 // Load Aggregate Store: 4x5.
1905 "movi v0.8b, #0\n"
1906 "movi v1.8b, #0\n"
1907 "movi v2.8b, #0\n"
1908 "movi v3.8b, #0\n"
1909 "ld1 {v0.s}[0], [%x[in]], #4\n"
1910 "ld1 {v0.b}[4], [%x[in]], #1\n"
1911 "ld1 {v1.s}[0], [x0], #4\n"
1912 "ld1 {v1.b}[4], [x0], #1\n"
1913 "ld1 {v2.s}[0], [x1], #4\n"
1914 "ld1 {v2.b}[4], [x1], #1\n"
1915 "ld1 {v3.s}[0], [x2], #4\n"
1916 "ld1 {v3.b}[4], [x2], #1\n"
1917 "uaddw v8.8h, v8.8h, v0.8b\n"
1918 "uaddw v9.8h, v9.8h, v1.8b\n"
1919 "uaddw v10.8h, v10.8h, v2.8b\n"
1920 "uaddw v11.8h, v11.8h, v3.8b\n"
1921 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1922
1923 // Aggregator Reduction.
1924 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
1925 "dup v1.4s, %w[additive_sum_offset]\n"
1926 "uaddlp v8.4s, v8.8h\n"
1927 "uaddlp v9.4s, v9.8h\n"
1928 "uaddlp v10.4s, v10.8h\n"
1929 "uaddlp v11.4s, v11.8h\n"
1930 "addp v8.4s, v8.4s, v9.4s\n"
1931 "addp v10.4s, v10.4s, v11.4s\n"
1932 "addp v8.4s, v8.4s, v10.4s\n"
1933 "mul v8.4s, v8.4s, v0.s[0]\n"
1934 "add v8.4s, v8.4s, v1.4s\n"
1935 "st1 {v8.4s}, [%x[out]]\n"
1936 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
1937 : [stride] "r"(params.stride),
1938 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
1939 [additive_sum_offset] "r"(params.additive_sum_offset)
1940 : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
1941 "cc", "memory");
1942 }
1943
1944 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)1945 inline void Stream<uint8_t, 4, 8, 6, RowMajorWithSum>::Pack(
1946 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
1947 #ifdef DEBUG
1948 #ifdef DEBUG_METAGEMM_VERBOSE
1949 std::cout << __FILE__ << "(" << __LINE__
1950 << ") RowMajorWithSum<uint8_t, 4, 8, 6, RowMajorWithSum>::Pack()"
1951 << std::endl
1952 << std::flush;
1953 #endif
1954 #endif
1955 int params_count_copy = params.count;
1956 asm volatile(
1957 "add x0, %x[in], %x[stride]\n"
1958 "add x1, x0, %x[stride]\n"
1959 "add x2, x1, %x[stride]\n"
1960 "movi v8.8h, #0\n"
1961 "movi v9.8h, #0\n"
1962 "movi v10.8h, #0\n"
1963 "movi v11.8h, #0\n"
1964
1965 // Reduce count by leftovers.
1966 "subs %x[count], %x[count], #6\n"
1967 "beq 2f\n"
1968
1969 "1:"
1970 "subs %x[count], %x[count], #8\n"
1971
1972 // Load Aggregate Store: 4x8.
1973 "ld1 {v0.2s}, [%x[in]], #8\n"
1974 "ld1 {v1.2s}, [x0], #8\n"
1975 "ld1 {v2.2s}, [x1], #8\n"
1976 "ld1 {v3.2s}, [x2], #8\n"
1977 "uaddw v8.8h, v8.8h, v0.8b\n"
1978 "uaddw v9.8h, v9.8h, v1.8b\n"
1979 "uaddw v10.8h, v10.8h, v2.8b\n"
1980 "uaddw v11.8h, v11.8h, v3.8b\n"
1981 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
1982
1983 "bne 1b\n"
1984
1985 "2:"
1986
1987 // Load Aggregate Store: 4x6.
1988 "movi v0.8b, #0\n"
1989 "movi v1.8b, #0\n"
1990 "movi v2.8b, #0\n"
1991 "movi v3.8b, #0\n"
1992 "ld1 {v0.s}[0], [%x[in]], #4\n"
1993 "ld1 {v0.h}[2], [%x[in]], #2\n"
1994 "ld1 {v1.s}[0], [x0], #4\n"
1995 "ld1 {v1.h}[2], [x0], #2\n"
1996 "ld1 {v2.s}[0], [x1], #4\n"
1997 "ld1 {v2.h}[2], [x1], #2\n"
1998 "ld1 {v3.s}[0], [x2], #4\n"
1999 "ld1 {v3.h}[2], [x2], #2\n"
2000 "uaddw v8.8h, v8.8h, v0.8b\n"
2001 "uaddw v9.8h, v9.8h, v1.8b\n"
2002 "uaddw v10.8h, v10.8h, v2.8b\n"
2003 "uaddw v11.8h, v11.8h, v3.8b\n"
2004 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2005
2006 // Aggregator Reduction.
2007 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2008 "dup v1.4s, %w[additive_sum_offset]\n"
2009 "uaddlp v8.4s, v8.8h\n"
2010 "uaddlp v9.4s, v9.8h\n"
2011 "uaddlp v10.4s, v10.8h\n"
2012 "uaddlp v11.4s, v11.8h\n"
2013 "addp v8.4s, v8.4s, v9.4s\n"
2014 "addp v10.4s, v10.4s, v11.4s\n"
2015 "addp v8.4s, v8.4s, v10.4s\n"
2016 "mul v8.4s, v8.4s, v0.s[0]\n"
2017 "add v8.4s, v8.4s, v1.4s\n"
2018 "st1 {v8.4s}, [%x[out]]\n"
2019 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2020 : [stride] "r"(params.stride),
2021 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2022 [additive_sum_offset] "r"(params.additive_sum_offset)
2023 : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
2024 "cc", "memory");
2025 }
2026
2027 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2028 inline void Stream<uint8_t, 4, 8, 7, RowMajorWithSum>::Pack(
2029 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2030 #ifdef DEBUG
2031 #ifdef DEBUG_METAGEMM_VERBOSE
2032 std::cout << __FILE__ << "(" << __LINE__
2033 << ") RowMajorWithSum<uint8_t, 4, 8, 7, RowMajorWithSum>::Pack()"
2034 << std::endl
2035 << std::flush;
2036 #endif
2037 #endif
2038 int params_count_copy = params.count;
2039 asm volatile(
2040 "add x0, %x[in], %x[stride]\n"
2041 "add x1, x0, %x[stride]\n"
2042 "add x2, x1, %x[stride]\n"
2043 "movi v8.8h, #0\n"
2044 "movi v9.8h, #0\n"
2045 "movi v10.8h, #0\n"
2046 "movi v11.8h, #0\n"
2047
2048 // Reduce count by leftovers.
2049 "subs %x[count], %x[count], #7\n"
2050 "beq 2f\n"
2051
2052 "1:"
2053 "subs %x[count], %x[count], #8\n"
2054
2055 // Load Aggregate Store: 4x8.
2056 "ld1 {v0.2s}, [%x[in]], #8\n"
2057 "ld1 {v1.2s}, [x0], #8\n"
2058 "ld1 {v2.2s}, [x1], #8\n"
2059 "ld1 {v3.2s}, [x2], #8\n"
2060 "uaddw v8.8h, v8.8h, v0.8b\n"
2061 "uaddw v9.8h, v9.8h, v1.8b\n"
2062 "uaddw v10.8h, v10.8h, v2.8b\n"
2063 "uaddw v11.8h, v11.8h, v3.8b\n"
2064 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2065
2066 "bne 1b\n"
2067
2068 "2:"
2069
2070 // Load Aggregate Store: 4x7.
2071 "movi v0.8b, #0\n"
2072 "movi v1.8b, #0\n"
2073 "movi v2.8b, #0\n"
2074 "movi v3.8b, #0\n"
2075 "ld1 {v0.s}[0], [%x[in]], #4\n"
2076 "ld1 {v0.h}[2], [%x[in]], #2\n"
2077 "ld1 {v0.b}[6], [%x[in]], #1\n"
2078 "ld1 {v1.s}[0], [x0], #4\n"
2079 "ld1 {v1.h}[2], [x0], #2\n"
2080 "ld1 {v1.b}[6], [x0], #1\n"
2081 "ld1 {v2.s}[0], [x1], #4\n"
2082 "ld1 {v2.h}[2], [x1], #2\n"
2083 "ld1 {v2.b}[6], [x1], #1\n"
2084 "ld1 {v3.s}[0], [x2], #4\n"
2085 "ld1 {v3.h}[2], [x2], #2\n"
2086 "ld1 {v3.b}[6], [x2], #1\n"
2087 "uaddw v8.8h, v8.8h, v0.8b\n"
2088 "uaddw v9.8h, v9.8h, v1.8b\n"
2089 "uaddw v10.8h, v10.8h, v2.8b\n"
2090 "uaddw v11.8h, v11.8h, v3.8b\n"
2091 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2092
2093 // Aggregator Reduction.
2094 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2095 "dup v1.4s, %w[additive_sum_offset]\n"
2096 "uaddlp v8.4s, v8.8h\n"
2097 "uaddlp v9.4s, v9.8h\n"
2098 "uaddlp v10.4s, v10.8h\n"
2099 "uaddlp v11.4s, v11.8h\n"
2100 "addp v8.4s, v8.4s, v9.4s\n"
2101 "addp v10.4s, v10.4s, v11.4s\n"
2102 "addp v8.4s, v8.4s, v10.4s\n"
2103 "mul v8.4s, v8.4s, v0.s[0]\n"
2104 "add v8.4s, v8.4s, v1.4s\n"
2105 "st1 {v8.4s}, [%x[out]]\n"
2106 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2107 : [stride] "r"(params.stride),
2108 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2109 [additive_sum_offset] "r"(params.additive_sum_offset)
2110 : "x0", "x1", "x2", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11",
2111 "cc", "memory");
2112 }
2113
2114 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2115 inline void Stream<uint8_t, 5, 8, 0, RowMajorWithSum>::Pack(
2116 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2117 #ifdef DEBUG
2118 #ifdef DEBUG_METAGEMM_VERBOSE
2119 std::cout << __FILE__ << "(" << __LINE__
2120 << ") RowMajorWithSum<uint8_t, 5, 8, 0, RowMajorWithSum>::Pack()"
2121 << std::endl
2122 << std::flush;
2123 #endif
2124 #endif
2125 int params_count_copy = params.count;
2126 asm volatile(
2127 "add x0, %x[in], %x[stride]\n"
2128 "add x1, x0, %x[stride]\n"
2129 "add x2, x1, %x[stride]\n"
2130 "add x3, x2, %x[stride]\n"
2131 "movi v8.8h, #0\n"
2132 "movi v9.8h, #0\n"
2133 "movi v10.8h, #0\n"
2134 "movi v11.8h, #0\n"
2135 "movi v12.8h, #0\n"
2136
2137 "1:"
2138 "subs %x[count], %x[count], #8\n"
2139
2140 // Load Aggregate Store: 5x8.
2141 "ld1 {v0.2s}, [%x[in]], #8\n"
2142 "ld1 {v1.2s}, [x0], #8\n"
2143 "ld1 {v2.2s}, [x1], #8\n"
2144 "ld1 {v3.2s}, [x2], #8\n"
2145 "ld1 {v4.2s}, [x3], #8\n"
2146 "uaddw v8.8h, v8.8h, v0.8b\n"
2147 "uaddw v9.8h, v9.8h, v1.8b\n"
2148 "uaddw v10.8h, v10.8h, v2.8b\n"
2149 "uaddw v11.8h, v11.8h, v3.8b\n"
2150 "uaddw v12.8h, v12.8h, v4.8b\n"
2151 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2152 "st1 {v4.2s}, [%x[out]], #8\n"
2153
2154 "bne 1b\n"
2155
2156 // Aggregator Reduction.
2157 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2158 "dup v1.4s, %w[additive_sum_offset]\n"
2159 "uaddlp v8.4s, v8.8h\n"
2160 "uaddlp v9.4s, v9.8h\n"
2161 "uaddlp v10.4s, v10.8h\n"
2162 "uaddlp v11.4s, v11.8h\n"
2163 "uaddlp v12.4s, v12.8h\n"
2164 "addp v8.4s, v8.4s, v9.4s\n"
2165 "addp v10.4s, v10.4s, v11.4s\n"
2166 "addp v12.4s, v12.4s, v12.4s\n"
2167 "addp v8.4s, v8.4s, v10.4s\n"
2168 "addp v9.4s, v12.4s, v12.4s\n"
2169 "mul v8.4s, v8.4s, v0.s[0]\n"
2170 "mul v9.4s, v9.4s, v0.s[0]\n"
2171 "add v8.4s, v8.4s, v1.4s\n"
2172 "add v9.4s, v9.4s, v1.4s\n"
2173 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2174 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2175 : [stride] "r"(params.stride),
2176 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2177 [additive_sum_offset] "r"(params.additive_sum_offset)
2178 : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2179 "v11", "v12", "cc", "memory");
2180 }
2181
2182 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2183 inline void Stream<uint8_t, 5, 8, 1, RowMajorWithSum>::Pack(
2184 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2185 #ifdef DEBUG
2186 #ifdef DEBUG_METAGEMM_VERBOSE
2187 std::cout << __FILE__ << "(" << __LINE__
2188 << ") RowMajorWithSum<uint8_t, 5, 8, 1, RowMajorWithSum>::Pack()"
2189 << std::endl
2190 << std::flush;
2191 #endif
2192 #endif
2193 int params_count_copy = params.count;
2194 asm volatile(
2195 "add x0, %x[in], %x[stride]\n"
2196 "add x1, x0, %x[stride]\n"
2197 "add x2, x1, %x[stride]\n"
2198 "add x3, x2, %x[stride]\n"
2199 "movi v8.8h, #0\n"
2200 "movi v9.8h, #0\n"
2201 "movi v10.8h, #0\n"
2202 "movi v11.8h, #0\n"
2203 "movi v12.8h, #0\n"
2204
2205 // Reduce count by leftovers.
2206 "subs %x[count], %x[count], #1\n"
2207 "beq 2f\n"
2208
2209 "1:"
2210 "subs %x[count], %x[count], #8\n"
2211
2212 // Load Aggregate Store: 5x8.
2213 "ld1 {v0.2s}, [%x[in]], #8\n"
2214 "ld1 {v1.2s}, [x0], #8\n"
2215 "ld1 {v2.2s}, [x1], #8\n"
2216 "ld1 {v3.2s}, [x2], #8\n"
2217 "ld1 {v4.2s}, [x3], #8\n"
2218 "uaddw v8.8h, v8.8h, v0.8b\n"
2219 "uaddw v9.8h, v9.8h, v1.8b\n"
2220 "uaddw v10.8h, v10.8h, v2.8b\n"
2221 "uaddw v11.8h, v11.8h, v3.8b\n"
2222 "uaddw v12.8h, v12.8h, v4.8b\n"
2223 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2224 "st1 {v4.2s}, [%x[out]], #8\n"
2225
2226 "bne 1b\n"
2227
2228 "2:"
2229
2230 // Load Aggregate Store: 5x1.
2231 "movi v0.8b, #0\n"
2232 "movi v1.8b, #0\n"
2233 "movi v2.8b, #0\n"
2234 "movi v3.8b, #0\n"
2235 "movi v4.8b, #0\n"
2236 "ld1 {v0.b}[0], [%x[in]], #1\n"
2237 "ld1 {v1.b}[0], [x0], #1\n"
2238 "ld1 {v2.b}[0], [x1], #1\n"
2239 "ld1 {v3.b}[0], [x2], #1\n"
2240 "ld1 {v4.b}[0], [x3], #1\n"
2241 "uaddw v8.8h, v8.8h, v0.8b\n"
2242 "uaddw v9.8h, v9.8h, v1.8b\n"
2243 "uaddw v10.8h, v10.8h, v2.8b\n"
2244 "uaddw v11.8h, v11.8h, v3.8b\n"
2245 "uaddw v12.8h, v12.8h, v4.8b\n"
2246 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2247 "st1 {v4.2s}, [%x[out]], #8\n"
2248
2249 // Aggregator Reduction.
2250 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2251 "dup v1.4s, %w[additive_sum_offset]\n"
2252 "uaddlp v8.4s, v8.8h\n"
2253 "uaddlp v9.4s, v9.8h\n"
2254 "uaddlp v10.4s, v10.8h\n"
2255 "uaddlp v11.4s, v11.8h\n"
2256 "uaddlp v12.4s, v12.8h\n"
2257 "addp v8.4s, v8.4s, v9.4s\n"
2258 "addp v10.4s, v10.4s, v11.4s\n"
2259 "addp v12.4s, v12.4s, v12.4s\n"
2260 "addp v8.4s, v8.4s, v10.4s\n"
2261 "addp v9.4s, v12.4s, v12.4s\n"
2262 "mul v8.4s, v8.4s, v0.s[0]\n"
2263 "mul v9.4s, v9.4s, v0.s[0]\n"
2264 "add v8.4s, v8.4s, v1.4s\n"
2265 "add v9.4s, v9.4s, v1.4s\n"
2266 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2267 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2268 : [stride] "r"(params.stride),
2269 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2270 [additive_sum_offset] "r"(params.additive_sum_offset)
2271 : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2272 "v11", "v12", "cc", "memory");
2273 }
2274
2275 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2276 inline void Stream<uint8_t, 5, 8, 2, RowMajorWithSum>::Pack(
2277 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2278 #ifdef DEBUG
2279 #ifdef DEBUG_METAGEMM_VERBOSE
2280 std::cout << __FILE__ << "(" << __LINE__
2281 << ") RowMajorWithSum<uint8_t, 5, 8, 2, RowMajorWithSum>::Pack()"
2282 << std::endl
2283 << std::flush;
2284 #endif
2285 #endif
2286 int params_count_copy = params.count;
2287 asm volatile(
2288 "add x0, %x[in], %x[stride]\n"
2289 "add x1, x0, %x[stride]\n"
2290 "add x2, x1, %x[stride]\n"
2291 "add x3, x2, %x[stride]\n"
2292 "movi v8.8h, #0\n"
2293 "movi v9.8h, #0\n"
2294 "movi v10.8h, #0\n"
2295 "movi v11.8h, #0\n"
2296 "movi v12.8h, #0\n"
2297
2298 // Reduce count by leftovers.
2299 "subs %x[count], %x[count], #2\n"
2300 "beq 2f\n"
2301
2302 "1:"
2303 "subs %x[count], %x[count], #8\n"
2304
2305 // Load Aggregate Store: 5x8.
2306 "ld1 {v0.2s}, [%x[in]], #8\n"
2307 "ld1 {v1.2s}, [x0], #8\n"
2308 "ld1 {v2.2s}, [x1], #8\n"
2309 "ld1 {v3.2s}, [x2], #8\n"
2310 "ld1 {v4.2s}, [x3], #8\n"
2311 "uaddw v8.8h, v8.8h, v0.8b\n"
2312 "uaddw v9.8h, v9.8h, v1.8b\n"
2313 "uaddw v10.8h, v10.8h, v2.8b\n"
2314 "uaddw v11.8h, v11.8h, v3.8b\n"
2315 "uaddw v12.8h, v12.8h, v4.8b\n"
2316 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2317 "st1 {v4.2s}, [%x[out]], #8\n"
2318
2319 "bne 1b\n"
2320
2321 "2:"
2322
2323 // Load Aggregate Store: 5x2.
2324 "movi v0.8b, #0\n"
2325 "movi v1.8b, #0\n"
2326 "movi v2.8b, #0\n"
2327 "movi v3.8b, #0\n"
2328 "movi v4.8b, #0\n"
2329 "ld1 {v0.h}[0], [%x[in]], #2\n"
2330 "ld1 {v1.h}[0], [x0], #2\n"
2331 "ld1 {v2.h}[0], [x1], #2\n"
2332 "ld1 {v3.h}[0], [x2], #2\n"
2333 "ld1 {v4.h}[0], [x3], #2\n"
2334 "uaddw v8.8h, v8.8h, v0.8b\n"
2335 "uaddw v9.8h, v9.8h, v1.8b\n"
2336 "uaddw v10.8h, v10.8h, v2.8b\n"
2337 "uaddw v11.8h, v11.8h, v3.8b\n"
2338 "uaddw v12.8h, v12.8h, v4.8b\n"
2339 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2340 "st1 {v4.2s}, [%x[out]], #8\n"
2341
2342 // Aggregator Reduction.
2343 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2344 "dup v1.4s, %w[additive_sum_offset]\n"
2345 "uaddlp v8.4s, v8.8h\n"
2346 "uaddlp v9.4s, v9.8h\n"
2347 "uaddlp v10.4s, v10.8h\n"
2348 "uaddlp v11.4s, v11.8h\n"
2349 "uaddlp v12.4s, v12.8h\n"
2350 "addp v8.4s, v8.4s, v9.4s\n"
2351 "addp v10.4s, v10.4s, v11.4s\n"
2352 "addp v12.4s, v12.4s, v12.4s\n"
2353 "addp v8.4s, v8.4s, v10.4s\n"
2354 "addp v9.4s, v12.4s, v12.4s\n"
2355 "mul v8.4s, v8.4s, v0.s[0]\n"
2356 "mul v9.4s, v9.4s, v0.s[0]\n"
2357 "add v8.4s, v8.4s, v1.4s\n"
2358 "add v9.4s, v9.4s, v1.4s\n"
2359 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2360 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2361 : [stride] "r"(params.stride),
2362 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2363 [additive_sum_offset] "r"(params.additive_sum_offset)
2364 : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2365 "v11", "v12", "cc", "memory");
2366 }
2367
2368 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2369 inline void Stream<uint8_t, 5, 8, 3, RowMajorWithSum>::Pack(
2370 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2371 #ifdef DEBUG
2372 #ifdef DEBUG_METAGEMM_VERBOSE
2373 std::cout << __FILE__ << "(" << __LINE__
2374 << ") RowMajorWithSum<uint8_t, 5, 8, 3, RowMajorWithSum>::Pack()"
2375 << std::endl
2376 << std::flush;
2377 #endif
2378 #endif
2379 int params_count_copy = params.count;
2380 asm volatile(
2381 "add x0, %x[in], %x[stride]\n"
2382 "add x1, x0, %x[stride]\n"
2383 "add x2, x1, %x[stride]\n"
2384 "add x3, x2, %x[stride]\n"
2385 "movi v8.8h, #0\n"
2386 "movi v9.8h, #0\n"
2387 "movi v10.8h, #0\n"
2388 "movi v11.8h, #0\n"
2389 "movi v12.8h, #0\n"
2390
2391 // Reduce count by leftovers.
2392 "subs %x[count], %x[count], #3\n"
2393 "beq 2f\n"
2394
2395 "1:"
2396 "subs %x[count], %x[count], #8\n"
2397
2398 // Load Aggregate Store: 5x8.
2399 "ld1 {v0.2s}, [%x[in]], #8\n"
2400 "ld1 {v1.2s}, [x0], #8\n"
2401 "ld1 {v2.2s}, [x1], #8\n"
2402 "ld1 {v3.2s}, [x2], #8\n"
2403 "ld1 {v4.2s}, [x3], #8\n"
2404 "uaddw v8.8h, v8.8h, v0.8b\n"
2405 "uaddw v9.8h, v9.8h, v1.8b\n"
2406 "uaddw v10.8h, v10.8h, v2.8b\n"
2407 "uaddw v11.8h, v11.8h, v3.8b\n"
2408 "uaddw v12.8h, v12.8h, v4.8b\n"
2409 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2410 "st1 {v4.2s}, [%x[out]], #8\n"
2411
2412 "bne 1b\n"
2413
2414 "2:"
2415
2416 // Load Aggregate Store: 5x3.
2417 "movi v0.8b, #0\n"
2418 "movi v1.8b, #0\n"
2419 "movi v2.8b, #0\n"
2420 "movi v3.8b, #0\n"
2421 "movi v4.8b, #0\n"
2422 "ld1 {v0.h}[0], [%x[in]], #2\n"
2423 "ld1 {v0.b}[2], [%x[in]], #1\n"
2424 "ld1 {v1.h}[0], [x0], #2\n"
2425 "ld1 {v1.b}[2], [x0], #1\n"
2426 "ld1 {v2.h}[0], [x1], #2\n"
2427 "ld1 {v2.b}[2], [x1], #1\n"
2428 "ld1 {v3.h}[0], [x2], #2\n"
2429 "ld1 {v3.b}[2], [x2], #1\n"
2430 "ld1 {v4.h}[0], [x3], #2\n"
2431 "ld1 {v4.b}[2], [x3], #1\n"
2432 "uaddw v8.8h, v8.8h, v0.8b\n"
2433 "uaddw v9.8h, v9.8h, v1.8b\n"
2434 "uaddw v10.8h, v10.8h, v2.8b\n"
2435 "uaddw v11.8h, v11.8h, v3.8b\n"
2436 "uaddw v12.8h, v12.8h, v4.8b\n"
2437 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2438 "st1 {v4.2s}, [%x[out]], #8\n"
2439
2440 // Aggregator Reduction.
2441 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2442 "dup v1.4s, %w[additive_sum_offset]\n"
2443 "uaddlp v8.4s, v8.8h\n"
2444 "uaddlp v9.4s, v9.8h\n"
2445 "uaddlp v10.4s, v10.8h\n"
2446 "uaddlp v11.4s, v11.8h\n"
2447 "uaddlp v12.4s, v12.8h\n"
2448 "addp v8.4s, v8.4s, v9.4s\n"
2449 "addp v10.4s, v10.4s, v11.4s\n"
2450 "addp v12.4s, v12.4s, v12.4s\n"
2451 "addp v8.4s, v8.4s, v10.4s\n"
2452 "addp v9.4s, v12.4s, v12.4s\n"
2453 "mul v8.4s, v8.4s, v0.s[0]\n"
2454 "mul v9.4s, v9.4s, v0.s[0]\n"
2455 "add v8.4s, v8.4s, v1.4s\n"
2456 "add v9.4s, v9.4s, v1.4s\n"
2457 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2458 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2459 : [stride] "r"(params.stride),
2460 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2461 [additive_sum_offset] "r"(params.additive_sum_offset)
2462 : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2463 "v11", "v12", "cc", "memory");
2464 }
2465
2466 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2467 inline void Stream<uint8_t, 5, 8, 4, RowMajorWithSum>::Pack(
2468 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2469 #ifdef DEBUG
2470 #ifdef DEBUG_METAGEMM_VERBOSE
2471 std::cout << __FILE__ << "(" << __LINE__
2472 << ") RowMajorWithSum<uint8_t, 5, 8, 4, RowMajorWithSum>::Pack()"
2473 << std::endl
2474 << std::flush;
2475 #endif
2476 #endif
2477 int params_count_copy = params.count;
2478 asm volatile(
2479 "add x0, %x[in], %x[stride]\n"
2480 "add x1, x0, %x[stride]\n"
2481 "add x2, x1, %x[stride]\n"
2482 "add x3, x2, %x[stride]\n"
2483 "movi v8.8h, #0\n"
2484 "movi v9.8h, #0\n"
2485 "movi v10.8h, #0\n"
2486 "movi v11.8h, #0\n"
2487 "movi v12.8h, #0\n"
2488
2489 // Reduce count by leftovers.
2490 "subs %x[count], %x[count], #4\n"
2491 "beq 2f\n"
2492
2493 "1:"
2494 "subs %x[count], %x[count], #8\n"
2495
2496 // Load Aggregate Store: 5x8.
2497 "ld1 {v0.2s}, [%x[in]], #8\n"
2498 "ld1 {v1.2s}, [x0], #8\n"
2499 "ld1 {v2.2s}, [x1], #8\n"
2500 "ld1 {v3.2s}, [x2], #8\n"
2501 "ld1 {v4.2s}, [x3], #8\n"
2502 "uaddw v8.8h, v8.8h, v0.8b\n"
2503 "uaddw v9.8h, v9.8h, v1.8b\n"
2504 "uaddw v10.8h, v10.8h, v2.8b\n"
2505 "uaddw v11.8h, v11.8h, v3.8b\n"
2506 "uaddw v12.8h, v12.8h, v4.8b\n"
2507 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2508 "st1 {v4.2s}, [%x[out]], #8\n"
2509
2510 "bne 1b\n"
2511
2512 "2:"
2513
2514 // Load Aggregate Store: 5x4.
2515 "movi v0.8b, #0\n"
2516 "movi v1.8b, #0\n"
2517 "movi v2.8b, #0\n"
2518 "movi v3.8b, #0\n"
2519 "movi v4.8b, #0\n"
2520 "ld1 {v0.s}[0], [%x[in]], #4\n"
2521 "ld1 {v1.s}[0], [x0], #4\n"
2522 "ld1 {v2.s}[0], [x1], #4\n"
2523 "ld1 {v3.s}[0], [x2], #4\n"
2524 "ld1 {v4.s}[0], [x3], #4\n"
2525 "uaddw v8.8h, v8.8h, v0.8b\n"
2526 "uaddw v9.8h, v9.8h, v1.8b\n"
2527 "uaddw v10.8h, v10.8h, v2.8b\n"
2528 "uaddw v11.8h, v11.8h, v3.8b\n"
2529 "uaddw v12.8h, v12.8h, v4.8b\n"
2530 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2531 "st1 {v4.2s}, [%x[out]], #8\n"
2532
2533 // Aggregator Reduction.
2534 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2535 "dup v1.4s, %w[additive_sum_offset]\n"
2536 "uaddlp v8.4s, v8.8h\n"
2537 "uaddlp v9.4s, v9.8h\n"
2538 "uaddlp v10.4s, v10.8h\n"
2539 "uaddlp v11.4s, v11.8h\n"
2540 "uaddlp v12.4s, v12.8h\n"
2541 "addp v8.4s, v8.4s, v9.4s\n"
2542 "addp v10.4s, v10.4s, v11.4s\n"
2543 "addp v12.4s, v12.4s, v12.4s\n"
2544 "addp v8.4s, v8.4s, v10.4s\n"
2545 "addp v9.4s, v12.4s, v12.4s\n"
2546 "mul v8.4s, v8.4s, v0.s[0]\n"
2547 "mul v9.4s, v9.4s, v0.s[0]\n"
2548 "add v8.4s, v8.4s, v1.4s\n"
2549 "add v9.4s, v9.4s, v1.4s\n"
2550 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2551 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2552 : [stride] "r"(params.stride),
2553 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2554 [additive_sum_offset] "r"(params.additive_sum_offset)
2555 : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2556 "v11", "v12", "cc", "memory");
2557 }
2558
2559 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2560 inline void Stream<uint8_t, 5, 8, 5, RowMajorWithSum>::Pack(
2561 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2562 #ifdef DEBUG
2563 #ifdef DEBUG_METAGEMM_VERBOSE
2564 std::cout << __FILE__ << "(" << __LINE__
2565 << ") RowMajorWithSum<uint8_t, 5, 8, 5, RowMajorWithSum>::Pack()"
2566 << std::endl
2567 << std::flush;
2568 #endif
2569 #endif
2570 int params_count_copy = params.count;
2571 asm volatile(
2572 "add x0, %x[in], %x[stride]\n"
2573 "add x1, x0, %x[stride]\n"
2574 "add x2, x1, %x[stride]\n"
2575 "add x3, x2, %x[stride]\n"
2576 "movi v8.8h, #0\n"
2577 "movi v9.8h, #0\n"
2578 "movi v10.8h, #0\n"
2579 "movi v11.8h, #0\n"
2580 "movi v12.8h, #0\n"
2581
2582 // Reduce count by leftovers.
2583 "subs %x[count], %x[count], #5\n"
2584 "beq 2f\n"
2585
2586 "1:"
2587 "subs %x[count], %x[count], #8\n"
2588
2589 // Load Aggregate Store: 5x8.
2590 "ld1 {v0.2s}, [%x[in]], #8\n"
2591 "ld1 {v1.2s}, [x0], #8\n"
2592 "ld1 {v2.2s}, [x1], #8\n"
2593 "ld1 {v3.2s}, [x2], #8\n"
2594 "ld1 {v4.2s}, [x3], #8\n"
2595 "uaddw v8.8h, v8.8h, v0.8b\n"
2596 "uaddw v9.8h, v9.8h, v1.8b\n"
2597 "uaddw v10.8h, v10.8h, v2.8b\n"
2598 "uaddw v11.8h, v11.8h, v3.8b\n"
2599 "uaddw v12.8h, v12.8h, v4.8b\n"
2600 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2601 "st1 {v4.2s}, [%x[out]], #8\n"
2602
2603 "bne 1b\n"
2604
2605 "2:"
2606
2607 // Load Aggregate Store: 5x5.
2608 "movi v0.8b, #0\n"
2609 "movi v1.8b, #0\n"
2610 "movi v2.8b, #0\n"
2611 "movi v3.8b, #0\n"
2612 "movi v4.8b, #0\n"
2613 "ld1 {v0.s}[0], [%x[in]], #4\n"
2614 "ld1 {v0.b}[4], [%x[in]], #1\n"
2615 "ld1 {v1.s}[0], [x0], #4\n"
2616 "ld1 {v1.b}[4], [x0], #1\n"
2617 "ld1 {v2.s}[0], [x1], #4\n"
2618 "ld1 {v2.b}[4], [x1], #1\n"
2619 "ld1 {v3.s}[0], [x2], #4\n"
2620 "ld1 {v3.b}[4], [x2], #1\n"
2621 "ld1 {v4.s}[0], [x3], #4\n"
2622 "ld1 {v4.b}[4], [x3], #1\n"
2623 "uaddw v8.8h, v8.8h, v0.8b\n"
2624 "uaddw v9.8h, v9.8h, v1.8b\n"
2625 "uaddw v10.8h, v10.8h, v2.8b\n"
2626 "uaddw v11.8h, v11.8h, v3.8b\n"
2627 "uaddw v12.8h, v12.8h, v4.8b\n"
2628 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2629 "st1 {v4.2s}, [%x[out]], #8\n"
2630
2631 // Aggregator Reduction.
2632 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2633 "dup v1.4s, %w[additive_sum_offset]\n"
2634 "uaddlp v8.4s, v8.8h\n"
2635 "uaddlp v9.4s, v9.8h\n"
2636 "uaddlp v10.4s, v10.8h\n"
2637 "uaddlp v11.4s, v11.8h\n"
2638 "uaddlp v12.4s, v12.8h\n"
2639 "addp v8.4s, v8.4s, v9.4s\n"
2640 "addp v10.4s, v10.4s, v11.4s\n"
2641 "addp v12.4s, v12.4s, v12.4s\n"
2642 "addp v8.4s, v8.4s, v10.4s\n"
2643 "addp v9.4s, v12.4s, v12.4s\n"
2644 "mul v8.4s, v8.4s, v0.s[0]\n"
2645 "mul v9.4s, v9.4s, v0.s[0]\n"
2646 "add v8.4s, v8.4s, v1.4s\n"
2647 "add v9.4s, v9.4s, v1.4s\n"
2648 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2649 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2650 : [stride] "r"(params.stride),
2651 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2652 [additive_sum_offset] "r"(params.additive_sum_offset)
2653 : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2654 "v11", "v12", "cc", "memory");
2655 }
2656
2657 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2658 inline void Stream<uint8_t, 5, 8, 6, RowMajorWithSum>::Pack(
2659 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2660 #ifdef DEBUG
2661 #ifdef DEBUG_METAGEMM_VERBOSE
2662 std::cout << __FILE__ << "(" << __LINE__
2663 << ") RowMajorWithSum<uint8_t, 5, 8, 6, RowMajorWithSum>::Pack()"
2664 << std::endl
2665 << std::flush;
2666 #endif
2667 #endif
2668 int params_count_copy = params.count;
2669 asm volatile(
2670 "add x0, %x[in], %x[stride]\n"
2671 "add x1, x0, %x[stride]\n"
2672 "add x2, x1, %x[stride]\n"
2673 "add x3, x2, %x[stride]\n"
2674 "movi v8.8h, #0\n"
2675 "movi v9.8h, #0\n"
2676 "movi v10.8h, #0\n"
2677 "movi v11.8h, #0\n"
2678 "movi v12.8h, #0\n"
2679
2680 // Reduce count by leftovers.
2681 "subs %x[count], %x[count], #6\n"
2682 "beq 2f\n"
2683
2684 "1:"
2685 "subs %x[count], %x[count], #8\n"
2686
2687 // Load Aggregate Store: 5x8.
2688 "ld1 {v0.2s}, [%x[in]], #8\n"
2689 "ld1 {v1.2s}, [x0], #8\n"
2690 "ld1 {v2.2s}, [x1], #8\n"
2691 "ld1 {v3.2s}, [x2], #8\n"
2692 "ld1 {v4.2s}, [x3], #8\n"
2693 "uaddw v8.8h, v8.8h, v0.8b\n"
2694 "uaddw v9.8h, v9.8h, v1.8b\n"
2695 "uaddw v10.8h, v10.8h, v2.8b\n"
2696 "uaddw v11.8h, v11.8h, v3.8b\n"
2697 "uaddw v12.8h, v12.8h, v4.8b\n"
2698 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2699 "st1 {v4.2s}, [%x[out]], #8\n"
2700
2701 "bne 1b\n"
2702
2703 "2:"
2704
2705 // Load Aggregate Store: 5x6.
2706 "movi v0.8b, #0\n"
2707 "movi v1.8b, #0\n"
2708 "movi v2.8b, #0\n"
2709 "movi v3.8b, #0\n"
2710 "movi v4.8b, #0\n"
2711 "ld1 {v0.s}[0], [%x[in]], #4\n"
2712 "ld1 {v0.h}[2], [%x[in]], #2\n"
2713 "ld1 {v1.s}[0], [x0], #4\n"
2714 "ld1 {v1.h}[2], [x0], #2\n"
2715 "ld1 {v2.s}[0], [x1], #4\n"
2716 "ld1 {v2.h}[2], [x1], #2\n"
2717 "ld1 {v3.s}[0], [x2], #4\n"
2718 "ld1 {v3.h}[2], [x2], #2\n"
2719 "ld1 {v4.s}[0], [x3], #4\n"
2720 "ld1 {v4.h}[2], [x3], #2\n"
2721 "uaddw v8.8h, v8.8h, v0.8b\n"
2722 "uaddw v9.8h, v9.8h, v1.8b\n"
2723 "uaddw v10.8h, v10.8h, v2.8b\n"
2724 "uaddw v11.8h, v11.8h, v3.8b\n"
2725 "uaddw v12.8h, v12.8h, v4.8b\n"
2726 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2727 "st1 {v4.2s}, [%x[out]], #8\n"
2728
2729 // Aggregator Reduction.
2730 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2731 "dup v1.4s, %w[additive_sum_offset]\n"
2732 "uaddlp v8.4s, v8.8h\n"
2733 "uaddlp v9.4s, v9.8h\n"
2734 "uaddlp v10.4s, v10.8h\n"
2735 "uaddlp v11.4s, v11.8h\n"
2736 "uaddlp v12.4s, v12.8h\n"
2737 "addp v8.4s, v8.4s, v9.4s\n"
2738 "addp v10.4s, v10.4s, v11.4s\n"
2739 "addp v12.4s, v12.4s, v12.4s\n"
2740 "addp v8.4s, v8.4s, v10.4s\n"
2741 "addp v9.4s, v12.4s, v12.4s\n"
2742 "mul v8.4s, v8.4s, v0.s[0]\n"
2743 "mul v9.4s, v9.4s, v0.s[0]\n"
2744 "add v8.4s, v8.4s, v1.4s\n"
2745 "add v9.4s, v9.4s, v1.4s\n"
2746 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2747 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2748 : [stride] "r"(params.stride),
2749 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2750 [additive_sum_offset] "r"(params.additive_sum_offset)
2751 : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2752 "v11", "v12", "cc", "memory");
2753 }
2754
2755 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2756 inline void Stream<uint8_t, 5, 8, 7, RowMajorWithSum>::Pack(
2757 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2758 #ifdef DEBUG
2759 #ifdef DEBUG_METAGEMM_VERBOSE
2760 std::cout << __FILE__ << "(" << __LINE__
2761 << ") RowMajorWithSum<uint8_t, 5, 8, 7, RowMajorWithSum>::Pack()"
2762 << std::endl
2763 << std::flush;
2764 #endif
2765 #endif
2766 int params_count_copy = params.count;
2767 asm volatile(
2768 "add x0, %x[in], %x[stride]\n"
2769 "add x1, x0, %x[stride]\n"
2770 "add x2, x1, %x[stride]\n"
2771 "add x3, x2, %x[stride]\n"
2772 "movi v8.8h, #0\n"
2773 "movi v9.8h, #0\n"
2774 "movi v10.8h, #0\n"
2775 "movi v11.8h, #0\n"
2776 "movi v12.8h, #0\n"
2777
2778 // Reduce count by leftovers.
2779 "subs %x[count], %x[count], #7\n"
2780 "beq 2f\n"
2781
2782 "1:"
2783 "subs %x[count], %x[count], #8\n"
2784
2785 // Load Aggregate Store: 5x8.
2786 "ld1 {v0.2s}, [%x[in]], #8\n"
2787 "ld1 {v1.2s}, [x0], #8\n"
2788 "ld1 {v2.2s}, [x1], #8\n"
2789 "ld1 {v3.2s}, [x2], #8\n"
2790 "ld1 {v4.2s}, [x3], #8\n"
2791 "uaddw v8.8h, v8.8h, v0.8b\n"
2792 "uaddw v9.8h, v9.8h, v1.8b\n"
2793 "uaddw v10.8h, v10.8h, v2.8b\n"
2794 "uaddw v11.8h, v11.8h, v3.8b\n"
2795 "uaddw v12.8h, v12.8h, v4.8b\n"
2796 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2797 "st1 {v4.2s}, [%x[out]], #8\n"
2798
2799 "bne 1b\n"
2800
2801 "2:"
2802
2803 // Load Aggregate Store: 5x7.
2804 "movi v0.8b, #0\n"
2805 "movi v1.8b, #0\n"
2806 "movi v2.8b, #0\n"
2807 "movi v3.8b, #0\n"
2808 "movi v4.8b, #0\n"
2809 "ld1 {v0.s}[0], [%x[in]], #4\n"
2810 "ld1 {v0.h}[2], [%x[in]], #2\n"
2811 "ld1 {v0.b}[6], [%x[in]], #1\n"
2812 "ld1 {v1.s}[0], [x0], #4\n"
2813 "ld1 {v1.h}[2], [x0], #2\n"
2814 "ld1 {v1.b}[6], [x0], #1\n"
2815 "ld1 {v2.s}[0], [x1], #4\n"
2816 "ld1 {v2.h}[2], [x1], #2\n"
2817 "ld1 {v2.b}[6], [x1], #1\n"
2818 "ld1 {v3.s}[0], [x2], #4\n"
2819 "ld1 {v3.h}[2], [x2], #2\n"
2820 "ld1 {v3.b}[6], [x2], #1\n"
2821 "ld1 {v4.s}[0], [x3], #4\n"
2822 "ld1 {v4.h}[2], [x3], #2\n"
2823 "ld1 {v4.b}[6], [x3], #1\n"
2824 "uaddw v8.8h, v8.8h, v0.8b\n"
2825 "uaddw v9.8h, v9.8h, v1.8b\n"
2826 "uaddw v10.8h, v10.8h, v2.8b\n"
2827 "uaddw v11.8h, v11.8h, v3.8b\n"
2828 "uaddw v12.8h, v12.8h, v4.8b\n"
2829 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2830 "st1 {v4.2s}, [%x[out]], #8\n"
2831
2832 // Aggregator Reduction.
2833 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2834 "dup v1.4s, %w[additive_sum_offset]\n"
2835 "uaddlp v8.4s, v8.8h\n"
2836 "uaddlp v9.4s, v9.8h\n"
2837 "uaddlp v10.4s, v10.8h\n"
2838 "uaddlp v11.4s, v11.8h\n"
2839 "uaddlp v12.4s, v12.8h\n"
2840 "addp v8.4s, v8.4s, v9.4s\n"
2841 "addp v10.4s, v10.4s, v11.4s\n"
2842 "addp v12.4s, v12.4s, v12.4s\n"
2843 "addp v8.4s, v8.4s, v10.4s\n"
2844 "addp v9.4s, v12.4s, v12.4s\n"
2845 "mul v8.4s, v8.4s, v0.s[0]\n"
2846 "mul v9.4s, v9.4s, v0.s[0]\n"
2847 "add v8.4s, v8.4s, v1.4s\n"
2848 "add v9.4s, v9.4s, v1.4s\n"
2849 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2850 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2851 : [stride] "r"(params.stride),
2852 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2853 [additive_sum_offset] "r"(params.additive_sum_offset)
2854 : "x0", "x1", "x2", "x3", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10",
2855 "v11", "v12", "cc", "memory");
2856 }
2857
2858 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2859 inline void Stream<uint8_t, 6, 8, 0, RowMajorWithSum>::Pack(
2860 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2861 #ifdef DEBUG
2862 #ifdef DEBUG_METAGEMM_VERBOSE
2863 std::cout << __FILE__ << "(" << __LINE__
2864 << ") RowMajorWithSum<uint8_t, 6, 8, 0, RowMajorWithSum>::Pack()"
2865 << std::endl
2866 << std::flush;
2867 #endif
2868 #endif
2869 int params_count_copy = params.count;
2870 asm volatile(
2871 "add x0, %x[in], %x[stride]\n"
2872 "add x1, x0, %x[stride]\n"
2873 "add x2, x1, %x[stride]\n"
2874 "add x3, x2, %x[stride]\n"
2875 "add x4, x3, %x[stride]\n"
2876 "movi v8.8h, #0\n"
2877 "movi v9.8h, #0\n"
2878 "movi v10.8h, #0\n"
2879 "movi v11.8h, #0\n"
2880 "movi v12.8h, #0\n"
2881 "movi v13.8h, #0\n"
2882
2883 "1:"
2884 "subs %x[count], %x[count], #8\n"
2885
2886 // Load Aggregate Store: 6x8.
2887 "ld1 {v0.2s}, [%x[in]], #8\n"
2888 "ld1 {v1.2s}, [x0], #8\n"
2889 "ld1 {v2.2s}, [x1], #8\n"
2890 "ld1 {v3.2s}, [x2], #8\n"
2891 "ld1 {v4.2s}, [x3], #8\n"
2892 "ld1 {v5.2s}, [x4], #8\n"
2893 "uaddw v8.8h, v8.8h, v0.8b\n"
2894 "uaddw v9.8h, v9.8h, v1.8b\n"
2895 "uaddw v10.8h, v10.8h, v2.8b\n"
2896 "uaddw v11.8h, v11.8h, v3.8b\n"
2897 "uaddw v12.8h, v12.8h, v4.8b\n"
2898 "uaddw v13.8h, v13.8h, v5.8b\n"
2899 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2900 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
2901
2902 "bne 1b\n"
2903
2904 // Aggregator Reduction.
2905 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
2906 "dup v1.4s, %w[additive_sum_offset]\n"
2907 "uaddlp v8.4s, v8.8h\n"
2908 "uaddlp v9.4s, v9.8h\n"
2909 "uaddlp v10.4s, v10.8h\n"
2910 "uaddlp v11.4s, v11.8h\n"
2911 "uaddlp v12.4s, v12.8h\n"
2912 "uaddlp v13.4s, v13.8h\n"
2913 "addp v8.4s, v8.4s, v9.4s\n"
2914 "addp v10.4s, v10.4s, v11.4s\n"
2915 "addp v12.4s, v12.4s, v13.4s\n"
2916 "addp v8.4s, v8.4s, v10.4s\n"
2917 "addp v9.4s, v12.4s, v12.4s\n"
2918 "mul v8.4s, v8.4s, v0.s[0]\n"
2919 "mul v9.4s, v9.4s, v0.s[0]\n"
2920 "add v8.4s, v8.4s, v1.4s\n"
2921 "add v9.4s, v9.4s, v1.4s\n"
2922 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
2923 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
2924 : [stride] "r"(params.stride),
2925 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
2926 [additive_sum_offset] "r"(params.additive_sum_offset)
2927 : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
2928 "v9", "v10", "v11", "v12", "v13", "cc", "memory");
2929 }
2930
2931 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)2932 inline void Stream<uint8_t, 6, 8, 1, RowMajorWithSum>::Pack(
2933 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
2934 #ifdef DEBUG
2935 #ifdef DEBUG_METAGEMM_VERBOSE
2936 std::cout << __FILE__ << "(" << __LINE__
2937 << ") RowMajorWithSum<uint8_t, 6, 8, 1, RowMajorWithSum>::Pack()"
2938 << std::endl
2939 << std::flush;
2940 #endif
2941 #endif
2942 int params_count_copy = params.count;
2943 asm volatile(
2944 "add x0, %x[in], %x[stride]\n"
2945 "add x1, x0, %x[stride]\n"
2946 "add x2, x1, %x[stride]\n"
2947 "add x3, x2, %x[stride]\n"
2948 "add x4, x3, %x[stride]\n"
2949 "movi v8.8h, #0\n"
2950 "movi v9.8h, #0\n"
2951 "movi v10.8h, #0\n"
2952 "movi v11.8h, #0\n"
2953 "movi v12.8h, #0\n"
2954 "movi v13.8h, #0\n"
2955
2956 // Reduce count by leftovers.
2957 "subs %x[count], %x[count], #1\n"
2958 "beq 2f\n"
2959
2960 "1:"
2961 "subs %x[count], %x[count], #8\n"
2962
2963 // Load Aggregate Store: 6x8.
2964 "ld1 {v0.2s}, [%x[in]], #8\n"
2965 "ld1 {v1.2s}, [x0], #8\n"
2966 "ld1 {v2.2s}, [x1], #8\n"
2967 "ld1 {v3.2s}, [x2], #8\n"
2968 "ld1 {v4.2s}, [x3], #8\n"
2969 "ld1 {v5.2s}, [x4], #8\n"
2970 "uaddw v8.8h, v8.8h, v0.8b\n"
2971 "uaddw v9.8h, v9.8h, v1.8b\n"
2972 "uaddw v10.8h, v10.8h, v2.8b\n"
2973 "uaddw v11.8h, v11.8h, v3.8b\n"
2974 "uaddw v12.8h, v12.8h, v4.8b\n"
2975 "uaddw v13.8h, v13.8h, v5.8b\n"
2976 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
2977 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
2978
2979 "bne 1b\n"
2980
2981 "2:"
2982
2983 // Load Aggregate Store: 6x1.
2984 "movi v0.8b, #0\n"
2985 "movi v1.8b, #0\n"
2986 "movi v2.8b, #0\n"
2987 "movi v3.8b, #0\n"
2988 "movi v4.8b, #0\n"
2989 "movi v5.8b, #0\n"
2990 "ld1 {v0.b}[0], [%x[in]], #1\n"
2991 "ld1 {v1.b}[0], [x0], #1\n"
2992 "ld1 {v2.b}[0], [x1], #1\n"
2993 "ld1 {v3.b}[0], [x2], #1\n"
2994 "ld1 {v4.b}[0], [x3], #1\n"
2995 "ld1 {v5.b}[0], [x4], #1\n"
2996 "uaddw v8.8h, v8.8h, v0.8b\n"
2997 "uaddw v9.8h, v9.8h, v1.8b\n"
2998 "uaddw v10.8h, v10.8h, v2.8b\n"
2999 "uaddw v11.8h, v11.8h, v3.8b\n"
3000 "uaddw v12.8h, v12.8h, v4.8b\n"
3001 "uaddw v13.8h, v13.8h, v5.8b\n"
3002 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3003 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3004
3005 // Aggregator Reduction.
3006 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3007 "dup v1.4s, %w[additive_sum_offset]\n"
3008 "uaddlp v8.4s, v8.8h\n"
3009 "uaddlp v9.4s, v9.8h\n"
3010 "uaddlp v10.4s, v10.8h\n"
3011 "uaddlp v11.4s, v11.8h\n"
3012 "uaddlp v12.4s, v12.8h\n"
3013 "uaddlp v13.4s, v13.8h\n"
3014 "addp v8.4s, v8.4s, v9.4s\n"
3015 "addp v10.4s, v10.4s, v11.4s\n"
3016 "addp v12.4s, v12.4s, v13.4s\n"
3017 "addp v8.4s, v8.4s, v10.4s\n"
3018 "addp v9.4s, v12.4s, v12.4s\n"
3019 "mul v8.4s, v8.4s, v0.s[0]\n"
3020 "mul v9.4s, v9.4s, v0.s[0]\n"
3021 "add v8.4s, v8.4s, v1.4s\n"
3022 "add v9.4s, v9.4s, v1.4s\n"
3023 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3024 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3025 : [stride] "r"(params.stride),
3026 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3027 [additive_sum_offset] "r"(params.additive_sum_offset)
3028 : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3029 "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3030 }
3031
3032 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3033 inline void Stream<uint8_t, 6, 8, 2, RowMajorWithSum>::Pack(
3034 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3035 #ifdef DEBUG
3036 #ifdef DEBUG_METAGEMM_VERBOSE
3037 std::cout << __FILE__ << "(" << __LINE__
3038 << ") RowMajorWithSum<uint8_t, 6, 8, 2, RowMajorWithSum>::Pack()"
3039 << std::endl
3040 << std::flush;
3041 #endif
3042 #endif
3043 int params_count_copy = params.count;
3044 asm volatile(
3045 "add x0, %x[in], %x[stride]\n"
3046 "add x1, x0, %x[stride]\n"
3047 "add x2, x1, %x[stride]\n"
3048 "add x3, x2, %x[stride]\n"
3049 "add x4, x3, %x[stride]\n"
3050 "movi v8.8h, #0\n"
3051 "movi v9.8h, #0\n"
3052 "movi v10.8h, #0\n"
3053 "movi v11.8h, #0\n"
3054 "movi v12.8h, #0\n"
3055 "movi v13.8h, #0\n"
3056
3057 // Reduce count by leftovers.
3058 "subs %x[count], %x[count], #2\n"
3059 "beq 2f\n"
3060
3061 "1:"
3062 "subs %x[count], %x[count], #8\n"
3063
3064 // Load Aggregate Store: 6x8.
3065 "ld1 {v0.2s}, [%x[in]], #8\n"
3066 "ld1 {v1.2s}, [x0], #8\n"
3067 "ld1 {v2.2s}, [x1], #8\n"
3068 "ld1 {v3.2s}, [x2], #8\n"
3069 "ld1 {v4.2s}, [x3], #8\n"
3070 "ld1 {v5.2s}, [x4], #8\n"
3071 "uaddw v8.8h, v8.8h, v0.8b\n"
3072 "uaddw v9.8h, v9.8h, v1.8b\n"
3073 "uaddw v10.8h, v10.8h, v2.8b\n"
3074 "uaddw v11.8h, v11.8h, v3.8b\n"
3075 "uaddw v12.8h, v12.8h, v4.8b\n"
3076 "uaddw v13.8h, v13.8h, v5.8b\n"
3077 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3078 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3079
3080 "bne 1b\n"
3081
3082 "2:"
3083
3084 // Load Aggregate Store: 6x2.
3085 "movi v0.8b, #0\n"
3086 "movi v1.8b, #0\n"
3087 "movi v2.8b, #0\n"
3088 "movi v3.8b, #0\n"
3089 "movi v4.8b, #0\n"
3090 "movi v5.8b, #0\n"
3091 "ld1 {v0.h}[0], [%x[in]], #2\n"
3092 "ld1 {v1.h}[0], [x0], #2\n"
3093 "ld1 {v2.h}[0], [x1], #2\n"
3094 "ld1 {v3.h}[0], [x2], #2\n"
3095 "ld1 {v4.h}[0], [x3], #2\n"
3096 "ld1 {v5.h}[0], [x4], #2\n"
3097 "uaddw v8.8h, v8.8h, v0.8b\n"
3098 "uaddw v9.8h, v9.8h, v1.8b\n"
3099 "uaddw v10.8h, v10.8h, v2.8b\n"
3100 "uaddw v11.8h, v11.8h, v3.8b\n"
3101 "uaddw v12.8h, v12.8h, v4.8b\n"
3102 "uaddw v13.8h, v13.8h, v5.8b\n"
3103 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3104 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3105
3106 // Aggregator Reduction.
3107 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3108 "dup v1.4s, %w[additive_sum_offset]\n"
3109 "uaddlp v8.4s, v8.8h\n"
3110 "uaddlp v9.4s, v9.8h\n"
3111 "uaddlp v10.4s, v10.8h\n"
3112 "uaddlp v11.4s, v11.8h\n"
3113 "uaddlp v12.4s, v12.8h\n"
3114 "uaddlp v13.4s, v13.8h\n"
3115 "addp v8.4s, v8.4s, v9.4s\n"
3116 "addp v10.4s, v10.4s, v11.4s\n"
3117 "addp v12.4s, v12.4s, v13.4s\n"
3118 "addp v8.4s, v8.4s, v10.4s\n"
3119 "addp v9.4s, v12.4s, v12.4s\n"
3120 "mul v8.4s, v8.4s, v0.s[0]\n"
3121 "mul v9.4s, v9.4s, v0.s[0]\n"
3122 "add v8.4s, v8.4s, v1.4s\n"
3123 "add v9.4s, v9.4s, v1.4s\n"
3124 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3125 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3126 : [stride] "r"(params.stride),
3127 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3128 [additive_sum_offset] "r"(params.additive_sum_offset)
3129 : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3130 "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3131 }
3132
3133 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3134 inline void Stream<uint8_t, 6, 8, 3, RowMajorWithSum>::Pack(
3135 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3136 #ifdef DEBUG
3137 #ifdef DEBUG_METAGEMM_VERBOSE
3138 std::cout << __FILE__ << "(" << __LINE__
3139 << ") RowMajorWithSum<uint8_t, 6, 8, 3, RowMajorWithSum>::Pack()"
3140 << std::endl
3141 << std::flush;
3142 #endif
3143 #endif
3144 int params_count_copy = params.count;
3145 asm volatile(
3146 "add x0, %x[in], %x[stride]\n"
3147 "add x1, x0, %x[stride]\n"
3148 "add x2, x1, %x[stride]\n"
3149 "add x3, x2, %x[stride]\n"
3150 "add x4, x3, %x[stride]\n"
3151 "movi v8.8h, #0\n"
3152 "movi v9.8h, #0\n"
3153 "movi v10.8h, #0\n"
3154 "movi v11.8h, #0\n"
3155 "movi v12.8h, #0\n"
3156 "movi v13.8h, #0\n"
3157
3158 // Reduce count by leftovers.
3159 "subs %x[count], %x[count], #3\n"
3160 "beq 2f\n"
3161
3162 "1:"
3163 "subs %x[count], %x[count], #8\n"
3164
3165 // Load Aggregate Store: 6x8.
3166 "ld1 {v0.2s}, [%x[in]], #8\n"
3167 "ld1 {v1.2s}, [x0], #8\n"
3168 "ld1 {v2.2s}, [x1], #8\n"
3169 "ld1 {v3.2s}, [x2], #8\n"
3170 "ld1 {v4.2s}, [x3], #8\n"
3171 "ld1 {v5.2s}, [x4], #8\n"
3172 "uaddw v8.8h, v8.8h, v0.8b\n"
3173 "uaddw v9.8h, v9.8h, v1.8b\n"
3174 "uaddw v10.8h, v10.8h, v2.8b\n"
3175 "uaddw v11.8h, v11.8h, v3.8b\n"
3176 "uaddw v12.8h, v12.8h, v4.8b\n"
3177 "uaddw v13.8h, v13.8h, v5.8b\n"
3178 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3179 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3180
3181 "bne 1b\n"
3182
3183 "2:"
3184
3185 // Load Aggregate Store: 6x3.
3186 "movi v0.8b, #0\n"
3187 "movi v1.8b, #0\n"
3188 "movi v2.8b, #0\n"
3189 "movi v3.8b, #0\n"
3190 "movi v4.8b, #0\n"
3191 "movi v5.8b, #0\n"
3192 "ld1 {v0.h}[0], [%x[in]], #2\n"
3193 "ld1 {v0.b}[2], [%x[in]], #1\n"
3194 "ld1 {v1.h}[0], [x0], #2\n"
3195 "ld1 {v1.b}[2], [x0], #1\n"
3196 "ld1 {v2.h}[0], [x1], #2\n"
3197 "ld1 {v2.b}[2], [x1], #1\n"
3198 "ld1 {v3.h}[0], [x2], #2\n"
3199 "ld1 {v3.b}[2], [x2], #1\n"
3200 "ld1 {v4.h}[0], [x3], #2\n"
3201 "ld1 {v4.b}[2], [x3], #1\n"
3202 "ld1 {v5.h}[0], [x4], #2\n"
3203 "ld1 {v5.b}[2], [x4], #1\n"
3204 "uaddw v8.8h, v8.8h, v0.8b\n"
3205 "uaddw v9.8h, v9.8h, v1.8b\n"
3206 "uaddw v10.8h, v10.8h, v2.8b\n"
3207 "uaddw v11.8h, v11.8h, v3.8b\n"
3208 "uaddw v12.8h, v12.8h, v4.8b\n"
3209 "uaddw v13.8h, v13.8h, v5.8b\n"
3210 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3211 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3212
3213 // Aggregator Reduction.
3214 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3215 "dup v1.4s, %w[additive_sum_offset]\n"
3216 "uaddlp v8.4s, v8.8h\n"
3217 "uaddlp v9.4s, v9.8h\n"
3218 "uaddlp v10.4s, v10.8h\n"
3219 "uaddlp v11.4s, v11.8h\n"
3220 "uaddlp v12.4s, v12.8h\n"
3221 "uaddlp v13.4s, v13.8h\n"
3222 "addp v8.4s, v8.4s, v9.4s\n"
3223 "addp v10.4s, v10.4s, v11.4s\n"
3224 "addp v12.4s, v12.4s, v13.4s\n"
3225 "addp v8.4s, v8.4s, v10.4s\n"
3226 "addp v9.4s, v12.4s, v12.4s\n"
3227 "mul v8.4s, v8.4s, v0.s[0]\n"
3228 "mul v9.4s, v9.4s, v0.s[0]\n"
3229 "add v8.4s, v8.4s, v1.4s\n"
3230 "add v9.4s, v9.4s, v1.4s\n"
3231 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3232 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3233 : [stride] "r"(params.stride),
3234 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3235 [additive_sum_offset] "r"(params.additive_sum_offset)
3236 : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3237 "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3238 }
3239
3240 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3241 inline void Stream<uint8_t, 6, 8, 4, RowMajorWithSum>::Pack(
3242 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3243 #ifdef DEBUG
3244 #ifdef DEBUG_METAGEMM_VERBOSE
3245 std::cout << __FILE__ << "(" << __LINE__
3246 << ") RowMajorWithSum<uint8_t, 6, 8, 4, RowMajorWithSum>::Pack()"
3247 << std::endl
3248 << std::flush;
3249 #endif
3250 #endif
3251 int params_count_copy = params.count;
3252 asm volatile(
3253 "add x0, %x[in], %x[stride]\n"
3254 "add x1, x0, %x[stride]\n"
3255 "add x2, x1, %x[stride]\n"
3256 "add x3, x2, %x[stride]\n"
3257 "add x4, x3, %x[stride]\n"
3258 "movi v8.8h, #0\n"
3259 "movi v9.8h, #0\n"
3260 "movi v10.8h, #0\n"
3261 "movi v11.8h, #0\n"
3262 "movi v12.8h, #0\n"
3263 "movi v13.8h, #0\n"
3264
3265 // Reduce count by leftovers.
3266 "subs %x[count], %x[count], #4\n"
3267 "beq 2f\n"
3268
3269 "1:"
3270 "subs %x[count], %x[count], #8\n"
3271
3272 // Load Aggregate Store: 6x8.
3273 "ld1 {v0.2s}, [%x[in]], #8\n"
3274 "ld1 {v1.2s}, [x0], #8\n"
3275 "ld1 {v2.2s}, [x1], #8\n"
3276 "ld1 {v3.2s}, [x2], #8\n"
3277 "ld1 {v4.2s}, [x3], #8\n"
3278 "ld1 {v5.2s}, [x4], #8\n"
3279 "uaddw v8.8h, v8.8h, v0.8b\n"
3280 "uaddw v9.8h, v9.8h, v1.8b\n"
3281 "uaddw v10.8h, v10.8h, v2.8b\n"
3282 "uaddw v11.8h, v11.8h, v3.8b\n"
3283 "uaddw v12.8h, v12.8h, v4.8b\n"
3284 "uaddw v13.8h, v13.8h, v5.8b\n"
3285 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3286 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3287
3288 "bne 1b\n"
3289
3290 "2:"
3291
3292 // Load Aggregate Store: 6x4.
3293 "movi v0.8b, #0\n"
3294 "movi v1.8b, #0\n"
3295 "movi v2.8b, #0\n"
3296 "movi v3.8b, #0\n"
3297 "movi v4.8b, #0\n"
3298 "movi v5.8b, #0\n"
3299 "ld1 {v0.s}[0], [%x[in]], #4\n"
3300 "ld1 {v1.s}[0], [x0], #4\n"
3301 "ld1 {v2.s}[0], [x1], #4\n"
3302 "ld1 {v3.s}[0], [x2], #4\n"
3303 "ld1 {v4.s}[0], [x3], #4\n"
3304 "ld1 {v5.s}[0], [x4], #4\n"
3305 "uaddw v8.8h, v8.8h, v0.8b\n"
3306 "uaddw v9.8h, v9.8h, v1.8b\n"
3307 "uaddw v10.8h, v10.8h, v2.8b\n"
3308 "uaddw v11.8h, v11.8h, v3.8b\n"
3309 "uaddw v12.8h, v12.8h, v4.8b\n"
3310 "uaddw v13.8h, v13.8h, v5.8b\n"
3311 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3312 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3313
3314 // Aggregator Reduction.
3315 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3316 "dup v1.4s, %w[additive_sum_offset]\n"
3317 "uaddlp v8.4s, v8.8h\n"
3318 "uaddlp v9.4s, v9.8h\n"
3319 "uaddlp v10.4s, v10.8h\n"
3320 "uaddlp v11.4s, v11.8h\n"
3321 "uaddlp v12.4s, v12.8h\n"
3322 "uaddlp v13.4s, v13.8h\n"
3323 "addp v8.4s, v8.4s, v9.4s\n"
3324 "addp v10.4s, v10.4s, v11.4s\n"
3325 "addp v12.4s, v12.4s, v13.4s\n"
3326 "addp v8.4s, v8.4s, v10.4s\n"
3327 "addp v9.4s, v12.4s, v12.4s\n"
3328 "mul v8.4s, v8.4s, v0.s[0]\n"
3329 "mul v9.4s, v9.4s, v0.s[0]\n"
3330 "add v8.4s, v8.4s, v1.4s\n"
3331 "add v9.4s, v9.4s, v1.4s\n"
3332 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3333 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3334 : [stride] "r"(params.stride),
3335 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3336 [additive_sum_offset] "r"(params.additive_sum_offset)
3337 : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3338 "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3339 }
3340
3341 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3342 inline void Stream<uint8_t, 6, 8, 5, RowMajorWithSum>::Pack(
3343 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3344 #ifdef DEBUG
3345 #ifdef DEBUG_METAGEMM_VERBOSE
3346 std::cout << __FILE__ << "(" << __LINE__
3347 << ") RowMajorWithSum<uint8_t, 6, 8, 5, RowMajorWithSum>::Pack()"
3348 << std::endl
3349 << std::flush;
3350 #endif
3351 #endif
3352 int params_count_copy = params.count;
3353 asm volatile(
3354 "add x0, %x[in], %x[stride]\n"
3355 "add x1, x0, %x[stride]\n"
3356 "add x2, x1, %x[stride]\n"
3357 "add x3, x2, %x[stride]\n"
3358 "add x4, x3, %x[stride]\n"
3359 "movi v8.8h, #0\n"
3360 "movi v9.8h, #0\n"
3361 "movi v10.8h, #0\n"
3362 "movi v11.8h, #0\n"
3363 "movi v12.8h, #0\n"
3364 "movi v13.8h, #0\n"
3365
3366 // Reduce count by leftovers.
3367 "subs %x[count], %x[count], #5\n"
3368 "beq 2f\n"
3369
3370 "1:"
3371 "subs %x[count], %x[count], #8\n"
3372
3373 // Load Aggregate Store: 6x8.
3374 "ld1 {v0.2s}, [%x[in]], #8\n"
3375 "ld1 {v1.2s}, [x0], #8\n"
3376 "ld1 {v2.2s}, [x1], #8\n"
3377 "ld1 {v3.2s}, [x2], #8\n"
3378 "ld1 {v4.2s}, [x3], #8\n"
3379 "ld1 {v5.2s}, [x4], #8\n"
3380 "uaddw v8.8h, v8.8h, v0.8b\n"
3381 "uaddw v9.8h, v9.8h, v1.8b\n"
3382 "uaddw v10.8h, v10.8h, v2.8b\n"
3383 "uaddw v11.8h, v11.8h, v3.8b\n"
3384 "uaddw v12.8h, v12.8h, v4.8b\n"
3385 "uaddw v13.8h, v13.8h, v5.8b\n"
3386 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3387 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3388
3389 "bne 1b\n"
3390
3391 "2:"
3392
3393 // Load Aggregate Store: 6x5.
3394 "movi v0.8b, #0\n"
3395 "movi v1.8b, #0\n"
3396 "movi v2.8b, #0\n"
3397 "movi v3.8b, #0\n"
3398 "movi v4.8b, #0\n"
3399 "movi v5.8b, #0\n"
3400 "ld1 {v0.s}[0], [%x[in]], #4\n"
3401 "ld1 {v0.b}[4], [%x[in]], #1\n"
3402 "ld1 {v1.s}[0], [x0], #4\n"
3403 "ld1 {v1.b}[4], [x0], #1\n"
3404 "ld1 {v2.s}[0], [x1], #4\n"
3405 "ld1 {v2.b}[4], [x1], #1\n"
3406 "ld1 {v3.s}[0], [x2], #4\n"
3407 "ld1 {v3.b}[4], [x2], #1\n"
3408 "ld1 {v4.s}[0], [x3], #4\n"
3409 "ld1 {v4.b}[4], [x3], #1\n"
3410 "ld1 {v5.s}[0], [x4], #4\n"
3411 "ld1 {v5.b}[4], [x4], #1\n"
3412 "uaddw v8.8h, v8.8h, v0.8b\n"
3413 "uaddw v9.8h, v9.8h, v1.8b\n"
3414 "uaddw v10.8h, v10.8h, v2.8b\n"
3415 "uaddw v11.8h, v11.8h, v3.8b\n"
3416 "uaddw v12.8h, v12.8h, v4.8b\n"
3417 "uaddw v13.8h, v13.8h, v5.8b\n"
3418 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3419 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3420
3421 // Aggregator Reduction.
3422 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3423 "dup v1.4s, %w[additive_sum_offset]\n"
3424 "uaddlp v8.4s, v8.8h\n"
3425 "uaddlp v9.4s, v9.8h\n"
3426 "uaddlp v10.4s, v10.8h\n"
3427 "uaddlp v11.4s, v11.8h\n"
3428 "uaddlp v12.4s, v12.8h\n"
3429 "uaddlp v13.4s, v13.8h\n"
3430 "addp v8.4s, v8.4s, v9.4s\n"
3431 "addp v10.4s, v10.4s, v11.4s\n"
3432 "addp v12.4s, v12.4s, v13.4s\n"
3433 "addp v8.4s, v8.4s, v10.4s\n"
3434 "addp v9.4s, v12.4s, v12.4s\n"
3435 "mul v8.4s, v8.4s, v0.s[0]\n"
3436 "mul v9.4s, v9.4s, v0.s[0]\n"
3437 "add v8.4s, v8.4s, v1.4s\n"
3438 "add v9.4s, v9.4s, v1.4s\n"
3439 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3440 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3441 : [stride] "r"(params.stride),
3442 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3443 [additive_sum_offset] "r"(params.additive_sum_offset)
3444 : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3445 "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3446 }
3447
3448 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3449 inline void Stream<uint8_t, 6, 8, 6, RowMajorWithSum>::Pack(
3450 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3451 #ifdef DEBUG
3452 #ifdef DEBUG_METAGEMM_VERBOSE
3453 std::cout << __FILE__ << "(" << __LINE__
3454 << ") RowMajorWithSum<uint8_t, 6, 8, 6, RowMajorWithSum>::Pack()"
3455 << std::endl
3456 << std::flush;
3457 #endif
3458 #endif
3459 int params_count_copy = params.count;
3460 asm volatile(
3461 "add x0, %x[in], %x[stride]\n"
3462 "add x1, x0, %x[stride]\n"
3463 "add x2, x1, %x[stride]\n"
3464 "add x3, x2, %x[stride]\n"
3465 "add x4, x3, %x[stride]\n"
3466 "movi v8.8h, #0\n"
3467 "movi v9.8h, #0\n"
3468 "movi v10.8h, #0\n"
3469 "movi v11.8h, #0\n"
3470 "movi v12.8h, #0\n"
3471 "movi v13.8h, #0\n"
3472
3473 // Reduce count by leftovers.
3474 "subs %x[count], %x[count], #6\n"
3475 "beq 2f\n"
3476
3477 "1:"
3478 "subs %x[count], %x[count], #8\n"
3479
3480 // Load Aggregate Store: 6x8.
3481 "ld1 {v0.2s}, [%x[in]], #8\n"
3482 "ld1 {v1.2s}, [x0], #8\n"
3483 "ld1 {v2.2s}, [x1], #8\n"
3484 "ld1 {v3.2s}, [x2], #8\n"
3485 "ld1 {v4.2s}, [x3], #8\n"
3486 "ld1 {v5.2s}, [x4], #8\n"
3487 "uaddw v8.8h, v8.8h, v0.8b\n"
3488 "uaddw v9.8h, v9.8h, v1.8b\n"
3489 "uaddw v10.8h, v10.8h, v2.8b\n"
3490 "uaddw v11.8h, v11.8h, v3.8b\n"
3491 "uaddw v12.8h, v12.8h, v4.8b\n"
3492 "uaddw v13.8h, v13.8h, v5.8b\n"
3493 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3494 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3495
3496 "bne 1b\n"
3497
3498 "2:"
3499
3500 // Load Aggregate Store: 6x6.
3501 "movi v0.8b, #0\n"
3502 "movi v1.8b, #0\n"
3503 "movi v2.8b, #0\n"
3504 "movi v3.8b, #0\n"
3505 "movi v4.8b, #0\n"
3506 "movi v5.8b, #0\n"
3507 "ld1 {v0.s}[0], [%x[in]], #4\n"
3508 "ld1 {v0.h}[2], [%x[in]], #2\n"
3509 "ld1 {v1.s}[0], [x0], #4\n"
3510 "ld1 {v1.h}[2], [x0], #2\n"
3511 "ld1 {v2.s}[0], [x1], #4\n"
3512 "ld1 {v2.h}[2], [x1], #2\n"
3513 "ld1 {v3.s}[0], [x2], #4\n"
3514 "ld1 {v3.h}[2], [x2], #2\n"
3515 "ld1 {v4.s}[0], [x3], #4\n"
3516 "ld1 {v4.h}[2], [x3], #2\n"
3517 "ld1 {v5.s}[0], [x4], #4\n"
3518 "ld1 {v5.h}[2], [x4], #2\n"
3519 "uaddw v8.8h, v8.8h, v0.8b\n"
3520 "uaddw v9.8h, v9.8h, v1.8b\n"
3521 "uaddw v10.8h, v10.8h, v2.8b\n"
3522 "uaddw v11.8h, v11.8h, v3.8b\n"
3523 "uaddw v12.8h, v12.8h, v4.8b\n"
3524 "uaddw v13.8h, v13.8h, v5.8b\n"
3525 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3526 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3527
3528 // Aggregator Reduction.
3529 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3530 "dup v1.4s, %w[additive_sum_offset]\n"
3531 "uaddlp v8.4s, v8.8h\n"
3532 "uaddlp v9.4s, v9.8h\n"
3533 "uaddlp v10.4s, v10.8h\n"
3534 "uaddlp v11.4s, v11.8h\n"
3535 "uaddlp v12.4s, v12.8h\n"
3536 "uaddlp v13.4s, v13.8h\n"
3537 "addp v8.4s, v8.4s, v9.4s\n"
3538 "addp v10.4s, v10.4s, v11.4s\n"
3539 "addp v12.4s, v12.4s, v13.4s\n"
3540 "addp v8.4s, v8.4s, v10.4s\n"
3541 "addp v9.4s, v12.4s, v12.4s\n"
3542 "mul v8.4s, v8.4s, v0.s[0]\n"
3543 "mul v9.4s, v9.4s, v0.s[0]\n"
3544 "add v8.4s, v8.4s, v1.4s\n"
3545 "add v9.4s, v9.4s, v1.4s\n"
3546 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3547 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3548 : [stride] "r"(params.stride),
3549 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3550 [additive_sum_offset] "r"(params.additive_sum_offset)
3551 : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3552 "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3553 }
3554
3555 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3556 inline void Stream<uint8_t, 6, 8, 7, RowMajorWithSum>::Pack(
3557 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3558 #ifdef DEBUG
3559 #ifdef DEBUG_METAGEMM_VERBOSE
3560 std::cout << __FILE__ << "(" << __LINE__
3561 << ") RowMajorWithSum<uint8_t, 6, 8, 7, RowMajorWithSum>::Pack()"
3562 << std::endl
3563 << std::flush;
3564 #endif
3565 #endif
3566 int params_count_copy = params.count;
3567 asm volatile(
3568 "add x0, %x[in], %x[stride]\n"
3569 "add x1, x0, %x[stride]\n"
3570 "add x2, x1, %x[stride]\n"
3571 "add x3, x2, %x[stride]\n"
3572 "add x4, x3, %x[stride]\n"
3573 "movi v8.8h, #0\n"
3574 "movi v9.8h, #0\n"
3575 "movi v10.8h, #0\n"
3576 "movi v11.8h, #0\n"
3577 "movi v12.8h, #0\n"
3578 "movi v13.8h, #0\n"
3579
3580 // Reduce count by leftovers.
3581 "subs %x[count], %x[count], #7\n"
3582 "beq 2f\n"
3583
3584 "1:"
3585 "subs %x[count], %x[count], #8\n"
3586
3587 // Load Aggregate Store: 6x8.
3588 "ld1 {v0.2s}, [%x[in]], #8\n"
3589 "ld1 {v1.2s}, [x0], #8\n"
3590 "ld1 {v2.2s}, [x1], #8\n"
3591 "ld1 {v3.2s}, [x2], #8\n"
3592 "ld1 {v4.2s}, [x3], #8\n"
3593 "ld1 {v5.2s}, [x4], #8\n"
3594 "uaddw v8.8h, v8.8h, v0.8b\n"
3595 "uaddw v9.8h, v9.8h, v1.8b\n"
3596 "uaddw v10.8h, v10.8h, v2.8b\n"
3597 "uaddw v11.8h, v11.8h, v3.8b\n"
3598 "uaddw v12.8h, v12.8h, v4.8b\n"
3599 "uaddw v13.8h, v13.8h, v5.8b\n"
3600 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3601 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3602
3603 "bne 1b\n"
3604
3605 "2:"
3606
3607 // Load Aggregate Store: 6x7.
3608 "movi v0.8b, #0\n"
3609 "movi v1.8b, #0\n"
3610 "movi v2.8b, #0\n"
3611 "movi v3.8b, #0\n"
3612 "movi v4.8b, #0\n"
3613 "movi v5.8b, #0\n"
3614 "ld1 {v0.s}[0], [%x[in]], #4\n"
3615 "ld1 {v0.h}[2], [%x[in]], #2\n"
3616 "ld1 {v0.b}[6], [%x[in]], #1\n"
3617 "ld1 {v1.s}[0], [x0], #4\n"
3618 "ld1 {v1.h}[2], [x0], #2\n"
3619 "ld1 {v1.b}[6], [x0], #1\n"
3620 "ld1 {v2.s}[0], [x1], #4\n"
3621 "ld1 {v2.h}[2], [x1], #2\n"
3622 "ld1 {v2.b}[6], [x1], #1\n"
3623 "ld1 {v3.s}[0], [x2], #4\n"
3624 "ld1 {v3.h}[2], [x2], #2\n"
3625 "ld1 {v3.b}[6], [x2], #1\n"
3626 "ld1 {v4.s}[0], [x3], #4\n"
3627 "ld1 {v4.h}[2], [x3], #2\n"
3628 "ld1 {v4.b}[6], [x3], #1\n"
3629 "ld1 {v5.s}[0], [x4], #4\n"
3630 "ld1 {v5.h}[2], [x4], #2\n"
3631 "ld1 {v5.b}[6], [x4], #1\n"
3632 "uaddw v8.8h, v8.8h, v0.8b\n"
3633 "uaddw v9.8h, v9.8h, v1.8b\n"
3634 "uaddw v10.8h, v10.8h, v2.8b\n"
3635 "uaddw v11.8h, v11.8h, v3.8b\n"
3636 "uaddw v12.8h, v12.8h, v4.8b\n"
3637 "uaddw v13.8h, v13.8h, v5.8b\n"
3638 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3639 "st1 {v4.2s, v5.2s}, [%x[out]], #16\n"
3640
3641 // Aggregator Reduction.
3642 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
3643 "dup v1.4s, %w[additive_sum_offset]\n"
3644 "uaddlp v8.4s, v8.8h\n"
3645 "uaddlp v9.4s, v9.8h\n"
3646 "uaddlp v10.4s, v10.8h\n"
3647 "uaddlp v11.4s, v11.8h\n"
3648 "uaddlp v12.4s, v12.8h\n"
3649 "uaddlp v13.4s, v13.8h\n"
3650 "addp v8.4s, v8.4s, v9.4s\n"
3651 "addp v10.4s, v10.4s, v11.4s\n"
3652 "addp v12.4s, v12.4s, v13.4s\n"
3653 "addp v8.4s, v8.4s, v10.4s\n"
3654 "addp v9.4s, v12.4s, v12.4s\n"
3655 "mul v8.4s, v8.4s, v0.s[0]\n"
3656 "mul v9.4s, v9.4s, v0.s[0]\n"
3657 "add v8.4s, v8.4s, v1.4s\n"
3658 "add v9.4s, v9.4s, v1.4s\n"
3659 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3660 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3661 : [stride] "r"(params.stride),
3662 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
3663 [additive_sum_offset] "r"(params.additive_sum_offset)
3664 : "x0", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v8",
3665 "v9", "v10", "v11", "v12", "v13", "cc", "memory");
3666 }
3667
3668 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3669 inline void Stream<uint8_t, 7, 8, 0, RowMajorWithSum>::Pack(
3670 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3671 #ifdef DEBUG
3672 #ifdef DEBUG_METAGEMM_VERBOSE
3673 std::cout << __FILE__ << "(" << __LINE__
3674 << ") RowMajorWithSum<uint8_t, 7, 8, 0, RowMajorWithSum>::Pack()"
3675 << std::endl
3676 << std::flush;
3677 #endif
3678 #endif
3679 int params_count_copy = params.count;
3680 asm volatile(
3681 "add x0, %x[in], %x[stride]\n"
3682 "add x1, x0, %x[stride]\n"
3683 "add x2, x1, %x[stride]\n"
3684 "add x3, x2, %x[stride]\n"
3685 "add x4, x3, %x[stride]\n"
3686 "add x5, x4, %x[stride]\n"
3687 "movi v8.8h, #0\n"
3688 "movi v9.8h, #0\n"
3689 "movi v10.8h, #0\n"
3690 "movi v11.8h, #0\n"
3691 "movi v12.8h, #0\n"
3692 "movi v13.8h, #0\n"
3693 "movi v14.8h, #0\n"
3694
3695 "1:"
3696 "subs %x[count], %x[count], #8\n"
3697
3698 // Load Aggregate Store: 7x8.
3699 "ld1 {v0.2s}, [%x[in]], #8\n"
3700 "ld1 {v1.2s}, [x0], #8\n"
3701 "ld1 {v2.2s}, [x1], #8\n"
3702 "ld1 {v3.2s}, [x2], #8\n"
3703 "ld1 {v4.2s}, [x3], #8\n"
3704 "ld1 {v5.2s}, [x4], #8\n"
3705 "ld1 {v6.2s}, [x5], #8\n"
3706 "uaddw v8.8h, v8.8h, v0.8b\n"
3707 "uaddw v9.8h, v9.8h, v1.8b\n"
3708 "uaddw v10.8h, v10.8h, v2.8b\n"
3709 "uaddw v11.8h, v11.8h, v3.8b\n"
3710 "uaddw v12.8h, v12.8h, v4.8b\n"
3711 "uaddw v13.8h, v13.8h, v5.8b\n"
3712 "uaddw v14.8h, v14.8h, v6.8b\n"
3713 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3714 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3715
3716 "bne 1b\n"
3717
3718 // Aggregator Reduction.
3719 "ldr w0, %[multiplicative_sum_offset]\n"
3720 "ldr w1, %[additive_sum_offset]\n"
3721 "mov v0.s[0], w0\n"
3722 "dup v1.4s, w1\n"
3723 "uaddlp v8.4s, v8.8h\n"
3724 "uaddlp v9.4s, v9.8h\n"
3725 "uaddlp v10.4s, v10.8h\n"
3726 "uaddlp v11.4s, v11.8h\n"
3727 "uaddlp v12.4s, v12.8h\n"
3728 "uaddlp v13.4s, v13.8h\n"
3729 "uaddlp v14.4s, v14.8h\n"
3730 "addp v8.4s, v8.4s, v9.4s\n"
3731 "addp v10.4s, v10.4s, v11.4s\n"
3732 "addp v12.4s, v12.4s, v13.4s\n"
3733 "addp v14.4s, v14.4s, v14.4s\n"
3734 "addp v8.4s, v8.4s, v10.4s\n"
3735 "addp v9.4s, v12.4s, v14.4s\n"
3736 "mul v8.4s, v8.4s, v0.s[0]\n"
3737 "mul v9.4s, v9.4s, v0.s[0]\n"
3738 "add v8.4s, v8.4s, v1.4s\n"
3739 "add v9.4s, v9.4s, v1.4s\n"
3740 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3741 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3742 : [stride] "r"(params.stride),
3743 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
3744 [additive_sum_offset] "m"(params.additive_sum_offset)
3745 : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
3746 "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
3747 }
3748
3749 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3750 inline void Stream<uint8_t, 7, 8, 1, RowMajorWithSum>::Pack(
3751 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3752 #ifdef DEBUG
3753 #ifdef DEBUG_METAGEMM_VERBOSE
3754 std::cout << __FILE__ << "(" << __LINE__
3755 << ") RowMajorWithSum<uint8_t, 7, 8, 1, RowMajorWithSum>::Pack()"
3756 << std::endl
3757 << std::flush;
3758 #endif
3759 #endif
3760 int params_count_copy = params.count;
3761 asm volatile(
3762 "add x0, %x[in], %x[stride]\n"
3763 "add x1, x0, %x[stride]\n"
3764 "add x2, x1, %x[stride]\n"
3765 "add x3, x2, %x[stride]\n"
3766 "add x4, x3, %x[stride]\n"
3767 "add x5, x4, %x[stride]\n"
3768 "movi v8.8h, #0\n"
3769 "movi v9.8h, #0\n"
3770 "movi v10.8h, #0\n"
3771 "movi v11.8h, #0\n"
3772 "movi v12.8h, #0\n"
3773 "movi v13.8h, #0\n"
3774 "movi v14.8h, #0\n"
3775
3776 // Reduce count by leftovers.
3777 "subs %x[count], %x[count], #1\n"
3778 "beq 2f\n"
3779
3780 "1:"
3781 "subs %x[count], %x[count], #8\n"
3782
3783 // Load Aggregate Store: 7x8.
3784 "ld1 {v0.2s}, [%x[in]], #8\n"
3785 "ld1 {v1.2s}, [x0], #8\n"
3786 "ld1 {v2.2s}, [x1], #8\n"
3787 "ld1 {v3.2s}, [x2], #8\n"
3788 "ld1 {v4.2s}, [x3], #8\n"
3789 "ld1 {v5.2s}, [x4], #8\n"
3790 "ld1 {v6.2s}, [x5], #8\n"
3791 "uaddw v8.8h, v8.8h, v0.8b\n"
3792 "uaddw v9.8h, v9.8h, v1.8b\n"
3793 "uaddw v10.8h, v10.8h, v2.8b\n"
3794 "uaddw v11.8h, v11.8h, v3.8b\n"
3795 "uaddw v12.8h, v12.8h, v4.8b\n"
3796 "uaddw v13.8h, v13.8h, v5.8b\n"
3797 "uaddw v14.8h, v14.8h, v6.8b\n"
3798 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3799 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3800
3801 "bne 1b\n"
3802
3803 "2:"
3804
3805 // Load Aggregate Store: 7x1.
3806 "movi v0.8b, #0\n"
3807 "movi v1.8b, #0\n"
3808 "movi v2.8b, #0\n"
3809 "movi v3.8b, #0\n"
3810 "movi v4.8b, #0\n"
3811 "movi v5.8b, #0\n"
3812 "movi v6.8b, #0\n"
3813 "ld1 {v0.b}[0], [%x[in]], #1\n"
3814 "ld1 {v1.b}[0], [x0], #1\n"
3815 "ld1 {v2.b}[0], [x1], #1\n"
3816 "ld1 {v3.b}[0], [x2], #1\n"
3817 "ld1 {v4.b}[0], [x3], #1\n"
3818 "ld1 {v5.b}[0], [x4], #1\n"
3819 "ld1 {v6.b}[0], [x5], #1\n"
3820 "uaddw v8.8h, v8.8h, v0.8b\n"
3821 "uaddw v9.8h, v9.8h, v1.8b\n"
3822 "uaddw v10.8h, v10.8h, v2.8b\n"
3823 "uaddw v11.8h, v11.8h, v3.8b\n"
3824 "uaddw v12.8h, v12.8h, v4.8b\n"
3825 "uaddw v13.8h, v13.8h, v5.8b\n"
3826 "uaddw v14.8h, v14.8h, v6.8b\n"
3827 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3828 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3829
3830 // Aggregator Reduction.
3831 "ldr w0, %[multiplicative_sum_offset]\n"
3832 "ldr w1, %[additive_sum_offset]\n"
3833 "mov v0.s[0], w0\n"
3834 "dup v1.4s, w1\n"
3835 "uaddlp v8.4s, v8.8h\n"
3836 "uaddlp v9.4s, v9.8h\n"
3837 "uaddlp v10.4s, v10.8h\n"
3838 "uaddlp v11.4s, v11.8h\n"
3839 "uaddlp v12.4s, v12.8h\n"
3840 "uaddlp v13.4s, v13.8h\n"
3841 "uaddlp v14.4s, v14.8h\n"
3842 "addp v8.4s, v8.4s, v9.4s\n"
3843 "addp v10.4s, v10.4s, v11.4s\n"
3844 "addp v12.4s, v12.4s, v13.4s\n"
3845 "addp v14.4s, v14.4s, v14.4s\n"
3846 "addp v8.4s, v8.4s, v10.4s\n"
3847 "addp v9.4s, v12.4s, v14.4s\n"
3848 "mul v8.4s, v8.4s, v0.s[0]\n"
3849 "mul v9.4s, v9.4s, v0.s[0]\n"
3850 "add v8.4s, v8.4s, v1.4s\n"
3851 "add v9.4s, v9.4s, v1.4s\n"
3852 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3853 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3854 : [stride] "r"(params.stride),
3855 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
3856 [additive_sum_offset] "m"(params.additive_sum_offset)
3857 : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
3858 "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
3859 }
3860
3861 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3862 inline void Stream<uint8_t, 7, 8, 2, RowMajorWithSum>::Pack(
3863 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3864 #ifdef DEBUG
3865 #ifdef DEBUG_METAGEMM_VERBOSE
3866 std::cout << __FILE__ << "(" << __LINE__
3867 << ") RowMajorWithSum<uint8_t, 7, 8, 2, RowMajorWithSum>::Pack()"
3868 << std::endl
3869 << std::flush;
3870 #endif
3871 #endif
3872 int params_count_copy = params.count;
3873 asm volatile(
3874 "add x0, %x[in], %x[stride]\n"
3875 "add x1, x0, %x[stride]\n"
3876 "add x2, x1, %x[stride]\n"
3877 "add x3, x2, %x[stride]\n"
3878 "add x4, x3, %x[stride]\n"
3879 "add x5, x4, %x[stride]\n"
3880 "movi v8.8h, #0\n"
3881 "movi v9.8h, #0\n"
3882 "movi v10.8h, #0\n"
3883 "movi v11.8h, #0\n"
3884 "movi v12.8h, #0\n"
3885 "movi v13.8h, #0\n"
3886 "movi v14.8h, #0\n"
3887
3888 // Reduce count by leftovers.
3889 "subs %x[count], %x[count], #2\n"
3890 "beq 2f\n"
3891
3892 "1:"
3893 "subs %x[count], %x[count], #8\n"
3894
3895 // Load Aggregate Store: 7x8.
3896 "ld1 {v0.2s}, [%x[in]], #8\n"
3897 "ld1 {v1.2s}, [x0], #8\n"
3898 "ld1 {v2.2s}, [x1], #8\n"
3899 "ld1 {v3.2s}, [x2], #8\n"
3900 "ld1 {v4.2s}, [x3], #8\n"
3901 "ld1 {v5.2s}, [x4], #8\n"
3902 "ld1 {v6.2s}, [x5], #8\n"
3903 "uaddw v8.8h, v8.8h, v0.8b\n"
3904 "uaddw v9.8h, v9.8h, v1.8b\n"
3905 "uaddw v10.8h, v10.8h, v2.8b\n"
3906 "uaddw v11.8h, v11.8h, v3.8b\n"
3907 "uaddw v12.8h, v12.8h, v4.8b\n"
3908 "uaddw v13.8h, v13.8h, v5.8b\n"
3909 "uaddw v14.8h, v14.8h, v6.8b\n"
3910 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3911 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3912
3913 "bne 1b\n"
3914
3915 "2:"
3916
3917 // Load Aggregate Store: 7x2.
3918 "movi v0.8b, #0\n"
3919 "movi v1.8b, #0\n"
3920 "movi v2.8b, #0\n"
3921 "movi v3.8b, #0\n"
3922 "movi v4.8b, #0\n"
3923 "movi v5.8b, #0\n"
3924 "movi v6.8b, #0\n"
3925 "ld1 {v0.h}[0], [%x[in]], #2\n"
3926 "ld1 {v1.h}[0], [x0], #2\n"
3927 "ld1 {v2.h}[0], [x1], #2\n"
3928 "ld1 {v3.h}[0], [x2], #2\n"
3929 "ld1 {v4.h}[0], [x3], #2\n"
3930 "ld1 {v5.h}[0], [x4], #2\n"
3931 "ld1 {v6.h}[0], [x5], #2\n"
3932 "uaddw v8.8h, v8.8h, v0.8b\n"
3933 "uaddw v9.8h, v9.8h, v1.8b\n"
3934 "uaddw v10.8h, v10.8h, v2.8b\n"
3935 "uaddw v11.8h, v11.8h, v3.8b\n"
3936 "uaddw v12.8h, v12.8h, v4.8b\n"
3937 "uaddw v13.8h, v13.8h, v5.8b\n"
3938 "uaddw v14.8h, v14.8h, v6.8b\n"
3939 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
3940 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
3941
3942 // Aggregator Reduction.
3943 "ldr w0, %[multiplicative_sum_offset]\n"
3944 "ldr w1, %[additive_sum_offset]\n"
3945 "mov v0.s[0], w0\n"
3946 "dup v1.4s, w1\n"
3947 "uaddlp v8.4s, v8.8h\n"
3948 "uaddlp v9.4s, v9.8h\n"
3949 "uaddlp v10.4s, v10.8h\n"
3950 "uaddlp v11.4s, v11.8h\n"
3951 "uaddlp v12.4s, v12.8h\n"
3952 "uaddlp v13.4s, v13.8h\n"
3953 "uaddlp v14.4s, v14.8h\n"
3954 "addp v8.4s, v8.4s, v9.4s\n"
3955 "addp v10.4s, v10.4s, v11.4s\n"
3956 "addp v12.4s, v12.4s, v13.4s\n"
3957 "addp v14.4s, v14.4s, v14.4s\n"
3958 "addp v8.4s, v8.4s, v10.4s\n"
3959 "addp v9.4s, v12.4s, v14.4s\n"
3960 "mul v8.4s, v8.4s, v0.s[0]\n"
3961 "mul v9.4s, v9.4s, v0.s[0]\n"
3962 "add v8.4s, v8.4s, v1.4s\n"
3963 "add v9.4s, v9.4s, v1.4s\n"
3964 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
3965 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
3966 : [stride] "r"(params.stride),
3967 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
3968 [additive_sum_offset] "m"(params.additive_sum_offset)
3969 : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
3970 "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
3971 }
3972
3973 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)3974 inline void Stream<uint8_t, 7, 8, 3, RowMajorWithSum>::Pack(
3975 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
3976 #ifdef DEBUG
3977 #ifdef DEBUG_METAGEMM_VERBOSE
3978 std::cout << __FILE__ << "(" << __LINE__
3979 << ") RowMajorWithSum<uint8_t, 7, 8, 3, RowMajorWithSum>::Pack()"
3980 << std::endl
3981 << std::flush;
3982 #endif
3983 #endif
3984 int params_count_copy = params.count;
3985 asm volatile(
3986 "add x0, %x[in], %x[stride]\n"
3987 "add x1, x0, %x[stride]\n"
3988 "add x2, x1, %x[stride]\n"
3989 "add x3, x2, %x[stride]\n"
3990 "add x4, x3, %x[stride]\n"
3991 "add x5, x4, %x[stride]\n"
3992 "movi v8.8h, #0\n"
3993 "movi v9.8h, #0\n"
3994 "movi v10.8h, #0\n"
3995 "movi v11.8h, #0\n"
3996 "movi v12.8h, #0\n"
3997 "movi v13.8h, #0\n"
3998 "movi v14.8h, #0\n"
3999
4000 // Reduce count by leftovers.
4001 "subs %x[count], %x[count], #3\n"
4002 "beq 2f\n"
4003
4004 "1:"
4005 "subs %x[count], %x[count], #8\n"
4006
4007 // Load Aggregate Store: 7x8.
4008 "ld1 {v0.2s}, [%x[in]], #8\n"
4009 "ld1 {v1.2s}, [x0], #8\n"
4010 "ld1 {v2.2s}, [x1], #8\n"
4011 "ld1 {v3.2s}, [x2], #8\n"
4012 "ld1 {v4.2s}, [x3], #8\n"
4013 "ld1 {v5.2s}, [x4], #8\n"
4014 "ld1 {v6.2s}, [x5], #8\n"
4015 "uaddw v8.8h, v8.8h, v0.8b\n"
4016 "uaddw v9.8h, v9.8h, v1.8b\n"
4017 "uaddw v10.8h, v10.8h, v2.8b\n"
4018 "uaddw v11.8h, v11.8h, v3.8b\n"
4019 "uaddw v12.8h, v12.8h, v4.8b\n"
4020 "uaddw v13.8h, v13.8h, v5.8b\n"
4021 "uaddw v14.8h, v14.8h, v6.8b\n"
4022 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4023 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4024
4025 "bne 1b\n"
4026
4027 "2:"
4028
4029 // Load Aggregate Store: 7x3.
4030 "movi v0.8b, #0\n"
4031 "movi v1.8b, #0\n"
4032 "movi v2.8b, #0\n"
4033 "movi v3.8b, #0\n"
4034 "movi v4.8b, #0\n"
4035 "movi v5.8b, #0\n"
4036 "movi v6.8b, #0\n"
4037 "ld1 {v0.h}[0], [%x[in]], #2\n"
4038 "ld1 {v0.b}[2], [%x[in]], #1\n"
4039 "ld1 {v1.h}[0], [x0], #2\n"
4040 "ld1 {v1.b}[2], [x0], #1\n"
4041 "ld1 {v2.h}[0], [x1], #2\n"
4042 "ld1 {v2.b}[2], [x1], #1\n"
4043 "ld1 {v3.h}[0], [x2], #2\n"
4044 "ld1 {v3.b}[2], [x2], #1\n"
4045 "ld1 {v4.h}[0], [x3], #2\n"
4046 "ld1 {v4.b}[2], [x3], #1\n"
4047 "ld1 {v5.h}[0], [x4], #2\n"
4048 "ld1 {v5.b}[2], [x4], #1\n"
4049 "ld1 {v6.h}[0], [x5], #2\n"
4050 "ld1 {v6.b}[2], [x5], #1\n"
4051 "uaddw v8.8h, v8.8h, v0.8b\n"
4052 "uaddw v9.8h, v9.8h, v1.8b\n"
4053 "uaddw v10.8h, v10.8h, v2.8b\n"
4054 "uaddw v11.8h, v11.8h, v3.8b\n"
4055 "uaddw v12.8h, v12.8h, v4.8b\n"
4056 "uaddw v13.8h, v13.8h, v5.8b\n"
4057 "uaddw v14.8h, v14.8h, v6.8b\n"
4058 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4059 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4060
4061 // Aggregator Reduction.
4062 "ldr w0, %[multiplicative_sum_offset]\n"
4063 "ldr w1, %[additive_sum_offset]\n"
4064 "mov v0.s[0], w0\n"
4065 "dup v1.4s, w1\n"
4066 "uaddlp v8.4s, v8.8h\n"
4067 "uaddlp v9.4s, v9.8h\n"
4068 "uaddlp v10.4s, v10.8h\n"
4069 "uaddlp v11.4s, v11.8h\n"
4070 "uaddlp v12.4s, v12.8h\n"
4071 "uaddlp v13.4s, v13.8h\n"
4072 "uaddlp v14.4s, v14.8h\n"
4073 "addp v8.4s, v8.4s, v9.4s\n"
4074 "addp v10.4s, v10.4s, v11.4s\n"
4075 "addp v12.4s, v12.4s, v13.4s\n"
4076 "addp v14.4s, v14.4s, v14.4s\n"
4077 "addp v8.4s, v8.4s, v10.4s\n"
4078 "addp v9.4s, v12.4s, v14.4s\n"
4079 "mul v8.4s, v8.4s, v0.s[0]\n"
4080 "mul v9.4s, v9.4s, v0.s[0]\n"
4081 "add v8.4s, v8.4s, v1.4s\n"
4082 "add v9.4s, v9.4s, v1.4s\n"
4083 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4084 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4085 : [stride] "r"(params.stride),
4086 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4087 [additive_sum_offset] "m"(params.additive_sum_offset)
4088 : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4089 "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4090 }
4091
4092 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4093 inline void Stream<uint8_t, 7, 8, 4, RowMajorWithSum>::Pack(
4094 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4095 #ifdef DEBUG
4096 #ifdef DEBUG_METAGEMM_VERBOSE
4097 std::cout << __FILE__ << "(" << __LINE__
4098 << ") RowMajorWithSum<uint8_t, 7, 8, 4, RowMajorWithSum>::Pack()"
4099 << std::endl
4100 << std::flush;
4101 #endif
4102 #endif
4103 int params_count_copy = params.count;
4104 asm volatile(
4105 "add x0, %x[in], %x[stride]\n"
4106 "add x1, x0, %x[stride]\n"
4107 "add x2, x1, %x[stride]\n"
4108 "add x3, x2, %x[stride]\n"
4109 "add x4, x3, %x[stride]\n"
4110 "add x5, x4, %x[stride]\n"
4111 "movi v8.8h, #0\n"
4112 "movi v9.8h, #0\n"
4113 "movi v10.8h, #0\n"
4114 "movi v11.8h, #0\n"
4115 "movi v12.8h, #0\n"
4116 "movi v13.8h, #0\n"
4117 "movi v14.8h, #0\n"
4118
4119 // Reduce count by leftovers.
4120 "subs %x[count], %x[count], #4\n"
4121 "beq 2f\n"
4122
4123 "1:"
4124 "subs %x[count], %x[count], #8\n"
4125
4126 // Load Aggregate Store: 7x8.
4127 "ld1 {v0.2s}, [%x[in]], #8\n"
4128 "ld1 {v1.2s}, [x0], #8\n"
4129 "ld1 {v2.2s}, [x1], #8\n"
4130 "ld1 {v3.2s}, [x2], #8\n"
4131 "ld1 {v4.2s}, [x3], #8\n"
4132 "ld1 {v5.2s}, [x4], #8\n"
4133 "ld1 {v6.2s}, [x5], #8\n"
4134 "uaddw v8.8h, v8.8h, v0.8b\n"
4135 "uaddw v9.8h, v9.8h, v1.8b\n"
4136 "uaddw v10.8h, v10.8h, v2.8b\n"
4137 "uaddw v11.8h, v11.8h, v3.8b\n"
4138 "uaddw v12.8h, v12.8h, v4.8b\n"
4139 "uaddw v13.8h, v13.8h, v5.8b\n"
4140 "uaddw v14.8h, v14.8h, v6.8b\n"
4141 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4142 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4143
4144 "bne 1b\n"
4145
4146 "2:"
4147
4148 // Load Aggregate Store: 7x4.
4149 "movi v0.8b, #0\n"
4150 "movi v1.8b, #0\n"
4151 "movi v2.8b, #0\n"
4152 "movi v3.8b, #0\n"
4153 "movi v4.8b, #0\n"
4154 "movi v5.8b, #0\n"
4155 "movi v6.8b, #0\n"
4156 "ld1 {v0.s}[0], [%x[in]], #4\n"
4157 "ld1 {v1.s}[0], [x0], #4\n"
4158 "ld1 {v2.s}[0], [x1], #4\n"
4159 "ld1 {v3.s}[0], [x2], #4\n"
4160 "ld1 {v4.s}[0], [x3], #4\n"
4161 "ld1 {v5.s}[0], [x4], #4\n"
4162 "ld1 {v6.s}[0], [x5], #4\n"
4163 "uaddw v8.8h, v8.8h, v0.8b\n"
4164 "uaddw v9.8h, v9.8h, v1.8b\n"
4165 "uaddw v10.8h, v10.8h, v2.8b\n"
4166 "uaddw v11.8h, v11.8h, v3.8b\n"
4167 "uaddw v12.8h, v12.8h, v4.8b\n"
4168 "uaddw v13.8h, v13.8h, v5.8b\n"
4169 "uaddw v14.8h, v14.8h, v6.8b\n"
4170 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4171 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4172
4173 // Aggregator Reduction.
4174 "ldr w0, %[multiplicative_sum_offset]\n"
4175 "ldr w1, %[additive_sum_offset]\n"
4176 "mov v0.s[0], w0\n"
4177 "dup v1.4s, w1\n"
4178 "uaddlp v8.4s, v8.8h\n"
4179 "uaddlp v9.4s, v9.8h\n"
4180 "uaddlp v10.4s, v10.8h\n"
4181 "uaddlp v11.4s, v11.8h\n"
4182 "uaddlp v12.4s, v12.8h\n"
4183 "uaddlp v13.4s, v13.8h\n"
4184 "uaddlp v14.4s, v14.8h\n"
4185 "addp v8.4s, v8.4s, v9.4s\n"
4186 "addp v10.4s, v10.4s, v11.4s\n"
4187 "addp v12.4s, v12.4s, v13.4s\n"
4188 "addp v14.4s, v14.4s, v14.4s\n"
4189 "addp v8.4s, v8.4s, v10.4s\n"
4190 "addp v9.4s, v12.4s, v14.4s\n"
4191 "mul v8.4s, v8.4s, v0.s[0]\n"
4192 "mul v9.4s, v9.4s, v0.s[0]\n"
4193 "add v8.4s, v8.4s, v1.4s\n"
4194 "add v9.4s, v9.4s, v1.4s\n"
4195 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4196 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4197 : [stride] "r"(params.stride),
4198 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4199 [additive_sum_offset] "m"(params.additive_sum_offset)
4200 : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4201 "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4202 }
4203
4204 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4205 inline void Stream<uint8_t, 7, 8, 5, RowMajorWithSum>::Pack(
4206 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4207 #ifdef DEBUG
4208 #ifdef DEBUG_METAGEMM_VERBOSE
4209 std::cout << __FILE__ << "(" << __LINE__
4210 << ") RowMajorWithSum<uint8_t, 7, 8, 5, RowMajorWithSum>::Pack()"
4211 << std::endl
4212 << std::flush;
4213 #endif
4214 #endif
4215 int params_count_copy = params.count;
4216 asm volatile(
4217 "add x0, %x[in], %x[stride]\n"
4218 "add x1, x0, %x[stride]\n"
4219 "add x2, x1, %x[stride]\n"
4220 "add x3, x2, %x[stride]\n"
4221 "add x4, x3, %x[stride]\n"
4222 "add x5, x4, %x[stride]\n"
4223 "movi v8.8h, #0\n"
4224 "movi v9.8h, #0\n"
4225 "movi v10.8h, #0\n"
4226 "movi v11.8h, #0\n"
4227 "movi v12.8h, #0\n"
4228 "movi v13.8h, #0\n"
4229 "movi v14.8h, #0\n"
4230
4231 // Reduce count by leftovers.
4232 "subs %x[count], %x[count], #5\n"
4233 "beq 2f\n"
4234
4235 "1:"
4236 "subs %x[count], %x[count], #8\n"
4237
4238 // Load Aggregate Store: 7x8.
4239 "ld1 {v0.2s}, [%x[in]], #8\n"
4240 "ld1 {v1.2s}, [x0], #8\n"
4241 "ld1 {v2.2s}, [x1], #8\n"
4242 "ld1 {v3.2s}, [x2], #8\n"
4243 "ld1 {v4.2s}, [x3], #8\n"
4244 "ld1 {v5.2s}, [x4], #8\n"
4245 "ld1 {v6.2s}, [x5], #8\n"
4246 "uaddw v8.8h, v8.8h, v0.8b\n"
4247 "uaddw v9.8h, v9.8h, v1.8b\n"
4248 "uaddw v10.8h, v10.8h, v2.8b\n"
4249 "uaddw v11.8h, v11.8h, v3.8b\n"
4250 "uaddw v12.8h, v12.8h, v4.8b\n"
4251 "uaddw v13.8h, v13.8h, v5.8b\n"
4252 "uaddw v14.8h, v14.8h, v6.8b\n"
4253 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4254 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4255
4256 "bne 1b\n"
4257
4258 "2:"
4259
4260 // Load Aggregate Store: 7x5.
4261 "movi v0.8b, #0\n"
4262 "movi v1.8b, #0\n"
4263 "movi v2.8b, #0\n"
4264 "movi v3.8b, #0\n"
4265 "movi v4.8b, #0\n"
4266 "movi v5.8b, #0\n"
4267 "movi v6.8b, #0\n"
4268 "ld1 {v0.s}[0], [%x[in]], #4\n"
4269 "ld1 {v0.b}[4], [%x[in]], #1\n"
4270 "ld1 {v1.s}[0], [x0], #4\n"
4271 "ld1 {v1.b}[4], [x0], #1\n"
4272 "ld1 {v2.s}[0], [x1], #4\n"
4273 "ld1 {v2.b}[4], [x1], #1\n"
4274 "ld1 {v3.s}[0], [x2], #4\n"
4275 "ld1 {v3.b}[4], [x2], #1\n"
4276 "ld1 {v4.s}[0], [x3], #4\n"
4277 "ld1 {v4.b}[4], [x3], #1\n"
4278 "ld1 {v5.s}[0], [x4], #4\n"
4279 "ld1 {v5.b}[4], [x4], #1\n"
4280 "ld1 {v6.s}[0], [x5], #4\n"
4281 "ld1 {v6.b}[4], [x5], #1\n"
4282 "uaddw v8.8h, v8.8h, v0.8b\n"
4283 "uaddw v9.8h, v9.8h, v1.8b\n"
4284 "uaddw v10.8h, v10.8h, v2.8b\n"
4285 "uaddw v11.8h, v11.8h, v3.8b\n"
4286 "uaddw v12.8h, v12.8h, v4.8b\n"
4287 "uaddw v13.8h, v13.8h, v5.8b\n"
4288 "uaddw v14.8h, v14.8h, v6.8b\n"
4289 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4290 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4291
4292 // Aggregator Reduction.
4293 "ldr w0, %[multiplicative_sum_offset]\n"
4294 "ldr w1, %[additive_sum_offset]\n"
4295 "mov v0.s[0], w0\n"
4296 "dup v1.4s, w1\n"
4297 "uaddlp v8.4s, v8.8h\n"
4298 "uaddlp v9.4s, v9.8h\n"
4299 "uaddlp v10.4s, v10.8h\n"
4300 "uaddlp v11.4s, v11.8h\n"
4301 "uaddlp v12.4s, v12.8h\n"
4302 "uaddlp v13.4s, v13.8h\n"
4303 "uaddlp v14.4s, v14.8h\n"
4304 "addp v8.4s, v8.4s, v9.4s\n"
4305 "addp v10.4s, v10.4s, v11.4s\n"
4306 "addp v12.4s, v12.4s, v13.4s\n"
4307 "addp v14.4s, v14.4s, v14.4s\n"
4308 "addp v8.4s, v8.4s, v10.4s\n"
4309 "addp v9.4s, v12.4s, v14.4s\n"
4310 "mul v8.4s, v8.4s, v0.s[0]\n"
4311 "mul v9.4s, v9.4s, v0.s[0]\n"
4312 "add v8.4s, v8.4s, v1.4s\n"
4313 "add v9.4s, v9.4s, v1.4s\n"
4314 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4315 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4316 : [stride] "r"(params.stride),
4317 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4318 [additive_sum_offset] "m"(params.additive_sum_offset)
4319 : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4320 "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4321 }
4322
4323 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4324 inline void Stream<uint8_t, 7, 8, 6, RowMajorWithSum>::Pack(
4325 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4326 #ifdef DEBUG
4327 #ifdef DEBUG_METAGEMM_VERBOSE
4328 std::cout << __FILE__ << "(" << __LINE__
4329 << ") RowMajorWithSum<uint8_t, 7, 8, 6, RowMajorWithSum>::Pack()"
4330 << std::endl
4331 << std::flush;
4332 #endif
4333 #endif
4334 int params_count_copy = params.count;
4335 asm volatile(
4336 "add x0, %x[in], %x[stride]\n"
4337 "add x1, x0, %x[stride]\n"
4338 "add x2, x1, %x[stride]\n"
4339 "add x3, x2, %x[stride]\n"
4340 "add x4, x3, %x[stride]\n"
4341 "add x5, x4, %x[stride]\n"
4342 "movi v8.8h, #0\n"
4343 "movi v9.8h, #0\n"
4344 "movi v10.8h, #0\n"
4345 "movi v11.8h, #0\n"
4346 "movi v12.8h, #0\n"
4347 "movi v13.8h, #0\n"
4348 "movi v14.8h, #0\n"
4349
4350 // Reduce count by leftovers.
4351 "subs %x[count], %x[count], #6\n"
4352 "beq 2f\n"
4353
4354 "1:"
4355 "subs %x[count], %x[count], #8\n"
4356
4357 // Load Aggregate Store: 7x8.
4358 "ld1 {v0.2s}, [%x[in]], #8\n"
4359 "ld1 {v1.2s}, [x0], #8\n"
4360 "ld1 {v2.2s}, [x1], #8\n"
4361 "ld1 {v3.2s}, [x2], #8\n"
4362 "ld1 {v4.2s}, [x3], #8\n"
4363 "ld1 {v5.2s}, [x4], #8\n"
4364 "ld1 {v6.2s}, [x5], #8\n"
4365 "uaddw v8.8h, v8.8h, v0.8b\n"
4366 "uaddw v9.8h, v9.8h, v1.8b\n"
4367 "uaddw v10.8h, v10.8h, v2.8b\n"
4368 "uaddw v11.8h, v11.8h, v3.8b\n"
4369 "uaddw v12.8h, v12.8h, v4.8b\n"
4370 "uaddw v13.8h, v13.8h, v5.8b\n"
4371 "uaddw v14.8h, v14.8h, v6.8b\n"
4372 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4373 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4374
4375 "bne 1b\n"
4376
4377 "2:"
4378
4379 // Load Aggregate Store: 7x6.
4380 "movi v0.8b, #0\n"
4381 "movi v1.8b, #0\n"
4382 "movi v2.8b, #0\n"
4383 "movi v3.8b, #0\n"
4384 "movi v4.8b, #0\n"
4385 "movi v5.8b, #0\n"
4386 "movi v6.8b, #0\n"
4387 "ld1 {v0.s}[0], [%x[in]], #4\n"
4388 "ld1 {v0.h}[2], [%x[in]], #2\n"
4389 "ld1 {v1.s}[0], [x0], #4\n"
4390 "ld1 {v1.h}[2], [x0], #2\n"
4391 "ld1 {v2.s}[0], [x1], #4\n"
4392 "ld1 {v2.h}[2], [x1], #2\n"
4393 "ld1 {v3.s}[0], [x2], #4\n"
4394 "ld1 {v3.h}[2], [x2], #2\n"
4395 "ld1 {v4.s}[0], [x3], #4\n"
4396 "ld1 {v4.h}[2], [x3], #2\n"
4397 "ld1 {v5.s}[0], [x4], #4\n"
4398 "ld1 {v5.h}[2], [x4], #2\n"
4399 "ld1 {v6.s}[0], [x5], #4\n"
4400 "ld1 {v6.h}[2], [x5], #2\n"
4401 "uaddw v8.8h, v8.8h, v0.8b\n"
4402 "uaddw v9.8h, v9.8h, v1.8b\n"
4403 "uaddw v10.8h, v10.8h, v2.8b\n"
4404 "uaddw v11.8h, v11.8h, v3.8b\n"
4405 "uaddw v12.8h, v12.8h, v4.8b\n"
4406 "uaddw v13.8h, v13.8h, v5.8b\n"
4407 "uaddw v14.8h, v14.8h, v6.8b\n"
4408 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4409 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4410
4411 // Aggregator Reduction.
4412 "ldr w0, %[multiplicative_sum_offset]\n"
4413 "ldr w1, %[additive_sum_offset]\n"
4414 "mov v0.s[0], w0\n"
4415 "dup v1.4s, w1\n"
4416 "uaddlp v8.4s, v8.8h\n"
4417 "uaddlp v9.4s, v9.8h\n"
4418 "uaddlp v10.4s, v10.8h\n"
4419 "uaddlp v11.4s, v11.8h\n"
4420 "uaddlp v12.4s, v12.8h\n"
4421 "uaddlp v13.4s, v13.8h\n"
4422 "uaddlp v14.4s, v14.8h\n"
4423 "addp v8.4s, v8.4s, v9.4s\n"
4424 "addp v10.4s, v10.4s, v11.4s\n"
4425 "addp v12.4s, v12.4s, v13.4s\n"
4426 "addp v14.4s, v14.4s, v14.4s\n"
4427 "addp v8.4s, v8.4s, v10.4s\n"
4428 "addp v9.4s, v12.4s, v14.4s\n"
4429 "mul v8.4s, v8.4s, v0.s[0]\n"
4430 "mul v9.4s, v9.4s, v0.s[0]\n"
4431 "add v8.4s, v8.4s, v1.4s\n"
4432 "add v9.4s, v9.4s, v1.4s\n"
4433 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4434 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4435 : [stride] "r"(params.stride),
4436 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4437 [additive_sum_offset] "m"(params.additive_sum_offset)
4438 : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4439 "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4440 }
4441
4442 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4443 inline void Stream<uint8_t, 7, 8, 7, RowMajorWithSum>::Pack(
4444 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4445 #ifdef DEBUG
4446 #ifdef DEBUG_METAGEMM_VERBOSE
4447 std::cout << __FILE__ << "(" << __LINE__
4448 << ") RowMajorWithSum<uint8_t, 7, 8, 7, RowMajorWithSum>::Pack()"
4449 << std::endl
4450 << std::flush;
4451 #endif
4452 #endif
4453 int params_count_copy = params.count;
4454 asm volatile(
4455 "add x0, %x[in], %x[stride]\n"
4456 "add x1, x0, %x[stride]\n"
4457 "add x2, x1, %x[stride]\n"
4458 "add x3, x2, %x[stride]\n"
4459 "add x4, x3, %x[stride]\n"
4460 "add x5, x4, %x[stride]\n"
4461 "movi v8.8h, #0\n"
4462 "movi v9.8h, #0\n"
4463 "movi v10.8h, #0\n"
4464 "movi v11.8h, #0\n"
4465 "movi v12.8h, #0\n"
4466 "movi v13.8h, #0\n"
4467 "movi v14.8h, #0\n"
4468
4469 // Reduce count by leftovers.
4470 "subs %x[count], %x[count], #7\n"
4471 "beq 2f\n"
4472
4473 "1:"
4474 "subs %x[count], %x[count], #8\n"
4475
4476 // Load Aggregate Store: 7x8.
4477 "ld1 {v0.2s}, [%x[in]], #8\n"
4478 "ld1 {v1.2s}, [x0], #8\n"
4479 "ld1 {v2.2s}, [x1], #8\n"
4480 "ld1 {v3.2s}, [x2], #8\n"
4481 "ld1 {v4.2s}, [x3], #8\n"
4482 "ld1 {v5.2s}, [x4], #8\n"
4483 "ld1 {v6.2s}, [x5], #8\n"
4484 "uaddw v8.8h, v8.8h, v0.8b\n"
4485 "uaddw v9.8h, v9.8h, v1.8b\n"
4486 "uaddw v10.8h, v10.8h, v2.8b\n"
4487 "uaddw v11.8h, v11.8h, v3.8b\n"
4488 "uaddw v12.8h, v12.8h, v4.8b\n"
4489 "uaddw v13.8h, v13.8h, v5.8b\n"
4490 "uaddw v14.8h, v14.8h, v6.8b\n"
4491 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4492 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4493
4494 "bne 1b\n"
4495
4496 "2:"
4497
4498 // Load Aggregate Store: 7x7.
4499 "movi v0.8b, #0\n"
4500 "movi v1.8b, #0\n"
4501 "movi v2.8b, #0\n"
4502 "movi v3.8b, #0\n"
4503 "movi v4.8b, #0\n"
4504 "movi v5.8b, #0\n"
4505 "movi v6.8b, #0\n"
4506 "ld1 {v0.s}[0], [%x[in]], #4\n"
4507 "ld1 {v0.h}[2], [%x[in]], #2\n"
4508 "ld1 {v0.b}[6], [%x[in]], #1\n"
4509 "ld1 {v1.s}[0], [x0], #4\n"
4510 "ld1 {v1.h}[2], [x0], #2\n"
4511 "ld1 {v1.b}[6], [x0], #1\n"
4512 "ld1 {v2.s}[0], [x1], #4\n"
4513 "ld1 {v2.h}[2], [x1], #2\n"
4514 "ld1 {v2.b}[6], [x1], #1\n"
4515 "ld1 {v3.s}[0], [x2], #4\n"
4516 "ld1 {v3.h}[2], [x2], #2\n"
4517 "ld1 {v3.b}[6], [x2], #1\n"
4518 "ld1 {v4.s}[0], [x3], #4\n"
4519 "ld1 {v4.h}[2], [x3], #2\n"
4520 "ld1 {v4.b}[6], [x3], #1\n"
4521 "ld1 {v5.s}[0], [x4], #4\n"
4522 "ld1 {v5.h}[2], [x4], #2\n"
4523 "ld1 {v5.b}[6], [x4], #1\n"
4524 "ld1 {v6.s}[0], [x5], #4\n"
4525 "ld1 {v6.h}[2], [x5], #2\n"
4526 "ld1 {v6.b}[6], [x5], #1\n"
4527 "uaddw v8.8h, v8.8h, v0.8b\n"
4528 "uaddw v9.8h, v9.8h, v1.8b\n"
4529 "uaddw v10.8h, v10.8h, v2.8b\n"
4530 "uaddw v11.8h, v11.8h, v3.8b\n"
4531 "uaddw v12.8h, v12.8h, v4.8b\n"
4532 "uaddw v13.8h, v13.8h, v5.8b\n"
4533 "uaddw v14.8h, v14.8h, v6.8b\n"
4534 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4535 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
4536
4537 // Aggregator Reduction.
4538 "ldr w0, %[multiplicative_sum_offset]\n"
4539 "ldr w1, %[additive_sum_offset]\n"
4540 "mov v0.s[0], w0\n"
4541 "dup v1.4s, w1\n"
4542 "uaddlp v8.4s, v8.8h\n"
4543 "uaddlp v9.4s, v9.8h\n"
4544 "uaddlp v10.4s, v10.8h\n"
4545 "uaddlp v11.4s, v11.8h\n"
4546 "uaddlp v12.4s, v12.8h\n"
4547 "uaddlp v13.4s, v13.8h\n"
4548 "uaddlp v14.4s, v14.8h\n"
4549 "addp v8.4s, v8.4s, v9.4s\n"
4550 "addp v10.4s, v10.4s, v11.4s\n"
4551 "addp v12.4s, v12.4s, v13.4s\n"
4552 "addp v14.4s, v14.4s, v14.4s\n"
4553 "addp v8.4s, v8.4s, v10.4s\n"
4554 "addp v9.4s, v12.4s, v14.4s\n"
4555 "mul v8.4s, v8.4s, v0.s[0]\n"
4556 "mul v9.4s, v9.4s, v0.s[0]\n"
4557 "add v8.4s, v8.4s, v1.4s\n"
4558 "add v9.4s, v9.4s, v1.4s\n"
4559 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4560 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4561 : [stride] "r"(params.stride),
4562 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4563 [additive_sum_offset] "m"(params.additive_sum_offset)
4564 : "x0", "x1", "x2", "x3", "x4", "x5", "v0", "v1", "v2", "v3", "v4", "v5",
4565 "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "cc", "memory");
4566 }
4567
4568 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4569 inline void Stream<uint8_t, 8, 8, 0, RowMajorWithSum>::Pack(
4570 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4571 #ifdef DEBUG
4572 #ifdef DEBUG_METAGEMM_VERBOSE
4573 std::cout << __FILE__ << "(" << __LINE__
4574 << ") RowMajorWithSum<uint8_t, 8, 8, 0, RowMajorWithSum>::Pack()"
4575 << std::endl
4576 << std::flush;
4577 #endif
4578 #endif
4579 int params_count_copy = params.count;
4580 asm volatile(
4581 "add x0, %x[in], %x[stride]\n"
4582 "add x1, x0, %x[stride]\n"
4583 "add x2, x1, %x[stride]\n"
4584 "add x3, x2, %x[stride]\n"
4585 "add x4, x3, %x[stride]\n"
4586 "add x5, x4, %x[stride]\n"
4587 "add x6, x5, %x[stride]\n"
4588 "movi v8.8h, #0\n"
4589 "movi v9.8h, #0\n"
4590 "movi v10.8h, #0\n"
4591 "movi v11.8h, #0\n"
4592 "movi v12.8h, #0\n"
4593 "movi v13.8h, #0\n"
4594 "movi v14.8h, #0\n"
4595 "movi v15.8h, #0\n"
4596
4597 "1:"
4598 "subs %x[count], %x[count], #8\n"
4599
4600 // Load Aggregate Store: 8x8.
4601 "ld1 {v0.2s}, [%x[in]], #8\n"
4602 "ld1 {v1.2s}, [x0], #8\n"
4603 "ld1 {v2.2s}, [x1], #8\n"
4604 "ld1 {v3.2s}, [x2], #8\n"
4605 "ld1 {v4.2s}, [x3], #8\n"
4606 "ld1 {v5.2s}, [x4], #8\n"
4607 "ld1 {v6.2s}, [x5], #8\n"
4608 "ld1 {v7.2s}, [x6], #8\n"
4609 "uaddw v8.8h, v8.8h, v0.8b\n"
4610 "uaddw v9.8h, v9.8h, v1.8b\n"
4611 "uaddw v10.8h, v10.8h, v2.8b\n"
4612 "uaddw v11.8h, v11.8h, v3.8b\n"
4613 "uaddw v12.8h, v12.8h, v4.8b\n"
4614 "uaddw v13.8h, v13.8h, v5.8b\n"
4615 "uaddw v14.8h, v14.8h, v6.8b\n"
4616 "uaddw v15.8h, v15.8h, v7.8b\n"
4617 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4618 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4619
4620 "bne 1b\n"
4621
4622 // Aggregator Reduction.
4623 "ldr w0, %[multiplicative_sum_offset]\n"
4624 "ldr w1, %[additive_sum_offset]\n"
4625 "mov v0.s[0], w0\n"
4626 "dup v1.4s, w1\n"
4627 "uaddlp v8.4s, v8.8h\n"
4628 "uaddlp v9.4s, v9.8h\n"
4629 "uaddlp v10.4s, v10.8h\n"
4630 "uaddlp v11.4s, v11.8h\n"
4631 "uaddlp v12.4s, v12.8h\n"
4632 "uaddlp v13.4s, v13.8h\n"
4633 "uaddlp v14.4s, v14.8h\n"
4634 "uaddlp v15.4s, v15.8h\n"
4635 "addp v8.4s, v8.4s, v9.4s\n"
4636 "addp v10.4s, v10.4s, v11.4s\n"
4637 "addp v12.4s, v12.4s, v13.4s\n"
4638 "addp v14.4s, v14.4s, v15.4s\n"
4639 "addp v8.4s, v8.4s, v10.4s\n"
4640 "addp v9.4s, v12.4s, v14.4s\n"
4641 "mul v8.4s, v8.4s, v0.s[0]\n"
4642 "mul v9.4s, v9.4s, v0.s[0]\n"
4643 "add v8.4s, v8.4s, v1.4s\n"
4644 "add v9.4s, v9.4s, v1.4s\n"
4645 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4646 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4647 : [stride] "r"(params.stride),
4648 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4649 [additive_sum_offset] "m"(params.additive_sum_offset)
4650 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
4651 "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
4652 "cc", "memory");
4653 }
4654
4655 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4656 inline void Stream<uint8_t, 8, 8, 1, RowMajorWithSum>::Pack(
4657 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4658 #ifdef DEBUG
4659 #ifdef DEBUG_METAGEMM_VERBOSE
4660 std::cout << __FILE__ << "(" << __LINE__
4661 << ") RowMajorWithSum<uint8_t, 8, 8, 1, RowMajorWithSum>::Pack()"
4662 << std::endl
4663 << std::flush;
4664 #endif
4665 #endif
4666 int params_count_copy = params.count;
4667 asm volatile(
4668 "add x0, %x[in], %x[stride]\n"
4669 "add x1, x0, %x[stride]\n"
4670 "add x2, x1, %x[stride]\n"
4671 "add x3, x2, %x[stride]\n"
4672 "add x4, x3, %x[stride]\n"
4673 "add x5, x4, %x[stride]\n"
4674 "add x6, x5, %x[stride]\n"
4675 "movi v8.8h, #0\n"
4676 "movi v9.8h, #0\n"
4677 "movi v10.8h, #0\n"
4678 "movi v11.8h, #0\n"
4679 "movi v12.8h, #0\n"
4680 "movi v13.8h, #0\n"
4681 "movi v14.8h, #0\n"
4682 "movi v15.8h, #0\n"
4683
4684 // Reduce count by leftovers.
4685 "subs %x[count], %x[count], #1\n"
4686 "beq 2f\n"
4687
4688 "1:"
4689 "subs %x[count], %x[count], #8\n"
4690
4691 // Load Aggregate Store: 8x8.
4692 "ld1 {v0.2s}, [%x[in]], #8\n"
4693 "ld1 {v1.2s}, [x0], #8\n"
4694 "ld1 {v2.2s}, [x1], #8\n"
4695 "ld1 {v3.2s}, [x2], #8\n"
4696 "ld1 {v4.2s}, [x3], #8\n"
4697 "ld1 {v5.2s}, [x4], #8\n"
4698 "ld1 {v6.2s}, [x5], #8\n"
4699 "ld1 {v7.2s}, [x6], #8\n"
4700 "uaddw v8.8h, v8.8h, v0.8b\n"
4701 "uaddw v9.8h, v9.8h, v1.8b\n"
4702 "uaddw v10.8h, v10.8h, v2.8b\n"
4703 "uaddw v11.8h, v11.8h, v3.8b\n"
4704 "uaddw v12.8h, v12.8h, v4.8b\n"
4705 "uaddw v13.8h, v13.8h, v5.8b\n"
4706 "uaddw v14.8h, v14.8h, v6.8b\n"
4707 "uaddw v15.8h, v15.8h, v7.8b\n"
4708 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4709 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4710
4711 "bne 1b\n"
4712
4713 "2:"
4714
4715 // Load Aggregate Store: 8x1.
4716 "movi v0.8b, #0\n"
4717 "movi v1.8b, #0\n"
4718 "movi v2.8b, #0\n"
4719 "movi v3.8b, #0\n"
4720 "movi v4.8b, #0\n"
4721 "movi v5.8b, #0\n"
4722 "movi v6.8b, #0\n"
4723 "movi v7.8b, #0\n"
4724 "ld1 {v0.b}[0], [%x[in]], #1\n"
4725 "ld1 {v1.b}[0], [x0], #1\n"
4726 "ld1 {v2.b}[0], [x1], #1\n"
4727 "ld1 {v3.b}[0], [x2], #1\n"
4728 "ld1 {v4.b}[0], [x3], #1\n"
4729 "ld1 {v5.b}[0], [x4], #1\n"
4730 "ld1 {v6.b}[0], [x5], #1\n"
4731 "ld1 {v7.b}[0], [x6], #1\n"
4732 "uaddw v8.8h, v8.8h, v0.8b\n"
4733 "uaddw v9.8h, v9.8h, v1.8b\n"
4734 "uaddw v10.8h, v10.8h, v2.8b\n"
4735 "uaddw v11.8h, v11.8h, v3.8b\n"
4736 "uaddw v12.8h, v12.8h, v4.8b\n"
4737 "uaddw v13.8h, v13.8h, v5.8b\n"
4738 "uaddw v14.8h, v14.8h, v6.8b\n"
4739 "uaddw v15.8h, v15.8h, v7.8b\n"
4740 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4741 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4742
4743 // Aggregator Reduction.
4744 "ldr w0, %[multiplicative_sum_offset]\n"
4745 "ldr w1, %[additive_sum_offset]\n"
4746 "mov v0.s[0], w0\n"
4747 "dup v1.4s, w1\n"
4748 "uaddlp v8.4s, v8.8h\n"
4749 "uaddlp v9.4s, v9.8h\n"
4750 "uaddlp v10.4s, v10.8h\n"
4751 "uaddlp v11.4s, v11.8h\n"
4752 "uaddlp v12.4s, v12.8h\n"
4753 "uaddlp v13.4s, v13.8h\n"
4754 "uaddlp v14.4s, v14.8h\n"
4755 "uaddlp v15.4s, v15.8h\n"
4756 "addp v8.4s, v8.4s, v9.4s\n"
4757 "addp v10.4s, v10.4s, v11.4s\n"
4758 "addp v12.4s, v12.4s, v13.4s\n"
4759 "addp v14.4s, v14.4s, v15.4s\n"
4760 "addp v8.4s, v8.4s, v10.4s\n"
4761 "addp v9.4s, v12.4s, v14.4s\n"
4762 "mul v8.4s, v8.4s, v0.s[0]\n"
4763 "mul v9.4s, v9.4s, v0.s[0]\n"
4764 "add v8.4s, v8.4s, v1.4s\n"
4765 "add v9.4s, v9.4s, v1.4s\n"
4766 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4767 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4768 : [stride] "r"(params.stride),
4769 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4770 [additive_sum_offset] "m"(params.additive_sum_offset)
4771 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
4772 "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
4773 "cc", "memory");
4774 }
4775
4776 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4777 inline void Stream<uint8_t, 8, 8, 2, RowMajorWithSum>::Pack(
4778 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4779 #ifdef DEBUG
4780 #ifdef DEBUG_METAGEMM_VERBOSE
4781 std::cout << __FILE__ << "(" << __LINE__
4782 << ") RowMajorWithSum<uint8_t, 8, 8, 2, RowMajorWithSum>::Pack()"
4783 << std::endl
4784 << std::flush;
4785 #endif
4786 #endif
4787 int params_count_copy = params.count;
4788 asm volatile(
4789 "add x0, %x[in], %x[stride]\n"
4790 "add x1, x0, %x[stride]\n"
4791 "add x2, x1, %x[stride]\n"
4792 "add x3, x2, %x[stride]\n"
4793 "add x4, x3, %x[stride]\n"
4794 "add x5, x4, %x[stride]\n"
4795 "add x6, x5, %x[stride]\n"
4796 "movi v8.8h, #0\n"
4797 "movi v9.8h, #0\n"
4798 "movi v10.8h, #0\n"
4799 "movi v11.8h, #0\n"
4800 "movi v12.8h, #0\n"
4801 "movi v13.8h, #0\n"
4802 "movi v14.8h, #0\n"
4803 "movi v15.8h, #0\n"
4804
4805 // Reduce count by leftovers.
4806 "subs %x[count], %x[count], #2\n"
4807 "beq 2f\n"
4808
4809 "1:"
4810 "subs %x[count], %x[count], #8\n"
4811
4812 // Load Aggregate Store: 8x8.
4813 "ld1 {v0.2s}, [%x[in]], #8\n"
4814 "ld1 {v1.2s}, [x0], #8\n"
4815 "ld1 {v2.2s}, [x1], #8\n"
4816 "ld1 {v3.2s}, [x2], #8\n"
4817 "ld1 {v4.2s}, [x3], #8\n"
4818 "ld1 {v5.2s}, [x4], #8\n"
4819 "ld1 {v6.2s}, [x5], #8\n"
4820 "ld1 {v7.2s}, [x6], #8\n"
4821 "uaddw v8.8h, v8.8h, v0.8b\n"
4822 "uaddw v9.8h, v9.8h, v1.8b\n"
4823 "uaddw v10.8h, v10.8h, v2.8b\n"
4824 "uaddw v11.8h, v11.8h, v3.8b\n"
4825 "uaddw v12.8h, v12.8h, v4.8b\n"
4826 "uaddw v13.8h, v13.8h, v5.8b\n"
4827 "uaddw v14.8h, v14.8h, v6.8b\n"
4828 "uaddw v15.8h, v15.8h, v7.8b\n"
4829 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4830 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4831
4832 "bne 1b\n"
4833
4834 "2:"
4835
4836 // Load Aggregate Store: 8x2.
4837 "movi v0.8b, #0\n"
4838 "movi v1.8b, #0\n"
4839 "movi v2.8b, #0\n"
4840 "movi v3.8b, #0\n"
4841 "movi v4.8b, #0\n"
4842 "movi v5.8b, #0\n"
4843 "movi v6.8b, #0\n"
4844 "movi v7.8b, #0\n"
4845 "ld1 {v0.h}[0], [%x[in]], #2\n"
4846 "ld1 {v1.h}[0], [x0], #2\n"
4847 "ld1 {v2.h}[0], [x1], #2\n"
4848 "ld1 {v3.h}[0], [x2], #2\n"
4849 "ld1 {v4.h}[0], [x3], #2\n"
4850 "ld1 {v5.h}[0], [x4], #2\n"
4851 "ld1 {v6.h}[0], [x5], #2\n"
4852 "ld1 {v7.h}[0], [x6], #2\n"
4853 "uaddw v8.8h, v8.8h, v0.8b\n"
4854 "uaddw v9.8h, v9.8h, v1.8b\n"
4855 "uaddw v10.8h, v10.8h, v2.8b\n"
4856 "uaddw v11.8h, v11.8h, v3.8b\n"
4857 "uaddw v12.8h, v12.8h, v4.8b\n"
4858 "uaddw v13.8h, v13.8h, v5.8b\n"
4859 "uaddw v14.8h, v14.8h, v6.8b\n"
4860 "uaddw v15.8h, v15.8h, v7.8b\n"
4861 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4862 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4863
4864 // Aggregator Reduction.
4865 "ldr w0, %[multiplicative_sum_offset]\n"
4866 "ldr w1, %[additive_sum_offset]\n"
4867 "mov v0.s[0], w0\n"
4868 "dup v1.4s, w1\n"
4869 "uaddlp v8.4s, v8.8h\n"
4870 "uaddlp v9.4s, v9.8h\n"
4871 "uaddlp v10.4s, v10.8h\n"
4872 "uaddlp v11.4s, v11.8h\n"
4873 "uaddlp v12.4s, v12.8h\n"
4874 "uaddlp v13.4s, v13.8h\n"
4875 "uaddlp v14.4s, v14.8h\n"
4876 "uaddlp v15.4s, v15.8h\n"
4877 "addp v8.4s, v8.4s, v9.4s\n"
4878 "addp v10.4s, v10.4s, v11.4s\n"
4879 "addp v12.4s, v12.4s, v13.4s\n"
4880 "addp v14.4s, v14.4s, v15.4s\n"
4881 "addp v8.4s, v8.4s, v10.4s\n"
4882 "addp v9.4s, v12.4s, v14.4s\n"
4883 "mul v8.4s, v8.4s, v0.s[0]\n"
4884 "mul v9.4s, v9.4s, v0.s[0]\n"
4885 "add v8.4s, v8.4s, v1.4s\n"
4886 "add v9.4s, v9.4s, v1.4s\n"
4887 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
4888 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
4889 : [stride] "r"(params.stride),
4890 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
4891 [additive_sum_offset] "m"(params.additive_sum_offset)
4892 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
4893 "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
4894 "cc", "memory");
4895 }
4896
4897 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)4898 inline void Stream<uint8_t, 8, 8, 3, RowMajorWithSum>::Pack(
4899 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
4900 #ifdef DEBUG
4901 #ifdef DEBUG_METAGEMM_VERBOSE
4902 std::cout << __FILE__ << "(" << __LINE__
4903 << ") RowMajorWithSum<uint8_t, 8, 8, 3, RowMajorWithSum>::Pack()"
4904 << std::endl
4905 << std::flush;
4906 #endif
4907 #endif
4908 int params_count_copy = params.count;
4909 asm volatile(
4910 "add x0, %x[in], %x[stride]\n"
4911 "add x1, x0, %x[stride]\n"
4912 "add x2, x1, %x[stride]\n"
4913 "add x3, x2, %x[stride]\n"
4914 "add x4, x3, %x[stride]\n"
4915 "add x5, x4, %x[stride]\n"
4916 "add x6, x5, %x[stride]\n"
4917 "movi v8.8h, #0\n"
4918 "movi v9.8h, #0\n"
4919 "movi v10.8h, #0\n"
4920 "movi v11.8h, #0\n"
4921 "movi v12.8h, #0\n"
4922 "movi v13.8h, #0\n"
4923 "movi v14.8h, #0\n"
4924 "movi v15.8h, #0\n"
4925
4926 // Reduce count by leftovers.
4927 "subs %x[count], %x[count], #3\n"
4928 "beq 2f\n"
4929
4930 "1:"
4931 "subs %x[count], %x[count], #8\n"
4932
4933 // Load Aggregate Store: 8x8.
4934 "ld1 {v0.2s}, [%x[in]], #8\n"
4935 "ld1 {v1.2s}, [x0], #8\n"
4936 "ld1 {v2.2s}, [x1], #8\n"
4937 "ld1 {v3.2s}, [x2], #8\n"
4938 "ld1 {v4.2s}, [x3], #8\n"
4939 "ld1 {v5.2s}, [x4], #8\n"
4940 "ld1 {v6.2s}, [x5], #8\n"
4941 "ld1 {v7.2s}, [x6], #8\n"
4942 "uaddw v8.8h, v8.8h, v0.8b\n"
4943 "uaddw v9.8h, v9.8h, v1.8b\n"
4944 "uaddw v10.8h, v10.8h, v2.8b\n"
4945 "uaddw v11.8h, v11.8h, v3.8b\n"
4946 "uaddw v12.8h, v12.8h, v4.8b\n"
4947 "uaddw v13.8h, v13.8h, v5.8b\n"
4948 "uaddw v14.8h, v14.8h, v6.8b\n"
4949 "uaddw v15.8h, v15.8h, v7.8b\n"
4950 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4951 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4952
4953 "bne 1b\n"
4954
4955 "2:"
4956
4957 // Load Aggregate Store: 8x3.
4958 "movi v0.8b, #0\n"
4959 "movi v1.8b, #0\n"
4960 "movi v2.8b, #0\n"
4961 "movi v3.8b, #0\n"
4962 "movi v4.8b, #0\n"
4963 "movi v5.8b, #0\n"
4964 "movi v6.8b, #0\n"
4965 "movi v7.8b, #0\n"
4966 "ld1 {v0.h}[0], [%x[in]], #2\n"
4967 "ld1 {v0.b}[2], [%x[in]], #1\n"
4968 "ld1 {v1.h}[0], [x0], #2\n"
4969 "ld1 {v1.b}[2], [x0], #1\n"
4970 "ld1 {v2.h}[0], [x1], #2\n"
4971 "ld1 {v2.b}[2], [x1], #1\n"
4972 "ld1 {v3.h}[0], [x2], #2\n"
4973 "ld1 {v3.b}[2], [x2], #1\n"
4974 "ld1 {v4.h}[0], [x3], #2\n"
4975 "ld1 {v4.b}[2], [x3], #1\n"
4976 "ld1 {v5.h}[0], [x4], #2\n"
4977 "ld1 {v5.b}[2], [x4], #1\n"
4978 "ld1 {v6.h}[0], [x5], #2\n"
4979 "ld1 {v6.b}[2], [x5], #1\n"
4980 "ld1 {v7.h}[0], [x6], #2\n"
4981 "ld1 {v7.b}[2], [x6], #1\n"
4982 "uaddw v8.8h, v8.8h, v0.8b\n"
4983 "uaddw v9.8h, v9.8h, v1.8b\n"
4984 "uaddw v10.8h, v10.8h, v2.8b\n"
4985 "uaddw v11.8h, v11.8h, v3.8b\n"
4986 "uaddw v12.8h, v12.8h, v4.8b\n"
4987 "uaddw v13.8h, v13.8h, v5.8b\n"
4988 "uaddw v14.8h, v14.8h, v6.8b\n"
4989 "uaddw v15.8h, v15.8h, v7.8b\n"
4990 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
4991 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
4992
4993 // Aggregator Reduction.
4994 "ldr w0, %[multiplicative_sum_offset]\n"
4995 "ldr w1, %[additive_sum_offset]\n"
4996 "mov v0.s[0], w0\n"
4997 "dup v1.4s, w1\n"
4998 "uaddlp v8.4s, v8.8h\n"
4999 "uaddlp v9.4s, v9.8h\n"
5000 "uaddlp v10.4s, v10.8h\n"
5001 "uaddlp v11.4s, v11.8h\n"
5002 "uaddlp v12.4s, v12.8h\n"
5003 "uaddlp v13.4s, v13.8h\n"
5004 "uaddlp v14.4s, v14.8h\n"
5005 "uaddlp v15.4s, v15.8h\n"
5006 "addp v8.4s, v8.4s, v9.4s\n"
5007 "addp v10.4s, v10.4s, v11.4s\n"
5008 "addp v12.4s, v12.4s, v13.4s\n"
5009 "addp v14.4s, v14.4s, v15.4s\n"
5010 "addp v8.4s, v8.4s, v10.4s\n"
5011 "addp v9.4s, v12.4s, v14.4s\n"
5012 "mul v8.4s, v8.4s, v0.s[0]\n"
5013 "mul v9.4s, v9.4s, v0.s[0]\n"
5014 "add v8.4s, v8.4s, v1.4s\n"
5015 "add v9.4s, v9.4s, v1.4s\n"
5016 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5017 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5018 : [stride] "r"(params.stride),
5019 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5020 [additive_sum_offset] "m"(params.additive_sum_offset)
5021 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5022 "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5023 "cc", "memory");
5024 }
5025
5026 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)5027 inline void Stream<uint8_t, 8, 8, 4, RowMajorWithSum>::Pack(
5028 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
5029 #ifdef DEBUG
5030 #ifdef DEBUG_METAGEMM_VERBOSE
5031 std::cout << __FILE__ << "(" << __LINE__
5032 << ") RowMajorWithSum<uint8_t, 8, 8, 4, RowMajorWithSum>::Pack()"
5033 << std::endl
5034 << std::flush;
5035 #endif
5036 #endif
5037 int params_count_copy = params.count;
5038 asm volatile(
5039 "add x0, %x[in], %x[stride]\n"
5040 "add x1, x0, %x[stride]\n"
5041 "add x2, x1, %x[stride]\n"
5042 "add x3, x2, %x[stride]\n"
5043 "add x4, x3, %x[stride]\n"
5044 "add x5, x4, %x[stride]\n"
5045 "add x6, x5, %x[stride]\n"
5046 "movi v8.8h, #0\n"
5047 "movi v9.8h, #0\n"
5048 "movi v10.8h, #0\n"
5049 "movi v11.8h, #0\n"
5050 "movi v12.8h, #0\n"
5051 "movi v13.8h, #0\n"
5052 "movi v14.8h, #0\n"
5053 "movi v15.8h, #0\n"
5054
5055 // Reduce count by leftovers.
5056 "subs %x[count], %x[count], #4\n"
5057 "beq 2f\n"
5058
5059 "1:"
5060 "subs %x[count], %x[count], #8\n"
5061
5062 // Load Aggregate Store: 8x8.
5063 "ld1 {v0.2s}, [%x[in]], #8\n"
5064 "ld1 {v1.2s}, [x0], #8\n"
5065 "ld1 {v2.2s}, [x1], #8\n"
5066 "ld1 {v3.2s}, [x2], #8\n"
5067 "ld1 {v4.2s}, [x3], #8\n"
5068 "ld1 {v5.2s}, [x4], #8\n"
5069 "ld1 {v6.2s}, [x5], #8\n"
5070 "ld1 {v7.2s}, [x6], #8\n"
5071 "uaddw v8.8h, v8.8h, v0.8b\n"
5072 "uaddw v9.8h, v9.8h, v1.8b\n"
5073 "uaddw v10.8h, v10.8h, v2.8b\n"
5074 "uaddw v11.8h, v11.8h, v3.8b\n"
5075 "uaddw v12.8h, v12.8h, v4.8b\n"
5076 "uaddw v13.8h, v13.8h, v5.8b\n"
5077 "uaddw v14.8h, v14.8h, v6.8b\n"
5078 "uaddw v15.8h, v15.8h, v7.8b\n"
5079 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5080 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5081
5082 "bne 1b\n"
5083
5084 "2:"
5085
5086 // Load Aggregate Store: 8x4.
5087 "movi v0.8b, #0\n"
5088 "movi v1.8b, #0\n"
5089 "movi v2.8b, #0\n"
5090 "movi v3.8b, #0\n"
5091 "movi v4.8b, #0\n"
5092 "movi v5.8b, #0\n"
5093 "movi v6.8b, #0\n"
5094 "movi v7.8b, #0\n"
5095 "ld1 {v0.s}[0], [%x[in]], #4\n"
5096 "ld1 {v1.s}[0], [x0], #4\n"
5097 "ld1 {v2.s}[0], [x1], #4\n"
5098 "ld1 {v3.s}[0], [x2], #4\n"
5099 "ld1 {v4.s}[0], [x3], #4\n"
5100 "ld1 {v5.s}[0], [x4], #4\n"
5101 "ld1 {v6.s}[0], [x5], #4\n"
5102 "ld1 {v7.s}[0], [x6], #4\n"
5103 "uaddw v8.8h, v8.8h, v0.8b\n"
5104 "uaddw v9.8h, v9.8h, v1.8b\n"
5105 "uaddw v10.8h, v10.8h, v2.8b\n"
5106 "uaddw v11.8h, v11.8h, v3.8b\n"
5107 "uaddw v12.8h, v12.8h, v4.8b\n"
5108 "uaddw v13.8h, v13.8h, v5.8b\n"
5109 "uaddw v14.8h, v14.8h, v6.8b\n"
5110 "uaddw v15.8h, v15.8h, v7.8b\n"
5111 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5112 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5113
5114 // Aggregator Reduction.
5115 "ldr w0, %[multiplicative_sum_offset]\n"
5116 "ldr w1, %[additive_sum_offset]\n"
5117 "mov v0.s[0], w0\n"
5118 "dup v1.4s, w1\n"
5119 "uaddlp v8.4s, v8.8h\n"
5120 "uaddlp v9.4s, v9.8h\n"
5121 "uaddlp v10.4s, v10.8h\n"
5122 "uaddlp v11.4s, v11.8h\n"
5123 "uaddlp v12.4s, v12.8h\n"
5124 "uaddlp v13.4s, v13.8h\n"
5125 "uaddlp v14.4s, v14.8h\n"
5126 "uaddlp v15.4s, v15.8h\n"
5127 "addp v8.4s, v8.4s, v9.4s\n"
5128 "addp v10.4s, v10.4s, v11.4s\n"
5129 "addp v12.4s, v12.4s, v13.4s\n"
5130 "addp v14.4s, v14.4s, v15.4s\n"
5131 "addp v8.4s, v8.4s, v10.4s\n"
5132 "addp v9.4s, v12.4s, v14.4s\n"
5133 "mul v8.4s, v8.4s, v0.s[0]\n"
5134 "mul v9.4s, v9.4s, v0.s[0]\n"
5135 "add v8.4s, v8.4s, v1.4s\n"
5136 "add v9.4s, v9.4s, v1.4s\n"
5137 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5138 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5139 : [stride] "r"(params.stride),
5140 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5141 [additive_sum_offset] "m"(params.additive_sum_offset)
5142 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5143 "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5144 "cc", "memory");
5145 }
5146
5147 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)5148 inline void Stream<uint8_t, 8, 8, 5, RowMajorWithSum>::Pack(
5149 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
5150 #ifdef DEBUG
5151 #ifdef DEBUG_METAGEMM_VERBOSE
5152 std::cout << __FILE__ << "(" << __LINE__
5153 << ") RowMajorWithSum<uint8_t, 8, 8, 5, RowMajorWithSum>::Pack()"
5154 << std::endl
5155 << std::flush;
5156 #endif
5157 #endif
5158 int params_count_copy = params.count;
5159 asm volatile(
5160 "add x0, %x[in], %x[stride]\n"
5161 "add x1, x0, %x[stride]\n"
5162 "add x2, x1, %x[stride]\n"
5163 "add x3, x2, %x[stride]\n"
5164 "add x4, x3, %x[stride]\n"
5165 "add x5, x4, %x[stride]\n"
5166 "add x6, x5, %x[stride]\n"
5167 "movi v8.8h, #0\n"
5168 "movi v9.8h, #0\n"
5169 "movi v10.8h, #0\n"
5170 "movi v11.8h, #0\n"
5171 "movi v12.8h, #0\n"
5172 "movi v13.8h, #0\n"
5173 "movi v14.8h, #0\n"
5174 "movi v15.8h, #0\n"
5175
5176 // Reduce count by leftovers.
5177 "subs %x[count], %x[count], #5\n"
5178 "beq 2f\n"
5179
5180 "1:"
5181 "subs %x[count], %x[count], #8\n"
5182
5183 // Load Aggregate Store: 8x8.
5184 "ld1 {v0.2s}, [%x[in]], #8\n"
5185 "ld1 {v1.2s}, [x0], #8\n"
5186 "ld1 {v2.2s}, [x1], #8\n"
5187 "ld1 {v3.2s}, [x2], #8\n"
5188 "ld1 {v4.2s}, [x3], #8\n"
5189 "ld1 {v5.2s}, [x4], #8\n"
5190 "ld1 {v6.2s}, [x5], #8\n"
5191 "ld1 {v7.2s}, [x6], #8\n"
5192 "uaddw v8.8h, v8.8h, v0.8b\n"
5193 "uaddw v9.8h, v9.8h, v1.8b\n"
5194 "uaddw v10.8h, v10.8h, v2.8b\n"
5195 "uaddw v11.8h, v11.8h, v3.8b\n"
5196 "uaddw v12.8h, v12.8h, v4.8b\n"
5197 "uaddw v13.8h, v13.8h, v5.8b\n"
5198 "uaddw v14.8h, v14.8h, v6.8b\n"
5199 "uaddw v15.8h, v15.8h, v7.8b\n"
5200 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5201 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5202
5203 "bne 1b\n"
5204
5205 "2:"
5206
5207 // Load Aggregate Store: 8x5.
5208 "movi v0.8b, #0\n"
5209 "movi v1.8b, #0\n"
5210 "movi v2.8b, #0\n"
5211 "movi v3.8b, #0\n"
5212 "movi v4.8b, #0\n"
5213 "movi v5.8b, #0\n"
5214 "movi v6.8b, #0\n"
5215 "movi v7.8b, #0\n"
5216 "ld1 {v0.s}[0], [%x[in]], #4\n"
5217 "ld1 {v0.b}[4], [%x[in]], #1\n"
5218 "ld1 {v1.s}[0], [x0], #4\n"
5219 "ld1 {v1.b}[4], [x0], #1\n"
5220 "ld1 {v2.s}[0], [x1], #4\n"
5221 "ld1 {v2.b}[4], [x1], #1\n"
5222 "ld1 {v3.s}[0], [x2], #4\n"
5223 "ld1 {v3.b}[4], [x2], #1\n"
5224 "ld1 {v4.s}[0], [x3], #4\n"
5225 "ld1 {v4.b}[4], [x3], #1\n"
5226 "ld1 {v5.s}[0], [x4], #4\n"
5227 "ld1 {v5.b}[4], [x4], #1\n"
5228 "ld1 {v6.s}[0], [x5], #4\n"
5229 "ld1 {v6.b}[4], [x5], #1\n"
5230 "ld1 {v7.s}[0], [x6], #4\n"
5231 "ld1 {v7.b}[4], [x6], #1\n"
5232 "uaddw v8.8h, v8.8h, v0.8b\n"
5233 "uaddw v9.8h, v9.8h, v1.8b\n"
5234 "uaddw v10.8h, v10.8h, v2.8b\n"
5235 "uaddw v11.8h, v11.8h, v3.8b\n"
5236 "uaddw v12.8h, v12.8h, v4.8b\n"
5237 "uaddw v13.8h, v13.8h, v5.8b\n"
5238 "uaddw v14.8h, v14.8h, v6.8b\n"
5239 "uaddw v15.8h, v15.8h, v7.8b\n"
5240 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5241 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5242
5243 // Aggregator Reduction.
5244 "ldr w0, %[multiplicative_sum_offset]\n"
5245 "ldr w1, %[additive_sum_offset]\n"
5246 "mov v0.s[0], w0\n"
5247 "dup v1.4s, w1\n"
5248 "uaddlp v8.4s, v8.8h\n"
5249 "uaddlp v9.4s, v9.8h\n"
5250 "uaddlp v10.4s, v10.8h\n"
5251 "uaddlp v11.4s, v11.8h\n"
5252 "uaddlp v12.4s, v12.8h\n"
5253 "uaddlp v13.4s, v13.8h\n"
5254 "uaddlp v14.4s, v14.8h\n"
5255 "uaddlp v15.4s, v15.8h\n"
5256 "addp v8.4s, v8.4s, v9.4s\n"
5257 "addp v10.4s, v10.4s, v11.4s\n"
5258 "addp v12.4s, v12.4s, v13.4s\n"
5259 "addp v14.4s, v14.4s, v15.4s\n"
5260 "addp v8.4s, v8.4s, v10.4s\n"
5261 "addp v9.4s, v12.4s, v14.4s\n"
5262 "mul v8.4s, v8.4s, v0.s[0]\n"
5263 "mul v9.4s, v9.4s, v0.s[0]\n"
5264 "add v8.4s, v8.4s, v1.4s\n"
5265 "add v9.4s, v9.4s, v1.4s\n"
5266 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5267 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5268 : [stride] "r"(params.stride),
5269 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5270 [additive_sum_offset] "m"(params.additive_sum_offset)
5271 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5272 "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5273 "cc", "memory");
5274 }
5275
5276 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)5277 inline void Stream<uint8_t, 8, 8, 6, RowMajorWithSum>::Pack(
5278 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
5279 #ifdef DEBUG
5280 #ifdef DEBUG_METAGEMM_VERBOSE
5281 std::cout << __FILE__ << "(" << __LINE__
5282 << ") RowMajorWithSum<uint8_t, 8, 8, 6, RowMajorWithSum>::Pack()"
5283 << std::endl
5284 << std::flush;
5285 #endif
5286 #endif
5287 int params_count_copy = params.count;
5288 asm volatile(
5289 "add x0, %x[in], %x[stride]\n"
5290 "add x1, x0, %x[stride]\n"
5291 "add x2, x1, %x[stride]\n"
5292 "add x3, x2, %x[stride]\n"
5293 "add x4, x3, %x[stride]\n"
5294 "add x5, x4, %x[stride]\n"
5295 "add x6, x5, %x[stride]\n"
5296 "movi v8.8h, #0\n"
5297 "movi v9.8h, #0\n"
5298 "movi v10.8h, #0\n"
5299 "movi v11.8h, #0\n"
5300 "movi v12.8h, #0\n"
5301 "movi v13.8h, #0\n"
5302 "movi v14.8h, #0\n"
5303 "movi v15.8h, #0\n"
5304
5305 // Reduce count by leftovers.
5306 "subs %x[count], %x[count], #6\n"
5307 "beq 2f\n"
5308
5309 "1:"
5310 "subs %x[count], %x[count], #8\n"
5311
5312 // Load Aggregate Store: 8x8.
5313 "ld1 {v0.2s}, [%x[in]], #8\n"
5314 "ld1 {v1.2s}, [x0], #8\n"
5315 "ld1 {v2.2s}, [x1], #8\n"
5316 "ld1 {v3.2s}, [x2], #8\n"
5317 "ld1 {v4.2s}, [x3], #8\n"
5318 "ld1 {v5.2s}, [x4], #8\n"
5319 "ld1 {v6.2s}, [x5], #8\n"
5320 "ld1 {v7.2s}, [x6], #8\n"
5321 "uaddw v8.8h, v8.8h, v0.8b\n"
5322 "uaddw v9.8h, v9.8h, v1.8b\n"
5323 "uaddw v10.8h, v10.8h, v2.8b\n"
5324 "uaddw v11.8h, v11.8h, v3.8b\n"
5325 "uaddw v12.8h, v12.8h, v4.8b\n"
5326 "uaddw v13.8h, v13.8h, v5.8b\n"
5327 "uaddw v14.8h, v14.8h, v6.8b\n"
5328 "uaddw v15.8h, v15.8h, v7.8b\n"
5329 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5330 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5331
5332 "bne 1b\n"
5333
5334 "2:"
5335
5336 // Load Aggregate Store: 8x6.
5337 "movi v0.8b, #0\n"
5338 "movi v1.8b, #0\n"
5339 "movi v2.8b, #0\n"
5340 "movi v3.8b, #0\n"
5341 "movi v4.8b, #0\n"
5342 "movi v5.8b, #0\n"
5343 "movi v6.8b, #0\n"
5344 "movi v7.8b, #0\n"
5345 "ld1 {v0.s}[0], [%x[in]], #4\n"
5346 "ld1 {v0.h}[2], [%x[in]], #2\n"
5347 "ld1 {v1.s}[0], [x0], #4\n"
5348 "ld1 {v1.h}[2], [x0], #2\n"
5349 "ld1 {v2.s}[0], [x1], #4\n"
5350 "ld1 {v2.h}[2], [x1], #2\n"
5351 "ld1 {v3.s}[0], [x2], #4\n"
5352 "ld1 {v3.h}[2], [x2], #2\n"
5353 "ld1 {v4.s}[0], [x3], #4\n"
5354 "ld1 {v4.h}[2], [x3], #2\n"
5355 "ld1 {v5.s}[0], [x4], #4\n"
5356 "ld1 {v5.h}[2], [x4], #2\n"
5357 "ld1 {v6.s}[0], [x5], #4\n"
5358 "ld1 {v6.h}[2], [x5], #2\n"
5359 "ld1 {v7.s}[0], [x6], #4\n"
5360 "ld1 {v7.h}[2], [x6], #2\n"
5361 "uaddw v8.8h, v8.8h, v0.8b\n"
5362 "uaddw v9.8h, v9.8h, v1.8b\n"
5363 "uaddw v10.8h, v10.8h, v2.8b\n"
5364 "uaddw v11.8h, v11.8h, v3.8b\n"
5365 "uaddw v12.8h, v12.8h, v4.8b\n"
5366 "uaddw v13.8h, v13.8h, v5.8b\n"
5367 "uaddw v14.8h, v14.8h, v6.8b\n"
5368 "uaddw v15.8h, v15.8h, v7.8b\n"
5369 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5370 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5371
5372 // Aggregator Reduction.
5373 "ldr w0, %[multiplicative_sum_offset]\n"
5374 "ldr w1, %[additive_sum_offset]\n"
5375 "mov v0.s[0], w0\n"
5376 "dup v1.4s, w1\n"
5377 "uaddlp v8.4s, v8.8h\n"
5378 "uaddlp v9.4s, v9.8h\n"
5379 "uaddlp v10.4s, v10.8h\n"
5380 "uaddlp v11.4s, v11.8h\n"
5381 "uaddlp v12.4s, v12.8h\n"
5382 "uaddlp v13.4s, v13.8h\n"
5383 "uaddlp v14.4s, v14.8h\n"
5384 "uaddlp v15.4s, v15.8h\n"
5385 "addp v8.4s, v8.4s, v9.4s\n"
5386 "addp v10.4s, v10.4s, v11.4s\n"
5387 "addp v12.4s, v12.4s, v13.4s\n"
5388 "addp v14.4s, v14.4s, v15.4s\n"
5389 "addp v8.4s, v8.4s, v10.4s\n"
5390 "addp v9.4s, v12.4s, v14.4s\n"
5391 "mul v8.4s, v8.4s, v0.s[0]\n"
5392 "mul v9.4s, v9.4s, v0.s[0]\n"
5393 "add v8.4s, v8.4s, v1.4s\n"
5394 "add v9.4s, v9.4s, v1.4s\n"
5395 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5396 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5397 : [stride] "r"(params.stride),
5398 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5399 [additive_sum_offset] "m"(params.additive_sum_offset)
5400 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5401 "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5402 "cc", "memory");
5403 }
5404
5405 template <>
Pack(const uint8_t * in,const RowMajorWithSum & params,uint8_t * out)5406 inline void Stream<uint8_t, 8, 8, 7, RowMajorWithSum>::Pack(
5407 const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
5408 #ifdef DEBUG
5409 #ifdef DEBUG_METAGEMM_VERBOSE
5410 std::cout << __FILE__ << "(" << __LINE__
5411 << ") RowMajorWithSum<uint8_t, 8, 8, 7, RowMajorWithSum>::Pack()"
5412 << std::endl
5413 << std::flush;
5414 #endif
5415 #endif
5416 int params_count_copy = params.count;
5417 asm volatile(
5418 "add x0, %x[in], %x[stride]\n"
5419 "add x1, x0, %x[stride]\n"
5420 "add x2, x1, %x[stride]\n"
5421 "add x3, x2, %x[stride]\n"
5422 "add x4, x3, %x[stride]\n"
5423 "add x5, x4, %x[stride]\n"
5424 "add x6, x5, %x[stride]\n"
5425 "movi v8.8h, #0\n"
5426 "movi v9.8h, #0\n"
5427 "movi v10.8h, #0\n"
5428 "movi v11.8h, #0\n"
5429 "movi v12.8h, #0\n"
5430 "movi v13.8h, #0\n"
5431 "movi v14.8h, #0\n"
5432 "movi v15.8h, #0\n"
5433
5434 // Reduce count by leftovers.
5435 "subs %x[count], %x[count], #7\n"
5436 "beq 2f\n"
5437
5438 "1:"
5439 "subs %x[count], %x[count], #8\n"
5440
5441 // Load Aggregate Store: 8x8.
5442 "ld1 {v0.2s}, [%x[in]], #8\n"
5443 "ld1 {v1.2s}, [x0], #8\n"
5444 "ld1 {v2.2s}, [x1], #8\n"
5445 "ld1 {v3.2s}, [x2], #8\n"
5446 "ld1 {v4.2s}, [x3], #8\n"
5447 "ld1 {v5.2s}, [x4], #8\n"
5448 "ld1 {v6.2s}, [x5], #8\n"
5449 "ld1 {v7.2s}, [x6], #8\n"
5450 "uaddw v8.8h, v8.8h, v0.8b\n"
5451 "uaddw v9.8h, v9.8h, v1.8b\n"
5452 "uaddw v10.8h, v10.8h, v2.8b\n"
5453 "uaddw v11.8h, v11.8h, v3.8b\n"
5454 "uaddw v12.8h, v12.8h, v4.8b\n"
5455 "uaddw v13.8h, v13.8h, v5.8b\n"
5456 "uaddw v14.8h, v14.8h, v6.8b\n"
5457 "uaddw v15.8h, v15.8h, v7.8b\n"
5458 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5459 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5460
5461 "bne 1b\n"
5462
5463 "2:"
5464
5465 // Load Aggregate Store: 8x7.
5466 "movi v0.8b, #0\n"
5467 "movi v1.8b, #0\n"
5468 "movi v2.8b, #0\n"
5469 "movi v3.8b, #0\n"
5470 "movi v4.8b, #0\n"
5471 "movi v5.8b, #0\n"
5472 "movi v6.8b, #0\n"
5473 "movi v7.8b, #0\n"
5474 "ld1 {v0.s}[0], [%x[in]], #4\n"
5475 "ld1 {v0.h}[2], [%x[in]], #2\n"
5476 "ld1 {v0.b}[6], [%x[in]], #1\n"
5477 "ld1 {v1.s}[0], [x0], #4\n"
5478 "ld1 {v1.h}[2], [x0], #2\n"
5479 "ld1 {v1.b}[6], [x0], #1\n"
5480 "ld1 {v2.s}[0], [x1], #4\n"
5481 "ld1 {v2.h}[2], [x1], #2\n"
5482 "ld1 {v2.b}[6], [x1], #1\n"
5483 "ld1 {v3.s}[0], [x2], #4\n"
5484 "ld1 {v3.h}[2], [x2], #2\n"
5485 "ld1 {v3.b}[6], [x2], #1\n"
5486 "ld1 {v4.s}[0], [x3], #4\n"
5487 "ld1 {v4.h}[2], [x3], #2\n"
5488 "ld1 {v4.b}[6], [x3], #1\n"
5489 "ld1 {v5.s}[0], [x4], #4\n"
5490 "ld1 {v5.h}[2], [x4], #2\n"
5491 "ld1 {v5.b}[6], [x4], #1\n"
5492 "ld1 {v6.s}[0], [x5], #4\n"
5493 "ld1 {v6.h}[2], [x5], #2\n"
5494 "ld1 {v6.b}[6], [x5], #1\n"
5495 "ld1 {v7.s}[0], [x6], #4\n"
5496 "ld1 {v7.h}[2], [x6], #2\n"
5497 "ld1 {v7.b}[6], [x6], #1\n"
5498 "uaddw v8.8h, v8.8h, v0.8b\n"
5499 "uaddw v9.8h, v9.8h, v1.8b\n"
5500 "uaddw v10.8h, v10.8h, v2.8b\n"
5501 "uaddw v11.8h, v11.8h, v3.8b\n"
5502 "uaddw v12.8h, v12.8h, v4.8b\n"
5503 "uaddw v13.8h, v13.8h, v5.8b\n"
5504 "uaddw v14.8h, v14.8h, v6.8b\n"
5505 "uaddw v15.8h, v15.8h, v7.8b\n"
5506 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
5507 "st1 {v4.2s, v5.2s, v6.2s, v7.2s}, [%x[out]], #32\n"
5508
5509 // Aggregator Reduction.
5510 "ldr w0, %[multiplicative_sum_offset]\n"
5511 "ldr w1, %[additive_sum_offset]\n"
5512 "mov v0.s[0], w0\n"
5513 "dup v1.4s, w1\n"
5514 "uaddlp v8.4s, v8.8h\n"
5515 "uaddlp v9.4s, v9.8h\n"
5516 "uaddlp v10.4s, v10.8h\n"
5517 "uaddlp v11.4s, v11.8h\n"
5518 "uaddlp v12.4s, v12.8h\n"
5519 "uaddlp v13.4s, v13.8h\n"
5520 "uaddlp v14.4s, v14.8h\n"
5521 "uaddlp v15.4s, v15.8h\n"
5522 "addp v8.4s, v8.4s, v9.4s\n"
5523 "addp v10.4s, v10.4s, v11.4s\n"
5524 "addp v12.4s, v12.4s, v13.4s\n"
5525 "addp v14.4s, v14.4s, v15.4s\n"
5526 "addp v8.4s, v8.4s, v10.4s\n"
5527 "addp v9.4s, v12.4s, v14.4s\n"
5528 "mul v8.4s, v8.4s, v0.s[0]\n"
5529 "mul v9.4s, v9.4s, v0.s[0]\n"
5530 "add v8.4s, v8.4s, v1.4s\n"
5531 "add v9.4s, v9.4s, v1.4s\n"
5532 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
5533 : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
5534 : [stride] "r"(params.stride),
5535 [multiplicative_sum_offset] "m"(params.multiplicative_sum_offset),
5536 [additive_sum_offset] "m"(params.additive_sum_offset)
5537 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "v0", "v1", "v2", "v3", "v4",
5538 "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
5539 "cc", "memory");
5540 }
5541
5542 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5543 inline void Stream<uint8_t, 1, 8, 0, ColumnMajorWithSum>::Pack(
5544 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5545 #ifdef DEBUG
5546 #ifdef DEBUG_METAGEMM_VERBOSE
5547 std::cout
5548 << __FILE__ << "(" << __LINE__
5549 << ") ColumnMajorWithSum<uint8_t, 1, 8, 0, ColumnMajorWithSum>::Pack()"
5550 << std::endl
5551 << std::flush;
5552 #endif
5553 #endif
5554 int params_count_copy = params.count;
5555 int params_stride_copy = params.stride;
5556 asm volatile(
5557 "movi v8.8h, #0\n"
5558
5559 "1:"
5560 "subs %x[count], %x[count], #8\n"
5561
5562 // Load Aggregate Store - column major 1x8
5563 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5564 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5565 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5566 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5567 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5568 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5569 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5570 "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5571 "prfm pldl1keep, [%x[in]]\n"
5572 "uaddw v8.8h, v8.8h, v0.8b\n"
5573 "st1 {v0.2s}, [%x[out]], #8\n"
5574
5575 "bne 1b\n"
5576
5577 // Aggregator Reduction.
5578 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5579 "dup v1.4s, %w[additive_sum_offset]\n"
5580 "uaddlp v8.4s, v8.8h\n"
5581 "addp v8.4s, v8.4s, v8.4s\n"
5582 "addp v8.4s, v8.4s, v8.4s\n"
5583 "mul v8.4s, v8.4s, v0.s[0]\n"
5584 "add v8.4s, v8.4s, v1.4s\n"
5585 "st1 {v8.4s}, [%x[out]]\n"
5586 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5587 [out] "+r"(out), [in] "+r"(in)
5588 : [additive_sum_offset] "r"(params.additive_sum_offset),
5589 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5590 : "v8", "v0", "v1", "cc", "memory");
5591 }
5592
5593 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5594 inline void Stream<uint8_t, 1, 8, 1, ColumnMajorWithSum>::Pack(
5595 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5596 #ifdef DEBUG
5597 #ifdef DEBUG_METAGEMM_VERBOSE
5598 std::cout
5599 << __FILE__ << "(" << __LINE__
5600 << ") ColumnMajorWithSum<uint8_t, 1, 8, 1, ColumnMajorWithSum>::Pack()"
5601 << std::endl
5602 << std::flush;
5603 #endif
5604 #endif
5605 int params_count_copy = params.count;
5606 int params_stride_copy = params.stride;
5607 asm volatile(
5608 "movi v8.8h, #0\n"
5609
5610 // Reduce count by leftovers.
5611 "subs %x[count], %x[count], #1\n"
5612 "beq 2f\n"
5613
5614 "1:"
5615 "subs %x[count], %x[count], #8\n"
5616
5617 // Load Aggregate Store - column major 1x8
5618 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5619 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5620 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5621 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5622 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5623 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5624 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5625 "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5626 "prfm pldl1keep, [%x[in]]\n"
5627 "uaddw v8.8h, v8.8h, v0.8b\n"
5628 "st1 {v0.2s}, [%x[out]], #8\n"
5629
5630 "bne 1b\n"
5631
5632 "2:"
5633
5634 // Load Aggregate Store - column major 1x1
5635 "movi v0.8b, #0\n"
5636 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5637 "prfm pldl1keep, [%x[in]]\n"
5638 "uaddw v8.8h, v8.8h, v0.8b\n"
5639 "st1 {v0.2s}, [%x[out]], #8\n"
5640
5641 // Aggregator Reduction.
5642 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5643 "dup v1.4s, %w[additive_sum_offset]\n"
5644 "uaddlp v8.4s, v8.8h\n"
5645 "addp v8.4s, v8.4s, v8.4s\n"
5646 "addp v8.4s, v8.4s, v8.4s\n"
5647 "mul v8.4s, v8.4s, v0.s[0]\n"
5648 "add v8.4s, v8.4s, v1.4s\n"
5649 "st1 {v8.4s}, [%x[out]]\n"
5650 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5651 [out] "+r"(out), [in] "+r"(in)
5652 : [additive_sum_offset] "r"(params.additive_sum_offset),
5653 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5654 : "v8", "v0", "v1", "cc", "memory");
5655 }
5656
5657 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5658 inline void Stream<uint8_t, 1, 8, 2, ColumnMajorWithSum>::Pack(
5659 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5660 #ifdef DEBUG
5661 #ifdef DEBUG_METAGEMM_VERBOSE
5662 std::cout
5663 << __FILE__ << "(" << __LINE__
5664 << ") ColumnMajorWithSum<uint8_t, 1, 8, 2, ColumnMajorWithSum>::Pack()"
5665 << std::endl
5666 << std::flush;
5667 #endif
5668 #endif
5669 int params_count_copy = params.count;
5670 int params_stride_copy = params.stride;
5671 asm volatile(
5672 "movi v8.8h, #0\n"
5673
5674 // Reduce count by leftovers.
5675 "subs %x[count], %x[count], #2\n"
5676 "beq 2f\n"
5677
5678 "1:"
5679 "subs %x[count], %x[count], #8\n"
5680
5681 // Load Aggregate Store - column major 1x8
5682 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5683 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5684 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5685 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5686 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5687 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5688 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5689 "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5690 "prfm pldl1keep, [%x[in]]\n"
5691 "uaddw v8.8h, v8.8h, v0.8b\n"
5692 "st1 {v0.2s}, [%x[out]], #8\n"
5693
5694 "bne 1b\n"
5695
5696 "2:"
5697
5698 // Load Aggregate Store - column major 1x2
5699 "movi v0.8b, #0\n"
5700 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5701 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5702 "prfm pldl1keep, [%x[in]]\n"
5703 "uaddw v8.8h, v8.8h, v0.8b\n"
5704 "st1 {v0.2s}, [%x[out]], #8\n"
5705
5706 // Aggregator Reduction.
5707 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5708 "dup v1.4s, %w[additive_sum_offset]\n"
5709 "uaddlp v8.4s, v8.8h\n"
5710 "addp v8.4s, v8.4s, v8.4s\n"
5711 "addp v8.4s, v8.4s, v8.4s\n"
5712 "mul v8.4s, v8.4s, v0.s[0]\n"
5713 "add v8.4s, v8.4s, v1.4s\n"
5714 "st1 {v8.4s}, [%x[out]]\n"
5715 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5716 [out] "+r"(out), [in] "+r"(in)
5717 : [additive_sum_offset] "r"(params.additive_sum_offset),
5718 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5719 : "v8", "v0", "v1", "cc", "memory");
5720 }
5721
5722 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5723 inline void Stream<uint8_t, 1, 8, 3, ColumnMajorWithSum>::Pack(
5724 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5725 #ifdef DEBUG
5726 #ifdef DEBUG_METAGEMM_VERBOSE
5727 std::cout
5728 << __FILE__ << "(" << __LINE__
5729 << ") ColumnMajorWithSum<uint8_t, 1, 8, 3, ColumnMajorWithSum>::Pack()"
5730 << std::endl
5731 << std::flush;
5732 #endif
5733 #endif
5734 int params_count_copy = params.count;
5735 int params_stride_copy = params.stride;
5736 asm volatile(
5737 "movi v8.8h, #0\n"
5738
5739 // Reduce count by leftovers.
5740 "subs %x[count], %x[count], #3\n"
5741 "beq 2f\n"
5742
5743 "1:"
5744 "subs %x[count], %x[count], #8\n"
5745
5746 // Load Aggregate Store - column major 1x8
5747 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5748 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5749 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5750 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5751 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5752 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5753 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5754 "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5755 "prfm pldl1keep, [%x[in]]\n"
5756 "uaddw v8.8h, v8.8h, v0.8b\n"
5757 "st1 {v0.2s}, [%x[out]], #8\n"
5758
5759 "bne 1b\n"
5760
5761 "2:"
5762
5763 // Load Aggregate Store - column major 1x3
5764 "movi v0.8b, #0\n"
5765 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5766 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5767 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5768 "prfm pldl1keep, [%x[in]]\n"
5769 "uaddw v8.8h, v8.8h, v0.8b\n"
5770 "st1 {v0.2s}, [%x[out]], #8\n"
5771
5772 // Aggregator Reduction.
5773 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5774 "dup v1.4s, %w[additive_sum_offset]\n"
5775 "uaddlp v8.4s, v8.8h\n"
5776 "addp v8.4s, v8.4s, v8.4s\n"
5777 "addp v8.4s, v8.4s, v8.4s\n"
5778 "mul v8.4s, v8.4s, v0.s[0]\n"
5779 "add v8.4s, v8.4s, v1.4s\n"
5780 "st1 {v8.4s}, [%x[out]]\n"
5781 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5782 [out] "+r"(out), [in] "+r"(in)
5783 : [additive_sum_offset] "r"(params.additive_sum_offset),
5784 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5785 : "v8", "v0", "v1", "cc", "memory");
5786 }
5787
5788 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5789 inline void Stream<uint8_t, 1, 8, 4, ColumnMajorWithSum>::Pack(
5790 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5791 #ifdef DEBUG
5792 #ifdef DEBUG_METAGEMM_VERBOSE
5793 std::cout
5794 << __FILE__ << "(" << __LINE__
5795 << ") ColumnMajorWithSum<uint8_t, 1, 8, 4, ColumnMajorWithSum>::Pack()"
5796 << std::endl
5797 << std::flush;
5798 #endif
5799 #endif
5800 int params_count_copy = params.count;
5801 int params_stride_copy = params.stride;
5802 asm volatile(
5803 "movi v8.8h, #0\n"
5804
5805 // Reduce count by leftovers.
5806 "subs %x[count], %x[count], #4\n"
5807 "beq 2f\n"
5808
5809 "1:"
5810 "subs %x[count], %x[count], #8\n"
5811
5812 // Load Aggregate Store - column major 1x8
5813 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5814 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5815 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5816 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5817 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5818 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5819 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5820 "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5821 "prfm pldl1keep, [%x[in]]\n"
5822 "uaddw v8.8h, v8.8h, v0.8b\n"
5823 "st1 {v0.2s}, [%x[out]], #8\n"
5824
5825 "bne 1b\n"
5826
5827 "2:"
5828
5829 // Load Aggregate Store - column major 1x4
5830 "movi v0.8b, #0\n"
5831 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5832 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5833 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5834 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5835 "prfm pldl1keep, [%x[in]]\n"
5836 "uaddw v8.8h, v8.8h, v0.8b\n"
5837 "st1 {v0.2s}, [%x[out]], #8\n"
5838
5839 // Aggregator Reduction.
5840 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5841 "dup v1.4s, %w[additive_sum_offset]\n"
5842 "uaddlp v8.4s, v8.8h\n"
5843 "addp v8.4s, v8.4s, v8.4s\n"
5844 "addp v8.4s, v8.4s, v8.4s\n"
5845 "mul v8.4s, v8.4s, v0.s[0]\n"
5846 "add v8.4s, v8.4s, v1.4s\n"
5847 "st1 {v8.4s}, [%x[out]]\n"
5848 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5849 [out] "+r"(out), [in] "+r"(in)
5850 : [additive_sum_offset] "r"(params.additive_sum_offset),
5851 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5852 : "v8", "v0", "v1", "cc", "memory");
5853 }
5854
5855 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5856 inline void Stream<uint8_t, 1, 8, 5, ColumnMajorWithSum>::Pack(
5857 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5858 #ifdef DEBUG
5859 #ifdef DEBUG_METAGEMM_VERBOSE
5860 std::cout
5861 << __FILE__ << "(" << __LINE__
5862 << ") ColumnMajorWithSum<uint8_t, 1, 8, 5, ColumnMajorWithSum>::Pack()"
5863 << std::endl
5864 << std::flush;
5865 #endif
5866 #endif
5867 int params_count_copy = params.count;
5868 int params_stride_copy = params.stride;
5869 asm volatile(
5870 "movi v8.8h, #0\n"
5871
5872 // Reduce count by leftovers.
5873 "subs %x[count], %x[count], #5\n"
5874 "beq 2f\n"
5875
5876 "1:"
5877 "subs %x[count], %x[count], #8\n"
5878
5879 // Load Aggregate Store - column major 1x8
5880 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5881 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5882 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5883 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5884 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5885 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5886 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5887 "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5888 "prfm pldl1keep, [%x[in]]\n"
5889 "uaddw v8.8h, v8.8h, v0.8b\n"
5890 "st1 {v0.2s}, [%x[out]], #8\n"
5891
5892 "bne 1b\n"
5893
5894 "2:"
5895
5896 // Load Aggregate Store - column major 1x5
5897 "movi v0.8b, #0\n"
5898 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5899 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5900 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5901 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5902 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5903 "prfm pldl1keep, [%x[in]]\n"
5904 "uaddw v8.8h, v8.8h, v0.8b\n"
5905 "st1 {v0.2s}, [%x[out]], #8\n"
5906
5907 // Aggregator Reduction.
5908 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5909 "dup v1.4s, %w[additive_sum_offset]\n"
5910 "uaddlp v8.4s, v8.8h\n"
5911 "addp v8.4s, v8.4s, v8.4s\n"
5912 "addp v8.4s, v8.4s, v8.4s\n"
5913 "mul v8.4s, v8.4s, v0.s[0]\n"
5914 "add v8.4s, v8.4s, v1.4s\n"
5915 "st1 {v8.4s}, [%x[out]]\n"
5916 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5917 [out] "+r"(out), [in] "+r"(in)
5918 : [additive_sum_offset] "r"(params.additive_sum_offset),
5919 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5920 : "v8", "v0", "v1", "cc", "memory");
5921 }
5922
5923 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5924 inline void Stream<uint8_t, 1, 8, 6, ColumnMajorWithSum>::Pack(
5925 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5926 #ifdef DEBUG
5927 #ifdef DEBUG_METAGEMM_VERBOSE
5928 std::cout
5929 << __FILE__ << "(" << __LINE__
5930 << ") ColumnMajorWithSum<uint8_t, 1, 8, 6, ColumnMajorWithSum>::Pack()"
5931 << std::endl
5932 << std::flush;
5933 #endif
5934 #endif
5935 int params_count_copy = params.count;
5936 int params_stride_copy = params.stride;
5937 asm volatile(
5938 "movi v8.8h, #0\n"
5939
5940 // Reduce count by leftovers.
5941 "subs %x[count], %x[count], #6\n"
5942 "beq 2f\n"
5943
5944 "1:"
5945 "subs %x[count], %x[count], #8\n"
5946
5947 // Load Aggregate Store - column major 1x8
5948 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5949 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5950 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5951 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5952 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5953 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5954 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
5955 "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
5956 "prfm pldl1keep, [%x[in]]\n"
5957 "uaddw v8.8h, v8.8h, v0.8b\n"
5958 "st1 {v0.2s}, [%x[out]], #8\n"
5959
5960 "bne 1b\n"
5961
5962 "2:"
5963
5964 // Load Aggregate Store - column major 1x6
5965 "movi v0.8b, #0\n"
5966 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
5967 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
5968 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
5969 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
5970 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
5971 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
5972 "prfm pldl1keep, [%x[in]]\n"
5973 "uaddw v8.8h, v8.8h, v0.8b\n"
5974 "st1 {v0.2s}, [%x[out]], #8\n"
5975
5976 // Aggregator Reduction.
5977 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
5978 "dup v1.4s, %w[additive_sum_offset]\n"
5979 "uaddlp v8.4s, v8.8h\n"
5980 "addp v8.4s, v8.4s, v8.4s\n"
5981 "addp v8.4s, v8.4s, v8.4s\n"
5982 "mul v8.4s, v8.4s, v0.s[0]\n"
5983 "add v8.4s, v8.4s, v1.4s\n"
5984 "st1 {v8.4s}, [%x[out]]\n"
5985 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
5986 [out] "+r"(out), [in] "+r"(in)
5987 : [additive_sum_offset] "r"(params.additive_sum_offset),
5988 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
5989 : "v8", "v0", "v1", "cc", "memory");
5990 }
5991
5992 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)5993 inline void Stream<uint8_t, 1, 8, 7, ColumnMajorWithSum>::Pack(
5994 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
5995 #ifdef DEBUG
5996 #ifdef DEBUG_METAGEMM_VERBOSE
5997 std::cout
5998 << __FILE__ << "(" << __LINE__
5999 << ") ColumnMajorWithSum<uint8_t, 1, 8, 7, ColumnMajorWithSum>::Pack()"
6000 << std::endl
6001 << std::flush;
6002 #endif
6003 #endif
6004 int params_count_copy = params.count;
6005 int params_stride_copy = params.stride;
6006 asm volatile(
6007 "movi v8.8h, #0\n"
6008
6009 // Reduce count by leftovers.
6010 "subs %x[count], %x[count], #7\n"
6011 "beq 2f\n"
6012
6013 "1:"
6014 "subs %x[count], %x[count], #8\n"
6015
6016 // Load Aggregate Store - column major 1x8
6017 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
6018 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
6019 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
6020 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
6021 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
6022 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
6023 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
6024 "ld1 {v0.b}[7], [%x[in]], %x[stride]\n"
6025 "prfm pldl1keep, [%x[in]]\n"
6026 "uaddw v8.8h, v8.8h, v0.8b\n"
6027 "st1 {v0.2s}, [%x[out]], #8\n"
6028
6029 "bne 1b\n"
6030
6031 "2:"
6032
6033 // Load Aggregate Store - column major 1x7
6034 "movi v0.8b, #0\n"
6035 "ld1 {v0.b}[0], [%x[in]], %x[stride]\n"
6036 "ld1 {v0.b}[1], [%x[in]], %x[stride]\n"
6037 "ld1 {v0.b}[2], [%x[in]], %x[stride]\n"
6038 "ld1 {v0.b}[3], [%x[in]], %x[stride]\n"
6039 "ld1 {v0.b}[4], [%x[in]], %x[stride]\n"
6040 "ld1 {v0.b}[5], [%x[in]], %x[stride]\n"
6041 "ld1 {v0.b}[6], [%x[in]], %x[stride]\n"
6042 "prfm pldl1keep, [%x[in]]\n"
6043 "uaddw v8.8h, v8.8h, v0.8b\n"
6044 "st1 {v0.2s}, [%x[out]], #8\n"
6045
6046 // Aggregator Reduction.
6047 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6048 "dup v1.4s, %w[additive_sum_offset]\n"
6049 "uaddlp v8.4s, v8.8h\n"
6050 "addp v8.4s, v8.4s, v8.4s\n"
6051 "addp v8.4s, v8.4s, v8.4s\n"
6052 "mul v8.4s, v8.4s, v0.s[0]\n"
6053 "add v8.4s, v8.4s, v1.4s\n"
6054 "st1 {v8.4s}, [%x[out]]\n"
6055 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6056 [out] "+r"(out), [in] "+r"(in)
6057 : [additive_sum_offset] "r"(params.additive_sum_offset),
6058 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6059 : "v8", "v0", "v1", "cc", "memory");
6060 }
6061
6062 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6063 inline void Stream<uint8_t, 2, 8, 0, ColumnMajorWithSum>::Pack(
6064 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6065 #ifdef DEBUG
6066 #ifdef DEBUG_METAGEMM_VERBOSE
6067 std::cout
6068 << __FILE__ << "(" << __LINE__
6069 << ") ColumnMajorWithSum<uint8_t, 2, 8, 0, ColumnMajorWithSum>::Pack()"
6070 << std::endl
6071 << std::flush;
6072 #endif
6073 #endif
6074 int params_count_copy = params.count;
6075 int params_stride_copy = params.stride;
6076 asm volatile(
6077 "movi v8.8h, #0\n"
6078 "movi v9.8h, #0\n"
6079
6080 "1:"
6081 "subs %x[count], %x[count], #8\n"
6082
6083 // Load Aggregate Store - column major 2x8
6084 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6085 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6086 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6087 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6088 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6089 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6090 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6091 "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6092 "prfm pldl1keep, [%x[in]]\n"
6093 "uzp1 v2.8b, v0.8b, v1.8b\n"
6094 "uzp2 v3.8b, v0.8b, v1.8b\n"
6095 "uaddw v8.8h, v8.8h, v2.8b\n"
6096 "uaddw v9.8h, v9.8h, v3.8b\n"
6097 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6098
6099 "bne 1b\n"
6100
6101 // Aggregator Reduction.
6102 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6103 "dup v1.4s, %w[additive_sum_offset]\n"
6104 "uaddlp v8.4s, v8.8h\n"
6105 "uaddlp v9.4s, v9.8h\n"
6106 "addp v8.4s, v8.4s, v9.4s\n"
6107 "addp v8.4s, v8.4s, v8.4s\n"
6108 "mul v8.4s, v8.4s, v0.s[0]\n"
6109 "add v8.4s, v8.4s, v1.4s\n"
6110 "st1 {v8.4s}, [%x[out]]\n"
6111 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6112 [out] "+r"(out), [in] "+r"(in)
6113 : [additive_sum_offset] "r"(params.additive_sum_offset),
6114 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6115 : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6116 }
6117
6118 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6119 inline void Stream<uint8_t, 2, 8, 1, ColumnMajorWithSum>::Pack(
6120 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6121 #ifdef DEBUG
6122 #ifdef DEBUG_METAGEMM_VERBOSE
6123 std::cout
6124 << __FILE__ << "(" << __LINE__
6125 << ") ColumnMajorWithSum<uint8_t, 2, 8, 1, ColumnMajorWithSum>::Pack()"
6126 << std::endl
6127 << std::flush;
6128 #endif
6129 #endif
6130 int params_count_copy = params.count;
6131 int params_stride_copy = params.stride;
6132 asm volatile(
6133 "movi v8.8h, #0\n"
6134 "movi v9.8h, #0\n"
6135
6136 // Reduce count by leftovers.
6137 "subs %x[count], %x[count], #1\n"
6138 "beq 2f\n"
6139
6140 "1:"
6141 "subs %x[count], %x[count], #8\n"
6142
6143 // Load Aggregate Store - column major 2x8
6144 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6145 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6146 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6147 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6148 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6149 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6150 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6151 "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6152 "prfm pldl1keep, [%x[in]]\n"
6153 "uzp1 v2.8b, v0.8b, v1.8b\n"
6154 "uzp2 v3.8b, v0.8b, v1.8b\n"
6155 "uaddw v8.8h, v8.8h, v2.8b\n"
6156 "uaddw v9.8h, v9.8h, v3.8b\n"
6157 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6158
6159 "bne 1b\n"
6160
6161 "2:"
6162
6163 // Load Aggregate Store - column major 2x1
6164 "movi v0.8b, #0\n"
6165 "movi v1.8b, #0\n"
6166 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6167 "prfm pldl1keep, [%x[in]]\n"
6168 "uzp1 v2.8b, v0.8b, v1.8b\n"
6169 "uzp2 v3.8b, v0.8b, v1.8b\n"
6170 "uaddw v8.8h, v8.8h, v2.8b\n"
6171 "uaddw v9.8h, v9.8h, v3.8b\n"
6172 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6173
6174 // Aggregator Reduction.
6175 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6176 "dup v1.4s, %w[additive_sum_offset]\n"
6177 "uaddlp v8.4s, v8.8h\n"
6178 "uaddlp v9.4s, v9.8h\n"
6179 "addp v8.4s, v8.4s, v9.4s\n"
6180 "addp v8.4s, v8.4s, v8.4s\n"
6181 "mul v8.4s, v8.4s, v0.s[0]\n"
6182 "add v8.4s, v8.4s, v1.4s\n"
6183 "st1 {v8.4s}, [%x[out]]\n"
6184 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6185 [out] "+r"(out), [in] "+r"(in)
6186 : [additive_sum_offset] "r"(params.additive_sum_offset),
6187 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6188 : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6189 }
6190
6191 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6192 inline void Stream<uint8_t, 2, 8, 2, ColumnMajorWithSum>::Pack(
6193 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6194 #ifdef DEBUG
6195 #ifdef DEBUG_METAGEMM_VERBOSE
6196 std::cout
6197 << __FILE__ << "(" << __LINE__
6198 << ") ColumnMajorWithSum<uint8_t, 2, 8, 2, ColumnMajorWithSum>::Pack()"
6199 << std::endl
6200 << std::flush;
6201 #endif
6202 #endif
6203 int params_count_copy = params.count;
6204 int params_stride_copy = params.stride;
6205 asm volatile(
6206 "movi v8.8h, #0\n"
6207 "movi v9.8h, #0\n"
6208
6209 // Reduce count by leftovers.
6210 "subs %x[count], %x[count], #2\n"
6211 "beq 2f\n"
6212
6213 "1:"
6214 "subs %x[count], %x[count], #8\n"
6215
6216 // Load Aggregate Store - column major 2x8
6217 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6218 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6219 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6220 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6221 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6222 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6223 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6224 "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6225 "prfm pldl1keep, [%x[in]]\n"
6226 "uzp1 v2.8b, v0.8b, v1.8b\n"
6227 "uzp2 v3.8b, v0.8b, v1.8b\n"
6228 "uaddw v8.8h, v8.8h, v2.8b\n"
6229 "uaddw v9.8h, v9.8h, v3.8b\n"
6230 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6231
6232 "bne 1b\n"
6233
6234 "2:"
6235
6236 // Load Aggregate Store - column major 2x2
6237 "movi v0.8b, #0\n"
6238 "movi v1.8b, #0\n"
6239 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6240 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6241 "prfm pldl1keep, [%x[in]]\n"
6242 "uzp1 v2.8b, v0.8b, v1.8b\n"
6243 "uzp2 v3.8b, v0.8b, v1.8b\n"
6244 "uaddw v8.8h, v8.8h, v2.8b\n"
6245 "uaddw v9.8h, v9.8h, v3.8b\n"
6246 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6247
6248 // Aggregator Reduction.
6249 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6250 "dup v1.4s, %w[additive_sum_offset]\n"
6251 "uaddlp v8.4s, v8.8h\n"
6252 "uaddlp v9.4s, v9.8h\n"
6253 "addp v8.4s, v8.4s, v9.4s\n"
6254 "addp v8.4s, v8.4s, v8.4s\n"
6255 "mul v8.4s, v8.4s, v0.s[0]\n"
6256 "add v8.4s, v8.4s, v1.4s\n"
6257 "st1 {v8.4s}, [%x[out]]\n"
6258 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6259 [out] "+r"(out), [in] "+r"(in)
6260 : [additive_sum_offset] "r"(params.additive_sum_offset),
6261 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6262 : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6263 }
6264
6265 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6266 inline void Stream<uint8_t, 2, 8, 3, ColumnMajorWithSum>::Pack(
6267 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6268 #ifdef DEBUG
6269 #ifdef DEBUG_METAGEMM_VERBOSE
6270 std::cout
6271 << __FILE__ << "(" << __LINE__
6272 << ") ColumnMajorWithSum<uint8_t, 2, 8, 3, ColumnMajorWithSum>::Pack()"
6273 << std::endl
6274 << std::flush;
6275 #endif
6276 #endif
6277 int params_count_copy = params.count;
6278 int params_stride_copy = params.stride;
6279 asm volatile(
6280 "movi v8.8h, #0\n"
6281 "movi v9.8h, #0\n"
6282
6283 // Reduce count by leftovers.
6284 "subs %x[count], %x[count], #3\n"
6285 "beq 2f\n"
6286
6287 "1:"
6288 "subs %x[count], %x[count], #8\n"
6289
6290 // Load Aggregate Store - column major 2x8
6291 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6292 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6293 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6294 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6295 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6296 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6297 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6298 "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6299 "prfm pldl1keep, [%x[in]]\n"
6300 "uzp1 v2.8b, v0.8b, v1.8b\n"
6301 "uzp2 v3.8b, v0.8b, v1.8b\n"
6302 "uaddw v8.8h, v8.8h, v2.8b\n"
6303 "uaddw v9.8h, v9.8h, v3.8b\n"
6304 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6305
6306 "bne 1b\n"
6307
6308 "2:"
6309
6310 // Load Aggregate Store - column major 2x3
6311 "movi v0.8b, #0\n"
6312 "movi v1.8b, #0\n"
6313 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6314 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6315 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6316 "prfm pldl1keep, [%x[in]]\n"
6317 "uzp1 v2.8b, v0.8b, v1.8b\n"
6318 "uzp2 v3.8b, v0.8b, v1.8b\n"
6319 "uaddw v8.8h, v8.8h, v2.8b\n"
6320 "uaddw v9.8h, v9.8h, v3.8b\n"
6321 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6322
6323 // Aggregator Reduction.
6324 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6325 "dup v1.4s, %w[additive_sum_offset]\n"
6326 "uaddlp v8.4s, v8.8h\n"
6327 "uaddlp v9.4s, v9.8h\n"
6328 "addp v8.4s, v8.4s, v9.4s\n"
6329 "addp v8.4s, v8.4s, v8.4s\n"
6330 "mul v8.4s, v8.4s, v0.s[0]\n"
6331 "add v8.4s, v8.4s, v1.4s\n"
6332 "st1 {v8.4s}, [%x[out]]\n"
6333 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6334 [out] "+r"(out), [in] "+r"(in)
6335 : [additive_sum_offset] "r"(params.additive_sum_offset),
6336 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6337 : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6338 }
6339
6340 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6341 inline void Stream<uint8_t, 2, 8, 4, ColumnMajorWithSum>::Pack(
6342 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6343 #ifdef DEBUG
6344 #ifdef DEBUG_METAGEMM_VERBOSE
6345 std::cout
6346 << __FILE__ << "(" << __LINE__
6347 << ") ColumnMajorWithSum<uint8_t, 2, 8, 4, ColumnMajorWithSum>::Pack()"
6348 << std::endl
6349 << std::flush;
6350 #endif
6351 #endif
6352 int params_count_copy = params.count;
6353 int params_stride_copy = params.stride;
6354 asm volatile(
6355 "movi v8.8h, #0\n"
6356 "movi v9.8h, #0\n"
6357
6358 // Reduce count by leftovers.
6359 "subs %x[count], %x[count], #4\n"
6360 "beq 2f\n"
6361
6362 "1:"
6363 "subs %x[count], %x[count], #8\n"
6364
6365 // Load Aggregate Store - column major 2x8
6366 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6367 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6368 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6369 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6370 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6371 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6372 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6373 "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6374 "prfm pldl1keep, [%x[in]]\n"
6375 "uzp1 v2.8b, v0.8b, v1.8b\n"
6376 "uzp2 v3.8b, v0.8b, v1.8b\n"
6377 "uaddw v8.8h, v8.8h, v2.8b\n"
6378 "uaddw v9.8h, v9.8h, v3.8b\n"
6379 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6380
6381 "bne 1b\n"
6382
6383 "2:"
6384
6385 // Load Aggregate Store - column major 2x4
6386 "movi v0.8b, #0\n"
6387 "movi v1.8b, #0\n"
6388 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6389 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6390 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6391 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6392 "prfm pldl1keep, [%x[in]]\n"
6393 "uzp1 v2.8b, v0.8b, v1.8b\n"
6394 "uzp2 v3.8b, v0.8b, v1.8b\n"
6395 "uaddw v8.8h, v8.8h, v2.8b\n"
6396 "uaddw v9.8h, v9.8h, v3.8b\n"
6397 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6398
6399 // Aggregator Reduction.
6400 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6401 "dup v1.4s, %w[additive_sum_offset]\n"
6402 "uaddlp v8.4s, v8.8h\n"
6403 "uaddlp v9.4s, v9.8h\n"
6404 "addp v8.4s, v8.4s, v9.4s\n"
6405 "addp v8.4s, v8.4s, v8.4s\n"
6406 "mul v8.4s, v8.4s, v0.s[0]\n"
6407 "add v8.4s, v8.4s, v1.4s\n"
6408 "st1 {v8.4s}, [%x[out]]\n"
6409 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6410 [out] "+r"(out), [in] "+r"(in)
6411 : [additive_sum_offset] "r"(params.additive_sum_offset),
6412 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6413 : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6414 }
6415
6416 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6417 inline void Stream<uint8_t, 2, 8, 5, ColumnMajorWithSum>::Pack(
6418 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6419 #ifdef DEBUG
6420 #ifdef DEBUG_METAGEMM_VERBOSE
6421 std::cout
6422 << __FILE__ << "(" << __LINE__
6423 << ") ColumnMajorWithSum<uint8_t, 2, 8, 5, ColumnMajorWithSum>::Pack()"
6424 << std::endl
6425 << std::flush;
6426 #endif
6427 #endif
6428 int params_count_copy = params.count;
6429 int params_stride_copy = params.stride;
6430 asm volatile(
6431 "movi v8.8h, #0\n"
6432 "movi v9.8h, #0\n"
6433
6434 // Reduce count by leftovers.
6435 "subs %x[count], %x[count], #5\n"
6436 "beq 2f\n"
6437
6438 "1:"
6439 "subs %x[count], %x[count], #8\n"
6440
6441 // Load Aggregate Store - column major 2x8
6442 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6443 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6444 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6445 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6446 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6447 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6448 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6449 "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6450 "prfm pldl1keep, [%x[in]]\n"
6451 "uzp1 v2.8b, v0.8b, v1.8b\n"
6452 "uzp2 v3.8b, v0.8b, v1.8b\n"
6453 "uaddw v8.8h, v8.8h, v2.8b\n"
6454 "uaddw v9.8h, v9.8h, v3.8b\n"
6455 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6456
6457 "bne 1b\n"
6458
6459 "2:"
6460
6461 // Load Aggregate Store - column major 2x5
6462 "movi v0.8b, #0\n"
6463 "movi v1.8b, #0\n"
6464 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6465 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6466 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6467 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6468 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6469 "prfm pldl1keep, [%x[in]]\n"
6470 "uzp1 v2.8b, v0.8b, v1.8b\n"
6471 "uzp2 v3.8b, v0.8b, v1.8b\n"
6472 "uaddw v8.8h, v8.8h, v2.8b\n"
6473 "uaddw v9.8h, v9.8h, v3.8b\n"
6474 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6475
6476 // Aggregator Reduction.
6477 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6478 "dup v1.4s, %w[additive_sum_offset]\n"
6479 "uaddlp v8.4s, v8.8h\n"
6480 "uaddlp v9.4s, v9.8h\n"
6481 "addp v8.4s, v8.4s, v9.4s\n"
6482 "addp v8.4s, v8.4s, v8.4s\n"
6483 "mul v8.4s, v8.4s, v0.s[0]\n"
6484 "add v8.4s, v8.4s, v1.4s\n"
6485 "st1 {v8.4s}, [%x[out]]\n"
6486 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6487 [out] "+r"(out), [in] "+r"(in)
6488 : [additive_sum_offset] "r"(params.additive_sum_offset),
6489 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6490 : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6491 }
6492
6493 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6494 inline void Stream<uint8_t, 2, 8, 6, ColumnMajorWithSum>::Pack(
6495 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6496 #ifdef DEBUG
6497 #ifdef DEBUG_METAGEMM_VERBOSE
6498 std::cout
6499 << __FILE__ << "(" << __LINE__
6500 << ") ColumnMajorWithSum<uint8_t, 2, 8, 6, ColumnMajorWithSum>::Pack()"
6501 << std::endl
6502 << std::flush;
6503 #endif
6504 #endif
6505 int params_count_copy = params.count;
6506 int params_stride_copy = params.stride;
6507 asm volatile(
6508 "movi v8.8h, #0\n"
6509 "movi v9.8h, #0\n"
6510
6511 // Reduce count by leftovers.
6512 "subs %x[count], %x[count], #6\n"
6513 "beq 2f\n"
6514
6515 "1:"
6516 "subs %x[count], %x[count], #8\n"
6517
6518 // Load Aggregate Store - column major 2x8
6519 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6520 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6521 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6522 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6523 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6524 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6525 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6526 "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6527 "prfm pldl1keep, [%x[in]]\n"
6528 "uzp1 v2.8b, v0.8b, v1.8b\n"
6529 "uzp2 v3.8b, v0.8b, v1.8b\n"
6530 "uaddw v8.8h, v8.8h, v2.8b\n"
6531 "uaddw v9.8h, v9.8h, v3.8b\n"
6532 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6533
6534 "bne 1b\n"
6535
6536 "2:"
6537
6538 // Load Aggregate Store - column major 2x6
6539 "movi v0.8b, #0\n"
6540 "movi v1.8b, #0\n"
6541 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6542 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6543 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6544 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6545 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6546 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6547 "prfm pldl1keep, [%x[in]]\n"
6548 "uzp1 v2.8b, v0.8b, v1.8b\n"
6549 "uzp2 v3.8b, v0.8b, v1.8b\n"
6550 "uaddw v8.8h, v8.8h, v2.8b\n"
6551 "uaddw v9.8h, v9.8h, v3.8b\n"
6552 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6553
6554 // Aggregator Reduction.
6555 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6556 "dup v1.4s, %w[additive_sum_offset]\n"
6557 "uaddlp v8.4s, v8.8h\n"
6558 "uaddlp v9.4s, v9.8h\n"
6559 "addp v8.4s, v8.4s, v9.4s\n"
6560 "addp v8.4s, v8.4s, v8.4s\n"
6561 "mul v8.4s, v8.4s, v0.s[0]\n"
6562 "add v8.4s, v8.4s, v1.4s\n"
6563 "st1 {v8.4s}, [%x[out]]\n"
6564 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6565 [out] "+r"(out), [in] "+r"(in)
6566 : [additive_sum_offset] "r"(params.additive_sum_offset),
6567 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6568 : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6569 }
6570
6571 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6572 inline void Stream<uint8_t, 2, 8, 7, ColumnMajorWithSum>::Pack(
6573 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6574 #ifdef DEBUG
6575 #ifdef DEBUG_METAGEMM_VERBOSE
6576 std::cout
6577 << __FILE__ << "(" << __LINE__
6578 << ") ColumnMajorWithSum<uint8_t, 2, 8, 7, ColumnMajorWithSum>::Pack()"
6579 << std::endl
6580 << std::flush;
6581 #endif
6582 #endif
6583 int params_count_copy = params.count;
6584 int params_stride_copy = params.stride;
6585 asm volatile(
6586 "movi v8.8h, #0\n"
6587 "movi v9.8h, #0\n"
6588
6589 // Reduce count by leftovers.
6590 "subs %x[count], %x[count], #7\n"
6591 "beq 2f\n"
6592
6593 "1:"
6594 "subs %x[count], %x[count], #8\n"
6595
6596 // Load Aggregate Store - column major 2x8
6597 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6598 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6599 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6600 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6601 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6602 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6603 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6604 "ld1 {v1.h}[3], [%x[in]], %x[stride]\n"
6605 "prfm pldl1keep, [%x[in]]\n"
6606 "uzp1 v2.8b, v0.8b, v1.8b\n"
6607 "uzp2 v3.8b, v0.8b, v1.8b\n"
6608 "uaddw v8.8h, v8.8h, v2.8b\n"
6609 "uaddw v9.8h, v9.8h, v3.8b\n"
6610 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6611
6612 "bne 1b\n"
6613
6614 "2:"
6615
6616 // Load Aggregate Store - column major 2x7
6617 "movi v0.8b, #0\n"
6618 "movi v1.8b, #0\n"
6619 "ld1 {v0.h}[0], [%x[in]], %x[stride]\n"
6620 "ld1 {v0.h}[1], [%x[in]], %x[stride]\n"
6621 "ld1 {v0.h}[2], [%x[in]], %x[stride]\n"
6622 "ld1 {v0.h}[3], [%x[in]], %x[stride]\n"
6623 "ld1 {v1.h}[0], [%x[in]], %x[stride]\n"
6624 "ld1 {v1.h}[1], [%x[in]], %x[stride]\n"
6625 "ld1 {v1.h}[2], [%x[in]], %x[stride]\n"
6626 "prfm pldl1keep, [%x[in]]\n"
6627 "uzp1 v2.8b, v0.8b, v1.8b\n"
6628 "uzp2 v3.8b, v0.8b, v1.8b\n"
6629 "uaddw v8.8h, v8.8h, v2.8b\n"
6630 "uaddw v9.8h, v9.8h, v3.8b\n"
6631 "st1 {v2.2s, v3.2s}, [%x[out]], #16\n"
6632
6633 // Aggregator Reduction.
6634 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6635 "dup v1.4s, %w[additive_sum_offset]\n"
6636 "uaddlp v8.4s, v8.8h\n"
6637 "uaddlp v9.4s, v9.8h\n"
6638 "addp v8.4s, v8.4s, v9.4s\n"
6639 "addp v8.4s, v8.4s, v8.4s\n"
6640 "mul v8.4s, v8.4s, v0.s[0]\n"
6641 "add v8.4s, v8.4s, v1.4s\n"
6642 "st1 {v8.4s}, [%x[out]]\n"
6643 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6644 [out] "+r"(out), [in] "+r"(in)
6645 : [additive_sum_offset] "r"(params.additive_sum_offset),
6646 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6647 : "v0", "v1", "v2", "v3", "v8", "v9", "cc", "memory");
6648 }
6649
6650 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6651 inline void Stream<uint8_t, 3, 8, 0, ColumnMajorWithSum>::Pack(
6652 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6653 #ifdef DEBUG
6654 #ifdef DEBUG_METAGEMM_VERBOSE
6655 std::cout
6656 << __FILE__ << "(" << __LINE__
6657 << ") ColumnMajorWithSum<uint8_t, 3, 8, 0, ColumnMajorWithSum>::Pack()"
6658 << std::endl
6659 << std::flush;
6660 #endif
6661 #endif
6662 int params_count_copy = params.count;
6663 int params_stride_copy = params.stride;
6664 asm volatile(
6665 "movi v8.8h, #0\n"
6666 "movi v9.8h, #0\n"
6667 "movi v10.8h, #0\n"
6668
6669 "1:"
6670 "subs %x[count], %x[count], #8\n"
6671
6672 // Load Aggregate Store - column major 3x8
6673 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6674 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6675 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6676 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6677 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6678 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6679 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6680 "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6681 "prfm pldl1keep, [%x[in]]\n"
6682 "uaddw v8.8h, v8.8h, v0.8b\n"
6683 "uaddw v9.8h, v9.8h, v1.8b\n"
6684 "uaddw v10.8h, v10.8h, v2.8b\n"
6685 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6686
6687 "bne 1b\n"
6688
6689 // Aggregator Reduction.
6690 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6691 "dup v1.4s, %w[additive_sum_offset]\n"
6692 "uaddlp v8.4s, v8.8h\n"
6693 "uaddlp v9.4s, v9.8h\n"
6694 "uaddlp v10.4s, v10.8h\n"
6695 "addp v8.4s, v8.4s, v9.4s\n"
6696 "addp v10.4s, v10.4s, v10.4s\n"
6697 "addp v8.4s, v8.4s, v10.4s\n"
6698 "mul v8.4s, v8.4s, v0.s[0]\n"
6699 "add v8.4s, v8.4s, v1.4s\n"
6700 "st1 {v8.4s}, [%x[out]]\n"
6701 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6702 [out] "+r"(out), [in] "+r"(in)
6703 : [additive_sum_offset] "r"(params.additive_sum_offset),
6704 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6705 : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
6706 }
6707
6708 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6709 inline void Stream<uint8_t, 3, 8, 1, ColumnMajorWithSum>::Pack(
6710 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6711 #ifdef DEBUG
6712 #ifdef DEBUG_METAGEMM_VERBOSE
6713 std::cout
6714 << __FILE__ << "(" << __LINE__
6715 << ") ColumnMajorWithSum<uint8_t, 3, 8, 1, ColumnMajorWithSum>::Pack()"
6716 << std::endl
6717 << std::flush;
6718 #endif
6719 #endif
6720 int params_count_copy = params.count;
6721 int params_stride_copy = params.stride;
6722 asm volatile(
6723 "movi v8.8h, #0\n"
6724 "movi v9.8h, #0\n"
6725 "movi v10.8h, #0\n"
6726
6727 // Reduce count by leftovers.
6728 "subs %x[count], %x[count], #1\n"
6729 "beq 2f\n"
6730
6731 "1:"
6732 "subs %x[count], %x[count], #8\n"
6733
6734 // Load Aggregate Store - column major 3x8
6735 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6736 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6737 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6738 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6739 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6740 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6741 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6742 "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6743 "prfm pldl1keep, [%x[in]]\n"
6744 "uaddw v8.8h, v8.8h, v0.8b\n"
6745 "uaddw v9.8h, v9.8h, v1.8b\n"
6746 "uaddw v10.8h, v10.8h, v2.8b\n"
6747 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6748
6749 "bne 1b\n"
6750
6751 "2:"
6752
6753 // Load Aggregate Store - column major 3x1
6754 "movi v0.8b, #0\n"
6755 "movi v1.8b, #0\n"
6756 "movi v2.8b, #0\n"
6757 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6758 "prfm pldl1keep, [%x[in]]\n"
6759 "uaddw v8.8h, v8.8h, v0.8b\n"
6760 "uaddw v9.8h, v9.8h, v1.8b\n"
6761 "uaddw v10.8h, v10.8h, v2.8b\n"
6762 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6763
6764 // Aggregator Reduction.
6765 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6766 "dup v1.4s, %w[additive_sum_offset]\n"
6767 "uaddlp v8.4s, v8.8h\n"
6768 "uaddlp v9.4s, v9.8h\n"
6769 "uaddlp v10.4s, v10.8h\n"
6770 "addp v8.4s, v8.4s, v9.4s\n"
6771 "addp v10.4s, v10.4s, v10.4s\n"
6772 "addp v8.4s, v8.4s, v10.4s\n"
6773 "mul v8.4s, v8.4s, v0.s[0]\n"
6774 "add v8.4s, v8.4s, v1.4s\n"
6775 "st1 {v8.4s}, [%x[out]]\n"
6776 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6777 [out] "+r"(out), [in] "+r"(in)
6778 : [additive_sum_offset] "r"(params.additive_sum_offset),
6779 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6780 : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
6781 }
6782
6783 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6784 inline void Stream<uint8_t, 3, 8, 2, ColumnMajorWithSum>::Pack(
6785 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6786 #ifdef DEBUG
6787 #ifdef DEBUG_METAGEMM_VERBOSE
6788 std::cout
6789 << __FILE__ << "(" << __LINE__
6790 << ") ColumnMajorWithSum<uint8_t, 3, 8, 2, ColumnMajorWithSum>::Pack()"
6791 << std::endl
6792 << std::flush;
6793 #endif
6794 #endif
6795 int params_count_copy = params.count;
6796 int params_stride_copy = params.stride;
6797 asm volatile(
6798 "movi v8.8h, #0\n"
6799 "movi v9.8h, #0\n"
6800 "movi v10.8h, #0\n"
6801
6802 // Reduce count by leftovers.
6803 "subs %x[count], %x[count], #2\n"
6804 "beq 2f\n"
6805
6806 "1:"
6807 "subs %x[count], %x[count], #8\n"
6808
6809 // Load Aggregate Store - column major 3x8
6810 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6811 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6812 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6813 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6814 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6815 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6816 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6817 "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6818 "prfm pldl1keep, [%x[in]]\n"
6819 "uaddw v8.8h, v8.8h, v0.8b\n"
6820 "uaddw v9.8h, v9.8h, v1.8b\n"
6821 "uaddw v10.8h, v10.8h, v2.8b\n"
6822 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6823
6824 "bne 1b\n"
6825
6826 "2:"
6827
6828 // Load Aggregate Store - column major 3x2
6829 "movi v0.8b, #0\n"
6830 "movi v1.8b, #0\n"
6831 "movi v2.8b, #0\n"
6832 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6833 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6834 "prfm pldl1keep, [%x[in]]\n"
6835 "uaddw v8.8h, v8.8h, v0.8b\n"
6836 "uaddw v9.8h, v9.8h, v1.8b\n"
6837 "uaddw v10.8h, v10.8h, v2.8b\n"
6838 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6839
6840 // Aggregator Reduction.
6841 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6842 "dup v1.4s, %w[additive_sum_offset]\n"
6843 "uaddlp v8.4s, v8.8h\n"
6844 "uaddlp v9.4s, v9.8h\n"
6845 "uaddlp v10.4s, v10.8h\n"
6846 "addp v8.4s, v8.4s, v9.4s\n"
6847 "addp v10.4s, v10.4s, v10.4s\n"
6848 "addp v8.4s, v8.4s, v10.4s\n"
6849 "mul v8.4s, v8.4s, v0.s[0]\n"
6850 "add v8.4s, v8.4s, v1.4s\n"
6851 "st1 {v8.4s}, [%x[out]]\n"
6852 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6853 [out] "+r"(out), [in] "+r"(in)
6854 : [additive_sum_offset] "r"(params.additive_sum_offset),
6855 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6856 : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
6857 }
6858
6859 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6860 inline void Stream<uint8_t, 3, 8, 3, ColumnMajorWithSum>::Pack(
6861 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6862 #ifdef DEBUG
6863 #ifdef DEBUG_METAGEMM_VERBOSE
6864 std::cout
6865 << __FILE__ << "(" << __LINE__
6866 << ") ColumnMajorWithSum<uint8_t, 3, 8, 3, ColumnMajorWithSum>::Pack()"
6867 << std::endl
6868 << std::flush;
6869 #endif
6870 #endif
6871 int params_count_copy = params.count;
6872 int params_stride_copy = params.stride;
6873 asm volatile(
6874 "movi v8.8h, #0\n"
6875 "movi v9.8h, #0\n"
6876 "movi v10.8h, #0\n"
6877
6878 // Reduce count by leftovers.
6879 "subs %x[count], %x[count], #3\n"
6880 "beq 2f\n"
6881
6882 "1:"
6883 "subs %x[count], %x[count], #8\n"
6884
6885 // Load Aggregate Store - column major 3x8
6886 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6887 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6888 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6889 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6890 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6891 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6892 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6893 "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6894 "prfm pldl1keep, [%x[in]]\n"
6895 "uaddw v8.8h, v8.8h, v0.8b\n"
6896 "uaddw v9.8h, v9.8h, v1.8b\n"
6897 "uaddw v10.8h, v10.8h, v2.8b\n"
6898 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6899
6900 "bne 1b\n"
6901
6902 "2:"
6903
6904 // Load Aggregate Store - column major 3x3
6905 "movi v0.8b, #0\n"
6906 "movi v1.8b, #0\n"
6907 "movi v2.8b, #0\n"
6908 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6909 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6910 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6911 "prfm pldl1keep, [%x[in]]\n"
6912 "uaddw v8.8h, v8.8h, v0.8b\n"
6913 "uaddw v9.8h, v9.8h, v1.8b\n"
6914 "uaddw v10.8h, v10.8h, v2.8b\n"
6915 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6916
6917 // Aggregator Reduction.
6918 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6919 "dup v1.4s, %w[additive_sum_offset]\n"
6920 "uaddlp v8.4s, v8.8h\n"
6921 "uaddlp v9.4s, v9.8h\n"
6922 "uaddlp v10.4s, v10.8h\n"
6923 "addp v8.4s, v8.4s, v9.4s\n"
6924 "addp v10.4s, v10.4s, v10.4s\n"
6925 "addp v8.4s, v8.4s, v10.4s\n"
6926 "mul v8.4s, v8.4s, v0.s[0]\n"
6927 "add v8.4s, v8.4s, v1.4s\n"
6928 "st1 {v8.4s}, [%x[out]]\n"
6929 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
6930 [out] "+r"(out), [in] "+r"(in)
6931 : [additive_sum_offset] "r"(params.additive_sum_offset),
6932 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
6933 : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
6934 }
6935
6936 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)6937 inline void Stream<uint8_t, 3, 8, 4, ColumnMajorWithSum>::Pack(
6938 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
6939 #ifdef DEBUG
6940 #ifdef DEBUG_METAGEMM_VERBOSE
6941 std::cout
6942 << __FILE__ << "(" << __LINE__
6943 << ") ColumnMajorWithSum<uint8_t, 3, 8, 4, ColumnMajorWithSum>::Pack()"
6944 << std::endl
6945 << std::flush;
6946 #endif
6947 #endif
6948 int params_count_copy = params.count;
6949 int params_stride_copy = params.stride;
6950 asm volatile(
6951 "movi v8.8h, #0\n"
6952 "movi v9.8h, #0\n"
6953 "movi v10.8h, #0\n"
6954
6955 // Reduce count by leftovers.
6956 "subs %x[count], %x[count], #4\n"
6957 "beq 2f\n"
6958
6959 "1:"
6960 "subs %x[count], %x[count], #8\n"
6961
6962 // Load Aggregate Store - column major 3x8
6963 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6964 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6965 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6966 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6967 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
6968 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
6969 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
6970 "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
6971 "prfm pldl1keep, [%x[in]]\n"
6972 "uaddw v8.8h, v8.8h, v0.8b\n"
6973 "uaddw v9.8h, v9.8h, v1.8b\n"
6974 "uaddw v10.8h, v10.8h, v2.8b\n"
6975 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6976
6977 "bne 1b\n"
6978
6979 "2:"
6980
6981 // Load Aggregate Store - column major 3x4
6982 "movi v0.8b, #0\n"
6983 "movi v1.8b, #0\n"
6984 "movi v2.8b, #0\n"
6985 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
6986 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
6987 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
6988 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
6989 "prfm pldl1keep, [%x[in]]\n"
6990 "uaddw v8.8h, v8.8h, v0.8b\n"
6991 "uaddw v9.8h, v9.8h, v1.8b\n"
6992 "uaddw v10.8h, v10.8h, v2.8b\n"
6993 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
6994
6995 // Aggregator Reduction.
6996 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
6997 "dup v1.4s, %w[additive_sum_offset]\n"
6998 "uaddlp v8.4s, v8.8h\n"
6999 "uaddlp v9.4s, v9.8h\n"
7000 "uaddlp v10.4s, v10.8h\n"
7001 "addp v8.4s, v8.4s, v9.4s\n"
7002 "addp v10.4s, v10.4s, v10.4s\n"
7003 "addp v8.4s, v8.4s, v10.4s\n"
7004 "mul v8.4s, v8.4s, v0.s[0]\n"
7005 "add v8.4s, v8.4s, v1.4s\n"
7006 "st1 {v8.4s}, [%x[out]]\n"
7007 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7008 [out] "+r"(out), [in] "+r"(in)
7009 : [additive_sum_offset] "r"(params.additive_sum_offset),
7010 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7011 : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
7012 }
7013
7014 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7015 inline void Stream<uint8_t, 3, 8, 5, ColumnMajorWithSum>::Pack(
7016 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7017 #ifdef DEBUG
7018 #ifdef DEBUG_METAGEMM_VERBOSE
7019 std::cout
7020 << __FILE__ << "(" << __LINE__
7021 << ") ColumnMajorWithSum<uint8_t, 3, 8, 5, ColumnMajorWithSum>::Pack()"
7022 << std::endl
7023 << std::flush;
7024 #endif
7025 #endif
7026 int params_count_copy = params.count;
7027 int params_stride_copy = params.stride;
7028 asm volatile(
7029 "movi v8.8h, #0\n"
7030 "movi v9.8h, #0\n"
7031 "movi v10.8h, #0\n"
7032
7033 // Reduce count by leftovers.
7034 "subs %x[count], %x[count], #5\n"
7035 "beq 2f\n"
7036
7037 "1:"
7038 "subs %x[count], %x[count], #8\n"
7039
7040 // Load Aggregate Store - column major 3x8
7041 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7042 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7043 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7044 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7045 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7046 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7047 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
7048 "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
7049 "prfm pldl1keep, [%x[in]]\n"
7050 "uaddw v8.8h, v8.8h, v0.8b\n"
7051 "uaddw v9.8h, v9.8h, v1.8b\n"
7052 "uaddw v10.8h, v10.8h, v2.8b\n"
7053 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7054
7055 "bne 1b\n"
7056
7057 "2:"
7058
7059 // Load Aggregate Store - column major 3x5
7060 "movi v0.8b, #0\n"
7061 "movi v1.8b, #0\n"
7062 "movi v2.8b, #0\n"
7063 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7064 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7065 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7066 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7067 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7068 "prfm pldl1keep, [%x[in]]\n"
7069 "uaddw v8.8h, v8.8h, v0.8b\n"
7070 "uaddw v9.8h, v9.8h, v1.8b\n"
7071 "uaddw v10.8h, v10.8h, v2.8b\n"
7072 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7073
7074 // Aggregator Reduction.
7075 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7076 "dup v1.4s, %w[additive_sum_offset]\n"
7077 "uaddlp v8.4s, v8.8h\n"
7078 "uaddlp v9.4s, v9.8h\n"
7079 "uaddlp v10.4s, v10.8h\n"
7080 "addp v8.4s, v8.4s, v9.4s\n"
7081 "addp v10.4s, v10.4s, v10.4s\n"
7082 "addp v8.4s, v8.4s, v10.4s\n"
7083 "mul v8.4s, v8.4s, v0.s[0]\n"
7084 "add v8.4s, v8.4s, v1.4s\n"
7085 "st1 {v8.4s}, [%x[out]]\n"
7086 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7087 [out] "+r"(out), [in] "+r"(in)
7088 : [additive_sum_offset] "r"(params.additive_sum_offset),
7089 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7090 : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
7091 }
7092
7093 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7094 inline void Stream<uint8_t, 3, 8, 6, ColumnMajorWithSum>::Pack(
7095 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7096 #ifdef DEBUG
7097 #ifdef DEBUG_METAGEMM_VERBOSE
7098 std::cout
7099 << __FILE__ << "(" << __LINE__
7100 << ") ColumnMajorWithSum<uint8_t, 3, 8, 6, ColumnMajorWithSum>::Pack()"
7101 << std::endl
7102 << std::flush;
7103 #endif
7104 #endif
7105 int params_count_copy = params.count;
7106 int params_stride_copy = params.stride;
7107 asm volatile(
7108 "movi v8.8h, #0\n"
7109 "movi v9.8h, #0\n"
7110 "movi v10.8h, #0\n"
7111
7112 // Reduce count by leftovers.
7113 "subs %x[count], %x[count], #6\n"
7114 "beq 2f\n"
7115
7116 "1:"
7117 "subs %x[count], %x[count], #8\n"
7118
7119 // Load Aggregate Store - column major 3x8
7120 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7121 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7122 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7123 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7124 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7125 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7126 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
7127 "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
7128 "prfm pldl1keep, [%x[in]]\n"
7129 "uaddw v8.8h, v8.8h, v0.8b\n"
7130 "uaddw v9.8h, v9.8h, v1.8b\n"
7131 "uaddw v10.8h, v10.8h, v2.8b\n"
7132 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7133
7134 "bne 1b\n"
7135
7136 "2:"
7137
7138 // Load Aggregate Store - column major 3x6
7139 "movi v0.8b, #0\n"
7140 "movi v1.8b, #0\n"
7141 "movi v2.8b, #0\n"
7142 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7143 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7144 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7145 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7146 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7147 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7148 "prfm pldl1keep, [%x[in]]\n"
7149 "uaddw v8.8h, v8.8h, v0.8b\n"
7150 "uaddw v9.8h, v9.8h, v1.8b\n"
7151 "uaddw v10.8h, v10.8h, v2.8b\n"
7152 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7153
7154 // Aggregator Reduction.
7155 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7156 "dup v1.4s, %w[additive_sum_offset]\n"
7157 "uaddlp v8.4s, v8.8h\n"
7158 "uaddlp v9.4s, v9.8h\n"
7159 "uaddlp v10.4s, v10.8h\n"
7160 "addp v8.4s, v8.4s, v9.4s\n"
7161 "addp v10.4s, v10.4s, v10.4s\n"
7162 "addp v8.4s, v8.4s, v10.4s\n"
7163 "mul v8.4s, v8.4s, v0.s[0]\n"
7164 "add v8.4s, v8.4s, v1.4s\n"
7165 "st1 {v8.4s}, [%x[out]]\n"
7166 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7167 [out] "+r"(out), [in] "+r"(in)
7168 : [additive_sum_offset] "r"(params.additive_sum_offset),
7169 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7170 : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
7171 }
7172
7173 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7174 inline void Stream<uint8_t, 3, 8, 7, ColumnMajorWithSum>::Pack(
7175 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7176 #ifdef DEBUG
7177 #ifdef DEBUG_METAGEMM_VERBOSE
7178 std::cout
7179 << __FILE__ << "(" << __LINE__
7180 << ") ColumnMajorWithSum<uint8_t, 3, 8, 7, ColumnMajorWithSum>::Pack()"
7181 << std::endl
7182 << std::flush;
7183 #endif
7184 #endif
7185 int params_count_copy = params.count;
7186 int params_stride_copy = params.stride;
7187 asm volatile(
7188 "movi v8.8h, #0\n"
7189 "movi v9.8h, #0\n"
7190 "movi v10.8h, #0\n"
7191
7192 // Reduce count by leftovers.
7193 "subs %x[count], %x[count], #7\n"
7194 "beq 2f\n"
7195
7196 "1:"
7197 "subs %x[count], %x[count], #8\n"
7198
7199 // Load Aggregate Store - column major 3x8
7200 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7201 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7202 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7203 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7204 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7205 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7206 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
7207 "ld3 {v0.b, v1.b, v2.b}[7], [%x[in]], %x[stride]\n"
7208 "prfm pldl1keep, [%x[in]]\n"
7209 "uaddw v8.8h, v8.8h, v0.8b\n"
7210 "uaddw v9.8h, v9.8h, v1.8b\n"
7211 "uaddw v10.8h, v10.8h, v2.8b\n"
7212 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7213
7214 "bne 1b\n"
7215
7216 "2:"
7217
7218 // Load Aggregate Store - column major 3x7
7219 "movi v0.8b, #0\n"
7220 "movi v1.8b, #0\n"
7221 "movi v2.8b, #0\n"
7222 "ld3 {v0.b, v1.b, v2.b}[0], [%x[in]], %x[stride]\n"
7223 "ld3 {v0.b, v1.b, v2.b}[1], [%x[in]], %x[stride]\n"
7224 "ld3 {v0.b, v1.b, v2.b}[2], [%x[in]], %x[stride]\n"
7225 "ld3 {v0.b, v1.b, v2.b}[3], [%x[in]], %x[stride]\n"
7226 "ld3 {v0.b, v1.b, v2.b}[4], [%x[in]], %x[stride]\n"
7227 "ld3 {v0.b, v1.b, v2.b}[5], [%x[in]], %x[stride]\n"
7228 "ld3 {v0.b, v1.b, v2.b}[6], [%x[in]], %x[stride]\n"
7229 "prfm pldl1keep, [%x[in]]\n"
7230 "uaddw v8.8h, v8.8h, v0.8b\n"
7231 "uaddw v9.8h, v9.8h, v1.8b\n"
7232 "uaddw v10.8h, v10.8h, v2.8b\n"
7233 "st1 {v0.2s, v1.2s, v2.2s}, [%x[out]], #24\n"
7234
7235 // Aggregator Reduction.
7236 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7237 "dup v1.4s, %w[additive_sum_offset]\n"
7238 "uaddlp v8.4s, v8.8h\n"
7239 "uaddlp v9.4s, v9.8h\n"
7240 "uaddlp v10.4s, v10.8h\n"
7241 "addp v8.4s, v8.4s, v9.4s\n"
7242 "addp v10.4s, v10.4s, v10.4s\n"
7243 "addp v8.4s, v8.4s, v10.4s\n"
7244 "mul v8.4s, v8.4s, v0.s[0]\n"
7245 "add v8.4s, v8.4s, v1.4s\n"
7246 "st1 {v8.4s}, [%x[out]]\n"
7247 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7248 [out] "+r"(out), [in] "+r"(in)
7249 : [additive_sum_offset] "r"(params.additive_sum_offset),
7250 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7251 : "v0", "v1", "v2", "v8", "v9", "v10", "cc", "memory");
7252 }
7253
7254 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7255 inline void Stream<uint8_t, 4, 8, 0, ColumnMajorWithSum>::Pack(
7256 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7257 #ifdef DEBUG
7258 #ifdef DEBUG_METAGEMM_VERBOSE
7259 std::cout
7260 << __FILE__ << "(" << __LINE__
7261 << ") ColumnMajorWithSum<uint8_t, 4, 8, 0, ColumnMajorWithSum>::Pack()"
7262 << std::endl
7263 << std::flush;
7264 #endif
7265 #endif
7266 int params_count_copy = params.count;
7267 int params_stride_copy = params.stride;
7268 asm volatile(
7269 "movi v8.8h, #0\n"
7270 "movi v9.8h, #0\n"
7271 "movi v10.8h, #0\n"
7272 "movi v11.8h, #0\n"
7273
7274 "1:"
7275 "subs %x[count], %x[count], #8\n"
7276
7277 // Load Aggregate Store - column major 4x8
7278 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7279 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7280 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7281 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7282 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7283 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7284 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7285 "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7286 "prfm pldl1keep, [%x[in]]\n"
7287 "trn1 v4.4h, v0.4h, v2.4h\n"
7288 "trn2 v6.4h, v0.4h, v2.4h\n"
7289 "trn1 v5.4h, v1.4h, v3.4h\n"
7290 "trn2 v7.4h, v1.4h, v3.4h\n"
7291 "trn1 v0.8b, v4.8b, v5.8b\n"
7292 "trn2 v1.8b, v4.8b, v5.8b\n"
7293 "trn1 v2.8b, v6.8b, v7.8b\n"
7294 "trn2 v3.8b, v6.8b, v7.8b\n"
7295 "uaddw v8.8h, v8.8h, v0.8b\n"
7296 "uaddw v9.8h, v9.8h, v1.8b\n"
7297 "uaddw v10.8h, v10.8h, v2.8b\n"
7298 "uaddw v11.8h, v11.8h, v3.8b\n"
7299 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7300
7301 "bne 1b\n"
7302
7303 // Aggregator Reduction.
7304 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7305 "dup v1.4s, %w[additive_sum_offset]\n"
7306 "uaddlp v8.4s, v8.8h\n"
7307 "uaddlp v9.4s, v9.8h\n"
7308 "uaddlp v10.4s, v10.8h\n"
7309 "uaddlp v11.4s, v11.8h\n"
7310 "addp v8.4s, v8.4s, v9.4s\n"
7311 "addp v10.4s, v10.4s, v11.4s\n"
7312 "addp v8.4s, v8.4s, v10.4s\n"
7313 "mul v8.4s, v8.4s, v0.s[0]\n"
7314 "add v8.4s, v8.4s, v1.4s\n"
7315 "st1 {v8.4s}, [%x[out]]\n"
7316 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7317 [out] "+r"(out), [in] "+r"(in)
7318 : [additive_sum_offset] "r"(params.additive_sum_offset),
7319 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7320 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7321 "v11", "cc", "memory");
7322 }
7323
7324 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7325 inline void Stream<uint8_t, 4, 8, 1, ColumnMajorWithSum>::Pack(
7326 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7327 #ifdef DEBUG
7328 #ifdef DEBUG_METAGEMM_VERBOSE
7329 std::cout
7330 << __FILE__ << "(" << __LINE__
7331 << ") ColumnMajorWithSum<uint8_t, 4, 8, 1, ColumnMajorWithSum>::Pack()"
7332 << std::endl
7333 << std::flush;
7334 #endif
7335 #endif
7336 int params_count_copy = params.count;
7337 int params_stride_copy = params.stride;
7338 asm volatile(
7339 "movi v8.8h, #0\n"
7340 "movi v9.8h, #0\n"
7341 "movi v10.8h, #0\n"
7342 "movi v11.8h, #0\n"
7343
7344 // Reduce count by leftovers.
7345 "subs %x[count], %x[count], #1\n"
7346 "beq 2f\n"
7347
7348 "1:"
7349 "subs %x[count], %x[count], #8\n"
7350
7351 // Load Aggregate Store - column major 4x8
7352 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7353 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7354 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7355 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7356 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7357 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7358 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7359 "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7360 "prfm pldl1keep, [%x[in]]\n"
7361 "trn1 v4.4h, v0.4h, v2.4h\n"
7362 "trn2 v6.4h, v0.4h, v2.4h\n"
7363 "trn1 v5.4h, v1.4h, v3.4h\n"
7364 "trn2 v7.4h, v1.4h, v3.4h\n"
7365 "trn1 v0.8b, v4.8b, v5.8b\n"
7366 "trn2 v1.8b, v4.8b, v5.8b\n"
7367 "trn1 v2.8b, v6.8b, v7.8b\n"
7368 "trn2 v3.8b, v6.8b, v7.8b\n"
7369 "uaddw v8.8h, v8.8h, v0.8b\n"
7370 "uaddw v9.8h, v9.8h, v1.8b\n"
7371 "uaddw v10.8h, v10.8h, v2.8b\n"
7372 "uaddw v11.8h, v11.8h, v3.8b\n"
7373 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7374
7375 "bne 1b\n"
7376
7377 "2:"
7378
7379 // Load Aggregate Store - column major 4x1
7380 "movi v0.8b, #0\n"
7381 "movi v1.8b, #0\n"
7382 "movi v2.8b, #0\n"
7383 "movi v3.8b, #0\n"
7384 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7385 "prfm pldl1keep, [%x[in]]\n"
7386 "trn1 v4.4h, v0.4h, v2.4h\n"
7387 "trn2 v6.4h, v0.4h, v2.4h\n"
7388 "trn1 v5.4h, v1.4h, v3.4h\n"
7389 "trn2 v7.4h, v1.4h, v3.4h\n"
7390 "trn1 v0.8b, v4.8b, v5.8b\n"
7391 "trn2 v1.8b, v4.8b, v5.8b\n"
7392 "trn1 v2.8b, v6.8b, v7.8b\n"
7393 "trn2 v3.8b, v6.8b, v7.8b\n"
7394 "uaddw v8.8h, v8.8h, v0.8b\n"
7395 "uaddw v9.8h, v9.8h, v1.8b\n"
7396 "uaddw v10.8h, v10.8h, v2.8b\n"
7397 "uaddw v11.8h, v11.8h, v3.8b\n"
7398 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7399
7400 // Aggregator Reduction.
7401 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7402 "dup v1.4s, %w[additive_sum_offset]\n"
7403 "uaddlp v8.4s, v8.8h\n"
7404 "uaddlp v9.4s, v9.8h\n"
7405 "uaddlp v10.4s, v10.8h\n"
7406 "uaddlp v11.4s, v11.8h\n"
7407 "addp v8.4s, v8.4s, v9.4s\n"
7408 "addp v10.4s, v10.4s, v11.4s\n"
7409 "addp v8.4s, v8.4s, v10.4s\n"
7410 "mul v8.4s, v8.4s, v0.s[0]\n"
7411 "add v8.4s, v8.4s, v1.4s\n"
7412 "st1 {v8.4s}, [%x[out]]\n"
7413 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7414 [out] "+r"(out), [in] "+r"(in)
7415 : [additive_sum_offset] "r"(params.additive_sum_offset),
7416 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7417 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7418 "v11", "cc", "memory");
7419 }
7420
7421 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7422 inline void Stream<uint8_t, 4, 8, 2, ColumnMajorWithSum>::Pack(
7423 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7424 #ifdef DEBUG
7425 #ifdef DEBUG_METAGEMM_VERBOSE
7426 std::cout
7427 << __FILE__ << "(" << __LINE__
7428 << ") ColumnMajorWithSum<uint8_t, 4, 8, 2, ColumnMajorWithSum>::Pack()"
7429 << std::endl
7430 << std::flush;
7431 #endif
7432 #endif
7433 int params_count_copy = params.count;
7434 int params_stride_copy = params.stride;
7435 asm volatile(
7436 "movi v8.8h, #0\n"
7437 "movi v9.8h, #0\n"
7438 "movi v10.8h, #0\n"
7439 "movi v11.8h, #0\n"
7440
7441 // Reduce count by leftovers.
7442 "subs %x[count], %x[count], #2\n"
7443 "beq 2f\n"
7444
7445 "1:"
7446 "subs %x[count], %x[count], #8\n"
7447
7448 // Load Aggregate Store - column major 4x8
7449 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7450 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7451 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7452 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7453 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7454 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7455 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7456 "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7457 "prfm pldl1keep, [%x[in]]\n"
7458 "trn1 v4.4h, v0.4h, v2.4h\n"
7459 "trn2 v6.4h, v0.4h, v2.4h\n"
7460 "trn1 v5.4h, v1.4h, v3.4h\n"
7461 "trn2 v7.4h, v1.4h, v3.4h\n"
7462 "trn1 v0.8b, v4.8b, v5.8b\n"
7463 "trn2 v1.8b, v4.8b, v5.8b\n"
7464 "trn1 v2.8b, v6.8b, v7.8b\n"
7465 "trn2 v3.8b, v6.8b, v7.8b\n"
7466 "uaddw v8.8h, v8.8h, v0.8b\n"
7467 "uaddw v9.8h, v9.8h, v1.8b\n"
7468 "uaddw v10.8h, v10.8h, v2.8b\n"
7469 "uaddw v11.8h, v11.8h, v3.8b\n"
7470 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7471
7472 "bne 1b\n"
7473
7474 "2:"
7475
7476 // Load Aggregate Store - column major 4x2
7477 "movi v0.8b, #0\n"
7478 "movi v1.8b, #0\n"
7479 "movi v2.8b, #0\n"
7480 "movi v3.8b, #0\n"
7481 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7482 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7483 "prfm pldl1keep, [%x[in]]\n"
7484 "trn1 v4.4h, v0.4h, v2.4h\n"
7485 "trn2 v6.4h, v0.4h, v2.4h\n"
7486 "trn1 v5.4h, v1.4h, v3.4h\n"
7487 "trn2 v7.4h, v1.4h, v3.4h\n"
7488 "trn1 v0.8b, v4.8b, v5.8b\n"
7489 "trn2 v1.8b, v4.8b, v5.8b\n"
7490 "trn1 v2.8b, v6.8b, v7.8b\n"
7491 "trn2 v3.8b, v6.8b, v7.8b\n"
7492 "uaddw v8.8h, v8.8h, v0.8b\n"
7493 "uaddw v9.8h, v9.8h, v1.8b\n"
7494 "uaddw v10.8h, v10.8h, v2.8b\n"
7495 "uaddw v11.8h, v11.8h, v3.8b\n"
7496 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7497
7498 // Aggregator Reduction.
7499 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7500 "dup v1.4s, %w[additive_sum_offset]\n"
7501 "uaddlp v8.4s, v8.8h\n"
7502 "uaddlp v9.4s, v9.8h\n"
7503 "uaddlp v10.4s, v10.8h\n"
7504 "uaddlp v11.4s, v11.8h\n"
7505 "addp v8.4s, v8.4s, v9.4s\n"
7506 "addp v10.4s, v10.4s, v11.4s\n"
7507 "addp v8.4s, v8.4s, v10.4s\n"
7508 "mul v8.4s, v8.4s, v0.s[0]\n"
7509 "add v8.4s, v8.4s, v1.4s\n"
7510 "st1 {v8.4s}, [%x[out]]\n"
7511 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7512 [out] "+r"(out), [in] "+r"(in)
7513 : [additive_sum_offset] "r"(params.additive_sum_offset),
7514 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7515 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7516 "v11", "cc", "memory");
7517 }
7518
7519 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7520 inline void Stream<uint8_t, 4, 8, 3, ColumnMajorWithSum>::Pack(
7521 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7522 #ifdef DEBUG
7523 #ifdef DEBUG_METAGEMM_VERBOSE
7524 std::cout
7525 << __FILE__ << "(" << __LINE__
7526 << ") ColumnMajorWithSum<uint8_t, 4, 8, 3, ColumnMajorWithSum>::Pack()"
7527 << std::endl
7528 << std::flush;
7529 #endif
7530 #endif
7531 int params_count_copy = params.count;
7532 int params_stride_copy = params.stride;
7533 asm volatile(
7534 "movi v8.8h, #0\n"
7535 "movi v9.8h, #0\n"
7536 "movi v10.8h, #0\n"
7537 "movi v11.8h, #0\n"
7538
7539 // Reduce count by leftovers.
7540 "subs %x[count], %x[count], #3\n"
7541 "beq 2f\n"
7542
7543 "1:"
7544 "subs %x[count], %x[count], #8\n"
7545
7546 // Load Aggregate Store - column major 4x8
7547 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7548 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7549 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7550 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7551 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7552 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7553 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7554 "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7555 "prfm pldl1keep, [%x[in]]\n"
7556 "trn1 v4.4h, v0.4h, v2.4h\n"
7557 "trn2 v6.4h, v0.4h, v2.4h\n"
7558 "trn1 v5.4h, v1.4h, v3.4h\n"
7559 "trn2 v7.4h, v1.4h, v3.4h\n"
7560 "trn1 v0.8b, v4.8b, v5.8b\n"
7561 "trn2 v1.8b, v4.8b, v5.8b\n"
7562 "trn1 v2.8b, v6.8b, v7.8b\n"
7563 "trn2 v3.8b, v6.8b, v7.8b\n"
7564 "uaddw v8.8h, v8.8h, v0.8b\n"
7565 "uaddw v9.8h, v9.8h, v1.8b\n"
7566 "uaddw v10.8h, v10.8h, v2.8b\n"
7567 "uaddw v11.8h, v11.8h, v3.8b\n"
7568 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7569
7570 "bne 1b\n"
7571
7572 "2:"
7573
7574 // Load Aggregate Store - column major 4x3
7575 "movi v0.8b, #0\n"
7576 "movi v1.8b, #0\n"
7577 "movi v2.8b, #0\n"
7578 "movi v3.8b, #0\n"
7579 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7580 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7581 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7582 "prfm pldl1keep, [%x[in]]\n"
7583 "trn1 v4.4h, v0.4h, v2.4h\n"
7584 "trn2 v6.4h, v0.4h, v2.4h\n"
7585 "trn1 v5.4h, v1.4h, v3.4h\n"
7586 "trn2 v7.4h, v1.4h, v3.4h\n"
7587 "trn1 v0.8b, v4.8b, v5.8b\n"
7588 "trn2 v1.8b, v4.8b, v5.8b\n"
7589 "trn1 v2.8b, v6.8b, v7.8b\n"
7590 "trn2 v3.8b, v6.8b, v7.8b\n"
7591 "uaddw v8.8h, v8.8h, v0.8b\n"
7592 "uaddw v9.8h, v9.8h, v1.8b\n"
7593 "uaddw v10.8h, v10.8h, v2.8b\n"
7594 "uaddw v11.8h, v11.8h, v3.8b\n"
7595 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7596
7597 // Aggregator Reduction.
7598 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7599 "dup v1.4s, %w[additive_sum_offset]\n"
7600 "uaddlp v8.4s, v8.8h\n"
7601 "uaddlp v9.4s, v9.8h\n"
7602 "uaddlp v10.4s, v10.8h\n"
7603 "uaddlp v11.4s, v11.8h\n"
7604 "addp v8.4s, v8.4s, v9.4s\n"
7605 "addp v10.4s, v10.4s, v11.4s\n"
7606 "addp v8.4s, v8.4s, v10.4s\n"
7607 "mul v8.4s, v8.4s, v0.s[0]\n"
7608 "add v8.4s, v8.4s, v1.4s\n"
7609 "st1 {v8.4s}, [%x[out]]\n"
7610 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7611 [out] "+r"(out), [in] "+r"(in)
7612 : [additive_sum_offset] "r"(params.additive_sum_offset),
7613 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7614 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7615 "v11", "cc", "memory");
7616 }
7617
7618 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7619 inline void Stream<uint8_t, 4, 8, 4, ColumnMajorWithSum>::Pack(
7620 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7621 #ifdef DEBUG
7622 #ifdef DEBUG_METAGEMM_VERBOSE
7623 std::cout
7624 << __FILE__ << "(" << __LINE__
7625 << ") ColumnMajorWithSum<uint8_t, 4, 8, 4, ColumnMajorWithSum>::Pack()"
7626 << std::endl
7627 << std::flush;
7628 #endif
7629 #endif
7630 int params_count_copy = params.count;
7631 int params_stride_copy = params.stride;
7632 asm volatile(
7633 "movi v8.8h, #0\n"
7634 "movi v9.8h, #0\n"
7635 "movi v10.8h, #0\n"
7636 "movi v11.8h, #0\n"
7637
7638 // Reduce count by leftovers.
7639 "subs %x[count], %x[count], #4\n"
7640 "beq 2f\n"
7641
7642 "1:"
7643 "subs %x[count], %x[count], #8\n"
7644
7645 // Load Aggregate Store - column major 4x8
7646 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7647 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7648 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7649 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7650 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7651 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7652 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7653 "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7654 "prfm pldl1keep, [%x[in]]\n"
7655 "trn1 v4.4h, v0.4h, v2.4h\n"
7656 "trn2 v6.4h, v0.4h, v2.4h\n"
7657 "trn1 v5.4h, v1.4h, v3.4h\n"
7658 "trn2 v7.4h, v1.4h, v3.4h\n"
7659 "trn1 v0.8b, v4.8b, v5.8b\n"
7660 "trn2 v1.8b, v4.8b, v5.8b\n"
7661 "trn1 v2.8b, v6.8b, v7.8b\n"
7662 "trn2 v3.8b, v6.8b, v7.8b\n"
7663 "uaddw v8.8h, v8.8h, v0.8b\n"
7664 "uaddw v9.8h, v9.8h, v1.8b\n"
7665 "uaddw v10.8h, v10.8h, v2.8b\n"
7666 "uaddw v11.8h, v11.8h, v3.8b\n"
7667 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7668
7669 "bne 1b\n"
7670
7671 "2:"
7672
7673 // Load Aggregate Store - column major 4x4
7674 "movi v0.8b, #0\n"
7675 "movi v1.8b, #0\n"
7676 "movi v2.8b, #0\n"
7677 "movi v3.8b, #0\n"
7678 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7679 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7680 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7681 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7682 "prfm pldl1keep, [%x[in]]\n"
7683 "trn1 v4.4h, v0.4h, v2.4h\n"
7684 "trn2 v6.4h, v0.4h, v2.4h\n"
7685 "trn1 v5.4h, v1.4h, v3.4h\n"
7686 "trn2 v7.4h, v1.4h, v3.4h\n"
7687 "trn1 v0.8b, v4.8b, v5.8b\n"
7688 "trn2 v1.8b, v4.8b, v5.8b\n"
7689 "trn1 v2.8b, v6.8b, v7.8b\n"
7690 "trn2 v3.8b, v6.8b, v7.8b\n"
7691 "uaddw v8.8h, v8.8h, v0.8b\n"
7692 "uaddw v9.8h, v9.8h, v1.8b\n"
7693 "uaddw v10.8h, v10.8h, v2.8b\n"
7694 "uaddw v11.8h, v11.8h, v3.8b\n"
7695 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7696
7697 // Aggregator Reduction.
7698 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7699 "dup v1.4s, %w[additive_sum_offset]\n"
7700 "uaddlp v8.4s, v8.8h\n"
7701 "uaddlp v9.4s, v9.8h\n"
7702 "uaddlp v10.4s, v10.8h\n"
7703 "uaddlp v11.4s, v11.8h\n"
7704 "addp v8.4s, v8.4s, v9.4s\n"
7705 "addp v10.4s, v10.4s, v11.4s\n"
7706 "addp v8.4s, v8.4s, v10.4s\n"
7707 "mul v8.4s, v8.4s, v0.s[0]\n"
7708 "add v8.4s, v8.4s, v1.4s\n"
7709 "st1 {v8.4s}, [%x[out]]\n"
7710 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7711 [out] "+r"(out), [in] "+r"(in)
7712 : [additive_sum_offset] "r"(params.additive_sum_offset),
7713 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7714 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7715 "v11", "cc", "memory");
7716 }
7717
7718 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7719 inline void Stream<uint8_t, 4, 8, 5, ColumnMajorWithSum>::Pack(
7720 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7721 #ifdef DEBUG
7722 #ifdef DEBUG_METAGEMM_VERBOSE
7723 std::cout
7724 << __FILE__ << "(" << __LINE__
7725 << ") ColumnMajorWithSum<uint8_t, 4, 8, 5, ColumnMajorWithSum>::Pack()"
7726 << std::endl
7727 << std::flush;
7728 #endif
7729 #endif
7730 int params_count_copy = params.count;
7731 int params_stride_copy = params.stride;
7732 asm volatile(
7733 "movi v8.8h, #0\n"
7734 "movi v9.8h, #0\n"
7735 "movi v10.8h, #0\n"
7736 "movi v11.8h, #0\n"
7737
7738 // Reduce count by leftovers.
7739 "subs %x[count], %x[count], #5\n"
7740 "beq 2f\n"
7741
7742 "1:"
7743 "subs %x[count], %x[count], #8\n"
7744
7745 // Load Aggregate Store - column major 4x8
7746 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7747 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7748 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7749 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7750 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7751 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7752 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7753 "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7754 "prfm pldl1keep, [%x[in]]\n"
7755 "trn1 v4.4h, v0.4h, v2.4h\n"
7756 "trn2 v6.4h, v0.4h, v2.4h\n"
7757 "trn1 v5.4h, v1.4h, v3.4h\n"
7758 "trn2 v7.4h, v1.4h, v3.4h\n"
7759 "trn1 v0.8b, v4.8b, v5.8b\n"
7760 "trn2 v1.8b, v4.8b, v5.8b\n"
7761 "trn1 v2.8b, v6.8b, v7.8b\n"
7762 "trn2 v3.8b, v6.8b, v7.8b\n"
7763 "uaddw v8.8h, v8.8h, v0.8b\n"
7764 "uaddw v9.8h, v9.8h, v1.8b\n"
7765 "uaddw v10.8h, v10.8h, v2.8b\n"
7766 "uaddw v11.8h, v11.8h, v3.8b\n"
7767 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7768
7769 "bne 1b\n"
7770
7771 "2:"
7772
7773 // Load Aggregate Store - column major 4x5
7774 "movi v0.8b, #0\n"
7775 "movi v1.8b, #0\n"
7776 "movi v2.8b, #0\n"
7777 "movi v3.8b, #0\n"
7778 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7779 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7780 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7781 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7782 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7783 "prfm pldl1keep, [%x[in]]\n"
7784 "trn1 v4.4h, v0.4h, v2.4h\n"
7785 "trn2 v6.4h, v0.4h, v2.4h\n"
7786 "trn1 v5.4h, v1.4h, v3.4h\n"
7787 "trn2 v7.4h, v1.4h, v3.4h\n"
7788 "trn1 v0.8b, v4.8b, v5.8b\n"
7789 "trn2 v1.8b, v4.8b, v5.8b\n"
7790 "trn1 v2.8b, v6.8b, v7.8b\n"
7791 "trn2 v3.8b, v6.8b, v7.8b\n"
7792 "uaddw v8.8h, v8.8h, v0.8b\n"
7793 "uaddw v9.8h, v9.8h, v1.8b\n"
7794 "uaddw v10.8h, v10.8h, v2.8b\n"
7795 "uaddw v11.8h, v11.8h, v3.8b\n"
7796 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7797
7798 // Aggregator Reduction.
7799 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7800 "dup v1.4s, %w[additive_sum_offset]\n"
7801 "uaddlp v8.4s, v8.8h\n"
7802 "uaddlp v9.4s, v9.8h\n"
7803 "uaddlp v10.4s, v10.8h\n"
7804 "uaddlp v11.4s, v11.8h\n"
7805 "addp v8.4s, v8.4s, v9.4s\n"
7806 "addp v10.4s, v10.4s, v11.4s\n"
7807 "addp v8.4s, v8.4s, v10.4s\n"
7808 "mul v8.4s, v8.4s, v0.s[0]\n"
7809 "add v8.4s, v8.4s, v1.4s\n"
7810 "st1 {v8.4s}, [%x[out]]\n"
7811 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7812 [out] "+r"(out), [in] "+r"(in)
7813 : [additive_sum_offset] "r"(params.additive_sum_offset),
7814 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7815 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7816 "v11", "cc", "memory");
7817 }
7818
7819 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7820 inline void Stream<uint8_t, 4, 8, 6, ColumnMajorWithSum>::Pack(
7821 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7822 #ifdef DEBUG
7823 #ifdef DEBUG_METAGEMM_VERBOSE
7824 std::cout
7825 << __FILE__ << "(" << __LINE__
7826 << ") ColumnMajorWithSum<uint8_t, 4, 8, 6, ColumnMajorWithSum>::Pack()"
7827 << std::endl
7828 << std::flush;
7829 #endif
7830 #endif
7831 int params_count_copy = params.count;
7832 int params_stride_copy = params.stride;
7833 asm volatile(
7834 "movi v8.8h, #0\n"
7835 "movi v9.8h, #0\n"
7836 "movi v10.8h, #0\n"
7837 "movi v11.8h, #0\n"
7838
7839 // Reduce count by leftovers.
7840 "subs %x[count], %x[count], #6\n"
7841 "beq 2f\n"
7842
7843 "1:"
7844 "subs %x[count], %x[count], #8\n"
7845
7846 // Load Aggregate Store - column major 4x8
7847 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7848 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7849 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7850 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7851 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7852 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7853 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7854 "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7855 "prfm pldl1keep, [%x[in]]\n"
7856 "trn1 v4.4h, v0.4h, v2.4h\n"
7857 "trn2 v6.4h, v0.4h, v2.4h\n"
7858 "trn1 v5.4h, v1.4h, v3.4h\n"
7859 "trn2 v7.4h, v1.4h, v3.4h\n"
7860 "trn1 v0.8b, v4.8b, v5.8b\n"
7861 "trn2 v1.8b, v4.8b, v5.8b\n"
7862 "trn1 v2.8b, v6.8b, v7.8b\n"
7863 "trn2 v3.8b, v6.8b, v7.8b\n"
7864 "uaddw v8.8h, v8.8h, v0.8b\n"
7865 "uaddw v9.8h, v9.8h, v1.8b\n"
7866 "uaddw v10.8h, v10.8h, v2.8b\n"
7867 "uaddw v11.8h, v11.8h, v3.8b\n"
7868 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7869
7870 "bne 1b\n"
7871
7872 "2:"
7873
7874 // Load Aggregate Store - column major 4x6
7875 "movi v0.8b, #0\n"
7876 "movi v1.8b, #0\n"
7877 "movi v2.8b, #0\n"
7878 "movi v3.8b, #0\n"
7879 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7880 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7881 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7882 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7883 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7884 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7885 "prfm pldl1keep, [%x[in]]\n"
7886 "trn1 v4.4h, v0.4h, v2.4h\n"
7887 "trn2 v6.4h, v0.4h, v2.4h\n"
7888 "trn1 v5.4h, v1.4h, v3.4h\n"
7889 "trn2 v7.4h, v1.4h, v3.4h\n"
7890 "trn1 v0.8b, v4.8b, v5.8b\n"
7891 "trn2 v1.8b, v4.8b, v5.8b\n"
7892 "trn1 v2.8b, v6.8b, v7.8b\n"
7893 "trn2 v3.8b, v6.8b, v7.8b\n"
7894 "uaddw v8.8h, v8.8h, v0.8b\n"
7895 "uaddw v9.8h, v9.8h, v1.8b\n"
7896 "uaddw v10.8h, v10.8h, v2.8b\n"
7897 "uaddw v11.8h, v11.8h, v3.8b\n"
7898 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7899
7900 // Aggregator Reduction.
7901 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
7902 "dup v1.4s, %w[additive_sum_offset]\n"
7903 "uaddlp v8.4s, v8.8h\n"
7904 "uaddlp v9.4s, v9.8h\n"
7905 "uaddlp v10.4s, v10.8h\n"
7906 "uaddlp v11.4s, v11.8h\n"
7907 "addp v8.4s, v8.4s, v9.4s\n"
7908 "addp v10.4s, v10.4s, v11.4s\n"
7909 "addp v8.4s, v8.4s, v10.4s\n"
7910 "mul v8.4s, v8.4s, v0.s[0]\n"
7911 "add v8.4s, v8.4s, v1.4s\n"
7912 "st1 {v8.4s}, [%x[out]]\n"
7913 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
7914 [out] "+r"(out), [in] "+r"(in)
7915 : [additive_sum_offset] "r"(params.additive_sum_offset),
7916 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
7917 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
7918 "v11", "cc", "memory");
7919 }
7920
7921 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)7922 inline void Stream<uint8_t, 4, 8, 7, ColumnMajorWithSum>::Pack(
7923 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
7924 #ifdef DEBUG
7925 #ifdef DEBUG_METAGEMM_VERBOSE
7926 std::cout
7927 << __FILE__ << "(" << __LINE__
7928 << ") ColumnMajorWithSum<uint8_t, 4, 8, 7, ColumnMajorWithSum>::Pack()"
7929 << std::endl
7930 << std::flush;
7931 #endif
7932 #endif
7933 int params_count_copy = params.count;
7934 int params_stride_copy = params.stride;
7935 asm volatile(
7936 "movi v8.8h, #0\n"
7937 "movi v9.8h, #0\n"
7938 "movi v10.8h, #0\n"
7939 "movi v11.8h, #0\n"
7940
7941 // Reduce count by leftovers.
7942 "subs %x[count], %x[count], #7\n"
7943 "beq 2f\n"
7944
7945 "1:"
7946 "subs %x[count], %x[count], #8\n"
7947
7948 // Load Aggregate Store - column major 4x8
7949 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7950 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7951 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7952 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7953 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7954 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7955 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7956 "ld1 {v3.s}[1], [%x[in]], %x[stride]\n"
7957 "prfm pldl1keep, [%x[in]]\n"
7958 "trn1 v4.4h, v0.4h, v2.4h\n"
7959 "trn2 v6.4h, v0.4h, v2.4h\n"
7960 "trn1 v5.4h, v1.4h, v3.4h\n"
7961 "trn2 v7.4h, v1.4h, v3.4h\n"
7962 "trn1 v0.8b, v4.8b, v5.8b\n"
7963 "trn2 v1.8b, v4.8b, v5.8b\n"
7964 "trn1 v2.8b, v6.8b, v7.8b\n"
7965 "trn2 v3.8b, v6.8b, v7.8b\n"
7966 "uaddw v8.8h, v8.8h, v0.8b\n"
7967 "uaddw v9.8h, v9.8h, v1.8b\n"
7968 "uaddw v10.8h, v10.8h, v2.8b\n"
7969 "uaddw v11.8h, v11.8h, v3.8b\n"
7970 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
7971
7972 "bne 1b\n"
7973
7974 "2:"
7975
7976 // Load Aggregate Store - column major 4x7
7977 "movi v0.8b, #0\n"
7978 "movi v1.8b, #0\n"
7979 "movi v2.8b, #0\n"
7980 "movi v3.8b, #0\n"
7981 "ld1 {v0.s}[0], [%x[in]], %x[stride]\n"
7982 "ld1 {v1.s}[0], [%x[in]], %x[stride]\n"
7983 "ld1 {v2.s}[0], [%x[in]], %x[stride]\n"
7984 "ld1 {v3.s}[0], [%x[in]], %x[stride]\n"
7985 "ld1 {v0.s}[1], [%x[in]], %x[stride]\n"
7986 "ld1 {v1.s}[1], [%x[in]], %x[stride]\n"
7987 "ld1 {v2.s}[1], [%x[in]], %x[stride]\n"
7988 "prfm pldl1keep, [%x[in]]\n"
7989 "trn1 v4.4h, v0.4h, v2.4h\n"
7990 "trn2 v6.4h, v0.4h, v2.4h\n"
7991 "trn1 v5.4h, v1.4h, v3.4h\n"
7992 "trn2 v7.4h, v1.4h, v3.4h\n"
7993 "trn1 v0.8b, v4.8b, v5.8b\n"
7994 "trn2 v1.8b, v4.8b, v5.8b\n"
7995 "trn1 v2.8b, v6.8b, v7.8b\n"
7996 "trn2 v3.8b, v6.8b, v7.8b\n"
7997 "uaddw v8.8h, v8.8h, v0.8b\n"
7998 "uaddw v9.8h, v9.8h, v1.8b\n"
7999 "uaddw v10.8h, v10.8h, v2.8b\n"
8000 "uaddw v11.8h, v11.8h, v3.8b\n"
8001 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8002
8003 // Aggregator Reduction.
8004 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8005 "dup v1.4s, %w[additive_sum_offset]\n"
8006 "uaddlp v8.4s, v8.8h\n"
8007 "uaddlp v9.4s, v9.8h\n"
8008 "uaddlp v10.4s, v10.8h\n"
8009 "uaddlp v11.4s, v11.8h\n"
8010 "addp v8.4s, v8.4s, v9.4s\n"
8011 "addp v10.4s, v10.4s, v11.4s\n"
8012 "addp v8.4s, v8.4s, v10.4s\n"
8013 "mul v8.4s, v8.4s, v0.s[0]\n"
8014 "add v8.4s, v8.4s, v1.4s\n"
8015 "st1 {v8.4s}, [%x[out]]\n"
8016 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8017 [out] "+r"(out), [in] "+r"(in)
8018 : [additive_sum_offset] "r"(params.additive_sum_offset),
8019 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8020 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8021 "v11", "cc", "memory");
8022 }
8023
8024 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8025 inline void Stream<uint8_t, 5, 8, 0, ColumnMajorWithSum>::Pack(
8026 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8027 #ifdef DEBUG
8028 #ifdef DEBUG_METAGEMM_VERBOSE
8029 std::cout
8030 << __FILE__ << "(" << __LINE__
8031 << ") ColumnMajorWithSum<uint8_t, 5, 8, 0, ColumnMajorWithSum>::Pack()"
8032 << std::endl
8033 << std::flush;
8034 #endif
8035 #endif
8036 int params_count_copy = params.count;
8037 int params_stride_copy = params.stride;
8038 asm volatile(
8039 "sub %x[stride], %x[stride], #4\n"
8040 "movi v8.8h, #0\n"
8041 "movi v9.8h, #0\n"
8042 "movi v10.8h, #0\n"
8043 "movi v11.8h, #0\n"
8044 "movi v12.8h, #0\n"
8045
8046 "1:"
8047 "subs %x[count], %x[count], #8\n"
8048
8049 // Load Aggregate Store - column major 5x8
8050 "ld1 {v0.s}[0], [%x[in]], #4\n"
8051 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8052 "ld1 {v1.s}[0], [%x[in]], #4\n"
8053 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8054 "ld1 {v2.s}[0], [%x[in]], #4\n"
8055 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8056 "ld1 {v3.s}[0], [%x[in]], #4\n"
8057 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8058 "ld1 {v0.s}[1], [%x[in]], #4\n"
8059 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8060 "ld1 {v1.s}[1], [%x[in]], #4\n"
8061 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8062 "ld1 {v2.s}[1], [%x[in]], #4\n"
8063 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8064 "ld1 {v3.s}[1], [%x[in]], #4\n"
8065 "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8066 "prfm pldl1keep, [%x[in]]\n"
8067 "trn1 v5.4h, v0.4h, v2.4h\n"
8068 "trn2 v7.4h, v0.4h, v2.4h\n"
8069 "trn1 v6.4h, v1.4h, v3.4h\n"
8070 "trn2 v13.4h, v1.4h, v3.4h\n"
8071 "trn1 v0.8b, v5.8b, v6.8b\n"
8072 "trn2 v1.8b, v5.8b, v6.8b\n"
8073 "trn1 v2.8b, v7.8b, v13.8b\n"
8074 "trn2 v3.8b, v7.8b, v13.8b\n"
8075 "uaddw v8.8h, v8.8h, v0.8b\n"
8076 "uaddw v9.8h, v9.8h, v1.8b\n"
8077 "uaddw v10.8h, v10.8h, v2.8b\n"
8078 "uaddw v11.8h, v11.8h, v3.8b\n"
8079 "uaddw v12.8h, v12.8h, v4.8b\n"
8080 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8081 "st1 {v4.2s}, [%x[out]], #8\n"
8082
8083 "bne 1b\n"
8084
8085 // Aggregator Reduction.
8086 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8087 "dup v1.4s, %w[additive_sum_offset]\n"
8088 "uaddlp v8.4s, v8.8h\n"
8089 "uaddlp v9.4s, v9.8h\n"
8090 "uaddlp v10.4s, v10.8h\n"
8091 "uaddlp v11.4s, v11.8h\n"
8092 "uaddlp v12.4s, v12.8h\n"
8093 "addp v8.4s, v8.4s, v9.4s\n"
8094 "addp v10.4s, v10.4s, v11.4s\n"
8095 "addp v12.4s, v12.4s, v12.4s\n"
8096 "addp v8.4s, v8.4s, v10.4s\n"
8097 "addp v9.4s, v12.4s, v12.4s\n"
8098 "mul v8.4s, v8.4s, v0.s[0]\n"
8099 "mul v9.4s, v9.4s, v0.s[0]\n"
8100 "add v8.4s, v8.4s, v1.4s\n"
8101 "add v9.4s, v9.4s, v1.4s\n"
8102 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8103 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8104 [out] "+r"(out), [in] "+r"(in)
8105 : [additive_sum_offset] "r"(params.additive_sum_offset),
8106 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8107 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8108 "v11", "v12", "v13", "cc", "memory");
8109 }
8110
8111 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8112 inline void Stream<uint8_t, 5, 8, 1, ColumnMajorWithSum>::Pack(
8113 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8114 #ifdef DEBUG
8115 #ifdef DEBUG_METAGEMM_VERBOSE
8116 std::cout
8117 << __FILE__ << "(" << __LINE__
8118 << ") ColumnMajorWithSum<uint8_t, 5, 8, 1, ColumnMajorWithSum>::Pack()"
8119 << std::endl
8120 << std::flush;
8121 #endif
8122 #endif
8123 int params_count_copy = params.count;
8124 int params_stride_copy = params.stride;
8125 asm volatile(
8126 "sub %x[stride], %x[stride], #4\n"
8127 "movi v8.8h, #0\n"
8128 "movi v9.8h, #0\n"
8129 "movi v10.8h, #0\n"
8130 "movi v11.8h, #0\n"
8131 "movi v12.8h, #0\n"
8132
8133 // Reduce count by leftovers.
8134 "subs %x[count], %x[count], #1\n"
8135 "beq 2f\n"
8136
8137 "1:"
8138 "subs %x[count], %x[count], #8\n"
8139
8140 // Load Aggregate Store - column major 5x8
8141 "ld1 {v0.s}[0], [%x[in]], #4\n"
8142 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8143 "ld1 {v1.s}[0], [%x[in]], #4\n"
8144 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8145 "ld1 {v2.s}[0], [%x[in]], #4\n"
8146 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8147 "ld1 {v3.s}[0], [%x[in]], #4\n"
8148 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8149 "ld1 {v0.s}[1], [%x[in]], #4\n"
8150 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8151 "ld1 {v1.s}[1], [%x[in]], #4\n"
8152 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8153 "ld1 {v2.s}[1], [%x[in]], #4\n"
8154 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8155 "ld1 {v3.s}[1], [%x[in]], #4\n"
8156 "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8157 "prfm pldl1keep, [%x[in]]\n"
8158 "trn1 v5.4h, v0.4h, v2.4h\n"
8159 "trn2 v7.4h, v0.4h, v2.4h\n"
8160 "trn1 v6.4h, v1.4h, v3.4h\n"
8161 "trn2 v13.4h, v1.4h, v3.4h\n"
8162 "trn1 v0.8b, v5.8b, v6.8b\n"
8163 "trn2 v1.8b, v5.8b, v6.8b\n"
8164 "trn1 v2.8b, v7.8b, v13.8b\n"
8165 "trn2 v3.8b, v7.8b, v13.8b\n"
8166 "uaddw v8.8h, v8.8h, v0.8b\n"
8167 "uaddw v9.8h, v9.8h, v1.8b\n"
8168 "uaddw v10.8h, v10.8h, v2.8b\n"
8169 "uaddw v11.8h, v11.8h, v3.8b\n"
8170 "uaddw v12.8h, v12.8h, v4.8b\n"
8171 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8172 "st1 {v4.2s}, [%x[out]], #8\n"
8173
8174 "bne 1b\n"
8175
8176 "2:"
8177
8178 // Load Aggregate Store - column major 5x1
8179 "movi v0.8b, #0\n"
8180 "movi v1.8b, #0\n"
8181 "movi v2.8b, #0\n"
8182 "movi v3.8b, #0\n"
8183 "movi v4.8b, #0\n"
8184 "ld1 {v0.s}[0], [%x[in]], #4\n"
8185 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8186 "prfm pldl1keep, [%x[in]]\n"
8187 "trn1 v5.4h, v0.4h, v2.4h\n"
8188 "trn2 v7.4h, v0.4h, v2.4h\n"
8189 "trn1 v6.4h, v1.4h, v3.4h\n"
8190 "trn2 v13.4h, v1.4h, v3.4h\n"
8191 "trn1 v0.8b, v5.8b, v6.8b\n"
8192 "trn2 v1.8b, v5.8b, v6.8b\n"
8193 "trn1 v2.8b, v7.8b, v13.8b\n"
8194 "trn2 v3.8b, v7.8b, v13.8b\n"
8195 "uaddw v8.8h, v8.8h, v0.8b\n"
8196 "uaddw v9.8h, v9.8h, v1.8b\n"
8197 "uaddw v10.8h, v10.8h, v2.8b\n"
8198 "uaddw v11.8h, v11.8h, v3.8b\n"
8199 "uaddw v12.8h, v12.8h, v4.8b\n"
8200 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8201 "st1 {v4.2s}, [%x[out]], #8\n"
8202
8203 // Aggregator Reduction.
8204 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8205 "dup v1.4s, %w[additive_sum_offset]\n"
8206 "uaddlp v8.4s, v8.8h\n"
8207 "uaddlp v9.4s, v9.8h\n"
8208 "uaddlp v10.4s, v10.8h\n"
8209 "uaddlp v11.4s, v11.8h\n"
8210 "uaddlp v12.4s, v12.8h\n"
8211 "addp v8.4s, v8.4s, v9.4s\n"
8212 "addp v10.4s, v10.4s, v11.4s\n"
8213 "addp v12.4s, v12.4s, v12.4s\n"
8214 "addp v8.4s, v8.4s, v10.4s\n"
8215 "addp v9.4s, v12.4s, v12.4s\n"
8216 "mul v8.4s, v8.4s, v0.s[0]\n"
8217 "mul v9.4s, v9.4s, v0.s[0]\n"
8218 "add v8.4s, v8.4s, v1.4s\n"
8219 "add v9.4s, v9.4s, v1.4s\n"
8220 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8221 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8222 [out] "+r"(out), [in] "+r"(in)
8223 : [additive_sum_offset] "r"(params.additive_sum_offset),
8224 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8225 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8226 "v11", "v12", "v13", "cc", "memory");
8227 }
8228
8229 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8230 inline void Stream<uint8_t, 5, 8, 2, ColumnMajorWithSum>::Pack(
8231 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8232 #ifdef DEBUG
8233 #ifdef DEBUG_METAGEMM_VERBOSE
8234 std::cout
8235 << __FILE__ << "(" << __LINE__
8236 << ") ColumnMajorWithSum<uint8_t, 5, 8, 2, ColumnMajorWithSum>::Pack()"
8237 << std::endl
8238 << std::flush;
8239 #endif
8240 #endif
8241 int params_count_copy = params.count;
8242 int params_stride_copy = params.stride;
8243 asm volatile(
8244 "sub %x[stride], %x[stride], #4\n"
8245 "movi v8.8h, #0\n"
8246 "movi v9.8h, #0\n"
8247 "movi v10.8h, #0\n"
8248 "movi v11.8h, #0\n"
8249 "movi v12.8h, #0\n"
8250
8251 // Reduce count by leftovers.
8252 "subs %x[count], %x[count], #2\n"
8253 "beq 2f\n"
8254
8255 "1:"
8256 "subs %x[count], %x[count], #8\n"
8257
8258 // Load Aggregate Store - column major 5x8
8259 "ld1 {v0.s}[0], [%x[in]], #4\n"
8260 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8261 "ld1 {v1.s}[0], [%x[in]], #4\n"
8262 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8263 "ld1 {v2.s}[0], [%x[in]], #4\n"
8264 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8265 "ld1 {v3.s}[0], [%x[in]], #4\n"
8266 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8267 "ld1 {v0.s}[1], [%x[in]], #4\n"
8268 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8269 "ld1 {v1.s}[1], [%x[in]], #4\n"
8270 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8271 "ld1 {v2.s}[1], [%x[in]], #4\n"
8272 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8273 "ld1 {v3.s}[1], [%x[in]], #4\n"
8274 "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8275 "prfm pldl1keep, [%x[in]]\n"
8276 "trn1 v5.4h, v0.4h, v2.4h\n"
8277 "trn2 v7.4h, v0.4h, v2.4h\n"
8278 "trn1 v6.4h, v1.4h, v3.4h\n"
8279 "trn2 v13.4h, v1.4h, v3.4h\n"
8280 "trn1 v0.8b, v5.8b, v6.8b\n"
8281 "trn2 v1.8b, v5.8b, v6.8b\n"
8282 "trn1 v2.8b, v7.8b, v13.8b\n"
8283 "trn2 v3.8b, v7.8b, v13.8b\n"
8284 "uaddw v8.8h, v8.8h, v0.8b\n"
8285 "uaddw v9.8h, v9.8h, v1.8b\n"
8286 "uaddw v10.8h, v10.8h, v2.8b\n"
8287 "uaddw v11.8h, v11.8h, v3.8b\n"
8288 "uaddw v12.8h, v12.8h, v4.8b\n"
8289 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8290 "st1 {v4.2s}, [%x[out]], #8\n"
8291
8292 "bne 1b\n"
8293
8294 "2:"
8295
8296 // Load Aggregate Store - column major 5x2
8297 "movi v0.8b, #0\n"
8298 "movi v1.8b, #0\n"
8299 "movi v2.8b, #0\n"
8300 "movi v3.8b, #0\n"
8301 "movi v4.8b, #0\n"
8302 "ld1 {v0.s}[0], [%x[in]], #4\n"
8303 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8304 "ld1 {v1.s}[0], [%x[in]], #4\n"
8305 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8306 "prfm pldl1keep, [%x[in]]\n"
8307 "trn1 v5.4h, v0.4h, v2.4h\n"
8308 "trn2 v7.4h, v0.4h, v2.4h\n"
8309 "trn1 v6.4h, v1.4h, v3.4h\n"
8310 "trn2 v13.4h, v1.4h, v3.4h\n"
8311 "trn1 v0.8b, v5.8b, v6.8b\n"
8312 "trn2 v1.8b, v5.8b, v6.8b\n"
8313 "trn1 v2.8b, v7.8b, v13.8b\n"
8314 "trn2 v3.8b, v7.8b, v13.8b\n"
8315 "uaddw v8.8h, v8.8h, v0.8b\n"
8316 "uaddw v9.8h, v9.8h, v1.8b\n"
8317 "uaddw v10.8h, v10.8h, v2.8b\n"
8318 "uaddw v11.8h, v11.8h, v3.8b\n"
8319 "uaddw v12.8h, v12.8h, v4.8b\n"
8320 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8321 "st1 {v4.2s}, [%x[out]], #8\n"
8322
8323 // Aggregator Reduction.
8324 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8325 "dup v1.4s, %w[additive_sum_offset]\n"
8326 "uaddlp v8.4s, v8.8h\n"
8327 "uaddlp v9.4s, v9.8h\n"
8328 "uaddlp v10.4s, v10.8h\n"
8329 "uaddlp v11.4s, v11.8h\n"
8330 "uaddlp v12.4s, v12.8h\n"
8331 "addp v8.4s, v8.4s, v9.4s\n"
8332 "addp v10.4s, v10.4s, v11.4s\n"
8333 "addp v12.4s, v12.4s, v12.4s\n"
8334 "addp v8.4s, v8.4s, v10.4s\n"
8335 "addp v9.4s, v12.4s, v12.4s\n"
8336 "mul v8.4s, v8.4s, v0.s[0]\n"
8337 "mul v9.4s, v9.4s, v0.s[0]\n"
8338 "add v8.4s, v8.4s, v1.4s\n"
8339 "add v9.4s, v9.4s, v1.4s\n"
8340 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8341 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8342 [out] "+r"(out), [in] "+r"(in)
8343 : [additive_sum_offset] "r"(params.additive_sum_offset),
8344 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8345 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8346 "v11", "v12", "v13", "cc", "memory");
8347 }
8348
8349 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8350 inline void Stream<uint8_t, 5, 8, 3, ColumnMajorWithSum>::Pack(
8351 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8352 #ifdef DEBUG
8353 #ifdef DEBUG_METAGEMM_VERBOSE
8354 std::cout
8355 << __FILE__ << "(" << __LINE__
8356 << ") ColumnMajorWithSum<uint8_t, 5, 8, 3, ColumnMajorWithSum>::Pack()"
8357 << std::endl
8358 << std::flush;
8359 #endif
8360 #endif
8361 int params_count_copy = params.count;
8362 int params_stride_copy = params.stride;
8363 asm volatile(
8364 "sub %x[stride], %x[stride], #4\n"
8365 "movi v8.8h, #0\n"
8366 "movi v9.8h, #0\n"
8367 "movi v10.8h, #0\n"
8368 "movi v11.8h, #0\n"
8369 "movi v12.8h, #0\n"
8370
8371 // Reduce count by leftovers.
8372 "subs %x[count], %x[count], #3\n"
8373 "beq 2f\n"
8374
8375 "1:"
8376 "subs %x[count], %x[count], #8\n"
8377
8378 // Load Aggregate Store - column major 5x8
8379 "ld1 {v0.s}[0], [%x[in]], #4\n"
8380 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8381 "ld1 {v1.s}[0], [%x[in]], #4\n"
8382 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8383 "ld1 {v2.s}[0], [%x[in]], #4\n"
8384 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8385 "ld1 {v3.s}[0], [%x[in]], #4\n"
8386 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8387 "ld1 {v0.s}[1], [%x[in]], #4\n"
8388 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8389 "ld1 {v1.s}[1], [%x[in]], #4\n"
8390 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8391 "ld1 {v2.s}[1], [%x[in]], #4\n"
8392 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8393 "ld1 {v3.s}[1], [%x[in]], #4\n"
8394 "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8395 "prfm pldl1keep, [%x[in]]\n"
8396 "trn1 v5.4h, v0.4h, v2.4h\n"
8397 "trn2 v7.4h, v0.4h, v2.4h\n"
8398 "trn1 v6.4h, v1.4h, v3.4h\n"
8399 "trn2 v13.4h, v1.4h, v3.4h\n"
8400 "trn1 v0.8b, v5.8b, v6.8b\n"
8401 "trn2 v1.8b, v5.8b, v6.8b\n"
8402 "trn1 v2.8b, v7.8b, v13.8b\n"
8403 "trn2 v3.8b, v7.8b, v13.8b\n"
8404 "uaddw v8.8h, v8.8h, v0.8b\n"
8405 "uaddw v9.8h, v9.8h, v1.8b\n"
8406 "uaddw v10.8h, v10.8h, v2.8b\n"
8407 "uaddw v11.8h, v11.8h, v3.8b\n"
8408 "uaddw v12.8h, v12.8h, v4.8b\n"
8409 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8410 "st1 {v4.2s}, [%x[out]], #8\n"
8411
8412 "bne 1b\n"
8413
8414 "2:"
8415
8416 // Load Aggregate Store - column major 5x3
8417 "movi v0.8b, #0\n"
8418 "movi v1.8b, #0\n"
8419 "movi v2.8b, #0\n"
8420 "movi v3.8b, #0\n"
8421 "movi v4.8b, #0\n"
8422 "ld1 {v0.s}[0], [%x[in]], #4\n"
8423 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8424 "ld1 {v1.s}[0], [%x[in]], #4\n"
8425 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8426 "ld1 {v2.s}[0], [%x[in]], #4\n"
8427 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8428 "prfm pldl1keep, [%x[in]]\n"
8429 "trn1 v5.4h, v0.4h, v2.4h\n"
8430 "trn2 v7.4h, v0.4h, v2.4h\n"
8431 "trn1 v6.4h, v1.4h, v3.4h\n"
8432 "trn2 v13.4h, v1.4h, v3.4h\n"
8433 "trn1 v0.8b, v5.8b, v6.8b\n"
8434 "trn2 v1.8b, v5.8b, v6.8b\n"
8435 "trn1 v2.8b, v7.8b, v13.8b\n"
8436 "trn2 v3.8b, v7.8b, v13.8b\n"
8437 "uaddw v8.8h, v8.8h, v0.8b\n"
8438 "uaddw v9.8h, v9.8h, v1.8b\n"
8439 "uaddw v10.8h, v10.8h, v2.8b\n"
8440 "uaddw v11.8h, v11.8h, v3.8b\n"
8441 "uaddw v12.8h, v12.8h, v4.8b\n"
8442 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8443 "st1 {v4.2s}, [%x[out]], #8\n"
8444
8445 // Aggregator Reduction.
8446 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8447 "dup v1.4s, %w[additive_sum_offset]\n"
8448 "uaddlp v8.4s, v8.8h\n"
8449 "uaddlp v9.4s, v9.8h\n"
8450 "uaddlp v10.4s, v10.8h\n"
8451 "uaddlp v11.4s, v11.8h\n"
8452 "uaddlp v12.4s, v12.8h\n"
8453 "addp v8.4s, v8.4s, v9.4s\n"
8454 "addp v10.4s, v10.4s, v11.4s\n"
8455 "addp v12.4s, v12.4s, v12.4s\n"
8456 "addp v8.4s, v8.4s, v10.4s\n"
8457 "addp v9.4s, v12.4s, v12.4s\n"
8458 "mul v8.4s, v8.4s, v0.s[0]\n"
8459 "mul v9.4s, v9.4s, v0.s[0]\n"
8460 "add v8.4s, v8.4s, v1.4s\n"
8461 "add v9.4s, v9.4s, v1.4s\n"
8462 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8463 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8464 [out] "+r"(out), [in] "+r"(in)
8465 : [additive_sum_offset] "r"(params.additive_sum_offset),
8466 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8467 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8468 "v11", "v12", "v13", "cc", "memory");
8469 }
8470
8471 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8472 inline void Stream<uint8_t, 5, 8, 4, ColumnMajorWithSum>::Pack(
8473 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8474 #ifdef DEBUG
8475 #ifdef DEBUG_METAGEMM_VERBOSE
8476 std::cout
8477 << __FILE__ << "(" << __LINE__
8478 << ") ColumnMajorWithSum<uint8_t, 5, 8, 4, ColumnMajorWithSum>::Pack()"
8479 << std::endl
8480 << std::flush;
8481 #endif
8482 #endif
8483 int params_count_copy = params.count;
8484 int params_stride_copy = params.stride;
8485 asm volatile(
8486 "sub %x[stride], %x[stride], #4\n"
8487 "movi v8.8h, #0\n"
8488 "movi v9.8h, #0\n"
8489 "movi v10.8h, #0\n"
8490 "movi v11.8h, #0\n"
8491 "movi v12.8h, #0\n"
8492
8493 // Reduce count by leftovers.
8494 "subs %x[count], %x[count], #4\n"
8495 "beq 2f\n"
8496
8497 "1:"
8498 "subs %x[count], %x[count], #8\n"
8499
8500 // Load Aggregate Store - column major 5x8
8501 "ld1 {v0.s}[0], [%x[in]], #4\n"
8502 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8503 "ld1 {v1.s}[0], [%x[in]], #4\n"
8504 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8505 "ld1 {v2.s}[0], [%x[in]], #4\n"
8506 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8507 "ld1 {v3.s}[0], [%x[in]], #4\n"
8508 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8509 "ld1 {v0.s}[1], [%x[in]], #4\n"
8510 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8511 "ld1 {v1.s}[1], [%x[in]], #4\n"
8512 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8513 "ld1 {v2.s}[1], [%x[in]], #4\n"
8514 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8515 "ld1 {v3.s}[1], [%x[in]], #4\n"
8516 "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8517 "prfm pldl1keep, [%x[in]]\n"
8518 "trn1 v5.4h, v0.4h, v2.4h\n"
8519 "trn2 v7.4h, v0.4h, v2.4h\n"
8520 "trn1 v6.4h, v1.4h, v3.4h\n"
8521 "trn2 v13.4h, v1.4h, v3.4h\n"
8522 "trn1 v0.8b, v5.8b, v6.8b\n"
8523 "trn2 v1.8b, v5.8b, v6.8b\n"
8524 "trn1 v2.8b, v7.8b, v13.8b\n"
8525 "trn2 v3.8b, v7.8b, v13.8b\n"
8526 "uaddw v8.8h, v8.8h, v0.8b\n"
8527 "uaddw v9.8h, v9.8h, v1.8b\n"
8528 "uaddw v10.8h, v10.8h, v2.8b\n"
8529 "uaddw v11.8h, v11.8h, v3.8b\n"
8530 "uaddw v12.8h, v12.8h, v4.8b\n"
8531 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8532 "st1 {v4.2s}, [%x[out]], #8\n"
8533
8534 "bne 1b\n"
8535
8536 "2:"
8537
8538 // Load Aggregate Store - column major 5x4
8539 "movi v0.8b, #0\n"
8540 "movi v1.8b, #0\n"
8541 "movi v2.8b, #0\n"
8542 "movi v3.8b, #0\n"
8543 "movi v4.8b, #0\n"
8544 "ld1 {v0.s}[0], [%x[in]], #4\n"
8545 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8546 "ld1 {v1.s}[0], [%x[in]], #4\n"
8547 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8548 "ld1 {v2.s}[0], [%x[in]], #4\n"
8549 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8550 "ld1 {v3.s}[0], [%x[in]], #4\n"
8551 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8552 "prfm pldl1keep, [%x[in]]\n"
8553 "trn1 v5.4h, v0.4h, v2.4h\n"
8554 "trn2 v7.4h, v0.4h, v2.4h\n"
8555 "trn1 v6.4h, v1.4h, v3.4h\n"
8556 "trn2 v13.4h, v1.4h, v3.4h\n"
8557 "trn1 v0.8b, v5.8b, v6.8b\n"
8558 "trn2 v1.8b, v5.8b, v6.8b\n"
8559 "trn1 v2.8b, v7.8b, v13.8b\n"
8560 "trn2 v3.8b, v7.8b, v13.8b\n"
8561 "uaddw v8.8h, v8.8h, v0.8b\n"
8562 "uaddw v9.8h, v9.8h, v1.8b\n"
8563 "uaddw v10.8h, v10.8h, v2.8b\n"
8564 "uaddw v11.8h, v11.8h, v3.8b\n"
8565 "uaddw v12.8h, v12.8h, v4.8b\n"
8566 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8567 "st1 {v4.2s}, [%x[out]], #8\n"
8568
8569 // Aggregator Reduction.
8570 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8571 "dup v1.4s, %w[additive_sum_offset]\n"
8572 "uaddlp v8.4s, v8.8h\n"
8573 "uaddlp v9.4s, v9.8h\n"
8574 "uaddlp v10.4s, v10.8h\n"
8575 "uaddlp v11.4s, v11.8h\n"
8576 "uaddlp v12.4s, v12.8h\n"
8577 "addp v8.4s, v8.4s, v9.4s\n"
8578 "addp v10.4s, v10.4s, v11.4s\n"
8579 "addp v12.4s, v12.4s, v12.4s\n"
8580 "addp v8.4s, v8.4s, v10.4s\n"
8581 "addp v9.4s, v12.4s, v12.4s\n"
8582 "mul v8.4s, v8.4s, v0.s[0]\n"
8583 "mul v9.4s, v9.4s, v0.s[0]\n"
8584 "add v8.4s, v8.4s, v1.4s\n"
8585 "add v9.4s, v9.4s, v1.4s\n"
8586 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8587 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8588 [out] "+r"(out), [in] "+r"(in)
8589 : [additive_sum_offset] "r"(params.additive_sum_offset),
8590 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8591 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8592 "v11", "v12", "v13", "cc", "memory");
8593 }
8594
8595 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8596 inline void Stream<uint8_t, 5, 8, 5, ColumnMajorWithSum>::Pack(
8597 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8598 #ifdef DEBUG
8599 #ifdef DEBUG_METAGEMM_VERBOSE
8600 std::cout
8601 << __FILE__ << "(" << __LINE__
8602 << ") ColumnMajorWithSum<uint8_t, 5, 8, 5, ColumnMajorWithSum>::Pack()"
8603 << std::endl
8604 << std::flush;
8605 #endif
8606 #endif
8607 int params_count_copy = params.count;
8608 int params_stride_copy = params.stride;
8609 asm volatile(
8610 "sub %x[stride], %x[stride], #4\n"
8611 "movi v8.8h, #0\n"
8612 "movi v9.8h, #0\n"
8613 "movi v10.8h, #0\n"
8614 "movi v11.8h, #0\n"
8615 "movi v12.8h, #0\n"
8616
8617 // Reduce count by leftovers.
8618 "subs %x[count], %x[count], #5\n"
8619 "beq 2f\n"
8620
8621 "1:"
8622 "subs %x[count], %x[count], #8\n"
8623
8624 // Load Aggregate Store - column major 5x8
8625 "ld1 {v0.s}[0], [%x[in]], #4\n"
8626 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8627 "ld1 {v1.s}[0], [%x[in]], #4\n"
8628 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8629 "ld1 {v2.s}[0], [%x[in]], #4\n"
8630 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8631 "ld1 {v3.s}[0], [%x[in]], #4\n"
8632 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8633 "ld1 {v0.s}[1], [%x[in]], #4\n"
8634 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8635 "ld1 {v1.s}[1], [%x[in]], #4\n"
8636 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8637 "ld1 {v2.s}[1], [%x[in]], #4\n"
8638 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8639 "ld1 {v3.s}[1], [%x[in]], #4\n"
8640 "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8641 "prfm pldl1keep, [%x[in]]\n"
8642 "trn1 v5.4h, v0.4h, v2.4h\n"
8643 "trn2 v7.4h, v0.4h, v2.4h\n"
8644 "trn1 v6.4h, v1.4h, v3.4h\n"
8645 "trn2 v13.4h, v1.4h, v3.4h\n"
8646 "trn1 v0.8b, v5.8b, v6.8b\n"
8647 "trn2 v1.8b, v5.8b, v6.8b\n"
8648 "trn1 v2.8b, v7.8b, v13.8b\n"
8649 "trn2 v3.8b, v7.8b, v13.8b\n"
8650 "uaddw v8.8h, v8.8h, v0.8b\n"
8651 "uaddw v9.8h, v9.8h, v1.8b\n"
8652 "uaddw v10.8h, v10.8h, v2.8b\n"
8653 "uaddw v11.8h, v11.8h, v3.8b\n"
8654 "uaddw v12.8h, v12.8h, v4.8b\n"
8655 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8656 "st1 {v4.2s}, [%x[out]], #8\n"
8657
8658 "bne 1b\n"
8659
8660 "2:"
8661
8662 // Load Aggregate Store - column major 5x5
8663 "movi v0.8b, #0\n"
8664 "movi v1.8b, #0\n"
8665 "movi v2.8b, #0\n"
8666 "movi v3.8b, #0\n"
8667 "movi v4.8b, #0\n"
8668 "ld1 {v0.s}[0], [%x[in]], #4\n"
8669 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8670 "ld1 {v1.s}[0], [%x[in]], #4\n"
8671 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8672 "ld1 {v2.s}[0], [%x[in]], #4\n"
8673 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8674 "ld1 {v3.s}[0], [%x[in]], #4\n"
8675 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8676 "ld1 {v0.s}[1], [%x[in]], #4\n"
8677 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8678 "prfm pldl1keep, [%x[in]]\n"
8679 "trn1 v5.4h, v0.4h, v2.4h\n"
8680 "trn2 v7.4h, v0.4h, v2.4h\n"
8681 "trn1 v6.4h, v1.4h, v3.4h\n"
8682 "trn2 v13.4h, v1.4h, v3.4h\n"
8683 "trn1 v0.8b, v5.8b, v6.8b\n"
8684 "trn2 v1.8b, v5.8b, v6.8b\n"
8685 "trn1 v2.8b, v7.8b, v13.8b\n"
8686 "trn2 v3.8b, v7.8b, v13.8b\n"
8687 "uaddw v8.8h, v8.8h, v0.8b\n"
8688 "uaddw v9.8h, v9.8h, v1.8b\n"
8689 "uaddw v10.8h, v10.8h, v2.8b\n"
8690 "uaddw v11.8h, v11.8h, v3.8b\n"
8691 "uaddw v12.8h, v12.8h, v4.8b\n"
8692 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8693 "st1 {v4.2s}, [%x[out]], #8\n"
8694
8695 // Aggregator Reduction.
8696 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8697 "dup v1.4s, %w[additive_sum_offset]\n"
8698 "uaddlp v8.4s, v8.8h\n"
8699 "uaddlp v9.4s, v9.8h\n"
8700 "uaddlp v10.4s, v10.8h\n"
8701 "uaddlp v11.4s, v11.8h\n"
8702 "uaddlp v12.4s, v12.8h\n"
8703 "addp v8.4s, v8.4s, v9.4s\n"
8704 "addp v10.4s, v10.4s, v11.4s\n"
8705 "addp v12.4s, v12.4s, v12.4s\n"
8706 "addp v8.4s, v8.4s, v10.4s\n"
8707 "addp v9.4s, v12.4s, v12.4s\n"
8708 "mul v8.4s, v8.4s, v0.s[0]\n"
8709 "mul v9.4s, v9.4s, v0.s[0]\n"
8710 "add v8.4s, v8.4s, v1.4s\n"
8711 "add v9.4s, v9.4s, v1.4s\n"
8712 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8713 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8714 [out] "+r"(out), [in] "+r"(in)
8715 : [additive_sum_offset] "r"(params.additive_sum_offset),
8716 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8717 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8718 "v11", "v12", "v13", "cc", "memory");
8719 }
8720
8721 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8722 inline void Stream<uint8_t, 5, 8, 6, ColumnMajorWithSum>::Pack(
8723 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8724 #ifdef DEBUG
8725 #ifdef DEBUG_METAGEMM_VERBOSE
8726 std::cout
8727 << __FILE__ << "(" << __LINE__
8728 << ") ColumnMajorWithSum<uint8_t, 5, 8, 6, ColumnMajorWithSum>::Pack()"
8729 << std::endl
8730 << std::flush;
8731 #endif
8732 #endif
8733 int params_count_copy = params.count;
8734 int params_stride_copy = params.stride;
8735 asm volatile(
8736 "sub %x[stride], %x[stride], #4\n"
8737 "movi v8.8h, #0\n"
8738 "movi v9.8h, #0\n"
8739 "movi v10.8h, #0\n"
8740 "movi v11.8h, #0\n"
8741 "movi v12.8h, #0\n"
8742
8743 // Reduce count by leftovers.
8744 "subs %x[count], %x[count], #6\n"
8745 "beq 2f\n"
8746
8747 "1:"
8748 "subs %x[count], %x[count], #8\n"
8749
8750 // Load Aggregate Store - column major 5x8
8751 "ld1 {v0.s}[0], [%x[in]], #4\n"
8752 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8753 "ld1 {v1.s}[0], [%x[in]], #4\n"
8754 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8755 "ld1 {v2.s}[0], [%x[in]], #4\n"
8756 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8757 "ld1 {v3.s}[0], [%x[in]], #4\n"
8758 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8759 "ld1 {v0.s}[1], [%x[in]], #4\n"
8760 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8761 "ld1 {v1.s}[1], [%x[in]], #4\n"
8762 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8763 "ld1 {v2.s}[1], [%x[in]], #4\n"
8764 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8765 "ld1 {v3.s}[1], [%x[in]], #4\n"
8766 "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8767 "prfm pldl1keep, [%x[in]]\n"
8768 "trn1 v5.4h, v0.4h, v2.4h\n"
8769 "trn2 v7.4h, v0.4h, v2.4h\n"
8770 "trn1 v6.4h, v1.4h, v3.4h\n"
8771 "trn2 v13.4h, v1.4h, v3.4h\n"
8772 "trn1 v0.8b, v5.8b, v6.8b\n"
8773 "trn2 v1.8b, v5.8b, v6.8b\n"
8774 "trn1 v2.8b, v7.8b, v13.8b\n"
8775 "trn2 v3.8b, v7.8b, v13.8b\n"
8776 "uaddw v8.8h, v8.8h, v0.8b\n"
8777 "uaddw v9.8h, v9.8h, v1.8b\n"
8778 "uaddw v10.8h, v10.8h, v2.8b\n"
8779 "uaddw v11.8h, v11.8h, v3.8b\n"
8780 "uaddw v12.8h, v12.8h, v4.8b\n"
8781 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8782 "st1 {v4.2s}, [%x[out]], #8\n"
8783
8784 "bne 1b\n"
8785
8786 "2:"
8787
8788 // Load Aggregate Store - column major 5x6
8789 "movi v0.8b, #0\n"
8790 "movi v1.8b, #0\n"
8791 "movi v2.8b, #0\n"
8792 "movi v3.8b, #0\n"
8793 "movi v4.8b, #0\n"
8794 "ld1 {v0.s}[0], [%x[in]], #4\n"
8795 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8796 "ld1 {v1.s}[0], [%x[in]], #4\n"
8797 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8798 "ld1 {v2.s}[0], [%x[in]], #4\n"
8799 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8800 "ld1 {v3.s}[0], [%x[in]], #4\n"
8801 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8802 "ld1 {v0.s}[1], [%x[in]], #4\n"
8803 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8804 "ld1 {v1.s}[1], [%x[in]], #4\n"
8805 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8806 "prfm pldl1keep, [%x[in]]\n"
8807 "trn1 v5.4h, v0.4h, v2.4h\n"
8808 "trn2 v7.4h, v0.4h, v2.4h\n"
8809 "trn1 v6.4h, v1.4h, v3.4h\n"
8810 "trn2 v13.4h, v1.4h, v3.4h\n"
8811 "trn1 v0.8b, v5.8b, v6.8b\n"
8812 "trn2 v1.8b, v5.8b, v6.8b\n"
8813 "trn1 v2.8b, v7.8b, v13.8b\n"
8814 "trn2 v3.8b, v7.8b, v13.8b\n"
8815 "uaddw v8.8h, v8.8h, v0.8b\n"
8816 "uaddw v9.8h, v9.8h, v1.8b\n"
8817 "uaddw v10.8h, v10.8h, v2.8b\n"
8818 "uaddw v11.8h, v11.8h, v3.8b\n"
8819 "uaddw v12.8h, v12.8h, v4.8b\n"
8820 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8821 "st1 {v4.2s}, [%x[out]], #8\n"
8822
8823 // Aggregator Reduction.
8824 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8825 "dup v1.4s, %w[additive_sum_offset]\n"
8826 "uaddlp v8.4s, v8.8h\n"
8827 "uaddlp v9.4s, v9.8h\n"
8828 "uaddlp v10.4s, v10.8h\n"
8829 "uaddlp v11.4s, v11.8h\n"
8830 "uaddlp v12.4s, v12.8h\n"
8831 "addp v8.4s, v8.4s, v9.4s\n"
8832 "addp v10.4s, v10.4s, v11.4s\n"
8833 "addp v12.4s, v12.4s, v12.4s\n"
8834 "addp v8.4s, v8.4s, v10.4s\n"
8835 "addp v9.4s, v12.4s, v12.4s\n"
8836 "mul v8.4s, v8.4s, v0.s[0]\n"
8837 "mul v9.4s, v9.4s, v0.s[0]\n"
8838 "add v8.4s, v8.4s, v1.4s\n"
8839 "add v9.4s, v9.4s, v1.4s\n"
8840 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8841 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8842 [out] "+r"(out), [in] "+r"(in)
8843 : [additive_sum_offset] "r"(params.additive_sum_offset),
8844 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8845 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8846 "v11", "v12", "v13", "cc", "memory");
8847 }
8848
8849 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8850 inline void Stream<uint8_t, 5, 8, 7, ColumnMajorWithSum>::Pack(
8851 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8852 #ifdef DEBUG
8853 #ifdef DEBUG_METAGEMM_VERBOSE
8854 std::cout
8855 << __FILE__ << "(" << __LINE__
8856 << ") ColumnMajorWithSum<uint8_t, 5, 8, 7, ColumnMajorWithSum>::Pack()"
8857 << std::endl
8858 << std::flush;
8859 #endif
8860 #endif
8861 int params_count_copy = params.count;
8862 int params_stride_copy = params.stride;
8863 asm volatile(
8864 "sub %x[stride], %x[stride], #4\n"
8865 "movi v8.8h, #0\n"
8866 "movi v9.8h, #0\n"
8867 "movi v10.8h, #0\n"
8868 "movi v11.8h, #0\n"
8869 "movi v12.8h, #0\n"
8870
8871 // Reduce count by leftovers.
8872 "subs %x[count], %x[count], #7\n"
8873 "beq 2f\n"
8874
8875 "1:"
8876 "subs %x[count], %x[count], #8\n"
8877
8878 // Load Aggregate Store - column major 5x8
8879 "ld1 {v0.s}[0], [%x[in]], #4\n"
8880 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8881 "ld1 {v1.s}[0], [%x[in]], #4\n"
8882 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8883 "ld1 {v2.s}[0], [%x[in]], #4\n"
8884 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8885 "ld1 {v3.s}[0], [%x[in]], #4\n"
8886 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8887 "ld1 {v0.s}[1], [%x[in]], #4\n"
8888 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8889 "ld1 {v1.s}[1], [%x[in]], #4\n"
8890 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8891 "ld1 {v2.s}[1], [%x[in]], #4\n"
8892 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8893 "ld1 {v3.s}[1], [%x[in]], #4\n"
8894 "ld1 {v4.b}[7], [%x[in]], %x[stride]\n"
8895 "prfm pldl1keep, [%x[in]]\n"
8896 "trn1 v5.4h, v0.4h, v2.4h\n"
8897 "trn2 v7.4h, v0.4h, v2.4h\n"
8898 "trn1 v6.4h, v1.4h, v3.4h\n"
8899 "trn2 v13.4h, v1.4h, v3.4h\n"
8900 "trn1 v0.8b, v5.8b, v6.8b\n"
8901 "trn2 v1.8b, v5.8b, v6.8b\n"
8902 "trn1 v2.8b, v7.8b, v13.8b\n"
8903 "trn2 v3.8b, v7.8b, v13.8b\n"
8904 "uaddw v8.8h, v8.8h, v0.8b\n"
8905 "uaddw v9.8h, v9.8h, v1.8b\n"
8906 "uaddw v10.8h, v10.8h, v2.8b\n"
8907 "uaddw v11.8h, v11.8h, v3.8b\n"
8908 "uaddw v12.8h, v12.8h, v4.8b\n"
8909 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8910 "st1 {v4.2s}, [%x[out]], #8\n"
8911
8912 "bne 1b\n"
8913
8914 "2:"
8915
8916 // Load Aggregate Store - column major 5x7
8917 "movi v0.8b, #0\n"
8918 "movi v1.8b, #0\n"
8919 "movi v2.8b, #0\n"
8920 "movi v3.8b, #0\n"
8921 "movi v4.8b, #0\n"
8922 "ld1 {v0.s}[0], [%x[in]], #4\n"
8923 "ld1 {v4.b}[0], [%x[in]], %x[stride]\n"
8924 "ld1 {v1.s}[0], [%x[in]], #4\n"
8925 "ld1 {v4.b}[1], [%x[in]], %x[stride]\n"
8926 "ld1 {v2.s}[0], [%x[in]], #4\n"
8927 "ld1 {v4.b}[2], [%x[in]], %x[stride]\n"
8928 "ld1 {v3.s}[0], [%x[in]], #4\n"
8929 "ld1 {v4.b}[3], [%x[in]], %x[stride]\n"
8930 "ld1 {v0.s}[1], [%x[in]], #4\n"
8931 "ld1 {v4.b}[4], [%x[in]], %x[stride]\n"
8932 "ld1 {v1.s}[1], [%x[in]], #4\n"
8933 "ld1 {v4.b}[5], [%x[in]], %x[stride]\n"
8934 "ld1 {v2.s}[1], [%x[in]], #4\n"
8935 "ld1 {v4.b}[6], [%x[in]], %x[stride]\n"
8936 "prfm pldl1keep, [%x[in]]\n"
8937 "trn1 v5.4h, v0.4h, v2.4h\n"
8938 "trn2 v7.4h, v0.4h, v2.4h\n"
8939 "trn1 v6.4h, v1.4h, v3.4h\n"
8940 "trn2 v13.4h, v1.4h, v3.4h\n"
8941 "trn1 v0.8b, v5.8b, v6.8b\n"
8942 "trn2 v1.8b, v5.8b, v6.8b\n"
8943 "trn1 v2.8b, v7.8b, v13.8b\n"
8944 "trn2 v3.8b, v7.8b, v13.8b\n"
8945 "uaddw v8.8h, v8.8h, v0.8b\n"
8946 "uaddw v9.8h, v9.8h, v1.8b\n"
8947 "uaddw v10.8h, v10.8h, v2.8b\n"
8948 "uaddw v11.8h, v11.8h, v3.8b\n"
8949 "uaddw v12.8h, v12.8h, v4.8b\n"
8950 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
8951 "st1 {v4.2s}, [%x[out]], #8\n"
8952
8953 // Aggregator Reduction.
8954 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
8955 "dup v1.4s, %w[additive_sum_offset]\n"
8956 "uaddlp v8.4s, v8.8h\n"
8957 "uaddlp v9.4s, v9.8h\n"
8958 "uaddlp v10.4s, v10.8h\n"
8959 "uaddlp v11.4s, v11.8h\n"
8960 "uaddlp v12.4s, v12.8h\n"
8961 "addp v8.4s, v8.4s, v9.4s\n"
8962 "addp v10.4s, v10.4s, v11.4s\n"
8963 "addp v12.4s, v12.4s, v12.4s\n"
8964 "addp v8.4s, v8.4s, v10.4s\n"
8965 "addp v9.4s, v12.4s, v12.4s\n"
8966 "mul v8.4s, v8.4s, v0.s[0]\n"
8967 "mul v9.4s, v9.4s, v0.s[0]\n"
8968 "add v8.4s, v8.4s, v1.4s\n"
8969 "add v9.4s, v9.4s, v1.4s\n"
8970 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
8971 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
8972 [out] "+r"(out), [in] "+r"(in)
8973 : [additive_sum_offset] "r"(params.additive_sum_offset),
8974 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
8975 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
8976 "v11", "v12", "v13", "cc", "memory");
8977 }
8978
8979 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)8980 inline void Stream<uint8_t, 6, 8, 0, ColumnMajorWithSum>::Pack(
8981 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
8982 #ifdef DEBUG
8983 #ifdef DEBUG_METAGEMM_VERBOSE
8984 std::cout
8985 << __FILE__ << "(" << __LINE__
8986 << ") ColumnMajorWithSum<uint8_t, 6, 8, 0, ColumnMajorWithSum>::Pack()"
8987 << std::endl
8988 << std::flush;
8989 #endif
8990 #endif
8991 int params_count_copy = params.count;
8992 int params_stride_copy = params.stride;
8993 asm volatile(
8994 "sub %x[stride], %x[stride], #4\n"
8995 "movi v8.8h, #0\n"
8996 "movi v9.8h, #0\n"
8997 "movi v10.8h, #0\n"
8998 "movi v11.8h, #0\n"
8999 "movi v12.8h, #0\n"
9000 "movi v13.8h, #0\n"
9001
9002 "1:"
9003 "subs %x[count], %x[count], #8\n"
9004
9005 // Load Aggregate Store - column major 6x8
9006 "ld1 {v0.s}[0], [%x[in]], #4\n"
9007 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9008 "ld1 {v1.s}[0], [%x[in]], #4\n"
9009 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9010 "ld1 {v2.s}[0], [%x[in]], #4\n"
9011 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9012 "ld1 {v3.s}[0], [%x[in]], #4\n"
9013 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9014 "ld1 {v0.s}[1], [%x[in]], #4\n"
9015 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9016 "ld1 {v1.s}[1], [%x[in]], #4\n"
9017 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9018 "ld1 {v2.s}[1], [%x[in]], #4\n"
9019 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9020 "ld1 {v3.s}[1], [%x[in]], #4\n"
9021 "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9022 "prfm pldl1keep, [%x[in]]\n"
9023 "trn1 v6.4h, v0.4h, v2.4h\n"
9024 "trn2 v14.4h, v0.4h, v2.4h\n"
9025 "trn1 v7.4h, v1.4h, v3.4h\n"
9026 "trn2 v15.4h, v1.4h, v3.4h\n"
9027 "uzp1 v16.8b, v4.8b, v5.8b\n"
9028 "uzp2 v17.8b, v4.8b, v5.8b\n"
9029 "trn1 v0.8b, v6.8b, v7.8b\n"
9030 "trn2 v1.8b, v6.8b, v7.8b\n"
9031 "trn1 v2.8b, v14.8b, v15.8b\n"
9032 "trn2 v3.8b, v14.8b, v15.8b\n"
9033 "uaddw v8.8h, v8.8h, v0.8b\n"
9034 "uaddw v9.8h, v9.8h, v1.8b\n"
9035 "uaddw v10.8h, v10.8h, v2.8b\n"
9036 "uaddw v11.8h, v11.8h, v3.8b\n"
9037 "uaddw v12.8h, v12.8h, v16.8b\n"
9038 "uaddw v13.8h, v13.8h, v17.8b\n"
9039 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9040 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9041
9042 "bne 1b\n"
9043
9044 // Aggregator Reduction.
9045 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9046 "dup v1.4s, %w[additive_sum_offset]\n"
9047 "uaddlp v8.4s, v8.8h\n"
9048 "uaddlp v9.4s, v9.8h\n"
9049 "uaddlp v10.4s, v10.8h\n"
9050 "uaddlp v11.4s, v11.8h\n"
9051 "uaddlp v12.4s, v12.8h\n"
9052 "uaddlp v13.4s, v13.8h\n"
9053 "addp v8.4s, v8.4s, v9.4s\n"
9054 "addp v10.4s, v10.4s, v11.4s\n"
9055 "addp v12.4s, v12.4s, v13.4s\n"
9056 "addp v8.4s, v8.4s, v10.4s\n"
9057 "addp v9.4s, v12.4s, v12.4s\n"
9058 "mul v8.4s, v8.4s, v0.s[0]\n"
9059 "mul v9.4s, v9.4s, v0.s[0]\n"
9060 "add v8.4s, v8.4s, v1.4s\n"
9061 "add v9.4s, v9.4s, v1.4s\n"
9062 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9063 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9064 [out] "+r"(out), [in] "+r"(in)
9065 : [additive_sum_offset] "r"(params.additive_sum_offset),
9066 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9067 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9068 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9069 }
9070
9071 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9072 inline void Stream<uint8_t, 6, 8, 1, ColumnMajorWithSum>::Pack(
9073 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9074 #ifdef DEBUG
9075 #ifdef DEBUG_METAGEMM_VERBOSE
9076 std::cout
9077 << __FILE__ << "(" << __LINE__
9078 << ") ColumnMajorWithSum<uint8_t, 6, 8, 1, ColumnMajorWithSum>::Pack()"
9079 << std::endl
9080 << std::flush;
9081 #endif
9082 #endif
9083 int params_count_copy = params.count;
9084 int params_stride_copy = params.stride;
9085 asm volatile(
9086 "sub %x[stride], %x[stride], #4\n"
9087 "movi v8.8h, #0\n"
9088 "movi v9.8h, #0\n"
9089 "movi v10.8h, #0\n"
9090 "movi v11.8h, #0\n"
9091 "movi v12.8h, #0\n"
9092 "movi v13.8h, #0\n"
9093
9094 // Reduce count by leftovers.
9095 "subs %x[count], %x[count], #1\n"
9096 "beq 2f\n"
9097
9098 "1:"
9099 "subs %x[count], %x[count], #8\n"
9100
9101 // Load Aggregate Store - column major 6x8
9102 "ld1 {v0.s}[0], [%x[in]], #4\n"
9103 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9104 "ld1 {v1.s}[0], [%x[in]], #4\n"
9105 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9106 "ld1 {v2.s}[0], [%x[in]], #4\n"
9107 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9108 "ld1 {v3.s}[0], [%x[in]], #4\n"
9109 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9110 "ld1 {v0.s}[1], [%x[in]], #4\n"
9111 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9112 "ld1 {v1.s}[1], [%x[in]], #4\n"
9113 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9114 "ld1 {v2.s}[1], [%x[in]], #4\n"
9115 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9116 "ld1 {v3.s}[1], [%x[in]], #4\n"
9117 "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9118 "prfm pldl1keep, [%x[in]]\n"
9119 "trn1 v6.4h, v0.4h, v2.4h\n"
9120 "trn2 v14.4h, v0.4h, v2.4h\n"
9121 "trn1 v7.4h, v1.4h, v3.4h\n"
9122 "trn2 v15.4h, v1.4h, v3.4h\n"
9123 "uzp1 v16.8b, v4.8b, v5.8b\n"
9124 "uzp2 v17.8b, v4.8b, v5.8b\n"
9125 "trn1 v0.8b, v6.8b, v7.8b\n"
9126 "trn2 v1.8b, v6.8b, v7.8b\n"
9127 "trn1 v2.8b, v14.8b, v15.8b\n"
9128 "trn2 v3.8b, v14.8b, v15.8b\n"
9129 "uaddw v8.8h, v8.8h, v0.8b\n"
9130 "uaddw v9.8h, v9.8h, v1.8b\n"
9131 "uaddw v10.8h, v10.8h, v2.8b\n"
9132 "uaddw v11.8h, v11.8h, v3.8b\n"
9133 "uaddw v12.8h, v12.8h, v16.8b\n"
9134 "uaddw v13.8h, v13.8h, v17.8b\n"
9135 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9136 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9137
9138 "bne 1b\n"
9139
9140 "2:"
9141
9142 // Load Aggregate Store - column major 6x1
9143 "movi v0.8b, #0\n"
9144 "movi v1.8b, #0\n"
9145 "movi v2.8b, #0\n"
9146 "movi v3.8b, #0\n"
9147 "movi v4.8b, #0\n"
9148 "movi v5.8b, #0\n"
9149 "ld1 {v0.s}[0], [%x[in]], #4\n"
9150 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9151 "prfm pldl1keep, [%x[in]]\n"
9152 "trn1 v6.4h, v0.4h, v2.4h\n"
9153 "trn2 v14.4h, v0.4h, v2.4h\n"
9154 "trn1 v7.4h, v1.4h, v3.4h\n"
9155 "trn2 v15.4h, v1.4h, v3.4h\n"
9156 "uzp1 v16.8b, v4.8b, v5.8b\n"
9157 "uzp2 v17.8b, v4.8b, v5.8b\n"
9158 "trn1 v0.8b, v6.8b, v7.8b\n"
9159 "trn2 v1.8b, v6.8b, v7.8b\n"
9160 "trn1 v2.8b, v14.8b, v15.8b\n"
9161 "trn2 v3.8b, v14.8b, v15.8b\n"
9162 "uaddw v8.8h, v8.8h, v0.8b\n"
9163 "uaddw v9.8h, v9.8h, v1.8b\n"
9164 "uaddw v10.8h, v10.8h, v2.8b\n"
9165 "uaddw v11.8h, v11.8h, v3.8b\n"
9166 "uaddw v12.8h, v12.8h, v16.8b\n"
9167 "uaddw v13.8h, v13.8h, v17.8b\n"
9168 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9169 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9170
9171 // Aggregator Reduction.
9172 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9173 "dup v1.4s, %w[additive_sum_offset]\n"
9174 "uaddlp v8.4s, v8.8h\n"
9175 "uaddlp v9.4s, v9.8h\n"
9176 "uaddlp v10.4s, v10.8h\n"
9177 "uaddlp v11.4s, v11.8h\n"
9178 "uaddlp v12.4s, v12.8h\n"
9179 "uaddlp v13.4s, v13.8h\n"
9180 "addp v8.4s, v8.4s, v9.4s\n"
9181 "addp v10.4s, v10.4s, v11.4s\n"
9182 "addp v12.4s, v12.4s, v13.4s\n"
9183 "addp v8.4s, v8.4s, v10.4s\n"
9184 "addp v9.4s, v12.4s, v12.4s\n"
9185 "mul v8.4s, v8.4s, v0.s[0]\n"
9186 "mul v9.4s, v9.4s, v0.s[0]\n"
9187 "add v8.4s, v8.4s, v1.4s\n"
9188 "add v9.4s, v9.4s, v1.4s\n"
9189 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9190 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9191 [out] "+r"(out), [in] "+r"(in)
9192 : [additive_sum_offset] "r"(params.additive_sum_offset),
9193 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9194 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9195 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9196 }
9197
9198 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9199 inline void Stream<uint8_t, 6, 8, 2, ColumnMajorWithSum>::Pack(
9200 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9201 #ifdef DEBUG
9202 #ifdef DEBUG_METAGEMM_VERBOSE
9203 std::cout
9204 << __FILE__ << "(" << __LINE__
9205 << ") ColumnMajorWithSum<uint8_t, 6, 8, 2, ColumnMajorWithSum>::Pack()"
9206 << std::endl
9207 << std::flush;
9208 #endif
9209 #endif
9210 int params_count_copy = params.count;
9211 int params_stride_copy = params.stride;
9212 asm volatile(
9213 "sub %x[stride], %x[stride], #4\n"
9214 "movi v8.8h, #0\n"
9215 "movi v9.8h, #0\n"
9216 "movi v10.8h, #0\n"
9217 "movi v11.8h, #0\n"
9218 "movi v12.8h, #0\n"
9219 "movi v13.8h, #0\n"
9220
9221 // Reduce count by leftovers.
9222 "subs %x[count], %x[count], #2\n"
9223 "beq 2f\n"
9224
9225 "1:"
9226 "subs %x[count], %x[count], #8\n"
9227
9228 // Load Aggregate Store - column major 6x8
9229 "ld1 {v0.s}[0], [%x[in]], #4\n"
9230 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9231 "ld1 {v1.s}[0], [%x[in]], #4\n"
9232 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9233 "ld1 {v2.s}[0], [%x[in]], #4\n"
9234 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9235 "ld1 {v3.s}[0], [%x[in]], #4\n"
9236 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9237 "ld1 {v0.s}[1], [%x[in]], #4\n"
9238 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9239 "ld1 {v1.s}[1], [%x[in]], #4\n"
9240 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9241 "ld1 {v2.s}[1], [%x[in]], #4\n"
9242 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9243 "ld1 {v3.s}[1], [%x[in]], #4\n"
9244 "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9245 "prfm pldl1keep, [%x[in]]\n"
9246 "trn1 v6.4h, v0.4h, v2.4h\n"
9247 "trn2 v14.4h, v0.4h, v2.4h\n"
9248 "trn1 v7.4h, v1.4h, v3.4h\n"
9249 "trn2 v15.4h, v1.4h, v3.4h\n"
9250 "uzp1 v16.8b, v4.8b, v5.8b\n"
9251 "uzp2 v17.8b, v4.8b, v5.8b\n"
9252 "trn1 v0.8b, v6.8b, v7.8b\n"
9253 "trn2 v1.8b, v6.8b, v7.8b\n"
9254 "trn1 v2.8b, v14.8b, v15.8b\n"
9255 "trn2 v3.8b, v14.8b, v15.8b\n"
9256 "uaddw v8.8h, v8.8h, v0.8b\n"
9257 "uaddw v9.8h, v9.8h, v1.8b\n"
9258 "uaddw v10.8h, v10.8h, v2.8b\n"
9259 "uaddw v11.8h, v11.8h, v3.8b\n"
9260 "uaddw v12.8h, v12.8h, v16.8b\n"
9261 "uaddw v13.8h, v13.8h, v17.8b\n"
9262 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9263 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9264
9265 "bne 1b\n"
9266
9267 "2:"
9268
9269 // Load Aggregate Store - column major 6x2
9270 "movi v0.8b, #0\n"
9271 "movi v1.8b, #0\n"
9272 "movi v2.8b, #0\n"
9273 "movi v3.8b, #0\n"
9274 "movi v4.8b, #0\n"
9275 "movi v5.8b, #0\n"
9276 "ld1 {v0.s}[0], [%x[in]], #4\n"
9277 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9278 "ld1 {v1.s}[0], [%x[in]], #4\n"
9279 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9280 "prfm pldl1keep, [%x[in]]\n"
9281 "trn1 v6.4h, v0.4h, v2.4h\n"
9282 "trn2 v14.4h, v0.4h, v2.4h\n"
9283 "trn1 v7.4h, v1.4h, v3.4h\n"
9284 "trn2 v15.4h, v1.4h, v3.4h\n"
9285 "uzp1 v16.8b, v4.8b, v5.8b\n"
9286 "uzp2 v17.8b, v4.8b, v5.8b\n"
9287 "trn1 v0.8b, v6.8b, v7.8b\n"
9288 "trn2 v1.8b, v6.8b, v7.8b\n"
9289 "trn1 v2.8b, v14.8b, v15.8b\n"
9290 "trn2 v3.8b, v14.8b, v15.8b\n"
9291 "uaddw v8.8h, v8.8h, v0.8b\n"
9292 "uaddw v9.8h, v9.8h, v1.8b\n"
9293 "uaddw v10.8h, v10.8h, v2.8b\n"
9294 "uaddw v11.8h, v11.8h, v3.8b\n"
9295 "uaddw v12.8h, v12.8h, v16.8b\n"
9296 "uaddw v13.8h, v13.8h, v17.8b\n"
9297 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9298 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9299
9300 // Aggregator Reduction.
9301 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9302 "dup v1.4s, %w[additive_sum_offset]\n"
9303 "uaddlp v8.4s, v8.8h\n"
9304 "uaddlp v9.4s, v9.8h\n"
9305 "uaddlp v10.4s, v10.8h\n"
9306 "uaddlp v11.4s, v11.8h\n"
9307 "uaddlp v12.4s, v12.8h\n"
9308 "uaddlp v13.4s, v13.8h\n"
9309 "addp v8.4s, v8.4s, v9.4s\n"
9310 "addp v10.4s, v10.4s, v11.4s\n"
9311 "addp v12.4s, v12.4s, v13.4s\n"
9312 "addp v8.4s, v8.4s, v10.4s\n"
9313 "addp v9.4s, v12.4s, v12.4s\n"
9314 "mul v8.4s, v8.4s, v0.s[0]\n"
9315 "mul v9.4s, v9.4s, v0.s[0]\n"
9316 "add v8.4s, v8.4s, v1.4s\n"
9317 "add v9.4s, v9.4s, v1.4s\n"
9318 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9319 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9320 [out] "+r"(out), [in] "+r"(in)
9321 : [additive_sum_offset] "r"(params.additive_sum_offset),
9322 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9323 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9324 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9325 }
9326
9327 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9328 inline void Stream<uint8_t, 6, 8, 3, ColumnMajorWithSum>::Pack(
9329 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9330 #ifdef DEBUG
9331 #ifdef DEBUG_METAGEMM_VERBOSE
9332 std::cout
9333 << __FILE__ << "(" << __LINE__
9334 << ") ColumnMajorWithSum<uint8_t, 6, 8, 3, ColumnMajorWithSum>::Pack()"
9335 << std::endl
9336 << std::flush;
9337 #endif
9338 #endif
9339 int params_count_copy = params.count;
9340 int params_stride_copy = params.stride;
9341 asm volatile(
9342 "sub %x[stride], %x[stride], #4\n"
9343 "movi v8.8h, #0\n"
9344 "movi v9.8h, #0\n"
9345 "movi v10.8h, #0\n"
9346 "movi v11.8h, #0\n"
9347 "movi v12.8h, #0\n"
9348 "movi v13.8h, #0\n"
9349
9350 // Reduce count by leftovers.
9351 "subs %x[count], %x[count], #3\n"
9352 "beq 2f\n"
9353
9354 "1:"
9355 "subs %x[count], %x[count], #8\n"
9356
9357 // Load Aggregate Store - column major 6x8
9358 "ld1 {v0.s}[0], [%x[in]], #4\n"
9359 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9360 "ld1 {v1.s}[0], [%x[in]], #4\n"
9361 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9362 "ld1 {v2.s}[0], [%x[in]], #4\n"
9363 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9364 "ld1 {v3.s}[0], [%x[in]], #4\n"
9365 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9366 "ld1 {v0.s}[1], [%x[in]], #4\n"
9367 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9368 "ld1 {v1.s}[1], [%x[in]], #4\n"
9369 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9370 "ld1 {v2.s}[1], [%x[in]], #4\n"
9371 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9372 "ld1 {v3.s}[1], [%x[in]], #4\n"
9373 "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9374 "prfm pldl1keep, [%x[in]]\n"
9375 "trn1 v6.4h, v0.4h, v2.4h\n"
9376 "trn2 v14.4h, v0.4h, v2.4h\n"
9377 "trn1 v7.4h, v1.4h, v3.4h\n"
9378 "trn2 v15.4h, v1.4h, v3.4h\n"
9379 "uzp1 v16.8b, v4.8b, v5.8b\n"
9380 "uzp2 v17.8b, v4.8b, v5.8b\n"
9381 "trn1 v0.8b, v6.8b, v7.8b\n"
9382 "trn2 v1.8b, v6.8b, v7.8b\n"
9383 "trn1 v2.8b, v14.8b, v15.8b\n"
9384 "trn2 v3.8b, v14.8b, v15.8b\n"
9385 "uaddw v8.8h, v8.8h, v0.8b\n"
9386 "uaddw v9.8h, v9.8h, v1.8b\n"
9387 "uaddw v10.8h, v10.8h, v2.8b\n"
9388 "uaddw v11.8h, v11.8h, v3.8b\n"
9389 "uaddw v12.8h, v12.8h, v16.8b\n"
9390 "uaddw v13.8h, v13.8h, v17.8b\n"
9391 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9392 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9393
9394 "bne 1b\n"
9395
9396 "2:"
9397
9398 // Load Aggregate Store - column major 6x3
9399 "movi v0.8b, #0\n"
9400 "movi v1.8b, #0\n"
9401 "movi v2.8b, #0\n"
9402 "movi v3.8b, #0\n"
9403 "movi v4.8b, #0\n"
9404 "movi v5.8b, #0\n"
9405 "ld1 {v0.s}[0], [%x[in]], #4\n"
9406 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9407 "ld1 {v1.s}[0], [%x[in]], #4\n"
9408 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9409 "ld1 {v2.s}[0], [%x[in]], #4\n"
9410 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9411 "prfm pldl1keep, [%x[in]]\n"
9412 "trn1 v6.4h, v0.4h, v2.4h\n"
9413 "trn2 v14.4h, v0.4h, v2.4h\n"
9414 "trn1 v7.4h, v1.4h, v3.4h\n"
9415 "trn2 v15.4h, v1.4h, v3.4h\n"
9416 "uzp1 v16.8b, v4.8b, v5.8b\n"
9417 "uzp2 v17.8b, v4.8b, v5.8b\n"
9418 "trn1 v0.8b, v6.8b, v7.8b\n"
9419 "trn2 v1.8b, v6.8b, v7.8b\n"
9420 "trn1 v2.8b, v14.8b, v15.8b\n"
9421 "trn2 v3.8b, v14.8b, v15.8b\n"
9422 "uaddw v8.8h, v8.8h, v0.8b\n"
9423 "uaddw v9.8h, v9.8h, v1.8b\n"
9424 "uaddw v10.8h, v10.8h, v2.8b\n"
9425 "uaddw v11.8h, v11.8h, v3.8b\n"
9426 "uaddw v12.8h, v12.8h, v16.8b\n"
9427 "uaddw v13.8h, v13.8h, v17.8b\n"
9428 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9429 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9430
9431 // Aggregator Reduction.
9432 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9433 "dup v1.4s, %w[additive_sum_offset]\n"
9434 "uaddlp v8.4s, v8.8h\n"
9435 "uaddlp v9.4s, v9.8h\n"
9436 "uaddlp v10.4s, v10.8h\n"
9437 "uaddlp v11.4s, v11.8h\n"
9438 "uaddlp v12.4s, v12.8h\n"
9439 "uaddlp v13.4s, v13.8h\n"
9440 "addp v8.4s, v8.4s, v9.4s\n"
9441 "addp v10.4s, v10.4s, v11.4s\n"
9442 "addp v12.4s, v12.4s, v13.4s\n"
9443 "addp v8.4s, v8.4s, v10.4s\n"
9444 "addp v9.4s, v12.4s, v12.4s\n"
9445 "mul v8.4s, v8.4s, v0.s[0]\n"
9446 "mul v9.4s, v9.4s, v0.s[0]\n"
9447 "add v8.4s, v8.4s, v1.4s\n"
9448 "add v9.4s, v9.4s, v1.4s\n"
9449 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9450 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9451 [out] "+r"(out), [in] "+r"(in)
9452 : [additive_sum_offset] "r"(params.additive_sum_offset),
9453 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9454 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9455 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9456 }
9457
9458 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9459 inline void Stream<uint8_t, 6, 8, 4, ColumnMajorWithSum>::Pack(
9460 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9461 #ifdef DEBUG
9462 #ifdef DEBUG_METAGEMM_VERBOSE
9463 std::cout
9464 << __FILE__ << "(" << __LINE__
9465 << ") ColumnMajorWithSum<uint8_t, 6, 8, 4, ColumnMajorWithSum>::Pack()"
9466 << std::endl
9467 << std::flush;
9468 #endif
9469 #endif
9470 int params_count_copy = params.count;
9471 int params_stride_copy = params.stride;
9472 asm volatile(
9473 "sub %x[stride], %x[stride], #4\n"
9474 "movi v8.8h, #0\n"
9475 "movi v9.8h, #0\n"
9476 "movi v10.8h, #0\n"
9477 "movi v11.8h, #0\n"
9478 "movi v12.8h, #0\n"
9479 "movi v13.8h, #0\n"
9480
9481 // Reduce count by leftovers.
9482 "subs %x[count], %x[count], #4\n"
9483 "beq 2f\n"
9484
9485 "1:"
9486 "subs %x[count], %x[count], #8\n"
9487
9488 // Load Aggregate Store - column major 6x8
9489 "ld1 {v0.s}[0], [%x[in]], #4\n"
9490 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9491 "ld1 {v1.s}[0], [%x[in]], #4\n"
9492 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9493 "ld1 {v2.s}[0], [%x[in]], #4\n"
9494 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9495 "ld1 {v3.s}[0], [%x[in]], #4\n"
9496 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9497 "ld1 {v0.s}[1], [%x[in]], #4\n"
9498 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9499 "ld1 {v1.s}[1], [%x[in]], #4\n"
9500 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9501 "ld1 {v2.s}[1], [%x[in]], #4\n"
9502 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9503 "ld1 {v3.s}[1], [%x[in]], #4\n"
9504 "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9505 "prfm pldl1keep, [%x[in]]\n"
9506 "trn1 v6.4h, v0.4h, v2.4h\n"
9507 "trn2 v14.4h, v0.4h, v2.4h\n"
9508 "trn1 v7.4h, v1.4h, v3.4h\n"
9509 "trn2 v15.4h, v1.4h, v3.4h\n"
9510 "uzp1 v16.8b, v4.8b, v5.8b\n"
9511 "uzp2 v17.8b, v4.8b, v5.8b\n"
9512 "trn1 v0.8b, v6.8b, v7.8b\n"
9513 "trn2 v1.8b, v6.8b, v7.8b\n"
9514 "trn1 v2.8b, v14.8b, v15.8b\n"
9515 "trn2 v3.8b, v14.8b, v15.8b\n"
9516 "uaddw v8.8h, v8.8h, v0.8b\n"
9517 "uaddw v9.8h, v9.8h, v1.8b\n"
9518 "uaddw v10.8h, v10.8h, v2.8b\n"
9519 "uaddw v11.8h, v11.8h, v3.8b\n"
9520 "uaddw v12.8h, v12.8h, v16.8b\n"
9521 "uaddw v13.8h, v13.8h, v17.8b\n"
9522 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9523 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9524
9525 "bne 1b\n"
9526
9527 "2:"
9528
9529 // Load Aggregate Store - column major 6x4
9530 "movi v0.8b, #0\n"
9531 "movi v1.8b, #0\n"
9532 "movi v2.8b, #0\n"
9533 "movi v3.8b, #0\n"
9534 "movi v4.8b, #0\n"
9535 "movi v5.8b, #0\n"
9536 "ld1 {v0.s}[0], [%x[in]], #4\n"
9537 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9538 "ld1 {v1.s}[0], [%x[in]], #4\n"
9539 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9540 "ld1 {v2.s}[0], [%x[in]], #4\n"
9541 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9542 "ld1 {v3.s}[0], [%x[in]], #4\n"
9543 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9544 "prfm pldl1keep, [%x[in]]\n"
9545 "trn1 v6.4h, v0.4h, v2.4h\n"
9546 "trn2 v14.4h, v0.4h, v2.4h\n"
9547 "trn1 v7.4h, v1.4h, v3.4h\n"
9548 "trn2 v15.4h, v1.4h, v3.4h\n"
9549 "uzp1 v16.8b, v4.8b, v5.8b\n"
9550 "uzp2 v17.8b, v4.8b, v5.8b\n"
9551 "trn1 v0.8b, v6.8b, v7.8b\n"
9552 "trn2 v1.8b, v6.8b, v7.8b\n"
9553 "trn1 v2.8b, v14.8b, v15.8b\n"
9554 "trn2 v3.8b, v14.8b, v15.8b\n"
9555 "uaddw v8.8h, v8.8h, v0.8b\n"
9556 "uaddw v9.8h, v9.8h, v1.8b\n"
9557 "uaddw v10.8h, v10.8h, v2.8b\n"
9558 "uaddw v11.8h, v11.8h, v3.8b\n"
9559 "uaddw v12.8h, v12.8h, v16.8b\n"
9560 "uaddw v13.8h, v13.8h, v17.8b\n"
9561 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9562 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9563
9564 // Aggregator Reduction.
9565 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9566 "dup v1.4s, %w[additive_sum_offset]\n"
9567 "uaddlp v8.4s, v8.8h\n"
9568 "uaddlp v9.4s, v9.8h\n"
9569 "uaddlp v10.4s, v10.8h\n"
9570 "uaddlp v11.4s, v11.8h\n"
9571 "uaddlp v12.4s, v12.8h\n"
9572 "uaddlp v13.4s, v13.8h\n"
9573 "addp v8.4s, v8.4s, v9.4s\n"
9574 "addp v10.4s, v10.4s, v11.4s\n"
9575 "addp v12.4s, v12.4s, v13.4s\n"
9576 "addp v8.4s, v8.4s, v10.4s\n"
9577 "addp v9.4s, v12.4s, v12.4s\n"
9578 "mul v8.4s, v8.4s, v0.s[0]\n"
9579 "mul v9.4s, v9.4s, v0.s[0]\n"
9580 "add v8.4s, v8.4s, v1.4s\n"
9581 "add v9.4s, v9.4s, v1.4s\n"
9582 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9583 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9584 [out] "+r"(out), [in] "+r"(in)
9585 : [additive_sum_offset] "r"(params.additive_sum_offset),
9586 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9587 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9588 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9589 }
9590
9591 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9592 inline void Stream<uint8_t, 6, 8, 5, ColumnMajorWithSum>::Pack(
9593 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9594 #ifdef DEBUG
9595 #ifdef DEBUG_METAGEMM_VERBOSE
9596 std::cout
9597 << __FILE__ << "(" << __LINE__
9598 << ") ColumnMajorWithSum<uint8_t, 6, 8, 5, ColumnMajorWithSum>::Pack()"
9599 << std::endl
9600 << std::flush;
9601 #endif
9602 #endif
9603 int params_count_copy = params.count;
9604 int params_stride_copy = params.stride;
9605 asm volatile(
9606 "sub %x[stride], %x[stride], #4\n"
9607 "movi v8.8h, #0\n"
9608 "movi v9.8h, #0\n"
9609 "movi v10.8h, #0\n"
9610 "movi v11.8h, #0\n"
9611 "movi v12.8h, #0\n"
9612 "movi v13.8h, #0\n"
9613
9614 // Reduce count by leftovers.
9615 "subs %x[count], %x[count], #5\n"
9616 "beq 2f\n"
9617
9618 "1:"
9619 "subs %x[count], %x[count], #8\n"
9620
9621 // Load Aggregate Store - column major 6x8
9622 "ld1 {v0.s}[0], [%x[in]], #4\n"
9623 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9624 "ld1 {v1.s}[0], [%x[in]], #4\n"
9625 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9626 "ld1 {v2.s}[0], [%x[in]], #4\n"
9627 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9628 "ld1 {v3.s}[0], [%x[in]], #4\n"
9629 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9630 "ld1 {v0.s}[1], [%x[in]], #4\n"
9631 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9632 "ld1 {v1.s}[1], [%x[in]], #4\n"
9633 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9634 "ld1 {v2.s}[1], [%x[in]], #4\n"
9635 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9636 "ld1 {v3.s}[1], [%x[in]], #4\n"
9637 "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9638 "prfm pldl1keep, [%x[in]]\n"
9639 "trn1 v6.4h, v0.4h, v2.4h\n"
9640 "trn2 v14.4h, v0.4h, v2.4h\n"
9641 "trn1 v7.4h, v1.4h, v3.4h\n"
9642 "trn2 v15.4h, v1.4h, v3.4h\n"
9643 "uzp1 v16.8b, v4.8b, v5.8b\n"
9644 "uzp2 v17.8b, v4.8b, v5.8b\n"
9645 "trn1 v0.8b, v6.8b, v7.8b\n"
9646 "trn2 v1.8b, v6.8b, v7.8b\n"
9647 "trn1 v2.8b, v14.8b, v15.8b\n"
9648 "trn2 v3.8b, v14.8b, v15.8b\n"
9649 "uaddw v8.8h, v8.8h, v0.8b\n"
9650 "uaddw v9.8h, v9.8h, v1.8b\n"
9651 "uaddw v10.8h, v10.8h, v2.8b\n"
9652 "uaddw v11.8h, v11.8h, v3.8b\n"
9653 "uaddw v12.8h, v12.8h, v16.8b\n"
9654 "uaddw v13.8h, v13.8h, v17.8b\n"
9655 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9656 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9657
9658 "bne 1b\n"
9659
9660 "2:"
9661
9662 // Load Aggregate Store - column major 6x5
9663 "movi v0.8b, #0\n"
9664 "movi v1.8b, #0\n"
9665 "movi v2.8b, #0\n"
9666 "movi v3.8b, #0\n"
9667 "movi v4.8b, #0\n"
9668 "movi v5.8b, #0\n"
9669 "ld1 {v0.s}[0], [%x[in]], #4\n"
9670 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9671 "ld1 {v1.s}[0], [%x[in]], #4\n"
9672 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9673 "ld1 {v2.s}[0], [%x[in]], #4\n"
9674 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9675 "ld1 {v3.s}[0], [%x[in]], #4\n"
9676 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9677 "ld1 {v0.s}[1], [%x[in]], #4\n"
9678 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9679 "prfm pldl1keep, [%x[in]]\n"
9680 "trn1 v6.4h, v0.4h, v2.4h\n"
9681 "trn2 v14.4h, v0.4h, v2.4h\n"
9682 "trn1 v7.4h, v1.4h, v3.4h\n"
9683 "trn2 v15.4h, v1.4h, v3.4h\n"
9684 "uzp1 v16.8b, v4.8b, v5.8b\n"
9685 "uzp2 v17.8b, v4.8b, v5.8b\n"
9686 "trn1 v0.8b, v6.8b, v7.8b\n"
9687 "trn2 v1.8b, v6.8b, v7.8b\n"
9688 "trn1 v2.8b, v14.8b, v15.8b\n"
9689 "trn2 v3.8b, v14.8b, v15.8b\n"
9690 "uaddw v8.8h, v8.8h, v0.8b\n"
9691 "uaddw v9.8h, v9.8h, v1.8b\n"
9692 "uaddw v10.8h, v10.8h, v2.8b\n"
9693 "uaddw v11.8h, v11.8h, v3.8b\n"
9694 "uaddw v12.8h, v12.8h, v16.8b\n"
9695 "uaddw v13.8h, v13.8h, v17.8b\n"
9696 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9697 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9698
9699 // Aggregator Reduction.
9700 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9701 "dup v1.4s, %w[additive_sum_offset]\n"
9702 "uaddlp v8.4s, v8.8h\n"
9703 "uaddlp v9.4s, v9.8h\n"
9704 "uaddlp v10.4s, v10.8h\n"
9705 "uaddlp v11.4s, v11.8h\n"
9706 "uaddlp v12.4s, v12.8h\n"
9707 "uaddlp v13.4s, v13.8h\n"
9708 "addp v8.4s, v8.4s, v9.4s\n"
9709 "addp v10.4s, v10.4s, v11.4s\n"
9710 "addp v12.4s, v12.4s, v13.4s\n"
9711 "addp v8.4s, v8.4s, v10.4s\n"
9712 "addp v9.4s, v12.4s, v12.4s\n"
9713 "mul v8.4s, v8.4s, v0.s[0]\n"
9714 "mul v9.4s, v9.4s, v0.s[0]\n"
9715 "add v8.4s, v8.4s, v1.4s\n"
9716 "add v9.4s, v9.4s, v1.4s\n"
9717 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9718 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9719 [out] "+r"(out), [in] "+r"(in)
9720 : [additive_sum_offset] "r"(params.additive_sum_offset),
9721 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9722 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9723 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9724 }
9725
9726 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9727 inline void Stream<uint8_t, 6, 8, 6, ColumnMajorWithSum>::Pack(
9728 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9729 #ifdef DEBUG
9730 #ifdef DEBUG_METAGEMM_VERBOSE
9731 std::cout
9732 << __FILE__ << "(" << __LINE__
9733 << ") ColumnMajorWithSum<uint8_t, 6, 8, 6, ColumnMajorWithSum>::Pack()"
9734 << std::endl
9735 << std::flush;
9736 #endif
9737 #endif
9738 int params_count_copy = params.count;
9739 int params_stride_copy = params.stride;
9740 asm volatile(
9741 "sub %x[stride], %x[stride], #4\n"
9742 "movi v8.8h, #0\n"
9743 "movi v9.8h, #0\n"
9744 "movi v10.8h, #0\n"
9745 "movi v11.8h, #0\n"
9746 "movi v12.8h, #0\n"
9747 "movi v13.8h, #0\n"
9748
9749 // Reduce count by leftovers.
9750 "subs %x[count], %x[count], #6\n"
9751 "beq 2f\n"
9752
9753 "1:"
9754 "subs %x[count], %x[count], #8\n"
9755
9756 // Load Aggregate Store - column major 6x8
9757 "ld1 {v0.s}[0], [%x[in]], #4\n"
9758 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9759 "ld1 {v1.s}[0], [%x[in]], #4\n"
9760 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9761 "ld1 {v2.s}[0], [%x[in]], #4\n"
9762 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9763 "ld1 {v3.s}[0], [%x[in]], #4\n"
9764 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9765 "ld1 {v0.s}[1], [%x[in]], #4\n"
9766 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9767 "ld1 {v1.s}[1], [%x[in]], #4\n"
9768 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9769 "ld1 {v2.s}[1], [%x[in]], #4\n"
9770 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9771 "ld1 {v3.s}[1], [%x[in]], #4\n"
9772 "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9773 "prfm pldl1keep, [%x[in]]\n"
9774 "trn1 v6.4h, v0.4h, v2.4h\n"
9775 "trn2 v14.4h, v0.4h, v2.4h\n"
9776 "trn1 v7.4h, v1.4h, v3.4h\n"
9777 "trn2 v15.4h, v1.4h, v3.4h\n"
9778 "uzp1 v16.8b, v4.8b, v5.8b\n"
9779 "uzp2 v17.8b, v4.8b, v5.8b\n"
9780 "trn1 v0.8b, v6.8b, v7.8b\n"
9781 "trn2 v1.8b, v6.8b, v7.8b\n"
9782 "trn1 v2.8b, v14.8b, v15.8b\n"
9783 "trn2 v3.8b, v14.8b, v15.8b\n"
9784 "uaddw v8.8h, v8.8h, v0.8b\n"
9785 "uaddw v9.8h, v9.8h, v1.8b\n"
9786 "uaddw v10.8h, v10.8h, v2.8b\n"
9787 "uaddw v11.8h, v11.8h, v3.8b\n"
9788 "uaddw v12.8h, v12.8h, v16.8b\n"
9789 "uaddw v13.8h, v13.8h, v17.8b\n"
9790 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9791 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9792
9793 "bne 1b\n"
9794
9795 "2:"
9796
9797 // Load Aggregate Store - column major 6x6
9798 "movi v0.8b, #0\n"
9799 "movi v1.8b, #0\n"
9800 "movi v2.8b, #0\n"
9801 "movi v3.8b, #0\n"
9802 "movi v4.8b, #0\n"
9803 "movi v5.8b, #0\n"
9804 "ld1 {v0.s}[0], [%x[in]], #4\n"
9805 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9806 "ld1 {v1.s}[0], [%x[in]], #4\n"
9807 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9808 "ld1 {v2.s}[0], [%x[in]], #4\n"
9809 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9810 "ld1 {v3.s}[0], [%x[in]], #4\n"
9811 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9812 "ld1 {v0.s}[1], [%x[in]], #4\n"
9813 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9814 "ld1 {v1.s}[1], [%x[in]], #4\n"
9815 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9816 "prfm pldl1keep, [%x[in]]\n"
9817 "trn1 v6.4h, v0.4h, v2.4h\n"
9818 "trn2 v14.4h, v0.4h, v2.4h\n"
9819 "trn1 v7.4h, v1.4h, v3.4h\n"
9820 "trn2 v15.4h, v1.4h, v3.4h\n"
9821 "uzp1 v16.8b, v4.8b, v5.8b\n"
9822 "uzp2 v17.8b, v4.8b, v5.8b\n"
9823 "trn1 v0.8b, v6.8b, v7.8b\n"
9824 "trn2 v1.8b, v6.8b, v7.8b\n"
9825 "trn1 v2.8b, v14.8b, v15.8b\n"
9826 "trn2 v3.8b, v14.8b, v15.8b\n"
9827 "uaddw v8.8h, v8.8h, v0.8b\n"
9828 "uaddw v9.8h, v9.8h, v1.8b\n"
9829 "uaddw v10.8h, v10.8h, v2.8b\n"
9830 "uaddw v11.8h, v11.8h, v3.8b\n"
9831 "uaddw v12.8h, v12.8h, v16.8b\n"
9832 "uaddw v13.8h, v13.8h, v17.8b\n"
9833 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9834 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9835
9836 // Aggregator Reduction.
9837 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9838 "dup v1.4s, %w[additive_sum_offset]\n"
9839 "uaddlp v8.4s, v8.8h\n"
9840 "uaddlp v9.4s, v9.8h\n"
9841 "uaddlp v10.4s, v10.8h\n"
9842 "uaddlp v11.4s, v11.8h\n"
9843 "uaddlp v12.4s, v12.8h\n"
9844 "uaddlp v13.4s, v13.8h\n"
9845 "addp v8.4s, v8.4s, v9.4s\n"
9846 "addp v10.4s, v10.4s, v11.4s\n"
9847 "addp v12.4s, v12.4s, v13.4s\n"
9848 "addp v8.4s, v8.4s, v10.4s\n"
9849 "addp v9.4s, v12.4s, v12.4s\n"
9850 "mul v8.4s, v8.4s, v0.s[0]\n"
9851 "mul v9.4s, v9.4s, v0.s[0]\n"
9852 "add v8.4s, v8.4s, v1.4s\n"
9853 "add v9.4s, v9.4s, v1.4s\n"
9854 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9855 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9856 [out] "+r"(out), [in] "+r"(in)
9857 : [additive_sum_offset] "r"(params.additive_sum_offset),
9858 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9859 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9860 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
9861 }
9862
9863 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)9864 inline void Stream<uint8_t, 6, 8, 7, ColumnMajorWithSum>::Pack(
9865 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
9866 #ifdef DEBUG
9867 #ifdef DEBUG_METAGEMM_VERBOSE
9868 std::cout
9869 << __FILE__ << "(" << __LINE__
9870 << ") ColumnMajorWithSum<uint8_t, 6, 8, 7, ColumnMajorWithSum>::Pack()"
9871 << std::endl
9872 << std::flush;
9873 #endif
9874 #endif
9875 int params_count_copy = params.count;
9876 int params_stride_copy = params.stride;
9877 asm volatile(
9878 "sub %x[stride], %x[stride], #4\n"
9879 "movi v8.8h, #0\n"
9880 "movi v9.8h, #0\n"
9881 "movi v10.8h, #0\n"
9882 "movi v11.8h, #0\n"
9883 "movi v12.8h, #0\n"
9884 "movi v13.8h, #0\n"
9885
9886 // Reduce count by leftovers.
9887 "subs %x[count], %x[count], #7\n"
9888 "beq 2f\n"
9889
9890 "1:"
9891 "subs %x[count], %x[count], #8\n"
9892
9893 // Load Aggregate Store - column major 6x8
9894 "ld1 {v0.s}[0], [%x[in]], #4\n"
9895 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9896 "ld1 {v1.s}[0], [%x[in]], #4\n"
9897 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9898 "ld1 {v2.s}[0], [%x[in]], #4\n"
9899 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9900 "ld1 {v3.s}[0], [%x[in]], #4\n"
9901 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9902 "ld1 {v0.s}[1], [%x[in]], #4\n"
9903 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9904 "ld1 {v1.s}[1], [%x[in]], #4\n"
9905 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9906 "ld1 {v2.s}[1], [%x[in]], #4\n"
9907 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9908 "ld1 {v3.s}[1], [%x[in]], #4\n"
9909 "ld1 {v5.h}[3], [%x[in]], %x[stride]\n"
9910 "prfm pldl1keep, [%x[in]]\n"
9911 "trn1 v6.4h, v0.4h, v2.4h\n"
9912 "trn2 v14.4h, v0.4h, v2.4h\n"
9913 "trn1 v7.4h, v1.4h, v3.4h\n"
9914 "trn2 v15.4h, v1.4h, v3.4h\n"
9915 "uzp1 v16.8b, v4.8b, v5.8b\n"
9916 "uzp2 v17.8b, v4.8b, v5.8b\n"
9917 "trn1 v0.8b, v6.8b, v7.8b\n"
9918 "trn2 v1.8b, v6.8b, v7.8b\n"
9919 "trn1 v2.8b, v14.8b, v15.8b\n"
9920 "trn2 v3.8b, v14.8b, v15.8b\n"
9921 "uaddw v8.8h, v8.8h, v0.8b\n"
9922 "uaddw v9.8h, v9.8h, v1.8b\n"
9923 "uaddw v10.8h, v10.8h, v2.8b\n"
9924 "uaddw v11.8h, v11.8h, v3.8b\n"
9925 "uaddw v12.8h, v12.8h, v16.8b\n"
9926 "uaddw v13.8h, v13.8h, v17.8b\n"
9927 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9928 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9929
9930 "bne 1b\n"
9931
9932 "2:"
9933
9934 // Load Aggregate Store - column major 6x7
9935 "movi v0.8b, #0\n"
9936 "movi v1.8b, #0\n"
9937 "movi v2.8b, #0\n"
9938 "movi v3.8b, #0\n"
9939 "movi v4.8b, #0\n"
9940 "movi v5.8b, #0\n"
9941 "ld1 {v0.s}[0], [%x[in]], #4\n"
9942 "ld1 {v4.h}[0], [%x[in]], %x[stride]\n"
9943 "ld1 {v1.s}[0], [%x[in]], #4\n"
9944 "ld1 {v4.h}[1], [%x[in]], %x[stride]\n"
9945 "ld1 {v2.s}[0], [%x[in]], #4\n"
9946 "ld1 {v4.h}[2], [%x[in]], %x[stride]\n"
9947 "ld1 {v3.s}[0], [%x[in]], #4\n"
9948 "ld1 {v4.h}[3], [%x[in]], %x[stride]\n"
9949 "ld1 {v0.s}[1], [%x[in]], #4\n"
9950 "ld1 {v5.h}[0], [%x[in]], %x[stride]\n"
9951 "ld1 {v1.s}[1], [%x[in]], #4\n"
9952 "ld1 {v5.h}[1], [%x[in]], %x[stride]\n"
9953 "ld1 {v2.s}[1], [%x[in]], #4\n"
9954 "ld1 {v5.h}[2], [%x[in]], %x[stride]\n"
9955 "prfm pldl1keep, [%x[in]]\n"
9956 "trn1 v6.4h, v0.4h, v2.4h\n"
9957 "trn2 v14.4h, v0.4h, v2.4h\n"
9958 "trn1 v7.4h, v1.4h, v3.4h\n"
9959 "trn2 v15.4h, v1.4h, v3.4h\n"
9960 "uzp1 v16.8b, v4.8b, v5.8b\n"
9961 "uzp2 v17.8b, v4.8b, v5.8b\n"
9962 "trn1 v0.8b, v6.8b, v7.8b\n"
9963 "trn2 v1.8b, v6.8b, v7.8b\n"
9964 "trn1 v2.8b, v14.8b, v15.8b\n"
9965 "trn2 v3.8b, v14.8b, v15.8b\n"
9966 "uaddw v8.8h, v8.8h, v0.8b\n"
9967 "uaddw v9.8h, v9.8h, v1.8b\n"
9968 "uaddw v10.8h, v10.8h, v2.8b\n"
9969 "uaddw v11.8h, v11.8h, v3.8b\n"
9970 "uaddw v12.8h, v12.8h, v16.8b\n"
9971 "uaddw v13.8h, v13.8h, v17.8b\n"
9972 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
9973 "st1 {v16.2s, v17.2s}, [%x[out]], #16\n"
9974
9975 // Aggregator Reduction.
9976 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
9977 "dup v1.4s, %w[additive_sum_offset]\n"
9978 "uaddlp v8.4s, v8.8h\n"
9979 "uaddlp v9.4s, v9.8h\n"
9980 "uaddlp v10.4s, v10.8h\n"
9981 "uaddlp v11.4s, v11.8h\n"
9982 "uaddlp v12.4s, v12.8h\n"
9983 "uaddlp v13.4s, v13.8h\n"
9984 "addp v8.4s, v8.4s, v9.4s\n"
9985 "addp v10.4s, v10.4s, v11.4s\n"
9986 "addp v12.4s, v12.4s, v13.4s\n"
9987 "addp v8.4s, v8.4s, v10.4s\n"
9988 "addp v9.4s, v12.4s, v12.4s\n"
9989 "mul v8.4s, v8.4s, v0.s[0]\n"
9990 "mul v9.4s, v9.4s, v0.s[0]\n"
9991 "add v8.4s, v8.4s, v1.4s\n"
9992 "add v9.4s, v9.4s, v1.4s\n"
9993 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
9994 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
9995 [out] "+r"(out), [in] "+r"(in)
9996 : [additive_sum_offset] "r"(params.additive_sum_offset),
9997 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
9998 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
9999 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10000 }
10001
10002 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10003 inline void Stream<uint8_t, 7, 8, 0, ColumnMajorWithSum>::Pack(
10004 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10005 #ifdef DEBUG
10006 #ifdef DEBUG_METAGEMM_VERBOSE
10007 std::cout
10008 << __FILE__ << "(" << __LINE__
10009 << ") ColumnMajorWithSum<uint8_t, 7, 8, 0, ColumnMajorWithSum>::Pack()"
10010 << std::endl
10011 << std::flush;
10012 #endif
10013 #endif
10014 int params_count_copy = params.count;
10015 int params_stride_copy = params.stride;
10016 asm volatile(
10017 "sub %x[stride], %x[stride], #4\n"
10018 "movi v8.8h, #0\n"
10019 "movi v9.8h, #0\n"
10020 "movi v10.8h, #0\n"
10021 "movi v11.8h, #0\n"
10022 "movi v12.8h, #0\n"
10023 "movi v13.8h, #0\n"
10024 "movi v14.8h, #0\n"
10025
10026 "1:"
10027 "subs %x[count], %x[count], #8\n"
10028
10029 // Load Aggregate Store - column major 7x8
10030 "ld1 {v0.s}[0], [%x[in]], #4\n"
10031 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10032 "ld1 {v1.s}[0], [%x[in]], #4\n"
10033 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10034 "ld1 {v2.s}[0], [%x[in]], #4\n"
10035 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10036 "ld1 {v3.s}[0], [%x[in]], #4\n"
10037 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10038 "ld1 {v0.s}[1], [%x[in]], #4\n"
10039 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10040 "ld1 {v1.s}[1], [%x[in]], #4\n"
10041 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10042 "ld1 {v2.s}[1], [%x[in]], #4\n"
10043 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10044 "ld1 {v3.s}[1], [%x[in]], #4\n"
10045 "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10046 "prfm pldl1keep, [%x[in]]\n"
10047 "trn1 v7.4h, v0.4h, v2.4h\n"
10048 "trn2 v16.4h, v0.4h, v2.4h\n"
10049 "trn1 v15.4h, v1.4h, v3.4h\n"
10050 "trn2 v17.4h, v1.4h, v3.4h\n"
10051 "trn1 v0.8b, v7.8b, v15.8b\n"
10052 "trn2 v1.8b, v7.8b, v15.8b\n"
10053 "trn1 v2.8b, v16.8b, v17.8b\n"
10054 "trn2 v3.8b, v16.8b, v17.8b\n"
10055 "uaddw v8.8h, v8.8h, v0.8b\n"
10056 "uaddw v9.8h, v9.8h, v1.8b\n"
10057 "uaddw v10.8h, v10.8h, v2.8b\n"
10058 "uaddw v11.8h, v11.8h, v3.8b\n"
10059 "uaddw v12.8h, v12.8h, v4.8b\n"
10060 "uaddw v13.8h, v13.8h, v5.8b\n"
10061 "uaddw v14.8h, v14.8h, v6.8b\n"
10062 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10063 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10064
10065 "bne 1b\n"
10066
10067 // Aggregator Reduction.
10068 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10069 "dup v1.4s, %w[additive_sum_offset]\n"
10070 "uaddlp v8.4s, v8.8h\n"
10071 "uaddlp v9.4s, v9.8h\n"
10072 "uaddlp v10.4s, v10.8h\n"
10073 "uaddlp v11.4s, v11.8h\n"
10074 "uaddlp v12.4s, v12.8h\n"
10075 "uaddlp v13.4s, v13.8h\n"
10076 "uaddlp v14.4s, v14.8h\n"
10077 "addp v8.4s, v8.4s, v9.4s\n"
10078 "addp v10.4s, v10.4s, v11.4s\n"
10079 "addp v12.4s, v12.4s, v13.4s\n"
10080 "addp v14.4s, v14.4s, v14.4s\n"
10081 "addp v8.4s, v8.4s, v10.4s\n"
10082 "addp v9.4s, v12.4s, v14.4s\n"
10083 "mul v8.4s, v8.4s, v0.s[0]\n"
10084 "mul v9.4s, v9.4s, v0.s[0]\n"
10085 "add v8.4s, v8.4s, v1.4s\n"
10086 "add v9.4s, v9.4s, v1.4s\n"
10087 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10088 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10089 [out] "+r"(out), [in] "+r"(in)
10090 : [additive_sum_offset] "r"(params.additive_sum_offset),
10091 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10092 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10093 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10094 }
10095
10096 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10097 inline void Stream<uint8_t, 7, 8, 1, ColumnMajorWithSum>::Pack(
10098 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10099 #ifdef DEBUG
10100 #ifdef DEBUG_METAGEMM_VERBOSE
10101 std::cout
10102 << __FILE__ << "(" << __LINE__
10103 << ") ColumnMajorWithSum<uint8_t, 7, 8, 1, ColumnMajorWithSum>::Pack()"
10104 << std::endl
10105 << std::flush;
10106 #endif
10107 #endif
10108 int params_count_copy = params.count;
10109 int params_stride_copy = params.stride;
10110 asm volatile(
10111 "sub %x[stride], %x[stride], #4\n"
10112 "movi v8.8h, #0\n"
10113 "movi v9.8h, #0\n"
10114 "movi v10.8h, #0\n"
10115 "movi v11.8h, #0\n"
10116 "movi v12.8h, #0\n"
10117 "movi v13.8h, #0\n"
10118 "movi v14.8h, #0\n"
10119
10120 // Reduce count by leftovers.
10121 "subs %x[count], %x[count], #1\n"
10122 "beq 2f\n"
10123
10124 "1:"
10125 "subs %x[count], %x[count], #8\n"
10126
10127 // Load Aggregate Store - column major 7x8
10128 "ld1 {v0.s}[0], [%x[in]], #4\n"
10129 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10130 "ld1 {v1.s}[0], [%x[in]], #4\n"
10131 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10132 "ld1 {v2.s}[0], [%x[in]], #4\n"
10133 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10134 "ld1 {v3.s}[0], [%x[in]], #4\n"
10135 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10136 "ld1 {v0.s}[1], [%x[in]], #4\n"
10137 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10138 "ld1 {v1.s}[1], [%x[in]], #4\n"
10139 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10140 "ld1 {v2.s}[1], [%x[in]], #4\n"
10141 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10142 "ld1 {v3.s}[1], [%x[in]], #4\n"
10143 "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10144 "prfm pldl1keep, [%x[in]]\n"
10145 "trn1 v7.4h, v0.4h, v2.4h\n"
10146 "trn2 v16.4h, v0.4h, v2.4h\n"
10147 "trn1 v15.4h, v1.4h, v3.4h\n"
10148 "trn2 v17.4h, v1.4h, v3.4h\n"
10149 "trn1 v0.8b, v7.8b, v15.8b\n"
10150 "trn2 v1.8b, v7.8b, v15.8b\n"
10151 "trn1 v2.8b, v16.8b, v17.8b\n"
10152 "trn2 v3.8b, v16.8b, v17.8b\n"
10153 "uaddw v8.8h, v8.8h, v0.8b\n"
10154 "uaddw v9.8h, v9.8h, v1.8b\n"
10155 "uaddw v10.8h, v10.8h, v2.8b\n"
10156 "uaddw v11.8h, v11.8h, v3.8b\n"
10157 "uaddw v12.8h, v12.8h, v4.8b\n"
10158 "uaddw v13.8h, v13.8h, v5.8b\n"
10159 "uaddw v14.8h, v14.8h, v6.8b\n"
10160 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10161 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10162
10163 "bne 1b\n"
10164
10165 "2:"
10166
10167 // Load Aggregate Store - column major 7x1
10168 "movi v0.8b, #0\n"
10169 "movi v1.8b, #0\n"
10170 "movi v2.8b, #0\n"
10171 "movi v3.8b, #0\n"
10172 "movi v4.8b, #0\n"
10173 "movi v5.8b, #0\n"
10174 "movi v6.8b, #0\n"
10175 "ld1 {v0.s}[0], [%x[in]], #4\n"
10176 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10177 "prfm pldl1keep, [%x[in]]\n"
10178 "trn1 v7.4h, v0.4h, v2.4h\n"
10179 "trn2 v16.4h, v0.4h, v2.4h\n"
10180 "trn1 v15.4h, v1.4h, v3.4h\n"
10181 "trn2 v17.4h, v1.4h, v3.4h\n"
10182 "trn1 v0.8b, v7.8b, v15.8b\n"
10183 "trn2 v1.8b, v7.8b, v15.8b\n"
10184 "trn1 v2.8b, v16.8b, v17.8b\n"
10185 "trn2 v3.8b, v16.8b, v17.8b\n"
10186 "uaddw v8.8h, v8.8h, v0.8b\n"
10187 "uaddw v9.8h, v9.8h, v1.8b\n"
10188 "uaddw v10.8h, v10.8h, v2.8b\n"
10189 "uaddw v11.8h, v11.8h, v3.8b\n"
10190 "uaddw v12.8h, v12.8h, v4.8b\n"
10191 "uaddw v13.8h, v13.8h, v5.8b\n"
10192 "uaddw v14.8h, v14.8h, v6.8b\n"
10193 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10194 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10195
10196 // Aggregator Reduction.
10197 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10198 "dup v1.4s, %w[additive_sum_offset]\n"
10199 "uaddlp v8.4s, v8.8h\n"
10200 "uaddlp v9.4s, v9.8h\n"
10201 "uaddlp v10.4s, v10.8h\n"
10202 "uaddlp v11.4s, v11.8h\n"
10203 "uaddlp v12.4s, v12.8h\n"
10204 "uaddlp v13.4s, v13.8h\n"
10205 "uaddlp v14.4s, v14.8h\n"
10206 "addp v8.4s, v8.4s, v9.4s\n"
10207 "addp v10.4s, v10.4s, v11.4s\n"
10208 "addp v12.4s, v12.4s, v13.4s\n"
10209 "addp v14.4s, v14.4s, v14.4s\n"
10210 "addp v8.4s, v8.4s, v10.4s\n"
10211 "addp v9.4s, v12.4s, v14.4s\n"
10212 "mul v8.4s, v8.4s, v0.s[0]\n"
10213 "mul v9.4s, v9.4s, v0.s[0]\n"
10214 "add v8.4s, v8.4s, v1.4s\n"
10215 "add v9.4s, v9.4s, v1.4s\n"
10216 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10217 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10218 [out] "+r"(out), [in] "+r"(in)
10219 : [additive_sum_offset] "r"(params.additive_sum_offset),
10220 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10221 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10222 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10223 }
10224
10225 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10226 inline void Stream<uint8_t, 7, 8, 2, ColumnMajorWithSum>::Pack(
10227 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10228 #ifdef DEBUG
10229 #ifdef DEBUG_METAGEMM_VERBOSE
10230 std::cout
10231 << __FILE__ << "(" << __LINE__
10232 << ") ColumnMajorWithSum<uint8_t, 7, 8, 2, ColumnMajorWithSum>::Pack()"
10233 << std::endl
10234 << std::flush;
10235 #endif
10236 #endif
10237 int params_count_copy = params.count;
10238 int params_stride_copy = params.stride;
10239 asm volatile(
10240 "sub %x[stride], %x[stride], #4\n"
10241 "movi v8.8h, #0\n"
10242 "movi v9.8h, #0\n"
10243 "movi v10.8h, #0\n"
10244 "movi v11.8h, #0\n"
10245 "movi v12.8h, #0\n"
10246 "movi v13.8h, #0\n"
10247 "movi v14.8h, #0\n"
10248
10249 // Reduce count by leftovers.
10250 "subs %x[count], %x[count], #2\n"
10251 "beq 2f\n"
10252
10253 "1:"
10254 "subs %x[count], %x[count], #8\n"
10255
10256 // Load Aggregate Store - column major 7x8
10257 "ld1 {v0.s}[0], [%x[in]], #4\n"
10258 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10259 "ld1 {v1.s}[0], [%x[in]], #4\n"
10260 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10261 "ld1 {v2.s}[0], [%x[in]], #4\n"
10262 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10263 "ld1 {v3.s}[0], [%x[in]], #4\n"
10264 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10265 "ld1 {v0.s}[1], [%x[in]], #4\n"
10266 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10267 "ld1 {v1.s}[1], [%x[in]], #4\n"
10268 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10269 "ld1 {v2.s}[1], [%x[in]], #4\n"
10270 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10271 "ld1 {v3.s}[1], [%x[in]], #4\n"
10272 "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10273 "prfm pldl1keep, [%x[in]]\n"
10274 "trn1 v7.4h, v0.4h, v2.4h\n"
10275 "trn2 v16.4h, v0.4h, v2.4h\n"
10276 "trn1 v15.4h, v1.4h, v3.4h\n"
10277 "trn2 v17.4h, v1.4h, v3.4h\n"
10278 "trn1 v0.8b, v7.8b, v15.8b\n"
10279 "trn2 v1.8b, v7.8b, v15.8b\n"
10280 "trn1 v2.8b, v16.8b, v17.8b\n"
10281 "trn2 v3.8b, v16.8b, v17.8b\n"
10282 "uaddw v8.8h, v8.8h, v0.8b\n"
10283 "uaddw v9.8h, v9.8h, v1.8b\n"
10284 "uaddw v10.8h, v10.8h, v2.8b\n"
10285 "uaddw v11.8h, v11.8h, v3.8b\n"
10286 "uaddw v12.8h, v12.8h, v4.8b\n"
10287 "uaddw v13.8h, v13.8h, v5.8b\n"
10288 "uaddw v14.8h, v14.8h, v6.8b\n"
10289 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10290 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10291
10292 "bne 1b\n"
10293
10294 "2:"
10295
10296 // Load Aggregate Store - column major 7x2
10297 "movi v0.8b, #0\n"
10298 "movi v1.8b, #0\n"
10299 "movi v2.8b, #0\n"
10300 "movi v3.8b, #0\n"
10301 "movi v4.8b, #0\n"
10302 "movi v5.8b, #0\n"
10303 "movi v6.8b, #0\n"
10304 "ld1 {v0.s}[0], [%x[in]], #4\n"
10305 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10306 "ld1 {v1.s}[0], [%x[in]], #4\n"
10307 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10308 "prfm pldl1keep, [%x[in]]\n"
10309 "trn1 v7.4h, v0.4h, v2.4h\n"
10310 "trn2 v16.4h, v0.4h, v2.4h\n"
10311 "trn1 v15.4h, v1.4h, v3.4h\n"
10312 "trn2 v17.4h, v1.4h, v3.4h\n"
10313 "trn1 v0.8b, v7.8b, v15.8b\n"
10314 "trn2 v1.8b, v7.8b, v15.8b\n"
10315 "trn1 v2.8b, v16.8b, v17.8b\n"
10316 "trn2 v3.8b, v16.8b, v17.8b\n"
10317 "uaddw v8.8h, v8.8h, v0.8b\n"
10318 "uaddw v9.8h, v9.8h, v1.8b\n"
10319 "uaddw v10.8h, v10.8h, v2.8b\n"
10320 "uaddw v11.8h, v11.8h, v3.8b\n"
10321 "uaddw v12.8h, v12.8h, v4.8b\n"
10322 "uaddw v13.8h, v13.8h, v5.8b\n"
10323 "uaddw v14.8h, v14.8h, v6.8b\n"
10324 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10325 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10326
10327 // Aggregator Reduction.
10328 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10329 "dup v1.4s, %w[additive_sum_offset]\n"
10330 "uaddlp v8.4s, v8.8h\n"
10331 "uaddlp v9.4s, v9.8h\n"
10332 "uaddlp v10.4s, v10.8h\n"
10333 "uaddlp v11.4s, v11.8h\n"
10334 "uaddlp v12.4s, v12.8h\n"
10335 "uaddlp v13.4s, v13.8h\n"
10336 "uaddlp v14.4s, v14.8h\n"
10337 "addp v8.4s, v8.4s, v9.4s\n"
10338 "addp v10.4s, v10.4s, v11.4s\n"
10339 "addp v12.4s, v12.4s, v13.4s\n"
10340 "addp v14.4s, v14.4s, v14.4s\n"
10341 "addp v8.4s, v8.4s, v10.4s\n"
10342 "addp v9.4s, v12.4s, v14.4s\n"
10343 "mul v8.4s, v8.4s, v0.s[0]\n"
10344 "mul v9.4s, v9.4s, v0.s[0]\n"
10345 "add v8.4s, v8.4s, v1.4s\n"
10346 "add v9.4s, v9.4s, v1.4s\n"
10347 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10348 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10349 [out] "+r"(out), [in] "+r"(in)
10350 : [additive_sum_offset] "r"(params.additive_sum_offset),
10351 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10352 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10353 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10354 }
10355
10356 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10357 inline void Stream<uint8_t, 7, 8, 3, ColumnMajorWithSum>::Pack(
10358 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10359 #ifdef DEBUG
10360 #ifdef DEBUG_METAGEMM_VERBOSE
10361 std::cout
10362 << __FILE__ << "(" << __LINE__
10363 << ") ColumnMajorWithSum<uint8_t, 7, 8, 3, ColumnMajorWithSum>::Pack()"
10364 << std::endl
10365 << std::flush;
10366 #endif
10367 #endif
10368 int params_count_copy = params.count;
10369 int params_stride_copy = params.stride;
10370 asm volatile(
10371 "sub %x[stride], %x[stride], #4\n"
10372 "movi v8.8h, #0\n"
10373 "movi v9.8h, #0\n"
10374 "movi v10.8h, #0\n"
10375 "movi v11.8h, #0\n"
10376 "movi v12.8h, #0\n"
10377 "movi v13.8h, #0\n"
10378 "movi v14.8h, #0\n"
10379
10380 // Reduce count by leftovers.
10381 "subs %x[count], %x[count], #3\n"
10382 "beq 2f\n"
10383
10384 "1:"
10385 "subs %x[count], %x[count], #8\n"
10386
10387 // Load Aggregate Store - column major 7x8
10388 "ld1 {v0.s}[0], [%x[in]], #4\n"
10389 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10390 "ld1 {v1.s}[0], [%x[in]], #4\n"
10391 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10392 "ld1 {v2.s}[0], [%x[in]], #4\n"
10393 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10394 "ld1 {v3.s}[0], [%x[in]], #4\n"
10395 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10396 "ld1 {v0.s}[1], [%x[in]], #4\n"
10397 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10398 "ld1 {v1.s}[1], [%x[in]], #4\n"
10399 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10400 "ld1 {v2.s}[1], [%x[in]], #4\n"
10401 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10402 "ld1 {v3.s}[1], [%x[in]], #4\n"
10403 "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10404 "prfm pldl1keep, [%x[in]]\n"
10405 "trn1 v7.4h, v0.4h, v2.4h\n"
10406 "trn2 v16.4h, v0.4h, v2.4h\n"
10407 "trn1 v15.4h, v1.4h, v3.4h\n"
10408 "trn2 v17.4h, v1.4h, v3.4h\n"
10409 "trn1 v0.8b, v7.8b, v15.8b\n"
10410 "trn2 v1.8b, v7.8b, v15.8b\n"
10411 "trn1 v2.8b, v16.8b, v17.8b\n"
10412 "trn2 v3.8b, v16.8b, v17.8b\n"
10413 "uaddw v8.8h, v8.8h, v0.8b\n"
10414 "uaddw v9.8h, v9.8h, v1.8b\n"
10415 "uaddw v10.8h, v10.8h, v2.8b\n"
10416 "uaddw v11.8h, v11.8h, v3.8b\n"
10417 "uaddw v12.8h, v12.8h, v4.8b\n"
10418 "uaddw v13.8h, v13.8h, v5.8b\n"
10419 "uaddw v14.8h, v14.8h, v6.8b\n"
10420 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10421 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10422
10423 "bne 1b\n"
10424
10425 "2:"
10426
10427 // Load Aggregate Store - column major 7x3
10428 "movi v0.8b, #0\n"
10429 "movi v1.8b, #0\n"
10430 "movi v2.8b, #0\n"
10431 "movi v3.8b, #0\n"
10432 "movi v4.8b, #0\n"
10433 "movi v5.8b, #0\n"
10434 "movi v6.8b, #0\n"
10435 "ld1 {v0.s}[0], [%x[in]], #4\n"
10436 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10437 "ld1 {v1.s}[0], [%x[in]], #4\n"
10438 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10439 "ld1 {v2.s}[0], [%x[in]], #4\n"
10440 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10441 "prfm pldl1keep, [%x[in]]\n"
10442 "trn1 v7.4h, v0.4h, v2.4h\n"
10443 "trn2 v16.4h, v0.4h, v2.4h\n"
10444 "trn1 v15.4h, v1.4h, v3.4h\n"
10445 "trn2 v17.4h, v1.4h, v3.4h\n"
10446 "trn1 v0.8b, v7.8b, v15.8b\n"
10447 "trn2 v1.8b, v7.8b, v15.8b\n"
10448 "trn1 v2.8b, v16.8b, v17.8b\n"
10449 "trn2 v3.8b, v16.8b, v17.8b\n"
10450 "uaddw v8.8h, v8.8h, v0.8b\n"
10451 "uaddw v9.8h, v9.8h, v1.8b\n"
10452 "uaddw v10.8h, v10.8h, v2.8b\n"
10453 "uaddw v11.8h, v11.8h, v3.8b\n"
10454 "uaddw v12.8h, v12.8h, v4.8b\n"
10455 "uaddw v13.8h, v13.8h, v5.8b\n"
10456 "uaddw v14.8h, v14.8h, v6.8b\n"
10457 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10458 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10459
10460 // Aggregator Reduction.
10461 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10462 "dup v1.4s, %w[additive_sum_offset]\n"
10463 "uaddlp v8.4s, v8.8h\n"
10464 "uaddlp v9.4s, v9.8h\n"
10465 "uaddlp v10.4s, v10.8h\n"
10466 "uaddlp v11.4s, v11.8h\n"
10467 "uaddlp v12.4s, v12.8h\n"
10468 "uaddlp v13.4s, v13.8h\n"
10469 "uaddlp v14.4s, v14.8h\n"
10470 "addp v8.4s, v8.4s, v9.4s\n"
10471 "addp v10.4s, v10.4s, v11.4s\n"
10472 "addp v12.4s, v12.4s, v13.4s\n"
10473 "addp v14.4s, v14.4s, v14.4s\n"
10474 "addp v8.4s, v8.4s, v10.4s\n"
10475 "addp v9.4s, v12.4s, v14.4s\n"
10476 "mul v8.4s, v8.4s, v0.s[0]\n"
10477 "mul v9.4s, v9.4s, v0.s[0]\n"
10478 "add v8.4s, v8.4s, v1.4s\n"
10479 "add v9.4s, v9.4s, v1.4s\n"
10480 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10481 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10482 [out] "+r"(out), [in] "+r"(in)
10483 : [additive_sum_offset] "r"(params.additive_sum_offset),
10484 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10485 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10486 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10487 }
10488
10489 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10490 inline void Stream<uint8_t, 7, 8, 4, ColumnMajorWithSum>::Pack(
10491 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10492 #ifdef DEBUG
10493 #ifdef DEBUG_METAGEMM_VERBOSE
10494 std::cout
10495 << __FILE__ << "(" << __LINE__
10496 << ") ColumnMajorWithSum<uint8_t, 7, 8, 4, ColumnMajorWithSum>::Pack()"
10497 << std::endl
10498 << std::flush;
10499 #endif
10500 #endif
10501 int params_count_copy = params.count;
10502 int params_stride_copy = params.stride;
10503 asm volatile(
10504 "sub %x[stride], %x[stride], #4\n"
10505 "movi v8.8h, #0\n"
10506 "movi v9.8h, #0\n"
10507 "movi v10.8h, #0\n"
10508 "movi v11.8h, #0\n"
10509 "movi v12.8h, #0\n"
10510 "movi v13.8h, #0\n"
10511 "movi v14.8h, #0\n"
10512
10513 // Reduce count by leftovers.
10514 "subs %x[count], %x[count], #4\n"
10515 "beq 2f\n"
10516
10517 "1:"
10518 "subs %x[count], %x[count], #8\n"
10519
10520 // Load Aggregate Store - column major 7x8
10521 "ld1 {v0.s}[0], [%x[in]], #4\n"
10522 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10523 "ld1 {v1.s}[0], [%x[in]], #4\n"
10524 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10525 "ld1 {v2.s}[0], [%x[in]], #4\n"
10526 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10527 "ld1 {v3.s}[0], [%x[in]], #4\n"
10528 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10529 "ld1 {v0.s}[1], [%x[in]], #4\n"
10530 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10531 "ld1 {v1.s}[1], [%x[in]], #4\n"
10532 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10533 "ld1 {v2.s}[1], [%x[in]], #4\n"
10534 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10535 "ld1 {v3.s}[1], [%x[in]], #4\n"
10536 "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10537 "prfm pldl1keep, [%x[in]]\n"
10538 "trn1 v7.4h, v0.4h, v2.4h\n"
10539 "trn2 v16.4h, v0.4h, v2.4h\n"
10540 "trn1 v15.4h, v1.4h, v3.4h\n"
10541 "trn2 v17.4h, v1.4h, v3.4h\n"
10542 "trn1 v0.8b, v7.8b, v15.8b\n"
10543 "trn2 v1.8b, v7.8b, v15.8b\n"
10544 "trn1 v2.8b, v16.8b, v17.8b\n"
10545 "trn2 v3.8b, v16.8b, v17.8b\n"
10546 "uaddw v8.8h, v8.8h, v0.8b\n"
10547 "uaddw v9.8h, v9.8h, v1.8b\n"
10548 "uaddw v10.8h, v10.8h, v2.8b\n"
10549 "uaddw v11.8h, v11.8h, v3.8b\n"
10550 "uaddw v12.8h, v12.8h, v4.8b\n"
10551 "uaddw v13.8h, v13.8h, v5.8b\n"
10552 "uaddw v14.8h, v14.8h, v6.8b\n"
10553 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10554 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10555
10556 "bne 1b\n"
10557
10558 "2:"
10559
10560 // Load Aggregate Store - column major 7x4
10561 "movi v0.8b, #0\n"
10562 "movi v1.8b, #0\n"
10563 "movi v2.8b, #0\n"
10564 "movi v3.8b, #0\n"
10565 "movi v4.8b, #0\n"
10566 "movi v5.8b, #0\n"
10567 "movi v6.8b, #0\n"
10568 "ld1 {v0.s}[0], [%x[in]], #4\n"
10569 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10570 "ld1 {v1.s}[0], [%x[in]], #4\n"
10571 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10572 "ld1 {v2.s}[0], [%x[in]], #4\n"
10573 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10574 "ld1 {v3.s}[0], [%x[in]], #4\n"
10575 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10576 "prfm pldl1keep, [%x[in]]\n"
10577 "trn1 v7.4h, v0.4h, v2.4h\n"
10578 "trn2 v16.4h, v0.4h, v2.4h\n"
10579 "trn1 v15.4h, v1.4h, v3.4h\n"
10580 "trn2 v17.4h, v1.4h, v3.4h\n"
10581 "trn1 v0.8b, v7.8b, v15.8b\n"
10582 "trn2 v1.8b, v7.8b, v15.8b\n"
10583 "trn1 v2.8b, v16.8b, v17.8b\n"
10584 "trn2 v3.8b, v16.8b, v17.8b\n"
10585 "uaddw v8.8h, v8.8h, v0.8b\n"
10586 "uaddw v9.8h, v9.8h, v1.8b\n"
10587 "uaddw v10.8h, v10.8h, v2.8b\n"
10588 "uaddw v11.8h, v11.8h, v3.8b\n"
10589 "uaddw v12.8h, v12.8h, v4.8b\n"
10590 "uaddw v13.8h, v13.8h, v5.8b\n"
10591 "uaddw v14.8h, v14.8h, v6.8b\n"
10592 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10593 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10594
10595 // Aggregator Reduction.
10596 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10597 "dup v1.4s, %w[additive_sum_offset]\n"
10598 "uaddlp v8.4s, v8.8h\n"
10599 "uaddlp v9.4s, v9.8h\n"
10600 "uaddlp v10.4s, v10.8h\n"
10601 "uaddlp v11.4s, v11.8h\n"
10602 "uaddlp v12.4s, v12.8h\n"
10603 "uaddlp v13.4s, v13.8h\n"
10604 "uaddlp v14.4s, v14.8h\n"
10605 "addp v8.4s, v8.4s, v9.4s\n"
10606 "addp v10.4s, v10.4s, v11.4s\n"
10607 "addp v12.4s, v12.4s, v13.4s\n"
10608 "addp v14.4s, v14.4s, v14.4s\n"
10609 "addp v8.4s, v8.4s, v10.4s\n"
10610 "addp v9.4s, v12.4s, v14.4s\n"
10611 "mul v8.4s, v8.4s, v0.s[0]\n"
10612 "mul v9.4s, v9.4s, v0.s[0]\n"
10613 "add v8.4s, v8.4s, v1.4s\n"
10614 "add v9.4s, v9.4s, v1.4s\n"
10615 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10616 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10617 [out] "+r"(out), [in] "+r"(in)
10618 : [additive_sum_offset] "r"(params.additive_sum_offset),
10619 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10620 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10621 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10622 }
10623
10624 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10625 inline void Stream<uint8_t, 7, 8, 5, ColumnMajorWithSum>::Pack(
10626 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10627 #ifdef DEBUG
10628 #ifdef DEBUG_METAGEMM_VERBOSE
10629 std::cout
10630 << __FILE__ << "(" << __LINE__
10631 << ") ColumnMajorWithSum<uint8_t, 7, 8, 5, ColumnMajorWithSum>::Pack()"
10632 << std::endl
10633 << std::flush;
10634 #endif
10635 #endif
10636 int params_count_copy = params.count;
10637 int params_stride_copy = params.stride;
10638 asm volatile(
10639 "sub %x[stride], %x[stride], #4\n"
10640 "movi v8.8h, #0\n"
10641 "movi v9.8h, #0\n"
10642 "movi v10.8h, #0\n"
10643 "movi v11.8h, #0\n"
10644 "movi v12.8h, #0\n"
10645 "movi v13.8h, #0\n"
10646 "movi v14.8h, #0\n"
10647
10648 // Reduce count by leftovers.
10649 "subs %x[count], %x[count], #5\n"
10650 "beq 2f\n"
10651
10652 "1:"
10653 "subs %x[count], %x[count], #8\n"
10654
10655 // Load Aggregate Store - column major 7x8
10656 "ld1 {v0.s}[0], [%x[in]], #4\n"
10657 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10658 "ld1 {v1.s}[0], [%x[in]], #4\n"
10659 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10660 "ld1 {v2.s}[0], [%x[in]], #4\n"
10661 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10662 "ld1 {v3.s}[0], [%x[in]], #4\n"
10663 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10664 "ld1 {v0.s}[1], [%x[in]], #4\n"
10665 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10666 "ld1 {v1.s}[1], [%x[in]], #4\n"
10667 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10668 "ld1 {v2.s}[1], [%x[in]], #4\n"
10669 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10670 "ld1 {v3.s}[1], [%x[in]], #4\n"
10671 "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10672 "prfm pldl1keep, [%x[in]]\n"
10673 "trn1 v7.4h, v0.4h, v2.4h\n"
10674 "trn2 v16.4h, v0.4h, v2.4h\n"
10675 "trn1 v15.4h, v1.4h, v3.4h\n"
10676 "trn2 v17.4h, v1.4h, v3.4h\n"
10677 "trn1 v0.8b, v7.8b, v15.8b\n"
10678 "trn2 v1.8b, v7.8b, v15.8b\n"
10679 "trn1 v2.8b, v16.8b, v17.8b\n"
10680 "trn2 v3.8b, v16.8b, v17.8b\n"
10681 "uaddw v8.8h, v8.8h, v0.8b\n"
10682 "uaddw v9.8h, v9.8h, v1.8b\n"
10683 "uaddw v10.8h, v10.8h, v2.8b\n"
10684 "uaddw v11.8h, v11.8h, v3.8b\n"
10685 "uaddw v12.8h, v12.8h, v4.8b\n"
10686 "uaddw v13.8h, v13.8h, v5.8b\n"
10687 "uaddw v14.8h, v14.8h, v6.8b\n"
10688 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10689 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10690
10691 "bne 1b\n"
10692
10693 "2:"
10694
10695 // Load Aggregate Store - column major 7x5
10696 "movi v0.8b, #0\n"
10697 "movi v1.8b, #0\n"
10698 "movi v2.8b, #0\n"
10699 "movi v3.8b, #0\n"
10700 "movi v4.8b, #0\n"
10701 "movi v5.8b, #0\n"
10702 "movi v6.8b, #0\n"
10703 "ld1 {v0.s}[0], [%x[in]], #4\n"
10704 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10705 "ld1 {v1.s}[0], [%x[in]], #4\n"
10706 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10707 "ld1 {v2.s}[0], [%x[in]], #4\n"
10708 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10709 "ld1 {v3.s}[0], [%x[in]], #4\n"
10710 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10711 "ld1 {v0.s}[1], [%x[in]], #4\n"
10712 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10713 "prfm pldl1keep, [%x[in]]\n"
10714 "trn1 v7.4h, v0.4h, v2.4h\n"
10715 "trn2 v16.4h, v0.4h, v2.4h\n"
10716 "trn1 v15.4h, v1.4h, v3.4h\n"
10717 "trn2 v17.4h, v1.4h, v3.4h\n"
10718 "trn1 v0.8b, v7.8b, v15.8b\n"
10719 "trn2 v1.8b, v7.8b, v15.8b\n"
10720 "trn1 v2.8b, v16.8b, v17.8b\n"
10721 "trn2 v3.8b, v16.8b, v17.8b\n"
10722 "uaddw v8.8h, v8.8h, v0.8b\n"
10723 "uaddw v9.8h, v9.8h, v1.8b\n"
10724 "uaddw v10.8h, v10.8h, v2.8b\n"
10725 "uaddw v11.8h, v11.8h, v3.8b\n"
10726 "uaddw v12.8h, v12.8h, v4.8b\n"
10727 "uaddw v13.8h, v13.8h, v5.8b\n"
10728 "uaddw v14.8h, v14.8h, v6.8b\n"
10729 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10730 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10731
10732 // Aggregator Reduction.
10733 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10734 "dup v1.4s, %w[additive_sum_offset]\n"
10735 "uaddlp v8.4s, v8.8h\n"
10736 "uaddlp v9.4s, v9.8h\n"
10737 "uaddlp v10.4s, v10.8h\n"
10738 "uaddlp v11.4s, v11.8h\n"
10739 "uaddlp v12.4s, v12.8h\n"
10740 "uaddlp v13.4s, v13.8h\n"
10741 "uaddlp v14.4s, v14.8h\n"
10742 "addp v8.4s, v8.4s, v9.4s\n"
10743 "addp v10.4s, v10.4s, v11.4s\n"
10744 "addp v12.4s, v12.4s, v13.4s\n"
10745 "addp v14.4s, v14.4s, v14.4s\n"
10746 "addp v8.4s, v8.4s, v10.4s\n"
10747 "addp v9.4s, v12.4s, v14.4s\n"
10748 "mul v8.4s, v8.4s, v0.s[0]\n"
10749 "mul v9.4s, v9.4s, v0.s[0]\n"
10750 "add v8.4s, v8.4s, v1.4s\n"
10751 "add v9.4s, v9.4s, v1.4s\n"
10752 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10753 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10754 [out] "+r"(out), [in] "+r"(in)
10755 : [additive_sum_offset] "r"(params.additive_sum_offset),
10756 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10757 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10758 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10759 }
10760
10761 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10762 inline void Stream<uint8_t, 7, 8, 6, ColumnMajorWithSum>::Pack(
10763 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10764 #ifdef DEBUG
10765 #ifdef DEBUG_METAGEMM_VERBOSE
10766 std::cout
10767 << __FILE__ << "(" << __LINE__
10768 << ") ColumnMajorWithSum<uint8_t, 7, 8, 6, ColumnMajorWithSum>::Pack()"
10769 << std::endl
10770 << std::flush;
10771 #endif
10772 #endif
10773 int params_count_copy = params.count;
10774 int params_stride_copy = params.stride;
10775 asm volatile(
10776 "sub %x[stride], %x[stride], #4\n"
10777 "movi v8.8h, #0\n"
10778 "movi v9.8h, #0\n"
10779 "movi v10.8h, #0\n"
10780 "movi v11.8h, #0\n"
10781 "movi v12.8h, #0\n"
10782 "movi v13.8h, #0\n"
10783 "movi v14.8h, #0\n"
10784
10785 // Reduce count by leftovers.
10786 "subs %x[count], %x[count], #6\n"
10787 "beq 2f\n"
10788
10789 "1:"
10790 "subs %x[count], %x[count], #8\n"
10791
10792 // Load Aggregate Store - column major 7x8
10793 "ld1 {v0.s}[0], [%x[in]], #4\n"
10794 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10795 "ld1 {v1.s}[0], [%x[in]], #4\n"
10796 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10797 "ld1 {v2.s}[0], [%x[in]], #4\n"
10798 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10799 "ld1 {v3.s}[0], [%x[in]], #4\n"
10800 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10801 "ld1 {v0.s}[1], [%x[in]], #4\n"
10802 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10803 "ld1 {v1.s}[1], [%x[in]], #4\n"
10804 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10805 "ld1 {v2.s}[1], [%x[in]], #4\n"
10806 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10807 "ld1 {v3.s}[1], [%x[in]], #4\n"
10808 "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10809 "prfm pldl1keep, [%x[in]]\n"
10810 "trn1 v7.4h, v0.4h, v2.4h\n"
10811 "trn2 v16.4h, v0.4h, v2.4h\n"
10812 "trn1 v15.4h, v1.4h, v3.4h\n"
10813 "trn2 v17.4h, v1.4h, v3.4h\n"
10814 "trn1 v0.8b, v7.8b, v15.8b\n"
10815 "trn2 v1.8b, v7.8b, v15.8b\n"
10816 "trn1 v2.8b, v16.8b, v17.8b\n"
10817 "trn2 v3.8b, v16.8b, v17.8b\n"
10818 "uaddw v8.8h, v8.8h, v0.8b\n"
10819 "uaddw v9.8h, v9.8h, v1.8b\n"
10820 "uaddw v10.8h, v10.8h, v2.8b\n"
10821 "uaddw v11.8h, v11.8h, v3.8b\n"
10822 "uaddw v12.8h, v12.8h, v4.8b\n"
10823 "uaddw v13.8h, v13.8h, v5.8b\n"
10824 "uaddw v14.8h, v14.8h, v6.8b\n"
10825 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10826 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10827
10828 "bne 1b\n"
10829
10830 "2:"
10831
10832 // Load Aggregate Store - column major 7x6
10833 "movi v0.8b, #0\n"
10834 "movi v1.8b, #0\n"
10835 "movi v2.8b, #0\n"
10836 "movi v3.8b, #0\n"
10837 "movi v4.8b, #0\n"
10838 "movi v5.8b, #0\n"
10839 "movi v6.8b, #0\n"
10840 "ld1 {v0.s}[0], [%x[in]], #4\n"
10841 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10842 "ld1 {v1.s}[0], [%x[in]], #4\n"
10843 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10844 "ld1 {v2.s}[0], [%x[in]], #4\n"
10845 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10846 "ld1 {v3.s}[0], [%x[in]], #4\n"
10847 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10848 "ld1 {v0.s}[1], [%x[in]], #4\n"
10849 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10850 "ld1 {v1.s}[1], [%x[in]], #4\n"
10851 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10852 "prfm pldl1keep, [%x[in]]\n"
10853 "trn1 v7.4h, v0.4h, v2.4h\n"
10854 "trn2 v16.4h, v0.4h, v2.4h\n"
10855 "trn1 v15.4h, v1.4h, v3.4h\n"
10856 "trn2 v17.4h, v1.4h, v3.4h\n"
10857 "trn1 v0.8b, v7.8b, v15.8b\n"
10858 "trn2 v1.8b, v7.8b, v15.8b\n"
10859 "trn1 v2.8b, v16.8b, v17.8b\n"
10860 "trn2 v3.8b, v16.8b, v17.8b\n"
10861 "uaddw v8.8h, v8.8h, v0.8b\n"
10862 "uaddw v9.8h, v9.8h, v1.8b\n"
10863 "uaddw v10.8h, v10.8h, v2.8b\n"
10864 "uaddw v11.8h, v11.8h, v3.8b\n"
10865 "uaddw v12.8h, v12.8h, v4.8b\n"
10866 "uaddw v13.8h, v13.8h, v5.8b\n"
10867 "uaddw v14.8h, v14.8h, v6.8b\n"
10868 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10869 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10870
10871 // Aggregator Reduction.
10872 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
10873 "dup v1.4s, %w[additive_sum_offset]\n"
10874 "uaddlp v8.4s, v8.8h\n"
10875 "uaddlp v9.4s, v9.8h\n"
10876 "uaddlp v10.4s, v10.8h\n"
10877 "uaddlp v11.4s, v11.8h\n"
10878 "uaddlp v12.4s, v12.8h\n"
10879 "uaddlp v13.4s, v13.8h\n"
10880 "uaddlp v14.4s, v14.8h\n"
10881 "addp v8.4s, v8.4s, v9.4s\n"
10882 "addp v10.4s, v10.4s, v11.4s\n"
10883 "addp v12.4s, v12.4s, v13.4s\n"
10884 "addp v14.4s, v14.4s, v14.4s\n"
10885 "addp v8.4s, v8.4s, v10.4s\n"
10886 "addp v9.4s, v12.4s, v14.4s\n"
10887 "mul v8.4s, v8.4s, v0.s[0]\n"
10888 "mul v9.4s, v9.4s, v0.s[0]\n"
10889 "add v8.4s, v8.4s, v1.4s\n"
10890 "add v9.4s, v9.4s, v1.4s\n"
10891 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
10892 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
10893 [out] "+r"(out), [in] "+r"(in)
10894 : [additive_sum_offset] "r"(params.additive_sum_offset),
10895 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
10896 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
10897 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
10898 }
10899
10900 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)10901 inline void Stream<uint8_t, 7, 8, 7, ColumnMajorWithSum>::Pack(
10902 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
10903 #ifdef DEBUG
10904 #ifdef DEBUG_METAGEMM_VERBOSE
10905 std::cout
10906 << __FILE__ << "(" << __LINE__
10907 << ") ColumnMajorWithSum<uint8_t, 7, 8, 7, ColumnMajorWithSum>::Pack()"
10908 << std::endl
10909 << std::flush;
10910 #endif
10911 #endif
10912 int params_count_copy = params.count;
10913 int params_stride_copy = params.stride;
10914 asm volatile(
10915 "sub %x[stride], %x[stride], #4\n"
10916 "movi v8.8h, #0\n"
10917 "movi v9.8h, #0\n"
10918 "movi v10.8h, #0\n"
10919 "movi v11.8h, #0\n"
10920 "movi v12.8h, #0\n"
10921 "movi v13.8h, #0\n"
10922 "movi v14.8h, #0\n"
10923
10924 // Reduce count by leftovers.
10925 "subs %x[count], %x[count], #7\n"
10926 "beq 2f\n"
10927
10928 "1:"
10929 "subs %x[count], %x[count], #8\n"
10930
10931 // Load Aggregate Store - column major 7x8
10932 "ld1 {v0.s}[0], [%x[in]], #4\n"
10933 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10934 "ld1 {v1.s}[0], [%x[in]], #4\n"
10935 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10936 "ld1 {v2.s}[0], [%x[in]], #4\n"
10937 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10938 "ld1 {v3.s}[0], [%x[in]], #4\n"
10939 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10940 "ld1 {v0.s}[1], [%x[in]], #4\n"
10941 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10942 "ld1 {v1.s}[1], [%x[in]], #4\n"
10943 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10944 "ld1 {v2.s}[1], [%x[in]], #4\n"
10945 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10946 "ld1 {v3.s}[1], [%x[in]], #4\n"
10947 "ld3 {v4.b, v5.b, v6.b}[7], [%x[in]], %x[stride]\n"
10948 "prfm pldl1keep, [%x[in]]\n"
10949 "trn1 v7.4h, v0.4h, v2.4h\n"
10950 "trn2 v16.4h, v0.4h, v2.4h\n"
10951 "trn1 v15.4h, v1.4h, v3.4h\n"
10952 "trn2 v17.4h, v1.4h, v3.4h\n"
10953 "trn1 v0.8b, v7.8b, v15.8b\n"
10954 "trn2 v1.8b, v7.8b, v15.8b\n"
10955 "trn1 v2.8b, v16.8b, v17.8b\n"
10956 "trn2 v3.8b, v16.8b, v17.8b\n"
10957 "uaddw v8.8h, v8.8h, v0.8b\n"
10958 "uaddw v9.8h, v9.8h, v1.8b\n"
10959 "uaddw v10.8h, v10.8h, v2.8b\n"
10960 "uaddw v11.8h, v11.8h, v3.8b\n"
10961 "uaddw v12.8h, v12.8h, v4.8b\n"
10962 "uaddw v13.8h, v13.8h, v5.8b\n"
10963 "uaddw v14.8h, v14.8h, v6.8b\n"
10964 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
10965 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
10966
10967 "bne 1b\n"
10968
10969 "2:"
10970
10971 // Load Aggregate Store - column major 7x7
10972 "movi v0.8b, #0\n"
10973 "movi v1.8b, #0\n"
10974 "movi v2.8b, #0\n"
10975 "movi v3.8b, #0\n"
10976 "movi v4.8b, #0\n"
10977 "movi v5.8b, #0\n"
10978 "movi v6.8b, #0\n"
10979 "ld1 {v0.s}[0], [%x[in]], #4\n"
10980 "ld3 {v4.b, v5.b, v6.b}[0], [%x[in]], %x[stride]\n"
10981 "ld1 {v1.s}[0], [%x[in]], #4\n"
10982 "ld3 {v4.b, v5.b, v6.b}[1], [%x[in]], %x[stride]\n"
10983 "ld1 {v2.s}[0], [%x[in]], #4\n"
10984 "ld3 {v4.b, v5.b, v6.b}[2], [%x[in]], %x[stride]\n"
10985 "ld1 {v3.s}[0], [%x[in]], #4\n"
10986 "ld3 {v4.b, v5.b, v6.b}[3], [%x[in]], %x[stride]\n"
10987 "ld1 {v0.s}[1], [%x[in]], #4\n"
10988 "ld3 {v4.b, v5.b, v6.b}[4], [%x[in]], %x[stride]\n"
10989 "ld1 {v1.s}[1], [%x[in]], #4\n"
10990 "ld3 {v4.b, v5.b, v6.b}[5], [%x[in]], %x[stride]\n"
10991 "ld1 {v2.s}[1], [%x[in]], #4\n"
10992 "ld3 {v4.b, v5.b, v6.b}[6], [%x[in]], %x[stride]\n"
10993 "prfm pldl1keep, [%x[in]]\n"
10994 "trn1 v7.4h, v0.4h, v2.4h\n"
10995 "trn2 v16.4h, v0.4h, v2.4h\n"
10996 "trn1 v15.4h, v1.4h, v3.4h\n"
10997 "trn2 v17.4h, v1.4h, v3.4h\n"
10998 "trn1 v0.8b, v7.8b, v15.8b\n"
10999 "trn2 v1.8b, v7.8b, v15.8b\n"
11000 "trn1 v2.8b, v16.8b, v17.8b\n"
11001 "trn2 v3.8b, v16.8b, v17.8b\n"
11002 "uaddw v8.8h, v8.8h, v0.8b\n"
11003 "uaddw v9.8h, v9.8h, v1.8b\n"
11004 "uaddw v10.8h, v10.8h, v2.8b\n"
11005 "uaddw v11.8h, v11.8h, v3.8b\n"
11006 "uaddw v12.8h, v12.8h, v4.8b\n"
11007 "uaddw v13.8h, v13.8h, v5.8b\n"
11008 "uaddw v14.8h, v14.8h, v6.8b\n"
11009 "st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [%x[out]], #32\n"
11010 "st1 {v4.2s, v5.2s, v6.2s}, [%x[out]], #24\n"
11011
11012 // Aggregator Reduction.
11013 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11014 "dup v1.4s, %w[additive_sum_offset]\n"
11015 "uaddlp v8.4s, v8.8h\n"
11016 "uaddlp v9.4s, v9.8h\n"
11017 "uaddlp v10.4s, v10.8h\n"
11018 "uaddlp v11.4s, v11.8h\n"
11019 "uaddlp v12.4s, v12.8h\n"
11020 "uaddlp v13.4s, v13.8h\n"
11021 "uaddlp v14.4s, v14.8h\n"
11022 "addp v8.4s, v8.4s, v9.4s\n"
11023 "addp v10.4s, v10.4s, v11.4s\n"
11024 "addp v12.4s, v12.4s, v13.4s\n"
11025 "addp v14.4s, v14.4s, v14.4s\n"
11026 "addp v8.4s, v8.4s, v10.4s\n"
11027 "addp v9.4s, v12.4s, v14.4s\n"
11028 "mul v8.4s, v8.4s, v0.s[0]\n"
11029 "mul v9.4s, v9.4s, v0.s[0]\n"
11030 "add v8.4s, v8.4s, v1.4s\n"
11031 "add v9.4s, v9.4s, v1.4s\n"
11032 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11033 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11034 [out] "+r"(out), [in] "+r"(in)
11035 : [additive_sum_offset] "r"(params.additive_sum_offset),
11036 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11037 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11038 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "cc", "memory");
11039 }
11040
11041 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11042 inline void Stream<uint8_t, 8, 8, 0, ColumnMajorWithSum>::Pack(
11043 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11044 #ifdef DEBUG
11045 #ifdef DEBUG_METAGEMM_VERBOSE
11046 std::cout
11047 << __FILE__ << "(" << __LINE__
11048 << ") ColumnMajorWithSum<uint8_t, 8, 8, 0, ColumnMajorWithSum>::Pack()"
11049 << std::endl
11050 << std::flush;
11051 #endif
11052 #endif
11053 int params_count_copy = params.count;
11054 int params_stride_copy = params.stride;
11055 asm volatile(
11056 "movi v8.8h, #0\n"
11057 "movi v9.8h, #0\n"
11058 "movi v10.8h, #0\n"
11059 "movi v11.8h, #0\n"
11060 "movi v12.8h, #0\n"
11061 "movi v13.8h, #0\n"
11062 "movi v14.8h, #0\n"
11063 "movi v15.8h, #0\n"
11064
11065 "1:"
11066 "subs %x[count], %x[count], #8\n"
11067
11068 // Load Aggregate Store - column major 8x8
11069 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11070 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11071 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11072 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11073 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11074 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11075 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11076 "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11077 "prfm pldl1keep, [%x[in]]\n"
11078 "trn1 v16.8b, v0.8b, v1.8b\n"
11079 "trn2 v17.8b, v0.8b, v1.8b\n"
11080 "trn1 v18.8b, v2.8b, v3.8b\n"
11081 "trn2 v19.8b, v2.8b, v3.8b\n"
11082 "trn1 v20.8b, v4.8b, v5.8b\n"
11083 "trn2 v21.8b, v4.8b, v5.8b\n"
11084 "trn1 v22.8b, v6.8b, v7.8b\n"
11085 "trn2 v23.8b, v6.8b, v7.8b\n"
11086 "trn1 v0.4h, v16.4h, v18.4h\n"
11087 "trn2 v2.4h, v16.4h, v18.4h\n"
11088 "trn1 v1.4h, v17.4h, v19.4h\n"
11089 "trn2 v3.4h, v17.4h, v19.4h\n"
11090 "trn1 v4.4h, v20.4h, v22.4h\n"
11091 "trn2 v6.4h, v20.4h, v22.4h\n"
11092 "trn1 v5.4h, v21.4h, v23.4h\n"
11093 "trn2 v7.4h, v21.4h, v23.4h\n"
11094 "trn1 v16.2s, v0.2s, v4.2s\n"
11095 "trn2 v20.2s, v0.2s, v4.2s\n"
11096 "trn1 v17.2s, v1.2s, v5.2s\n"
11097 "trn2 v21.2s, v1.2s, v5.2s\n"
11098 "trn1 v18.2s, v2.2s, v6.2s\n"
11099 "trn2 v22.2s, v2.2s, v6.2s\n"
11100 "trn1 v19.2s, v3.2s, v7.2s\n"
11101 "trn2 v23.2s, v3.2s, v7.2s\n"
11102 "uaddw v8.8h, v8.8h, v16.8b\n"
11103 "uaddw v9.8h, v9.8h, v17.8b\n"
11104 "uaddw v10.8h, v10.8h, v18.8b\n"
11105 "uaddw v11.8h, v11.8h, v19.8b\n"
11106 "uaddw v12.8h, v12.8h, v20.8b\n"
11107 "uaddw v13.8h, v13.8h, v21.8b\n"
11108 "uaddw v14.8h, v14.8h, v22.8b\n"
11109 "uaddw v15.8h, v15.8h, v23.8b\n"
11110 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11111 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11112
11113 "bne 1b\n"
11114
11115 // Aggregator Reduction.
11116 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11117 "dup v1.4s, %w[additive_sum_offset]\n"
11118 "uaddlp v8.4s, v8.8h\n"
11119 "uaddlp v9.4s, v9.8h\n"
11120 "uaddlp v10.4s, v10.8h\n"
11121 "uaddlp v11.4s, v11.8h\n"
11122 "uaddlp v12.4s, v12.8h\n"
11123 "uaddlp v13.4s, v13.8h\n"
11124 "uaddlp v14.4s, v14.8h\n"
11125 "uaddlp v15.4s, v15.8h\n"
11126 "addp v8.4s, v8.4s, v9.4s\n"
11127 "addp v10.4s, v10.4s, v11.4s\n"
11128 "addp v12.4s, v12.4s, v13.4s\n"
11129 "addp v14.4s, v14.4s, v15.4s\n"
11130 "addp v8.4s, v8.4s, v10.4s\n"
11131 "addp v9.4s, v12.4s, v14.4s\n"
11132 "mul v8.4s, v8.4s, v0.s[0]\n"
11133 "mul v9.4s, v9.4s, v0.s[0]\n"
11134 "add v8.4s, v8.4s, v1.4s\n"
11135 "add v9.4s, v9.4s, v1.4s\n"
11136 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11137 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11138 [out] "+r"(out), [in] "+r"(in)
11139 : [additive_sum_offset] "r"(params.additive_sum_offset),
11140 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11141 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11142 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11143 "v21", "v22", "v23", "cc", "memory");
11144 }
11145
11146 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11147 inline void Stream<uint8_t, 8, 8, 1, ColumnMajorWithSum>::Pack(
11148 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11149 #ifdef DEBUG
11150 #ifdef DEBUG_METAGEMM_VERBOSE
11151 std::cout
11152 << __FILE__ << "(" << __LINE__
11153 << ") ColumnMajorWithSum<uint8_t, 8, 8, 1, ColumnMajorWithSum>::Pack()"
11154 << std::endl
11155 << std::flush;
11156 #endif
11157 #endif
11158 int params_count_copy = params.count;
11159 int params_stride_copy = params.stride;
11160 asm volatile(
11161 "movi v8.8h, #0\n"
11162 "movi v9.8h, #0\n"
11163 "movi v10.8h, #0\n"
11164 "movi v11.8h, #0\n"
11165 "movi v12.8h, #0\n"
11166 "movi v13.8h, #0\n"
11167 "movi v14.8h, #0\n"
11168 "movi v15.8h, #0\n"
11169
11170 // Reduce count by leftovers.
11171 "subs %x[count], %x[count], #1\n"
11172 "beq 2f\n"
11173
11174 "1:"
11175 "subs %x[count], %x[count], #8\n"
11176
11177 // Load Aggregate Store - column major 8x8
11178 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11179 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11180 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11181 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11182 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11183 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11184 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11185 "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11186 "prfm pldl1keep, [%x[in]]\n"
11187 "trn1 v16.8b, v0.8b, v1.8b\n"
11188 "trn2 v17.8b, v0.8b, v1.8b\n"
11189 "trn1 v18.8b, v2.8b, v3.8b\n"
11190 "trn2 v19.8b, v2.8b, v3.8b\n"
11191 "trn1 v20.8b, v4.8b, v5.8b\n"
11192 "trn2 v21.8b, v4.8b, v5.8b\n"
11193 "trn1 v22.8b, v6.8b, v7.8b\n"
11194 "trn2 v23.8b, v6.8b, v7.8b\n"
11195 "trn1 v0.4h, v16.4h, v18.4h\n"
11196 "trn2 v2.4h, v16.4h, v18.4h\n"
11197 "trn1 v1.4h, v17.4h, v19.4h\n"
11198 "trn2 v3.4h, v17.4h, v19.4h\n"
11199 "trn1 v4.4h, v20.4h, v22.4h\n"
11200 "trn2 v6.4h, v20.4h, v22.4h\n"
11201 "trn1 v5.4h, v21.4h, v23.4h\n"
11202 "trn2 v7.4h, v21.4h, v23.4h\n"
11203 "trn1 v16.2s, v0.2s, v4.2s\n"
11204 "trn2 v20.2s, v0.2s, v4.2s\n"
11205 "trn1 v17.2s, v1.2s, v5.2s\n"
11206 "trn2 v21.2s, v1.2s, v5.2s\n"
11207 "trn1 v18.2s, v2.2s, v6.2s\n"
11208 "trn2 v22.2s, v2.2s, v6.2s\n"
11209 "trn1 v19.2s, v3.2s, v7.2s\n"
11210 "trn2 v23.2s, v3.2s, v7.2s\n"
11211 "uaddw v8.8h, v8.8h, v16.8b\n"
11212 "uaddw v9.8h, v9.8h, v17.8b\n"
11213 "uaddw v10.8h, v10.8h, v18.8b\n"
11214 "uaddw v11.8h, v11.8h, v19.8b\n"
11215 "uaddw v12.8h, v12.8h, v20.8b\n"
11216 "uaddw v13.8h, v13.8h, v21.8b\n"
11217 "uaddw v14.8h, v14.8h, v22.8b\n"
11218 "uaddw v15.8h, v15.8h, v23.8b\n"
11219 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11220 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11221
11222 "bne 1b\n"
11223
11224 "2:"
11225
11226 // Load Aggregate Store - column major 8x1
11227 "movi v0.8b, #0\n"
11228 "movi v1.8b, #0\n"
11229 "movi v2.8b, #0\n"
11230 "movi v3.8b, #0\n"
11231 "movi v4.8b, #0\n"
11232 "movi v5.8b, #0\n"
11233 "movi v6.8b, #0\n"
11234 "movi v7.8b, #0\n"
11235 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11236 "prfm pldl1keep, [%x[in]]\n"
11237 "trn1 v16.8b, v0.8b, v1.8b\n"
11238 "trn2 v17.8b, v0.8b, v1.8b\n"
11239 "trn1 v18.8b, v2.8b, v3.8b\n"
11240 "trn2 v19.8b, v2.8b, v3.8b\n"
11241 "trn1 v20.8b, v4.8b, v5.8b\n"
11242 "trn2 v21.8b, v4.8b, v5.8b\n"
11243 "trn1 v22.8b, v6.8b, v7.8b\n"
11244 "trn2 v23.8b, v6.8b, v7.8b\n"
11245 "trn1 v0.4h, v16.4h, v18.4h\n"
11246 "trn2 v2.4h, v16.4h, v18.4h\n"
11247 "trn1 v1.4h, v17.4h, v19.4h\n"
11248 "trn2 v3.4h, v17.4h, v19.4h\n"
11249 "trn1 v4.4h, v20.4h, v22.4h\n"
11250 "trn2 v6.4h, v20.4h, v22.4h\n"
11251 "trn1 v5.4h, v21.4h, v23.4h\n"
11252 "trn2 v7.4h, v21.4h, v23.4h\n"
11253 "trn1 v16.2s, v0.2s, v4.2s\n"
11254 "trn2 v20.2s, v0.2s, v4.2s\n"
11255 "trn1 v17.2s, v1.2s, v5.2s\n"
11256 "trn2 v21.2s, v1.2s, v5.2s\n"
11257 "trn1 v18.2s, v2.2s, v6.2s\n"
11258 "trn2 v22.2s, v2.2s, v6.2s\n"
11259 "trn1 v19.2s, v3.2s, v7.2s\n"
11260 "trn2 v23.2s, v3.2s, v7.2s\n"
11261 "uaddw v8.8h, v8.8h, v16.8b\n"
11262 "uaddw v9.8h, v9.8h, v17.8b\n"
11263 "uaddw v10.8h, v10.8h, v18.8b\n"
11264 "uaddw v11.8h, v11.8h, v19.8b\n"
11265 "uaddw v12.8h, v12.8h, v20.8b\n"
11266 "uaddw v13.8h, v13.8h, v21.8b\n"
11267 "uaddw v14.8h, v14.8h, v22.8b\n"
11268 "uaddw v15.8h, v15.8h, v23.8b\n"
11269 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11270 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11271
11272 // Aggregator Reduction.
11273 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11274 "dup v1.4s, %w[additive_sum_offset]\n"
11275 "uaddlp v8.4s, v8.8h\n"
11276 "uaddlp v9.4s, v9.8h\n"
11277 "uaddlp v10.4s, v10.8h\n"
11278 "uaddlp v11.4s, v11.8h\n"
11279 "uaddlp v12.4s, v12.8h\n"
11280 "uaddlp v13.4s, v13.8h\n"
11281 "uaddlp v14.4s, v14.8h\n"
11282 "uaddlp v15.4s, v15.8h\n"
11283 "addp v8.4s, v8.4s, v9.4s\n"
11284 "addp v10.4s, v10.4s, v11.4s\n"
11285 "addp v12.4s, v12.4s, v13.4s\n"
11286 "addp v14.4s, v14.4s, v15.4s\n"
11287 "addp v8.4s, v8.4s, v10.4s\n"
11288 "addp v9.4s, v12.4s, v14.4s\n"
11289 "mul v8.4s, v8.4s, v0.s[0]\n"
11290 "mul v9.4s, v9.4s, v0.s[0]\n"
11291 "add v8.4s, v8.4s, v1.4s\n"
11292 "add v9.4s, v9.4s, v1.4s\n"
11293 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11294 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11295 [out] "+r"(out), [in] "+r"(in)
11296 : [additive_sum_offset] "r"(params.additive_sum_offset),
11297 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11298 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11299 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11300 "v21", "v22", "v23", "cc", "memory");
11301 }
11302
11303 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11304 inline void Stream<uint8_t, 8, 8, 2, ColumnMajorWithSum>::Pack(
11305 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11306 #ifdef DEBUG
11307 #ifdef DEBUG_METAGEMM_VERBOSE
11308 std::cout
11309 << __FILE__ << "(" << __LINE__
11310 << ") ColumnMajorWithSum<uint8_t, 8, 8, 2, ColumnMajorWithSum>::Pack()"
11311 << std::endl
11312 << std::flush;
11313 #endif
11314 #endif
11315 int params_count_copy = params.count;
11316 int params_stride_copy = params.stride;
11317 asm volatile(
11318 "movi v8.8h, #0\n"
11319 "movi v9.8h, #0\n"
11320 "movi v10.8h, #0\n"
11321 "movi v11.8h, #0\n"
11322 "movi v12.8h, #0\n"
11323 "movi v13.8h, #0\n"
11324 "movi v14.8h, #0\n"
11325 "movi v15.8h, #0\n"
11326
11327 // Reduce count by leftovers.
11328 "subs %x[count], %x[count], #2\n"
11329 "beq 2f\n"
11330
11331 "1:"
11332 "subs %x[count], %x[count], #8\n"
11333
11334 // Load Aggregate Store - column major 8x8
11335 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11336 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11337 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11338 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11339 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11340 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11341 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11342 "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11343 "prfm pldl1keep, [%x[in]]\n"
11344 "trn1 v16.8b, v0.8b, v1.8b\n"
11345 "trn2 v17.8b, v0.8b, v1.8b\n"
11346 "trn1 v18.8b, v2.8b, v3.8b\n"
11347 "trn2 v19.8b, v2.8b, v3.8b\n"
11348 "trn1 v20.8b, v4.8b, v5.8b\n"
11349 "trn2 v21.8b, v4.8b, v5.8b\n"
11350 "trn1 v22.8b, v6.8b, v7.8b\n"
11351 "trn2 v23.8b, v6.8b, v7.8b\n"
11352 "trn1 v0.4h, v16.4h, v18.4h\n"
11353 "trn2 v2.4h, v16.4h, v18.4h\n"
11354 "trn1 v1.4h, v17.4h, v19.4h\n"
11355 "trn2 v3.4h, v17.4h, v19.4h\n"
11356 "trn1 v4.4h, v20.4h, v22.4h\n"
11357 "trn2 v6.4h, v20.4h, v22.4h\n"
11358 "trn1 v5.4h, v21.4h, v23.4h\n"
11359 "trn2 v7.4h, v21.4h, v23.4h\n"
11360 "trn1 v16.2s, v0.2s, v4.2s\n"
11361 "trn2 v20.2s, v0.2s, v4.2s\n"
11362 "trn1 v17.2s, v1.2s, v5.2s\n"
11363 "trn2 v21.2s, v1.2s, v5.2s\n"
11364 "trn1 v18.2s, v2.2s, v6.2s\n"
11365 "trn2 v22.2s, v2.2s, v6.2s\n"
11366 "trn1 v19.2s, v3.2s, v7.2s\n"
11367 "trn2 v23.2s, v3.2s, v7.2s\n"
11368 "uaddw v8.8h, v8.8h, v16.8b\n"
11369 "uaddw v9.8h, v9.8h, v17.8b\n"
11370 "uaddw v10.8h, v10.8h, v18.8b\n"
11371 "uaddw v11.8h, v11.8h, v19.8b\n"
11372 "uaddw v12.8h, v12.8h, v20.8b\n"
11373 "uaddw v13.8h, v13.8h, v21.8b\n"
11374 "uaddw v14.8h, v14.8h, v22.8b\n"
11375 "uaddw v15.8h, v15.8h, v23.8b\n"
11376 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11377 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11378
11379 "bne 1b\n"
11380
11381 "2:"
11382
11383 // Load Aggregate Store - column major 8x2
11384 "movi v0.8b, #0\n"
11385 "movi v1.8b, #0\n"
11386 "movi v2.8b, #0\n"
11387 "movi v3.8b, #0\n"
11388 "movi v4.8b, #0\n"
11389 "movi v5.8b, #0\n"
11390 "movi v6.8b, #0\n"
11391 "movi v7.8b, #0\n"
11392 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11393 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11394 "prfm pldl1keep, [%x[in]]\n"
11395 "trn1 v16.8b, v0.8b, v1.8b\n"
11396 "trn2 v17.8b, v0.8b, v1.8b\n"
11397 "trn1 v18.8b, v2.8b, v3.8b\n"
11398 "trn2 v19.8b, v2.8b, v3.8b\n"
11399 "trn1 v20.8b, v4.8b, v5.8b\n"
11400 "trn2 v21.8b, v4.8b, v5.8b\n"
11401 "trn1 v22.8b, v6.8b, v7.8b\n"
11402 "trn2 v23.8b, v6.8b, v7.8b\n"
11403 "trn1 v0.4h, v16.4h, v18.4h\n"
11404 "trn2 v2.4h, v16.4h, v18.4h\n"
11405 "trn1 v1.4h, v17.4h, v19.4h\n"
11406 "trn2 v3.4h, v17.4h, v19.4h\n"
11407 "trn1 v4.4h, v20.4h, v22.4h\n"
11408 "trn2 v6.4h, v20.4h, v22.4h\n"
11409 "trn1 v5.4h, v21.4h, v23.4h\n"
11410 "trn2 v7.4h, v21.4h, v23.4h\n"
11411 "trn1 v16.2s, v0.2s, v4.2s\n"
11412 "trn2 v20.2s, v0.2s, v4.2s\n"
11413 "trn1 v17.2s, v1.2s, v5.2s\n"
11414 "trn2 v21.2s, v1.2s, v5.2s\n"
11415 "trn1 v18.2s, v2.2s, v6.2s\n"
11416 "trn2 v22.2s, v2.2s, v6.2s\n"
11417 "trn1 v19.2s, v3.2s, v7.2s\n"
11418 "trn2 v23.2s, v3.2s, v7.2s\n"
11419 "uaddw v8.8h, v8.8h, v16.8b\n"
11420 "uaddw v9.8h, v9.8h, v17.8b\n"
11421 "uaddw v10.8h, v10.8h, v18.8b\n"
11422 "uaddw v11.8h, v11.8h, v19.8b\n"
11423 "uaddw v12.8h, v12.8h, v20.8b\n"
11424 "uaddw v13.8h, v13.8h, v21.8b\n"
11425 "uaddw v14.8h, v14.8h, v22.8b\n"
11426 "uaddw v15.8h, v15.8h, v23.8b\n"
11427 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11428 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11429
11430 // Aggregator Reduction.
11431 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11432 "dup v1.4s, %w[additive_sum_offset]\n"
11433 "uaddlp v8.4s, v8.8h\n"
11434 "uaddlp v9.4s, v9.8h\n"
11435 "uaddlp v10.4s, v10.8h\n"
11436 "uaddlp v11.4s, v11.8h\n"
11437 "uaddlp v12.4s, v12.8h\n"
11438 "uaddlp v13.4s, v13.8h\n"
11439 "uaddlp v14.4s, v14.8h\n"
11440 "uaddlp v15.4s, v15.8h\n"
11441 "addp v8.4s, v8.4s, v9.4s\n"
11442 "addp v10.4s, v10.4s, v11.4s\n"
11443 "addp v12.4s, v12.4s, v13.4s\n"
11444 "addp v14.4s, v14.4s, v15.4s\n"
11445 "addp v8.4s, v8.4s, v10.4s\n"
11446 "addp v9.4s, v12.4s, v14.4s\n"
11447 "mul v8.4s, v8.4s, v0.s[0]\n"
11448 "mul v9.4s, v9.4s, v0.s[0]\n"
11449 "add v8.4s, v8.4s, v1.4s\n"
11450 "add v9.4s, v9.4s, v1.4s\n"
11451 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11452 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11453 [out] "+r"(out), [in] "+r"(in)
11454 : [additive_sum_offset] "r"(params.additive_sum_offset),
11455 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11456 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11457 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11458 "v21", "v22", "v23", "cc", "memory");
11459 }
11460
11461 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11462 inline void Stream<uint8_t, 8, 8, 3, ColumnMajorWithSum>::Pack(
11463 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11464 #ifdef DEBUG
11465 #ifdef DEBUG_METAGEMM_VERBOSE
11466 std::cout
11467 << __FILE__ << "(" << __LINE__
11468 << ") ColumnMajorWithSum<uint8_t, 8, 8, 3, ColumnMajorWithSum>::Pack()"
11469 << std::endl
11470 << std::flush;
11471 #endif
11472 #endif
11473 int params_count_copy = params.count;
11474 int params_stride_copy = params.stride;
11475 asm volatile(
11476 "movi v8.8h, #0\n"
11477 "movi v9.8h, #0\n"
11478 "movi v10.8h, #0\n"
11479 "movi v11.8h, #0\n"
11480 "movi v12.8h, #0\n"
11481 "movi v13.8h, #0\n"
11482 "movi v14.8h, #0\n"
11483 "movi v15.8h, #0\n"
11484
11485 // Reduce count by leftovers.
11486 "subs %x[count], %x[count], #3\n"
11487 "beq 2f\n"
11488
11489 "1:"
11490 "subs %x[count], %x[count], #8\n"
11491
11492 // Load Aggregate Store - column major 8x8
11493 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11494 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11495 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11496 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11497 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11498 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11499 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11500 "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11501 "prfm pldl1keep, [%x[in]]\n"
11502 "trn1 v16.8b, v0.8b, v1.8b\n"
11503 "trn2 v17.8b, v0.8b, v1.8b\n"
11504 "trn1 v18.8b, v2.8b, v3.8b\n"
11505 "trn2 v19.8b, v2.8b, v3.8b\n"
11506 "trn1 v20.8b, v4.8b, v5.8b\n"
11507 "trn2 v21.8b, v4.8b, v5.8b\n"
11508 "trn1 v22.8b, v6.8b, v7.8b\n"
11509 "trn2 v23.8b, v6.8b, v7.8b\n"
11510 "trn1 v0.4h, v16.4h, v18.4h\n"
11511 "trn2 v2.4h, v16.4h, v18.4h\n"
11512 "trn1 v1.4h, v17.4h, v19.4h\n"
11513 "trn2 v3.4h, v17.4h, v19.4h\n"
11514 "trn1 v4.4h, v20.4h, v22.4h\n"
11515 "trn2 v6.4h, v20.4h, v22.4h\n"
11516 "trn1 v5.4h, v21.4h, v23.4h\n"
11517 "trn2 v7.4h, v21.4h, v23.4h\n"
11518 "trn1 v16.2s, v0.2s, v4.2s\n"
11519 "trn2 v20.2s, v0.2s, v4.2s\n"
11520 "trn1 v17.2s, v1.2s, v5.2s\n"
11521 "trn2 v21.2s, v1.2s, v5.2s\n"
11522 "trn1 v18.2s, v2.2s, v6.2s\n"
11523 "trn2 v22.2s, v2.2s, v6.2s\n"
11524 "trn1 v19.2s, v3.2s, v7.2s\n"
11525 "trn2 v23.2s, v3.2s, v7.2s\n"
11526 "uaddw v8.8h, v8.8h, v16.8b\n"
11527 "uaddw v9.8h, v9.8h, v17.8b\n"
11528 "uaddw v10.8h, v10.8h, v18.8b\n"
11529 "uaddw v11.8h, v11.8h, v19.8b\n"
11530 "uaddw v12.8h, v12.8h, v20.8b\n"
11531 "uaddw v13.8h, v13.8h, v21.8b\n"
11532 "uaddw v14.8h, v14.8h, v22.8b\n"
11533 "uaddw v15.8h, v15.8h, v23.8b\n"
11534 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11535 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11536
11537 "bne 1b\n"
11538
11539 "2:"
11540
11541 // Load Aggregate Store - column major 8x3
11542 "movi v0.8b, #0\n"
11543 "movi v1.8b, #0\n"
11544 "movi v2.8b, #0\n"
11545 "movi v3.8b, #0\n"
11546 "movi v4.8b, #0\n"
11547 "movi v5.8b, #0\n"
11548 "movi v6.8b, #0\n"
11549 "movi v7.8b, #0\n"
11550 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11551 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11552 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11553 "prfm pldl1keep, [%x[in]]\n"
11554 "trn1 v16.8b, v0.8b, v1.8b\n"
11555 "trn2 v17.8b, v0.8b, v1.8b\n"
11556 "trn1 v18.8b, v2.8b, v3.8b\n"
11557 "trn2 v19.8b, v2.8b, v3.8b\n"
11558 "trn1 v20.8b, v4.8b, v5.8b\n"
11559 "trn2 v21.8b, v4.8b, v5.8b\n"
11560 "trn1 v22.8b, v6.8b, v7.8b\n"
11561 "trn2 v23.8b, v6.8b, v7.8b\n"
11562 "trn1 v0.4h, v16.4h, v18.4h\n"
11563 "trn2 v2.4h, v16.4h, v18.4h\n"
11564 "trn1 v1.4h, v17.4h, v19.4h\n"
11565 "trn2 v3.4h, v17.4h, v19.4h\n"
11566 "trn1 v4.4h, v20.4h, v22.4h\n"
11567 "trn2 v6.4h, v20.4h, v22.4h\n"
11568 "trn1 v5.4h, v21.4h, v23.4h\n"
11569 "trn2 v7.4h, v21.4h, v23.4h\n"
11570 "trn1 v16.2s, v0.2s, v4.2s\n"
11571 "trn2 v20.2s, v0.2s, v4.2s\n"
11572 "trn1 v17.2s, v1.2s, v5.2s\n"
11573 "trn2 v21.2s, v1.2s, v5.2s\n"
11574 "trn1 v18.2s, v2.2s, v6.2s\n"
11575 "trn2 v22.2s, v2.2s, v6.2s\n"
11576 "trn1 v19.2s, v3.2s, v7.2s\n"
11577 "trn2 v23.2s, v3.2s, v7.2s\n"
11578 "uaddw v8.8h, v8.8h, v16.8b\n"
11579 "uaddw v9.8h, v9.8h, v17.8b\n"
11580 "uaddw v10.8h, v10.8h, v18.8b\n"
11581 "uaddw v11.8h, v11.8h, v19.8b\n"
11582 "uaddw v12.8h, v12.8h, v20.8b\n"
11583 "uaddw v13.8h, v13.8h, v21.8b\n"
11584 "uaddw v14.8h, v14.8h, v22.8b\n"
11585 "uaddw v15.8h, v15.8h, v23.8b\n"
11586 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11587 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11588
11589 // Aggregator Reduction.
11590 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11591 "dup v1.4s, %w[additive_sum_offset]\n"
11592 "uaddlp v8.4s, v8.8h\n"
11593 "uaddlp v9.4s, v9.8h\n"
11594 "uaddlp v10.4s, v10.8h\n"
11595 "uaddlp v11.4s, v11.8h\n"
11596 "uaddlp v12.4s, v12.8h\n"
11597 "uaddlp v13.4s, v13.8h\n"
11598 "uaddlp v14.4s, v14.8h\n"
11599 "uaddlp v15.4s, v15.8h\n"
11600 "addp v8.4s, v8.4s, v9.4s\n"
11601 "addp v10.4s, v10.4s, v11.4s\n"
11602 "addp v12.4s, v12.4s, v13.4s\n"
11603 "addp v14.4s, v14.4s, v15.4s\n"
11604 "addp v8.4s, v8.4s, v10.4s\n"
11605 "addp v9.4s, v12.4s, v14.4s\n"
11606 "mul v8.4s, v8.4s, v0.s[0]\n"
11607 "mul v9.4s, v9.4s, v0.s[0]\n"
11608 "add v8.4s, v8.4s, v1.4s\n"
11609 "add v9.4s, v9.4s, v1.4s\n"
11610 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11611 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11612 [out] "+r"(out), [in] "+r"(in)
11613 : [additive_sum_offset] "r"(params.additive_sum_offset),
11614 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11615 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11616 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11617 "v21", "v22", "v23", "cc", "memory");
11618 }
11619
11620 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11621 inline void Stream<uint8_t, 8, 8, 4, ColumnMajorWithSum>::Pack(
11622 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11623 #ifdef DEBUG
11624 #ifdef DEBUG_METAGEMM_VERBOSE
11625 std::cout
11626 << __FILE__ << "(" << __LINE__
11627 << ") ColumnMajorWithSum<uint8_t, 8, 8, 4, ColumnMajorWithSum>::Pack()"
11628 << std::endl
11629 << std::flush;
11630 #endif
11631 #endif
11632 int params_count_copy = params.count;
11633 int params_stride_copy = params.stride;
11634 asm volatile(
11635 "movi v8.8h, #0\n"
11636 "movi v9.8h, #0\n"
11637 "movi v10.8h, #0\n"
11638 "movi v11.8h, #0\n"
11639 "movi v12.8h, #0\n"
11640 "movi v13.8h, #0\n"
11641 "movi v14.8h, #0\n"
11642 "movi v15.8h, #0\n"
11643
11644 // Reduce count by leftovers.
11645 "subs %x[count], %x[count], #4\n"
11646 "beq 2f\n"
11647
11648 "1:"
11649 "subs %x[count], %x[count], #8\n"
11650
11651 // Load Aggregate Store - column major 8x8
11652 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11653 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11654 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11655 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11656 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11657 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11658 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11659 "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11660 "prfm pldl1keep, [%x[in]]\n"
11661 "trn1 v16.8b, v0.8b, v1.8b\n"
11662 "trn2 v17.8b, v0.8b, v1.8b\n"
11663 "trn1 v18.8b, v2.8b, v3.8b\n"
11664 "trn2 v19.8b, v2.8b, v3.8b\n"
11665 "trn1 v20.8b, v4.8b, v5.8b\n"
11666 "trn2 v21.8b, v4.8b, v5.8b\n"
11667 "trn1 v22.8b, v6.8b, v7.8b\n"
11668 "trn2 v23.8b, v6.8b, v7.8b\n"
11669 "trn1 v0.4h, v16.4h, v18.4h\n"
11670 "trn2 v2.4h, v16.4h, v18.4h\n"
11671 "trn1 v1.4h, v17.4h, v19.4h\n"
11672 "trn2 v3.4h, v17.4h, v19.4h\n"
11673 "trn1 v4.4h, v20.4h, v22.4h\n"
11674 "trn2 v6.4h, v20.4h, v22.4h\n"
11675 "trn1 v5.4h, v21.4h, v23.4h\n"
11676 "trn2 v7.4h, v21.4h, v23.4h\n"
11677 "trn1 v16.2s, v0.2s, v4.2s\n"
11678 "trn2 v20.2s, v0.2s, v4.2s\n"
11679 "trn1 v17.2s, v1.2s, v5.2s\n"
11680 "trn2 v21.2s, v1.2s, v5.2s\n"
11681 "trn1 v18.2s, v2.2s, v6.2s\n"
11682 "trn2 v22.2s, v2.2s, v6.2s\n"
11683 "trn1 v19.2s, v3.2s, v7.2s\n"
11684 "trn2 v23.2s, v3.2s, v7.2s\n"
11685 "uaddw v8.8h, v8.8h, v16.8b\n"
11686 "uaddw v9.8h, v9.8h, v17.8b\n"
11687 "uaddw v10.8h, v10.8h, v18.8b\n"
11688 "uaddw v11.8h, v11.8h, v19.8b\n"
11689 "uaddw v12.8h, v12.8h, v20.8b\n"
11690 "uaddw v13.8h, v13.8h, v21.8b\n"
11691 "uaddw v14.8h, v14.8h, v22.8b\n"
11692 "uaddw v15.8h, v15.8h, v23.8b\n"
11693 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11694 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11695
11696 "bne 1b\n"
11697
11698 "2:"
11699
11700 // Load Aggregate Store - column major 8x4
11701 "movi v0.8b, #0\n"
11702 "movi v1.8b, #0\n"
11703 "movi v2.8b, #0\n"
11704 "movi v3.8b, #0\n"
11705 "movi v4.8b, #0\n"
11706 "movi v5.8b, #0\n"
11707 "movi v6.8b, #0\n"
11708 "movi v7.8b, #0\n"
11709 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11710 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11711 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11712 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11713 "prfm pldl1keep, [%x[in]]\n"
11714 "trn1 v16.8b, v0.8b, v1.8b\n"
11715 "trn2 v17.8b, v0.8b, v1.8b\n"
11716 "trn1 v18.8b, v2.8b, v3.8b\n"
11717 "trn2 v19.8b, v2.8b, v3.8b\n"
11718 "trn1 v20.8b, v4.8b, v5.8b\n"
11719 "trn2 v21.8b, v4.8b, v5.8b\n"
11720 "trn1 v22.8b, v6.8b, v7.8b\n"
11721 "trn2 v23.8b, v6.8b, v7.8b\n"
11722 "trn1 v0.4h, v16.4h, v18.4h\n"
11723 "trn2 v2.4h, v16.4h, v18.4h\n"
11724 "trn1 v1.4h, v17.4h, v19.4h\n"
11725 "trn2 v3.4h, v17.4h, v19.4h\n"
11726 "trn1 v4.4h, v20.4h, v22.4h\n"
11727 "trn2 v6.4h, v20.4h, v22.4h\n"
11728 "trn1 v5.4h, v21.4h, v23.4h\n"
11729 "trn2 v7.4h, v21.4h, v23.4h\n"
11730 "trn1 v16.2s, v0.2s, v4.2s\n"
11731 "trn2 v20.2s, v0.2s, v4.2s\n"
11732 "trn1 v17.2s, v1.2s, v5.2s\n"
11733 "trn2 v21.2s, v1.2s, v5.2s\n"
11734 "trn1 v18.2s, v2.2s, v6.2s\n"
11735 "trn2 v22.2s, v2.2s, v6.2s\n"
11736 "trn1 v19.2s, v3.2s, v7.2s\n"
11737 "trn2 v23.2s, v3.2s, v7.2s\n"
11738 "uaddw v8.8h, v8.8h, v16.8b\n"
11739 "uaddw v9.8h, v9.8h, v17.8b\n"
11740 "uaddw v10.8h, v10.8h, v18.8b\n"
11741 "uaddw v11.8h, v11.8h, v19.8b\n"
11742 "uaddw v12.8h, v12.8h, v20.8b\n"
11743 "uaddw v13.8h, v13.8h, v21.8b\n"
11744 "uaddw v14.8h, v14.8h, v22.8b\n"
11745 "uaddw v15.8h, v15.8h, v23.8b\n"
11746 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11747 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11748
11749 // Aggregator Reduction.
11750 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11751 "dup v1.4s, %w[additive_sum_offset]\n"
11752 "uaddlp v8.4s, v8.8h\n"
11753 "uaddlp v9.4s, v9.8h\n"
11754 "uaddlp v10.4s, v10.8h\n"
11755 "uaddlp v11.4s, v11.8h\n"
11756 "uaddlp v12.4s, v12.8h\n"
11757 "uaddlp v13.4s, v13.8h\n"
11758 "uaddlp v14.4s, v14.8h\n"
11759 "uaddlp v15.4s, v15.8h\n"
11760 "addp v8.4s, v8.4s, v9.4s\n"
11761 "addp v10.4s, v10.4s, v11.4s\n"
11762 "addp v12.4s, v12.4s, v13.4s\n"
11763 "addp v14.4s, v14.4s, v15.4s\n"
11764 "addp v8.4s, v8.4s, v10.4s\n"
11765 "addp v9.4s, v12.4s, v14.4s\n"
11766 "mul v8.4s, v8.4s, v0.s[0]\n"
11767 "mul v9.4s, v9.4s, v0.s[0]\n"
11768 "add v8.4s, v8.4s, v1.4s\n"
11769 "add v9.4s, v9.4s, v1.4s\n"
11770 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11771 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11772 [out] "+r"(out), [in] "+r"(in)
11773 : [additive_sum_offset] "r"(params.additive_sum_offset),
11774 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11775 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11776 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11777 "v21", "v22", "v23", "cc", "memory");
11778 }
11779
11780 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11781 inline void Stream<uint8_t, 8, 8, 5, ColumnMajorWithSum>::Pack(
11782 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11783 #ifdef DEBUG
11784 #ifdef DEBUG_METAGEMM_VERBOSE
11785 std::cout
11786 << __FILE__ << "(" << __LINE__
11787 << ") ColumnMajorWithSum<uint8_t, 8, 8, 5, ColumnMajorWithSum>::Pack()"
11788 << std::endl
11789 << std::flush;
11790 #endif
11791 #endif
11792 int params_count_copy = params.count;
11793 int params_stride_copy = params.stride;
11794 asm volatile(
11795 "movi v8.8h, #0\n"
11796 "movi v9.8h, #0\n"
11797 "movi v10.8h, #0\n"
11798 "movi v11.8h, #0\n"
11799 "movi v12.8h, #0\n"
11800 "movi v13.8h, #0\n"
11801 "movi v14.8h, #0\n"
11802 "movi v15.8h, #0\n"
11803
11804 // Reduce count by leftovers.
11805 "subs %x[count], %x[count], #5\n"
11806 "beq 2f\n"
11807
11808 "1:"
11809 "subs %x[count], %x[count], #8\n"
11810
11811 // Load Aggregate Store - column major 8x8
11812 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11813 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11814 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11815 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11816 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11817 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11818 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11819 "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11820 "prfm pldl1keep, [%x[in]]\n"
11821 "trn1 v16.8b, v0.8b, v1.8b\n"
11822 "trn2 v17.8b, v0.8b, v1.8b\n"
11823 "trn1 v18.8b, v2.8b, v3.8b\n"
11824 "trn2 v19.8b, v2.8b, v3.8b\n"
11825 "trn1 v20.8b, v4.8b, v5.8b\n"
11826 "trn2 v21.8b, v4.8b, v5.8b\n"
11827 "trn1 v22.8b, v6.8b, v7.8b\n"
11828 "trn2 v23.8b, v6.8b, v7.8b\n"
11829 "trn1 v0.4h, v16.4h, v18.4h\n"
11830 "trn2 v2.4h, v16.4h, v18.4h\n"
11831 "trn1 v1.4h, v17.4h, v19.4h\n"
11832 "trn2 v3.4h, v17.4h, v19.4h\n"
11833 "trn1 v4.4h, v20.4h, v22.4h\n"
11834 "trn2 v6.4h, v20.4h, v22.4h\n"
11835 "trn1 v5.4h, v21.4h, v23.4h\n"
11836 "trn2 v7.4h, v21.4h, v23.4h\n"
11837 "trn1 v16.2s, v0.2s, v4.2s\n"
11838 "trn2 v20.2s, v0.2s, v4.2s\n"
11839 "trn1 v17.2s, v1.2s, v5.2s\n"
11840 "trn2 v21.2s, v1.2s, v5.2s\n"
11841 "trn1 v18.2s, v2.2s, v6.2s\n"
11842 "trn2 v22.2s, v2.2s, v6.2s\n"
11843 "trn1 v19.2s, v3.2s, v7.2s\n"
11844 "trn2 v23.2s, v3.2s, v7.2s\n"
11845 "uaddw v8.8h, v8.8h, v16.8b\n"
11846 "uaddw v9.8h, v9.8h, v17.8b\n"
11847 "uaddw v10.8h, v10.8h, v18.8b\n"
11848 "uaddw v11.8h, v11.8h, v19.8b\n"
11849 "uaddw v12.8h, v12.8h, v20.8b\n"
11850 "uaddw v13.8h, v13.8h, v21.8b\n"
11851 "uaddw v14.8h, v14.8h, v22.8b\n"
11852 "uaddw v15.8h, v15.8h, v23.8b\n"
11853 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11854 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11855
11856 "bne 1b\n"
11857
11858 "2:"
11859
11860 // Load Aggregate Store - column major 8x5
11861 "movi v0.8b, #0\n"
11862 "movi v1.8b, #0\n"
11863 "movi v2.8b, #0\n"
11864 "movi v3.8b, #0\n"
11865 "movi v4.8b, #0\n"
11866 "movi v5.8b, #0\n"
11867 "movi v6.8b, #0\n"
11868 "movi v7.8b, #0\n"
11869 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11870 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11871 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11872 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11873 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11874 "prfm pldl1keep, [%x[in]]\n"
11875 "trn1 v16.8b, v0.8b, v1.8b\n"
11876 "trn2 v17.8b, v0.8b, v1.8b\n"
11877 "trn1 v18.8b, v2.8b, v3.8b\n"
11878 "trn2 v19.8b, v2.8b, v3.8b\n"
11879 "trn1 v20.8b, v4.8b, v5.8b\n"
11880 "trn2 v21.8b, v4.8b, v5.8b\n"
11881 "trn1 v22.8b, v6.8b, v7.8b\n"
11882 "trn2 v23.8b, v6.8b, v7.8b\n"
11883 "trn1 v0.4h, v16.4h, v18.4h\n"
11884 "trn2 v2.4h, v16.4h, v18.4h\n"
11885 "trn1 v1.4h, v17.4h, v19.4h\n"
11886 "trn2 v3.4h, v17.4h, v19.4h\n"
11887 "trn1 v4.4h, v20.4h, v22.4h\n"
11888 "trn2 v6.4h, v20.4h, v22.4h\n"
11889 "trn1 v5.4h, v21.4h, v23.4h\n"
11890 "trn2 v7.4h, v21.4h, v23.4h\n"
11891 "trn1 v16.2s, v0.2s, v4.2s\n"
11892 "trn2 v20.2s, v0.2s, v4.2s\n"
11893 "trn1 v17.2s, v1.2s, v5.2s\n"
11894 "trn2 v21.2s, v1.2s, v5.2s\n"
11895 "trn1 v18.2s, v2.2s, v6.2s\n"
11896 "trn2 v22.2s, v2.2s, v6.2s\n"
11897 "trn1 v19.2s, v3.2s, v7.2s\n"
11898 "trn2 v23.2s, v3.2s, v7.2s\n"
11899 "uaddw v8.8h, v8.8h, v16.8b\n"
11900 "uaddw v9.8h, v9.8h, v17.8b\n"
11901 "uaddw v10.8h, v10.8h, v18.8b\n"
11902 "uaddw v11.8h, v11.8h, v19.8b\n"
11903 "uaddw v12.8h, v12.8h, v20.8b\n"
11904 "uaddw v13.8h, v13.8h, v21.8b\n"
11905 "uaddw v14.8h, v14.8h, v22.8b\n"
11906 "uaddw v15.8h, v15.8h, v23.8b\n"
11907 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
11908 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
11909
11910 // Aggregator Reduction.
11911 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
11912 "dup v1.4s, %w[additive_sum_offset]\n"
11913 "uaddlp v8.4s, v8.8h\n"
11914 "uaddlp v9.4s, v9.8h\n"
11915 "uaddlp v10.4s, v10.8h\n"
11916 "uaddlp v11.4s, v11.8h\n"
11917 "uaddlp v12.4s, v12.8h\n"
11918 "uaddlp v13.4s, v13.8h\n"
11919 "uaddlp v14.4s, v14.8h\n"
11920 "uaddlp v15.4s, v15.8h\n"
11921 "addp v8.4s, v8.4s, v9.4s\n"
11922 "addp v10.4s, v10.4s, v11.4s\n"
11923 "addp v12.4s, v12.4s, v13.4s\n"
11924 "addp v14.4s, v14.4s, v15.4s\n"
11925 "addp v8.4s, v8.4s, v10.4s\n"
11926 "addp v9.4s, v12.4s, v14.4s\n"
11927 "mul v8.4s, v8.4s, v0.s[0]\n"
11928 "mul v9.4s, v9.4s, v0.s[0]\n"
11929 "add v8.4s, v8.4s, v1.4s\n"
11930 "add v9.4s, v9.4s, v1.4s\n"
11931 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
11932 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
11933 [out] "+r"(out), [in] "+r"(in)
11934 : [additive_sum_offset] "r"(params.additive_sum_offset),
11935 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
11936 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
11937 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
11938 "v21", "v22", "v23", "cc", "memory");
11939 }
11940
11941 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)11942 inline void Stream<uint8_t, 8, 8, 6, ColumnMajorWithSum>::Pack(
11943 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
11944 #ifdef DEBUG
11945 #ifdef DEBUG_METAGEMM_VERBOSE
11946 std::cout
11947 << __FILE__ << "(" << __LINE__
11948 << ") ColumnMajorWithSum<uint8_t, 8, 8, 6, ColumnMajorWithSum>::Pack()"
11949 << std::endl
11950 << std::flush;
11951 #endif
11952 #endif
11953 int params_count_copy = params.count;
11954 int params_stride_copy = params.stride;
11955 asm volatile(
11956 "movi v8.8h, #0\n"
11957 "movi v9.8h, #0\n"
11958 "movi v10.8h, #0\n"
11959 "movi v11.8h, #0\n"
11960 "movi v12.8h, #0\n"
11961 "movi v13.8h, #0\n"
11962 "movi v14.8h, #0\n"
11963 "movi v15.8h, #0\n"
11964
11965 // Reduce count by leftovers.
11966 "subs %x[count], %x[count], #6\n"
11967 "beq 2f\n"
11968
11969 "1:"
11970 "subs %x[count], %x[count], #8\n"
11971
11972 // Load Aggregate Store - column major 8x8
11973 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
11974 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
11975 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
11976 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
11977 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
11978 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
11979 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
11980 "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
11981 "prfm pldl1keep, [%x[in]]\n"
11982 "trn1 v16.8b, v0.8b, v1.8b\n"
11983 "trn2 v17.8b, v0.8b, v1.8b\n"
11984 "trn1 v18.8b, v2.8b, v3.8b\n"
11985 "trn2 v19.8b, v2.8b, v3.8b\n"
11986 "trn1 v20.8b, v4.8b, v5.8b\n"
11987 "trn2 v21.8b, v4.8b, v5.8b\n"
11988 "trn1 v22.8b, v6.8b, v7.8b\n"
11989 "trn2 v23.8b, v6.8b, v7.8b\n"
11990 "trn1 v0.4h, v16.4h, v18.4h\n"
11991 "trn2 v2.4h, v16.4h, v18.4h\n"
11992 "trn1 v1.4h, v17.4h, v19.4h\n"
11993 "trn2 v3.4h, v17.4h, v19.4h\n"
11994 "trn1 v4.4h, v20.4h, v22.4h\n"
11995 "trn2 v6.4h, v20.4h, v22.4h\n"
11996 "trn1 v5.4h, v21.4h, v23.4h\n"
11997 "trn2 v7.4h, v21.4h, v23.4h\n"
11998 "trn1 v16.2s, v0.2s, v4.2s\n"
11999 "trn2 v20.2s, v0.2s, v4.2s\n"
12000 "trn1 v17.2s, v1.2s, v5.2s\n"
12001 "trn2 v21.2s, v1.2s, v5.2s\n"
12002 "trn1 v18.2s, v2.2s, v6.2s\n"
12003 "trn2 v22.2s, v2.2s, v6.2s\n"
12004 "trn1 v19.2s, v3.2s, v7.2s\n"
12005 "trn2 v23.2s, v3.2s, v7.2s\n"
12006 "uaddw v8.8h, v8.8h, v16.8b\n"
12007 "uaddw v9.8h, v9.8h, v17.8b\n"
12008 "uaddw v10.8h, v10.8h, v18.8b\n"
12009 "uaddw v11.8h, v11.8h, v19.8b\n"
12010 "uaddw v12.8h, v12.8h, v20.8b\n"
12011 "uaddw v13.8h, v13.8h, v21.8b\n"
12012 "uaddw v14.8h, v14.8h, v22.8b\n"
12013 "uaddw v15.8h, v15.8h, v23.8b\n"
12014 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
12015 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
12016
12017 "bne 1b\n"
12018
12019 "2:"
12020
12021 // Load Aggregate Store - column major 8x6
12022 "movi v0.8b, #0\n"
12023 "movi v1.8b, #0\n"
12024 "movi v2.8b, #0\n"
12025 "movi v3.8b, #0\n"
12026 "movi v4.8b, #0\n"
12027 "movi v5.8b, #0\n"
12028 "movi v6.8b, #0\n"
12029 "movi v7.8b, #0\n"
12030 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
12031 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
12032 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
12033 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
12034 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
12035 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
12036 "prfm pldl1keep, [%x[in]]\n"
12037 "trn1 v16.8b, v0.8b, v1.8b\n"
12038 "trn2 v17.8b, v0.8b, v1.8b\n"
12039 "trn1 v18.8b, v2.8b, v3.8b\n"
12040 "trn2 v19.8b, v2.8b, v3.8b\n"
12041 "trn1 v20.8b, v4.8b, v5.8b\n"
12042 "trn2 v21.8b, v4.8b, v5.8b\n"
12043 "trn1 v22.8b, v6.8b, v7.8b\n"
12044 "trn2 v23.8b, v6.8b, v7.8b\n"
12045 "trn1 v0.4h, v16.4h, v18.4h\n"
12046 "trn2 v2.4h, v16.4h, v18.4h\n"
12047 "trn1 v1.4h, v17.4h, v19.4h\n"
12048 "trn2 v3.4h, v17.4h, v19.4h\n"
12049 "trn1 v4.4h, v20.4h, v22.4h\n"
12050 "trn2 v6.4h, v20.4h, v22.4h\n"
12051 "trn1 v5.4h, v21.4h, v23.4h\n"
12052 "trn2 v7.4h, v21.4h, v23.4h\n"
12053 "trn1 v16.2s, v0.2s, v4.2s\n"
12054 "trn2 v20.2s, v0.2s, v4.2s\n"
12055 "trn1 v17.2s, v1.2s, v5.2s\n"
12056 "trn2 v21.2s, v1.2s, v5.2s\n"
12057 "trn1 v18.2s, v2.2s, v6.2s\n"
12058 "trn2 v22.2s, v2.2s, v6.2s\n"
12059 "trn1 v19.2s, v3.2s, v7.2s\n"
12060 "trn2 v23.2s, v3.2s, v7.2s\n"
12061 "uaddw v8.8h, v8.8h, v16.8b\n"
12062 "uaddw v9.8h, v9.8h, v17.8b\n"
12063 "uaddw v10.8h, v10.8h, v18.8b\n"
12064 "uaddw v11.8h, v11.8h, v19.8b\n"
12065 "uaddw v12.8h, v12.8h, v20.8b\n"
12066 "uaddw v13.8h, v13.8h, v21.8b\n"
12067 "uaddw v14.8h, v14.8h, v22.8b\n"
12068 "uaddw v15.8h, v15.8h, v23.8b\n"
12069 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
12070 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
12071
12072 // Aggregator Reduction.
12073 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
12074 "dup v1.4s, %w[additive_sum_offset]\n"
12075 "uaddlp v8.4s, v8.8h\n"
12076 "uaddlp v9.4s, v9.8h\n"
12077 "uaddlp v10.4s, v10.8h\n"
12078 "uaddlp v11.4s, v11.8h\n"
12079 "uaddlp v12.4s, v12.8h\n"
12080 "uaddlp v13.4s, v13.8h\n"
12081 "uaddlp v14.4s, v14.8h\n"
12082 "uaddlp v15.4s, v15.8h\n"
12083 "addp v8.4s, v8.4s, v9.4s\n"
12084 "addp v10.4s, v10.4s, v11.4s\n"
12085 "addp v12.4s, v12.4s, v13.4s\n"
12086 "addp v14.4s, v14.4s, v15.4s\n"
12087 "addp v8.4s, v8.4s, v10.4s\n"
12088 "addp v9.4s, v12.4s, v14.4s\n"
12089 "mul v8.4s, v8.4s, v0.s[0]\n"
12090 "mul v9.4s, v9.4s, v0.s[0]\n"
12091 "add v8.4s, v8.4s, v1.4s\n"
12092 "add v9.4s, v9.4s, v1.4s\n"
12093 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
12094 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
12095 [out] "+r"(out), [in] "+r"(in)
12096 : [additive_sum_offset] "r"(params.additive_sum_offset),
12097 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
12098 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
12099 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
12100 "v21", "v22", "v23", "cc", "memory");
12101 }
12102
12103 template <>
Pack(const uint8_t * in,const ColumnMajorWithSum & params,uint8_t * out)12104 inline void Stream<uint8_t, 8, 8, 7, ColumnMajorWithSum>::Pack(
12105 const uint8_t* in, const ColumnMajorWithSum& params, uint8_t* out) {
12106 #ifdef DEBUG
12107 #ifdef DEBUG_METAGEMM_VERBOSE
12108 std::cout
12109 << __FILE__ << "(" << __LINE__
12110 << ") ColumnMajorWithSum<uint8_t, 8, 8, 7, ColumnMajorWithSum>::Pack()"
12111 << std::endl
12112 << std::flush;
12113 #endif
12114 #endif
12115 int params_count_copy = params.count;
12116 int params_stride_copy = params.stride;
12117 asm volatile(
12118 "movi v8.8h, #0\n"
12119 "movi v9.8h, #0\n"
12120 "movi v10.8h, #0\n"
12121 "movi v11.8h, #0\n"
12122 "movi v12.8h, #0\n"
12123 "movi v13.8h, #0\n"
12124 "movi v14.8h, #0\n"
12125 "movi v15.8h, #0\n"
12126
12127 // Reduce count by leftovers.
12128 "subs %x[count], %x[count], #7\n"
12129 "beq 2f\n"
12130
12131 "1:"
12132 "subs %x[count], %x[count], #8\n"
12133
12134 // Load Aggregate Store - column major 8x8
12135 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
12136 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
12137 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
12138 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
12139 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
12140 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
12141 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
12142 "ld1 {v7.2s}, [%x[in]], %x[stride]\n"
12143 "prfm pldl1keep, [%x[in]]\n"
12144 "trn1 v16.8b, v0.8b, v1.8b\n"
12145 "trn2 v17.8b, v0.8b, v1.8b\n"
12146 "trn1 v18.8b, v2.8b, v3.8b\n"
12147 "trn2 v19.8b, v2.8b, v3.8b\n"
12148 "trn1 v20.8b, v4.8b, v5.8b\n"
12149 "trn2 v21.8b, v4.8b, v5.8b\n"
12150 "trn1 v22.8b, v6.8b, v7.8b\n"
12151 "trn2 v23.8b, v6.8b, v7.8b\n"
12152 "trn1 v0.4h, v16.4h, v18.4h\n"
12153 "trn2 v2.4h, v16.4h, v18.4h\n"
12154 "trn1 v1.4h, v17.4h, v19.4h\n"
12155 "trn2 v3.4h, v17.4h, v19.4h\n"
12156 "trn1 v4.4h, v20.4h, v22.4h\n"
12157 "trn2 v6.4h, v20.4h, v22.4h\n"
12158 "trn1 v5.4h, v21.4h, v23.4h\n"
12159 "trn2 v7.4h, v21.4h, v23.4h\n"
12160 "trn1 v16.2s, v0.2s, v4.2s\n"
12161 "trn2 v20.2s, v0.2s, v4.2s\n"
12162 "trn1 v17.2s, v1.2s, v5.2s\n"
12163 "trn2 v21.2s, v1.2s, v5.2s\n"
12164 "trn1 v18.2s, v2.2s, v6.2s\n"
12165 "trn2 v22.2s, v2.2s, v6.2s\n"
12166 "trn1 v19.2s, v3.2s, v7.2s\n"
12167 "trn2 v23.2s, v3.2s, v7.2s\n"
12168 "uaddw v8.8h, v8.8h, v16.8b\n"
12169 "uaddw v9.8h, v9.8h, v17.8b\n"
12170 "uaddw v10.8h, v10.8h, v18.8b\n"
12171 "uaddw v11.8h, v11.8h, v19.8b\n"
12172 "uaddw v12.8h, v12.8h, v20.8b\n"
12173 "uaddw v13.8h, v13.8h, v21.8b\n"
12174 "uaddw v14.8h, v14.8h, v22.8b\n"
12175 "uaddw v15.8h, v15.8h, v23.8b\n"
12176 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
12177 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
12178
12179 "bne 1b\n"
12180
12181 "2:"
12182
12183 // Load Aggregate Store - column major 8x7
12184 "movi v0.8b, #0\n"
12185 "movi v1.8b, #0\n"
12186 "movi v2.8b, #0\n"
12187 "movi v3.8b, #0\n"
12188 "movi v4.8b, #0\n"
12189 "movi v5.8b, #0\n"
12190 "movi v6.8b, #0\n"
12191 "movi v7.8b, #0\n"
12192 "ld1 {v0.2s}, [%x[in]], %x[stride]\n"
12193 "ld1 {v1.2s}, [%x[in]], %x[stride]\n"
12194 "ld1 {v2.2s}, [%x[in]], %x[stride]\n"
12195 "ld1 {v3.2s}, [%x[in]], %x[stride]\n"
12196 "ld1 {v4.2s}, [%x[in]], %x[stride]\n"
12197 "ld1 {v5.2s}, [%x[in]], %x[stride]\n"
12198 "ld1 {v6.2s}, [%x[in]], %x[stride]\n"
12199 "prfm pldl1keep, [%x[in]]\n"
12200 "trn1 v16.8b, v0.8b, v1.8b\n"
12201 "trn2 v17.8b, v0.8b, v1.8b\n"
12202 "trn1 v18.8b, v2.8b, v3.8b\n"
12203 "trn2 v19.8b, v2.8b, v3.8b\n"
12204 "trn1 v20.8b, v4.8b, v5.8b\n"
12205 "trn2 v21.8b, v4.8b, v5.8b\n"
12206 "trn1 v22.8b, v6.8b, v7.8b\n"
12207 "trn2 v23.8b, v6.8b, v7.8b\n"
12208 "trn1 v0.4h, v16.4h, v18.4h\n"
12209 "trn2 v2.4h, v16.4h, v18.4h\n"
12210 "trn1 v1.4h, v17.4h, v19.4h\n"
12211 "trn2 v3.4h, v17.4h, v19.4h\n"
12212 "trn1 v4.4h, v20.4h, v22.4h\n"
12213 "trn2 v6.4h, v20.4h, v22.4h\n"
12214 "trn1 v5.4h, v21.4h, v23.4h\n"
12215 "trn2 v7.4h, v21.4h, v23.4h\n"
12216 "trn1 v16.2s, v0.2s, v4.2s\n"
12217 "trn2 v20.2s, v0.2s, v4.2s\n"
12218 "trn1 v17.2s, v1.2s, v5.2s\n"
12219 "trn2 v21.2s, v1.2s, v5.2s\n"
12220 "trn1 v18.2s, v2.2s, v6.2s\n"
12221 "trn2 v22.2s, v2.2s, v6.2s\n"
12222 "trn1 v19.2s, v3.2s, v7.2s\n"
12223 "trn2 v23.2s, v3.2s, v7.2s\n"
12224 "uaddw v8.8h, v8.8h, v16.8b\n"
12225 "uaddw v9.8h, v9.8h, v17.8b\n"
12226 "uaddw v10.8h, v10.8h, v18.8b\n"
12227 "uaddw v11.8h, v11.8h, v19.8b\n"
12228 "uaddw v12.8h, v12.8h, v20.8b\n"
12229 "uaddw v13.8h, v13.8h, v21.8b\n"
12230 "uaddw v14.8h, v14.8h, v22.8b\n"
12231 "uaddw v15.8h, v15.8h, v23.8b\n"
12232 "st1 {v16.2s, v17.2s, v18.2s, v19.2s}, [%x[out]], #32\n"
12233 "st1 {v20.2s, v21.2s, v22.2s, v23.2s}, [%x[out]], #32\n"
12234
12235 // Aggregator Reduction.
12236 "mov v0.s[0], %w[multiplicative_sum_offset]\n"
12237 "dup v1.4s, %w[additive_sum_offset]\n"
12238 "uaddlp v8.4s, v8.8h\n"
12239 "uaddlp v9.4s, v9.8h\n"
12240 "uaddlp v10.4s, v10.8h\n"
12241 "uaddlp v11.4s, v11.8h\n"
12242 "uaddlp v12.4s, v12.8h\n"
12243 "uaddlp v13.4s, v13.8h\n"
12244 "uaddlp v14.4s, v14.8h\n"
12245 "uaddlp v15.4s, v15.8h\n"
12246 "addp v8.4s, v8.4s, v9.4s\n"
12247 "addp v10.4s, v10.4s, v11.4s\n"
12248 "addp v12.4s, v12.4s, v13.4s\n"
12249 "addp v14.4s, v14.4s, v15.4s\n"
12250 "addp v8.4s, v8.4s, v10.4s\n"
12251 "addp v9.4s, v12.4s, v14.4s\n"
12252 "mul v8.4s, v8.4s, v0.s[0]\n"
12253 "mul v9.4s, v9.4s, v0.s[0]\n"
12254 "add v8.4s, v8.4s, v1.4s\n"
12255 "add v9.4s, v9.4s, v1.4s\n"
12256 "st1 {v8.4s, v9.4s}, [%x[out]]\n"
12257 : [count] "+r"(params_count_copy), [stride] "+r"(params_stride_copy),
12258 [out] "+r"(out), [in] "+r"(in)
12259 : [additive_sum_offset] "r"(params.additive_sum_offset),
12260 [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset)
12261 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
12262 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
12263 "v21", "v22", "v23", "cc", "memory");
12264 }
12265
12266 } // namespace meta
12267 } // namespace gemmlowp
12268
12269 #else
12270 #warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!"
12271 #endif
12272
12273 #endif // GEMMLOWP_META_STREAMS_ARM_64_H_
12274