• Home
  • Raw
  • Download

Lines Matching full:n

39       "vmov.i16 q8, #0\n"  in Pack()
42 "subs %[count], %[count], #8\n" in Pack()
45 "vld1.32 {d0}, [%[in]]!\n" in Pack()
46 "vaddw.u8 q8, q8, d0\n" in Pack()
47 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
49 "bne 1b\n" in Pack()
52 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
53 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
54 "vpaddl.u16 q8, q8\n" in Pack()
55 "vpadd.u32 d16, d16, d17\n" in Pack()
56 "vpadd.u32 d16, d16, d16\n" in Pack()
57 "vmul.i32 q8, q8, d0[0]\n" in Pack()
58 "vadd.i32 q8, q8, q1\n" in Pack()
59 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
80 "vmov.i16 q8, #0\n" in Pack()
83 "subs %[count], %[count], #1\n" in Pack()
84 "beq 2f\n" in Pack()
87 "subs %[count], %[count], #8\n" in Pack()
90 "vld1.32 {d0}, [%[in]]!\n" in Pack()
91 "vaddw.u8 q8, q8, d0\n" in Pack()
92 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
94 "bne 1b\n" in Pack()
99 "vmov.i8 d0, #0\n" in Pack()
100 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
101 "vaddw.u8 q8, q8, d0\n" in Pack()
102 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
105 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
106 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
107 "vpaddl.u16 q8, q8\n" in Pack()
108 "vpadd.u32 d16, d16, d17\n" in Pack()
109 "vpadd.u32 d16, d16, d16\n" in Pack()
110 "vmul.i32 q8, q8, d0[0]\n" in Pack()
111 "vadd.i32 q8, q8, q1\n" in Pack()
112 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
133 "vmov.i16 q8, #0\n" in Pack()
136 "subs %[count], %[count], #2\n" in Pack()
137 "beq 2f\n" in Pack()
140 "subs %[count], %[count], #8\n" in Pack()
143 "vld1.32 {d0}, [%[in]]!\n" in Pack()
144 "vaddw.u8 q8, q8, d0\n" in Pack()
145 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
147 "bne 1b\n" in Pack()
152 "vmov.i8 d0, #0\n" in Pack()
153 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
154 "vaddw.u8 q8, q8, d0\n" in Pack()
155 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
158 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
159 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
160 "vpaddl.u16 q8, q8\n" in Pack()
161 "vpadd.u32 d16, d16, d17\n" in Pack()
162 "vpadd.u32 d16, d16, d16\n" in Pack()
163 "vmul.i32 q8, q8, d0[0]\n" in Pack()
164 "vadd.i32 q8, q8, q1\n" in Pack()
165 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
186 "vmov.i16 q8, #0\n" in Pack()
189 "subs %[count], %[count], #3\n" in Pack()
190 "beq 2f\n" in Pack()
193 "subs %[count], %[count], #8\n" in Pack()
196 "vld1.32 {d0}, [%[in]]!\n" in Pack()
197 "vaddw.u8 q8, q8, d0\n" in Pack()
198 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
200 "bne 1b\n" in Pack()
205 "vmov.i8 d0, #0\n" in Pack()
206 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
207 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
208 "vaddw.u8 q8, q8, d0\n" in Pack()
209 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
212 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
213 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
214 "vpaddl.u16 q8, q8\n" in Pack()
215 "vpadd.u32 d16, d16, d17\n" in Pack()
216 "vpadd.u32 d16, d16, d16\n" in Pack()
217 "vmul.i32 q8, q8, d0[0]\n" in Pack()
218 "vadd.i32 q8, q8, q1\n" in Pack()
219 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
240 "vmov.i16 q8, #0\n" in Pack()
243 "subs %[count], %[count], #4\n" in Pack()
244 "beq 2f\n" in Pack()
247 "subs %[count], %[count], #8\n" in Pack()
250 "vld1.32 {d0}, [%[in]]!\n" in Pack()
251 "vaddw.u8 q8, q8, d0\n" in Pack()
252 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
254 "bne 1b\n" in Pack()
259 "vmov.i8 d0, #0\n" in Pack()
260 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
261 "vaddw.u8 q8, q8, d0\n" in Pack()
262 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
265 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
266 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
267 "vpaddl.u16 q8, q8\n" in Pack()
268 "vpadd.u32 d16, d16, d17\n" in Pack()
269 "vpadd.u32 d16, d16, d16\n" in Pack()
270 "vmul.i32 q8, q8, d0[0]\n" in Pack()
271 "vadd.i32 q8, q8, q1\n" in Pack()
272 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
293 "vmov.i16 q8, #0\n" in Pack()
296 "subs %[count], %[count], #5\n" in Pack()
297 "beq 2f\n" in Pack()
300 "subs %[count], %[count], #8\n" in Pack()
303 "vld1.32 {d0}, [%[in]]!\n" in Pack()
304 "vaddw.u8 q8, q8, d0\n" in Pack()
305 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
307 "bne 1b\n" in Pack()
312 "vmov.i8 d0, #0\n" in Pack()
313 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
314 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
315 "vaddw.u8 q8, q8, d0\n" in Pack()
316 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
319 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
320 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
321 "vpaddl.u16 q8, q8\n" in Pack()
322 "vpadd.u32 d16, d16, d17\n" in Pack()
323 "vpadd.u32 d16, d16, d16\n" in Pack()
324 "vmul.i32 q8, q8, d0[0]\n" in Pack()
325 "vadd.i32 q8, q8, q1\n" in Pack()
326 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
347 "vmov.i16 q8, #0\n" in Pack()
350 "subs %[count], %[count], #6\n" in Pack()
351 "beq 2f\n" in Pack()
354 "subs %[count], %[count], #8\n" in Pack()
357 "vld1.32 {d0}, [%[in]]!\n" in Pack()
358 "vaddw.u8 q8, q8, d0\n" in Pack()
359 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
361 "bne 1b\n" in Pack()
366 "vmov.i8 d0, #0\n" in Pack()
367 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
368 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
369 "vaddw.u8 q8, q8, d0\n" in Pack()
370 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
373 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
374 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
375 "vpaddl.u16 q8, q8\n" in Pack()
376 "vpadd.u32 d16, d16, d17\n" in Pack()
377 "vpadd.u32 d16, d16, d16\n" in Pack()
378 "vmul.i32 q8, q8, d0[0]\n" in Pack()
379 "vadd.i32 q8, q8, q1\n" in Pack()
380 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
401 "vmov.i16 q8, #0\n" in Pack()
404 "subs %[count], %[count], #7\n" in Pack()
405 "beq 2f\n" in Pack()
408 "subs %[count], %[count], #8\n" in Pack()
411 "vld1.32 {d0}, [%[in]]!\n" in Pack()
412 "vaddw.u8 q8, q8, d0\n" in Pack()
413 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
415 "bne 1b\n" in Pack()
420 "vmov.i8 d0, #0\n" in Pack()
421 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
422 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
423 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
424 "vaddw.u8 q8, q8, d0\n" in Pack()
425 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
428 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
429 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
430 "vpaddl.u16 q8, q8\n" in Pack()
431 "vpadd.u32 d16, d16, d17\n" in Pack()
432 "vpadd.u32 d16, d16, d16\n" in Pack()
433 "vmul.i32 q8, q8, d0[0]\n" in Pack()
434 "vadd.i32 q8, q8, q1\n" in Pack()
435 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
456 "add r0, %[in], %[stride]\n" in Pack()
457 "vmov.i16 q8, #0\n" in Pack()
458 "vmov.i16 q9, #0\n" in Pack()
461 "subs %[count], %[count], #8\n" in Pack()
464 "vld1.32 {d0}, [%[in]]!\n" in Pack()
465 "vld1.32 {d1}, [r0]!\n" in Pack()
466 "vaddw.u8 q8, q8, d0\n" in Pack()
467 "vaddw.u8 q9, q9, d1\n" in Pack()
468 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
470 "bne 1b\n" in Pack()
473 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
474 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
475 "vpaddl.u16 q8, q8\n" in Pack()
476 "vpaddl.u16 q9, q9\n" in Pack()
477 "vpadd.u32 d16, d16, d17\n" in Pack()
478 "vpadd.u32 d18, d18, d19\n" in Pack()
479 "vpadd.u32 d16, d16, d18\n" in Pack()
480 "vmul.i32 q8, q8, d0[0]\n" in Pack()
481 "vadd.i32 q8, q8, q1\n" in Pack()
482 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
504 "add r0, %[in], %[stride]\n" in Pack()
505 "vmov.i16 q8, #0\n" in Pack()
506 "vmov.i16 q9, #0\n" in Pack()
509 "subs %[count], %[count], #1\n" in Pack()
510 "beq 2f\n" in Pack()
513 "subs %[count], %[count], #8\n" in Pack()
516 "vld1.32 {d0}, [%[in]]!\n" in Pack()
517 "vld1.32 {d1}, [r0]!\n" in Pack()
518 "vaddw.u8 q8, q8, d0\n" in Pack()
519 "vaddw.u8 q9, q9, d1\n" in Pack()
520 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
522 "bne 1b\n" in Pack()
527 "vmov.i8 d0, #0\n" in Pack()
528 "vmov.i8 d1, #0\n" in Pack()
529 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
530 "vld1.8 {d1[0]}, [r0]!\n" in Pack()
531 "vaddw.u8 q8, q8, d0\n" in Pack()
532 "vaddw.u8 q9, q9, d1\n" in Pack()
533 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
536 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
537 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
538 "vpaddl.u16 q8, q8\n" in Pack()
539 "vpaddl.u16 q9, q9\n" in Pack()
540 "vpadd.u32 d16, d16, d17\n" in Pack()
541 "vpadd.u32 d18, d18, d19\n" in Pack()
542 "vpadd.u32 d16, d16, d18\n" in Pack()
543 "vmul.i32 q8, q8, d0[0]\n" in Pack()
544 "vadd.i32 q8, q8, q1\n" in Pack()
545 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
567 "add r0, %[in], %[stride]\n" in Pack()
568 "vmov.i16 q8, #0\n" in Pack()
569 "vmov.i16 q9, #0\n" in Pack()
572 "subs %[count], %[count], #2\n" in Pack()
573 "beq 2f\n" in Pack()
576 "subs %[count], %[count], #8\n" in Pack()
579 "vld1.32 {d0}, [%[in]]!\n" in Pack()
580 "vld1.32 {d1}, [r0]!\n" in Pack()
581 "vaddw.u8 q8, q8, d0\n" in Pack()
582 "vaddw.u8 q9, q9, d1\n" in Pack()
583 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
585 "bne 1b\n" in Pack()
590 "vmov.i8 d0, #0\n" in Pack()
591 "vmov.i8 d1, #0\n" in Pack()
592 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
593 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
594 "vaddw.u8 q8, q8, d0\n" in Pack()
595 "vaddw.u8 q9, q9, d1\n" in Pack()
596 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
599 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
600 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
601 "vpaddl.u16 q8, q8\n" in Pack()
602 "vpaddl.u16 q9, q9\n" in Pack()
603 "vpadd.u32 d16, d16, d17\n" in Pack()
604 "vpadd.u32 d18, d18, d19\n" in Pack()
605 "vpadd.u32 d16, d16, d18\n" in Pack()
606 "vmul.i32 q8, q8, d0[0]\n" in Pack()
607 "vadd.i32 q8, q8, q1\n" in Pack()
608 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
630 "add r0, %[in], %[stride]\n" in Pack()
631 "vmov.i16 q8, #0\n" in Pack()
632 "vmov.i16 q9, #0\n" in Pack()
635 "subs %[count], %[count], #3\n" in Pack()
636 "beq 2f\n" in Pack()
639 "subs %[count], %[count], #8\n" in Pack()
642 "vld1.32 {d0}, [%[in]]!\n" in Pack()
643 "vld1.32 {d1}, [r0]!\n" in Pack()
644 "vaddw.u8 q8, q8, d0\n" in Pack()
645 "vaddw.u8 q9, q9, d1\n" in Pack()
646 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
648 "bne 1b\n" in Pack()
653 "vmov.i8 d0, #0\n" in Pack()
654 "vmov.i8 d1, #0\n" in Pack()
655 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
656 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
657 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
658 "vld1.8 {d1[2]}, [r0]!\n" in Pack()
659 "vaddw.u8 q8, q8, d0\n" in Pack()
660 "vaddw.u8 q9, q9, d1\n" in Pack()
661 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
664 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
665 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
666 "vpaddl.u16 q8, q8\n" in Pack()
667 "vpaddl.u16 q9, q9\n" in Pack()
668 "vpadd.u32 d16, d16, d17\n" in Pack()
669 "vpadd.u32 d18, d18, d19\n" in Pack()
670 "vpadd.u32 d16, d16, d18\n" in Pack()
671 "vmul.i32 q8, q8, d0[0]\n" in Pack()
672 "vadd.i32 q8, q8, q1\n" in Pack()
673 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
695 "add r0, %[in], %[stride]\n" in Pack()
696 "vmov.i16 q8, #0\n" in Pack()
697 "vmov.i16 q9, #0\n" in Pack()
700 "subs %[count], %[count], #4\n" in Pack()
701 "beq 2f\n" in Pack()
704 "subs %[count], %[count], #8\n" in Pack()
707 "vld1.32 {d0}, [%[in]]!\n" in Pack()
708 "vld1.32 {d1}, [r0]!\n" in Pack()
709 "vaddw.u8 q8, q8, d0\n" in Pack()
710 "vaddw.u8 q9, q9, d1\n" in Pack()
711 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
713 "bne 1b\n" in Pack()
718 "vmov.i8 d0, #0\n" in Pack()
719 "vmov.i8 d1, #0\n" in Pack()
720 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
721 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
722 "vaddw.u8 q8, q8, d0\n" in Pack()
723 "vaddw.u8 q9, q9, d1\n" in Pack()
724 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
727 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
728 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
729 "vpaddl.u16 q8, q8\n" in Pack()
730 "vpaddl.u16 q9, q9\n" in Pack()
731 "vpadd.u32 d16, d16, d17\n" in Pack()
732 "vpadd.u32 d18, d18, d19\n" in Pack()
733 "vpadd.u32 d16, d16, d18\n" in Pack()
734 "vmul.i32 q8, q8, d0[0]\n" in Pack()
735 "vadd.i32 q8, q8, q1\n" in Pack()
736 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
758 "add r0, %[in], %[stride]\n" in Pack()
759 "vmov.i16 q8, #0\n" in Pack()
760 "vmov.i16 q9, #0\n" in Pack()
763 "subs %[count], %[count], #5\n" in Pack()
764 "beq 2f\n" in Pack()
767 "subs %[count], %[count], #8\n" in Pack()
770 "vld1.32 {d0}, [%[in]]!\n" in Pack()
771 "vld1.32 {d1}, [r0]!\n" in Pack()
772 "vaddw.u8 q8, q8, d0\n" in Pack()
773 "vaddw.u8 q9, q9, d1\n" in Pack()
774 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
776 "bne 1b\n" in Pack()
781 "vmov.i8 d0, #0\n" in Pack()
782 "vmov.i8 d1, #0\n" in Pack()
783 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
784 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
785 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
786 "vld1.8 {d1[4]}, [r0]!\n" in Pack()
787 "vaddw.u8 q8, q8, d0\n" in Pack()
788 "vaddw.u8 q9, q9, d1\n" in Pack()
789 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
792 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
793 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
794 "vpaddl.u16 q8, q8\n" in Pack()
795 "vpaddl.u16 q9, q9\n" in Pack()
796 "vpadd.u32 d16, d16, d17\n" in Pack()
797 "vpadd.u32 d18, d18, d19\n" in Pack()
798 "vpadd.u32 d16, d16, d18\n" in Pack()
799 "vmul.i32 q8, q8, d0[0]\n" in Pack()
800 "vadd.i32 q8, q8, q1\n" in Pack()
801 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
823 "add r0, %[in], %[stride]\n" in Pack()
824 "vmov.i16 q8, #0\n" in Pack()
825 "vmov.i16 q9, #0\n" in Pack()
828 "subs %[count], %[count], #6\n" in Pack()
829 "beq 2f\n" in Pack()
832 "subs %[count], %[count], #8\n" in Pack()
835 "vld1.32 {d0}, [%[in]]!\n" in Pack()
836 "vld1.32 {d1}, [r0]!\n" in Pack()
837 "vaddw.u8 q8, q8, d0\n" in Pack()
838 "vaddw.u8 q9, q9, d1\n" in Pack()
839 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
841 "bne 1b\n" in Pack()
846 "vmov.i8 d0, #0\n" in Pack()
847 "vmov.i8 d1, #0\n" in Pack()
848 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
849 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
850 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
851 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
852 "vaddw.u8 q8, q8, d0\n" in Pack()
853 "vaddw.u8 q9, q9, d1\n" in Pack()
854 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
857 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
858 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
859 "vpaddl.u16 q8, q8\n" in Pack()
860 "vpaddl.u16 q9, q9\n" in Pack()
861 "vpadd.u32 d16, d16, d17\n" in Pack()
862 "vpadd.u32 d18, d18, d19\n" in Pack()
863 "vpadd.u32 d16, d16, d18\n" in Pack()
864 "vmul.i32 q8, q8, d0[0]\n" in Pack()
865 "vadd.i32 q8, q8, q1\n" in Pack()
866 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
888 "add r0, %[in], %[stride]\n" in Pack()
889 "vmov.i16 q8, #0\n" in Pack()
890 "vmov.i16 q9, #0\n" in Pack()
893 "subs %[count], %[count], #7\n" in Pack()
894 "beq 2f\n" in Pack()
897 "subs %[count], %[count], #8\n" in Pack()
900 "vld1.32 {d0}, [%[in]]!\n" in Pack()
901 "vld1.32 {d1}, [r0]!\n" in Pack()
902 "vaddw.u8 q8, q8, d0\n" in Pack()
903 "vaddw.u8 q9, q9, d1\n" in Pack()
904 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
906 "bne 1b\n" in Pack()
911 "vmov.i8 d0, #0\n" in Pack()
912 "vmov.i8 d1, #0\n" in Pack()
913 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
914 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
915 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
916 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
917 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
918 "vld1.8 {d1[6]}, [r0]!\n" in Pack()
919 "vaddw.u8 q8, q8, d0\n" in Pack()
920 "vaddw.u8 q9, q9, d1\n" in Pack()
921 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
924 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
925 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
926 "vpaddl.u16 q8, q8\n" in Pack()
927 "vpaddl.u16 q9, q9\n" in Pack()
928 "vpadd.u32 d16, d16, d17\n" in Pack()
929 "vpadd.u32 d18, d18, d19\n" in Pack()
930 "vpadd.u32 d16, d16, d18\n" in Pack()
931 "vmul.i32 q8, q8, d0[0]\n" in Pack()
932 "vadd.i32 q8, q8, q1\n" in Pack()
933 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
955 "add r0, %[in], %[stride]\n" in Pack()
956 "add r1, r0, %[stride]\n" in Pack()
957 "vmov.i16 q8, #0\n" in Pack()
958 "vmov.i16 q9, #0\n" in Pack()
959 "vmov.i16 q10, #0\n" in Pack()
962 "subs %[count], %[count], #8\n" in Pack()
965 "vld1.32 {d0}, [%[in]]!\n" in Pack()
966 "vld1.32 {d1}, [r0]!\n" in Pack()
967 "vld1.32 {d2}, [r1]!\n" in Pack()
968 "vaddw.u8 q8, q8, d0\n" in Pack()
969 "vaddw.u8 q9, q9, d1\n" in Pack()
970 "vaddw.u8 q10, q10, d2\n" in Pack()
971 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
973 "bne 1b\n" in Pack()
976 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
977 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
978 "vpaddl.u16 q8, q8\n" in Pack()
979 "vpaddl.u16 q9, q9\n" in Pack()
980 "vpaddl.u16 q10, q10\n" in Pack()
981 "vpadd.u32 d16, d16, d17\n" in Pack()
982 "vpadd.u32 d18, d18, d19\n" in Pack()
983 "vpadd.u32 d20, d20, d21\n" in Pack()
984 "vpadd.u32 d16, d16, d18\n" in Pack()
985 "vpadd.u32 d17, d20, d20\n" in Pack()
986 "vmul.i32 q8, q8, d0[0]\n" in Pack()
987 "vadd.i32 q8, q8, q1\n" in Pack()
988 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
1010 "add r0, %[in], %[stride]\n" in Pack()
1011 "add r1, r0, %[stride]\n" in Pack()
1012 "vmov.i16 q8, #0\n" in Pack()
1013 "vmov.i16 q9, #0\n" in Pack()
1014 "vmov.i16 q10, #0\n" in Pack()
1017 "subs %[count], %[count], #1\n" in Pack()
1018 "beq 2f\n" in Pack()
1021 "subs %[count], %[count], #8\n" in Pack()
1024 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1025 "vld1.32 {d1}, [r0]!\n" in Pack()
1026 "vld1.32 {d2}, [r1]!\n" in Pack()
1027 "vaddw.u8 q8, q8, d0\n" in Pack()
1028 "vaddw.u8 q9, q9, d1\n" in Pack()
1029 "vaddw.u8 q10, q10, d2\n" in Pack()
1030 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1032 "bne 1b\n" in Pack()
1037 "vmov.i8 d0, #0\n" in Pack()
1038 "vmov.i8 d1, #0\n" in Pack()
1039 "vmov.i8 d2, #0\n" in Pack()
1040 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
1041 "vld1.8 {d1[0]}, [r0]!\n" in Pack()
1042 "vld1.8 {d2[0]}, [r1]!\n" in Pack()
1043 "vaddw.u8 q8, q8, d0\n" in Pack()
1044 "vaddw.u8 q9, q9, d1\n" in Pack()
1045 "vaddw.u8 q10, q10, d2\n" in Pack()
1046 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1049 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1050 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1051 "vpaddl.u16 q8, q8\n" in Pack()
1052 "vpaddl.u16 q9, q9\n" in Pack()
1053 "vpaddl.u16 q10, q10\n" in Pack()
1054 "vpadd.u32 d16, d16, d17\n" in Pack()
1055 "vpadd.u32 d18, d18, d19\n" in Pack()
1056 "vpadd.u32 d20, d20, d21\n" in Pack()
1057 "vpadd.u32 d16, d16, d18\n" in Pack()
1058 "vpadd.u32 d17, d20, d20\n" in Pack()
1059 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1060 "vadd.i32 q8, q8, q1\n" in Pack()
1061 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
1083 "add r0, %[in], %[stride]\n" in Pack()
1084 "add r1, r0, %[stride]\n" in Pack()
1085 "vmov.i16 q8, #0\n" in Pack()
1086 "vmov.i16 q9, #0\n" in Pack()
1087 "vmov.i16 q10, #0\n" in Pack()
1090 "subs %[count], %[count], #2\n" in Pack()
1091 "beq 2f\n" in Pack()
1094 "subs %[count], %[count], #8\n" in Pack()
1097 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1098 "vld1.32 {d1}, [r0]!\n" in Pack()
1099 "vld1.32 {d2}, [r1]!\n" in Pack()
1100 "vaddw.u8 q8, q8, d0\n" in Pack()
1101 "vaddw.u8 q9, q9, d1\n" in Pack()
1102 "vaddw.u8 q10, q10, d2\n" in Pack()
1103 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1105 "bne 1b\n" in Pack()
1110 "vmov.i8 d0, #0\n" in Pack()
1111 "vmov.i8 d1, #0\n" in Pack()
1112 "vmov.i8 d2, #0\n" in Pack()
1113 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
1114 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
1115 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
1116 "vaddw.u8 q8, q8, d0\n" in Pack()
1117 "vaddw.u8 q9, q9, d1\n" in Pack()
1118 "vaddw.u8 q10, q10, d2\n" in Pack()
1119 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1122 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1123 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1124 "vpaddl.u16 q8, q8\n" in Pack()
1125 "vpaddl.u16 q9, q9\n" in Pack()
1126 "vpaddl.u16 q10, q10\n" in Pack()
1127 "vpadd.u32 d16, d16, d17\n" in Pack()
1128 "vpadd.u32 d18, d18, d19\n" in Pack()
1129 "vpadd.u32 d20, d20, d21\n" in Pack()
1130 "vpadd.u32 d16, d16, d18\n" in Pack()
1131 "vpadd.u32 d17, d20, d20\n" in Pack()
1132 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1133 "vadd.i32 q8, q8, q1\n" in Pack()
1134 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
1156 "add r0, %[in], %[stride]\n" in Pack()
1157 "add r1, r0, %[stride]\n" in Pack()
1158 "vmov.i16 q8, #0\n" in Pack()
1159 "vmov.i16 q9, #0\n" in Pack()
1160 "vmov.i16 q10, #0\n" in Pack()
1163 "subs %[count], %[count], #3\n" in Pack()
1164 "beq 2f\n" in Pack()
1167 "subs %[count], %[count], #8\n" in Pack()
1170 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1171 "vld1.32 {d1}, [r0]!\n" in Pack()
1172 "vld1.32 {d2}, [r1]!\n" in Pack()
1173 "vaddw.u8 q8, q8, d0\n" in Pack()
1174 "vaddw.u8 q9, q9, d1\n" in Pack()
1175 "vaddw.u8 q10, q10, d2\n" in Pack()
1176 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1178 "bne 1b\n" in Pack()
1183 "vmov.i8 d0, #0\n" in Pack()
1184 "vmov.i8 d1, #0\n" in Pack()
1185 "vmov.i8 d2, #0\n" in Pack()
1186 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
1187 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
1188 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
1189 "vld1.8 {d1[2]}, [r0]!\n" in Pack()
1190 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
1191 "vld1.8 {d2[2]}, [r1]!\n" in Pack()
1192 "vaddw.u8 q8, q8, d0\n" in Pack()
1193 "vaddw.u8 q9, q9, d1\n" in Pack()
1194 "vaddw.u8 q10, q10, d2\n" in Pack()
1195 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1198 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1199 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1200 "vpaddl.u16 q8, q8\n" in Pack()
1201 "vpaddl.u16 q9, q9\n" in Pack()
1202 "vpaddl.u16 q10, q10\n" in Pack()
1203 "vpadd.u32 d16, d16, d17\n" in Pack()
1204 "vpadd.u32 d18, d18, d19\n" in Pack()
1205 "vpadd.u32 d20, d20, d21\n" in Pack()
1206 "vpadd.u32 d16, d16, d18\n" in Pack()
1207 "vpadd.u32 d17, d20, d20\n" in Pack()
1208 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1209 "vadd.i32 q8, q8, q1\n" in Pack()
1210 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
1232 "add r0, %[in], %[stride]\n" in Pack()
1233 "add r1, r0, %[stride]\n" in Pack()
1234 "vmov.i16 q8, #0\n" in Pack()
1235 "vmov.i16 q9, #0\n" in Pack()
1236 "vmov.i16 q10, #0\n" in Pack()
1239 "subs %[count], %[count], #4\n" in Pack()
1240 "beq 2f\n" in Pack()
1243 "subs %[count], %[count], #8\n" in Pack()
1246 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1247 "vld1.32 {d1}, [r0]!\n" in Pack()
1248 "vld1.32 {d2}, [r1]!\n" in Pack()
1249 "vaddw.u8 q8, q8, d0\n" in Pack()
1250 "vaddw.u8 q9, q9, d1\n" in Pack()
1251 "vaddw.u8 q10, q10, d2\n" in Pack()
1252 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1254 "bne 1b\n" in Pack()
1259 "vmov.i8 d0, #0\n" in Pack()
1260 "vmov.i8 d1, #0\n" in Pack()
1261 "vmov.i8 d2, #0\n" in Pack()
1262 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1263 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
1264 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
1265 "vaddw.u8 q8, q8, d0\n" in Pack()
1266 "vaddw.u8 q9, q9, d1\n" in Pack()
1267 "vaddw.u8 q10, q10, d2\n" in Pack()
1268 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1271 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1272 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1273 "vpaddl.u16 q8, q8\n" in Pack()
1274 "vpaddl.u16 q9, q9\n" in Pack()
1275 "vpaddl.u16 q10, q10\n" in Pack()
1276 "vpadd.u32 d16, d16, d17\n" in Pack()
1277 "vpadd.u32 d18, d18, d19\n" in Pack()
1278 "vpadd.u32 d20, d20, d21\n" in Pack()
1279 "vpadd.u32 d16, d16, d18\n" in Pack()
1280 "vpadd.u32 d17, d20, d20\n" in Pack()
1281 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1282 "vadd.i32 q8, q8, q1\n" in Pack()
1283 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
1305 "add r0, %[in], %[stride]\n" in Pack()
1306 "add r1, r0, %[stride]\n" in Pack()
1307 "vmov.i16 q8, #0\n" in Pack()
1308 "vmov.i16 q9, #0\n" in Pack()
1309 "vmov.i16 q10, #0\n" in Pack()
1312 "subs %[count], %[count], #5\n" in Pack()
1313 "beq 2f\n" in Pack()
1316 "subs %[count], %[count], #8\n" in Pack()
1319 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1320 "vld1.32 {d1}, [r0]!\n" in Pack()
1321 "vld1.32 {d2}, [r1]!\n" in Pack()
1322 "vaddw.u8 q8, q8, d0\n" in Pack()
1323 "vaddw.u8 q9, q9, d1\n" in Pack()
1324 "vaddw.u8 q10, q10, d2\n" in Pack()
1325 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1327 "bne 1b\n" in Pack()
1332 "vmov.i8 d0, #0\n" in Pack()
1333 "vmov.i8 d1, #0\n" in Pack()
1334 "vmov.i8 d2, #0\n" in Pack()
1335 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1336 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
1337 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
1338 "vld1.8 {d1[4]}, [r0]!\n" in Pack()
1339 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
1340 "vld1.8 {d2[4]}, [r1]!\n" in Pack()
1341 "vaddw.u8 q8, q8, d0\n" in Pack()
1342 "vaddw.u8 q9, q9, d1\n" in Pack()
1343 "vaddw.u8 q10, q10, d2\n" in Pack()
1344 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1347 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1348 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1349 "vpaddl.u16 q8, q8\n" in Pack()
1350 "vpaddl.u16 q9, q9\n" in Pack()
1351 "vpaddl.u16 q10, q10\n" in Pack()
1352 "vpadd.u32 d16, d16, d17\n" in Pack()
1353 "vpadd.u32 d18, d18, d19\n" in Pack()
1354 "vpadd.u32 d20, d20, d21\n" in Pack()
1355 "vpadd.u32 d16, d16, d18\n" in Pack()
1356 "vpadd.u32 d17, d20, d20\n" in Pack()
1357 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1358 "vadd.i32 q8, q8, q1\n" in Pack()
1359 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
1381 "add r0, %[in], %[stride]\n" in Pack()
1382 "add r1, r0, %[stride]\n" in Pack()
1383 "vmov.i16 q8, #0\n" in Pack()
1384 "vmov.i16 q9, #0\n" in Pack()
1385 "vmov.i16 q10, #0\n" in Pack()
1388 "subs %[count], %[count], #6\n" in Pack()
1389 "beq 2f\n" in Pack()
1392 "subs %[count], %[count], #8\n" in Pack()
1395 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1396 "vld1.32 {d1}, [r0]!\n" in Pack()
1397 "vld1.32 {d2}, [r1]!\n" in Pack()
1398 "vaddw.u8 q8, q8, d0\n" in Pack()
1399 "vaddw.u8 q9, q9, d1\n" in Pack()
1400 "vaddw.u8 q10, q10, d2\n" in Pack()
1401 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1403 "bne 1b\n" in Pack()
1408 "vmov.i8 d0, #0\n" in Pack()
1409 "vmov.i8 d1, #0\n" in Pack()
1410 "vmov.i8 d2, #0\n" in Pack()
1411 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1412 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
1413 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
1414 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
1415 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
1416 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
1417 "vaddw.u8 q8, q8, d0\n" in Pack()
1418 "vaddw.u8 q9, q9, d1\n" in Pack()
1419 "vaddw.u8 q10, q10, d2\n" in Pack()
1420 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1423 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1424 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1425 "vpaddl.u16 q8, q8\n" in Pack()
1426 "vpaddl.u16 q9, q9\n" in Pack()
1427 "vpaddl.u16 q10, q10\n" in Pack()
1428 "vpadd.u32 d16, d16, d17\n" in Pack()
1429 "vpadd.u32 d18, d18, d19\n" in Pack()
1430 "vpadd.u32 d20, d20, d21\n" in Pack()
1431 "vpadd.u32 d16, d16, d18\n" in Pack()
1432 "vpadd.u32 d17, d20, d20\n" in Pack()
1433 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1434 "vadd.i32 q8, q8, q1\n" in Pack()
1435 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
1457 "add r0, %[in], %[stride]\n" in Pack()
1458 "add r1, r0, %[stride]\n" in Pack()
1459 "vmov.i16 q8, #0\n" in Pack()
1460 "vmov.i16 q9, #0\n" in Pack()
1461 "vmov.i16 q10, #0\n" in Pack()
1464 "subs %[count], %[count], #7\n" in Pack()
1465 "beq 2f\n" in Pack()
1468 "subs %[count], %[count], #8\n" in Pack()
1471 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1472 "vld1.32 {d1}, [r0]!\n" in Pack()
1473 "vld1.32 {d2}, [r1]!\n" in Pack()
1474 "vaddw.u8 q8, q8, d0\n" in Pack()
1475 "vaddw.u8 q9, q9, d1\n" in Pack()
1476 "vaddw.u8 q10, q10, d2\n" in Pack()
1477 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1479 "bne 1b\n" in Pack()
1484 "vmov.i8 d0, #0\n" in Pack()
1485 "vmov.i8 d1, #0\n" in Pack()
1486 "vmov.i8 d2, #0\n" in Pack()
1487 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1488 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
1489 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
1490 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
1491 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
1492 "vld1.8 {d1[6]}, [r0]!\n" in Pack()
1493 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
1494 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
1495 "vld1.8 {d2[6]}, [r1]!\n" in Pack()
1496 "vaddw.u8 q8, q8, d0\n" in Pack()
1497 "vaddw.u8 q9, q9, d1\n" in Pack()
1498 "vaddw.u8 q10, q10, d2\n" in Pack()
1499 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1502 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1503 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1504 "vpaddl.u16 q8, q8\n" in Pack()
1505 "vpaddl.u16 q9, q9\n" in Pack()
1506 "vpaddl.u16 q10, q10\n" in Pack()
1507 "vpadd.u32 d16, d16, d17\n" in Pack()
1508 "vpadd.u32 d18, d18, d19\n" in Pack()
1509 "vpadd.u32 d20, d20, d21\n" in Pack()
1510 "vpadd.u32 d16, d16, d18\n" in Pack()
1511 "vpadd.u32 d17, d20, d20\n" in Pack()
1512 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1513 "vadd.i32 q8, q8, q1\n" in Pack()
1514 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
1536 "add r0, %[in], %[stride]\n" in Pack()
1537 "add r1, r0, %[stride]\n" in Pack()
1538 "add r2, r1, %[stride]\n" in Pack()
1539 "vmov.i16 q8, #0\n" in Pack()
1540 "vmov.i16 q9, #0\n" in Pack()
1541 "vmov.i16 q10, #0\n" in Pack()
1542 "vmov.i16 q11, #0\n" in Pack()
1545 "subs %[count], %[count], #8\n" in Pack()
1548 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1549 "vld1.32 {d1}, [r0]!\n" in Pack()
1550 "vld1.32 {d2}, [r1]!\n" in Pack()
1551 "vld1.32 {d3}, [r2]!\n" in Pack()
1552 "vaddw.u8 q8, q8, d0\n" in Pack()
1553 "vaddw.u8 q9, q9, d1\n" in Pack()
1554 "vaddw.u8 q10, q10, d2\n" in Pack()
1555 "vaddw.u8 q11, q11, d3\n" in Pack()
1556 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1558 "bne 1b\n" in Pack()
1561 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1562 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1563 "vpaddl.u16 q8, q8\n" in Pack()
1564 "vpaddl.u16 q9, q9\n" in Pack()
1565 "vpaddl.u16 q10, q10\n" in Pack()
1566 "vpaddl.u16 q11, q11\n" in Pack()
1567 "vpadd.u32 d16, d16, d17\n" in Pack()
1568 "vpadd.u32 d18, d18, d19\n" in Pack()
1569 "vpadd.u32 d20, d20, d21\n" in Pack()
1570 "vpadd.u32 d22, d22, d23\n" in Pack()
1571 "vpadd.u32 d16, d16, d18\n" in Pack()
1572 "vpadd.u32 d17, d20, d22\n" in Pack()
1573 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1574 "vadd.i32 q8, q8, q1\n" in Pack()
1575 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
1597 "add r0, %[in], %[stride]\n" in Pack()
1598 "add r1, r0, %[stride]\n" in Pack()
1599 "add r2, r1, %[stride]\n" in Pack()
1600 "vmov.i16 q8, #0\n" in Pack()
1601 "vmov.i16 q9, #0\n" in Pack()
1602 "vmov.i16 q10, #0\n" in Pack()
1603 "vmov.i16 q11, #0\n" in Pack()
1606 "subs %[count], %[count], #1\n" in Pack()
1607 "beq 2f\n" in Pack()
1610 "subs %[count], %[count], #8\n" in Pack()
1613 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1614 "vld1.32 {d1}, [r0]!\n" in Pack()
1615 "vld1.32 {d2}, [r1]!\n" in Pack()
1616 "vld1.32 {d3}, [r2]!\n" in Pack()
1617 "vaddw.u8 q8, q8, d0\n" in Pack()
1618 "vaddw.u8 q9, q9, d1\n" in Pack()
1619 "vaddw.u8 q10, q10, d2\n" in Pack()
1620 "vaddw.u8 q11, q11, d3\n" in Pack()
1621 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1623 "bne 1b\n" in Pack()
1628 "vmov.i8 d0, #0\n" in Pack()
1629 "vmov.i8 d1, #0\n" in Pack()
1630 "vmov.i8 d2, #0\n" in Pack()
1631 "vmov.i8 d3, #0\n" in Pack()
1632 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
1633 "vld1.8 {d1[0]}, [r0]!\n" in Pack()
1634 "vld1.8 {d2[0]}, [r1]!\n" in Pack()
1635 "vld1.8 {d3[0]}, [r2]!\n" in Pack()
1636 "vaddw.u8 q8, q8, d0\n" in Pack()
1637 "vaddw.u8 q9, q9, d1\n" in Pack()
1638 "vaddw.u8 q10, q10, d2\n" in Pack()
1639 "vaddw.u8 q11, q11, d3\n" in Pack()
1640 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1643 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1644 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1645 "vpaddl.u16 q8, q8\n" in Pack()
1646 "vpaddl.u16 q9, q9\n" in Pack()
1647 "vpaddl.u16 q10, q10\n" in Pack()
1648 "vpaddl.u16 q11, q11\n" in Pack()
1649 "vpadd.u32 d16, d16, d17\n" in Pack()
1650 "vpadd.u32 d18, d18, d19\n" in Pack()
1651 "vpadd.u32 d20, d20, d21\n" in Pack()
1652 "vpadd.u32 d22, d22, d23\n" in Pack()
1653 "vpadd.u32 d16, d16, d18\n" in Pack()
1654 "vpadd.u32 d17, d20, d22\n" in Pack()
1655 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1656 "vadd.i32 q8, q8, q1\n" in Pack()
1657 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
1679 "add r0, %[in], %[stride]\n" in Pack()
1680 "add r1, r0, %[stride]\n" in Pack()
1681 "add r2, r1, %[stride]\n" in Pack()
1682 "vmov.i16 q8, #0\n" in Pack()
1683 "vmov.i16 q9, #0\n" in Pack()
1684 "vmov.i16 q10, #0\n" in Pack()
1685 "vmov.i16 q11, #0\n" in Pack()
1688 "subs %[count], %[count], #2\n" in Pack()
1689 "beq 2f\n" in Pack()
1692 "subs %[count], %[count], #8\n" in Pack()
1695 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1696 "vld1.32 {d1}, [r0]!\n" in Pack()
1697 "vld1.32 {d2}, [r1]!\n" in Pack()
1698 "vld1.32 {d3}, [r2]!\n" in Pack()
1699 "vaddw.u8 q8, q8, d0\n" in Pack()
1700 "vaddw.u8 q9, q9, d1\n" in Pack()
1701 "vaddw.u8 q10, q10, d2\n" in Pack()
1702 "vaddw.u8 q11, q11, d3\n" in Pack()
1703 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1705 "bne 1b\n" in Pack()
1710 "vmov.i8 d0, #0\n" in Pack()
1711 "vmov.i8 d1, #0\n" in Pack()
1712 "vmov.i8 d2, #0\n" in Pack()
1713 "vmov.i8 d3, #0\n" in Pack()
1714 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
1715 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
1716 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
1717 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
1718 "vaddw.u8 q8, q8, d0\n" in Pack()
1719 "vaddw.u8 q9, q9, d1\n" in Pack()
1720 "vaddw.u8 q10, q10, d2\n" in Pack()
1721 "vaddw.u8 q11, q11, d3\n" in Pack()
1722 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1725 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1726 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1727 "vpaddl.u16 q8, q8\n" in Pack()
1728 "vpaddl.u16 q9, q9\n" in Pack()
1729 "vpaddl.u16 q10, q10\n" in Pack()
1730 "vpaddl.u16 q11, q11\n" in Pack()
1731 "vpadd.u32 d16, d16, d17\n" in Pack()
1732 "vpadd.u32 d18, d18, d19\n" in Pack()
1733 "vpadd.u32 d20, d20, d21\n" in Pack()
1734 "vpadd.u32 d22, d22, d23\n" in Pack()
1735 "vpadd.u32 d16, d16, d18\n" in Pack()
1736 "vpadd.u32 d17, d20, d22\n" in Pack()
1737 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1738 "vadd.i32 q8, q8, q1\n" in Pack()
1739 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
1761 "add r0, %[in], %[stride]\n" in Pack()
1762 "add r1, r0, %[stride]\n" in Pack()
1763 "add r2, r1, %[stride]\n" in Pack()
1764 "vmov.i16 q8, #0\n" in Pack()
1765 "vmov.i16 q9, #0\n" in Pack()
1766 "vmov.i16 q10, #0\n" in Pack()
1767 "vmov.i16 q11, #0\n" in Pack()
1770 "subs %[count], %[count], #3\n" in Pack()
1771 "beq 2f\n" in Pack()
1774 "subs %[count], %[count], #8\n" in Pack()
1777 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1778 "vld1.32 {d1}, [r0]!\n" in Pack()
1779 "vld1.32 {d2}, [r1]!\n" in Pack()
1780 "vld1.32 {d3}, [r2]!\n" in Pack()
1781 "vaddw.u8 q8, q8, d0\n" in Pack()
1782 "vaddw.u8 q9, q9, d1\n" in Pack()
1783 "vaddw.u8 q10, q10, d2\n" in Pack()
1784 "vaddw.u8 q11, q11, d3\n" in Pack()
1785 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1787 "bne 1b\n" in Pack()
1792 "vmov.i8 d0, #0\n" in Pack()
1793 "vmov.i8 d1, #0\n" in Pack()
1794 "vmov.i8 d2, #0\n" in Pack()
1795 "vmov.i8 d3, #0\n" in Pack()
1796 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
1797 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
1798 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
1799 "vld1.8 {d1[2]}, [r0]!\n" in Pack()
1800 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
1801 "vld1.8 {d2[2]}, [r1]!\n" in Pack()
1802 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
1803 "vld1.8 {d3[2]}, [r2]!\n" in Pack()
1804 "vaddw.u8 q8, q8, d0\n" in Pack()
1805 "vaddw.u8 q9, q9, d1\n" in Pack()
1806 "vaddw.u8 q10, q10, d2\n" in Pack()
1807 "vaddw.u8 q11, q11, d3\n" in Pack()
1808 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1811 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1812 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1813 "vpaddl.u16 q8, q8\n" in Pack()
1814 "vpaddl.u16 q9, q9\n" in Pack()
1815 "vpaddl.u16 q10, q10\n" in Pack()
1816 "vpaddl.u16 q11, q11\n" in Pack()
1817 "vpadd.u32 d16, d16, d17\n" in Pack()
1818 "vpadd.u32 d18, d18, d19\n" in Pack()
1819 "vpadd.u32 d20, d20, d21\n" in Pack()
1820 "vpadd.u32 d22, d22, d23\n" in Pack()
1821 "vpadd.u32 d16, d16, d18\n" in Pack()
1822 "vpadd.u32 d17, d20, d22\n" in Pack()
1823 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1824 "vadd.i32 q8, q8, q1\n" in Pack()
1825 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
1847 "add r0, %[in], %[stride]\n" in Pack()
1848 "add r1, r0, %[stride]\n" in Pack()
1849 "add r2, r1, %[stride]\n" in Pack()
1850 "vmov.i16 q8, #0\n" in Pack()
1851 "vmov.i16 q9, #0\n" in Pack()
1852 "vmov.i16 q10, #0\n" in Pack()
1853 "vmov.i16 q11, #0\n" in Pack()
1856 "subs %[count], %[count], #4\n" in Pack()
1857 "beq 2f\n" in Pack()
1860 "subs %[count], %[count], #8\n" in Pack()
1863 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1864 "vld1.32 {d1}, [r0]!\n" in Pack()
1865 "vld1.32 {d2}, [r1]!\n" in Pack()
1866 "vld1.32 {d3}, [r2]!\n" in Pack()
1867 "vaddw.u8 q8, q8, d0\n" in Pack()
1868 "vaddw.u8 q9, q9, d1\n" in Pack()
1869 "vaddw.u8 q10, q10, d2\n" in Pack()
1870 "vaddw.u8 q11, q11, d3\n" in Pack()
1871 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1873 "bne 1b\n" in Pack()
1878 "vmov.i8 d0, #0\n" in Pack()
1879 "vmov.i8 d1, #0\n" in Pack()
1880 "vmov.i8 d2, #0\n" in Pack()
1881 "vmov.i8 d3, #0\n" in Pack()
1882 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1883 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
1884 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
1885 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
1886 "vaddw.u8 q8, q8, d0\n" in Pack()
1887 "vaddw.u8 q9, q9, d1\n" in Pack()
1888 "vaddw.u8 q10, q10, d2\n" in Pack()
1889 "vaddw.u8 q11, q11, d3\n" in Pack()
1890 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1893 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1894 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1895 "vpaddl.u16 q8, q8\n" in Pack()
1896 "vpaddl.u16 q9, q9\n" in Pack()
1897 "vpaddl.u16 q10, q10\n" in Pack()
1898 "vpaddl.u16 q11, q11\n" in Pack()
1899 "vpadd.u32 d16, d16, d17\n" in Pack()
1900 "vpadd.u32 d18, d18, d19\n" in Pack()
1901 "vpadd.u32 d20, d20, d21\n" in Pack()
1902 "vpadd.u32 d22, d22, d23\n" in Pack()
1903 "vpadd.u32 d16, d16, d18\n" in Pack()
1904 "vpadd.u32 d17, d20, d22\n" in Pack()
1905 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1906 "vadd.i32 q8, q8, q1\n" in Pack()
1907 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
1929 "add r0, %[in], %[stride]\n" in Pack()
1930 "add r1, r0, %[stride]\n" in Pack()
1931 "add r2, r1, %[stride]\n" in Pack()
1932 "vmov.i16 q8, #0\n" in Pack()
1933 "vmov.i16 q9, #0\n" in Pack()
1934 "vmov.i16 q10, #0\n" in Pack()
1935 "vmov.i16 q11, #0\n" in Pack()
1938 "subs %[count], %[count], #5\n" in Pack()
1939 "beq 2f\n" in Pack()
1942 "subs %[count], %[count], #8\n" in Pack()
1945 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1946 "vld1.32 {d1}, [r0]!\n" in Pack()
1947 "vld1.32 {d2}, [r1]!\n" in Pack()
1948 "vld1.32 {d3}, [r2]!\n" in Pack()
1949 "vaddw.u8 q8, q8, d0\n" in Pack()
1950 "vaddw.u8 q9, q9, d1\n" in Pack()
1951 "vaddw.u8 q10, q10, d2\n" in Pack()
1952 "vaddw.u8 q11, q11, d3\n" in Pack()
1953 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1955 "bne 1b\n" in Pack()
1960 "vmov.i8 d0, #0\n" in Pack()
1961 "vmov.i8 d1, #0\n" in Pack()
1962 "vmov.i8 d2, #0\n" in Pack()
1963 "vmov.i8 d3, #0\n" in Pack()
1964 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1965 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
1966 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
1967 "vld1.8 {d1[4]}, [r0]!\n" in Pack()
1968 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
1969 "vld1.8 {d2[4]}, [r1]!\n" in Pack()
1970 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
1971 "vld1.8 {d3[4]}, [r2]!\n" in Pack()
1972 "vaddw.u8 q8, q8, d0\n" in Pack()
1973 "vaddw.u8 q9, q9, d1\n" in Pack()
1974 "vaddw.u8 q10, q10, d2\n" in Pack()
1975 "vaddw.u8 q11, q11, d3\n" in Pack()
1976 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1979 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1980 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
1981 "vpaddl.u16 q8, q8\n" in Pack()
1982 "vpaddl.u16 q9, q9\n" in Pack()
1983 "vpaddl.u16 q10, q10\n" in Pack()
1984 "vpaddl.u16 q11, q11\n" in Pack()
1985 "vpadd.u32 d16, d16, d17\n" in Pack()
1986 "vpadd.u32 d18, d18, d19\n" in Pack()
1987 "vpadd.u32 d20, d20, d21\n" in Pack()
1988 "vpadd.u32 d22, d22, d23\n" in Pack()
1989 "vpadd.u32 d16, d16, d18\n" in Pack()
1990 "vpadd.u32 d17, d20, d22\n" in Pack()
1991 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1992 "vadd.i32 q8, q8, q1\n" in Pack()
1993 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
2015 "add r0, %[in], %[stride]\n" in Pack()
2016 "add r1, r0, %[stride]\n" in Pack()
2017 "add r2, r1, %[stride]\n" in Pack()
2018 "vmov.i16 q8, #0\n" in Pack()
2019 "vmov.i16 q9, #0\n" in Pack()
2020 "vmov.i16 q10, #0\n" in Pack()
2021 "vmov.i16 q11, #0\n" in Pack()
2024 "subs %[count], %[count], #6\n" in Pack()
2025 "beq 2f\n" in Pack()
2028 "subs %[count], %[count], #8\n" in Pack()
2031 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2032 "vld1.32 {d1}, [r0]!\n" in Pack()
2033 "vld1.32 {d2}, [r1]!\n" in Pack()
2034 "vld1.32 {d3}, [r2]!\n" in Pack()
2035 "vaddw.u8 q8, q8, d0\n" in Pack()
2036 "vaddw.u8 q9, q9, d1\n" in Pack()
2037 "vaddw.u8 q10, q10, d2\n" in Pack()
2038 "vaddw.u8 q11, q11, d3\n" in Pack()
2039 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
2041 "bne 1b\n" in Pack()
2046 "vmov.i8 d0, #0\n" in Pack()
2047 "vmov.i8 d1, #0\n" in Pack()
2048 "vmov.i8 d2, #0\n" in Pack()
2049 "vmov.i8 d3, #0\n" in Pack()
2050 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2051 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
2052 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
2053 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
2054 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
2055 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
2056 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
2057 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
2058 "vaddw.u8 q8, q8, d0\n" in Pack()
2059 "vaddw.u8 q9, q9, d1\n" in Pack()
2060 "vaddw.u8 q10, q10, d2\n" in Pack()
2061 "vaddw.u8 q11, q11, d3\n" in Pack()
2062 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
2065 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2066 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2067 "vpaddl.u16 q8, q8\n" in Pack()
2068 "vpaddl.u16 q9, q9\n" in Pack()
2069 "vpaddl.u16 q10, q10\n" in Pack()
2070 "vpaddl.u16 q11, q11\n" in Pack()
2071 "vpadd.u32 d16, d16, d17\n" in Pack()
2072 "vpadd.u32 d18, d18, d19\n" in Pack()
2073 "vpadd.u32 d20, d20, d21\n" in Pack()
2074 "vpadd.u32 d22, d22, d23\n" in Pack()
2075 "vpadd.u32 d16, d16, d18\n" in Pack()
2076 "vpadd.u32 d17, d20, d22\n" in Pack()
2077 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2078 "vadd.i32 q8, q8, q1\n" in Pack()
2079 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
2101 "add r0, %[in], %[stride]\n" in Pack()
2102 "add r1, r0, %[stride]\n" in Pack()
2103 "add r2, r1, %[stride]\n" in Pack()
2104 "vmov.i16 q8, #0\n" in Pack()
2105 "vmov.i16 q9, #0\n" in Pack()
2106 "vmov.i16 q10, #0\n" in Pack()
2107 "vmov.i16 q11, #0\n" in Pack()
2110 "subs %[count], %[count], #7\n" in Pack()
2111 "beq 2f\n" in Pack()
2114 "subs %[count], %[count], #8\n" in Pack()
2117 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2118 "vld1.32 {d1}, [r0]!\n" in Pack()
2119 "vld1.32 {d2}, [r1]!\n" in Pack()
2120 "vld1.32 {d3}, [r2]!\n" in Pack()
2121 "vaddw.u8 q8, q8, d0\n" in Pack()
2122 "vaddw.u8 q9, q9, d1\n" in Pack()
2123 "vaddw.u8 q10, q10, d2\n" in Pack()
2124 "vaddw.u8 q11, q11, d3\n" in Pack()
2125 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
2127 "bne 1b\n" in Pack()
2132 "vmov.i8 d0, #0\n" in Pack()
2133 "vmov.i8 d1, #0\n" in Pack()
2134 "vmov.i8 d2, #0\n" in Pack()
2135 "vmov.i8 d3, #0\n" in Pack()
2136 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2137 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
2138 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
2139 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
2140 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
2141 "vld1.8 {d1[6]}, [r0]!\n" in Pack()
2142 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
2143 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
2144 "vld1.8 {d2[6]}, [r1]!\n" in Pack()
2145 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
2146 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
2147 "vld1.8 {d3[6]}, [r2]!\n" in Pack()
2148 "vaddw.u8 q8, q8, d0\n" in Pack()
2149 "vaddw.u8 q9, q9, d1\n" in Pack()
2150 "vaddw.u8 q10, q10, d2\n" in Pack()
2151 "vaddw.u8 q11, q11, d3\n" in Pack()
2152 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
2155 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2156 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2157 "vpaddl.u16 q8, q8\n" in Pack()
2158 "vpaddl.u16 q9, q9\n" in Pack()
2159 "vpaddl.u16 q10, q10\n" in Pack()
2160 "vpaddl.u16 q11, q11\n" in Pack()
2161 "vpadd.u32 d16, d16, d17\n" in Pack()
2162 "vpadd.u32 d18, d18, d19\n" in Pack()
2163 "vpadd.u32 d20, d20, d21\n" in Pack()
2164 "vpadd.u32 d22, d22, d23\n" in Pack()
2165 "vpadd.u32 d16, d16, d18\n" in Pack()
2166 "vpadd.u32 d17, d20, d22\n" in Pack()
2167 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2168 "vadd.i32 q8, q8, q1\n" in Pack()
2169 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
2191 "add r0, %[in], %[stride]\n" in Pack()
2192 "add r1, r0, %[stride]\n" in Pack()
2193 "add r2, r1, %[stride]\n" in Pack()
2194 "add r3, r2, %[stride]\n" in Pack()
2195 "vmov.i16 q8, #0\n" in Pack()
2196 "vmov.i16 q9, #0\n" in Pack()
2197 "vmov.i16 q10, #0\n" in Pack()
2198 "vmov.i16 q11, #0\n" in Pack()
2199 "vmov.i16 q12, #0\n" in Pack()
2202 "subs %[count], %[count], #8\n" in Pack()
2205 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2206 "vld1.32 {d1}, [r0]!\n" in Pack()
2207 "vld1.32 {d2}, [r1]!\n" in Pack()
2208 "vld1.32 {d3}, [r2]!\n" in Pack()
2209 "vld1.32 {d4}, [r3]!\n" in Pack()
2210 "vaddw.u8 q8, q8, d0\n" in Pack()
2211 "vaddw.u8 q9, q9, d1\n" in Pack()
2212 "vaddw.u8 q10, q10, d2\n" in Pack()
2213 "vaddw.u8 q11, q11, d3\n" in Pack()
2214 "vaddw.u8 q12, q12, d4\n" in Pack()
2215 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2216 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2218 "bne 1b\n" in Pack()
2221 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2222 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2223 "vpaddl.u16 q8, q8\n" in Pack()
2224 "vpaddl.u16 q9, q9\n" in Pack()
2225 "vpaddl.u16 q10, q10\n" in Pack()
2226 "vpaddl.u16 q11, q11\n" in Pack()
2227 "vpaddl.u16 q12, q12\n" in Pack()
2228 "vpadd.u32 d16, d16, d17\n" in Pack()
2229 "vpadd.u32 d18, d18, d19\n" in Pack()
2230 "vpadd.u32 d20, d20, d21\n" in Pack()
2231 "vpadd.u32 d22, d22, d23\n" in Pack()
2232 "vpadd.u32 d24, d24, d25\n" in Pack()
2233 "vpadd.u32 d16, d16, d18\n" in Pack()
2234 "vpadd.u32 d17, d20, d22\n" in Pack()
2235 "vpadd.u32 d18, d24, d24\n" in Pack()
2236 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2237 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2238 "vadd.i32 q8, q8, q1\n" in Pack()
2239 "vadd.i32 q9, q9, q1\n" in Pack()
2240 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
2262 "add r0, %[in], %[stride]\n" in Pack()
2263 "add r1, r0, %[stride]\n" in Pack()
2264 "add r2, r1, %[stride]\n" in Pack()
2265 "add r3, r2, %[stride]\n" in Pack()
2266 "vmov.i16 q8, #0\n" in Pack()
2267 "vmov.i16 q9, #0\n" in Pack()
2268 "vmov.i16 q10, #0\n" in Pack()
2269 "vmov.i16 q11, #0\n" in Pack()
2270 "vmov.i16 q12, #0\n" in Pack()
2273 "subs %[count], %[count], #1\n" in Pack()
2274 "beq 2f\n" in Pack()
2277 "subs %[count], %[count], #8\n" in Pack()
2280 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2281 "vld1.32 {d1}, [r0]!\n" in Pack()
2282 "vld1.32 {d2}, [r1]!\n" in Pack()
2283 "vld1.32 {d3}, [r2]!\n" in Pack()
2284 "vld1.32 {d4}, [r3]!\n" in Pack()
2285 "vaddw.u8 q8, q8, d0\n" in Pack()
2286 "vaddw.u8 q9, q9, d1\n" in Pack()
2287 "vaddw.u8 q10, q10, d2\n" in Pack()
2288 "vaddw.u8 q11, q11, d3\n" in Pack()
2289 "vaddw.u8 q12, q12, d4\n" in Pack()
2290 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2291 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2293 "bne 1b\n" in Pack()
2298 "vmov.i8 d0, #0\n" in Pack()
2299 "vmov.i8 d1, #0\n" in Pack()
2300 "vmov.i8 d2, #0\n" in Pack()
2301 "vmov.i8 d3, #0\n" in Pack()
2302 "vmov.i8 d4, #0\n" in Pack()
2303 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
2304 "vld1.8 {d1[0]}, [r0]!\n" in Pack()
2305 "vld1.8 {d2[0]}, [r1]!\n" in Pack()
2306 "vld1.8 {d3[0]}, [r2]!\n" in Pack()
2307 "vld1.8 {d4[0]}, [r3]!\n" in Pack()
2308 "vaddw.u8 q8, q8, d0\n" in Pack()
2309 "vaddw.u8 q9, q9, d1\n" in Pack()
2310 "vaddw.u8 q10, q10, d2\n" in Pack()
2311 "vaddw.u8 q11, q11, d3\n" in Pack()
2312 "vaddw.u8 q12, q12, d4\n" in Pack()
2313 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2314 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2317 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2318 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2319 "vpaddl.u16 q8, q8\n" in Pack()
2320 "vpaddl.u16 q9, q9\n" in Pack()
2321 "vpaddl.u16 q10, q10\n" in Pack()
2322 "vpaddl.u16 q11, q11\n" in Pack()
2323 "vpaddl.u16 q12, q12\n" in Pack()
2324 "vpadd.u32 d16, d16, d17\n" in Pack()
2325 "vpadd.u32 d18, d18, d19\n" in Pack()
2326 "vpadd.u32 d20, d20, d21\n" in Pack()
2327 "vpadd.u32 d22, d22, d23\n" in Pack()
2328 "vpadd.u32 d24, d24, d25\n" in Pack()
2329 "vpadd.u32 d16, d16, d18\n" in Pack()
2330 "vpadd.u32 d17, d20, d22\n" in Pack()
2331 "vpadd.u32 d18, d24, d24\n" in Pack()
2332 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2333 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2334 "vadd.i32 q8, q8, q1\n" in Pack()
2335 "vadd.i32 q9, q9, q1\n" in Pack()
2336 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
2358 "add r0, %[in], %[stride]\n" in Pack()
2359 "add r1, r0, %[stride]\n" in Pack()
2360 "add r2, r1, %[stride]\n" in Pack()
2361 "add r3, r2, %[stride]\n" in Pack()
2362 "vmov.i16 q8, #0\n" in Pack()
2363 "vmov.i16 q9, #0\n" in Pack()
2364 "vmov.i16 q10, #0\n" in Pack()
2365 "vmov.i16 q11, #0\n" in Pack()
2366 "vmov.i16 q12, #0\n" in Pack()
2369 "subs %[count], %[count], #2\n" in Pack()
2370 "beq 2f\n" in Pack()
2373 "subs %[count], %[count], #8\n" in Pack()
2376 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2377 "vld1.32 {d1}, [r0]!\n" in Pack()
2378 "vld1.32 {d2}, [r1]!\n" in Pack()
2379 "vld1.32 {d3}, [r2]!\n" in Pack()
2380 "vld1.32 {d4}, [r3]!\n" in Pack()
2381 "vaddw.u8 q8, q8, d0\n" in Pack()
2382 "vaddw.u8 q9, q9, d1\n" in Pack()
2383 "vaddw.u8 q10, q10, d2\n" in Pack()
2384 "vaddw.u8 q11, q11, d3\n" in Pack()
2385 "vaddw.u8 q12, q12, d4\n" in Pack()
2386 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2387 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2389 "bne 1b\n" in Pack()
2394 "vmov.i8 d0, #0\n" in Pack()
2395 "vmov.i8 d1, #0\n" in Pack()
2396 "vmov.i8 d2, #0\n" in Pack()
2397 "vmov.i8 d3, #0\n" in Pack()
2398 "vmov.i8 d4, #0\n" in Pack()
2399 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
2400 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
2401 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
2402 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
2403 "vld1.16 {d4[0]}, [r3]!\n" in Pack()
2404 "vaddw.u8 q8, q8, d0\n" in Pack()
2405 "vaddw.u8 q9, q9, d1\n" in Pack()
2406 "vaddw.u8 q10, q10, d2\n" in Pack()
2407 "vaddw.u8 q11, q11, d3\n" in Pack()
2408 "vaddw.u8 q12, q12, d4\n" in Pack()
2409 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2410 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2413 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2414 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2415 "vpaddl.u16 q8, q8\n" in Pack()
2416 "vpaddl.u16 q9, q9\n" in Pack()
2417 "vpaddl.u16 q10, q10\n" in Pack()
2418 "vpaddl.u16 q11, q11\n" in Pack()
2419 "vpaddl.u16 q12, q12\n" in Pack()
2420 "vpadd.u32 d16, d16, d17\n" in Pack()
2421 "vpadd.u32 d18, d18, d19\n" in Pack()
2422 "vpadd.u32 d20, d20, d21\n" in Pack()
2423 "vpadd.u32 d22, d22, d23\n" in Pack()
2424 "vpadd.u32 d24, d24, d25\n" in Pack()
2425 "vpadd.u32 d16, d16, d18\n" in Pack()
2426 "vpadd.u32 d17, d20, d22\n" in Pack()
2427 "vpadd.u32 d18, d24, d24\n" in Pack()
2428 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2429 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2430 "vadd.i32 q8, q8, q1\n" in Pack()
2431 "vadd.i32 q9, q9, q1\n" in Pack()
2432 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
2454 "add r0, %[in], %[stride]\n" in Pack()
2455 "add r1, r0, %[stride]\n" in Pack()
2456 "add r2, r1, %[stride]\n" in Pack()
2457 "add r3, r2, %[stride]\n" in Pack()
2458 "vmov.i16 q8, #0\n" in Pack()
2459 "vmov.i16 q9, #0\n" in Pack()
2460 "vmov.i16 q10, #0\n" in Pack()
2461 "vmov.i16 q11, #0\n" in Pack()
2462 "vmov.i16 q12, #0\n" in Pack()
2465 "subs %[count], %[count], #3\n" in Pack()
2466 "beq 2f\n" in Pack()
2469 "subs %[count], %[count], #8\n" in Pack()
2472 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2473 "vld1.32 {d1}, [r0]!\n" in Pack()
2474 "vld1.32 {d2}, [r1]!\n" in Pack()
2475 "vld1.32 {d3}, [r2]!\n" in Pack()
2476 "vld1.32 {d4}, [r3]!\n" in Pack()
2477 "vaddw.u8 q8, q8, d0\n" in Pack()
2478 "vaddw.u8 q9, q9, d1\n" in Pack()
2479 "vaddw.u8 q10, q10, d2\n" in Pack()
2480 "vaddw.u8 q11, q11, d3\n" in Pack()
2481 "vaddw.u8 q12, q12, d4\n" in Pack()
2482 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2483 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2485 "bne 1b\n" in Pack()
2490 "vmov.i8 d0, #0\n" in Pack()
2491 "vmov.i8 d1, #0\n" in Pack()
2492 "vmov.i8 d2, #0\n" in Pack()
2493 "vmov.i8 d3, #0\n" in Pack()
2494 "vmov.i8 d4, #0\n" in Pack()
2495 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
2496 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
2497 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
2498 "vld1.8 {d1[2]}, [r0]!\n" in Pack()
2499 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
2500 "vld1.8 {d2[2]}, [r1]!\n" in Pack()
2501 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
2502 "vld1.8 {d3[2]}, [r2]!\n" in Pack()
2503 "vld1.16 {d4[0]}, [r3]!\n" in Pack()
2504 "vld1.8 {d4[2]}, [r3]!\n" in Pack()
2505 "vaddw.u8 q8, q8, d0\n" in Pack()
2506 "vaddw.u8 q9, q9, d1\n" in Pack()
2507 "vaddw.u8 q10, q10, d2\n" in Pack()
2508 "vaddw.u8 q11, q11, d3\n" in Pack()
2509 "vaddw.u8 q12, q12, d4\n" in Pack()
2510 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2511 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2514 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2515 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2516 "vpaddl.u16 q8, q8\n" in Pack()
2517 "vpaddl.u16 q9, q9\n" in Pack()
2518 "vpaddl.u16 q10, q10\n" in Pack()
2519 "vpaddl.u16 q11, q11\n" in Pack()
2520 "vpaddl.u16 q12, q12\n" in Pack()
2521 "vpadd.u32 d16, d16, d17\n" in Pack()
2522 "vpadd.u32 d18, d18, d19\n" in Pack()
2523 "vpadd.u32 d20, d20, d21\n" in Pack()
2524 "vpadd.u32 d22, d22, d23\n" in Pack()
2525 "vpadd.u32 d24, d24, d25\n" in Pack()
2526 "vpadd.u32 d16, d16, d18\n" in Pack()
2527 "vpadd.u32 d17, d20, d22\n" in Pack()
2528 "vpadd.u32 d18, d24, d24\n" in Pack()
2529 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2530 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2531 "vadd.i32 q8, q8, q1\n" in Pack()
2532 "vadd.i32 q9, q9, q1\n" in Pack()
2533 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
2555 "add r0, %[in], %[stride]\n" in Pack()
2556 "add r1, r0, %[stride]\n" in Pack()
2557 "add r2, r1, %[stride]\n" in Pack()
2558 "add r3, r2, %[stride]\n" in Pack()
2559 "vmov.i16 q8, #0\n" in Pack()
2560 "vmov.i16 q9, #0\n" in Pack()
2561 "vmov.i16 q10, #0\n" in Pack()
2562 "vmov.i16 q11, #0\n" in Pack()
2563 "vmov.i16 q12, #0\n" in Pack()
2566 "subs %[count], %[count], #4\n" in Pack()
2567 "beq 2f\n" in Pack()
2570 "subs %[count], %[count], #8\n" in Pack()
2573 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2574 "vld1.32 {d1}, [r0]!\n" in Pack()
2575 "vld1.32 {d2}, [r1]!\n" in Pack()
2576 "vld1.32 {d3}, [r2]!\n" in Pack()
2577 "vld1.32 {d4}, [r3]!\n" in Pack()
2578 "vaddw.u8 q8, q8, d0\n" in Pack()
2579 "vaddw.u8 q9, q9, d1\n" in Pack()
2580 "vaddw.u8 q10, q10, d2\n" in Pack()
2581 "vaddw.u8 q11, q11, d3\n" in Pack()
2582 "vaddw.u8 q12, q12, d4\n" in Pack()
2583 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2584 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2586 "bne 1b\n" in Pack()
2591 "vmov.i8 d0, #0\n" in Pack()
2592 "vmov.i8 d1, #0\n" in Pack()
2593 "vmov.i8 d2, #0\n" in Pack()
2594 "vmov.i8 d3, #0\n" in Pack()
2595 "vmov.i8 d4, #0\n" in Pack()
2596 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2597 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
2598 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
2599 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
2600 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
2601 "vaddw.u8 q8, q8, d0\n" in Pack()
2602 "vaddw.u8 q9, q9, d1\n" in Pack()
2603 "vaddw.u8 q10, q10, d2\n" in Pack()
2604 "vaddw.u8 q11, q11, d3\n" in Pack()
2605 "vaddw.u8 q12, q12, d4\n" in Pack()
2606 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2607 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2610 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2611 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2612 "vpaddl.u16 q8, q8\n" in Pack()
2613 "vpaddl.u16 q9, q9\n" in Pack()
2614 "vpaddl.u16 q10, q10\n" in Pack()
2615 "vpaddl.u16 q11, q11\n" in Pack()
2616 "vpaddl.u16 q12, q12\n" in Pack()
2617 "vpadd.u32 d16, d16, d17\n" in Pack()
2618 "vpadd.u32 d18, d18, d19\n" in Pack()
2619 "vpadd.u32 d20, d20, d21\n" in Pack()
2620 "vpadd.u32 d22, d22, d23\n" in Pack()
2621 "vpadd.u32 d24, d24, d25\n" in Pack()
2622 "vpadd.u32 d16, d16, d18\n" in Pack()
2623 "vpadd.u32 d17, d20, d22\n" in Pack()
2624 "vpadd.u32 d18, d24, d24\n" in Pack()
2625 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2626 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2627 "vadd.i32 q8, q8, q1\n" in Pack()
2628 "vadd.i32 q9, q9, q1\n" in Pack()
2629 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
2651 "add r0, %[in], %[stride]\n" in Pack()
2652 "add r1, r0, %[stride]\n" in Pack()
2653 "add r2, r1, %[stride]\n" in Pack()
2654 "add r3, r2, %[stride]\n" in Pack()
2655 "vmov.i16 q8, #0\n" in Pack()
2656 "vmov.i16 q9, #0\n" in Pack()
2657 "vmov.i16 q10, #0\n" in Pack()
2658 "vmov.i16 q11, #0\n" in Pack()
2659 "vmov.i16 q12, #0\n" in Pack()
2662 "subs %[count], %[count], #5\n" in Pack()
2663 "beq 2f\n" in Pack()
2666 "subs %[count], %[count], #8\n" in Pack()
2669 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2670 "vld1.32 {d1}, [r0]!\n" in Pack()
2671 "vld1.32 {d2}, [r1]!\n" in Pack()
2672 "vld1.32 {d3}, [r2]!\n" in Pack()
2673 "vld1.32 {d4}, [r3]!\n" in Pack()
2674 "vaddw.u8 q8, q8, d0\n" in Pack()
2675 "vaddw.u8 q9, q9, d1\n" in Pack()
2676 "vaddw.u8 q10, q10, d2\n" in Pack()
2677 "vaddw.u8 q11, q11, d3\n" in Pack()
2678 "vaddw.u8 q12, q12, d4\n" in Pack()
2679 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2680 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2682 "bne 1b\n" in Pack()
2687 "vmov.i8 d0, #0\n" in Pack()
2688 "vmov.i8 d1, #0\n" in Pack()
2689 "vmov.i8 d2, #0\n" in Pack()
2690 "vmov.i8 d3, #0\n" in Pack()
2691 "vmov.i8 d4, #0\n" in Pack()
2692 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2693 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
2694 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
2695 "vld1.8 {d1[4]}, [r0]!\n" in Pack()
2696 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
2697 "vld1.8 {d2[4]}, [r1]!\n" in Pack()
2698 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
2699 "vld1.8 {d3[4]}, [r2]!\n" in Pack()
2700 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
2701 "vld1.8 {d4[4]}, [r3]!\n" in Pack()
2702 "vaddw.u8 q8, q8, d0\n" in Pack()
2703 "vaddw.u8 q9, q9, d1\n" in Pack()
2704 "vaddw.u8 q10, q10, d2\n" in Pack()
2705 "vaddw.u8 q11, q11, d3\n" in Pack()
2706 "vaddw.u8 q12, q12, d4\n" in Pack()
2707 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2708 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2711 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2712 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2713 "vpaddl.u16 q8, q8\n" in Pack()
2714 "vpaddl.u16 q9, q9\n" in Pack()
2715 "vpaddl.u16 q10, q10\n" in Pack()
2716 "vpaddl.u16 q11, q11\n" in Pack()
2717 "vpaddl.u16 q12, q12\n" in Pack()
2718 "vpadd.u32 d16, d16, d17\n" in Pack()
2719 "vpadd.u32 d18, d18, d19\n" in Pack()
2720 "vpadd.u32 d20, d20, d21\n" in Pack()
2721 "vpadd.u32 d22, d22, d23\n" in Pack()
2722 "vpadd.u32 d24, d24, d25\n" in Pack()
2723 "vpadd.u32 d16, d16, d18\n" in Pack()
2724 "vpadd.u32 d17, d20, d22\n" in Pack()
2725 "vpadd.u32 d18, d24, d24\n" in Pack()
2726 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2727 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2728 "vadd.i32 q8, q8, q1\n" in Pack()
2729 "vadd.i32 q9, q9, q1\n" in Pack()
2730 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
2752 "add r0, %[in], %[stride]\n" in Pack()
2753 "add r1, r0, %[stride]\n" in Pack()
2754 "add r2, r1, %[stride]\n" in Pack()
2755 "add r3, r2, %[stride]\n" in Pack()
2756 "vmov.i16 q8, #0\n" in Pack()
2757 "vmov.i16 q9, #0\n" in Pack()
2758 "vmov.i16 q10, #0\n" in Pack()
2759 "vmov.i16 q11, #0\n" in Pack()
2760 "vmov.i16 q12, #0\n" in Pack()
2763 "subs %[count], %[count], #6\n" in Pack()
2764 "beq 2f\n" in Pack()
2767 "subs %[count], %[count], #8\n" in Pack()
2770 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2771 "vld1.32 {d1}, [r0]!\n" in Pack()
2772 "vld1.32 {d2}, [r1]!\n" in Pack()
2773 "vld1.32 {d3}, [r2]!\n" in Pack()
2774 "vld1.32 {d4}, [r3]!\n" in Pack()
2775 "vaddw.u8 q8, q8, d0\n" in Pack()
2776 "vaddw.u8 q9, q9, d1\n" in Pack()
2777 "vaddw.u8 q10, q10, d2\n" in Pack()
2778 "vaddw.u8 q11, q11, d3\n" in Pack()
2779 "vaddw.u8 q12, q12, d4\n" in Pack()
2780 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2781 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2783 "bne 1b\n" in Pack()
2788 "vmov.i8 d0, #0\n" in Pack()
2789 "vmov.i8 d1, #0\n" in Pack()
2790 "vmov.i8 d2, #0\n" in Pack()
2791 "vmov.i8 d3, #0\n" in Pack()
2792 "vmov.i8 d4, #0\n" in Pack()
2793 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2794 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
2795 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
2796 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
2797 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
2798 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
2799 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
2800 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
2801 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
2802 "vld1.16 {d4[2]}, [r3]!\n" in Pack()
2803 "vaddw.u8 q8, q8, d0\n" in Pack()
2804 "vaddw.u8 q9, q9, d1\n" in Pack()
2805 "vaddw.u8 q10, q10, d2\n" in Pack()
2806 "vaddw.u8 q11, q11, d3\n" in Pack()
2807 "vaddw.u8 q12, q12, d4\n" in Pack()
2808 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2809 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2812 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2813 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2814 "vpaddl.u16 q8, q8\n" in Pack()
2815 "vpaddl.u16 q9, q9\n" in Pack()
2816 "vpaddl.u16 q10, q10\n" in Pack()
2817 "vpaddl.u16 q11, q11\n" in Pack()
2818 "vpaddl.u16 q12, q12\n" in Pack()
2819 "vpadd.u32 d16, d16, d17\n" in Pack()
2820 "vpadd.u32 d18, d18, d19\n" in Pack()
2821 "vpadd.u32 d20, d20, d21\n" in Pack()
2822 "vpadd.u32 d22, d22, d23\n" in Pack()
2823 "vpadd.u32 d24, d24, d25\n" in Pack()
2824 "vpadd.u32 d16, d16, d18\n" in Pack()
2825 "vpadd.u32 d17, d20, d22\n" in Pack()
2826 "vpadd.u32 d18, d24, d24\n" in Pack()
2827 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2828 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2829 "vadd.i32 q8, q8, q1\n" in Pack()
2830 "vadd.i32 q9, q9, q1\n" in Pack()
2831 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
2853 "add r0, %[in], %[stride]\n" in Pack()
2854 "add r1, r0, %[stride]\n" in Pack()
2855 "add r2, r1, %[stride]\n" in Pack()
2856 "add r3, r2, %[stride]\n" in Pack()
2857 "vmov.i16 q8, #0\n" in Pack()
2858 "vmov.i16 q9, #0\n" in Pack()
2859 "vmov.i16 q10, #0\n" in Pack()
2860 "vmov.i16 q11, #0\n" in Pack()
2861 "vmov.i16 q12, #0\n" in Pack()
2864 "subs %[count], %[count], #7\n" in Pack()
2865 "beq 2f\n" in Pack()
2868 "subs %[count], %[count], #8\n" in Pack()
2871 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2872 "vld1.32 {d1}, [r0]!\n" in Pack()
2873 "vld1.32 {d2}, [r1]!\n" in Pack()
2874 "vld1.32 {d3}, [r2]!\n" in Pack()
2875 "vld1.32 {d4}, [r3]!\n" in Pack()
2876 "vaddw.u8 q8, q8, d0\n" in Pack()
2877 "vaddw.u8 q9, q9, d1\n" in Pack()
2878 "vaddw.u8 q10, q10, d2\n" in Pack()
2879 "vaddw.u8 q11, q11, d3\n" in Pack()
2880 "vaddw.u8 q12, q12, d4\n" in Pack()
2881 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2882 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2884 "bne 1b\n" in Pack()
2889 "vmov.i8 d0, #0\n" in Pack()
2890 "vmov.i8 d1, #0\n" in Pack()
2891 "vmov.i8 d2, #0\n" in Pack()
2892 "vmov.i8 d3, #0\n" in Pack()
2893 "vmov.i8 d4, #0\n" in Pack()
2894 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2895 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
2896 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
2897 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
2898 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
2899 "vld1.8 {d1[6]}, [r0]!\n" in Pack()
2900 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
2901 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
2902 "vld1.8 {d2[6]}, [r1]!\n" in Pack()
2903 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
2904 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
2905 "vld1.8 {d3[6]}, [r2]!\n" in Pack()
2906 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
2907 "vld1.16 {d4[2]}, [r3]!\n" in Pack()
2908 "vld1.8 {d4[6]}, [r3]!\n" in Pack()
2909 "vaddw.u8 q8, q8, d0\n" in Pack()
2910 "vaddw.u8 q9, q9, d1\n" in Pack()
2911 "vaddw.u8 q10, q10, d2\n" in Pack()
2912 "vaddw.u8 q11, q11, d3\n" in Pack()
2913 "vaddw.u8 q12, q12, d4\n" in Pack()
2914 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2915 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
2918 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2919 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2920 "vpaddl.u16 q8, q8\n" in Pack()
2921 "vpaddl.u16 q9, q9\n" in Pack()
2922 "vpaddl.u16 q10, q10\n" in Pack()
2923 "vpaddl.u16 q11, q11\n" in Pack()
2924 "vpaddl.u16 q12, q12\n" in Pack()
2925 "vpadd.u32 d16, d16, d17\n" in Pack()
2926 "vpadd.u32 d18, d18, d19\n" in Pack()
2927 "vpadd.u32 d20, d20, d21\n" in Pack()
2928 "vpadd.u32 d22, d22, d23\n" in Pack()
2929 "vpadd.u32 d24, d24, d25\n" in Pack()
2930 "vpadd.u32 d16, d16, d18\n" in Pack()
2931 "vpadd.u32 d17, d20, d22\n" in Pack()
2932 "vpadd.u32 d18, d24, d24\n" in Pack()
2933 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2934 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2935 "vadd.i32 q8, q8, q1\n" in Pack()
2936 "vadd.i32 q9, q9, q1\n" in Pack()
2937 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
2959 "add r0, %[in], %[stride]\n" in Pack()
2960 "add r1, r0, %[stride]\n" in Pack()
2961 "add r2, r1, %[stride]\n" in Pack()
2962 "add r3, r2, %[stride]\n" in Pack()
2963 "add r4, r3, %[stride]\n" in Pack()
2964 "vmov.i16 q8, #0\n" in Pack()
2965 "vmov.i16 q9, #0\n" in Pack()
2966 "vmov.i16 q10, #0\n" in Pack()
2967 "vmov.i16 q11, #0\n" in Pack()
2968 "vmov.i16 q12, #0\n" in Pack()
2969 "vmov.i16 q13, #0\n" in Pack()
2972 "subs %[count], %[count], #8\n" in Pack()
2975 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2976 "vld1.32 {d1}, [r0]!\n" in Pack()
2977 "vld1.32 {d2}, [r1]!\n" in Pack()
2978 "vld1.32 {d3}, [r2]!\n" in Pack()
2979 "vld1.32 {d4}, [r3]!\n" in Pack()
2980 "vld1.32 {d5}, [r4]!\n" in Pack()
2981 "vaddw.u8 q8, q8, d0\n" in Pack()
2982 "vaddw.u8 q9, q9, d1\n" in Pack()
2983 "vaddw.u8 q10, q10, d2\n" in Pack()
2984 "vaddw.u8 q11, q11, d3\n" in Pack()
2985 "vaddw.u8 q12, q12, d4\n" in Pack()
2986 "vaddw.u8 q13, q13, d5\n" in Pack()
2987 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
2988 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
2990 "bne 1b\n" in Pack()
2993 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2994 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
2995 "vpaddl.u16 q8, q8\n" in Pack()
2996 "vpaddl.u16 q9, q9\n" in Pack()
2997 "vpaddl.u16 q10, q10\n" in Pack()
2998 "vpaddl.u16 q11, q11\n" in Pack()
2999 "vpaddl.u16 q12, q12\n" in Pack()
3000 "vpaddl.u16 q13, q13\n" in Pack()
3001 "vpadd.u32 d16, d16, d17\n" in Pack()
3002 "vpadd.u32 d18, d18, d19\n" in Pack()
3003 "vpadd.u32 d20, d20, d21\n" in Pack()
3004 "vpadd.u32 d22, d22, d23\n" in Pack()
3005 "vpadd.u32 d24, d24, d25\n" in Pack()
3006 "vpadd.u32 d26, d26, d27\n" in Pack()
3007 "vpadd.u32 d16, d16, d18\n" in Pack()
3008 "vpadd.u32 d17, d20, d22\n" in Pack()
3009 "vpadd.u32 d18, d24, d26\n" in Pack()
3010 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3011 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3012 "vadd.i32 q8, q8, q1\n" in Pack()
3013 "vadd.i32 q9, q9, q1\n" in Pack()
3014 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
3037 "add r0, %[in], %[stride]\n" in Pack()
3038 "add r1, r0, %[stride]\n" in Pack()
3039 "add r2, r1, %[stride]\n" in Pack()
3040 "add r3, r2, %[stride]\n" in Pack()
3041 "add r4, r3, %[stride]\n" in Pack()
3042 "vmov.i16 q8, #0\n" in Pack()
3043 "vmov.i16 q9, #0\n" in Pack()
3044 "vmov.i16 q10, #0\n" in Pack()
3045 "vmov.i16 q11, #0\n" in Pack()
3046 "vmov.i16 q12, #0\n" in Pack()
3047 "vmov.i16 q13, #0\n" in Pack()
3050 "subs %[count], %[count], #1\n" in Pack()
3051 "beq 2f\n" in Pack()
3054 "subs %[count], %[count], #8\n" in Pack()
3057 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3058 "vld1.32 {d1}, [r0]!\n" in Pack()
3059 "vld1.32 {d2}, [r1]!\n" in Pack()
3060 "vld1.32 {d3}, [r2]!\n" in Pack()
3061 "vld1.32 {d4}, [r3]!\n" in Pack()
3062 "vld1.32 {d5}, [r4]!\n" in Pack()
3063 "vaddw.u8 q8, q8, d0\n" in Pack()
3064 "vaddw.u8 q9, q9, d1\n" in Pack()
3065 "vaddw.u8 q10, q10, d2\n" in Pack()
3066 "vaddw.u8 q11, q11, d3\n" in Pack()
3067 "vaddw.u8 q12, q12, d4\n" in Pack()
3068 "vaddw.u8 q13, q13, d5\n" in Pack()
3069 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3070 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3072 "bne 1b\n" in Pack()
3077 "vmov.i8 d0, #0\n" in Pack()
3078 "vmov.i8 d1, #0\n" in Pack()
3079 "vmov.i8 d2, #0\n" in Pack()
3080 "vmov.i8 d3, #0\n" in Pack()
3081 "vmov.i8 d4, #0\n" in Pack()
3082 "vmov.i8 d5, #0\n" in Pack()
3083 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
3084 "vld1.8 {d1[0]}, [r0]!\n" in Pack()
3085 "vld1.8 {d2[0]}, [r1]!\n" in Pack()
3086 "vld1.8 {d3[0]}, [r2]!\n" in Pack()
3087 "vld1.8 {d4[0]}, [r3]!\n" in Pack()
3088 "vld1.8 {d5[0]}, [r4]!\n" in Pack()
3089 "vaddw.u8 q8, q8, d0\n" in Pack()
3090 "vaddw.u8 q9, q9, d1\n" in Pack()
3091 "vaddw.u8 q10, q10, d2\n" in Pack()
3092 "vaddw.u8 q11, q11, d3\n" in Pack()
3093 "vaddw.u8 q12, q12, d4\n" in Pack()
3094 "vaddw.u8 q13, q13, d5\n" in Pack()
3095 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3096 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3099 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3100 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
3101 "vpaddl.u16 q8, q8\n" in Pack()
3102 "vpaddl.u16 q9, q9\n" in Pack()
3103 "vpaddl.u16 q10, q10\n" in Pack()
3104 "vpaddl.u16 q11, q11\n" in Pack()
3105 "vpaddl.u16 q12, q12\n" in Pack()
3106 "vpaddl.u16 q13, q13\n" in Pack()
3107 "vpadd.u32 d16, d16, d17\n" in Pack()
3108 "vpadd.u32 d18, d18, d19\n" in Pack()
3109 "vpadd.u32 d20, d20, d21\n" in Pack()
3110 "vpadd.u32 d22, d22, d23\n" in Pack()
3111 "vpadd.u32 d24, d24, d25\n" in Pack()
3112 "vpadd.u32 d26, d26, d27\n" in Pack()
3113 "vpadd.u32 d16, d16, d18\n" in Pack()
3114 "vpadd.u32 d17, d20, d22\n" in Pack()
3115 "vpadd.u32 d18, d24, d26\n" in Pack()
3116 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3117 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3118 "vadd.i32 q8, q8, q1\n" in Pack()
3119 "vadd.i32 q9, q9, q1\n" in Pack()
3120 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
3143 "add r0, %[in], %[stride]\n" in Pack()
3144 "add r1, r0, %[stride]\n" in Pack()
3145 "add r2, r1, %[stride]\n" in Pack()
3146 "add r3, r2, %[stride]\n" in Pack()
3147 "add r4, r3, %[stride]\n" in Pack()
3148 "vmov.i16 q8, #0\n" in Pack()
3149 "vmov.i16 q9, #0\n" in Pack()
3150 "vmov.i16 q10, #0\n" in Pack()
3151 "vmov.i16 q11, #0\n" in Pack()
3152 "vmov.i16 q12, #0\n" in Pack()
3153 "vmov.i16 q13, #0\n" in Pack()
3156 "subs %[count], %[count], #2\n" in Pack()
3157 "beq 2f\n" in Pack()
3160 "subs %[count], %[count], #8\n" in Pack()
3163 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3164 "vld1.32 {d1}, [r0]!\n" in Pack()
3165 "vld1.32 {d2}, [r1]!\n" in Pack()
3166 "vld1.32 {d3}, [r2]!\n" in Pack()
3167 "vld1.32 {d4}, [r3]!\n" in Pack()
3168 "vld1.32 {d5}, [r4]!\n" in Pack()
3169 "vaddw.u8 q8, q8, d0\n" in Pack()
3170 "vaddw.u8 q9, q9, d1\n" in Pack()
3171 "vaddw.u8 q10, q10, d2\n" in Pack()
3172 "vaddw.u8 q11, q11, d3\n" in Pack()
3173 "vaddw.u8 q12, q12, d4\n" in Pack()
3174 "vaddw.u8 q13, q13, d5\n" in Pack()
3175 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3176 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3178 "bne 1b\n" in Pack()
3183 "vmov.i8 d0, #0\n" in Pack()
3184 "vmov.i8 d1, #0\n" in Pack()
3185 "vmov.i8 d2, #0\n" in Pack()
3186 "vmov.i8 d3, #0\n" in Pack()
3187 "vmov.i8 d4, #0\n" in Pack()
3188 "vmov.i8 d5, #0\n" in Pack()
3189 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
3190 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
3191 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
3192 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
3193 "vld1.16 {d4[0]}, [r3]!\n" in Pack()
3194 "vld1.16 {d5[0]}, [r4]!\n" in Pack()
3195 "vaddw.u8 q8, q8, d0\n" in Pack()
3196 "vaddw.u8 q9, q9, d1\n" in Pack()
3197 "vaddw.u8 q10, q10, d2\n" in Pack()
3198 "vaddw.u8 q11, q11, d3\n" in Pack()
3199 "vaddw.u8 q12, q12, d4\n" in Pack()
3200 "vaddw.u8 q13, q13, d5\n" in Pack()
3201 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3202 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3205 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3206 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
3207 "vpaddl.u16 q8, q8\n" in Pack()
3208 "vpaddl.u16 q9, q9\n" in Pack()
3209 "vpaddl.u16 q10, q10\n" in Pack()
3210 "vpaddl.u16 q11, q11\n" in Pack()
3211 "vpaddl.u16 q12, q12\n" in Pack()
3212 "vpaddl.u16 q13, q13\n" in Pack()
3213 "vpadd.u32 d16, d16, d17\n" in Pack()
3214 "vpadd.u32 d18, d18, d19\n" in Pack()
3215 "vpadd.u32 d20, d20, d21\n" in Pack()
3216 "vpadd.u32 d22, d22, d23\n" in Pack()
3217 "vpadd.u32 d24, d24, d25\n" in Pack()
3218 "vpadd.u32 d26, d26, d27\n" in Pack()
3219 "vpadd.u32 d16, d16, d18\n" in Pack()
3220 "vpadd.u32 d17, d20, d22\n" in Pack()
3221 "vpadd.u32 d18, d24, d26\n" in Pack()
3222 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3223 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3224 "vadd.i32 q8, q8, q1\n" in Pack()
3225 "vadd.i32 q9, q9, q1\n" in Pack()
3226 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
3249 "add r0, %[in], %[stride]\n" in Pack()
3250 "add r1, r0, %[stride]\n" in Pack()
3251 "add r2, r1, %[stride]\n" in Pack()
3252 "add r3, r2, %[stride]\n" in Pack()
3253 "add r4, r3, %[stride]\n" in Pack()
3254 "vmov.i16 q8, #0\n" in Pack()
3255 "vmov.i16 q9, #0\n" in Pack()
3256 "vmov.i16 q10, #0\n" in Pack()
3257 "vmov.i16 q11, #0\n" in Pack()
3258 "vmov.i16 q12, #0\n" in Pack()
3259 "vmov.i16 q13, #0\n" in Pack()
3262 "subs %[count], %[count], #3\n" in Pack()
3263 "beq 2f\n" in Pack()
3266 "subs %[count], %[count], #8\n" in Pack()
3269 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3270 "vld1.32 {d1}, [r0]!\n" in Pack()
3271 "vld1.32 {d2}, [r1]!\n" in Pack()
3272 "vld1.32 {d3}, [r2]!\n" in Pack()
3273 "vld1.32 {d4}, [r3]!\n" in Pack()
3274 "vld1.32 {d5}, [r4]!\n" in Pack()
3275 "vaddw.u8 q8, q8, d0\n" in Pack()
3276 "vaddw.u8 q9, q9, d1\n" in Pack()
3277 "vaddw.u8 q10, q10, d2\n" in Pack()
3278 "vaddw.u8 q11, q11, d3\n" in Pack()
3279 "vaddw.u8 q12, q12, d4\n" in Pack()
3280 "vaddw.u8 q13, q13, d5\n" in Pack()
3281 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3282 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3284 "bne 1b\n" in Pack()
3289 "vmov.i8 d0, #0\n" in Pack()
3290 "vmov.i8 d1, #0\n" in Pack()
3291 "vmov.i8 d2, #0\n" in Pack()
3292 "vmov.i8 d3, #0\n" in Pack()
3293 "vmov.i8 d4, #0\n" in Pack()
3294 "vmov.i8 d5, #0\n" in Pack()
3295 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
3296 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
3297 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
3298 "vld1.8 {d1[2]}, [r0]!\n" in Pack()
3299 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
3300 "vld1.8 {d2[2]}, [r1]!\n" in Pack()
3301 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
3302 "vld1.8 {d3[2]}, [r2]!\n" in Pack()
3303 "vld1.16 {d4[0]}, [r3]!\n" in Pack()
3304 "vld1.8 {d4[2]}, [r3]!\n" in Pack()
3305 "vld1.16 {d5[0]}, [r4]!\n" in Pack()
3306 "vld1.8 {d5[2]}, [r4]!\n" in Pack()
3307 "vaddw.u8 q8, q8, d0\n" in Pack()
3308 "vaddw.u8 q9, q9, d1\n" in Pack()
3309 "vaddw.u8 q10, q10, d2\n" in Pack()
3310 "vaddw.u8 q11, q11, d3\n" in Pack()
3311 "vaddw.u8 q12, q12, d4\n" in Pack()
3312 "vaddw.u8 q13, q13, d5\n" in Pack()
3313 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3314 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3317 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3318 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
3319 "vpaddl.u16 q8, q8\n" in Pack()
3320 "vpaddl.u16 q9, q9\n" in Pack()
3321 "vpaddl.u16 q10, q10\n" in Pack()
3322 "vpaddl.u16 q11, q11\n" in Pack()
3323 "vpaddl.u16 q12, q12\n" in Pack()
3324 "vpaddl.u16 q13, q13\n" in Pack()
3325 "vpadd.u32 d16, d16, d17\n" in Pack()
3326 "vpadd.u32 d18, d18, d19\n" in Pack()
3327 "vpadd.u32 d20, d20, d21\n" in Pack()
3328 "vpadd.u32 d22, d22, d23\n" in Pack()
3329 "vpadd.u32 d24, d24, d25\n" in Pack()
3330 "vpadd.u32 d26, d26, d27\n" in Pack()
3331 "vpadd.u32 d16, d16, d18\n" in Pack()
3332 "vpadd.u32 d17, d20, d22\n" in Pack()
3333 "vpadd.u32 d18, d24, d26\n" in Pack()
3334 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3335 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3336 "vadd.i32 q8, q8, q1\n" in Pack()
3337 "vadd.i32 q9, q9, q1\n" in Pack()
3338 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
3361 "add r0, %[in], %[stride]\n" in Pack()
3362 "add r1, r0, %[stride]\n" in Pack()
3363 "add r2, r1, %[stride]\n" in Pack()
3364 "add r3, r2, %[stride]\n" in Pack()
3365 "add r4, r3, %[stride]\n" in Pack()
3366 "vmov.i16 q8, #0\n" in Pack()
3367 "vmov.i16 q9, #0\n" in Pack()
3368 "vmov.i16 q10, #0\n" in Pack()
3369 "vmov.i16 q11, #0\n" in Pack()
3370 "vmov.i16 q12, #0\n" in Pack()
3371 "vmov.i16 q13, #0\n" in Pack()
3374 "subs %[count], %[count], #4\n" in Pack()
3375 "beq 2f\n" in Pack()
3378 "subs %[count], %[count], #8\n" in Pack()
3381 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3382 "vld1.32 {d1}, [r0]!\n" in Pack()
3383 "vld1.32 {d2}, [r1]!\n" in Pack()
3384 "vld1.32 {d3}, [r2]!\n" in Pack()
3385 "vld1.32 {d4}, [r3]!\n" in Pack()
3386 "vld1.32 {d5}, [r4]!\n" in Pack()
3387 "vaddw.u8 q8, q8, d0\n" in Pack()
3388 "vaddw.u8 q9, q9, d1\n" in Pack()
3389 "vaddw.u8 q10, q10, d2\n" in Pack()
3390 "vaddw.u8 q11, q11, d3\n" in Pack()
3391 "vaddw.u8 q12, q12, d4\n" in Pack()
3392 "vaddw.u8 q13, q13, d5\n" in Pack()
3393 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3394 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3396 "bne 1b\n" in Pack()
3401 "vmov.i8 d0, #0\n" in Pack()
3402 "vmov.i8 d1, #0\n" in Pack()
3403 "vmov.i8 d2, #0\n" in Pack()
3404 "vmov.i8 d3, #0\n" in Pack()
3405 "vmov.i8 d4, #0\n" in Pack()
3406 "vmov.i8 d5, #0\n" in Pack()
3407 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
3408 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
3409 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
3410 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
3411 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
3412 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
3413 "vaddw.u8 q8, q8, d0\n" in Pack()
3414 "vaddw.u8 q9, q9, d1\n" in Pack()
3415 "vaddw.u8 q10, q10, d2\n" in Pack()
3416 "vaddw.u8 q11, q11, d3\n" in Pack()
3417 "vaddw.u8 q12, q12, d4\n" in Pack()
3418 "vaddw.u8 q13, q13, d5\n" in Pack()
3419 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3420 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3423 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3424 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
3425 "vpaddl.u16 q8, q8\n" in Pack()
3426 "vpaddl.u16 q9, q9\n" in Pack()
3427 "vpaddl.u16 q10, q10\n" in Pack()
3428 "vpaddl.u16 q11, q11\n" in Pack()
3429 "vpaddl.u16 q12, q12\n" in Pack()
3430 "vpaddl.u16 q13, q13\n" in Pack()
3431 "vpadd.u32 d16, d16, d17\n" in Pack()
3432 "vpadd.u32 d18, d18, d19\n" in Pack()
3433 "vpadd.u32 d20, d20, d21\n" in Pack()
3434 "vpadd.u32 d22, d22, d23\n" in Pack()
3435 "vpadd.u32 d24, d24, d25\n" in Pack()
3436 "vpadd.u32 d26, d26, d27\n" in Pack()
3437 "vpadd.u32 d16, d16, d18\n" in Pack()
3438 "vpadd.u32 d17, d20, d22\n" in Pack()
3439 "vpadd.u32 d18, d24, d26\n" in Pack()
3440 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3441 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3442 "vadd.i32 q8, q8, q1\n" in Pack()
3443 "vadd.i32 q9, q9, q1\n" in Pack()
3444 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
3467 "add r0, %[in], %[stride]\n" in Pack()
3468 "add r1, r0, %[stride]\n" in Pack()
3469 "add r2, r1, %[stride]\n" in Pack()
3470 "add r3, r2, %[stride]\n" in Pack()
3471 "add r4, r3, %[stride]\n" in Pack()
3472 "vmov.i16 q8, #0\n" in Pack()
3473 "vmov.i16 q9, #0\n" in Pack()
3474 "vmov.i16 q10, #0\n" in Pack()
3475 "vmov.i16 q11, #0\n" in Pack()
3476 "vmov.i16 q12, #0\n" in Pack()
3477 "vmov.i16 q13, #0\n" in Pack()
3480 "subs %[count], %[count], #5\n" in Pack()
3481 "beq 2f\n" in Pack()
3484 "subs %[count], %[count], #8\n" in Pack()
3487 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3488 "vld1.32 {d1}, [r0]!\n" in Pack()
3489 "vld1.32 {d2}, [r1]!\n" in Pack()
3490 "vld1.32 {d3}, [r2]!\n" in Pack()
3491 "vld1.32 {d4}, [r3]!\n" in Pack()
3492 "vld1.32 {d5}, [r4]!\n" in Pack()
3493 "vaddw.u8 q8, q8, d0\n" in Pack()
3494 "vaddw.u8 q9, q9, d1\n" in Pack()
3495 "vaddw.u8 q10, q10, d2\n" in Pack()
3496 "vaddw.u8 q11, q11, d3\n" in Pack()
3497 "vaddw.u8 q12, q12, d4\n" in Pack()
3498 "vaddw.u8 q13, q13, d5\n" in Pack()
3499 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3500 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3502 "bne 1b\n" in Pack()
3507 "vmov.i8 d0, #0\n" in Pack()
3508 "vmov.i8 d1, #0\n" in Pack()
3509 "vmov.i8 d2, #0\n" in Pack()
3510 "vmov.i8 d3, #0\n" in Pack()
3511 "vmov.i8 d4, #0\n" in Pack()
3512 "vmov.i8 d5, #0\n" in Pack()
3513 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
3514 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
3515 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
3516 "vld1.8 {d1[4]}, [r0]!\n" in Pack()
3517 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
3518 "vld1.8 {d2[4]}, [r1]!\n" in Pack()
3519 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
3520 "vld1.8 {d3[4]}, [r2]!\n" in Pack()
3521 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
3522 "vld1.8 {d4[4]}, [r3]!\n" in Pack()
3523 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
3524 "vld1.8 {d5[4]}, [r4]!\n" in Pack()
3525 "vaddw.u8 q8, q8, d0\n" in Pack()
3526 "vaddw.u8 q9, q9, d1\n" in Pack()
3527 "vaddw.u8 q10, q10, d2\n" in Pack()
3528 "vaddw.u8 q11, q11, d3\n" in Pack()
3529 "vaddw.u8 q12, q12, d4\n" in Pack()
3530 "vaddw.u8 q13, q13, d5\n" in Pack()
3531 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3532 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3535 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3536 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
3537 "vpaddl.u16 q8, q8\n" in Pack()
3538 "vpaddl.u16 q9, q9\n" in Pack()
3539 "vpaddl.u16 q10, q10\n" in Pack()
3540 "vpaddl.u16 q11, q11\n" in Pack()
3541 "vpaddl.u16 q12, q12\n" in Pack()
3542 "vpaddl.u16 q13, q13\n" in Pack()
3543 "vpadd.u32 d16, d16, d17\n" in Pack()
3544 "vpadd.u32 d18, d18, d19\n" in Pack()
3545 "vpadd.u32 d20, d20, d21\n" in Pack()
3546 "vpadd.u32 d22, d22, d23\n" in Pack()
3547 "vpadd.u32 d24, d24, d25\n" in Pack()
3548 "vpadd.u32 d26, d26, d27\n" in Pack()
3549 "vpadd.u32 d16, d16, d18\n" in Pack()
3550 "vpadd.u32 d17, d20, d22\n" in Pack()
3551 "vpadd.u32 d18, d24, d26\n" in Pack()
3552 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3553 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3554 "vadd.i32 q8, q8, q1\n" in Pack()
3555 "vadd.i32 q9, q9, q1\n" in Pack()
3556 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
3579 "add r0, %[in], %[stride]\n" in Pack()
3580 "add r1, r0, %[stride]\n" in Pack()
3581 "add r2, r1, %[stride]\n" in Pack()
3582 "add r3, r2, %[stride]\n" in Pack()
3583 "add r4, r3, %[stride]\n" in Pack()
3584 "vmov.i16 q8, #0\n" in Pack()
3585 "vmov.i16 q9, #0\n" in Pack()
3586 "vmov.i16 q10, #0\n" in Pack()
3587 "vmov.i16 q11, #0\n" in Pack()
3588 "vmov.i16 q12, #0\n" in Pack()
3589 "vmov.i16 q13, #0\n" in Pack()
3592 "subs %[count], %[count], #6\n" in Pack()
3593 "beq 2f\n" in Pack()
3596 "subs %[count], %[count], #8\n" in Pack()
3599 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3600 "vld1.32 {d1}, [r0]!\n" in Pack()
3601 "vld1.32 {d2}, [r1]!\n" in Pack()
3602 "vld1.32 {d3}, [r2]!\n" in Pack()
3603 "vld1.32 {d4}, [r3]!\n" in Pack()
3604 "vld1.32 {d5}, [r4]!\n" in Pack()
3605 "vaddw.u8 q8, q8, d0\n" in Pack()
3606 "vaddw.u8 q9, q9, d1\n" in Pack()
3607 "vaddw.u8 q10, q10, d2\n" in Pack()
3608 "vaddw.u8 q11, q11, d3\n" in Pack()
3609 "vaddw.u8 q12, q12, d4\n" in Pack()
3610 "vaddw.u8 q13, q13, d5\n" in Pack()
3611 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3612 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3614 "bne 1b\n" in Pack()
3619 "vmov.i8 d0, #0\n" in Pack()
3620 "vmov.i8 d1, #0\n" in Pack()
3621 "vmov.i8 d2, #0\n" in Pack()
3622 "vmov.i8 d3, #0\n" in Pack()
3623 "vmov.i8 d4, #0\n" in Pack()
3624 "vmov.i8 d5, #0\n" in Pack()
3625 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
3626 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
3627 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
3628 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
3629 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
3630 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
3631 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
3632 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
3633 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
3634 "vld1.16 {d4[2]}, [r3]!\n" in Pack()
3635 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
3636 "vld1.16 {d5[2]}, [r4]!\n" in Pack()
3637 "vaddw.u8 q8, q8, d0\n" in Pack()
3638 "vaddw.u8 q9, q9, d1\n" in Pack()
3639 "vaddw.u8 q10, q10, d2\n" in Pack()
3640 "vaddw.u8 q11, q11, d3\n" in Pack()
3641 "vaddw.u8 q12, q12, d4\n" in Pack()
3642 "vaddw.u8 q13, q13, d5\n" in Pack()
3643 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3644 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3647 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3648 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
3649 "vpaddl.u16 q8, q8\n" in Pack()
3650 "vpaddl.u16 q9, q9\n" in Pack()
3651 "vpaddl.u16 q10, q10\n" in Pack()
3652 "vpaddl.u16 q11, q11\n" in Pack()
3653 "vpaddl.u16 q12, q12\n" in Pack()
3654 "vpaddl.u16 q13, q13\n" in Pack()
3655 "vpadd.u32 d16, d16, d17\n" in Pack()
3656 "vpadd.u32 d18, d18, d19\n" in Pack()
3657 "vpadd.u32 d20, d20, d21\n" in Pack()
3658 "vpadd.u32 d22, d22, d23\n" in Pack()
3659 "vpadd.u32 d24, d24, d25\n" in Pack()
3660 "vpadd.u32 d26, d26, d27\n" in Pack()
3661 "vpadd.u32 d16, d16, d18\n" in Pack()
3662 "vpadd.u32 d17, d20, d22\n" in Pack()
3663 "vpadd.u32 d18, d24, d26\n" in Pack()
3664 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3665 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3666 "vadd.i32 q8, q8, q1\n" in Pack()
3667 "vadd.i32 q9, q9, q1\n" in Pack()
3668 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
3691 "add r0, %[in], %[stride]\n" in Pack()
3692 "add r1, r0, %[stride]\n" in Pack()
3693 "add r2, r1, %[stride]\n" in Pack()
3694 "add r3, r2, %[stride]\n" in Pack()
3695 "add r4, r3, %[stride]\n" in Pack()
3696 "vmov.i16 q8, #0\n" in Pack()
3697 "vmov.i16 q9, #0\n" in Pack()
3698 "vmov.i16 q10, #0\n" in Pack()
3699 "vmov.i16 q11, #0\n" in Pack()
3700 "vmov.i16 q12, #0\n" in Pack()
3701 "vmov.i16 q13, #0\n" in Pack()
3704 "subs %[count], %[count], #7\n" in Pack()
3705 "beq 2f\n" in Pack()
3708 "subs %[count], %[count], #8\n" in Pack()
3711 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3712 "vld1.32 {d1}, [r0]!\n" in Pack()
3713 "vld1.32 {d2}, [r1]!\n" in Pack()
3714 "vld1.32 {d3}, [r2]!\n" in Pack()
3715 "vld1.32 {d4}, [r3]!\n" in Pack()
3716 "vld1.32 {d5}, [r4]!\n" in Pack()
3717 "vaddw.u8 q8, q8, d0\n" in Pack()
3718 "vaddw.u8 q9, q9, d1\n" in Pack()
3719 "vaddw.u8 q10, q10, d2\n" in Pack()
3720 "vaddw.u8 q11, q11, d3\n" in Pack()
3721 "vaddw.u8 q12, q12, d4\n" in Pack()
3722 "vaddw.u8 q13, q13, d5\n" in Pack()
3723 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3724 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3726 "bne 1b\n" in Pack()
3731 "vmov.i8 d0, #0\n" in Pack()
3732 "vmov.i8 d1, #0\n" in Pack()
3733 "vmov.i8 d2, #0\n" in Pack()
3734 "vmov.i8 d3, #0\n" in Pack()
3735 "vmov.i8 d4, #0\n" in Pack()
3736 "vmov.i8 d5, #0\n" in Pack()
3737 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
3738 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
3739 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
3740 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
3741 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
3742 "vld1.8 {d1[6]}, [r0]!\n" in Pack()
3743 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
3744 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
3745 "vld1.8 {d2[6]}, [r1]!\n" in Pack()
3746 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
3747 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
3748 "vld1.8 {d3[6]}, [r2]!\n" in Pack()
3749 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
3750 "vld1.16 {d4[2]}, [r3]!\n" in Pack()
3751 "vld1.8 {d4[6]}, [r3]!\n" in Pack()
3752 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
3753 "vld1.16 {d5[2]}, [r4]!\n" in Pack()
3754 "vld1.8 {d5[6]}, [r4]!\n" in Pack()
3755 "vaddw.u8 q8, q8, d0\n" in Pack()
3756 "vaddw.u8 q9, q9, d1\n" in Pack()
3757 "vaddw.u8 q10, q10, d2\n" in Pack()
3758 "vaddw.u8 q11, q11, d3\n" in Pack()
3759 "vaddw.u8 q12, q12, d4\n" in Pack()
3760 "vaddw.u8 q13, q13, d5\n" in Pack()
3761 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3762 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
3765 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3766 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
3767 "vpaddl.u16 q8, q8\n" in Pack()
3768 "vpaddl.u16 q9, q9\n" in Pack()
3769 "vpaddl.u16 q10, q10\n" in Pack()
3770 "vpaddl.u16 q11, q11\n" in Pack()
3771 "vpaddl.u16 q12, q12\n" in Pack()
3772 "vpaddl.u16 q13, q13\n" in Pack()
3773 "vpadd.u32 d16, d16, d17\n" in Pack()
3774 "vpadd.u32 d18, d18, d19\n" in Pack()
3775 "vpadd.u32 d20, d20, d21\n" in Pack()
3776 "vpadd.u32 d22, d22, d23\n" in Pack()
3777 "vpadd.u32 d24, d24, d25\n" in Pack()
3778 "vpadd.u32 d26, d26, d27\n" in Pack()
3779 "vpadd.u32 d16, d16, d18\n" in Pack()
3780 "vpadd.u32 d17, d20, d22\n" in Pack()
3781 "vpadd.u32 d18, d24, d26\n" in Pack()
3782 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3783 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3784 "vadd.i32 q8, q8, q1\n" in Pack()
3785 "vadd.i32 q9, q9, q1\n" in Pack()
3786 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
3809 "add r0, %[in], %[stride]\n" in Pack()
3810 "add r1, r0, %[stride]\n" in Pack()
3811 "add r2, r1, %[stride]\n" in Pack()
3812 "add r3, r2, %[stride]\n" in Pack()
3813 "add r4, r3, %[stride]\n" in Pack()
3814 "add r5, r4, %[stride]\n" in Pack()
3815 "vmov.i16 q8, #0\n" in Pack()
3816 "vmov.i16 q9, #0\n" in Pack()
3817 "vmov.i16 q10, #0\n" in Pack()
3818 "vmov.i16 q11, #0\n" in Pack()
3819 "vmov.i16 q12, #0\n" in Pack()
3820 "vmov.i16 q13, #0\n" in Pack()
3821 "vmov.i16 q14, #0\n" in Pack()
3824 "subs %[count], %[count], #8\n" in Pack()
3827 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3828 "vld1.32 {d1}, [r0]!\n" in Pack()
3829 "vld1.32 {d2}, [r1]!\n" in Pack()
3830 "vld1.32 {d3}, [r2]!\n" in Pack()
3831 "vld1.32 {d4}, [r3]!\n" in Pack()
3832 "vld1.32 {d5}, [r4]!\n" in Pack()
3833 "vld1.32 {d6}, [r5]!\n" in Pack()
3834 "vaddw.u8 q8, q8, d0\n" in Pack()
3835 "vaddw.u8 q9, q9, d1\n" in Pack()
3836 "vaddw.u8 q10, q10, d2\n" in Pack()
3837 "vaddw.u8 q11, q11, d3\n" in Pack()
3838 "vaddw.u8 q12, q12, d4\n" in Pack()
3839 "vaddw.u8 q13, q13, d5\n" in Pack()
3840 "vaddw.u8 q14, q14, d6\n" in Pack()
3841 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
3842 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
3844 "bne 1b\n" in Pack()
3847 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
3848 "ldr r1, %[additive_sum_offset]\n" in Pack()
3849 "vmov.32 d0[0], r0\n" in Pack()
3850 "vdup.32 q1, r1\n" in Pack()
3851 "vpaddl.u16 q8, q8\n" in Pack()
3852 "vpaddl.u16 q9, q9\n" in Pack()
3853 "vpaddl.u16 q10, q10\n" in Pack()
3854 "vpaddl.u16 q11, q11\n" in Pack()
3855 "vpaddl.u16 q12, q12\n" in Pack()
3856 "vpaddl.u16 q13, q13\n" in Pack()
3857 "vpaddl.u16 q14, q14\n" in Pack()
3858 "vpadd.u32 d16, d16, d17\n" in Pack()
3859 "vpadd.u32 d18, d18, d19\n" in Pack()
3860 "vpadd.u32 d20, d20, d21\n" in Pack()
3861 "vpadd.u32 d22, d22, d23\n" in Pack()
3862 "vpadd.u32 d24, d24, d25\n" in Pack()
3863 "vpadd.u32 d26, d26, d27\n" in Pack()
3864 "vpadd.u32 d28, d28, d29\n" in Pack()
3865 "vpadd.u32 d16, d16, d18\n" in Pack()
3866 "vpadd.u32 d17, d20, d22\n" in Pack()
3867 "vpadd.u32 d18, d24, d26\n" in Pack()
3868 "vpadd.u32 d19, d28, d28\n" in Pack()
3869 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3870 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3871 "vadd.i32 q8, q8, q1\n" in Pack()
3872 "vadd.i32 q9, q9, q1\n" in Pack()
3873 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
3896 "add r0, %[in], %[stride]\n" in Pack()
3897 "add r1, r0, %[stride]\n" in Pack()
3898 "add r2, r1, %[stride]\n" in Pack()
3899 "add r3, r2, %[stride]\n" in Pack()
3900 "add r4, r3, %[stride]\n" in Pack()
3901 "add r5, r4, %[stride]\n" in Pack()
3902 "vmov.i16 q8, #0\n" in Pack()
3903 "vmov.i16 q9, #0\n" in Pack()
3904 "vmov.i16 q10, #0\n" in Pack()
3905 "vmov.i16 q11, #0\n" in Pack()
3906 "vmov.i16 q12, #0\n" in Pack()
3907 "vmov.i16 q13, #0\n" in Pack()
3908 "vmov.i16 q14, #0\n" in Pack()
3911 "subs %[count], %[count], #1\n" in Pack()
3912 "beq 2f\n" in Pack()
3915 "subs %[count], %[count], #8\n" in Pack()
3918 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3919 "vld1.32 {d1}, [r0]!\n" in Pack()
3920 "vld1.32 {d2}, [r1]!\n" in Pack()
3921 "vld1.32 {d3}, [r2]!\n" in Pack()
3922 "vld1.32 {d4}, [r3]!\n" in Pack()
3923 "vld1.32 {d5}, [r4]!\n" in Pack()
3924 "vld1.32 {d6}, [r5]!\n" in Pack()
3925 "vaddw.u8 q8, q8, d0\n" in Pack()
3926 "vaddw.u8 q9, q9, d1\n" in Pack()
3927 "vaddw.u8 q10, q10, d2\n" in Pack()
3928 "vaddw.u8 q11, q11, d3\n" in Pack()
3929 "vaddw.u8 q12, q12, d4\n" in Pack()
3930 "vaddw.u8 q13, q13, d5\n" in Pack()
3931 "vaddw.u8 q14, q14, d6\n" in Pack()
3932 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
3933 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
3935 "bne 1b\n" in Pack()
3940 "vmov.i8 d0, #0\n" in Pack()
3941 "vmov.i8 d1, #0\n" in Pack()
3942 "vmov.i8 d2, #0\n" in Pack()
3943 "vmov.i8 d3, #0\n" in Pack()
3944 "vmov.i8 d4, #0\n" in Pack()
3945 "vmov.i8 d5, #0\n" in Pack()
3946 "vmov.i8 d6, #0\n" in Pack()
3947 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
3948 "vld1.8 {d1[0]}, [r0]!\n" in Pack()
3949 "vld1.8 {d2[0]}, [r1]!\n" in Pack()
3950 "vld1.8 {d3[0]}, [r2]!\n" in Pack()
3951 "vld1.8 {d4[0]}, [r3]!\n" in Pack()
3952 "vld1.8 {d5[0]}, [r4]!\n" in Pack()
3953 "vld1.8 {d6[0]}, [r5]!\n" in Pack()
3954 "vaddw.u8 q8, q8, d0\n" in Pack()
3955 "vaddw.u8 q9, q9, d1\n" in Pack()
3956 "vaddw.u8 q10, q10, d2\n" in Pack()
3957 "vaddw.u8 q11, q11, d3\n" in Pack()
3958 "vaddw.u8 q12, q12, d4\n" in Pack()
3959 "vaddw.u8 q13, q13, d5\n" in Pack()
3960 "vaddw.u8 q14, q14, d6\n" in Pack()
3961 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
3962 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
3965 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
3966 "ldr r1, %[additive_sum_offset]\n" in Pack()
3967 "vmov.32 d0[0], r0\n" in Pack()
3968 "vdup.32 q1, r1\n" in Pack()
3969 "vpaddl.u16 q8, q8\n" in Pack()
3970 "vpaddl.u16 q9, q9\n" in Pack()
3971 "vpaddl.u16 q10, q10\n" in Pack()
3972 "vpaddl.u16 q11, q11\n" in Pack()
3973 "vpaddl.u16 q12, q12\n" in Pack()
3974 "vpaddl.u16 q13, q13\n" in Pack()
3975 "vpaddl.u16 q14, q14\n" in Pack()
3976 "vpadd.u32 d16, d16, d17\n" in Pack()
3977 "vpadd.u32 d18, d18, d19\n" in Pack()
3978 "vpadd.u32 d20, d20, d21\n" in Pack()
3979 "vpadd.u32 d22, d22, d23\n" in Pack()
3980 "vpadd.u32 d24, d24, d25\n" in Pack()
3981 "vpadd.u32 d26, d26, d27\n" in Pack()
3982 "vpadd.u32 d28, d28, d29\n" in Pack()
3983 "vpadd.u32 d16, d16, d18\n" in Pack()
3984 "vpadd.u32 d17, d20, d22\n" in Pack()
3985 "vpadd.u32 d18, d24, d26\n" in Pack()
3986 "vpadd.u32 d19, d28, d28\n" in Pack()
3987 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3988 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3989 "vadd.i32 q8, q8, q1\n" in Pack()
3990 "vadd.i32 q9, q9, q1\n" in Pack()
3991 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
4014 "add r0, %[in], %[stride]\n" in Pack()
4015 "add r1, r0, %[stride]\n" in Pack()
4016 "add r2, r1, %[stride]\n" in Pack()
4017 "add r3, r2, %[stride]\n" in Pack()
4018 "add r4, r3, %[stride]\n" in Pack()
4019 "add r5, r4, %[stride]\n" in Pack()
4020 "vmov.i16 q8, #0\n" in Pack()
4021 "vmov.i16 q9, #0\n" in Pack()
4022 "vmov.i16 q10, #0\n" in Pack()
4023 "vmov.i16 q11, #0\n" in Pack()
4024 "vmov.i16 q12, #0\n" in Pack()
4025 "vmov.i16 q13, #0\n" in Pack()
4026 "vmov.i16 q14, #0\n" in Pack()
4029 "subs %[count], %[count], #2\n" in Pack()
4030 "beq 2f\n" in Pack()
4033 "subs %[count], %[count], #8\n" in Pack()
4036 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4037 "vld1.32 {d1}, [r0]!\n" in Pack()
4038 "vld1.32 {d2}, [r1]!\n" in Pack()
4039 "vld1.32 {d3}, [r2]!\n" in Pack()
4040 "vld1.32 {d4}, [r3]!\n" in Pack()
4041 "vld1.32 {d5}, [r4]!\n" in Pack()
4042 "vld1.32 {d6}, [r5]!\n" in Pack()
4043 "vaddw.u8 q8, q8, d0\n" in Pack()
4044 "vaddw.u8 q9, q9, d1\n" in Pack()
4045 "vaddw.u8 q10, q10, d2\n" in Pack()
4046 "vaddw.u8 q11, q11, d3\n" in Pack()
4047 "vaddw.u8 q12, q12, d4\n" in Pack()
4048 "vaddw.u8 q13, q13, d5\n" in Pack()
4049 "vaddw.u8 q14, q14, d6\n" in Pack()
4050 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4051 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4053 "bne 1b\n" in Pack()
4058 "vmov.i8 d0, #0\n" in Pack()
4059 "vmov.i8 d1, #0\n" in Pack()
4060 "vmov.i8 d2, #0\n" in Pack()
4061 "vmov.i8 d3, #0\n" in Pack()
4062 "vmov.i8 d4, #0\n" in Pack()
4063 "vmov.i8 d5, #0\n" in Pack()
4064 "vmov.i8 d6, #0\n" in Pack()
4065 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
4066 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
4067 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
4068 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
4069 "vld1.16 {d4[0]}, [r3]!\n" in Pack()
4070 "vld1.16 {d5[0]}, [r4]!\n" in Pack()
4071 "vld1.16 {d6[0]}, [r5]!\n" in Pack()
4072 "vaddw.u8 q8, q8, d0\n" in Pack()
4073 "vaddw.u8 q9, q9, d1\n" in Pack()
4074 "vaddw.u8 q10, q10, d2\n" in Pack()
4075 "vaddw.u8 q11, q11, d3\n" in Pack()
4076 "vaddw.u8 q12, q12, d4\n" in Pack()
4077 "vaddw.u8 q13, q13, d5\n" in Pack()
4078 "vaddw.u8 q14, q14, d6\n" in Pack()
4079 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4080 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4083 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
4084 "ldr r1, %[additive_sum_offset]\n" in Pack()
4085 "vmov.32 d0[0], r0\n" in Pack()
4086 "vdup.32 q1, r1\n" in Pack()
4087 "vpaddl.u16 q8, q8\n" in Pack()
4088 "vpaddl.u16 q9, q9\n" in Pack()
4089 "vpaddl.u16 q10, q10\n" in Pack()
4090 "vpaddl.u16 q11, q11\n" in Pack()
4091 "vpaddl.u16 q12, q12\n" in Pack()
4092 "vpaddl.u16 q13, q13\n" in Pack()
4093 "vpaddl.u16 q14, q14\n" in Pack()
4094 "vpadd.u32 d16, d16, d17\n" in Pack()
4095 "vpadd.u32 d18, d18, d19\n" in Pack()
4096 "vpadd.u32 d20, d20, d21\n" in Pack()
4097 "vpadd.u32 d22, d22, d23\n" in Pack()
4098 "vpadd.u32 d24, d24, d25\n" in Pack()
4099 "vpadd.u32 d26, d26, d27\n" in Pack()
4100 "vpadd.u32 d28, d28, d29\n" in Pack()
4101 "vpadd.u32 d16, d16, d18\n" in Pack()
4102 "vpadd.u32 d17, d20, d22\n" in Pack()
4103 "vpadd.u32 d18, d24, d26\n" in Pack()
4104 "vpadd.u32 d19, d28, d28\n" in Pack()
4105 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4106 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4107 "vadd.i32 q8, q8, q1\n" in Pack()
4108 "vadd.i32 q9, q9, q1\n" in Pack()
4109 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
4132 "add r0, %[in], %[stride]\n" in Pack()
4133 "add r1, r0, %[stride]\n" in Pack()
4134 "add r2, r1, %[stride]\n" in Pack()
4135 "add r3, r2, %[stride]\n" in Pack()
4136 "add r4, r3, %[stride]\n" in Pack()
4137 "add r5, r4, %[stride]\n" in Pack()
4138 "vmov.i16 q8, #0\n" in Pack()
4139 "vmov.i16 q9, #0\n" in Pack()
4140 "vmov.i16 q10, #0\n" in Pack()
4141 "vmov.i16 q11, #0\n" in Pack()
4142 "vmov.i16 q12, #0\n" in Pack()
4143 "vmov.i16 q13, #0\n" in Pack()
4144 "vmov.i16 q14, #0\n" in Pack()
4147 "subs %[count], %[count], #3\n" in Pack()
4148 "beq 2f\n" in Pack()
4151 "subs %[count], %[count], #8\n" in Pack()
4154 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4155 "vld1.32 {d1}, [r0]!\n" in Pack()
4156 "vld1.32 {d2}, [r1]!\n" in Pack()
4157 "vld1.32 {d3}, [r2]!\n" in Pack()
4158 "vld1.32 {d4}, [r3]!\n" in Pack()
4159 "vld1.32 {d5}, [r4]!\n" in Pack()
4160 "vld1.32 {d6}, [r5]!\n" in Pack()
4161 "vaddw.u8 q8, q8, d0\n" in Pack()
4162 "vaddw.u8 q9, q9, d1\n" in Pack()
4163 "vaddw.u8 q10, q10, d2\n" in Pack()
4164 "vaddw.u8 q11, q11, d3\n" in Pack()
4165 "vaddw.u8 q12, q12, d4\n" in Pack()
4166 "vaddw.u8 q13, q13, d5\n" in Pack()
4167 "vaddw.u8 q14, q14, d6\n" in Pack()
4168 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4169 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4171 "bne 1b\n" in Pack()
4176 "vmov.i8 d0, #0\n" in Pack()
4177 "vmov.i8 d1, #0\n" in Pack()
4178 "vmov.i8 d2, #0\n" in Pack()
4179 "vmov.i8 d3, #0\n" in Pack()
4180 "vmov.i8 d4, #0\n" in Pack()
4181 "vmov.i8 d5, #0\n" in Pack()
4182 "vmov.i8 d6, #0\n" in Pack()
4183 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
4184 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
4185 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
4186 "vld1.8 {d1[2]}, [r0]!\n" in Pack()
4187 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
4188 "vld1.8 {d2[2]}, [r1]!\n" in Pack()
4189 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
4190 "vld1.8 {d3[2]}, [r2]!\n" in Pack()
4191 "vld1.16 {d4[0]}, [r3]!\n" in Pack()
4192 "vld1.8 {d4[2]}, [r3]!\n" in Pack()
4193 "vld1.16 {d5[0]}, [r4]!\n" in Pack()
4194 "vld1.8 {d5[2]}, [r4]!\n" in Pack()
4195 "vld1.16 {d6[0]}, [r5]!\n" in Pack()
4196 "vld1.8 {d6[2]}, [r5]!\n" in Pack()
4197 "vaddw.u8 q8, q8, d0\n" in Pack()
4198 "vaddw.u8 q9, q9, d1\n" in Pack()
4199 "vaddw.u8 q10, q10, d2\n" in Pack()
4200 "vaddw.u8 q11, q11, d3\n" in Pack()
4201 "vaddw.u8 q12, q12, d4\n" in Pack()
4202 "vaddw.u8 q13, q13, d5\n" in Pack()
4203 "vaddw.u8 q14, q14, d6\n" in Pack()
4204 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4205 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4208 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
4209 "ldr r1, %[additive_sum_offset]\n" in Pack()
4210 "vmov.32 d0[0], r0\n" in Pack()
4211 "vdup.32 q1, r1\n" in Pack()
4212 "vpaddl.u16 q8, q8\n" in Pack()
4213 "vpaddl.u16 q9, q9\n" in Pack()
4214 "vpaddl.u16 q10, q10\n" in Pack()
4215 "vpaddl.u16 q11, q11\n" in Pack()
4216 "vpaddl.u16 q12, q12\n" in Pack()
4217 "vpaddl.u16 q13, q13\n" in Pack()
4218 "vpaddl.u16 q14, q14\n" in Pack()
4219 "vpadd.u32 d16, d16, d17\n" in Pack()
4220 "vpadd.u32 d18, d18, d19\n" in Pack()
4221 "vpadd.u32 d20, d20, d21\n" in Pack()
4222 "vpadd.u32 d22, d22, d23\n" in Pack()
4223 "vpadd.u32 d24, d24, d25\n" in Pack()
4224 "vpadd.u32 d26, d26, d27\n" in Pack()
4225 "vpadd.u32 d28, d28, d29\n" in Pack()
4226 "vpadd.u32 d16, d16, d18\n" in Pack()
4227 "vpadd.u32 d17, d20, d22\n" in Pack()
4228 "vpadd.u32 d18, d24, d26\n" in Pack()
4229 "vpadd.u32 d19, d28, d28\n" in Pack()
4230 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4231 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4232 "vadd.i32 q8, q8, q1\n" in Pack()
4233 "vadd.i32 q9, q9, q1\n" in Pack()
4234 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
4257 "add r0, %[in], %[stride]\n" in Pack()
4258 "add r1, r0, %[stride]\n" in Pack()
4259 "add r2, r1, %[stride]\n" in Pack()
4260 "add r3, r2, %[stride]\n" in Pack()
4261 "add r4, r3, %[stride]\n" in Pack()
4262 "add r5, r4, %[stride]\n" in Pack()
4263 "vmov.i16 q8, #0\n" in Pack()
4264 "vmov.i16 q9, #0\n" in Pack()
4265 "vmov.i16 q10, #0\n" in Pack()
4266 "vmov.i16 q11, #0\n" in Pack()
4267 "vmov.i16 q12, #0\n" in Pack()
4268 "vmov.i16 q13, #0\n" in Pack()
4269 "vmov.i16 q14, #0\n" in Pack()
4272 "subs %[count], %[count], #4\n" in Pack()
4273 "beq 2f\n" in Pack()
4276 "subs %[count], %[count], #8\n" in Pack()
4279 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4280 "vld1.32 {d1}, [r0]!\n" in Pack()
4281 "vld1.32 {d2}, [r1]!\n" in Pack()
4282 "vld1.32 {d3}, [r2]!\n" in Pack()
4283 "vld1.32 {d4}, [r3]!\n" in Pack()
4284 "vld1.32 {d5}, [r4]!\n" in Pack()
4285 "vld1.32 {d6}, [r5]!\n" in Pack()
4286 "vaddw.u8 q8, q8, d0\n" in Pack()
4287 "vaddw.u8 q9, q9, d1\n" in Pack()
4288 "vaddw.u8 q10, q10, d2\n" in Pack()
4289 "vaddw.u8 q11, q11, d3\n" in Pack()
4290 "vaddw.u8 q12, q12, d4\n" in Pack()
4291 "vaddw.u8 q13, q13, d5\n" in Pack()
4292 "vaddw.u8 q14, q14, d6\n" in Pack()
4293 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4294 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4296 "bne 1b\n" in Pack()
4301 "vmov.i8 d0, #0\n" in Pack()
4302 "vmov.i8 d1, #0\n" in Pack()
4303 "vmov.i8 d2, #0\n" in Pack()
4304 "vmov.i8 d3, #0\n" in Pack()
4305 "vmov.i8 d4, #0\n" in Pack()
4306 "vmov.i8 d5, #0\n" in Pack()
4307 "vmov.i8 d6, #0\n" in Pack()
4308 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
4309 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
4310 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
4311 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
4312 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
4313 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
4314 "vld1.32 {d6[0]}, [r5]!\n" in Pack()
4315 "vaddw.u8 q8, q8, d0\n" in Pack()
4316 "vaddw.u8 q9, q9, d1\n" in Pack()
4317 "vaddw.u8 q10, q10, d2\n" in Pack()
4318 "vaddw.u8 q11, q11, d3\n" in Pack()
4319 "vaddw.u8 q12, q12, d4\n" in Pack()
4320 "vaddw.u8 q13, q13, d5\n" in Pack()
4321 "vaddw.u8 q14, q14, d6\n" in Pack()
4322 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4323 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4326 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
4327 "ldr r1, %[additive_sum_offset]\n" in Pack()
4328 "vmov.32 d0[0], r0\n" in Pack()
4329 "vdup.32 q1, r1\n" in Pack()
4330 "vpaddl.u16 q8, q8\n" in Pack()
4331 "vpaddl.u16 q9, q9\n" in Pack()
4332 "vpaddl.u16 q10, q10\n" in Pack()
4333 "vpaddl.u16 q11, q11\n" in Pack()
4334 "vpaddl.u16 q12, q12\n" in Pack()
4335 "vpaddl.u16 q13, q13\n" in Pack()
4336 "vpaddl.u16 q14, q14\n" in Pack()
4337 "vpadd.u32 d16, d16, d17\n" in Pack()
4338 "vpadd.u32 d18, d18, d19\n" in Pack()
4339 "vpadd.u32 d20, d20, d21\n" in Pack()
4340 "vpadd.u32 d22, d22, d23\n" in Pack()
4341 "vpadd.u32 d24, d24, d25\n" in Pack()
4342 "vpadd.u32 d26, d26, d27\n" in Pack()
4343 "vpadd.u32 d28, d28, d29\n" in Pack()
4344 "vpadd.u32 d16, d16, d18\n" in Pack()
4345 "vpadd.u32 d17, d20, d22\n" in Pack()
4346 "vpadd.u32 d18, d24, d26\n" in Pack()
4347 "vpadd.u32 d19, d28, d28\n" in Pack()
4348 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4349 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4350 "vadd.i32 q8, q8, q1\n" in Pack()
4351 "vadd.i32 q9, q9, q1\n" in Pack()
4352 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
4375 "add r0, %[in], %[stride]\n" in Pack()
4376 "add r1, r0, %[stride]\n" in Pack()
4377 "add r2, r1, %[stride]\n" in Pack()
4378 "add r3, r2, %[stride]\n" in Pack()
4379 "add r4, r3, %[stride]\n" in Pack()
4380 "add r5, r4, %[stride]\n" in Pack()
4381 "vmov.i16 q8, #0\n" in Pack()
4382 "vmov.i16 q9, #0\n" in Pack()
4383 "vmov.i16 q10, #0\n" in Pack()
4384 "vmov.i16 q11, #0\n" in Pack()
4385 "vmov.i16 q12, #0\n" in Pack()
4386 "vmov.i16 q13, #0\n" in Pack()
4387 "vmov.i16 q14, #0\n" in Pack()
4390 "subs %[count], %[count], #5\n" in Pack()
4391 "beq 2f\n" in Pack()
4394 "subs %[count], %[count], #8\n" in Pack()
4397 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4398 "vld1.32 {d1}, [r0]!\n" in Pack()
4399 "vld1.32 {d2}, [r1]!\n" in Pack()
4400 "vld1.32 {d3}, [r2]!\n" in Pack()
4401 "vld1.32 {d4}, [r3]!\n" in Pack()
4402 "vld1.32 {d5}, [r4]!\n" in Pack()
4403 "vld1.32 {d6}, [r5]!\n" in Pack()
4404 "vaddw.u8 q8, q8, d0\n" in Pack()
4405 "vaddw.u8 q9, q9, d1\n" in Pack()
4406 "vaddw.u8 q10, q10, d2\n" in Pack()
4407 "vaddw.u8 q11, q11, d3\n" in Pack()
4408 "vaddw.u8 q12, q12, d4\n" in Pack()
4409 "vaddw.u8 q13, q13, d5\n" in Pack()
4410 "vaddw.u8 q14, q14, d6\n" in Pack()
4411 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4412 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4414 "bne 1b\n" in Pack()
4419 "vmov.i8 d0, #0\n" in Pack()
4420 "vmov.i8 d1, #0\n" in Pack()
4421 "vmov.i8 d2, #0\n" in Pack()
4422 "vmov.i8 d3, #0\n" in Pack()
4423 "vmov.i8 d4, #0\n" in Pack()
4424 "vmov.i8 d5, #0\n" in Pack()
4425 "vmov.i8 d6, #0\n" in Pack()
4426 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
4427 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
4428 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
4429 "vld1.8 {d1[4]}, [r0]!\n" in Pack()
4430 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
4431 "vld1.8 {d2[4]}, [r1]!\n" in Pack()
4432 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
4433 "vld1.8 {d3[4]}, [r2]!\n" in Pack()
4434 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
4435 "vld1.8 {d4[4]}, [r3]!\n" in Pack()
4436 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
4437 "vld1.8 {d5[4]}, [r4]!\n" in Pack()
4438 "vld1.32 {d6[0]}, [r5]!\n" in Pack()
4439 "vld1.8 {d6[4]}, [r5]!\n" in Pack()
4440 "vaddw.u8 q8, q8, d0\n" in Pack()
4441 "vaddw.u8 q9, q9, d1\n" in Pack()
4442 "vaddw.u8 q10, q10, d2\n" in Pack()
4443 "vaddw.u8 q11, q11, d3\n" in Pack()
4444 "vaddw.u8 q12, q12, d4\n" in Pack()
4445 "vaddw.u8 q13, q13, d5\n" in Pack()
4446 "vaddw.u8 q14, q14, d6\n" in Pack()
4447 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4448 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4451 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
4452 "ldr r1, %[additive_sum_offset]\n" in Pack()
4453 "vmov.32 d0[0], r0\n" in Pack()
4454 "vdup.32 q1, r1\n" in Pack()
4455 "vpaddl.u16 q8, q8\n" in Pack()
4456 "vpaddl.u16 q9, q9\n" in Pack()
4457 "vpaddl.u16 q10, q10\n" in Pack()
4458 "vpaddl.u16 q11, q11\n" in Pack()
4459 "vpaddl.u16 q12, q12\n" in Pack()
4460 "vpaddl.u16 q13, q13\n" in Pack()
4461 "vpaddl.u16 q14, q14\n" in Pack()
4462 "vpadd.u32 d16, d16, d17\n" in Pack()
4463 "vpadd.u32 d18, d18, d19\n" in Pack()
4464 "vpadd.u32 d20, d20, d21\n" in Pack()
4465 "vpadd.u32 d22, d22, d23\n" in Pack()
4466 "vpadd.u32 d24, d24, d25\n" in Pack()
4467 "vpadd.u32 d26, d26, d27\n" in Pack()
4468 "vpadd.u32 d28, d28, d29\n" in Pack()
4469 "vpadd.u32 d16, d16, d18\n" in Pack()
4470 "vpadd.u32 d17, d20, d22\n" in Pack()
4471 "vpadd.u32 d18, d24, d26\n" in Pack()
4472 "vpadd.u32 d19, d28, d28\n" in Pack()
4473 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4474 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4475 "vadd.i32 q8, q8, q1\n" in Pack()
4476 "vadd.i32 q9, q9, q1\n" in Pack()
4477 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
4500 "add r0, %[in], %[stride]\n" in Pack()
4501 "add r1, r0, %[stride]\n" in Pack()
4502 "add r2, r1, %[stride]\n" in Pack()
4503 "add r3, r2, %[stride]\n" in Pack()
4504 "add r4, r3, %[stride]\n" in Pack()
4505 "add r5, r4, %[stride]\n" in Pack()
4506 "vmov.i16 q8, #0\n" in Pack()
4507 "vmov.i16 q9, #0\n" in Pack()
4508 "vmov.i16 q10, #0\n" in Pack()
4509 "vmov.i16 q11, #0\n" in Pack()
4510 "vmov.i16 q12, #0\n" in Pack()
4511 "vmov.i16 q13, #0\n" in Pack()
4512 "vmov.i16 q14, #0\n" in Pack()
4515 "subs %[count], %[count], #6\n" in Pack()
4516 "beq 2f\n" in Pack()
4519 "subs %[count], %[count], #8\n" in Pack()
4522 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4523 "vld1.32 {d1}, [r0]!\n" in Pack()
4524 "vld1.32 {d2}, [r1]!\n" in Pack()
4525 "vld1.32 {d3}, [r2]!\n" in Pack()
4526 "vld1.32 {d4}, [r3]!\n" in Pack()
4527 "vld1.32 {d5}, [r4]!\n" in Pack()
4528 "vld1.32 {d6}, [r5]!\n" in Pack()
4529 "vaddw.u8 q8, q8, d0\n" in Pack()
4530 "vaddw.u8 q9, q9, d1\n" in Pack()
4531 "vaddw.u8 q10, q10, d2\n" in Pack()
4532 "vaddw.u8 q11, q11, d3\n" in Pack()
4533 "vaddw.u8 q12, q12, d4\n" in Pack()
4534 "vaddw.u8 q13, q13, d5\n" in Pack()
4535 "vaddw.u8 q14, q14, d6\n" in Pack()
4536 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4537 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4539 "bne 1b\n" in Pack()
4544 "vmov.i8 d0, #0\n" in Pack()
4545 "vmov.i8 d1, #0\n" in Pack()
4546 "vmov.i8 d2, #0\n" in Pack()
4547 "vmov.i8 d3, #0\n" in Pack()
4548 "vmov.i8 d4, #0\n" in Pack()
4549 "vmov.i8 d5, #0\n" in Pack()
4550 "vmov.i8 d6, #0\n" in Pack()
4551 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
4552 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
4553 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
4554 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
4555 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
4556 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
4557 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
4558 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
4559 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
4560 "vld1.16 {d4[2]}, [r3]!\n" in Pack()
4561 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
4562 "vld1.16 {d5[2]}, [r4]!\n" in Pack()
4563 "vld1.32 {d6[0]}, [r5]!\n" in Pack()
4564 "vld1.16 {d6[2]}, [r5]!\n" in Pack()
4565 "vaddw.u8 q8, q8, d0\n" in Pack()
4566 "vaddw.u8 q9, q9, d1\n" in Pack()
4567 "vaddw.u8 q10, q10, d2\n" in Pack()
4568 "vaddw.u8 q11, q11, d3\n" in Pack()
4569 "vaddw.u8 q12, q12, d4\n" in Pack()
4570 "vaddw.u8 q13, q13, d5\n" in Pack()
4571 "vaddw.u8 q14, q14, d6\n" in Pack()
4572 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4573 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4576 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
4577 "ldr r1, %[additive_sum_offset]\n" in Pack()
4578 "vmov.32 d0[0], r0\n" in Pack()
4579 "vdup.32 q1, r1\n" in Pack()
4580 "vpaddl.u16 q8, q8\n" in Pack()
4581 "vpaddl.u16 q9, q9\n" in Pack()
4582 "vpaddl.u16 q10, q10\n" in Pack()
4583 "vpaddl.u16 q11, q11\n" in Pack()
4584 "vpaddl.u16 q12, q12\n" in Pack()
4585 "vpaddl.u16 q13, q13\n" in Pack()
4586 "vpaddl.u16 q14, q14\n" in Pack()
4587 "vpadd.u32 d16, d16, d17\n" in Pack()
4588 "vpadd.u32 d18, d18, d19\n" in Pack()
4589 "vpadd.u32 d20, d20, d21\n" in Pack()
4590 "vpadd.u32 d22, d22, d23\n" in Pack()
4591 "vpadd.u32 d24, d24, d25\n" in Pack()
4592 "vpadd.u32 d26, d26, d27\n" in Pack()
4593 "vpadd.u32 d28, d28, d29\n" in Pack()
4594 "vpadd.u32 d16, d16, d18\n" in Pack()
4595 "vpadd.u32 d17, d20, d22\n" in Pack()
4596 "vpadd.u32 d18, d24, d26\n" in Pack()
4597 "vpadd.u32 d19, d28, d28\n" in Pack()
4598 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4599 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4600 "vadd.i32 q8, q8, q1\n" in Pack()
4601 "vadd.i32 q9, q9, q1\n" in Pack()
4602 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
4625 "add r0, %[in], %[stride]\n" in Pack()
4626 "add r1, r0, %[stride]\n" in Pack()
4627 "add r2, r1, %[stride]\n" in Pack()
4628 "add r3, r2, %[stride]\n" in Pack()
4629 "add r4, r3, %[stride]\n" in Pack()
4630 "add r5, r4, %[stride]\n" in Pack()
4631 "vmov.i16 q8, #0\n" in Pack()
4632 "vmov.i16 q9, #0\n" in Pack()
4633 "vmov.i16 q10, #0\n" in Pack()
4634 "vmov.i16 q11, #0\n" in Pack()
4635 "vmov.i16 q12, #0\n" in Pack()
4636 "vmov.i16 q13, #0\n" in Pack()
4637 "vmov.i16 q14, #0\n" in Pack()
4640 "subs %[count], %[count], #7\n" in Pack()
4641 "beq 2f\n" in Pack()
4644 "subs %[count], %[count], #8\n" in Pack()
4647 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4648 "vld1.32 {d1}, [r0]!\n" in Pack()
4649 "vld1.32 {d2}, [r1]!\n" in Pack()
4650 "vld1.32 {d3}, [r2]!\n" in Pack()
4651 "vld1.32 {d4}, [r3]!\n" in Pack()
4652 "vld1.32 {d5}, [r4]!\n" in Pack()
4653 "vld1.32 {d6}, [r5]!\n" in Pack()
4654 "vaddw.u8 q8, q8, d0\n" in Pack()
4655 "vaddw.u8 q9, q9, d1\n" in Pack()
4656 "vaddw.u8 q10, q10, d2\n" in Pack()
4657 "vaddw.u8 q11, q11, d3\n" in Pack()
4658 "vaddw.u8 q12, q12, d4\n" in Pack()
4659 "vaddw.u8 q13, q13, d5\n" in Pack()
4660 "vaddw.u8 q14, q14, d6\n" in Pack()
4661 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4662 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4664 "bne 1b\n" in Pack()
4669 "vmov.i8 d0, #0\n" in Pack()
4670 "vmov.i8 d1, #0\n" in Pack()
4671 "vmov.i8 d2, #0\n" in Pack()
4672 "vmov.i8 d3, #0\n" in Pack()
4673 "vmov.i8 d4, #0\n" in Pack()
4674 "vmov.i8 d5, #0\n" in Pack()
4675 "vmov.i8 d6, #0\n" in Pack()
4676 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
4677 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
4678 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
4679 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
4680 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
4681 "vld1.8 {d1[6]}, [r0]!\n" in Pack()
4682 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
4683 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
4684 "vld1.8 {d2[6]}, [r1]!\n" in Pack()
4685 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
4686 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
4687 "vld1.8 {d3[6]}, [r2]!\n" in Pack()
4688 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
4689 "vld1.16 {d4[2]}, [r3]!\n" in Pack()
4690 "vld1.8 {d4[6]}, [r3]!\n" in Pack()
4691 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
4692 "vld1.16 {d5[2]}, [r4]!\n" in Pack()
4693 "vld1.8 {d5[6]}, [r4]!\n" in Pack()
4694 "vld1.32 {d6[0]}, [r5]!\n" in Pack()
4695 "vld1.16 {d6[2]}, [r5]!\n" in Pack()
4696 "vld1.8 {d6[6]}, [r5]!\n" in Pack()
4697 "vaddw.u8 q8, q8, d0\n" in Pack()
4698 "vaddw.u8 q9, q9, d1\n" in Pack()
4699 "vaddw.u8 q10, q10, d2\n" in Pack()
4700 "vaddw.u8 q11, q11, d3\n" in Pack()
4701 "vaddw.u8 q12, q12, d4\n" in Pack()
4702 "vaddw.u8 q13, q13, d5\n" in Pack()
4703 "vaddw.u8 q14, q14, d6\n" in Pack()
4704 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4705 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
4708 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
4709 "ldr r1, %[additive_sum_offset]\n" in Pack()
4710 "vmov.32 d0[0], r0\n" in Pack()
4711 "vdup.32 q1, r1\n" in Pack()
4712 "vpaddl.u16 q8, q8\n" in Pack()
4713 "vpaddl.u16 q9, q9\n" in Pack()
4714 "vpaddl.u16 q10, q10\n" in Pack()
4715 "vpaddl.u16 q11, q11\n" in Pack()
4716 "vpaddl.u16 q12, q12\n" in Pack()
4717 "vpaddl.u16 q13, q13\n" in Pack()
4718 "vpaddl.u16 q14, q14\n" in Pack()
4719 "vpadd.u32 d16, d16, d17\n" in Pack()
4720 "vpadd.u32 d18, d18, d19\n" in Pack()
4721 "vpadd.u32 d20, d20, d21\n" in Pack()
4722 "vpadd.u32 d22, d22, d23\n" in Pack()
4723 "vpadd.u32 d24, d24, d25\n" in Pack()
4724 "vpadd.u32 d26, d26, d27\n" in Pack()
4725 "vpadd.u32 d28, d28, d29\n" in Pack()
4726 "vpadd.u32 d16, d16, d18\n" in Pack()
4727 "vpadd.u32 d17, d20, d22\n" in Pack()
4728 "vpadd.u32 d18, d24, d26\n" in Pack()
4729 "vpadd.u32 d19, d28, d28\n" in Pack()
4730 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4731 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4732 "vadd.i32 q8, q8, q1\n" in Pack()
4733 "vadd.i32 q9, q9, q1\n" in Pack()
4734 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
4757 "add r0, %[in], %[stride]\n" in Pack()
4758 "add r1, r0, %[stride]\n" in Pack()
4759 "add r2, r1, %[stride]\n" in Pack()
4760 "add r3, r2, %[stride]\n" in Pack()
4761 "add r4, r3, %[stride]\n" in Pack()
4762 "add r5, r4, %[stride]\n" in Pack()
4763 "add r6, r5, %[stride]\n" in Pack()
4764 "vmov.i16 q8, #0\n" in Pack()
4765 "vmov.i16 q9, #0\n" in Pack()
4766 "vmov.i16 q10, #0\n" in Pack()
4767 "vmov.i16 q11, #0\n" in Pack()
4768 "vmov.i16 q12, #0\n" in Pack()
4769 "vmov.i16 q13, #0\n" in Pack()
4770 "vmov.i16 q14, #0\n" in Pack()
4771 "vmov.i16 q15, #0\n" in Pack()
4774 "subs %[count], %[count], #8\n" in Pack()
4777 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4778 "vld1.32 {d1}, [r0]!\n" in Pack()
4779 "vld1.32 {d2}, [r1]!\n" in Pack()
4780 "vld1.32 {d3}, [r2]!\n" in Pack()
4781 "vld1.32 {d4}, [r3]!\n" in Pack()
4782 "vld1.32 {d5}, [r4]!\n" in Pack()
4783 "vld1.32 {d6}, [r5]!\n" in Pack()
4784 "vld1.32 {d7}, [r6]!\n" in Pack()
4785 "vaddw.u8 q8, q8, d0\n" in Pack()
4786 "vaddw.u8 q9, q9, d1\n" in Pack()
4787 "vaddw.u8 q10, q10, d2\n" in Pack()
4788 "vaddw.u8 q11, q11, d3\n" in Pack()
4789 "vaddw.u8 q12, q12, d4\n" in Pack()
4790 "vaddw.u8 q13, q13, d5\n" in Pack()
4791 "vaddw.u8 q14, q14, d6\n" in Pack()
4792 "vaddw.u8 q15, q15, d7\n" in Pack()
4793 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
4794 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
4796 "bne 1b\n" in Pack()
4799 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
4800 "ldr r1, %[additive_sum_offset]\n" in Pack()
4801 "vmov.32 d0[0], r0\n" in Pack()
4802 "vdup.32 q1, r1\n" in Pack()
4803 "vpaddl.u16 q8, q8\n" in Pack()
4804 "vpaddl.u16 q9, q9\n" in Pack()
4805 "vpaddl.u16 q10, q10\n" in Pack()
4806 "vpaddl.u16 q11, q11\n" in Pack()
4807 "vpaddl.u16 q12, q12\n" in Pack()
4808 "vpaddl.u16 q13, q13\n" in Pack()
4809 "vpaddl.u16 q14, q14\n" in Pack()
4810 "vpaddl.u16 q15, q15\n" in Pack()
4811 "vpadd.u32 d16, d16, d17\n" in Pack()
4812 "vpadd.u32 d18, d18, d19\n" in Pack()
4813 "vpadd.u32 d20, d20, d21\n" in Pack()
4814 "vpadd.u32 d22, d22, d23\n" in Pack()
4815 "vpadd.u32 d24, d24, d25\n" in Pack()
4816 "vpadd.u32 d26, d26, d27\n" in Pack()
4817 "vpadd.u32 d28, d28, d29\n" in Pack()
4818 "vpadd.u32 d30, d30, d31\n" in Pack()
4819 "vpadd.u32 d16, d16, d18\n" in Pack()
4820 "vpadd.u32 d17, d20, d22\n" in Pack()
4821 "vpadd.u32 d18, d24, d26\n" in Pack()
4822 "vpadd.u32 d19, d28, d30\n" in Pack()
4823 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4824 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4825 "vadd.i32 q8, q8, q1\n" in Pack()
4826 "vadd.i32 q9, q9, q1\n" in Pack()
4827 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
4851 "add r0, %[in], %[stride]\n" in Pack()
4852 "add r1, r0, %[stride]\n" in Pack()
4853 "add r2, r1, %[stride]\n" in Pack()
4854 "add r3, r2, %[stride]\n" in Pack()
4855 "add r4, r3, %[stride]\n" in Pack()
4856 "add r5, r4, %[stride]\n" in Pack()
4857 "add r6, r5, %[stride]\n" in Pack()
4858 "vmov.i16 q8, #0\n" in Pack()
4859 "vmov.i16 q9, #0\n" in Pack()
4860 "vmov.i16 q10, #0\n" in Pack()
4861 "vmov.i16 q11, #0\n" in Pack()
4862 "vmov.i16 q12, #0\n" in Pack()
4863 "vmov.i16 q13, #0\n" in Pack()
4864 "vmov.i16 q14, #0\n" in Pack()
4865 "vmov.i16 q15, #0\n" in Pack()
4868 "subs %[count], %[count], #1\n" in Pack()
4869 "beq 2f\n" in Pack()
4872 "subs %[count], %[count], #8\n" in Pack()
4875 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4876 "vld1.32 {d1}, [r0]!\n" in Pack()
4877 "vld1.32 {d2}, [r1]!\n" in Pack()
4878 "vld1.32 {d3}, [r2]!\n" in Pack()
4879 "vld1.32 {d4}, [r3]!\n" in Pack()
4880 "vld1.32 {d5}, [r4]!\n" in Pack()
4881 "vld1.32 {d6}, [r5]!\n" in Pack()
4882 "vld1.32 {d7}, [r6]!\n" in Pack()
4883 "vaddw.u8 q8, q8, d0\n" in Pack()
4884 "vaddw.u8 q9, q9, d1\n" in Pack()
4885 "vaddw.u8 q10, q10, d2\n" in Pack()
4886 "vaddw.u8 q11, q11, d3\n" in Pack()
4887 "vaddw.u8 q12, q12, d4\n" in Pack()
4888 "vaddw.u8 q13, q13, d5\n" in Pack()
4889 "vaddw.u8 q14, q14, d6\n" in Pack()
4890 "vaddw.u8 q15, q15, d7\n" in Pack()
4891 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
4892 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
4894 "bne 1b\n" in Pack()
4899 "vmov.i8 d0, #0\n" in Pack()
4900 "vmov.i8 d1, #0\n" in Pack()
4901 "vmov.i8 d2, #0\n" in Pack()
4902 "vmov.i8 d3, #0\n" in Pack()
4903 "vmov.i8 d4, #0\n" in Pack()
4904 "vmov.i8 d5, #0\n" in Pack()
4905 "vmov.i8 d6, #0\n" in Pack()
4906 "vmov.i8 d7, #0\n" in Pack()
4907 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
4908 "vld1.8 {d1[0]}, [r0]!\n" in Pack()
4909 "vld1.8 {d2[0]}, [r1]!\n" in Pack()
4910 "vld1.8 {d3[0]}, [r2]!\n" in Pack()
4911 "vld1.8 {d4[0]}, [r3]!\n" in Pack()
4912 "vld1.8 {d5[0]}, [r4]!\n" in Pack()
4913 "vld1.8 {d6[0]}, [r5]!\n" in Pack()
4914 "vld1.8 {d7[0]}, [r6]!\n" in Pack()
4915 "vaddw.u8 q8, q8, d0\n" in Pack()
4916 "vaddw.u8 q9, q9, d1\n" in Pack()
4917 "vaddw.u8 q10, q10, d2\n" in Pack()
4918 "vaddw.u8 q11, q11, d3\n" in Pack()
4919 "vaddw.u8 q12, q12, d4\n" in Pack()
4920 "vaddw.u8 q13, q13, d5\n" in Pack()
4921 "vaddw.u8 q14, q14, d6\n" in Pack()
4922 "vaddw.u8 q15, q15, d7\n" in Pack()
4923 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
4924 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
4927 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
4928 "ldr r1, %[additive_sum_offset]\n" in Pack()
4929 "vmov.32 d0[0], r0\n" in Pack()
4930 "vdup.32 q1, r1\n" in Pack()
4931 "vpaddl.u16 q8, q8\n" in Pack()
4932 "vpaddl.u16 q9, q9\n" in Pack()
4933 "vpaddl.u16 q10, q10\n" in Pack()
4934 "vpaddl.u16 q11, q11\n" in Pack()
4935 "vpaddl.u16 q12, q12\n" in Pack()
4936 "vpaddl.u16 q13, q13\n" in Pack()
4937 "vpaddl.u16 q14, q14\n" in Pack()
4938 "vpaddl.u16 q15, q15\n" in Pack()
4939 "vpadd.u32 d16, d16, d17\n" in Pack()
4940 "vpadd.u32 d18, d18, d19\n" in Pack()
4941 "vpadd.u32 d20, d20, d21\n" in Pack()
4942 "vpadd.u32 d22, d22, d23\n" in Pack()
4943 "vpadd.u32 d24, d24, d25\n" in Pack()
4944 "vpadd.u32 d26, d26, d27\n" in Pack()
4945 "vpadd.u32 d28, d28, d29\n" in Pack()
4946 "vpadd.u32 d30, d30, d31\n" in Pack()
4947 "vpadd.u32 d16, d16, d18\n" in Pack()
4948 "vpadd.u32 d17, d20, d22\n" in Pack()
4949 "vpadd.u32 d18, d24, d26\n" in Pack()
4950 "vpadd.u32 d19, d28, d30\n" in Pack()
4951 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4952 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4953 "vadd.i32 q8, q8, q1\n" in Pack()
4954 "vadd.i32 q9, q9, q1\n" in Pack()
4955 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
4979 "add r0, %[in], %[stride]\n" in Pack()
4980 "add r1, r0, %[stride]\n" in Pack()
4981 "add r2, r1, %[stride]\n" in Pack()
4982 "add r3, r2, %[stride]\n" in Pack()
4983 "add r4, r3, %[stride]\n" in Pack()
4984 "add r5, r4, %[stride]\n" in Pack()
4985 "add r6, r5, %[stride]\n" in Pack()
4986 "vmov.i16 q8, #0\n" in Pack()
4987 "vmov.i16 q9, #0\n" in Pack()
4988 "vmov.i16 q10, #0\n" in Pack()
4989 "vmov.i16 q11, #0\n" in Pack()
4990 "vmov.i16 q12, #0\n" in Pack()
4991 "vmov.i16 q13, #0\n" in Pack()
4992 "vmov.i16 q14, #0\n" in Pack()
4993 "vmov.i16 q15, #0\n" in Pack()
4996 "subs %[count], %[count], #2\n" in Pack()
4997 "beq 2f\n" in Pack()
5000 "subs %[count], %[count], #8\n" in Pack()
5003 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5004 "vld1.32 {d1}, [r0]!\n" in Pack()
5005 "vld1.32 {d2}, [r1]!\n" in Pack()
5006 "vld1.32 {d3}, [r2]!\n" in Pack()
5007 "vld1.32 {d4}, [r3]!\n" in Pack()
5008 "vld1.32 {d5}, [r4]!\n" in Pack()
5009 "vld1.32 {d6}, [r5]!\n" in Pack()
5010 "vld1.32 {d7}, [r6]!\n" in Pack()
5011 "vaddw.u8 q8, q8, d0\n" in Pack()
5012 "vaddw.u8 q9, q9, d1\n" in Pack()
5013 "vaddw.u8 q10, q10, d2\n" in Pack()
5014 "vaddw.u8 q11, q11, d3\n" in Pack()
5015 "vaddw.u8 q12, q12, d4\n" in Pack()
5016 "vaddw.u8 q13, q13, d5\n" in Pack()
5017 "vaddw.u8 q14, q14, d6\n" in Pack()
5018 "vaddw.u8 q15, q15, d7\n" in Pack()
5019 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5020 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5022 "bne 1b\n" in Pack()
5027 "vmov.i8 d0, #0\n" in Pack()
5028 "vmov.i8 d1, #0\n" in Pack()
5029 "vmov.i8 d2, #0\n" in Pack()
5030 "vmov.i8 d3, #0\n" in Pack()
5031 "vmov.i8 d4, #0\n" in Pack()
5032 "vmov.i8 d5, #0\n" in Pack()
5033 "vmov.i8 d6, #0\n" in Pack()
5034 "vmov.i8 d7, #0\n" in Pack()
5035 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
5036 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
5037 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
5038 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
5039 "vld1.16 {d4[0]}, [r3]!\n" in Pack()
5040 "vld1.16 {d5[0]}, [r4]!\n" in Pack()
5041 "vld1.16 {d6[0]}, [r5]!\n" in Pack()
5042 "vld1.16 {d7[0]}, [r6]!\n" in Pack()
5043 "vaddw.u8 q8, q8, d0\n" in Pack()
5044 "vaddw.u8 q9, q9, d1\n" in Pack()
5045 "vaddw.u8 q10, q10, d2\n" in Pack()
5046 "vaddw.u8 q11, q11, d3\n" in Pack()
5047 "vaddw.u8 q12, q12, d4\n" in Pack()
5048 "vaddw.u8 q13, q13, d5\n" in Pack()
5049 "vaddw.u8 q14, q14, d6\n" in Pack()
5050 "vaddw.u8 q15, q15, d7\n" in Pack()
5051 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5052 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5055 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
5056 "ldr r1, %[additive_sum_offset]\n" in Pack()
5057 "vmov.32 d0[0], r0\n" in Pack()
5058 "vdup.32 q1, r1\n" in Pack()
5059 "vpaddl.u16 q8, q8\n" in Pack()
5060 "vpaddl.u16 q9, q9\n" in Pack()
5061 "vpaddl.u16 q10, q10\n" in Pack()
5062 "vpaddl.u16 q11, q11\n" in Pack()
5063 "vpaddl.u16 q12, q12\n" in Pack()
5064 "vpaddl.u16 q13, q13\n" in Pack()
5065 "vpaddl.u16 q14, q14\n" in Pack()
5066 "vpaddl.u16 q15, q15\n" in Pack()
5067 "vpadd.u32 d16, d16, d17\n" in Pack()
5068 "vpadd.u32 d18, d18, d19\n" in Pack()
5069 "vpadd.u32 d20, d20, d21\n" in Pack()
5070 "vpadd.u32 d22, d22, d23\n" in Pack()
5071 "vpadd.u32 d24, d24, d25\n" in Pack()
5072 "vpadd.u32 d26, d26, d27\n" in Pack()
5073 "vpadd.u32 d28, d28, d29\n" in Pack()
5074 "vpadd.u32 d30, d30, d31\n" in Pack()
5075 "vpadd.u32 d16, d16, d18\n" in Pack()
5076 "vpadd.u32 d17, d20, d22\n" in Pack()
5077 "vpadd.u32 d18, d24, d26\n" in Pack()
5078 "vpadd.u32 d19, d28, d30\n" in Pack()
5079 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5080 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5081 "vadd.i32 q8, q8, q1\n" in Pack()
5082 "vadd.i32 q9, q9, q1\n" in Pack()
5083 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
5107 "add r0, %[in], %[stride]\n" in Pack()
5108 "add r1, r0, %[stride]\n" in Pack()
5109 "add r2, r1, %[stride]\n" in Pack()
5110 "add r3, r2, %[stride]\n" in Pack()
5111 "add r4, r3, %[stride]\n" in Pack()
5112 "add r5, r4, %[stride]\n" in Pack()
5113 "add r6, r5, %[stride]\n" in Pack()
5114 "vmov.i16 q8, #0\n" in Pack()
5115 "vmov.i16 q9, #0\n" in Pack()
5116 "vmov.i16 q10, #0\n" in Pack()
5117 "vmov.i16 q11, #0\n" in Pack()
5118 "vmov.i16 q12, #0\n" in Pack()
5119 "vmov.i16 q13, #0\n" in Pack()
5120 "vmov.i16 q14, #0\n" in Pack()
5121 "vmov.i16 q15, #0\n" in Pack()
5124 "subs %[count], %[count], #3\n" in Pack()
5125 "beq 2f\n" in Pack()
5128 "subs %[count], %[count], #8\n" in Pack()
5131 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5132 "vld1.32 {d1}, [r0]!\n" in Pack()
5133 "vld1.32 {d2}, [r1]!\n" in Pack()
5134 "vld1.32 {d3}, [r2]!\n" in Pack()
5135 "vld1.32 {d4}, [r3]!\n" in Pack()
5136 "vld1.32 {d5}, [r4]!\n" in Pack()
5137 "vld1.32 {d6}, [r5]!\n" in Pack()
5138 "vld1.32 {d7}, [r6]!\n" in Pack()
5139 "vaddw.u8 q8, q8, d0\n" in Pack()
5140 "vaddw.u8 q9, q9, d1\n" in Pack()
5141 "vaddw.u8 q10, q10, d2\n" in Pack()
5142 "vaddw.u8 q11, q11, d3\n" in Pack()
5143 "vaddw.u8 q12, q12, d4\n" in Pack()
5144 "vaddw.u8 q13, q13, d5\n" in Pack()
5145 "vaddw.u8 q14, q14, d6\n" in Pack()
5146 "vaddw.u8 q15, q15, d7\n" in Pack()
5147 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5148 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5150 "bne 1b\n" in Pack()
5155 "vmov.i8 d0, #0\n" in Pack()
5156 "vmov.i8 d1, #0\n" in Pack()
5157 "vmov.i8 d2, #0\n" in Pack()
5158 "vmov.i8 d3, #0\n" in Pack()
5159 "vmov.i8 d4, #0\n" in Pack()
5160 "vmov.i8 d5, #0\n" in Pack()
5161 "vmov.i8 d6, #0\n" in Pack()
5162 "vmov.i8 d7, #0\n" in Pack()
5163 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
5164 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
5165 "vld1.16 {d1[0]}, [r0]!\n" in Pack()
5166 "vld1.8 {d1[2]}, [r0]!\n" in Pack()
5167 "vld1.16 {d2[0]}, [r1]!\n" in Pack()
5168 "vld1.8 {d2[2]}, [r1]!\n" in Pack()
5169 "vld1.16 {d3[0]}, [r2]!\n" in Pack()
5170 "vld1.8 {d3[2]}, [r2]!\n" in Pack()
5171 "vld1.16 {d4[0]}, [r3]!\n" in Pack()
5172 "vld1.8 {d4[2]}, [r3]!\n" in Pack()
5173 "vld1.16 {d5[0]}, [r4]!\n" in Pack()
5174 "vld1.8 {d5[2]}, [r4]!\n" in Pack()
5175 "vld1.16 {d6[0]}, [r5]!\n" in Pack()
5176 "vld1.8 {d6[2]}, [r5]!\n" in Pack()
5177 "vld1.16 {d7[0]}, [r6]!\n" in Pack()
5178 "vld1.8 {d7[2]}, [r6]!\n" in Pack()
5179 "vaddw.u8 q8, q8, d0\n" in Pack()
5180 "vaddw.u8 q9, q9, d1\n" in Pack()
5181 "vaddw.u8 q10, q10, d2\n" in Pack()
5182 "vaddw.u8 q11, q11, d3\n" in Pack()
5183 "vaddw.u8 q12, q12, d4\n" in Pack()
5184 "vaddw.u8 q13, q13, d5\n" in Pack()
5185 "vaddw.u8 q14, q14, d6\n" in Pack()
5186 "vaddw.u8 q15, q15, d7\n" in Pack()
5187 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5188 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5191 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
5192 "ldr r1, %[additive_sum_offset]\n" in Pack()
5193 "vmov.32 d0[0], r0\n" in Pack()
5194 "vdup.32 q1, r1\n" in Pack()
5195 "vpaddl.u16 q8, q8\n" in Pack()
5196 "vpaddl.u16 q9, q9\n" in Pack()
5197 "vpaddl.u16 q10, q10\n" in Pack()
5198 "vpaddl.u16 q11, q11\n" in Pack()
5199 "vpaddl.u16 q12, q12\n" in Pack()
5200 "vpaddl.u16 q13, q13\n" in Pack()
5201 "vpaddl.u16 q14, q14\n" in Pack()
5202 "vpaddl.u16 q15, q15\n" in Pack()
5203 "vpadd.u32 d16, d16, d17\n" in Pack()
5204 "vpadd.u32 d18, d18, d19\n" in Pack()
5205 "vpadd.u32 d20, d20, d21\n" in Pack()
5206 "vpadd.u32 d22, d22, d23\n" in Pack()
5207 "vpadd.u32 d24, d24, d25\n" in Pack()
5208 "vpadd.u32 d26, d26, d27\n" in Pack()
5209 "vpadd.u32 d28, d28, d29\n" in Pack()
5210 "vpadd.u32 d30, d30, d31\n" in Pack()
5211 "vpadd.u32 d16, d16, d18\n" in Pack()
5212 "vpadd.u32 d17, d20, d22\n" in Pack()
5213 "vpadd.u32 d18, d24, d26\n" in Pack()
5214 "vpadd.u32 d19, d28, d30\n" in Pack()
5215 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5216 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5217 "vadd.i32 q8, q8, q1\n" in Pack()
5218 "vadd.i32 q9, q9, q1\n" in Pack()
5219 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
5243 "add r0, %[in], %[stride]\n" in Pack()
5244 "add r1, r0, %[stride]\n" in Pack()
5245 "add r2, r1, %[stride]\n" in Pack()
5246 "add r3, r2, %[stride]\n" in Pack()
5247 "add r4, r3, %[stride]\n" in Pack()
5248 "add r5, r4, %[stride]\n" in Pack()
5249 "add r6, r5, %[stride]\n" in Pack()
5250 "vmov.i16 q8, #0\n" in Pack()
5251 "vmov.i16 q9, #0\n" in Pack()
5252 "vmov.i16 q10, #0\n" in Pack()
5253 "vmov.i16 q11, #0\n" in Pack()
5254 "vmov.i16 q12, #0\n" in Pack()
5255 "vmov.i16 q13, #0\n" in Pack()
5256 "vmov.i16 q14, #0\n" in Pack()
5257 "vmov.i16 q15, #0\n" in Pack()
5260 "subs %[count], %[count], #4\n" in Pack()
5261 "beq 2f\n" in Pack()
5264 "subs %[count], %[count], #8\n" in Pack()
5267 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5268 "vld1.32 {d1}, [r0]!\n" in Pack()
5269 "vld1.32 {d2}, [r1]!\n" in Pack()
5270 "vld1.32 {d3}, [r2]!\n" in Pack()
5271 "vld1.32 {d4}, [r3]!\n" in Pack()
5272 "vld1.32 {d5}, [r4]!\n" in Pack()
5273 "vld1.32 {d6}, [r5]!\n" in Pack()
5274 "vld1.32 {d7}, [r6]!\n" in Pack()
5275 "vaddw.u8 q8, q8, d0\n" in Pack()
5276 "vaddw.u8 q9, q9, d1\n" in Pack()
5277 "vaddw.u8 q10, q10, d2\n" in Pack()
5278 "vaddw.u8 q11, q11, d3\n" in Pack()
5279 "vaddw.u8 q12, q12, d4\n" in Pack()
5280 "vaddw.u8 q13, q13, d5\n" in Pack()
5281 "vaddw.u8 q14, q14, d6\n" in Pack()
5282 "vaddw.u8 q15, q15, d7\n" in Pack()
5283 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5284 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5286 "bne 1b\n" in Pack()
5291 "vmov.i8 d0, #0\n" in Pack()
5292 "vmov.i8 d1, #0\n" in Pack()
5293 "vmov.i8 d2, #0\n" in Pack()
5294 "vmov.i8 d3, #0\n" in Pack()
5295 "vmov.i8 d4, #0\n" in Pack()
5296 "vmov.i8 d5, #0\n" in Pack()
5297 "vmov.i8 d6, #0\n" in Pack()
5298 "vmov.i8 d7, #0\n" in Pack()
5299 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
5300 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
5301 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
5302 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
5303 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
5304 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
5305 "vld1.32 {d6[0]}, [r5]!\n" in Pack()
5306 "vld1.32 {d7[0]}, [r6]!\n" in Pack()
5307 "vaddw.u8 q8, q8, d0\n" in Pack()
5308 "vaddw.u8 q9, q9, d1\n" in Pack()
5309 "vaddw.u8 q10, q10, d2\n" in Pack()
5310 "vaddw.u8 q11, q11, d3\n" in Pack()
5311 "vaddw.u8 q12, q12, d4\n" in Pack()
5312 "vaddw.u8 q13, q13, d5\n" in Pack()
5313 "vaddw.u8 q14, q14, d6\n" in Pack()
5314 "vaddw.u8 q15, q15, d7\n" in Pack()
5315 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5316 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5319 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
5320 "ldr r1, %[additive_sum_offset]\n" in Pack()
5321 "vmov.32 d0[0], r0\n" in Pack()
5322 "vdup.32 q1, r1\n" in Pack()
5323 "vpaddl.u16 q8, q8\n" in Pack()
5324 "vpaddl.u16 q9, q9\n" in Pack()
5325 "vpaddl.u16 q10, q10\n" in Pack()
5326 "vpaddl.u16 q11, q11\n" in Pack()
5327 "vpaddl.u16 q12, q12\n" in Pack()
5328 "vpaddl.u16 q13, q13\n" in Pack()
5329 "vpaddl.u16 q14, q14\n" in Pack()
5330 "vpaddl.u16 q15, q15\n" in Pack()
5331 "vpadd.u32 d16, d16, d17\n" in Pack()
5332 "vpadd.u32 d18, d18, d19\n" in Pack()
5333 "vpadd.u32 d20, d20, d21\n" in Pack()
5334 "vpadd.u32 d22, d22, d23\n" in Pack()
5335 "vpadd.u32 d24, d24, d25\n" in Pack()
5336 "vpadd.u32 d26, d26, d27\n" in Pack()
5337 "vpadd.u32 d28, d28, d29\n" in Pack()
5338 "vpadd.u32 d30, d30, d31\n" in Pack()
5339 "vpadd.u32 d16, d16, d18\n" in Pack()
5340 "vpadd.u32 d17, d20, d22\n" in Pack()
5341 "vpadd.u32 d18, d24, d26\n" in Pack()
5342 "vpadd.u32 d19, d28, d30\n" in Pack()
5343 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5344 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5345 "vadd.i32 q8, q8, q1\n" in Pack()
5346 "vadd.i32 q9, q9, q1\n" in Pack()
5347 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
5371 "add r0, %[in], %[stride]\n" in Pack()
5372 "add r1, r0, %[stride]\n" in Pack()
5373 "add r2, r1, %[stride]\n" in Pack()
5374 "add r3, r2, %[stride]\n" in Pack()
5375 "add r4, r3, %[stride]\n" in Pack()
5376 "add r5, r4, %[stride]\n" in Pack()
5377 "add r6, r5, %[stride]\n" in Pack()
5378 "vmov.i16 q8, #0\n" in Pack()
5379 "vmov.i16 q9, #0\n" in Pack()
5380 "vmov.i16 q10, #0\n" in Pack()
5381 "vmov.i16 q11, #0\n" in Pack()
5382 "vmov.i16 q12, #0\n" in Pack()
5383 "vmov.i16 q13, #0\n" in Pack()
5384 "vmov.i16 q14, #0\n" in Pack()
5385 "vmov.i16 q15, #0\n" in Pack()
5388 "subs %[count], %[count], #5\n" in Pack()
5389 "beq 2f\n" in Pack()
5392 "subs %[count], %[count], #8\n" in Pack()
5395 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5396 "vld1.32 {d1}, [r0]!\n" in Pack()
5397 "vld1.32 {d2}, [r1]!\n" in Pack()
5398 "vld1.32 {d3}, [r2]!\n" in Pack()
5399 "vld1.32 {d4}, [r3]!\n" in Pack()
5400 "vld1.32 {d5}, [r4]!\n" in Pack()
5401 "vld1.32 {d6}, [r5]!\n" in Pack()
5402 "vld1.32 {d7}, [r6]!\n" in Pack()
5403 "vaddw.u8 q8, q8, d0\n" in Pack()
5404 "vaddw.u8 q9, q9, d1\n" in Pack()
5405 "vaddw.u8 q10, q10, d2\n" in Pack()
5406 "vaddw.u8 q11, q11, d3\n" in Pack()
5407 "vaddw.u8 q12, q12, d4\n" in Pack()
5408 "vaddw.u8 q13, q13, d5\n" in Pack()
5409 "vaddw.u8 q14, q14, d6\n" in Pack()
5410 "vaddw.u8 q15, q15, d7\n" in Pack()
5411 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5412 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5414 "bne 1b\n" in Pack()
5419 "vmov.i8 d0, #0\n" in Pack()
5420 "vmov.i8 d1, #0\n" in Pack()
5421 "vmov.i8 d2, #0\n" in Pack()
5422 "vmov.i8 d3, #0\n" in Pack()
5423 "vmov.i8 d4, #0\n" in Pack()
5424 "vmov.i8 d5, #0\n" in Pack()
5425 "vmov.i8 d6, #0\n" in Pack()
5426 "vmov.i8 d7, #0\n" in Pack()
5427 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
5428 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
5429 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
5430 "vld1.8 {d1[4]}, [r0]!\n" in Pack()
5431 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
5432 "vld1.8 {d2[4]}, [r1]!\n" in Pack()
5433 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
5434 "vld1.8 {d3[4]}, [r2]!\n" in Pack()
5435 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
5436 "vld1.8 {d4[4]}, [r3]!\n" in Pack()
5437 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
5438 "vld1.8 {d5[4]}, [r4]!\n" in Pack()
5439 "vld1.32 {d6[0]}, [r5]!\n" in Pack()
5440 "vld1.8 {d6[4]}, [r5]!\n" in Pack()
5441 "vld1.32 {d7[0]}, [r6]!\n" in Pack()
5442 "vld1.8 {d7[4]}, [r6]!\n" in Pack()
5443 "vaddw.u8 q8, q8, d0\n" in Pack()
5444 "vaddw.u8 q9, q9, d1\n" in Pack()
5445 "vaddw.u8 q10, q10, d2\n" in Pack()
5446 "vaddw.u8 q11, q11, d3\n" in Pack()
5447 "vaddw.u8 q12, q12, d4\n" in Pack()
5448 "vaddw.u8 q13, q13, d5\n" in Pack()
5449 "vaddw.u8 q14, q14, d6\n" in Pack()
5450 "vaddw.u8 q15, q15, d7\n" in Pack()
5451 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5452 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5455 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
5456 "ldr r1, %[additive_sum_offset]\n" in Pack()
5457 "vmov.32 d0[0], r0\n" in Pack()
5458 "vdup.32 q1, r1\n" in Pack()
5459 "vpaddl.u16 q8, q8\n" in Pack()
5460 "vpaddl.u16 q9, q9\n" in Pack()
5461 "vpaddl.u16 q10, q10\n" in Pack()
5462 "vpaddl.u16 q11, q11\n" in Pack()
5463 "vpaddl.u16 q12, q12\n" in Pack()
5464 "vpaddl.u16 q13, q13\n" in Pack()
5465 "vpaddl.u16 q14, q14\n" in Pack()
5466 "vpaddl.u16 q15, q15\n" in Pack()
5467 "vpadd.u32 d16, d16, d17\n" in Pack()
5468 "vpadd.u32 d18, d18, d19\n" in Pack()
5469 "vpadd.u32 d20, d20, d21\n" in Pack()
5470 "vpadd.u32 d22, d22, d23\n" in Pack()
5471 "vpadd.u32 d24, d24, d25\n" in Pack()
5472 "vpadd.u32 d26, d26, d27\n" in Pack()
5473 "vpadd.u32 d28, d28, d29\n" in Pack()
5474 "vpadd.u32 d30, d30, d31\n" in Pack()
5475 "vpadd.u32 d16, d16, d18\n" in Pack()
5476 "vpadd.u32 d17, d20, d22\n" in Pack()
5477 "vpadd.u32 d18, d24, d26\n" in Pack()
5478 "vpadd.u32 d19, d28, d30\n" in Pack()
5479 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5480 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5481 "vadd.i32 q8, q8, q1\n" in Pack()
5482 "vadd.i32 q9, q9, q1\n" in Pack()
5483 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
5507 "add r0, %[in], %[stride]\n" in Pack()
5508 "add r1, r0, %[stride]\n" in Pack()
5509 "add r2, r1, %[stride]\n" in Pack()
5510 "add r3, r2, %[stride]\n" in Pack()
5511 "add r4, r3, %[stride]\n" in Pack()
5512 "add r5, r4, %[stride]\n" in Pack()
5513 "add r6, r5, %[stride]\n" in Pack()
5514 "vmov.i16 q8, #0\n" in Pack()
5515 "vmov.i16 q9, #0\n" in Pack()
5516 "vmov.i16 q10, #0\n" in Pack()
5517 "vmov.i16 q11, #0\n" in Pack()
5518 "vmov.i16 q12, #0\n" in Pack()
5519 "vmov.i16 q13, #0\n" in Pack()
5520 "vmov.i16 q14, #0\n" in Pack()
5521 "vmov.i16 q15, #0\n" in Pack()
5524 "subs %[count], %[count], #6\n" in Pack()
5525 "beq 2f\n" in Pack()
5528 "subs %[count], %[count], #8\n" in Pack()
5531 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5532 "vld1.32 {d1}, [r0]!\n" in Pack()
5533 "vld1.32 {d2}, [r1]!\n" in Pack()
5534 "vld1.32 {d3}, [r2]!\n" in Pack()
5535 "vld1.32 {d4}, [r3]!\n" in Pack()
5536 "vld1.32 {d5}, [r4]!\n" in Pack()
5537 "vld1.32 {d6}, [r5]!\n" in Pack()
5538 "vld1.32 {d7}, [r6]!\n" in Pack()
5539 "vaddw.u8 q8, q8, d0\n" in Pack()
5540 "vaddw.u8 q9, q9, d1\n" in Pack()
5541 "vaddw.u8 q10, q10, d2\n" in Pack()
5542 "vaddw.u8 q11, q11, d3\n" in Pack()
5543 "vaddw.u8 q12, q12, d4\n" in Pack()
5544 "vaddw.u8 q13, q13, d5\n" in Pack()
5545 "vaddw.u8 q14, q14, d6\n" in Pack()
5546 "vaddw.u8 q15, q15, d7\n" in Pack()
5547 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5548 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5550 "bne 1b\n" in Pack()
5555 "vmov.i8 d0, #0\n" in Pack()
5556 "vmov.i8 d1, #0\n" in Pack()
5557 "vmov.i8 d2, #0\n" in Pack()
5558 "vmov.i8 d3, #0\n" in Pack()
5559 "vmov.i8 d4, #0\n" in Pack()
5560 "vmov.i8 d5, #0\n" in Pack()
5561 "vmov.i8 d6, #0\n" in Pack()
5562 "vmov.i8 d7, #0\n" in Pack()
5563 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
5564 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
5565 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
5566 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
5567 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
5568 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
5569 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
5570 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
5571 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
5572 "vld1.16 {d4[2]}, [r3]!\n" in Pack()
5573 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
5574 "vld1.16 {d5[2]}, [r4]!\n" in Pack()
5575 "vld1.32 {d6[0]}, [r5]!\n" in Pack()
5576 "vld1.16 {d6[2]}, [r5]!\n" in Pack()
5577 "vld1.32 {d7[0]}, [r6]!\n" in Pack()
5578 "vld1.16 {d7[2]}, [r6]!\n" in Pack()
5579 "vaddw.u8 q8, q8, d0\n" in Pack()
5580 "vaddw.u8 q9, q9, d1\n" in Pack()
5581 "vaddw.u8 q10, q10, d2\n" in Pack()
5582 "vaddw.u8 q11, q11, d3\n" in Pack()
5583 "vaddw.u8 q12, q12, d4\n" in Pack()
5584 "vaddw.u8 q13, q13, d5\n" in Pack()
5585 "vaddw.u8 q14, q14, d6\n" in Pack()
5586 "vaddw.u8 q15, q15, d7\n" in Pack()
5587 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5588 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5591 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
5592 "ldr r1, %[additive_sum_offset]\n" in Pack()
5593 "vmov.32 d0[0], r0\n" in Pack()
5594 "vdup.32 q1, r1\n" in Pack()
5595 "vpaddl.u16 q8, q8\n" in Pack()
5596 "vpaddl.u16 q9, q9\n" in Pack()
5597 "vpaddl.u16 q10, q10\n" in Pack()
5598 "vpaddl.u16 q11, q11\n" in Pack()
5599 "vpaddl.u16 q12, q12\n" in Pack()
5600 "vpaddl.u16 q13, q13\n" in Pack()
5601 "vpaddl.u16 q14, q14\n" in Pack()
5602 "vpaddl.u16 q15, q15\n" in Pack()
5603 "vpadd.u32 d16, d16, d17\n" in Pack()
5604 "vpadd.u32 d18, d18, d19\n" in Pack()
5605 "vpadd.u32 d20, d20, d21\n" in Pack()
5606 "vpadd.u32 d22, d22, d23\n" in Pack()
5607 "vpadd.u32 d24, d24, d25\n" in Pack()
5608 "vpadd.u32 d26, d26, d27\n" in Pack()
5609 "vpadd.u32 d28, d28, d29\n" in Pack()
5610 "vpadd.u32 d30, d30, d31\n" in Pack()
5611 "vpadd.u32 d16, d16, d18\n" in Pack()
5612 "vpadd.u32 d17, d20, d22\n" in Pack()
5613 "vpadd.u32 d18, d24, d26\n" in Pack()
5614 "vpadd.u32 d19, d28, d30\n" in Pack()
5615 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5616 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5617 "vadd.i32 q8, q8, q1\n" in Pack()
5618 "vadd.i32 q9, q9, q1\n" in Pack()
5619 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
5643 "add r0, %[in], %[stride]\n" in Pack()
5644 "add r1, r0, %[stride]\n" in Pack()
5645 "add r2, r1, %[stride]\n" in Pack()
5646 "add r3, r2, %[stride]\n" in Pack()
5647 "add r4, r3, %[stride]\n" in Pack()
5648 "add r5, r4, %[stride]\n" in Pack()
5649 "add r6, r5, %[stride]\n" in Pack()
5650 "vmov.i16 q8, #0\n" in Pack()
5651 "vmov.i16 q9, #0\n" in Pack()
5652 "vmov.i16 q10, #0\n" in Pack()
5653 "vmov.i16 q11, #0\n" in Pack()
5654 "vmov.i16 q12, #0\n" in Pack()
5655 "vmov.i16 q13, #0\n" in Pack()
5656 "vmov.i16 q14, #0\n" in Pack()
5657 "vmov.i16 q15, #0\n" in Pack()
5660 "subs %[count], %[count], #7\n" in Pack()
5661 "beq 2f\n" in Pack()
5664 "subs %[count], %[count], #8\n" in Pack()
5667 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5668 "vld1.32 {d1}, [r0]!\n" in Pack()
5669 "vld1.32 {d2}, [r1]!\n" in Pack()
5670 "vld1.32 {d3}, [r2]!\n" in Pack()
5671 "vld1.32 {d4}, [r3]!\n" in Pack()
5672 "vld1.32 {d5}, [r4]!\n" in Pack()
5673 "vld1.32 {d6}, [r5]!\n" in Pack()
5674 "vld1.32 {d7}, [r6]!\n" in Pack()
5675 "vaddw.u8 q8, q8, d0\n" in Pack()
5676 "vaddw.u8 q9, q9, d1\n" in Pack()
5677 "vaddw.u8 q10, q10, d2\n" in Pack()
5678 "vaddw.u8 q11, q11, d3\n" in Pack()
5679 "vaddw.u8 q12, q12, d4\n" in Pack()
5680 "vaddw.u8 q13, q13, d5\n" in Pack()
5681 "vaddw.u8 q14, q14, d6\n" in Pack()
5682 "vaddw.u8 q15, q15, d7\n" in Pack()
5683 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5684 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5686 "bne 1b\n" in Pack()
5691 "vmov.i8 d0, #0\n" in Pack()
5692 "vmov.i8 d1, #0\n" in Pack()
5693 "vmov.i8 d2, #0\n" in Pack()
5694 "vmov.i8 d3, #0\n" in Pack()
5695 "vmov.i8 d4, #0\n" in Pack()
5696 "vmov.i8 d5, #0\n" in Pack()
5697 "vmov.i8 d6, #0\n" in Pack()
5698 "vmov.i8 d7, #0\n" in Pack()
5699 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
5700 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
5701 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
5702 "vld1.32 {d1[0]}, [r0]!\n" in Pack()
5703 "vld1.16 {d1[2]}, [r0]!\n" in Pack()
5704 "vld1.8 {d1[6]}, [r0]!\n" in Pack()
5705 "vld1.32 {d2[0]}, [r1]!\n" in Pack()
5706 "vld1.16 {d2[2]}, [r1]!\n" in Pack()
5707 "vld1.8 {d2[6]}, [r1]!\n" in Pack()
5708 "vld1.32 {d3[0]}, [r2]!\n" in Pack()
5709 "vld1.16 {d3[2]}, [r2]!\n" in Pack()
5710 "vld1.8 {d3[6]}, [r2]!\n" in Pack()
5711 "vld1.32 {d4[0]}, [r3]!\n" in Pack()
5712 "vld1.16 {d4[2]}, [r3]!\n" in Pack()
5713 "vld1.8 {d4[6]}, [r3]!\n" in Pack()
5714 "vld1.32 {d5[0]}, [r4]!\n" in Pack()
5715 "vld1.16 {d5[2]}, [r4]!\n" in Pack()
5716 "vld1.8 {d5[6]}, [r4]!\n" in Pack()
5717 "vld1.32 {d6[0]}, [r5]!\n" in Pack()
5718 "vld1.16 {d6[2]}, [r5]!\n" in Pack()
5719 "vld1.8 {d6[6]}, [r5]!\n" in Pack()
5720 "vld1.32 {d7[0]}, [r6]!\n" in Pack()
5721 "vld1.16 {d7[2]}, [r6]!\n" in Pack()
5722 "vld1.8 {d7[6]}, [r6]!\n" in Pack()
5723 "vaddw.u8 q8, q8, d0\n" in Pack()
5724 "vaddw.u8 q9, q9, d1\n" in Pack()
5725 "vaddw.u8 q10, q10, d2\n" in Pack()
5726 "vaddw.u8 q11, q11, d3\n" in Pack()
5727 "vaddw.u8 q12, q12, d4\n" in Pack()
5728 "vaddw.u8 q13, q13, d5\n" in Pack()
5729 "vaddw.u8 q14, q14, d6\n" in Pack()
5730 "vaddw.u8 q15, q15, d7\n" in Pack()
5731 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5732 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
5735 "ldr r0, %[multiplicative_sum_offset]\n" in Pack()
5736 "ldr r1, %[additive_sum_offset]\n" in Pack()
5737 "vmov.32 d0[0], r0\n" in Pack()
5738 "vdup.32 q1, r1\n" in Pack()
5739 "vpaddl.u16 q8, q8\n" in Pack()
5740 "vpaddl.u16 q9, q9\n" in Pack()
5741 "vpaddl.u16 q10, q10\n" in Pack()
5742 "vpaddl.u16 q11, q11\n" in Pack()
5743 "vpaddl.u16 q12, q12\n" in Pack()
5744 "vpaddl.u16 q13, q13\n" in Pack()
5745 "vpaddl.u16 q14, q14\n" in Pack()
5746 "vpaddl.u16 q15, q15\n" in Pack()
5747 "vpadd.u32 d16, d16, d17\n" in Pack()
5748 "vpadd.u32 d18, d18, d19\n" in Pack()
5749 "vpadd.u32 d20, d20, d21\n" in Pack()
5750 "vpadd.u32 d22, d22, d23\n" in Pack()
5751 "vpadd.u32 d24, d24, d25\n" in Pack()
5752 "vpadd.u32 d26, d26, d27\n" in Pack()
5753 "vpadd.u32 d28, d28, d29\n" in Pack()
5754 "vpadd.u32 d30, d30, d31\n" in Pack()
5755 "vpadd.u32 d16, d16, d18\n" in Pack()
5756 "vpadd.u32 d17, d20, d22\n" in Pack()
5757 "vpadd.u32 d18, d24, d26\n" in Pack()
5758 "vpadd.u32 d19, d28, d30\n" in Pack()
5759 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5760 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5761 "vadd.i32 q8, q8, q1\n" in Pack()
5762 "vadd.i32 q9, q9, q1\n" in Pack()
5763 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
5789 "vmov.i16 q8, #0\n" in Pack()
5792 "subs %[count], %[count], #8\n" in Pack()
5795 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5796 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5797 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
5798 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
5799 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
5800 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
5801 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
5802 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
5803 "pld [%[in]]\n" in Pack()
5804 "vaddw.u8 q8, q8, d0\n" in Pack()
5805 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5807 "bne 1b\n" in Pack()
5810 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
5811 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
5812 "vpaddl.u16 q8, q8\n" in Pack()
5813 "vpadd.u32 d16, d16, d17\n" in Pack()
5814 "vpadd.u32 d16, d16, d16\n" in Pack()
5815 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5816 "vadd.i32 q8, q8, q1\n" in Pack()
5817 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
5840 "vmov.i16 q8, #0\n" in Pack()
5843 "subs %[count], %[count], #1\n" in Pack()
5844 "beq 2f\n" in Pack()
5847 "subs %[count], %[count], #8\n" in Pack()
5850 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5851 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5852 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
5853 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
5854 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
5855 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
5856 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
5857 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
5858 "pld [%[in]]\n" in Pack()
5859 "vaddw.u8 q8, q8, d0\n" in Pack()
5860 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5862 "bne 1b\n" in Pack()
5867 "vmov.i8 d0, #0\n" in Pack()
5868 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5869 "pld [%[in]]\n" in Pack()
5870 "vaddw.u8 q8, q8, d0\n" in Pack()
5871 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5874 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
5875 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
5876 "vpaddl.u16 q8, q8\n" in Pack()
5877 "vpadd.u32 d16, d16, d17\n" in Pack()
5878 "vpadd.u32 d16, d16, d16\n" in Pack()
5879 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5880 "vadd.i32 q8, q8, q1\n" in Pack()
5881 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
5904 "vmov.i16 q8, #0\n" in Pack()
5907 "subs %[count], %[count], #2\n" in Pack()
5908 "beq 2f\n" in Pack()
5911 "subs %[count], %[count], #8\n" in Pack()
5914 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5915 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5916 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
5917 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
5918 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
5919 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
5920 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
5921 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
5922 "pld [%[in]]\n" in Pack()
5923 "vaddw.u8 q8, q8, d0\n" in Pack()
5924 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5926 "bne 1b\n" in Pack()
5931 "vmov.i8 d0, #0\n" in Pack()
5932 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5933 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5934 "pld [%[in]]\n" in Pack()
5935 "vaddw.u8 q8, q8, d0\n" in Pack()
5936 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5939 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
5940 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
5941 "vpaddl.u16 q8, q8\n" in Pack()
5942 "vpadd.u32 d16, d16, d17\n" in Pack()
5943 "vpadd.u32 d16, d16, d16\n" in Pack()
5944 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5945 "vadd.i32 q8, q8, q1\n" in Pack()
5946 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
5969 "vmov.i16 q8, #0\n" in Pack()
5972 "subs %[count], %[count], #3\n" in Pack()
5973 "beq 2f\n" in Pack()
5976 "subs %[count], %[count], #8\n" in Pack()
5979 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5980 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5981 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
5982 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
5983 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
5984 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
5985 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
5986 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
5987 "pld [%[in]]\n" in Pack()
5988 "vaddw.u8 q8, q8, d0\n" in Pack()
5989 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5991 "bne 1b\n" in Pack()
5996 "vmov.i8 d0, #0\n" in Pack()
5997 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5998 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5999 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6000 "pld [%[in]]\n" in Pack()
6001 "vaddw.u8 q8, q8, d0\n" in Pack()
6002 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6005 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6006 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6007 "vpaddl.u16 q8, q8\n" in Pack()
6008 "vpadd.u32 d16, d16, d17\n" in Pack()
6009 "vpadd.u32 d16, d16, d16\n" in Pack()
6010 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6011 "vadd.i32 q8, q8, q1\n" in Pack()
6012 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
6035 "vmov.i16 q8, #0\n" in Pack()
6038 "subs %[count], %[count], #4\n" in Pack()
6039 "beq 2f\n" in Pack()
6042 "subs %[count], %[count], #8\n" in Pack()
6045 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6046 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6047 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6048 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6049 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6050 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6051 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6052 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
6053 "pld [%[in]]\n" in Pack()
6054 "vaddw.u8 q8, q8, d0\n" in Pack()
6055 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6057 "bne 1b\n" in Pack()
6062 "vmov.i8 d0, #0\n" in Pack()
6063 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6064 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6065 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6066 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6067 "pld [%[in]]\n" in Pack()
6068 "vaddw.u8 q8, q8, d0\n" in Pack()
6069 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6072 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6073 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6074 "vpaddl.u16 q8, q8\n" in Pack()
6075 "vpadd.u32 d16, d16, d17\n" in Pack()
6076 "vpadd.u32 d16, d16, d16\n" in Pack()
6077 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6078 "vadd.i32 q8, q8, q1\n" in Pack()
6079 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
6102 "vmov.i16 q8, #0\n" in Pack()
6105 "subs %[count], %[count], #5\n" in Pack()
6106 "beq 2f\n" in Pack()
6109 "subs %[count], %[count], #8\n" in Pack()
6112 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6113 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6114 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6115 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6116 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6117 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6118 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6119 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
6120 "pld [%[in]]\n" in Pack()
6121 "vaddw.u8 q8, q8, d0\n" in Pack()
6122 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6124 "bne 1b\n" in Pack()
6129 "vmov.i8 d0, #0\n" in Pack()
6130 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6131 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6132 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6133 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6134 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6135 "pld [%[in]]\n" in Pack()
6136 "vaddw.u8 q8, q8, d0\n" in Pack()
6137 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6140 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6141 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6142 "vpaddl.u16 q8, q8\n" in Pack()
6143 "vpadd.u32 d16, d16, d17\n" in Pack()
6144 "vpadd.u32 d16, d16, d16\n" in Pack()
6145 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6146 "vadd.i32 q8, q8, q1\n" in Pack()
6147 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
6170 "vmov.i16 q8, #0\n" in Pack()
6173 "subs %[count], %[count], #6\n" in Pack()
6174 "beq 2f\n" in Pack()
6177 "subs %[count], %[count], #8\n" in Pack()
6180 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6181 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6182 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6183 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6184 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6185 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6186 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6187 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
6188 "pld [%[in]]\n" in Pack()
6189 "vaddw.u8 q8, q8, d0\n" in Pack()
6190 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6192 "bne 1b\n" in Pack()
6197 "vmov.i8 d0, #0\n" in Pack()
6198 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6199 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6200 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6201 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6202 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6203 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6204 "pld [%[in]]\n" in Pack()
6205 "vaddw.u8 q8, q8, d0\n" in Pack()
6206 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6209 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6210 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6211 "vpaddl.u16 q8, q8\n" in Pack()
6212 "vpadd.u32 d16, d16, d17\n" in Pack()
6213 "vpadd.u32 d16, d16, d16\n" in Pack()
6214 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6215 "vadd.i32 q8, q8, q1\n" in Pack()
6216 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
6239 "vmov.i16 q8, #0\n" in Pack()
6242 "subs %[count], %[count], #7\n" in Pack()
6243 "beq 2f\n" in Pack()
6246 "subs %[count], %[count], #8\n" in Pack()
6249 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6250 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6251 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6252 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6253 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6254 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6255 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6256 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
6257 "pld [%[in]]\n" in Pack()
6258 "vaddw.u8 q8, q8, d0\n" in Pack()
6259 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6261 "bne 1b\n" in Pack()
6266 "vmov.i8 d0, #0\n" in Pack()
6267 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6268 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6269 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6270 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6271 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6272 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6273 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6274 "pld [%[in]]\n" in Pack()
6275 "vaddw.u8 q8, q8, d0\n" in Pack()
6276 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6279 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6280 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6281 "vpaddl.u16 q8, q8\n" in Pack()
6282 "vpadd.u32 d16, d16, d17\n" in Pack()
6283 "vpadd.u32 d16, d16, d16\n" in Pack()
6284 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6285 "vadd.i32 q8, q8, q1\n" in Pack()
6286 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
6309 "vmov.i16 q8, #0\n" in Pack()
6310 "vmov.i16 q9, #0\n" in Pack()
6313 "subs %[count], %[count], #8\n" in Pack()
6316 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6317 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6318 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6319 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6320 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6321 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6322 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6323 "vld1.16 {d1[3]}, [%[in]], %[stride]\n" in Pack()
6324 "pld [%[in]]\n" in Pack()
6325 "vuzp.8 d0, d1\n" in Pack()
6326 "vaddw.u8 q8, q8, d0\n" in Pack()
6327 "vaddw.u8 q9, q9, d1\n" in Pack()
6328 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6330 "bne 1b\n" in Pack()
6333 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6334 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6335 "vpaddl.u16 q8, q8\n" in Pack()
6336 "vpaddl.u16 q9, q9\n" in Pack()
6337 "vpadd.u32 d16, d16, d17\n" in Pack()
6338 "vpadd.u32 d18, d18, d19\n" in Pack()
6339 "vpadd.u32 d16, d16, d18\n" in Pack()
6340 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6341 "vadd.i32 q8, q8, q1\n" in Pack()
6342 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
6365 "vmov.i16 q8, #0\n" in Pack()
6366 "vmov.i16 q9, #0\n" in Pack()
6369 "subs %[count], %[count], #1\n" in Pack()
6370 "beq 2f\n" in Pack()
6373 "subs %[count], %[count], #8\n" in Pack()
6376 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6377 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6378 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6379 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6380 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6381 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6382 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6383 "vld1.16 {d1[3]}, [%[in]], %[stride]\n" in Pack()
6384 "pld [%[in]]\n" in Pack()
6385 "vuzp.8 d0, d1\n" in Pack()
6386 "vaddw.u8 q8, q8, d0\n" in Pack()
6387 "vaddw.u8 q9, q9, d1\n" in Pack()
6388 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6390 "bne 1b\n" in Pack()
6395 "vmov.i8 d0, #0\n" in Pack()
6396 "vmov.i8 d1, #0\n" in Pack()
6397 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6398 "pld [%[in]]\n" in Pack()
6399 "vuzp.8 d0, d1\n" in Pack()
6400 "vaddw.u8 q8, q8, d0\n" in Pack()
6401 "vaddw.u8 q9, q9, d1\n" in Pack()
6402 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6405 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6406 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6407 "vpaddl.u16 q8, q8\n" in Pack()
6408 "vpaddl.u16 q9, q9\n" in Pack()
6409 "vpadd.u32 d16, d16, d17\n" in Pack()
6410 "vpadd.u32 d18, d18, d19\n" in Pack()
6411 "vpadd.u32 d16, d16, d18\n" in Pack()
6412 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6413 "vadd.i32 q8, q8, q1\n" in Pack()
6414 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
6437 "vmov.i16 q8, #0\n" in Pack()
6438 "vmov.i16 q9, #0\n" in Pack()
6441 "subs %[count], %[count], #2\n" in Pack()
6442 "beq 2f\n" in Pack()
6445 "subs %[count], %[count], #8\n" in Pack()
6448 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6449 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6450 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6451 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6452 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6453 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6454 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6455 "vld1.16 {d1[3]}, [%[in]], %[stride]\n" in Pack()
6456 "pld [%[in]]\n" in Pack()
6457 "vuzp.8 d0, d1\n" in Pack()
6458 "vaddw.u8 q8, q8, d0\n" in Pack()
6459 "vaddw.u8 q9, q9, d1\n" in Pack()
6460 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6462 "bne 1b\n" in Pack()
6467 "vmov.i8 d0, #0\n" in Pack()
6468 "vmov.i8 d1, #0\n" in Pack()
6469 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6470 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6471 "pld [%[in]]\n" in Pack()
6472 "vuzp.8 d0, d1\n" in Pack()
6473 "vaddw.u8 q8, q8, d0\n" in Pack()
6474 "vaddw.u8 q9, q9, d1\n" in Pack()
6475 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6478 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6479 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6480 "vpaddl.u16 q8, q8\n" in Pack()
6481 "vpaddl.u16 q9, q9\n" in Pack()
6482 "vpadd.u32 d16, d16, d17\n" in Pack()
6483 "vpadd.u32 d18, d18, d19\n" in Pack()
6484 "vpadd.u32 d16, d16, d18\n" in Pack()
6485 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6486 "vadd.i32 q8, q8, q1\n" in Pack()
6487 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
6510 "vmov.i16 q8, #0\n" in Pack()
6511 "vmov.i16 q9, #0\n" in Pack()
6514 "subs %[count], %[count], #3\n" in Pack()
6515 "beq 2f\n" in Pack()
6518 "subs %[count], %[count], #8\n" in Pack()
6521 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6522 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6523 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6524 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6525 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6526 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6527 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6528 "vld1.16 {d1[3]}, [%[in]], %[stride]\n" in Pack()
6529 "pld [%[in]]\n" in Pack()
6530 "vuzp.8 d0, d1\n" in Pack()
6531 "vaddw.u8 q8, q8, d0\n" in Pack()
6532 "vaddw.u8 q9, q9, d1\n" in Pack()
6533 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6535 "bne 1b\n" in Pack()
6540 "vmov.i8 d0, #0\n" in Pack()
6541 "vmov.i8 d1, #0\n" in Pack()
6542 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6543 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6544 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6545 "pld [%[in]]\n" in Pack()
6546 "vuzp.8 d0, d1\n" in Pack()
6547 "vaddw.u8 q8, q8, d0\n" in Pack()
6548 "vaddw.u8 q9, q9, d1\n" in Pack()
6549 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6552 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6553 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6554 "vpaddl.u16 q8, q8\n" in Pack()
6555 "vpaddl.u16 q9, q9\n" in Pack()
6556 "vpadd.u32 d16, d16, d17\n" in Pack()
6557 "vpadd.u32 d18, d18, d19\n" in Pack()
6558 "vpadd.u32 d16, d16, d18\n" in Pack()
6559 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6560 "vadd.i32 q8, q8, q1\n" in Pack()
6561 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
6584 "vmov.i16 q8, #0\n" in Pack()
6585 "vmov.i16 q9, #0\n" in Pack()
6588 "subs %[count], %[count], #4\n" in Pack()
6589 "beq 2f\n" in Pack()
6592 "subs %[count], %[count], #8\n" in Pack()
6595 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6596 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6597 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6598 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6599 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6600 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6601 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6602 "vld1.16 {d1[3]}, [%[in]], %[stride]\n" in Pack()
6603 "pld [%[in]]\n" in Pack()
6604 "vuzp.8 d0, d1\n" in Pack()
6605 "vaddw.u8 q8, q8, d0\n" in Pack()
6606 "vaddw.u8 q9, q9, d1\n" in Pack()
6607 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6609 "bne 1b\n" in Pack()
6614 "vmov.i8 d0, #0\n" in Pack()
6615 "vmov.i8 d1, #0\n" in Pack()
6616 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6617 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6618 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6619 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6620 "pld [%[in]]\n" in Pack()
6621 "vuzp.8 d0, d1\n" in Pack()
6622 "vaddw.u8 q8, q8, d0\n" in Pack()
6623 "vaddw.u8 q9, q9, d1\n" in Pack()
6624 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6627 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6628 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6629 "vpaddl.u16 q8, q8\n" in Pack()
6630 "vpaddl.u16 q9, q9\n" in Pack()
6631 "vpadd.u32 d16, d16, d17\n" in Pack()
6632 "vpadd.u32 d18, d18, d19\n" in Pack()
6633 "vpadd.u32 d16, d16, d18\n" in Pack()
6634 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6635 "vadd.i32 q8, q8, q1\n" in Pack()
6636 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
6659 "vmov.i16 q8, #0\n" in Pack()
6660 "vmov.i16 q9, #0\n" in Pack()
6663 "subs %[count], %[count], #5\n" in Pack()
6664 "beq 2f\n" in Pack()
6667 "subs %[count], %[count], #8\n" in Pack()
6670 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6671 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6672 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6673 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6674 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6675 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6676 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6677 "vld1.16 {d1[3]}, [%[in]], %[stride]\n" in Pack()
6678 "pld [%[in]]\n" in Pack()
6679 "vuzp.8 d0, d1\n" in Pack()
6680 "vaddw.u8 q8, q8, d0\n" in Pack()
6681 "vaddw.u8 q9, q9, d1\n" in Pack()
6682 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6684 "bne 1b\n" in Pack()
6689 "vmov.i8 d0, #0\n" in Pack()
6690 "vmov.i8 d1, #0\n" in Pack()
6691 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6692 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6693 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6694 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6695 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6696 "pld [%[in]]\n" in Pack()
6697 "vuzp.8 d0, d1\n" in Pack()
6698 "vaddw.u8 q8, q8, d0\n" in Pack()
6699 "vaddw.u8 q9, q9, d1\n" in Pack()
6700 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6703 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6704 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6705 "vpaddl.u16 q8, q8\n" in Pack()
6706 "vpaddl.u16 q9, q9\n" in Pack()
6707 "vpadd.u32 d16, d16, d17\n" in Pack()
6708 "vpadd.u32 d18, d18, d19\n" in Pack()
6709 "vpadd.u32 d16, d16, d18\n" in Pack()
6710 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6711 "vadd.i32 q8, q8, q1\n" in Pack()
6712 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
6735 "vmov.i16 q8, #0\n" in Pack()
6736 "vmov.i16 q9, #0\n" in Pack()
6739 "subs %[count], %[count], #6\n" in Pack()
6740 "beq 2f\n" in Pack()
6743 "subs %[count], %[count], #8\n" in Pack()
6746 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6747 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6748 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6749 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6750 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6751 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6752 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6753 "vld1.16 {d1[3]}, [%[in]], %[stride]\n" in Pack()
6754 "pld [%[in]]\n" in Pack()
6755 "vuzp.8 d0, d1\n" in Pack()
6756 "vaddw.u8 q8, q8, d0\n" in Pack()
6757 "vaddw.u8 q9, q9, d1\n" in Pack()
6758 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6760 "bne 1b\n" in Pack()
6765 "vmov.i8 d0, #0\n" in Pack()
6766 "vmov.i8 d1, #0\n" in Pack()
6767 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6768 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6769 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6770 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6771 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6772 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6773 "pld [%[in]]\n" in Pack()
6774 "vuzp.8 d0, d1\n" in Pack()
6775 "vaddw.u8 q8, q8, d0\n" in Pack()
6776 "vaddw.u8 q9, q9, d1\n" in Pack()
6777 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6780 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6781 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6782 "vpaddl.u16 q8, q8\n" in Pack()
6783 "vpaddl.u16 q9, q9\n" in Pack()
6784 "vpadd.u32 d16, d16, d17\n" in Pack()
6785 "vpadd.u32 d18, d18, d19\n" in Pack()
6786 "vpadd.u32 d16, d16, d18\n" in Pack()
6787 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6788 "vadd.i32 q8, q8, q1\n" in Pack()
6789 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
6812 "vmov.i16 q8, #0\n" in Pack()
6813 "vmov.i16 q9, #0\n" in Pack()
6816 "subs %[count], %[count], #7\n" in Pack()
6817 "beq 2f\n" in Pack()
6820 "subs %[count], %[count], #8\n" in Pack()
6823 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6824 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6825 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6826 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6827 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6828 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6829 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6830 "vld1.16 {d1[3]}, [%[in]], %[stride]\n" in Pack()
6831 "pld [%[in]]\n" in Pack()
6832 "vuzp.8 d0, d1\n" in Pack()
6833 "vaddw.u8 q8, q8, d0\n" in Pack()
6834 "vaddw.u8 q9, q9, d1\n" in Pack()
6835 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6837 "bne 1b\n" in Pack()
6842 "vmov.i8 d0, #0\n" in Pack()
6843 "vmov.i8 d1, #0\n" in Pack()
6844 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6845 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6846 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6847 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6848 "vld1.16 {d1[0]}, [%[in]], %[stride]\n" in Pack()
6849 "vld1.16 {d1[1]}, [%[in]], %[stride]\n" in Pack()
6850 "vld1.16 {d1[2]}, [%[in]], %[stride]\n" in Pack()
6851 "pld [%[in]]\n" in Pack()
6852 "vuzp.8 d0, d1\n" in Pack()
6853 "vaddw.u8 q8, q8, d0\n" in Pack()
6854 "vaddw.u8 q9, q9, d1\n" in Pack()
6855 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6858 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6859 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6860 "vpaddl.u16 q8, q8\n" in Pack()
6861 "vpaddl.u16 q9, q9\n" in Pack()
6862 "vpadd.u32 d16, d16, d17\n" in Pack()
6863 "vpadd.u32 d18, d18, d19\n" in Pack()
6864 "vpadd.u32 d16, d16, d18\n" in Pack()
6865 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6866 "vadd.i32 q8, q8, q1\n" in Pack()
6867 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
6890 "vmov.i16 q8, #0\n" in Pack()
6891 "vmov.i16 q9, #0\n" in Pack()
6892 "vmov.i16 q10, #0\n" in Pack()
6895 "subs %[count], %[count], #8\n" in Pack()
6898 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
6899 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
6900 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
6901 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
6902 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
6903 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
6904 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
6905 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
6906 "vaddw.u8 q8, q8, d0\n" in Pack()
6907 "vaddw.u8 q9, q9, d1\n" in Pack()
6908 "vaddw.u8 q10, q10, d2\n" in Pack()
6909 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
6911 "bne 1b\n" in Pack()
6914 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6915 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6916 "vpaddl.u16 q8, q8\n" in Pack()
6917 "vpaddl.u16 q9, q9\n" in Pack()
6918 "vpaddl.u16 q10, q10\n" in Pack()
6919 "vpadd.u32 d16, d16, d17\n" in Pack()
6920 "vpadd.u32 d18, d18, d19\n" in Pack()
6921 "vpadd.u32 d20, d20, d21\n" in Pack()
6922 "vpadd.u32 d16, d16, d18\n" in Pack()
6923 "vpadd.u32 d17, d20, d20\n" in Pack()
6924 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6925 "vadd.i32 q8, q8, q1\n" in Pack()
6926 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
6950 "vmov.i16 q8, #0\n" in Pack()
6951 "vmov.i16 q9, #0\n" in Pack()
6952 "vmov.i16 q10, #0\n" in Pack()
6955 "subs %[count], %[count], #1\n" in Pack()
6956 "beq 2f\n" in Pack()
6959 "subs %[count], %[count], #8\n" in Pack()
6962 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
6963 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
6964 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
6965 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
6966 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
6967 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
6968 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
6969 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
6970 "vaddw.u8 q8, q8, d0\n" in Pack()
6971 "vaddw.u8 q9, q9, d1\n" in Pack()
6972 "vaddw.u8 q10, q10, d2\n" in Pack()
6973 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
6975 "bne 1b\n" in Pack()
6980 "vmov.i8 d0, #0\n" in Pack()
6981 "vmov.i8 d1, #0\n" in Pack()
6982 "vmov.i8 d2, #0\n" in Pack()
6983 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
6984 "vaddw.u8 q8, q8, d0\n" in Pack()
6985 "vaddw.u8 q9, q9, d1\n" in Pack()
6986 "vaddw.u8 q10, q10, d2\n" in Pack()
6987 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
6990 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6991 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
6992 "vpaddl.u16 q8, q8\n" in Pack()
6993 "vpaddl.u16 q9, q9\n" in Pack()
6994 "vpaddl.u16 q10, q10\n" in Pack()
6995 "vpadd.u32 d16, d16, d17\n" in Pack()
6996 "vpadd.u32 d18, d18, d19\n" in Pack()
6997 "vpadd.u32 d20, d20, d21\n" in Pack()
6998 "vpadd.u32 d16, d16, d18\n" in Pack()
6999 "vpadd.u32 d17, d20, d20\n" in Pack()
7000 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7001 "vadd.i32 q8, q8, q1\n" in Pack()
7002 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
7026 "vmov.i16 q8, #0\n" in Pack()
7027 "vmov.i16 q9, #0\n" in Pack()
7028 "vmov.i16 q10, #0\n" in Pack()
7031 "subs %[count], %[count], #2\n" in Pack()
7032 "beq 2f\n" in Pack()
7035 "subs %[count], %[count], #8\n" in Pack()
7038 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7039 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7040 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7041 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7042 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7043 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7044 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7045 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7046 "vaddw.u8 q8, q8, d0\n" in Pack()
7047 "vaddw.u8 q9, q9, d1\n" in Pack()
7048 "vaddw.u8 q10, q10, d2\n" in Pack()
7049 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7051 "bne 1b\n" in Pack()
7056 "vmov.i8 d0, #0\n" in Pack()
7057 "vmov.i8 d1, #0\n" in Pack()
7058 "vmov.i8 d2, #0\n" in Pack()
7059 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7060 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7061 "vaddw.u8 q8, q8, d0\n" in Pack()
7062 "vaddw.u8 q9, q9, d1\n" in Pack()
7063 "vaddw.u8 q10, q10, d2\n" in Pack()
7064 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7067 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7068 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7069 "vpaddl.u16 q8, q8\n" in Pack()
7070 "vpaddl.u16 q9, q9\n" in Pack()
7071 "vpaddl.u16 q10, q10\n" in Pack()
7072 "vpadd.u32 d16, d16, d17\n" in Pack()
7073 "vpadd.u32 d18, d18, d19\n" in Pack()
7074 "vpadd.u32 d20, d20, d21\n" in Pack()
7075 "vpadd.u32 d16, d16, d18\n" in Pack()
7076 "vpadd.u32 d17, d20, d20\n" in Pack()
7077 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7078 "vadd.i32 q8, q8, q1\n" in Pack()
7079 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
7103 "vmov.i16 q8, #0\n" in Pack()
7104 "vmov.i16 q9, #0\n" in Pack()
7105 "vmov.i16 q10, #0\n" in Pack()
7108 "subs %[count], %[count], #3\n" in Pack()
7109 "beq 2f\n" in Pack()
7112 "subs %[count], %[count], #8\n" in Pack()
7115 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7116 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7117 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7118 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7119 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7120 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7121 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7122 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7123 "vaddw.u8 q8, q8, d0\n" in Pack()
7124 "vaddw.u8 q9, q9, d1\n" in Pack()
7125 "vaddw.u8 q10, q10, d2\n" in Pack()
7126 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7128 "bne 1b\n" in Pack()
7133 "vmov.i8 d0, #0\n" in Pack()
7134 "vmov.i8 d1, #0\n" in Pack()
7135 "vmov.i8 d2, #0\n" in Pack()
7136 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7137 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7138 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7139 "vaddw.u8 q8, q8, d0\n" in Pack()
7140 "vaddw.u8 q9, q9, d1\n" in Pack()
7141 "vaddw.u8 q10, q10, d2\n" in Pack()
7142 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7145 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7146 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7147 "vpaddl.u16 q8, q8\n" in Pack()
7148 "vpaddl.u16 q9, q9\n" in Pack()
7149 "vpaddl.u16 q10, q10\n" in Pack()
7150 "vpadd.u32 d16, d16, d17\n" in Pack()
7151 "vpadd.u32 d18, d18, d19\n" in Pack()
7152 "vpadd.u32 d20, d20, d21\n" in Pack()
7153 "vpadd.u32 d16, d16, d18\n" in Pack()
7154 "vpadd.u32 d17, d20, d20\n" in Pack()
7155 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7156 "vadd.i32 q8, q8, q1\n" in Pack()
7157 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
7181 "vmov.i16 q8, #0\n" in Pack()
7182 "vmov.i16 q9, #0\n" in Pack()
7183 "vmov.i16 q10, #0\n" in Pack()
7186 "subs %[count], %[count], #4\n" in Pack()
7187 "beq 2f\n" in Pack()
7190 "subs %[count], %[count], #8\n" in Pack()
7193 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7194 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7195 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7196 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7197 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7198 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7199 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7200 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7201 "vaddw.u8 q8, q8, d0\n" in Pack()
7202 "vaddw.u8 q9, q9, d1\n" in Pack()
7203 "vaddw.u8 q10, q10, d2\n" in Pack()
7204 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7206 "bne 1b\n" in Pack()
7211 "vmov.i8 d0, #0\n" in Pack()
7212 "vmov.i8 d1, #0\n" in Pack()
7213 "vmov.i8 d2, #0\n" in Pack()
7214 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7215 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7216 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7217 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7218 "vaddw.u8 q8, q8, d0\n" in Pack()
7219 "vaddw.u8 q9, q9, d1\n" in Pack()
7220 "vaddw.u8 q10, q10, d2\n" in Pack()
7221 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7224 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7225 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7226 "vpaddl.u16 q8, q8\n" in Pack()
7227 "vpaddl.u16 q9, q9\n" in Pack()
7228 "vpaddl.u16 q10, q10\n" in Pack()
7229 "vpadd.u32 d16, d16, d17\n" in Pack()
7230 "vpadd.u32 d18, d18, d19\n" in Pack()
7231 "vpadd.u32 d20, d20, d21\n" in Pack()
7232 "vpadd.u32 d16, d16, d18\n" in Pack()
7233 "vpadd.u32 d17, d20, d20\n" in Pack()
7234 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7235 "vadd.i32 q8, q8, q1\n" in Pack()
7236 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
7260 "vmov.i16 q8, #0\n" in Pack()
7261 "vmov.i16 q9, #0\n" in Pack()
7262 "vmov.i16 q10, #0\n" in Pack()
7265 "subs %[count], %[count], #5\n" in Pack()
7266 "beq 2f\n" in Pack()
7269 "subs %[count], %[count], #8\n" in Pack()
7272 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7273 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7274 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7275 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7276 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7277 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7278 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7279 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7280 "vaddw.u8 q8, q8, d0\n" in Pack()
7281 "vaddw.u8 q9, q9, d1\n" in Pack()
7282 "vaddw.u8 q10, q10, d2\n" in Pack()
7283 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7285 "bne 1b\n" in Pack()
7290 "vmov.i8 d0, #0\n" in Pack()
7291 "vmov.i8 d1, #0\n" in Pack()
7292 "vmov.i8 d2, #0\n" in Pack()
7293 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7294 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7295 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7296 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7297 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7298 "vaddw.u8 q8, q8, d0\n" in Pack()
7299 "vaddw.u8 q9, q9, d1\n" in Pack()
7300 "vaddw.u8 q10, q10, d2\n" in Pack()
7301 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7304 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7305 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7306 "vpaddl.u16 q8, q8\n" in Pack()
7307 "vpaddl.u16 q9, q9\n" in Pack()
7308 "vpaddl.u16 q10, q10\n" in Pack()
7309 "vpadd.u32 d16, d16, d17\n" in Pack()
7310 "vpadd.u32 d18, d18, d19\n" in Pack()
7311 "vpadd.u32 d20, d20, d21\n" in Pack()
7312 "vpadd.u32 d16, d16, d18\n" in Pack()
7313 "vpadd.u32 d17, d20, d20\n" in Pack()
7314 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7315 "vadd.i32 q8, q8, q1\n" in Pack()
7316 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
7340 "vmov.i16 q8, #0\n" in Pack()
7341 "vmov.i16 q9, #0\n" in Pack()
7342 "vmov.i16 q10, #0\n" in Pack()
7345 "subs %[count], %[count], #6\n" in Pack()
7346 "beq 2f\n" in Pack()
7349 "subs %[count], %[count], #8\n" in Pack()
7352 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7353 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7354 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7355 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7356 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7357 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7358 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7359 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7360 "vaddw.u8 q8, q8, d0\n" in Pack()
7361 "vaddw.u8 q9, q9, d1\n" in Pack()
7362 "vaddw.u8 q10, q10, d2\n" in Pack()
7363 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7365 "bne 1b\n" in Pack()
7370 "vmov.i8 d0, #0\n" in Pack()
7371 "vmov.i8 d1, #0\n" in Pack()
7372 "vmov.i8 d2, #0\n" in Pack()
7373 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7374 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7375 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7376 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7377 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7378 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7379 "vaddw.u8 q8, q8, d0\n" in Pack()
7380 "vaddw.u8 q9, q9, d1\n" in Pack()
7381 "vaddw.u8 q10, q10, d2\n" in Pack()
7382 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7385 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7386 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7387 "vpaddl.u16 q8, q8\n" in Pack()
7388 "vpaddl.u16 q9, q9\n" in Pack()
7389 "vpaddl.u16 q10, q10\n" in Pack()
7390 "vpadd.u32 d16, d16, d17\n" in Pack()
7391 "vpadd.u32 d18, d18, d19\n" in Pack()
7392 "vpadd.u32 d20, d20, d21\n" in Pack()
7393 "vpadd.u32 d16, d16, d18\n" in Pack()
7394 "vpadd.u32 d17, d20, d20\n" in Pack()
7395 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7396 "vadd.i32 q8, q8, q1\n" in Pack()
7397 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
7421 "vmov.i16 q8, #0\n" in Pack()
7422 "vmov.i16 q9, #0\n" in Pack()
7423 "vmov.i16 q10, #0\n" in Pack()
7426 "subs %[count], %[count], #7\n" in Pack()
7427 "beq 2f\n" in Pack()
7430 "subs %[count], %[count], #8\n" in Pack()
7433 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7434 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7435 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7436 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7437 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7438 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7439 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7440 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7441 "vaddw.u8 q8, q8, d0\n" in Pack()
7442 "vaddw.u8 q9, q9, d1\n" in Pack()
7443 "vaddw.u8 q10, q10, d2\n" in Pack()
7444 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7446 "bne 1b\n" in Pack()
7451 "vmov.i8 d0, #0\n" in Pack()
7452 "vmov.i8 d1, #0\n" in Pack()
7453 "vmov.i8 d2, #0\n" in Pack()
7454 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7455 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7456 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7457 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7458 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7459 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7460 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7461 "vaddw.u8 q8, q8, d0\n" in Pack()
7462 "vaddw.u8 q9, q9, d1\n" in Pack()
7463 "vaddw.u8 q10, q10, d2\n" in Pack()
7464 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7467 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7468 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7469 "vpaddl.u16 q8, q8\n" in Pack()
7470 "vpaddl.u16 q9, q9\n" in Pack()
7471 "vpaddl.u16 q10, q10\n" in Pack()
7472 "vpadd.u32 d16, d16, d17\n" in Pack()
7473 "vpadd.u32 d18, d18, d19\n" in Pack()
7474 "vpadd.u32 d20, d20, d21\n" in Pack()
7475 "vpadd.u32 d16, d16, d18\n" in Pack()
7476 "vpadd.u32 d17, d20, d20\n" in Pack()
7477 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7478 "vadd.i32 q8, q8, q1\n" in Pack()
7479 "vst1.32 {d16, d17}, [%[out]:64]\n" in Pack()
7503 "vmov.i16 q8, #0\n" in Pack()
7504 "vmov.i16 q9, #0\n" in Pack()
7505 "vmov.i16 q10, #0\n" in Pack()
7506 "vmov.i16 q11, #0\n" in Pack()
7509 "subs %[count], %[count], #8\n" in Pack()
7512 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7513 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7514 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7515 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
7516 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7517 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
7518 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
7519 "vld1.32 {d3[1]}, [%[in]], %[stride]\n" in Pack()
7520 "pld [%[in]]\n" in Pack()
7521 "vtrn.16 d0, d2\n" in Pack()
7522 "vtrn.16 d1, d3\n" in Pack()
7523 "vtrn.8 d0, d1\n" in Pack()
7524 "vtrn.8 d2, d3\n" in Pack()
7525 "vaddw.u8 q8, q8, d0\n" in Pack()
7526 "vaddw.u8 q9, q9, d1\n" in Pack()
7527 "vaddw.u8 q10, q10, d2\n" in Pack()
7528 "vaddw.u8 q11, q11, d3\n" in Pack()
7529 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7531 "bne 1b\n" in Pack()
7534 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7535 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7536 "vpaddl.u16 q8, q8\n" in Pack()
7537 "vpaddl.u16 q9, q9\n" in Pack()
7538 "vpaddl.u16 q10, q10\n" in Pack()
7539 "vpaddl.u16 q11, q11\n" in Pack()
7540 "vpadd.u32 d16, d16, d17\n" in Pack()
7541 "vpadd.u32 d18, d18, d19\n" in Pack()
7542 "vpadd.u32 d20, d20, d21\n" in Pack()
7543 "vpadd.u32 d22, d22, d23\n" in Pack()
7544 "vpadd.u32 d16, d16, d18\n" in Pack()
7545 "vpadd.u32 d17, d20, d22\n" in Pack()
7546 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7547 "vadd.i32 q8, q8, q1\n" in Pack()
7548 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
7572 "vmov.i16 q8, #0\n" in Pack()
7573 "vmov.i16 q9, #0\n" in Pack()
7574 "vmov.i16 q10, #0\n" in Pack()
7575 "vmov.i16 q11, #0\n" in Pack()
7578 "subs %[count], %[count], #1\n" in Pack()
7579 "beq 2f\n" in Pack()
7582 "subs %[count], %[count], #8\n" in Pack()
7585 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7586 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7587 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7588 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
7589 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7590 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
7591 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
7592 "vld1.32 {d3[1]}, [%[in]], %[stride]\n" in Pack()
7593 "pld [%[in]]\n" in Pack()
7594 "vtrn.16 d0, d2\n" in Pack()
7595 "vtrn.16 d1, d3\n" in Pack()
7596 "vtrn.8 d0, d1\n" in Pack()
7597 "vtrn.8 d2, d3\n" in Pack()
7598 "vaddw.u8 q8, q8, d0\n" in Pack()
7599 "vaddw.u8 q9, q9, d1\n" in Pack()
7600 "vaddw.u8 q10, q10, d2\n" in Pack()
7601 "vaddw.u8 q11, q11, d3\n" in Pack()
7602 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7604 "bne 1b\n" in Pack()
7609 "vmov.i8 d0, #0\n" in Pack()
7610 "vmov.i8 d1, #0\n" in Pack()
7611 "vmov.i8 d2, #0\n" in Pack()
7612 "vmov.i8 d3, #0\n" in Pack()
7613 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7614 "pld [%[in]]\n" in Pack()
7615 "vtrn.16 d0, d2\n" in Pack()
7616 "vtrn.16 d1, d3\n" in Pack()
7617 "vtrn.8 d0, d1\n" in Pack()
7618 "vtrn.8 d2, d3\n" in Pack()
7619 "vaddw.u8 q8, q8, d0\n" in Pack()
7620 "vaddw.u8 q9, q9, d1\n" in Pack()
7621 "vaddw.u8 q10, q10, d2\n" in Pack()
7622 "vaddw.u8 q11, q11, d3\n" in Pack()
7623 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7626 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7627 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7628 "vpaddl.u16 q8, q8\n" in Pack()
7629 "vpaddl.u16 q9, q9\n" in Pack()
7630 "vpaddl.u16 q10, q10\n" in Pack()
7631 "vpaddl.u16 q11, q11\n" in Pack()
7632 "vpadd.u32 d16, d16, d17\n" in Pack()
7633 "vpadd.u32 d18, d18, d19\n" in Pack()
7634 "vpadd.u32 d20, d20, d21\n" in Pack()
7635 "vpadd.u32 d22, d22, d23\n" in Pack()
7636 "vpadd.u32 d16, d16, d18\n" in Pack()
7637 "vpadd.u32 d17, d20, d22\n" in Pack()
7638 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7639 "vadd.i32 q8, q8, q1\n" in Pack()
7640 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
7664 "vmov.i16 q8, #0\n" in Pack()
7665 "vmov.i16 q9, #0\n" in Pack()
7666 "vmov.i16 q10, #0\n" in Pack()
7667 "vmov.i16 q11, #0\n" in Pack()
7670 "subs %[count], %[count], #2\n" in Pack()
7671 "beq 2f\n" in Pack()
7674 "subs %[count], %[count], #8\n" in Pack()
7677 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7678 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7679 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7680 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
7681 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7682 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
7683 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
7684 "vld1.32 {d3[1]}, [%[in]], %[stride]\n" in Pack()
7685 "pld [%[in]]\n" in Pack()
7686 "vtrn.16 d0, d2\n" in Pack()
7687 "vtrn.16 d1, d3\n" in Pack()
7688 "vtrn.8 d0, d1\n" in Pack()
7689 "vtrn.8 d2, d3\n" in Pack()
7690 "vaddw.u8 q8, q8, d0\n" in Pack()
7691 "vaddw.u8 q9, q9, d1\n" in Pack()
7692 "vaddw.u8 q10, q10, d2\n" in Pack()
7693 "vaddw.u8 q11, q11, d3\n" in Pack()
7694 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7696 "bne 1b\n" in Pack()
7701 "vmov.i8 d0, #0\n" in Pack()
7702 "vmov.i8 d1, #0\n" in Pack()
7703 "vmov.i8 d2, #0\n" in Pack()
7704 "vmov.i8 d3, #0\n" in Pack()
7705 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7706 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7707 "pld [%[in]]\n" in Pack()
7708 "vtrn.16 d0, d2\n" in Pack()
7709 "vtrn.16 d1, d3\n" in Pack()
7710 "vtrn.8 d0, d1\n" in Pack()
7711 "vtrn.8 d2, d3\n" in Pack()
7712 "vaddw.u8 q8, q8, d0\n" in Pack()
7713 "vaddw.u8 q9, q9, d1\n" in Pack()
7714 "vaddw.u8 q10, q10, d2\n" in Pack()
7715 "vaddw.u8 q11, q11, d3\n" in Pack()
7716 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7719 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7720 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7721 "vpaddl.u16 q8, q8\n" in Pack()
7722 "vpaddl.u16 q9, q9\n" in Pack()
7723 "vpaddl.u16 q10, q10\n" in Pack()
7724 "vpaddl.u16 q11, q11\n" in Pack()
7725 "vpadd.u32 d16, d16, d17\n" in Pack()
7726 "vpadd.u32 d18, d18, d19\n" in Pack()
7727 "vpadd.u32 d20, d20, d21\n" in Pack()
7728 "vpadd.u32 d22, d22, d23\n" in Pack()
7729 "vpadd.u32 d16, d16, d18\n" in Pack()
7730 "vpadd.u32 d17, d20, d22\n" in Pack()
7731 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7732 "vadd.i32 q8, q8, q1\n" in Pack()
7733 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
7757 "vmov.i16 q8, #0\n" in Pack()
7758 "vmov.i16 q9, #0\n" in Pack()
7759 "vmov.i16 q10, #0\n" in Pack()
7760 "vmov.i16 q11, #0\n" in Pack()
7763 "subs %[count], %[count], #3\n" in Pack()
7764 "beq 2f\n" in Pack()
7767 "subs %[count], %[count], #8\n" in Pack()
7770 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7771 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7772 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7773 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
7774 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7775 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
7776 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
7777 "vld1.32 {d3[1]}, [%[in]], %[stride]\n" in Pack()
7778 "pld [%[in]]\n" in Pack()
7779 "vtrn.16 d0, d2\n" in Pack()
7780 "vtrn.16 d1, d3\n" in Pack()
7781 "vtrn.8 d0, d1\n" in Pack()
7782 "vtrn.8 d2, d3\n" in Pack()
7783 "vaddw.u8 q8, q8, d0\n" in Pack()
7784 "vaddw.u8 q9, q9, d1\n" in Pack()
7785 "vaddw.u8 q10, q10, d2\n" in Pack()
7786 "vaddw.u8 q11, q11, d3\n" in Pack()
7787 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7789 "bne 1b\n" in Pack()
7794 "vmov.i8 d0, #0\n" in Pack()
7795 "vmov.i8 d1, #0\n" in Pack()
7796 "vmov.i8 d2, #0\n" in Pack()
7797 "vmov.i8 d3, #0\n" in Pack()
7798 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7799 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7800 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7801 "pld [%[in]]\n" in Pack()
7802 "vtrn.16 d0, d2\n" in Pack()
7803 "vtrn.16 d1, d3\n" in Pack()
7804 "vtrn.8 d0, d1\n" in Pack()
7805 "vtrn.8 d2, d3\n" in Pack()
7806 "vaddw.u8 q8, q8, d0\n" in Pack()
7807 "vaddw.u8 q9, q9, d1\n" in Pack()
7808 "vaddw.u8 q10, q10, d2\n" in Pack()
7809 "vaddw.u8 q11, q11, d3\n" in Pack()
7810 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7813 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7814 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7815 "vpaddl.u16 q8, q8\n" in Pack()
7816 "vpaddl.u16 q9, q9\n" in Pack()
7817 "vpaddl.u16 q10, q10\n" in Pack()
7818 "vpaddl.u16 q11, q11\n" in Pack()
7819 "vpadd.u32 d16, d16, d17\n" in Pack()
7820 "vpadd.u32 d18, d18, d19\n" in Pack()
7821 "vpadd.u32 d20, d20, d21\n" in Pack()
7822 "vpadd.u32 d22, d22, d23\n" in Pack()
7823 "vpadd.u32 d16, d16, d18\n" in Pack()
7824 "vpadd.u32 d17, d20, d22\n" in Pack()
7825 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7826 "vadd.i32 q8, q8, q1\n" in Pack()
7827 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
7851 "vmov.i16 q8, #0\n" in Pack()
7852 "vmov.i16 q9, #0\n" in Pack()
7853 "vmov.i16 q10, #0\n" in Pack()
7854 "vmov.i16 q11, #0\n" in Pack()
7857 "subs %[count], %[count], #4\n" in Pack()
7858 "beq 2f\n" in Pack()
7861 "subs %[count], %[count], #8\n" in Pack()
7864 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7865 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7866 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7867 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
7868 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7869 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
7870 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
7871 "vld1.32 {d3[1]}, [%[in]], %[stride]\n" in Pack()
7872 "pld [%[in]]\n" in Pack()
7873 "vtrn.16 d0, d2\n" in Pack()
7874 "vtrn.16 d1, d3\n" in Pack()
7875 "vtrn.8 d0, d1\n" in Pack()
7876 "vtrn.8 d2, d3\n" in Pack()
7877 "vaddw.u8 q8, q8, d0\n" in Pack()
7878 "vaddw.u8 q9, q9, d1\n" in Pack()
7879 "vaddw.u8 q10, q10, d2\n" in Pack()
7880 "vaddw.u8 q11, q11, d3\n" in Pack()
7881 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7883 "bne 1b\n" in Pack()
7888 "vmov.i8 d0, #0\n" in Pack()
7889 "vmov.i8 d1, #0\n" in Pack()
7890 "vmov.i8 d2, #0\n" in Pack()
7891 "vmov.i8 d3, #0\n" in Pack()
7892 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7893 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7894 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7895 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
7896 "pld [%[in]]\n" in Pack()
7897 "vtrn.16 d0, d2\n" in Pack()
7898 "vtrn.16 d1, d3\n" in Pack()
7899 "vtrn.8 d0, d1\n" in Pack()
7900 "vtrn.8 d2, d3\n" in Pack()
7901 "vaddw.u8 q8, q8, d0\n" in Pack()
7902 "vaddw.u8 q9, q9, d1\n" in Pack()
7903 "vaddw.u8 q10, q10, d2\n" in Pack()
7904 "vaddw.u8 q11, q11, d3\n" in Pack()
7905 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7908 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7909 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
7910 "vpaddl.u16 q8, q8\n" in Pack()
7911 "vpaddl.u16 q9, q9\n" in Pack()
7912 "vpaddl.u16 q10, q10\n" in Pack()
7913 "vpaddl.u16 q11, q11\n" in Pack()
7914 "vpadd.u32 d16, d16, d17\n" in Pack()
7915 "vpadd.u32 d18, d18, d19\n" in Pack()
7916 "vpadd.u32 d20, d20, d21\n" in Pack()
7917 "vpadd.u32 d22, d22, d23\n" in Pack()
7918 "vpadd.u32 d16, d16, d18\n" in Pack()
7919 "vpadd.u32 d17, d20, d22\n" in Pack()
7920 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7921 "vadd.i32 q8, q8, q1\n" in Pack()
7922 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
7946 "vmov.i16 q8, #0\n" in Pack()
7947 "vmov.i16 q9, #0\n" in Pack()
7948 "vmov.i16 q10, #0\n" in Pack()
7949 "vmov.i16 q11, #0\n" in Pack()
7952 "subs %[count], %[count], #5\n" in Pack()
7953 "beq 2f\n" in Pack()
7956 "subs %[count], %[count], #8\n" in Pack()
7959 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7960 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7961 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7962 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
7963 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7964 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
7965 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
7966 "vld1.32 {d3[1]}, [%[in]], %[stride]\n" in Pack()
7967 "pld [%[in]]\n" in Pack()
7968 "vtrn.16 d0, d2\n" in Pack()
7969 "vtrn.16 d1, d3\n" in Pack()
7970 "vtrn.8 d0, d1\n" in Pack()
7971 "vtrn.8 d2, d3\n" in Pack()
7972 "vaddw.u8 q8, q8, d0\n" in Pack()
7973 "vaddw.u8 q9, q9, d1\n" in Pack()
7974 "vaddw.u8 q10, q10, d2\n" in Pack()
7975 "vaddw.u8 q11, q11, d3\n" in Pack()
7976 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7978 "bne 1b\n" in Pack()
7983 "vmov.i8 d0, #0\n" in Pack()
7984 "vmov.i8 d1, #0\n" in Pack()
7985 "vmov.i8 d2, #0\n" in Pack()
7986 "vmov.i8 d3, #0\n" in Pack()
7987 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7988 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
7989 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
7990 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
7991 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7992 "pld [%[in]]\n" in Pack()
7993 "vtrn.16 d0, d2\n" in Pack()
7994 "vtrn.16 d1, d3\n" in Pack()
7995 "vtrn.8 d0, d1\n" in Pack()
7996 "vtrn.8 d2, d3\n" in Pack()
7997 "vaddw.u8 q8, q8, d0\n" in Pack()
7998 "vaddw.u8 q9, q9, d1\n" in Pack()
7999 "vaddw.u8 q10, q10, d2\n" in Pack()
8000 "vaddw.u8 q11, q11, d3\n" in Pack()
8001 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8004 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8005 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8006 "vpaddl.u16 q8, q8\n" in Pack()
8007 "vpaddl.u16 q9, q9\n" in Pack()
8008 "vpaddl.u16 q10, q10\n" in Pack()
8009 "vpaddl.u16 q11, q11\n" in Pack()
8010 "vpadd.u32 d16, d16, d17\n" in Pack()
8011 "vpadd.u32 d18, d18, d19\n" in Pack()
8012 "vpadd.u32 d20, d20, d21\n" in Pack()
8013 "vpadd.u32 d22, d22, d23\n" in Pack()
8014 "vpadd.u32 d16, d16, d18\n" in Pack()
8015 "vpadd.u32 d17, d20, d22\n" in Pack()
8016 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8017 "vadd.i32 q8, q8, q1\n" in Pack()
8018 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
8042 "vmov.i16 q8, #0\n" in Pack()
8043 "vmov.i16 q9, #0\n" in Pack()
8044 "vmov.i16 q10, #0\n" in Pack()
8045 "vmov.i16 q11, #0\n" in Pack()
8048 "subs %[count], %[count], #6\n" in Pack()
8049 "beq 2f\n" in Pack()
8052 "subs %[count], %[count], #8\n" in Pack()
8055 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
8056 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
8057 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
8058 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
8059 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
8060 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
8061 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
8062 "vld1.32 {d3[1]}, [%[in]], %[stride]\n" in Pack()
8063 "pld [%[in]]\n" in Pack()
8064 "vtrn.16 d0, d2\n" in Pack()
8065 "vtrn.16 d1, d3\n" in Pack()
8066 "vtrn.8 d0, d1\n" in Pack()
8067 "vtrn.8 d2, d3\n" in Pack()
8068 "vaddw.u8 q8, q8, d0\n" in Pack()
8069 "vaddw.u8 q9, q9, d1\n" in Pack()
8070 "vaddw.u8 q10, q10, d2\n" in Pack()
8071 "vaddw.u8 q11, q11, d3\n" in Pack()
8072 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8074 "bne 1b\n" in Pack()
8079 "vmov.i8 d0, #0\n" in Pack()
8080 "vmov.i8 d1, #0\n" in Pack()
8081 "vmov.i8 d2, #0\n" in Pack()
8082 "vmov.i8 d3, #0\n" in Pack()
8083 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
8084 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
8085 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
8086 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
8087 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
8088 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
8089 "pld [%[in]]\n" in Pack()
8090 "vtrn.16 d0, d2\n" in Pack()
8091 "vtrn.16 d1, d3\n" in Pack()
8092 "vtrn.8 d0, d1\n" in Pack()
8093 "vtrn.8 d2, d3\n" in Pack()
8094 "vaddw.u8 q8, q8, d0\n" in Pack()
8095 "vaddw.u8 q9, q9, d1\n" in Pack()
8096 "vaddw.u8 q10, q10, d2\n" in Pack()
8097 "vaddw.u8 q11, q11, d3\n" in Pack()
8098 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8101 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8102 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8103 "vpaddl.u16 q8, q8\n" in Pack()
8104 "vpaddl.u16 q9, q9\n" in Pack()
8105 "vpaddl.u16 q10, q10\n" in Pack()
8106 "vpaddl.u16 q11, q11\n" in Pack()
8107 "vpadd.u32 d16, d16, d17\n" in Pack()
8108 "vpadd.u32 d18, d18, d19\n" in Pack()
8109 "vpadd.u32 d20, d20, d21\n" in Pack()
8110 "vpadd.u32 d22, d22, d23\n" in Pack()
8111 "vpadd.u32 d16, d16, d18\n" in Pack()
8112 "vpadd.u32 d17, d20, d22\n" in Pack()
8113 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8114 "vadd.i32 q8, q8, q1\n" in Pack()
8115 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
8139 "vmov.i16 q8, #0\n" in Pack()
8140 "vmov.i16 q9, #0\n" in Pack()
8141 "vmov.i16 q10, #0\n" in Pack()
8142 "vmov.i16 q11, #0\n" in Pack()
8145 "subs %[count], %[count], #7\n" in Pack()
8146 "beq 2f\n" in Pack()
8149 "subs %[count], %[count], #8\n" in Pack()
8152 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
8153 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
8154 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
8155 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
8156 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
8157 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
8158 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
8159 "vld1.32 {d3[1]}, [%[in]], %[stride]\n" in Pack()
8160 "pld [%[in]]\n" in Pack()
8161 "vtrn.16 d0, d2\n" in Pack()
8162 "vtrn.16 d1, d3\n" in Pack()
8163 "vtrn.8 d0, d1\n" in Pack()
8164 "vtrn.8 d2, d3\n" in Pack()
8165 "vaddw.u8 q8, q8, d0\n" in Pack()
8166 "vaddw.u8 q9, q9, d1\n" in Pack()
8167 "vaddw.u8 q10, q10, d2\n" in Pack()
8168 "vaddw.u8 q11, q11, d3\n" in Pack()
8169 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8171 "bne 1b\n" in Pack()
8176 "vmov.i8 d0, #0\n" in Pack()
8177 "vmov.i8 d1, #0\n" in Pack()
8178 "vmov.i8 d2, #0\n" in Pack()
8179 "vmov.i8 d3, #0\n" in Pack()
8180 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
8181 "vld1.32 {d1[0]}, [%[in]], %[stride]\n" in Pack()
8182 "vld1.32 {d2[0]}, [%[in]], %[stride]\n" in Pack()
8183 "vld1.32 {d3[0]}, [%[in]], %[stride]\n" in Pack()
8184 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
8185 "vld1.32 {d1[1]}, [%[in]], %[stride]\n" in Pack()
8186 "vld1.32 {d2[1]}, [%[in]], %[stride]\n" in Pack()
8187 "pld [%[in]]\n" in Pack()
8188 "vtrn.16 d0, d2\n" in Pack()
8189 "vtrn.16 d1, d3\n" in Pack()
8190 "vtrn.8 d0, d1\n" in Pack()
8191 "vtrn.8 d2, d3\n" in Pack()
8192 "vaddw.u8 q8, q8, d0\n" in Pack()
8193 "vaddw.u8 q9, q9, d1\n" in Pack()
8194 "vaddw.u8 q10, q10, d2\n" in Pack()
8195 "vaddw.u8 q11, q11, d3\n" in Pack()
8196 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8199 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8200 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8201 "vpaddl.u16 q8, q8\n" in Pack()
8202 "vpaddl.u16 q9, q9\n" in Pack()
8203 "vpaddl.u16 q10, q10\n" in Pack()
8204 "vpaddl.u16 q11, q11\n" in Pack()
8205 "vpadd.u32 d16, d16, d17\n" in Pack()
8206 "vpadd.u32 d18, d18, d19\n" in Pack()
8207 "vpadd.u32 d20, d20, d21\n" in Pack()
8208 "vpadd.u32 d22, d22, d23\n" in Pack()
8209 "vpadd.u32 d16, d16, d18\n" in Pack()
8210 "vpadd.u32 d17, d20, d22\n" in Pack()
8211 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8212 "vadd.i32 q8, q8, q1\n" in Pack()
8213 "vst1.32 {d16, d17}, [%[out]:128]\n" in Pack()
8237 "sub %[stride], %[stride], #4\n" in Pack()
8238 "vmov.i16 q8, #0\n" in Pack()
8239 "vmov.i16 q9, #0\n" in Pack()
8240 "vmov.i16 q10, #0\n" in Pack()
8241 "vmov.i16 q11, #0\n" in Pack()
8242 "vmov.i16 q12, #0\n" in Pack()
8245 "subs %[count], %[count], #8\n" in Pack()
8248 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8249 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8250 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8251 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8252 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8253 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8254 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8255 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8256 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8257 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8258 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
8259 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
8260 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
8261 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
8262 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
8263 "vld1.8 {d4[7]}, [%[in]], %[stride]\n" in Pack()
8264 "pld [%[in]]\n" in Pack()
8265 "vtrn.16 d0, d2\n" in Pack()
8266 "vtrn.16 d1, d3\n" in Pack()
8267 "vtrn.8 d0, d1\n" in Pack()
8268 "vtrn.8 d2, d3\n" in Pack()
8269 "vaddw.u8 q8, q8, d0\n" in Pack()
8270 "vaddw.u8 q9, q9, d1\n" in Pack()
8271 "vaddw.u8 q10, q10, d2\n" in Pack()
8272 "vaddw.u8 q11, q11, d3\n" in Pack()
8273 "vaddw.u8 q12, q12, d4\n" in Pack()
8274 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8275 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8277 "bne 1b\n" in Pack()
8280 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8281 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8282 "vpaddl.u16 q8, q8\n" in Pack()
8283 "vpaddl.u16 q9, q9\n" in Pack()
8284 "vpaddl.u16 q10, q10\n" in Pack()
8285 "vpaddl.u16 q11, q11\n" in Pack()
8286 "vpaddl.u16 q12, q12\n" in Pack()
8287 "vpadd.u32 d16, d16, d17\n" in Pack()
8288 "vpadd.u32 d18, d18, d19\n" in Pack()
8289 "vpadd.u32 d20, d20, d21\n" in Pack()
8290 "vpadd.u32 d22, d22, d23\n" in Pack()
8291 "vpadd.u32 d24, d24, d25\n" in Pack()
8292 "vpadd.u32 d16, d16, d18\n" in Pack()
8293 "vpadd.u32 d17, d20, d22\n" in Pack()
8294 "vpadd.u32 d18, d24, d24\n" in Pack()
8295 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8296 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8297 "vadd.i32 q8, q8, q1\n" in Pack()
8298 "vadd.i32 q9, q9, q1\n" in Pack()
8299 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
8323 "sub %[stride], %[stride], #4\n" in Pack()
8324 "vmov.i16 q8, #0\n" in Pack()
8325 "vmov.i16 q9, #0\n" in Pack()
8326 "vmov.i16 q10, #0\n" in Pack()
8327 "vmov.i16 q11, #0\n" in Pack()
8328 "vmov.i16 q12, #0\n" in Pack()
8331 "subs %[count], %[count], #1\n" in Pack()
8332 "beq 2f\n" in Pack()
8335 "subs %[count], %[count], #8\n" in Pack()
8338 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8339 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8340 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8341 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8342 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8343 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8344 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8345 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8346 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8347 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8348 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
8349 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
8350 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
8351 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
8352 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
8353 "vld1.8 {d4[7]}, [%[in]], %[stride]\n" in Pack()
8354 "pld [%[in]]\n" in Pack()
8355 "vtrn.16 d0, d2\n" in Pack()
8356 "vtrn.16 d1, d3\n" in Pack()
8357 "vtrn.8 d0, d1\n" in Pack()
8358 "vtrn.8 d2, d3\n" in Pack()
8359 "vaddw.u8 q8, q8, d0\n" in Pack()
8360 "vaddw.u8 q9, q9, d1\n" in Pack()
8361 "vaddw.u8 q10, q10, d2\n" in Pack()
8362 "vaddw.u8 q11, q11, d3\n" in Pack()
8363 "vaddw.u8 q12, q12, d4\n" in Pack()
8364 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8365 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8367 "bne 1b\n" in Pack()
8372 "vmov.i8 d0, #0\n" in Pack()
8373 "vmov.i8 d1, #0\n" in Pack()
8374 "vmov.i8 d2, #0\n" in Pack()
8375 "vmov.i8 d3, #0\n" in Pack()
8376 "vmov.i8 d4, #0\n" in Pack()
8377 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8378 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8379 "pld [%[in]]\n" in Pack()
8380 "vtrn.16 d0, d2\n" in Pack()
8381 "vtrn.16 d1, d3\n" in Pack()
8382 "vtrn.8 d0, d1\n" in Pack()
8383 "vtrn.8 d2, d3\n" in Pack()
8384 "vaddw.u8 q8, q8, d0\n" in Pack()
8385 "vaddw.u8 q9, q9, d1\n" in Pack()
8386 "vaddw.u8 q10, q10, d2\n" in Pack()
8387 "vaddw.u8 q11, q11, d3\n" in Pack()
8388 "vaddw.u8 q12, q12, d4\n" in Pack()
8389 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8390 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8393 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8394 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8395 "vpaddl.u16 q8, q8\n" in Pack()
8396 "vpaddl.u16 q9, q9\n" in Pack()
8397 "vpaddl.u16 q10, q10\n" in Pack()
8398 "vpaddl.u16 q11, q11\n" in Pack()
8399 "vpaddl.u16 q12, q12\n" in Pack()
8400 "vpadd.u32 d16, d16, d17\n" in Pack()
8401 "vpadd.u32 d18, d18, d19\n" in Pack()
8402 "vpadd.u32 d20, d20, d21\n" in Pack()
8403 "vpadd.u32 d22, d22, d23\n" in Pack()
8404 "vpadd.u32 d24, d24, d25\n" in Pack()
8405 "vpadd.u32 d16, d16, d18\n" in Pack()
8406 "vpadd.u32 d17, d20, d22\n" in Pack()
8407 "vpadd.u32 d18, d24, d24\n" in Pack()
8408 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8409 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8410 "vadd.i32 q8, q8, q1\n" in Pack()
8411 "vadd.i32 q9, q9, q1\n" in Pack()
8412 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
8436 "sub %[stride], %[stride], #4\n" in Pack()
8437 "vmov.i16 q8, #0\n" in Pack()
8438 "vmov.i16 q9, #0\n" in Pack()
8439 "vmov.i16 q10, #0\n" in Pack()
8440 "vmov.i16 q11, #0\n" in Pack()
8441 "vmov.i16 q12, #0\n" in Pack()
8444 "subs %[count], %[count], #2\n" in Pack()
8445 "beq 2f\n" in Pack()
8448 "subs %[count], %[count], #8\n" in Pack()
8451 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8452 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8453 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8454 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8455 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8456 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8457 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8458 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8459 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8460 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8461 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
8462 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
8463 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
8464 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
8465 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
8466 "vld1.8 {d4[7]}, [%[in]], %[stride]\n" in Pack()
8467 "pld [%[in]]\n" in Pack()
8468 "vtrn.16 d0, d2\n" in Pack()
8469 "vtrn.16 d1, d3\n" in Pack()
8470 "vtrn.8 d0, d1\n" in Pack()
8471 "vtrn.8 d2, d3\n" in Pack()
8472 "vaddw.u8 q8, q8, d0\n" in Pack()
8473 "vaddw.u8 q9, q9, d1\n" in Pack()
8474 "vaddw.u8 q10, q10, d2\n" in Pack()
8475 "vaddw.u8 q11, q11, d3\n" in Pack()
8476 "vaddw.u8 q12, q12, d4\n" in Pack()
8477 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8478 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8480 "bne 1b\n" in Pack()
8485 "vmov.i8 d0, #0\n" in Pack()
8486 "vmov.i8 d1, #0\n" in Pack()
8487 "vmov.i8 d2, #0\n" in Pack()
8488 "vmov.i8 d3, #0\n" in Pack()
8489 "vmov.i8 d4, #0\n" in Pack()
8490 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8491 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8492 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8493 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8494 "pld [%[in]]\n" in Pack()
8495 "vtrn.16 d0, d2\n" in Pack()
8496 "vtrn.16 d1, d3\n" in Pack()
8497 "vtrn.8 d0, d1\n" in Pack()
8498 "vtrn.8 d2, d3\n" in Pack()
8499 "vaddw.u8 q8, q8, d0\n" in Pack()
8500 "vaddw.u8 q9, q9, d1\n" in Pack()
8501 "vaddw.u8 q10, q10, d2\n" in Pack()
8502 "vaddw.u8 q11, q11, d3\n" in Pack()
8503 "vaddw.u8 q12, q12, d4\n" in Pack()
8504 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8505 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8508 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8509 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8510 "vpaddl.u16 q8, q8\n" in Pack()
8511 "vpaddl.u16 q9, q9\n" in Pack()
8512 "vpaddl.u16 q10, q10\n" in Pack()
8513 "vpaddl.u16 q11, q11\n" in Pack()
8514 "vpaddl.u16 q12, q12\n" in Pack()
8515 "vpadd.u32 d16, d16, d17\n" in Pack()
8516 "vpadd.u32 d18, d18, d19\n" in Pack()
8517 "vpadd.u32 d20, d20, d21\n" in Pack()
8518 "vpadd.u32 d22, d22, d23\n" in Pack()
8519 "vpadd.u32 d24, d24, d25\n" in Pack()
8520 "vpadd.u32 d16, d16, d18\n" in Pack()
8521 "vpadd.u32 d17, d20, d22\n" in Pack()
8522 "vpadd.u32 d18, d24, d24\n" in Pack()
8523 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8524 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8525 "vadd.i32 q8, q8, q1\n" in Pack()
8526 "vadd.i32 q9, q9, q1\n" in Pack()
8527 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
8551 "sub %[stride], %[stride], #4\n" in Pack()
8552 "vmov.i16 q8, #0\n" in Pack()
8553 "vmov.i16 q9, #0\n" in Pack()
8554 "vmov.i16 q10, #0\n" in Pack()
8555 "vmov.i16 q11, #0\n" in Pack()
8556 "vmov.i16 q12, #0\n" in Pack()
8559 "subs %[count], %[count], #3\n" in Pack()
8560 "beq 2f\n" in Pack()
8563 "subs %[count], %[count], #8\n" in Pack()
8566 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8567 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8568 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8569 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8570 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8571 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8572 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8573 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8574 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8575 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8576 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
8577 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
8578 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
8579 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
8580 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
8581 "vld1.8 {d4[7]}, [%[in]], %[stride]\n" in Pack()
8582 "pld [%[in]]\n" in Pack()
8583 "vtrn.16 d0, d2\n" in Pack()
8584 "vtrn.16 d1, d3\n" in Pack()
8585 "vtrn.8 d0, d1\n" in Pack()
8586 "vtrn.8 d2, d3\n" in Pack()
8587 "vaddw.u8 q8, q8, d0\n" in Pack()
8588 "vaddw.u8 q9, q9, d1\n" in Pack()
8589 "vaddw.u8 q10, q10, d2\n" in Pack()
8590 "vaddw.u8 q11, q11, d3\n" in Pack()
8591 "vaddw.u8 q12, q12, d4\n" in Pack()
8592 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8593 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8595 "bne 1b\n" in Pack()
8600 "vmov.i8 d0, #0\n" in Pack()
8601 "vmov.i8 d1, #0\n" in Pack()
8602 "vmov.i8 d2, #0\n" in Pack()
8603 "vmov.i8 d3, #0\n" in Pack()
8604 "vmov.i8 d4, #0\n" in Pack()
8605 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8606 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8607 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8608 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8609 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8610 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8611 "pld [%[in]]\n" in Pack()
8612 "vtrn.16 d0, d2\n" in Pack()
8613 "vtrn.16 d1, d3\n" in Pack()
8614 "vtrn.8 d0, d1\n" in Pack()
8615 "vtrn.8 d2, d3\n" in Pack()
8616 "vaddw.u8 q8, q8, d0\n" in Pack()
8617 "vaddw.u8 q9, q9, d1\n" in Pack()
8618 "vaddw.u8 q10, q10, d2\n" in Pack()
8619 "vaddw.u8 q11, q11, d3\n" in Pack()
8620 "vaddw.u8 q12, q12, d4\n" in Pack()
8621 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8622 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8625 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8626 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8627 "vpaddl.u16 q8, q8\n" in Pack()
8628 "vpaddl.u16 q9, q9\n" in Pack()
8629 "vpaddl.u16 q10, q10\n" in Pack()
8630 "vpaddl.u16 q11, q11\n" in Pack()
8631 "vpaddl.u16 q12, q12\n" in Pack()
8632 "vpadd.u32 d16, d16, d17\n" in Pack()
8633 "vpadd.u32 d18, d18, d19\n" in Pack()
8634 "vpadd.u32 d20, d20, d21\n" in Pack()
8635 "vpadd.u32 d22, d22, d23\n" in Pack()
8636 "vpadd.u32 d24, d24, d25\n" in Pack()
8637 "vpadd.u32 d16, d16, d18\n" in Pack()
8638 "vpadd.u32 d17, d20, d22\n" in Pack()
8639 "vpadd.u32 d18, d24, d24\n" in Pack()
8640 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8641 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8642 "vadd.i32 q8, q8, q1\n" in Pack()
8643 "vadd.i32 q9, q9, q1\n" in Pack()
8644 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
8668 "sub %[stride], %[stride], #4\n" in Pack()
8669 "vmov.i16 q8, #0\n" in Pack()
8670 "vmov.i16 q9, #0\n" in Pack()
8671 "vmov.i16 q10, #0\n" in Pack()
8672 "vmov.i16 q11, #0\n" in Pack()
8673 "vmov.i16 q12, #0\n" in Pack()
8676 "subs %[count], %[count], #4\n" in Pack()
8677 "beq 2f\n" in Pack()
8680 "subs %[count], %[count], #8\n" in Pack()
8683 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8684 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8685 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8686 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8687 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8688 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8689 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8690 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8691 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8692 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8693 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
8694 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
8695 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
8696 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
8697 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
8698 "vld1.8 {d4[7]}, [%[in]], %[stride]\n" in Pack()
8699 "pld [%[in]]\n" in Pack()
8700 "vtrn.16 d0, d2\n" in Pack()
8701 "vtrn.16 d1, d3\n" in Pack()
8702 "vtrn.8 d0, d1\n" in Pack()
8703 "vtrn.8 d2, d3\n" in Pack()
8704 "vaddw.u8 q8, q8, d0\n" in Pack()
8705 "vaddw.u8 q9, q9, d1\n" in Pack()
8706 "vaddw.u8 q10, q10, d2\n" in Pack()
8707 "vaddw.u8 q11, q11, d3\n" in Pack()
8708 "vaddw.u8 q12, q12, d4\n" in Pack()
8709 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8710 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8712 "bne 1b\n" in Pack()
8717 "vmov.i8 d0, #0\n" in Pack()
8718 "vmov.i8 d1, #0\n" in Pack()
8719 "vmov.i8 d2, #0\n" in Pack()
8720 "vmov.i8 d3, #0\n" in Pack()
8721 "vmov.i8 d4, #0\n" in Pack()
8722 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8723 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8724 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8725 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8726 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8727 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8728 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8729 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8730 "pld [%[in]]\n" in Pack()
8731 "vtrn.16 d0, d2\n" in Pack()
8732 "vtrn.16 d1, d3\n" in Pack()
8733 "vtrn.8 d0, d1\n" in Pack()
8734 "vtrn.8 d2, d3\n" in Pack()
8735 "vaddw.u8 q8, q8, d0\n" in Pack()
8736 "vaddw.u8 q9, q9, d1\n" in Pack()
8737 "vaddw.u8 q10, q10, d2\n" in Pack()
8738 "vaddw.u8 q11, q11, d3\n" in Pack()
8739 "vaddw.u8 q12, q12, d4\n" in Pack()
8740 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8741 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8744 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8745 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8746 "vpaddl.u16 q8, q8\n" in Pack()
8747 "vpaddl.u16 q9, q9\n" in Pack()
8748 "vpaddl.u16 q10, q10\n" in Pack()
8749 "vpaddl.u16 q11, q11\n" in Pack()
8750 "vpaddl.u16 q12, q12\n" in Pack()
8751 "vpadd.u32 d16, d16, d17\n" in Pack()
8752 "vpadd.u32 d18, d18, d19\n" in Pack()
8753 "vpadd.u32 d20, d20, d21\n" in Pack()
8754 "vpadd.u32 d22, d22, d23\n" in Pack()
8755 "vpadd.u32 d24, d24, d25\n" in Pack()
8756 "vpadd.u32 d16, d16, d18\n" in Pack()
8757 "vpadd.u32 d17, d20, d22\n" in Pack()
8758 "vpadd.u32 d18, d24, d24\n" in Pack()
8759 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8760 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8761 "vadd.i32 q8, q8, q1\n" in Pack()
8762 "vadd.i32 q9, q9, q1\n" in Pack()
8763 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
8787 "sub %[stride], %[stride], #4\n" in Pack()
8788 "vmov.i16 q8, #0\n" in Pack()
8789 "vmov.i16 q9, #0\n" in Pack()
8790 "vmov.i16 q10, #0\n" in Pack()
8791 "vmov.i16 q11, #0\n" in Pack()
8792 "vmov.i16 q12, #0\n" in Pack()
8795 "subs %[count], %[count], #5\n" in Pack()
8796 "beq 2f\n" in Pack()
8799 "subs %[count], %[count], #8\n" in Pack()
8802 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8803 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8804 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8805 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8806 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8807 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8808 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8809 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8810 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8811 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8812 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
8813 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
8814 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
8815 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
8816 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
8817 "vld1.8 {d4[7]}, [%[in]], %[stride]\n" in Pack()
8818 "pld [%[in]]\n" in Pack()
8819 "vtrn.16 d0, d2\n" in Pack()
8820 "vtrn.16 d1, d3\n" in Pack()
8821 "vtrn.8 d0, d1\n" in Pack()
8822 "vtrn.8 d2, d3\n" in Pack()
8823 "vaddw.u8 q8, q8, d0\n" in Pack()
8824 "vaddw.u8 q9, q9, d1\n" in Pack()
8825 "vaddw.u8 q10, q10, d2\n" in Pack()
8826 "vaddw.u8 q11, q11, d3\n" in Pack()
8827 "vaddw.u8 q12, q12, d4\n" in Pack()
8828 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8829 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8831 "bne 1b\n" in Pack()
8836 "vmov.i8 d0, #0\n" in Pack()
8837 "vmov.i8 d1, #0\n" in Pack()
8838 "vmov.i8 d2, #0\n" in Pack()
8839 "vmov.i8 d3, #0\n" in Pack()
8840 "vmov.i8 d4, #0\n" in Pack()
8841 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8842 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8843 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8844 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8845 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8846 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8847 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8848 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8849 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8850 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8851 "pld [%[in]]\n" in Pack()
8852 "vtrn.16 d0, d2\n" in Pack()
8853 "vtrn.16 d1, d3\n" in Pack()
8854 "vtrn.8 d0, d1\n" in Pack()
8855 "vtrn.8 d2, d3\n" in Pack()
8856 "vaddw.u8 q8, q8, d0\n" in Pack()
8857 "vaddw.u8 q9, q9, d1\n" in Pack()
8858 "vaddw.u8 q10, q10, d2\n" in Pack()
8859 "vaddw.u8 q11, q11, d3\n" in Pack()
8860 "vaddw.u8 q12, q12, d4\n" in Pack()
8861 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8862 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8865 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8866 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8867 "vpaddl.u16 q8, q8\n" in Pack()
8868 "vpaddl.u16 q9, q9\n" in Pack()
8869 "vpaddl.u16 q10, q10\n" in Pack()
8870 "vpaddl.u16 q11, q11\n" in Pack()
8871 "vpaddl.u16 q12, q12\n" in Pack()
8872 "vpadd.u32 d16, d16, d17\n" in Pack()
8873 "vpadd.u32 d18, d18, d19\n" in Pack()
8874 "vpadd.u32 d20, d20, d21\n" in Pack()
8875 "vpadd.u32 d22, d22, d23\n" in Pack()
8876 "vpadd.u32 d24, d24, d25\n" in Pack()
8877 "vpadd.u32 d16, d16, d18\n" in Pack()
8878 "vpadd.u32 d17, d20, d22\n" in Pack()
8879 "vpadd.u32 d18, d24, d24\n" in Pack()
8880 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8881 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8882 "vadd.i32 q8, q8, q1\n" in Pack()
8883 "vadd.i32 q9, q9, q1\n" in Pack()
8884 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
8908 "sub %[stride], %[stride], #4\n" in Pack()
8909 "vmov.i16 q8, #0\n" in Pack()
8910 "vmov.i16 q9, #0\n" in Pack()
8911 "vmov.i16 q10, #0\n" in Pack()
8912 "vmov.i16 q11, #0\n" in Pack()
8913 "vmov.i16 q12, #0\n" in Pack()
8916 "subs %[count], %[count], #6\n" in Pack()
8917 "beq 2f\n" in Pack()
8920 "subs %[count], %[count], #8\n" in Pack()
8923 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8924 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8925 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8926 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8927 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8928 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8929 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8930 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8931 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8932 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8933 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
8934 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
8935 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
8936 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
8937 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
8938 "vld1.8 {d4[7]}, [%[in]], %[stride]\n" in Pack()
8939 "pld [%[in]]\n" in Pack()
8940 "vtrn.16 d0, d2\n" in Pack()
8941 "vtrn.16 d1, d3\n" in Pack()
8942 "vtrn.8 d0, d1\n" in Pack()
8943 "vtrn.8 d2, d3\n" in Pack()
8944 "vaddw.u8 q8, q8, d0\n" in Pack()
8945 "vaddw.u8 q9, q9, d1\n" in Pack()
8946 "vaddw.u8 q10, q10, d2\n" in Pack()
8947 "vaddw.u8 q11, q11, d3\n" in Pack()
8948 "vaddw.u8 q12, q12, d4\n" in Pack()
8949 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8950 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8952 "bne 1b\n" in Pack()
8957 "vmov.i8 d0, #0\n" in Pack()
8958 "vmov.i8 d1, #0\n" in Pack()
8959 "vmov.i8 d2, #0\n" in Pack()
8960 "vmov.i8 d3, #0\n" in Pack()
8961 "vmov.i8 d4, #0\n" in Pack()
8962 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8963 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
8964 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
8965 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
8966 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
8967 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
8968 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
8969 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
8970 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8971 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
8972 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
8973 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
8974 "pld [%[in]]\n" in Pack()
8975 "vtrn.16 d0, d2\n" in Pack()
8976 "vtrn.16 d1, d3\n" in Pack()
8977 "vtrn.8 d0, d1\n" in Pack()
8978 "vtrn.8 d2, d3\n" in Pack()
8979 "vaddw.u8 q8, q8, d0\n" in Pack()
8980 "vaddw.u8 q9, q9, d1\n" in Pack()
8981 "vaddw.u8 q10, q10, d2\n" in Pack()
8982 "vaddw.u8 q11, q11, d3\n" in Pack()
8983 "vaddw.u8 q12, q12, d4\n" in Pack()
8984 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8985 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
8988 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8989 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
8990 "vpaddl.u16 q8, q8\n" in Pack()
8991 "vpaddl.u16 q9, q9\n" in Pack()
8992 "vpaddl.u16 q10, q10\n" in Pack()
8993 "vpaddl.u16 q11, q11\n" in Pack()
8994 "vpaddl.u16 q12, q12\n" in Pack()
8995 "vpadd.u32 d16, d16, d17\n" in Pack()
8996 "vpadd.u32 d18, d18, d19\n" in Pack()
8997 "vpadd.u32 d20, d20, d21\n" in Pack()
8998 "vpadd.u32 d22, d22, d23\n" in Pack()
8999 "vpadd.u32 d24, d24, d25\n" in Pack()
9000 "vpadd.u32 d16, d16, d18\n" in Pack()
9001 "vpadd.u32 d17, d20, d22\n" in Pack()
9002 "vpadd.u32 d18, d24, d24\n" in Pack()
9003 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9004 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9005 "vadd.i32 q8, q8, q1\n" in Pack()
9006 "vadd.i32 q9, q9, q1\n" in Pack()
9007 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
9031 "sub %[stride], %[stride], #4\n" in Pack()
9032 "vmov.i16 q8, #0\n" in Pack()
9033 "vmov.i16 q9, #0\n" in Pack()
9034 "vmov.i16 q10, #0\n" in Pack()
9035 "vmov.i16 q11, #0\n" in Pack()
9036 "vmov.i16 q12, #0\n" in Pack()
9039 "subs %[count], %[count], #7\n" in Pack()
9040 "beq 2f\n" in Pack()
9043 "subs %[count], %[count], #8\n" in Pack()
9046 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9047 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9048 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9049 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9050 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9051 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9052 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9053 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9054 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9055 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
9056 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9057 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
9058 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9059 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
9060 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
9061 "vld1.8 {d4[7]}, [%[in]], %[stride]\n" in Pack()
9062 "pld [%[in]]\n" in Pack()
9063 "vtrn.16 d0, d2\n" in Pack()
9064 "vtrn.16 d1, d3\n" in Pack()
9065 "vtrn.8 d0, d1\n" in Pack()
9066 "vtrn.8 d2, d3\n" in Pack()
9067 "vaddw.u8 q8, q8, d0\n" in Pack()
9068 "vaddw.u8 q9, q9, d1\n" in Pack()
9069 "vaddw.u8 q10, q10, d2\n" in Pack()
9070 "vaddw.u8 q11, q11, d3\n" in Pack()
9071 "vaddw.u8 q12, q12, d4\n" in Pack()
9072 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
9073 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
9075 "bne 1b\n" in Pack()
9080 "vmov.i8 d0, #0\n" in Pack()
9081 "vmov.i8 d1, #0\n" in Pack()
9082 "vmov.i8 d2, #0\n" in Pack()
9083 "vmov.i8 d3, #0\n" in Pack()
9084 "vmov.i8 d4, #0\n" in Pack()
9085 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9086 "vld1.8 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9087 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9088 "vld1.8 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9089 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9090 "vld1.8 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9091 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9092 "vld1.8 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9093 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9094 "vld1.8 {d4[4]}, [%[in]], %[stride]\n" in Pack()
9095 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9096 "vld1.8 {d4[5]}, [%[in]], %[stride]\n" in Pack()
9097 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9098 "vld1.8 {d4[6]}, [%[in]], %[stride]\n" in Pack()
9099 "pld [%[in]]\n" in Pack()
9100 "vtrn.16 d0, d2\n" in Pack()
9101 "vtrn.16 d1, d3\n" in Pack()
9102 "vtrn.8 d0, d1\n" in Pack()
9103 "vtrn.8 d2, d3\n" in Pack()
9104 "vaddw.u8 q8, q8, d0\n" in Pack()
9105 "vaddw.u8 q9, q9, d1\n" in Pack()
9106 "vaddw.u8 q10, q10, d2\n" in Pack()
9107 "vaddw.u8 q11, q11, d3\n" in Pack()
9108 "vaddw.u8 q12, q12, d4\n" in Pack()
9109 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
9110 "vst1.32 {d4}, [%[out]:64]!\n" in Pack()
9113 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9114 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
9115 "vpaddl.u16 q8, q8\n" in Pack()
9116 "vpaddl.u16 q9, q9\n" in Pack()
9117 "vpaddl.u16 q10, q10\n" in Pack()
9118 "vpaddl.u16 q11, q11\n" in Pack()
9119 "vpaddl.u16 q12, q12\n" in Pack()
9120 "vpadd.u32 d16, d16, d17\n" in Pack()
9121 "vpadd.u32 d18, d18, d19\n" in Pack()
9122 "vpadd.u32 d20, d20, d21\n" in Pack()
9123 "vpadd.u32 d22, d22, d23\n" in Pack()
9124 "vpadd.u32 d24, d24, d25\n" in Pack()
9125 "vpadd.u32 d16, d16, d18\n" in Pack()
9126 "vpadd.u32 d17, d20, d22\n" in Pack()
9127 "vpadd.u32 d18, d24, d24\n" in Pack()
9128 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9129 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9130 "vadd.i32 q8, q8, q1\n" in Pack()
9131 "vadd.i32 q9, q9, q1\n" in Pack()
9132 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
9156 "sub %[stride], %[stride], #4\n" in Pack()
9157 "vmov.i16 q8, #0\n" in Pack()
9158 "vmov.i16 q9, #0\n" in Pack()
9159 "vmov.i16 q10, #0\n" in Pack()
9160 "vmov.i16 q11, #0\n" in Pack()
9161 "vmov.i16 q12, #0\n" in Pack()
9162 "vmov.i16 q13, #0\n" in Pack()
9165 "subs %[count], %[count], #8\n" in Pack()
9168 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9169 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9170 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9171 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9172 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9173 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9174 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9175 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9176 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9177 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9178 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9179 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
9180 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9181 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
9182 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
9183 "vld1.16 {d5[3]}, [%[in]], %[stride]\n" in Pack()
9184 "pld [%[in]]\n" in Pack()
9185 "vtrn.16 d0, d2\n" in Pack()
9186 "vtrn.16 d1, d3\n" in Pack()
9187 "vuzp.8 d4, d5\n" in Pack()
9188 "vtrn.8 d0, d1\n" in Pack()
9189 "vtrn.8 d2, d3\n" in Pack()
9190 "vaddw.u8 q8, q8, d0\n" in Pack()
9191 "vaddw.u8 q9, q9, d1\n" in Pack()
9192 "vaddw.u8 q10, q10, d2\n" in Pack()
9193 "vaddw.u8 q11, q11, d3\n" in Pack()
9194 "vaddw.u8 q12, q12, d4\n" in Pack()
9195 "vaddw.u8 q13, q13, d5\n" in Pack()
9196 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9197 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9199 "bne 1b\n" in Pack()
9202 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9203 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
9204 "vpaddl.u16 q8, q8\n" in Pack()
9205 "vpaddl.u16 q9, q9\n" in Pack()
9206 "vpaddl.u16 q10, q10\n" in Pack()
9207 "vpaddl.u16 q11, q11\n" in Pack()
9208 "vpaddl.u16 q12, q12\n" in Pack()
9209 "vpaddl.u16 q13, q13\n" in Pack()
9210 "vpadd.u32 d16, d16, d17\n" in Pack()
9211 "vpadd.u32 d18, d18, d19\n" in Pack()
9212 "vpadd.u32 d20, d20, d21\n" in Pack()
9213 "vpadd.u32 d22, d22, d23\n" in Pack()
9214 "vpadd.u32 d24, d24, d25\n" in Pack()
9215 "vpadd.u32 d26, d26, d27\n" in Pack()
9216 "vpadd.u32 d16, d16, d18\n" in Pack()
9217 "vpadd.u32 d17, d20, d22\n" in Pack()
9218 "vpadd.u32 d18, d24, d26\n" in Pack()
9219 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9220 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9221 "vadd.i32 q8, q8, q1\n" in Pack()
9222 "vadd.i32 q9, q9, q1\n" in Pack()
9223 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
9247 "sub %[stride], %[stride], #4\n" in Pack()
9248 "vmov.i16 q8, #0\n" in Pack()
9249 "vmov.i16 q9, #0\n" in Pack()
9250 "vmov.i16 q10, #0\n" in Pack()
9251 "vmov.i16 q11, #0\n" in Pack()
9252 "vmov.i16 q12, #0\n" in Pack()
9253 "vmov.i16 q13, #0\n" in Pack()
9256 "subs %[count], %[count], #1\n" in Pack()
9257 "beq 2f\n" in Pack()
9260 "subs %[count], %[count], #8\n" in Pack()
9263 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9264 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9265 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9266 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9267 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9268 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9269 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9270 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9271 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9272 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9273 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9274 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
9275 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9276 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
9277 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
9278 "vld1.16 {d5[3]}, [%[in]], %[stride]\n" in Pack()
9279 "pld [%[in]]\n" in Pack()
9280 "vtrn.16 d0, d2\n" in Pack()
9281 "vtrn.16 d1, d3\n" in Pack()
9282 "vuzp.8 d4, d5\n" in Pack()
9283 "vtrn.8 d0, d1\n" in Pack()
9284 "vtrn.8 d2, d3\n" in Pack()
9285 "vaddw.u8 q8, q8, d0\n" in Pack()
9286 "vaddw.u8 q9, q9, d1\n" in Pack()
9287 "vaddw.u8 q10, q10, d2\n" in Pack()
9288 "vaddw.u8 q11, q11, d3\n" in Pack()
9289 "vaddw.u8 q12, q12, d4\n" in Pack()
9290 "vaddw.u8 q13, q13, d5\n" in Pack()
9291 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9292 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9294 "bne 1b\n" in Pack()
9299 "vmov.i8 d0, #0\n" in Pack()
9300 "vmov.i8 d1, #0\n" in Pack()
9301 "vmov.i8 d2, #0\n" in Pack()
9302 "vmov.i8 d3, #0\n" in Pack()
9303 "vmov.i8 d4, #0\n" in Pack()
9304 "vmov.i8 d5, #0\n" in Pack()
9305 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9306 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9307 "pld [%[in]]\n" in Pack()
9308 "vtrn.16 d0, d2\n" in Pack()
9309 "vtrn.16 d1, d3\n" in Pack()
9310 "vuzp.8 d4, d5\n" in Pack()
9311 "vtrn.8 d0, d1\n" in Pack()
9312 "vtrn.8 d2, d3\n" in Pack()
9313 "vaddw.u8 q8, q8, d0\n" in Pack()
9314 "vaddw.u8 q9, q9, d1\n" in Pack()
9315 "vaddw.u8 q10, q10, d2\n" in Pack()
9316 "vaddw.u8 q11, q11, d3\n" in Pack()
9317 "vaddw.u8 q12, q12, d4\n" in Pack()
9318 "vaddw.u8 q13, q13, d5\n" in Pack()
9319 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9320 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9323 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9324 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
9325 "vpaddl.u16 q8, q8\n" in Pack()
9326 "vpaddl.u16 q9, q9\n" in Pack()
9327 "vpaddl.u16 q10, q10\n" in Pack()
9328 "vpaddl.u16 q11, q11\n" in Pack()
9329 "vpaddl.u16 q12, q12\n" in Pack()
9330 "vpaddl.u16 q13, q13\n" in Pack()
9331 "vpadd.u32 d16, d16, d17\n" in Pack()
9332 "vpadd.u32 d18, d18, d19\n" in Pack()
9333 "vpadd.u32 d20, d20, d21\n" in Pack()
9334 "vpadd.u32 d22, d22, d23\n" in Pack()
9335 "vpadd.u32 d24, d24, d25\n" in Pack()
9336 "vpadd.u32 d26, d26, d27\n" in Pack()
9337 "vpadd.u32 d16, d16, d18\n" in Pack()
9338 "vpadd.u32 d17, d20, d22\n" in Pack()
9339 "vpadd.u32 d18, d24, d26\n" in Pack()
9340 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9341 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9342 "vadd.i32 q8, q8, q1\n" in Pack()
9343 "vadd.i32 q9, q9, q1\n" in Pack()
9344 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
9368 "sub %[stride], %[stride], #4\n" in Pack()
9369 "vmov.i16 q8, #0\n" in Pack()
9370 "vmov.i16 q9, #0\n" in Pack()
9371 "vmov.i16 q10, #0\n" in Pack()
9372 "vmov.i16 q11, #0\n" in Pack()
9373 "vmov.i16 q12, #0\n" in Pack()
9374 "vmov.i16 q13, #0\n" in Pack()
9377 "subs %[count], %[count], #2\n" in Pack()
9378 "beq 2f\n" in Pack()
9381 "subs %[count], %[count], #8\n" in Pack()
9384 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9385 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9386 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9387 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9388 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9389 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9390 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9391 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9392 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9393 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9394 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9395 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
9396 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9397 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
9398 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
9399 "vld1.16 {d5[3]}, [%[in]], %[stride]\n" in Pack()
9400 "pld [%[in]]\n" in Pack()
9401 "vtrn.16 d0, d2\n" in Pack()
9402 "vtrn.16 d1, d3\n" in Pack()
9403 "vuzp.8 d4, d5\n" in Pack()
9404 "vtrn.8 d0, d1\n" in Pack()
9405 "vtrn.8 d2, d3\n" in Pack()
9406 "vaddw.u8 q8, q8, d0\n" in Pack()
9407 "vaddw.u8 q9, q9, d1\n" in Pack()
9408 "vaddw.u8 q10, q10, d2\n" in Pack()
9409 "vaddw.u8 q11, q11, d3\n" in Pack()
9410 "vaddw.u8 q12, q12, d4\n" in Pack()
9411 "vaddw.u8 q13, q13, d5\n" in Pack()
9412 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9413 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9415 "bne 1b\n" in Pack()
9420 "vmov.i8 d0, #0\n" in Pack()
9421 "vmov.i8 d1, #0\n" in Pack()
9422 "vmov.i8 d2, #0\n" in Pack()
9423 "vmov.i8 d3, #0\n" in Pack()
9424 "vmov.i8 d4, #0\n" in Pack()
9425 "vmov.i8 d5, #0\n" in Pack()
9426 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9427 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9428 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9429 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9430 "pld [%[in]]\n" in Pack()
9431 "vtrn.16 d0, d2\n" in Pack()
9432 "vtrn.16 d1, d3\n" in Pack()
9433 "vuzp.8 d4, d5\n" in Pack()
9434 "vtrn.8 d0, d1\n" in Pack()
9435 "vtrn.8 d2, d3\n" in Pack()
9436 "vaddw.u8 q8, q8, d0\n" in Pack()
9437 "vaddw.u8 q9, q9, d1\n" in Pack()
9438 "vaddw.u8 q10, q10, d2\n" in Pack()
9439 "vaddw.u8 q11, q11, d3\n" in Pack()
9440 "vaddw.u8 q12, q12, d4\n" in Pack()
9441 "vaddw.u8 q13, q13, d5\n" in Pack()
9442 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9443 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9446 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9447 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
9448 "vpaddl.u16 q8, q8\n" in Pack()
9449 "vpaddl.u16 q9, q9\n" in Pack()
9450 "vpaddl.u16 q10, q10\n" in Pack()
9451 "vpaddl.u16 q11, q11\n" in Pack()
9452 "vpaddl.u16 q12, q12\n" in Pack()
9453 "vpaddl.u16 q13, q13\n" in Pack()
9454 "vpadd.u32 d16, d16, d17\n" in Pack()
9455 "vpadd.u32 d18, d18, d19\n" in Pack()
9456 "vpadd.u32 d20, d20, d21\n" in Pack()
9457 "vpadd.u32 d22, d22, d23\n" in Pack()
9458 "vpadd.u32 d24, d24, d25\n" in Pack()
9459 "vpadd.u32 d26, d26, d27\n" in Pack()
9460 "vpadd.u32 d16, d16, d18\n" in Pack()
9461 "vpadd.u32 d17, d20, d22\n" in Pack()
9462 "vpadd.u32 d18, d24, d26\n" in Pack()
9463 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9464 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9465 "vadd.i32 q8, q8, q1\n" in Pack()
9466 "vadd.i32 q9, q9, q1\n" in Pack()
9467 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
9491 "sub %[stride], %[stride], #4\n" in Pack()
9492 "vmov.i16 q8, #0\n" in Pack()
9493 "vmov.i16 q9, #0\n" in Pack()
9494 "vmov.i16 q10, #0\n" in Pack()
9495 "vmov.i16 q11, #0\n" in Pack()
9496 "vmov.i16 q12, #0\n" in Pack()
9497 "vmov.i16 q13, #0\n" in Pack()
9500 "subs %[count], %[count], #3\n" in Pack()
9501 "beq 2f\n" in Pack()
9504 "subs %[count], %[count], #8\n" in Pack()
9507 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9508 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9509 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9510 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9511 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9512 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9513 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9514 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9515 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9516 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9517 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9518 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
9519 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9520 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
9521 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
9522 "vld1.16 {d5[3]}, [%[in]], %[stride]\n" in Pack()
9523 "pld [%[in]]\n" in Pack()
9524 "vtrn.16 d0, d2\n" in Pack()
9525 "vtrn.16 d1, d3\n" in Pack()
9526 "vuzp.8 d4, d5\n" in Pack()
9527 "vtrn.8 d0, d1\n" in Pack()
9528 "vtrn.8 d2, d3\n" in Pack()
9529 "vaddw.u8 q8, q8, d0\n" in Pack()
9530 "vaddw.u8 q9, q9, d1\n" in Pack()
9531 "vaddw.u8 q10, q10, d2\n" in Pack()
9532 "vaddw.u8 q11, q11, d3\n" in Pack()
9533 "vaddw.u8 q12, q12, d4\n" in Pack()
9534 "vaddw.u8 q13, q13, d5\n" in Pack()
9535 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9536 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9538 "bne 1b\n" in Pack()
9543 "vmov.i8 d0, #0\n" in Pack()
9544 "vmov.i8 d1, #0\n" in Pack()
9545 "vmov.i8 d2, #0\n" in Pack()
9546 "vmov.i8 d3, #0\n" in Pack()
9547 "vmov.i8 d4, #0\n" in Pack()
9548 "vmov.i8 d5, #0\n" in Pack()
9549 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9550 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9551 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9552 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9553 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9554 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9555 "pld [%[in]]\n" in Pack()
9556 "vtrn.16 d0, d2\n" in Pack()
9557 "vtrn.16 d1, d3\n" in Pack()
9558 "vuzp.8 d4, d5\n" in Pack()
9559 "vtrn.8 d0, d1\n" in Pack()
9560 "vtrn.8 d2, d3\n" in Pack()
9561 "vaddw.u8 q8, q8, d0\n" in Pack()
9562 "vaddw.u8 q9, q9, d1\n" in Pack()
9563 "vaddw.u8 q10, q10, d2\n" in Pack()
9564 "vaddw.u8 q11, q11, d3\n" in Pack()
9565 "vaddw.u8 q12, q12, d4\n" in Pack()
9566 "vaddw.u8 q13, q13, d5\n" in Pack()
9567 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9568 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9571 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9572 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
9573 "vpaddl.u16 q8, q8\n" in Pack()
9574 "vpaddl.u16 q9, q9\n" in Pack()
9575 "vpaddl.u16 q10, q10\n" in Pack()
9576 "vpaddl.u16 q11, q11\n" in Pack()
9577 "vpaddl.u16 q12, q12\n" in Pack()
9578 "vpaddl.u16 q13, q13\n" in Pack()
9579 "vpadd.u32 d16, d16, d17\n" in Pack()
9580 "vpadd.u32 d18, d18, d19\n" in Pack()
9581 "vpadd.u32 d20, d20, d21\n" in Pack()
9582 "vpadd.u32 d22, d22, d23\n" in Pack()
9583 "vpadd.u32 d24, d24, d25\n" in Pack()
9584 "vpadd.u32 d26, d26, d27\n" in Pack()
9585 "vpadd.u32 d16, d16, d18\n" in Pack()
9586 "vpadd.u32 d17, d20, d22\n" in Pack()
9587 "vpadd.u32 d18, d24, d26\n" in Pack()
9588 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9589 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9590 "vadd.i32 q8, q8, q1\n" in Pack()
9591 "vadd.i32 q9, q9, q1\n" in Pack()
9592 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
9616 "sub %[stride], %[stride], #4\n" in Pack()
9617 "vmov.i16 q8, #0\n" in Pack()
9618 "vmov.i16 q9, #0\n" in Pack()
9619 "vmov.i16 q10, #0\n" in Pack()
9620 "vmov.i16 q11, #0\n" in Pack()
9621 "vmov.i16 q12, #0\n" in Pack()
9622 "vmov.i16 q13, #0\n" in Pack()
9625 "subs %[count], %[count], #4\n" in Pack()
9626 "beq 2f\n" in Pack()
9629 "subs %[count], %[count], #8\n" in Pack()
9632 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9633 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9634 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9635 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9636 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9637 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9638 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9639 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9640 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9641 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9642 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9643 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
9644 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9645 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
9646 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
9647 "vld1.16 {d5[3]}, [%[in]], %[stride]\n" in Pack()
9648 "pld [%[in]]\n" in Pack()
9649 "vtrn.16 d0, d2\n" in Pack()
9650 "vtrn.16 d1, d3\n" in Pack()
9651 "vuzp.8 d4, d5\n" in Pack()
9652 "vtrn.8 d0, d1\n" in Pack()
9653 "vtrn.8 d2, d3\n" in Pack()
9654 "vaddw.u8 q8, q8, d0\n" in Pack()
9655 "vaddw.u8 q9, q9, d1\n" in Pack()
9656 "vaddw.u8 q10, q10, d2\n" in Pack()
9657 "vaddw.u8 q11, q11, d3\n" in Pack()
9658 "vaddw.u8 q12, q12, d4\n" in Pack()
9659 "vaddw.u8 q13, q13, d5\n" in Pack()
9660 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9661 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9663 "bne 1b\n" in Pack()
9668 "vmov.i8 d0, #0\n" in Pack()
9669 "vmov.i8 d1, #0\n" in Pack()
9670 "vmov.i8 d2, #0\n" in Pack()
9671 "vmov.i8 d3, #0\n" in Pack()
9672 "vmov.i8 d4, #0\n" in Pack()
9673 "vmov.i8 d5, #0\n" in Pack()
9674 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9675 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9676 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9677 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9678 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9679 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9680 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9681 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9682 "pld [%[in]]\n" in Pack()
9683 "vtrn.16 d0, d2\n" in Pack()
9684 "vtrn.16 d1, d3\n" in Pack()
9685 "vuzp.8 d4, d5\n" in Pack()
9686 "vtrn.8 d0, d1\n" in Pack()
9687 "vtrn.8 d2, d3\n" in Pack()
9688 "vaddw.u8 q8, q8, d0\n" in Pack()
9689 "vaddw.u8 q9, q9, d1\n" in Pack()
9690 "vaddw.u8 q10, q10, d2\n" in Pack()
9691 "vaddw.u8 q11, q11, d3\n" in Pack()
9692 "vaddw.u8 q12, q12, d4\n" in Pack()
9693 "vaddw.u8 q13, q13, d5\n" in Pack()
9694 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9695 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9698 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9699 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
9700 "vpaddl.u16 q8, q8\n" in Pack()
9701 "vpaddl.u16 q9, q9\n" in Pack()
9702 "vpaddl.u16 q10, q10\n" in Pack()
9703 "vpaddl.u16 q11, q11\n" in Pack()
9704 "vpaddl.u16 q12, q12\n" in Pack()
9705 "vpaddl.u16 q13, q13\n" in Pack()
9706 "vpadd.u32 d16, d16, d17\n" in Pack()
9707 "vpadd.u32 d18, d18, d19\n" in Pack()
9708 "vpadd.u32 d20, d20, d21\n" in Pack()
9709 "vpadd.u32 d22, d22, d23\n" in Pack()
9710 "vpadd.u32 d24, d24, d25\n" in Pack()
9711 "vpadd.u32 d26, d26, d27\n" in Pack()
9712 "vpadd.u32 d16, d16, d18\n" in Pack()
9713 "vpadd.u32 d17, d20, d22\n" in Pack()
9714 "vpadd.u32 d18, d24, d26\n" in Pack()
9715 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9716 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9717 "vadd.i32 q8, q8, q1\n" in Pack()
9718 "vadd.i32 q9, q9, q1\n" in Pack()
9719 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
9743 "sub %[stride], %[stride], #4\n" in Pack()
9744 "vmov.i16 q8, #0\n" in Pack()
9745 "vmov.i16 q9, #0\n" in Pack()
9746 "vmov.i16 q10, #0\n" in Pack()
9747 "vmov.i16 q11, #0\n" in Pack()
9748 "vmov.i16 q12, #0\n" in Pack()
9749 "vmov.i16 q13, #0\n" in Pack()
9752 "subs %[count], %[count], #5\n" in Pack()
9753 "beq 2f\n" in Pack()
9756 "subs %[count], %[count], #8\n" in Pack()
9759 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9760 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9761 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9762 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9763 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9764 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9765 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9766 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9767 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9768 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9769 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9770 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
9771 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9772 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
9773 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
9774 "vld1.16 {d5[3]}, [%[in]], %[stride]\n" in Pack()
9775 "pld [%[in]]\n" in Pack()
9776 "vtrn.16 d0, d2\n" in Pack()
9777 "vtrn.16 d1, d3\n" in Pack()
9778 "vuzp.8 d4, d5\n" in Pack()
9779 "vtrn.8 d0, d1\n" in Pack()
9780 "vtrn.8 d2, d3\n" in Pack()
9781 "vaddw.u8 q8, q8, d0\n" in Pack()
9782 "vaddw.u8 q9, q9, d1\n" in Pack()
9783 "vaddw.u8 q10, q10, d2\n" in Pack()
9784 "vaddw.u8 q11, q11, d3\n" in Pack()
9785 "vaddw.u8 q12, q12, d4\n" in Pack()
9786 "vaddw.u8 q13, q13, d5\n" in Pack()
9787 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9788 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9790 "bne 1b\n" in Pack()
9795 "vmov.i8 d0, #0\n" in Pack()
9796 "vmov.i8 d1, #0\n" in Pack()
9797 "vmov.i8 d2, #0\n" in Pack()
9798 "vmov.i8 d3, #0\n" in Pack()
9799 "vmov.i8 d4, #0\n" in Pack()
9800 "vmov.i8 d5, #0\n" in Pack()
9801 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9802 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9803 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9804 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9805 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9806 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9807 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9808 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9809 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9810 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9811 "pld [%[in]]\n" in Pack()
9812 "vtrn.16 d0, d2\n" in Pack()
9813 "vtrn.16 d1, d3\n" in Pack()
9814 "vuzp.8 d4, d5\n" in Pack()
9815 "vtrn.8 d0, d1\n" in Pack()
9816 "vtrn.8 d2, d3\n" in Pack()
9817 "vaddw.u8 q8, q8, d0\n" in Pack()
9818 "vaddw.u8 q9, q9, d1\n" in Pack()
9819 "vaddw.u8 q10, q10, d2\n" in Pack()
9820 "vaddw.u8 q11, q11, d3\n" in Pack()
9821 "vaddw.u8 q12, q12, d4\n" in Pack()
9822 "vaddw.u8 q13, q13, d5\n" in Pack()
9823 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9824 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9827 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9828 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
9829 "vpaddl.u16 q8, q8\n" in Pack()
9830 "vpaddl.u16 q9, q9\n" in Pack()
9831 "vpaddl.u16 q10, q10\n" in Pack()
9832 "vpaddl.u16 q11, q11\n" in Pack()
9833 "vpaddl.u16 q12, q12\n" in Pack()
9834 "vpaddl.u16 q13, q13\n" in Pack()
9835 "vpadd.u32 d16, d16, d17\n" in Pack()
9836 "vpadd.u32 d18, d18, d19\n" in Pack()
9837 "vpadd.u32 d20, d20, d21\n" in Pack()
9838 "vpadd.u32 d22, d22, d23\n" in Pack()
9839 "vpadd.u32 d24, d24, d25\n" in Pack()
9840 "vpadd.u32 d26, d26, d27\n" in Pack()
9841 "vpadd.u32 d16, d16, d18\n" in Pack()
9842 "vpadd.u32 d17, d20, d22\n" in Pack()
9843 "vpadd.u32 d18, d24, d26\n" in Pack()
9844 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9845 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9846 "vadd.i32 q8, q8, q1\n" in Pack()
9847 "vadd.i32 q9, q9, q1\n" in Pack()
9848 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
9872 "sub %[stride], %[stride], #4\n" in Pack()
9873 "vmov.i16 q8, #0\n" in Pack()
9874 "vmov.i16 q9, #0\n" in Pack()
9875 "vmov.i16 q10, #0\n" in Pack()
9876 "vmov.i16 q11, #0\n" in Pack()
9877 "vmov.i16 q12, #0\n" in Pack()
9878 "vmov.i16 q13, #0\n" in Pack()
9881 "subs %[count], %[count], #6\n" in Pack()
9882 "beq 2f\n" in Pack()
9885 "subs %[count], %[count], #8\n" in Pack()
9888 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9889 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9890 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9891 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9892 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9893 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9894 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9895 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9896 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9897 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9898 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9899 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
9900 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
9901 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
9902 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
9903 "vld1.16 {d5[3]}, [%[in]], %[stride]\n" in Pack()
9904 "pld [%[in]]\n" in Pack()
9905 "vtrn.16 d0, d2\n" in Pack()
9906 "vtrn.16 d1, d3\n" in Pack()
9907 "vuzp.8 d4, d5\n" in Pack()
9908 "vtrn.8 d0, d1\n" in Pack()
9909 "vtrn.8 d2, d3\n" in Pack()
9910 "vaddw.u8 q8, q8, d0\n" in Pack()
9911 "vaddw.u8 q9, q9, d1\n" in Pack()
9912 "vaddw.u8 q10, q10, d2\n" in Pack()
9913 "vaddw.u8 q11, q11, d3\n" in Pack()
9914 "vaddw.u8 q12, q12, d4\n" in Pack()
9915 "vaddw.u8 q13, q13, d5\n" in Pack()
9916 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9917 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9919 "bne 1b\n" in Pack()
9924 "vmov.i8 d0, #0\n" in Pack()
9925 "vmov.i8 d1, #0\n" in Pack()
9926 "vmov.i8 d2, #0\n" in Pack()
9927 "vmov.i8 d3, #0\n" in Pack()
9928 "vmov.i8 d4, #0\n" in Pack()
9929 "vmov.i8 d5, #0\n" in Pack()
9930 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9931 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
9932 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
9933 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
9934 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
9935 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
9936 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
9937 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
9938 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9939 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
9940 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
9941 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
9942 "pld [%[in]]\n" in Pack()
9943 "vtrn.16 d0, d2\n" in Pack()
9944 "vtrn.16 d1, d3\n" in Pack()
9945 "vuzp.8 d4, d5\n" in Pack()
9946 "vtrn.8 d0, d1\n" in Pack()
9947 "vtrn.8 d2, d3\n" in Pack()
9948 "vaddw.u8 q8, q8, d0\n" in Pack()
9949 "vaddw.u8 q9, q9, d1\n" in Pack()
9950 "vaddw.u8 q10, q10, d2\n" in Pack()
9951 "vaddw.u8 q11, q11, d3\n" in Pack()
9952 "vaddw.u8 q12, q12, d4\n" in Pack()
9953 "vaddw.u8 q13, q13, d5\n" in Pack()
9954 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9955 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
9958 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9959 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
9960 "vpaddl.u16 q8, q8\n" in Pack()
9961 "vpaddl.u16 q9, q9\n" in Pack()
9962 "vpaddl.u16 q10, q10\n" in Pack()
9963 "vpaddl.u16 q11, q11\n" in Pack()
9964 "vpaddl.u16 q12, q12\n" in Pack()
9965 "vpaddl.u16 q13, q13\n" in Pack()
9966 "vpadd.u32 d16, d16, d17\n" in Pack()
9967 "vpadd.u32 d18, d18, d19\n" in Pack()
9968 "vpadd.u32 d20, d20, d21\n" in Pack()
9969 "vpadd.u32 d22, d22, d23\n" in Pack()
9970 "vpadd.u32 d24, d24, d25\n" in Pack()
9971 "vpadd.u32 d26, d26, d27\n" in Pack()
9972 "vpadd.u32 d16, d16, d18\n" in Pack()
9973 "vpadd.u32 d17, d20, d22\n" in Pack()
9974 "vpadd.u32 d18, d24, d26\n" in Pack()
9975 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9976 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9977 "vadd.i32 q8, q8, q1\n" in Pack()
9978 "vadd.i32 q9, q9, q1\n" in Pack()
9979 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
10003 "sub %[stride], %[stride], #4\n" in Pack()
10004 "vmov.i16 q8, #0\n" in Pack()
10005 "vmov.i16 q9, #0\n" in Pack()
10006 "vmov.i16 q10, #0\n" in Pack()
10007 "vmov.i16 q11, #0\n" in Pack()
10008 "vmov.i16 q12, #0\n" in Pack()
10009 "vmov.i16 q13, #0\n" in Pack()
10012 "subs %[count], %[count], #7\n" in Pack()
10013 "beq 2f\n" in Pack()
10016 "subs %[count], %[count], #8\n" in Pack()
10019 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10020 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
10021 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10022 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
10023 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10024 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
10025 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10026 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
10027 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10028 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
10029 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10030 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
10031 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10032 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
10033 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
10034 "vld1.16 {d5[3]}, [%[in]], %[stride]\n" in Pack()
10035 "pld [%[in]]\n" in Pack()
10036 "vtrn.16 d0, d2\n" in Pack()
10037 "vtrn.16 d1, d3\n" in Pack()
10038 "vuzp.8 d4, d5\n" in Pack()
10039 "vtrn.8 d0, d1\n" in Pack()
10040 "vtrn.8 d2, d3\n" in Pack()
10041 "vaddw.u8 q8, q8, d0\n" in Pack()
10042 "vaddw.u8 q9, q9, d1\n" in Pack()
10043 "vaddw.u8 q10, q10, d2\n" in Pack()
10044 "vaddw.u8 q11, q11, d3\n" in Pack()
10045 "vaddw.u8 q12, q12, d4\n" in Pack()
10046 "vaddw.u8 q13, q13, d5\n" in Pack()
10047 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
10048 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
10050 "bne 1b\n" in Pack()
10055 "vmov.i8 d0, #0\n" in Pack()
10056 "vmov.i8 d1, #0\n" in Pack()
10057 "vmov.i8 d2, #0\n" in Pack()
10058 "vmov.i8 d3, #0\n" in Pack()
10059 "vmov.i8 d4, #0\n" in Pack()
10060 "vmov.i8 d5, #0\n" in Pack()
10061 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10062 "vld1.16 {d4[0]}, [%[in]], %[stride]\n" in Pack()
10063 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10064 "vld1.16 {d4[1]}, [%[in]], %[stride]\n" in Pack()
10065 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10066 "vld1.16 {d4[2]}, [%[in]], %[stride]\n" in Pack()
10067 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10068 "vld1.16 {d4[3]}, [%[in]], %[stride]\n" in Pack()
10069 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10070 "vld1.16 {d5[0]}, [%[in]], %[stride]\n" in Pack()
10071 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10072 "vld1.16 {d5[1]}, [%[in]], %[stride]\n" in Pack()
10073 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10074 "vld1.16 {d5[2]}, [%[in]], %[stride]\n" in Pack()
10075 "pld [%[in]]\n" in Pack()
10076 "vtrn.16 d0, d2\n" in Pack()
10077 "vtrn.16 d1, d3\n" in Pack()
10078 "vuzp.8 d4, d5\n" in Pack()
10079 "vtrn.8 d0, d1\n" in Pack()
10080 "vtrn.8 d2, d3\n" in Pack()
10081 "vaddw.u8 q8, q8, d0\n" in Pack()
10082 "vaddw.u8 q9, q9, d1\n" in Pack()
10083 "vaddw.u8 q10, q10, d2\n" in Pack()
10084 "vaddw.u8 q11, q11, d3\n" in Pack()
10085 "vaddw.u8 q12, q12, d4\n" in Pack()
10086 "vaddw.u8 q13, q13, d5\n" in Pack()
10087 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
10088 "vst1.32 {d4, d5}, [%[out]:128]!\n" in Pack()
10091 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10092 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
10093 "vpaddl.u16 q8, q8\n" in Pack()
10094 "vpaddl.u16 q9, q9\n" in Pack()
10095 "vpaddl.u16 q10, q10\n" in Pack()
10096 "vpaddl.u16 q11, q11\n" in Pack()
10097 "vpaddl.u16 q12, q12\n" in Pack()
10098 "vpaddl.u16 q13, q13\n" in Pack()
10099 "vpadd.u32 d16, d16, d17\n" in Pack()
10100 "vpadd.u32 d18, d18, d19\n" in Pack()
10101 "vpadd.u32 d20, d20, d21\n" in Pack()
10102 "vpadd.u32 d22, d22, d23\n" in Pack()
10103 "vpadd.u32 d24, d24, d25\n" in Pack()
10104 "vpadd.u32 d26, d26, d27\n" in Pack()
10105 "vpadd.u32 d16, d16, d18\n" in Pack()
10106 "vpadd.u32 d17, d20, d22\n" in Pack()
10107 "vpadd.u32 d18, d24, d26\n" in Pack()
10108 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10109 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10110 "vadd.i32 q8, q8, q1\n" in Pack()
10111 "vadd.i32 q9, q9, q1\n" in Pack()
10112 "vst1.32 {d16, d17, d18, d19}, [%[out]:128]\n" in Pack()
10136 "sub %[stride], %[stride], #4\n" in Pack()
10137 "vmov.i16 q8, #0\n" in Pack()
10138 "vmov.i16 q9, #0\n" in Pack()
10139 "vmov.i16 q10, #0\n" in Pack()
10140 "vmov.i16 q11, #0\n" in Pack()
10141 "vmov.i16 q12, #0\n" in Pack()
10142 "vmov.i16 q13, #0\n" in Pack()
10143 "vmov.i16 q14, #0\n" in Pack()
10146 "subs %[count], %[count], #8\n" in Pack()
10149 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10150 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10151 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10152 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10153 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10154 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10155 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10156 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10157 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10158 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10159 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10160 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
10161 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10162 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
10163 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
10164 "vld3.8 {d4[7], d5[7], d6[7]}, [%[in]], %[stride]\n" in Pack()
10165 "pld [%[in]]\n" in Pack()
10166 "vtrn.16 d0, d2\n" in Pack()
10167 "vtrn.16 d1, d3\n" in Pack()
10168 "vtrn.8 d0, d1\n" in Pack()
10169 "vtrn.8 d2, d3\n" in Pack()
10170 "vaddw.u8 q8, q8, d0\n" in Pack()
10171 "vaddw.u8 q9, q9, d1\n" in Pack()
10172 "vaddw.u8 q10, q10, d2\n" in Pack()
10173 "vaddw.u8 q11, q11, d3\n" in Pack()
10174 "vaddw.u8 q12, q12, d4\n" in Pack()
10175 "vaddw.u8 q13, q13, d5\n" in Pack()
10176 "vaddw.u8 q14, q14, d6\n" in Pack()
10177 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10178 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10180 "bne 1b\n" in Pack()
10183 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10184 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
10185 "vpaddl.u16 q8, q8\n" in Pack()
10186 "vpaddl.u16 q9, q9\n" in Pack()
10187 "vpaddl.u16 q10, q10\n" in Pack()
10188 "vpaddl.u16 q11, q11\n" in Pack()
10189 "vpaddl.u16 q12, q12\n" in Pack()
10190 "vpaddl.u16 q13, q13\n" in Pack()
10191 "vpaddl.u16 q14, q14\n" in Pack()
10192 "vpadd.u32 d16, d16, d17\n" in Pack()
10193 "vpadd.u32 d18, d18, d19\n" in Pack()
10194 "vpadd.u32 d20, d20, d21\n" in Pack()
10195 "vpadd.u32 d22, d22, d23\n" in Pack()
10196 "vpadd.u32 d24, d24, d25\n" in Pack()
10197 "vpadd.u32 d26, d26, d27\n" in Pack()
10198 "vpadd.u32 d28, d28, d29\n" in Pack()
10199 "vpadd.u32 d16, d16, d18\n" in Pack()
10200 "vpadd.u32 d17, d20, d22\n" in Pack()
10201 "vpadd.u32 d18, d24, d26\n" in Pack()
10202 "vpadd.u32 d19, d28, d28\n" in Pack()
10203 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10204 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10205 "vadd.i32 q8, q8, q1\n" in Pack()
10206 "vadd.i32 q9, q9, q1\n" in Pack()
10207 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
10232 "sub %[stride], %[stride], #4\n" in Pack()
10233 "vmov.i16 q8, #0\n" in Pack()
10234 "vmov.i16 q9, #0\n" in Pack()
10235 "vmov.i16 q10, #0\n" in Pack()
10236 "vmov.i16 q11, #0\n" in Pack()
10237 "vmov.i16 q12, #0\n" in Pack()
10238 "vmov.i16 q13, #0\n" in Pack()
10239 "vmov.i16 q14, #0\n" in Pack()
10242 "subs %[count], %[count], #1\n" in Pack()
10243 "beq 2f\n" in Pack()
10246 "subs %[count], %[count], #8\n" in Pack()
10249 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10250 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10251 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10252 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10253 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10254 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10255 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10256 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10257 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10258 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10259 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10260 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
10261 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10262 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
10263 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
10264 "vld3.8 {d4[7], d5[7], d6[7]}, [%[in]], %[stride]\n" in Pack()
10265 "pld [%[in]]\n" in Pack()
10266 "vtrn.16 d0, d2\n" in Pack()
10267 "vtrn.16 d1, d3\n" in Pack()
10268 "vtrn.8 d0, d1\n" in Pack()
10269 "vtrn.8 d2, d3\n" in Pack()
10270 "vaddw.u8 q8, q8, d0\n" in Pack()
10271 "vaddw.u8 q9, q9, d1\n" in Pack()
10272 "vaddw.u8 q10, q10, d2\n" in Pack()
10273 "vaddw.u8 q11, q11, d3\n" in Pack()
10274 "vaddw.u8 q12, q12, d4\n" in Pack()
10275 "vaddw.u8 q13, q13, d5\n" in Pack()
10276 "vaddw.u8 q14, q14, d6\n" in Pack()
10277 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10278 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10280 "bne 1b\n" in Pack()
10285 "vmov.i8 d0, #0\n" in Pack()
10286 "vmov.i8 d1, #0\n" in Pack()
10287 "vmov.i8 d2, #0\n" in Pack()
10288 "vmov.i8 d3, #0\n" in Pack()
10289 "vmov.i8 d4, #0\n" in Pack()
10290 "vmov.i8 d5, #0\n" in Pack()
10291 "vmov.i8 d6, #0\n" in Pack()
10292 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10293 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10294 "pld [%[in]]\n" in Pack()
10295 "vtrn.16 d0, d2\n" in Pack()
10296 "vtrn.16 d1, d3\n" in Pack()
10297 "vtrn.8 d0, d1\n" in Pack()
10298 "vtrn.8 d2, d3\n" in Pack()
10299 "vaddw.u8 q8, q8, d0\n" in Pack()
10300 "vaddw.u8 q9, q9, d1\n" in Pack()
10301 "vaddw.u8 q10, q10, d2\n" in Pack()
10302 "vaddw.u8 q11, q11, d3\n" in Pack()
10303 "vaddw.u8 q12, q12, d4\n" in Pack()
10304 "vaddw.u8 q13, q13, d5\n" in Pack()
10305 "vaddw.u8 q14, q14, d6\n" in Pack()
10306 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10307 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10310 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10311 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
10312 "vpaddl.u16 q8, q8\n" in Pack()
10313 "vpaddl.u16 q9, q9\n" in Pack()
10314 "vpaddl.u16 q10, q10\n" in Pack()
10315 "vpaddl.u16 q11, q11\n" in Pack()
10316 "vpaddl.u16 q12, q12\n" in Pack()
10317 "vpaddl.u16 q13, q13\n" in Pack()
10318 "vpaddl.u16 q14, q14\n" in Pack()
10319 "vpadd.u32 d16, d16, d17\n" in Pack()
10320 "vpadd.u32 d18, d18, d19\n" in Pack()
10321 "vpadd.u32 d20, d20, d21\n" in Pack()
10322 "vpadd.u32 d22, d22, d23\n" in Pack()
10323 "vpadd.u32 d24, d24, d25\n" in Pack()
10324 "vpadd.u32 d26, d26, d27\n" in Pack()
10325 "vpadd.u32 d28, d28, d29\n" in Pack()
10326 "vpadd.u32 d16, d16, d18\n" in Pack()
10327 "vpadd.u32 d17, d20, d22\n" in Pack()
10328 "vpadd.u32 d18, d24, d26\n" in Pack()
10329 "vpadd.u32 d19, d28, d28\n" in Pack()
10330 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10331 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10332 "vadd.i32 q8, q8, q1\n" in Pack()
10333 "vadd.i32 q9, q9, q1\n" in Pack()
10334 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
10359 "sub %[stride], %[stride], #4\n" in Pack()
10360 "vmov.i16 q8, #0\n" in Pack()
10361 "vmov.i16 q9, #0\n" in Pack()
10362 "vmov.i16 q10, #0\n" in Pack()
10363 "vmov.i16 q11, #0\n" in Pack()
10364 "vmov.i16 q12, #0\n" in Pack()
10365 "vmov.i16 q13, #0\n" in Pack()
10366 "vmov.i16 q14, #0\n" in Pack()
10369 "subs %[count], %[count], #2\n" in Pack()
10370 "beq 2f\n" in Pack()
10373 "subs %[count], %[count], #8\n" in Pack()
10376 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10377 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10378 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10379 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10380 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10381 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10382 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10383 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10384 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10385 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10386 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10387 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
10388 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10389 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
10390 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
10391 "vld3.8 {d4[7], d5[7], d6[7]}, [%[in]], %[stride]\n" in Pack()
10392 "pld [%[in]]\n" in Pack()
10393 "vtrn.16 d0, d2\n" in Pack()
10394 "vtrn.16 d1, d3\n" in Pack()
10395 "vtrn.8 d0, d1\n" in Pack()
10396 "vtrn.8 d2, d3\n" in Pack()
10397 "vaddw.u8 q8, q8, d0\n" in Pack()
10398 "vaddw.u8 q9, q9, d1\n" in Pack()
10399 "vaddw.u8 q10, q10, d2\n" in Pack()
10400 "vaddw.u8 q11, q11, d3\n" in Pack()
10401 "vaddw.u8 q12, q12, d4\n" in Pack()
10402 "vaddw.u8 q13, q13, d5\n" in Pack()
10403 "vaddw.u8 q14, q14, d6\n" in Pack()
10404 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10405 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10407 "bne 1b\n" in Pack()
10412 "vmov.i8 d0, #0\n" in Pack()
10413 "vmov.i8 d1, #0\n" in Pack()
10414 "vmov.i8 d2, #0\n" in Pack()
10415 "vmov.i8 d3, #0\n" in Pack()
10416 "vmov.i8 d4, #0\n" in Pack()
10417 "vmov.i8 d5, #0\n" in Pack()
10418 "vmov.i8 d6, #0\n" in Pack()
10419 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10420 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10421 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10422 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10423 "pld [%[in]]\n" in Pack()
10424 "vtrn.16 d0, d2\n" in Pack()
10425 "vtrn.16 d1, d3\n" in Pack()
10426 "vtrn.8 d0, d1\n" in Pack()
10427 "vtrn.8 d2, d3\n" in Pack()
10428 "vaddw.u8 q8, q8, d0\n" in Pack()
10429 "vaddw.u8 q9, q9, d1\n" in Pack()
10430 "vaddw.u8 q10, q10, d2\n" in Pack()
10431 "vaddw.u8 q11, q11, d3\n" in Pack()
10432 "vaddw.u8 q12, q12, d4\n" in Pack()
10433 "vaddw.u8 q13, q13, d5\n" in Pack()
10434 "vaddw.u8 q14, q14, d6\n" in Pack()
10435 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10436 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10439 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10440 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
10441 "vpaddl.u16 q8, q8\n" in Pack()
10442 "vpaddl.u16 q9, q9\n" in Pack()
10443 "vpaddl.u16 q10, q10\n" in Pack()
10444 "vpaddl.u16 q11, q11\n" in Pack()
10445 "vpaddl.u16 q12, q12\n" in Pack()
10446 "vpaddl.u16 q13, q13\n" in Pack()
10447 "vpaddl.u16 q14, q14\n" in Pack()
10448 "vpadd.u32 d16, d16, d17\n" in Pack()
10449 "vpadd.u32 d18, d18, d19\n" in Pack()
10450 "vpadd.u32 d20, d20, d21\n" in Pack()
10451 "vpadd.u32 d22, d22, d23\n" in Pack()
10452 "vpadd.u32 d24, d24, d25\n" in Pack()
10453 "vpadd.u32 d26, d26, d27\n" in Pack()
10454 "vpadd.u32 d28, d28, d29\n" in Pack()
10455 "vpadd.u32 d16, d16, d18\n" in Pack()
10456 "vpadd.u32 d17, d20, d22\n" in Pack()
10457 "vpadd.u32 d18, d24, d26\n" in Pack()
10458 "vpadd.u32 d19, d28, d28\n" in Pack()
10459 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10460 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10461 "vadd.i32 q8, q8, q1\n" in Pack()
10462 "vadd.i32 q9, q9, q1\n" in Pack()
10463 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
10488 "sub %[stride], %[stride], #4\n" in Pack()
10489 "vmov.i16 q8, #0\n" in Pack()
10490 "vmov.i16 q9, #0\n" in Pack()
10491 "vmov.i16 q10, #0\n" in Pack()
10492 "vmov.i16 q11, #0\n" in Pack()
10493 "vmov.i16 q12, #0\n" in Pack()
10494 "vmov.i16 q13, #0\n" in Pack()
10495 "vmov.i16 q14, #0\n" in Pack()
10498 "subs %[count], %[count], #3\n" in Pack()
10499 "beq 2f\n" in Pack()
10502 "subs %[count], %[count], #8\n" in Pack()
10505 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10506 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10507 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10508 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10509 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10510 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10511 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10512 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10513 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10514 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10515 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10516 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
10517 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10518 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
10519 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
10520 "vld3.8 {d4[7], d5[7], d6[7]}, [%[in]], %[stride]\n" in Pack()
10521 "pld [%[in]]\n" in Pack()
10522 "vtrn.16 d0, d2\n" in Pack()
10523 "vtrn.16 d1, d3\n" in Pack()
10524 "vtrn.8 d0, d1\n" in Pack()
10525 "vtrn.8 d2, d3\n" in Pack()
10526 "vaddw.u8 q8, q8, d0\n" in Pack()
10527 "vaddw.u8 q9, q9, d1\n" in Pack()
10528 "vaddw.u8 q10, q10, d2\n" in Pack()
10529 "vaddw.u8 q11, q11, d3\n" in Pack()
10530 "vaddw.u8 q12, q12, d4\n" in Pack()
10531 "vaddw.u8 q13, q13, d5\n" in Pack()
10532 "vaddw.u8 q14, q14, d6\n" in Pack()
10533 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10534 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10536 "bne 1b\n" in Pack()
10541 "vmov.i8 d0, #0\n" in Pack()
10542 "vmov.i8 d1, #0\n" in Pack()
10543 "vmov.i8 d2, #0\n" in Pack()
10544 "vmov.i8 d3, #0\n" in Pack()
10545 "vmov.i8 d4, #0\n" in Pack()
10546 "vmov.i8 d5, #0\n" in Pack()
10547 "vmov.i8 d6, #0\n" in Pack()
10548 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10549 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10550 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10551 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10552 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10553 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10554 "pld [%[in]]\n" in Pack()
10555 "vtrn.16 d0, d2\n" in Pack()
10556 "vtrn.16 d1, d3\n" in Pack()
10557 "vtrn.8 d0, d1\n" in Pack()
10558 "vtrn.8 d2, d3\n" in Pack()
10559 "vaddw.u8 q8, q8, d0\n" in Pack()
10560 "vaddw.u8 q9, q9, d1\n" in Pack()
10561 "vaddw.u8 q10, q10, d2\n" in Pack()
10562 "vaddw.u8 q11, q11, d3\n" in Pack()
10563 "vaddw.u8 q12, q12, d4\n" in Pack()
10564 "vaddw.u8 q13, q13, d5\n" in Pack()
10565 "vaddw.u8 q14, q14, d6\n" in Pack()
10566 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10567 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10570 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10571 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
10572 "vpaddl.u16 q8, q8\n" in Pack()
10573 "vpaddl.u16 q9, q9\n" in Pack()
10574 "vpaddl.u16 q10, q10\n" in Pack()
10575 "vpaddl.u16 q11, q11\n" in Pack()
10576 "vpaddl.u16 q12, q12\n" in Pack()
10577 "vpaddl.u16 q13, q13\n" in Pack()
10578 "vpaddl.u16 q14, q14\n" in Pack()
10579 "vpadd.u32 d16, d16, d17\n" in Pack()
10580 "vpadd.u32 d18, d18, d19\n" in Pack()
10581 "vpadd.u32 d20, d20, d21\n" in Pack()
10582 "vpadd.u32 d22, d22, d23\n" in Pack()
10583 "vpadd.u32 d24, d24, d25\n" in Pack()
10584 "vpadd.u32 d26, d26, d27\n" in Pack()
10585 "vpadd.u32 d28, d28, d29\n" in Pack()
10586 "vpadd.u32 d16, d16, d18\n" in Pack()
10587 "vpadd.u32 d17, d20, d22\n" in Pack()
10588 "vpadd.u32 d18, d24, d26\n" in Pack()
10589 "vpadd.u32 d19, d28, d28\n" in Pack()
10590 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10591 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10592 "vadd.i32 q8, q8, q1\n" in Pack()
10593 "vadd.i32 q9, q9, q1\n" in Pack()
10594 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
10619 "sub %[stride], %[stride], #4\n" in Pack()
10620 "vmov.i16 q8, #0\n" in Pack()
10621 "vmov.i16 q9, #0\n" in Pack()
10622 "vmov.i16 q10, #0\n" in Pack()
10623 "vmov.i16 q11, #0\n" in Pack()
10624 "vmov.i16 q12, #0\n" in Pack()
10625 "vmov.i16 q13, #0\n" in Pack()
10626 "vmov.i16 q14, #0\n" in Pack()
10629 "subs %[count], %[count], #4\n" in Pack()
10630 "beq 2f\n" in Pack()
10633 "subs %[count], %[count], #8\n" in Pack()
10636 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10637 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10638 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10639 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10640 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10641 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10642 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10643 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10644 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10645 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10646 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10647 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
10648 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10649 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
10650 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
10651 "vld3.8 {d4[7], d5[7], d6[7]}, [%[in]], %[stride]\n" in Pack()
10652 "pld [%[in]]\n" in Pack()
10653 "vtrn.16 d0, d2\n" in Pack()
10654 "vtrn.16 d1, d3\n" in Pack()
10655 "vtrn.8 d0, d1\n" in Pack()
10656 "vtrn.8 d2, d3\n" in Pack()
10657 "vaddw.u8 q8, q8, d0\n" in Pack()
10658 "vaddw.u8 q9, q9, d1\n" in Pack()
10659 "vaddw.u8 q10, q10, d2\n" in Pack()
10660 "vaddw.u8 q11, q11, d3\n" in Pack()
10661 "vaddw.u8 q12, q12, d4\n" in Pack()
10662 "vaddw.u8 q13, q13, d5\n" in Pack()
10663 "vaddw.u8 q14, q14, d6\n" in Pack()
10664 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10665 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10667 "bne 1b\n" in Pack()
10672 "vmov.i8 d0, #0\n" in Pack()
10673 "vmov.i8 d1, #0\n" in Pack()
10674 "vmov.i8 d2, #0\n" in Pack()
10675 "vmov.i8 d3, #0\n" in Pack()
10676 "vmov.i8 d4, #0\n" in Pack()
10677 "vmov.i8 d5, #0\n" in Pack()
10678 "vmov.i8 d6, #0\n" in Pack()
10679 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10680 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10681 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10682 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10683 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10684 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10685 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10686 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10687 "pld [%[in]]\n" in Pack()
10688 "vtrn.16 d0, d2\n" in Pack()
10689 "vtrn.16 d1, d3\n" in Pack()
10690 "vtrn.8 d0, d1\n" in Pack()
10691 "vtrn.8 d2, d3\n" in Pack()
10692 "vaddw.u8 q8, q8, d0\n" in Pack()
10693 "vaddw.u8 q9, q9, d1\n" in Pack()
10694 "vaddw.u8 q10, q10, d2\n" in Pack()
10695 "vaddw.u8 q11, q11, d3\n" in Pack()
10696 "vaddw.u8 q12, q12, d4\n" in Pack()
10697 "vaddw.u8 q13, q13, d5\n" in Pack()
10698 "vaddw.u8 q14, q14, d6\n" in Pack()
10699 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10700 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10703 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10704 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
10705 "vpaddl.u16 q8, q8\n" in Pack()
10706 "vpaddl.u16 q9, q9\n" in Pack()
10707 "vpaddl.u16 q10, q10\n" in Pack()
10708 "vpaddl.u16 q11, q11\n" in Pack()
10709 "vpaddl.u16 q12, q12\n" in Pack()
10710 "vpaddl.u16 q13, q13\n" in Pack()
10711 "vpaddl.u16 q14, q14\n" in Pack()
10712 "vpadd.u32 d16, d16, d17\n" in Pack()
10713 "vpadd.u32 d18, d18, d19\n" in Pack()
10714 "vpadd.u32 d20, d20, d21\n" in Pack()
10715 "vpadd.u32 d22, d22, d23\n" in Pack()
10716 "vpadd.u32 d24, d24, d25\n" in Pack()
10717 "vpadd.u32 d26, d26, d27\n" in Pack()
10718 "vpadd.u32 d28, d28, d29\n" in Pack()
10719 "vpadd.u32 d16, d16, d18\n" in Pack()
10720 "vpadd.u32 d17, d20, d22\n" in Pack()
10721 "vpadd.u32 d18, d24, d26\n" in Pack()
10722 "vpadd.u32 d19, d28, d28\n" in Pack()
10723 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10724 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10725 "vadd.i32 q8, q8, q1\n" in Pack()
10726 "vadd.i32 q9, q9, q1\n" in Pack()
10727 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
10752 "sub %[stride], %[stride], #4\n" in Pack()
10753 "vmov.i16 q8, #0\n" in Pack()
10754 "vmov.i16 q9, #0\n" in Pack()
10755 "vmov.i16 q10, #0\n" in Pack()
10756 "vmov.i16 q11, #0\n" in Pack()
10757 "vmov.i16 q12, #0\n" in Pack()
10758 "vmov.i16 q13, #0\n" in Pack()
10759 "vmov.i16 q14, #0\n" in Pack()
10762 "subs %[count], %[count], #5\n" in Pack()
10763 "beq 2f\n" in Pack()
10766 "subs %[count], %[count], #8\n" in Pack()
10769 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10770 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10771 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10772 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10773 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10774 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10775 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10776 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10777 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10778 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10779 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10780 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
10781 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10782 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
10783 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
10784 "vld3.8 {d4[7], d5[7], d6[7]}, [%[in]], %[stride]\n" in Pack()
10785 "pld [%[in]]\n" in Pack()
10786 "vtrn.16 d0, d2\n" in Pack()
10787 "vtrn.16 d1, d3\n" in Pack()
10788 "vtrn.8 d0, d1\n" in Pack()
10789 "vtrn.8 d2, d3\n" in Pack()
10790 "vaddw.u8 q8, q8, d0\n" in Pack()
10791 "vaddw.u8 q9, q9, d1\n" in Pack()
10792 "vaddw.u8 q10, q10, d2\n" in Pack()
10793 "vaddw.u8 q11, q11, d3\n" in Pack()
10794 "vaddw.u8 q12, q12, d4\n" in Pack()
10795 "vaddw.u8 q13, q13, d5\n" in Pack()
10796 "vaddw.u8 q14, q14, d6\n" in Pack()
10797 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10798 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10800 "bne 1b\n" in Pack()
10805 "vmov.i8 d0, #0\n" in Pack()
10806 "vmov.i8 d1, #0\n" in Pack()
10807 "vmov.i8 d2, #0\n" in Pack()
10808 "vmov.i8 d3, #0\n" in Pack()
10809 "vmov.i8 d4, #0\n" in Pack()
10810 "vmov.i8 d5, #0\n" in Pack()
10811 "vmov.i8 d6, #0\n" in Pack()
10812 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10813 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10814 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10815 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10816 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10817 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10818 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10819 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10820 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10821 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10822 "pld [%[in]]\n" in Pack()
10823 "vtrn.16 d0, d2\n" in Pack()
10824 "vtrn.16 d1, d3\n" in Pack()
10825 "vtrn.8 d0, d1\n" in Pack()
10826 "vtrn.8 d2, d3\n" in Pack()
10827 "vaddw.u8 q8, q8, d0\n" in Pack()
10828 "vaddw.u8 q9, q9, d1\n" in Pack()
10829 "vaddw.u8 q10, q10, d2\n" in Pack()
10830 "vaddw.u8 q11, q11, d3\n" in Pack()
10831 "vaddw.u8 q12, q12, d4\n" in Pack()
10832 "vaddw.u8 q13, q13, d5\n" in Pack()
10833 "vaddw.u8 q14, q14, d6\n" in Pack()
10834 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10835 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10838 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10839 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
10840 "vpaddl.u16 q8, q8\n" in Pack()
10841 "vpaddl.u16 q9, q9\n" in Pack()
10842 "vpaddl.u16 q10, q10\n" in Pack()
10843 "vpaddl.u16 q11, q11\n" in Pack()
10844 "vpaddl.u16 q12, q12\n" in Pack()
10845 "vpaddl.u16 q13, q13\n" in Pack()
10846 "vpaddl.u16 q14, q14\n" in Pack()
10847 "vpadd.u32 d16, d16, d17\n" in Pack()
10848 "vpadd.u32 d18, d18, d19\n" in Pack()
10849 "vpadd.u32 d20, d20, d21\n" in Pack()
10850 "vpadd.u32 d22, d22, d23\n" in Pack()
10851 "vpadd.u32 d24, d24, d25\n" in Pack()
10852 "vpadd.u32 d26, d26, d27\n" in Pack()
10853 "vpadd.u32 d28, d28, d29\n" in Pack()
10854 "vpadd.u32 d16, d16, d18\n" in Pack()
10855 "vpadd.u32 d17, d20, d22\n" in Pack()
10856 "vpadd.u32 d18, d24, d26\n" in Pack()
10857 "vpadd.u32 d19, d28, d28\n" in Pack()
10858 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10859 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10860 "vadd.i32 q8, q8, q1\n" in Pack()
10861 "vadd.i32 q9, q9, q1\n" in Pack()
10862 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
10887 "sub %[stride], %[stride], #4\n" in Pack()
10888 "vmov.i16 q8, #0\n" in Pack()
10889 "vmov.i16 q9, #0\n" in Pack()
10890 "vmov.i16 q10, #0\n" in Pack()
10891 "vmov.i16 q11, #0\n" in Pack()
10892 "vmov.i16 q12, #0\n" in Pack()
10893 "vmov.i16 q13, #0\n" in Pack()
10894 "vmov.i16 q14, #0\n" in Pack()
10897 "subs %[count], %[count], #6\n" in Pack()
10898 "beq 2f\n" in Pack()
10901 "subs %[count], %[count], #8\n" in Pack()
10904 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10905 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10906 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10907 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10908 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10909 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10910 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10911 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10912 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10913 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10914 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10915 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
10916 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
10917 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
10918 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
10919 "vld3.8 {d4[7], d5[7], d6[7]}, [%[in]], %[stride]\n" in Pack()
10920 "pld [%[in]]\n" in Pack()
10921 "vtrn.16 d0, d2\n" in Pack()
10922 "vtrn.16 d1, d3\n" in Pack()
10923 "vtrn.8 d0, d1\n" in Pack()
10924 "vtrn.8 d2, d3\n" in Pack()
10925 "vaddw.u8 q8, q8, d0\n" in Pack()
10926 "vaddw.u8 q9, q9, d1\n" in Pack()
10927 "vaddw.u8 q10, q10, d2\n" in Pack()
10928 "vaddw.u8 q11, q11, d3\n" in Pack()
10929 "vaddw.u8 q12, q12, d4\n" in Pack()
10930 "vaddw.u8 q13, q13, d5\n" in Pack()
10931 "vaddw.u8 q14, q14, d6\n" in Pack()
10932 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10933 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10935 "bne 1b\n" in Pack()
10940 "vmov.i8 d0, #0\n" in Pack()
10941 "vmov.i8 d1, #0\n" in Pack()
10942 "vmov.i8 d2, #0\n" in Pack()
10943 "vmov.i8 d3, #0\n" in Pack()
10944 "vmov.i8 d4, #0\n" in Pack()
10945 "vmov.i8 d5, #0\n" in Pack()
10946 "vmov.i8 d6, #0\n" in Pack()
10947 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10948 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
10949 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
10950 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
10951 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
10952 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
10953 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
10954 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
10955 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10956 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
10957 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
10958 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
10959 "pld [%[in]]\n" in Pack()
10960 "vtrn.16 d0, d2\n" in Pack()
10961 "vtrn.16 d1, d3\n" in Pack()
10962 "vtrn.8 d0, d1\n" in Pack()
10963 "vtrn.8 d2, d3\n" in Pack()
10964 "vaddw.u8 q8, q8, d0\n" in Pack()
10965 "vaddw.u8 q9, q9, d1\n" in Pack()
10966 "vaddw.u8 q10, q10, d2\n" in Pack()
10967 "vaddw.u8 q11, q11, d3\n" in Pack()
10968 "vaddw.u8 q12, q12, d4\n" in Pack()
10969 "vaddw.u8 q13, q13, d5\n" in Pack()
10970 "vaddw.u8 q14, q14, d6\n" in Pack()
10971 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10972 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
10975 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10976 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
10977 "vpaddl.u16 q8, q8\n" in Pack()
10978 "vpaddl.u16 q9, q9\n" in Pack()
10979 "vpaddl.u16 q10, q10\n" in Pack()
10980 "vpaddl.u16 q11, q11\n" in Pack()
10981 "vpaddl.u16 q12, q12\n" in Pack()
10982 "vpaddl.u16 q13, q13\n" in Pack()
10983 "vpaddl.u16 q14, q14\n" in Pack()
10984 "vpadd.u32 d16, d16, d17\n" in Pack()
10985 "vpadd.u32 d18, d18, d19\n" in Pack()
10986 "vpadd.u32 d20, d20, d21\n" in Pack()
10987 "vpadd.u32 d22, d22, d23\n" in Pack()
10988 "vpadd.u32 d24, d24, d25\n" in Pack()
10989 "vpadd.u32 d26, d26, d27\n" in Pack()
10990 "vpadd.u32 d28, d28, d29\n" in Pack()
10991 "vpadd.u32 d16, d16, d18\n" in Pack()
10992 "vpadd.u32 d17, d20, d22\n" in Pack()
10993 "vpadd.u32 d18, d24, d26\n" in Pack()
10994 "vpadd.u32 d19, d28, d28\n" in Pack()
10995 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10996 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10997 "vadd.i32 q8, q8, q1\n" in Pack()
10998 "vadd.i32 q9, q9, q1\n" in Pack()
10999 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
11024 "sub %[stride], %[stride], #4\n" in Pack()
11025 "vmov.i16 q8, #0\n" in Pack()
11026 "vmov.i16 q9, #0\n" in Pack()
11027 "vmov.i16 q10, #0\n" in Pack()
11028 "vmov.i16 q11, #0\n" in Pack()
11029 "vmov.i16 q12, #0\n" in Pack()
11030 "vmov.i16 q13, #0\n" in Pack()
11031 "vmov.i16 q14, #0\n" in Pack()
11034 "subs %[count], %[count], #7\n" in Pack()
11035 "beq 2f\n" in Pack()
11038 "subs %[count], %[count], #8\n" in Pack()
11041 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
11042 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
11043 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
11044 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
11045 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
11046 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
11047 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
11048 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
11049 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
11050 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
11051 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
11052 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
11053 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
11054 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
11055 "vld1.32 {d3[1]}, [%[in]]!\n" in Pack()
11056 "vld3.8 {d4[7], d5[7], d6[7]}, [%[in]], %[stride]\n" in Pack()
11057 "pld [%[in]]\n" in Pack()
11058 "vtrn.16 d0, d2\n" in Pack()
11059 "vtrn.16 d1, d3\n" in Pack()
11060 "vtrn.8 d0, d1\n" in Pack()
11061 "vtrn.8 d2, d3\n" in Pack()
11062 "vaddw.u8 q8, q8, d0\n" in Pack()
11063 "vaddw.u8 q9, q9, d1\n" in Pack()
11064 "vaddw.u8 q10, q10, d2\n" in Pack()
11065 "vaddw.u8 q11, q11, d3\n" in Pack()
11066 "vaddw.u8 q12, q12, d4\n" in Pack()
11067 "vaddw.u8 q13, q13, d5\n" in Pack()
11068 "vaddw.u8 q14, q14, d6\n" in Pack()
11069 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
11070 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
11072 "bne 1b\n" in Pack()
11077 "vmov.i8 d0, #0\n" in Pack()
11078 "vmov.i8 d1, #0\n" in Pack()
11079 "vmov.i8 d2, #0\n" in Pack()
11080 "vmov.i8 d3, #0\n" in Pack()
11081 "vmov.i8 d4, #0\n" in Pack()
11082 "vmov.i8 d5, #0\n" in Pack()
11083 "vmov.i8 d6, #0\n" in Pack()
11084 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
11085 "vld3.8 {d4[0], d5[0], d6[0]}, [%[in]], %[stride]\n" in Pack()
11086 "vld1.32 {d1[0]}, [%[in]]!\n" in Pack()
11087 "vld3.8 {d4[1], d5[1], d6[1]}, [%[in]], %[stride]\n" in Pack()
11088 "vld1.32 {d2[0]}, [%[in]]!\n" in Pack()
11089 "vld3.8 {d4[2], d5[2], d6[2]}, [%[in]], %[stride]\n" in Pack()
11090 "vld1.32 {d3[0]}, [%[in]]!\n" in Pack()
11091 "vld3.8 {d4[3], d5[3], d6[3]}, [%[in]], %[stride]\n" in Pack()
11092 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
11093 "vld3.8 {d4[4], d5[4], d6[4]}, [%[in]], %[stride]\n" in Pack()
11094 "vld1.32 {d1[1]}, [%[in]]!\n" in Pack()
11095 "vld3.8 {d4[5], d5[5], d6[5]}, [%[in]], %[stride]\n" in Pack()
11096 "vld1.32 {d2[1]}, [%[in]]!\n" in Pack()
11097 "vld3.8 {d4[6], d5[6], d6[6]}, [%[in]], %[stride]\n" in Pack()
11098 "pld [%[in]]\n" in Pack()
11099 "vtrn.16 d0, d2\n" in Pack()
11100 "vtrn.16 d1, d3\n" in Pack()
11101 "vtrn.8 d0, d1\n" in Pack()
11102 "vtrn.8 d2, d3\n" in Pack()
11103 "vaddw.u8 q8, q8, d0\n" in Pack()
11104 "vaddw.u8 q9, q9, d1\n" in Pack()
11105 "vaddw.u8 q10, q10, d2\n" in Pack()
11106 "vaddw.u8 q11, q11, d3\n" in Pack()
11107 "vaddw.u8 q12, q12, d4\n" in Pack()
11108 "vaddw.u8 q13, q13, d5\n" in Pack()
11109 "vaddw.u8 q14, q14, d6\n" in Pack()
11110 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
11111 "vst1.32 {d4, d5, d6}, [%[out]:64]!\n" in Pack()
11114 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11115 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
11116 "vpaddl.u16 q8, q8\n" in Pack()
11117 "vpaddl.u16 q9, q9\n" in Pack()
11118 "vpaddl.u16 q10, q10\n" in Pack()
11119 "vpaddl.u16 q11, q11\n" in Pack()
11120 "vpaddl.u16 q12, q12\n" in Pack()
11121 "vpaddl.u16 q13, q13\n" in Pack()
11122 "vpaddl.u16 q14, q14\n" in Pack()
11123 "vpadd.u32 d16, d16, d17\n" in Pack()
11124 "vpadd.u32 d18, d18, d19\n" in Pack()
11125 "vpadd.u32 d20, d20, d21\n" in Pack()
11126 "vpadd.u32 d22, d22, d23\n" in Pack()
11127 "vpadd.u32 d24, d24, d25\n" in Pack()
11128 "vpadd.u32 d26, d26, d27\n" in Pack()
11129 "vpadd.u32 d28, d28, d29\n" in Pack()
11130 "vpadd.u32 d16, d16, d18\n" in Pack()
11131 "vpadd.u32 d17, d20, d22\n" in Pack()
11132 "vpadd.u32 d18, d24, d26\n" in Pack()
11133 "vpadd.u32 d19, d28, d28\n" in Pack()
11134 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11135 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11136 "vadd.i32 q8, q8, q1\n" in Pack()
11137 "vadd.i32 q9, q9, q1\n" in Pack()
11138 "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n" in Pack()
11163 "vmov.i16 q8, #0\n" in Pack()
11164 "vmov.i16 q9, #0\n" in Pack()
11165 "vmov.i16 q10, #0\n" in Pack()
11166 "vmov.i16 q11, #0\n" in Pack()
11167 "vmov.i16 q12, #0\n" in Pack()
11168 "vmov.i16 q13, #0\n" in Pack()
11169 "vmov.i16 q14, #0\n" in Pack()
11170 "vmov.i16 q15, #0\n" in Pack()
11173 "subs %[count], %[count], #8\n" in Pack()
11176 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11177 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11178 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11179 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11180 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
11181 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
11182 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
11183 "vld1.32 {d7}, [%[in]], %[stride]\n" in Pack()
11184 "pld [%[in]]\n" in Pack()
11185 "vtrn.8 d0, d1\n" in Pack()
11186 "vtrn.8 d2, d3\n" in Pack()
11187 "vtrn.8 d4, d5\n" in Pack()
11188 "vtrn.8 d6, d7\n" in Pack()
11189 "vtrn.16 d0, d2\n" in Pack()
11190 "vtrn.16 d1, d3\n" in Pack()
11191 "vtrn.16 d4, d6\n" in Pack()
11192 "vtrn.16 d5, d7\n" in Pack()
11193 "vtrn.32 d0, d4\n" in Pack()
11194 "vtrn.32 d1, d5\n" in Pack()
11195 "vtrn.32 d2, d6\n" in Pack()
11196 "vtrn.32 d3, d7\n" in Pack()
11197 "vaddw.u8 q8, q8, d0\n" in Pack()
11198 "vaddw.u8 q9, q9, d1\n" in Pack()
11199 "vaddw.u8 q10, q10, d2\n" in Pack()
11200 "vaddw.u8 q11, q11, d3\n" in Pack()
11201 "vaddw.u8 q12, q12, d4\n" in Pack()
11202 "vaddw.u8 q13, q13, d5\n" in Pack()
11203 "vaddw.u8 q14, q14, d6\n" in Pack()
11204 "vaddw.u8 q15, q15, d7\n" in Pack()
11205 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11206 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11208 "bne 1b\n" in Pack()
11211 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11212 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
11213 "vpaddl.u16 q8, q8\n" in Pack()
11214 "vpaddl.u16 q9, q9\n" in Pack()
11215 "vpaddl.u16 q10, q10\n" in Pack()
11216 "vpaddl.u16 q11, q11\n" in Pack()
11217 "vpaddl.u16 q12, q12\n" in Pack()
11218 "vpaddl.u16 q13, q13\n" in Pack()
11219 "vpaddl.u16 q14, q14\n" in Pack()
11220 "vpaddl.u16 q15, q15\n" in Pack()
11221 "vpadd.u32 d16, d16, d17\n" in Pack()
11222 "vpadd.u32 d18, d18, d19\n" in Pack()
11223 "vpadd.u32 d20, d20, d21\n" in Pack()
11224 "vpadd.u32 d22, d22, d23\n" in Pack()
11225 "vpadd.u32 d24, d24, d25\n" in Pack()
11226 "vpadd.u32 d26, d26, d27\n" in Pack()
11227 "vpadd.u32 d28, d28, d29\n" in Pack()
11228 "vpadd.u32 d30, d30, d31\n" in Pack()
11229 "vpadd.u32 d16, d16, d18\n" in Pack()
11230 "vpadd.u32 d17, d20, d22\n" in Pack()
11231 "vpadd.u32 d18, d24, d26\n" in Pack()
11232 "vpadd.u32 d19, d28, d30\n" in Pack()
11233 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11234 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11235 "vadd.i32 q8, q8, q1\n" in Pack()
11236 "vadd.i32 q9, q9, q1\n" in Pack()
11237 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
11262 "vmov.i16 q8, #0\n" in Pack()
11263 "vmov.i16 q9, #0\n" in Pack()
11264 "vmov.i16 q10, #0\n" in Pack()
11265 "vmov.i16 q11, #0\n" in Pack()
11266 "vmov.i16 q12, #0\n" in Pack()
11267 "vmov.i16 q13, #0\n" in Pack()
11268 "vmov.i16 q14, #0\n" in Pack()
11269 "vmov.i16 q15, #0\n" in Pack()
11272 "subs %[count], %[count], #1\n" in Pack()
11273 "beq 2f\n" in Pack()
11276 "subs %[count], %[count], #8\n" in Pack()
11279 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11280 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11281 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11282 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11283 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
11284 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
11285 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
11286 "vld1.32 {d7}, [%[in]], %[stride]\n" in Pack()
11287 "pld [%[in]]\n" in Pack()
11288 "vtrn.8 d0, d1\n" in Pack()
11289 "vtrn.8 d2, d3\n" in Pack()
11290 "vtrn.8 d4, d5\n" in Pack()
11291 "vtrn.8 d6, d7\n" in Pack()
11292 "vtrn.16 d0, d2\n" in Pack()
11293 "vtrn.16 d1, d3\n" in Pack()
11294 "vtrn.16 d4, d6\n" in Pack()
11295 "vtrn.16 d5, d7\n" in Pack()
11296 "vtrn.32 d0, d4\n" in Pack()
11297 "vtrn.32 d1, d5\n" in Pack()
11298 "vtrn.32 d2, d6\n" in Pack()
11299 "vtrn.32 d3, d7\n" in Pack()
11300 "vaddw.u8 q8, q8, d0\n" in Pack()
11301 "vaddw.u8 q9, q9, d1\n" in Pack()
11302 "vaddw.u8 q10, q10, d2\n" in Pack()
11303 "vaddw.u8 q11, q11, d3\n" in Pack()
11304 "vaddw.u8 q12, q12, d4\n" in Pack()
11305 "vaddw.u8 q13, q13, d5\n" in Pack()
11306 "vaddw.u8 q14, q14, d6\n" in Pack()
11307 "vaddw.u8 q15, q15, d7\n" in Pack()
11308 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11309 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11311 "bne 1b\n" in Pack()
11316 "vmov.i8 d0, #0\n" in Pack()
11317 "vmov.i8 d1, #0\n" in Pack()
11318 "vmov.i8 d2, #0\n" in Pack()
11319 "vmov.i8 d3, #0\n" in Pack()
11320 "vmov.i8 d4, #0\n" in Pack()
11321 "vmov.i8 d5, #0\n" in Pack()
11322 "vmov.i8 d6, #0\n" in Pack()
11323 "vmov.i8 d7, #0\n" in Pack()
11324 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11325 "pld [%[in]]\n" in Pack()
11326 "vtrn.8 d0, d1\n" in Pack()
11327 "vtrn.8 d2, d3\n" in Pack()
11328 "vtrn.8 d4, d5\n" in Pack()
11329 "vtrn.8 d6, d7\n" in Pack()
11330 "vtrn.16 d0, d2\n" in Pack()
11331 "vtrn.16 d1, d3\n" in Pack()
11332 "vtrn.16 d4, d6\n" in Pack()
11333 "vtrn.16 d5, d7\n" in Pack()
11334 "vtrn.32 d0, d4\n" in Pack()
11335 "vtrn.32 d1, d5\n" in Pack()
11336 "vtrn.32 d2, d6\n" in Pack()
11337 "vtrn.32 d3, d7\n" in Pack()
11338 "vaddw.u8 q8, q8, d0\n" in Pack()
11339 "vaddw.u8 q9, q9, d1\n" in Pack()
11340 "vaddw.u8 q10, q10, d2\n" in Pack()
11341 "vaddw.u8 q11, q11, d3\n" in Pack()
11342 "vaddw.u8 q12, q12, d4\n" in Pack()
11343 "vaddw.u8 q13, q13, d5\n" in Pack()
11344 "vaddw.u8 q14, q14, d6\n" in Pack()
11345 "vaddw.u8 q15, q15, d7\n" in Pack()
11346 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11347 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11350 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11351 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
11352 "vpaddl.u16 q8, q8\n" in Pack()
11353 "vpaddl.u16 q9, q9\n" in Pack()
11354 "vpaddl.u16 q10, q10\n" in Pack()
11355 "vpaddl.u16 q11, q11\n" in Pack()
11356 "vpaddl.u16 q12, q12\n" in Pack()
11357 "vpaddl.u16 q13, q13\n" in Pack()
11358 "vpaddl.u16 q14, q14\n" in Pack()
11359 "vpaddl.u16 q15, q15\n" in Pack()
11360 "vpadd.u32 d16, d16, d17\n" in Pack()
11361 "vpadd.u32 d18, d18, d19\n" in Pack()
11362 "vpadd.u32 d20, d20, d21\n" in Pack()
11363 "vpadd.u32 d22, d22, d23\n" in Pack()
11364 "vpadd.u32 d24, d24, d25\n" in Pack()
11365 "vpadd.u32 d26, d26, d27\n" in Pack()
11366 "vpadd.u32 d28, d28, d29\n" in Pack()
11367 "vpadd.u32 d30, d30, d31\n" in Pack()
11368 "vpadd.u32 d16, d16, d18\n" in Pack()
11369 "vpadd.u32 d17, d20, d22\n" in Pack()
11370 "vpadd.u32 d18, d24, d26\n" in Pack()
11371 "vpadd.u32 d19, d28, d30\n" in Pack()
11372 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11373 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11374 "vadd.i32 q8, q8, q1\n" in Pack()
11375 "vadd.i32 q9, q9, q1\n" in Pack()
11376 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
11401 "vmov.i16 q8, #0\n" in Pack()
11402 "vmov.i16 q9, #0\n" in Pack()
11403 "vmov.i16 q10, #0\n" in Pack()
11404 "vmov.i16 q11, #0\n" in Pack()
11405 "vmov.i16 q12, #0\n" in Pack()
11406 "vmov.i16 q13, #0\n" in Pack()
11407 "vmov.i16 q14, #0\n" in Pack()
11408 "vmov.i16 q15, #0\n" in Pack()
11411 "subs %[count], %[count], #2\n" in Pack()
11412 "beq 2f\n" in Pack()
11415 "subs %[count], %[count], #8\n" in Pack()
11418 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11419 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11420 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11421 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11422 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
11423 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
11424 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
11425 "vld1.32 {d7}, [%[in]], %[stride]\n" in Pack()
11426 "pld [%[in]]\n" in Pack()
11427 "vtrn.8 d0, d1\n" in Pack()
11428 "vtrn.8 d2, d3\n" in Pack()
11429 "vtrn.8 d4, d5\n" in Pack()
11430 "vtrn.8 d6, d7\n" in Pack()
11431 "vtrn.16 d0, d2\n" in Pack()
11432 "vtrn.16 d1, d3\n" in Pack()
11433 "vtrn.16 d4, d6\n" in Pack()
11434 "vtrn.16 d5, d7\n" in Pack()
11435 "vtrn.32 d0, d4\n" in Pack()
11436 "vtrn.32 d1, d5\n" in Pack()
11437 "vtrn.32 d2, d6\n" in Pack()
11438 "vtrn.32 d3, d7\n" in Pack()
11439 "vaddw.u8 q8, q8, d0\n" in Pack()
11440 "vaddw.u8 q9, q9, d1\n" in Pack()
11441 "vaddw.u8 q10, q10, d2\n" in Pack()
11442 "vaddw.u8 q11, q11, d3\n" in Pack()
11443 "vaddw.u8 q12, q12, d4\n" in Pack()
11444 "vaddw.u8 q13, q13, d5\n" in Pack()
11445 "vaddw.u8 q14, q14, d6\n" in Pack()
11446 "vaddw.u8 q15, q15, d7\n" in Pack()
11447 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11448 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11450 "bne 1b\n" in Pack()
11455 "vmov.i8 d0, #0\n" in Pack()
11456 "vmov.i8 d1, #0\n" in Pack()
11457 "vmov.i8 d2, #0\n" in Pack()
11458 "vmov.i8 d3, #0\n" in Pack()
11459 "vmov.i8 d4, #0\n" in Pack()
11460 "vmov.i8 d5, #0\n" in Pack()
11461 "vmov.i8 d6, #0\n" in Pack()
11462 "vmov.i8 d7, #0\n" in Pack()
11463 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11464 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11465 "pld [%[in]]\n" in Pack()
11466 "vtrn.8 d0, d1\n" in Pack()
11467 "vtrn.8 d2, d3\n" in Pack()
11468 "vtrn.8 d4, d5\n" in Pack()
11469 "vtrn.8 d6, d7\n" in Pack()
11470 "vtrn.16 d0, d2\n" in Pack()
11471 "vtrn.16 d1, d3\n" in Pack()
11472 "vtrn.16 d4, d6\n" in Pack()
11473 "vtrn.16 d5, d7\n" in Pack()
11474 "vtrn.32 d0, d4\n" in Pack()
11475 "vtrn.32 d1, d5\n" in Pack()
11476 "vtrn.32 d2, d6\n" in Pack()
11477 "vtrn.32 d3, d7\n" in Pack()
11478 "vaddw.u8 q8, q8, d0\n" in Pack()
11479 "vaddw.u8 q9, q9, d1\n" in Pack()
11480 "vaddw.u8 q10, q10, d2\n" in Pack()
11481 "vaddw.u8 q11, q11, d3\n" in Pack()
11482 "vaddw.u8 q12, q12, d4\n" in Pack()
11483 "vaddw.u8 q13, q13, d5\n" in Pack()
11484 "vaddw.u8 q14, q14, d6\n" in Pack()
11485 "vaddw.u8 q15, q15, d7\n" in Pack()
11486 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11487 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11490 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11491 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
11492 "vpaddl.u16 q8, q8\n" in Pack()
11493 "vpaddl.u16 q9, q9\n" in Pack()
11494 "vpaddl.u16 q10, q10\n" in Pack()
11495 "vpaddl.u16 q11, q11\n" in Pack()
11496 "vpaddl.u16 q12, q12\n" in Pack()
11497 "vpaddl.u16 q13, q13\n" in Pack()
11498 "vpaddl.u16 q14, q14\n" in Pack()
11499 "vpaddl.u16 q15, q15\n" in Pack()
11500 "vpadd.u32 d16, d16, d17\n" in Pack()
11501 "vpadd.u32 d18, d18, d19\n" in Pack()
11502 "vpadd.u32 d20, d20, d21\n" in Pack()
11503 "vpadd.u32 d22, d22, d23\n" in Pack()
11504 "vpadd.u32 d24, d24, d25\n" in Pack()
11505 "vpadd.u32 d26, d26, d27\n" in Pack()
11506 "vpadd.u32 d28, d28, d29\n" in Pack()
11507 "vpadd.u32 d30, d30, d31\n" in Pack()
11508 "vpadd.u32 d16, d16, d18\n" in Pack()
11509 "vpadd.u32 d17, d20, d22\n" in Pack()
11510 "vpadd.u32 d18, d24, d26\n" in Pack()
11511 "vpadd.u32 d19, d28, d30\n" in Pack()
11512 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11513 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11514 "vadd.i32 q8, q8, q1\n" in Pack()
11515 "vadd.i32 q9, q9, q1\n" in Pack()
11516 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
11541 "vmov.i16 q8, #0\n" in Pack()
11542 "vmov.i16 q9, #0\n" in Pack()
11543 "vmov.i16 q10, #0\n" in Pack()
11544 "vmov.i16 q11, #0\n" in Pack()
11545 "vmov.i16 q12, #0\n" in Pack()
11546 "vmov.i16 q13, #0\n" in Pack()
11547 "vmov.i16 q14, #0\n" in Pack()
11548 "vmov.i16 q15, #0\n" in Pack()
11551 "subs %[count], %[count], #3\n" in Pack()
11552 "beq 2f\n" in Pack()
11555 "subs %[count], %[count], #8\n" in Pack()
11558 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11559 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11560 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11561 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11562 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
11563 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
11564 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
11565 "vld1.32 {d7}, [%[in]], %[stride]\n" in Pack()
11566 "pld [%[in]]\n" in Pack()
11567 "vtrn.8 d0, d1\n" in Pack()
11568 "vtrn.8 d2, d3\n" in Pack()
11569 "vtrn.8 d4, d5\n" in Pack()
11570 "vtrn.8 d6, d7\n" in Pack()
11571 "vtrn.16 d0, d2\n" in Pack()
11572 "vtrn.16 d1, d3\n" in Pack()
11573 "vtrn.16 d4, d6\n" in Pack()
11574 "vtrn.16 d5, d7\n" in Pack()
11575 "vtrn.32 d0, d4\n" in Pack()
11576 "vtrn.32 d1, d5\n" in Pack()
11577 "vtrn.32 d2, d6\n" in Pack()
11578 "vtrn.32 d3, d7\n" in Pack()
11579 "vaddw.u8 q8, q8, d0\n" in Pack()
11580 "vaddw.u8 q9, q9, d1\n" in Pack()
11581 "vaddw.u8 q10, q10, d2\n" in Pack()
11582 "vaddw.u8 q11, q11, d3\n" in Pack()
11583 "vaddw.u8 q12, q12, d4\n" in Pack()
11584 "vaddw.u8 q13, q13, d5\n" in Pack()
11585 "vaddw.u8 q14, q14, d6\n" in Pack()
11586 "vaddw.u8 q15, q15, d7\n" in Pack()
11587 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11588 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11590 "bne 1b\n" in Pack()
11595 "vmov.i8 d0, #0\n" in Pack()
11596 "vmov.i8 d1, #0\n" in Pack()
11597 "vmov.i8 d2, #0\n" in Pack()
11598 "vmov.i8 d3, #0\n" in Pack()
11599 "vmov.i8 d4, #0\n" in Pack()
11600 "vmov.i8 d5, #0\n" in Pack()
11601 "vmov.i8 d6, #0\n" in Pack()
11602 "vmov.i8 d7, #0\n" in Pack()
11603 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11604 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11605 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11606 "pld [%[in]]\n" in Pack()
11607 "vtrn.8 d0, d1\n" in Pack()
11608 "vtrn.8 d2, d3\n" in Pack()
11609 "vtrn.8 d4, d5\n" in Pack()
11610 "vtrn.8 d6, d7\n" in Pack()
11611 "vtrn.16 d0, d2\n" in Pack()
11612 "vtrn.16 d1, d3\n" in Pack()
11613 "vtrn.16 d4, d6\n" in Pack()
11614 "vtrn.16 d5, d7\n" in Pack()
11615 "vtrn.32 d0, d4\n" in Pack()
11616 "vtrn.32 d1, d5\n" in Pack()
11617 "vtrn.32 d2, d6\n" in Pack()
11618 "vtrn.32 d3, d7\n" in Pack()
11619 "vaddw.u8 q8, q8, d0\n" in Pack()
11620 "vaddw.u8 q9, q9, d1\n" in Pack()
11621 "vaddw.u8 q10, q10, d2\n" in Pack()
11622 "vaddw.u8 q11, q11, d3\n" in Pack()
11623 "vaddw.u8 q12, q12, d4\n" in Pack()
11624 "vaddw.u8 q13, q13, d5\n" in Pack()
11625 "vaddw.u8 q14, q14, d6\n" in Pack()
11626 "vaddw.u8 q15, q15, d7\n" in Pack()
11627 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11628 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11631 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11632 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
11633 "vpaddl.u16 q8, q8\n" in Pack()
11634 "vpaddl.u16 q9, q9\n" in Pack()
11635 "vpaddl.u16 q10, q10\n" in Pack()
11636 "vpaddl.u16 q11, q11\n" in Pack()
11637 "vpaddl.u16 q12, q12\n" in Pack()
11638 "vpaddl.u16 q13, q13\n" in Pack()
11639 "vpaddl.u16 q14, q14\n" in Pack()
11640 "vpaddl.u16 q15, q15\n" in Pack()
11641 "vpadd.u32 d16, d16, d17\n" in Pack()
11642 "vpadd.u32 d18, d18, d19\n" in Pack()
11643 "vpadd.u32 d20, d20, d21\n" in Pack()
11644 "vpadd.u32 d22, d22, d23\n" in Pack()
11645 "vpadd.u32 d24, d24, d25\n" in Pack()
11646 "vpadd.u32 d26, d26, d27\n" in Pack()
11647 "vpadd.u32 d28, d28, d29\n" in Pack()
11648 "vpadd.u32 d30, d30, d31\n" in Pack()
11649 "vpadd.u32 d16, d16, d18\n" in Pack()
11650 "vpadd.u32 d17, d20, d22\n" in Pack()
11651 "vpadd.u32 d18, d24, d26\n" in Pack()
11652 "vpadd.u32 d19, d28, d30\n" in Pack()
11653 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11654 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11655 "vadd.i32 q8, q8, q1\n" in Pack()
11656 "vadd.i32 q9, q9, q1\n" in Pack()
11657 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
11682 "vmov.i16 q8, #0\n" in Pack()
11683 "vmov.i16 q9, #0\n" in Pack()
11684 "vmov.i16 q10, #0\n" in Pack()
11685 "vmov.i16 q11, #0\n" in Pack()
11686 "vmov.i16 q12, #0\n" in Pack()
11687 "vmov.i16 q13, #0\n" in Pack()
11688 "vmov.i16 q14, #0\n" in Pack()
11689 "vmov.i16 q15, #0\n" in Pack()
11692 "subs %[count], %[count], #4\n" in Pack()
11693 "beq 2f\n" in Pack()
11696 "subs %[count], %[count], #8\n" in Pack()
11699 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11700 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11701 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11702 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11703 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
11704 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
11705 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
11706 "vld1.32 {d7}, [%[in]], %[stride]\n" in Pack()
11707 "pld [%[in]]\n" in Pack()
11708 "vtrn.8 d0, d1\n" in Pack()
11709 "vtrn.8 d2, d3\n" in Pack()
11710 "vtrn.8 d4, d5\n" in Pack()
11711 "vtrn.8 d6, d7\n" in Pack()
11712 "vtrn.16 d0, d2\n" in Pack()
11713 "vtrn.16 d1, d3\n" in Pack()
11714 "vtrn.16 d4, d6\n" in Pack()
11715 "vtrn.16 d5, d7\n" in Pack()
11716 "vtrn.32 d0, d4\n" in Pack()
11717 "vtrn.32 d1, d5\n" in Pack()
11718 "vtrn.32 d2, d6\n" in Pack()
11719 "vtrn.32 d3, d7\n" in Pack()
11720 "vaddw.u8 q8, q8, d0\n" in Pack()
11721 "vaddw.u8 q9, q9, d1\n" in Pack()
11722 "vaddw.u8 q10, q10, d2\n" in Pack()
11723 "vaddw.u8 q11, q11, d3\n" in Pack()
11724 "vaddw.u8 q12, q12, d4\n" in Pack()
11725 "vaddw.u8 q13, q13, d5\n" in Pack()
11726 "vaddw.u8 q14, q14, d6\n" in Pack()
11727 "vaddw.u8 q15, q15, d7\n" in Pack()
11728 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11729 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11731 "bne 1b\n" in Pack()
11736 "vmov.i8 d0, #0\n" in Pack()
11737 "vmov.i8 d1, #0\n" in Pack()
11738 "vmov.i8 d2, #0\n" in Pack()
11739 "vmov.i8 d3, #0\n" in Pack()
11740 "vmov.i8 d4, #0\n" in Pack()
11741 "vmov.i8 d5, #0\n" in Pack()
11742 "vmov.i8 d6, #0\n" in Pack()
11743 "vmov.i8 d7, #0\n" in Pack()
11744 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11745 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11746 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11747 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11748 "pld [%[in]]\n" in Pack()
11749 "vtrn.8 d0, d1\n" in Pack()
11750 "vtrn.8 d2, d3\n" in Pack()
11751 "vtrn.8 d4, d5\n" in Pack()
11752 "vtrn.8 d6, d7\n" in Pack()
11753 "vtrn.16 d0, d2\n" in Pack()
11754 "vtrn.16 d1, d3\n" in Pack()
11755 "vtrn.16 d4, d6\n" in Pack()
11756 "vtrn.16 d5, d7\n" in Pack()
11757 "vtrn.32 d0, d4\n" in Pack()
11758 "vtrn.32 d1, d5\n" in Pack()
11759 "vtrn.32 d2, d6\n" in Pack()
11760 "vtrn.32 d3, d7\n" in Pack()
11761 "vaddw.u8 q8, q8, d0\n" in Pack()
11762 "vaddw.u8 q9, q9, d1\n" in Pack()
11763 "vaddw.u8 q10, q10, d2\n" in Pack()
11764 "vaddw.u8 q11, q11, d3\n" in Pack()
11765 "vaddw.u8 q12, q12, d4\n" in Pack()
11766 "vaddw.u8 q13, q13, d5\n" in Pack()
11767 "vaddw.u8 q14, q14, d6\n" in Pack()
11768 "vaddw.u8 q15, q15, d7\n" in Pack()
11769 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11770 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11773 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11774 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
11775 "vpaddl.u16 q8, q8\n" in Pack()
11776 "vpaddl.u16 q9, q9\n" in Pack()
11777 "vpaddl.u16 q10, q10\n" in Pack()
11778 "vpaddl.u16 q11, q11\n" in Pack()
11779 "vpaddl.u16 q12, q12\n" in Pack()
11780 "vpaddl.u16 q13, q13\n" in Pack()
11781 "vpaddl.u16 q14, q14\n" in Pack()
11782 "vpaddl.u16 q15, q15\n" in Pack()
11783 "vpadd.u32 d16, d16, d17\n" in Pack()
11784 "vpadd.u32 d18, d18, d19\n" in Pack()
11785 "vpadd.u32 d20, d20, d21\n" in Pack()
11786 "vpadd.u32 d22, d22, d23\n" in Pack()
11787 "vpadd.u32 d24, d24, d25\n" in Pack()
11788 "vpadd.u32 d26, d26, d27\n" in Pack()
11789 "vpadd.u32 d28, d28, d29\n" in Pack()
11790 "vpadd.u32 d30, d30, d31\n" in Pack()
11791 "vpadd.u32 d16, d16, d18\n" in Pack()
11792 "vpadd.u32 d17, d20, d22\n" in Pack()
11793 "vpadd.u32 d18, d24, d26\n" in Pack()
11794 "vpadd.u32 d19, d28, d30\n" in Pack()
11795 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11796 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11797 "vadd.i32 q8, q8, q1\n" in Pack()
11798 "vadd.i32 q9, q9, q1\n" in Pack()
11799 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
11824 "vmov.i16 q8, #0\n" in Pack()
11825 "vmov.i16 q9, #0\n" in Pack()
11826 "vmov.i16 q10, #0\n" in Pack()
11827 "vmov.i16 q11, #0\n" in Pack()
11828 "vmov.i16 q12, #0\n" in Pack()
11829 "vmov.i16 q13, #0\n" in Pack()
11830 "vmov.i16 q14, #0\n" in Pack()
11831 "vmov.i16 q15, #0\n" in Pack()
11834 "subs %[count], %[count], #5\n" in Pack()
11835 "beq 2f\n" in Pack()
11838 "subs %[count], %[count], #8\n" in Pack()
11841 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11842 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11843 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11844 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11845 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
11846 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
11847 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
11848 "vld1.32 {d7}, [%[in]], %[stride]\n" in Pack()
11849 "pld [%[in]]\n" in Pack()
11850 "vtrn.8 d0, d1\n" in Pack()
11851 "vtrn.8 d2, d3\n" in Pack()
11852 "vtrn.8 d4, d5\n" in Pack()
11853 "vtrn.8 d6, d7\n" in Pack()
11854 "vtrn.16 d0, d2\n" in Pack()
11855 "vtrn.16 d1, d3\n" in Pack()
11856 "vtrn.16 d4, d6\n" in Pack()
11857 "vtrn.16 d5, d7\n" in Pack()
11858 "vtrn.32 d0, d4\n" in Pack()
11859 "vtrn.32 d1, d5\n" in Pack()
11860 "vtrn.32 d2, d6\n" in Pack()
11861 "vtrn.32 d3, d7\n" in Pack()
11862 "vaddw.u8 q8, q8, d0\n" in Pack()
11863 "vaddw.u8 q9, q9, d1\n" in Pack()
11864 "vaddw.u8 q10, q10, d2\n" in Pack()
11865 "vaddw.u8 q11, q11, d3\n" in Pack()
11866 "vaddw.u8 q12, q12, d4\n" in Pack()
11867 "vaddw.u8 q13, q13, d5\n" in Pack()
11868 "vaddw.u8 q14, q14, d6\n" in Pack()
11869 "vaddw.u8 q15, q15, d7\n" in Pack()
11870 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11871 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11873 "bne 1b\n" in Pack()
11878 "vmov.i8 d0, #0\n" in Pack()
11879 "vmov.i8 d1, #0\n" in Pack()
11880 "vmov.i8 d2, #0\n" in Pack()
11881 "vmov.i8 d3, #0\n" in Pack()
11882 "vmov.i8 d4, #0\n" in Pack()
11883 "vmov.i8 d5, #0\n" in Pack()
11884 "vmov.i8 d6, #0\n" in Pack()
11885 "vmov.i8 d7, #0\n" in Pack()
11886 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11887 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11888 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11889 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11890 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
11891 "pld [%[in]]\n" in Pack()
11892 "vtrn.8 d0, d1\n" in Pack()
11893 "vtrn.8 d2, d3\n" in Pack()
11894 "vtrn.8 d4, d5\n" in Pack()
11895 "vtrn.8 d6, d7\n" in Pack()
11896 "vtrn.16 d0, d2\n" in Pack()
11897 "vtrn.16 d1, d3\n" in Pack()
11898 "vtrn.16 d4, d6\n" in Pack()
11899 "vtrn.16 d5, d7\n" in Pack()
11900 "vtrn.32 d0, d4\n" in Pack()
11901 "vtrn.32 d1, d5\n" in Pack()
11902 "vtrn.32 d2, d6\n" in Pack()
11903 "vtrn.32 d3, d7\n" in Pack()
11904 "vaddw.u8 q8, q8, d0\n" in Pack()
11905 "vaddw.u8 q9, q9, d1\n" in Pack()
11906 "vaddw.u8 q10, q10, d2\n" in Pack()
11907 "vaddw.u8 q11, q11, d3\n" in Pack()
11908 "vaddw.u8 q12, q12, d4\n" in Pack()
11909 "vaddw.u8 q13, q13, d5\n" in Pack()
11910 "vaddw.u8 q14, q14, d6\n" in Pack()
11911 "vaddw.u8 q15, q15, d7\n" in Pack()
11912 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11913 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
11916 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11917 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
11918 "vpaddl.u16 q8, q8\n" in Pack()
11919 "vpaddl.u16 q9, q9\n" in Pack()
11920 "vpaddl.u16 q10, q10\n" in Pack()
11921 "vpaddl.u16 q11, q11\n" in Pack()
11922 "vpaddl.u16 q12, q12\n" in Pack()
11923 "vpaddl.u16 q13, q13\n" in Pack()
11924 "vpaddl.u16 q14, q14\n" in Pack()
11925 "vpaddl.u16 q15, q15\n" in Pack()
11926 "vpadd.u32 d16, d16, d17\n" in Pack()
11927 "vpadd.u32 d18, d18, d19\n" in Pack()
11928 "vpadd.u32 d20, d20, d21\n" in Pack()
11929 "vpadd.u32 d22, d22, d23\n" in Pack()
11930 "vpadd.u32 d24, d24, d25\n" in Pack()
11931 "vpadd.u32 d26, d26, d27\n" in Pack()
11932 "vpadd.u32 d28, d28, d29\n" in Pack()
11933 "vpadd.u32 d30, d30, d31\n" in Pack()
11934 "vpadd.u32 d16, d16, d18\n" in Pack()
11935 "vpadd.u32 d17, d20, d22\n" in Pack()
11936 "vpadd.u32 d18, d24, d26\n" in Pack()
11937 "vpadd.u32 d19, d28, d30\n" in Pack()
11938 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11939 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11940 "vadd.i32 q8, q8, q1\n" in Pack()
11941 "vadd.i32 q9, q9, q1\n" in Pack()
11942 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
11967 "vmov.i16 q8, #0\n" in Pack()
11968 "vmov.i16 q9, #0\n" in Pack()
11969 "vmov.i16 q10, #0\n" in Pack()
11970 "vmov.i16 q11, #0\n" in Pack()
11971 "vmov.i16 q12, #0\n" in Pack()
11972 "vmov.i16 q13, #0\n" in Pack()
11973 "vmov.i16 q14, #0\n" in Pack()
11974 "vmov.i16 q15, #0\n" in Pack()
11977 "subs %[count], %[count], #6\n" in Pack()
11978 "beq 2f\n" in Pack()
11981 "subs %[count], %[count], #8\n" in Pack()
11984 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11985 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
11986 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
11987 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
11988 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
11989 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
11990 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
11991 "vld1.32 {d7}, [%[in]], %[stride]\n" in Pack()
11992 "pld [%[in]]\n" in Pack()
11993 "vtrn.8 d0, d1\n" in Pack()
11994 "vtrn.8 d2, d3\n" in Pack()
11995 "vtrn.8 d4, d5\n" in Pack()
11996 "vtrn.8 d6, d7\n" in Pack()
11997 "vtrn.16 d0, d2\n" in Pack()
11998 "vtrn.16 d1, d3\n" in Pack()
11999 "vtrn.16 d4, d6\n" in Pack()
12000 "vtrn.16 d5, d7\n" in Pack()
12001 "vtrn.32 d0, d4\n" in Pack()
12002 "vtrn.32 d1, d5\n" in Pack()
12003 "vtrn.32 d2, d6\n" in Pack()
12004 "vtrn.32 d3, d7\n" in Pack()
12005 "vaddw.u8 q8, q8, d0\n" in Pack()
12006 "vaddw.u8 q9, q9, d1\n" in Pack()
12007 "vaddw.u8 q10, q10, d2\n" in Pack()
12008 "vaddw.u8 q11, q11, d3\n" in Pack()
12009 "vaddw.u8 q12, q12, d4\n" in Pack()
12010 "vaddw.u8 q13, q13, d5\n" in Pack()
12011 "vaddw.u8 q14, q14, d6\n" in Pack()
12012 "vaddw.u8 q15, q15, d7\n" in Pack()
12013 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
12014 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
12016 "bne 1b\n" in Pack()
12021 "vmov.i8 d0, #0\n" in Pack()
12022 "vmov.i8 d1, #0\n" in Pack()
12023 "vmov.i8 d2, #0\n" in Pack()
12024 "vmov.i8 d3, #0\n" in Pack()
12025 "vmov.i8 d4, #0\n" in Pack()
12026 "vmov.i8 d5, #0\n" in Pack()
12027 "vmov.i8 d6, #0\n" in Pack()
12028 "vmov.i8 d7, #0\n" in Pack()
12029 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
12030 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
12031 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
12032 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
12033 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
12034 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
12035 "pld [%[in]]\n" in Pack()
12036 "vtrn.8 d0, d1\n" in Pack()
12037 "vtrn.8 d2, d3\n" in Pack()
12038 "vtrn.8 d4, d5\n" in Pack()
12039 "vtrn.8 d6, d7\n" in Pack()
12040 "vtrn.16 d0, d2\n" in Pack()
12041 "vtrn.16 d1, d3\n" in Pack()
12042 "vtrn.16 d4, d6\n" in Pack()
12043 "vtrn.16 d5, d7\n" in Pack()
12044 "vtrn.32 d0, d4\n" in Pack()
12045 "vtrn.32 d1, d5\n" in Pack()
12046 "vtrn.32 d2, d6\n" in Pack()
12047 "vtrn.32 d3, d7\n" in Pack()
12048 "vaddw.u8 q8, q8, d0\n" in Pack()
12049 "vaddw.u8 q9, q9, d1\n" in Pack()
12050 "vaddw.u8 q10, q10, d2\n" in Pack()
12051 "vaddw.u8 q11, q11, d3\n" in Pack()
12052 "vaddw.u8 q12, q12, d4\n" in Pack()
12053 "vaddw.u8 q13, q13, d5\n" in Pack()
12054 "vaddw.u8 q14, q14, d6\n" in Pack()
12055 "vaddw.u8 q15, q15, d7\n" in Pack()
12056 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
12057 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
12060 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
12061 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
12062 "vpaddl.u16 q8, q8\n" in Pack()
12063 "vpaddl.u16 q9, q9\n" in Pack()
12064 "vpaddl.u16 q10, q10\n" in Pack()
12065 "vpaddl.u16 q11, q11\n" in Pack()
12066 "vpaddl.u16 q12, q12\n" in Pack()
12067 "vpaddl.u16 q13, q13\n" in Pack()
12068 "vpaddl.u16 q14, q14\n" in Pack()
12069 "vpaddl.u16 q15, q15\n" in Pack()
12070 "vpadd.u32 d16, d16, d17\n" in Pack()
12071 "vpadd.u32 d18, d18, d19\n" in Pack()
12072 "vpadd.u32 d20, d20, d21\n" in Pack()
12073 "vpadd.u32 d22, d22, d23\n" in Pack()
12074 "vpadd.u32 d24, d24, d25\n" in Pack()
12075 "vpadd.u32 d26, d26, d27\n" in Pack()
12076 "vpadd.u32 d28, d28, d29\n" in Pack()
12077 "vpadd.u32 d30, d30, d31\n" in Pack()
12078 "vpadd.u32 d16, d16, d18\n" in Pack()
12079 "vpadd.u32 d17, d20, d22\n" in Pack()
12080 "vpadd.u32 d18, d24, d26\n" in Pack()
12081 "vpadd.u32 d19, d28, d30\n" in Pack()
12082 "vmul.i32 q8, q8, d0[0]\n" in Pack()
12083 "vmul.i32 q9, q9, d0[0]\n" in Pack()
12084 "vadd.i32 q8, q8, q1\n" in Pack()
12085 "vadd.i32 q9, q9, q1\n" in Pack()
12086 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()
12111 "vmov.i16 q8, #0\n" in Pack()
12112 "vmov.i16 q9, #0\n" in Pack()
12113 "vmov.i16 q10, #0\n" in Pack()
12114 "vmov.i16 q11, #0\n" in Pack()
12115 "vmov.i16 q12, #0\n" in Pack()
12116 "vmov.i16 q13, #0\n" in Pack()
12117 "vmov.i16 q14, #0\n" in Pack()
12118 "vmov.i16 q15, #0\n" in Pack()
12121 "subs %[count], %[count], #7\n" in Pack()
12122 "beq 2f\n" in Pack()
12125 "subs %[count], %[count], #8\n" in Pack()
12128 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
12129 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
12130 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
12131 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
12132 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
12133 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
12134 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
12135 "vld1.32 {d7}, [%[in]], %[stride]\n" in Pack()
12136 "pld [%[in]]\n" in Pack()
12137 "vtrn.8 d0, d1\n" in Pack()
12138 "vtrn.8 d2, d3\n" in Pack()
12139 "vtrn.8 d4, d5\n" in Pack()
12140 "vtrn.8 d6, d7\n" in Pack()
12141 "vtrn.16 d0, d2\n" in Pack()
12142 "vtrn.16 d1, d3\n" in Pack()
12143 "vtrn.16 d4, d6\n" in Pack()
12144 "vtrn.16 d5, d7\n" in Pack()
12145 "vtrn.32 d0, d4\n" in Pack()
12146 "vtrn.32 d1, d5\n" in Pack()
12147 "vtrn.32 d2, d6\n" in Pack()
12148 "vtrn.32 d3, d7\n" in Pack()
12149 "vaddw.u8 q8, q8, d0\n" in Pack()
12150 "vaddw.u8 q9, q9, d1\n" in Pack()
12151 "vaddw.u8 q10, q10, d2\n" in Pack()
12152 "vaddw.u8 q11, q11, d3\n" in Pack()
12153 "vaddw.u8 q12, q12, d4\n" in Pack()
12154 "vaddw.u8 q13, q13, d5\n" in Pack()
12155 "vaddw.u8 q14, q14, d6\n" in Pack()
12156 "vaddw.u8 q15, q15, d7\n" in Pack()
12157 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
12158 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
12160 "bne 1b\n" in Pack()
12165 "vmov.i8 d0, #0\n" in Pack()
12166 "vmov.i8 d1, #0\n" in Pack()
12167 "vmov.i8 d2, #0\n" in Pack()
12168 "vmov.i8 d3, #0\n" in Pack()
12169 "vmov.i8 d4, #0\n" in Pack()
12170 "vmov.i8 d5, #0\n" in Pack()
12171 "vmov.i8 d6, #0\n" in Pack()
12172 "vmov.i8 d7, #0\n" in Pack()
12173 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
12174 "vld1.32 {d1}, [%[in]], %[stride]\n" in Pack()
12175 "vld1.32 {d2}, [%[in]], %[stride]\n" in Pack()
12176 "vld1.32 {d3}, [%[in]], %[stride]\n" in Pack()
12177 "vld1.32 {d4}, [%[in]], %[stride]\n" in Pack()
12178 "vld1.32 {d5}, [%[in]], %[stride]\n" in Pack()
12179 "vld1.32 {d6}, [%[in]], %[stride]\n" in Pack()
12180 "pld [%[in]]\n" in Pack()
12181 "vtrn.8 d0, d1\n" in Pack()
12182 "vtrn.8 d2, d3\n" in Pack()
12183 "vtrn.8 d4, d5\n" in Pack()
12184 "vtrn.8 d6, d7\n" in Pack()
12185 "vtrn.16 d0, d2\n" in Pack()
12186 "vtrn.16 d1, d3\n" in Pack()
12187 "vtrn.16 d4, d6\n" in Pack()
12188 "vtrn.16 d5, d7\n" in Pack()
12189 "vtrn.32 d0, d4\n" in Pack()
12190 "vtrn.32 d1, d5\n" in Pack()
12191 "vtrn.32 d2, d6\n" in Pack()
12192 "vtrn.32 d3, d7\n" in Pack()
12193 "vaddw.u8 q8, q8, d0\n" in Pack()
12194 "vaddw.u8 q9, q9, d1\n" in Pack()
12195 "vaddw.u8 q10, q10, d2\n" in Pack()
12196 "vaddw.u8 q11, q11, d3\n" in Pack()
12197 "vaddw.u8 q12, q12, d4\n" in Pack()
12198 "vaddw.u8 q13, q13, d5\n" in Pack()
12199 "vaddw.u8 q14, q14, d6\n" in Pack()
12200 "vaddw.u8 q15, q15, d7\n" in Pack()
12201 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
12202 "vst1.32 {d4, d5, d6, d7}, [%[out]:256]!\n" in Pack()
12205 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
12206 "vdup.32 q1, %[additive_sum_offset]\n" in Pack()
12207 "vpaddl.u16 q8, q8\n" in Pack()
12208 "vpaddl.u16 q9, q9\n" in Pack()
12209 "vpaddl.u16 q10, q10\n" in Pack()
12210 "vpaddl.u16 q11, q11\n" in Pack()
12211 "vpaddl.u16 q12, q12\n" in Pack()
12212 "vpaddl.u16 q13, q13\n" in Pack()
12213 "vpaddl.u16 q14, q14\n" in Pack()
12214 "vpaddl.u16 q15, q15\n" in Pack()
12215 "vpadd.u32 d16, d16, d17\n" in Pack()
12216 "vpadd.u32 d18, d18, d19\n" in Pack()
12217 "vpadd.u32 d20, d20, d21\n" in Pack()
12218 "vpadd.u32 d22, d22, d23\n" in Pack()
12219 "vpadd.u32 d24, d24, d25\n" in Pack()
12220 "vpadd.u32 d26, d26, d27\n" in Pack()
12221 "vpadd.u32 d28, d28, d29\n" in Pack()
12222 "vpadd.u32 d30, d30, d31\n" in Pack()
12223 "vpadd.u32 d16, d16, d18\n" in Pack()
12224 "vpadd.u32 d17, d20, d22\n" in Pack()
12225 "vpadd.u32 d18, d24, d26\n" in Pack()
12226 "vpadd.u32 d19, d28, d30\n" in Pack()
12227 "vmul.i32 q8, q8, d0[0]\n" in Pack()
12228 "vmul.i32 q9, q9, d0[0]\n" in Pack()
12229 "vadd.i32 q8, q8, q1\n" in Pack()
12230 "vadd.i32 q9, q9, q1\n" in Pack()
12231 "vst1.32 {d16, d17, d18, d19}, [%[out]:256]\n" in Pack()