• Home
  • Raw
  • Download

Lines Matching full:n

43       "pld [%[lhs]]\n"  in Multiply()
44 "pld [%[rhs]]\n" in Multiply()
47 "vmov.i32 q0, #0\n" in Multiply()
53 "subs %[count], %[count], #8\n" in Multiply()
55 "vld1.32 {d2}, [%[lhs]:64]!\n" in Multiply()
56 "vld1.32 {d3}, [%[rhs]:64]!\n" in Multiply()
57 "pld [%[lhs], #64]\n" in Multiply()
58 "pld [%[rhs], #64]\n" in Multiply()
59 "vmull.u8 q2, d3, d2\n" in Multiply()
60 "vpadal.u16 q0, q2\n" in Multiply()
63 "bgt 1b\n" in Multiply()
66 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
67 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
68 "vdup.32 q6, %[multiplicative_offset]\n" in Multiply()
69 "vdup.32 q7, %[rounding_offset]\n" in Multiply()
70 "vdup.32 q8, %[shift]\n" in Multiply()
71 "vdup.32 q4, d8[0]\n" in Multiply()
76 "vpadd.u32 d0, d0, d1\n" in Multiply()
77 "vpadd.u32 d0, d0, d0\n" in Multiply()
80 "vadd.s32 q0, q0, q4\n" in Multiply()
81 "vadd.s32 q0, q0, q5\n" in Multiply()
82 "vmul.i32 q0, q0, q6\n" in Multiply()
83 "vadd.i32 q0, q0, q7\n" in Multiply()
84 "vshl.s32 q0, q0, q8\n" in Multiply()
85 "vqmovn.s32 d0, q0\n" in Multiply()
86 "vqmovun.s16 d0, q0\n" in Multiply()
89 "vst1.8 {d0[0]}, [%[result]]!\n" in Multiply()
117 "pld [%[lhs]]\n" in Multiply()
118 "pld [%[rhs]]\n" in Multiply()
121 "vmov.i32 q0, #0\n" in Multiply()
122 "vmov.i32 q1, #0\n" in Multiply()
128 "subs %[count], %[count], #8\n" in Multiply()
130 "vld1.32 {d4}, [%[lhs]:64]!\n" in Multiply()
131 "vld1.32 {d5, d6}, [%[rhs]:64]!\n" in Multiply()
132 "pld [%[lhs], #64]\n" in Multiply()
133 "pld [%[rhs], #64]\n" in Multiply()
134 "vmull.u8 q4, d5, d4\n" in Multiply()
135 "vmull.u8 q5, d6, d4\n" in Multiply()
136 "vpadal.u16 q0, q4\n" in Multiply()
137 "vpadal.u16 q1, q5\n" in Multiply()
140 "bgt 1b\n" in Multiply()
143 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
144 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
145 "vdup.32 q6, %[multiplicative_offset]\n" in Multiply()
146 "vdup.32 q7, %[rounding_offset]\n" in Multiply()
147 "vdup.32 q8, %[shift]\n" in Multiply()
148 "vdup.32 q4, d8[0]\n" in Multiply()
153 "vpadd.u32 d0, d0, d1\n" in Multiply()
154 "vpadd.u32 d2, d2, d3\n" in Multiply()
155 "vpadd.u32 d0, d0, d2\n" in Multiply()
158 "vadd.s32 q0, q0, q4\n" in Multiply()
159 "vadd.s32 q0, q0, q5\n" in Multiply()
160 "vmul.i32 q0, q0, q6\n" in Multiply()
161 "vadd.i32 q0, q0, q7\n" in Multiply()
162 "vshl.s32 q0, q0, q8\n" in Multiply()
163 "vqmovn.s32 d0, q0\n" in Multiply()
164 "vqmovun.s16 d0, q0\n" in Multiply()
167 "vst1.16 {d0[0]}, [%[result]]!\n" in Multiply()
195 "pld [%[lhs]]\n" in Multiply()
196 "pld [%[rhs]]\n" in Multiply()
199 "vmov.i32 q0, #0\n" in Multiply()
200 "vmov.i32 q1, #0\n" in Multiply()
201 "vmov.i32 q2, #0\n" in Multiply()
207 "subs %[count], %[count], #8\n" in Multiply()
209 "vld1.32 {d6}, [%[lhs]:64]!\n" in Multiply()
210 "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n" in Multiply()
211 "pld [%[lhs], #64]\n" in Multiply()
212 "pld [%[rhs], #64]\n" in Multiply()
213 "vmull.u8 q5, d7, d6\n" in Multiply()
214 "vmull.u8 q6, d8, d6\n" in Multiply()
215 "vmull.u8 q7, d9, d6\n" in Multiply()
216 "vpadal.u16 q0, q5\n" in Multiply()
217 "vpadal.u16 q1, q6\n" in Multiply()
218 "vpadal.u16 q2, q7\n" in Multiply()
221 "bgt 1b\n" in Multiply()
224 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
225 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
226 "vdup.32 q6, %[multiplicative_offset]\n" in Multiply()
227 "vdup.32 q7, %[rounding_offset]\n" in Multiply()
228 "vdup.32 q8, %[shift]\n" in Multiply()
229 "vdup.32 q4, d8[0]\n" in Multiply()
234 "vpadd.u32 d0, d0, d1\n" in Multiply()
235 "vpadd.u32 d2, d2, d3\n" in Multiply()
236 "vpadd.u32 d4, d4, d5\n" in Multiply()
237 "vpadd.u32 d0, d0, d2\n" in Multiply()
238 "vpadd.u32 d1, d4, d4\n" in Multiply()
241 "vadd.s32 q0, q0, q4\n" in Multiply()
242 "vadd.s32 q0, q0, q5\n" in Multiply()
243 "vmul.i32 q0, q0, q6\n" in Multiply()
244 "vadd.i32 q0, q0, q7\n" in Multiply()
245 "vshl.s32 q0, q0, q8\n" in Multiply()
246 "vqmovn.s32 d0, q0\n" in Multiply()
247 "vqmovun.s16 d0, q0\n" in Multiply()
250 "vst1.16 {d0[0]}, [%[result]]!\n" in Multiply()
251 "vst1.8 {d0[2]}, [%[result]]!\n" in Multiply()
279 "pld [%[lhs]]\n" in Multiply()
280 "pld [%[rhs]]\n" in Multiply()
283 "vmov.i32 q0, #0\n" in Multiply()
284 "vmov.i32 q1, #0\n" in Multiply()
285 "vmov.i32 q2, #0\n" in Multiply()
286 "vmov.i32 q3, q0\n" in Multiply()
292 "subs %[count], %[count], #8\n" in Multiply()
294 "vld1.32 {d8}, [%[lhs]:64]!\n" in Multiply()
295 "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n" in Multiply()
296 "pld [%[lhs], #64]\n" in Multiply()
297 "pld [%[rhs], #64]\n" in Multiply()
298 "vmull.u8 q7, d9, d8\n" in Multiply()
299 "vmull.u8 q8, d10, d8\n" in Multiply()
300 "vmull.u8 q9, d11, d8\n" in Multiply()
301 "vmull.u8 q10, d12, d8\n" in Multiply()
302 "vpadal.u16 q0, q7\n" in Multiply()
303 "vpadal.u16 q1, q8\n" in Multiply()
304 "vpadal.u16 q2, q9\n" in Multiply()
305 "vpadal.u16 q3, q10\n" in Multiply()
308 "bgt 1b\n" in Multiply()
311 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
312 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
313 "vdup.32 q6, %[multiplicative_offset]\n" in Multiply()
314 "vdup.32 q7, %[rounding_offset]\n" in Multiply()
315 "vdup.32 q8, %[shift]\n" in Multiply()
316 "vdup.32 q4, d8[0]\n" in Multiply()
321 "vpadd.u32 d0, d0, d1\n" in Multiply()
322 "vpadd.u32 d2, d2, d3\n" in Multiply()
323 "vpadd.u32 d4, d4, d5\n" in Multiply()
324 "vpadd.u32 d6, d6, d7\n" in Multiply()
325 "vpadd.u32 d0, d0, d2\n" in Multiply()
326 "vpadd.u32 d1, d4, d6\n" in Multiply()
329 "vadd.s32 q0, q0, q4\n" in Multiply()
330 "vadd.s32 q0, q0, q5\n" in Multiply()
331 "vmul.i32 q0, q0, q6\n" in Multiply()
332 "vadd.i32 q0, q0, q7\n" in Multiply()
333 "vshl.s32 q0, q0, q8\n" in Multiply()
334 "vqmovn.s32 d0, q0\n" in Multiply()
335 "vqmovun.s16 d0, q0\n" in Multiply()
338 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
367 "pld [%[lhs]]\n" in Multiply()
368 "pld [%[rhs]]\n" in Multiply()
371 "vmov.i32 q0, #0\n" in Multiply()
372 "vmov.i32 q1, #0\n" in Multiply()
373 "vmov.i32 q2, #0\n" in Multiply()
374 "vmov.i32 q3, q0\n" in Multiply()
375 "vmov.i32 q4, q1\n" in Multiply()
381 "subs %[count], %[count], #8\n" in Multiply()
383 "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n" in Multiply()
384 "vld1.32 {d14}, [%[lhs]:64]!\n" in Multiply()
385 "pld [%[lhs], #64]\n" in Multiply()
386 "vmull.u8 q8, d10, d14\n" in Multiply()
387 "vmull.u8 q9, d11, d14\n" in Multiply()
388 "vmull.u8 q10, d12, d14\n" in Multiply()
389 "vmull.u8 q11, d13, d14\n" in Multiply()
390 "vld1.32 {d10}, [%[rhs]:64]!\n" in Multiply()
391 "pld [%[rhs], #128]\n" in Multiply()
392 "vpadal.u16 q0, q8\n" in Multiply()
393 "vpadal.u16 q1, q9\n" in Multiply()
394 "vpadal.u16 q2, q10\n" in Multiply()
395 "vpadal.u16 q3, q11\n" in Multiply()
396 "vmull.u8 q8, d10, d14\n" in Multiply()
397 "vpadal.u16 q4, q8\n" in Multiply()
400 "bgt 1b\n" in Multiply()
403 "vld1.32 {d10, d11}, [%[lhs]:64]!\n" in Multiply()
404 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" in Multiply()
405 "vdup.32 q8, %[multiplicative_offset]\n" in Multiply()
406 "vdup.32 q9, %[rounding_offset]\n" in Multiply()
407 "vdup.32 q10, %[shift]\n" in Multiply()
408 "vdup.32 q5, d10[0]\n" in Multiply()
413 "vpadd.u32 d0, d0, d1\n" in Multiply()
414 "vpadd.u32 d2, d2, d3\n" in Multiply()
415 "vpadd.u32 d4, d4, d5\n" in Multiply()
416 "vpadd.u32 d6, d6, d7\n" in Multiply()
417 "vpadd.u32 d8, d8, d9\n" in Multiply()
418 "vpadd.u32 d0, d0, d2\n" in Multiply()
419 "vpadd.u32 d1, d4, d6\n" in Multiply()
420 "vpadd.u32 d2, d8, d8\n" in Multiply()
423 "vadd.s32 q0, q0, q5\n" in Multiply()
424 "vadd.s32 q1, q1, q5\n" in Multiply()
425 "vadd.s32 q0, q0, q6\n" in Multiply()
426 "vadd.s32 q1, q1, q7\n" in Multiply()
427 "vmul.i32 q0, q0, q8\n" in Multiply()
428 "vmul.i32 q1, q1, q8\n" in Multiply()
429 "vadd.i32 q0, q0, q9\n" in Multiply()
430 "vadd.i32 q1, q1, q9\n" in Multiply()
431 "vshl.s32 q0, q0, q10\n" in Multiply()
432 "vshl.s32 q1, q1, q10\n" in Multiply()
433 "vqmovn.s32 d0, q0\n" in Multiply()
434 "vqmovn.s32 d1, q1\n" in Multiply()
435 "vqmovun.s16 d0, q0\n" in Multiply()
438 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
439 "vst1.8 {d0[4]}, [%[result]]!\n" in Multiply()
468 "pld [%[lhs]]\n" in Multiply()
469 "pld [%[rhs]]\n" in Multiply()
472 "vmov.i32 q0, #0\n" in Multiply()
473 "vmov.i32 q1, #0\n" in Multiply()
474 "vmov.i32 q2, #0\n" in Multiply()
475 "vmov.i32 q3, q0\n" in Multiply()
476 "vmov.i32 q4, q1\n" in Multiply()
477 "vmov.i32 q5, q2\n" in Multiply()
483 "subs %[count], %[count], #8\n" in Multiply()
485 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" in Multiply()
486 "vld1.32 {d16}, [%[lhs]:64]!\n" in Multiply()
487 "pld [%[lhs], #64]\n" in Multiply()
488 "vmull.u8 q9, d12, d16\n" in Multiply()
489 "vmull.u8 q10, d13, d16\n" in Multiply()
490 "vmull.u8 q11, d14, d16\n" in Multiply()
491 "vmull.u8 q12, d15, d16\n" in Multiply()
492 "vld1.32 {d12, d13}, [%[rhs]:64]!\n" in Multiply()
493 "pld [%[rhs], #128]\n" in Multiply()
494 "vpadal.u16 q0, q9\n" in Multiply()
495 "vpadal.u16 q1, q10\n" in Multiply()
496 "vpadal.u16 q2, q11\n" in Multiply()
497 "vpadal.u16 q3, q12\n" in Multiply()
498 "vmull.u8 q9, d12, d16\n" in Multiply()
499 "vmull.u8 q10, d13, d16\n" in Multiply()
500 "vpadal.u16 q4, q9\n" in Multiply()
501 "vpadal.u16 q5, q10\n" in Multiply()
504 "bgt 1b\n" in Multiply()
507 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
508 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" in Multiply()
509 "vdup.32 q9, %[multiplicative_offset]\n" in Multiply()
510 "vdup.32 q10, %[rounding_offset]\n" in Multiply()
511 "vdup.32 q11, %[shift]\n" in Multiply()
512 "vdup.32 q6, d12[0]\n" in Multiply()
517 "vpadd.u32 d0, d0, d1\n" in Multiply()
518 "vpadd.u32 d2, d2, d3\n" in Multiply()
519 "vpadd.u32 d4, d4, d5\n" in Multiply()
520 "vpadd.u32 d6, d6, d7\n" in Multiply()
521 "vpadd.u32 d8, d8, d9\n" in Multiply()
522 "vpadd.u32 d10, d10, d11\n" in Multiply()
523 "vpadd.u32 d0, d0, d2\n" in Multiply()
524 "vpadd.u32 d1, d4, d6\n" in Multiply()
525 "vpadd.u32 d2, d8, d10\n" in Multiply()
528 "vadd.s32 q0, q0, q6\n" in Multiply()
529 "vadd.s32 q1, q1, q6\n" in Multiply()
530 "vadd.s32 q0, q0, q7\n" in Multiply()
531 "vadd.s32 q1, q1, q8\n" in Multiply()
532 "vmul.i32 q0, q0, q9\n" in Multiply()
533 "vmul.i32 q1, q1, q9\n" in Multiply()
534 "vadd.i32 q0, q0, q10\n" in Multiply()
535 "vadd.i32 q1, q1, q10\n" in Multiply()
536 "vshl.s32 q0, q0, q11\n" in Multiply()
537 "vshl.s32 q1, q1, q11\n" in Multiply()
538 "vqmovn.s32 d0, q0\n" in Multiply()
539 "vqmovn.s32 d1, q1\n" in Multiply()
540 "vqmovun.s16 d0, q0\n" in Multiply()
543 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
544 "vst1.16 {d0[2]}, [%[result]]!\n" in Multiply()
573 "pld [%[lhs]]\n" in Multiply()
574 "pld [%[rhs]]\n" in Multiply()
577 "vmov.i32 q0, #0\n" in Multiply()
578 "vmov.i32 q1, #0\n" in Multiply()
579 "vmov.i32 q2, #0\n" in Multiply()
580 "vmov.i32 q3, q0\n" in Multiply()
581 "vmov.i32 q4, q1\n" in Multiply()
582 "vmov.i32 q5, q2\n" in Multiply()
583 "vmov.i32 q6, q3\n" in Multiply()
589 "subs %[count], %[count], #8\n" in Multiply()
591 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" in Multiply()
592 "vld1.32 {d18}, [%[lhs]:64]!\n" in Multiply()
593 "pld [%[lhs], #64]\n" in Multiply()
594 "vmull.u8 q10, d14, d18\n" in Multiply()
595 "vmull.u8 q11, d15, d18\n" in Multiply()
596 "vmull.u8 q12, d16, d18\n" in Multiply()
597 "vmull.u8 q13, d17, d18\n" in Multiply()
598 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" in Multiply()
599 "pld [%[rhs], #128]\n" in Multiply()
600 "vpadal.u16 q0, q10\n" in Multiply()
601 "vpadal.u16 q1, q11\n" in Multiply()
602 "vpadal.u16 q2, q12\n" in Multiply()
603 "vpadal.u16 q3, q13\n" in Multiply()
604 "vmull.u8 q10, d14, d18\n" in Multiply()
605 "vmull.u8 q11, d15, d18\n" in Multiply()
606 "vmull.u8 q12, d16, d18\n" in Multiply()
607 "vpadal.u16 q4, q10\n" in Multiply()
608 "vpadal.u16 q5, q11\n" in Multiply()
609 "vpadal.u16 q6, q12\n" in Multiply()
612 "bgt 1b\n" in Multiply()
615 "vld1.32 {d14, d15}, [%[lhs]:64]!\n" in Multiply()
616 "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n" in Multiply()
617 "vdup.32 q10, %[multiplicative_offset]\n" in Multiply()
618 "vdup.32 q11, %[rounding_offset]\n" in Multiply()
619 "vdup.32 q12, %[shift]\n" in Multiply()
620 "vdup.32 q7, d14[0]\n" in Multiply()
625 "vpadd.u32 d0, d0, d1\n" in Multiply()
626 "vpadd.u32 d2, d2, d3\n" in Multiply()
627 "vpadd.u32 d4, d4, d5\n" in Multiply()
628 "vpadd.u32 d6, d6, d7\n" in Multiply()
629 "vpadd.u32 d8, d8, d9\n" in Multiply()
630 "vpadd.u32 d10, d10, d11\n" in Multiply()
631 "vpadd.u32 d12, d12, d13\n" in Multiply()
632 "vpadd.u32 d0, d0, d2\n" in Multiply()
633 "vpadd.u32 d1, d4, d6\n" in Multiply()
634 "vpadd.u32 d2, d8, d10\n" in Multiply()
635 "vpadd.u32 d3, d12, d12\n" in Multiply()
638 "vadd.s32 q0, q0, q7\n" in Multiply()
639 "vadd.s32 q1, q1, q7\n" in Multiply()
640 "vadd.s32 q0, q0, q8\n" in Multiply()
641 "vadd.s32 q1, q1, q9\n" in Multiply()
642 "vmul.i32 q0, q0, q10\n" in Multiply()
643 "vmul.i32 q1, q1, q10\n" in Multiply()
644 "vadd.i32 q0, q0, q11\n" in Multiply()
645 "vadd.i32 q1, q1, q11\n" in Multiply()
646 "vshl.s32 q0, q0, q12\n" in Multiply()
647 "vshl.s32 q1, q1, q12\n" in Multiply()
648 "vqmovn.s32 d0, q0\n" in Multiply()
649 "vqmovn.s32 d1, q1\n" in Multiply()
650 "vqmovun.s16 d0, q0\n" in Multiply()
653 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
654 "vst1.16 {d0[2]}, [%[result]]!\n" in Multiply()
655 "vst1.8 {d0[6]}, [%[result]]!\n" in Multiply()
684 "pld [%[lhs]]\n" in Multiply()
685 "pld [%[rhs]]\n" in Multiply()
688 "vmov.i32 q0, #0\n" in Multiply()
689 "vmov.i32 q1, #0\n" in Multiply()
690 "vmov.i32 q2, #0\n" in Multiply()
691 "vmov.i32 q3, q0\n" in Multiply()
692 "vmov.i32 q4, q1\n" in Multiply()
693 "vmov.i32 q5, q2\n" in Multiply()
694 "vmov.i32 q6, q3\n" in Multiply()
695 "vmov.i32 q7, q4\n" in Multiply()
700 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" in Multiply()
701 "vld1.32 {d16}, [%[lhs]:64]!\n" in Multiply()
702 "vmull.u8 q11, d16, d17\n" in Multiply()
703 "vmull.u8 q12, d16, d18\n" in Multiply()
704 "vmull.u8 q13, d16, d19\n" in Multiply()
705 "vmull.u8 q14, d16, d20\n" in Multiply()
706 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" in Multiply()
707 "vpadal.u16 q0, q11\n" in Multiply()
708 "vpadal.u16 q1, q12\n" in Multiply()
709 "vpadal.u16 q2, q13\n" in Multiply()
710 "vpadal.u16 q3, q14\n" in Multiply()
711 "pld [%[rhs], #256]\n" in Multiply()
712 "vmull.u8 q15, d16, d17\n" in Multiply()
713 "vmull.u8 q11, d16, d18\n" in Multiply()
714 "vmull.u8 q12, d16, d19\n" in Multiply()
715 "vmull.u8 q13, d16, d20\n" in Multiply()
716 "pld [%[lhs], #32]\n" in Multiply()
719 "subs %[count], %[count], #8\n" in Multiply()
721 "vpadal.u16 q4, q15\n" in Multiply()
722 "vpadal.u16 q5, q11\n" in Multiply()
723 "vpadal.u16 q6, q12\n" in Multiply()
724 "vpadal.u16 q7, q13\n" in Multiply()
727 "bgt 1b\n" in Multiply()
730 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" in Multiply()
731 "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n" in Multiply()
732 "vdup.32 q11, %[multiplicative_offset]\n" in Multiply()
733 "vdup.32 q12, %[rounding_offset]\n" in Multiply()
734 "vdup.32 q13, %[shift]\n" in Multiply()
735 "vdup.32 q8, d16[0]\n" in Multiply()
740 "vpadd.u32 d0, d0, d1\n" in Multiply()
741 "vpadd.u32 d2, d2, d3\n" in Multiply()
742 "vpadd.u32 d4, d4, d5\n" in Multiply()
743 "vpadd.u32 d6, d6, d7\n" in Multiply()
744 "vpadd.u32 d8, d8, d9\n" in Multiply()
745 "vpadd.u32 d10, d10, d11\n" in Multiply()
746 "vpadd.u32 d12, d12, d13\n" in Multiply()
747 "vpadd.u32 d14, d14, d15\n" in Multiply()
748 "vpadd.u32 d0, d0, d2\n" in Multiply()
749 "vpadd.u32 d1, d4, d6\n" in Multiply()
750 "vpadd.u32 d2, d8, d10\n" in Multiply()
751 "vpadd.u32 d3, d12, d14\n" in Multiply()
754 "vadd.s32 q0, q0, q8\n" in Multiply()
755 "vadd.s32 q1, q1, q8\n" in Multiply()
756 "vadd.s32 q0, q0, q9\n" in Multiply()
757 "vadd.s32 q1, q1, q10\n" in Multiply()
758 "vmul.i32 q0, q0, q11\n" in Multiply()
759 "vmul.i32 q1, q1, q11\n" in Multiply()
760 "vadd.i32 q0, q0, q12\n" in Multiply()
761 "vadd.i32 q1, q1, q12\n" in Multiply()
762 "vshl.s32 q0, q0, q13\n" in Multiply()
763 "vshl.s32 q1, q1, q13\n" in Multiply()
764 "vqmovn.s32 d0, q0\n" in Multiply()
765 "vqmovn.s32 d1, q1\n" in Multiply()
766 "vqmovun.s16 d0, q0\n" in Multiply()
769 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
799 "pld [%[lhs]]\n" in Multiply()
800 "pld [%[rhs]]\n" in Multiply()
803 "vmov.i32 q0, #0\n" in Multiply()
804 "vmov.i32 q1, #0\n" in Multiply()
810 "subs %[count], %[count], #8\n" in Multiply()
812 "vld1.32 {d4, d5}, [%[lhs]:64]!\n" in Multiply()
813 "vld1.32 {d6}, [%[rhs]:64]!\n" in Multiply()
814 "pld [%[lhs], #64]\n" in Multiply()
815 "pld [%[rhs], #64]\n" in Multiply()
816 "vmull.u8 q4, d6, d4\n" in Multiply()
817 "vmull.u8 q5, d6, d5\n" in Multiply()
818 "vpadal.u16 q0, q4\n" in Multiply()
819 "vpadal.u16 q1, q5\n" in Multiply()
822 "bgt 1b\n" in Multiply()
825 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
826 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
827 "vdup.32 q6, %[multiplicative_offset]\n" in Multiply()
828 "vdup.32 q7, %[rounding_offset]\n" in Multiply()
829 "vdup.32 q8, %[shift]\n" in Multiply()
830 "vdup.32 q2, d8[0]\n" in Multiply()
831 "vdup.32 q4, d8[1]\n" in Multiply()
834 "add r0, %[result], %[stride]\n" in Multiply()
837 "vpadd.u32 d0, d0, d1\n" in Multiply()
838 "vpadd.u32 d0, d0, d0\n" in Multiply()
839 "vpadd.u32 d2, d2, d3\n" in Multiply()
840 "vpadd.u32 d2, d2, d2\n" in Multiply()
843 "vadd.s32 q0, q0, q2\n" in Multiply()
844 "vadd.s32 q1, q1, q4\n" in Multiply()
845 "vadd.s32 q0, q0, q5\n" in Multiply()
846 "vadd.s32 q1, q1, q5\n" in Multiply()
847 "vmul.i32 q0, q0, q6\n" in Multiply()
848 "vmul.i32 q1, q1, q6\n" in Multiply()
849 "vadd.i32 q0, q0, q7\n" in Multiply()
850 "vadd.i32 q1, q1, q7\n" in Multiply()
851 "vshl.s32 q0, q0, q8\n" in Multiply()
852 "vshl.s32 q1, q1, q8\n" in Multiply()
853 "vqmovn.s32 d0, q0\n" in Multiply()
854 "vqmovn.s32 d2, q1\n" in Multiply()
855 "vqmovun.s16 d0, q0\n" in Multiply()
856 "vqmovun.s16 d2, q1\n" in Multiply()
859 "vst1.8 {d0[0]}, [%[result]]!\n" in Multiply()
860 "vst1.8 {d2[0]}, [r0]!\n" in Multiply()
888 "pld [%[lhs]]\n" in Multiply()
889 "pld [%[rhs]]\n" in Multiply()
892 "vmov.i32 q0, #0\n" in Multiply()
893 "vmov.i32 q1, #0\n" in Multiply()
894 "vmov.i32 q2, #0\n" in Multiply()
895 "vmov.i32 q3, q0\n" in Multiply()
901 "subs %[count], %[count], #8\n" in Multiply()
903 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
904 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
905 "pld [%[lhs], #64]\n" in Multiply()
906 "pld [%[rhs], #64]\n" in Multiply()
907 "vmull.u8 q6, d10, d8\n" in Multiply()
908 "vmull.u8 q7, d11, d8\n" in Multiply()
909 "vmull.u8 q8, d10, d9\n" in Multiply()
910 "vmull.u8 q9, d11, d9\n" in Multiply()
911 "vpadal.u16 q0, q6\n" in Multiply()
912 "vpadal.u16 q1, q7\n" in Multiply()
913 "vpadal.u16 q2, q8\n" in Multiply()
914 "vpadal.u16 q3, q9\n" in Multiply()
917 "bgt 1b\n" in Multiply()
920 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
921 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
922 "vdup.32 q6, %[multiplicative_offset]\n" in Multiply()
923 "vdup.32 q7, %[rounding_offset]\n" in Multiply()
924 "vdup.32 q8, %[shift]\n" in Multiply()
925 "vdup.32 q9, d8[0]\n" in Multiply()
926 "vdup.32 q4, d8[1]\n" in Multiply()
929 "add r0, %[result], %[stride]\n" in Multiply()
932 "vpadd.u32 d0, d0, d1\n" in Multiply()
933 "vpadd.u32 d2, d2, d3\n" in Multiply()
934 "vpadd.u32 d0, d0, d2\n" in Multiply()
935 "vpadd.u32 d4, d4, d5\n" in Multiply()
936 "vpadd.u32 d6, d6, d7\n" in Multiply()
937 "vpadd.u32 d4, d4, d6\n" in Multiply()
940 "vadd.s32 q0, q0, q9\n" in Multiply()
941 "vadd.s32 q2, q2, q4\n" in Multiply()
942 "vadd.s32 q0, q0, q5\n" in Multiply()
943 "vadd.s32 q2, q2, q5\n" in Multiply()
944 "vmul.i32 q0, q0, q6\n" in Multiply()
945 "vmul.i32 q2, q2, q6\n" in Multiply()
946 "vadd.i32 q0, q0, q7\n" in Multiply()
947 "vadd.i32 q2, q2, q7\n" in Multiply()
948 "vshl.s32 q0, q0, q8\n" in Multiply()
949 "vshl.s32 q2, q2, q8\n" in Multiply()
950 "vqmovn.s32 d0, q0\n" in Multiply()
951 "vqmovn.s32 d4, q2\n" in Multiply()
952 "vqmovun.s16 d0, q0\n" in Multiply()
953 "vqmovun.s16 d4, q2\n" in Multiply()
956 "vst1.16 {d0[0]}, [%[result]]!\n" in Multiply()
957 "vst1.16 {d4[0]}, [r0]!\n" in Multiply()
986 "pld [%[lhs]]\n" in Multiply()
987 "pld [%[rhs]]\n" in Multiply()
990 "vmov.i32 q0, #0\n" in Multiply()
991 "vmov.i32 q1, #0\n" in Multiply()
992 "vmov.i32 q2, #0\n" in Multiply()
993 "vmov.i32 q3, q0\n" in Multiply()
994 "vmov.i32 q4, q1\n" in Multiply()
995 "vmov.i32 q5, q2\n" in Multiply()
1001 "subs %[count], %[count], #8\n" in Multiply()
1003 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
1004 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" in Multiply()
1005 "pld [%[lhs], #64]\n" in Multiply()
1006 "pld [%[rhs], #64]\n" in Multiply()
1007 "vmull.u8 q9, d14, d12\n" in Multiply()
1008 "vmull.u8 q10, d15, d12\n" in Multiply()
1009 "vmull.u8 q11, d16, d12\n" in Multiply()
1010 "vmull.u8 q12, d14, d13\n" in Multiply()
1011 "vmull.u8 q13, d15, d13\n" in Multiply()
1012 "vmull.u8 q14, d16, d13\n" in Multiply()
1013 "vpadal.u16 q0, q9\n" in Multiply()
1014 "vpadal.u16 q1, q10\n" in Multiply()
1015 "vpadal.u16 q2, q11\n" in Multiply()
1016 "vpadal.u16 q3, q12\n" in Multiply()
1017 "vpadal.u16 q4, q13\n" in Multiply()
1018 "vpadal.u16 q5, q14\n" in Multiply()
1021 "bgt 1b\n" in Multiply()
1024 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
1025 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" in Multiply()
1026 "vdup.32 q8, %[multiplicative_offset]\n" in Multiply()
1027 "vdup.32 q9, %[rounding_offset]\n" in Multiply()
1028 "vdup.32 q10, %[shift]\n" in Multiply()
1029 "vdup.32 q11, d12[0]\n" in Multiply()
1030 "vdup.32 q6, d12[1]\n" in Multiply()
1033 "add r0, %[result], %[stride]\n" in Multiply()
1036 "vpadd.u32 d0, d0, d1\n" in Multiply()
1037 "vpadd.u32 d2, d2, d3\n" in Multiply()
1038 "vpadd.u32 d4, d4, d5\n" in Multiply()
1039 "vpadd.u32 d0, d0, d2\n" in Multiply()
1040 "vpadd.u32 d1, d4, d4\n" in Multiply()
1041 "vpadd.u32 d6, d6, d7\n" in Multiply()
1042 "vpadd.u32 d8, d8, d9\n" in Multiply()
1043 "vpadd.u32 d10, d10, d11\n" in Multiply()
1044 "vpadd.u32 d6, d6, d8\n" in Multiply()
1045 "vpadd.u32 d7, d10, d10\n" in Multiply()
1048 "vadd.s32 q0, q0, q11\n" in Multiply()
1049 "vadd.s32 q3, q3, q6\n" in Multiply()
1050 "vadd.s32 q0, q0, q7\n" in Multiply()
1051 "vadd.s32 q3, q3, q7\n" in Multiply()
1052 "vmul.i32 q0, q0, q8\n" in Multiply()
1053 "vmul.i32 q3, q3, q8\n" in Multiply()
1054 "vadd.i32 q0, q0, q9\n" in Multiply()
1055 "vadd.i32 q3, q3, q9\n" in Multiply()
1056 "vshl.s32 q0, q0, q10\n" in Multiply()
1057 "vshl.s32 q3, q3, q10\n" in Multiply()
1058 "vqmovn.s32 d0, q0\n" in Multiply()
1059 "vqmovn.s32 d6, q3\n" in Multiply()
1060 "vqmovun.s16 d0, q0\n" in Multiply()
1061 "vqmovun.s16 d6, q3\n" in Multiply()
1064 "vst1.16 {d0[0]}, [%[result]]!\n" in Multiply()
1065 "vst1.8 {d0[2]}, [%[result]]!\n" in Multiply()
1066 "vst1.16 {d6[0]}, [r0]!\n" in Multiply()
1067 "vst1.8 {d6[2]}, [r0]!\n" in Multiply()
1097 "pld [%[lhs]]\n" in Multiply()
1098 "pld [%[rhs]]\n" in Multiply()
1101 "vmov.i32 q0, #0\n" in Multiply()
1102 "vmov.i32 q1, #0\n" in Multiply()
1103 "vmov.i32 q2, #0\n" in Multiply()
1104 "vmov.i32 q3, q0\n" in Multiply()
1105 "vmov.i32 q4, q1\n" in Multiply()
1106 "vmov.i32 q5, q2\n" in Multiply()
1107 "vmov.i32 q6, q3\n" in Multiply()
1108 "vmov.i32 q7, q4\n" in Multiply()
1113 "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n" in Multiply()
1114 "vld1.8 {d16}, [%[lhs]:64]!\n" in Multiply()
1115 "vmull.u8 q11, d16, d18\n" in Multiply()
1116 "vld1.8 {d17}, [%[lhs]:64]!\n" in Multiply()
1117 "vmull.u8 q12, d16, d19\n" in Multiply()
1118 "pld [%[rhs], #64]\n" in Multiply()
1119 "vmull.u8 q13, d16, d20\n" in Multiply()
1120 "pld [%[lhs], #64]\n" in Multiply()
1121 "vmull.u8 q14, d16, d21\n" in Multiply()
1122 "vmull.u8 q15, d17, d18\n" in Multiply()
1123 "vpadal.u16 q0, q11\n" in Multiply()
1124 "vpadal.u16 q1, q12\n" in Multiply()
1125 "vpadal.u16 q2, q13\n" in Multiply()
1126 "vmull.u8 q11, d17, d19\n" in Multiply()
1127 "vmull.u8 q12, d17, d20\n" in Multiply()
1128 "vmull.u8 q13, d17, d21\n" in Multiply()
1131 "subs %[count], %[count], #8\n" in Multiply()
1133 "vpadal.u16 q3, q14\n" in Multiply()
1134 "vpadal.u16 q4, q15\n" in Multiply()
1135 "vpadal.u16 q5, q11\n" in Multiply()
1136 "vpadal.u16 q6, q12\n" in Multiply()
1137 "vpadal.u16 q7, q13\n" in Multiply()
1140 "bgt 1b\n" in Multiply()
1143 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" in Multiply()
1144 "vld1.32 {d18, d19}, [%[rhs]:64]!\n" in Multiply()
1145 "vdup.32 q10, %[multiplicative_offset]\n" in Multiply()
1146 "vdup.32 q11, %[rounding_offset]\n" in Multiply()
1147 "vdup.32 q12, %[shift]\n" in Multiply()
1148 "vdup.32 q13, d16[0]\n" in Multiply()
1149 "vdup.32 q8, d16[1]\n" in Multiply()
1152 "add r0, %[result], %[stride]\n" in Multiply()
1155 "vpadd.u32 d0, d0, d1\n" in Multiply()
1156 "vpadd.u32 d2, d2, d3\n" in Multiply()
1157 "vpadd.u32 d4, d4, d5\n" in Multiply()
1158 "vpadd.u32 d6, d6, d7\n" in Multiply()
1159 "vpadd.u32 d0, d0, d2\n" in Multiply()
1160 "vpadd.u32 d1, d4, d6\n" in Multiply()
1161 "vpadd.u32 d8, d8, d9\n" in Multiply()
1162 "vpadd.u32 d10, d10, d11\n" in Multiply()
1163 "vpadd.u32 d12, d12, d13\n" in Multiply()
1164 "vpadd.u32 d14, d14, d15\n" in Multiply()
1165 "vpadd.u32 d8, d8, d10\n" in Multiply()
1166 "vpadd.u32 d9, d12, d14\n" in Multiply()
1169 "vadd.s32 q0, q0, q13\n" in Multiply()
1170 "vadd.s32 q4, q4, q8\n" in Multiply()
1171 "vadd.s32 q0, q0, q9\n" in Multiply()
1172 "vadd.s32 q4, q4, q9\n" in Multiply()
1173 "vmul.i32 q0, q0, q10\n" in Multiply()
1174 "vmul.i32 q4, q4, q10\n" in Multiply()
1175 "vadd.i32 q0, q0, q11\n" in Multiply()
1176 "vadd.i32 q4, q4, q11\n" in Multiply()
1177 "vshl.s32 q0, q0, q12\n" in Multiply()
1178 "vshl.s32 q4, q4, q12\n" in Multiply()
1179 "vqmovn.s32 d0, q0\n" in Multiply()
1180 "vqmovn.s32 d8, q4\n" in Multiply()
1181 "vqmovun.s16 d0, q0\n" in Multiply()
1182 "vqmovun.s16 d8, q4\n" in Multiply()
1185 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
1186 "vst1.32 {d8[0]}, [r0]!\n" in Multiply()
1216 "pld [%[lhs]]\n" in Multiply()
1217 "pld [%[rhs]]\n" in Multiply()
1220 "vmov.i32 q0, #0\n" in Multiply()
1221 "vmov.i32 q1, #0\n" in Multiply()
1222 "vmov.i32 q2, #0\n" in Multiply()
1228 "subs %[count], %[count], #8\n" in Multiply()
1230 "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n" in Multiply()
1231 "vld1.32 {d9}, [%[rhs]:64]!\n" in Multiply()
1232 "pld [%[lhs], #64]\n" in Multiply()
1233 "pld [%[rhs], #64]\n" in Multiply()
1234 "vmull.u8 q5, d9, d6\n" in Multiply()
1235 "vmull.u8 q6, d9, d7\n" in Multiply()
1236 "vmull.u8 q7, d9, d8\n" in Multiply()
1237 "vpadal.u16 q0, q5\n" in Multiply()
1238 "vpadal.u16 q1, q6\n" in Multiply()
1239 "vpadal.u16 q2, q7\n" in Multiply()
1242 "bgt 1b\n" in Multiply()
1245 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
1246 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
1247 "vdup.32 q6, %[multiplicative_offset]\n" in Multiply()
1248 "vdup.32 q7, %[rounding_offset]\n" in Multiply()
1249 "vdup.32 q8, %[shift]\n" in Multiply()
1250 "vdup.32 q3, d8[0]\n" in Multiply()
1251 "vdup.32 q9, d8[1]\n" in Multiply()
1252 "vdup.32 q4, d9[0]\n" in Multiply()
1255 "add r0, %[result], %[stride]\n" in Multiply()
1256 "add r1, r0, %[stride]\n" in Multiply()
1259 "vpadd.u32 d0, d0, d1\n" in Multiply()
1260 "vpadd.u32 d0, d0, d0\n" in Multiply()
1261 "vpadd.u32 d2, d2, d3\n" in Multiply()
1262 "vpadd.u32 d2, d2, d2\n" in Multiply()
1263 "vpadd.u32 d4, d4, d5\n" in Multiply()
1264 "vpadd.u32 d4, d4, d4\n" in Multiply()
1267 "vadd.s32 q0, q0, q3\n" in Multiply()
1268 "vadd.s32 q1, q1, q9\n" in Multiply()
1269 "vadd.s32 q2, q2, q4\n" in Multiply()
1270 "vadd.s32 q0, q0, q5\n" in Multiply()
1271 "vadd.s32 q1, q1, q5\n" in Multiply()
1272 "vadd.s32 q2, q2, q5\n" in Multiply()
1273 "vmul.i32 q0, q0, q6\n" in Multiply()
1274 "vmul.i32 q1, q1, q6\n" in Multiply()
1275 "vmul.i32 q2, q2, q6\n" in Multiply()
1276 "vadd.i32 q0, q0, q7\n" in Multiply()
1277 "vadd.i32 q1, q1, q7\n" in Multiply()
1278 "vadd.i32 q2, q2, q7\n" in Multiply()
1279 "vshl.s32 q0, q0, q8\n" in Multiply()
1280 "vshl.s32 q1, q1, q8\n" in Multiply()
1281 "vshl.s32 q2, q2, q8\n" in Multiply()
1282 "vqmovn.s32 d0, q0\n" in Multiply()
1283 "vqmovn.s32 d2, q1\n" in Multiply()
1284 "vqmovn.s32 d4, q2\n" in Multiply()
1285 "vqmovun.s16 d0, q0\n" in Multiply()
1286 "vqmovun.s16 d2, q1\n" in Multiply()
1287 "vqmovun.s16 d4, q2\n" in Multiply()
1290 "vst1.8 {d0[0]}, [%[result]]!\n" in Multiply()
1291 "vst1.8 {d2[0]}, [r0]!\n" in Multiply()
1292 "vst1.8 {d4[0]}, [r1]!\n" in Multiply()
1321 "pld [%[lhs]]\n" in Multiply()
1322 "pld [%[rhs]]\n" in Multiply()
1325 "vmov.i32 q0, #0\n" in Multiply()
1326 "vmov.i32 q1, #0\n" in Multiply()
1327 "vmov.i32 q2, #0\n" in Multiply()
1328 "vmov.i32 q3, q0\n" in Multiply()
1329 "vmov.i32 q4, q1\n" in Multiply()
1330 "vmov.i32 q5, q2\n" in Multiply()
1336 "subs %[count], %[count], #8\n" in Multiply()
1338 "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n" in Multiply()
1339 "vld1.32 {d15, d16}, [%[rhs]:64]!\n" in Multiply()
1340 "pld [%[lhs], #64]\n" in Multiply()
1341 "pld [%[rhs], #64]\n" in Multiply()
1342 "vmull.u8 q9, d15, d12\n" in Multiply()
1343 "vmull.u8 q10, d16, d12\n" in Multiply()
1344 "vmull.u8 q11, d15, d13\n" in Multiply()
1345 "vmull.u8 q12, d16, d13\n" in Multiply()
1346 "vmull.u8 q13, d15, d14\n" in Multiply()
1347 "vmull.u8 q14, d16, d14\n" in Multiply()
1348 "vpadal.u16 q0, q9\n" in Multiply()
1349 "vpadal.u16 q1, q10\n" in Multiply()
1350 "vpadal.u16 q2, q11\n" in Multiply()
1351 "vpadal.u16 q3, q12\n" in Multiply()
1352 "vpadal.u16 q4, q13\n" in Multiply()
1353 "vpadal.u16 q5, q14\n" in Multiply()
1356 "bgt 1b\n" in Multiply()
1359 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
1360 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" in Multiply()
1361 "vdup.32 q8, %[multiplicative_offset]\n" in Multiply()
1362 "vdup.32 q9, %[rounding_offset]\n" in Multiply()
1363 "vdup.32 q10, %[shift]\n" in Multiply()
1364 "vdup.32 q11, d12[0]\n" in Multiply()
1365 "vdup.32 q12, d12[1]\n" in Multiply()
1366 "vdup.32 q6, d13[0]\n" in Multiply()
1369 "add r0, %[result], %[stride]\n" in Multiply()
1370 "add r1, r0, %[stride]\n" in Multiply()
1373 "vpadd.u32 d0, d0, d1\n" in Multiply()
1374 "vpadd.u32 d2, d2, d3\n" in Multiply()
1375 "vpadd.u32 d0, d0, d2\n" in Multiply()
1376 "vpadd.u32 d4, d4, d5\n" in Multiply()
1377 "vpadd.u32 d6, d6, d7\n" in Multiply()
1378 "vpadd.u32 d4, d4, d6\n" in Multiply()
1379 "vpadd.u32 d8, d8, d9\n" in Multiply()
1380 "vpadd.u32 d10, d10, d11\n" in Multiply()
1381 "vpadd.u32 d8, d8, d10\n" in Multiply()
1384 "vadd.s32 q0, q0, q11\n" in Multiply()
1385 "vadd.s32 q2, q2, q12\n" in Multiply()
1386 "vadd.s32 q4, q4, q6\n" in Multiply()
1387 "vadd.s32 q0, q0, q7\n" in Multiply()
1388 "vadd.s32 q2, q2, q7\n" in Multiply()
1389 "vadd.s32 q4, q4, q7\n" in Multiply()
1390 "vmul.i32 q0, q0, q8\n" in Multiply()
1391 "vmul.i32 q2, q2, q8\n" in Multiply()
1392 "vmul.i32 q4, q4, q8\n" in Multiply()
1393 "vadd.i32 q0, q0, q9\n" in Multiply()
1394 "vadd.i32 q2, q2, q9\n" in Multiply()
1395 "vadd.i32 q4, q4, q9\n" in Multiply()
1396 "vshl.s32 q0, q0, q10\n" in Multiply()
1397 "vshl.s32 q2, q2, q10\n" in Multiply()
1398 "vshl.s32 q4, q4, q10\n" in Multiply()
1399 "vqmovn.s32 d0, q0\n" in Multiply()
1400 "vqmovn.s32 d4, q2\n" in Multiply()
1401 "vqmovn.s32 d8, q4\n" in Multiply()
1402 "vqmovun.s16 d0, q0\n" in Multiply()
1403 "vqmovun.s16 d4, q2\n" in Multiply()
1404 "vqmovun.s16 d8, q4\n" in Multiply()
1407 "vst1.16 {d0[0]}, [%[result]]!\n" in Multiply()
1408 "vst1.16 {d4[0]}, [r0]!\n" in Multiply()
1409 "vst1.16 {d8[0]}, [r1]!\n" in Multiply()
1439 "pld [%[lhs]]\n" in Multiply()
1440 "pld [%[rhs]]\n" in Multiply()
1443 "vmov.i32 q0, #0\n" in Multiply()
1444 "vmov.i32 q1, #0\n" in Multiply()
1445 "vmov.i32 q2, #0\n" in Multiply()
1446 "vmov.i32 q3, q0\n" in Multiply()
1447 "vmov.i32 q4, q1\n" in Multiply()
1448 "vmov.i32 q5, q2\n" in Multiply()
1449 "vmov.i32 q6, q3\n" in Multiply()
1450 "vmov.i32 q7, q4\n" in Multiply()
1451 "vmov.i32 q8, q5\n" in Multiply()
1456 "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n" in Multiply()
1457 "vld1.8 {d18}, [%[lhs]:64]!\n" in Multiply()
1458 "vmull.u8 q12, d18, d21\n" in Multiply()
1459 "vld1.8 {d19}, [%[lhs]:64]!\n" in Multiply()
1460 "vmull.u8 q13, d18, d22\n" in Multiply()
1461 "vld1.8 {d20}, [%[lhs]:64]!\n" in Multiply()
1462 "vmull.u8 q14, d18, d23\n" in Multiply()
1463 "pld [%[lhs], #64]\n" in Multiply()
1464 "vmull.u8 q15, d19, d21\n" in Multiply()
1465 "pld [%[rhs], #64]\n" in Multiply()
1466 "vpadal.u16 q0, q12\n" in Multiply()
1467 "vpadal.u16 q1, q13\n" in Multiply()
1468 "vpadal.u16 q2, q14\n" in Multiply()
1469 "vpadal.u16 q3, q15\n" in Multiply()
1470 "vmull.u8 q12, d19, d22\n" in Multiply()
1471 "vmull.u8 q13, d19, d23\n" in Multiply()
1472 "vmull.u8 q14, d20, d21\n" in Multiply()
1473 "vmull.u8 q15, d20, d22\n" in Multiply()
1476 "subs %[count], %[count], #8\n" in Multiply()
1478 "vmull.u8 q9, d20, d23\n" in Multiply()
1479 "vpadal.u16 q4, q12\n" in Multiply()
1480 "vpadal.u16 q5, q13\n" in Multiply()
1481 "vpadal.u16 q6, q14\n" in Multiply()
1482 "vpadal.u16 q7, q15\n" in Multiply()
1483 "vpadal.u16 q8, q9\n" in Multiply()
1486 "bgt 1b\n" in Multiply()
1489 "vld1.32 {d18, d19}, [%[lhs]:64]!\n" in Multiply()
1490 "vld1.32 {d20, d21}, [%[rhs]:64]!\n" in Multiply()
1491 "vdup.32 q11, %[multiplicative_offset]\n" in Multiply()
1492 "vdup.32 q12, %[rounding_offset]\n" in Multiply()
1493 "vdup.32 q13, %[shift]\n" in Multiply()
1494 "vdup.32 q14, d18[0]\n" in Multiply()
1495 "vdup.32 q15, d18[1]\n" in Multiply()
1496 "vdup.32 q9, d19[0]\n" in Multiply()
1499 "add r0, %[result], %[stride]\n" in Multiply()
1500 "add r1, r0, %[stride]\n" in Multiply()
1503 "vpadd.u32 d0, d0, d1\n" in Multiply()
1504 "vpadd.u32 d2, d2, d3\n" in Multiply()
1505 "vpadd.u32 d4, d4, d5\n" in Multiply()
1506 "vpadd.u32 d0, d0, d2\n" in Multiply()
1507 "vpadd.u32 d1, d4, d4\n" in Multiply()
1508 "vpadd.u32 d6, d6, d7\n" in Multiply()
1509 "vpadd.u32 d8, d8, d9\n" in Multiply()
1510 "vpadd.u32 d10, d10, d11\n" in Multiply()
1511 "vpadd.u32 d6, d6, d8\n" in Multiply()
1512 "vpadd.u32 d7, d10, d10\n" in Multiply()
1513 "vpadd.u32 d12, d12, d13\n" in Multiply()
1514 "vpadd.u32 d14, d14, d15\n" in Multiply()
1515 "vpadd.u32 d16, d16, d17\n" in Multiply()
1516 "vpadd.u32 d12, d12, d14\n" in Multiply()
1517 "vpadd.u32 d13, d16, d16\n" in Multiply()
1520 "vadd.s32 q0, q0, q14\n" in Multiply()
1521 "vadd.s32 q3, q3, q15\n" in Multiply()
1522 "vadd.s32 q6, q6, q9\n" in Multiply()
1523 "vadd.s32 q0, q0, q10\n" in Multiply()
1524 "vadd.s32 q3, q3, q10\n" in Multiply()
1525 "vadd.s32 q6, q6, q10\n" in Multiply()
1526 "vmul.i32 q0, q0, q11\n" in Multiply()
1527 "vmul.i32 q3, q3, q11\n" in Multiply()
1528 "vmul.i32 q6, q6, q11\n" in Multiply()
1529 "vadd.i32 q0, q0, q12\n" in Multiply()
1530 "vadd.i32 q3, q3, q12\n" in Multiply()
1531 "vadd.i32 q6, q6, q12\n" in Multiply()
1532 "vshl.s32 q0, q0, q13\n" in Multiply()
1533 "vshl.s32 q3, q3, q13\n" in Multiply()
1534 "vshl.s32 q6, q6, q13\n" in Multiply()
1535 "vqmovn.s32 d0, q0\n" in Multiply()
1536 "vqmovn.s32 d6, q3\n" in Multiply()
1537 "vqmovn.s32 d12, q6\n" in Multiply()
1538 "vqmovun.s16 d0, q0\n" in Multiply()
1539 "vqmovun.s16 d6, q3\n" in Multiply()
1540 "vqmovun.s16 d12, q6\n" in Multiply()
1543 "vst1.16 {d0[0]}, [%[result]]!\n" in Multiply()
1544 "vst1.8 {d0[2]}, [%[result]]!\n" in Multiply()
1545 "vst1.16 {d6[0]}, [r0]!\n" in Multiply()
1546 "vst1.8 {d6[2]}, [r0]!\n" in Multiply()
1547 "vst1.16 {d12[0]}, [r1]!\n" in Multiply()
1548 "vst1.8 {d12[2]}, [r1]!\n" in Multiply()
1579 "pld [%[lhs]]\n" in Multiply()
1580 "pld [%[rhs]]\n" in Multiply()
1583 "vmov.i32 q0, #0\n" in Multiply()
1589 "subs %[count], %[count], #8\n" in Multiply()
1591 "vld1.32 {d2}, [%[lhs]:64]!\n" in Multiply()
1592 "vld1.32 {d3}, [%[rhs]:64]!\n" in Multiply()
1593 "pld [%[lhs], #64]\n" in Multiply()
1594 "pld [%[rhs], #64]\n" in Multiply()
1595 "vmull.u8 q2, d3, d2\n" in Multiply()
1596 "vpadal.u16 q0, q2\n" in Multiply()
1599 "bgt 1b\n" in Multiply()
1602 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
1603 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
1604 "vdup.32 q4, d8[0]\n" in Multiply()
1609 "vpadd.u32 d0, d0, d1\n" in Multiply()
1610 "vpadd.u32 d0, d0, d0\n" in Multiply()
1613 "vadd.s32 q0, q0, q4\n" in Multiply()
1614 "vadd.s32 q0, q0, q5\n" in Multiply()
1617 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
1643 "pld [%[lhs]]\n" in Multiply()
1644 "pld [%[rhs]]\n" in Multiply()
1647 "vmov.i32 q0, #0\n" in Multiply()
1648 "vmov.i32 q1, #0\n" in Multiply()
1654 "subs %[count], %[count], #8\n" in Multiply()
1656 "vld1.32 {d4}, [%[lhs]:64]!\n" in Multiply()
1657 "vld1.32 {d5, d6}, [%[rhs]:64]!\n" in Multiply()
1658 "pld [%[lhs], #64]\n" in Multiply()
1659 "pld [%[rhs], #64]\n" in Multiply()
1660 "vmull.u8 q4, d5, d4\n" in Multiply()
1661 "vmull.u8 q5, d6, d4\n" in Multiply()
1662 "vpadal.u16 q0, q4\n" in Multiply()
1663 "vpadal.u16 q1, q5\n" in Multiply()
1666 "bgt 1b\n" in Multiply()
1669 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
1670 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
1671 "vdup.32 q4, d8[0]\n" in Multiply()
1676 "vpadd.u32 d0, d0, d1\n" in Multiply()
1677 "vpadd.u32 d2, d2, d3\n" in Multiply()
1678 "vpadd.u32 d0, d0, d2\n" in Multiply()
1681 "vadd.s32 q0, q0, q4\n" in Multiply()
1682 "vadd.s32 q0, q0, q5\n" in Multiply()
1685 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
1711 "pld [%[lhs]]\n" in Multiply()
1712 "pld [%[rhs]]\n" in Multiply()
1715 "vmov.i32 q0, #0\n" in Multiply()
1716 "vmov.i32 q1, #0\n" in Multiply()
1717 "vmov.i32 q2, #0\n" in Multiply()
1723 "subs %[count], %[count], #8\n" in Multiply()
1725 "vld1.32 {d6}, [%[lhs]:64]!\n" in Multiply()
1726 "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n" in Multiply()
1727 "pld [%[lhs], #64]\n" in Multiply()
1728 "pld [%[rhs], #64]\n" in Multiply()
1729 "vmull.u8 q5, d7, d6\n" in Multiply()
1730 "vmull.u8 q6, d8, d6\n" in Multiply()
1731 "vmull.u8 q7, d9, d6\n" in Multiply()
1732 "vpadal.u16 q0, q5\n" in Multiply()
1733 "vpadal.u16 q1, q6\n" in Multiply()
1734 "vpadal.u16 q2, q7\n" in Multiply()
1737 "bgt 1b\n" in Multiply()
1740 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
1741 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
1742 "vdup.32 q4, d8[0]\n" in Multiply()
1747 "vpadd.u32 d0, d0, d1\n" in Multiply()
1748 "vpadd.u32 d2, d2, d3\n" in Multiply()
1749 "vpadd.u32 d4, d4, d5\n" in Multiply()
1750 "vpadd.u32 d0, d0, d2\n" in Multiply()
1751 "vpadd.u32 d1, d4, d4\n" in Multiply()
1754 "vadd.s32 q0, q0, q4\n" in Multiply()
1755 "vadd.s32 q0, q0, q5\n" in Multiply()
1758 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
1759 "vst1.32 {d1[0]}, [%[result]]!\n" in Multiply()
1785 "pld [%[lhs]]\n" in Multiply()
1786 "pld [%[rhs]]\n" in Multiply()
1789 "vmov.i32 q0, #0\n" in Multiply()
1790 "vmov.i32 q1, #0\n" in Multiply()
1791 "vmov.i32 q2, #0\n" in Multiply()
1792 "vmov.i32 q3, q0\n" in Multiply()
1798 "subs %[count], %[count], #8\n" in Multiply()
1800 "vld1.32 {d8}, [%[lhs]:64]!\n" in Multiply()
1801 "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n" in Multiply()
1802 "pld [%[lhs], #64]\n" in Multiply()
1803 "pld [%[rhs], #64]\n" in Multiply()
1804 "vmull.u8 q7, d9, d8\n" in Multiply()
1805 "vmull.u8 q8, d10, d8\n" in Multiply()
1806 "vmull.u8 q9, d11, d8\n" in Multiply()
1807 "vmull.u8 q10, d12, d8\n" in Multiply()
1808 "vpadal.u16 q0, q7\n" in Multiply()
1809 "vpadal.u16 q1, q8\n" in Multiply()
1810 "vpadal.u16 q2, q9\n" in Multiply()
1811 "vpadal.u16 q3, q10\n" in Multiply()
1814 "bgt 1b\n" in Multiply()
1817 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
1818 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
1819 "vdup.32 q4, d8[0]\n" in Multiply()
1824 "vpadd.u32 d0, d0, d1\n" in Multiply()
1825 "vpadd.u32 d2, d2, d3\n" in Multiply()
1826 "vpadd.u32 d4, d4, d5\n" in Multiply()
1827 "vpadd.u32 d6, d6, d7\n" in Multiply()
1828 "vpadd.u32 d0, d0, d2\n" in Multiply()
1829 "vpadd.u32 d1, d4, d6\n" in Multiply()
1832 "vadd.s32 q0, q0, q4\n" in Multiply()
1833 "vadd.s32 q0, q0, q5\n" in Multiply()
1836 "vst1.32 {d0, d1}, [%[result]]!\n" in Multiply()
1863 "pld [%[lhs]]\n" in Multiply()
1864 "pld [%[rhs]]\n" in Multiply()
1867 "vmov.i32 q0, #0\n" in Multiply()
1868 "vmov.i32 q1, #0\n" in Multiply()
1869 "vmov.i32 q2, #0\n" in Multiply()
1870 "vmov.i32 q3, q0\n" in Multiply()
1871 "vmov.i32 q4, q1\n" in Multiply()
1877 "subs %[count], %[count], #8\n" in Multiply()
1879 "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n" in Multiply()
1880 "vld1.32 {d14}, [%[lhs]:64]!\n" in Multiply()
1881 "pld [%[lhs], #64]\n" in Multiply()
1882 "vmull.u8 q8, d10, d14\n" in Multiply()
1883 "vmull.u8 q9, d11, d14\n" in Multiply()
1884 "vmull.u8 q10, d12, d14\n" in Multiply()
1885 "vmull.u8 q11, d13, d14\n" in Multiply()
1886 "vld1.32 {d10}, [%[rhs]:64]!\n" in Multiply()
1887 "pld [%[rhs], #128]\n" in Multiply()
1888 "vpadal.u16 q0, q8\n" in Multiply()
1889 "vpadal.u16 q1, q9\n" in Multiply()
1890 "vpadal.u16 q2, q10\n" in Multiply()
1891 "vpadal.u16 q3, q11\n" in Multiply()
1892 "vmull.u8 q8, d10, d14\n" in Multiply()
1893 "vpadal.u16 q4, q8\n" in Multiply()
1896 "bgt 1b\n" in Multiply()
1899 "vld1.32 {d10, d11}, [%[lhs]:64]!\n" in Multiply()
1900 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" in Multiply()
1901 "vdup.32 q5, d10[0]\n" in Multiply()
1906 "vpadd.u32 d0, d0, d1\n" in Multiply()
1907 "vpadd.u32 d2, d2, d3\n" in Multiply()
1908 "vpadd.u32 d4, d4, d5\n" in Multiply()
1909 "vpadd.u32 d6, d6, d7\n" in Multiply()
1910 "vpadd.u32 d8, d8, d9\n" in Multiply()
1911 "vpadd.u32 d0, d0, d2\n" in Multiply()
1912 "vpadd.u32 d1, d4, d6\n" in Multiply()
1913 "vpadd.u32 d2, d8, d8\n" in Multiply()
1916 "vadd.s32 q0, q0, q5\n" in Multiply()
1917 "vadd.s32 q1, q1, q5\n" in Multiply()
1918 "vadd.s32 q0, q0, q6\n" in Multiply()
1919 "vadd.s32 q1, q1, q7\n" in Multiply()
1922 "vst1.32 {d0, d1}, [%[result]]!\n" in Multiply()
1923 "vst1.32 {d2[0]}, [%[result]]!\n" in Multiply()
1950 "pld [%[lhs]]\n" in Multiply()
1951 "pld [%[rhs]]\n" in Multiply()
1954 "vmov.i32 q0, #0\n" in Multiply()
1955 "vmov.i32 q1, #0\n" in Multiply()
1956 "vmov.i32 q2, #0\n" in Multiply()
1957 "vmov.i32 q3, q0\n" in Multiply()
1958 "vmov.i32 q4, q1\n" in Multiply()
1959 "vmov.i32 q5, q2\n" in Multiply()
1965 "subs %[count], %[count], #8\n" in Multiply()
1967 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" in Multiply()
1968 "vld1.32 {d16}, [%[lhs]:64]!\n" in Multiply()
1969 "pld [%[lhs], #64]\n" in Multiply()
1970 "vmull.u8 q9, d12, d16\n" in Multiply()
1971 "vmull.u8 q10, d13, d16\n" in Multiply()
1972 "vmull.u8 q11, d14, d16\n" in Multiply()
1973 "vmull.u8 q12, d15, d16\n" in Multiply()
1974 "vld1.32 {d12, d13}, [%[rhs]:64]!\n" in Multiply()
1975 "pld [%[rhs], #128]\n" in Multiply()
1976 "vpadal.u16 q0, q9\n" in Multiply()
1977 "vpadal.u16 q1, q10\n" in Multiply()
1978 "vpadal.u16 q2, q11\n" in Multiply()
1979 "vpadal.u16 q3, q12\n" in Multiply()
1980 "vmull.u8 q9, d12, d16\n" in Multiply()
1981 "vmull.u8 q10, d13, d16\n" in Multiply()
1982 "vpadal.u16 q4, q9\n" in Multiply()
1983 "vpadal.u16 q5, q10\n" in Multiply()
1986 "bgt 1b\n" in Multiply()
1989 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
1990 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" in Multiply()
1991 "vdup.32 q6, d12[0]\n" in Multiply()
1996 "vpadd.u32 d0, d0, d1\n" in Multiply()
1997 "vpadd.u32 d2, d2, d3\n" in Multiply()
1998 "vpadd.u32 d4, d4, d5\n" in Multiply()
1999 "vpadd.u32 d6, d6, d7\n" in Multiply()
2000 "vpadd.u32 d8, d8, d9\n" in Multiply()
2001 "vpadd.u32 d10, d10, d11\n" in Multiply()
2002 "vpadd.u32 d0, d0, d2\n" in Multiply()
2003 "vpadd.u32 d1, d4, d6\n" in Multiply()
2004 "vpadd.u32 d2, d8, d10\n" in Multiply()
2007 "vadd.s32 q0, q0, q6\n" in Multiply()
2008 "vadd.s32 q1, q1, q6\n" in Multiply()
2009 "vadd.s32 q0, q0, q7\n" in Multiply()
2010 "vadd.s32 q1, q1, q8\n" in Multiply()
2013 "vst1.32 {d0, d1, d2}, [%[result]]!\n" in Multiply()
2040 "pld [%[lhs]]\n" in Multiply()
2041 "pld [%[rhs]]\n" in Multiply()
2044 "vmov.i32 q0, #0\n" in Multiply()
2045 "vmov.i32 q1, #0\n" in Multiply()
2046 "vmov.i32 q2, #0\n" in Multiply()
2047 "vmov.i32 q3, q0\n" in Multiply()
2048 "vmov.i32 q4, q1\n" in Multiply()
2049 "vmov.i32 q5, q2\n" in Multiply()
2050 "vmov.i32 q6, q3\n" in Multiply()
2056 "subs %[count], %[count], #8\n" in Multiply()
2058 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" in Multiply()
2059 "vld1.32 {d18}, [%[lhs]:64]!\n" in Multiply()
2060 "pld [%[lhs], #64]\n" in Multiply()
2061 "vmull.u8 q10, d14, d18\n" in Multiply()
2062 "vmull.u8 q11, d15, d18\n" in Multiply()
2063 "vmull.u8 q12, d16, d18\n" in Multiply()
2064 "vmull.u8 q13, d17, d18\n" in Multiply()
2065 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" in Multiply()
2066 "pld [%[rhs], #128]\n" in Multiply()
2067 "vpadal.u16 q0, q10\n" in Multiply()
2068 "vpadal.u16 q1, q11\n" in Multiply()
2069 "vpadal.u16 q2, q12\n" in Multiply()
2070 "vpadal.u16 q3, q13\n" in Multiply()
2071 "vmull.u8 q10, d14, d18\n" in Multiply()
2072 "vmull.u8 q11, d15, d18\n" in Multiply()
2073 "vmull.u8 q12, d16, d18\n" in Multiply()
2074 "vpadal.u16 q4, q10\n" in Multiply()
2075 "vpadal.u16 q5, q11\n" in Multiply()
2076 "vpadal.u16 q6, q12\n" in Multiply()
2079 "bgt 1b\n" in Multiply()
2082 "vld1.32 {d14, d15}, [%[lhs]:64]!\n" in Multiply()
2083 "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n" in Multiply()
2084 "vdup.32 q7, d14[0]\n" in Multiply()
2089 "vpadd.u32 d0, d0, d1\n" in Multiply()
2090 "vpadd.u32 d2, d2, d3\n" in Multiply()
2091 "vpadd.u32 d4, d4, d5\n" in Multiply()
2092 "vpadd.u32 d6, d6, d7\n" in Multiply()
2093 "vpadd.u32 d8, d8, d9\n" in Multiply()
2094 "vpadd.u32 d10, d10, d11\n" in Multiply()
2095 "vpadd.u32 d12, d12, d13\n" in Multiply()
2096 "vpadd.u32 d0, d0, d2\n" in Multiply()
2097 "vpadd.u32 d1, d4, d6\n" in Multiply()
2098 "vpadd.u32 d2, d8, d10\n" in Multiply()
2099 "vpadd.u32 d3, d12, d12\n" in Multiply()
2102 "vadd.s32 q0, q0, q7\n" in Multiply()
2103 "vadd.s32 q1, q1, q7\n" in Multiply()
2104 "vadd.s32 q0, q0, q8\n" in Multiply()
2105 "vadd.s32 q1, q1, q9\n" in Multiply()
2108 "vst1.32 {d0, d1, d2}, [%[result]]!\n" in Multiply()
2109 "vst1.32 {d3[0]}, [%[result]]!\n" in Multiply()
2136 "pld [%[lhs]]\n" in Multiply()
2137 "pld [%[rhs]]\n" in Multiply()
2140 "vmov.i32 q0, #0\n" in Multiply()
2141 "vmov.i32 q1, #0\n" in Multiply()
2142 "vmov.i32 q2, #0\n" in Multiply()
2143 "vmov.i32 q3, q0\n" in Multiply()
2144 "vmov.i32 q4, q1\n" in Multiply()
2145 "vmov.i32 q5, q2\n" in Multiply()
2146 "vmov.i32 q6, q3\n" in Multiply()
2147 "vmov.i32 q7, q4\n" in Multiply()
2152 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" in Multiply()
2153 "vld1.32 {d16}, [%[lhs]:64]!\n" in Multiply()
2154 "vmull.u8 q11, d16, d17\n" in Multiply()
2155 "vmull.u8 q12, d16, d18\n" in Multiply()
2156 "vmull.u8 q13, d16, d19\n" in Multiply()
2157 "vmull.u8 q14, d16, d20\n" in Multiply()
2158 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" in Multiply()
2159 "vpadal.u16 q0, q11\n" in Multiply()
2160 "vpadal.u16 q1, q12\n" in Multiply()
2161 "vpadal.u16 q2, q13\n" in Multiply()
2162 "vpadal.u16 q3, q14\n" in Multiply()
2163 "pld [%[rhs], #256]\n" in Multiply()
2164 "vmull.u8 q15, d16, d17\n" in Multiply()
2165 "vmull.u8 q11, d16, d18\n" in Multiply()
2166 "vmull.u8 q12, d16, d19\n" in Multiply()
2167 "vmull.u8 q13, d16, d20\n" in Multiply()
2168 "pld [%[lhs], #32]\n" in Multiply()
2171 "subs %[count], %[count], #8\n" in Multiply()
2173 "vpadal.u16 q4, q15\n" in Multiply()
2174 "vpadal.u16 q5, q11\n" in Multiply()
2175 "vpadal.u16 q6, q12\n" in Multiply()
2176 "vpadal.u16 q7, q13\n" in Multiply()
2179 "bgt 1b\n" in Multiply()
2182 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" in Multiply()
2183 "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n" in Multiply()
2184 "vdup.32 q8, d16[0]\n" in Multiply()
2189 "vpadd.u32 d0, d0, d1\n" in Multiply()
2190 "vpadd.u32 d2, d2, d3\n" in Multiply()
2191 "vpadd.u32 d4, d4, d5\n" in Multiply()
2192 "vpadd.u32 d6, d6, d7\n" in Multiply()
2193 "vpadd.u32 d8, d8, d9\n" in Multiply()
2194 "vpadd.u32 d10, d10, d11\n" in Multiply()
2195 "vpadd.u32 d12, d12, d13\n" in Multiply()
2196 "vpadd.u32 d14, d14, d15\n" in Multiply()
2197 "vpadd.u32 d0, d0, d2\n" in Multiply()
2198 "vpadd.u32 d1, d4, d6\n" in Multiply()
2199 "vpadd.u32 d2, d8, d10\n" in Multiply()
2200 "vpadd.u32 d3, d12, d14\n" in Multiply()
2203 "vadd.s32 q0, q0, q8\n" in Multiply()
2204 "vadd.s32 q1, q1, q8\n" in Multiply()
2205 "vadd.s32 q0, q0, q9\n" in Multiply()
2206 "vadd.s32 q1, q1, q10\n" in Multiply()
2209 "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n" in Multiply()
2237 "pld [%[lhs]]\n" in Multiply()
2238 "pld [%[rhs]]\n" in Multiply()
2241 "vmov.i32 q0, #0\n" in Multiply()
2242 "vmov.i32 q1, #0\n" in Multiply()
2248 "subs %[count], %[count], #8\n" in Multiply()
2250 "vld1.32 {d4, d5}, [%[lhs]:64]!\n" in Multiply()
2251 "vld1.32 {d6}, [%[rhs]:64]!\n" in Multiply()
2252 "pld [%[lhs], #64]\n" in Multiply()
2253 "pld [%[rhs], #64]\n" in Multiply()
2254 "vmull.u8 q4, d6, d4\n" in Multiply()
2255 "vmull.u8 q5, d6, d5\n" in Multiply()
2256 "vpadal.u16 q0, q4\n" in Multiply()
2257 "vpadal.u16 q1, q5\n" in Multiply()
2260 "bgt 1b\n" in Multiply()
2263 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
2264 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
2265 "vdup.32 q2, d8[0]\n" in Multiply()
2266 "vdup.32 q4, d8[1]\n" in Multiply()
2269 "add r0, %[result], %[stride]\n" in Multiply()
2272 "vpadd.u32 d0, d0, d1\n" in Multiply()
2273 "vpadd.u32 d0, d0, d0\n" in Multiply()
2274 "vpadd.u32 d2, d2, d3\n" in Multiply()
2275 "vpadd.u32 d2, d2, d2\n" in Multiply()
2278 "vadd.s32 q0, q0, q2\n" in Multiply()
2279 "vadd.s32 q1, q1, q4\n" in Multiply()
2280 "vadd.s32 q0, q0, q5\n" in Multiply()
2281 "vadd.s32 q1, q1, q5\n" in Multiply()
2284 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
2285 "vst1.32 {d2[0]}, [r0]!\n" in Multiply()
2311 "pld [%[lhs]]\n" in Multiply()
2312 "pld [%[rhs]]\n" in Multiply()
2315 "vmov.i32 q0, #0\n" in Multiply()
2316 "vmov.i32 q1, #0\n" in Multiply()
2317 "vmov.i32 q2, #0\n" in Multiply()
2318 "vmov.i32 q3, q0\n" in Multiply()
2324 "subs %[count], %[count], #8\n" in Multiply()
2326 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
2327 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
2328 "pld [%[lhs], #64]\n" in Multiply()
2329 "pld [%[rhs], #64]\n" in Multiply()
2330 "vmull.u8 q6, d10, d8\n" in Multiply()
2331 "vmull.u8 q7, d11, d8\n" in Multiply()
2332 "vmull.u8 q8, d10, d9\n" in Multiply()
2333 "vmull.u8 q9, d11, d9\n" in Multiply()
2334 "vpadal.u16 q0, q6\n" in Multiply()
2335 "vpadal.u16 q1, q7\n" in Multiply()
2336 "vpadal.u16 q2, q8\n" in Multiply()
2337 "vpadal.u16 q3, q9\n" in Multiply()
2340 "bgt 1b\n" in Multiply()
2343 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
2344 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
2345 "vdup.32 q6, d8[0]\n" in Multiply()
2346 "vdup.32 q4, d8[1]\n" in Multiply()
2349 "add r0, %[result], %[stride]\n" in Multiply()
2352 "vpadd.u32 d0, d0, d1\n" in Multiply()
2353 "vpadd.u32 d2, d2, d3\n" in Multiply()
2354 "vpadd.u32 d0, d0, d2\n" in Multiply()
2355 "vpadd.u32 d4, d4, d5\n" in Multiply()
2356 "vpadd.u32 d6, d6, d7\n" in Multiply()
2357 "vpadd.u32 d4, d4, d6\n" in Multiply()
2360 "vadd.s32 q0, q0, q6\n" in Multiply()
2361 "vadd.s32 q2, q2, q4\n" in Multiply()
2362 "vadd.s32 q0, q0, q5\n" in Multiply()
2363 "vadd.s32 q2, q2, q5\n" in Multiply()
2366 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
2367 "vst1.32 {d4}, [r0]!\n" in Multiply()
2394 "pld [%[lhs]]\n" in Multiply()
2395 "pld [%[rhs]]\n" in Multiply()
2398 "vmov.i32 q0, #0\n" in Multiply()
2399 "vmov.i32 q1, #0\n" in Multiply()
2400 "vmov.i32 q2, #0\n" in Multiply()
2401 "vmov.i32 q3, q0\n" in Multiply()
2402 "vmov.i32 q4, q1\n" in Multiply()
2403 "vmov.i32 q5, q2\n" in Multiply()
2409 "subs %[count], %[count], #8\n" in Multiply()
2411 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
2412 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" in Multiply()
2413 "pld [%[lhs], #64]\n" in Multiply()
2414 "pld [%[rhs], #64]\n" in Multiply()
2415 "vmull.u8 q9, d14, d12\n" in Multiply()
2416 "vmull.u8 q10, d15, d12\n" in Multiply()
2417 "vmull.u8 q11, d16, d12\n" in Multiply()
2418 "vmull.u8 q12, d14, d13\n" in Multiply()
2419 "vmull.u8 q13, d15, d13\n" in Multiply()
2420 "vmull.u8 q14, d16, d13\n" in Multiply()
2421 "vpadal.u16 q0, q9\n" in Multiply()
2422 "vpadal.u16 q1, q10\n" in Multiply()
2423 "vpadal.u16 q2, q11\n" in Multiply()
2424 "vpadal.u16 q3, q12\n" in Multiply()
2425 "vpadal.u16 q4, q13\n" in Multiply()
2426 "vpadal.u16 q5, q14\n" in Multiply()
2429 "bgt 1b\n" in Multiply()
2432 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
2433 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" in Multiply()
2434 "vdup.32 q8, d12[0]\n" in Multiply()
2435 "vdup.32 q6, d12[1]\n" in Multiply()
2438 "add r0, %[result], %[stride]\n" in Multiply()
2441 "vpadd.u32 d0, d0, d1\n" in Multiply()
2442 "vpadd.u32 d2, d2, d3\n" in Multiply()
2443 "vpadd.u32 d4, d4, d5\n" in Multiply()
2444 "vpadd.u32 d0, d0, d2\n" in Multiply()
2445 "vpadd.u32 d1, d4, d4\n" in Multiply()
2446 "vpadd.u32 d6, d6, d7\n" in Multiply()
2447 "vpadd.u32 d8, d8, d9\n" in Multiply()
2448 "vpadd.u32 d10, d10, d11\n" in Multiply()
2449 "vpadd.u32 d6, d6, d8\n" in Multiply()
2450 "vpadd.u32 d7, d10, d10\n" in Multiply()
2453 "vadd.s32 q0, q0, q8\n" in Multiply()
2454 "vadd.s32 q3, q3, q6\n" in Multiply()
2455 "vadd.s32 q0, q0, q7\n" in Multiply()
2456 "vadd.s32 q3, q3, q7\n" in Multiply()
2459 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
2460 "vst1.32 {d1[0]}, [%[result]]!\n" in Multiply()
2461 "vst1.32 {d6}, [r0]!\n" in Multiply()
2462 "vst1.32 {d7[0]}, [r0]!\n" in Multiply()
2490 "pld [%[lhs]]\n" in Multiply()
2491 "pld [%[rhs]]\n" in Multiply()
2494 "vmov.i32 q0, #0\n" in Multiply()
2495 "vmov.i32 q1, #0\n" in Multiply()
2496 "vmov.i32 q2, #0\n" in Multiply()
2497 "vmov.i32 q3, q0\n" in Multiply()
2498 "vmov.i32 q4, q1\n" in Multiply()
2499 "vmov.i32 q5, q2\n" in Multiply()
2500 "vmov.i32 q6, q3\n" in Multiply()
2501 "vmov.i32 q7, q4\n" in Multiply()
2506 "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n" in Multiply()
2507 "vld1.8 {d16}, [%[lhs]:64]!\n" in Multiply()
2508 "vmull.u8 q11, d16, d18\n" in Multiply()
2509 "vld1.8 {d17}, [%[lhs]:64]!\n" in Multiply()
2510 "vmull.u8 q12, d16, d19\n" in Multiply()
2511 "pld [%[rhs], #64]\n" in Multiply()
2512 "vmull.u8 q13, d16, d20\n" in Multiply()
2513 "pld [%[lhs], #64]\n" in Multiply()
2514 "vmull.u8 q14, d16, d21\n" in Multiply()
2515 "vmull.u8 q15, d17, d18\n" in Multiply()
2516 "vpadal.u16 q0, q11\n" in Multiply()
2517 "vpadal.u16 q1, q12\n" in Multiply()
2518 "vpadal.u16 q2, q13\n" in Multiply()
2519 "vmull.u8 q11, d17, d19\n" in Multiply()
2520 "vmull.u8 q12, d17, d20\n" in Multiply()
2521 "vmull.u8 q13, d17, d21\n" in Multiply()
2524 "subs %[count], %[count], #8\n" in Multiply()
2526 "vpadal.u16 q3, q14\n" in Multiply()
2527 "vpadal.u16 q4, q15\n" in Multiply()
2528 "vpadal.u16 q5, q11\n" in Multiply()
2529 "vpadal.u16 q6, q12\n" in Multiply()
2530 "vpadal.u16 q7, q13\n" in Multiply()
2533 "bgt 1b\n" in Multiply()
2536 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" in Multiply()
2537 "vld1.32 {d18, d19}, [%[rhs]:64]!\n" in Multiply()
2538 "vdup.32 q10, d16[0]\n" in Multiply()
2539 "vdup.32 q8, d16[1]\n" in Multiply()
2542 "add r0, %[result], %[stride]\n" in Multiply()
2545 "vpadd.u32 d0, d0, d1\n" in Multiply()
2546 "vpadd.u32 d2, d2, d3\n" in Multiply()
2547 "vpadd.u32 d4, d4, d5\n" in Multiply()
2548 "vpadd.u32 d6, d6, d7\n" in Multiply()
2549 "vpadd.u32 d0, d0, d2\n" in Multiply()
2550 "vpadd.u32 d1, d4, d6\n" in Multiply()
2551 "vpadd.u32 d8, d8, d9\n" in Multiply()
2552 "vpadd.u32 d10, d10, d11\n" in Multiply()
2553 "vpadd.u32 d12, d12, d13\n" in Multiply()
2554 "vpadd.u32 d14, d14, d15\n" in Multiply()
2555 "vpadd.u32 d8, d8, d10\n" in Multiply()
2556 "vpadd.u32 d9, d12, d14\n" in Multiply()
2559 "vadd.s32 q0, q0, q10\n" in Multiply()
2560 "vadd.s32 q4, q4, q8\n" in Multiply()
2561 "vadd.s32 q0, q0, q9\n" in Multiply()
2562 "vadd.s32 q4, q4, q9\n" in Multiply()
2565 "vst1.32 {d0, d1}, [%[result]]!\n" in Multiply()
2566 "vst1.32 {d8, d9}, [r0]!\n" in Multiply()
2594 "pld [%[lhs]]\n" in Multiply()
2595 "pld [%[rhs]]\n" in Multiply()
2598 "vmov.i32 q0, #0\n" in Multiply()
2599 "vmov.i32 q1, #0\n" in Multiply()
2600 "vmov.i32 q2, #0\n" in Multiply()
2606 "subs %[count], %[count], #8\n" in Multiply()
2608 "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n" in Multiply()
2609 "vld1.32 {d9}, [%[rhs]:64]!\n" in Multiply()
2610 "pld [%[lhs], #64]\n" in Multiply()
2611 "pld [%[rhs], #64]\n" in Multiply()
2612 "vmull.u8 q5, d9, d6\n" in Multiply()
2613 "vmull.u8 q6, d9, d7\n" in Multiply()
2614 "vmull.u8 q7, d9, d8\n" in Multiply()
2615 "vpadal.u16 q0, q5\n" in Multiply()
2616 "vpadal.u16 q1, q6\n" in Multiply()
2617 "vpadal.u16 q2, q7\n" in Multiply()
2620 "bgt 1b\n" in Multiply()
2623 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
2624 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
2625 "vdup.32 q3, d8[0]\n" in Multiply()
2626 "vdup.32 q6, d8[1]\n" in Multiply()
2627 "vdup.32 q4, d9[0]\n" in Multiply()
2630 "add r0, %[result], %[stride]\n" in Multiply()
2631 "add r1, r0, %[stride]\n" in Multiply()
2634 "vpadd.u32 d0, d0, d1\n" in Multiply()
2635 "vpadd.u32 d0, d0, d0\n" in Multiply()
2636 "vpadd.u32 d2, d2, d3\n" in Multiply()
2637 "vpadd.u32 d2, d2, d2\n" in Multiply()
2638 "vpadd.u32 d4, d4, d5\n" in Multiply()
2639 "vpadd.u32 d4, d4, d4\n" in Multiply()
2642 "vadd.s32 q0, q0, q3\n" in Multiply()
2643 "vadd.s32 q1, q1, q6\n" in Multiply()
2644 "vadd.s32 q2, q2, q4\n" in Multiply()
2645 "vadd.s32 q0, q0, q5\n" in Multiply()
2646 "vadd.s32 q1, q1, q5\n" in Multiply()
2647 "vadd.s32 q2, q2, q5\n" in Multiply()
2650 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
2651 "vst1.32 {d2[0]}, [r0]!\n" in Multiply()
2652 "vst1.32 {d4[0]}, [r1]!\n" in Multiply()
2678 "pld [%[lhs]]\n" in Multiply()
2679 "pld [%[rhs]]\n" in Multiply()
2682 "vmov.i32 q0, #0\n" in Multiply()
2683 "vmov.i32 q1, #0\n" in Multiply()
2684 "vmov.i32 q2, #0\n" in Multiply()
2685 "vmov.i32 q3, q0\n" in Multiply()
2686 "vmov.i32 q4, q1\n" in Multiply()
2687 "vmov.i32 q5, q2\n" in Multiply()
2693 "subs %[count], %[count], #8\n" in Multiply()
2695 "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n" in Multiply()
2696 "vld1.32 {d15, d16}, [%[rhs]:64]!\n" in Multiply()
2697 "pld [%[lhs], #64]\n" in Multiply()
2698 "pld [%[rhs], #64]\n" in Multiply()
2699 "vmull.u8 q9, d15, d12\n" in Multiply()
2700 "vmull.u8 q10, d16, d12\n" in Multiply()
2701 "vmull.u8 q11, d15, d13\n" in Multiply()
2702 "vmull.u8 q12, d16, d13\n" in Multiply()
2703 "vmull.u8 q13, d15, d14\n" in Multiply()
2704 "vmull.u8 q14, d16, d14\n" in Multiply()
2705 "vpadal.u16 q0, q9\n" in Multiply()
2706 "vpadal.u16 q1, q10\n" in Multiply()
2707 "vpadal.u16 q2, q11\n" in Multiply()
2708 "vpadal.u16 q3, q12\n" in Multiply()
2709 "vpadal.u16 q4, q13\n" in Multiply()
2710 "vpadal.u16 q5, q14\n" in Multiply()
2713 "bgt 1b\n" in Multiply()
2716 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
2717 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" in Multiply()
2718 "vdup.32 q8, d12[0]\n" in Multiply()
2719 "vdup.32 q9, d12[1]\n" in Multiply()
2720 "vdup.32 q6, d13[0]\n" in Multiply()
2723 "add r0, %[result], %[stride]\n" in Multiply()
2724 "add r1, r0, %[stride]\n" in Multiply()
2727 "vpadd.u32 d0, d0, d1\n" in Multiply()
2728 "vpadd.u32 d2, d2, d3\n" in Multiply()
2729 "vpadd.u32 d0, d0, d2\n" in Multiply()
2730 "vpadd.u32 d4, d4, d5\n" in Multiply()
2731 "vpadd.u32 d6, d6, d7\n" in Multiply()
2732 "vpadd.u32 d4, d4, d6\n" in Multiply()
2733 "vpadd.u32 d8, d8, d9\n" in Multiply()
2734 "vpadd.u32 d10, d10, d11\n" in Multiply()
2735 "vpadd.u32 d8, d8, d10\n" in Multiply()
2738 "vadd.s32 q0, q0, q8\n" in Multiply()
2739 "vadd.s32 q2, q2, q9\n" in Multiply()
2740 "vadd.s32 q4, q4, q6\n" in Multiply()
2741 "vadd.s32 q0, q0, q7\n" in Multiply()
2742 "vadd.s32 q2, q2, q7\n" in Multiply()
2743 "vadd.s32 q4, q4, q7\n" in Multiply()
2746 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
2747 "vst1.32 {d4}, [r0]!\n" in Multiply()
2748 "vst1.32 {d8}, [r1]!\n" in Multiply()
2776 "pld [%[lhs]]\n" in Multiply()
2777 "pld [%[rhs]]\n" in Multiply()
2780 "vmov.i32 q0, #0\n" in Multiply()
2781 "vmov.i32 q1, #0\n" in Multiply()
2782 "vmov.i32 q2, #0\n" in Multiply()
2783 "vmov.i32 q3, q0\n" in Multiply()
2784 "vmov.i32 q4, q1\n" in Multiply()
2785 "vmov.i32 q5, q2\n" in Multiply()
2786 "vmov.i32 q6, q3\n" in Multiply()
2787 "vmov.i32 q7, q4\n" in Multiply()
2788 "vmov.i32 q8, q5\n" in Multiply()
2793 "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n" in Multiply()
2794 "vld1.8 {d18}, [%[lhs]:64]!\n" in Multiply()
2795 "vmull.u8 q12, d18, d21\n" in Multiply()
2796 "vld1.8 {d19}, [%[lhs]:64]!\n" in Multiply()
2797 "vmull.u8 q13, d18, d22\n" in Multiply()
2798 "vld1.8 {d20}, [%[lhs]:64]!\n" in Multiply()
2799 "vmull.u8 q14, d18, d23\n" in Multiply()
2800 "pld [%[lhs], #64]\n" in Multiply()
2801 "vmull.u8 q15, d19, d21\n" in Multiply()
2802 "pld [%[rhs], #64]\n" in Multiply()
2803 "vpadal.u16 q0, q12\n" in Multiply()
2804 "vpadal.u16 q1, q13\n" in Multiply()
2805 "vpadal.u16 q2, q14\n" in Multiply()
2806 "vpadal.u16 q3, q15\n" in Multiply()
2807 "vmull.u8 q12, d19, d22\n" in Multiply()
2808 "vmull.u8 q13, d19, d23\n" in Multiply()
2809 "vmull.u8 q14, d20, d21\n" in Multiply()
2810 "vmull.u8 q15, d20, d22\n" in Multiply()
2813 "subs %[count], %[count], #8\n" in Multiply()
2815 "vmull.u8 q9, d20, d23\n" in Multiply()
2816 "vpadal.u16 q4, q12\n" in Multiply()
2817 "vpadal.u16 q5, q13\n" in Multiply()
2818 "vpadal.u16 q6, q14\n" in Multiply()
2819 "vpadal.u16 q7, q15\n" in Multiply()
2820 "vpadal.u16 q8, q9\n" in Multiply()
2823 "bgt 1b\n" in Multiply()
2826 "vld1.32 {d18, d19}, [%[lhs]:64]!\n" in Multiply()
2827 "vld1.32 {d20, d21}, [%[rhs]:64]!\n" in Multiply()
2828 "vdup.32 q11, d18[0]\n" in Multiply()
2829 "vdup.32 q12, d18[1]\n" in Multiply()
2830 "vdup.32 q9, d19[0]\n" in Multiply()
2833 "add r0, %[result], %[stride]\n" in Multiply()
2834 "add r1, r0, %[stride]\n" in Multiply()
2837 "vpadd.u32 d0, d0, d1\n" in Multiply()
2838 "vpadd.u32 d2, d2, d3\n" in Multiply()
2839 "vpadd.u32 d4, d4, d5\n" in Multiply()
2840 "vpadd.u32 d0, d0, d2\n" in Multiply()
2841 "vpadd.u32 d1, d4, d4\n" in Multiply()
2842 "vpadd.u32 d6, d6, d7\n" in Multiply()
2843 "vpadd.u32 d8, d8, d9\n" in Multiply()
2844 "vpadd.u32 d10, d10, d11\n" in Multiply()
2845 "vpadd.u32 d6, d6, d8\n" in Multiply()
2846 "vpadd.u32 d7, d10, d10\n" in Multiply()
2847 "vpadd.u32 d12, d12, d13\n" in Multiply()
2848 "vpadd.u32 d14, d14, d15\n" in Multiply()
2849 "vpadd.u32 d16, d16, d17\n" in Multiply()
2850 "vpadd.u32 d12, d12, d14\n" in Multiply()
2851 "vpadd.u32 d13, d16, d16\n" in Multiply()
2854 "vadd.s32 q0, q0, q11\n" in Multiply()
2855 "vadd.s32 q3, q3, q12\n" in Multiply()
2856 "vadd.s32 q6, q6, q9\n" in Multiply()
2857 "vadd.s32 q0, q0, q10\n" in Multiply()
2858 "vadd.s32 q3, q3, q10\n" in Multiply()
2859 "vadd.s32 q6, q6, q10\n" in Multiply()
2862 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
2863 "vst1.32 {d1[0]}, [%[result]]!\n" in Multiply()
2864 "vst1.32 {d6}, [r0]!\n" in Multiply()
2865 "vst1.32 {d7[0]}, [r0]!\n" in Multiply()
2866 "vst1.32 {d12}, [r1]!\n" in Multiply()
2867 "vst1.32 {d13[0]}, [r1]!\n" in Multiply()
2895 "pld [%[lhs]]\n" in Multiply()
2896 "pld [%[rhs]]\n" in Multiply()
2899 "vmov.i32 q0, #0\n" in Multiply()
2905 "subs %[count], %[count], #8\n" in Multiply()
2907 "vld1.32 {d2}, [%[lhs]:64]!\n" in Multiply()
2908 "vld1.32 {d3}, [%[rhs]:64]!\n" in Multiply()
2909 "pld [%[lhs], #64]\n" in Multiply()
2910 "pld [%[rhs], #64]\n" in Multiply()
2911 "vmull.u8 q2, d3, d2\n" in Multiply()
2912 "vpadal.u16 q0, q2\n" in Multiply()
2915 "bgt 1b\n" in Multiply()
2918 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
2919 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
2920 "vdup.32 q6, %[scale]\n" in Multiply()
2921 "vdup.32 q4, d8[0]\n" in Multiply()
2926 "vpadd.u32 d0, d0, d1\n" in Multiply()
2927 "vpadd.u32 d0, d0, d0\n" in Multiply()
2930 "vadd.s32 q0, q0, q4\n" in Multiply()
2931 "vadd.s32 q0, q0, q5\n" in Multiply()
2932 "vcvt.f32.s32 q0, q0\n" in Multiply()
2933 "vmul.f32 q0, q0, q6\n" in Multiply()
2936 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
2963 "pld [%[lhs]]\n" in Multiply()
2964 "pld [%[rhs]]\n" in Multiply()
2967 "vmov.i32 q0, #0\n" in Multiply()
2968 "vmov.i32 q1, #0\n" in Multiply()
2974 "subs %[count], %[count], #8\n" in Multiply()
2976 "vld1.32 {d4}, [%[lhs]:64]!\n" in Multiply()
2977 "vld1.32 {d5, d6}, [%[rhs]:64]!\n" in Multiply()
2978 "pld [%[lhs], #64]\n" in Multiply()
2979 "pld [%[rhs], #64]\n" in Multiply()
2980 "vmull.u8 q4, d5, d4\n" in Multiply()
2981 "vmull.u8 q5, d6, d4\n" in Multiply()
2982 "vpadal.u16 q0, q4\n" in Multiply()
2983 "vpadal.u16 q1, q5\n" in Multiply()
2986 "bgt 1b\n" in Multiply()
2989 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
2990 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
2991 "vdup.32 q6, %[scale]\n" in Multiply()
2992 "vdup.32 q4, d8[0]\n" in Multiply()
2997 "vpadd.u32 d0, d0, d1\n" in Multiply()
2998 "vpadd.u32 d2, d2, d3\n" in Multiply()
2999 "vpadd.u32 d0, d0, d2\n" in Multiply()
3002 "vadd.s32 q0, q0, q4\n" in Multiply()
3003 "vadd.s32 q0, q0, q5\n" in Multiply()
3004 "vcvt.f32.s32 q0, q0\n" in Multiply()
3005 "vmul.f32 q0, q0, q6\n" in Multiply()
3008 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
3035 "pld [%[lhs]]\n" in Multiply()
3036 "pld [%[rhs]]\n" in Multiply()
3039 "vmov.i32 q0, #0\n" in Multiply()
3040 "vmov.i32 q1, #0\n" in Multiply()
3041 "vmov.i32 q2, #0\n" in Multiply()
3047 "subs %[count], %[count], #8\n" in Multiply()
3049 "vld1.32 {d6}, [%[lhs]:64]!\n" in Multiply()
3050 "vld1.32 {d7, d8, d9}, [%[rhs]:64]!\n" in Multiply()
3051 "pld [%[lhs], #64]\n" in Multiply()
3052 "pld [%[rhs], #64]\n" in Multiply()
3053 "vmull.u8 q5, d7, d6\n" in Multiply()
3054 "vmull.u8 q6, d8, d6\n" in Multiply()
3055 "vmull.u8 q7, d9, d6\n" in Multiply()
3056 "vpadal.u16 q0, q5\n" in Multiply()
3057 "vpadal.u16 q1, q6\n" in Multiply()
3058 "vpadal.u16 q2, q7\n" in Multiply()
3061 "bgt 1b\n" in Multiply()
3064 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
3065 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
3066 "vdup.32 q6, %[scale]\n" in Multiply()
3067 "vdup.32 q4, d8[0]\n" in Multiply()
3072 "vpadd.u32 d0, d0, d1\n" in Multiply()
3073 "vpadd.u32 d2, d2, d3\n" in Multiply()
3074 "vpadd.u32 d4, d4, d5\n" in Multiply()
3075 "vpadd.u32 d0, d0, d2\n" in Multiply()
3076 "vpadd.u32 d1, d4, d4\n" in Multiply()
3079 "vadd.s32 q0, q0, q4\n" in Multiply()
3080 "vadd.s32 q0, q0, q5\n" in Multiply()
3081 "vcvt.f32.s32 q0, q0\n" in Multiply()
3082 "vmul.f32 q0, q0, q6\n" in Multiply()
3085 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
3086 "vst1.32 {d1[0]}, [%[result]]!\n" in Multiply()
3113 "pld [%[lhs]]\n" in Multiply()
3114 "pld [%[rhs]]\n" in Multiply()
3117 "vmov.i32 q0, #0\n" in Multiply()
3118 "vmov.i32 q1, #0\n" in Multiply()
3119 "vmov.i32 q2, #0\n" in Multiply()
3120 "vmov.i32 q3, q0\n" in Multiply()
3126 "subs %[count], %[count], #8\n" in Multiply()
3128 "vld1.32 {d8}, [%[lhs]:64]!\n" in Multiply()
3129 "vld1.32 {d9, d10, d11, d12}, [%[rhs]:64]!\n" in Multiply()
3130 "pld [%[lhs], #64]\n" in Multiply()
3131 "pld [%[rhs], #64]\n" in Multiply()
3132 "vmull.u8 q7, d9, d8\n" in Multiply()
3133 "vmull.u8 q8, d10, d8\n" in Multiply()
3134 "vmull.u8 q9, d11, d8\n" in Multiply()
3135 "vmull.u8 q10, d12, d8\n" in Multiply()
3136 "vpadal.u16 q0, q7\n" in Multiply()
3137 "vpadal.u16 q1, q8\n" in Multiply()
3138 "vpadal.u16 q2, q9\n" in Multiply()
3139 "vpadal.u16 q3, q10\n" in Multiply()
3142 "bgt 1b\n" in Multiply()
3145 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
3146 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
3147 "vdup.32 q6, %[scale]\n" in Multiply()
3148 "vdup.32 q4, d8[0]\n" in Multiply()
3153 "vpadd.u32 d0, d0, d1\n" in Multiply()
3154 "vpadd.u32 d2, d2, d3\n" in Multiply()
3155 "vpadd.u32 d4, d4, d5\n" in Multiply()
3156 "vpadd.u32 d6, d6, d7\n" in Multiply()
3157 "vpadd.u32 d0, d0, d2\n" in Multiply()
3158 "vpadd.u32 d1, d4, d6\n" in Multiply()
3161 "vadd.s32 q0, q0, q4\n" in Multiply()
3162 "vadd.s32 q0, q0, q5\n" in Multiply()
3163 "vcvt.f32.s32 q0, q0\n" in Multiply()
3164 "vmul.f32 q0, q0, q6\n" in Multiply()
3167 "vst1.32 {d0, d1}, [%[result]]!\n" in Multiply()
3195 "pld [%[lhs]]\n" in Multiply()
3196 "pld [%[rhs]]\n" in Multiply()
3199 "vmov.i32 q0, #0\n" in Multiply()
3200 "vmov.i32 q1, #0\n" in Multiply()
3201 "vmov.i32 q2, #0\n" in Multiply()
3202 "vmov.i32 q3, q0\n" in Multiply()
3203 "vmov.i32 q4, q1\n" in Multiply()
3209 "subs %[count], %[count], #8\n" in Multiply()
3211 "vld1.32 {d10, d11, d12, d13}, [%[rhs]:64]!\n" in Multiply()
3212 "vld1.32 {d14}, [%[lhs]:64]!\n" in Multiply()
3213 "pld [%[lhs], #64]\n" in Multiply()
3214 "vmull.u8 q8, d10, d14\n" in Multiply()
3215 "vmull.u8 q9, d11, d14\n" in Multiply()
3216 "vmull.u8 q10, d12, d14\n" in Multiply()
3217 "vmull.u8 q11, d13, d14\n" in Multiply()
3218 "vld1.32 {d10}, [%[rhs]:64]!\n" in Multiply()
3219 "pld [%[rhs], #128]\n" in Multiply()
3220 "vpadal.u16 q0, q8\n" in Multiply()
3221 "vpadal.u16 q1, q9\n" in Multiply()
3222 "vpadal.u16 q2, q10\n" in Multiply()
3223 "vpadal.u16 q3, q11\n" in Multiply()
3224 "vmull.u8 q8, d10, d14\n" in Multiply()
3225 "vpadal.u16 q4, q8\n" in Multiply()
3228 "bgt 1b\n" in Multiply()
3231 "vld1.32 {d10, d11}, [%[lhs]:64]!\n" in Multiply()
3232 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" in Multiply()
3233 "vdup.32 q8, %[scale]\n" in Multiply()
3234 "vdup.32 q5, d10[0]\n" in Multiply()
3239 "vpadd.u32 d0, d0, d1\n" in Multiply()
3240 "vpadd.u32 d2, d2, d3\n" in Multiply()
3241 "vpadd.u32 d4, d4, d5\n" in Multiply()
3242 "vpadd.u32 d6, d6, d7\n" in Multiply()
3243 "vpadd.u32 d8, d8, d9\n" in Multiply()
3244 "vpadd.u32 d0, d0, d2\n" in Multiply()
3245 "vpadd.u32 d1, d4, d6\n" in Multiply()
3246 "vpadd.u32 d2, d8, d8\n" in Multiply()
3249 "vadd.s32 q0, q0, q5\n" in Multiply()
3250 "vadd.s32 q1, q1, q5\n" in Multiply()
3251 "vadd.s32 q0, q0, q6\n" in Multiply()
3252 "vadd.s32 q1, q1, q7\n" in Multiply()
3253 "vcvt.f32.s32 q0, q0\n" in Multiply()
3254 "vcvt.f32.s32 q1, q1\n" in Multiply()
3255 "vmul.f32 q0, q0, q8\n" in Multiply()
3256 "vmul.f32 q1, q1, q8\n" in Multiply()
3259 "vst1.32 {d0, d1}, [%[result]]!\n" in Multiply()
3260 "vst1.32 {d2[0]}, [%[result]]!\n" in Multiply()
3288 "pld [%[lhs]]\n" in Multiply()
3289 "pld [%[rhs]]\n" in Multiply()
3292 "vmov.i32 q0, #0\n" in Multiply()
3293 "vmov.i32 q1, #0\n" in Multiply()
3294 "vmov.i32 q2, #0\n" in Multiply()
3295 "vmov.i32 q3, q0\n" in Multiply()
3296 "vmov.i32 q4, q1\n" in Multiply()
3297 "vmov.i32 q5, q2\n" in Multiply()
3303 "subs %[count], %[count], #8\n" in Multiply()
3305 "vld1.32 {d12, d13, d14, d15}, [%[rhs]:64]!\n" in Multiply()
3306 "vld1.32 {d16}, [%[lhs]:64]!\n" in Multiply()
3307 "pld [%[lhs], #64]\n" in Multiply()
3308 "vmull.u8 q9, d12, d16\n" in Multiply()
3309 "vmull.u8 q10, d13, d16\n" in Multiply()
3310 "vmull.u8 q11, d14, d16\n" in Multiply()
3311 "vmull.u8 q12, d15, d16\n" in Multiply()
3312 "vld1.32 {d12, d13}, [%[rhs]:64]!\n" in Multiply()
3313 "pld [%[rhs], #128]\n" in Multiply()
3314 "vpadal.u16 q0, q9\n" in Multiply()
3315 "vpadal.u16 q1, q10\n" in Multiply()
3316 "vpadal.u16 q2, q11\n" in Multiply()
3317 "vpadal.u16 q3, q12\n" in Multiply()
3318 "vmull.u8 q9, d12, d16\n" in Multiply()
3319 "vmull.u8 q10, d13, d16\n" in Multiply()
3320 "vpadal.u16 q4, q9\n" in Multiply()
3321 "vpadal.u16 q5, q10\n" in Multiply()
3324 "bgt 1b\n" in Multiply()
3327 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
3328 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" in Multiply()
3329 "vdup.32 q9, %[scale]\n" in Multiply()
3330 "vdup.32 q6, d12[0]\n" in Multiply()
3335 "vpadd.u32 d0, d0, d1\n" in Multiply()
3336 "vpadd.u32 d2, d2, d3\n" in Multiply()
3337 "vpadd.u32 d4, d4, d5\n" in Multiply()
3338 "vpadd.u32 d6, d6, d7\n" in Multiply()
3339 "vpadd.u32 d8, d8, d9\n" in Multiply()
3340 "vpadd.u32 d10, d10, d11\n" in Multiply()
3341 "vpadd.u32 d0, d0, d2\n" in Multiply()
3342 "vpadd.u32 d1, d4, d6\n" in Multiply()
3343 "vpadd.u32 d2, d8, d10\n" in Multiply()
3346 "vadd.s32 q0, q0, q6\n" in Multiply()
3347 "vadd.s32 q1, q1, q6\n" in Multiply()
3348 "vadd.s32 q0, q0, q7\n" in Multiply()
3349 "vadd.s32 q1, q1, q8\n" in Multiply()
3350 "vcvt.f32.s32 q0, q0\n" in Multiply()
3351 "vcvt.f32.s32 q1, q1\n" in Multiply()
3352 "vmul.f32 q0, q0, q9\n" in Multiply()
3353 "vmul.f32 q1, q1, q9\n" in Multiply()
3356 "vst1.32 {d0, d1, d2}, [%[result]]!\n" in Multiply()
3384 "pld [%[lhs]]\n" in Multiply()
3385 "pld [%[rhs]]\n" in Multiply()
3388 "vmov.i32 q0, #0\n" in Multiply()
3389 "vmov.i32 q1, #0\n" in Multiply()
3390 "vmov.i32 q2, #0\n" in Multiply()
3391 "vmov.i32 q3, q0\n" in Multiply()
3392 "vmov.i32 q4, q1\n" in Multiply()
3393 "vmov.i32 q5, q2\n" in Multiply()
3394 "vmov.i32 q6, q3\n" in Multiply()
3400 "subs %[count], %[count], #8\n" in Multiply()
3402 "vld1.32 {d14, d15, d16, d17}, [%[rhs]:64]!\n" in Multiply()
3403 "vld1.32 {d18}, [%[lhs]:64]!\n" in Multiply()
3404 "pld [%[lhs], #64]\n" in Multiply()
3405 "vmull.u8 q10, d14, d18\n" in Multiply()
3406 "vmull.u8 q11, d15, d18\n" in Multiply()
3407 "vmull.u8 q12, d16, d18\n" in Multiply()
3408 "vmull.u8 q13, d17, d18\n" in Multiply()
3409 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" in Multiply()
3410 "pld [%[rhs], #128]\n" in Multiply()
3411 "vpadal.u16 q0, q10\n" in Multiply()
3412 "vpadal.u16 q1, q11\n" in Multiply()
3413 "vpadal.u16 q2, q12\n" in Multiply()
3414 "vpadal.u16 q3, q13\n" in Multiply()
3415 "vmull.u8 q10, d14, d18\n" in Multiply()
3416 "vmull.u8 q11, d15, d18\n" in Multiply()
3417 "vmull.u8 q12, d16, d18\n" in Multiply()
3418 "vpadal.u16 q4, q10\n" in Multiply()
3419 "vpadal.u16 q5, q11\n" in Multiply()
3420 "vpadal.u16 q6, q12\n" in Multiply()
3423 "bgt 1b\n" in Multiply()
3426 "vld1.32 {d14, d15}, [%[lhs]:64]!\n" in Multiply()
3427 "vld1.32 {d16, d17, d18, d19}, [%[rhs]:64]!\n" in Multiply()
3428 "vdup.32 q10, %[scale]\n" in Multiply()
3429 "vdup.32 q7, d14[0]\n" in Multiply()
3434 "vpadd.u32 d0, d0, d1\n" in Multiply()
3435 "vpadd.u32 d2, d2, d3\n" in Multiply()
3436 "vpadd.u32 d4, d4, d5\n" in Multiply()
3437 "vpadd.u32 d6, d6, d7\n" in Multiply()
3438 "vpadd.u32 d8, d8, d9\n" in Multiply()
3439 "vpadd.u32 d10, d10, d11\n" in Multiply()
3440 "vpadd.u32 d12, d12, d13\n" in Multiply()
3441 "vpadd.u32 d0, d0, d2\n" in Multiply()
3442 "vpadd.u32 d1, d4, d6\n" in Multiply()
3443 "vpadd.u32 d2, d8, d10\n" in Multiply()
3444 "vpadd.u32 d3, d12, d12\n" in Multiply()
3447 "vadd.s32 q0, q0, q7\n" in Multiply()
3448 "vadd.s32 q1, q1, q7\n" in Multiply()
3449 "vadd.s32 q0, q0, q8\n" in Multiply()
3450 "vadd.s32 q1, q1, q9\n" in Multiply()
3451 "vcvt.f32.s32 q0, q0\n" in Multiply()
3452 "vcvt.f32.s32 q1, q1\n" in Multiply()
3453 "vmul.f32 q0, q0, q10\n" in Multiply()
3454 "vmul.f32 q1, q1, q10\n" in Multiply()
3457 "vst1.32 {d0, d1, d2}, [%[result]]!\n" in Multiply()
3458 "vst1.32 {d3[0]}, [%[result]]!\n" in Multiply()
3486 "pld [%[lhs]]\n" in Multiply()
3487 "pld [%[rhs]]\n" in Multiply()
3490 "vmov.i32 q0, #0\n" in Multiply()
3491 "vmov.i32 q1, #0\n" in Multiply()
3492 "vmov.i32 q2, #0\n" in Multiply()
3493 "vmov.i32 q3, q0\n" in Multiply()
3494 "vmov.i32 q4, q1\n" in Multiply()
3495 "vmov.i32 q5, q2\n" in Multiply()
3496 "vmov.i32 q6, q3\n" in Multiply()
3497 "vmov.i32 q7, q4\n" in Multiply()
3502 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" in Multiply()
3503 "vld1.32 {d16}, [%[lhs]:64]!\n" in Multiply()
3504 "vmull.u8 q11, d16, d17\n" in Multiply()
3505 "vmull.u8 q12, d16, d18\n" in Multiply()
3506 "vmull.u8 q13, d16, d19\n" in Multiply()
3507 "vmull.u8 q14, d16, d20\n" in Multiply()
3508 "vld1.32 {d17, d18, d19, d20}, [%[rhs]:256]!\n" in Multiply()
3509 "vpadal.u16 q0, q11\n" in Multiply()
3510 "vpadal.u16 q1, q12\n" in Multiply()
3511 "vpadal.u16 q2, q13\n" in Multiply()
3512 "vpadal.u16 q3, q14\n" in Multiply()
3513 "pld [%[rhs], #256]\n" in Multiply()
3514 "vmull.u8 q15, d16, d17\n" in Multiply()
3515 "vmull.u8 q11, d16, d18\n" in Multiply()
3516 "vmull.u8 q12, d16, d19\n" in Multiply()
3517 "vmull.u8 q13, d16, d20\n" in Multiply()
3518 "pld [%[lhs], #32]\n" in Multiply()
3521 "subs %[count], %[count], #8\n" in Multiply()
3523 "vpadal.u16 q4, q15\n" in Multiply()
3524 "vpadal.u16 q5, q11\n" in Multiply()
3525 "vpadal.u16 q6, q12\n" in Multiply()
3526 "vpadal.u16 q7, q13\n" in Multiply()
3529 "bgt 1b\n" in Multiply()
3532 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" in Multiply()
3533 "vld1.32 {d18, d19, d20, d21}, [%[rhs]:64]!\n" in Multiply()
3534 "vdup.32 q11, %[scale]\n" in Multiply()
3535 "vdup.32 q8, d16[0]\n" in Multiply()
3540 "vpadd.u32 d0, d0, d1\n" in Multiply()
3541 "vpadd.u32 d2, d2, d3\n" in Multiply()
3542 "vpadd.u32 d4, d4, d5\n" in Multiply()
3543 "vpadd.u32 d6, d6, d7\n" in Multiply()
3544 "vpadd.u32 d8, d8, d9\n" in Multiply()
3545 "vpadd.u32 d10, d10, d11\n" in Multiply()
3546 "vpadd.u32 d12, d12, d13\n" in Multiply()
3547 "vpadd.u32 d14, d14, d15\n" in Multiply()
3548 "vpadd.u32 d0, d0, d2\n" in Multiply()
3549 "vpadd.u32 d1, d4, d6\n" in Multiply()
3550 "vpadd.u32 d2, d8, d10\n" in Multiply()
3551 "vpadd.u32 d3, d12, d14\n" in Multiply()
3554 "vadd.s32 q0, q0, q8\n" in Multiply()
3555 "vadd.s32 q1, q1, q8\n" in Multiply()
3556 "vadd.s32 q0, q0, q9\n" in Multiply()
3557 "vadd.s32 q1, q1, q10\n" in Multiply()
3558 "vcvt.f32.s32 q0, q0\n" in Multiply()
3559 "vcvt.f32.s32 q1, q1\n" in Multiply()
3560 "vmul.f32 q0, q0, q11\n" in Multiply()
3561 "vmul.f32 q1, q1, q11\n" in Multiply()
3564 "vst1.32 {d0, d1, d2, d3}, [%[result]]!\n" in Multiply()
3593 "pld [%[lhs]]\n" in Multiply()
3594 "pld [%[rhs]]\n" in Multiply()
3597 "vmov.i32 q0, #0\n" in Multiply()
3598 "vmov.i32 q1, #0\n" in Multiply()
3604 "subs %[count], %[count], #8\n" in Multiply()
3606 "vld1.32 {d4, d5}, [%[lhs]:64]!\n" in Multiply()
3607 "vld1.32 {d6}, [%[rhs]:64]!\n" in Multiply()
3608 "pld [%[lhs], #64]\n" in Multiply()
3609 "pld [%[rhs], #64]\n" in Multiply()
3610 "vmull.u8 q4, d6, d4\n" in Multiply()
3611 "vmull.u8 q5, d6, d5\n" in Multiply()
3612 "vpadal.u16 q0, q4\n" in Multiply()
3613 "vpadal.u16 q1, q5\n" in Multiply()
3616 "bgt 1b\n" in Multiply()
3619 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
3620 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
3621 "vdup.32 q6, %[scale]\n" in Multiply()
3622 "vdup.32 q2, d8[0]\n" in Multiply()
3623 "vdup.32 q4, d8[1]\n" in Multiply()
3626 "add r0, %[result], %[stride]\n" in Multiply()
3629 "vpadd.u32 d0, d0, d1\n" in Multiply()
3630 "vpadd.u32 d0, d0, d0\n" in Multiply()
3631 "vpadd.u32 d2, d2, d3\n" in Multiply()
3632 "vpadd.u32 d2, d2, d2\n" in Multiply()
3635 "vadd.s32 q0, q0, q2\n" in Multiply()
3636 "vadd.s32 q1, q1, q4\n" in Multiply()
3637 "vadd.s32 q0, q0, q5\n" in Multiply()
3638 "vadd.s32 q1, q1, q5\n" in Multiply()
3639 "vcvt.f32.s32 q0, q0\n" in Multiply()
3640 "vcvt.f32.s32 q1, q1\n" in Multiply()
3641 "vmul.f32 q0, q0, q6\n" in Multiply()
3642 "vmul.f32 q1, q1, q6\n" in Multiply()
3645 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
3646 "vst1.32 {d2[0]}, [r0]!\n" in Multiply()
3673 "pld [%[lhs]]\n" in Multiply()
3674 "pld [%[rhs]]\n" in Multiply()
3677 "vmov.i32 q0, #0\n" in Multiply()
3678 "vmov.i32 q1, #0\n" in Multiply()
3679 "vmov.i32 q2, #0\n" in Multiply()
3680 "vmov.i32 q3, q0\n" in Multiply()
3686 "subs %[count], %[count], #8\n" in Multiply()
3688 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
3689 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
3690 "pld [%[lhs], #64]\n" in Multiply()
3691 "pld [%[rhs], #64]\n" in Multiply()
3692 "vmull.u8 q6, d10, d8\n" in Multiply()
3693 "vmull.u8 q7, d11, d8\n" in Multiply()
3694 "vmull.u8 q8, d10, d9\n" in Multiply()
3695 "vmull.u8 q9, d11, d9\n" in Multiply()
3696 "vpadal.u16 q0, q6\n" in Multiply()
3697 "vpadal.u16 q1, q7\n" in Multiply()
3698 "vpadal.u16 q2, q8\n" in Multiply()
3699 "vpadal.u16 q3, q9\n" in Multiply()
3702 "bgt 1b\n" in Multiply()
3705 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
3706 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
3707 "vdup.32 q6, %[scale]\n" in Multiply()
3708 "vdup.32 q7, d8[0]\n" in Multiply()
3709 "vdup.32 q4, d8[1]\n" in Multiply()
3712 "add r0, %[result], %[stride]\n" in Multiply()
3715 "vpadd.u32 d0, d0, d1\n" in Multiply()
3716 "vpadd.u32 d2, d2, d3\n" in Multiply()
3717 "vpadd.u32 d0, d0, d2\n" in Multiply()
3718 "vpadd.u32 d4, d4, d5\n" in Multiply()
3719 "vpadd.u32 d6, d6, d7\n" in Multiply()
3720 "vpadd.u32 d4, d4, d6\n" in Multiply()
3723 "vadd.s32 q0, q0, q7\n" in Multiply()
3724 "vadd.s32 q2, q2, q4\n" in Multiply()
3725 "vadd.s32 q0, q0, q5\n" in Multiply()
3726 "vadd.s32 q2, q2, q5\n" in Multiply()
3727 "vcvt.f32.s32 q0, q0\n" in Multiply()
3728 "vcvt.f32.s32 q2, q2\n" in Multiply()
3729 "vmul.f32 q0, q0, q6\n" in Multiply()
3730 "vmul.f32 q2, q2, q6\n" in Multiply()
3733 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
3734 "vst1.32 {d4}, [r0]!\n" in Multiply()
3762 "pld [%[lhs]]\n" in Multiply()
3763 "pld [%[rhs]]\n" in Multiply()
3766 "vmov.i32 q0, #0\n" in Multiply()
3767 "vmov.i32 q1, #0\n" in Multiply()
3768 "vmov.i32 q2, #0\n" in Multiply()
3769 "vmov.i32 q3, q0\n" in Multiply()
3770 "vmov.i32 q4, q1\n" in Multiply()
3771 "vmov.i32 q5, q2\n" in Multiply()
3777 "subs %[count], %[count], #8\n" in Multiply()
3779 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
3780 "vld1.32 {d14, d15, d16}, [%[rhs]:64]!\n" in Multiply()
3781 "pld [%[lhs], #64]\n" in Multiply()
3782 "pld [%[rhs], #64]\n" in Multiply()
3783 "vmull.u8 q9, d14, d12\n" in Multiply()
3784 "vmull.u8 q10, d15, d12\n" in Multiply()
3785 "vmull.u8 q11, d16, d12\n" in Multiply()
3786 "vmull.u8 q12, d14, d13\n" in Multiply()
3787 "vmull.u8 q13, d15, d13\n" in Multiply()
3788 "vmull.u8 q14, d16, d13\n" in Multiply()
3789 "vpadal.u16 q0, q9\n" in Multiply()
3790 "vpadal.u16 q1, q10\n" in Multiply()
3791 "vpadal.u16 q2, q11\n" in Multiply()
3792 "vpadal.u16 q3, q12\n" in Multiply()
3793 "vpadal.u16 q4, q13\n" in Multiply()
3794 "vpadal.u16 q5, q14\n" in Multiply()
3797 "bgt 1b\n" in Multiply()
3800 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
3801 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" in Multiply()
3802 "vdup.32 q8, %[scale]\n" in Multiply()
3803 "vdup.32 q9, d12[0]\n" in Multiply()
3804 "vdup.32 q6, d12[1]\n" in Multiply()
3807 "add r0, %[result], %[stride]\n" in Multiply()
3810 "vpadd.u32 d0, d0, d1\n" in Multiply()
3811 "vpadd.u32 d2, d2, d3\n" in Multiply()
3812 "vpadd.u32 d4, d4, d5\n" in Multiply()
3813 "vpadd.u32 d0, d0, d2\n" in Multiply()
3814 "vpadd.u32 d1, d4, d4\n" in Multiply()
3815 "vpadd.u32 d6, d6, d7\n" in Multiply()
3816 "vpadd.u32 d8, d8, d9\n" in Multiply()
3817 "vpadd.u32 d10, d10, d11\n" in Multiply()
3818 "vpadd.u32 d6, d6, d8\n" in Multiply()
3819 "vpadd.u32 d7, d10, d10\n" in Multiply()
3822 "vadd.s32 q0, q0, q9\n" in Multiply()
3823 "vadd.s32 q3, q3, q6\n" in Multiply()
3824 "vadd.s32 q0, q0, q7\n" in Multiply()
3825 "vadd.s32 q3, q3, q7\n" in Multiply()
3826 "vcvt.f32.s32 q0, q0\n" in Multiply()
3827 "vcvt.f32.s32 q3, q3\n" in Multiply()
3828 "vmul.f32 q0, q0, q8\n" in Multiply()
3829 "vmul.f32 q3, q3, q8\n" in Multiply()
3832 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
3833 "vst1.32 {d1[0]}, [%[result]]!\n" in Multiply()
3834 "vst1.32 {d6}, [r0]!\n" in Multiply()
3835 "vst1.32 {d7[0]}, [r0]!\n" in Multiply()
3864 "pld [%[lhs]]\n" in Multiply()
3865 "pld [%[rhs]]\n" in Multiply()
3868 "vmov.i32 q0, #0\n" in Multiply()
3869 "vmov.i32 q1, #0\n" in Multiply()
3870 "vmov.i32 q2, #0\n" in Multiply()
3871 "vmov.i32 q3, q0\n" in Multiply()
3872 "vmov.i32 q4, q1\n" in Multiply()
3873 "vmov.i32 q5, q2\n" in Multiply()
3874 "vmov.i32 q6, q3\n" in Multiply()
3875 "vmov.i32 q7, q4\n" in Multiply()
3880 "vld1.8 {d18, d19, d20, d21}, [%[rhs]:256]!\n" in Multiply()
3881 "vld1.8 {d16}, [%[lhs]:64]!\n" in Multiply()
3882 "vmull.u8 q11, d16, d18\n" in Multiply()
3883 "vld1.8 {d17}, [%[lhs]:64]!\n" in Multiply()
3884 "vmull.u8 q12, d16, d19\n" in Multiply()
3885 "pld [%[rhs], #64]\n" in Multiply()
3886 "vmull.u8 q13, d16, d20\n" in Multiply()
3887 "pld [%[lhs], #64]\n" in Multiply()
3888 "vmull.u8 q14, d16, d21\n" in Multiply()
3889 "vmull.u8 q15, d17, d18\n" in Multiply()
3890 "vpadal.u16 q0, q11\n" in Multiply()
3891 "vpadal.u16 q1, q12\n" in Multiply()
3892 "vpadal.u16 q2, q13\n" in Multiply()
3893 "vmull.u8 q11, d17, d19\n" in Multiply()
3894 "vmull.u8 q12, d17, d20\n" in Multiply()
3895 "vmull.u8 q13, d17, d21\n" in Multiply()
3898 "subs %[count], %[count], #8\n" in Multiply()
3900 "vpadal.u16 q3, q14\n" in Multiply()
3901 "vpadal.u16 q4, q15\n" in Multiply()
3902 "vpadal.u16 q5, q11\n" in Multiply()
3903 "vpadal.u16 q6, q12\n" in Multiply()
3904 "vpadal.u16 q7, q13\n" in Multiply()
3907 "bgt 1b\n" in Multiply()
3910 "vld1.32 {d16, d17}, [%[lhs]:64]!\n" in Multiply()
3911 "vld1.32 {d18, d19}, [%[rhs]:64]!\n" in Multiply()
3912 "vdup.32 q10, %[scale]\n" in Multiply()
3913 "vdup.32 q11, d16[0]\n" in Multiply()
3914 "vdup.32 q8, d16[1]\n" in Multiply()
3917 "add r0, %[result], %[stride]\n" in Multiply()
3920 "vpadd.u32 d0, d0, d1\n" in Multiply()
3921 "vpadd.u32 d2, d2, d3\n" in Multiply()
3922 "vpadd.u32 d4, d4, d5\n" in Multiply()
3923 "vpadd.u32 d6, d6, d7\n" in Multiply()
3924 "vpadd.u32 d0, d0, d2\n" in Multiply()
3925 "vpadd.u32 d1, d4, d6\n" in Multiply()
3926 "vpadd.u32 d8, d8, d9\n" in Multiply()
3927 "vpadd.u32 d10, d10, d11\n" in Multiply()
3928 "vpadd.u32 d12, d12, d13\n" in Multiply()
3929 "vpadd.u32 d14, d14, d15\n" in Multiply()
3930 "vpadd.u32 d8, d8, d10\n" in Multiply()
3931 "vpadd.u32 d9, d12, d14\n" in Multiply()
3934 "vadd.s32 q0, q0, q11\n" in Multiply()
3935 "vadd.s32 q4, q4, q8\n" in Multiply()
3936 "vadd.s32 q0, q0, q9\n" in Multiply()
3937 "vadd.s32 q4, q4, q9\n" in Multiply()
3938 "vcvt.f32.s32 q0, q0\n" in Multiply()
3939 "vcvt.f32.s32 q4, q4\n" in Multiply()
3940 "vmul.f32 q0, q0, q10\n" in Multiply()
3941 "vmul.f32 q4, q4, q10\n" in Multiply()
3944 "vst1.32 {d0, d1}, [%[result]]!\n" in Multiply()
3945 "vst1.32 {d8, d9}, [r0]!\n" in Multiply()
3974 "pld [%[lhs]]\n" in Multiply()
3975 "pld [%[rhs]]\n" in Multiply()
3978 "vmov.i32 q0, #0\n" in Multiply()
3979 "vmov.i32 q1, #0\n" in Multiply()
3980 "vmov.i32 q2, #0\n" in Multiply()
3986 "subs %[count], %[count], #8\n" in Multiply()
3988 "vld1.32 {d6, d7, d8}, [%[lhs]:64]!\n" in Multiply()
3989 "vld1.32 {d9}, [%[rhs]:64]!\n" in Multiply()
3990 "pld [%[lhs], #64]\n" in Multiply()
3991 "pld [%[rhs], #64]\n" in Multiply()
3992 "vmull.u8 q5, d9, d6\n" in Multiply()
3993 "vmull.u8 q6, d9, d7\n" in Multiply()
3994 "vmull.u8 q7, d9, d8\n" in Multiply()
3995 "vpadal.u16 q0, q5\n" in Multiply()
3996 "vpadal.u16 q1, q6\n" in Multiply()
3997 "vpadal.u16 q2, q7\n" in Multiply()
4000 "bgt 1b\n" in Multiply()
4003 "vld1.32 {d8, d9}, [%[lhs]:64]!\n" in Multiply()
4004 "vld1.32 {d10, d11}, [%[rhs]:64]!\n" in Multiply()
4005 "vdup.32 q6, %[scale]\n" in Multiply()
4006 "vdup.32 q3, d8[0]\n" in Multiply()
4007 "vdup.32 q7, d8[1]\n" in Multiply()
4008 "vdup.32 q4, d9[0]\n" in Multiply()
4011 "add r0, %[result], %[stride]\n" in Multiply()
4012 "add r1, r0, %[stride]\n" in Multiply()
4015 "vpadd.u32 d0, d0, d1\n" in Multiply()
4016 "vpadd.u32 d0, d0, d0\n" in Multiply()
4017 "vpadd.u32 d2, d2, d3\n" in Multiply()
4018 "vpadd.u32 d2, d2, d2\n" in Multiply()
4019 "vpadd.u32 d4, d4, d5\n" in Multiply()
4020 "vpadd.u32 d4, d4, d4\n" in Multiply()
4023 "vadd.s32 q0, q0, q3\n" in Multiply()
4024 "vadd.s32 q1, q1, q7\n" in Multiply()
4025 "vadd.s32 q2, q2, q4\n" in Multiply()
4026 "vadd.s32 q0, q0, q5\n" in Multiply()
4027 "vadd.s32 q1, q1, q5\n" in Multiply()
4028 "vadd.s32 q2, q2, q5\n" in Multiply()
4029 "vcvt.f32.s32 q0, q0\n" in Multiply()
4030 "vcvt.f32.s32 q1, q1\n" in Multiply()
4031 "vcvt.f32.s32 q2, q2\n" in Multiply()
4032 "vmul.f32 q0, q0, q6\n" in Multiply()
4033 "vmul.f32 q1, q1, q6\n" in Multiply()
4034 "vmul.f32 q2, q2, q6\n" in Multiply()
4037 "vst1.32 {d0[0]}, [%[result]]!\n" in Multiply()
4038 "vst1.32 {d2[0]}, [r0]!\n" in Multiply()
4039 "vst1.32 {d4[0]}, [r1]!\n" in Multiply()
4066 "pld [%[lhs]]\n" in Multiply()
4067 "pld [%[rhs]]\n" in Multiply()
4070 "vmov.i32 q0, #0\n" in Multiply()
4071 "vmov.i32 q1, #0\n" in Multiply()
4072 "vmov.i32 q2, #0\n" in Multiply()
4073 "vmov.i32 q3, q0\n" in Multiply()
4074 "vmov.i32 q4, q1\n" in Multiply()
4075 "vmov.i32 q5, q2\n" in Multiply()
4081 "subs %[count], %[count], #8\n" in Multiply()
4083 "vld1.32 {d12, d13, d14}, [%[lhs]:64]!\n" in Multiply()
4084 "vld1.32 {d15, d16}, [%[rhs]:64]!\n" in Multiply()
4085 "pld [%[lhs], #64]\n" in Multiply()
4086 "pld [%[rhs], #64]\n" in Multiply()
4087 "vmull.u8 q9, d15, d12\n" in Multiply()
4088 "vmull.u8 q10, d16, d12\n" in Multiply()
4089 "vmull.u8 q11, d15, d13\n" in Multiply()
4090 "vmull.u8 q12, d16, d13\n" in Multiply()
4091 "vmull.u8 q13, d15, d14\n" in Multiply()
4092 "vmull.u8 q14, d16, d14\n" in Multiply()
4093 "vpadal.u16 q0, q9\n" in Multiply()
4094 "vpadal.u16 q1, q10\n" in Multiply()
4095 "vpadal.u16 q2, q11\n" in Multiply()
4096 "vpadal.u16 q3, q12\n" in Multiply()
4097 "vpadal.u16 q4, q13\n" in Multiply()
4098 "vpadal.u16 q5, q14\n" in Multiply()
4101 "bgt 1b\n" in Multiply()
4104 "vld1.32 {d12, d13}, [%[lhs]:64]!\n" in Multiply()
4105 "vld1.32 {d14, d15}, [%[rhs]:64]!\n" in Multiply()
4106 "vdup.32 q8, %[scale]\n" in Multiply()
4107 "vdup.32 q9, d12[0]\n" in Multiply()
4108 "vdup.32 q10, d12[1]\n" in Multiply()
4109 "vdup.32 q6, d13[0]\n" in Multiply()
4112 "add r0, %[result], %[stride]\n" in Multiply()
4113 "add r1, r0, %[stride]\n" in Multiply()
4116 "vpadd.u32 d0, d0, d1\n" in Multiply()
4117 "vpadd.u32 d2, d2, d3\n" in Multiply()
4118 "vpadd.u32 d0, d0, d2\n" in Multiply()
4119 "vpadd.u32 d4, d4, d5\n" in Multiply()
4120 "vpadd.u32 d6, d6, d7\n" in Multiply()
4121 "vpadd.u32 d4, d4, d6\n" in Multiply()
4122 "vpadd.u32 d8, d8, d9\n" in Multiply()
4123 "vpadd.u32 d10, d10, d11\n" in Multiply()
4124 "vpadd.u32 d8, d8, d10\n" in Multiply()
4127 "vadd.s32 q0, q0, q9\n" in Multiply()
4128 "vadd.s32 q2, q2, q10\n" in Multiply()
4129 "vadd.s32 q4, q4, q6\n" in Multiply()
4130 "vadd.s32 q0, q0, q7\n" in Multiply()
4131 "vadd.s32 q2, q2, q7\n" in Multiply()
4132 "vadd.s32 q4, q4, q7\n" in Multiply()
4133 "vcvt.f32.s32 q0, q0\n" in Multiply()
4134 "vcvt.f32.s32 q2, q2\n" in Multiply()
4135 "vcvt.f32.s32 q4, q4\n" in Multiply()
4136 "vmul.f32 q0, q0, q8\n" in Multiply()
4137 "vmul.f32 q2, q2, q8\n" in Multiply()
4138 "vmul.f32 q4, q4, q8\n" in Multiply()
4141 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
4142 "vst1.32 {d4}, [r0]!\n" in Multiply()
4143 "vst1.32 {d8}, [r1]!\n" in Multiply()
4172 "pld [%[lhs]]\n" in Multiply()
4173 "pld [%[rhs]]\n" in Multiply()
4176 "vmov.i32 q0, #0\n" in Multiply()
4177 "vmov.i32 q1, #0\n" in Multiply()
4178 "vmov.i32 q2, #0\n" in Multiply()
4179 "vmov.i32 q3, q0\n" in Multiply()
4180 "vmov.i32 q4, q1\n" in Multiply()
4181 "vmov.i32 q5, q2\n" in Multiply()
4182 "vmov.i32 q6, q3\n" in Multiply()
4183 "vmov.i32 q7, q4\n" in Multiply()
4184 "vmov.i32 q8, q5\n" in Multiply()
4189 "vld1.8 {d21, d22, d23}, [%[rhs]:64]!\n" in Multiply()
4190 "vld1.8 {d18}, [%[lhs]:64]!\n" in Multiply()
4191 "vmull.u8 q12, d18, d21\n" in Multiply()
4192 "vld1.8 {d19}, [%[lhs]:64]!\n" in Multiply()
4193 "vmull.u8 q13, d18, d22\n" in Multiply()
4194 "vld1.8 {d20}, [%[lhs]:64]!\n" in Multiply()
4195 "vmull.u8 q14, d18, d23\n" in Multiply()
4196 "pld [%[lhs], #64]\n" in Multiply()
4197 "vmull.u8 q15, d19, d21\n" in Multiply()
4198 "pld [%[rhs], #64]\n" in Multiply()
4199 "vpadal.u16 q0, q12\n" in Multiply()
4200 "vpadal.u16 q1, q13\n" in Multiply()
4201 "vpadal.u16 q2, q14\n" in Multiply()
4202 "vpadal.u16 q3, q15\n" in Multiply()
4203 "vmull.u8 q12, d19, d22\n" in Multiply()
4204 "vmull.u8 q13, d19, d23\n" in Multiply()
4205 "vmull.u8 q14, d20, d21\n" in Multiply()
4206 "vmull.u8 q15, d20, d22\n" in Multiply()
4209 "subs %[count], %[count], #8\n" in Multiply()
4211 "vmull.u8 q9, d20, d23\n" in Multiply()
4212 "vpadal.u16 q4, q12\n" in Multiply()
4213 "vpadal.u16 q5, q13\n" in Multiply()
4214 "vpadal.u16 q6, q14\n" in Multiply()
4215 "vpadal.u16 q7, q15\n" in Multiply()
4216 "vpadal.u16 q8, q9\n" in Multiply()
4219 "bgt 1b\n" in Multiply()
4222 "vld1.32 {d18, d19}, [%[lhs]:64]!\n" in Multiply()
4223 "vld1.32 {d20, d21}, [%[rhs]:64]!\n" in Multiply()
4224 "vdup.32 q11, %[scale]\n" in Multiply()
4225 "vdup.32 q12, d18[0]\n" in Multiply()
4226 "vdup.32 q13, d18[1]\n" in Multiply()
4227 "vdup.32 q9, d19[0]\n" in Multiply()
4230 "add r0, %[result], %[stride]\n" in Multiply()
4231 "add r1, r0, %[stride]\n" in Multiply()
4234 "vpadd.u32 d0, d0, d1\n" in Multiply()
4235 "vpadd.u32 d2, d2, d3\n" in Multiply()
4236 "vpadd.u32 d4, d4, d5\n" in Multiply()
4237 "vpadd.u32 d0, d0, d2\n" in Multiply()
4238 "vpadd.u32 d1, d4, d4\n" in Multiply()
4239 "vpadd.u32 d6, d6, d7\n" in Multiply()
4240 "vpadd.u32 d8, d8, d9\n" in Multiply()
4241 "vpadd.u32 d10, d10, d11\n" in Multiply()
4242 "vpadd.u32 d6, d6, d8\n" in Multiply()
4243 "vpadd.u32 d7, d10, d10\n" in Multiply()
4244 "vpadd.u32 d12, d12, d13\n" in Multiply()
4245 "vpadd.u32 d14, d14, d15\n" in Multiply()
4246 "vpadd.u32 d16, d16, d17\n" in Multiply()
4247 "vpadd.u32 d12, d12, d14\n" in Multiply()
4248 "vpadd.u32 d13, d16, d16\n" in Multiply()
4251 "vadd.s32 q0, q0, q12\n" in Multiply()
4252 "vadd.s32 q3, q3, q13\n" in Multiply()
4253 "vadd.s32 q6, q6, q9\n" in Multiply()
4254 "vadd.s32 q0, q0, q10\n" in Multiply()
4255 "vadd.s32 q3, q3, q10\n" in Multiply()
4256 "vadd.s32 q6, q6, q10\n" in Multiply()
4257 "vcvt.f32.s32 q0, q0\n" in Multiply()
4258 "vcvt.f32.s32 q3, q3\n" in Multiply()
4259 "vcvt.f32.s32 q6, q6\n" in Multiply()
4260 "vmul.f32 q0, q0, q11\n" in Multiply()
4261 "vmul.f32 q3, q3, q11\n" in Multiply()
4262 "vmul.f32 q6, q6, q11\n" in Multiply()
4265 "vst1.32 {d0}, [%[result]]!\n" in Multiply()
4266 "vst1.32 {d1[0]}, [%[result]]!\n" in Multiply()
4267 "vst1.32 {d6}, [r0]!\n" in Multiply()
4268 "vst1.32 {d7[0]}, [r0]!\n" in Multiply()
4269 "vst1.32 {d12}, [r1]!\n" in Multiply()
4270 "vst1.32 {d13[0]}, [r1]!\n" in Multiply()