Lines Matching full:srcs
255 __m128i srcs[8];
257 srcs[0] = LoadAligned16(src_x);
260 srcs[1] = LoadAligned16(src_x);
262 srcs[2] = LoadAligned16(src_x);
265 srcs[3] = LoadAligned16(src_x);
267 srcs[4] = LoadAligned16(src_x);
270 srcs[5] = LoadAligned16(src_x);
272 srcs[6] = LoadAligned16(src_x);
282 srcs[next_row] = LoadAligned16(src_x);
286 SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
295 srcs[0] = srcs[1];
297 srcs[1] = srcs[2];
298 srcs[2] = srcs[3];
300 srcs[3] = srcs[4];
301 srcs[4] = srcs[5];
303 srcs[5] = srcs[6];
304 srcs[6] = srcs[7];
321 __m128i srcs[9];
322 srcs[0] = LoadAligned16(src);
325 srcs[2] = LoadAligned16(src);
327 srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
329 srcs[4] = LoadAligned16(src);
331 srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
333 srcs[6] = LoadAligned16(src);
335 srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
342 srcs[num_taps] = LoadAligned16(src);
344 srcs[num_taps - 1] = _mm_unpacklo_epi64(
345 _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
348 SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
360 srcs[0] = srcs[2];
362 srcs[1] = srcs[3];
363 srcs[2] = srcs[4];
365 srcs[3] = srcs[5];
366 srcs[4] = srcs[6];
368 srcs[5] = srcs[7];
369 srcs[6] = srcs[8];
386 __m128i srcs[9];
387 srcs[0] = LoadAligned16(src);
390 srcs[4] = LoadAligned16(src);
392 srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
394 srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
395 srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
401 srcs[next_row] = LoadAligned16(src);
404 srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
406 srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
407 srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
408 srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
410 srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
411 srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
412 srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
414 srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
415 srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
416 srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
420 SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
435 srcs[0] = srcs[4];
437 srcs[1] = srcs[5];
438 srcs[4] = srcs[8];
440 srcs[1] = srcs[5];
441 srcs[2] = srcs[6];
442 srcs[3] = srcs[7];
443 srcs[4] = srcs[8];
457 __m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
462 v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
463 v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
464 v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
467 v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
468 v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
469 v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
470 v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
473 v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
476 v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
477 v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
490 __m128i srcs[9];
493 srcs[2] = _mm_setzero_si128();
495 srcs[0] = Load4(src);
503 srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
506 srcs[2] = Load4(src);
509 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
511 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
526 srcs[0] = srcs[2];
530 srcs[4] = _mm_setzero_si128();
532 srcs[0] = Load4(src);
537 srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
540 srcs[2] = Load4(src);
543 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
550 srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
553 srcs[4] = Load4(src);
556 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
558 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
573 srcs[0] = srcs[2];
574 srcs[1] = srcs[3];
575 srcs[2] = srcs[4];
579 srcs[6] = _mm_setzero_si128();
581 srcs[0] = Load4(src);
586 srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
589 srcs[2] = Load4(src);
592 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
596 srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
599 srcs[4] = Load4(src);
602 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
609 srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
612 srcs[6] = Load4(src);
615 srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
617 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
632 srcs[0] = srcs[2];
633 srcs[1] = srcs[3];
634 srcs[2] = srcs[4];
635 srcs[3] = srcs[5];
636 srcs[4] = srcs[6];
640 srcs[8] = _mm_setzero_si128();
642 srcs[0] = Load4(src);
647 srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
650 srcs[2] = Load4(src);
653 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
657 srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
660 srcs[4] = Load4(src);
663 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
667 srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
670 srcs[6] = Load4(src);
673 srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
680 srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
683 srcs[8] = Load4(src);
686 srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
688 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
703 srcs[0] = srcs[2];
704 srcs[1] = srcs[3];
705 srcs[2] = srcs[4];
706 srcs[3] = srcs[5];
707 srcs[4] = srcs[6];
708 srcs[5] = srcs[7];
709 srcs[6] = srcs[8];
721 __m128i srcs[9];
724 srcs[2] = _mm_setzero_si128();
726 srcs[0] = Load2(src);
732 srcs[0] = Load2<1>(src, srcs[0]);
735 srcs[0] = Load2<2>(src, srcs[0]);
738 srcs[0] = Load2<3>(src, srcs[0]);
741 srcs[2] = Load2<0>(src, srcs[2]);
744 const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
746 srcs[1] = _mm_srli_si128(srcs_0_2, 2);
747 // This uses srcs[0]..srcs[1].
748 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
763 srcs[0] = srcs[2];
767 srcs[4] = _mm_setzero_si128();
770 srcs[0] = Load2(src);
773 srcs[0] = Load2<1>(src, srcs[0]);
776 srcs[0] = Load2<2>(src, srcs[0]);
782 srcs[0] = Load2<3>(src, srcs[0]);
785 srcs[4] = Load2<0>(src, srcs[4]);
788 srcs[4] = Load2<1>(src, srcs[4]);
791 srcs[4] = Load2<2>(src, srcs[4]);
794 const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
796 srcs[1] = _mm_srli_si128(srcs_0_4, 2);
798 srcs[2] = _mm_srli_si128(srcs_0_4, 4);
800 srcs[3] = _mm_srli_si128(srcs_0_4, 6);
802 // This uses srcs[0]..srcs[3].
803 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
818 srcs[0] = srcs[4];
825 srcs[8] = _mm_setzero_si128();
828 srcs[0] = Load2(src);
831 srcs[0] = Load2<1>(src, srcs[0]);
834 srcs[0] = Load2<2>(src, srcs[0]);
837 srcs[0] = Load2<3>(src, srcs[0]);
840 srcs[4] = Load2(src);
843 const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
845 srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
850 srcs[4] = Load2<1>(src, srcs[4]);
853 srcs[4] = Load2<2>(src, srcs[4]);
856 srcs[4] = Load2<3>(src, srcs[4]);
859 srcs[8] = Load2<0>(src, srcs[8]);
862 const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
864 srcs[2] = _mm_srli_si128(srcs_0_4, 4);
866 srcs[3] = _mm_srli_si128(srcs_0_4, 6);
867 const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
869 srcs[5] = _mm_srli_si128(srcs_4_8, 2);
871 // This uses srcs[0]..srcs[5].
872 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
886 srcs[0] = srcs[4];
887 srcs[1] = srcs[5];
888 srcs[4] = srcs[8];
895 srcs[8] = _mm_setzero_si128();
897 srcs[0] = Load2(src);
900 srcs[0] = Load2<1>(src, srcs[0]);
903 srcs[0] = Load2<2>(src, srcs[0]);
906 srcs[0] = Load2<3>(src, srcs[0]);
909 srcs[4] = Load2(src);
912 srcs[4] = Load2<1>(src, srcs[4]);
915 srcs[4] = Load2<2>(src, srcs[4]);
919 const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
921 srcs[1] = _mm_srli_si128(srcs_0_4, 2);
923 srcs[2] = _mm_srli_si128(srcs_0_4, 4);
925 srcs[3] = _mm_srli_si128(srcs_0_4, 6);
930 srcs[4] = Load2<3>(src, srcs[4]);
933 srcs[8] = Load2<0>(src, srcs[8]);
936 srcs[8] = Load2<1>(src, srcs[8]);
939 srcs[8] = Load2<2>(src, srcs[8]);
943 const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
945 srcs[5] = _mm_srli_si128(srcs_4_8, 2);
947 srcs[6] = _mm_srli_si128(srcs_4_8, 4);
949 srcs[7] = _mm_srli_si128(srcs_4_8, 6);
951 // This uses srcs[0]..srcs[7].
952 const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
966 srcs[0] = srcs[4];
967 srcs[1] = srcs[5];
968 srcs[2] = srcs[6];
969 srcs[3] = srcs[7];
970 srcs[4] = srcs[8];