Lines Matching refs:__m128i
32 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb in SubtractGreenFromBlueAndRed()
33 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g in SubtractGreenFromBlueAndRed()
34 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); in SubtractGreenFromBlueAndRed()
35 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g in SubtractGreenFromBlueAndRed()
36 const __m128i out = _mm_sub_epi8(in, C); in SubtractGreenFromBlueAndRed()
37 _mm_storeu_si128((__m128i*)&argb_data[i], out); in SubtractGreenFromBlueAndRed()
50 const __m128i mults_rb = _mm_set_epi16( in TransformColor()
55 const __m128i mults_b2 = _mm_set_epi16( in TransformColor()
58 const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks in TransformColor()
59 const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks in TransformColor()
62 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb in TransformColor()
63 const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 in TransformColor()
64 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); in TransformColor()
65 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 in TransformColor()
66 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 in TransformColor()
67 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 in TransformColor()
68 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 in TransformColor()
69 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 in TransformColor()
70 const __m128i H = _mm_add_epi8(G, D); // x dr x db in TransformColor()
71 const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db in TransformColor()
72 const __m128i out = _mm_sub_epi8(in, I); in TransformColor()
73 _mm_storeu_si128((__m128i*)&argb_data[i], out); in TransformColor()
87 const __m128i mults_r = _mm_set_epi16( in CollectColorBlueTransforms()
90 const __m128i mults_g = _mm_set_epi16( in CollectColorBlueTransforms()
93 const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask in CollectColorBlueTransforms()
94 const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask in CollectColorBlueTransforms()
101 const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); in CollectColorBlueTransforms()
102 const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); in CollectColorBlueTransforms()
103 const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0 in CollectColorBlueTransforms()
104 const __m128i A1 = _mm_slli_epi16(in1, 8); in CollectColorBlueTransforms()
105 const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 in CollectColorBlueTransforms()
106 const __m128i B1 = _mm_and_si128(in1, mask_g); in CollectColorBlueTransforms()
107 const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0 in CollectColorBlueTransforms()
108 const __m128i C1 = _mm_mulhi_epi16(A1, mults_r); in CollectColorBlueTransforms()
109 const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db in CollectColorBlueTransforms()
110 const __m128i D1 = _mm_mulhi_epi16(B1, mults_g); in CollectColorBlueTransforms()
111 const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b' in CollectColorBlueTransforms()
112 const __m128i E1 = _mm_sub_epi8(in1, D1); in CollectColorBlueTransforms()
113 const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db in CollectColorBlueTransforms()
114 const __m128i F1 = _mm_srli_epi32(C1, 16); in CollectColorBlueTransforms()
115 const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b' in CollectColorBlueTransforms()
116 const __m128i G1 = _mm_sub_epi8(E1, F1); in CollectColorBlueTransforms()
117 const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b in CollectColorBlueTransforms()
118 const __m128i H1 = _mm_and_si128(G1, mask_b); in CollectColorBlueTransforms()
119 const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b' in CollectColorBlueTransforms()
120 _mm_storeu_si128((__m128i*)values, I); in CollectColorBlueTransforms()
137 const __m128i mults_g = _mm_set_epi16( in CollectColorRedTransforms()
140 const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask in CollectColorRedTransforms()
141 const __m128i mask = _mm_set1_epi32(0xff); in CollectColorRedTransforms()
149 const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); in CollectColorRedTransforms()
150 const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); in CollectColorRedTransforms()
151 const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 in CollectColorRedTransforms()
152 const __m128i A1 = _mm_and_si128(in1, mask_g); in CollectColorRedTransforms()
153 const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r in CollectColorRedTransforms()
154 const __m128i B1 = _mm_srli_epi32(in1, 16); in CollectColorRedTransforms()
155 const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr in CollectColorRedTransforms()
156 const __m128i C1 = _mm_mulhi_epi16(A1, mults_g); in CollectColorRedTransforms()
157 const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r' in CollectColorRedTransforms()
158 const __m128i E1 = _mm_sub_epi8(B1, C1); in CollectColorRedTransforms()
159 const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r' in CollectColorRedTransforms()
160 const __m128i F1 = _mm_and_si128(E1, mask); in CollectColorRedTransforms()
161 const __m128i I = _mm_packs_epi32(F0, F1); in CollectColorRedTransforms()
162 _mm_storeu_si128((__m128i*)values, I); in CollectColorRedTransforms()
185 const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); in AddVector()
186 const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); in AddVector()
188 const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); in AddVector()
189 const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); in AddVector()
191 const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]); in AddVector()
192 const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]); in AddVector()
194 const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]); in AddVector()
195 const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]); in AddVector()
197 _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); in AddVector()
198 _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); in AddVector()
200 _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); in AddVector()
201 _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); in AddVector()
210 const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); in AddVectorEq()
211 const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); in AddVectorEq()
213 const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]); in AddVectorEq()
214 const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]); in AddVectorEq()
216 const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]); in AddVectorEq()
217 const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]); in AddVectorEq()
219 const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]); in AddVectorEq()
220 const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]); in AddVectorEq()
222 _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0)); in AddVectorEq()
223 _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1)); in AddVectorEq()
225 _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2)); in AddVectorEq()
226 _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); in AddVectorEq()
284 __m128i zero = _mm_setzero_si128(); in CombinedShannonEntropy()
286 __m128i sumXY_128 = zero; in CombinedShannonEntropy()
287 __m128i sumX_128 = zero; in CombinedShannonEntropy()
290 const __m128i x = _mm_loadu_si128((const __m128i*)(X + i)); in CombinedShannonEntropy()
291 const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i)); in CombinedShannonEntropy()
296 const __m128i xy_128 = _mm_add_epi32(x, y); in CombinedShannonEntropy()
302 _mm_storeu_si128((__m128i*)tmp, xy_128); in CombinedShannonEntropy()
320 _mm_storeu_si128((__m128i*)tmp, sumX_128); in CombinedShannonEntropy()
324 _mm_storeu_si128((__m128i*)tmp, sumXY_128); in CombinedShannonEntropy()
340 __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); in VectorMismatch()
341 __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); in VectorMismatch()
346 const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); in VectorMismatch()
347 const __m128i B0 = in VectorMismatch()
348 _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); in VectorMismatch()
349 const __m128i B1 = in VectorMismatch()
350 _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); in VectorMismatch()
355 const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); in VectorMismatch()
356 A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); in VectorMismatch()
357 A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); in VectorMismatch()
367 _mm_loadu_si128((const __m128i*)&array1[0]), in VectorMismatch()
368 _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { in VectorMismatch()
372 _mm_loadu_si128((const __m128i*)&array1[4]), in VectorMismatch()
373 _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) { in VectorMismatch()
393 const __m128i ff = _mm_set1_epi16(0xff00); in BundleColorMap_SSE2()
394 const __m128i zero = _mm_setzero_si128(); in BundleColorMap_SSE2()
397 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); in BundleColorMap_SSE2()
398 const __m128i in_lo = _mm_unpacklo_epi8(zero, in); in BundleColorMap_SSE2()
399 const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff); in BundleColorMap_SSE2()
400 const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff); in BundleColorMap_SSE2()
401 const __m128i in_hi = _mm_unpackhi_epi8(zero, in); in BundleColorMap_SSE2()
402 const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff); in BundleColorMap_SSE2()
403 const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff); in BundleColorMap_SSE2()
404 _mm_storeu_si128((__m128i*)&dst[0], dst0); in BundleColorMap_SSE2()
405 _mm_storeu_si128((__m128i*)&dst[4], dst1); in BundleColorMap_SSE2()
406 _mm_storeu_si128((__m128i*)&dst[8], dst2); in BundleColorMap_SSE2()
407 _mm_storeu_si128((__m128i*)&dst[12], dst3); in BundleColorMap_SSE2()
412 const __m128i ff = _mm_set1_epi16(0xff00); in BundleColorMap_SSE2()
413 const __m128i mul = _mm_set1_epi16(0x110); in BundleColorMap_SSE2()
416 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); in BundleColorMap_SSE2()
417 const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0 in BundleColorMap_SSE2()
418 const __m128i pack = _mm_and_si128(tmp, ff); // ab00 in BundleColorMap_SSE2()
419 const __m128i dst0 = _mm_unpacklo_epi16(pack, ff); in BundleColorMap_SSE2()
420 const __m128i dst1 = _mm_unpackhi_epi16(pack, ff); in BundleColorMap_SSE2()
421 _mm_storeu_si128((__m128i*)&dst[0], dst0); in BundleColorMap_SSE2()
422 _mm_storeu_si128((__m128i*)&dst[4], dst1); in BundleColorMap_SSE2()
427 const __m128i mask_or = _mm_set1_epi32(0xff000000); in BundleColorMap_SSE2()
428 const __m128i mul_cst = _mm_set1_epi16(0x0104); in BundleColorMap_SSE2()
429 const __m128i mask_mul = _mm_set1_epi16(0x0f00); in BundleColorMap_SSE2()
432 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); in BundleColorMap_SSE2()
433 const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0 in BundleColorMap_SSE2()
434 const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000 in BundleColorMap_SSE2()
435 const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000 in BundleColorMap_SSE2()
436 const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000 in BundleColorMap_SSE2()
438 const __m128i res = _mm_or_si128(pack, mask_or); in BundleColorMap_SSE2()
439 _mm_storeu_si128((__m128i*)dst, res); in BundleColorMap_SSE2()
447 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]); in BundleColorMap_SSE2()
448 const __m128i shift = _mm_slli_epi64(in, 7); in BundleColorMap_SSE2()
464 static WEBP_INLINE void Average2_m128i(const __m128i* const a0, in Average2_m128i()
465 const __m128i* const a1, in Average2_m128i()
466 __m128i* const avg) { in Average2_m128i()
468 const __m128i ones = _mm_set1_epi8(1); in Average2_m128i()
469 const __m128i avg1 = _mm_avg_epu8(*a0, *a1); in Average2_m128i()
470 const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones); in Average2_m128i()
478 const __m128i black = _mm_set1_epi32(ARGB_BLACK); in PredictorSub0_SSE2()
480 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); in PredictorSub0_SSE2()
481 const __m128i res = _mm_sub_epi8(src, black); in PredictorSub0_SSE2()
482 _mm_storeu_si128((__m128i*)&out[i], res); in PredictorSub0_SSE2()
494 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
495 const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
496 const __m128i res = _mm_sub_epi8(src, pred); \
497 _mm_storeu_si128((__m128i*)&out[i], res); \
515 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); in PredictorSub5_SSE2()
516 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); in PredictorSub5_SSE2()
517 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); in PredictorSub5_SSE2()
518 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); in PredictorSub5_SSE2()
519 __m128i avg, pred, res; in PredictorSub5_SSE2()
523 _mm_storeu_si128((__m128i*)&out[i], res); in PredictorSub5_SSE2()
535 const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \
536 const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \
537 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
538 __m128i pred, res; \
541 _mm_storeu_si128((__m128i*)&out[i], res); \
559 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); in PredictorSub10_SSE2()
560 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); in PredictorSub10_SSE2()
561 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); in PredictorSub10_SSE2()
562 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); in PredictorSub10_SSE2()
563 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); in PredictorSub10_SSE2()
564 __m128i avgTTR, avgLTL, avg, res; in PredictorSub10_SSE2()
569 _mm_storeu_si128((__m128i*)&out[i], res); in PredictorSub10_SSE2()
577 static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B, in GetSumAbsDiff32()
578 __m128i* const out) { in GetSumAbsDiff32()
581 const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); in GetSumAbsDiff32()
582 const __m128i B_lo = _mm_unpacklo_epi32(*B, *A); in GetSumAbsDiff32()
583 const __m128i A_hi = _mm_unpackhi_epi32(*A, *A); in GetSumAbsDiff32()
584 const __m128i B_hi = _mm_unpackhi_epi32(*B, *A); in GetSumAbsDiff32()
585 const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo); in GetSumAbsDiff32()
586 const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi); in GetSumAbsDiff32()
594 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); in PredictorSub11_SSE2()
595 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); in PredictorSub11_SSE2()
596 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); in PredictorSub11_SSE2()
597 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); in PredictorSub11_SSE2()
598 __m128i pa, pb; in PredictorSub11_SSE2()
602 const __m128i mask = _mm_cmpgt_epi32(pb, pa); in PredictorSub11_SSE2()
603 const __m128i A = _mm_and_si128(mask, L); in PredictorSub11_SSE2()
604 const __m128i B = _mm_andnot_si128(mask, T); in PredictorSub11_SSE2()
605 const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T in PredictorSub11_SSE2()
606 const __m128i res = _mm_sub_epi8(src, pred); in PredictorSub11_SSE2()
607 _mm_storeu_si128((__m128i*)&out[i], res); in PredictorSub11_SSE2()
619 const __m128i zero = _mm_setzero_si128(); in PredictorSub12_SSE2()
621 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); in PredictorSub12_SSE2()
622 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); in PredictorSub12_SSE2()
623 const __m128i L_lo = _mm_unpacklo_epi8(L, zero); in PredictorSub12_SSE2()
624 const __m128i L_hi = _mm_unpackhi_epi8(L, zero); in PredictorSub12_SSE2()
625 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); in PredictorSub12_SSE2()
626 const __m128i T_lo = _mm_unpacklo_epi8(T, zero); in PredictorSub12_SSE2()
627 const __m128i T_hi = _mm_unpackhi_epi8(T, zero); in PredictorSub12_SSE2()
628 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); in PredictorSub12_SSE2()
629 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); in PredictorSub12_SSE2()
630 const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero); in PredictorSub12_SSE2()
631 const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo); in PredictorSub12_SSE2()
632 const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi); in PredictorSub12_SSE2()
633 const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo); in PredictorSub12_SSE2()
634 const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi); in PredictorSub12_SSE2()
635 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); in PredictorSub12_SSE2()
636 const __m128i res = _mm_sub_epi8(src, pred); in PredictorSub12_SSE2()
637 _mm_storeu_si128((__m128i*)&out[i], res); in PredictorSub12_SSE2()
648 const __m128i zero = _mm_setzero_si128(); in PredictorSub13_SSE2()
651 const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]); in PredictorSub13_SSE2()
652 const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]); in PredictorSub13_SSE2()
653 const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]); in PredictorSub13_SSE2()
654 const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]); in PredictorSub13_SSE2()
655 const __m128i L_lo = _mm_unpacklo_epi8(L, zero); in PredictorSub13_SSE2()
656 const __m128i T_lo = _mm_unpacklo_epi8(T, zero); in PredictorSub13_SSE2()
657 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); in PredictorSub13_SSE2()
658 const __m128i sum = _mm_add_epi16(T_lo, L_lo); in PredictorSub13_SSE2()
659 const __m128i avg = _mm_srli_epi16(sum, 1); in PredictorSub13_SSE2()
660 const __m128i A1 = _mm_sub_epi16(avg, TL_lo); in PredictorSub13_SSE2()
661 const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg); in PredictorSub13_SSE2()
662 const __m128i A2 = _mm_sub_epi16(A1, bit_fix); in PredictorSub13_SSE2()
663 const __m128i A3 = _mm_srai_epi16(A2, 1); in PredictorSub13_SSE2()
664 const __m128i A4 = _mm_add_epi16(avg, A3); in PredictorSub13_SSE2()
665 const __m128i pred = _mm_packus_epi16(A4, A4); in PredictorSub13_SSE2()
666 const __m128i res = _mm_sub_epi8(src, pred); in PredictorSub13_SSE2()
667 _mm_storel_epi64((__m128i*)&out[i], res); in PredictorSub13_SSE2()