Lines Matching refs:__m128i
48 const __m128i k1 = _mm_set1_epi16(20091); in Transform()
49 const __m128i k2 = _mm_set1_epi16(-30068); in Transform()
50 __m128i T0, T1, T2, T3; in Transform()
55 __m128i in0, in1, in2, in3; in Transform()
57 in0 = _mm_loadl_epi64((const __m128i*)&in[0]); in Transform()
58 in1 = _mm_loadl_epi64((const __m128i*)&in[4]); in Transform()
59 in2 = _mm_loadl_epi64((const __m128i*)&in[8]); in Transform()
60 in3 = _mm_loadl_epi64((const __m128i*)&in[12]); in Transform()
66 const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]); in Transform()
67 const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]); in Transform()
68 const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]); in Transform()
69 const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]); in Transform()
85 const __m128i a = _mm_add_epi16(in0, in2); in Transform()
86 const __m128i b = _mm_sub_epi16(in0, in2); in Transform()
88 const __m128i c1 = _mm_mulhi_epi16(in1, k2); in Transform()
89 const __m128i c2 = _mm_mulhi_epi16(in3, k1); in Transform()
90 const __m128i c3 = _mm_sub_epi16(in1, in3); in Transform()
91 const __m128i c4 = _mm_sub_epi16(c1, c2); in Transform()
92 const __m128i c = _mm_add_epi16(c3, c4); in Transform()
94 const __m128i d1 = _mm_mulhi_epi16(in1, k1); in Transform()
95 const __m128i d2 = _mm_mulhi_epi16(in3, k2); in Transform()
96 const __m128i d3 = _mm_add_epi16(in1, in3); in Transform()
97 const __m128i d4 = _mm_add_epi16(d1, d2); in Transform()
98 const __m128i d = _mm_add_epi16(d3, d4); in Transform()
101 const __m128i tmp0 = _mm_add_epi16(a, d); in Transform()
102 const __m128i tmp1 = _mm_add_epi16(b, c); in Transform()
103 const __m128i tmp2 = _mm_sub_epi16(b, c); in Transform()
104 const __m128i tmp3 = _mm_sub_epi16(a, d); in Transform()
114 const __m128i four = _mm_set1_epi16(4); in Transform()
115 const __m128i dc = _mm_add_epi16(T0, four); in Transform()
116 const __m128i a = _mm_add_epi16(dc, T2); in Transform()
117 const __m128i b = _mm_sub_epi16(dc, T2); in Transform()
119 const __m128i c1 = _mm_mulhi_epi16(T1, k2); in Transform()
120 const __m128i c2 = _mm_mulhi_epi16(T3, k1); in Transform()
121 const __m128i c3 = _mm_sub_epi16(T1, T3); in Transform()
122 const __m128i c4 = _mm_sub_epi16(c1, c2); in Transform()
123 const __m128i c = _mm_add_epi16(c3, c4); in Transform()
125 const __m128i d1 = _mm_mulhi_epi16(T1, k1); in Transform()
126 const __m128i d2 = _mm_mulhi_epi16(T3, k2); in Transform()
127 const __m128i d3 = _mm_add_epi16(T1, T3); in Transform()
128 const __m128i d4 = _mm_add_epi16(d1, d2); in Transform()
129 const __m128i d = _mm_add_epi16(d3, d4); in Transform()
132 const __m128i tmp0 = _mm_add_epi16(a, d); in Transform()
133 const __m128i tmp1 = _mm_add_epi16(b, c); in Transform()
134 const __m128i tmp2 = _mm_sub_epi16(b, c); in Transform()
135 const __m128i tmp3 = _mm_sub_epi16(a, d); in Transform()
136 const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); in Transform()
137 const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); in Transform()
138 const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); in Transform()
139 const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); in Transform()
148 const __m128i zero = _mm_setzero_si128(); in Transform()
150 __m128i dst0, dst1, dst2, dst3; in Transform()
153 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS)); in Transform()
154 dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS)); in Transform()
155 dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS)); in Transform()
156 dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS)); in Transform()
182 _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0); in Transform()
183 _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1); in Transform()
184 _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2); in Transform()
185 _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3); in Transform()
201 const __m128i A = _mm_set1_epi16(in[0] + 4); in TransformAC3()
202 const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); in TransformAC3()
203 const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); in TransformAC3()
206 const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); in TransformAC3()
207 const __m128i B = _mm_adds_epi16(A, CD); in TransformAC3()
208 const __m128i m0 = _mm_adds_epi16(B, d4); in TransformAC3()
209 const __m128i m1 = _mm_adds_epi16(B, c4); in TransformAC3()
210 const __m128i m2 = _mm_subs_epi16(B, c4); in TransformAC3()
211 const __m128i m3 = _mm_subs_epi16(B, d4); in TransformAC3()
212 const __m128i zero = _mm_setzero_si128(); in TransformAC3()
214 __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); in TransformAC3()
215 __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); in TransformAC3()
216 __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); in TransformAC3()
217 __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); in TransformAC3()
251 static WEBP_INLINE void SignedShift8b(__m128i* const x) { in SignedShift8b()
252 const __m128i zero = _mm_setzero_si128(); in SignedShift8b()
253 const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x); in SignedShift8b()
254 const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x); in SignedShift8b()
255 const __m128i lo_1 = _mm_srai_epi16(lo_0, 3 + 8); in SignedShift8b()
256 const __m128i hi_1 = _mm_srai_epi16(hi_0, 3 + 8); in SignedShift8b()
271 static WEBP_INLINE void GetNotHEV(const __m128i* const p1, in GetNotHEV()
272 const __m128i* const p0, in GetNotHEV()
273 const __m128i* const q0, in GetNotHEV()
274 const __m128i* const q1, in GetNotHEV()
275 int hev_thresh, __m128i* const not_hev) { in GetNotHEV()
276 const __m128i zero = _mm_setzero_si128(); in GetNotHEV()
277 const __m128i t_1 = MM_ABS(*p1, *p0); in GetNotHEV()
278 const __m128i t_2 = MM_ABS(*q1, *q0); in GetNotHEV()
280 const __m128i h = _mm_set1_epi8(hev_thresh); in GetNotHEV()
281 const __m128i t_max = _mm_max_epu8(t_1, t_2); in GetNotHEV()
283 const __m128i t_max_h = _mm_subs_epu8(t_max, h); in GetNotHEV()
288 static WEBP_INLINE void GetBaseDelta(const __m128i* const p1, in GetBaseDelta()
289 const __m128i* const p0, in GetBaseDelta()
290 const __m128i* const q0, in GetBaseDelta()
291 const __m128i* const q1, in GetBaseDelta()
292 __m128i* const delta) { in GetBaseDelta()
294 const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 in GetBaseDelta()
295 const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0); // q0 - p0 in GetBaseDelta()
296 const __m128i s1 = _mm_adds_epi8(p1_q1, q0_p0); // p1 - q1 + 1 * (q0 - p0) in GetBaseDelta()
297 const __m128i s2 = _mm_adds_epi8(q0_p0, s1); // p1 - q1 + 2 * (q0 - p0) in GetBaseDelta()
298 const __m128i s3 = _mm_adds_epi8(q0_p0, s2); // p1 - q1 + 3 * (q0 - p0) in GetBaseDelta()
303 static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0, in DoSimpleFilter()
304 const __m128i* const fl) { in DoSimpleFilter()
305 const __m128i k3 = _mm_set1_epi8(3); in DoSimpleFilter()
306 const __m128i k4 = _mm_set1_epi8(4); in DoSimpleFilter()
307 __m128i v3 = _mm_adds_epi8(*fl, k3); in DoSimpleFilter()
308 __m128i v4 = _mm_adds_epi8(*fl, k4); in DoSimpleFilter()
320 static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi, in Update2Pixels()
321 const __m128i* const a0_lo, in Update2Pixels()
322 const __m128i* const a0_hi) { in Update2Pixels()
323 const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7); in Update2Pixels()
324 const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7); in Update2Pixels()
325 const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi); in Update2Pixels()
326 const __m128i sign_bit = _mm_set1_epi8(0x80); in Update2Pixels()
333 static WEBP_INLINE void NeedsFilter(const __m128i* const p1, in NeedsFilter()
334 const __m128i* const p0, in NeedsFilter()
335 const __m128i* const q0, in NeedsFilter()
336 const __m128i* const q1, in NeedsFilter()
337 int thresh, __m128i* const mask) { in NeedsFilter()
338 const __m128i m_thresh = _mm_set1_epi8(thresh); in NeedsFilter()
339 const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) in NeedsFilter()
340 const __m128i kFE = _mm_set1_epi8(0xFE); in NeedsFilter()
341 const __m128i t2 = _mm_and_si128(t1, kFE); // set lsb of each byte to zero in NeedsFilter()
342 const __m128i t3 = _mm_srli_epi16(t2, 1); // abs(p1 - q1) / 2 in NeedsFilter()
344 const __m128i t4 = MM_ABS(*p0, *q0); // abs(p0 - q0) in NeedsFilter()
345 const __m128i t5 = _mm_adds_epu8(t4, t4); // abs(p0 - q0) * 2 in NeedsFilter()
346 const __m128i t6 = _mm_adds_epu8(t5, t3); // abs(p0-q0)*2 + abs(p1-q1)/2 in NeedsFilter()
348 const __m128i t7 = _mm_subs_epu8(t6, m_thresh); // mask <= m_thresh in NeedsFilter()
356 static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0, in DoFilter2()
357 __m128i* const q0, __m128i* const q1, in DoFilter2()
359 __m128i a, mask; in DoFilter2()
360 const __m128i sign_bit = _mm_set1_epi8(0x80); in DoFilter2()
362 const __m128i p1s = _mm_xor_si128(*p1, sign_bit); in DoFilter2()
363 const __m128i q1s = _mm_xor_si128(*q1, sign_bit); in DoFilter2()
375 static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, in DoFilter4()
376 __m128i* const q0, __m128i* const q1, in DoFilter4()
377 const __m128i* const mask, int hev_thresh) { in DoFilter4()
378 const __m128i zero = _mm_setzero_si128(); in DoFilter4()
379 const __m128i sign_bit = _mm_set1_epi8(0x80); in DoFilter4()
380 const __m128i k64 = _mm_set1_epi8(64); in DoFilter4()
381 const __m128i k3 = _mm_set1_epi8(3); in DoFilter4()
382 const __m128i k4 = _mm_set1_epi8(4); in DoFilter4()
383 __m128i not_hev; in DoFilter4()
384 __m128i t1, t2, t3; in DoFilter4()
420 static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, in DoFilter6()
421 __m128i* const p0, __m128i* const q0, in DoFilter6()
422 __m128i* const q1, __m128i* const q2, in DoFilter6()
423 const __m128i* const mask, int hev_thresh) { in DoFilter6()
424 const __m128i zero = _mm_setzero_si128(); in DoFilter6()
425 const __m128i sign_bit = _mm_set1_epi8(0x80); in DoFilter6()
426 __m128i a, not_hev; in DoFilter6()
436 const __m128i m = _mm_andnot_si128(not_hev, *mask); in DoFilter6()
437 const __m128i f = _mm_and_si128(a, m); in DoFilter6()
442 const __m128i k9 = _mm_set1_epi16(0x0900); in DoFilter6()
443 const __m128i k63 = _mm_set1_epi16(63); in DoFilter6()
445 const __m128i m = _mm_and_si128(not_hev, *mask); in DoFilter6()
446 const __m128i f = _mm_and_si128(a, m); in DoFilter6()
448 const __m128i f_lo = _mm_unpacklo_epi8(zero, f); in DoFilter6()
449 const __m128i f_hi = _mm_unpackhi_epi8(zero, f); in DoFilter6()
451 const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9); // Filter (lo) * 9 in DoFilter6()
452 const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9); // Filter (hi) * 9 in DoFilter6()
454 const __m128i a2_lo = _mm_add_epi16(f9_lo, k63); // Filter * 9 + 63 in DoFilter6()
455 const __m128i a2_hi = _mm_add_epi16(f9_hi, k63); // Filter * 9 + 63 in DoFilter6()
457 const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo); // Filter * 18 + 63 in DoFilter6()
458 const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi); // Filter * 18 + 63 in DoFilter6()
460 const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63 in DoFilter6()
461 const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63 in DoFilter6()
471 __m128i* const p, __m128i* const q) { in Load8x4()
474 const __m128i A0 = _mm_set_epi32( in Load8x4()
477 const __m128i A1 = _mm_set_epi32( in Load8x4()
483 const __m128i B0 = _mm_unpacklo_epi8(A0, A1); in Load8x4()
484 const __m128i B1 = _mm_unpackhi_epi8(A0, A1); in Load8x4()
488 const __m128i C0 = _mm_unpacklo_epi16(B0, B1); in Load8x4()
489 const __m128i C1 = _mm_unpackhi_epi16(B0, B1); in Load8x4()
500 __m128i* const p1, __m128i* const p0, in Load16x4()
501 __m128i* const q0, __m128i* const q1) { in Load16x4()
525 const __m128i t1 = *p1; in Load16x4()
526 const __m128i t2 = *q0; in Load16x4()
534 static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) { in Store4x4()
543 static WEBP_INLINE void Store16x4(const __m128i* const p1, in Store16x4()
544 const __m128i* const p0, in Store16x4()
545 const __m128i* const q0, in Store16x4()
546 const __m128i* const q1, in Store16x4()
549 __m128i t1, p1_s, p0_s, q0_s, q1_s; in Store16x4()
589 __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]); in SimpleVFilter16()
590 __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]); in SimpleVFilter16()
591 __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]); in SimpleVFilter16()
592 __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]); in SimpleVFilter16()
597 _mm_storeu_si128((__m128i*)&p[-stride], p0); in SimpleVFilter16()
598 _mm_storeu_si128((__m128i*)&p[0], q0); in SimpleVFilter16()
602 __m128i p1, p0, q0, q1; in SimpleHFilter16()
643 e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
644 e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \
645 e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \
646 e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
650 const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
651 const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \
663 _mm_storel_epi64((__m128i*)&u[(stride)], p); \
665 _mm_storel_epi64((__m128i*)&v[(stride)], p); \
668 static WEBP_INLINE void ComplexMask(const __m128i* const p1, in ComplexMask()
669 const __m128i* const p0, in ComplexMask()
670 const __m128i* const q0, in ComplexMask()
671 const __m128i* const q1, in ComplexMask()
673 __m128i* const mask) { in ComplexMask()
674 const __m128i it = _mm_set1_epi8(ithresh); in ComplexMask()
675 const __m128i diff = _mm_subs_epu8(*mask, it); in ComplexMask()
676 const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128()); in ComplexMask()
677 __m128i filter_mask; in ComplexMask()
685 __m128i t1; in VFilter16()
686 __m128i mask; in VFilter16()
687 __m128i p2, p1, p0, q0, q1, q2; in VFilter16()
701 _mm_storeu_si128((__m128i*)&p[-3 * stride], p2); in VFilter16()
702 _mm_storeu_si128((__m128i*)&p[-2 * stride], p1); in VFilter16()
703 _mm_storeu_si128((__m128i*)&p[-1 * stride], p0); in VFilter16()
704 _mm_storeu_si128((__m128i*)&p[+0 * stride], q0); in VFilter16()
705 _mm_storeu_si128((__m128i*)&p[+1 * stride], q1); in VFilter16()
706 _mm_storeu_si128((__m128i*)&p[+2 * stride], q2); in VFilter16()
711 __m128i mask; in HFilter16()
712 __m128i p3, p2, p1, p0, q0, q1, q2, q3; in HFilter16()
732 __m128i p3, p2, p1, p0; // loop invariants in VFilter16i()
737 __m128i mask, tmp1, tmp2; in VFilter16i()
751 _mm_storeu_si128((__m128i*)&b[0 * stride], p1); in VFilter16i()
752 _mm_storeu_si128((__m128i*)&b[1 * stride], p0); in VFilter16i()
753 _mm_storeu_si128((__m128i*)&b[2 * stride], p3); in VFilter16i()
754 _mm_storeu_si128((__m128i*)&b[3 * stride], p2); in VFilter16i()
765 __m128i p3, p2, p1, p0; // loop invariants in HFilter16i()
770 __m128i mask, tmp1, tmp2; in HFilter16i()
793 __m128i mask; in VFilter8()
794 __m128i t1, p2, p1, p0, q0, q1, q2; in VFilter8()
818 __m128i mask; in HFilter8()
819 __m128i p3, p2, p1, p0, q0, q1, q2, q3; in HFilter8()
838 __m128i mask; in VFilter8i()
839 __m128i t1, t2, p1, p0, q0, q1; in VFilter8i()
864 __m128i mask; in HFilter8i()
865 __m128i t1, t2, p1, p0, q0, q1; in HFilter8i()
897 const __m128i one = _mm_set1_epi8(1); in VE4()
898 const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1)); in VE4()
899 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); in VE4()
900 const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2); in VE4()
901 const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00); in VE4()
902 const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one); in VE4()
903 const __m128i b = _mm_subs_epu8(a, lsb); in VE4()
904 const __m128i avg = _mm_avg_epu8(b, BCDEFGH0); in VE4()
913 const __m128i one = _mm_set1_epi8(1); in LD4()
914 const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS)); in LD4()
915 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); in LD4()
916 const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2); in LD4()
917 const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, dst[-BPS + 7], 3); in LD4()
918 const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0); in LD4()
919 const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); in LD4()
920 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); in LD4()
921 const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); in LD4()
929 const __m128i one = _mm_set1_epi8(1); in VR4()
934 const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1)); in VR4()
935 const __m128i ABCD0 = _mm_srli_si128(XABCD, 1); in VR4()
936 const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0); in VR4()
937 const __m128i _XABCD = _mm_slli_si128(XABCD, 1); in VR4()
938 const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0); in VR4()
939 const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0); in VR4()
940 const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); in VR4()
941 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); in VR4()
942 const __m128i efgh = _mm_avg_epu8(avg2, XABCD); in VR4()
954 const __m128i one = _mm_set1_epi8(1); in VL4()
955 const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS)); in VL4()
956 const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); in VL4()
957 const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2); in VL4()
958 const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_); in VL4()
959 const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_); in VL4()
960 const __m128i avg3 = _mm_avg_epu8(avg1, avg2); in VL4()
961 const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one); in VL4()
962 const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_); in VL4()
963 const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_); in VL4()
964 const __m128i abbc = _mm_or_si128(ab, bc); in VL4()
965 const __m128i lsb2 = _mm_and_si128(abbc, lsb1); in VL4()
966 const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); in VL4()
979 const __m128i one = _mm_set1_epi8(1); in RD4()
980 const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1)); in RD4()
981 const __m128i ____XABCD = _mm_slli_si128(XABCD, 4); in RD4()
986 const __m128i LKJI_____ = in RD4()
988 const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD); in RD4()
989 const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1); in RD4()
990 const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2); in RD4()
991 const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD); in RD4()
992 const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); in RD4()
993 const __m128i avg2 = _mm_subs_epu8(avg1, lsb); in RD4()
994 const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); in RD4()
1009 const __m128i zero = _mm_setzero_si128(); in TrueMotion()
1012 const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); in TrueMotion()
1013 const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); in TrueMotion()
1016 const __m128i base = _mm_set1_epi16(val); in TrueMotion()
1017 const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); in TrueMotion()
1021 const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); in TrueMotion()
1022 const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); in TrueMotion()
1025 const __m128i base = _mm_set1_epi16(val); in TrueMotion()
1026 const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); in TrueMotion()
1027 _mm_storel_epi64((__m128i*)dst, out); in TrueMotion()
1030 const __m128i top_values = _mm_loadu_si128((const __m128i*)top); in TrueMotion()
1031 const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero); in TrueMotion()
1032 const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero); in TrueMotion()
1035 const __m128i base = _mm_set1_epi16(val); in TrueMotion()
1036 const __m128i out_0 = _mm_add_epi16(base, top_base_0); in TrueMotion()
1037 const __m128i out_1 = _mm_add_epi16(base, top_base_1); in TrueMotion()
1038 const __m128i out = _mm_packus_epi16(out_0, out_1); in TrueMotion()
1039 _mm_storeu_si128((__m128i*)dst, out); in TrueMotion()
1049 const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); in VE16()
1052 _mm_storeu_si128((__m128i*)(dst + j * BPS), top); in VE16()
1059 const __m128i values = _mm_set1_epi8(dst[-1]); in HE16()
1060 _mm_storeu_si128((__m128i*)dst, values); in HE16()
1067 const __m128i values = _mm_set1_epi8(v); in Put16()
1069 _mm_storeu_si128((__m128i*)(dst + j * BPS), values); in Put16()
1074 const __m128i zero = _mm_setzero_si128(); in DC16()
1075 const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); in DC16()
1076 const __m128i sad8x2 = _mm_sad_epu8(top, zero); in DC16()
1078 const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); in DC16()
1100 const __m128i zero = _mm_setzero_si128(); in DC16NoLeft()
1101 const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS)); in DC16NoLeft()
1102 const __m128i sad8x2 = _mm_sad_epu8(top, zero); in DC16NoLeft()
1104 const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); in DC16NoLeft()
1118 const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); in VE8uv()
1120 _mm_storel_epi64((__m128i*)(dst + j * BPS), top); in VE8uv()
1127 const __m128i values = _mm_set1_epi8(dst[-1]); in HE8uv()
1128 _mm_storel_epi64((__m128i*)dst, values); in HE8uv()
1136 const __m128i values = _mm_set1_epi8(v); in Put8x8uv()
1138 _mm_storel_epi64((__m128i*)(dst + j * BPS), values); in Put8x8uv()
1143 const __m128i zero = _mm_setzero_si128(); in DC8uv()
1144 const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); in DC8uv()
1145 const __m128i sum = _mm_sad_epu8(top, zero); in DC8uv()
1158 const __m128i zero = _mm_setzero_si128(); in DC8uvNoLeft()
1159 const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS)); in DC8uvNoLeft()
1160 const __m128i sum = _mm_sad_epu8(top, zero); in DC8uvNoLeft()