1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_ports/mem.h"
15 #include "vpx_ports/emmintrin_compat.h"
16 #include "vpx_dsp/x86/mem_sse2.h"
17
abs_diff(__m128i a,__m128i b)18 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
19 return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
20 }
21
22 // filter_mask and hev_mask
23 #define FILTER_HEV_MASK \
24 do { \
25 /* (abs(q1 - q0), abs(p1 - p0) */ \
26 __m128i flat = abs_diff(q1p1, q0p0); \
27 /* abs(p1 - q1), abs(p0 - q0) */ \
28 const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
29 __m128i abs_p0q0, abs_p1q1, work; \
30 \
31 /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
32 hev = \
33 _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
34 hev = _mm_cmpgt_epi16(hev, thresh_v); \
35 hev = _mm_packs_epi16(hev, hev); \
36 \
37 /* const int8_t mask = filter_mask(*limit, *blimit, */ \
38 /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
39 abs_p0q0 = \
40 _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
41 abs_p1q1 = \
42 _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
43 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
44 abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
45 /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
46 mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
47 /* abs(p3 - p2), abs(p2 - p1) */ \
48 work = abs_diff(p3p2, p2p1); \
49 flat = _mm_max_epu8(work, flat); \
50 /* abs(q3 - q2), abs(q2 - q1) */ \
51 work = abs_diff(q3q2, q2q1); \
52 flat = _mm_max_epu8(work, flat); \
53 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
54 mask = _mm_unpacklo_epi64(mask, flat); \
55 mask = _mm_subs_epu8(mask, limit_v); \
56 mask = _mm_cmpeq_epi8(mask, zero); \
57 mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
58 } while (0)
59
60 #define FILTER4 \
61 do { \
62 const __m128i t3t4 = \
63 _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \
64 const __m128i t80 = _mm_set1_epi8((int8_t)0x80); \
65 __m128i filter, filter2filter1, work; \
66 \
67 ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
68 qs1qs0 = _mm_xor_si128(q1q0, t80); \
69 \
70 /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
71 work = _mm_subs_epi8(ps1ps0, qs1qs0); \
72 filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
73 /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
74 filter = _mm_subs_epi8(filter, work); \
75 filter = _mm_subs_epi8(filter, work); \
76 filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
77 filter = _mm_and_si128(filter, mask); /* & mask */ \
78 filter = _mm_unpacklo_epi64(filter, filter); \
79 \
80 /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
81 /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
82 filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
83 filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
84 filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
85 filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
86 filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
87 filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
88 \
89 /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
90 filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
91 filter = _mm_unpacklo_epi8(filter, filter); \
92 filter = _mm_srai_epi16(filter, 9); /* round */ \
93 filter = _mm_packs_epi16(filter, filter); \
94 filter = _mm_andnot_si128(hev, filter); \
95 \
96 hev = _mm_unpackhi_epi64(filter2filter1, filter); \
97 filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
98 \
99 /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
100 qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
101 /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
102 ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
103 qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
104 ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
105 } while (0)
106
vpx_lpf_horizontal_4_sse2(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)107 void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
108 const uint8_t *limit, const uint8_t *thresh) {
109 const __m128i zero = _mm_set1_epi16(0);
110 const __m128i limit_v =
111 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
112 _mm_loadl_epi64((const __m128i *)limit));
113 const __m128i thresh_v =
114 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
115 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
116 __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
117 __m128i mask, hev;
118
119 p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
120 _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
121 q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
122 _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
123 q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
124 _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
125 q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
126 _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
127 p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
128 p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
129 q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
130 q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
131
132 FILTER_HEV_MASK;
133 FILTER4;
134
135 _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0)); // *op1
136 _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0); // *op0
137 _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0); // *oq0
138 _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0)); // *oq1
139 }
140
vpx_lpf_vertical_4_sse2(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)141 void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
142 const uint8_t *limit, const uint8_t *thresh) {
143 const __m128i zero = _mm_set1_epi16(0);
144 const __m128i limit_v =
145 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
146 _mm_loadl_epi64((const __m128i *)limit));
147 const __m128i thresh_v =
148 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
149 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
150 __m128i x0, x1, x2, x3;
151 __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
152 __m128i mask, hev;
153
154 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
155 q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
156 _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
157
158 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
159 x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
160 _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
161
162 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
163 x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
164 _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
165
166 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
167 x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
168 _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
169
170 // Transpose 8x8
171 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
172 p1p0 = _mm_unpacklo_epi16(q1q0, x1);
173 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
174 x0 = _mm_unpacklo_epi16(x2, x3);
175 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
176 p3p2 = _mm_unpacklo_epi32(p1p0, x0);
177 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
178 p1p0 = _mm_unpackhi_epi32(p1p0, x0);
179 p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
180 p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
181
182 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
183 q1q0 = _mm_unpackhi_epi16(q1q0, x1);
184 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
185 x2 = _mm_unpackhi_epi16(x2, x3);
186 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
187 q3q2 = _mm_unpackhi_epi32(q1q0, x2);
188 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
189 q1q0 = _mm_unpacklo_epi32(q1q0, x2);
190
191 q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
192 q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
193 p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
194 p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
195 q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
196
197 FILTER_HEV_MASK;
198 FILTER4;
199
200 // Transpose 8x4 to 4x8
201 // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
202 // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
203 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
204 ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
205 // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
206 x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
207 // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
208 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
209 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
210 qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
211 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
212 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
213
214 storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
215 ps1ps0 = _mm_srli_si128(ps1ps0, 4);
216 storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
217 ps1ps0 = _mm_srli_si128(ps1ps0, 4);
218 storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
219 ps1ps0 = _mm_srli_si128(ps1ps0, 4);
220 storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
221
222 storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
223 qs1qs0 = _mm_srli_si128(qs1qs0, 4);
224 storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
225 qs1qs0 = _mm_srli_si128(qs1qs0, 4);
226 storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
227 qs1qs0 = _mm_srli_si128(qs1qs0, 4);
228 storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
229 }
230
vpx_lpf_horizontal_16_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)231 void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
232 const unsigned char *blimit,
233 const unsigned char *limit,
234 const unsigned char *thresh) {
235 const __m128i zero = _mm_set1_epi16(0);
236 const __m128i one = _mm_set1_epi8(1);
237 const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
238 const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
239 const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
240 __m128i mask, hev, flat, flat2;
241 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
242 __m128i abs_p1p0;
243
244 q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
245 q4p4 = _mm_castps_si128(
246 _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
247 q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
248 q3p3 = _mm_castps_si128(
249 _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
250 q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
251 q2p2 = _mm_castps_si128(
252 _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
253 q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
254 q1p1 = _mm_castps_si128(
255 _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
256 p1q1 = _mm_shuffle_epi32(q1p1, 78);
257 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
258 q0p0 = _mm_castps_si128(
259 _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
260 p0q0 = _mm_shuffle_epi32(q0p0, 78);
261
262 {
263 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
264 abs_p1p0 = abs_diff(q1p1, q0p0);
265 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
266 fe = _mm_set1_epi8((int8_t)0xfe);
267 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
268 abs_p0q0 = abs_diff(q0p0, p0q0);
269 abs_p1q1 = abs_diff(q1p1, p1q1);
270 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
271 hev = _mm_subs_epu8(flat, thresh_v);
272 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
273
274 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
275 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
276 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
277 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
278 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
279 mask = _mm_max_epu8(abs_p1p0, mask);
280 // mask |= (abs(p1 - p0) > limit) * -1;
281 // mask |= (abs(q1 - q0) > limit) * -1;
282
283 work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
284 mask = _mm_max_epu8(work, mask);
285 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
286 mask = _mm_subs_epu8(mask, limit_v);
287 mask = _mm_cmpeq_epi8(mask, zero);
288 }
289
290 // lp filter
291 {
292 const __m128i t4 = _mm_set1_epi8(4);
293 const __m128i t3 = _mm_set1_epi8(3);
294 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
295 const __m128i t1 = _mm_set1_epi16(0x1);
296 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
297 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
298 __m128i qs0 = _mm_xor_si128(p0q0, t80);
299 __m128i qs1 = _mm_xor_si128(p1q1, t80);
300 __m128i filt;
301 __m128i work_a;
302 __m128i filter1, filter2;
303 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
304 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
305
306 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
307 work_a = _mm_subs_epi8(qs0, qs0ps0);
308 filt = _mm_adds_epi8(filt, work_a);
309 filt = _mm_adds_epi8(filt, work_a);
310 filt = _mm_adds_epi8(filt, work_a);
311 // (vpx_filter + 3 * (qs0 - ps0)) & mask
312 filt = _mm_and_si128(filt, mask);
313
314 filter1 = _mm_adds_epi8(filt, t4);
315 filter2 = _mm_adds_epi8(filt, t3);
316
317 filter1 = _mm_unpacklo_epi8(zero, filter1);
318 filter1 = _mm_srai_epi16(filter1, 0xB);
319 filter2 = _mm_unpacklo_epi8(zero, filter2);
320 filter2 = _mm_srai_epi16(filter2, 0xB);
321
322 // Filter1 >> 3
323 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
324 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
325
326 // filt >> 1
327 filt = _mm_adds_epi16(filter1, t1);
328 filt = _mm_srai_epi16(filt, 1);
329 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
330 filt);
331 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
332 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
333 // loopfilter done
334
335 {
336 __m128i work;
337 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
338 flat = _mm_max_epu8(abs_p1p0, flat);
339 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
340 flat = _mm_subs_epu8(flat, one);
341 flat = _mm_cmpeq_epi8(flat, zero);
342 flat = _mm_and_si128(flat, mask);
343
344 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
345 q5p5 = _mm_castps_si128(
346 _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
347
348 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
349 q6p6 = _mm_castps_si128(
350 _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
351 flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
352
353 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
354 q7p7 = _mm_castps_si128(
355 _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
356 work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
357 flat2 = _mm_max_epu8(work, flat2);
358 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
359 flat2 = _mm_subs_epu8(flat2, one);
360 flat2 = _mm_cmpeq_epi8(flat2, zero);
361 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
362 }
363
364 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
365 // flat and wide flat calculations
366 {
367 const __m128i eight = _mm_set1_epi16(8);
368 const __m128i four = _mm_set1_epi16(4);
369 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
370 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
371 __m128i pixelFilter_p, pixelFilter_q;
372 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
373 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
374
375 p7_16 = _mm_unpacklo_epi8(q7p7, zero);
376 p6_16 = _mm_unpacklo_epi8(q6p6, zero);
377 p5_16 = _mm_unpacklo_epi8(q5p5, zero);
378 p4_16 = _mm_unpacklo_epi8(q4p4, zero);
379 p3_16 = _mm_unpacklo_epi8(q3p3, zero);
380 p2_16 = _mm_unpacklo_epi8(q2p2, zero);
381 p1_16 = _mm_unpacklo_epi8(q1p1, zero);
382 p0_16 = _mm_unpacklo_epi8(q0p0, zero);
383 q0_16 = _mm_unpackhi_epi8(q0p0, zero);
384 q1_16 = _mm_unpackhi_epi8(q1p1, zero);
385 q2_16 = _mm_unpackhi_epi8(q2p2, zero);
386 q3_16 = _mm_unpackhi_epi8(q3p3, zero);
387 q4_16 = _mm_unpackhi_epi8(q4p4, zero);
388 q5_16 = _mm_unpackhi_epi8(q5p5, zero);
389 q6_16 = _mm_unpackhi_epi8(q6p6, zero);
390 q7_16 = _mm_unpackhi_epi8(q7p7, zero);
391
392 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
393 _mm_add_epi16(p4_16, p3_16));
394 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
395 _mm_add_epi16(q4_16, q3_16));
396
397 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
398 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
399
400 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
401 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
402 pixelFilter_p =
403 _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
404 pixetFilter_p2p1p0 = _mm_add_epi16(
405 four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
406 res_p = _mm_srli_epi16(
407 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
408 res_q = _mm_srli_epi16(
409 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
410 flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
411 res_p = _mm_srli_epi16(
412 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
413 res_q = _mm_srli_epi16(
414 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
415
416 flat_q0p0 = _mm_packus_epi16(res_p, res_q);
417
418 sum_p7 = _mm_add_epi16(p7_16, p7_16);
419 sum_q7 = _mm_add_epi16(q7_16, q7_16);
420 sum_p3 = _mm_add_epi16(p3_16, p3_16);
421 sum_q3 = _mm_add_epi16(q3_16, q3_16);
422
423 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
424 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
425 res_p = _mm_srli_epi16(
426 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
427 res_q = _mm_srli_epi16(
428 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
429 flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
430
431 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
432 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
433 res_p = _mm_srli_epi16(
434 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
435 res_q = _mm_srli_epi16(
436 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
437 flat_q1p1 = _mm_packus_epi16(res_p, res_q);
438
439 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
440 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
441 sum_p3 = _mm_add_epi16(sum_p3, p3_16);
442 sum_q3 = _mm_add_epi16(sum_q3, q3_16);
443
444 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
445 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
446 res_p = _mm_srli_epi16(
447 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
448 res_q = _mm_srli_epi16(
449 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
450 flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
451
452 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
453 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
454
455 res_p = _mm_srli_epi16(
456 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
457 res_q = _mm_srli_epi16(
458 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
459 flat_q2p2 = _mm_packus_epi16(res_p, res_q);
460
461 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
462 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
463 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
464 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
465 res_p = _mm_srli_epi16(
466 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
467 res_q = _mm_srli_epi16(
468 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
469 flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
470
471 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
472 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
473 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
474 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
475 res_p = _mm_srli_epi16(
476 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
477 res_q = _mm_srli_epi16(
478 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
479 flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
480
481 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
482 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
483 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
484 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
485 res_p = _mm_srli_epi16(
486 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
487 res_q = _mm_srli_epi16(
488 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
489 flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
490
491 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
492 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
493 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
494 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
495 res_p = _mm_srli_epi16(
496 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
497 res_q = _mm_srli_epi16(
498 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
499 flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
500 }
501 // wide flat
502 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
503
504 flat = _mm_shuffle_epi32(flat, 68);
505 flat2 = _mm_shuffle_epi32(flat2, 68);
506
507 q2p2 = _mm_andnot_si128(flat, q2p2);
508 flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
509 q2p2 = _mm_or_si128(q2p2, flat_q2p2);
510
511 qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
512 flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
513 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
514
515 qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
516 flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
517 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
518
519 q6p6 = _mm_andnot_si128(flat2, q6p6);
520 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
521 q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
522 _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
523 _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
524
525 q5p5 = _mm_andnot_si128(flat2, q5p5);
526 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
527 q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
528 _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
529 _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
530
531 q4p4 = _mm_andnot_si128(flat2, q4p4);
532 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
533 q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
534 _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
535 _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
536
537 q3p3 = _mm_andnot_si128(flat2, q3p3);
538 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
539 q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
540 _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
541 _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
542
543 q2p2 = _mm_andnot_si128(flat2, q2p2);
544 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
545 q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
546 _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
547 _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
548
549 q1p1 = _mm_andnot_si128(flat2, q1p1);
550 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
551 q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
552 _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
553 _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
554
555 q0p0 = _mm_andnot_si128(flat2, q0p0);
556 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
557 q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
558 _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
559 _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
560 }
561 }
562
filter_add2_sub2(const __m128i * const total,const __m128i * const a1,const __m128i * const a2,const __m128i * const s1,const __m128i * const s2)563 static INLINE __m128i filter_add2_sub2(const __m128i *const total,
564 const __m128i *const a1,
565 const __m128i *const a2,
566 const __m128i *const s1,
567 const __m128i *const s2) {
568 __m128i x = _mm_add_epi16(*a1, *total);
569 x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
570 return x;
571 }
572
filter8_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f8_lo,const __m128i * const f8_hi)573 static INLINE __m128i filter8_mask(const __m128i *const flat,
574 const __m128i *const other_filt,
575 const __m128i *const f8_lo,
576 const __m128i *const f8_hi) {
577 const __m128i f8 =
578 _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
579 const __m128i result = _mm_and_si128(*flat, f8);
580 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
581 }
582
filter16_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f_lo,const __m128i * const f_hi)583 static INLINE __m128i filter16_mask(const __m128i *const flat,
584 const __m128i *const other_filt,
585 const __m128i *const f_lo,
586 const __m128i *const f_hi) {
587 const __m128i f =
588 _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
589 const __m128i result = _mm_and_si128(*flat, f);
590 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
591 }
592
vpx_lpf_horizontal_16_dual_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)593 void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
594 const unsigned char *blimit,
595 const unsigned char *limit,
596 const unsigned char *thresh) {
597 const __m128i zero = _mm_set1_epi16(0);
598 const __m128i one = _mm_set1_epi8(1);
599 const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
600 const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
601 const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
602 __m128i mask, hev, flat, flat2;
603 __m128i p7, p6, p5;
604 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
605 __m128i q5, q6, q7;
606
607 __m128i op2, op1, op0, oq0, oq1, oq2;
608
609 __m128i max_abs_p1p0q1q0;
610
611 p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
612 p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
613 p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
614 p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
615 p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
616 p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
617 p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
618 p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
619 q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
620 q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
621 q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
622 q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
623 q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
624 q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
625 q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
626 q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
627
628 {
629 const __m128i abs_p1p0 = abs_diff(p1, p0);
630 const __m128i abs_q1q0 = abs_diff(q1, q0);
631 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
632 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
633 __m128i abs_p0q0 = abs_diff(p0, q0);
634 __m128i abs_p1q1 = abs_diff(p1, q1);
635 __m128i work;
636 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
637
638 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
639 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
640 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
641 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
642 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
643 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
644 // mask |= (abs(p1 - p0) > limit) * -1;
645 // mask |= (abs(q1 - q0) > limit) * -1;
646 work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
647 mask = _mm_max_epu8(work, mask);
648 work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
649 mask = _mm_max_epu8(work, mask);
650 mask = _mm_subs_epu8(mask, limit_v);
651 mask = _mm_cmpeq_epi8(mask, zero);
652 }
653
654 {
655 __m128i work;
656 work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
657 flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
658 work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
659 flat = _mm_max_epu8(work, flat);
660 work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
661 flat = _mm_subs_epu8(flat, one);
662 flat = _mm_cmpeq_epi8(flat, zero);
663 flat = _mm_and_si128(flat, mask);
664 flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
665 flat2 = _mm_max_epu8(work, flat2);
666 work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
667 flat2 = _mm_max_epu8(work, flat2);
668 work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
669 flat2 = _mm_max_epu8(work, flat2);
670 flat2 = _mm_subs_epu8(flat2, one);
671 flat2 = _mm_cmpeq_epi8(flat2, zero);
672 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
673 }
674
675 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
676 // filter4
677 {
678 const __m128i t4 = _mm_set1_epi8(4);
679 const __m128i t3 = _mm_set1_epi8(3);
680 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
681 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
682 const __m128i t1f = _mm_set1_epi8(0x1f);
683 const __m128i t1 = _mm_set1_epi8(0x1);
684 const __m128i t7f = _mm_set1_epi8(0x7f);
685 const __m128i ff = _mm_cmpeq_epi8(t4, t4);
686
687 __m128i filt;
688 __m128i work_a;
689 __m128i filter1, filter2;
690
691 op1 = _mm_xor_si128(p1, t80);
692 op0 = _mm_xor_si128(p0, t80);
693 oq0 = _mm_xor_si128(q0, t80);
694 oq1 = _mm_xor_si128(q1, t80);
695
696 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
697 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
698 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
699
700 work_a = _mm_subs_epi8(oq0, op0);
701 filt = _mm_adds_epi8(filt, work_a);
702 filt = _mm_adds_epi8(filt, work_a);
703 filt = _mm_adds_epi8(filt, work_a);
704 // (vpx_filter + 3 * (qs0 - ps0)) & mask
705 filt = _mm_and_si128(filt, mask);
706 filter1 = _mm_adds_epi8(filt, t4);
707 filter2 = _mm_adds_epi8(filt, t3);
708
709 // Filter1 >> 3
710 work_a = _mm_cmpgt_epi8(zero, filter1);
711 filter1 = _mm_srli_epi16(filter1, 3);
712 work_a = _mm_and_si128(work_a, te0);
713 filter1 = _mm_and_si128(filter1, t1f);
714 filter1 = _mm_or_si128(filter1, work_a);
715 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
716
717 // Filter2 >> 3
718 work_a = _mm_cmpgt_epi8(zero, filter2);
719 filter2 = _mm_srli_epi16(filter2, 3);
720 work_a = _mm_and_si128(work_a, te0);
721 filter2 = _mm_and_si128(filter2, t1f);
722 filter2 = _mm_or_si128(filter2, work_a);
723 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
724
725 // filt >> 1
726 filt = _mm_adds_epi8(filter1, t1);
727 work_a = _mm_cmpgt_epi8(zero, filt);
728 filt = _mm_srli_epi16(filt, 1);
729 work_a = _mm_and_si128(work_a, t80);
730 filt = _mm_and_si128(filt, t7f);
731 filt = _mm_or_si128(filt, work_a);
732 filt = _mm_andnot_si128(hev, filt);
733 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
734 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
735 // loopfilter done
736
737 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
738 // filter8
739 {
740 const __m128i four = _mm_set1_epi16(4);
741 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
742 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
743 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
744 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
745 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
746 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
747 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
748 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
749
750 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
751 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
752 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
753 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
754 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
755 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
756 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
757 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
758 __m128i f8_lo, f8_hi;
759
760 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
761 _mm_add_epi16(p3_lo, p2_lo));
762 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
763 _mm_add_epi16(p2_lo, p1_lo));
764 f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
765
766 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
767 _mm_add_epi16(p3_hi, p2_hi));
768 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
769 _mm_add_epi16(p2_hi, p1_hi));
770 f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
771
772 op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
773
774 f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
775 f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
776 op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
777
778 f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
779 f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
780 op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
781
782 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
783 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
784 oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
785
786 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
787 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
788 oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
789
790 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
791 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
792 oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
793 }
794
795 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
796 // wide flat calculations
797 {
798 const __m128i eight = _mm_set1_epi16(8);
799 const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
800 const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
801 const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
802 const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
803 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
804 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
805 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
806 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
807 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
808 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
809 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
810 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
811 const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
812 const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
813 const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
814 const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
815
816 const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
817 const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
818 const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
819 const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
820 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
821 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
822 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
823 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
824 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
825 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
826 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
827 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
828 const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
829 const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
830 const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
831 const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
832
833 __m128i f_lo;
834 __m128i f_hi;
835
836 f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
837 f_lo =
838 _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
839 f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
840 _mm_add_epi16(p2_lo, p1_lo));
841 f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
842 f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
843
844 f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
845 f_hi =
846 _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
847 f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
848 _mm_add_epi16(p2_hi, p1_hi));
849 f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
850 f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
851
852 p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
853 _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
854
855 f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
856 f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
857 p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
858 _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
859
860 f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
861 f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
862 p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
863 _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
864
865 f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
866 f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
867 p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
868 _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
869
870 f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
871 f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
872 op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
873 _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
874
875 f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
876 f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
877 op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
878 _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
879
880 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
881 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
882 op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
883 _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
884
885 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
886 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
887 oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
888 _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
889
890 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
891 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
892 oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
893 _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
894
895 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
896 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
897 oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
898 _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
899
900 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
901 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
902 q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
903 _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
904
905 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
906 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
907 q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
908 _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
909
910 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
911 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
912 q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
913 _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
914
915 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
916 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
917 q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
918 _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
919 }
920 // wide flat
921 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
922 }
923 }
924
vpx_lpf_horizontal_8_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)925 void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
926 const unsigned char *blimit,
927 const unsigned char *limit,
928 const unsigned char *thresh) {
929 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
930 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
931 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
932 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
933 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
934 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
935 const __m128i zero = _mm_set1_epi16(0);
936 const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
937 const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
938 const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
939 __m128i mask, hev, flat;
940 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
941 __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
942
943 q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
944 _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
945 q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
946 _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
947 q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
948 _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
949 q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
950 _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
951 p1q1 = _mm_shuffle_epi32(q1p1, 78);
952 p0q0 = _mm_shuffle_epi32(q0p0, 78);
953
954 {
955 // filter_mask and hev_mask
956 const __m128i one = _mm_set1_epi8(1);
957 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
958 const __m128i ff = _mm_cmpeq_epi8(fe, fe);
959 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
960 abs_p1p0 = abs_diff(q1p1, q0p0);
961 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
962
963 abs_p0q0 = abs_diff(q0p0, p0q0);
964 abs_p1q1 = abs_diff(q1p1, p1q1);
965 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
966 hev = _mm_subs_epu8(flat, thresh_v);
967 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
968
969 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
970 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
971 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
972 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
973 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
974 mask = _mm_max_epu8(abs_p1p0, mask);
975 // mask |= (abs(p1 - p0) > limit) * -1;
976 // mask |= (abs(q1 - q0) > limit) * -1;
977
978 work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
979 mask = _mm_max_epu8(work, mask);
980 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
981 mask = _mm_subs_epu8(mask, limit_v);
982 mask = _mm_cmpeq_epi8(mask, zero);
983
984 // flat_mask4
985
986 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
987 flat = _mm_max_epu8(abs_p1p0, flat);
988 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
989 flat = _mm_subs_epu8(flat, one);
990 flat = _mm_cmpeq_epi8(flat, zero);
991 flat = _mm_and_si128(flat, mask);
992 }
993
994 {
995 const __m128i four = _mm_set1_epi16(4);
996 unsigned char *src = s;
997 {
998 __m128i workp_a, workp_b, workp_shft;
999 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
1000 zero);
1001 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
1002 zero);
1003 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
1004 zero);
1005 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
1006 zero);
1007 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
1008 zero);
1009 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
1010 zero);
1011 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
1012 zero);
1013 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
1014 zero);
1015
1016 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1017 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1018 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1019 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1020 _mm_storel_epi64((__m128i *)&flat_op2[0],
1021 _mm_packus_epi16(workp_shft, workp_shft));
1022
1023 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1024 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1025 _mm_storel_epi64((__m128i *)&flat_op1[0],
1026 _mm_packus_epi16(workp_shft, workp_shft));
1027
1028 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1029 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1030 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1031 _mm_storel_epi64((__m128i *)&flat_op0[0],
1032 _mm_packus_epi16(workp_shft, workp_shft));
1033
1034 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1035 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1036 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1037 _mm_storel_epi64((__m128i *)&flat_oq0[0],
1038 _mm_packus_epi16(workp_shft, workp_shft));
1039
1040 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1041 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1042 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1043 _mm_storel_epi64((__m128i *)&flat_oq1[0],
1044 _mm_packus_epi16(workp_shft, workp_shft));
1045
1046 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1047 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1048 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1049 _mm_storel_epi64((__m128i *)&flat_oq2[0],
1050 _mm_packus_epi16(workp_shft, workp_shft));
1051 }
1052 }
1053 // lp filter
1054 {
1055 const __m128i t4 = _mm_set1_epi8(4);
1056 const __m128i t3 = _mm_set1_epi8(3);
1057 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
1058 const __m128i t1 = _mm_set1_epi8(0x1);
1059 const __m128i ps1 =
1060 _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
1061 const __m128i ps0 =
1062 _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
1063 const __m128i qs0 =
1064 _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
1065 const __m128i qs1 =
1066 _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
1067 __m128i filt;
1068 __m128i work_a;
1069 __m128i filter1, filter2;
1070
1071 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1072 work_a = _mm_subs_epi8(qs0, ps0);
1073 filt = _mm_adds_epi8(filt, work_a);
1074 filt = _mm_adds_epi8(filt, work_a);
1075 filt = _mm_adds_epi8(filt, work_a);
1076 // (vpx_filter + 3 * (qs0 - ps0)) & mask
1077 filt = _mm_and_si128(filt, mask);
1078
1079 filter1 = _mm_adds_epi8(filt, t4);
1080 filter2 = _mm_adds_epi8(filt, t3);
1081
1082 // Filter1 >> 3
1083 filter1 = _mm_unpacklo_epi8(zero, filter1);
1084 filter1 = _mm_srai_epi16(filter1, 11);
1085 filter1 = _mm_packs_epi16(filter1, filter1);
1086
1087 // Filter2 >> 3
1088 filter2 = _mm_unpacklo_epi8(zero, filter2);
1089 filter2 = _mm_srai_epi16(filter2, 11);
1090 filter2 = _mm_packs_epi16(filter2, zero);
1091
1092 // filt >> 1
1093 filt = _mm_adds_epi8(filter1, t1);
1094 filt = _mm_unpacklo_epi8(zero, filt);
1095 filt = _mm_srai_epi16(filt, 9);
1096 filt = _mm_packs_epi16(filt, zero);
1097
1098 filt = _mm_andnot_si128(hev, filt);
1099
1100 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1101 q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1102 work_a = _mm_andnot_si128(flat, work_a);
1103 q0 = _mm_and_si128(flat, q0);
1104 q0 = _mm_or_si128(work_a, q0);
1105
1106 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1107 q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1108 work_a = _mm_andnot_si128(flat, work_a);
1109 q1 = _mm_and_si128(flat, q1);
1110 q1 = _mm_or_si128(work_a, q1);
1111
1112 work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
1113 q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1114 work_a = _mm_andnot_si128(flat, work_a);
1115 q2 = _mm_and_si128(flat, q2);
1116 q2 = _mm_or_si128(work_a, q2);
1117
1118 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1119 p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1120 work_a = _mm_andnot_si128(flat, work_a);
1121 p0 = _mm_and_si128(flat, p0);
1122 p0 = _mm_or_si128(work_a, p0);
1123
1124 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1125 p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1126 work_a = _mm_andnot_si128(flat, work_a);
1127 p1 = _mm_and_si128(flat, p1);
1128 p1 = _mm_or_si128(work_a, p1);
1129
1130 work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
1131 p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1132 work_a = _mm_andnot_si128(flat, work_a);
1133 p2 = _mm_and_si128(flat, p2);
1134 p2 = _mm_or_si128(work_a, p2);
1135
1136 _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
1137 _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
1138 _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
1139 _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
1140 _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
1141 _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
1142 }
1143 }
1144
vpx_lpf_horizontal_8_dual_sse2(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1145 void vpx_lpf_horizontal_8_dual_sse2(
1146 uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
1147 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1148 const uint8_t *thresh1) {
1149 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
1150 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
1151 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
1152 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
1153 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
1154 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
1155 const __m128i zero = _mm_set1_epi16(0);
1156 const __m128i blimit =
1157 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
1158 _mm_load_si128((const __m128i *)blimit1));
1159 const __m128i limit =
1160 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
1161 _mm_load_si128((const __m128i *)limit1));
1162 const __m128i thresh =
1163 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
1164 _mm_load_si128((const __m128i *)thresh1));
1165
1166 __m128i mask, hev, flat;
1167 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1168
1169 p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
1170 p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
1171 p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
1172 p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
1173 q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
1174 q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
1175 q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
1176 q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
1177 {
1178 const __m128i abs_p1p0 =
1179 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1180 const __m128i abs_q1q0 =
1181 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1182 const __m128i one = _mm_set1_epi8(1);
1183 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
1184 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1185 __m128i abs_p0q0 =
1186 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1187 __m128i abs_p1q1 =
1188 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1189 __m128i work;
1190
1191 // filter_mask and hev_mask
1192 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1193 hev = _mm_subs_epu8(flat, thresh);
1194 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1195
1196 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1197 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1198 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1199 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1200 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1201 mask = _mm_max_epu8(flat, mask);
1202 // mask |= (abs(p1 - p0) > limit) * -1;
1203 // mask |= (abs(q1 - q0) > limit) * -1;
1204 work = _mm_max_epu8(
1205 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1206 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1207 mask = _mm_max_epu8(work, mask);
1208 work = _mm_max_epu8(
1209 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1210 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1211 mask = _mm_max_epu8(work, mask);
1212 mask = _mm_subs_epu8(mask, limit);
1213 mask = _mm_cmpeq_epi8(mask, zero);
1214
1215 // flat_mask4
1216 work = _mm_max_epu8(
1217 _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
1218 _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
1219 flat = _mm_max_epu8(work, flat);
1220 work = _mm_max_epu8(
1221 _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
1222 _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
1223 flat = _mm_max_epu8(work, flat);
1224 flat = _mm_subs_epu8(flat, one);
1225 flat = _mm_cmpeq_epi8(flat, zero);
1226 flat = _mm_and_si128(flat, mask);
1227 }
1228 {
1229 const __m128i four = _mm_set1_epi16(4);
1230 unsigned char *src = s;
1231 int i = 0;
1232
1233 do {
1234 __m128i workp_a, workp_b, workp_shft;
1235 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
1236 zero);
1237 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
1238 zero);
1239 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
1240 zero);
1241 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
1242 zero);
1243 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
1244 zero);
1245 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
1246 zero);
1247 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
1248 zero);
1249 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
1250 zero);
1251
1252 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1253 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1254 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1255 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1256 _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1257 _mm_packus_epi16(workp_shft, workp_shft));
1258
1259 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1260 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1261 _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1262 _mm_packus_epi16(workp_shft, workp_shft));
1263
1264 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1265 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1266 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1267 _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1268 _mm_packus_epi16(workp_shft, workp_shft));
1269
1270 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1271 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1272 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1273 _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1274 _mm_packus_epi16(workp_shft, workp_shft));
1275
1276 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1277 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1278 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1279 _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1280 _mm_packus_epi16(workp_shft, workp_shft));
1281
1282 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1283 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1284 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1285 _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1286 _mm_packus_epi16(workp_shft, workp_shft));
1287
1288 src += 8;
1289 } while (++i < 2);
1290 }
1291 // lp filter
1292 {
1293 const __m128i t4 = _mm_set1_epi8(4);
1294 const __m128i t3 = _mm_set1_epi8(3);
1295 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
1296 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
1297 const __m128i t1f = _mm_set1_epi8(0x1f);
1298 const __m128i t1 = _mm_set1_epi8(0x1);
1299 const __m128i t7f = _mm_set1_epi8(0x7f);
1300
1301 const __m128i ps1 =
1302 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
1303 const __m128i ps0 =
1304 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
1305 const __m128i qs0 =
1306 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
1307 const __m128i qs1 =
1308 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
1309 __m128i filt;
1310 __m128i work_a;
1311 __m128i filter1, filter2;
1312
1313 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1314 work_a = _mm_subs_epi8(qs0, ps0);
1315 filt = _mm_adds_epi8(filt, work_a);
1316 filt = _mm_adds_epi8(filt, work_a);
1317 filt = _mm_adds_epi8(filt, work_a);
1318 // (vpx_filter + 3 * (qs0 - ps0)) & mask
1319 filt = _mm_and_si128(filt, mask);
1320
1321 filter1 = _mm_adds_epi8(filt, t4);
1322 filter2 = _mm_adds_epi8(filt, t3);
1323
1324 // Filter1 >> 3
1325 work_a = _mm_cmpgt_epi8(zero, filter1);
1326 filter1 = _mm_srli_epi16(filter1, 3);
1327 work_a = _mm_and_si128(work_a, te0);
1328 filter1 = _mm_and_si128(filter1, t1f);
1329 filter1 = _mm_or_si128(filter1, work_a);
1330
1331 // Filter2 >> 3
1332 work_a = _mm_cmpgt_epi8(zero, filter2);
1333 filter2 = _mm_srli_epi16(filter2, 3);
1334 work_a = _mm_and_si128(work_a, te0);
1335 filter2 = _mm_and_si128(filter2, t1f);
1336 filter2 = _mm_or_si128(filter2, work_a);
1337
1338 // filt >> 1
1339 filt = _mm_adds_epi8(filter1, t1);
1340 work_a = _mm_cmpgt_epi8(zero, filt);
1341 filt = _mm_srli_epi16(filt, 1);
1342 work_a = _mm_and_si128(work_a, t80);
1343 filt = _mm_and_si128(filt, t7f);
1344 filt = _mm_or_si128(filt, work_a);
1345
1346 filt = _mm_andnot_si128(hev, filt);
1347
1348 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1349 q0 = _mm_load_si128((__m128i *)flat_oq0);
1350 work_a = _mm_andnot_si128(flat, work_a);
1351 q0 = _mm_and_si128(flat, q0);
1352 q0 = _mm_or_si128(work_a, q0);
1353
1354 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1355 q1 = _mm_load_si128((__m128i *)flat_oq1);
1356 work_a = _mm_andnot_si128(flat, work_a);
1357 q1 = _mm_and_si128(flat, q1);
1358 q1 = _mm_or_si128(work_a, q1);
1359
1360 work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
1361 q2 = _mm_load_si128((__m128i *)flat_oq2);
1362 work_a = _mm_andnot_si128(flat, work_a);
1363 q2 = _mm_and_si128(flat, q2);
1364 q2 = _mm_or_si128(work_a, q2);
1365
1366 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1367 p0 = _mm_load_si128((__m128i *)flat_op0);
1368 work_a = _mm_andnot_si128(flat, work_a);
1369 p0 = _mm_and_si128(flat, p0);
1370 p0 = _mm_or_si128(work_a, p0);
1371
1372 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1373 p1 = _mm_load_si128((__m128i *)flat_op1);
1374 work_a = _mm_andnot_si128(flat, work_a);
1375 p1 = _mm_and_si128(flat, p1);
1376 p1 = _mm_or_si128(work_a, p1);
1377
1378 work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
1379 p2 = _mm_load_si128((__m128i *)flat_op2);
1380 work_a = _mm_andnot_si128(flat, work_a);
1381 p2 = _mm_and_si128(flat, p2);
1382 p2 = _mm_or_si128(work_a, p2);
1383
1384 _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
1385 _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
1386 _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
1387 _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
1388 _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
1389 _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
1390 }
1391 }
1392
vpx_lpf_horizontal_4_dual_sse2(unsigned char * s,int pitch,const unsigned char * blimit0,const unsigned char * limit0,const unsigned char * thresh0,const unsigned char * blimit1,const unsigned char * limit1,const unsigned char * thresh1)1393 void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
1394 const unsigned char *blimit0,
1395 const unsigned char *limit0,
1396 const unsigned char *thresh0,
1397 const unsigned char *blimit1,
1398 const unsigned char *limit1,
1399 const unsigned char *thresh1) {
1400 const __m128i blimit =
1401 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
1402 _mm_load_si128((const __m128i *)blimit1));
1403 const __m128i limit =
1404 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
1405 _mm_load_si128((const __m128i *)limit1));
1406 const __m128i thresh =
1407 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
1408 _mm_load_si128((const __m128i *)thresh1));
1409 const __m128i zero = _mm_set1_epi16(0);
1410 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1411 __m128i mask, hev, flat;
1412
1413 p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
1414 p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
1415 p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
1416 p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
1417 q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
1418 q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
1419 q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
1420 q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
1421
1422 // filter_mask and hev_mask
1423 {
1424 const __m128i abs_p1p0 =
1425 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1426 const __m128i abs_q1q0 =
1427 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1428 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
1429 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1430 __m128i abs_p0q0 =
1431 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1432 __m128i abs_p1q1 =
1433 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1434 __m128i work;
1435
1436 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1437 hev = _mm_subs_epu8(flat, thresh);
1438 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1439
1440 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1441 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1442 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1443 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1444 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1445 mask = _mm_max_epu8(flat, mask);
1446 // mask |= (abs(p1 - p0) > limit) * -1;
1447 // mask |= (abs(q1 - q0) > limit) * -1;
1448 work = _mm_max_epu8(
1449 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1450 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1451 mask = _mm_max_epu8(work, mask);
1452 work = _mm_max_epu8(
1453 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1454 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1455 mask = _mm_max_epu8(work, mask);
1456 mask = _mm_subs_epu8(mask, limit);
1457 mask = _mm_cmpeq_epi8(mask, zero);
1458 }
1459
1460 // filter4
1461 {
1462 const __m128i t4 = _mm_set1_epi8(4);
1463 const __m128i t3 = _mm_set1_epi8(3);
1464 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
1465 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
1466 const __m128i t1f = _mm_set1_epi8(0x1f);
1467 const __m128i t1 = _mm_set1_epi8(0x1);
1468 const __m128i t7f = _mm_set1_epi8(0x7f);
1469
1470 const __m128i ps1 =
1471 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
1472 const __m128i ps0 =
1473 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
1474 const __m128i qs0 =
1475 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
1476 const __m128i qs1 =
1477 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
1478 __m128i filt;
1479 __m128i work_a;
1480 __m128i filter1, filter2;
1481
1482 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1483 work_a = _mm_subs_epi8(qs0, ps0);
1484 filt = _mm_adds_epi8(filt, work_a);
1485 filt = _mm_adds_epi8(filt, work_a);
1486 filt = _mm_adds_epi8(filt, work_a);
1487 // (vpx_filter + 3 * (qs0 - ps0)) & mask
1488 filt = _mm_and_si128(filt, mask);
1489
1490 filter1 = _mm_adds_epi8(filt, t4);
1491 filter2 = _mm_adds_epi8(filt, t3);
1492
1493 // Filter1 >> 3
1494 work_a = _mm_cmpgt_epi8(zero, filter1);
1495 filter1 = _mm_srli_epi16(filter1, 3);
1496 work_a = _mm_and_si128(work_a, te0);
1497 filter1 = _mm_and_si128(filter1, t1f);
1498 filter1 = _mm_or_si128(filter1, work_a);
1499
1500 // Filter2 >> 3
1501 work_a = _mm_cmpgt_epi8(zero, filter2);
1502 filter2 = _mm_srli_epi16(filter2, 3);
1503 work_a = _mm_and_si128(work_a, te0);
1504 filter2 = _mm_and_si128(filter2, t1f);
1505 filter2 = _mm_or_si128(filter2, work_a);
1506
1507 // filt >> 1
1508 filt = _mm_adds_epi8(filter1, t1);
1509 work_a = _mm_cmpgt_epi8(zero, filt);
1510 filt = _mm_srli_epi16(filt, 1);
1511 work_a = _mm_and_si128(work_a, t80);
1512 filt = _mm_and_si128(filt, t7f);
1513 filt = _mm_or_si128(filt, work_a);
1514
1515 filt = _mm_andnot_si128(hev, filt);
1516
1517 q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1518 q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1519 p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1520 p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1521
1522 _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
1523 _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
1524 _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
1525 _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
1526 }
1527 }
1528
transpose8x16(unsigned char * in0,unsigned char * in1,int in_p,unsigned char * out,int out_p)1529 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1530 int in_p, unsigned char *out, int out_p) {
1531 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1532 __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1533
1534 // 2-way interleave w/hoisting of unpacks
1535 x0 = _mm_loadl_epi64((__m128i *)in0); // 1
1536 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
1537 x0 = _mm_unpacklo_epi8(x0, x1); // 1
1538
1539 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
1540 x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7
1541 x1 = _mm_unpacklo_epi8(x2, x3); // 2
1542
1543 x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9
1544 x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11
1545 x2 = _mm_unpacklo_epi8(x4, x5); // 3
1546
1547 x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13
1548 x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15
1549 x3 = _mm_unpacklo_epi8(x6, x7); // 4
1550 x4 = _mm_unpacklo_epi16(x0, x1); // 9
1551
1552 x8 = _mm_loadl_epi64((__m128i *)in1); // 2
1553 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
1554 x8 = _mm_unpacklo_epi8(x8, x9); // 5
1555 x5 = _mm_unpacklo_epi16(x2, x3); // 10
1556
1557 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
1558 x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8
1559 x9 = _mm_unpacklo_epi8(x10, x11); // 6
1560
1561 x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10
1562 x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12
1563 x10 = _mm_unpacklo_epi8(x12, x13); // 7
1564 x12 = _mm_unpacklo_epi16(x8, x9); // 11
1565
1566 x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14
1567 x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16
1568 x11 = _mm_unpacklo_epi8(x14, x15); // 8
1569 x13 = _mm_unpacklo_epi16(x10, x11); // 12
1570
1571 x6 = _mm_unpacklo_epi32(x4, x5); // 13
1572 x7 = _mm_unpackhi_epi32(x4, x5); // 14
1573 x14 = _mm_unpacklo_epi32(x12, x13); // 15
1574 x15 = _mm_unpackhi_epi32(x12, x13); // 16
1575
1576 // Store first 4-line result
1577 _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1578 _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1579 _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1580 _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1581
1582 x4 = _mm_unpackhi_epi16(x0, x1);
1583 x5 = _mm_unpackhi_epi16(x2, x3);
1584 x12 = _mm_unpackhi_epi16(x8, x9);
1585 x13 = _mm_unpackhi_epi16(x10, x11);
1586
1587 x6 = _mm_unpacklo_epi32(x4, x5);
1588 x7 = _mm_unpackhi_epi32(x4, x5);
1589 x14 = _mm_unpacklo_epi32(x12, x13);
1590 x15 = _mm_unpackhi_epi32(x12, x13);
1591
1592 // Store second 4-line result
1593 _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1594 _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1595 _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1596 _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1597 }
1598
transpose(unsigned char * src[],int in_p,unsigned char * dst[],int out_p,int num_8x8_to_transpose)1599 static INLINE void transpose(unsigned char *src[], int in_p,
1600 unsigned char *dst[], int out_p,
1601 int num_8x8_to_transpose) {
1602 int idx8x8 = 0;
1603 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1604 do {
1605 unsigned char *in = src[idx8x8];
1606 unsigned char *out = dst[idx8x8];
1607
1608 x0 =
1609 _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
1610 x1 =
1611 _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
1612 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1613 x0 = _mm_unpacklo_epi8(x0, x1);
1614
1615 x2 =
1616 _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
1617 x3 =
1618 _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
1619 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1620 x1 = _mm_unpacklo_epi8(x2, x3);
1621
1622 x4 =
1623 _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
1624 x5 =
1625 _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
1626 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1627 x2 = _mm_unpacklo_epi8(x4, x5);
1628
1629 x6 =
1630 _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
1631 x7 =
1632 _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
1633 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1634 x3 = _mm_unpacklo_epi8(x6, x7);
1635
1636 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1637 x4 = _mm_unpacklo_epi16(x0, x1);
1638 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1639 x5 = _mm_unpacklo_epi16(x2, x3);
1640 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1641 x6 = _mm_unpacklo_epi32(x4, x5);
1642 mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70
1643 mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71
1644 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1645 x7 = _mm_unpackhi_epi32(x4, x5);
1646 mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72
1647 mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73
1648
1649 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1650 x4 = _mm_unpackhi_epi16(x0, x1);
1651 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1652 x5 = _mm_unpackhi_epi16(x2, x3);
1653 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1654 x6 = _mm_unpacklo_epi32(x4, x5);
1655 mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74
1656 mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75
1657 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1658 x7 = _mm_unpackhi_epi32(x4, x5);
1659
1660 mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76
1661 mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77
1662 } while (++idx8x8 < num_8x8_to_transpose);
1663 }
1664
vpx_lpf_vertical_4_dual_sse2(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1665 void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
1666 const uint8_t *limit0, const uint8_t *thresh0,
1667 const uint8_t *blimit1, const uint8_t *limit1,
1668 const uint8_t *thresh1) {
1669 DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1670 unsigned char *src[2];
1671 unsigned char *dst[2];
1672
1673 // Transpose 8x16
1674 transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
1675
1676 // Loop filtering
1677 vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1678 blimit1, limit1, thresh1);
1679 src[0] = t_dst;
1680 src[1] = t_dst + 8;
1681 dst[0] = s - 4;
1682 dst[1] = s - 4 + pitch * 8;
1683
1684 // Transpose back
1685 transpose(src, 16, dst, pitch, 2);
1686 }
1687
vpx_lpf_vertical_8_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)1688 void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
1689 const unsigned char *blimit,
1690 const unsigned char *limit,
1691 const unsigned char *thresh) {
1692 DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
1693 unsigned char *src[1];
1694 unsigned char *dst[1];
1695
1696 // Transpose 8x8
1697 src[0] = s - 4;
1698 dst[0] = t_dst;
1699
1700 transpose(src, pitch, dst, 8, 1);
1701
1702 // Loop filtering
1703 vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
1704
1705 src[0] = t_dst;
1706 dst[0] = s - 4;
1707
1708 // Transpose back
1709 transpose(src, 8, dst, pitch, 1);
1710 }
1711
vpx_lpf_vertical_8_dual_sse2(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1712 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
1713 const uint8_t *limit0, const uint8_t *thresh0,
1714 const uint8_t *blimit1, const uint8_t *limit1,
1715 const uint8_t *thresh1) {
1716 DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1717 unsigned char *src[2];
1718 unsigned char *dst[2];
1719
1720 // Transpose 8x16
1721 transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
1722
1723 // Loop filtering
1724 vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1725 blimit1, limit1, thresh1);
1726 src[0] = t_dst;
1727 src[1] = t_dst + 8;
1728
1729 dst[0] = s - 4;
1730 dst[1] = s - 4 + pitch * 8;
1731
1732 // Transpose back
1733 transpose(src, 16, dst, pitch, 2);
1734 }
1735
vpx_lpf_vertical_16_sse2(unsigned char * s,int pitch,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)1736 void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
1737 const unsigned char *blimit,
1738 const unsigned char *limit,
1739 const unsigned char *thresh) {
1740 DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
1741 unsigned char *src[2];
1742 unsigned char *dst[2];
1743
1744 src[0] = s - 8;
1745 src[1] = s;
1746 dst[0] = t_dst;
1747 dst[1] = t_dst + 8 * 8;
1748
1749 // Transpose 16x8
1750 transpose(src, pitch, dst, 8, 2);
1751
1752 // Loop filtering
1753 vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
1754
1755 src[0] = t_dst;
1756 src[1] = t_dst + 8 * 8;
1757 dst[0] = s - 8;
1758 dst[1] = s;
1759
1760 // Transpose back
1761 transpose(src, 8, dst, pitch, 2);
1762 }
1763
vpx_lpf_vertical_16_dual_sse2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1764 void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
1765 const uint8_t *blimit, const uint8_t *limit,
1766 const uint8_t *thresh) {
1767 DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1768
1769 // Transpose 16x16
1770 transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
1771 transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
1772
1773 // Loop filtering
1774 vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
1775
1776 // Transpose back
1777 transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
1778 transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
1779 }
1780