1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h> // SSE2
13
14 #include "config/aom_dsp_rtcd.h"
15
16 #include "aom_dsp/x86/synonyms.h"
17 #include "aom_ports/mem.h"
18 #include "aom_ports/emmintrin_compat.h"
19 #include "aom_dsp/x86/lpf_common_sse2.h"
20
abs_diff(__m128i a,__m128i b)21 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
22 return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
23 }
24
25 // this function treats its input as 2 parallel 8x4 matrices, transposes each of
26 // them to 4x8 independently while flipping the second matrix horizontally.
27 // Used for 14 taps pq pairs creation
transpose_pq_14_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * q0p0,__m128i * q1p1,__m128i * q2p2,__m128i * q3p3,__m128i * q4p4,__m128i * q5p5,__m128i * q6p6,__m128i * q7p7)28 static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
29 __m128i *x3, __m128i *q0p0,
30 __m128i *q1p1, __m128i *q2p2,
31 __m128i *q3p3, __m128i *q4p4,
32 __m128i *q5p5, __m128i *q6p6,
33 __m128i *q7p7) {
34 __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
35 w0 = _mm_unpacklo_epi8(
36 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
37 w1 = _mm_unpacklo_epi8(
38 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
39 w2 = _mm_unpackhi_epi8(
40 *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
41 w3 = _mm_unpackhi_epi8(
42 *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
43
44 ww0 = _mm_unpacklo_epi16(
45 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
46 ww1 = _mm_unpackhi_epi16(
47 w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
48 ww2 = _mm_unpacklo_epi16(
49 w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311
50 ww3 = _mm_unpackhi_epi16(
51 w2,
52 w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315
53
54 *q7p7 = _mm_unpacklo_epi32(
55 ww0,
56 _mm_srli_si128(
57 ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx
58 *q6p6 = _mm_unpackhi_epi32(
59 _mm_slli_si128(ww0, 4),
60 ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx
61 *q5p5 = _mm_unpackhi_epi32(
62 ww0,
63 _mm_slli_si128(
64 ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx
65 *q4p4 = _mm_unpacklo_epi32(
66 _mm_srli_si128(ww0, 12),
67 ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx
68 *q3p3 = _mm_unpacklo_epi32(
69 ww1,
70 _mm_srli_si128(
71 ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx
72 *q2p2 = _mm_unpackhi_epi32(
73 _mm_slli_si128(ww1, 4),
74 ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx
75 *q1p1 = _mm_unpackhi_epi32(
76 ww1,
77 _mm_slli_si128(
78 ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
79 *q0p0 = _mm_unpacklo_epi32(
80 _mm_srli_si128(ww1, 12),
81 ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
82 }
83
84 // this function treats its input as 2 parallel 8x4 matrices, transposes each of
85 // them independently while flipping the second matrix horizontaly Used for 14
86 // taps filter pq pairs inverse
transpose_pq_14_inv_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * pq0,__m128i * pq1,__m128i * pq2,__m128i * pq3)87 static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
88 __m128i *x2, __m128i *x3,
89 __m128i *x4, __m128i *x5,
90 __m128i *x6, __m128i *x7,
91 __m128i *pq0, __m128i *pq1,
92 __m128i *pq2, __m128i *pq3) {
93 __m128i w10, w11, w12, w13;
94 __m128i w0, w1, w2, w3, w4, w5;
95 __m128i d0, d1, d2, d3;
96
97 w0 = _mm_unpacklo_epi8(
98 *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
99 w1 = _mm_unpacklo_epi8(
100 *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
101 w2 = _mm_unpacklo_epi8(
102 *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
103 w3 = _mm_unpacklo_epi8(
104 *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
105
106 w4 = _mm_unpacklo_epi16(
107 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
108 w5 = _mm_unpacklo_epi16(
109 w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
110
111 d0 = _mm_unpacklo_epi32(
112 w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
113 d2 = _mm_unpackhi_epi32(
114 w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
115
116 w10 = _mm_unpacklo_epi8(
117 *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
118 w11 = _mm_unpacklo_epi8(
119 *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
120 w12 = _mm_unpacklo_epi8(
121 *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
122 w13 = _mm_unpacklo_epi8(
123 *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
124
125 w4 = _mm_unpackhi_epi16(
126 w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
127 w5 = _mm_unpackhi_epi16(
128 w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
129
130 d1 = _mm_unpacklo_epi32(
131 w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
132 d3 = _mm_unpackhi_epi32(
133 w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
134
135 *pq0 = _mm_unpacklo_epi64(d0, d1); // pq
136 *pq1 = _mm_unpackhi_epi64(d0, d1); // pq
137 *pq2 = _mm_unpacklo_epi64(d2, d3); // pq
138 *pq3 = _mm_unpackhi_epi64(d2, d3); // pq
139 }
140
filter4_sse2(__m128i * p1p0,__m128i * q1q0,__m128i * hev,__m128i * mask,__m128i * qs1qs0,__m128i * ps1ps0)141 static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
142 __m128i *hev, __m128i *mask,
143 __m128i *qs1qs0, __m128i *ps1ps0) {
144 __m128i filter, filter2filter1, work;
145 __m128i ps1ps0_work, qs1qs0_work;
146 __m128i hev1;
147 const __m128i t3t4 =
148 _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
149 const __m128i t80 = _mm_set1_epi8((char)0x80);
150 const __m128i ff = _mm_cmpeq_epi8(t80, t80);
151
152 ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
153 qs1qs0_work = _mm_xor_si128(*q1q0, t80);
154
155 /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
156 work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
157 filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
158 /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
159 filter = _mm_subs_epi8(filter, work);
160 filter = _mm_subs_epi8(filter, work);
161 filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
162 filter = _mm_and_si128(filter, *mask); /* & mask */
163 filter = _mm_unpacklo_epi32(filter, filter);
164
165 /* filter1 = signed_char_clamp(filter + 4) >> 3; */
166 /* filter2 = signed_char_clamp(filter + 3) >> 3; */
167 filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
168 filter2filter1 =
169 _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit
170 filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
171 filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
172
173 /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
174 filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
175 filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit
176 filter = _mm_srai_epi16(filter, 9); /* round */
177 filter = _mm_packs_epi16(filter, filter);
178 filter = _mm_andnot_si128(*hev, filter);
179 filter = _mm_unpacklo_epi32(filter, filter);
180
181 filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
182 hev1 = _mm_srli_si128(filter2filter1, 8);
183 /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
184 qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
185 /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
186 ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
187
188 *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
189 *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
190 }
191
filter4_dual_sse2(__m128i * p1p0,__m128i * q1q0,__m128i * hev,__m128i * mask,__m128i * qs1qs0,__m128i * ps1ps0)192 static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
193 __m128i *hev, __m128i *mask,
194 __m128i *qs1qs0,
195 __m128i *ps1ps0) {
196 const __m128i t3t4 =
197 _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
198 const __m128i t80 = _mm_set1_epi8((char)0x80);
199 __m128i filter, filter2filter1, work;
200 __m128i ps1ps0_work, qs1qs0_work;
201 __m128i hev1;
202 const __m128i ff = _mm_cmpeq_epi8(t80, t80);
203
204 ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
205 qs1qs0_work = _mm_xor_si128(*q1q0, t80);
206
207 /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
208 work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
209 filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
210 /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
211 filter = _mm_subs_epi8(filter, work);
212 filter = _mm_subs_epi8(filter, work);
213 filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
214 filter = _mm_and_si128(filter, *mask); /* & mask */
215 filter = _mm_unpacklo_epi64(filter, filter);
216
217 /* filter1 = signed_char_clamp(filter + 4) >> 3; */
218 /* filter2 = signed_char_clamp(filter + 3) >> 3; */
219 filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
220 filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
221 filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
222 filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
223 filter = _mm_srai_epi16(filter, 11); /* >> 3 */
224 filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
225
226 /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
227 filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
228 filter = _mm_unpacklo_epi8(filter, filter);
229 filter = _mm_srai_epi16(filter, 9); /* round */
230 filter = _mm_packs_epi16(filter, filter);
231 filter = _mm_andnot_si128(*hev, filter);
232
233 hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
234 filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
235
236 /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
237 qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
238 /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
239 ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
240 *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
241 *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
242 }
243
lpf_internal_4_sse2(__m128i * p1,__m128i * p0,__m128i * q0,__m128i * q1,__m128i * limit,__m128i * thresh,__m128i * q1q0_out,__m128i * p1p0_out)244 static AOM_FORCE_INLINE void lpf_internal_4_sse2(
245 __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
246 __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
247 __m128i q1p1, q0p0, p1p0, q1q0;
248 __m128i abs_p0q0, abs_p1q1;
249 __m128i mask, flat, hev;
250 const __m128i zero = _mm_setzero_si128();
251
252 q1p1 = _mm_unpacklo_epi32(*p1, *q1);
253 q0p0 = _mm_unpacklo_epi32(*p0, *q0);
254
255 p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
256 q1q0 = _mm_srli_si128(p1p0, 8);
257
258 /* (abs(q1 - q0), abs(p1 - p0) */
259 flat = abs_diff(q1p1, q0p0);
260 /* abs(p1 - q1), abs(p0 - q0) */
261 __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
262
263 /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
264 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
265 hev = _mm_unpacklo_epi8(flat, zero);
266
267 hev = _mm_cmpgt_epi16(hev, *thresh);
268 hev = _mm_packs_epi16(hev, hev);
269 hev = _mm_unpacklo_epi32(hev, hev);
270
271 abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
272 abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */
273 abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
274 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
275 abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
276 /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
277
278 mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
279 mask = _mm_unpacklo_epi32(mask, flat);
280 mask = _mm_subs_epu8(mask, *limit);
281 mask = _mm_cmpeq_epi8(mask, zero);
282 mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
283
284 filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
285 }
286
lpf_internal_4_dual_sse2(__m128i * p1,__m128i * p0,__m128i * q0,__m128i * q1,__m128i * limit,__m128i * thresh,__m128i * q1q0_out,__m128i * p1p0_out)287 static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
288 __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
289 __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
290 __m128i q1p1, q0p0, p1p0, q1q0;
291 __m128i abs_p0q0, abs_p1q1;
292 __m128i mask, hev;
293 const __m128i zero = _mm_setzero_si128();
294
295 q1p1 = _mm_unpacklo_epi64(*p1, *q1);
296 q0p0 = _mm_unpacklo_epi64(*p0, *q0);
297
298 p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
299 q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
300
301 /* (abs(q1 - q0), abs(p1 - p0) */
302 __m128i flat = abs_diff(q1p1, q0p0);
303 /* abs(p1 - q1), abs(p0 - q0) */
304 const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
305
306 /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
307 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
308 hev = _mm_unpacklo_epi8(flat, zero);
309
310 hev = _mm_cmpgt_epi16(hev, *thresh);
311 hev = _mm_packs_epi16(hev, hev);
312
313 /* const int8_t mask = filter_mask2(*limit, *blimit, */
314 /* p1, p0, q0, q1); */
315 abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
316 abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
317 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
318 abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
319 /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
320 mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
321 mask = _mm_unpacklo_epi64(mask, flat);
322 mask = _mm_subs_epu8(mask, *limit);
323 mask = _mm_cmpeq_epi8(mask, zero);
324 mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
325
326 filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
327 }
328
aom_lpf_horizontal_4_sse2(uint8_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh)329 void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
330 const uint8_t *_blimit, const uint8_t *_limit,
331 const uint8_t *_thresh) {
332 const __m128i zero = _mm_setzero_si128();
333 __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
334 _mm_loadl_epi64((const __m128i *)_limit));
335 __m128i thresh =
336 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
337
338 __m128i qs1qs0, ps1ps0;
339 __m128i p1, p0, q0, q1;
340
341 p1 = xx_loadl_32(s - 2 * p);
342 p0 = xx_loadl_32(s - 1 * p);
343 q0 = xx_loadl_32(s - 0 * p);
344 q1 = xx_loadl_32(s + 1 * p);
345
346 lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
347
348 xx_storel_32(s - 1 * p, ps1ps0);
349 xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
350 xx_storel_32(s + 0 * p, qs1qs0);
351 xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
352 }
353
aom_lpf_vertical_4_sse2(uint8_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh)354 void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
355 const uint8_t *_blimit, const uint8_t *_limit,
356 const uint8_t *_thresh) {
357 __m128i p1p0, q1q0;
358 __m128i p1, p0, q0, q1;
359
360 const __m128i zero = _mm_setzero_si128();
361 __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
362 _mm_loadl_epi64((const __m128i *)_limit));
363 __m128i thresh =
364 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
365
366 __m128i x0, x1, x2, x3;
367 __m128i d0, d1, d2, d3;
368 x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
369 x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
370 x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
371 x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
372
373 transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
374
375 lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
376
377 // Transpose 8x4 to 4x8
378 p1 = _mm_srli_si128(p1p0, 4);
379 q1 = _mm_srli_si128(q1q0, 4);
380
381 transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
382
383 xx_storel_32(s + 0 * p - 2, d0);
384 xx_storel_32(s + 1 * p - 2, d1);
385 xx_storel_32(s + 2 * p - 2, d2);
386 xx_storel_32(s + 3 * p - 2, d3);
387 }
388
store_buffer_horz_8(__m128i x,int p,int num,uint8_t * s)389 static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
390 xx_storel_32(s - (num + 1) * p, x);
391 xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
392 }
393
lpf_internal_14_dual_sse2(__m128i * q6p6,__m128i * q5p5,__m128i * q4p4,__m128i * q3p3,__m128i * q2p2,__m128i * q1p1,__m128i * q0p0,__m128i * blimit,__m128i * limit,__m128i * thresh)394 static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
395 __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
396 __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
397 __m128i *thresh) {
398 const __m128i zero = _mm_setzero_si128();
399 const __m128i one = _mm_set1_epi8(1);
400 __m128i mask, hev, flat, flat2;
401 __m128i qs0ps0, qs1ps1;
402 __m128i p1p0, q1q0, qs1qs0, ps1ps0;
403 __m128i abs_p1p0;
404
405 p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
406 q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
407
408 {
409 __m128i abs_p1q1, abs_p0q0, abs_q1q0;
410 __m128i fe, ff, work;
411 abs_p1p0 = abs_diff(*q1p1, *q0p0);
412 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
413 fe = _mm_set1_epi8((char)0xfe);
414 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
415 abs_p0q0 = abs_diff(p1p0, q1q0);
416 abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
417 abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
418
419 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
420 hev = _mm_subs_epu8(flat, *thresh);
421 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
422 // replicate for the further "merged variables" usage
423 hev = _mm_unpacklo_epi64(hev, hev);
424
425 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
426 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
427 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
428 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
429 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
430 mask = _mm_max_epu8(abs_p1p0, mask);
431 // mask |= (abs(p1 - p0) > limit) * -1;
432 // mask |= (abs(q1 - q0) > limit) * -1;
433
434 work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
435 mask = _mm_max_epu8(work, mask);
436 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
437 mask = _mm_subs_epu8(mask, *limit);
438 mask = _mm_cmpeq_epi8(mask, zero);
439 }
440
441 // lp filter - the same for 6, 8 and 14 versions
442 filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
443 qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
444 qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
445 // loopfilter done
446
447 __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
448 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
449
450 __m128i work;
451 flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
452 flat = _mm_max_epu8(abs_p1p0, flat);
453 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
454 flat = _mm_subs_epu8(flat, one);
455 flat = _mm_cmpeq_epi8(flat, zero);
456 flat = _mm_and_si128(flat, mask);
457
458 // if flat ==0 then flat2 is zero as well and we don't need any calc below
459 // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
460 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
461 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
462 // flat and wide flat calculations
463
464 const __m128i eight = _mm_set1_epi16(8);
465 const __m128i four = _mm_set1_epi16(4);
466 __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
467 __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
468 __m128i pixelFilter_p, pixelFilter_q;
469 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
470 __m128i sum_p6, sum_q6;
471 __m128i sum_p3, sum_q3, res_p, res_q;
472
473 p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
474 p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
475 p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
476 p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
477 p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
478 p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
479 p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
480 q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
481 q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
482 q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
483 q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
484 q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
485 q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
486 q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
487 pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
488 pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
489
490 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
491 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
492
493 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
494 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
495 pixelFilter_p =
496 _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
497 pixetFilter_p2p1p0 = _mm_add_epi16(
498 four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
499 res_p = _mm_srli_epi16(
500 _mm_add_epi16(pixelFilter_p,
501 _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
502 _mm_add_epi16(p1_16, q0_16))),
503 4);
504 res_q = _mm_srli_epi16(
505 _mm_add_epi16(pixelFilter_p,
506 _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
507 _mm_add_epi16(p0_16, q1_16))),
508 4);
509 flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
510
511 res_p = _mm_srli_epi16(
512 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
513 res_q = _mm_srli_epi16(
514 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
515
516 flat_q0p0 = _mm_packus_epi16(res_p, res_q);
517
518 sum_p6 = _mm_add_epi16(p6_16, p6_16);
519 sum_q6 = _mm_add_epi16(q6_16, q6_16);
520 sum_p3 = _mm_add_epi16(p3_16, p3_16);
521 sum_q3 = _mm_add_epi16(q3_16, q3_16);
522
523 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
524 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
525
526 res_p = _mm_srli_epi16(
527 _mm_add_epi16(
528 pixelFilter_p,
529 _mm_add_epi16(sum_p6,
530 _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
531 4);
532 res_q = _mm_srli_epi16(
533 _mm_add_epi16(
534 pixelFilter_q,
535 _mm_add_epi16(sum_q6,
536 _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
537 4);
538 flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
539
540 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
541 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
542 res_p = _mm_srli_epi16(
543 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
544 res_q = _mm_srli_epi16(
545 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
546 flat_q1p1 = _mm_packus_epi16(res_p, res_q);
547
548 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
549 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
550
551 sum_p3 = _mm_add_epi16(sum_p3, p3_16);
552 sum_q3 = _mm_add_epi16(sum_q3, q3_16);
553
554 res_p = _mm_srli_epi16(
555 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
556 res_q = _mm_srli_epi16(
557 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
558 flat_q2p2 = _mm_packus_epi16(res_p, res_q);
559
560 // work with flat2
561 flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
562 work = abs_diff(*q6p6, *q0p0);
563 flat2 = _mm_max_epu8(work, flat2);
564 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
565 flat2 = _mm_subs_epu8(flat2, one);
566 flat2 = _mm_cmpeq_epi8(flat2, zero);
567 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
568
569 // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
570 flat = _mm_unpacklo_epi64(flat, flat);
571 *q2p2 = _mm_andnot_si128(flat, *q2p2);
572 flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
573 *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
574
575 qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
576 flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
577 *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
578
579 qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
580 flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
581 *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
582
583 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
584 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
585 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
586
587 sum_p6 = _mm_add_epi16(sum_p6, p6_16);
588 sum_q6 = _mm_add_epi16(sum_q6, q6_16);
589
590 res_p = _mm_srli_epi16(
591 _mm_add_epi16(
592 pixelFilter_p,
593 _mm_add_epi16(sum_p6,
594 _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
595 4);
596 res_q = _mm_srli_epi16(
597 _mm_add_epi16(
598 pixelFilter_q,
599 _mm_add_epi16(sum_q6,
600 _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
601 4);
602 flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
603
604 sum_p6 = _mm_add_epi16(sum_p6, p6_16);
605 sum_q6 = _mm_add_epi16(sum_q6, q6_16);
606
607 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
608 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
609
610 res_p = _mm_srli_epi16(
611 _mm_add_epi16(
612 pixelFilter_p,
613 _mm_add_epi16(sum_p6,
614 _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
615 4);
616 res_q = _mm_srli_epi16(
617 _mm_add_epi16(
618 pixelFilter_q,
619 _mm_add_epi16(sum_q6,
620 _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
621 4);
622 flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
623
624 sum_p6 = _mm_add_epi16(sum_p6, p6_16);
625 sum_q6 = _mm_add_epi16(sum_q6, q6_16);
626
627 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
628 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
629
630 res_p = _mm_srli_epi16(
631 _mm_add_epi16(
632 pixelFilter_p,
633 _mm_add_epi16(sum_p6,
634 _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
635 4);
636 res_q = _mm_srli_epi16(
637 _mm_add_epi16(
638 pixelFilter_q,
639 _mm_add_epi16(sum_q6,
640 _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
641 4);
642 flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
643
644 sum_p6 = _mm_add_epi16(sum_p6, p6_16);
645 sum_q6 = _mm_add_epi16(sum_q6, q6_16);
646 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
647 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
648
649 res_p = _mm_srli_epi16(
650 _mm_add_epi16(
651 pixelFilter_p,
652 _mm_add_epi16(sum_p6,
653 _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
654 4);
655 res_q = _mm_srli_epi16(
656 _mm_add_epi16(
657 pixelFilter_q,
658 _mm_add_epi16(sum_q6,
659 _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
660 4);
661 flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
662
663 // wide flat
664 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
665 flat2 = _mm_unpacklo_epi64(flat2, flat2);
666
667 *q5p5 = _mm_andnot_si128(flat2, *q5p5);
668 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
669 *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
670
671 *q4p4 = _mm_andnot_si128(flat2, *q4p4);
672 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
673 *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
674
675 *q3p3 = _mm_andnot_si128(flat2, *q3p3);
676 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
677 *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
678
679 *q2p2 = _mm_andnot_si128(flat2, *q2p2);
680 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
681 *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
682
683 *q1p1 = _mm_andnot_si128(flat2, *q1p1);
684 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
685 *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
686
687 *q0p0 = _mm_andnot_si128(flat2, *q0p0);
688 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
689 *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
690 }
691 } else {
692 *q0p0 = qs0ps0;
693 *q1p1 = qs1ps1;
694 }
695 }
696
lpf_internal_14_sse2(__m128i * q6p6,__m128i * q5p5,__m128i * q4p4,__m128i * q3p3,__m128i * q2p2,__m128i * q1p1,__m128i * q0p0,__m128i * blimit,__m128i * limit,__m128i * thresh)697 static AOM_FORCE_INLINE void lpf_internal_14_sse2(
698 __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
699 __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
700 __m128i *thresh) {
701 const __m128i zero = _mm_setzero_si128();
702 const __m128i one = _mm_set1_epi8(1);
703 __m128i mask, hev, flat, flat2;
704 __m128i flat2_pq[6], flat_pq[3];
705 __m128i qs0ps0, qs1ps1;
706 __m128i p1p0, q1q0, qs1qs0, ps1ps0;
707 __m128i abs_p1p0;
708
709 p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
710 q1q0 = _mm_srli_si128(p1p0, 8);
711
712 __m128i fe, ff, work;
713 {
714 __m128i abs_p1q1, abs_p0q0, abs_q1q0;
715 abs_p1p0 = abs_diff(*q1p1, *q0p0);
716 abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
717 fe = _mm_set1_epi8((char)0xfe);
718 ff = _mm_cmpeq_epi8(fe, fe);
719 abs_p0q0 = abs_diff(p1p0, q1q0);
720 abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
721
722 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
723
724 hev = _mm_subs_epu8(flat, *thresh);
725 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
726 // replicate for the further "merged variables" usage
727 hev = _mm_unpacklo_epi32(hev, hev);
728
729 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
730 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
731 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
732 mask = _mm_unpacklo_epi32(mask, zero);
733 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
734 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
735 mask = _mm_max_epu8(abs_p1p0, mask);
736 // mask |= (abs(p1 - p0) > limit) * -1;
737 // mask |= (abs(q1 - q0) > limit) * -1;
738
739 work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
740 mask = _mm_max_epu8(work, mask);
741 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
742 mask = _mm_subs_epu8(mask, *limit);
743 mask = _mm_cmpeq_epi8(mask, zero);
744 }
745
746 // lp filter - the same for 6, 8 and 14 versions
747 filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
748 qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
749 qs1ps1 = _mm_srli_si128(qs0ps0, 8);
750 // loopfilter done
751
752 flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
753 flat = _mm_max_epu8(abs_p1p0, flat);
754 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
755 flat = _mm_subs_epu8(flat, one);
756 flat = _mm_cmpeq_epi8(flat, zero);
757 flat = _mm_and_si128(flat, mask);
758 flat = _mm_unpacklo_epi32(flat, flat);
759 flat = _mm_unpacklo_epi64(flat, flat);
760
761 // if flat ==0 then flat2 is zero as well and we don't need any calc below
762 // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
763 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
764 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
765 // flat and wide flat calculations
766 __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
767 __m128i pq_16[7];
768 const __m128i eight = _mm_set1_epi16(8);
769 const __m128i four = _mm_set1_epi16(4);
770 __m128i sum_p6;
771 __m128i sum_p3;
772
773 pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
774 pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
775 pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
776 pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
777 pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
778 pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
779 pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
780 q0_16 = _mm_srli_si128(pq_16[0], 8);
781 q1_16 = _mm_srli_si128(pq_16[1], 8);
782 q2_16 = _mm_srli_si128(pq_16[2], 8);
783 q3_16 = _mm_srli_si128(pq_16[3], 8);
784 q4_16 = _mm_srli_si128(pq_16[4], 8);
785 q5_16 = _mm_srli_si128(pq_16[5], 8);
786
787 __m128i flat_p[3], flat_q[3];
788 __m128i flat2_p[6], flat2_q[6];
789
790 __m128i work0, work0_0, work0_1, sum_p_0;
791 __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
792 __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
793 sum_p = _mm_add_epi16(sum_p, sum_lp);
794
795 __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
796 __m128i sum_q = _mm_srli_si128(sum_p, 8);
797
798 sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
799 sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
800
801 flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
802 flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
803
804 sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
805 sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
806
807 sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
808 sum_p = _mm_sub_epi16(sum_p_0, q5_16);
809
810 work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
811 work0_1 = _mm_add_epi16(
812 sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
813
814 sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
815 sum_lp = _mm_sub_epi16(sum_lp, q2_16);
816
817 work0 = _mm_add_epi16(sum_p3, pq_16[1]);
818 flat_p[1] = _mm_add_epi16(sum_lp, work0);
819 flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
820
821 flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
822 flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
823 flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
824 flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
825
826 sum_lp = _mm_sub_epi16(sum_lp, q1_16);
827 sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
828
829 sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
830 work0 = _mm_add_epi16(sum_p3, pq_16[2]);
831
832 flat_p[2] = _mm_add_epi16(sum_lp, work0);
833 flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
834 flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
835 flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
836
837 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
838 flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
839
840 work = abs_diff(*q6p6, *q0p0);
841 flat2 = _mm_max_epu8(work, flat2);
842 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
843 flat2 = _mm_subs_epu8(flat2, one);
844 flat2 = _mm_cmpeq_epi8(flat2, zero);
845 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
846 flat2 = _mm_unpacklo_epi32(flat2, flat2);
847
848 // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
849 qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
850 flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
851 *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
852
853 qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
854 flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
855 *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
856
857 *q2p2 = _mm_andnot_si128(flat, *q2p2);
858 flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
859 *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
860
861 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
862 flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
863 flat2_q[0] = _mm_add_epi16(
864 sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
865
866 flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
867 flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
868
869 flat2_pq[0] =
870 _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
871 flat2_pq[1] =
872 _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
873 flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
874 flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
875
876 sum_p = _mm_sub_epi16(sum_p, q4_16);
877 sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
878
879 sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
880 work0 = _mm_add_epi16(
881 sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
882 flat2_p[2] = _mm_add_epi16(sum_p, work0);
883 flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
884 flat2_pq[2] =
885 _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
886 flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
887
888 sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
889 sum_p = _mm_sub_epi16(sum_p, q3_16);
890 sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
891
892 work0 = _mm_add_epi16(
893 sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
894 flat2_p[3] = _mm_add_epi16(sum_p, work0);
895 flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
896 flat2_pq[3] =
897 _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
898 flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
899
900 sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
901 sum_p = _mm_sub_epi16(sum_p, q2_16);
902 sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
903
904 work0 = _mm_add_epi16(
905 sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
906 flat2_p[4] = _mm_add_epi16(sum_p, work0);
907 flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
908 flat2_pq[4] =
909 _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
910 flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
911
912 sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
913 sum_p = _mm_sub_epi16(sum_p, q1_16);
914 sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
915
916 work0 = _mm_add_epi16(
917 sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
918 flat2_p[5] = _mm_add_epi16(sum_p, work0);
919 flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
920 flat2_pq[5] =
921 _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
922 flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
923
924 // wide flat
925 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
926
927 *q0p0 = _mm_andnot_si128(flat2, *q0p0);
928 flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
929 *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
930
931 *q1p1 = _mm_andnot_si128(flat2, *q1p1);
932 flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
933 *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
934
935 *q2p2 = _mm_andnot_si128(flat2, *q2p2);
936 flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
937 *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
938
939 *q3p3 = _mm_andnot_si128(flat2, *q3p3);
940 flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
941 *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
942
943 *q4p4 = _mm_andnot_si128(flat2, *q4p4);
944 flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
945 *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
946
947 *q5p5 = _mm_andnot_si128(flat2, *q5p5);
948 flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
949 *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
950 }
951 } else {
952 *q0p0 = qs0ps0;
953 *q1p1 = qs1ps1;
954 }
955 }
956
aom_lpf_horizontal_14_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)957 void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
958 const unsigned char *_blimit,
959 const unsigned char *_limit,
960 const unsigned char *_thresh) {
961 __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
962 __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
963 __m128i limit = _mm_load_si128((const __m128i *)_limit);
964 __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
965
966 q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
967 q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
968 q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
969 q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
970
971 q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
972
973 q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
974
975 q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
976
977 lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
978 &limit, &thresh);
979
980 store_buffer_horz_8(q0p0, p, 0, s);
981 store_buffer_horz_8(q1p1, p, 1, s);
982 store_buffer_horz_8(q2p2, p, 2, s);
983 store_buffer_horz_8(q3p3, p, 3, s);
984 store_buffer_horz_8(q4p4, p, 4, s);
985 store_buffer_horz_8(q5p5, p, 5, s);
986 }
987
lpf_internal_6_dual_sse2(__m128i * p2,__m128i * q2,__m128i * p1,__m128i * q1,__m128i * p0,__m128i * q0,__m128i * q1q0,__m128i * p1p0,__m128i * blimit,__m128i * limit,__m128i * thresh)988 static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
989 __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
990 __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
991 __m128i *thresh) {
992 const __m128i zero = _mm_setzero_si128();
993 __m128i mask, hev, flat;
994 __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
995 __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
996 __m128i ps1ps0, qs1qs0;
997
998 q2p2 = _mm_unpacklo_epi64(*p2, *q2);
999 q1p1 = _mm_unpacklo_epi64(*p1, *q1);
1000 q0p0 = _mm_unpacklo_epi64(*p0, *q0);
1001
1002 *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
1003 *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
1004
1005 const __m128i one = _mm_set1_epi8(1);
1006 const __m128i fe = _mm_set1_epi8((char)0xfe);
1007 const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1008
1009 {
1010 // filter_mask and hev_mask
1011 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1012 abs_p1p0 = abs_diff(q1p1, q0p0);
1013 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
1014
1015 abs_p0q0 = abs_diff(*p1p0, *q1q0);
1016 abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
1017 abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
1018
1019 // considering sse doesn't have unsigned elements comparison the idea is
1020 // to find at least one case when X > limit, it means the corresponding
1021 // mask bit is set.
1022 // to achieve that we find global max value of all inputs of abs(x-y) or
1023 // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
1024 // otherwise - not
1025
1026 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1027 hev = _mm_subs_epu8(flat, *thresh);
1028 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1029 // replicate for the further "merged variables" usage
1030 hev = _mm_unpacklo_epi64(hev, hev);
1031
1032 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1033 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1034 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
1035 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1036 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1037 mask = _mm_max_epu8(abs_p1p0, mask);
1038 // mask |= (abs(p1 - p0) > limit) * -1;
1039 // mask |= (abs(q1 - q0) > limit) * -1;
1040
1041 work = abs_diff(q2p2, q1p1);
1042 mask = _mm_max_epu8(work, mask);
1043 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
1044 mask = _mm_subs_epu8(mask, *limit);
1045 mask = _mm_cmpeq_epi8(mask, zero);
1046
1047 // lp filter - the same for 6, 8 and 14 versions
1048 filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
1049
1050 // flat_mask
1051 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
1052 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
1053 flat = _mm_subs_epu8(flat, one);
1054 flat = _mm_cmpeq_epi8(flat, zero);
1055 flat = _mm_and_si128(flat, mask);
1056 // replicate for the further "merged variables" usage
1057 flat = _mm_unpacklo_epi64(flat, flat);
1058 }
1059
1060 // 5 tap filter
1061 // need it only if flat !=0
1062 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
1063 const __m128i four = _mm_set1_epi16(4);
1064 __m128i workp_a, workp_b, workp_shft0, workp_shft1;
1065 p2_16 = _mm_unpacklo_epi8(*p2, zero);
1066 p1_16 = _mm_unpacklo_epi8(*p1, zero);
1067 p0_16 = _mm_unpacklo_epi8(*p0, zero);
1068 q0_16 = _mm_unpacklo_epi8(*q0, zero);
1069 q1_16 = _mm_unpacklo_epi8(*q1, zero);
1070 q2_16 = _mm_unpacklo_epi8(*q2, zero);
1071
1072 // op1
1073 workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
1074 _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2
1075 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
1076 p2_16); // p2 + p0 * 2 + p1 * 2 + 4
1077
1078 workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
1079 workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
1080 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
1081
1082 // op0
1083 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1
1084 workp_a = _mm_add_epi16(workp_a,
1085 workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
1086 workp_shft1 = _mm_srli_epi16(workp_a, 3);
1087
1088 flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
1089
1090 // oq0
1091 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
1092 p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
1093 workp_b = _mm_add_epi16(q1_16, q2_16);
1094 workp_a = _mm_add_epi16(
1095 workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
1096 workp_shft0 = _mm_srli_epi16(workp_a, 3);
1097
1098 // oq1
1099 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
1100 p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
1101 workp_b = _mm_add_epi16(q2_16, q2_16);
1102 workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
1103 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
1104
1105 flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
1106
1107 qs1qs0 = _mm_andnot_si128(flat, *q1q0);
1108 *q1q0 = _mm_and_si128(flat, flat_q0q1);
1109 *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
1110
1111 ps1ps0 = _mm_andnot_si128(flat, *p1p0);
1112 *p1p0 = _mm_and_si128(flat, flat_p1p0);
1113 *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
1114 }
1115 }
1116
lpf_internal_6_sse2(__m128i * p2,__m128i * q2,__m128i * p1,__m128i * q1,__m128i * p0,__m128i * q0,__m128i * q1q0,__m128i * p1p0,__m128i * blimit,__m128i * limit,__m128i * thresh)1117 static AOM_FORCE_INLINE void lpf_internal_6_sse2(
1118 __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
1119 __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
1120 __m128i *thresh) {
1121 const __m128i zero = _mm_setzero_si128();
1122 __m128i mask, hev, flat;
1123 __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
1124 __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
1125 __m128i ps1ps0, qs1qs0;
1126
1127 q2p2 = _mm_unpacklo_epi32(*p2, *q2);
1128 q1p1 = _mm_unpacklo_epi32(*p1, *q1);
1129 q0p0 = _mm_unpacklo_epi32(*p0, *q0);
1130
1131 *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
1132 *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
1133
1134 const __m128i one = _mm_set1_epi8(1);
1135 const __m128i fe = _mm_set1_epi8((char)0xfe);
1136 const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1137 {
1138 // filter_mask and hev_mask
1139 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1140 abs_p1p0 = abs_diff(q1p1, q0p0);
1141 abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
1142
1143 abs_p0q0 = abs_diff(*p1p0, *q1q0);
1144 abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
1145
1146 // considering sse doesn't have unsigned elements comparison the idea is
1147 // to find at least one case when X > limit, it means the corresponding
1148 // mask bit is set.
1149 // to achieve that we find global max value of all inputs of abs(x-y) or
1150 // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
1151 // otherwise - not
1152
1153 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1154 hev = _mm_subs_epu8(flat, *thresh);
1155 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1156 // replicate for the further "merged variables" usage
1157 hev = _mm_unpacklo_epi32(hev, hev);
1158
1159 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1160 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1161 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
1162 mask = _mm_unpacklo_epi32(mask, zero);
1163 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1164 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1165 mask = _mm_max_epu8(abs_p1p0, mask);
1166 // mask |= (abs(p1 - p0) > limit) * -1;
1167 // mask |= (abs(q1 - q0) > limit) * -1;
1168
1169 work = abs_diff(q2p2, q1p1);
1170 mask = _mm_max_epu8(work, mask);
1171 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
1172 mask = _mm_subs_epu8(mask, *limit);
1173 mask = _mm_cmpeq_epi8(mask, zero);
1174
1175 // lp filter - the same for 6, 8 and 14 versions
1176 filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
1177
1178 // flat_mask
1179 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
1180 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
1181 flat = _mm_subs_epu8(flat, one);
1182 flat = _mm_cmpeq_epi8(flat, zero);
1183 flat = _mm_and_si128(flat, mask);
1184 // replicate for the further "merged variables" usage
1185 flat = _mm_unpacklo_epi32(flat, flat);
1186 flat = _mm_unpacklo_epi64(flat, flat);
1187 }
1188
1189 // 5 tap filter
1190 // need it only if flat !=0
1191 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
1192 const __m128i four = _mm_set1_epi16(4);
1193 __m128i workp_a, workp_b, workp_c;
1194 __m128i pq0x2_pq1, pq1_pq2;
1195 pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
1196 pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
1197 pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
1198 q0_16 = _mm_srli_si128(pq0_16, 8);
1199 q2_16 = _mm_srli_si128(pq2_16, 8);
1200
1201 // op1
1202 pq0x2_pq1 =
1203 _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1
1204 pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2
1205 workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
1206 pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
1207
1208 workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
1209 workp_b =
1210 _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
1211
1212 // op0
1213 workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
1214 workp_a = _mm_add_epi16(workp_a,
1215 workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
1216 workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
1217 workp_b = _mm_srli_epi16(workp_b, 3);
1218
1219 flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
1220
1221 // oq0
1222 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
1223 pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
1224 workp_b = _mm_srli_si128(pq1_pq2, 8);
1225 workp_a = _mm_add_epi16(
1226 workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
1227 // workp_shft0 = _mm_srli_epi16(workp_a, 3);
1228
1229 // oq1
1230 workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
1231 pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
1232 workp_b = _mm_add_epi16(q2_16, q2_16);
1233 workp_b =
1234 _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
1235
1236 workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
1237 workp_a = _mm_srli_epi16(workp_a, 3);
1238
1239 flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
1240
1241 qs1qs0 = _mm_andnot_si128(flat, *q1q0);
1242 *q1q0 = _mm_and_si128(flat, flat_q0q1);
1243 *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
1244
1245 ps1ps0 = _mm_andnot_si128(flat, *p1p0);
1246 *p1p0 = _mm_and_si128(flat, flat_p1p0);
1247 *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
1248 }
1249 }
1250
aom_lpf_horizontal_6_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1251 void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
1252 const unsigned char *_blimit,
1253 const unsigned char *_limit,
1254 const unsigned char *_thresh) {
1255 __m128i p2, p1, p0, q0, q1, q2;
1256 __m128i p1p0, q1q0;
1257 __m128i blimit = _mm_load_si128((__m128i *)_blimit);
1258 __m128i limit = _mm_load_si128((__m128i *)_limit);
1259 __m128i thresh = _mm_load_si128((__m128i *)_thresh);
1260
1261 p2 = xx_loadl_32(s - 3 * p);
1262 p1 = xx_loadl_32(s - 2 * p);
1263 p0 = xx_loadl_32(s - 1 * p);
1264 q0 = xx_loadl_32(s - 0 * p);
1265 q1 = xx_loadl_32(s + 1 * p);
1266 q2 = xx_loadl_32(s + 2 * p);
1267
1268 lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
1269 &limit, &thresh);
1270
1271 xx_storel_32(s - 1 * p, p1p0);
1272 xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
1273 xx_storel_32(s + 0 * p, q1q0);
1274 xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
1275 }
1276
aom_lpf_horizontal_6_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1277 void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
1278 const unsigned char *_blimit0,
1279 const unsigned char *_limit0,
1280 const unsigned char *_thresh0,
1281 const unsigned char *_blimit1,
1282 const unsigned char *_limit1,
1283 const unsigned char *_thresh1) {
1284 __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
1285 _mm_load_si128((__m128i *)_blimit1));
1286 __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
1287 _mm_load_si128((__m128i *)_limit1));
1288 __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
1289 _mm_load_si128((__m128i *)_thresh1));
1290
1291 __m128i p2, p1, p0, q0, q1, q2;
1292 __m128i p1p0, q1q0;
1293
1294 p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
1295 p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
1296 p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
1297 q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
1298 q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
1299 q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
1300
1301 lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
1302 &limit, &thresh);
1303
1304 _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
1305 _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
1306 _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
1307 _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
1308 }
1309
lpf_internal_8_sse2(__m128i * p3,__m128i * q3,__m128i * p2,__m128i * q2,__m128i * p1,__m128i * q1,__m128i * p0,__m128i * q0,__m128i * q1q0_out,__m128i * p1p0_out,__m128i * blimit,__m128i * limit,__m128i * thresh)1310 static AOM_FORCE_INLINE void lpf_internal_8_sse2(
1311 __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
1312 __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
1313 __m128i *blimit, __m128i *limit, __m128i *thresh) {
1314 const __m128i zero = _mm_setzero_si128();
1315 __m128i mask, hev, flat;
1316 __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
1317 flat_p1p0, flat_q0q1;
1318 __m128i q2p2, q1p1, q0p0;
1319 __m128i q1q0, p1p0, ps1ps0, qs1qs0;
1320 __m128i work_pq, opq2, pq2;
1321
1322 q3p3 = _mm_unpacklo_epi32(*p3, *q3);
1323 q2p2 = _mm_unpacklo_epi32(*p2, *q2);
1324 q1p1 = _mm_unpacklo_epi32(*p1, *q1);
1325 q0p0 = _mm_unpacklo_epi32(*p0, *q0);
1326
1327 p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0
1328 q1q0 = _mm_srli_si128(p1p0, 8);
1329
1330 // filter_mask and hev_mask
1331
1332 // considering sse doesn't have unsigned elements comparison the idea is to
1333 // find at least one case when X > limit, it means the corresponding mask
1334 // bit is set.
1335 // to achieve that we find global max value of all inputs of abs(x-y) or
1336 // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
1337 // otherwise - not
1338
1339 const __m128i one = _mm_set1_epi8(1);
1340 const __m128i fe = _mm_set1_epi8((char)0xfe);
1341 const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1342 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1343
1344 abs_p1p0 = abs_diff(q1p1, q0p0);
1345 abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
1346
1347 abs_p0q0 = abs_diff(p1p0, q1q0);
1348 abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
1349
1350 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1351 hev = _mm_subs_epu8(flat, *thresh);
1352 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1353 // replicate for the further "merged variables" usage
1354 hev = _mm_unpacklo_epi32(hev, hev);
1355
1356 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1357 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1358 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
1359 mask = _mm_unpacklo_epi32(mask, zero);
1360 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1361 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1362 mask = _mm_max_epu8(abs_p1p0, mask);
1363 // mask |= (abs(p1 - p0) > limit) * -1;
1364 // mask |= (abs(q1 - q0) > limit) * -1;
1365
1366 work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
1367
1368 mask = _mm_max_epu8(work, mask);
1369 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
1370 mask = _mm_subs_epu8(mask, *limit);
1371 mask = _mm_cmpeq_epi8(mask, zero);
1372
1373 // lp filter - the same for 6, 8 and 14 versions
1374 filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
1375
1376 // flat_mask4
1377 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
1378 flat = _mm_max_epu8(abs_p1p0, flat);
1379
1380 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
1381 flat = _mm_subs_epu8(flat, one);
1382 flat = _mm_cmpeq_epi8(flat, zero);
1383 flat = _mm_and_si128(flat, mask);
1384 // replicate for the further "merged variables" usage
1385 flat = _mm_unpacklo_epi32(flat, flat);
1386 flat = _mm_unpacklo_epi64(flat, flat);
1387
1388 // filter8 need it only if flat !=0
1389 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
1390 const __m128i four = _mm_set1_epi16(4);
1391 __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
1392 p2_16 = _mm_unpacklo_epi8(*p2, zero);
1393 p1_16 = _mm_unpacklo_epi8(*p1, zero);
1394 p0_16 = _mm_unpacklo_epi8(*p0, zero);
1395 q0_16 = _mm_unpacklo_epi8(*q0, zero);
1396 q1_16 = _mm_unpacklo_epi8(*q1, zero);
1397 q2_16 = _mm_unpacklo_epi8(*q2, zero);
1398 p3_16 = _mm_unpacklo_epi8(*p3, zero);
1399 q3_16 = _mm_unpacklo_epi8(*q3, zero);
1400
1401 // op2
1402 workp_a =
1403 _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
1404 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
1405 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
1406 workp_shft2 = _mm_add_epi16(workp_a, workp_b);
1407
1408 // op1
1409 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
1410 workp_c = _mm_add_epi16(workp_a, workp_b);
1411 // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1412
1413 // op0
1414 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
1415 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
1416 workp_d = _mm_add_epi16(workp_a, workp_b);
1417 // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1418
1419 workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
1420 workp_c = _mm_srli_epi16(workp_c, 3);
1421 flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
1422
1423 // oq0
1424 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
1425 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
1426 // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1427 workp_c = _mm_add_epi16(workp_a, workp_b);
1428
1429 // oq1
1430 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
1431 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
1432 workp_d = _mm_add_epi16(workp_a, workp_b);
1433 // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1434
1435 workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
1436 workp_c = _mm_srli_epi16(workp_c, 3);
1437 flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
1438
1439 // oq2
1440 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
1441 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
1442 workp_shft1 = _mm_add_epi16(workp_a, workp_b);
1443
1444 workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
1445 workp_c = _mm_srli_epi16(workp_c, 3);
1446
1447 opq2 = _mm_packus_epi16(workp_c, workp_c);
1448
1449 work_pq = _mm_andnot_si128(flat, q2p2);
1450 pq2 = _mm_and_si128(flat, opq2);
1451 *p2 = _mm_or_si128(work_pq, pq2);
1452 *q2 = _mm_srli_si128(*p2, 4);
1453
1454 qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
1455 q1q0 = _mm_and_si128(flat, flat_q0q1);
1456 *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
1457
1458 ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
1459 p1p0 = _mm_and_si128(flat, flat_p1p0);
1460 *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
1461 }
1462 }
1463
lpf_internal_8_dual_sse2(__m128i * p3,__m128i * q3,__m128i * p2,__m128i * q2,__m128i * p1,__m128i * q1,__m128i * p0,__m128i * q0,__m128i * q1q0_out,__m128i * p1p0_out,__m128i * blimit,__m128i * limit,__m128i * thresh)1464 static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
1465 __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
1466 __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
1467 __m128i *blimit, __m128i *limit, __m128i *thresh) {
1468 const __m128i zero = _mm_setzero_si128();
1469 __m128i mask, hev, flat;
1470 __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
1471 flat_p1p0, flat_q0q1;
1472 __m128i q2p2, q1p1, q0p0;
1473 __m128i q1q0, p1p0, ps1ps0, qs1qs0;
1474 __m128i work_pq, opq2, pq2;
1475
1476 q3p3 = _mm_unpacklo_epi64(*p3, *q3);
1477 q2p2 = _mm_unpacklo_epi64(*p2, *q2);
1478 q1p1 = _mm_unpacklo_epi64(*p1, *q1);
1479 q0p0 = _mm_unpacklo_epi64(*p0, *q0);
1480
1481 p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
1482 q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
1483
1484 {
1485 // filter_mask and hev_mask
1486
1487 // considering sse doesn't have unsigned elements comparison the idea is to
1488 // find at least one case when X > limit, it means the corresponding mask
1489 // bit is set.
1490 // to achieve that we find global max value of all inputs of abs(x-y) or
1491 // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
1492 // otherwise - not
1493
1494 const __m128i one = _mm_set1_epi8(1);
1495 const __m128i fe = _mm_set1_epi8((char)0xfe);
1496 const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1497 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1498
1499 abs_p1p0 = abs_diff(q1p1, q0p0);
1500 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
1501
1502 abs_p0q0 = abs_diff(p1p0, q1q0);
1503 abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
1504 abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
1505
1506 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1507 hev = _mm_subs_epu8(flat, *thresh);
1508 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1509 // replicate for the further "merged variables" usage
1510 hev = _mm_unpacklo_epi64(hev, hev);
1511
1512 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1513 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1514 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
1515 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1516 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1517 mask = _mm_max_epu8(abs_p1p0, mask);
1518 // mask |= (abs(p1 - p0) > limit) * -1;
1519 // mask |= (abs(q1 - q0) > limit) * -1;
1520
1521 work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
1522
1523 mask = _mm_max_epu8(work, mask);
1524 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
1525 mask = _mm_subs_epu8(mask, *limit);
1526 mask = _mm_cmpeq_epi8(mask, zero);
1527
1528 // lp filter - the same for 6, 8 and 14 versions
1529 filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
1530
1531 // flat_mask4
1532 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
1533 flat = _mm_max_epu8(abs_p1p0, flat);
1534
1535 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
1536 flat = _mm_subs_epu8(flat, one);
1537 flat = _mm_cmpeq_epi8(flat, zero);
1538 flat = _mm_and_si128(flat, mask);
1539 // replicate for the further "merged variables" usage
1540 flat = _mm_unpacklo_epi64(flat, flat);
1541 }
1542
1543 // filter8 need it only if flat !=0
1544 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
1545 const __m128i four = _mm_set1_epi16(4);
1546
1547 __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
1548 p2_16 = _mm_unpacklo_epi8(*p2, zero);
1549 p1_16 = _mm_unpacklo_epi8(*p1, zero);
1550 p0_16 = _mm_unpacklo_epi8(*p0, zero);
1551 q0_16 = _mm_unpacklo_epi8(*q0, zero);
1552 q1_16 = _mm_unpacklo_epi8(*q1, zero);
1553 q2_16 = _mm_unpacklo_epi8(*q2, zero);
1554 p3_16 = _mm_unpacklo_epi8(*p3, zero);
1555 q3_16 = _mm_unpacklo_epi8(*q3, zero);
1556
1557 // op2
1558 workp_a =
1559 _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
1560 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
1561 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
1562 workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1563
1564 // op1
1565 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
1566 workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1567
1568 // op0
1569 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
1570 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
1571 workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1572
1573 flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
1574
1575 // oq0
1576 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
1577 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
1578 workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1579
1580 // oq1
1581 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
1582 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
1583 workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1584
1585 flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
1586
1587 // oq2
1588 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
1589 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
1590 workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1591
1592 opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
1593
1594 work_pq = _mm_andnot_si128(flat, q2p2);
1595 pq2 = _mm_and_si128(flat, opq2);
1596 *p2 = _mm_or_si128(work_pq, pq2);
1597 *q2 = _mm_srli_si128(*p2, 8);
1598
1599 qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
1600 q1q0 = _mm_and_si128(flat, flat_q0q1);
1601 *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
1602
1603 ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
1604 p1p0 = _mm_and_si128(flat, flat_p1p0);
1605 *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
1606 }
1607 }
1608
aom_lpf_horizontal_8_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1609 void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
1610 const unsigned char *_blimit,
1611 const unsigned char *_limit,
1612 const unsigned char *_thresh) {
1613 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1614 __m128i q1q0, p1p0;
1615 __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
1616 __m128i limit = _mm_load_si128((const __m128i *)_limit);
1617 __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
1618
1619 p3 = xx_loadl_32(s - 4 * p);
1620 p2 = xx_loadl_32(s - 3 * p);
1621 p1 = xx_loadl_32(s - 2 * p);
1622 p0 = xx_loadl_32(s - 1 * p);
1623 q0 = xx_loadl_32(s - 0 * p);
1624 q1 = xx_loadl_32(s + 1 * p);
1625 q2 = xx_loadl_32(s + 2 * p);
1626 q3 = xx_loadl_32(s + 3 * p);
1627
1628 lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
1629 &blimit, &limit, &thresh);
1630
1631 xx_storel_32(s - 1 * p, p1p0);
1632 xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
1633 xx_storel_32(s + 0 * p, q1q0);
1634 xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
1635 xx_storel_32(s - 3 * p, p2);
1636 xx_storel_32(s + 2 * p, q2);
1637 }
1638
aom_lpf_horizontal_14_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1639 void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
1640 const unsigned char *_blimit0,
1641 const unsigned char *_limit0,
1642 const unsigned char *_thresh0,
1643 const unsigned char *_blimit1,
1644 const unsigned char *_limit1,
1645 const unsigned char *_thresh1) {
1646 __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
1647 __m128i blimit =
1648 _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
1649 _mm_load_si128((const __m128i *)_blimit1));
1650 __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
1651 _mm_load_si128((const __m128i *)_limit1));
1652 __m128i thresh =
1653 _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
1654 _mm_load_si128((const __m128i *)_thresh1));
1655
1656 q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
1657 _mm_loadl_epi64((__m128i *)(s + 4 * p)));
1658 q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
1659 _mm_loadl_epi64((__m128i *)(s + 3 * p)));
1660 q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
1661 _mm_loadl_epi64((__m128i *)(s + 2 * p)));
1662 q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
1663 _mm_loadl_epi64((__m128i *)(s + 1 * p)));
1664
1665 q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
1666 _mm_loadl_epi64((__m128i *)(s - 0 * p)));
1667
1668 q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
1669 _mm_loadl_epi64((__m128i *)(s + 5 * p)));
1670
1671 q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
1672 _mm_loadl_epi64((__m128i *)(s + 6 * p)));
1673
1674 lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
1675 &blimit, &limit, &thresh);
1676
1677 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
1678 _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
1679 _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
1680 _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
1681 _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
1682 _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
1683 _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
1684 _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
1685 _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
1686 _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
1687 _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
1688 _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
1689 }
1690
aom_lpf_horizontal_8_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1691 void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1692 const uint8_t *_limit0,
1693 const uint8_t *_thresh0,
1694 const uint8_t *_blimit1,
1695 const uint8_t *_limit1,
1696 const uint8_t *_thresh1) {
1697 __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
1698 _mm_load_si128((__m128i *)_blimit1));
1699 __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
1700 _mm_load_si128((__m128i *)_limit1));
1701 __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
1702 _mm_load_si128((__m128i *)_thresh1));
1703
1704 __m128i p2, p1, p0, q0, q1, q2, p3, q3;
1705 __m128i q1q0, p1p0;
1706
1707 p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
1708 p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
1709 p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
1710 p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
1711 q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
1712 q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
1713 q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
1714 q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
1715
1716 lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
1717 &blimit, &limit, &thresh);
1718
1719 _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
1720 _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
1721 _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
1722 _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
1723 _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1724 _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1725 }
1726
aom_lpf_horizontal_4_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1727 void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1728 const unsigned char *_blimit0,
1729 const unsigned char *_limit0,
1730 const unsigned char *_thresh0,
1731 const unsigned char *_blimit1,
1732 const unsigned char *_limit1,
1733 const unsigned char *_thresh1) {
1734 __m128i p1, p0, q0, q1;
1735 __m128i qs1qs0, ps1ps0;
1736
1737 p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
1738 p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
1739 q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
1740 q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
1741
1742 const __m128i zero = _mm_setzero_si128();
1743 const __m128i blimit =
1744 _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
1745 _mm_load_si128((const __m128i *)_blimit1));
1746 const __m128i limit =
1747 _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
1748 _mm_load_si128((const __m128i *)_limit1));
1749
1750 __m128i l = _mm_unpacklo_epi64(blimit, limit);
1751
1752 __m128i thresh0 =
1753 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
1754
1755 __m128i thresh1 =
1756 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
1757
1758 __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
1759
1760 lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
1761
1762 _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
1763 _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
1764 _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
1765 _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
1766 }
1767
aom_lpf_vertical_4_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1768 void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1769 const uint8_t *_limit0,
1770 const uint8_t *_thresh0,
1771 const uint8_t *_blimit1,
1772 const uint8_t *_limit1,
1773 const uint8_t *_thresh1) {
1774 __m128i p0, q0, q1, p1;
1775 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1776 __m128i d0, d1, d2, d3, d4, d5, d6, d7;
1777 __m128i qs1qs0, ps1ps0;
1778
1779 const __m128i zero = _mm_setzero_si128();
1780 const __m128i blimit =
1781 _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
1782 _mm_load_si128((const __m128i *)_blimit1));
1783 const __m128i limit =
1784 _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
1785 _mm_load_si128((const __m128i *)_limit1));
1786
1787 __m128i l = _mm_unpacklo_epi64(blimit, limit);
1788
1789 __m128i thresh0 =
1790 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
1791
1792 __m128i thresh1 =
1793 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
1794
1795 __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
1796
1797 x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
1798 x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
1799 x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
1800 x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
1801 x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
1802 x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
1803 x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
1804 x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
1805
1806 transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
1807 &q1);
1808
1809 lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
1810
1811 p1 = _mm_srli_si128(ps1ps0, 8);
1812 q1 = _mm_srli_si128(qs1qs0, 8);
1813
1814 transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
1815 &d5, &d6, &d7);
1816
1817 xx_storel_32((s - 2 + 0 * p), d0);
1818 xx_storel_32((s - 2 + 1 * p), d1);
1819 xx_storel_32((s - 2 + 2 * p), d2);
1820 xx_storel_32((s - 2 + 3 * p), d3);
1821 xx_storel_32((s - 2 + 4 * p), d4);
1822 xx_storel_32((s - 2 + 5 * p), d5);
1823 xx_storel_32((s - 2 + 6 * p), d6);
1824 xx_storel_32((s - 2 + 7 * p), d7);
1825 }
1826
aom_lpf_vertical_6_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1827 void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
1828 const unsigned char *_blimit,
1829 const unsigned char *_limit,
1830 const unsigned char *_thresh) {
1831 __m128i d0, d1, d2, d3, d4, d5, d6, d7;
1832 __m128i x2, x1, x0, x3;
1833 __m128i p0, q0;
1834 __m128i p1p0, q1q0;
1835 __m128i blimit = _mm_load_si128((__m128i *)_blimit);
1836 __m128i limit = _mm_load_si128((__m128i *)_limit);
1837 __m128i thresh = _mm_load_si128((__m128i *)_thresh);
1838
1839 x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
1840 x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
1841 x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
1842 x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
1843
1844 transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
1845 &d7);
1846
1847 lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
1848 &limit, &thresh);
1849
1850 p0 = _mm_srli_si128(p1p0, 4);
1851 q0 = _mm_srli_si128(q1q0, 4);
1852
1853 transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
1854
1855 xx_storel_32(s + 0 * p - 2, d0);
1856 xx_storel_32(s + 1 * p - 2, d1);
1857 xx_storel_32(s + 2 * p - 2, d2);
1858 xx_storel_32(s + 3 * p - 2, d3);
1859 }
1860
aom_lpf_vertical_6_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1861 void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1862 const uint8_t *_limit0,
1863 const uint8_t *_thresh0,
1864 const uint8_t *_blimit1,
1865 const uint8_t *_limit1,
1866 const uint8_t *_thresh1) {
1867 __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
1868 _mm_load_si128((__m128i *)_blimit1));
1869 __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
1870 _mm_load_si128((__m128i *)_limit1));
1871 __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
1872 _mm_load_si128((__m128i *)_thresh1));
1873
1874 __m128i d0, d1, d2, d3, d4, d5, d6, d7;
1875 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1876 __m128i p0, q0;
1877 __m128i p1p0, q1q0;
1878 __m128i d0d1, d2d3, d4d5, d6d7;
1879
1880 x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
1881 x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
1882 x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
1883 x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
1884 x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
1885 x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
1886 x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
1887 x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
1888
1889 transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
1890 &d6d7);
1891
1892 d1 = _mm_srli_si128(d0d1, 8);
1893 d3 = _mm_srli_si128(d2d3, 8);
1894 d5 = _mm_srli_si128(d4d5, 8);
1895 d7 = _mm_srli_si128(d6d7, 8);
1896
1897 lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
1898 &blimit, &limit, &thresh);
1899
1900 p0 = _mm_srli_si128(p1p0, 8);
1901 q0 = _mm_srli_si128(q1q0, 8);
1902
1903 transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
1904 &d6, &d7);
1905
1906 xx_storel_32((s - 2 + 0 * p), d0);
1907 xx_storel_32((s - 2 + 1 * p), d1);
1908 xx_storel_32((s - 2 + 2 * p), d2);
1909 xx_storel_32((s - 2 + 3 * p), d3);
1910 xx_storel_32((s - 2 + 4 * p), d4);
1911 xx_storel_32((s - 2 + 5 * p), d5);
1912 xx_storel_32((s - 2 + 6 * p), d6);
1913 xx_storel_32((s - 2 + 7 * p), d7);
1914 }
1915
aom_lpf_vertical_8_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1916 void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
1917 const unsigned char *_blimit,
1918 const unsigned char *_limit,
1919 const unsigned char *_thresh) {
1920 __m128i d0, d1, d2, d3, d4, d5, d6, d7;
1921
1922 __m128i p0, q0;
1923 __m128i x2, x1, x0, x3;
1924 __m128i q1q0, p1p0;
1925 __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
1926 __m128i limit = _mm_load_si128((const __m128i *)_limit);
1927 __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
1928
1929 x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
1930 x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
1931 x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
1932 x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
1933
1934 transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
1935 &d7);
1936 // Loop filtering
1937 lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
1938 &blimit, &limit, &thresh);
1939
1940 p0 = _mm_srli_si128(p1p0, 4);
1941 q0 = _mm_srli_si128(q1q0, 4);
1942
1943 transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
1944 &d2, &d3);
1945
1946 _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
1947 _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
1948 _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
1949 _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
1950 }
1951
aom_lpf_vertical_8_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1952 void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1953 const uint8_t *_limit0,
1954 const uint8_t *_thresh0,
1955 const uint8_t *_blimit1,
1956 const uint8_t *_limit1,
1957 const uint8_t *_thresh1) {
1958 __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
1959 _mm_load_si128((__m128i *)_blimit1));
1960 __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
1961 _mm_load_si128((__m128i *)_limit1));
1962 __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
1963 _mm_load_si128((__m128i *)_thresh1));
1964
1965 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1966 __m128i d1, d3, d5, d7;
1967 __m128i q1q0, p1p0;
1968 __m128i p1, q1;
1969 __m128i d0d1, d2d3, d4d5, d6d7;
1970
1971 x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
1972 x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
1973 x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
1974 x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
1975 x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
1976 x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
1977 x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
1978 x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
1979
1980 transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
1981 &d6d7);
1982
1983 d1 = _mm_srli_si128(d0d1, 8);
1984 d3 = _mm_srli_si128(d2d3, 8);
1985 d5 = _mm_srli_si128(d4d5, 8);
1986 d7 = _mm_srli_si128(d6d7, 8);
1987
1988 lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
1989 &q1q0, &p1p0, &blimit, &limit, &thresh);
1990
1991 p1 = _mm_srli_si128(p1p0, 8);
1992 q1 = _mm_srli_si128(q1q0, 8);
1993
1994 transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
1995 &d2d3, &d4d5, &d6d7);
1996
1997 _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
1998 _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
1999 _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
2000 _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
2001 _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
2002 _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
2003 _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
2004 _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
2005 }
2006
aom_lpf_vertical_14_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)2007 void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
2008 const unsigned char *_blimit,
2009 const unsigned char *_limit,
2010 const unsigned char *_thresh) {
2011 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
2012 __m128i x6, x5, x4, x3;
2013 __m128i pq0, pq1, pq2, pq3;
2014 __m128i blimit = _mm_load_si128((__m128i *)_blimit);
2015 __m128i limit = _mm_load_si128((__m128i *)_limit);
2016 __m128i thresh = _mm_load_si128((__m128i *)_thresh);
2017
2018 x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
2019 x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
2020 x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
2021 x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
2022
2023 transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
2024 &q5p5, &q6p6, &q7p7);
2025
2026 lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
2027 &limit, &thresh);
2028
2029 transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
2030 &q0p0, &pq0, &pq1, &pq2, &pq3);
2031 _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
2032 _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
2033 _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
2034 _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
2035 }
2036
aom_lpf_vertical_14_dual_sse2(unsigned char * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)2037 void aom_lpf_vertical_14_dual_sse2(
2038 unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
2039 const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
2040 const uint8_t *_thresh1) {
2041 __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
2042 __m128i x7, x6, x5, x4, x3, x2, x1, x0;
2043 __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
2044 __m128i q0, q1, q2, q3, q7;
2045 __m128i p0p1, p2p3, p4p5, p6p7;
2046
2047 __m128i blimit =
2048 _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
2049 _mm_load_si128((const __m128i *)_blimit1));
2050 __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
2051 _mm_load_si128((const __m128i *)_limit1));
2052 __m128i thresh =
2053 _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
2054 _mm_load_si128((const __m128i *)_thresh1));
2055
2056 x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
2057 x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
2058 x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
2059 x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
2060 x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
2061 x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
2062 x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
2063 x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
2064
2065 transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
2066 &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
2067
2068 q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
2069 q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
2070 q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
2071 q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
2072 q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
2073 q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
2074 q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
2075 q7 = _mm_srli_si128(d14d15, 8);
2076
2077 lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
2078 &blimit, &limit, &thresh);
2079
2080 x0 = _mm_srli_si128(q0p0, 8);
2081 x1 = _mm_srli_si128(q1p1, 8);
2082 x2 = _mm_srli_si128(q2p2, 8);
2083 x3 = _mm_srli_si128(q3p3, 8);
2084 x4 = _mm_srli_si128(q4p4, 8);
2085 x5 = _mm_srli_si128(q5p5, 8);
2086 x6 = _mm_srli_si128(q6p6, 8);
2087
2088 transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
2089 &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
2090 &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
2091
2092 _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
2093 _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
2094 _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
2095 _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
2096 _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
2097 _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
2098 _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
2099 _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
2100 }
2101
filter_add2_sub2(const __m128i * const total,const __m128i * const a1,const __m128i * const a2,const __m128i * const s1,const __m128i * const s2)2102 static INLINE __m128i filter_add2_sub2(const __m128i *const total,
2103 const __m128i *const a1,
2104 const __m128i *const a2,
2105 const __m128i *const s1,
2106 const __m128i *const s2) {
2107 __m128i x = _mm_add_epi16(*a1, *total);
2108 x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
2109 return x;
2110 }
2111
filter8_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f8_lo,const __m128i * const f8_hi)2112 static INLINE __m128i filter8_mask(const __m128i *const flat,
2113 const __m128i *const other_filt,
2114 const __m128i *const f8_lo,
2115 const __m128i *const f8_hi) {
2116 const __m128i f8 =
2117 _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
2118 const __m128i result = _mm_and_si128(*flat, f8);
2119 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
2120 }
2121
filter16_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f_lo,const __m128i * const f_hi)2122 static INLINE __m128i filter16_mask(const __m128i *const flat,
2123 const __m128i *const other_filt,
2124 const __m128i *const f_lo,
2125 const __m128i *const f_hi) {
2126 const __m128i f =
2127 _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
2128 const __m128i result = _mm_and_si128(*flat, f);
2129 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
2130 }
2131
aom_lpf_horizontal_14_quad_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)2132 void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p,
2133 const unsigned char *_blimit0,
2134 const unsigned char *_limit0,
2135 const unsigned char *_thresh0) {
2136 const __m128i zero = _mm_setzero_si128();
2137 const __m128i one = _mm_set1_epi8(1);
2138 const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
2139 const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
2140 const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
2141 __m128i mask, hev, flat, flat2;
2142 __m128i p6, p5;
2143 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
2144 __m128i q6, q5;
2145
2146 __m128i op2, op1, op0, oq0, oq1, oq2;
2147
2148 __m128i max_abs_p1p0q1q0;
2149
2150 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
2151 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
2152 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
2153 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
2154 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
2155 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
2156 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
2157 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
2158 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
2159 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
2160 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
2161 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
2162 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
2163 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
2164
2165 {
2166 const __m128i abs_p1p0 = abs_diff(p1, p0);
2167 const __m128i abs_q1q0 = abs_diff(q1, q0);
2168 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
2169 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
2170 __m128i abs_p0q0 = abs_diff(p0, q0);
2171 __m128i abs_p1q1 = abs_diff(p1, q1);
2172 __m128i work;
2173 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
2174
2175 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
2176 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
2177 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
2178 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
2179 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
2180 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
2181 // mask |= (abs(p1 - p0) > limit) * -1;
2182 // mask |= (abs(q1 - q0) > limit) * -1;
2183 work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
2184 mask = _mm_max_epu8(work, mask);
2185 work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
2186 mask = _mm_max_epu8(work, mask);
2187 mask = _mm_subs_epu8(mask, limit_v);
2188 mask = _mm_cmpeq_epi8(mask, zero);
2189 }
2190
2191 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
2192
2193 {
2194 __m128i work;
2195 work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
2196 flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
2197 work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
2198 flat = _mm_max_epu8(work, flat);
2199 work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
2200 flat = _mm_subs_epu8(flat, one);
2201 flat = _mm_cmpeq_epi8(flat, zero);
2202 flat = _mm_and_si128(flat, mask);
2203 flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
2204 flat2 = _mm_max_epu8(work, flat2);
2205 work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
2206 flat2 = _mm_max_epu8(work, flat2);
2207 flat2 = _mm_subs_epu8(flat2, one);
2208 flat2 = _mm_cmpeq_epi8(flat2, zero);
2209 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
2210 }
2211
2212 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2213 // filter4
2214 {
2215 const __m128i t4 = _mm_set1_epi8(4);
2216 const __m128i t3 = _mm_set1_epi8(3);
2217 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
2218 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
2219 const __m128i t1f = _mm_set1_epi8(0x1f);
2220 const __m128i t1 = _mm_set1_epi8(0x1);
2221 const __m128i t7f = _mm_set1_epi8(0x7f);
2222 const __m128i ff = _mm_cmpeq_epi8(t4, t4);
2223
2224 __m128i filt;
2225 __m128i work_a;
2226 __m128i filter1, filter2;
2227
2228 op1 = _mm_xor_si128(p1, t80);
2229 op0 = _mm_xor_si128(p0, t80);
2230 oq0 = _mm_xor_si128(q0, t80);
2231 oq1 = _mm_xor_si128(q1, t80);
2232
2233 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
2234 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
2235 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
2236
2237 work_a = _mm_subs_epi8(oq0, op0);
2238 filt = _mm_adds_epi8(filt, work_a);
2239 filt = _mm_adds_epi8(filt, work_a);
2240 filt = _mm_adds_epi8(filt, work_a);
2241 filt = _mm_and_si128(filt, mask);
2242 filter1 = _mm_adds_epi8(filt, t4);
2243 filter2 = _mm_adds_epi8(filt, t3);
2244
2245 work_a = _mm_cmpgt_epi8(zero, filter1);
2246 filter1 = _mm_srli_epi16(filter1, 3);
2247 work_a = _mm_and_si128(work_a, te0);
2248 filter1 = _mm_and_si128(filter1, t1f);
2249 filter1 = _mm_or_si128(filter1, work_a);
2250 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
2251
2252 work_a = _mm_cmpgt_epi8(zero, filter2);
2253 filter2 = _mm_srli_epi16(filter2, 3);
2254 work_a = _mm_and_si128(work_a, te0);
2255 filter2 = _mm_and_si128(filter2, t1f);
2256 filter2 = _mm_or_si128(filter2, work_a);
2257 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
2258
2259 filt = _mm_adds_epi8(filter1, t1);
2260 work_a = _mm_cmpgt_epi8(zero, filt);
2261 filt = _mm_srli_epi16(filt, 1);
2262 work_a = _mm_and_si128(work_a, t80);
2263 filt = _mm_and_si128(filt, t7f);
2264 filt = _mm_or_si128(filt, work_a);
2265 filt = _mm_andnot_si128(hev, filt);
2266 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
2267 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
2268
2269 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2270 // filter8
2271 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
2272 const __m128i four = _mm_set1_epi16(4);
2273 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
2274 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
2275 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
2276 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
2277 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
2278 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
2279 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
2280 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
2281
2282 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
2283 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
2284 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
2285 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
2286 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
2287 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
2288 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
2289 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
2290 __m128i f8_lo, f8_hi;
2291
2292 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
2293 _mm_add_epi16(p3_lo, p2_lo));
2294 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
2295 _mm_add_epi16(p2_lo, p1_lo));
2296 f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
2297
2298 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
2299 _mm_add_epi16(p3_hi, p2_hi));
2300 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
2301 _mm_add_epi16(p2_hi, p1_hi));
2302 f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
2303
2304 op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
2305
2306 f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
2307 f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
2308 op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
2309
2310 f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
2311 f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
2312 op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
2313
2314 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
2315 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
2316 oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
2317
2318 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
2319 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
2320 oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
2321
2322 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
2323 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
2324 oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
2325
2326 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2327 // wide flat calculations
2328 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
2329 const __m128i eight = _mm_set1_epi16(8);
2330 const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
2331 const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
2332 const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
2333 const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
2334 const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
2335 const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
2336
2337 const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
2338 const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
2339 const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
2340 const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
2341 const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
2342 const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
2343
2344 __m128i f_lo;
2345 __m128i f_hi;
2346
2347 f_lo = _mm_sub_epi16(_mm_slli_epi16(p6_lo, 3), p6_lo);
2348 f_lo = _mm_add_epi16(_mm_slli_epi16(p5_lo, 1), f_lo);
2349 f_lo = _mm_add_epi16(_mm_slli_epi16(p4_lo, 1), f_lo);
2350 f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
2351 _mm_add_epi16(p2_lo, p1_lo));
2352 f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
2353 f_lo = _mm_add_epi16(f_lo, eight);
2354
2355 f_hi = _mm_sub_epi16(_mm_slli_epi16(p6_hi, 3), p6_hi);
2356 f_hi = _mm_add_epi16(_mm_slli_epi16(p5_hi, 1), f_hi);
2357 f_hi = _mm_add_epi16(_mm_slli_epi16(p4_hi, 1), f_hi);
2358 f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
2359 _mm_add_epi16(p2_hi, p1_hi));
2360 f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
2361 f_hi = _mm_add_epi16(f_hi, eight);
2362
2363 p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
2364 _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
2365
2366 f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p3_lo, &p6_lo, &p6_lo);
2367 f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p3_hi, &p6_hi, &p6_hi);
2368 p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
2369 _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
2370
2371 f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p2_lo, &p6_lo, &p5_lo);
2372 f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p2_hi, &p6_hi, &p5_hi);
2373 p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
2374 _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
2375
2376 f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p1_lo, &p6_lo, &p4_lo);
2377 f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p1_hi, &p6_hi, &p4_hi);
2378 op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
2379 _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
2380
2381 f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p0_lo, &p6_lo, &p3_lo);
2382 f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p0_hi, &p6_hi, &p3_hi);
2383 op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
2384 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2385
2386 f_lo = filter_add2_sub2(&f_lo, &q5_lo, &q0_lo, &p6_lo, &p2_lo);
2387 f_hi = filter_add2_sub2(&f_hi, &q5_hi, &q0_hi, &p6_hi, &p2_hi);
2388 op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
2389 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2390
2391 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q1_lo, &p6_lo, &p1_lo);
2392 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q1_hi, &p6_hi, &p1_hi);
2393 oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
2394 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2395
2396 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q2_lo, &p5_lo, &p0_lo);
2397 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q2_hi, &p5_hi, &p0_hi);
2398 oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
2399 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2400
2401 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q3_lo, &p4_lo, &q0_lo);
2402 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q3_hi, &p4_hi, &q0_hi);
2403 oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
2404 _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
2405
2406 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q4_lo, &p3_lo, &q1_lo);
2407 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q4_hi, &p3_hi, &q1_hi);
2408 q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
2409 _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
2410
2411 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q5_lo, &p2_lo, &q2_lo);
2412 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q5_hi, &p2_hi, &q2_hi);
2413 q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
2414 _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
2415
2416 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q6_lo, &p1_lo, &q3_lo);
2417 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q6_hi, &p1_hi, &q3_hi);
2418 q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
2419 _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
2420 } else {
2421 _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
2422 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2423 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2424 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2425 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2426 _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
2427 }
2428 } else {
2429 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2430 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2431 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2432 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2433 }
2434 }
2435 }
2436
aom_lpf_horizontal_8_quad_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)2437 void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p,
2438 const unsigned char *_blimit0,
2439 const unsigned char *_limit0,
2440 const unsigned char *_thresh0) {
2441 const __m128i zero = _mm_setzero_si128();
2442 const __m128i one = _mm_set1_epi8(1);
2443 const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
2444 const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
2445 const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
2446 __m128i mask, hev, flat;
2447 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
2448
2449 __m128i op2, op1, op0, oq0, oq1, oq2;
2450
2451 __m128i max_abs_p1p0q1q0;
2452
2453 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
2454 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
2455 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
2456 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
2457 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
2458 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
2459 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
2460 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
2461
2462 {
2463 const __m128i abs_p1p0 = abs_diff(p1, p0);
2464 const __m128i abs_q1q0 = abs_diff(q1, q0);
2465 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
2466 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
2467 __m128i abs_p0q0 = abs_diff(p0, q0);
2468 __m128i abs_p1q1 = abs_diff(p1, q1);
2469 __m128i work;
2470 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
2471
2472 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
2473 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
2474 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
2475 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
2476 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
2477 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
2478 // mask |= (abs(p1 - p0) > limit) * -1;
2479 // mask |= (abs(q1 - q0) > limit) * -1;
2480 work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
2481 mask = _mm_max_epu8(work, mask);
2482 work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
2483 mask = _mm_max_epu8(work, mask);
2484 mask = _mm_subs_epu8(mask, limit_v);
2485 mask = _mm_cmpeq_epi8(mask, zero);
2486 }
2487
2488 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
2489
2490 {
2491 __m128i work;
2492 work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
2493 flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
2494 work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
2495 flat = _mm_max_epu8(work, flat);
2496 flat = _mm_subs_epu8(flat, one);
2497 flat = _mm_cmpeq_epi8(flat, zero);
2498 flat = _mm_and_si128(flat, mask);
2499 }
2500
2501 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2502 // filter4
2503 {
2504 const __m128i t4 = _mm_set1_epi8(4);
2505 const __m128i t3 = _mm_set1_epi8(3);
2506 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
2507 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
2508 const __m128i t1f = _mm_set1_epi8(0x1f);
2509 const __m128i t1 = _mm_set1_epi8(0x1);
2510 const __m128i t7f = _mm_set1_epi8(0x7f);
2511 const __m128i ff = _mm_cmpeq_epi8(t4, t4);
2512
2513 __m128i filt;
2514 __m128i work_a;
2515 __m128i filter1, filter2;
2516
2517 op1 = _mm_xor_si128(p1, t80);
2518 op0 = _mm_xor_si128(p0, t80);
2519 oq0 = _mm_xor_si128(q0, t80);
2520 oq1 = _mm_xor_si128(q1, t80);
2521
2522 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
2523 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
2524 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
2525
2526 work_a = _mm_subs_epi8(oq0, op0);
2527 filt = _mm_adds_epi8(filt, work_a);
2528 filt = _mm_adds_epi8(filt, work_a);
2529 filt = _mm_adds_epi8(filt, work_a);
2530 filt = _mm_and_si128(filt, mask);
2531 filter1 = _mm_adds_epi8(filt, t4);
2532 filter2 = _mm_adds_epi8(filt, t3);
2533
2534 work_a = _mm_cmpgt_epi8(zero, filter1);
2535 filter1 = _mm_srli_epi16(filter1, 3);
2536 work_a = _mm_and_si128(work_a, te0);
2537 filter1 = _mm_and_si128(filter1, t1f);
2538 filter1 = _mm_or_si128(filter1, work_a);
2539 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
2540
2541 work_a = _mm_cmpgt_epi8(zero, filter2);
2542 filter2 = _mm_srli_epi16(filter2, 3);
2543 work_a = _mm_and_si128(work_a, te0);
2544 filter2 = _mm_and_si128(filter2, t1f);
2545 filter2 = _mm_or_si128(filter2, work_a);
2546 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
2547
2548 filt = _mm_adds_epi8(filter1, t1);
2549 work_a = _mm_cmpgt_epi8(zero, filt);
2550 filt = _mm_srli_epi16(filt, 1);
2551 work_a = _mm_and_si128(work_a, t80);
2552 filt = _mm_and_si128(filt, t7f);
2553 filt = _mm_or_si128(filt, work_a);
2554 filt = _mm_andnot_si128(hev, filt);
2555 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
2556 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
2557
2558 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2559 // filter8
2560 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
2561 const __m128i four = _mm_set1_epi16(4);
2562 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
2563 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
2564 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
2565 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
2566 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
2567 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
2568 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
2569 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
2570
2571 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
2572 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
2573 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
2574 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
2575 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
2576 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
2577 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
2578 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
2579 __m128i f8_lo, f8_hi;
2580
2581 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
2582 _mm_add_epi16(p3_lo, p2_lo));
2583 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
2584 _mm_add_epi16(p2_lo, p1_lo));
2585 f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
2586
2587 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
2588 _mm_add_epi16(p3_hi, p2_hi));
2589 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
2590 _mm_add_epi16(p2_hi, p1_hi));
2591 f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
2592
2593 op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
2594 _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
2595
2596 f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
2597 f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
2598 op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
2599 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2600
2601 f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
2602 f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
2603 op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
2604 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2605
2606 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
2607 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
2608 oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
2609 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2610
2611 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
2612 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
2613 oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
2614 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2615
2616 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
2617 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
2618 oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
2619 _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
2620 } else {
2621 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2622 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2623 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2624 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2625 }
2626 }
2627 }
2628
aom_lpf_horizontal_6_quad_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)2629 void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p,
2630 const unsigned char *_blimit0,
2631 const unsigned char *_limit0,
2632 const unsigned char *_thresh0) {
2633 const __m128i zero = _mm_setzero_si128();
2634 const __m128i one = _mm_set1_epi8(1);
2635 const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
2636 const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
2637 const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
2638 __m128i mask, hev, flat;
2639 __m128i p2, p1, p0, q0, q1, q2;
2640
2641 __m128i op1, op0, oq0, oq1;
2642
2643 __m128i max_abs_p1p0q1q0;
2644
2645 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
2646 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
2647 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
2648 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
2649 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
2650 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
2651
2652 {
2653 const __m128i abs_p1p0 = abs_diff(p1, p0);
2654 const __m128i abs_q1q0 = abs_diff(q1, q0);
2655 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
2656 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
2657 __m128i abs_p0q0 = abs_diff(p0, q0);
2658 __m128i abs_p1q1 = abs_diff(p1, q1);
2659 __m128i work;
2660 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
2661
2662 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
2663 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
2664 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
2665 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
2666 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
2667 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
2668 // mask |= (abs(p1 - p0) > limit) * -1;
2669 // mask |= (abs(q1 - q0) > limit) * -1;
2670 work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(q2, q1));
2671 mask = _mm_max_epu8(work, mask);
2672 mask = _mm_subs_epu8(mask, limit_v);
2673 mask = _mm_cmpeq_epi8(mask, zero);
2674 }
2675
2676 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
2677
2678 {
2679 __m128i work;
2680 work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
2681 flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
2682 flat = _mm_subs_epu8(flat, one);
2683 flat = _mm_cmpeq_epi8(flat, zero);
2684 flat = _mm_and_si128(flat, mask);
2685 }
2686
2687 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2688 // filter4
2689 {
2690 const __m128i t4 = _mm_set1_epi8(4);
2691 const __m128i t3 = _mm_set1_epi8(3);
2692 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
2693 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
2694 const __m128i t1f = _mm_set1_epi8(0x1f);
2695 const __m128i t1 = _mm_set1_epi8(0x1);
2696 const __m128i t7f = _mm_set1_epi8(0x7f);
2697 const __m128i ff = _mm_cmpeq_epi8(t4, t4);
2698
2699 __m128i filt;
2700 __m128i work_a;
2701 __m128i filter1, filter2;
2702
2703 op1 = _mm_xor_si128(p1, t80);
2704 op0 = _mm_xor_si128(p0, t80);
2705 oq0 = _mm_xor_si128(q0, t80);
2706 oq1 = _mm_xor_si128(q1, t80);
2707
2708 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
2709 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
2710 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
2711
2712 work_a = _mm_subs_epi8(oq0, op0);
2713 filt = _mm_adds_epi8(filt, work_a);
2714 filt = _mm_adds_epi8(filt, work_a);
2715 filt = _mm_adds_epi8(filt, work_a);
2716 filt = _mm_and_si128(filt, mask);
2717 filter1 = _mm_adds_epi8(filt, t4);
2718 filter2 = _mm_adds_epi8(filt, t3);
2719
2720 work_a = _mm_cmpgt_epi8(zero, filter1);
2721 filter1 = _mm_srli_epi16(filter1, 3);
2722 work_a = _mm_and_si128(work_a, te0);
2723 filter1 = _mm_and_si128(filter1, t1f);
2724 filter1 = _mm_or_si128(filter1, work_a);
2725 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
2726
2727 work_a = _mm_cmpgt_epi8(zero, filter2);
2728 filter2 = _mm_srli_epi16(filter2, 3);
2729 work_a = _mm_and_si128(work_a, te0);
2730 filter2 = _mm_and_si128(filter2, t1f);
2731 filter2 = _mm_or_si128(filter2, work_a);
2732 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
2733
2734 filt = _mm_adds_epi8(filter1, t1);
2735 work_a = _mm_cmpgt_epi8(zero, filt);
2736 filt = _mm_srli_epi16(filt, 1);
2737 work_a = _mm_and_si128(work_a, t80);
2738 filt = _mm_and_si128(filt, t7f);
2739 filt = _mm_or_si128(filt, work_a);
2740 filt = _mm_andnot_si128(hev, filt);
2741 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
2742 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
2743
2744 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2745 // filter6
2746 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
2747 const __m128i four = _mm_set1_epi16(4);
2748 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
2749 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
2750 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
2751 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
2752 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
2753 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
2754
2755 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
2756 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
2757 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
2758 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
2759 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
2760 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
2761 __m128i f8_lo, f8_hi;
2762
2763 f8_lo = _mm_add_epi16(_mm_add_epi16(p2_lo, four),
2764 _mm_add_epi16(p2_lo, p2_lo));
2765 f8_lo = _mm_add_epi16(_mm_add_epi16(p1_lo, f8_lo),
2766 _mm_add_epi16(p1_lo, p0_lo));
2767 f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
2768
2769 f8_hi = _mm_add_epi16(_mm_add_epi16(p2_hi, four),
2770 _mm_add_epi16(p2_hi, p2_hi));
2771 f8_hi = _mm_add_epi16(_mm_add_epi16(p1_hi, f8_hi),
2772 _mm_add_epi16(p1_hi, p0_hi));
2773 f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
2774
2775 op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
2776 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2777
2778 f8_lo = filter_add2_sub2(&f8_lo, &q0_lo, &q1_lo, &p2_lo, &p2_lo);
2779 f8_hi = filter_add2_sub2(&f8_hi, &q0_hi, &q1_hi, &p2_hi, &p2_hi);
2780 op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
2781 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2782
2783 f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &q2_lo, &p1_lo, &p2_lo);
2784 f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &q2_hi, &p1_hi, &p2_hi);
2785 oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
2786 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2787
2788 f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &q2_lo, &p0_lo, &p1_lo);
2789 f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &q2_hi, &p0_hi, &p1_hi);
2790 oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
2791 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2792 } else {
2793 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2794 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2795 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2796 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2797 }
2798 }
2799 }
2800
aom_lpf_horizontal_4_quad_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)2801 void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p,
2802 const unsigned char *_blimit0,
2803 const unsigned char *_limit0,
2804 const unsigned char *_thresh0) {
2805 const __m128i zero = _mm_setzero_si128();
2806 const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
2807 const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
2808 const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
2809 __m128i mask, hev;
2810 __m128i p1, p0, q0, q1;
2811
2812 __m128i op1, op0, oq0, oq1;
2813
2814 __m128i max_abs_p1p0q1q0;
2815
2816 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
2817 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
2818 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
2819 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
2820
2821 {
2822 const __m128i abs_p1p0 = abs_diff(p1, p0);
2823 const __m128i abs_q1q0 = abs_diff(q1, q0);
2824 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
2825 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
2826 __m128i abs_p0q0 = abs_diff(p0, q0);
2827 __m128i abs_p1q1 = abs_diff(p1, q1);
2828 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
2829
2830 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
2831 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
2832 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
2833 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
2834 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
2835 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
2836 // mask |= (abs(p1 - p0) > limit) * -1;
2837 // mask |= (abs(q1 - q0) > limit) * -1;
2838 mask = _mm_subs_epu8(mask, limit_v);
2839 mask = _mm_cmpeq_epi8(mask, zero);
2840 }
2841
2842 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
2843
2844 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2845 // filter4
2846 {
2847 const __m128i t4 = _mm_set1_epi8(4);
2848 const __m128i t3 = _mm_set1_epi8(3);
2849 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
2850 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
2851 const __m128i t1f = _mm_set1_epi8(0x1f);
2852 const __m128i t1 = _mm_set1_epi8(0x1);
2853 const __m128i t7f = _mm_set1_epi8(0x7f);
2854 const __m128i ff = _mm_cmpeq_epi8(t4, t4);
2855
2856 __m128i filt;
2857 __m128i work_a;
2858 __m128i filter1, filter2;
2859
2860 op1 = _mm_xor_si128(p1, t80);
2861 op0 = _mm_xor_si128(p0, t80);
2862 oq0 = _mm_xor_si128(q0, t80);
2863 oq1 = _mm_xor_si128(q1, t80);
2864
2865 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
2866 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
2867 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
2868
2869 work_a = _mm_subs_epi8(oq0, op0);
2870 filt = _mm_adds_epi8(filt, work_a);
2871 filt = _mm_adds_epi8(filt, work_a);
2872 filt = _mm_adds_epi8(filt, work_a);
2873 filt = _mm_and_si128(filt, mask);
2874 filter1 = _mm_adds_epi8(filt, t4);
2875 filter2 = _mm_adds_epi8(filt, t3);
2876
2877 work_a = _mm_cmpgt_epi8(zero, filter1);
2878 filter1 = _mm_srli_epi16(filter1, 3);
2879 work_a = _mm_and_si128(work_a, te0);
2880 filter1 = _mm_and_si128(filter1, t1f);
2881 filter1 = _mm_or_si128(filter1, work_a);
2882 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
2883
2884 work_a = _mm_cmpgt_epi8(zero, filter2);
2885 filter2 = _mm_srli_epi16(filter2, 3);
2886 work_a = _mm_and_si128(work_a, te0);
2887 filter2 = _mm_and_si128(filter2, t1f);
2888 filter2 = _mm_or_si128(filter2, work_a);
2889 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
2890
2891 filt = _mm_adds_epi8(filter1, t1);
2892 work_a = _mm_cmpgt_epi8(zero, filt);
2893 filt = _mm_srli_epi16(filt, 1);
2894 work_a = _mm_and_si128(work_a, t80);
2895 filt = _mm_and_si128(filt, t7f);
2896 filt = _mm_or_si128(filt, work_a);
2897 filt = _mm_andnot_si128(hev, filt);
2898 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
2899 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
2900
2901 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2902 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2903 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2904 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2905 }
2906 }
2907
aom_lpf_vertical_14_quad_sse2(unsigned char * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)2908 void aom_lpf_vertical_14_quad_sse2(unsigned char *s, int pitch,
2909 const uint8_t *_blimit0,
2910 const uint8_t *_limit0,
2911 const uint8_t *_thresh0) {
2912 DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
2913
2914 // Transpose 16x16
2915 transpose_16x8(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
2916 transpose_16x8(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
2917
2918 // Loop filtering
2919 aom_lpf_horizontal_14_quad(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0);
2920
2921 // Transpose back
2922 transpose_16x8(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
2923 transpose_16x8(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
2924 }
2925
aom_lpf_vertical_8_quad_sse2(uint8_t * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)2926 void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch,
2927 const uint8_t *_blimit0,
2928 const uint8_t *_limit0,
2929 const uint8_t *_thresh0) {
2930 DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
2931
2932 // Transpose 16x8
2933 transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
2934
2935 // Loop filtering
2936 aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
2937
2938 // Transpose back
2939 transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
2940 }
2941
aom_lpf_vertical_6_quad_sse2(uint8_t * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)2942 void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch,
2943 const uint8_t *_blimit0,
2944 const uint8_t *_limit0,
2945 const uint8_t *_thresh0) {
2946 DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
2947
2948 // Transpose 16x8:: (wxh) 8x16 to 16x8
2949 transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
2950
2951 // Loop filtering
2952 aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
2953
2954 // Transpose back:: (wxh) 16x8 to 8x16
2955 transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
2956 }
2957
aom_lpf_vertical_4_quad_sse2(uint8_t * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)2958 void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
2959 const uint8_t *_blimit0,
2960 const uint8_t *_limit0,
2961 const uint8_t *_thresh0) {
2962 DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
2963
2964 // Transpose 16x8
2965 transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
2966
2967 // Loop filtering
2968 aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0,
2969 _thresh0);
2970
2971 // Transpose back
2972 transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
2973 }
2974