1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_ports/mem.h"
15
signed_char_clamp_bd_sse2(__m128i value,int bd)16 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
17 __m128i ubounded;
18 __m128i lbounded;
19 __m128i retval;
20
21 const __m128i zero = _mm_set1_epi16(0);
22 const __m128i one = _mm_set1_epi16(1);
23 __m128i t80, max, min;
24
25 if (bd == 8) {
26 t80 = _mm_set1_epi16(0x80);
27 max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
28 } else if (bd == 10) {
29 t80 = _mm_set1_epi16(0x200);
30 max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
31 } else { // bd == 12
32 t80 = _mm_set1_epi16(0x800);
33 max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
34 }
35
36 min = _mm_subs_epi16(zero, t80);
37
38 ubounded = _mm_cmpgt_epi16(value, max);
39 lbounded = _mm_cmplt_epi16(value, min);
40 retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
41 ubounded = _mm_and_si128(ubounded, max);
42 lbounded = _mm_and_si128(lbounded, min);
43 retval = _mm_or_si128(retval, ubounded);
44 retval = _mm_or_si128(retval, lbounded);
45 return retval;
46 }
47
48 // TODO(debargha, peter): Break up large functions into smaller ones
49 // in this file.
vpx_highbd_lpf_horizontal_16_sse2(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)50 void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
51 const uint8_t *blimit,
52 const uint8_t *limit,
53 const uint8_t *thresh, int bd) {
54 const __m128i zero = _mm_set1_epi16(0);
55 const __m128i one = _mm_set1_epi16(1);
56 __m128i blimit_v, limit_v, thresh_v;
57 __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
58 __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
59 __m128i ps1, qs1, ps0, qs0;
60 __m128i abs_p0q0, abs_p1q1, ffff, work;
61 __m128i filt, work_a, filter1, filter2;
62 __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
63 __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
64 __m128i flat2_q0, flat2_p0;
65 __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
66 __m128i pixelFilter_p, pixelFilter_q;
67 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
68 __m128i sum_p7, sum_q7, sum_p3, sum_q3;
69 __m128i t4, t3, t80, t1;
70 __m128i eight, four;
71
72 if (bd == 8) {
73 blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
74 limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
75 thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
76 } else if (bd == 10) {
77 blimit_v = _mm_slli_epi16(
78 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
79 limit_v = _mm_slli_epi16(
80 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
81 thresh_v = _mm_slli_epi16(
82 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
83 } else { // bd == 12
84 blimit_v = _mm_slli_epi16(
85 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
86 limit_v = _mm_slli_epi16(
87 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
88 thresh_v = _mm_slli_epi16(
89 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
90 }
91
92 q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));
93 p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));
94 q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
95 p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
96 q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
97 p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
98 q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
99 p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
100 q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
101 p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
102
103 // highbd_filter_mask
104 abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
105 abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
106
107 ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
108
109 abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
110 abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
111
112 // highbd_hev_mask (in C code this is actually called from highbd_filter4)
113 flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
114 hev = _mm_subs_epu16(flat, thresh_v);
115 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
116
117 abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
118 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
119 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
120 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
121 mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
122 work = _mm_max_epi16(
123 _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
124 _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
125 mask = _mm_max_epi16(work, mask);
126 work = _mm_max_epi16(
127 _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
128 _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
129 mask = _mm_max_epi16(work, mask);
130 work = _mm_max_epi16(
131 _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
132 _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
133 mask = _mm_max_epi16(work, mask);
134
135 mask = _mm_subs_epu16(mask, limit_v);
136 mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
137
138 // lp filter
139 // highbd_filter4
140 t4 = _mm_set1_epi16(4);
141 t3 = _mm_set1_epi16(3);
142 if (bd == 8)
143 t80 = _mm_set1_epi16(0x80);
144 else if (bd == 10)
145 t80 = _mm_set1_epi16(0x200);
146 else // bd == 12
147 t80 = _mm_set1_epi16(0x800);
148
149 t1 = _mm_set1_epi16(0x1);
150
151 ps1 = _mm_subs_epi16(p1, t80);
152 qs1 = _mm_subs_epi16(q1, t80);
153 ps0 = _mm_subs_epi16(p0, t80);
154 qs0 = _mm_subs_epi16(q0, t80);
155
156 filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
157 hev);
158 work_a = _mm_subs_epi16(qs0, ps0);
159 filt = _mm_adds_epi16(filt, work_a);
160 filt = _mm_adds_epi16(filt, work_a);
161 filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
162 filt = _mm_and_si128(filt, mask);
163 filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
164 filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
165
166 // Filter1 >> 3
167 filter1 = _mm_srai_epi16(filter1, 0x3);
168 filter2 = _mm_srai_epi16(filter2, 0x3);
169
170 qs0 = _mm_adds_epi16(
171 signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
172 ps0 = _mm_adds_epi16(
173 signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
174 filt = _mm_adds_epi16(filter1, t1);
175 filt = _mm_srai_epi16(filt, 1);
176 filt = _mm_andnot_si128(hev, filt);
177 qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
178 t80);
179 ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
180 t80);
181
182 // end highbd_filter4
183 // loopfilter done
184
185 // highbd_flat_mask4
186 flat = _mm_max_epi16(
187 _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
188 _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
189 work = _mm_max_epi16(
190 _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
191 _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
192 flat = _mm_max_epi16(work, flat);
193 work = _mm_max_epi16(abs_p1p0, abs_q1q0);
194 flat = _mm_max_epi16(work, flat);
195
196 if (bd == 8)
197 flat = _mm_subs_epu16(flat, one);
198 else if (bd == 10)
199 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
200 else // bd == 12
201 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
202
203 flat = _mm_cmpeq_epi16(flat, zero);
204 // end flat_mask4
205
206 // flat & mask = flat && mask (as used in filter8)
207 // (because, in both vars, each block of 16 either all 1s or all 0s)
208 flat = _mm_and_si128(flat, mask);
209
210 p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));
211 q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));
212 p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));
213 q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));
214 p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));
215 q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));
216
217 // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
218 // but referred to as p0-p4 & q0-q4 in fn)
219 flat2 = _mm_max_epi16(
220 _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
221 _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
222
223 work = _mm_max_epi16(
224 _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
225 _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
226 flat2 = _mm_max_epi16(work, flat2);
227
228 work = _mm_max_epi16(
229 _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
230 _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
231 flat2 = _mm_max_epi16(work, flat2);
232
233 work = _mm_max_epi16(
234 _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
235 _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
236 flat2 = _mm_max_epi16(work, flat2);
237
238 if (bd == 8)
239 flat2 = _mm_subs_epu16(flat2, one);
240 else if (bd == 10)
241 flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
242 else // bd == 12
243 flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
244
245 flat2 = _mm_cmpeq_epi16(flat2, zero);
246 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
247 // end highbd_flat_mask5
248
249 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
250 // flat and wide flat calculations
251 eight = _mm_set1_epi16(8);
252 four = _mm_set1_epi16(4);
253
254 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
255 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
256
257 pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
258 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
259
260 pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
261 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
262 pixelFilter_p =
263 _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
264 pixetFilter_p2p1p0 = _mm_add_epi16(
265 four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
266 flat2_p0 =
267 _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
268 flat2_q0 =
269 _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
270 flat_p0 = _mm_srli_epi16(
271 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
272 flat_q0 = _mm_srli_epi16(
273 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
274
275 sum_p7 = _mm_add_epi16(p7, p7);
276 sum_q7 = _mm_add_epi16(q7, q7);
277 sum_p3 = _mm_add_epi16(p3, p3);
278 sum_q3 = _mm_add_epi16(q3, q3);
279
280 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
281 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
282 flat2_p1 = _mm_srli_epi16(
283 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
284 flat2_q1 = _mm_srli_epi16(
285 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
286
287 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
288 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
289 flat_p1 = _mm_srli_epi16(
290 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
291 flat_q1 = _mm_srli_epi16(
292 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
293
294 sum_p7 = _mm_add_epi16(sum_p7, p7);
295 sum_q7 = _mm_add_epi16(sum_q7, q7);
296 sum_p3 = _mm_add_epi16(sum_p3, p3);
297 sum_q3 = _mm_add_epi16(sum_q3, q3);
298
299 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
300 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
301 flat2_p2 = _mm_srli_epi16(
302 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
303 flat2_q2 = _mm_srli_epi16(
304 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
305
306 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
307 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
308 flat_p2 = _mm_srli_epi16(
309 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
310 flat_q2 = _mm_srli_epi16(
311 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
312
313 sum_p7 = _mm_add_epi16(sum_p7, p7);
314 sum_q7 = _mm_add_epi16(sum_q7, q7);
315 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
316 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
317 flat2_p3 = _mm_srli_epi16(
318 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
319 flat2_q3 = _mm_srli_epi16(
320 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
321
322 sum_p7 = _mm_add_epi16(sum_p7, p7);
323 sum_q7 = _mm_add_epi16(sum_q7, q7);
324 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
325 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
326 flat2_p4 = _mm_srli_epi16(
327 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
328 flat2_q4 = _mm_srli_epi16(
329 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
330
331 sum_p7 = _mm_add_epi16(sum_p7, p7);
332 sum_q7 = _mm_add_epi16(sum_q7, q7);
333 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
334 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
335 flat2_p5 = _mm_srli_epi16(
336 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
337 flat2_q5 = _mm_srli_epi16(
338 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
339
340 sum_p7 = _mm_add_epi16(sum_p7, p7);
341 sum_q7 = _mm_add_epi16(sum_q7, q7);
342 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
343 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
344 flat2_p6 = _mm_srli_epi16(
345 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
346 flat2_q6 = _mm_srli_epi16(
347 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
348
349 // wide flat
350 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
351
352 // highbd_filter8
353 p2 = _mm_andnot_si128(flat, p2);
354 // p2 remains unchanged if !(flat && mask)
355 flat_p2 = _mm_and_si128(flat, flat_p2);
356 // when (flat && mask)
357 p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values
358 q2 = _mm_andnot_si128(flat, q2);
359 flat_q2 = _mm_and_si128(flat, flat_q2);
360 q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values
361
362 ps1 = _mm_andnot_si128(flat, ps1);
363 // p1 takes the value assigned to in in filter4 if !(flat && mask)
364 flat_p1 = _mm_and_si128(flat, flat_p1);
365 // when (flat && mask)
366 p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values
367 qs1 = _mm_andnot_si128(flat, qs1);
368 flat_q1 = _mm_and_si128(flat, flat_q1);
369 q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values
370
371 ps0 = _mm_andnot_si128(flat, ps0);
372 // p0 takes the value assigned to in in filter4 if !(flat && mask)
373 flat_p0 = _mm_and_si128(flat, flat_p0);
374 // when (flat && mask)
375 p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values
376 qs0 = _mm_andnot_si128(flat, qs0);
377 flat_q0 = _mm_and_si128(flat, flat_q0);
378 q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values
379 // end highbd_filter8
380
381 // highbd_filter16
382 p6 = _mm_andnot_si128(flat2, p6);
383 // p6 remains unchanged if !(flat2 && flat && mask)
384 flat2_p6 = _mm_and_si128(flat2, flat2_p6);
385 // get values for when (flat2 && flat && mask)
386 p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values
387 q6 = _mm_andnot_si128(flat2, q6);
388 // q6 remains unchanged if !(flat2 && flat && mask)
389 flat2_q6 = _mm_and_si128(flat2, flat2_q6);
390 // get values for when (flat2 && flat && mask)
391 q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
392 _mm_store_si128((__m128i *)(s - 7 * pitch), p6);
393 _mm_store_si128((__m128i *)(s + 6 * pitch), q6);
394
395 p5 = _mm_andnot_si128(flat2, p5);
396 // p5 remains unchanged if !(flat2 && flat && mask)
397 flat2_p5 = _mm_and_si128(flat2, flat2_p5);
398 // get values for when (flat2 && flat && mask)
399 p5 = _mm_or_si128(p5, flat2_p5);
400 // full list of p5 values
401 q5 = _mm_andnot_si128(flat2, q5);
402 // q5 remains unchanged if !(flat2 && flat && mask)
403 flat2_q5 = _mm_and_si128(flat2, flat2_q5);
404 // get values for when (flat2 && flat && mask)
405 q5 = _mm_or_si128(q5, flat2_q5);
406 // full list of q5 values
407 _mm_store_si128((__m128i *)(s - 6 * pitch), p5);
408 _mm_store_si128((__m128i *)(s + 5 * pitch), q5);
409
410 p4 = _mm_andnot_si128(flat2, p4);
411 // p4 remains unchanged if !(flat2 && flat && mask)
412 flat2_p4 = _mm_and_si128(flat2, flat2_p4);
413 // get values for when (flat2 && flat && mask)
414 p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values
415 q4 = _mm_andnot_si128(flat2, q4);
416 // q4 remains unchanged if !(flat2 && flat && mask)
417 flat2_q4 = _mm_and_si128(flat2, flat2_q4);
418 // get values for when (flat2 && flat && mask)
419 q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
420 _mm_store_si128((__m128i *)(s - 5 * pitch), p4);
421 _mm_store_si128((__m128i *)(s + 4 * pitch), q4);
422
423 p3 = _mm_andnot_si128(flat2, p3);
424 // p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
425 flat2_p3 = _mm_and_si128(flat2, flat2_p3);
426 // get values for when (flat2 && flat && mask)
427 p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values
428 q3 = _mm_andnot_si128(flat2, q3);
429 // q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
430 flat2_q3 = _mm_and_si128(flat2, flat2_q3);
431 // get values for when (flat2 && flat && mask)
432 q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
433 _mm_store_si128((__m128i *)(s - 4 * pitch), p3);
434 _mm_store_si128((__m128i *)(s + 3 * pitch), q3);
435
436 p2 = _mm_andnot_si128(flat2, p2);
437 // p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
438 flat2_p2 = _mm_and_si128(flat2, flat2_p2);
439 // get values for when (flat2 && flat && mask)
440 p2 = _mm_or_si128(p2, flat2_p2);
441 // full list of p2 values
442 q2 = _mm_andnot_si128(flat2, q2);
443 // q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
444 flat2_q2 = _mm_and_si128(flat2, flat2_q2);
445 // get values for when (flat2 && flat && mask)
446 q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
447 _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
448 _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
449
450 p1 = _mm_andnot_si128(flat2, p1);
451 // p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
452 flat2_p1 = _mm_and_si128(flat2, flat2_p1);
453 // get values for when (flat2 && flat && mask)
454 p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values
455 q1 = _mm_andnot_si128(flat2, q1);
456 // q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
457 flat2_q1 = _mm_and_si128(flat2, flat2_q1);
458 // get values for when (flat2 && flat && mask)
459 q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
460 _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
461 _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
462
463 p0 = _mm_andnot_si128(flat2, p0);
464 // p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
465 flat2_p0 = _mm_and_si128(flat2, flat2_p0);
466 // get values for when (flat2 && flat && mask)
467 p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values
468 q0 = _mm_andnot_si128(flat2, q0);
469 // q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
470 flat2_q0 = _mm_and_si128(flat2, flat2_q0);
471 // get values for when (flat2 && flat && mask)
472 q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
473 _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
474 _mm_store_si128((__m128i *)(s - 0 * pitch), q0);
475 }
476
vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)477 void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,
478 const uint8_t *blimit,
479 const uint8_t *limit,
480 const uint8_t *thresh, int bd) {
481 vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);
482 vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);
483 }
484
vpx_highbd_lpf_horizontal_8_sse2(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)485 void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
486 const uint8_t *blimit,
487 const uint8_t *limit,
488 const uint8_t *thresh, int bd) {
489 DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
490 DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
491 DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
492 DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
493 DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
494 DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
495 const __m128i zero = _mm_set1_epi16(0);
496 __m128i blimit_v, limit_v, thresh_v;
497 __m128i mask, hev, flat;
498 __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
499 __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
500 __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
501 __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
502 __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
503 __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
504 __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
505 __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
506 const __m128i one = _mm_set1_epi16(1);
507 const __m128i ffff = _mm_cmpeq_epi16(one, one);
508 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
509 const __m128i four = _mm_set1_epi16(4);
510 __m128i workp_a, workp_b, workp_shft;
511
512 const __m128i t4 = _mm_set1_epi16(4);
513 const __m128i t3 = _mm_set1_epi16(3);
514 __m128i t80;
515 const __m128i t1 = _mm_set1_epi16(0x1);
516 __m128i ps1, ps0, qs0, qs1;
517 __m128i filt;
518 __m128i work_a;
519 __m128i filter1, filter2;
520
521 if (bd == 8) {
522 blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
523 limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
524 thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
525 t80 = _mm_set1_epi16(0x80);
526 } else if (bd == 10) {
527 blimit_v = _mm_slli_epi16(
528 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
529 limit_v = _mm_slli_epi16(
530 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
531 thresh_v = _mm_slli_epi16(
532 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
533 t80 = _mm_set1_epi16(0x200);
534 } else { // bd == 12
535 blimit_v = _mm_slli_epi16(
536 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
537 limit_v = _mm_slli_epi16(
538 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
539 thresh_v = _mm_slli_epi16(
540 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
541 t80 = _mm_set1_epi16(0x800);
542 }
543
544 ps1 = _mm_subs_epi16(p1, t80);
545 ps0 = _mm_subs_epi16(p0, t80);
546 qs0 = _mm_subs_epi16(q0, t80);
547 qs1 = _mm_subs_epi16(q1, t80);
548
549 // filter_mask and hev_mask
550 abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
551 abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
552
553 abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
554 abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
555 flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
556 hev = _mm_subs_epu16(flat, thresh_v);
557 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
558
559 abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
560 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
561 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
562 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
563 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
564 // So taking maximums continues to work:
565 mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
566 mask = _mm_max_epi16(abs_p1p0, mask);
567 // mask |= (abs(p1 - p0) > limit) * -1;
568 mask = _mm_max_epi16(abs_q1q0, mask);
569 // mask |= (abs(q1 - q0) > limit) * -1;
570
571 work = _mm_max_epi16(
572 _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
573 _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
574 mask = _mm_max_epi16(work, mask);
575 work = _mm_max_epi16(
576 _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
577 _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
578 mask = _mm_max_epi16(work, mask);
579 mask = _mm_subs_epu16(mask, limit_v);
580 mask = _mm_cmpeq_epi16(mask, zero);
581
582 // flat_mask4
583 flat = _mm_max_epi16(
584 _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
585 _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
586 work = _mm_max_epi16(
587 _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
588 _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
589 flat = _mm_max_epi16(work, flat);
590 flat = _mm_max_epi16(abs_p1p0, flat);
591 flat = _mm_max_epi16(abs_q1q0, flat);
592
593 if (bd == 8)
594 flat = _mm_subs_epu16(flat, one);
595 else if (bd == 10)
596 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
597 else // bd == 12
598 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
599
600 flat = _mm_cmpeq_epi16(flat, zero);
601 flat = _mm_and_si128(flat, mask); // flat & mask
602
603 // Added before shift for rounding part of ROUND_POWER_OF_TWO
604
605 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
606 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
607 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
608 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
609 _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
610
611 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
612 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
613 _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
614
615 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
616 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
617 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
618 _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
619
620 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
621 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
622 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
623 _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
624
625 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
626 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
627 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
628 _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
629
630 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
631 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
632 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
633 _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
634
635 // lp filter
636 filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
637 filt = _mm_and_si128(filt, hev);
638 work_a = _mm_subs_epi16(qs0, ps0);
639 filt = _mm_adds_epi16(filt, work_a);
640 filt = _mm_adds_epi16(filt, work_a);
641 filt = _mm_adds_epi16(filt, work_a);
642 // (vpx_filter + 3 * (qs0 - ps0)) & mask
643 filt = signed_char_clamp_bd_sse2(filt, bd);
644 filt = _mm_and_si128(filt, mask);
645
646 filter1 = _mm_adds_epi16(filt, t4);
647 filter2 = _mm_adds_epi16(filt, t3);
648
649 // Filter1 >> 3
650 filter1 = signed_char_clamp_bd_sse2(filter1, bd);
651 filter1 = _mm_srai_epi16(filter1, 3);
652
653 // Filter2 >> 3
654 filter2 = signed_char_clamp_bd_sse2(filter2, bd);
655 filter2 = _mm_srai_epi16(filter2, 3);
656
657 // filt >> 1
658 filt = _mm_adds_epi16(filter1, t1);
659 filt = _mm_srai_epi16(filt, 1);
660 // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
661 filt = _mm_andnot_si128(hev, filt);
662
663 work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
664 work_a = _mm_adds_epi16(work_a, t80);
665 q0 = _mm_load_si128((__m128i *)flat_oq0);
666 work_a = _mm_andnot_si128(flat, work_a);
667 q0 = _mm_and_si128(flat, q0);
668 q0 = _mm_or_si128(work_a, q0);
669
670 work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
671 work_a = _mm_adds_epi16(work_a, t80);
672 q1 = _mm_load_si128((__m128i *)flat_oq1);
673 work_a = _mm_andnot_si128(flat, work_a);
674 q1 = _mm_and_si128(flat, q1);
675 q1 = _mm_or_si128(work_a, q1);
676
677 work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
678 q2 = _mm_load_si128((__m128i *)flat_oq2);
679 work_a = _mm_andnot_si128(flat, work_a);
680 q2 = _mm_and_si128(flat, q2);
681 q2 = _mm_or_si128(work_a, q2);
682
683 work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
684 work_a = _mm_adds_epi16(work_a, t80);
685 p0 = _mm_load_si128((__m128i *)flat_op0);
686 work_a = _mm_andnot_si128(flat, work_a);
687 p0 = _mm_and_si128(flat, p0);
688 p0 = _mm_or_si128(work_a, p0);
689
690 work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
691 work_a = _mm_adds_epi16(work_a, t80);
692 p1 = _mm_load_si128((__m128i *)flat_op1);
693 work_a = _mm_andnot_si128(flat, work_a);
694 p1 = _mm_and_si128(flat, p1);
695 p1 = _mm_or_si128(work_a, p1);
696
697 work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
698 p2 = _mm_load_si128((__m128i *)flat_op2);
699 work_a = _mm_andnot_si128(flat, work_a);
700 p2 = _mm_and_si128(flat, p2);
701 p2 = _mm_or_si128(work_a, p2);
702
703 _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
704 _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
705 _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
706 _mm_store_si128((__m128i *)(s + 0 * pitch), q0);
707 _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
708 _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
709 }
710
vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)711 void vpx_highbd_lpf_horizontal_8_dual_sse2(
712 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
713 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
714 const uint8_t *thresh1, int bd) {
715 vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);
716 vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
717 }
718
vpx_highbd_lpf_horizontal_4_sse2(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)719 void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
720 const uint8_t *blimit,
721 const uint8_t *limit,
722 const uint8_t *thresh, int bd) {
723 const __m128i zero = _mm_set1_epi16(0);
724 __m128i blimit_v, limit_v, thresh_v;
725 __m128i mask, hev, flat;
726 __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
727 __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
728 __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
729 __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
730 __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
731 __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
732 __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
733 __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
734 const __m128i abs_p1p0 =
735 _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
736 const __m128i abs_q1q0 =
737 _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
738 const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
739 const __m128i one = _mm_set1_epi16(1);
740 __m128i abs_p0q0 =
741 _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
742 __m128i abs_p1q1 =
743 _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
744 __m128i work;
745 const __m128i t4 = _mm_set1_epi16(4);
746 const __m128i t3 = _mm_set1_epi16(3);
747 __m128i t80;
748 __m128i tff80;
749 __m128i tffe0;
750 __m128i t1f;
751 // equivalent to shifting 0x1f left by bitdepth - 8
752 // and setting new bits to 1
753 const __m128i t1 = _mm_set1_epi16(0x1);
754 __m128i t7f;
755 // equivalent to shifting 0x7f left by bitdepth - 8
756 // and setting new bits to 1
757 __m128i ps1, ps0, qs0, qs1;
758 __m128i filt;
759 __m128i work_a;
760 __m128i filter1, filter2;
761
762 if (bd == 8) {
763 blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
764 limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
765 thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
766 t80 = _mm_set1_epi16(0x80);
767 tff80 = _mm_set1_epi16((int16_t)0xff80);
768 tffe0 = _mm_set1_epi16((int16_t)0xffe0);
769 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
770 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
771 } else if (bd == 10) {
772 blimit_v = _mm_slli_epi16(
773 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
774 limit_v = _mm_slli_epi16(
775 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
776 thresh_v = _mm_slli_epi16(
777 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
778 t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
779 tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 2);
780 tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 2);
781 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
782 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
783 } else { // bd == 12
784 blimit_v = _mm_slli_epi16(
785 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
786 limit_v = _mm_slli_epi16(
787 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
788 thresh_v = _mm_slli_epi16(
789 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
790 t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
791 tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 4);
792 tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 4);
793 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
794 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
795 }
796
797 ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
798 ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
799 qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
800 qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
801
802 // filter_mask and hev_mask
803 flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
804 hev = _mm_subs_epu16(flat, thresh_v);
805 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
806
807 abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
808 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
809 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
810 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
811 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
812 // So taking maximums continues to work:
813 mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
814 mask = _mm_max_epi16(flat, mask);
815 // mask |= (abs(p1 - p0) > limit) * -1;
816 // mask |= (abs(q1 - q0) > limit) * -1;
817 work = _mm_max_epi16(
818 _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
819 _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
820 mask = _mm_max_epi16(work, mask);
821 work = _mm_max_epi16(
822 _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
823 _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
824 mask = _mm_max_epi16(work, mask);
825 mask = _mm_subs_epu16(mask, limit_v);
826 mask = _mm_cmpeq_epi16(mask, zero);
827
828 // filter4
829 filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
830 filt = _mm_and_si128(filt, hev);
831 work_a = _mm_subs_epi16(qs0, ps0);
832 filt = _mm_adds_epi16(filt, work_a);
833 filt = _mm_adds_epi16(filt, work_a);
834 filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
835
836 // (vpx_filter + 3 * (qs0 - ps0)) & mask
837 filt = _mm_and_si128(filt, mask);
838
839 filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
840 filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
841
842 // Filter1 >> 3
843 work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
844 filter1 = _mm_srli_epi16(filter1, 3);
845 work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
846 filter1 = _mm_and_si128(filter1, t1f); // clamp the range
847 filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
848
849 // Filter2 >> 3
850 work_a = _mm_cmpgt_epi16(zero, filter2);
851 filter2 = _mm_srli_epi16(filter2, 3);
852 work_a = _mm_and_si128(work_a, tffe0);
853 filter2 = _mm_and_si128(filter2, t1f);
854 filter2 = _mm_or_si128(filter2, work_a);
855
856 // filt >> 1
857 filt = _mm_adds_epi16(filter1, t1);
858 work_a = _mm_cmpgt_epi16(zero, filt);
859 filt = _mm_srli_epi16(filt, 1);
860 work_a = _mm_and_si128(work_a, tff80);
861 filt = _mm_and_si128(filt, t7f);
862 filt = _mm_or_si128(filt, work_a);
863
864 filt = _mm_andnot_si128(hev, filt);
865
866 q0 = _mm_adds_epi16(
867 signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
868 q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
869 t80);
870 p0 = _mm_adds_epi16(
871 signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
872 p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
873 t80);
874
875 _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
876 _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
877 _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
878 _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
879 }
880
vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)881 void vpx_highbd_lpf_horizontal_4_dual_sse2(
882 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
883 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
884 const uint8_t *thresh1, int bd) {
885 vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);
886 vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
887 }
888
highbd_transpose(uint16_t * src[],int in_p,uint16_t * dst[],int out_p,int num_8x8_to_transpose)889 static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
890 int out_p, int num_8x8_to_transpose) {
891 int idx8x8 = 0;
892 __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
893 do {
894 uint16_t *in = src[idx8x8];
895 uint16_t *out = dst[idx8x8];
896
897 p0 =
898 _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
899 p1 =
900 _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
901 p2 =
902 _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
903 p3 =
904 _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
905 p4 =
906 _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
907 p5 =
908 _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
909 p6 =
910 _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
911 p7 =
912 _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
913 // 00 10 01 11 02 12 03 13
914 x0 = _mm_unpacklo_epi16(p0, p1);
915 // 20 30 21 31 22 32 23 33
916 x1 = _mm_unpacklo_epi16(p2, p3);
917 // 40 50 41 51 42 52 43 53
918 x2 = _mm_unpacklo_epi16(p4, p5);
919 // 60 70 61 71 62 72 63 73
920 x3 = _mm_unpacklo_epi16(p6, p7);
921 // 00 10 20 30 01 11 21 31
922 x4 = _mm_unpacklo_epi32(x0, x1);
923 // 40 50 60 70 41 51 61 71
924 x5 = _mm_unpacklo_epi32(x2, x3);
925 // 00 10 20 30 40 50 60 70
926 x6 = _mm_unpacklo_epi64(x4, x5);
927 // 01 11 21 31 41 51 61 71
928 x7 = _mm_unpackhi_epi64(x4, x5);
929
930 _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
931 // 00 10 20 30 40 50 60 70
932 _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
933 // 01 11 21 31 41 51 61 71
934
935 // 02 12 22 32 03 13 23 33
936 x4 = _mm_unpackhi_epi32(x0, x1);
937 // 42 52 62 72 43 53 63 73
938 x5 = _mm_unpackhi_epi32(x2, x3);
939 // 02 12 22 32 42 52 62 72
940 x6 = _mm_unpacklo_epi64(x4, x5);
941 // 03 13 23 33 43 53 63 73
942 x7 = _mm_unpackhi_epi64(x4, x5);
943
944 _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
945 // 02 12 22 32 42 52 62 72
946 _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
947 // 03 13 23 33 43 53 63 73
948
949 // 04 14 05 15 06 16 07 17
950 x0 = _mm_unpackhi_epi16(p0, p1);
951 // 24 34 25 35 26 36 27 37
952 x1 = _mm_unpackhi_epi16(p2, p3);
953 // 44 54 45 55 46 56 47 57
954 x2 = _mm_unpackhi_epi16(p4, p5);
955 // 64 74 65 75 66 76 67 77
956 x3 = _mm_unpackhi_epi16(p6, p7);
957 // 04 14 24 34 05 15 25 35
958 x4 = _mm_unpacklo_epi32(x0, x1);
959 // 44 54 64 74 45 55 65 75
960 x5 = _mm_unpacklo_epi32(x2, x3);
961 // 04 14 24 34 44 54 64 74
962 x6 = _mm_unpacklo_epi64(x4, x5);
963 // 05 15 25 35 45 55 65 75
964 x7 = _mm_unpackhi_epi64(x4, x5);
965
966 _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
967 // 04 14 24 34 44 54 64 74
968 _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
969 // 05 15 25 35 45 55 65 75
970
971 // 06 16 26 36 07 17 27 37
972 x4 = _mm_unpackhi_epi32(x0, x1);
973 // 46 56 66 76 47 57 67 77
974 x5 = _mm_unpackhi_epi32(x2, x3);
975 // 06 16 26 36 46 56 66 76
976 x6 = _mm_unpacklo_epi64(x4, x5);
977 // 07 17 27 37 47 57 67 77
978 x7 = _mm_unpackhi_epi64(x4, x5);
979
980 _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
981 // 06 16 26 36 46 56 66 76
982 _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
983 // 07 17 27 37 47 57 67 77
984 } while (++idx8x8 < num_8x8_to_transpose);
985 }
986
highbd_transpose8x16(uint16_t * in0,uint16_t * in1,int in_p,uint16_t * out,int out_p)987 static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
988 uint16_t *out, int out_p) {
989 uint16_t *src0[1];
990 uint16_t *src1[1];
991 uint16_t *dest0[1];
992 uint16_t *dest1[1];
993 src0[0] = in0;
994 src1[0] = in1;
995 dest0[0] = out;
996 dest1[0] = out + 8;
997 highbd_transpose(src0, in_p, dest0, out_p, 1);
998 highbd_transpose(src1, in_p, dest1, out_p, 1);
999 }
1000
vpx_highbd_lpf_vertical_4_sse2(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)1001 void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,
1002 const uint8_t *blimit, const uint8_t *limit,
1003 const uint8_t *thresh, int bd) {
1004 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1005 uint16_t *src[1];
1006 uint16_t *dst[1];
1007
1008 // Transpose 8x8
1009 src[0] = s - 4;
1010 dst[0] = t_dst;
1011
1012 highbd_transpose(src, pitch, dst, 8, 1);
1013
1014 // Loop filtering
1015 vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
1016
1017 src[0] = t_dst;
1018 dst[0] = s - 4;
1019
1020 // Transpose back
1021 highbd_transpose(src, 8, dst, pitch, 1);
1022 }
1023
vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)1024 void vpx_highbd_lpf_vertical_4_dual_sse2(
1025 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
1026 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1027 const uint8_t *thresh1, int bd) {
1028 DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1029 uint16_t *src[2];
1030 uint16_t *dst[2];
1031
1032 // Transpose 8x16
1033 highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
1034
1035 // Loop filtering
1036 vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1037 thresh0, blimit1, limit1, thresh1, bd);
1038 src[0] = t_dst;
1039 src[1] = t_dst + 8;
1040 dst[0] = s - 4;
1041 dst[1] = s - 4 + pitch * 8;
1042
1043 // Transpose back
1044 highbd_transpose(src, 16, dst, pitch, 2);
1045 }
1046
vpx_highbd_lpf_vertical_8_sse2(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)1047 void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,
1048 const uint8_t *blimit, const uint8_t *limit,
1049 const uint8_t *thresh, int bd) {
1050 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1051 uint16_t *src[1];
1052 uint16_t *dst[1];
1053
1054 // Transpose 8x8
1055 src[0] = s - 4;
1056 dst[0] = t_dst;
1057
1058 highbd_transpose(src, pitch, dst, 8, 1);
1059
1060 // Loop filtering
1061 vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
1062
1063 src[0] = t_dst;
1064 dst[0] = s - 4;
1065
1066 // Transpose back
1067 highbd_transpose(src, 8, dst, pitch, 1);
1068 }
1069
vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)1070 void vpx_highbd_lpf_vertical_8_dual_sse2(
1071 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
1072 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1073 const uint8_t *thresh1, int bd) {
1074 DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1075 uint16_t *src[2];
1076 uint16_t *dst[2];
1077
1078 // Transpose 8x16
1079 highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
1080
1081 // Loop filtering
1082 vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1083 thresh0, blimit1, limit1, thresh1, bd);
1084 src[0] = t_dst;
1085 src[1] = t_dst + 8;
1086
1087 dst[0] = s - 4;
1088 dst[1] = s - 4 + pitch * 8;
1089
1090 // Transpose back
1091 highbd_transpose(src, 16, dst, pitch, 2);
1092 }
1093
vpx_highbd_lpf_vertical_16_sse2(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)1094 void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,
1095 const uint8_t *blimit,
1096 const uint8_t *limit,
1097 const uint8_t *thresh, int bd) {
1098 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
1099 uint16_t *src[2];
1100 uint16_t *dst[2];
1101
1102 src[0] = s - 8;
1103 src[1] = s;
1104 dst[0] = t_dst;
1105 dst[1] = t_dst + 8 * 8;
1106
1107 // Transpose 16x8
1108 highbd_transpose(src, pitch, dst, 8, 2);
1109
1110 // Loop filtering
1111 vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
1112 bd);
1113 src[0] = t_dst;
1114 src[1] = t_dst + 8 * 8;
1115 dst[0] = s - 8;
1116 dst[1] = s;
1117
1118 // Transpose back
1119 highbd_transpose(src, 8, dst, pitch, 2);
1120 }
1121
vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)1122 void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,
1123 const uint8_t *blimit,
1124 const uint8_t *limit,
1125 const uint8_t *thresh, int bd) {
1126 DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
1127
1128 // Transpose 16x16
1129 highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
1130 highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
1131
1132 // Loop filtering
1133 vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
1134 thresh, bd);
1135
1136 // Transpose back
1137 highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
1138 highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,
1139 pitch);
1140 }
1141