1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12 #include "vp9/common/vp9_loopfilter.h"
13 #include "vpx_ports/emmintrin_compat.h"
14
mb_lpf_horizontal_edge_w_sse2_8(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)15 static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
16 int p,
17 const unsigned char *_blimit,
18 const unsigned char *_limit,
19 const unsigned char *_thresh) {
20 const __m128i zero = _mm_set1_epi16(0);
21 const __m128i one = _mm_set1_epi8(1);
22 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
23 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
24 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
25 __m128i mask, hev, flat, flat2;
26 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
27 __m128i abs_p1p0;
28
29 q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
30 q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
31 (__m64 *)(s + 4 * p)));
32 q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
33 q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
34 (__m64 *)(s + 3 * p)));
35 q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
36 q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
37 (__m64 *)(s + 2 * p)));
38 q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
39 q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
40 (__m64 *)(s + 1 * p)));
41 p1q1 = _mm_shuffle_epi32(q1p1, 78);
42 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
43 q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
44 (__m64 *)(s - 0 * p)));
45 p0q0 = _mm_shuffle_epi32(q0p0, 78);
46
47 {
48 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
49 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
50 _mm_subs_epu8(q0p0, q1p1));
51 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
52 fe = _mm_set1_epi8(0xfe);
53 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
54 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
55 _mm_subs_epu8(p0q0, q0p0));
56 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
57 _mm_subs_epu8(p1q1, q1p1));
58 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
59 hev = _mm_subs_epu8(flat, thresh);
60 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
61
62 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
63 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
64 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
65 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
66 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
67 mask = _mm_max_epu8(abs_p1p0, mask);
68 // mask |= (abs(p1 - p0) > limit) * -1;
69 // mask |= (abs(q1 - q0) > limit) * -1;
70
71 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
72 _mm_subs_epu8(q1p1, q2p2)),
73 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
74 _mm_subs_epu8(q2p2, q3p3)));
75 mask = _mm_max_epu8(work, mask);
76 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
77 mask = _mm_subs_epu8(mask, limit);
78 mask = _mm_cmpeq_epi8(mask, zero);
79 }
80
81 // lp filter
82 {
83 const __m128i t4 = _mm_set1_epi8(4);
84 const __m128i t3 = _mm_set1_epi8(3);
85 const __m128i t80 = _mm_set1_epi8(0x80);
86 const __m128i t1 = _mm_set1_epi16(0x1);
87 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
88 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
89 __m128i qs0 = _mm_xor_si128(p0q0, t80);
90 __m128i qs1 = _mm_xor_si128(p1q1, t80);
91 __m128i filt;
92 __m128i work_a;
93 __m128i filter1, filter2;
94 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
95 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
96
97 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
98 work_a = _mm_subs_epi8(qs0, qs0ps0);
99 filt = _mm_adds_epi8(filt, work_a);
100 filt = _mm_adds_epi8(filt, work_a);
101 filt = _mm_adds_epi8(filt, work_a);
102 // (vp9_filter + 3 * (qs0 - ps0)) & mask
103 filt = _mm_and_si128(filt, mask);
104
105 filter1 = _mm_adds_epi8(filt, t4);
106 filter2 = _mm_adds_epi8(filt, t3);
107
108 filter1 = _mm_unpacklo_epi8(zero, filter1);
109 filter1 = _mm_srai_epi16(filter1, 0xB);
110 filter2 = _mm_unpacklo_epi8(zero, filter2);
111 filter2 = _mm_srai_epi16(filter2, 0xB);
112
113 // Filter1 >> 3
114 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
115 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
116
117 // filt >> 1
118 filt = _mm_adds_epi16(filter1, t1);
119 filt = _mm_srai_epi16(filt, 1);
120 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
121 filt);
122 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
123 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
124 // loopfilter done
125
126 {
127 __m128i work;
128 flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
129 _mm_subs_epu8(q0p0, q2p2)),
130 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
131 _mm_subs_epu8(q0p0, q3p3)));
132 flat = _mm_max_epu8(abs_p1p0, flat);
133 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
134 flat = _mm_subs_epu8(flat, one);
135 flat = _mm_cmpeq_epi8(flat, zero);
136 flat = _mm_and_si128(flat, mask);
137
138 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
139 q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
140 (__m64 *)(s + 5 * p)));
141
142 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
143 q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
144 (__m64 *)(s + 6 * p)));
145
146 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
147 _mm_subs_epu8(q0p0, q4p4)),
148 _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
149 _mm_subs_epu8(q0p0, q5p5)));
150
151 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
152 q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
153 (__m64 *)(s + 7 * p)));
154
155 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
156 _mm_subs_epu8(q0p0, q6p6)),
157 _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
158 _mm_subs_epu8(q0p0, q7p7)));
159
160 flat2 = _mm_max_epu8(work, flat2);
161 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
162 flat2 = _mm_subs_epu8(flat2, one);
163 flat2 = _mm_cmpeq_epi8(flat2, zero);
164 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
165 }
166
167 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
168 // flat and wide flat calculations
169 {
170 const __m128i eight = _mm_set1_epi16(8);
171 const __m128i four = _mm_set1_epi16(4);
172 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
173 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
174 __m128i pixelFilter_p, pixelFilter_q;
175 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
176 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
177
178 p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
179 p6_16 = _mm_unpacklo_epi8(q6p6, zero);
180 p5_16 = _mm_unpacklo_epi8(q5p5, zero);
181 p4_16 = _mm_unpacklo_epi8(q4p4, zero);
182 p3_16 = _mm_unpacklo_epi8(q3p3, zero);
183 p2_16 = _mm_unpacklo_epi8(q2p2, zero);
184 p1_16 = _mm_unpacklo_epi8(q1p1, zero);
185 p0_16 = _mm_unpacklo_epi8(q0p0, zero);
186 q0_16 = _mm_unpackhi_epi8(q0p0, zero);
187 q1_16 = _mm_unpackhi_epi8(q1p1, zero);
188 q2_16 = _mm_unpackhi_epi8(q2p2, zero);
189 q3_16 = _mm_unpackhi_epi8(q3p3, zero);
190 q4_16 = _mm_unpackhi_epi8(q4p4, zero);
191 q5_16 = _mm_unpackhi_epi8(q5p5, zero);
192 q6_16 = _mm_unpackhi_epi8(q6p6, zero);
193 q7_16 = _mm_unpackhi_epi8(q7p7, zero);
194
195 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
196 _mm_add_epi16(p4_16, p3_16));
197 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
198 _mm_add_epi16(q4_16, q3_16));
199
200 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
201 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
202
203 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
204 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
205 pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
206 pixelFilter_q));
207 pixetFilter_p2p1p0 = _mm_add_epi16(four,
208 _mm_add_epi16(pixetFilter_p2p1p0,
209 pixetFilter_q2q1q0));
210 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
211 _mm_add_epi16(p7_16, p0_16)), 4);
212 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
213 _mm_add_epi16(q7_16, q0_16)), 4);
214 flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
215 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
216 _mm_add_epi16(p3_16, p0_16)), 3);
217 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
218 _mm_add_epi16(q3_16, q0_16)), 3);
219
220 flat_q0p0 = _mm_packus_epi16(res_p, res_q);
221
222 sum_p7 = _mm_add_epi16(p7_16, p7_16);
223 sum_q7 = _mm_add_epi16(q7_16, q7_16);
224 sum_p3 = _mm_add_epi16(p3_16, p3_16);
225 sum_q3 = _mm_add_epi16(q3_16, q3_16);
226
227 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
228 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
229 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
230 _mm_add_epi16(sum_p7, p1_16)), 4);
231 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
232 _mm_add_epi16(sum_q7, q1_16)), 4);
233 flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
234
235 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
236 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
237 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
238 _mm_add_epi16(sum_p3, p1_16)), 3);
239 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
240 _mm_add_epi16(sum_q3, q1_16)), 3);
241 flat_q1p1 = _mm_packus_epi16(res_p, res_q);
242
243 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
244 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
245 sum_p3 = _mm_add_epi16(sum_p3, p3_16);
246 sum_q3 = _mm_add_epi16(sum_q3, q3_16);
247
248 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
249 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
250 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
251 _mm_add_epi16(sum_p7, p2_16)), 4);
252 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
253 _mm_add_epi16(sum_q7, q2_16)), 4);
254 flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
255
256 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
257 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
258
259 res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
260 _mm_add_epi16(sum_p3, p2_16)), 3);
261 res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
262 _mm_add_epi16(sum_q3, q2_16)), 3);
263 flat_q2p2 = _mm_packus_epi16(res_p, res_q);
264
265 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
266 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
267 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
268 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
269 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
270 _mm_add_epi16(sum_p7, p3_16)), 4);
271 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
272 _mm_add_epi16(sum_q7, q3_16)), 4);
273 flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
274
275 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
276 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
277 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
278 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
279 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
280 _mm_add_epi16(sum_p7, p4_16)), 4);
281 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
282 _mm_add_epi16(sum_q7, q4_16)), 4);
283 flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
284
285 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
286 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
287 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
288 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
289 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
290 _mm_add_epi16(sum_p7, p5_16)), 4);
291 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
292 _mm_add_epi16(sum_q7, q5_16)), 4);
293 flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
294
295 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
296 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
297 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
298 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
299 res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
300 _mm_add_epi16(sum_p7, p6_16)), 4);
301 res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
302 _mm_add_epi16(sum_q7, q6_16)), 4);
303 flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
304 }
305 // wide flat
306 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
307
308 flat = _mm_shuffle_epi32(flat, 68);
309 flat2 = _mm_shuffle_epi32(flat2, 68);
310
311 q2p2 = _mm_andnot_si128(flat, q2p2);
312 flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
313 q2p2 = _mm_or_si128(q2p2, flat_q2p2);
314
315 qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
316 flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
317 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
318
319 qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
320 flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
321 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
322
323 q6p6 = _mm_andnot_si128(flat2, q6p6);
324 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
325 q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
326 _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
327 _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
328
329 q5p5 = _mm_andnot_si128(flat2, q5p5);
330 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
331 q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
332 _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
333 _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
334
335 q4p4 = _mm_andnot_si128(flat2, q4p4);
336 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
337 q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
338 _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
339 _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
340
341 q3p3 = _mm_andnot_si128(flat2, q3p3);
342 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
343 q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
344 _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
345 _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
346
347 q2p2 = _mm_andnot_si128(flat2, q2p2);
348 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
349 q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
350 _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
351 _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
352
353 q1p1 = _mm_andnot_si128(flat2, q1p1);
354 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
355 q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
356 _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
357 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
358
359 q0p0 = _mm_andnot_si128(flat2, q0p0);
360 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
361 q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
362 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
363 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
364 }
365 }
366
mb_lpf_horizontal_edge_w_sse2_16(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)367 static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
368 int p,
369 const unsigned char *_blimit,
370 const unsigned char *_limit,
371 const unsigned char *_thresh) {
372 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16);
373 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16);
374
375 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16);
376 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16);
377
378 DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16);
379 DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16);
380
381 const __m128i zero = _mm_set1_epi16(0);
382 const __m128i one = _mm_set1_epi8(1);
383 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
384 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
385 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
386 __m128i mask, hev, flat, flat2;
387 __m128i p7, p6, p5;
388 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
389 __m128i q5, q6, q7;
390 int i = 0;
391
392 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
393 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
394 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
395 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
396 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
397 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
398 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
399 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
400 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
401 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
402
403 _mm_store_si128((__m128i *)&ap[4 * 16], p4);
404 _mm_store_si128((__m128i *)&ap[3 * 16], p3);
405 _mm_store_si128((__m128i *)&ap[2 * 16], p2);
406 _mm_store_si128((__m128i *)&ap[1 * 16], p1);
407 _mm_store_si128((__m128i *)&ap[0 * 16], p0);
408 _mm_store_si128((__m128i *)&aq[4 * 16], q4);
409 _mm_store_si128((__m128i *)&aq[3 * 16], q3);
410 _mm_store_si128((__m128i *)&aq[2 * 16], q2);
411 _mm_store_si128((__m128i *)&aq[1 * 16], q1);
412 _mm_store_si128((__m128i *)&aq[0 * 16], q0);
413
414
415 {
416 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
417 _mm_subs_epu8(p0, p1));
418 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
419 _mm_subs_epu8(q0, q1));
420 const __m128i fe = _mm_set1_epi8(0xfe);
421 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
422 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
423 _mm_subs_epu8(q0, p0));
424 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
425 _mm_subs_epu8(q1, p1));
426 __m128i work;
427 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
428 hev = _mm_subs_epu8(flat, thresh);
429 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
430
431 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
432 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
433 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
434 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
435 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
436 mask = _mm_max_epu8(flat, mask);
437 // mask |= (abs(p1 - p0) > limit) * -1;
438 // mask |= (abs(q1 - q0) > limit) * -1;
439 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
440 _mm_subs_epu8(p1, p2)),
441 _mm_or_si128(_mm_subs_epu8(p3, p2),
442 _mm_subs_epu8(p2, p3)));
443 mask = _mm_max_epu8(work, mask);
444 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
445 _mm_subs_epu8(q1, q2)),
446 _mm_or_si128(_mm_subs_epu8(q3, q2),
447 _mm_subs_epu8(q2, q3)));
448 mask = _mm_max_epu8(work, mask);
449 mask = _mm_subs_epu8(mask, limit);
450 mask = _mm_cmpeq_epi8(mask, zero);
451 }
452
453 // lp filter
454 {
455 const __m128i t4 = _mm_set1_epi8(4);
456 const __m128i t3 = _mm_set1_epi8(3);
457 const __m128i t80 = _mm_set1_epi8(0x80);
458 const __m128i te0 = _mm_set1_epi8(0xe0);
459 const __m128i t1f = _mm_set1_epi8(0x1f);
460 const __m128i t1 = _mm_set1_epi8(0x1);
461 const __m128i t7f = _mm_set1_epi8(0x7f);
462
463 __m128i ps1 = _mm_xor_si128(p1, t80);
464 __m128i ps0 = _mm_xor_si128(p0, t80);
465 __m128i qs0 = _mm_xor_si128(q0, t80);
466 __m128i qs1 = _mm_xor_si128(q1, t80);
467 __m128i filt;
468 __m128i work_a;
469 __m128i filter1, filter2;
470
471 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
472 work_a = _mm_subs_epi8(qs0, ps0);
473 filt = _mm_adds_epi8(filt, work_a);
474 filt = _mm_adds_epi8(filt, work_a);
475 filt = _mm_adds_epi8(filt, work_a);
476 // (vp9_filter + 3 * (qs0 - ps0)) & mask
477 filt = _mm_and_si128(filt, mask);
478
479 filter1 = _mm_adds_epi8(filt, t4);
480 filter2 = _mm_adds_epi8(filt, t3);
481
482 // Filter1 >> 3
483 work_a = _mm_cmpgt_epi8(zero, filter1);
484 filter1 = _mm_srli_epi16(filter1, 3);
485 work_a = _mm_and_si128(work_a, te0);
486 filter1 = _mm_and_si128(filter1, t1f);
487 filter1 = _mm_or_si128(filter1, work_a);
488 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
489
490 // Filter2 >> 3
491 work_a = _mm_cmpgt_epi8(zero, filter2);
492 filter2 = _mm_srli_epi16(filter2, 3);
493 work_a = _mm_and_si128(work_a, te0);
494 filter2 = _mm_and_si128(filter2, t1f);
495 filter2 = _mm_or_si128(filter2, work_a);
496 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
497
498 // filt >> 1
499 filt = _mm_adds_epi8(filter1, t1);
500 work_a = _mm_cmpgt_epi8(zero, filt);
501 filt = _mm_srli_epi16(filt, 1);
502 work_a = _mm_and_si128(work_a, t80);
503 filt = _mm_and_si128(filt, t7f);
504 filt = _mm_or_si128(filt, work_a);
505 filt = _mm_andnot_si128(hev, filt);
506 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
507 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
508 // loopfilter done
509
510 {
511 __m128i work;
512 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
513 _mm_subs_epu8(p0, p2)),
514 _mm_or_si128(_mm_subs_epu8(q2, q0),
515 _mm_subs_epu8(q0, q2)));
516 flat = _mm_max_epu8(work, flat);
517 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
518 _mm_subs_epu8(p0, p3)),
519 _mm_or_si128(_mm_subs_epu8(q3, q0),
520 _mm_subs_epu8(q0, q3)));
521 flat = _mm_max_epu8(work, flat);
522 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
523 _mm_subs_epu8(p0, p4)),
524 _mm_or_si128(_mm_subs_epu8(q4, q0),
525 _mm_subs_epu8(q0, q4)));
526 flat = _mm_subs_epu8(flat, one);
527 flat = _mm_cmpeq_epi8(flat, zero);
528 flat = _mm_and_si128(flat, mask);
529
530 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
531 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
532 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
533 _mm_subs_epu8(p0, p5)),
534 _mm_or_si128(_mm_subs_epu8(q5, q0),
535 _mm_subs_epu8(q0, q5)));
536 _mm_store_si128((__m128i *)&ap[5 * 16], p5);
537 _mm_store_si128((__m128i *)&aq[5 * 16], q5);
538 flat2 = _mm_max_epu8(work, flat2);
539 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
540 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
541 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
542 _mm_subs_epu8(p0, p6)),
543 _mm_or_si128(_mm_subs_epu8(q6, q0),
544 _mm_subs_epu8(q0, q6)));
545 _mm_store_si128((__m128i *)&ap[6 * 16], p6);
546 _mm_store_si128((__m128i *)&aq[6 * 16], q6);
547 flat2 = _mm_max_epu8(work, flat2);
548
549 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
550 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
551 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
552 _mm_subs_epu8(p0, p7)),
553 _mm_or_si128(_mm_subs_epu8(q7, q0),
554 _mm_subs_epu8(q0, q7)));
555 _mm_store_si128((__m128i *)&ap[7 * 16], p7);
556 _mm_store_si128((__m128i *)&aq[7 * 16], q7);
557 flat2 = _mm_max_epu8(work, flat2);
558 flat2 = _mm_subs_epu8(flat2, one);
559 flat2 = _mm_cmpeq_epi8(flat2, zero);
560 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
561 }
562
563 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
564 // flat and wide flat calculations
565 {
566 const __m128i eight = _mm_set1_epi16(8);
567 const __m128i four = _mm_set1_epi16(4);
568 __m128i temp_flat2 = flat2;
569 unsigned char *src = s;
570 int i = 0;
571 do {
572 __m128i workp_shft;
573 __m128i a, b, c;
574
575 unsigned int off = i * 8;
576 p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)),
577 zero);
578 p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)),
579 zero);
580 p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)),
581 zero);
582 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)),
583 zero);
584 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)),
585 zero);
586 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)),
587 zero);
588 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)),
589 zero);
590 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)),
591 zero);
592 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)),
593 zero);
594 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)),
595 zero);
596 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)),
597 zero);
598 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)),
599 zero);
600 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)),
601 zero);
602 q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)),
603 zero);
604 q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)),
605 zero);
606 q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)),
607 zero);
608
609 c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
610 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
611
612 b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
613 a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
614 a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
615
616 _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8],
617 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
618 , b));
619
620 c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
621 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
622 _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8],
623 _mm_packus_epi16(workp_shft, workp_shft));
624
625 a = _mm_add_epi16(q1, a);
626 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
627 _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8],
628 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
629 , b));
630
631 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
632 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
633 _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8],
634 _mm_packus_epi16(workp_shft, workp_shft));
635
636 a = _mm_add_epi16(q2, a);
637 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
638 _mm_storel_epi64((__m128i *)&flat_op[i * 8],
639 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
640 , b));
641
642 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
643 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
644 _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8],
645 _mm_packus_epi16(workp_shft, workp_shft));
646
647 a = _mm_add_epi16(q3, a);
648 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
649 _mm_storel_epi64((__m128i *)&flat_oq[i * 8],
650 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
651 , b));
652
653 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
654 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
655 _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8],
656 _mm_packus_epi16(workp_shft, workp_shft));
657
658 b = _mm_add_epi16(q3, b);
659 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
660 _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8],
661 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
662 , b));
663
664 c = _mm_add_epi16(q4, c);
665 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
666 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
667 _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8],
668 _mm_packus_epi16(workp_shft, workp_shft));
669
670 b = _mm_add_epi16(q3, b);
671 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
672 _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8],
673 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
674 , b));
675 a = _mm_add_epi16(q5, a);
676 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
677 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
678 _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8],
679 _mm_packus_epi16(workp_shft, workp_shft));
680
681 a = _mm_add_epi16(q6, a);
682 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
683 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
684 _mm_storel_epi64((__m128i *)&flat2_op[i * 8],
685 _mm_packus_epi16(workp_shft, workp_shft));
686
687 a = _mm_add_epi16(q7, a);
688 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
689 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
690 _mm_storel_epi64((__m128i *)&flat2_oq[i * 8],
691 _mm_packus_epi16(workp_shft, workp_shft));
692
693 a = _mm_add_epi16(q7, a);
694 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
695 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
696 _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8],
697 _mm_packus_epi16(workp_shft, workp_shft));
698
699 a = _mm_add_epi16(q7, a);
700 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
701 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
702 _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8],
703 _mm_packus_epi16(workp_shft, workp_shft));
704
705 a = _mm_add_epi16(q7, a);
706 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
707 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
708 _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8],
709 _mm_packus_epi16(workp_shft, workp_shft));
710
711 a = _mm_add_epi16(q7, a);
712 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
713 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
714 _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8],
715 _mm_packus_epi16(workp_shft, workp_shft));
716
717 a = _mm_add_epi16(q7, a);
718 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
719 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
720 _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8],
721 _mm_packus_epi16(workp_shft, workp_shft));
722
723 a = _mm_add_epi16(q7, a);
724 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
725 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
726 _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8],
727 _mm_packus_epi16(workp_shft, workp_shft));
728
729 temp_flat2 = _mm_srli_si128(temp_flat2, 8);
730 src += 8;
731 } while (++i < 2);
732 }
733 // wide flat
734 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
735
736 work_a = _mm_load_si128((__m128i *)&ap[2 * 16]);
737 p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
738 work_a = _mm_andnot_si128(flat, work_a);
739 p2 = _mm_and_si128(flat, p2);
740 p2 = _mm_or_si128(work_a, p2);
741 _mm_store_si128((__m128i *)&flat_op[2 * 16], p2);
742
743 p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
744 work_a = _mm_andnot_si128(flat, ps1);
745 p1 = _mm_and_si128(flat, p1);
746 p1 = _mm_or_si128(work_a, p1);
747 _mm_store_si128((__m128i *)&flat_op[1 * 16], p1);
748
749 p0 = _mm_load_si128((__m128i *)&flat_op[0]);
750 work_a = _mm_andnot_si128(flat, ps0);
751 p0 = _mm_and_si128(flat, p0);
752 p0 = _mm_or_si128(work_a, p0);
753 _mm_store_si128((__m128i *)&flat_op[0], p0);
754
755 q0 = _mm_load_si128((__m128i *)&flat_oq[0]);
756 work_a = _mm_andnot_si128(flat, qs0);
757 q0 = _mm_and_si128(flat, q0);
758 q0 = _mm_or_si128(work_a, q0);
759 _mm_store_si128((__m128i *)&flat_oq[0], q0);
760
761 q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
762 work_a = _mm_andnot_si128(flat, qs1);
763 q1 = _mm_and_si128(flat, q1);
764 q1 = _mm_or_si128(work_a, q1);
765 _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1);
766
767 work_a = _mm_load_si128((__m128i *)&aq[2 * 16]);
768 q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
769 work_a = _mm_andnot_si128(flat, work_a);
770 q2 = _mm_and_si128(flat, q2);
771 q2 = _mm_or_si128(work_a, q2);
772 _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2);
773
774 // write out op6 - op3
775 {
776 unsigned char *dst = (s - 7 * p);
777 for (i = 6; i > 2; i--) {
778 __m128i flat2_output;
779 work_a = _mm_load_si128((__m128i *)&ap[i * 16]);
780 flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]);
781 work_a = _mm_andnot_si128(flat2, work_a);
782 flat2_output = _mm_and_si128(flat2, flat2_output);
783 work_a = _mm_or_si128(work_a, flat2_output);
784 _mm_storeu_si128((__m128i *)dst, work_a);
785 dst += p;
786 }
787 }
788
789 work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
790 p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]);
791 work_a = _mm_andnot_si128(flat2, work_a);
792 p2 = _mm_and_si128(flat2, p2);
793 p2 = _mm_or_si128(work_a, p2);
794 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
795
796 work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
797 p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]);
798 work_a = _mm_andnot_si128(flat2, work_a);
799 p1 = _mm_and_si128(flat2, p1);
800 p1 = _mm_or_si128(work_a, p1);
801 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
802
803 work_a = _mm_load_si128((__m128i *)&flat_op[0]);
804 p0 = _mm_load_si128((__m128i *)&flat2_op[0]);
805 work_a = _mm_andnot_si128(flat2, work_a);
806 p0 = _mm_and_si128(flat2, p0);
807 p0 = _mm_or_si128(work_a, p0);
808 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
809
810 work_a = _mm_load_si128((__m128i *)&flat_oq[0]);
811 q0 = _mm_load_si128((__m128i *)&flat2_oq[0]);
812 work_a = _mm_andnot_si128(flat2, work_a);
813 q0 = _mm_and_si128(flat2, q0);
814 q0 = _mm_or_si128(work_a, q0);
815 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
816
817 work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
818 q1 = _mm_load_si128((__m128i *)&flat2_oq[16]);
819 work_a = _mm_andnot_si128(flat2, work_a);
820 q1 = _mm_and_si128(flat2, q1);
821 q1 = _mm_or_si128(work_a, q1);
822 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
823
824 work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
825 q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]);
826 work_a = _mm_andnot_si128(flat2, work_a);
827 q2 = _mm_and_si128(flat2, q2);
828 q2 = _mm_or_si128(work_a, q2);
829 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
830
831 // write out oq3 - oq7
832 {
833 unsigned char *dst = (s + 3 * p);
834 for (i = 3; i < 7; i++) {
835 __m128i flat2_output;
836 work_a = _mm_load_si128((__m128i *)&aq[i * 16]);
837 flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]);
838 work_a = _mm_andnot_si128(flat2, work_a);
839 flat2_output = _mm_and_si128(flat2, flat2_output);
840 work_a = _mm_or_si128(work_a, flat2_output);
841 _mm_storeu_si128((__m128i *)dst, work_a);
842 dst += p;
843 }
844 }
845 }
846 }
847
848 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
vp9_lpf_horizontal_16_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh,int count)849 void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
850 const unsigned char *_blimit,
851 const unsigned char *_limit,
852 const unsigned char *_thresh, int count) {
853 if (count == 1)
854 mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
855 else
856 mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
857 }
858
vp9_lpf_horizontal_8_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh,int count)859 void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
860 const unsigned char *_blimit,
861 const unsigned char *_limit,
862 const unsigned char *_thresh, int count) {
863 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
864 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
865 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
866 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
867 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
868 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
869 const __m128i zero = _mm_set1_epi16(0);
870 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
871 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
872 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
873 __m128i mask, hev, flat;
874 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
875 __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
876
877 (void)count;
878
879 q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
880 _mm_loadl_epi64((__m128i *)(s + 3 * p)));
881 q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
882 _mm_loadl_epi64((__m128i *)(s + 2 * p)));
883 q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
884 _mm_loadl_epi64((__m128i *)(s + 1 * p)));
885 q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
886 _mm_loadl_epi64((__m128i *)(s - 0 * p)));
887 p1q1 = _mm_shuffle_epi32(q1p1, 78);
888 p0q0 = _mm_shuffle_epi32(q0p0, 78);
889
890 {
891 // filter_mask and hev_mask
892 const __m128i one = _mm_set1_epi8(1);
893 const __m128i fe = _mm_set1_epi8(0xfe);
894 const __m128i ff = _mm_cmpeq_epi8(fe, fe);
895 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
896 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
897 _mm_subs_epu8(q0p0, q1p1));
898 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
899
900 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
901 _mm_subs_epu8(p0q0, q0p0));
902 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
903 _mm_subs_epu8(p1q1, q1p1));
904 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
905 hev = _mm_subs_epu8(flat, thresh);
906 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
907
908 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
909 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
910 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
911 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
912 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
913 mask = _mm_max_epu8(abs_p1p0, mask);
914 // mask |= (abs(p1 - p0) > limit) * -1;
915 // mask |= (abs(q1 - q0) > limit) * -1;
916
917 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
918 _mm_subs_epu8(q1p1, q2p2)),
919 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
920 _mm_subs_epu8(q2p2, q3p3)));
921 mask = _mm_max_epu8(work, mask);
922 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
923 mask = _mm_subs_epu8(mask, limit);
924 mask = _mm_cmpeq_epi8(mask, zero);
925
926 // flat_mask4
927
928 flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
929 _mm_subs_epu8(q0p0, q2p2)),
930 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
931 _mm_subs_epu8(q0p0, q3p3)));
932 flat = _mm_max_epu8(abs_p1p0, flat);
933 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
934 flat = _mm_subs_epu8(flat, one);
935 flat = _mm_cmpeq_epi8(flat, zero);
936 flat = _mm_and_si128(flat, mask);
937 }
938
939 {
940 const __m128i four = _mm_set1_epi16(4);
941 unsigned char *src = s;
942 {
943 __m128i workp_a, workp_b, workp_shft;
944 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
945 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
946 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
947 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
948 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
949 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
950 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
951 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
952
953 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
954 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
955 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
956 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
957 _mm_storel_epi64((__m128i *)&flat_op2[0],
958 _mm_packus_epi16(workp_shft, workp_shft));
959
960 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
961 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
962 _mm_storel_epi64((__m128i *)&flat_op1[0],
963 _mm_packus_epi16(workp_shft, workp_shft));
964
965 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
966 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
967 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
968 _mm_storel_epi64((__m128i *)&flat_op0[0],
969 _mm_packus_epi16(workp_shft, workp_shft));
970
971 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
972 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
973 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
974 _mm_storel_epi64((__m128i *)&flat_oq0[0],
975 _mm_packus_epi16(workp_shft, workp_shft));
976
977 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
978 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
979 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
980 _mm_storel_epi64((__m128i *)&flat_oq1[0],
981 _mm_packus_epi16(workp_shft, workp_shft));
982
983 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
984 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
985 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
986 _mm_storel_epi64((__m128i *)&flat_oq2[0],
987 _mm_packus_epi16(workp_shft, workp_shft));
988 }
989 }
990 // lp filter
991 {
992 const __m128i t4 = _mm_set1_epi8(4);
993 const __m128i t3 = _mm_set1_epi8(3);
994 const __m128i t80 = _mm_set1_epi8(0x80);
995 const __m128i t1 = _mm_set1_epi8(0x1);
996 const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
997 t80);
998 const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
999 t80);
1000 const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
1001 t80);
1002 const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
1003 t80);
1004 __m128i filt;
1005 __m128i work_a;
1006 __m128i filter1, filter2;
1007
1008 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1009 work_a = _mm_subs_epi8(qs0, ps0);
1010 filt = _mm_adds_epi8(filt, work_a);
1011 filt = _mm_adds_epi8(filt, work_a);
1012 filt = _mm_adds_epi8(filt, work_a);
1013 // (vp9_filter + 3 * (qs0 - ps0)) & mask
1014 filt = _mm_and_si128(filt, mask);
1015
1016 filter1 = _mm_adds_epi8(filt, t4);
1017 filter2 = _mm_adds_epi8(filt, t3);
1018
1019 // Filter1 >> 3
1020 filter1 = _mm_unpacklo_epi8(zero, filter1);
1021 filter1 = _mm_srai_epi16(filter1, 11);
1022 filter1 = _mm_packs_epi16(filter1, filter1);
1023
1024 // Filter2 >> 3
1025 filter2 = _mm_unpacklo_epi8(zero, filter2);
1026 filter2 = _mm_srai_epi16(filter2, 11);
1027 filter2 = _mm_packs_epi16(filter2, zero);
1028
1029 // filt >> 1
1030 filt = _mm_adds_epi8(filter1, t1);
1031 filt = _mm_unpacklo_epi8(zero, filt);
1032 filt = _mm_srai_epi16(filt, 9);
1033 filt = _mm_packs_epi16(filt, zero);
1034
1035 filt = _mm_andnot_si128(hev, filt);
1036
1037 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1038 q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1039 work_a = _mm_andnot_si128(flat, work_a);
1040 q0 = _mm_and_si128(flat, q0);
1041 q0 = _mm_or_si128(work_a, q0);
1042
1043 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1044 q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1045 work_a = _mm_andnot_si128(flat, work_a);
1046 q1 = _mm_and_si128(flat, q1);
1047 q1 = _mm_or_si128(work_a, q1);
1048
1049 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1050 q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1051 work_a = _mm_andnot_si128(flat, work_a);
1052 q2 = _mm_and_si128(flat, q2);
1053 q2 = _mm_or_si128(work_a, q2);
1054
1055 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1056 p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1057 work_a = _mm_andnot_si128(flat, work_a);
1058 p0 = _mm_and_si128(flat, p0);
1059 p0 = _mm_or_si128(work_a, p0);
1060
1061 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1062 p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1063 work_a = _mm_andnot_si128(flat, work_a);
1064 p1 = _mm_and_si128(flat, p1);
1065 p1 = _mm_or_si128(work_a, p1);
1066
1067 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1068 p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1069 work_a = _mm_andnot_si128(flat, work_a);
1070 p2 = _mm_and_si128(flat, p2);
1071 p2 = _mm_or_si128(work_a, p2);
1072
1073 _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1074 _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1075 _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1076 _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1077 _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1078 _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1079 }
1080 }
1081
vp9_lpf_horizontal_8_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1082 void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
1083 const uint8_t *_blimit0,
1084 const uint8_t *_limit0,
1085 const uint8_t *_thresh0,
1086 const uint8_t *_blimit1,
1087 const uint8_t *_limit1,
1088 const uint8_t *_thresh1) {
1089 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
1090 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
1091 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
1092 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
1093 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
1094 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
1095 const __m128i zero = _mm_set1_epi16(0);
1096 const __m128i blimit =
1097 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1098 _mm_load_si128((const __m128i *)_blimit1));
1099 const __m128i limit =
1100 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1101 _mm_load_si128((const __m128i *)_limit1));
1102 const __m128i thresh =
1103 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1104 _mm_load_si128((const __m128i *)_thresh1));
1105
1106 __m128i mask, hev, flat;
1107 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1108
1109 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1110 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1111 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1112 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1113 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1114 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1115 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1116 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1117 {
1118 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1119 _mm_subs_epu8(p0, p1));
1120 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1121 _mm_subs_epu8(q0, q1));
1122 const __m128i one = _mm_set1_epi8(1);
1123 const __m128i fe = _mm_set1_epi8(0xfe);
1124 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1125 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1126 _mm_subs_epu8(q0, p0));
1127 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1128 _mm_subs_epu8(q1, p1));
1129 __m128i work;
1130
1131 // filter_mask and hev_mask
1132 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1133 hev = _mm_subs_epu8(flat, thresh);
1134 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1135
1136 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1137 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1138 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1139 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1140 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1141 mask = _mm_max_epu8(flat, mask);
1142 // mask |= (abs(p1 - p0) > limit) * -1;
1143 // mask |= (abs(q1 - q0) > limit) * -1;
1144 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1145 _mm_subs_epu8(p1, p2)),
1146 _mm_or_si128(_mm_subs_epu8(p3, p2),
1147 _mm_subs_epu8(p2, p3)));
1148 mask = _mm_max_epu8(work, mask);
1149 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1150 _mm_subs_epu8(q1, q2)),
1151 _mm_or_si128(_mm_subs_epu8(q3, q2),
1152 _mm_subs_epu8(q2, q3)));
1153 mask = _mm_max_epu8(work, mask);
1154 mask = _mm_subs_epu8(mask, limit);
1155 mask = _mm_cmpeq_epi8(mask, zero);
1156
1157 // flat_mask4
1158 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
1159 _mm_subs_epu8(p0, p2)),
1160 _mm_or_si128(_mm_subs_epu8(q2, q0),
1161 _mm_subs_epu8(q0, q2)));
1162 flat = _mm_max_epu8(work, flat);
1163 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
1164 _mm_subs_epu8(p0, p3)),
1165 _mm_or_si128(_mm_subs_epu8(q3, q0),
1166 _mm_subs_epu8(q0, q3)));
1167 flat = _mm_max_epu8(work, flat);
1168 flat = _mm_subs_epu8(flat, one);
1169 flat = _mm_cmpeq_epi8(flat, zero);
1170 flat = _mm_and_si128(flat, mask);
1171 }
1172 {
1173 const __m128i four = _mm_set1_epi16(4);
1174 unsigned char *src = s;
1175 int i = 0;
1176
1177 do {
1178 __m128i workp_a, workp_b, workp_shft;
1179 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1180 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1181 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1182 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1183 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1184 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1185 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1186 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1187
1188 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1189 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1190 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1191 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1192 _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1193 _mm_packus_epi16(workp_shft, workp_shft));
1194
1195 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1196 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1197 _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1198 _mm_packus_epi16(workp_shft, workp_shft));
1199
1200 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1201 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1202 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1203 _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1204 _mm_packus_epi16(workp_shft, workp_shft));
1205
1206 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1207 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1208 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1209 _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1210 _mm_packus_epi16(workp_shft, workp_shft));
1211
1212 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1213 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1214 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1215 _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1216 _mm_packus_epi16(workp_shft, workp_shft));
1217
1218 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1219 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1220 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1221 _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1222 _mm_packus_epi16(workp_shft, workp_shft));
1223
1224 src += 8;
1225 } while (++i < 2);
1226 }
1227 // lp filter
1228 {
1229 const __m128i t4 = _mm_set1_epi8(4);
1230 const __m128i t3 = _mm_set1_epi8(3);
1231 const __m128i t80 = _mm_set1_epi8(0x80);
1232 const __m128i te0 = _mm_set1_epi8(0xe0);
1233 const __m128i t1f = _mm_set1_epi8(0x1f);
1234 const __m128i t1 = _mm_set1_epi8(0x1);
1235 const __m128i t7f = _mm_set1_epi8(0x7f);
1236
1237 const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1238 t80);
1239 const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1240 t80);
1241 const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1242 t80);
1243 const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1244 t80);
1245 __m128i filt;
1246 __m128i work_a;
1247 __m128i filter1, filter2;
1248
1249 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1250 work_a = _mm_subs_epi8(qs0, ps0);
1251 filt = _mm_adds_epi8(filt, work_a);
1252 filt = _mm_adds_epi8(filt, work_a);
1253 filt = _mm_adds_epi8(filt, work_a);
1254 // (vp9_filter + 3 * (qs0 - ps0)) & mask
1255 filt = _mm_and_si128(filt, mask);
1256
1257 filter1 = _mm_adds_epi8(filt, t4);
1258 filter2 = _mm_adds_epi8(filt, t3);
1259
1260 // Filter1 >> 3
1261 work_a = _mm_cmpgt_epi8(zero, filter1);
1262 filter1 = _mm_srli_epi16(filter1, 3);
1263 work_a = _mm_and_si128(work_a, te0);
1264 filter1 = _mm_and_si128(filter1, t1f);
1265 filter1 = _mm_or_si128(filter1, work_a);
1266
1267 // Filter2 >> 3
1268 work_a = _mm_cmpgt_epi8(zero, filter2);
1269 filter2 = _mm_srli_epi16(filter2, 3);
1270 work_a = _mm_and_si128(work_a, te0);
1271 filter2 = _mm_and_si128(filter2, t1f);
1272 filter2 = _mm_or_si128(filter2, work_a);
1273
1274 // filt >> 1
1275 filt = _mm_adds_epi8(filter1, t1);
1276 work_a = _mm_cmpgt_epi8(zero, filt);
1277 filt = _mm_srli_epi16(filt, 1);
1278 work_a = _mm_and_si128(work_a, t80);
1279 filt = _mm_and_si128(filt, t7f);
1280 filt = _mm_or_si128(filt, work_a);
1281
1282 filt = _mm_andnot_si128(hev, filt);
1283
1284 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1285 q0 = _mm_load_si128((__m128i *)flat_oq0);
1286 work_a = _mm_andnot_si128(flat, work_a);
1287 q0 = _mm_and_si128(flat, q0);
1288 q0 = _mm_or_si128(work_a, q0);
1289
1290 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1291 q1 = _mm_load_si128((__m128i *)flat_oq1);
1292 work_a = _mm_andnot_si128(flat, work_a);
1293 q1 = _mm_and_si128(flat, q1);
1294 q1 = _mm_or_si128(work_a, q1);
1295
1296 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1297 q2 = _mm_load_si128((__m128i *)flat_oq2);
1298 work_a = _mm_andnot_si128(flat, work_a);
1299 q2 = _mm_and_si128(flat, q2);
1300 q2 = _mm_or_si128(work_a, q2);
1301
1302 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1303 p0 = _mm_load_si128((__m128i *)flat_op0);
1304 work_a = _mm_andnot_si128(flat, work_a);
1305 p0 = _mm_and_si128(flat, p0);
1306 p0 = _mm_or_si128(work_a, p0);
1307
1308 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1309 p1 = _mm_load_si128((__m128i *)flat_op1);
1310 work_a = _mm_andnot_si128(flat, work_a);
1311 p1 = _mm_and_si128(flat, p1);
1312 p1 = _mm_or_si128(work_a, p1);
1313
1314 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1315 p2 = _mm_load_si128((__m128i *)flat_op2);
1316 work_a = _mm_andnot_si128(flat, work_a);
1317 p2 = _mm_and_si128(flat, p2);
1318 p2 = _mm_or_si128(work_a, p2);
1319
1320 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1321 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1322 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1323 _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1324 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1325 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1326 }
1327 }
1328
vp9_lpf_horizontal_4_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1329 void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1330 const unsigned char *_blimit0,
1331 const unsigned char *_limit0,
1332 const unsigned char *_thresh0,
1333 const unsigned char *_blimit1,
1334 const unsigned char *_limit1,
1335 const unsigned char *_thresh1) {
1336 const __m128i blimit =
1337 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1338 _mm_load_si128((const __m128i *)_blimit1));
1339 const __m128i limit =
1340 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1341 _mm_load_si128((const __m128i *)_limit1));
1342 const __m128i thresh =
1343 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1344 _mm_load_si128((const __m128i *)_thresh1));
1345 const __m128i zero = _mm_set1_epi16(0);
1346 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1347 __m128i mask, hev, flat;
1348
1349 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1350 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1351 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1352 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1353 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1354 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1355 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1356 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1357
1358 // filter_mask and hev_mask
1359 {
1360 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1361 _mm_subs_epu8(p0, p1));
1362 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1363 _mm_subs_epu8(q0, q1));
1364 const __m128i fe = _mm_set1_epi8(0xfe);
1365 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1366 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1367 _mm_subs_epu8(q0, p0));
1368 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1369 _mm_subs_epu8(q1, p1));
1370 __m128i work;
1371
1372 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1373 hev = _mm_subs_epu8(flat, thresh);
1374 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1375
1376 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1377 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1378 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1379 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1380 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1381 mask = _mm_max_epu8(flat, mask);
1382 // mask |= (abs(p1 - p0) > limit) * -1;
1383 // mask |= (abs(q1 - q0) > limit) * -1;
1384 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1385 _mm_subs_epu8(p1, p2)),
1386 _mm_or_si128(_mm_subs_epu8(p3, p2),
1387 _mm_subs_epu8(p2, p3)));
1388 mask = _mm_max_epu8(work, mask);
1389 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1390 _mm_subs_epu8(q1, q2)),
1391 _mm_or_si128(_mm_subs_epu8(q3, q2),
1392 _mm_subs_epu8(q2, q3)));
1393 mask = _mm_max_epu8(work, mask);
1394 mask = _mm_subs_epu8(mask, limit);
1395 mask = _mm_cmpeq_epi8(mask, zero);
1396 }
1397
1398 // filter4
1399 {
1400 const __m128i t4 = _mm_set1_epi8(4);
1401 const __m128i t3 = _mm_set1_epi8(3);
1402 const __m128i t80 = _mm_set1_epi8(0x80);
1403 const __m128i te0 = _mm_set1_epi8(0xe0);
1404 const __m128i t1f = _mm_set1_epi8(0x1f);
1405 const __m128i t1 = _mm_set1_epi8(0x1);
1406 const __m128i t7f = _mm_set1_epi8(0x7f);
1407
1408 const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1409 t80);
1410 const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1411 t80);
1412 const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1413 t80);
1414 const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1415 t80);
1416 __m128i filt;
1417 __m128i work_a;
1418 __m128i filter1, filter2;
1419
1420 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1421 work_a = _mm_subs_epi8(qs0, ps0);
1422 filt = _mm_adds_epi8(filt, work_a);
1423 filt = _mm_adds_epi8(filt, work_a);
1424 filt = _mm_adds_epi8(filt, work_a);
1425 // (vp9_filter + 3 * (qs0 - ps0)) & mask
1426 filt = _mm_and_si128(filt, mask);
1427
1428 filter1 = _mm_adds_epi8(filt, t4);
1429 filter2 = _mm_adds_epi8(filt, t3);
1430
1431 // Filter1 >> 3
1432 work_a = _mm_cmpgt_epi8(zero, filter1);
1433 filter1 = _mm_srli_epi16(filter1, 3);
1434 work_a = _mm_and_si128(work_a, te0);
1435 filter1 = _mm_and_si128(filter1, t1f);
1436 filter1 = _mm_or_si128(filter1, work_a);
1437
1438 // Filter2 >> 3
1439 work_a = _mm_cmpgt_epi8(zero, filter2);
1440 filter2 = _mm_srli_epi16(filter2, 3);
1441 work_a = _mm_and_si128(work_a, te0);
1442 filter2 = _mm_and_si128(filter2, t1f);
1443 filter2 = _mm_or_si128(filter2, work_a);
1444
1445 // filt >> 1
1446 filt = _mm_adds_epi8(filter1, t1);
1447 work_a = _mm_cmpgt_epi8(zero, filt);
1448 filt = _mm_srli_epi16(filt, 1);
1449 work_a = _mm_and_si128(work_a, t80);
1450 filt = _mm_and_si128(filt, t7f);
1451 filt = _mm_or_si128(filt, work_a);
1452
1453 filt = _mm_andnot_si128(hev, filt);
1454
1455 q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1456 q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1457 p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1458 p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1459
1460 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1461 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1462 _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1463 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1464 }
1465 }
1466
transpose8x16(unsigned char * in0,unsigned char * in1,int in_p,unsigned char * out,int out_p)1467 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1468 int in_p, unsigned char *out, int out_p) {
1469 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1470 __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1471
1472 // Read in 16 lines
1473 x0 = _mm_loadl_epi64((__m128i *)in0);
1474 x8 = _mm_loadl_epi64((__m128i *)in1);
1475 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
1476 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
1477 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
1478 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
1479 x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
1480 x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
1481 x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
1482 x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
1483 x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
1484 x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
1485 x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
1486 x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
1487 x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
1488 x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
1489
1490 x0 = _mm_unpacklo_epi8(x0, x1);
1491 x1 = _mm_unpacklo_epi8(x2, x3);
1492 x2 = _mm_unpacklo_epi8(x4, x5);
1493 x3 = _mm_unpacklo_epi8(x6, x7);
1494
1495 x8 = _mm_unpacklo_epi8(x8, x9);
1496 x9 = _mm_unpacklo_epi8(x10, x11);
1497 x10 = _mm_unpacklo_epi8(x12, x13);
1498 x11 = _mm_unpacklo_epi8(x14, x15);
1499
1500 x4 = _mm_unpacklo_epi16(x0, x1);
1501 x5 = _mm_unpacklo_epi16(x2, x3);
1502 x12 = _mm_unpacklo_epi16(x8, x9);
1503 x13 = _mm_unpacklo_epi16(x10, x11);
1504
1505 x6 = _mm_unpacklo_epi32(x4, x5);
1506 x7 = _mm_unpackhi_epi32(x4, x5);
1507 x14 = _mm_unpacklo_epi32(x12, x13);
1508 x15 = _mm_unpackhi_epi32(x12, x13);
1509
1510 // Store first 4-line result
1511 _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1512 _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1513 _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1514 _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1515
1516 x4 = _mm_unpackhi_epi16(x0, x1);
1517 x5 = _mm_unpackhi_epi16(x2, x3);
1518 x12 = _mm_unpackhi_epi16(x8, x9);
1519 x13 = _mm_unpackhi_epi16(x10, x11);
1520
1521 x6 = _mm_unpacklo_epi32(x4, x5);
1522 x7 = _mm_unpackhi_epi32(x4, x5);
1523 x14 = _mm_unpacklo_epi32(x12, x13);
1524 x15 = _mm_unpackhi_epi32(x12, x13);
1525
1526 // Store second 4-line result
1527 _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1528 _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1529 _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1530 _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1531 }
1532
transpose(unsigned char * src[],int in_p,unsigned char * dst[],int out_p,int num_8x8_to_transpose)1533 static INLINE void transpose(unsigned char *src[], int in_p,
1534 unsigned char *dst[], int out_p,
1535 int num_8x8_to_transpose) {
1536 int idx8x8 = 0;
1537 __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1538 do {
1539 unsigned char *in = src[idx8x8];
1540 unsigned char *out = dst[idx8x8];
1541
1542 x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
1543 x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
1544 x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
1545 x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
1546 x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
1547 x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
1548 x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
1549 x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
1550 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1551 x0 = _mm_unpacklo_epi8(x0, x1);
1552 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1553 x1 = _mm_unpacklo_epi8(x2, x3);
1554 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1555 x2 = _mm_unpacklo_epi8(x4, x5);
1556 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1557 x3 = _mm_unpacklo_epi8(x6, x7);
1558 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1559 x4 = _mm_unpacklo_epi16(x0, x1);
1560 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1561 x5 = _mm_unpacklo_epi16(x2, x3);
1562 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1563 x6 = _mm_unpacklo_epi32(x4, x5);
1564 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1565 x7 = _mm_unpackhi_epi32(x4, x5);
1566
1567 _mm_storel_pd((double *)(out + 0*out_p),
1568 _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
1569 _mm_storeh_pd((double *)(out + 1*out_p),
1570 _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
1571 _mm_storel_pd((double *)(out + 2*out_p),
1572 _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
1573 _mm_storeh_pd((double *)(out + 3*out_p),
1574 _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
1575
1576 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1577 x4 = _mm_unpackhi_epi16(x0, x1);
1578 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1579 x5 = _mm_unpackhi_epi16(x2, x3);
1580 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1581 x6 = _mm_unpacklo_epi32(x4, x5);
1582 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1583 x7 = _mm_unpackhi_epi32(x4, x5);
1584
1585 _mm_storel_pd((double *)(out + 4*out_p),
1586 _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
1587 _mm_storeh_pd((double *)(out + 5*out_p),
1588 _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
1589 _mm_storel_pd((double *)(out + 6*out_p),
1590 _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
1591 _mm_storeh_pd((double *)(out + 7*out_p),
1592 _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
1593 } while (++idx8x8 < num_8x8_to_transpose);
1594 }
1595
vp9_lpf_vertical_4_dual_sse2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1596 void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1597 const uint8_t *limit0,
1598 const uint8_t *thresh0,
1599 const uint8_t *blimit1,
1600 const uint8_t *limit1,
1601 const uint8_t *thresh1) {
1602 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1603 unsigned char *src[2];
1604 unsigned char *dst[2];
1605
1606 // Transpose 8x16
1607 transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1608
1609 // Loop filtering
1610 vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1611 blimit1, limit1, thresh1);
1612 src[0] = t_dst;
1613 src[1] = t_dst + 8;
1614 dst[0] = s - 4;
1615 dst[1] = s - 4 + p * 8;
1616
1617 // Transpose back
1618 transpose(src, 16, dst, p, 2);
1619 }
1620
vp9_lpf_vertical_8_sse2(unsigned char * s,int p,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh,int count)1621 void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
1622 const unsigned char *blimit,
1623 const unsigned char *limit,
1624 const unsigned char *thresh, int count) {
1625 DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
1626 unsigned char *src[1];
1627 unsigned char *dst[1];
1628 (void)count;
1629
1630 // Transpose 8x8
1631 src[0] = s - 4;
1632 dst[0] = t_dst;
1633
1634 transpose(src, p, dst, 8, 1);
1635
1636 // Loop filtering
1637 vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
1638
1639 src[0] = t_dst;
1640 dst[0] = s - 4;
1641
1642 // Transpose back
1643 transpose(src, 8, dst, p, 1);
1644 }
1645
vp9_lpf_vertical_8_dual_sse2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1646 void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1647 const uint8_t *limit0,
1648 const uint8_t *thresh0,
1649 const uint8_t *blimit1,
1650 const uint8_t *limit1,
1651 const uint8_t *thresh1) {
1652 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1653 unsigned char *src[2];
1654 unsigned char *dst[2];
1655
1656 // Transpose 8x16
1657 transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1658
1659 // Loop filtering
1660 vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1661 blimit1, limit1, thresh1);
1662 src[0] = t_dst;
1663 src[1] = t_dst + 8;
1664
1665 dst[0] = s - 4;
1666 dst[1] = s - 4 + p * 8;
1667
1668 // Transpose back
1669 transpose(src, 16, dst, p, 2);
1670 }
1671
vp9_lpf_vertical_16_sse2(unsigned char * s,int p,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)1672 void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
1673 const unsigned char *blimit,
1674 const unsigned char *limit,
1675 const unsigned char *thresh) {
1676 DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
1677 unsigned char *src[2];
1678 unsigned char *dst[2];
1679
1680 src[0] = s - 8;
1681 src[1] = s;
1682 dst[0] = t_dst;
1683 dst[1] = t_dst + 8 * 8;
1684
1685 // Transpose 16x8
1686 transpose(src, p, dst, 8, 2);
1687
1688 // Loop filtering
1689 mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
1690
1691 src[0] = t_dst;
1692 src[1] = t_dst + 8 * 8;
1693 dst[0] = s - 8;
1694 dst[1] = s;
1695
1696 // Transpose back
1697 transpose(src, 8, dst, p, 2);
1698 }
1699
vp9_lpf_vertical_16_dual_sse2(unsigned char * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1700 void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1701 const uint8_t *blimit, const uint8_t *limit,
1702 const uint8_t *thresh) {
1703 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1704
1705 // Transpose 16x16
1706 transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1707 transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1708
1709 // Loop filtering
1710 mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
1711 thresh);
1712
1713 // Transpose back
1714 transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1715 transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1716 }
1717