1 /*
2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h> /* AVX2 */
13
14 #include "config/aom_dsp_rtcd.h"
15
16 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
17 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
18 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
19 };
20
aom_lpf_horizontal_6_quad_avx2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)21 void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p,
22 const unsigned char *_blimit0,
23 const unsigned char *_limit0,
24 const unsigned char *_thresh0) {
25 __m256i p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
26 __m128i p2, p1, p0, q0, q1, q2;
27 __m128i mask, flat;
28
29 const __m128i thresh_v =
30 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
31 const __m128i limit_v =
32 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
33 const __m128i blimit_v =
34 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
35 const __m128i zero = _mm_setzero_si128();
36 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
37
38 p256_2 =
39 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
40 p256_1 =
41 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
42 p256_0 =
43 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
44 q256_0 =
45 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
46 q256_1 =
47 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
48 q256_2 =
49 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
50
51 p2 = _mm256_castsi256_si128(p256_2);
52 p1 = _mm256_castsi256_si128(p256_1);
53 p0 = _mm256_castsi256_si128(p256_0);
54 q0 = _mm256_castsi256_si128(q256_0);
55 q1 = _mm256_castsi256_si128(q256_1);
56 q2 = _mm256_castsi256_si128(q256_2);
57
58 {
59 __m128i work;
60 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
61 const __m128i abs_p1p0 =
62 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
63 const __m128i abs_q1q0 =
64 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
65 __m128i abs_p0q0 =
66 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
67 __m128i abs_p1q1 =
68 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
69
70 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
71
72 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
73 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
74 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
75 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
76 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
77 mask = _mm_max_epu8(flat, mask);
78 // mask |= (abs(p1 - p0) > limit) * -1;
79 // mask |= (abs(q1 - q0) > limit) * -1;
80 work = _mm_max_epu8(
81 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
82 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)));
83 mask = _mm_max_epu8(work, mask);
84 mask = _mm_subs_epu8(mask, limit_v);
85 mask = _mm_cmpeq_epi8(mask, zero);
86 }
87
88 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
89
90 // loop filter
91 {
92 const __m128i t4 = _mm_set1_epi8(4);
93 const __m128i t3 = _mm_set1_epi8(3);
94 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
95 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
96 const __m128i t1f = _mm_set1_epi8(0x1f);
97 const __m128i t1 = _mm_set1_epi8(0x1);
98 const __m128i t7f = _mm_set1_epi8(0x7f);
99 const __m128i one = _mm_set1_epi8(1);
100 __m128i hev;
101
102 hev = _mm_subs_epu8(flat, thresh_v);
103 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
104
105 __m128i ps1 = _mm_xor_si128(p1, t80);
106 __m128i ps0 = _mm_xor_si128(p0, t80);
107 __m128i qs0 = _mm_xor_si128(q0, t80);
108 __m128i qs1 = _mm_xor_si128(q1, t80);
109 __m128i filt;
110 __m128i work_a;
111 __m128i filter1, filter2;
112 __m128i flat_p1, flat_p0, flat_q0, flat_q1;
113
114 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
115 work_a = _mm_subs_epi8(qs0, ps0);
116 filt = _mm_adds_epi8(filt, work_a);
117 filt = _mm_adds_epi8(filt, work_a);
118 filt = _mm_adds_epi8(filt, work_a);
119 filt = _mm_and_si128(filt, mask);
120
121 filter1 = _mm_adds_epi8(filt, t4);
122 filter2 = _mm_adds_epi8(filt, t3);
123
124 work_a = _mm_cmpgt_epi8(zero, filter1);
125 filter1 = _mm_srli_epi16(filter1, 3);
126 work_a = _mm_and_si128(work_a, te0);
127 filter1 = _mm_and_si128(filter1, t1f);
128 filter1 = _mm_or_si128(filter1, work_a);
129 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
130
131 work_a = _mm_cmpgt_epi8(zero, filter2);
132 filter2 = _mm_srli_epi16(filter2, 3);
133 work_a = _mm_and_si128(work_a, te0);
134 filter2 = _mm_and_si128(filter2, t1f);
135 filter2 = _mm_or_si128(filter2, work_a);
136 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
137
138 filt = _mm_adds_epi8(filter1, t1);
139 work_a = _mm_cmpgt_epi8(zero, filt);
140 filt = _mm_srli_epi16(filt, 1);
141 work_a = _mm_and_si128(work_a, t80);
142 filt = _mm_and_si128(filt, t7f);
143 filt = _mm_or_si128(filt, work_a);
144 filt = _mm_andnot_si128(hev, filt);
145 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
146 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
147
148 __m128i work;
149 work = _mm_max_epu8(
150 _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
151 _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
152 flat = _mm_max_epu8(work, flat);
153 flat = _mm_subs_epu8(flat, one);
154 flat = _mm_cmpeq_epi8(flat, zero);
155 flat = _mm_and_si128(flat, mask);
156
157 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
158 const __m256i four = _mm256_set1_epi16(4);
159 __m256i pixetFilter, add, res;
160
161 const __m256i filter =
162 _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
163
164 p256_2 = _mm256_shuffle_epi8(p256_2, filter);
165 p256_1 = _mm256_shuffle_epi8(p256_1, filter);
166 p256_0 = _mm256_shuffle_epi8(p256_0, filter);
167 q256_0 = _mm256_shuffle_epi8(q256_0, filter);
168 q256_1 = _mm256_shuffle_epi8(q256_1, filter);
169 q256_2 = _mm256_shuffle_epi8(q256_2, filter);
170
171 pixetFilter = _mm256_slli_epi16(
172 _mm256_add_epi16(p256_2, _mm256_add_epi16(p256_1, p256_0)), 1);
173 pixetFilter =
174 _mm256_add_epi16(pixetFilter, _mm256_add_epi16(p256_2, q256_0));
175 pixetFilter = _mm256_add_epi16(four, pixetFilter);
176 res = _mm256_srli_epi16(pixetFilter, 3);
177 flat_p1 = _mm256_castsi256_si128(
178 _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
179 p1 = _mm_andnot_si128(flat, ps1);
180 flat_p1 = _mm_and_si128(flat, flat_p1);
181 p1 = _mm_or_si128(flat_p1, p1);
182
183 add = _mm256_add_epi16(_mm256_sub_epi16(q256_1, p256_2),
184 _mm256_sub_epi16(q256_0, p256_2));
185 pixetFilter = _mm256_add_epi16(pixetFilter, add);
186 res = _mm256_srli_epi16(pixetFilter, 3);
187 flat_p0 = _mm256_castsi256_si128(
188 _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
189 p0 = _mm_andnot_si128(flat, ps0);
190 flat_p0 = _mm_and_si128(flat, flat_p0);
191 p0 = _mm_or_si128(flat_p0, p0);
192
193 add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_2),
194 _mm256_sub_epi16(q256_1, p256_1));
195 pixetFilter = _mm256_add_epi16(pixetFilter, add);
196 res = _mm256_srli_epi16(pixetFilter, 3);
197 flat_q0 = _mm256_castsi256_si128(
198 _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
199 q0 = _mm_andnot_si128(flat, qs0);
200 flat_q0 = _mm_and_si128(flat, flat_q0);
201 q0 = _mm_or_si128(flat_q0, q0);
202
203 add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_1),
204 _mm256_sub_epi16(q256_2, p256_0));
205 pixetFilter = _mm256_add_epi16(pixetFilter, add);
206 res = _mm256_srli_epi16(pixetFilter, 3);
207 flat_q1 = _mm256_castsi256_si128(
208 _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
209 q1 = _mm_andnot_si128(flat, qs1);
210 flat_q1 = _mm_and_si128(flat, flat_q1);
211 q1 = _mm_or_si128(flat_q1, q1);
212
213 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
214 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
215 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
216 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
217 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
218 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
219 } else {
220 _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
221 _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
222 _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
223 _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
224 }
225 }
226 }
227
aom_lpf_horizontal_8_quad_avx2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)228 void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
229 const unsigned char *_blimit0,
230 const unsigned char *_limit0,
231 const unsigned char *_thresh0) {
232 __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
233 __m128i p3, p2, p1, p0, q0, q1, q2, q3;
234 __m128i mask, flat;
235
236 const __m128i thresh_v =
237 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
238 const __m128i limit_v =
239 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
240 const __m128i blimit_v =
241 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
242 const __m128i zero = _mm_setzero_si128();
243 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
244
245 p256_3 =
246 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
247 p256_2 =
248 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
249 p256_1 =
250 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
251 p256_0 =
252 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
253 q256_0 =
254 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
255 q256_1 =
256 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
257 q256_2 =
258 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
259 q256_3 =
260 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
261
262 p3 = _mm256_castsi256_si128(p256_3);
263 p2 = _mm256_castsi256_si128(p256_2);
264 p1 = _mm256_castsi256_si128(p256_1);
265 p0 = _mm256_castsi256_si128(p256_0);
266 q0 = _mm256_castsi256_si128(q256_0);
267 q1 = _mm256_castsi256_si128(q256_1);
268 q2 = _mm256_castsi256_si128(q256_2);
269 q3 = _mm256_castsi256_si128(q256_3);
270
271 {
272 __m128i work;
273 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
274 const __m128i abs_p1p0 =
275 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
276 const __m128i abs_q1q0 =
277 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
278 __m128i abs_p0q0 =
279 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
280 __m128i abs_p1q1 =
281 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
282
283 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
284
285 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
286 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
287 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
288 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
289 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
290 mask = _mm_max_epu8(flat, mask);
291 // mask |= (abs(p1 - p0) > limit) * -1;
292 // mask |= (abs(q1 - q0) > limit) * -1;
293 work = _mm_max_epu8(
294 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
295 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
296 mask = _mm_max_epu8(work, mask);
297 work = _mm_max_epu8(
298 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
299 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
300 mask = _mm_max_epu8(work, mask);
301 mask = _mm_subs_epu8(mask, limit_v);
302 mask = _mm_cmpeq_epi8(mask, zero);
303 }
304
305 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
306
307 // loop filter
308 {
309 const __m128i t4 = _mm_set1_epi8(4);
310 const __m128i t3 = _mm_set1_epi8(3);
311 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
312 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
313 const __m128i t1f = _mm_set1_epi8(0x1f);
314 const __m128i t1 = _mm_set1_epi8(0x1);
315 const __m128i t7f = _mm_set1_epi8(0x7f);
316 const __m128i one = _mm_set1_epi8(1);
317 __m128i hev;
318
319 hev = _mm_subs_epu8(flat, thresh_v);
320 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
321
322 __m128i ps1 = _mm_xor_si128(p1, t80);
323 __m128i ps0 = _mm_xor_si128(p0, t80);
324 __m128i qs0 = _mm_xor_si128(q0, t80);
325 __m128i qs1 = _mm_xor_si128(q1, t80);
326 __m128i filt;
327 __m128i work_a;
328 __m128i filter1, filter2;
329 __m128i flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
330
331 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
332 work_a = _mm_subs_epi8(qs0, ps0);
333 filt = _mm_adds_epi8(filt, work_a);
334 filt = _mm_adds_epi8(filt, work_a);
335 filt = _mm_adds_epi8(filt, work_a);
336 filt = _mm_and_si128(filt, mask);
337
338 filter1 = _mm_adds_epi8(filt, t4);
339 filter2 = _mm_adds_epi8(filt, t3);
340
341 work_a = _mm_cmpgt_epi8(zero, filter1);
342 filter1 = _mm_srli_epi16(filter1, 3);
343 work_a = _mm_and_si128(work_a, te0);
344 filter1 = _mm_and_si128(filter1, t1f);
345 filter1 = _mm_or_si128(filter1, work_a);
346 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
347
348 work_a = _mm_cmpgt_epi8(zero, filter2);
349 filter2 = _mm_srli_epi16(filter2, 3);
350 work_a = _mm_and_si128(work_a, te0);
351 filter2 = _mm_and_si128(filter2, t1f);
352 filter2 = _mm_or_si128(filter2, work_a);
353 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
354
355 filt = _mm_adds_epi8(filter1, t1);
356 work_a = _mm_cmpgt_epi8(zero, filt);
357 filt = _mm_srli_epi16(filt, 1);
358 work_a = _mm_and_si128(work_a, t80);
359 filt = _mm_and_si128(filt, t7f);
360 filt = _mm_or_si128(filt, work_a);
361 filt = _mm_andnot_si128(hev, filt);
362 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
363 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
364
365 __m128i work;
366 work = _mm_max_epu8(
367 _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
368 _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
369 flat = _mm_max_epu8(work, flat);
370 work = _mm_max_epu8(
371 _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
372 _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
373 flat = _mm_max_epu8(work, flat);
374 flat = _mm_subs_epu8(flat, one);
375 flat = _mm_cmpeq_epi8(flat, zero);
376 flat = _mm_and_si128(flat, mask);
377
378 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
379 const __m256i four = _mm256_set1_epi16(4);
380 __m256i pixetFilter_p2p1p0, p2p1p0, q2q1q0, pixetFilter_q2q1q0, sum_p,
381 sum_q, res_p, res_q;
382
383 const __m256i filter =
384 _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
385
386 p256_3 = _mm256_shuffle_epi8(p256_3, filter);
387 p256_2 = _mm256_shuffle_epi8(p256_2, filter);
388 p256_1 = _mm256_shuffle_epi8(p256_1, filter);
389 p256_0 = _mm256_shuffle_epi8(p256_0, filter);
390 q256_0 = _mm256_shuffle_epi8(q256_0, filter);
391 q256_1 = _mm256_shuffle_epi8(q256_1, filter);
392 q256_2 = _mm256_shuffle_epi8(q256_2, filter);
393 q256_3 = _mm256_shuffle_epi8(q256_3, filter);
394
395 p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
396 q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
397
398 pixetFilter_p2p1p0 =
399 _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
400 pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
401
402 pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
403 res_p =
404 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
405 flat_p0 = _mm256_castsi256_si128(
406 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
407 p0 = _mm_andnot_si128(flat, ps0);
408 flat_p0 = _mm_and_si128(flat, flat_p0);
409 p0 = _mm_or_si128(flat_p0, p0);
410
411 pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
412 res_q =
413 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
414 flat_q0 = _mm256_castsi256_si128(
415 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
416 q0 = _mm_andnot_si128(flat, qs0);
417 flat_q0 = _mm_and_si128(flat, flat_q0);
418 q0 = _mm_or_si128(flat_q0, q0);
419
420 sum_p = _mm256_sub_epi16(p256_3, q256_2);
421 pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
422 res_p =
423 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
424 flat_p1 = _mm256_castsi256_si128(
425 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
426 p1 = _mm_andnot_si128(flat, ps1);
427 flat_p1 = _mm_and_si128(flat, flat_p1);
428 p1 = _mm_or_si128(flat_p1, p1);
429
430 sum_q = _mm256_sub_epi16(q256_3, p256_2);
431 pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
432 res_q =
433 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
434 flat_q1 = _mm256_castsi256_si128(
435 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
436 q1 = _mm_andnot_si128(flat, qs1);
437 flat_q1 = _mm_and_si128(flat, flat_q1);
438 q1 = _mm_or_si128(flat_q1, q1);
439
440 sum_p = _mm256_sub_epi16(p256_3, q256_1);
441 pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
442 res_p =
443 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
444 flat_p2 = _mm256_castsi256_si128(
445 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
446 p2 = _mm_andnot_si128(flat, p2);
447 flat_p2 = _mm_and_si128(flat, flat_p2);
448 p2 = _mm_or_si128(flat_p2, p2);
449
450 sum_q = _mm256_sub_epi16(q256_3, p256_1);
451 pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
452 res_q =
453 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
454 flat_q2 = _mm256_castsi256_si128(
455 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
456 q2 = _mm_andnot_si128(flat, q2);
457 flat_q2 = _mm_and_si128(flat, flat_q2);
458 q2 = _mm_or_si128(flat_q2, q2);
459
460 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
461 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
462 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
463 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
464 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
465 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
466 } else {
467 _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
468 _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
469 _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
470 _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
471 }
472 }
473 }
474
trans_store_16x16_lpf_vert14(unsigned char * in0,int in_p,unsigned char * out,int out_p,int is_store_avx2)475 static INLINE void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p,
476 unsigned char *out, int out_p,
477 int is_store_avx2) {
478 const __m128i x0 = _mm_loadu_si128((__m128i *)in0);
479 const __m128i x1 = _mm_loadu_si128((__m128i *)(in0 + in_p * 1));
480 const __m128i x2 = _mm_loadu_si128((__m128i *)(in0 + in_p * 2));
481 const __m128i x3 = _mm_loadu_si128((__m128i *)(in0 + in_p * 3));
482 const __m128i x4 = _mm_loadu_si128((__m128i *)(in0 + in_p * 4));
483 const __m128i x5 = _mm_loadu_si128((__m128i *)(in0 + in_p * 5));
484 const __m128i x6 = _mm_loadu_si128((__m128i *)(in0 + in_p * 6));
485 const __m128i x7 = _mm_loadu_si128((__m128i *)(in0 + in_p * 7));
486
487 const __m256i y0 = _mm256_insertf128_si256(
488 _mm256_castsi128_si256(x0), _mm_loadu_si128((__m128i *)(in0 + in_p * 8)),
489 0x1);
490 const __m256i y1 = _mm256_insertf128_si256(
491 _mm256_castsi128_si256(x1), _mm_loadu_si128((__m128i *)(in0 + in_p * 9)),
492 0x1);
493 const __m256i y2 = _mm256_insertf128_si256(
494 _mm256_castsi128_si256(x2), _mm_loadu_si128((__m128i *)(in0 + in_p * 10)),
495 0x1);
496 const __m256i y3 = _mm256_insertf128_si256(
497 _mm256_castsi128_si256(x3), _mm_loadu_si128((__m128i *)(in0 + in_p * 11)),
498 0x1);
499 const __m256i y4 = _mm256_insertf128_si256(
500 _mm256_castsi128_si256(x4), _mm_loadu_si128((__m128i *)(in0 + in_p * 12)),
501 0x1);
502 const __m256i y5 = _mm256_insertf128_si256(
503 _mm256_castsi128_si256(x5), _mm_loadu_si128((__m128i *)(in0 + in_p * 13)),
504 0x1);
505 const __m256i y6 = _mm256_insertf128_si256(
506 _mm256_castsi128_si256(x6), _mm_loadu_si128((__m128i *)(in0 + in_p * 14)),
507 0x1);
508 const __m256i y7 = _mm256_insertf128_si256(
509 _mm256_castsi128_si256(x7), _mm_loadu_si128((__m128i *)(in0 + in_p * 15)),
510 0x1);
511
512 const __m256i y_s00 = _mm256_unpacklo_epi8(y0, y1);
513 const __m256i y_s01 = _mm256_unpackhi_epi8(y0, y1);
514 const __m256i y_s02 = _mm256_unpacklo_epi8(y2, y3);
515 const __m256i y_s03 = _mm256_unpackhi_epi8(y2, y3);
516 const __m256i y_s04 = _mm256_unpacklo_epi8(y4, y5);
517 const __m256i y_s05 = _mm256_unpackhi_epi8(y4, y5);
518 const __m256i y_s06 = _mm256_unpacklo_epi8(y6, y7);
519 const __m256i y_s07 = _mm256_unpackhi_epi8(y6, y7);
520
521 const __m256i y_s10 = _mm256_unpacklo_epi16(y_s00, y_s02);
522 const __m256i y_s11 = _mm256_unpackhi_epi16(y_s00, y_s02);
523 const __m256i y_s12 = _mm256_unpacklo_epi16(y_s01, y_s03);
524 const __m256i y_s13 = _mm256_unpackhi_epi16(y_s01, y_s03);
525 const __m256i y_s14 = _mm256_unpacklo_epi16(y_s04, y_s06);
526 const __m256i y_s15 = _mm256_unpackhi_epi16(y_s04, y_s06);
527 const __m256i y_s16 = _mm256_unpacklo_epi16(y_s05, y_s07);
528 const __m256i y_s17 = _mm256_unpackhi_epi16(y_s05, y_s07);
529
530 const __m256i y_s20 = _mm256_unpacklo_epi32(y_s10, y_s14);
531 const __m256i y_s21 = _mm256_unpackhi_epi32(y_s10, y_s14);
532 const __m256i y_s22 = _mm256_unpacklo_epi32(y_s11, y_s15);
533 const __m256i y_s23 = _mm256_unpackhi_epi32(y_s11, y_s15);
534 const __m256i y_s24 = _mm256_unpacklo_epi32(y_s12, y_s16);
535 const __m256i y_s25 = _mm256_unpackhi_epi32(y_s12, y_s16);
536 const __m256i y_s26 = _mm256_unpacklo_epi32(y_s13, y_s17);
537 const __m256i y_s27 = _mm256_unpackhi_epi32(y_s13, y_s17);
538
539 const __m256i row_s01 = _mm256_permute4x64_epi64(y_s20, 0xd8);
540 const __m256i row_s23 = _mm256_permute4x64_epi64(y_s21, 0xd8);
541 const __m256i row_s45 = _mm256_permute4x64_epi64(y_s22, 0xd8);
542 const __m256i row_s67 = _mm256_permute4x64_epi64(y_s23, 0xd8);
543 const __m256i row_s89 = _mm256_permute4x64_epi64(y_s24, 0xd8);
544 const __m256i row_s1011 = _mm256_permute4x64_epi64(y_s25, 0xd8);
545 const __m256i row_s1213 = _mm256_permute4x64_epi64(y_s26, 0xd8);
546 const __m256i row_s1415 = _mm256_permute4x64_epi64(y_s27, 0xd8);
547
548 if (is_store_avx2) {
549 _mm256_storeu_si256((__m256i *)(out), row_s01);
550 _mm256_storeu_si256((__m256i *)(out + (2 * out_p)), row_s23);
551 _mm256_storeu_si256((__m256i *)(out + (4 * out_p)), row_s45);
552 _mm256_storeu_si256((__m256i *)(out + (6 * out_p)), row_s67);
553 _mm256_storeu_si256((__m256i *)(out + (8 * out_p)), row_s89);
554 _mm256_storeu_si256((__m256i *)(out + (10 * out_p)), row_s1011);
555 _mm256_storeu_si256((__m256i *)(out + (12 * out_p)), row_s1213);
556 _mm256_storeu_si256((__m256i *)(out + (14 * out_p)), row_s1415);
557 } else {
558 _mm_storeu_si128((__m128i *)(out), _mm256_castsi256_si128(row_s01));
559 _mm_storeu_si128((__m128i *)(out + (2 * out_p)),
560 _mm256_castsi256_si128(row_s23));
561 _mm_storeu_si128((__m128i *)(out + (4 * out_p)),
562 _mm256_castsi256_si128(row_s45));
563 _mm_storeu_si128((__m128i *)(out + (6 * out_p)),
564 _mm256_castsi256_si128(row_s67));
565 _mm_storeu_si128((__m128i *)(out + (8 * out_p)),
566 _mm256_castsi256_si128(row_s89));
567 _mm_storeu_si128((__m128i *)(out + (10 * out_p)),
568 _mm256_castsi256_si128(row_s1011));
569 _mm_storeu_si128((__m128i *)(out + (12 * out_p)),
570 _mm256_castsi256_si128(row_s1213));
571 _mm_storeu_si128((__m128i *)(out + (14 * out_p)),
572 _mm256_castsi256_si128(row_s1415));
573 _mm_storeu_si128((__m128i *)(out + (1 * out_p)),
574 _mm256_extracti128_si256(row_s01, 1));
575 _mm_storeu_si128((__m128i *)(out + (3 * out_p)),
576 _mm256_extracti128_si256(row_s23, 1));
577 _mm_storeu_si128((__m128i *)(out + (5 * out_p)),
578 _mm256_extracti128_si256(row_s45, 1));
579 _mm_storeu_si128((__m128i *)(out + (7 * out_p)),
580 _mm256_extracti128_si256(row_s67, 1));
581 _mm_storeu_si128((__m128i *)(out + (9 * out_p)),
582 _mm256_extracti128_si256(row_s89, 1));
583 _mm_storeu_si128((__m128i *)(out + (11 * out_p)),
584 _mm256_extracti128_si256(row_s1011, 1));
585 _mm_storeu_si128((__m128i *)(out + (13 * out_p)),
586 _mm256_extracti128_si256(row_s1213, 1));
587 _mm_storeu_si128((__m128i *)(out + (15 * out_p)),
588 _mm256_extracti128_si256(row_s1415, 1));
589 }
590 }
591
aom_lpf_horizontal_14_quad_avx2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)592 void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
593 const unsigned char *_blimit0,
594 const unsigned char *_limit0,
595 const unsigned char *_thresh0) {
596 __m128i mask, flat;
597 const __m128i zero = _mm_setzero_si128();
598 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
599
600 __m256i p256_3 =
601 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
602 __m256i p256_2 =
603 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
604 __m256i p256_1 =
605 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
606 __m256i p256_0 =
607 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
608 __m256i q256_0 =
609 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
610 __m256i q256_1 =
611 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
612 __m256i q256_2 =
613 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
614 __m256i q256_3 =
615 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
616
617 __m128i p3 = _mm256_castsi256_si128(p256_3);
618 __m128i p2 = _mm256_castsi256_si128(p256_2);
619 __m128i p1 = _mm256_castsi256_si128(p256_1);
620 __m128i p0 = _mm256_castsi256_si128(p256_0);
621 __m128i q0 = _mm256_castsi256_si128(q256_0);
622 __m128i q1 = _mm256_castsi256_si128(q256_1);
623 __m128i q2 = _mm256_castsi256_si128(q256_2);
624 __m128i q3 = _mm256_castsi256_si128(q256_3);
625
626 {
627 const __m128i limit_v =
628 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
629 const __m128i blimit_v =
630 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
631 const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
632 const __m128i abs_p1p0 =
633 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
634 const __m128i abs_q1q0 =
635 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
636 __m128i abs_p0q0 =
637 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
638 __m128i abs_p1q1 =
639 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
640
641 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
642
643 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
644 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
645 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
646 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
647 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
648 mask = _mm_max_epu8(flat, mask);
649 // mask |= (abs(p1 - p0) > limit) * -1;
650 // mask |= (abs(q1 - q0) > limit) * -1;
651 __m128i work = _mm_max_epu8(
652 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
653 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
654 mask = _mm_max_epu8(work, mask);
655 work = _mm_max_epu8(
656 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
657 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
658 mask = _mm_max_epu8(work, mask);
659 mask = _mm_subs_epu8(mask, limit_v);
660 mask = _mm_cmpeq_epi8(mask, zero);
661 }
662
663 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
664
665 // loop filter
666 {
667 const __m128i thresh_v =
668 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
669 const __m128i one = _mm_set1_epi8(1);
670 const __m128i t3 = _mm_set1_epi8(3);
671 const __m128i t4 = _mm_add_epi8(one, t3);
672 const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
673 const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
674 const __m128i t1f = _mm_set1_epi8(0x1f);
675 const __m128i t7f = _mm_sub_epi8(t80, one);
676
677 __m128i hev = _mm_subs_epu8(flat, thresh_v);
678 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
679
680 __m128i ps1 = _mm_xor_si128(p1, t80);
681 __m128i ps0 = _mm_xor_si128(p0, t80);
682 __m128i qs0 = _mm_xor_si128(q0, t80);
683 __m128i qs1 = _mm_xor_si128(q1, t80);
684
685 __m128i filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
686 __m128i work_a = _mm_subs_epi8(qs0, ps0);
687 filt = _mm_adds_epi8(filt, work_a);
688 filt = _mm_adds_epi8(filt, work_a);
689 filt = _mm_adds_epi8(filt, work_a);
690 filt = _mm_and_si128(filt, mask);
691
692 __m128i filter1 = _mm_adds_epi8(filt, t4);
693 __m128i filter2 = _mm_adds_epi8(filt, t3);
694
695 work_a = _mm_cmpgt_epi8(zero, filter1);
696 filter1 = _mm_srli_epi16(filter1, 3);
697 work_a = _mm_and_si128(work_a, te0);
698 filter1 = _mm_and_si128(filter1, t1f);
699 filter1 = _mm_or_si128(filter1, work_a);
700 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
701
702 work_a = _mm_cmpgt_epi8(zero, filter2);
703 filter2 = _mm_srli_epi16(filter2, 3);
704 work_a = _mm_and_si128(work_a, te0);
705 filter2 = _mm_and_si128(filter2, t1f);
706 filter2 = _mm_or_si128(filter2, work_a);
707 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
708
709 filt = _mm_adds_epi8(filter1, one);
710 work_a = _mm_cmpgt_epi8(zero, filt);
711 filt = _mm_srli_epi16(filt, 1);
712 work_a = _mm_and_si128(work_a, t80);
713 filt = _mm_and_si128(filt, t7f);
714 filt = _mm_or_si128(filt, work_a);
715 filt = _mm_andnot_si128(hev, filt);
716 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
717 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
718
719 // Derive flat
720 __m256i p0q0256 = _mm256_blend_epi32(p256_0, q256_0, 0xf0);
721 __m256i p2q2256 = _mm256_blend_epi32(p256_2, q256_2, 0xf0);
722 __m256i p3q3256 = _mm256_blend_epi32(p256_3, q256_3, 0xf0);
723 const __m256i ps0qs0256 =
724 _mm256_insertf128_si256(_mm256_castsi128_si256(ps0), qs0, 0x1);
725 const __m256i ps1qs1256 =
726 _mm256_insertf128_si256(_mm256_castsi128_si256(ps1), qs1, 0x1);
727 const __m256i work01 = _mm256_or_si256(_mm256_subs_epu8(p2q2256, p0q0256),
728 _mm256_subs_epu8(p0q0256, p2q2256));
729 const __m256i work02 = _mm256_or_si256(_mm256_subs_epu8(p3q3256, p0q0256),
730 _mm256_subs_epu8(p0q0256, p3q3256));
731 const __m256i max0_256 = _mm256_max_epu8(work01, work02);
732 const __m128i max1_256 =
733 _mm_max_epu8(_mm256_castsi256_si128(max0_256),
734 _mm256_extractf128_si256(max0_256, 1));
735 flat = _mm_max_epu8(max1_256, flat);
736 flat = _mm_subs_epu8(flat, one);
737 flat = _mm_cmpeq_epi8(flat, zero);
738 flat = _mm_and_si128(flat, mask);
739
740 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
741 // flat and wide flat calculations
742 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
743 const __m256i flat256 =
744 _mm256_insertf128_si256(_mm256_castsi128_si256(flat), flat, 0x1);
745 const __m256i eight = _mm256_set1_epi16(8);
746 const __m256i four = _mm256_set1_epi16(4);
747
748 __m256i p256_4 = _mm256_castpd_si256(
749 _mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
750 __m256i q256_4 = _mm256_castpd_si256(
751 _mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
752 __m256i p256_5 = _mm256_castpd_si256(
753 _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
754 __m256i q256_5 = _mm256_castpd_si256(
755 _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
756 __m256i p256_6 = _mm256_castpd_si256(
757 _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
758 __m256i q256_6 = _mm256_castpd_si256(
759 _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
760
761 // Derive flat2
762 __m256i p4q4256 = _mm256_blend_epi32(p256_4, q256_4, 0xf0);
763 __m256i p5q5256 = _mm256_blend_epi32(p256_5, q256_5, 0xf0);
764 const __m256i p6q6256 = _mm256_blend_epi32(p256_6, q256_6, 0xf0);
765 const __m256i work1 = _mm256_or_si256(_mm256_subs_epu8(p4q4256, p0q0256),
766 _mm256_subs_epu8(p0q0256, p4q4256));
767 const __m256i work2 = _mm256_or_si256(_mm256_subs_epu8(p5q5256, p0q0256),
768 _mm256_subs_epu8(p0q0256, p5q5256));
769 const __m256i work3 = _mm256_or_si256(_mm256_subs_epu8(p6q6256, p0q0256),
770 _mm256_subs_epu8(p0q0256, p6q6256));
771 __m256i flat2_256 = _mm256_max_epu8(work1, work2);
772 flat2_256 = _mm256_max_epu8(flat2_256, work3);
773 __m128i flat2 = _mm_max_epu8(_mm256_castsi256_si128(flat2_256),
774 _mm256_extractf128_si256(flat2_256, 1));
775 flat2 = _mm_subs_epu8(flat2, one);
776 flat2 = _mm_cmpeq_epi8(flat2, zero);
777 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
778
779 const __m256i filter =
780 _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
781
782 p256_3 = _mm256_shuffle_epi8(p256_3, filter);
783 p256_2 = _mm256_shuffle_epi8(p256_2, filter);
784 p256_1 = _mm256_shuffle_epi8(p256_1, filter);
785 p256_0 = _mm256_shuffle_epi8(p256_0, filter);
786 q256_0 = _mm256_shuffle_epi8(q256_0, filter);
787 q256_1 = _mm256_shuffle_epi8(q256_1, filter);
788 q256_2 = _mm256_shuffle_epi8(q256_2, filter);
789 q256_3 = _mm256_shuffle_epi8(q256_3, filter);
790
791 const __m256i p2p1p0 =
792 _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
793 const __m256i q2q1q0 =
794 _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
795
796 __m256i pixetFilter_p2p1p0 =
797 _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
798 __m256i pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
799
800 // Derive p0 and q0
801 pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
802 __m256i res_p =
803 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
804 pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
805 __m256i res_q =
806 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
807 __m256i flat_p0q0 =
808 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
809 p0q0256 = _mm256_andnot_si256(flat256, ps0qs0256);
810 flat_p0q0 = _mm256_and_si256(flat256, flat_p0q0);
811 p0q0256 = _mm256_or_si256(flat_p0q0, p0q0256);
812 p0 = _mm256_castsi256_si128(p0q0256);
813 q0 = _mm256_extractf128_si256(p0q0256, 1);
814
815 // Derive p1 and q1
816 __m256i sum_p = _mm256_sub_epi16(p256_3, q256_2);
817 pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
818 res_p =
819 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
820 __m256i sum_q = _mm256_sub_epi16(q256_3, p256_2);
821 pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
822 res_q =
823 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
824 __m256i flat_p1q1 =
825 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
826 __m256i p1q1256 = _mm256_andnot_si256(flat256, ps1qs1256);
827 flat_p1q1 = _mm256_and_si256(flat256, flat_p1q1);
828 p1q1256 = _mm256_or_si256(flat_p1q1, p1q1256);
829 p1 = _mm256_castsi256_si128(p1q1256);
830 q1 = _mm256_extractf128_si256(p1q1256, 1);
831
832 // Derive p2 and q2
833 sum_p = _mm256_sub_epi16(p256_3, q256_1);
834 pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
835 res_p =
836 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
837 sum_q = _mm256_sub_epi16(q256_3, p256_1);
838 pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
839 res_q =
840 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
841 __m256i flat_p2q2 =
842 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
843 p2q2256 = _mm256_andnot_si256(flat256, p2q2256);
844 flat_p2q2 = _mm256_and_si256(flat256, flat_p2q2);
845 p2q2256 = _mm256_or_si256(flat_p2q2, p2q2256);
846 p2 = _mm256_castsi256_si128(p2q2256);
847 q2 = _mm256_extractf128_si256(p2q2256, 1);
848 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
849 flat2_256 =
850 _mm256_insertf128_si256(_mm256_castsi128_si256(flat2), flat2, 0x1);
851 p256_6 = _mm256_shuffle_epi8(p256_6, filter);
852 p256_5 = _mm256_shuffle_epi8(p256_5, filter);
853 p256_4 = _mm256_shuffle_epi8(p256_4, filter);
854 q256_4 = _mm256_shuffle_epi8(q256_4, filter);
855 q256_5 = _mm256_shuffle_epi8(q256_5, filter);
856 q256_6 = _mm256_shuffle_epi8(q256_6, filter);
857
858 __m256i pixelFilter_p =
859 _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3));
860 __m256i pixelFilter_q =
861 _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3));
862
863 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0);
864 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q2q1q0);
865
866 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p256_0);
867 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q256_0);
868 pixelFilter_p = _mm256_add_epi16(
869 eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
870 pixelFilter_q = pixelFilter_p;
871
872 // Derive p0 and q0
873 pixelFilter_p =
874 _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p);
875 res_p = _mm256_srli_epi16(pixelFilter_p, 4);
876 pixelFilter_q =
877 _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q);
878 res_q = _mm256_srli_epi16(pixelFilter_q, 4);
879 __m256i flat2_p0q0 =
880 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
881 p0q0256 = _mm256_andnot_si256(flat2_256, p0q0256);
882 flat2_p0q0 = _mm256_and_si256(flat2_256, flat2_p0q0);
883 p0q0256 = _mm256_or_si256(flat2_p0q0, p0q0256);
884
885 p0 = _mm256_castsi256_si128(p0q0256);
886 q0 = _mm256_extractf128_si256(p0q0256, 1);
887 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
888 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
889
890 // Derive p1 and q1
891 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5),
892 _mm256_sub_epi16(p256_2, q256_0));
893 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
894 res_p = _mm256_srli_epi16(pixelFilter_p, 4);
895 sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5),
896 _mm256_sub_epi16(q256_2, p256_0));
897 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
898 res_q = _mm256_srli_epi16(pixelFilter_q, 4);
899 __m256i flat2_p1q1 =
900 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
901 p1q1256 = _mm256_andnot_si256(flat2_256, p1q1256);
902 flat2_p1q1 = _mm256_and_si256(flat2_256, flat2_p1q1);
903 p1q1256 = _mm256_or_si256(flat2_p1q1, p1q1256);
904 p1 = _mm256_castsi256_si128(p1q1256);
905 q1 = _mm256_extractf128_si256(p1q1256, 1);
906 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
907 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
908
909 // Derive p2 and q2
910 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4),
911 _mm256_sub_epi16(p256_3, p256_0));
912 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
913 res_p = _mm256_srli_epi16(pixelFilter_p, 4);
914 sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4),
915 _mm256_sub_epi16(q256_3, q256_0));
916 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
917 res_q = _mm256_srli_epi16(pixelFilter_q, 4);
918 __m256i flat2_p2q2 =
919 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
920 p2q2256 = _mm256_andnot_si256(flat2_256, p2q2256);
921 flat2_p2q2 = _mm256_and_si256(flat2_256, flat2_p2q2);
922 p2q2256 = _mm256_or_si256(flat2_p2q2, p2q2256);
923 p2 = _mm256_castsi256_si128(p2q2256);
924 q2 = _mm256_extractf128_si256(p2q2256, 1);
925 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
926 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
927
928 // Derive p3 and q3
929 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3),
930 _mm256_sub_epi16(p256_4, p256_1));
931 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
932 res_p = _mm256_srli_epi16(pixelFilter_p, 4);
933 sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3),
934 _mm256_sub_epi16(q256_4, q256_1));
935 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
936 res_q = _mm256_srli_epi16(pixelFilter_q, 4);
937 __m256i flat2_p3q3 =
938 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
939 p3q3256 = _mm256_andnot_si256(flat2_256, p3q3256);
940 flat2_p3q3 = _mm256_and_si256(flat2_256, flat2_p3q3);
941 p3q3256 = _mm256_or_si256(flat2_p3q3, p3q3256);
942 p3 = _mm256_castsi256_si128(p3q3256);
943 q3 = _mm256_extractf128_si256(p3q3256, 1);
944 _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
945 _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
946
947 // Derive p4 and q4
948 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2),
949 _mm256_sub_epi16(p256_5, p256_2));
950 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
951 res_p = _mm256_srli_epi16(pixelFilter_p, 4);
952 sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2),
953 _mm256_sub_epi16(q256_5, q256_2));
954 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
955 res_q = _mm256_srli_epi16(pixelFilter_q, 4);
956 __m256i flat2_p4q4 =
957 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
958 p4q4256 = _mm256_andnot_si256(flat2_256, p4q4256);
959 flat2_p4q4 = _mm256_and_si256(flat2_256, flat2_p4q4);
960 p4q4256 = _mm256_or_si256(flat2_p4q4, p4q4256);
961 _mm_storeu_si128((__m128i *)(s - 5 * p),
962 _mm256_castsi256_si128(p4q4256));
963 _mm_storeu_si128((__m128i *)(s + 4 * p),
964 _mm256_extractf128_si256(p4q4256, 1));
965
966 // Derive p5 and q5
967 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1),
968 _mm256_sub_epi16(p256_6, p256_3));
969 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
970 res_p = _mm256_srli_epi16(pixelFilter_p, 4);
971 sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1),
972 _mm256_sub_epi16(q256_6, q256_3));
973 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
974 res_q = _mm256_srli_epi16(pixelFilter_q, 4);
975 __m256i flat2_p5q5 =
976 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
977 p5q5256 = _mm256_andnot_si256(flat2_256, p5q5256);
978 flat2_p5q5 = _mm256_and_si256(flat2_256, flat2_p5q5);
979 p5q5256 = _mm256_or_si256(flat2_p5q5, p5q5256);
980 _mm_storeu_si128((__m128i *)(s - 6 * p),
981 _mm256_castsi256_si128(p5q5256));
982 _mm_storeu_si128((__m128i *)(s + 5 * p),
983 _mm256_extractf128_si256(p5q5256, 1));
984 } else {
985 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
986 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
987 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
988 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
989 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
990 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
991 }
992 } else {
993 _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
994 _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
995 _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
996 _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
997 }
998 }
999 }
1000
aom_lpf_vertical_14_quad_avx2(unsigned char * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)1001 void aom_lpf_vertical_14_quad_avx2(unsigned char *s, int pitch,
1002 const uint8_t *_blimit0,
1003 const uint8_t *_limit0,
1004 const uint8_t *_thresh0) {
1005 DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1006
1007 // Transpose 16x16
1008 trans_store_16x16_lpf_vert14(s - 8, pitch, t_dst, 16, 1);
1009
1010 // Loop filtering
1011 aom_lpf_horizontal_14_quad_avx2(t_dst + 8 * 16, 16, _blimit0, _limit0,
1012 _thresh0);
1013
1014 // Transpose back
1015 trans_store_16x16_lpf_vert14(t_dst, 16, s - 8, pitch, 0);
1016 }
1017