• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h> /* AVX2 */
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
17   0, 128, 1, 128, 2,  128, 3,  128, 4,  128, 5,  128, 6,  128, 7,  128,
18   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
19 };
20 
aom_lpf_horizontal_6_quad_avx2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)21 void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p,
22                                     const unsigned char *_blimit0,
23                                     const unsigned char *_limit0,
24                                     const unsigned char *_thresh0) {
25   __m256i p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
26   __m128i p2, p1, p0, q0, q1, q2;
27   __m128i mask, flat;
28 
29   const __m128i thresh_v =
30       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
31   const __m128i limit_v =
32       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
33   const __m128i blimit_v =
34       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
35   const __m128i zero = _mm_setzero_si128();
36   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
37 
38   p256_2 =
39       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
40   p256_1 =
41       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
42   p256_0 =
43       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
44   q256_0 =
45       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
46   q256_1 =
47       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
48   q256_2 =
49       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
50 
51   p2 = _mm256_castsi256_si128(p256_2);
52   p1 = _mm256_castsi256_si128(p256_1);
53   p0 = _mm256_castsi256_si128(p256_0);
54   q0 = _mm256_castsi256_si128(q256_0);
55   q1 = _mm256_castsi256_si128(q256_1);
56   q2 = _mm256_castsi256_si128(q256_2);
57 
58   {
59     __m128i work;
60     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
61     const __m128i abs_p1p0 =
62         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
63     const __m128i abs_q1q0 =
64         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
65     __m128i abs_p0q0 =
66         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
67     __m128i abs_p1q1 =
68         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
69 
70     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
71 
72     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
73     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
74     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
75     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
76     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
77     mask = _mm_max_epu8(flat, mask);
78     // mask |= (abs(p1 - p0) > limit) * -1;
79     // mask |= (abs(q1 - q0) > limit) * -1;
80     work = _mm_max_epu8(
81         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
82         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)));
83     mask = _mm_max_epu8(work, mask);
84     mask = _mm_subs_epu8(mask, limit_v);
85     mask = _mm_cmpeq_epi8(mask, zero);
86   }
87 
88   if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
89 
90   // loop filter
91   {
92     const __m128i t4 = _mm_set1_epi8(4);
93     const __m128i t3 = _mm_set1_epi8(3);
94     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
95     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
96     const __m128i t1f = _mm_set1_epi8(0x1f);
97     const __m128i t1 = _mm_set1_epi8(0x1);
98     const __m128i t7f = _mm_set1_epi8(0x7f);
99     const __m128i one = _mm_set1_epi8(1);
100     __m128i hev;
101 
102     hev = _mm_subs_epu8(flat, thresh_v);
103     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
104 
105     __m128i ps1 = _mm_xor_si128(p1, t80);
106     __m128i ps0 = _mm_xor_si128(p0, t80);
107     __m128i qs0 = _mm_xor_si128(q0, t80);
108     __m128i qs1 = _mm_xor_si128(q1, t80);
109     __m128i filt;
110     __m128i work_a;
111     __m128i filter1, filter2;
112     __m128i flat_p1, flat_p0, flat_q0, flat_q1;
113 
114     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
115     work_a = _mm_subs_epi8(qs0, ps0);
116     filt = _mm_adds_epi8(filt, work_a);
117     filt = _mm_adds_epi8(filt, work_a);
118     filt = _mm_adds_epi8(filt, work_a);
119     filt = _mm_and_si128(filt, mask);
120 
121     filter1 = _mm_adds_epi8(filt, t4);
122     filter2 = _mm_adds_epi8(filt, t3);
123 
124     work_a = _mm_cmpgt_epi8(zero, filter1);
125     filter1 = _mm_srli_epi16(filter1, 3);
126     work_a = _mm_and_si128(work_a, te0);
127     filter1 = _mm_and_si128(filter1, t1f);
128     filter1 = _mm_or_si128(filter1, work_a);
129     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
130 
131     work_a = _mm_cmpgt_epi8(zero, filter2);
132     filter2 = _mm_srli_epi16(filter2, 3);
133     work_a = _mm_and_si128(work_a, te0);
134     filter2 = _mm_and_si128(filter2, t1f);
135     filter2 = _mm_or_si128(filter2, work_a);
136     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
137 
138     filt = _mm_adds_epi8(filter1, t1);
139     work_a = _mm_cmpgt_epi8(zero, filt);
140     filt = _mm_srli_epi16(filt, 1);
141     work_a = _mm_and_si128(work_a, t80);
142     filt = _mm_and_si128(filt, t7f);
143     filt = _mm_or_si128(filt, work_a);
144     filt = _mm_andnot_si128(hev, filt);
145     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
146     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
147 
148     __m128i work;
149     work = _mm_max_epu8(
150         _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
151         _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
152     flat = _mm_max_epu8(work, flat);
153     flat = _mm_subs_epu8(flat, one);
154     flat = _mm_cmpeq_epi8(flat, zero);
155     flat = _mm_and_si128(flat, mask);
156 
157     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
158       const __m256i four = _mm256_set1_epi16(4);
159       __m256i pixetFilter, add, res;
160 
161       const __m256i filter =
162           _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
163 
164       p256_2 = _mm256_shuffle_epi8(p256_2, filter);
165       p256_1 = _mm256_shuffle_epi8(p256_1, filter);
166       p256_0 = _mm256_shuffle_epi8(p256_0, filter);
167       q256_0 = _mm256_shuffle_epi8(q256_0, filter);
168       q256_1 = _mm256_shuffle_epi8(q256_1, filter);
169       q256_2 = _mm256_shuffle_epi8(q256_2, filter);
170 
171       pixetFilter = _mm256_slli_epi16(
172           _mm256_add_epi16(p256_2, _mm256_add_epi16(p256_1, p256_0)), 1);
173       pixetFilter =
174           _mm256_add_epi16(pixetFilter, _mm256_add_epi16(p256_2, q256_0));
175       pixetFilter = _mm256_add_epi16(four, pixetFilter);
176       res = _mm256_srli_epi16(pixetFilter, 3);
177       flat_p1 = _mm256_castsi256_si128(
178           _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
179       p1 = _mm_andnot_si128(flat, ps1);
180       flat_p1 = _mm_and_si128(flat, flat_p1);
181       p1 = _mm_or_si128(flat_p1, p1);
182 
183       add = _mm256_add_epi16(_mm256_sub_epi16(q256_1, p256_2),
184                              _mm256_sub_epi16(q256_0, p256_2));
185       pixetFilter = _mm256_add_epi16(pixetFilter, add);
186       res = _mm256_srli_epi16(pixetFilter, 3);
187       flat_p0 = _mm256_castsi256_si128(
188           _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
189       p0 = _mm_andnot_si128(flat, ps0);
190       flat_p0 = _mm_and_si128(flat, flat_p0);
191       p0 = _mm_or_si128(flat_p0, p0);
192 
193       add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_2),
194                              _mm256_sub_epi16(q256_1, p256_1));
195       pixetFilter = _mm256_add_epi16(pixetFilter, add);
196       res = _mm256_srli_epi16(pixetFilter, 3);
197       flat_q0 = _mm256_castsi256_si128(
198           _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
199       q0 = _mm_andnot_si128(flat, qs0);
200       flat_q0 = _mm_and_si128(flat, flat_q0);
201       q0 = _mm_or_si128(flat_q0, q0);
202 
203       add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_1),
204                              _mm256_sub_epi16(q256_2, p256_0));
205       pixetFilter = _mm256_add_epi16(pixetFilter, add);
206       res = _mm256_srli_epi16(pixetFilter, 3);
207       flat_q1 = _mm256_castsi256_si128(
208           _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
209       q1 = _mm_andnot_si128(flat, qs1);
210       flat_q1 = _mm_and_si128(flat, flat_q1);
211       q1 = _mm_or_si128(flat_q1, q1);
212 
213       _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
214       _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
215       _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
216       _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
217       _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
218       _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
219     } else {
220       _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
221       _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
222       _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
223       _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
224     }
225   }
226 }
227 
aom_lpf_horizontal_8_quad_avx2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)228 void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
229                                     const unsigned char *_blimit0,
230                                     const unsigned char *_limit0,
231                                     const unsigned char *_thresh0) {
232   __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
233   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
234   __m128i mask, flat;
235 
236   const __m128i thresh_v =
237       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
238   const __m128i limit_v =
239       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
240   const __m128i blimit_v =
241       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
242   const __m128i zero = _mm_setzero_si128();
243   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
244 
245   p256_3 =
246       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
247   p256_2 =
248       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
249   p256_1 =
250       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
251   p256_0 =
252       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
253   q256_0 =
254       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
255   q256_1 =
256       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
257   q256_2 =
258       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
259   q256_3 =
260       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
261 
262   p3 = _mm256_castsi256_si128(p256_3);
263   p2 = _mm256_castsi256_si128(p256_2);
264   p1 = _mm256_castsi256_si128(p256_1);
265   p0 = _mm256_castsi256_si128(p256_0);
266   q0 = _mm256_castsi256_si128(q256_0);
267   q1 = _mm256_castsi256_si128(q256_1);
268   q2 = _mm256_castsi256_si128(q256_2);
269   q3 = _mm256_castsi256_si128(q256_3);
270 
271   {
272     __m128i work;
273     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
274     const __m128i abs_p1p0 =
275         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
276     const __m128i abs_q1q0 =
277         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
278     __m128i abs_p0q0 =
279         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
280     __m128i abs_p1q1 =
281         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
282 
283     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
284 
285     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
286     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
287     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
288     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
289     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
290     mask = _mm_max_epu8(flat, mask);
291     // mask |= (abs(p1 - p0) > limit) * -1;
292     // mask |= (abs(q1 - q0) > limit) * -1;
293     work = _mm_max_epu8(
294         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
295         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
296     mask = _mm_max_epu8(work, mask);
297     work = _mm_max_epu8(
298         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
299         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
300     mask = _mm_max_epu8(work, mask);
301     mask = _mm_subs_epu8(mask, limit_v);
302     mask = _mm_cmpeq_epi8(mask, zero);
303   }
304 
305   if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
306 
307   // loop filter
308   {
309     const __m128i t4 = _mm_set1_epi8(4);
310     const __m128i t3 = _mm_set1_epi8(3);
311     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
312     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
313     const __m128i t1f = _mm_set1_epi8(0x1f);
314     const __m128i t1 = _mm_set1_epi8(0x1);
315     const __m128i t7f = _mm_set1_epi8(0x7f);
316     const __m128i one = _mm_set1_epi8(1);
317     __m128i hev;
318 
319     hev = _mm_subs_epu8(flat, thresh_v);
320     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
321 
322     __m128i ps1 = _mm_xor_si128(p1, t80);
323     __m128i ps0 = _mm_xor_si128(p0, t80);
324     __m128i qs0 = _mm_xor_si128(q0, t80);
325     __m128i qs1 = _mm_xor_si128(q1, t80);
326     __m128i filt;
327     __m128i work_a;
328     __m128i filter1, filter2;
329     __m128i flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
330 
331     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
332     work_a = _mm_subs_epi8(qs0, ps0);
333     filt = _mm_adds_epi8(filt, work_a);
334     filt = _mm_adds_epi8(filt, work_a);
335     filt = _mm_adds_epi8(filt, work_a);
336     filt = _mm_and_si128(filt, mask);
337 
338     filter1 = _mm_adds_epi8(filt, t4);
339     filter2 = _mm_adds_epi8(filt, t3);
340 
341     work_a = _mm_cmpgt_epi8(zero, filter1);
342     filter1 = _mm_srli_epi16(filter1, 3);
343     work_a = _mm_and_si128(work_a, te0);
344     filter1 = _mm_and_si128(filter1, t1f);
345     filter1 = _mm_or_si128(filter1, work_a);
346     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
347 
348     work_a = _mm_cmpgt_epi8(zero, filter2);
349     filter2 = _mm_srli_epi16(filter2, 3);
350     work_a = _mm_and_si128(work_a, te0);
351     filter2 = _mm_and_si128(filter2, t1f);
352     filter2 = _mm_or_si128(filter2, work_a);
353     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
354 
355     filt = _mm_adds_epi8(filter1, t1);
356     work_a = _mm_cmpgt_epi8(zero, filt);
357     filt = _mm_srli_epi16(filt, 1);
358     work_a = _mm_and_si128(work_a, t80);
359     filt = _mm_and_si128(filt, t7f);
360     filt = _mm_or_si128(filt, work_a);
361     filt = _mm_andnot_si128(hev, filt);
362     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
363     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
364 
365     __m128i work;
366     work = _mm_max_epu8(
367         _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
368         _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
369     flat = _mm_max_epu8(work, flat);
370     work = _mm_max_epu8(
371         _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
372         _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
373     flat = _mm_max_epu8(work, flat);
374     flat = _mm_subs_epu8(flat, one);
375     flat = _mm_cmpeq_epi8(flat, zero);
376     flat = _mm_and_si128(flat, mask);
377 
378     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
379       const __m256i four = _mm256_set1_epi16(4);
380       __m256i pixetFilter_p2p1p0, p2p1p0, q2q1q0, pixetFilter_q2q1q0, sum_p,
381           sum_q, res_p, res_q;
382 
383       const __m256i filter =
384           _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
385 
386       p256_3 = _mm256_shuffle_epi8(p256_3, filter);
387       p256_2 = _mm256_shuffle_epi8(p256_2, filter);
388       p256_1 = _mm256_shuffle_epi8(p256_1, filter);
389       p256_0 = _mm256_shuffle_epi8(p256_0, filter);
390       q256_0 = _mm256_shuffle_epi8(q256_0, filter);
391       q256_1 = _mm256_shuffle_epi8(q256_1, filter);
392       q256_2 = _mm256_shuffle_epi8(q256_2, filter);
393       q256_3 = _mm256_shuffle_epi8(q256_3, filter);
394 
395       p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
396       q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
397 
398       pixetFilter_p2p1p0 =
399           _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
400       pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
401 
402       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
403       res_p =
404           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
405       flat_p0 = _mm256_castsi256_si128(
406           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
407       p0 = _mm_andnot_si128(flat, ps0);
408       flat_p0 = _mm_and_si128(flat, flat_p0);
409       p0 = _mm_or_si128(flat_p0, p0);
410 
411       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
412       res_q =
413           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
414       flat_q0 = _mm256_castsi256_si128(
415           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
416       q0 = _mm_andnot_si128(flat, qs0);
417       flat_q0 = _mm_and_si128(flat, flat_q0);
418       q0 = _mm_or_si128(flat_q0, q0);
419 
420       sum_p = _mm256_sub_epi16(p256_3, q256_2);
421       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
422       res_p =
423           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
424       flat_p1 = _mm256_castsi256_si128(
425           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
426       p1 = _mm_andnot_si128(flat, ps1);
427       flat_p1 = _mm_and_si128(flat, flat_p1);
428       p1 = _mm_or_si128(flat_p1, p1);
429 
430       sum_q = _mm256_sub_epi16(q256_3, p256_2);
431       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
432       res_q =
433           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
434       flat_q1 = _mm256_castsi256_si128(
435           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
436       q1 = _mm_andnot_si128(flat, qs1);
437       flat_q1 = _mm_and_si128(flat, flat_q1);
438       q1 = _mm_or_si128(flat_q1, q1);
439 
440       sum_p = _mm256_sub_epi16(p256_3, q256_1);
441       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
442       res_p =
443           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
444       flat_p2 = _mm256_castsi256_si128(
445           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
446       p2 = _mm_andnot_si128(flat, p2);
447       flat_p2 = _mm_and_si128(flat, flat_p2);
448       p2 = _mm_or_si128(flat_p2, p2);
449 
450       sum_q = _mm256_sub_epi16(q256_3, p256_1);
451       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
452       res_q =
453           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
454       flat_q2 = _mm256_castsi256_si128(
455           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
456       q2 = _mm_andnot_si128(flat, q2);
457       flat_q2 = _mm_and_si128(flat, flat_q2);
458       q2 = _mm_or_si128(flat_q2, q2);
459 
460       _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
461       _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
462       _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
463       _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
464       _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
465       _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
466     } else {
467       _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
468       _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
469       _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
470       _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
471     }
472   }
473 }
474 
trans_store_16x16_lpf_vert14(unsigned char * in0,int in_p,unsigned char * out,int out_p,int is_store_avx2)475 static INLINE void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p,
476                                                 unsigned char *out, int out_p,
477                                                 int is_store_avx2) {
478   const __m128i x0 = _mm_loadu_si128((__m128i *)in0);
479   const __m128i x1 = _mm_loadu_si128((__m128i *)(in0 + in_p * 1));
480   const __m128i x2 = _mm_loadu_si128((__m128i *)(in0 + in_p * 2));
481   const __m128i x3 = _mm_loadu_si128((__m128i *)(in0 + in_p * 3));
482   const __m128i x4 = _mm_loadu_si128((__m128i *)(in0 + in_p * 4));
483   const __m128i x5 = _mm_loadu_si128((__m128i *)(in0 + in_p * 5));
484   const __m128i x6 = _mm_loadu_si128((__m128i *)(in0 + in_p * 6));
485   const __m128i x7 = _mm_loadu_si128((__m128i *)(in0 + in_p * 7));
486 
487   const __m256i y0 = _mm256_insertf128_si256(
488       _mm256_castsi128_si256(x0), _mm_loadu_si128((__m128i *)(in0 + in_p * 8)),
489       0x1);
490   const __m256i y1 = _mm256_insertf128_si256(
491       _mm256_castsi128_si256(x1), _mm_loadu_si128((__m128i *)(in0 + in_p * 9)),
492       0x1);
493   const __m256i y2 = _mm256_insertf128_si256(
494       _mm256_castsi128_si256(x2), _mm_loadu_si128((__m128i *)(in0 + in_p * 10)),
495       0x1);
496   const __m256i y3 = _mm256_insertf128_si256(
497       _mm256_castsi128_si256(x3), _mm_loadu_si128((__m128i *)(in0 + in_p * 11)),
498       0x1);
499   const __m256i y4 = _mm256_insertf128_si256(
500       _mm256_castsi128_si256(x4), _mm_loadu_si128((__m128i *)(in0 + in_p * 12)),
501       0x1);
502   const __m256i y5 = _mm256_insertf128_si256(
503       _mm256_castsi128_si256(x5), _mm_loadu_si128((__m128i *)(in0 + in_p * 13)),
504       0x1);
505   const __m256i y6 = _mm256_insertf128_si256(
506       _mm256_castsi128_si256(x6), _mm_loadu_si128((__m128i *)(in0 + in_p * 14)),
507       0x1);
508   const __m256i y7 = _mm256_insertf128_si256(
509       _mm256_castsi128_si256(x7), _mm_loadu_si128((__m128i *)(in0 + in_p * 15)),
510       0x1);
511 
512   const __m256i y_s00 = _mm256_unpacklo_epi8(y0, y1);
513   const __m256i y_s01 = _mm256_unpackhi_epi8(y0, y1);
514   const __m256i y_s02 = _mm256_unpacklo_epi8(y2, y3);
515   const __m256i y_s03 = _mm256_unpackhi_epi8(y2, y3);
516   const __m256i y_s04 = _mm256_unpacklo_epi8(y4, y5);
517   const __m256i y_s05 = _mm256_unpackhi_epi8(y4, y5);
518   const __m256i y_s06 = _mm256_unpacklo_epi8(y6, y7);
519   const __m256i y_s07 = _mm256_unpackhi_epi8(y6, y7);
520 
521   const __m256i y_s10 = _mm256_unpacklo_epi16(y_s00, y_s02);
522   const __m256i y_s11 = _mm256_unpackhi_epi16(y_s00, y_s02);
523   const __m256i y_s12 = _mm256_unpacklo_epi16(y_s01, y_s03);
524   const __m256i y_s13 = _mm256_unpackhi_epi16(y_s01, y_s03);
525   const __m256i y_s14 = _mm256_unpacklo_epi16(y_s04, y_s06);
526   const __m256i y_s15 = _mm256_unpackhi_epi16(y_s04, y_s06);
527   const __m256i y_s16 = _mm256_unpacklo_epi16(y_s05, y_s07);
528   const __m256i y_s17 = _mm256_unpackhi_epi16(y_s05, y_s07);
529 
530   const __m256i y_s20 = _mm256_unpacklo_epi32(y_s10, y_s14);
531   const __m256i y_s21 = _mm256_unpackhi_epi32(y_s10, y_s14);
532   const __m256i y_s22 = _mm256_unpacklo_epi32(y_s11, y_s15);
533   const __m256i y_s23 = _mm256_unpackhi_epi32(y_s11, y_s15);
534   const __m256i y_s24 = _mm256_unpacklo_epi32(y_s12, y_s16);
535   const __m256i y_s25 = _mm256_unpackhi_epi32(y_s12, y_s16);
536   const __m256i y_s26 = _mm256_unpacklo_epi32(y_s13, y_s17);
537   const __m256i y_s27 = _mm256_unpackhi_epi32(y_s13, y_s17);
538 
539   const __m256i row_s01 = _mm256_permute4x64_epi64(y_s20, 0xd8);
540   const __m256i row_s23 = _mm256_permute4x64_epi64(y_s21, 0xd8);
541   const __m256i row_s45 = _mm256_permute4x64_epi64(y_s22, 0xd8);
542   const __m256i row_s67 = _mm256_permute4x64_epi64(y_s23, 0xd8);
543   const __m256i row_s89 = _mm256_permute4x64_epi64(y_s24, 0xd8);
544   const __m256i row_s1011 = _mm256_permute4x64_epi64(y_s25, 0xd8);
545   const __m256i row_s1213 = _mm256_permute4x64_epi64(y_s26, 0xd8);
546   const __m256i row_s1415 = _mm256_permute4x64_epi64(y_s27, 0xd8);
547 
548   if (is_store_avx2) {
549     _mm256_storeu_si256((__m256i *)(out), row_s01);
550     _mm256_storeu_si256((__m256i *)(out + (2 * out_p)), row_s23);
551     _mm256_storeu_si256((__m256i *)(out + (4 * out_p)), row_s45);
552     _mm256_storeu_si256((__m256i *)(out + (6 * out_p)), row_s67);
553     _mm256_storeu_si256((__m256i *)(out + (8 * out_p)), row_s89);
554     _mm256_storeu_si256((__m256i *)(out + (10 * out_p)), row_s1011);
555     _mm256_storeu_si256((__m256i *)(out + (12 * out_p)), row_s1213);
556     _mm256_storeu_si256((__m256i *)(out + (14 * out_p)), row_s1415);
557   } else {
558     _mm_storeu_si128((__m128i *)(out), _mm256_castsi256_si128(row_s01));
559     _mm_storeu_si128((__m128i *)(out + (2 * out_p)),
560                      _mm256_castsi256_si128(row_s23));
561     _mm_storeu_si128((__m128i *)(out + (4 * out_p)),
562                      _mm256_castsi256_si128(row_s45));
563     _mm_storeu_si128((__m128i *)(out + (6 * out_p)),
564                      _mm256_castsi256_si128(row_s67));
565     _mm_storeu_si128((__m128i *)(out + (8 * out_p)),
566                      _mm256_castsi256_si128(row_s89));
567     _mm_storeu_si128((__m128i *)(out + (10 * out_p)),
568                      _mm256_castsi256_si128(row_s1011));
569     _mm_storeu_si128((__m128i *)(out + (12 * out_p)),
570                      _mm256_castsi256_si128(row_s1213));
571     _mm_storeu_si128((__m128i *)(out + (14 * out_p)),
572                      _mm256_castsi256_si128(row_s1415));
573     _mm_storeu_si128((__m128i *)(out + (1 * out_p)),
574                      _mm256_extracti128_si256(row_s01, 1));
575     _mm_storeu_si128((__m128i *)(out + (3 * out_p)),
576                      _mm256_extracti128_si256(row_s23, 1));
577     _mm_storeu_si128((__m128i *)(out + (5 * out_p)),
578                      _mm256_extracti128_si256(row_s45, 1));
579     _mm_storeu_si128((__m128i *)(out + (7 * out_p)),
580                      _mm256_extracti128_si256(row_s67, 1));
581     _mm_storeu_si128((__m128i *)(out + (9 * out_p)),
582                      _mm256_extracti128_si256(row_s89, 1));
583     _mm_storeu_si128((__m128i *)(out + (11 * out_p)),
584                      _mm256_extracti128_si256(row_s1011, 1));
585     _mm_storeu_si128((__m128i *)(out + (13 * out_p)),
586                      _mm256_extracti128_si256(row_s1213, 1));
587     _mm_storeu_si128((__m128i *)(out + (15 * out_p)),
588                      _mm256_extracti128_si256(row_s1415, 1));
589   }
590 }
591 
aom_lpf_horizontal_14_quad_avx2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)592 void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
593                                      const unsigned char *_blimit0,
594                                      const unsigned char *_limit0,
595                                      const unsigned char *_thresh0) {
596   __m128i mask, flat;
597   const __m128i zero = _mm_setzero_si128();
598   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
599 
600   __m256i p256_3 =
601       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
602   __m256i p256_2 =
603       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
604   __m256i p256_1 =
605       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
606   __m256i p256_0 =
607       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
608   __m256i q256_0 =
609       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
610   __m256i q256_1 =
611       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
612   __m256i q256_2 =
613       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
614   __m256i q256_3 =
615       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
616 
617   __m128i p3 = _mm256_castsi256_si128(p256_3);
618   __m128i p2 = _mm256_castsi256_si128(p256_2);
619   __m128i p1 = _mm256_castsi256_si128(p256_1);
620   __m128i p0 = _mm256_castsi256_si128(p256_0);
621   __m128i q0 = _mm256_castsi256_si128(q256_0);
622   __m128i q1 = _mm256_castsi256_si128(q256_1);
623   __m128i q2 = _mm256_castsi256_si128(q256_2);
624   __m128i q3 = _mm256_castsi256_si128(q256_3);
625 
626   {
627     const __m128i limit_v =
628         _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
629     const __m128i blimit_v =
630         _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
631     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
632     const __m128i abs_p1p0 =
633         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
634     const __m128i abs_q1q0 =
635         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
636     __m128i abs_p0q0 =
637         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
638     __m128i abs_p1q1 =
639         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
640 
641     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
642 
643     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
644     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
645     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
646     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
647     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
648     mask = _mm_max_epu8(flat, mask);
649     // mask |= (abs(p1 - p0) > limit) * -1;
650     // mask |= (abs(q1 - q0) > limit) * -1;
651     __m128i work = _mm_max_epu8(
652         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
653         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
654     mask = _mm_max_epu8(work, mask);
655     work = _mm_max_epu8(
656         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
657         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
658     mask = _mm_max_epu8(work, mask);
659     mask = _mm_subs_epu8(mask, limit_v);
660     mask = _mm_cmpeq_epi8(mask, zero);
661   }
662 
663   if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
664 
665   // loop filter
666   {
667     const __m128i thresh_v =
668         _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
669     const __m128i one = _mm_set1_epi8(1);
670     const __m128i t3 = _mm_set1_epi8(3);
671     const __m128i t4 = _mm_add_epi8(one, t3);
672     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
673     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
674     const __m128i t1f = _mm_set1_epi8(0x1f);
675     const __m128i t7f = _mm_sub_epi8(t80, one);
676 
677     __m128i hev = _mm_subs_epu8(flat, thresh_v);
678     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
679 
680     __m128i ps1 = _mm_xor_si128(p1, t80);
681     __m128i ps0 = _mm_xor_si128(p0, t80);
682     __m128i qs0 = _mm_xor_si128(q0, t80);
683     __m128i qs1 = _mm_xor_si128(q1, t80);
684 
685     __m128i filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
686     __m128i work_a = _mm_subs_epi8(qs0, ps0);
687     filt = _mm_adds_epi8(filt, work_a);
688     filt = _mm_adds_epi8(filt, work_a);
689     filt = _mm_adds_epi8(filt, work_a);
690     filt = _mm_and_si128(filt, mask);
691 
692     __m128i filter1 = _mm_adds_epi8(filt, t4);
693     __m128i filter2 = _mm_adds_epi8(filt, t3);
694 
695     work_a = _mm_cmpgt_epi8(zero, filter1);
696     filter1 = _mm_srli_epi16(filter1, 3);
697     work_a = _mm_and_si128(work_a, te0);
698     filter1 = _mm_and_si128(filter1, t1f);
699     filter1 = _mm_or_si128(filter1, work_a);
700     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
701 
702     work_a = _mm_cmpgt_epi8(zero, filter2);
703     filter2 = _mm_srli_epi16(filter2, 3);
704     work_a = _mm_and_si128(work_a, te0);
705     filter2 = _mm_and_si128(filter2, t1f);
706     filter2 = _mm_or_si128(filter2, work_a);
707     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
708 
709     filt = _mm_adds_epi8(filter1, one);
710     work_a = _mm_cmpgt_epi8(zero, filt);
711     filt = _mm_srli_epi16(filt, 1);
712     work_a = _mm_and_si128(work_a, t80);
713     filt = _mm_and_si128(filt, t7f);
714     filt = _mm_or_si128(filt, work_a);
715     filt = _mm_andnot_si128(hev, filt);
716     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
717     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
718 
719     // Derive flat
720     __m256i p0q0256 = _mm256_blend_epi32(p256_0, q256_0, 0xf0);
721     __m256i p2q2256 = _mm256_blend_epi32(p256_2, q256_2, 0xf0);
722     __m256i p3q3256 = _mm256_blend_epi32(p256_3, q256_3, 0xf0);
723     const __m256i ps0qs0256 =
724         _mm256_insertf128_si256(_mm256_castsi128_si256(ps0), qs0, 0x1);
725     const __m256i ps1qs1256 =
726         _mm256_insertf128_si256(_mm256_castsi128_si256(ps1), qs1, 0x1);
727     const __m256i work01 = _mm256_or_si256(_mm256_subs_epu8(p2q2256, p0q0256),
728                                            _mm256_subs_epu8(p0q0256, p2q2256));
729     const __m256i work02 = _mm256_or_si256(_mm256_subs_epu8(p3q3256, p0q0256),
730                                            _mm256_subs_epu8(p0q0256, p3q3256));
731     const __m256i max0_256 = _mm256_max_epu8(work01, work02);
732     const __m128i max1_256 =
733         _mm_max_epu8(_mm256_castsi256_si128(max0_256),
734                      _mm256_extractf128_si256(max0_256, 1));
735     flat = _mm_max_epu8(max1_256, flat);
736     flat = _mm_subs_epu8(flat, one);
737     flat = _mm_cmpeq_epi8(flat, zero);
738     flat = _mm_and_si128(flat, mask);
739 
740     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
741     // flat and wide flat calculations
742     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
743       const __m256i flat256 =
744           _mm256_insertf128_si256(_mm256_castsi128_si256(flat), flat, 0x1);
745       const __m256i eight = _mm256_set1_epi16(8);
746       const __m256i four = _mm256_set1_epi16(4);
747 
748       __m256i p256_4 = _mm256_castpd_si256(
749           _mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
750       __m256i q256_4 = _mm256_castpd_si256(
751           _mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
752       __m256i p256_5 = _mm256_castpd_si256(
753           _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
754       __m256i q256_5 = _mm256_castpd_si256(
755           _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
756       __m256i p256_6 = _mm256_castpd_si256(
757           _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
758       __m256i q256_6 = _mm256_castpd_si256(
759           _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
760 
761       // Derive flat2
762       __m256i p4q4256 = _mm256_blend_epi32(p256_4, q256_4, 0xf0);
763       __m256i p5q5256 = _mm256_blend_epi32(p256_5, q256_5, 0xf0);
764       const __m256i p6q6256 = _mm256_blend_epi32(p256_6, q256_6, 0xf0);
765       const __m256i work1 = _mm256_or_si256(_mm256_subs_epu8(p4q4256, p0q0256),
766                                             _mm256_subs_epu8(p0q0256, p4q4256));
767       const __m256i work2 = _mm256_or_si256(_mm256_subs_epu8(p5q5256, p0q0256),
768                                             _mm256_subs_epu8(p0q0256, p5q5256));
769       const __m256i work3 = _mm256_or_si256(_mm256_subs_epu8(p6q6256, p0q0256),
770                                             _mm256_subs_epu8(p0q0256, p6q6256));
771       __m256i flat2_256 = _mm256_max_epu8(work1, work2);
772       flat2_256 = _mm256_max_epu8(flat2_256, work3);
773       __m128i flat2 = _mm_max_epu8(_mm256_castsi256_si128(flat2_256),
774                                    _mm256_extractf128_si256(flat2_256, 1));
775       flat2 = _mm_subs_epu8(flat2, one);
776       flat2 = _mm_cmpeq_epi8(flat2, zero);
777       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
778 
779       const __m256i filter =
780           _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
781 
782       p256_3 = _mm256_shuffle_epi8(p256_3, filter);
783       p256_2 = _mm256_shuffle_epi8(p256_2, filter);
784       p256_1 = _mm256_shuffle_epi8(p256_1, filter);
785       p256_0 = _mm256_shuffle_epi8(p256_0, filter);
786       q256_0 = _mm256_shuffle_epi8(q256_0, filter);
787       q256_1 = _mm256_shuffle_epi8(q256_1, filter);
788       q256_2 = _mm256_shuffle_epi8(q256_2, filter);
789       q256_3 = _mm256_shuffle_epi8(q256_3, filter);
790 
791       const __m256i p2p1p0 =
792           _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
793       const __m256i q2q1q0 =
794           _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
795 
796       __m256i pixetFilter_p2p1p0 =
797           _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
798       __m256i pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
799 
800       // Derive p0 and q0
801       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
802       __m256i res_p =
803           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
804       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
805       __m256i res_q =
806           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
807       __m256i flat_p0q0 =
808           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
809       p0q0256 = _mm256_andnot_si256(flat256, ps0qs0256);
810       flat_p0q0 = _mm256_and_si256(flat256, flat_p0q0);
811       p0q0256 = _mm256_or_si256(flat_p0q0, p0q0256);
812       p0 = _mm256_castsi256_si128(p0q0256);
813       q0 = _mm256_extractf128_si256(p0q0256, 1);
814 
815       // Derive p1 and q1
816       __m256i sum_p = _mm256_sub_epi16(p256_3, q256_2);
817       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
818       res_p =
819           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
820       __m256i sum_q = _mm256_sub_epi16(q256_3, p256_2);
821       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
822       res_q =
823           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
824       __m256i flat_p1q1 =
825           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
826       __m256i p1q1256 = _mm256_andnot_si256(flat256, ps1qs1256);
827       flat_p1q1 = _mm256_and_si256(flat256, flat_p1q1);
828       p1q1256 = _mm256_or_si256(flat_p1q1, p1q1256);
829       p1 = _mm256_castsi256_si128(p1q1256);
830       q1 = _mm256_extractf128_si256(p1q1256, 1);
831 
832       // Derive p2 and q2
833       sum_p = _mm256_sub_epi16(p256_3, q256_1);
834       pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
835       res_p =
836           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
837       sum_q = _mm256_sub_epi16(q256_3, p256_1);
838       pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
839       res_q =
840           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
841       __m256i flat_p2q2 =
842           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
843       p2q2256 = _mm256_andnot_si256(flat256, p2q2256);
844       flat_p2q2 = _mm256_and_si256(flat256, flat_p2q2);
845       p2q2256 = _mm256_or_si256(flat_p2q2, p2q2256);
846       p2 = _mm256_castsi256_si128(p2q2256);
847       q2 = _mm256_extractf128_si256(p2q2256, 1);
848       if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
849         flat2_256 =
850             _mm256_insertf128_si256(_mm256_castsi128_si256(flat2), flat2, 0x1);
851         p256_6 = _mm256_shuffle_epi8(p256_6, filter);
852         p256_5 = _mm256_shuffle_epi8(p256_5, filter);
853         p256_4 = _mm256_shuffle_epi8(p256_4, filter);
854         q256_4 = _mm256_shuffle_epi8(q256_4, filter);
855         q256_5 = _mm256_shuffle_epi8(q256_5, filter);
856         q256_6 = _mm256_shuffle_epi8(q256_6, filter);
857 
858         __m256i pixelFilter_p =
859             _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3));
860         __m256i pixelFilter_q =
861             _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3));
862 
863         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0);
864         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q2q1q0);
865 
866         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p256_0);
867         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q256_0);
868         pixelFilter_p = _mm256_add_epi16(
869             eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
870         pixelFilter_q = pixelFilter_p;
871 
872         // Derive p0 and q0
873         pixelFilter_p =
874             _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p);
875         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
876         pixelFilter_q =
877             _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q);
878         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
879         __m256i flat2_p0q0 =
880             _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
881         p0q0256 = _mm256_andnot_si256(flat2_256, p0q0256);
882         flat2_p0q0 = _mm256_and_si256(flat2_256, flat2_p0q0);
883         p0q0256 = _mm256_or_si256(flat2_p0q0, p0q0256);
884 
885         p0 = _mm256_castsi256_si128(p0q0256);
886         q0 = _mm256_extractf128_si256(p0q0256, 1);
887         _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
888         _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
889 
890         // Derive p1 and q1
891         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5),
892                                  _mm256_sub_epi16(p256_2, q256_0));
893         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
894         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
895         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5),
896                                  _mm256_sub_epi16(q256_2, p256_0));
897         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
898         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
899         __m256i flat2_p1q1 =
900             _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
901         p1q1256 = _mm256_andnot_si256(flat2_256, p1q1256);
902         flat2_p1q1 = _mm256_and_si256(flat2_256, flat2_p1q1);
903         p1q1256 = _mm256_or_si256(flat2_p1q1, p1q1256);
904         p1 = _mm256_castsi256_si128(p1q1256);
905         q1 = _mm256_extractf128_si256(p1q1256, 1);
906         _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
907         _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
908 
909         // Derive p2 and q2
910         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4),
911                                  _mm256_sub_epi16(p256_3, p256_0));
912         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
913         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
914         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4),
915                                  _mm256_sub_epi16(q256_3, q256_0));
916         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
917         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
918         __m256i flat2_p2q2 =
919             _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
920         p2q2256 = _mm256_andnot_si256(flat2_256, p2q2256);
921         flat2_p2q2 = _mm256_and_si256(flat2_256, flat2_p2q2);
922         p2q2256 = _mm256_or_si256(flat2_p2q2, p2q2256);
923         p2 = _mm256_castsi256_si128(p2q2256);
924         q2 = _mm256_extractf128_si256(p2q2256, 1);
925         _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
926         _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
927 
928         // Derive p3 and q3
929         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3),
930                                  _mm256_sub_epi16(p256_4, p256_1));
931         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
932         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
933         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3),
934                                  _mm256_sub_epi16(q256_4, q256_1));
935         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
936         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
937         __m256i flat2_p3q3 =
938             _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
939         p3q3256 = _mm256_andnot_si256(flat2_256, p3q3256);
940         flat2_p3q3 = _mm256_and_si256(flat2_256, flat2_p3q3);
941         p3q3256 = _mm256_or_si256(flat2_p3q3, p3q3256);
942         p3 = _mm256_castsi256_si128(p3q3256);
943         q3 = _mm256_extractf128_si256(p3q3256, 1);
944         _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
945         _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
946 
947         // Derive p4 and q4
948         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2),
949                                  _mm256_sub_epi16(p256_5, p256_2));
950         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
951         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
952         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2),
953                                  _mm256_sub_epi16(q256_5, q256_2));
954         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
955         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
956         __m256i flat2_p4q4 =
957             _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
958         p4q4256 = _mm256_andnot_si256(flat2_256, p4q4256);
959         flat2_p4q4 = _mm256_and_si256(flat2_256, flat2_p4q4);
960         p4q4256 = _mm256_or_si256(flat2_p4q4, p4q4256);
961         _mm_storeu_si128((__m128i *)(s - 5 * p),
962                          _mm256_castsi256_si128(p4q4256));
963         _mm_storeu_si128((__m128i *)(s + 4 * p),
964                          _mm256_extractf128_si256(p4q4256, 1));
965 
966         // Derive p5 and q5
967         sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1),
968                                  _mm256_sub_epi16(p256_6, p256_3));
969         pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
970         res_p = _mm256_srli_epi16(pixelFilter_p, 4);
971         sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1),
972                                  _mm256_sub_epi16(q256_6, q256_3));
973         pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
974         res_q = _mm256_srli_epi16(pixelFilter_q, 4);
975         __m256i flat2_p5q5 =
976             _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
977         p5q5256 = _mm256_andnot_si256(flat2_256, p5q5256);
978         flat2_p5q5 = _mm256_and_si256(flat2_256, flat2_p5q5);
979         p5q5256 = _mm256_or_si256(flat2_p5q5, p5q5256);
980         _mm_storeu_si128((__m128i *)(s - 6 * p),
981                          _mm256_castsi256_si128(p5q5256));
982         _mm_storeu_si128((__m128i *)(s + 5 * p),
983                          _mm256_extractf128_si256(p5q5256, 1));
984       } else {
985         _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
986         _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
987         _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
988         _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
989         _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
990         _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
991       }
992     } else {
993       _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
994       _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
995       _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
996       _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
997     }
998   }
999 }
1000 
aom_lpf_vertical_14_quad_avx2(unsigned char * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)1001 void aom_lpf_vertical_14_quad_avx2(unsigned char *s, int pitch,
1002                                    const uint8_t *_blimit0,
1003                                    const uint8_t *_limit0,
1004                                    const uint8_t *_thresh0) {
1005   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1006 
1007   // Transpose 16x16
1008   trans_store_16x16_lpf_vert14(s - 8, pitch, t_dst, 16, 1);
1009 
1010   // Loop filtering
1011   aom_lpf_horizontal_14_quad_avx2(t_dst + 8 * 16, 16, _blimit0, _limit0,
1012                                   _thresh0);
1013 
1014   // Transpose back
1015   trans_store_16x16_lpf_vert14(t_dst, 16, s - 8, pitch, 0);
1016 }
1017