• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>  // SSE2
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_dsp/x86/synonyms.h"
17 #include "aom_ports/mem.h"
18 #include "aom_ports/emmintrin_compat.h"
19 #include "aom_dsp/x86/lpf_common_sse2.h"
20 
abs_diff(__m128i a,__m128i b)21 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
22   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
23 }
24 
25 // this function treats its input as 2 parallel 8x4 matrices, transposes each of
26 // them to 4x8  independently while flipping the second matrix horizontally.
27 // Used for 14 taps pq pairs creation
transpose_pq_14_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * q0p0,__m128i * q1p1,__m128i * q2p2,__m128i * q3p3,__m128i * q4p4,__m128i * q5p5,__m128i * q6p6,__m128i * q7p7)28 static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
29                                         __m128i *x3, __m128i *q0p0,
30                                         __m128i *q1p1, __m128i *q2p2,
31                                         __m128i *q3p3, __m128i *q4p4,
32                                         __m128i *q5p5, __m128i *q6p6,
33                                         __m128i *q7p7) {
34   __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
35   w0 = _mm_unpacklo_epi8(
36       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
37   w1 = _mm_unpacklo_epi8(
38       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
39   w2 = _mm_unpackhi_epi8(
40       *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
41   w3 = _mm_unpackhi_epi8(
42       *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
43 
44   ww0 = _mm_unpacklo_epi16(
45       w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
46   ww1 = _mm_unpackhi_epi16(
47       w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
48   ww2 = _mm_unpacklo_epi16(
49       w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
50   ww3 = _mm_unpackhi_epi16(
51       w2,
52       w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
53 
54   *q7p7 = _mm_unpacklo_epi32(
55       ww0,
56       _mm_srli_si128(
57           ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
58   *q6p6 = _mm_unpackhi_epi32(
59       _mm_slli_si128(ww0, 4),
60       ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
61   *q5p5 = _mm_unpackhi_epi32(
62       ww0,
63       _mm_slli_si128(
64           ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
65   *q4p4 = _mm_unpacklo_epi32(
66       _mm_srli_si128(ww0, 12),
67       ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
68   *q3p3 = _mm_unpacklo_epi32(
69       ww1,
70       _mm_srli_si128(
71           ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
72   *q2p2 = _mm_unpackhi_epi32(
73       _mm_slli_si128(ww1, 4),
74       ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
75   *q1p1 = _mm_unpackhi_epi32(
76       ww1,
77       _mm_slli_si128(
78           ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
79   *q0p0 = _mm_unpacklo_epi32(
80       _mm_srli_si128(ww1, 12),
81       ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
82 }
83 
84 // this function treats its input as 2 parallel 8x4 matrices, transposes each of
85 // them  independently while flipping the second matrix horizontaly  Used for 14
86 // taps filter pq pairs inverse
transpose_pq_14_inv_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * pq0,__m128i * pq1,__m128i * pq2,__m128i * pq3)87 static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
88                                             __m128i *x2, __m128i *x3,
89                                             __m128i *x4, __m128i *x5,
90                                             __m128i *x6, __m128i *x7,
91                                             __m128i *pq0, __m128i *pq1,
92                                             __m128i *pq2, __m128i *pq3) {
93   __m128i w10, w11, w12, w13;
94   __m128i w0, w1, w2, w3, w4, w5;
95   __m128i d0, d1, d2, d3;
96 
97   w0 = _mm_unpacklo_epi8(
98       *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
99   w1 = _mm_unpacklo_epi8(
100       *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
101   w2 = _mm_unpacklo_epi8(
102       *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
103   w3 = _mm_unpacklo_epi8(
104       *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
105 
106   w4 = _mm_unpacklo_epi16(
107       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
108   w5 = _mm_unpacklo_epi16(
109       w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
110 
111   d0 = _mm_unpacklo_epi32(
112       w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
113   d2 = _mm_unpackhi_epi32(
114       w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
115 
116   w10 = _mm_unpacklo_epi8(
117       *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
118   w11 = _mm_unpacklo_epi8(
119       *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
120   w12 = _mm_unpacklo_epi8(
121       *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
122   w13 = _mm_unpacklo_epi8(
123       *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
124 
125   w4 = _mm_unpackhi_epi16(
126       w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
127   w5 = _mm_unpackhi_epi16(
128       w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
129 
130   d1 = _mm_unpacklo_epi32(
131       w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
132   d3 = _mm_unpackhi_epi32(
133       w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
134 
135   *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
136   *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
137   *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
138   *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
139 }
140 
filter4_sse2(__m128i * p1p0,__m128i * q1q0,__m128i * hev,__m128i * mask,__m128i * qs1qs0,__m128i * ps1ps0)141 static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
142                                           __m128i *hev, __m128i *mask,
143                                           __m128i *qs1qs0, __m128i *ps1ps0) {
144   __m128i filter, filter2filter1, work;
145   __m128i ps1ps0_work, qs1qs0_work;
146   __m128i hev1;
147   const __m128i t3t4 =
148       _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
149   const __m128i t80 = _mm_set1_epi8((char)0x80);
150   const __m128i ff = _mm_cmpeq_epi8(t80, t80);
151 
152   ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
153   qs1qs0_work = _mm_xor_si128(*q1q0, t80);
154 
155   /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
156   work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
157   filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
158   /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
159   filter = _mm_subs_epi8(filter, work);
160   filter = _mm_subs_epi8(filter, work);
161   filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
162   filter = _mm_and_si128(filter, *mask); /* & mask */
163   filter = _mm_unpacklo_epi32(filter, filter);
164 
165   /* filter1 = signed_char_clamp(filter + 4) >> 3; */
166   /* filter2 = signed_char_clamp(filter + 3) >> 3; */
167   filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
168   filter2filter1 =
169       _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
170   filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
171   filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
172 
173   /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
174   filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
175   filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
176   filter = _mm_srai_epi16(filter, 9);          /* round */
177   filter = _mm_packs_epi16(filter, filter);
178   filter = _mm_andnot_si128(*hev, filter);
179   filter = _mm_unpacklo_epi32(filter, filter);
180 
181   filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
182   hev1 = _mm_srli_si128(filter2filter1, 8);
183   /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
184   qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
185   /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
186   ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
187 
188   *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
189   *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
190 }
191 
filter4_dual_sse2(__m128i * p1p0,__m128i * q1q0,__m128i * hev,__m128i * mask,__m128i * qs1qs0,__m128i * ps1ps0)192 static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
193                                                __m128i *hev, __m128i *mask,
194                                                __m128i *qs1qs0,
195                                                __m128i *ps1ps0) {
196   const __m128i t3t4 =
197       _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
198   const __m128i t80 = _mm_set1_epi8((char)0x80);
199   __m128i filter, filter2filter1, work;
200   __m128i ps1ps0_work, qs1qs0_work;
201   __m128i hev1;
202   const __m128i ff = _mm_cmpeq_epi8(t80, t80);
203 
204   ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
205   qs1qs0_work = _mm_xor_si128(*q1q0, t80);
206 
207   /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
208   work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
209   filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
210   /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
211   filter = _mm_subs_epi8(filter, work);
212   filter = _mm_subs_epi8(filter, work);
213   filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
214   filter = _mm_and_si128(filter, *mask); /* & mask */
215   filter = _mm_unpacklo_epi64(filter, filter);
216 
217   /* filter1 = signed_char_clamp(filter + 4) >> 3; */
218   /* filter2 = signed_char_clamp(filter + 3) >> 3; */
219   filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
220   filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
221   filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
222   filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
223   filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
224   filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
225 
226   /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
227   filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
228   filter = _mm_unpacklo_epi8(filter, filter);
229   filter = _mm_srai_epi16(filter, 9); /* round */
230   filter = _mm_packs_epi16(filter, filter);
231   filter = _mm_andnot_si128(*hev, filter);
232 
233   hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
234   filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
235 
236   /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
237   qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
238   /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
239   ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
240   *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
241   *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
242 }
243 
lpf_internal_4_sse2(__m128i * p1,__m128i * p0,__m128i * q0,__m128i * q1,__m128i * limit,__m128i * thresh,__m128i * q1q0_out,__m128i * p1p0_out)244 static AOM_FORCE_INLINE void lpf_internal_4_sse2(
245     __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
246     __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
247   __m128i q1p1, q0p0, p1p0, q1q0;
248   __m128i abs_p0q0, abs_p1q1;
249   __m128i mask, flat, hev;
250   const __m128i zero = _mm_setzero_si128();
251 
252   q1p1 = _mm_unpacklo_epi32(*p1, *q1);
253   q0p0 = _mm_unpacklo_epi32(*p0, *q0);
254 
255   p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
256   q1q0 = _mm_srli_si128(p1p0, 8);
257 
258   /* (abs(q1 - q0), abs(p1 - p0) */
259   flat = abs_diff(q1p1, q0p0);
260   /* abs(p1 - q1), abs(p0 - q0) */
261   __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
262 
263   /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
264   flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
265   hev = _mm_unpacklo_epi8(flat, zero);
266 
267   hev = _mm_cmpgt_epi16(hev, *thresh);
268   hev = _mm_packs_epi16(hev, hev);
269   hev = _mm_unpacklo_epi32(hev, hev);
270 
271   abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
272   abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
273   abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
274   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
275   abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
276   /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
277 
278   mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
279   mask = _mm_unpacklo_epi32(mask, flat);
280   mask = _mm_subs_epu8(mask, *limit);
281   mask = _mm_cmpeq_epi8(mask, zero);
282   mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
283 
284   filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
285 }
286 
lpf_internal_4_dual_sse2(__m128i * p1,__m128i * p0,__m128i * q0,__m128i * q1,__m128i * limit,__m128i * thresh,__m128i * q1q0_out,__m128i * p1p0_out)287 static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
288     __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
289     __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
290   __m128i q1p1, q0p0, p1p0, q1q0;
291   __m128i abs_p0q0, abs_p1q1;
292   __m128i mask, hev;
293   const __m128i zero = _mm_setzero_si128();
294 
295   q1p1 = _mm_unpacklo_epi64(*p1, *q1);
296   q0p0 = _mm_unpacklo_epi64(*p0, *q0);
297 
298   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
299   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
300 
301   /* (abs(q1 - q0), abs(p1 - p0) */
302   __m128i flat = abs_diff(q1p1, q0p0);
303   /* abs(p1 - q1), abs(p0 - q0) */
304   const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
305 
306   /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
307   flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
308   hev = _mm_unpacklo_epi8(flat, zero);
309 
310   hev = _mm_cmpgt_epi16(hev, *thresh);
311   hev = _mm_packs_epi16(hev, hev);
312 
313   /* const int8_t mask = filter_mask2(*limit, *blimit, */
314   /*                                  p1, p0, q0, q1); */
315   abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
316   abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
317   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
318   abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
319   /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
320   mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
321   mask = _mm_unpacklo_epi64(mask, flat);
322   mask = _mm_subs_epu8(mask, *limit);
323   mask = _mm_cmpeq_epi8(mask, zero);
324   mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
325 
326   filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
327 }
328 
aom_lpf_horizontal_4_sse2(uint8_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh)329 void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
330                                const uint8_t *_blimit, const uint8_t *_limit,
331                                const uint8_t *_thresh) {
332   const __m128i zero = _mm_setzero_si128();
333   __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
334                                      _mm_loadl_epi64((const __m128i *)_limit));
335   __m128i thresh =
336       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
337 
338   __m128i qs1qs0, ps1ps0;
339   __m128i p1, p0, q0, q1;
340 
341   p1 = xx_loadl_32(s - 2 * p);
342   p0 = xx_loadl_32(s - 1 * p);
343   q0 = xx_loadl_32(s - 0 * p);
344   q1 = xx_loadl_32(s + 1 * p);
345 
346   lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
347 
348   xx_storel_32(s - 1 * p, ps1ps0);
349   xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
350   xx_storel_32(s + 0 * p, qs1qs0);
351   xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
352 }
353 
aom_lpf_vertical_4_sse2(uint8_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh)354 void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
355                              const uint8_t *_blimit, const uint8_t *_limit,
356                              const uint8_t *_thresh) {
357   __m128i p1p0, q1q0;
358   __m128i p1, p0, q0, q1;
359 
360   const __m128i zero = _mm_setzero_si128();
361   __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
362                                      _mm_loadl_epi64((const __m128i *)_limit));
363   __m128i thresh =
364       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
365 
366   __m128i x0, x1, x2, x3;
367   __m128i d0, d1, d2, d3;
368   x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
369   x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
370   x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
371   x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
372 
373   transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
374 
375   lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
376 
377   // Transpose 8x4 to 4x8
378   p1 = _mm_srli_si128(p1p0, 4);
379   q1 = _mm_srli_si128(q1q0, 4);
380 
381   transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
382 
383   xx_storel_32(s + 0 * p - 2, d0);
384   xx_storel_32(s + 1 * p - 2, d1);
385   xx_storel_32(s + 2 * p - 2, d2);
386   xx_storel_32(s + 3 * p - 2, d3);
387 }
388 
store_buffer_horz_8(__m128i x,int p,int num,uint8_t * s)389 static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
390   xx_storel_32(s - (num + 1) * p, x);
391   xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
392 }
393 
lpf_internal_14_dual_sse2(__m128i * q6p6,__m128i * q5p5,__m128i * q4p4,__m128i * q3p3,__m128i * q2p2,__m128i * q1p1,__m128i * q0p0,__m128i * blimit,__m128i * limit,__m128i * thresh)394 static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
395     __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
396     __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
397     __m128i *thresh) {
398   const __m128i zero = _mm_setzero_si128();
399   const __m128i one = _mm_set1_epi8(1);
400   __m128i mask, hev, flat, flat2;
401   __m128i qs0ps0, qs1ps1;
402   __m128i p1p0, q1q0, qs1qs0, ps1ps0;
403   __m128i abs_p1p0;
404 
405   p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
406   q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
407 
408   {
409     __m128i abs_p1q1, abs_p0q0, abs_q1q0;
410     __m128i fe, ff, work;
411     abs_p1p0 = abs_diff(*q1p1, *q0p0);
412     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
413     fe = _mm_set1_epi8((char)0xfe);
414     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
415     abs_p0q0 = abs_diff(p1p0, q1q0);
416     abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
417     abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
418 
419     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
420     hev = _mm_subs_epu8(flat, *thresh);
421     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
422     // replicate for the further "merged variables" usage
423     hev = _mm_unpacklo_epi64(hev, hev);
424 
425     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
426     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
427     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
428     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
429     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
430     mask = _mm_max_epu8(abs_p1p0, mask);
431     // mask |= (abs(p1 - p0) > limit) * -1;
432     // mask |= (abs(q1 - q0) > limit) * -1;
433 
434     work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
435     mask = _mm_max_epu8(work, mask);
436     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
437     mask = _mm_subs_epu8(mask, *limit);
438     mask = _mm_cmpeq_epi8(mask, zero);
439   }
440 
441   // lp filter - the same for 6, 8 and 14 versions
442   filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
443   qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
444   qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
445   // loopfilter done
446 
447   __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
448   __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
449 
450   __m128i work;
451   flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
452   flat = _mm_max_epu8(abs_p1p0, flat);
453   flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
454   flat = _mm_subs_epu8(flat, one);
455   flat = _mm_cmpeq_epi8(flat, zero);
456   flat = _mm_and_si128(flat, mask);
457 
458   // if flat ==0 then flat2 is zero as well and we don't need any calc below
459   // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
460   if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
461     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
462     // flat and wide flat calculations
463 
464     const __m128i eight = _mm_set1_epi16(8);
465     const __m128i four = _mm_set1_epi16(4);
466     __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
467     __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
468     __m128i pixelFilter_p, pixelFilter_q;
469     __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
470     __m128i sum_p6, sum_q6;
471     __m128i sum_p3, sum_q3, res_p, res_q;
472 
473     p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
474     p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
475     p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
476     p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
477     p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
478     p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
479     p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
480     q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
481     q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
482     q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
483     q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
484     q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
485     q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
486     q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
487     pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
488     pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
489 
490     pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
491     pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
492 
493     pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
494     pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
495     pixelFilter_p =
496         _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
497     pixetFilter_p2p1p0 = _mm_add_epi16(
498         four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
499     res_p = _mm_srli_epi16(
500         _mm_add_epi16(pixelFilter_p,
501                       _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
502                                     _mm_add_epi16(p1_16, q0_16))),
503         4);
504     res_q = _mm_srli_epi16(
505         _mm_add_epi16(pixelFilter_p,
506                       _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
507                                     _mm_add_epi16(p0_16, q1_16))),
508         4);
509     flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
510 
511     res_p = _mm_srli_epi16(
512         _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
513     res_q = _mm_srli_epi16(
514         _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
515 
516     flat_q0p0 = _mm_packus_epi16(res_p, res_q);
517 
518     sum_p6 = _mm_add_epi16(p6_16, p6_16);
519     sum_q6 = _mm_add_epi16(q6_16, q6_16);
520     sum_p3 = _mm_add_epi16(p3_16, p3_16);
521     sum_q3 = _mm_add_epi16(q3_16, q3_16);
522 
523     pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
524     pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
525 
526     res_p = _mm_srli_epi16(
527         _mm_add_epi16(
528             pixelFilter_p,
529             _mm_add_epi16(sum_p6,
530                           _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
531         4);
532     res_q = _mm_srli_epi16(
533         _mm_add_epi16(
534             pixelFilter_q,
535             _mm_add_epi16(sum_q6,
536                           _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
537         4);
538     flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
539 
540     pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
541     pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
542     res_p = _mm_srli_epi16(
543         _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
544     res_q = _mm_srli_epi16(
545         _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
546     flat_q1p1 = _mm_packus_epi16(res_p, res_q);
547 
548     pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
549     pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
550 
551     sum_p3 = _mm_add_epi16(sum_p3, p3_16);
552     sum_q3 = _mm_add_epi16(sum_q3, q3_16);
553 
554     res_p = _mm_srli_epi16(
555         _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
556     res_q = _mm_srli_epi16(
557         _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
558     flat_q2p2 = _mm_packus_epi16(res_p, res_q);
559 
560     // work with flat2
561     flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
562     work = abs_diff(*q6p6, *q0p0);
563     flat2 = _mm_max_epu8(work, flat2);
564     flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
565     flat2 = _mm_subs_epu8(flat2, one);
566     flat2 = _mm_cmpeq_epi8(flat2, zero);
567     flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
568 
569     // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
570     flat = _mm_unpacklo_epi64(flat, flat);
571     *q2p2 = _mm_andnot_si128(flat, *q2p2);
572     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
573     *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
574 
575     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
576     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
577     *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
578 
579     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
580     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
581     *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
582 
583     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
584       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
585       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
586 
587       sum_p6 = _mm_add_epi16(sum_p6, p6_16);
588       sum_q6 = _mm_add_epi16(sum_q6, q6_16);
589 
590       res_p = _mm_srli_epi16(
591           _mm_add_epi16(
592               pixelFilter_p,
593               _mm_add_epi16(sum_p6,
594                             _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
595           4);
596       res_q = _mm_srli_epi16(
597           _mm_add_epi16(
598               pixelFilter_q,
599               _mm_add_epi16(sum_q6,
600                             _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
601           4);
602       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
603 
604       sum_p6 = _mm_add_epi16(sum_p6, p6_16);
605       sum_q6 = _mm_add_epi16(sum_q6, q6_16);
606 
607       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
608       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
609 
610       res_p = _mm_srli_epi16(
611           _mm_add_epi16(
612               pixelFilter_p,
613               _mm_add_epi16(sum_p6,
614                             _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
615           4);
616       res_q = _mm_srli_epi16(
617           _mm_add_epi16(
618               pixelFilter_q,
619               _mm_add_epi16(sum_q6,
620                             _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
621           4);
622       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
623 
624       sum_p6 = _mm_add_epi16(sum_p6, p6_16);
625       sum_q6 = _mm_add_epi16(sum_q6, q6_16);
626 
627       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
628       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
629 
630       res_p = _mm_srli_epi16(
631           _mm_add_epi16(
632               pixelFilter_p,
633               _mm_add_epi16(sum_p6,
634                             _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
635           4);
636       res_q = _mm_srli_epi16(
637           _mm_add_epi16(
638               pixelFilter_q,
639               _mm_add_epi16(sum_q6,
640                             _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
641           4);
642       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
643 
644       sum_p6 = _mm_add_epi16(sum_p6, p6_16);
645       sum_q6 = _mm_add_epi16(sum_q6, q6_16);
646       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
647       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
648 
649       res_p = _mm_srli_epi16(
650           _mm_add_epi16(
651               pixelFilter_p,
652               _mm_add_epi16(sum_p6,
653                             _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
654           4);
655       res_q = _mm_srli_epi16(
656           _mm_add_epi16(
657               pixelFilter_q,
658               _mm_add_epi16(sum_q6,
659                             _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
660           4);
661       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
662 
663       // wide flat
664       // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
665       flat2 = _mm_unpacklo_epi64(flat2, flat2);
666 
667       *q5p5 = _mm_andnot_si128(flat2, *q5p5);
668       flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
669       *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
670 
671       *q4p4 = _mm_andnot_si128(flat2, *q4p4);
672       flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
673       *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
674 
675       *q3p3 = _mm_andnot_si128(flat2, *q3p3);
676       flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
677       *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
678 
679       *q2p2 = _mm_andnot_si128(flat2, *q2p2);
680       flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
681       *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
682 
683       *q1p1 = _mm_andnot_si128(flat2, *q1p1);
684       flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
685       *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
686 
687       *q0p0 = _mm_andnot_si128(flat2, *q0p0);
688       flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
689       *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
690     }
691   } else {
692     *q0p0 = qs0ps0;
693     *q1p1 = qs1ps1;
694   }
695 }
696 
lpf_internal_14_sse2(__m128i * q6p6,__m128i * q5p5,__m128i * q4p4,__m128i * q3p3,__m128i * q2p2,__m128i * q1p1,__m128i * q0p0,__m128i * blimit,__m128i * limit,__m128i * thresh)697 static AOM_FORCE_INLINE void lpf_internal_14_sse2(
698     __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
699     __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
700     __m128i *thresh) {
701   const __m128i zero = _mm_setzero_si128();
702   const __m128i one = _mm_set1_epi8(1);
703   __m128i mask, hev, flat, flat2;
704   __m128i flat2_pq[6], flat_pq[3];
705   __m128i qs0ps0, qs1ps1;
706   __m128i p1p0, q1q0, qs1qs0, ps1ps0;
707   __m128i abs_p1p0;
708 
709   p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
710   q1q0 = _mm_srli_si128(p1p0, 8);
711 
712   __m128i fe, ff, work;
713   {
714     __m128i abs_p1q1, abs_p0q0, abs_q1q0;
715     abs_p1p0 = abs_diff(*q1p1, *q0p0);
716     abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
717     fe = _mm_set1_epi8((char)0xfe);
718     ff = _mm_cmpeq_epi8(fe, fe);
719     abs_p0q0 = abs_diff(p1p0, q1q0);
720     abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
721 
722     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
723 
724     hev = _mm_subs_epu8(flat, *thresh);
725     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
726     // replicate for the further "merged variables" usage
727     hev = _mm_unpacklo_epi32(hev, hev);
728 
729     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
730     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
731     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
732     mask = _mm_unpacklo_epi32(mask, zero);
733     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
734     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
735     mask = _mm_max_epu8(abs_p1p0, mask);
736     // mask |= (abs(p1 - p0) > limit) * -1;
737     // mask |= (abs(q1 - q0) > limit) * -1;
738 
739     work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
740     mask = _mm_max_epu8(work, mask);
741     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
742     mask = _mm_subs_epu8(mask, *limit);
743     mask = _mm_cmpeq_epi8(mask, zero);
744   }
745 
746   // lp filter - the same for 6, 8 and 14 versions
747   filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
748   qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
749   qs1ps1 = _mm_srli_si128(qs0ps0, 8);
750   // loopfilter done
751 
752   flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
753   flat = _mm_max_epu8(abs_p1p0, flat);
754   flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
755   flat = _mm_subs_epu8(flat, one);
756   flat = _mm_cmpeq_epi8(flat, zero);
757   flat = _mm_and_si128(flat, mask);
758   flat = _mm_unpacklo_epi32(flat, flat);
759   flat = _mm_unpacklo_epi64(flat, flat);
760 
761   // if flat ==0 then flat2 is zero as well and we don't need any calc below
762   // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
763   if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
764     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
765     // flat and wide flat calculations
766     __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
767     __m128i pq_16[7];
768     const __m128i eight = _mm_set1_epi16(8);
769     const __m128i four = _mm_set1_epi16(4);
770     __m128i sum_p6;
771     __m128i sum_p3;
772 
773     pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
774     pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
775     pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
776     pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
777     pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
778     pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
779     pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
780     q0_16 = _mm_srli_si128(pq_16[0], 8);
781     q1_16 = _mm_srli_si128(pq_16[1], 8);
782     q2_16 = _mm_srli_si128(pq_16[2], 8);
783     q3_16 = _mm_srli_si128(pq_16[3], 8);
784     q4_16 = _mm_srli_si128(pq_16[4], 8);
785     q5_16 = _mm_srli_si128(pq_16[5], 8);
786 
787     __m128i flat_p[3], flat_q[3];
788     __m128i flat2_p[6], flat2_q[6];
789 
790     __m128i work0, work0_0, work0_1, sum_p_0;
791     __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
792     __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
793     sum_p = _mm_add_epi16(sum_p, sum_lp);
794 
795     __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
796     __m128i sum_q = _mm_srli_si128(sum_p, 8);
797 
798     sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
799     sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
800 
801     flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
802     flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
803 
804     sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
805     sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
806 
807     sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
808     sum_p = _mm_sub_epi16(sum_p_0, q5_16);
809 
810     work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
811     work0_1 = _mm_add_epi16(
812         sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
813 
814     sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
815     sum_lp = _mm_sub_epi16(sum_lp, q2_16);
816 
817     work0 = _mm_add_epi16(sum_p3, pq_16[1]);
818     flat_p[1] = _mm_add_epi16(sum_lp, work0);
819     flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
820 
821     flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
822     flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
823     flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
824     flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
825 
826     sum_lp = _mm_sub_epi16(sum_lp, q1_16);
827     sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
828 
829     sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
830     work0 = _mm_add_epi16(sum_p3, pq_16[2]);
831 
832     flat_p[2] = _mm_add_epi16(sum_lp, work0);
833     flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
834     flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
835     flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
836 
837     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
838     flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
839 
840     work = abs_diff(*q6p6, *q0p0);
841     flat2 = _mm_max_epu8(work, flat2);
842     flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
843     flat2 = _mm_subs_epu8(flat2, one);
844     flat2 = _mm_cmpeq_epi8(flat2, zero);
845     flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
846     flat2 = _mm_unpacklo_epi32(flat2, flat2);
847 
848     // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
849     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
850     flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
851     *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
852 
853     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
854     flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
855     *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
856 
857     *q2p2 = _mm_andnot_si128(flat, *q2p2);
858     flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
859     *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
860 
861     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
862       flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
863       flat2_q[0] = _mm_add_epi16(
864           sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
865 
866       flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
867       flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
868 
869       flat2_pq[0] =
870           _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
871       flat2_pq[1] =
872           _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
873       flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
874       flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
875 
876       sum_p = _mm_sub_epi16(sum_p, q4_16);
877       sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
878 
879       sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
880       work0 = _mm_add_epi16(
881           sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
882       flat2_p[2] = _mm_add_epi16(sum_p, work0);
883       flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
884       flat2_pq[2] =
885           _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
886       flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
887 
888       sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
889       sum_p = _mm_sub_epi16(sum_p, q3_16);
890       sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
891 
892       work0 = _mm_add_epi16(
893           sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
894       flat2_p[3] = _mm_add_epi16(sum_p, work0);
895       flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
896       flat2_pq[3] =
897           _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
898       flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
899 
900       sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
901       sum_p = _mm_sub_epi16(sum_p, q2_16);
902       sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
903 
904       work0 = _mm_add_epi16(
905           sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
906       flat2_p[4] = _mm_add_epi16(sum_p, work0);
907       flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
908       flat2_pq[4] =
909           _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
910       flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
911 
912       sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
913       sum_p = _mm_sub_epi16(sum_p, q1_16);
914       sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
915 
916       work0 = _mm_add_epi16(
917           sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
918       flat2_p[5] = _mm_add_epi16(sum_p, work0);
919       flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
920       flat2_pq[5] =
921           _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
922       flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
923 
924       // wide flat
925       // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
926 
927       *q0p0 = _mm_andnot_si128(flat2, *q0p0);
928       flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
929       *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
930 
931       *q1p1 = _mm_andnot_si128(flat2, *q1p1);
932       flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
933       *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
934 
935       *q2p2 = _mm_andnot_si128(flat2, *q2p2);
936       flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
937       *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
938 
939       *q3p3 = _mm_andnot_si128(flat2, *q3p3);
940       flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
941       *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
942 
943       *q4p4 = _mm_andnot_si128(flat2, *q4p4);
944       flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
945       *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
946 
947       *q5p5 = _mm_andnot_si128(flat2, *q5p5);
948       flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
949       *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
950     }
951   } else {
952     *q0p0 = qs0ps0;
953     *q1p1 = qs1ps1;
954   }
955 }
956 
aom_lpf_horizontal_14_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)957 void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
958                                 const unsigned char *_blimit,
959                                 const unsigned char *_limit,
960                                 const unsigned char *_thresh) {
961   __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
962   __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
963   __m128i limit = _mm_load_si128((const __m128i *)_limit);
964   __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
965 
966   q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
967   q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
968   q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
969   q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
970 
971   q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
972 
973   q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
974 
975   q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
976 
977   lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
978                        &limit, &thresh);
979 
980   store_buffer_horz_8(q0p0, p, 0, s);
981   store_buffer_horz_8(q1p1, p, 1, s);
982   store_buffer_horz_8(q2p2, p, 2, s);
983   store_buffer_horz_8(q3p3, p, 3, s);
984   store_buffer_horz_8(q4p4, p, 4, s);
985   store_buffer_horz_8(q5p5, p, 5, s);
986 }
987 
lpf_internal_6_dual_sse2(__m128i * p2,__m128i * q2,__m128i * p1,__m128i * q1,__m128i * p0,__m128i * q0,__m128i * q1q0,__m128i * p1p0,__m128i * blimit,__m128i * limit,__m128i * thresh)988 static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
989     __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
990     __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
991     __m128i *thresh) {
992   const __m128i zero = _mm_setzero_si128();
993   __m128i mask, hev, flat;
994   __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
995   __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
996   __m128i ps1ps0, qs1qs0;
997 
998   q2p2 = _mm_unpacklo_epi64(*p2, *q2);
999   q1p1 = _mm_unpacklo_epi64(*p1, *q1);
1000   q0p0 = _mm_unpacklo_epi64(*p0, *q0);
1001 
1002   *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
1003   *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
1004 
1005   const __m128i one = _mm_set1_epi8(1);
1006   const __m128i fe = _mm_set1_epi8((char)0xfe);
1007   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1008 
1009   {
1010     // filter_mask and hev_mask
1011     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1012     abs_p1p0 = abs_diff(q1p1, q0p0);
1013     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
1014 
1015     abs_p0q0 = abs_diff(*p1p0, *q1q0);
1016     abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
1017     abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
1018 
1019     // considering sse doesn't have unsigned elements comparison the idea is
1020     // to find at least one case when X > limit, it means the corresponding
1021     // mask bit is set.
1022     // to achieve that we find global max value of all inputs of abs(x-y) or
1023     // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
1024     // otherwise - not
1025 
1026     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1027     hev = _mm_subs_epu8(flat, *thresh);
1028     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1029     // replicate for the further "merged variables" usage
1030     hev = _mm_unpacklo_epi64(hev, hev);
1031 
1032     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1033     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1034     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
1035     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1036     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1037     mask = _mm_max_epu8(abs_p1p0, mask);
1038     // mask |= (abs(p1 - p0) > limit) * -1;
1039     // mask |= (abs(q1 - q0) > limit) * -1;
1040 
1041     work = abs_diff(q2p2, q1p1);
1042     mask = _mm_max_epu8(work, mask);
1043     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
1044     mask = _mm_subs_epu8(mask, *limit);
1045     mask = _mm_cmpeq_epi8(mask, zero);
1046 
1047     // lp filter - the same for 6, 8 and 14 versions
1048     filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
1049 
1050     // flat_mask
1051     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
1052     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
1053     flat = _mm_subs_epu8(flat, one);
1054     flat = _mm_cmpeq_epi8(flat, zero);
1055     flat = _mm_and_si128(flat, mask);
1056     // replicate for the further "merged variables" usage
1057     flat = _mm_unpacklo_epi64(flat, flat);
1058   }
1059 
1060   // 5 tap filter
1061   // need it only if flat !=0
1062   if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
1063     const __m128i four = _mm_set1_epi16(4);
1064     __m128i workp_a, workp_b, workp_shft0, workp_shft1;
1065     p2_16 = _mm_unpacklo_epi8(*p2, zero);
1066     p1_16 = _mm_unpacklo_epi8(*p1, zero);
1067     p0_16 = _mm_unpacklo_epi8(*p0, zero);
1068     q0_16 = _mm_unpacklo_epi8(*q0, zero);
1069     q1_16 = _mm_unpacklo_epi8(*q1, zero);
1070     q2_16 = _mm_unpacklo_epi8(*q2, zero);
1071 
1072     // op1
1073     workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
1074                             _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
1075     workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
1076                             p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
1077 
1078     workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
1079     workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
1080                                  3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
1081 
1082     // op0
1083     workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
1084     workp_a = _mm_add_epi16(workp_a,
1085                             workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
1086     workp_shft1 = _mm_srli_epi16(workp_a, 3);
1087 
1088     flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
1089 
1090     // oq0
1091     workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
1092                             p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
1093     workp_b = _mm_add_epi16(q1_16, q2_16);
1094     workp_a = _mm_add_epi16(
1095         workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
1096     workp_shft0 = _mm_srli_epi16(workp_a, 3);
1097 
1098     // oq1
1099     workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
1100                             p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
1101     workp_b = _mm_add_epi16(q2_16, q2_16);
1102     workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
1103                                  3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
1104 
1105     flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
1106 
1107     qs1qs0 = _mm_andnot_si128(flat, *q1q0);
1108     *q1q0 = _mm_and_si128(flat, flat_q0q1);
1109     *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
1110 
1111     ps1ps0 = _mm_andnot_si128(flat, *p1p0);
1112     *p1p0 = _mm_and_si128(flat, flat_p1p0);
1113     *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
1114   }
1115 }
1116 
lpf_internal_6_sse2(__m128i * p2,__m128i * q2,__m128i * p1,__m128i * q1,__m128i * p0,__m128i * q0,__m128i * q1q0,__m128i * p1p0,__m128i * blimit,__m128i * limit,__m128i * thresh)1117 static AOM_FORCE_INLINE void lpf_internal_6_sse2(
1118     __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
1119     __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
1120     __m128i *thresh) {
1121   const __m128i zero = _mm_setzero_si128();
1122   __m128i mask, hev, flat;
1123   __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
1124   __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
1125   __m128i ps1ps0, qs1qs0;
1126 
1127   q2p2 = _mm_unpacklo_epi32(*p2, *q2);
1128   q1p1 = _mm_unpacklo_epi32(*p1, *q1);
1129   q0p0 = _mm_unpacklo_epi32(*p0, *q0);
1130 
1131   *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
1132   *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
1133 
1134   const __m128i one = _mm_set1_epi8(1);
1135   const __m128i fe = _mm_set1_epi8((char)0xfe);
1136   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1137   {
1138     // filter_mask and hev_mask
1139     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1140     abs_p1p0 = abs_diff(q1p1, q0p0);
1141     abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
1142 
1143     abs_p0q0 = abs_diff(*p1p0, *q1q0);
1144     abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
1145 
1146     // considering sse doesn't have unsigned elements comparison the idea is
1147     // to find at least one case when X > limit, it means the corresponding
1148     // mask bit is set.
1149     // to achieve that we find global max value of all inputs of abs(x-y) or
1150     // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
1151     // otherwise - not
1152 
1153     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1154     hev = _mm_subs_epu8(flat, *thresh);
1155     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1156     // replicate for the further "merged variables" usage
1157     hev = _mm_unpacklo_epi32(hev, hev);
1158 
1159     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1160     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1161     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
1162     mask = _mm_unpacklo_epi32(mask, zero);
1163     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1164     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1165     mask = _mm_max_epu8(abs_p1p0, mask);
1166     // mask |= (abs(p1 - p0) > limit) * -1;
1167     // mask |= (abs(q1 - q0) > limit) * -1;
1168 
1169     work = abs_diff(q2p2, q1p1);
1170     mask = _mm_max_epu8(work, mask);
1171     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
1172     mask = _mm_subs_epu8(mask, *limit);
1173     mask = _mm_cmpeq_epi8(mask, zero);
1174 
1175     // lp filter - the same for 6, 8 and 14 versions
1176     filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
1177 
1178     // flat_mask
1179     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
1180     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
1181     flat = _mm_subs_epu8(flat, one);
1182     flat = _mm_cmpeq_epi8(flat, zero);
1183     flat = _mm_and_si128(flat, mask);
1184     // replicate for the further "merged variables" usage
1185     flat = _mm_unpacklo_epi32(flat, flat);
1186     flat = _mm_unpacklo_epi64(flat, flat);
1187   }
1188 
1189   // 5 tap filter
1190   // need it only if flat !=0
1191   if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
1192     const __m128i four = _mm_set1_epi16(4);
1193     __m128i workp_a, workp_b, workp_c;
1194     __m128i pq0x2_pq1, pq1_pq2;
1195     pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
1196     pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
1197     pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
1198     q0_16 = _mm_srli_si128(pq0_16, 8);
1199     q2_16 = _mm_srli_si128(pq2_16, 8);
1200 
1201     // op1
1202     pq0x2_pq1 =
1203         _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
1204     pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
1205     workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
1206                             pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
1207 
1208     workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
1209     workp_b =
1210         _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
1211 
1212     // op0
1213     workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
1214     workp_a = _mm_add_epi16(workp_a,
1215                             workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
1216     workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
1217     workp_b = _mm_srli_epi16(workp_b, 3);
1218 
1219     flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
1220 
1221     // oq0
1222     workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
1223                             pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
1224     workp_b = _mm_srli_si128(pq1_pq2, 8);
1225     workp_a = _mm_add_epi16(
1226         workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
1227     // workp_shft0 = _mm_srli_epi16(workp_a, 3);
1228 
1229     // oq1
1230     workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
1231                             pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
1232     workp_b = _mm_add_epi16(q2_16, q2_16);
1233     workp_b =
1234         _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
1235 
1236     workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
1237     workp_a = _mm_srli_epi16(workp_a, 3);
1238 
1239     flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
1240 
1241     qs1qs0 = _mm_andnot_si128(flat, *q1q0);
1242     *q1q0 = _mm_and_si128(flat, flat_q0q1);
1243     *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
1244 
1245     ps1ps0 = _mm_andnot_si128(flat, *p1p0);
1246     *p1p0 = _mm_and_si128(flat, flat_p1p0);
1247     *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
1248   }
1249 }
1250 
aom_lpf_horizontal_6_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1251 void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
1252                                const unsigned char *_blimit,
1253                                const unsigned char *_limit,
1254                                const unsigned char *_thresh) {
1255   __m128i p2, p1, p0, q0, q1, q2;
1256   __m128i p1p0, q1q0;
1257   __m128i blimit = _mm_load_si128((__m128i *)_blimit);
1258   __m128i limit = _mm_load_si128((__m128i *)_limit);
1259   __m128i thresh = _mm_load_si128((__m128i *)_thresh);
1260 
1261   p2 = xx_loadl_32(s - 3 * p);
1262   p1 = xx_loadl_32(s - 2 * p);
1263   p0 = xx_loadl_32(s - 1 * p);
1264   q0 = xx_loadl_32(s - 0 * p);
1265   q1 = xx_loadl_32(s + 1 * p);
1266   q2 = xx_loadl_32(s + 2 * p);
1267 
1268   lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
1269                       &limit, &thresh);
1270 
1271   xx_storel_32(s - 1 * p, p1p0);
1272   xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
1273   xx_storel_32(s + 0 * p, q1q0);
1274   xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
1275 }
1276 
aom_lpf_horizontal_6_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1277 void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
1278                                     const unsigned char *_blimit0,
1279                                     const unsigned char *_limit0,
1280                                     const unsigned char *_thresh0,
1281                                     const unsigned char *_blimit1,
1282                                     const unsigned char *_limit1,
1283                                     const unsigned char *_thresh1) {
1284   __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
1285                                       _mm_load_si128((__m128i *)_blimit1));
1286   __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
1287                                      _mm_load_si128((__m128i *)_limit1));
1288   __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
1289                                       _mm_load_si128((__m128i *)_thresh1));
1290 
1291   __m128i p2, p1, p0, q0, q1, q2;
1292   __m128i p1p0, q1q0;
1293 
1294   p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
1295   p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
1296   p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
1297   q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
1298   q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
1299   q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
1300 
1301   lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
1302                            &limit, &thresh);
1303 
1304   _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
1305   _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
1306   _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
1307   _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
1308 }
1309 
lpf_internal_8_sse2(__m128i * p3,__m128i * q3,__m128i * p2,__m128i * q2,__m128i * p1,__m128i * q1,__m128i * p0,__m128i * q0,__m128i * q1q0_out,__m128i * p1p0_out,__m128i * blimit,__m128i * limit,__m128i * thresh)1310 static AOM_FORCE_INLINE void lpf_internal_8_sse2(
1311     __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
1312     __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
1313     __m128i *blimit, __m128i *limit, __m128i *thresh) {
1314   const __m128i zero = _mm_setzero_si128();
1315   __m128i mask, hev, flat;
1316   __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
1317       flat_p1p0, flat_q0q1;
1318   __m128i q2p2, q1p1, q0p0;
1319   __m128i q1q0, p1p0, ps1ps0, qs1qs0;
1320   __m128i work_pq, opq2, pq2;
1321 
1322   q3p3 = _mm_unpacklo_epi32(*p3, *q3);
1323   q2p2 = _mm_unpacklo_epi32(*p2, *q2);
1324   q1p1 = _mm_unpacklo_epi32(*p1, *q1);
1325   q0p0 = _mm_unpacklo_epi32(*p0, *q0);
1326 
1327   p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
1328   q1q0 = _mm_srli_si128(p1p0, 8);
1329 
1330   // filter_mask and hev_mask
1331 
1332   // considering sse doesn't have unsigned elements comparison the idea is to
1333   // find at least one case when X > limit, it means the corresponding  mask
1334   // bit is set.
1335   // to achieve that we find global max value of all inputs of abs(x-y) or
1336   // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
1337   // otherwise - not
1338 
1339   const __m128i one = _mm_set1_epi8(1);
1340   const __m128i fe = _mm_set1_epi8((char)0xfe);
1341   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1342   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1343 
1344   abs_p1p0 = abs_diff(q1p1, q0p0);
1345   abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
1346 
1347   abs_p0q0 = abs_diff(p1p0, q1q0);
1348   abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
1349 
1350   flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1351   hev = _mm_subs_epu8(flat, *thresh);
1352   hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1353   // replicate for the further "merged variables" usage
1354   hev = _mm_unpacklo_epi32(hev, hev);
1355 
1356   abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1357   abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1358   mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
1359   mask = _mm_unpacklo_epi32(mask, zero);
1360   mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1361   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1362   mask = _mm_max_epu8(abs_p1p0, mask);
1363   // mask |= (abs(p1 - p0) > limit) * -1;
1364   // mask |= (abs(q1 - q0) > limit) * -1;
1365 
1366   work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
1367 
1368   mask = _mm_max_epu8(work, mask);
1369   mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
1370   mask = _mm_subs_epu8(mask, *limit);
1371   mask = _mm_cmpeq_epi8(mask, zero);
1372 
1373   // lp filter - the same for 6, 8 and 14 versions
1374   filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
1375 
1376   // flat_mask4
1377   flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
1378   flat = _mm_max_epu8(abs_p1p0, flat);
1379 
1380   flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
1381   flat = _mm_subs_epu8(flat, one);
1382   flat = _mm_cmpeq_epi8(flat, zero);
1383   flat = _mm_and_si128(flat, mask);
1384   // replicate for the further "merged variables" usage
1385   flat = _mm_unpacklo_epi32(flat, flat);
1386   flat = _mm_unpacklo_epi64(flat, flat);
1387 
1388   // filter8 need it only if flat !=0
1389   if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
1390     const __m128i four = _mm_set1_epi16(4);
1391     __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
1392     p2_16 = _mm_unpacklo_epi8(*p2, zero);
1393     p1_16 = _mm_unpacklo_epi8(*p1, zero);
1394     p0_16 = _mm_unpacklo_epi8(*p0, zero);
1395     q0_16 = _mm_unpacklo_epi8(*q0, zero);
1396     q1_16 = _mm_unpacklo_epi8(*q1, zero);
1397     q2_16 = _mm_unpacklo_epi8(*q2, zero);
1398     p3_16 = _mm_unpacklo_epi8(*p3, zero);
1399     q3_16 = _mm_unpacklo_epi8(*q3, zero);
1400 
1401     // op2
1402     workp_a =
1403         _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
1404     workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
1405     workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
1406     workp_shft2 = _mm_add_epi16(workp_a, workp_b);
1407 
1408     // op1
1409     workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
1410     workp_c = _mm_add_epi16(workp_a, workp_b);
1411     // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1412 
1413     // op0
1414     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
1415     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
1416     workp_d = _mm_add_epi16(workp_a, workp_b);
1417     // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1418 
1419     workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
1420     workp_c = _mm_srli_epi16(workp_c, 3);
1421     flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
1422 
1423     // oq0
1424     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
1425     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
1426     // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1427     workp_c = _mm_add_epi16(workp_a, workp_b);
1428 
1429     // oq1
1430     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
1431     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
1432     workp_d = _mm_add_epi16(workp_a, workp_b);
1433     // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1434 
1435     workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
1436     workp_c = _mm_srli_epi16(workp_c, 3);
1437     flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
1438 
1439     // oq2
1440     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
1441     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
1442     workp_shft1 = _mm_add_epi16(workp_a, workp_b);
1443 
1444     workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
1445     workp_c = _mm_srli_epi16(workp_c, 3);
1446 
1447     opq2 = _mm_packus_epi16(workp_c, workp_c);
1448 
1449     work_pq = _mm_andnot_si128(flat, q2p2);
1450     pq2 = _mm_and_si128(flat, opq2);
1451     *p2 = _mm_or_si128(work_pq, pq2);
1452     *q2 = _mm_srli_si128(*p2, 4);
1453 
1454     qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
1455     q1q0 = _mm_and_si128(flat, flat_q0q1);
1456     *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
1457 
1458     ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
1459     p1p0 = _mm_and_si128(flat, flat_p1p0);
1460     *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
1461   }
1462 }
1463 
lpf_internal_8_dual_sse2(__m128i * p3,__m128i * q3,__m128i * p2,__m128i * q2,__m128i * p1,__m128i * q1,__m128i * p0,__m128i * q0,__m128i * q1q0_out,__m128i * p1p0_out,__m128i * blimit,__m128i * limit,__m128i * thresh)1464 static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
1465     __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
1466     __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
1467     __m128i *blimit, __m128i *limit, __m128i *thresh) {
1468   const __m128i zero = _mm_setzero_si128();
1469   __m128i mask, hev, flat;
1470   __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
1471       flat_p1p0, flat_q0q1;
1472   __m128i q2p2, q1p1, q0p0;
1473   __m128i q1q0, p1p0, ps1ps0, qs1qs0;
1474   __m128i work_pq, opq2, pq2;
1475 
1476   q3p3 = _mm_unpacklo_epi64(*p3, *q3);
1477   q2p2 = _mm_unpacklo_epi64(*p2, *q2);
1478   q1p1 = _mm_unpacklo_epi64(*p1, *q1);
1479   q0p0 = _mm_unpacklo_epi64(*p0, *q0);
1480 
1481   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
1482   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
1483 
1484   {
1485     // filter_mask and hev_mask
1486 
1487     // considering sse doesn't have unsigned elements comparison the idea is to
1488     // find at least one case when X > limit, it means the corresponding  mask
1489     // bit is set.
1490     // to achieve that we find global max value of all inputs of abs(x-y) or
1491     // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
1492     // otherwise - not
1493 
1494     const __m128i one = _mm_set1_epi8(1);
1495     const __m128i fe = _mm_set1_epi8((char)0xfe);
1496     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1497     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1498 
1499     abs_p1p0 = abs_diff(q1p1, q0p0);
1500     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
1501 
1502     abs_p0q0 = abs_diff(p1p0, q1q0);
1503     abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
1504     abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
1505 
1506     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1507     hev = _mm_subs_epu8(flat, *thresh);
1508     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1509     // replicate for the further "merged variables" usage
1510     hev = _mm_unpacklo_epi64(hev, hev);
1511 
1512     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1513     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1514     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
1515     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1516     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1517     mask = _mm_max_epu8(abs_p1p0, mask);
1518     // mask |= (abs(p1 - p0) > limit) * -1;
1519     // mask |= (abs(q1 - q0) > limit) * -1;
1520 
1521     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
1522 
1523     mask = _mm_max_epu8(work, mask);
1524     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
1525     mask = _mm_subs_epu8(mask, *limit);
1526     mask = _mm_cmpeq_epi8(mask, zero);
1527 
1528     // lp filter - the same for 6, 8 and 14 versions
1529     filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
1530 
1531     // flat_mask4
1532     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
1533     flat = _mm_max_epu8(abs_p1p0, flat);
1534 
1535     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
1536     flat = _mm_subs_epu8(flat, one);
1537     flat = _mm_cmpeq_epi8(flat, zero);
1538     flat = _mm_and_si128(flat, mask);
1539     // replicate for the further "merged variables" usage
1540     flat = _mm_unpacklo_epi64(flat, flat);
1541   }
1542 
1543   // filter8 need it only if flat !=0
1544   if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
1545     const __m128i four = _mm_set1_epi16(4);
1546 
1547     __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
1548     p2_16 = _mm_unpacklo_epi8(*p2, zero);
1549     p1_16 = _mm_unpacklo_epi8(*p1, zero);
1550     p0_16 = _mm_unpacklo_epi8(*p0, zero);
1551     q0_16 = _mm_unpacklo_epi8(*q0, zero);
1552     q1_16 = _mm_unpacklo_epi8(*q1, zero);
1553     q2_16 = _mm_unpacklo_epi8(*q2, zero);
1554     p3_16 = _mm_unpacklo_epi8(*p3, zero);
1555     q3_16 = _mm_unpacklo_epi8(*q3, zero);
1556 
1557     // op2
1558     workp_a =
1559         _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
1560     workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
1561     workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
1562     workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1563 
1564     // op1
1565     workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
1566     workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1567 
1568     // op0
1569     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
1570     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
1571     workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1572 
1573     flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
1574 
1575     // oq0
1576     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
1577     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
1578     workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1579 
1580     // oq1
1581     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
1582     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
1583     workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1584 
1585     flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
1586 
1587     // oq2
1588     workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
1589     workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
1590     workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1591 
1592     opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
1593 
1594     work_pq = _mm_andnot_si128(flat, q2p2);
1595     pq2 = _mm_and_si128(flat, opq2);
1596     *p2 = _mm_or_si128(work_pq, pq2);
1597     *q2 = _mm_srli_si128(*p2, 8);
1598 
1599     qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
1600     q1q0 = _mm_and_si128(flat, flat_q0q1);
1601     *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
1602 
1603     ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
1604     p1p0 = _mm_and_si128(flat, flat_p1p0);
1605     *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
1606   }
1607 }
1608 
aom_lpf_horizontal_8_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1609 void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
1610                                const unsigned char *_blimit,
1611                                const unsigned char *_limit,
1612                                const unsigned char *_thresh) {
1613   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1614   __m128i q1q0, p1p0;
1615   __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
1616   __m128i limit = _mm_load_si128((const __m128i *)_limit);
1617   __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
1618 
1619   p3 = xx_loadl_32(s - 4 * p);
1620   p2 = xx_loadl_32(s - 3 * p);
1621   p1 = xx_loadl_32(s - 2 * p);
1622   p0 = xx_loadl_32(s - 1 * p);
1623   q0 = xx_loadl_32(s - 0 * p);
1624   q1 = xx_loadl_32(s + 1 * p);
1625   q2 = xx_loadl_32(s + 2 * p);
1626   q3 = xx_loadl_32(s + 3 * p);
1627 
1628   lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
1629                       &blimit, &limit, &thresh);
1630 
1631   xx_storel_32(s - 1 * p, p1p0);
1632   xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
1633   xx_storel_32(s + 0 * p, q1q0);
1634   xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
1635   xx_storel_32(s - 3 * p, p2);
1636   xx_storel_32(s + 2 * p, q2);
1637 }
1638 
aom_lpf_horizontal_14_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1639 void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
1640                                      const unsigned char *_blimit0,
1641                                      const unsigned char *_limit0,
1642                                      const unsigned char *_thresh0,
1643                                      const unsigned char *_blimit1,
1644                                      const unsigned char *_limit1,
1645                                      const unsigned char *_thresh1) {
1646   __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
1647   __m128i blimit =
1648       _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
1649                          _mm_load_si128((const __m128i *)_blimit1));
1650   __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
1651                                      _mm_load_si128((const __m128i *)_limit1));
1652   __m128i thresh =
1653       _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
1654                          _mm_load_si128((const __m128i *)_thresh1));
1655 
1656   q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
1657                             _mm_loadl_epi64((__m128i *)(s + 4 * p)));
1658   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
1659                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
1660   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
1661                             _mm_loadl_epi64((__m128i *)(s + 2 * p)));
1662   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
1663                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
1664 
1665   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
1666                             _mm_loadl_epi64((__m128i *)(s - 0 * p)));
1667 
1668   q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
1669                             _mm_loadl_epi64((__m128i *)(s + 5 * p)));
1670 
1671   q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
1672                             _mm_loadl_epi64((__m128i *)(s + 6 * p)));
1673 
1674   lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
1675                             &blimit, &limit, &thresh);
1676 
1677   _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
1678   _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
1679   _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
1680   _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
1681   _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
1682   _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
1683   _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
1684   _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
1685   _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
1686   _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
1687   _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
1688   _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
1689 }
1690 
aom_lpf_horizontal_8_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1691 void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1692                                     const uint8_t *_limit0,
1693                                     const uint8_t *_thresh0,
1694                                     const uint8_t *_blimit1,
1695                                     const uint8_t *_limit1,
1696                                     const uint8_t *_thresh1) {
1697   __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
1698                                       _mm_load_si128((__m128i *)_blimit1));
1699   __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
1700                                      _mm_load_si128((__m128i *)_limit1));
1701   __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
1702                                       _mm_load_si128((__m128i *)_thresh1));
1703 
1704   __m128i p2, p1, p0, q0, q1, q2, p3, q3;
1705   __m128i q1q0, p1p0;
1706 
1707   p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
1708   p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
1709   p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
1710   p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
1711   q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
1712   q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
1713   q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
1714   q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
1715 
1716   lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
1717                            &blimit, &limit, &thresh);
1718 
1719   _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
1720   _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
1721   _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
1722   _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
1723   _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1724   _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1725 }
1726 
aom_lpf_horizontal_4_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1727 void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1728                                     const unsigned char *_blimit0,
1729                                     const unsigned char *_limit0,
1730                                     const unsigned char *_thresh0,
1731                                     const unsigned char *_blimit1,
1732                                     const unsigned char *_limit1,
1733                                     const unsigned char *_thresh1) {
1734   __m128i p1, p0, q0, q1;
1735   __m128i qs1qs0, ps1ps0;
1736 
1737   p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
1738   p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
1739   q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
1740   q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
1741 
1742   const __m128i zero = _mm_setzero_si128();
1743   const __m128i blimit =
1744       _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
1745                          _mm_load_si128((const __m128i *)_blimit1));
1746   const __m128i limit =
1747       _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
1748                          _mm_load_si128((const __m128i *)_limit1));
1749 
1750   __m128i l = _mm_unpacklo_epi64(blimit, limit);
1751 
1752   __m128i thresh0 =
1753       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
1754 
1755   __m128i thresh1 =
1756       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
1757 
1758   __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
1759 
1760   lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
1761 
1762   _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
1763   _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
1764   _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
1765   _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
1766 }
1767 
aom_lpf_vertical_4_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1768 void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1769                                   const uint8_t *_limit0,
1770                                   const uint8_t *_thresh0,
1771                                   const uint8_t *_blimit1,
1772                                   const uint8_t *_limit1,
1773                                   const uint8_t *_thresh1) {
1774   __m128i p0, q0, q1, p1;
1775   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1776   __m128i d0, d1, d2, d3, d4, d5, d6, d7;
1777   __m128i qs1qs0, ps1ps0;
1778 
1779   const __m128i zero = _mm_setzero_si128();
1780   const __m128i blimit =
1781       _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
1782                          _mm_load_si128((const __m128i *)_blimit1));
1783   const __m128i limit =
1784       _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
1785                          _mm_load_si128((const __m128i *)_limit1));
1786 
1787   __m128i l = _mm_unpacklo_epi64(blimit, limit);
1788 
1789   __m128i thresh0 =
1790       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
1791 
1792   __m128i thresh1 =
1793       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
1794 
1795   __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
1796 
1797   x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
1798   x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
1799   x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
1800   x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
1801   x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
1802   x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
1803   x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
1804   x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
1805 
1806   transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
1807                         &q1);
1808 
1809   lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
1810 
1811   p1 = _mm_srli_si128(ps1ps0, 8);
1812   q1 = _mm_srli_si128(qs1qs0, 8);
1813 
1814   transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
1815                         &d5, &d6, &d7);
1816 
1817   xx_storel_32((s - 2 + 0 * p), d0);
1818   xx_storel_32((s - 2 + 1 * p), d1);
1819   xx_storel_32((s - 2 + 2 * p), d2);
1820   xx_storel_32((s - 2 + 3 * p), d3);
1821   xx_storel_32((s - 2 + 4 * p), d4);
1822   xx_storel_32((s - 2 + 5 * p), d5);
1823   xx_storel_32((s - 2 + 6 * p), d6);
1824   xx_storel_32((s - 2 + 7 * p), d7);
1825 }
1826 
aom_lpf_vertical_6_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1827 void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
1828                              const unsigned char *_blimit,
1829                              const unsigned char *_limit,
1830                              const unsigned char *_thresh) {
1831   __m128i d0, d1, d2, d3, d4, d5, d6, d7;
1832   __m128i x2, x1, x0, x3;
1833   __m128i p0, q0;
1834   __m128i p1p0, q1q0;
1835   __m128i blimit = _mm_load_si128((__m128i *)_blimit);
1836   __m128i limit = _mm_load_si128((__m128i *)_limit);
1837   __m128i thresh = _mm_load_si128((__m128i *)_thresh);
1838 
1839   x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
1840   x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
1841   x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
1842   x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
1843 
1844   transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
1845                         &d7);
1846 
1847   lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
1848                       &limit, &thresh);
1849 
1850   p0 = _mm_srli_si128(p1p0, 4);
1851   q0 = _mm_srli_si128(q1q0, 4);
1852 
1853   transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
1854 
1855   xx_storel_32(s + 0 * p - 2, d0);
1856   xx_storel_32(s + 1 * p - 2, d1);
1857   xx_storel_32(s + 2 * p - 2, d2);
1858   xx_storel_32(s + 3 * p - 2, d3);
1859 }
1860 
aom_lpf_vertical_6_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1861 void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1862                                   const uint8_t *_limit0,
1863                                   const uint8_t *_thresh0,
1864                                   const uint8_t *_blimit1,
1865                                   const uint8_t *_limit1,
1866                                   const uint8_t *_thresh1) {
1867   __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
1868                                       _mm_load_si128((__m128i *)_blimit1));
1869   __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
1870                                      _mm_load_si128((__m128i *)_limit1));
1871   __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
1872                                       _mm_load_si128((__m128i *)_thresh1));
1873 
1874   __m128i d0, d1, d2, d3, d4, d5, d6, d7;
1875   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1876   __m128i p0, q0;
1877   __m128i p1p0, q1q0;
1878   __m128i d0d1, d2d3, d4d5, d6d7;
1879 
1880   x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
1881   x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
1882   x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
1883   x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
1884   x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
1885   x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
1886   x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
1887   x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
1888 
1889   transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
1890                     &d6d7);
1891 
1892   d1 = _mm_srli_si128(d0d1, 8);
1893   d3 = _mm_srli_si128(d2d3, 8);
1894   d5 = _mm_srli_si128(d4d5, 8);
1895   d7 = _mm_srli_si128(d6d7, 8);
1896 
1897   lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
1898                            &blimit, &limit, &thresh);
1899 
1900   p0 = _mm_srli_si128(p1p0, 8);
1901   q0 = _mm_srli_si128(q1q0, 8);
1902 
1903   transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
1904                         &d6, &d7);
1905 
1906   xx_storel_32((s - 2 + 0 * p), d0);
1907   xx_storel_32((s - 2 + 1 * p), d1);
1908   xx_storel_32((s - 2 + 2 * p), d2);
1909   xx_storel_32((s - 2 + 3 * p), d3);
1910   xx_storel_32((s - 2 + 4 * p), d4);
1911   xx_storel_32((s - 2 + 5 * p), d5);
1912   xx_storel_32((s - 2 + 6 * p), d6);
1913   xx_storel_32((s - 2 + 7 * p), d7);
1914 }
1915 
aom_lpf_vertical_8_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1916 void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
1917                              const unsigned char *_blimit,
1918                              const unsigned char *_limit,
1919                              const unsigned char *_thresh) {
1920   __m128i d0, d1, d2, d3, d4, d5, d6, d7;
1921 
1922   __m128i p0, q0;
1923   __m128i x2, x1, x0, x3;
1924   __m128i q1q0, p1p0;
1925   __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
1926   __m128i limit = _mm_load_si128((const __m128i *)_limit);
1927   __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
1928 
1929   x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
1930   x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
1931   x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
1932   x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
1933 
1934   transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
1935                         &d7);
1936   // Loop filtering
1937   lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
1938                       &blimit, &limit, &thresh);
1939 
1940   p0 = _mm_srli_si128(p1p0, 4);
1941   q0 = _mm_srli_si128(q1q0, 4);
1942 
1943   transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
1944                         &d2, &d3);
1945 
1946   _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
1947   _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
1948   _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
1949   _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
1950 }
1951 
aom_lpf_vertical_8_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1952 void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1953                                   const uint8_t *_limit0,
1954                                   const uint8_t *_thresh0,
1955                                   const uint8_t *_blimit1,
1956                                   const uint8_t *_limit1,
1957                                   const uint8_t *_thresh1) {
1958   __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
1959                                       _mm_load_si128((__m128i *)_blimit1));
1960   __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
1961                                      _mm_load_si128((__m128i *)_limit1));
1962   __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
1963                                       _mm_load_si128((__m128i *)_thresh1));
1964 
1965   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1966   __m128i d1, d3, d5, d7;
1967   __m128i q1q0, p1p0;
1968   __m128i p1, q1;
1969   __m128i d0d1, d2d3, d4d5, d6d7;
1970 
1971   x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
1972   x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
1973   x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
1974   x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
1975   x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
1976   x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
1977   x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
1978   x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
1979 
1980   transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
1981                     &d6d7);
1982 
1983   d1 = _mm_srli_si128(d0d1, 8);
1984   d3 = _mm_srli_si128(d2d3, 8);
1985   d5 = _mm_srli_si128(d4d5, 8);
1986   d7 = _mm_srli_si128(d6d7, 8);
1987 
1988   lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
1989                            &q1q0, &p1p0, &blimit, &limit, &thresh);
1990 
1991   p1 = _mm_srli_si128(p1p0, 8);
1992   q1 = _mm_srli_si128(q1q0, 8);
1993 
1994   transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
1995                     &d2d3, &d4d5, &d6d7);
1996 
1997   _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
1998   _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
1999   _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
2000   _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
2001   _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
2002   _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
2003   _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
2004   _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
2005 }
2006 
aom_lpf_vertical_14_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)2007 void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
2008                               const unsigned char *_blimit,
2009                               const unsigned char *_limit,
2010                               const unsigned char *_thresh) {
2011   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
2012   __m128i x6, x5, x4, x3;
2013   __m128i pq0, pq1, pq2, pq3;
2014   __m128i blimit = _mm_load_si128((__m128i *)_blimit);
2015   __m128i limit = _mm_load_si128((__m128i *)_limit);
2016   __m128i thresh = _mm_load_si128((__m128i *)_thresh);
2017 
2018   x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
2019   x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
2020   x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
2021   x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
2022 
2023   transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
2024                        &q5p5, &q6p6, &q7p7);
2025 
2026   lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
2027                        &limit, &thresh);
2028 
2029   transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
2030                            &q0p0, &pq0, &pq1, &pq2, &pq3);
2031   _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
2032   _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
2033   _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
2034   _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
2035 }
2036 
aom_lpf_vertical_14_dual_sse2(unsigned char * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)2037 void aom_lpf_vertical_14_dual_sse2(
2038     unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
2039     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
2040     const uint8_t *_thresh1) {
2041   __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
2042   __m128i x7, x6, x5, x4, x3, x2, x1, x0;
2043   __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
2044   __m128i q0, q1, q2, q3, q7;
2045   __m128i p0p1, p2p3, p4p5, p6p7;
2046 
2047   __m128i blimit =
2048       _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
2049                          _mm_load_si128((const __m128i *)_blimit1));
2050   __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
2051                                      _mm_load_si128((const __m128i *)_limit1));
2052   __m128i thresh =
2053       _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
2054                          _mm_load_si128((const __m128i *)_thresh1));
2055 
2056   x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
2057   x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
2058   x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
2059   x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
2060   x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
2061   x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
2062   x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
2063   x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
2064 
2065   transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
2066                           &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
2067 
2068   q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
2069   q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
2070   q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
2071   q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
2072   q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
2073   q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
2074   q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
2075   q7 = _mm_srli_si128(d14d15, 8);
2076 
2077   lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
2078                             &blimit, &limit, &thresh);
2079 
2080   x0 = _mm_srli_si128(q0p0, 8);
2081   x1 = _mm_srli_si128(q1p1, 8);
2082   x2 = _mm_srli_si128(q2p2, 8);
2083   x3 = _mm_srli_si128(q3p3, 8);
2084   x4 = _mm_srli_si128(q4p4, 8);
2085   x5 = _mm_srli_si128(q5p5, 8);
2086   x6 = _mm_srli_si128(q6p6, 8);
2087 
2088   transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
2089                           &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
2090                           &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
2091 
2092   _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
2093   _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
2094   _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
2095   _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
2096   _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
2097   _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
2098   _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
2099   _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
2100 }
2101 
filter_add2_sub2(const __m128i * const total,const __m128i * const a1,const __m128i * const a2,const __m128i * const s1,const __m128i * const s2)2102 static INLINE __m128i filter_add2_sub2(const __m128i *const total,
2103                                        const __m128i *const a1,
2104                                        const __m128i *const a2,
2105                                        const __m128i *const s1,
2106                                        const __m128i *const s2) {
2107   __m128i x = _mm_add_epi16(*a1, *total);
2108   x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
2109   return x;
2110 }
2111 
filter8_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f8_lo,const __m128i * const f8_hi)2112 static INLINE __m128i filter8_mask(const __m128i *const flat,
2113                                    const __m128i *const other_filt,
2114                                    const __m128i *const f8_lo,
2115                                    const __m128i *const f8_hi) {
2116   const __m128i f8 =
2117       _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
2118   const __m128i result = _mm_and_si128(*flat, f8);
2119   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
2120 }
2121 
filter16_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f_lo,const __m128i * const f_hi)2122 static INLINE __m128i filter16_mask(const __m128i *const flat,
2123                                     const __m128i *const other_filt,
2124                                     const __m128i *const f_lo,
2125                                     const __m128i *const f_hi) {
2126   const __m128i f =
2127       _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
2128   const __m128i result = _mm_and_si128(*flat, f);
2129   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
2130 }
2131 
aom_lpf_horizontal_14_quad_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)2132 void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p,
2133                                      const unsigned char *_blimit0,
2134                                      const unsigned char *_limit0,
2135                                      const unsigned char *_thresh0) {
2136   const __m128i zero = _mm_setzero_si128();
2137   const __m128i one = _mm_set1_epi8(1);
2138   const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
2139   const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
2140   const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
2141   __m128i mask, hev, flat, flat2;
2142   __m128i p6, p5;
2143   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
2144   __m128i q6, q5;
2145 
2146   __m128i op2, op1, op0, oq0, oq1, oq2;
2147 
2148   __m128i max_abs_p1p0q1q0;
2149 
2150   p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
2151   p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
2152   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
2153   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
2154   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
2155   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
2156   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
2157   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
2158   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
2159   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
2160   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
2161   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
2162   q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
2163   q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
2164 
2165   {
2166     const __m128i abs_p1p0 = abs_diff(p1, p0);
2167     const __m128i abs_q1q0 = abs_diff(q1, q0);
2168     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
2169     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
2170     __m128i abs_p0q0 = abs_diff(p0, q0);
2171     __m128i abs_p1q1 = abs_diff(p1, q1);
2172     __m128i work;
2173     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
2174 
2175     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
2176     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
2177     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
2178     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
2179     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
2180     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
2181     // mask |= (abs(p1 - p0) > limit) * -1;
2182     // mask |= (abs(q1 - q0) > limit) * -1;
2183     work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
2184     mask = _mm_max_epu8(work, mask);
2185     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
2186     mask = _mm_max_epu8(work, mask);
2187     mask = _mm_subs_epu8(mask, limit_v);
2188     mask = _mm_cmpeq_epi8(mask, zero);
2189   }
2190 
2191   if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
2192 
2193   {
2194     __m128i work;
2195     work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
2196     flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
2197     work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
2198     flat = _mm_max_epu8(work, flat);
2199     work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
2200     flat = _mm_subs_epu8(flat, one);
2201     flat = _mm_cmpeq_epi8(flat, zero);
2202     flat = _mm_and_si128(flat, mask);
2203     flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
2204     flat2 = _mm_max_epu8(work, flat2);
2205     work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
2206     flat2 = _mm_max_epu8(work, flat2);
2207     flat2 = _mm_subs_epu8(flat2, one);
2208     flat2 = _mm_cmpeq_epi8(flat2, zero);
2209     flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
2210   }
2211 
2212   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2213   // filter4
2214   {
2215     const __m128i t4 = _mm_set1_epi8(4);
2216     const __m128i t3 = _mm_set1_epi8(3);
2217     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
2218     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
2219     const __m128i t1f = _mm_set1_epi8(0x1f);
2220     const __m128i t1 = _mm_set1_epi8(0x1);
2221     const __m128i t7f = _mm_set1_epi8(0x7f);
2222     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
2223 
2224     __m128i filt;
2225     __m128i work_a;
2226     __m128i filter1, filter2;
2227 
2228     op1 = _mm_xor_si128(p1, t80);
2229     op0 = _mm_xor_si128(p0, t80);
2230     oq0 = _mm_xor_si128(q0, t80);
2231     oq1 = _mm_xor_si128(q1, t80);
2232 
2233     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
2234     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
2235     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
2236 
2237     work_a = _mm_subs_epi8(oq0, op0);
2238     filt = _mm_adds_epi8(filt, work_a);
2239     filt = _mm_adds_epi8(filt, work_a);
2240     filt = _mm_adds_epi8(filt, work_a);
2241     filt = _mm_and_si128(filt, mask);
2242     filter1 = _mm_adds_epi8(filt, t4);
2243     filter2 = _mm_adds_epi8(filt, t3);
2244 
2245     work_a = _mm_cmpgt_epi8(zero, filter1);
2246     filter1 = _mm_srli_epi16(filter1, 3);
2247     work_a = _mm_and_si128(work_a, te0);
2248     filter1 = _mm_and_si128(filter1, t1f);
2249     filter1 = _mm_or_si128(filter1, work_a);
2250     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
2251 
2252     work_a = _mm_cmpgt_epi8(zero, filter2);
2253     filter2 = _mm_srli_epi16(filter2, 3);
2254     work_a = _mm_and_si128(work_a, te0);
2255     filter2 = _mm_and_si128(filter2, t1f);
2256     filter2 = _mm_or_si128(filter2, work_a);
2257     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
2258 
2259     filt = _mm_adds_epi8(filter1, t1);
2260     work_a = _mm_cmpgt_epi8(zero, filt);
2261     filt = _mm_srli_epi16(filt, 1);
2262     work_a = _mm_and_si128(work_a, t80);
2263     filt = _mm_and_si128(filt, t7f);
2264     filt = _mm_or_si128(filt, work_a);
2265     filt = _mm_andnot_si128(hev, filt);
2266     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
2267     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
2268 
2269     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2270     // filter8
2271     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
2272       const __m128i four = _mm_set1_epi16(4);
2273       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
2274       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
2275       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
2276       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
2277       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
2278       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
2279       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
2280       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
2281 
2282       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
2283       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
2284       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
2285       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
2286       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
2287       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
2288       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
2289       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
2290       __m128i f8_lo, f8_hi;
2291 
2292       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
2293                             _mm_add_epi16(p3_lo, p2_lo));
2294       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
2295                             _mm_add_epi16(p2_lo, p1_lo));
2296       f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
2297 
2298       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
2299                             _mm_add_epi16(p3_hi, p2_hi));
2300       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
2301                             _mm_add_epi16(p2_hi, p1_hi));
2302       f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
2303 
2304       op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
2305 
2306       f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
2307       f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
2308       op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
2309 
2310       f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
2311       f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
2312       op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
2313 
2314       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
2315       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
2316       oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
2317 
2318       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
2319       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
2320       oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
2321 
2322       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
2323       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
2324       oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
2325 
2326       // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2327       // wide flat calculations
2328       if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
2329         const __m128i eight = _mm_set1_epi16(8);
2330         const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
2331         const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
2332         const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
2333         const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
2334         const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
2335         const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
2336 
2337         const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
2338         const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
2339         const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
2340         const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
2341         const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
2342         const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
2343 
2344         __m128i f_lo;
2345         __m128i f_hi;
2346 
2347         f_lo = _mm_sub_epi16(_mm_slli_epi16(p6_lo, 3), p6_lo);
2348         f_lo = _mm_add_epi16(_mm_slli_epi16(p5_lo, 1), f_lo);
2349         f_lo = _mm_add_epi16(_mm_slli_epi16(p4_lo, 1), f_lo);
2350         f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
2351                              _mm_add_epi16(p2_lo, p1_lo));
2352         f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
2353         f_lo = _mm_add_epi16(f_lo, eight);
2354 
2355         f_hi = _mm_sub_epi16(_mm_slli_epi16(p6_hi, 3), p6_hi);
2356         f_hi = _mm_add_epi16(_mm_slli_epi16(p5_hi, 1), f_hi);
2357         f_hi = _mm_add_epi16(_mm_slli_epi16(p4_hi, 1), f_hi);
2358         f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
2359                              _mm_add_epi16(p2_hi, p1_hi));
2360         f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
2361         f_hi = _mm_add_epi16(f_hi, eight);
2362 
2363         p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
2364         _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
2365 
2366         f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p3_lo, &p6_lo, &p6_lo);
2367         f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p3_hi, &p6_hi, &p6_hi);
2368         p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
2369         _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
2370 
2371         f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p2_lo, &p6_lo, &p5_lo);
2372         f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p2_hi, &p6_hi, &p5_hi);
2373         p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
2374         _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
2375 
2376         f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p1_lo, &p6_lo, &p4_lo);
2377         f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p1_hi, &p6_hi, &p4_hi);
2378         op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
2379         _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
2380 
2381         f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p0_lo, &p6_lo, &p3_lo);
2382         f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p0_hi, &p6_hi, &p3_hi);
2383         op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
2384         _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2385 
2386         f_lo = filter_add2_sub2(&f_lo, &q5_lo, &q0_lo, &p6_lo, &p2_lo);
2387         f_hi = filter_add2_sub2(&f_hi, &q5_hi, &q0_hi, &p6_hi, &p2_hi);
2388         op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
2389         _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2390 
2391         f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q1_lo, &p6_lo, &p1_lo);
2392         f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q1_hi, &p6_hi, &p1_hi);
2393         oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
2394         _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2395 
2396         f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q2_lo, &p5_lo, &p0_lo);
2397         f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q2_hi, &p5_hi, &p0_hi);
2398         oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
2399         _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2400 
2401         f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q3_lo, &p4_lo, &q0_lo);
2402         f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q3_hi, &p4_hi, &q0_hi);
2403         oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
2404         _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
2405 
2406         f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q4_lo, &p3_lo, &q1_lo);
2407         f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q4_hi, &p3_hi, &q1_hi);
2408         q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
2409         _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
2410 
2411         f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q5_lo, &p2_lo, &q2_lo);
2412         f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q5_hi, &p2_hi, &q2_hi);
2413         q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
2414         _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
2415 
2416         f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q6_lo, &p1_lo, &q3_lo);
2417         f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q6_hi, &p1_hi, &q3_hi);
2418         q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
2419         _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
2420       } else {
2421         _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
2422         _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2423         _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2424         _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2425         _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2426         _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
2427       }
2428     } else {
2429       _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2430       _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2431       _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2432       _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2433     }
2434   }
2435 }
2436 
aom_lpf_horizontal_8_quad_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)2437 void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p,
2438                                     const unsigned char *_blimit0,
2439                                     const unsigned char *_limit0,
2440                                     const unsigned char *_thresh0) {
2441   const __m128i zero = _mm_setzero_si128();
2442   const __m128i one = _mm_set1_epi8(1);
2443   const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
2444   const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
2445   const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
2446   __m128i mask, hev, flat;
2447   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
2448 
2449   __m128i op2, op1, op0, oq0, oq1, oq2;
2450 
2451   __m128i max_abs_p1p0q1q0;
2452 
2453   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
2454   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
2455   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
2456   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
2457   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
2458   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
2459   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
2460   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
2461 
2462   {
2463     const __m128i abs_p1p0 = abs_diff(p1, p0);
2464     const __m128i abs_q1q0 = abs_diff(q1, q0);
2465     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
2466     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
2467     __m128i abs_p0q0 = abs_diff(p0, q0);
2468     __m128i abs_p1q1 = abs_diff(p1, q1);
2469     __m128i work;
2470     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
2471 
2472     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
2473     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
2474     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
2475     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
2476     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
2477     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
2478     // mask |= (abs(p1 - p0) > limit) * -1;
2479     // mask |= (abs(q1 - q0) > limit) * -1;
2480     work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
2481     mask = _mm_max_epu8(work, mask);
2482     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
2483     mask = _mm_max_epu8(work, mask);
2484     mask = _mm_subs_epu8(mask, limit_v);
2485     mask = _mm_cmpeq_epi8(mask, zero);
2486   }
2487 
2488   if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
2489 
2490   {
2491     __m128i work;
2492     work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
2493     flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
2494     work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
2495     flat = _mm_max_epu8(work, flat);
2496     flat = _mm_subs_epu8(flat, one);
2497     flat = _mm_cmpeq_epi8(flat, zero);
2498     flat = _mm_and_si128(flat, mask);
2499   }
2500 
2501   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2502   // filter4
2503   {
2504     const __m128i t4 = _mm_set1_epi8(4);
2505     const __m128i t3 = _mm_set1_epi8(3);
2506     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
2507     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
2508     const __m128i t1f = _mm_set1_epi8(0x1f);
2509     const __m128i t1 = _mm_set1_epi8(0x1);
2510     const __m128i t7f = _mm_set1_epi8(0x7f);
2511     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
2512 
2513     __m128i filt;
2514     __m128i work_a;
2515     __m128i filter1, filter2;
2516 
2517     op1 = _mm_xor_si128(p1, t80);
2518     op0 = _mm_xor_si128(p0, t80);
2519     oq0 = _mm_xor_si128(q0, t80);
2520     oq1 = _mm_xor_si128(q1, t80);
2521 
2522     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
2523     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
2524     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
2525 
2526     work_a = _mm_subs_epi8(oq0, op0);
2527     filt = _mm_adds_epi8(filt, work_a);
2528     filt = _mm_adds_epi8(filt, work_a);
2529     filt = _mm_adds_epi8(filt, work_a);
2530     filt = _mm_and_si128(filt, mask);
2531     filter1 = _mm_adds_epi8(filt, t4);
2532     filter2 = _mm_adds_epi8(filt, t3);
2533 
2534     work_a = _mm_cmpgt_epi8(zero, filter1);
2535     filter1 = _mm_srli_epi16(filter1, 3);
2536     work_a = _mm_and_si128(work_a, te0);
2537     filter1 = _mm_and_si128(filter1, t1f);
2538     filter1 = _mm_or_si128(filter1, work_a);
2539     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
2540 
2541     work_a = _mm_cmpgt_epi8(zero, filter2);
2542     filter2 = _mm_srli_epi16(filter2, 3);
2543     work_a = _mm_and_si128(work_a, te0);
2544     filter2 = _mm_and_si128(filter2, t1f);
2545     filter2 = _mm_or_si128(filter2, work_a);
2546     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
2547 
2548     filt = _mm_adds_epi8(filter1, t1);
2549     work_a = _mm_cmpgt_epi8(zero, filt);
2550     filt = _mm_srli_epi16(filt, 1);
2551     work_a = _mm_and_si128(work_a, t80);
2552     filt = _mm_and_si128(filt, t7f);
2553     filt = _mm_or_si128(filt, work_a);
2554     filt = _mm_andnot_si128(hev, filt);
2555     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
2556     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
2557 
2558     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2559     // filter8
2560     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
2561       const __m128i four = _mm_set1_epi16(4);
2562       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
2563       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
2564       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
2565       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
2566       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
2567       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
2568       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
2569       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
2570 
2571       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
2572       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
2573       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
2574       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
2575       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
2576       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
2577       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
2578       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
2579       __m128i f8_lo, f8_hi;
2580 
2581       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
2582                             _mm_add_epi16(p3_lo, p2_lo));
2583       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
2584                             _mm_add_epi16(p2_lo, p1_lo));
2585       f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
2586 
2587       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
2588                             _mm_add_epi16(p3_hi, p2_hi));
2589       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
2590                             _mm_add_epi16(p2_hi, p1_hi));
2591       f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
2592 
2593       op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
2594       _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
2595 
2596       f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
2597       f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
2598       op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
2599       _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2600 
2601       f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
2602       f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
2603       op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
2604       _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2605 
2606       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
2607       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
2608       oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
2609       _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2610 
2611       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
2612       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
2613       oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
2614       _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2615 
2616       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
2617       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
2618       oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
2619       _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
2620     } else {
2621       _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2622       _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2623       _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2624       _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2625     }
2626   }
2627 }
2628 
aom_lpf_horizontal_6_quad_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)2629 void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p,
2630                                     const unsigned char *_blimit0,
2631                                     const unsigned char *_limit0,
2632                                     const unsigned char *_thresh0) {
2633   const __m128i zero = _mm_setzero_si128();
2634   const __m128i one = _mm_set1_epi8(1);
2635   const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
2636   const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
2637   const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
2638   __m128i mask, hev, flat;
2639   __m128i p2, p1, p0, q0, q1, q2;
2640 
2641   __m128i op1, op0, oq0, oq1;
2642 
2643   __m128i max_abs_p1p0q1q0;
2644 
2645   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
2646   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
2647   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
2648   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
2649   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
2650   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
2651 
2652   {
2653     const __m128i abs_p1p0 = abs_diff(p1, p0);
2654     const __m128i abs_q1q0 = abs_diff(q1, q0);
2655     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
2656     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
2657     __m128i abs_p0q0 = abs_diff(p0, q0);
2658     __m128i abs_p1q1 = abs_diff(p1, q1);
2659     __m128i work;
2660     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
2661 
2662     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
2663     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
2664     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
2665     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
2666     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
2667     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
2668     // mask |= (abs(p1 - p0) > limit) * -1;
2669     // mask |= (abs(q1 - q0) > limit) * -1;
2670     work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(q2, q1));
2671     mask = _mm_max_epu8(work, mask);
2672     mask = _mm_subs_epu8(mask, limit_v);
2673     mask = _mm_cmpeq_epi8(mask, zero);
2674   }
2675 
2676   if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
2677 
2678   {
2679     __m128i work;
2680     work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
2681     flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
2682     flat = _mm_subs_epu8(flat, one);
2683     flat = _mm_cmpeq_epi8(flat, zero);
2684     flat = _mm_and_si128(flat, mask);
2685   }
2686 
2687   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2688   // filter4
2689   {
2690     const __m128i t4 = _mm_set1_epi8(4);
2691     const __m128i t3 = _mm_set1_epi8(3);
2692     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
2693     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
2694     const __m128i t1f = _mm_set1_epi8(0x1f);
2695     const __m128i t1 = _mm_set1_epi8(0x1);
2696     const __m128i t7f = _mm_set1_epi8(0x7f);
2697     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
2698 
2699     __m128i filt;
2700     __m128i work_a;
2701     __m128i filter1, filter2;
2702 
2703     op1 = _mm_xor_si128(p1, t80);
2704     op0 = _mm_xor_si128(p0, t80);
2705     oq0 = _mm_xor_si128(q0, t80);
2706     oq1 = _mm_xor_si128(q1, t80);
2707 
2708     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
2709     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
2710     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
2711 
2712     work_a = _mm_subs_epi8(oq0, op0);
2713     filt = _mm_adds_epi8(filt, work_a);
2714     filt = _mm_adds_epi8(filt, work_a);
2715     filt = _mm_adds_epi8(filt, work_a);
2716     filt = _mm_and_si128(filt, mask);
2717     filter1 = _mm_adds_epi8(filt, t4);
2718     filter2 = _mm_adds_epi8(filt, t3);
2719 
2720     work_a = _mm_cmpgt_epi8(zero, filter1);
2721     filter1 = _mm_srli_epi16(filter1, 3);
2722     work_a = _mm_and_si128(work_a, te0);
2723     filter1 = _mm_and_si128(filter1, t1f);
2724     filter1 = _mm_or_si128(filter1, work_a);
2725     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
2726 
2727     work_a = _mm_cmpgt_epi8(zero, filter2);
2728     filter2 = _mm_srli_epi16(filter2, 3);
2729     work_a = _mm_and_si128(work_a, te0);
2730     filter2 = _mm_and_si128(filter2, t1f);
2731     filter2 = _mm_or_si128(filter2, work_a);
2732     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
2733 
2734     filt = _mm_adds_epi8(filter1, t1);
2735     work_a = _mm_cmpgt_epi8(zero, filt);
2736     filt = _mm_srli_epi16(filt, 1);
2737     work_a = _mm_and_si128(work_a, t80);
2738     filt = _mm_and_si128(filt, t7f);
2739     filt = _mm_or_si128(filt, work_a);
2740     filt = _mm_andnot_si128(hev, filt);
2741     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
2742     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
2743 
2744     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2745     // filter6
2746     if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
2747       const __m128i four = _mm_set1_epi16(4);
2748       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
2749       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
2750       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
2751       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
2752       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
2753       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
2754 
2755       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
2756       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
2757       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
2758       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
2759       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
2760       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
2761       __m128i f8_lo, f8_hi;
2762 
2763       f8_lo = _mm_add_epi16(_mm_add_epi16(p2_lo, four),
2764                             _mm_add_epi16(p2_lo, p2_lo));
2765       f8_lo = _mm_add_epi16(_mm_add_epi16(p1_lo, f8_lo),
2766                             _mm_add_epi16(p1_lo, p0_lo));
2767       f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
2768 
2769       f8_hi = _mm_add_epi16(_mm_add_epi16(p2_hi, four),
2770                             _mm_add_epi16(p2_hi, p2_hi));
2771       f8_hi = _mm_add_epi16(_mm_add_epi16(p1_hi, f8_hi),
2772                             _mm_add_epi16(p1_hi, p0_hi));
2773       f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
2774 
2775       op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
2776       _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2777 
2778       f8_lo = filter_add2_sub2(&f8_lo, &q0_lo, &q1_lo, &p2_lo, &p2_lo);
2779       f8_hi = filter_add2_sub2(&f8_hi, &q0_hi, &q1_hi, &p2_hi, &p2_hi);
2780       op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
2781       _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2782 
2783       f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &q2_lo, &p1_lo, &p2_lo);
2784       f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &q2_hi, &p1_hi, &p2_hi);
2785       oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
2786       _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2787 
2788       f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &q2_lo, &p0_lo, &p1_lo);
2789       f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &q2_hi, &p0_hi, &p1_hi);
2790       oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
2791       _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2792     } else {
2793       _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2794       _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2795       _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2796       _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2797     }
2798   }
2799 }
2800 
aom_lpf_horizontal_4_quad_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0)2801 void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p,
2802                                     const unsigned char *_blimit0,
2803                                     const unsigned char *_limit0,
2804                                     const unsigned char *_thresh0) {
2805   const __m128i zero = _mm_setzero_si128();
2806   const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
2807   const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
2808   const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
2809   __m128i mask, hev;
2810   __m128i p1, p0, q0, q1;
2811 
2812   __m128i op1, op0, oq0, oq1;
2813 
2814   __m128i max_abs_p1p0q1q0;
2815 
2816   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
2817   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
2818   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
2819   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
2820 
2821   {
2822     const __m128i abs_p1p0 = abs_diff(p1, p0);
2823     const __m128i abs_q1q0 = abs_diff(q1, q0);
2824     const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
2825     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
2826     __m128i abs_p0q0 = abs_diff(p0, q0);
2827     __m128i abs_p1q1 = abs_diff(p1, q1);
2828     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
2829 
2830     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
2831     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
2832     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
2833     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
2834     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
2835     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
2836     // mask |= (abs(p1 - p0) > limit) * -1;
2837     // mask |= (abs(q1 - q0) > limit) * -1;
2838     mask = _mm_subs_epu8(mask, limit_v);
2839     mask = _mm_cmpeq_epi8(mask, zero);
2840   }
2841 
2842   if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
2843 
2844   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2845   // filter4
2846   {
2847     const __m128i t4 = _mm_set1_epi8(4);
2848     const __m128i t3 = _mm_set1_epi8(3);
2849     const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
2850     const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
2851     const __m128i t1f = _mm_set1_epi8(0x1f);
2852     const __m128i t1 = _mm_set1_epi8(0x1);
2853     const __m128i t7f = _mm_set1_epi8(0x7f);
2854     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
2855 
2856     __m128i filt;
2857     __m128i work_a;
2858     __m128i filter1, filter2;
2859 
2860     op1 = _mm_xor_si128(p1, t80);
2861     op0 = _mm_xor_si128(p0, t80);
2862     oq0 = _mm_xor_si128(q0, t80);
2863     oq1 = _mm_xor_si128(q1, t80);
2864 
2865     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
2866     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
2867     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
2868 
2869     work_a = _mm_subs_epi8(oq0, op0);
2870     filt = _mm_adds_epi8(filt, work_a);
2871     filt = _mm_adds_epi8(filt, work_a);
2872     filt = _mm_adds_epi8(filt, work_a);
2873     filt = _mm_and_si128(filt, mask);
2874     filter1 = _mm_adds_epi8(filt, t4);
2875     filter2 = _mm_adds_epi8(filt, t3);
2876 
2877     work_a = _mm_cmpgt_epi8(zero, filter1);
2878     filter1 = _mm_srli_epi16(filter1, 3);
2879     work_a = _mm_and_si128(work_a, te0);
2880     filter1 = _mm_and_si128(filter1, t1f);
2881     filter1 = _mm_or_si128(filter1, work_a);
2882     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
2883 
2884     work_a = _mm_cmpgt_epi8(zero, filter2);
2885     filter2 = _mm_srli_epi16(filter2, 3);
2886     work_a = _mm_and_si128(work_a, te0);
2887     filter2 = _mm_and_si128(filter2, t1f);
2888     filter2 = _mm_or_si128(filter2, work_a);
2889     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
2890 
2891     filt = _mm_adds_epi8(filter1, t1);
2892     work_a = _mm_cmpgt_epi8(zero, filt);
2893     filt = _mm_srli_epi16(filt, 1);
2894     work_a = _mm_and_si128(work_a, t80);
2895     filt = _mm_and_si128(filt, t7f);
2896     filt = _mm_or_si128(filt, work_a);
2897     filt = _mm_andnot_si128(hev, filt);
2898     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
2899     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
2900 
2901     _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
2902     _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
2903     _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
2904     _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
2905   }
2906 }
2907 
aom_lpf_vertical_14_quad_sse2(unsigned char * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)2908 void aom_lpf_vertical_14_quad_sse2(unsigned char *s, int pitch,
2909                                    const uint8_t *_blimit0,
2910                                    const uint8_t *_limit0,
2911                                    const uint8_t *_thresh0) {
2912   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
2913 
2914   // Transpose 16x16
2915   transpose_16x8(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
2916   transpose_16x8(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
2917 
2918   // Loop filtering
2919   aom_lpf_horizontal_14_quad(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0);
2920 
2921   // Transpose back
2922   transpose_16x8(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
2923   transpose_16x8(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
2924 }
2925 
aom_lpf_vertical_8_quad_sse2(uint8_t * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)2926 void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch,
2927                                   const uint8_t *_blimit0,
2928                                   const uint8_t *_limit0,
2929                                   const uint8_t *_thresh0) {
2930   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
2931 
2932   // Transpose 16x8
2933   transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
2934 
2935   // Loop filtering
2936   aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
2937 
2938   // Transpose back
2939   transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
2940 }
2941 
aom_lpf_vertical_6_quad_sse2(uint8_t * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)2942 void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch,
2943                                   const uint8_t *_blimit0,
2944                                   const uint8_t *_limit0,
2945                                   const uint8_t *_thresh0) {
2946   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
2947 
2948   // Transpose 16x8:: (wxh) 8x16 to 16x8
2949   transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
2950 
2951   // Loop filtering
2952   aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
2953 
2954   // Transpose back:: (wxh) 16x8 to 8x16
2955   transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
2956 }
2957 
aom_lpf_vertical_4_quad_sse2(uint8_t * s,int pitch,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0)2958 void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
2959                                   const uint8_t *_blimit0,
2960                                   const uint8_t *_limit0,
2961                                   const uint8_t *_thresh0) {
2962   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
2963 
2964   // Transpose 16x8
2965   transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
2966 
2967   // Loop filtering
2968   aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0,
2969                                  _thresh0);
2970 
2971   // Transpose back
2972   transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
2973 }
2974