• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdlib.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_ports/mem.h"
19 
signed_char_clamp(int t)20 static INLINE int8_t signed_char_clamp(int t) {
21   return (int8_t)clamp(t, -128, 127);
22 }
23 
24 #if CONFIG_AV1_HIGHBITDEPTH
signed_char_clamp_high(int t,int bd)25 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
26   switch (bd) {
27     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
28     case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
29     case 8:
30     default: return (int16_t)clamp(t, -128, 128 - 1);
31   }
32 }
33 #endif
34 
35 // should we apply any filter at all: 11111111 yes, 00000000 no
filter_mask2(uint8_t limit,uint8_t blimit,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1)36 static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
37                                   uint8_t p0, uint8_t q0, uint8_t q1) {
38   int8_t mask = 0;
39   mask |= (abs(p1 - p0) > limit) * -1;
40   mask |= (abs(q1 - q0) > limit) * -1;
41   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
42   return ~mask;
43 }
44 
filter_mask(uint8_t limit,uint8_t blimit,uint8_t p3,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2,uint8_t q3)45 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
46                                  uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
47                                  uint8_t q1, uint8_t q2, uint8_t q3) {
48   int8_t mask = 0;
49   mask |= (abs(p3 - p2) > limit) * -1;
50   mask |= (abs(p2 - p1) > limit) * -1;
51   mask |= (abs(p1 - p0) > limit) * -1;
52   mask |= (abs(q1 - q0) > limit) * -1;
53   mask |= (abs(q2 - q1) > limit) * -1;
54   mask |= (abs(q3 - q2) > limit) * -1;
55   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
56   return ~mask;
57 }
58 
filter_mask3_chroma(uint8_t limit,uint8_t blimit,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2)59 static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
60                                          uint8_t p2, uint8_t p1, uint8_t p0,
61                                          uint8_t q0, uint8_t q1, uint8_t q2) {
62   int8_t mask = 0;
63   mask |= (abs(p2 - p1) > limit) * -1;
64   mask |= (abs(p1 - p0) > limit) * -1;
65   mask |= (abs(q1 - q0) > limit) * -1;
66   mask |= (abs(q2 - q1) > limit) * -1;
67   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
68   return ~mask;
69 }
70 
flat_mask3_chroma(uint8_t thresh,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2)71 static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
72                                        uint8_t p0, uint8_t q0, uint8_t q1,
73                                        uint8_t q2) {
74   int8_t mask = 0;
75   mask |= (abs(p1 - p0) > thresh) * -1;
76   mask |= (abs(q1 - q0) > thresh) * -1;
77   mask |= (abs(p2 - p0) > thresh) * -1;
78   mask |= (abs(q2 - q0) > thresh) * -1;
79   return ~mask;
80 }
81 
flat_mask4(uint8_t thresh,uint8_t p3,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2,uint8_t q3)82 static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
83                                 uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
84                                 uint8_t q2, uint8_t q3) {
85   int8_t mask = 0;
86   mask |= (abs(p1 - p0) > thresh) * -1;
87   mask |= (abs(q1 - q0) > thresh) * -1;
88   mask |= (abs(p2 - p0) > thresh) * -1;
89   mask |= (abs(q2 - q0) > thresh) * -1;
90   mask |= (abs(p3 - p0) > thresh) * -1;
91   mask |= (abs(q3 - q0) > thresh) * -1;
92   return ~mask;
93 }
94 
95 // is there high edge variance internal edge: 11111111 yes, 00000000 no
hev_mask(uint8_t thresh,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1)96 static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
97                               uint8_t q0, uint8_t q1) {
98   int8_t hev = 0;
99   hev |= (abs(p1 - p0) > thresh) * -1;
100   hev |= (abs(q1 - q0) > thresh) * -1;
101   return hev;
102 }
103 
filter4(int8_t mask,uint8_t thresh,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1)104 static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
105                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
106   int8_t filter1, filter2;
107 
108   const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
109   const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
110   const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
111   const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
112   const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
113 
114   // add outer taps if we have high edge variance
115   int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
116 
117   // inner taps
118   filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
119 
120   // save bottom 3 bits so that we round one side +4 and the other +3
121   // if it equals 4 we'll set to adjust by -1 to account for the fact
122   // we'd round 3 the other way
123   filter1 = signed_char_clamp(filter + 4) >> 3;
124   filter2 = signed_char_clamp(filter + 3) >> 3;
125 
126   *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
127   *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
128 
129   // outer tap adjustments
130   filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
131 
132   *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
133   *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
134 }
135 
aom_lpf_horizontal_4_c(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)136 void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
137                             const uint8_t *blimit, const uint8_t *limit,
138                             const uint8_t *thresh) {
139   int i;
140   int count = 4;
141 
142   // loop filter designed to work using chars so that we can make maximum use
143   // of 8 bit simd instructions.
144   for (i = 0; i < count; ++i) {
145     const uint8_t p1 = s[-2 * p], p0 = s[-p];
146     const uint8_t q0 = s[0 * p], q1 = s[1 * p];
147     const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
148     filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
149     ++s;
150   }
151 }
152 
aom_lpf_horizontal_4_dual_c(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)153 void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
154                                  const uint8_t *limit0, const uint8_t *thresh0,
155                                  const uint8_t *blimit1, const uint8_t *limit1,
156                                  const uint8_t *thresh1) {
157   aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
158   aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
159 }
160 
aom_lpf_horizontal_4_quad_c(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0)161 void aom_lpf_horizontal_4_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
162                                  const uint8_t *limit0,
163                                  const uint8_t *thresh0) {
164   aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
165   aom_lpf_horizontal_4_c(s + 4, p, blimit0, limit0, thresh0);
166   aom_lpf_horizontal_4_c(s + 8, p, blimit0, limit0, thresh0);
167   aom_lpf_horizontal_4_c(s + 12, p, blimit0, limit0, thresh0);
168 }
169 
aom_lpf_vertical_4_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)170 void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
171                           const uint8_t *limit, const uint8_t *thresh) {
172   int i;
173   int count = 4;
174 
175   // loop filter designed to work using chars so that we can make maximum use
176   // of 8 bit simd instructions.
177   for (i = 0; i < count; ++i) {
178     const uint8_t p1 = s[-2], p0 = s[-1];
179     const uint8_t q0 = s[0], q1 = s[1];
180     const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
181     filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
182     s += pitch;
183   }
184 }
185 
aom_lpf_vertical_4_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)186 void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
187                                const uint8_t *limit0, const uint8_t *thresh0,
188                                const uint8_t *blimit1, const uint8_t *limit1,
189                                const uint8_t *thresh1) {
190   aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
191   aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
192 }
193 
aom_lpf_vertical_4_quad_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0)194 void aom_lpf_vertical_4_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
195                                const uint8_t *limit0, const uint8_t *thresh0) {
196   aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
197   aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
198   aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
199   aom_lpf_vertical_4_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
200 }
201 
filter6(int8_t mask,uint8_t thresh,int8_t flat,uint8_t * op2,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1,uint8_t * oq2)202 static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
203                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
204                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
205   if (flat && mask) {
206     const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
207     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
208 
209     // 5-tap filter [1, 2, 2, 2, 1]
210     *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
211     *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
212     *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
213     *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
214   } else {
215     filter4(mask, thresh, op1, op0, oq0, oq1);
216   }
217 }
218 
filter8(int8_t mask,uint8_t thresh,int8_t flat,uint8_t * op3,uint8_t * op2,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1,uint8_t * oq2,uint8_t * oq3)219 static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
220                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
221                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
222                            uint8_t *oq2, uint8_t *oq3) {
223   if (flat && mask) {
224     const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
225     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
226 
227     // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
228     *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
229     *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
230     *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
231     *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
232     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
233     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
234   } else {
235     filter4(mask, thresh, op1, op0, oq0, oq1);
236   }
237 }
238 
aom_lpf_horizontal_6_c(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)239 void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
240                             const uint8_t *limit, const uint8_t *thresh) {
241   int i;
242   int count = 4;
243 
244   // loop filter designed to work using chars so that we can make maximum use
245   // of 8 bit simd instructions.
246   for (i = 0; i < count; ++i) {
247     const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
248     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
249 
250     const int8_t mask =
251         filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
252     const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
253     filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
254             s + 2 * p);
255     ++s;
256   }
257 }
258 
aom_lpf_horizontal_6_dual_c(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)259 void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
260                                  const uint8_t *limit0, const uint8_t *thresh0,
261                                  const uint8_t *blimit1, const uint8_t *limit1,
262                                  const uint8_t *thresh1) {
263   aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
264   aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
265 }
266 
aom_lpf_horizontal_6_quad_c(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0)267 void aom_lpf_horizontal_6_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
268                                  const uint8_t *limit0,
269                                  const uint8_t *thresh0) {
270   aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
271   aom_lpf_horizontal_6_c(s + 4, p, blimit0, limit0, thresh0);
272   aom_lpf_horizontal_6_c(s + 8, p, blimit0, limit0, thresh0);
273   aom_lpf_horizontal_6_c(s + 12, p, blimit0, limit0, thresh0);
274 }
275 
aom_lpf_horizontal_8_c(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)276 void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
277                             const uint8_t *limit, const uint8_t *thresh) {
278   int i;
279   int count = 4;
280 
281   // loop filter designed to work using chars so that we can make maximum use
282   // of 8 bit simd instructions.
283   for (i = 0; i < count; ++i) {
284     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
285     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
286 
287     const int8_t mask =
288         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
289     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
290     filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
291             s + 1 * p, s + 2 * p, s + 3 * p);
292     ++s;
293   }
294 }
295 
aom_lpf_horizontal_8_dual_c(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)296 void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
297                                  const uint8_t *limit0, const uint8_t *thresh0,
298                                  const uint8_t *blimit1, const uint8_t *limit1,
299                                  const uint8_t *thresh1) {
300   aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
301   aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
302 }
303 
aom_lpf_horizontal_8_quad_c(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0)304 void aom_lpf_horizontal_8_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
305                                  const uint8_t *limit0,
306                                  const uint8_t *thresh0) {
307   aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
308   aom_lpf_horizontal_8_c(s + 4, p, blimit0, limit0, thresh0);
309   aom_lpf_horizontal_8_c(s + 8, p, blimit0, limit0, thresh0);
310   aom_lpf_horizontal_8_c(s + 12, p, blimit0, limit0, thresh0);
311 }
312 
aom_lpf_vertical_6_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)313 void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
314                           const uint8_t *limit, const uint8_t *thresh) {
315   int i;
316   int count = 4;
317 
318   for (i = 0; i < count; ++i) {
319     const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
320     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
321     const int8_t mask =
322         filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
323     const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
324     filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
325     s += pitch;
326   }
327 }
328 
aom_lpf_vertical_6_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)329 void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
330                                const uint8_t *limit0, const uint8_t *thresh0,
331                                const uint8_t *blimit1, const uint8_t *limit1,
332                                const uint8_t *thresh1) {
333   aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
334   aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
335 }
336 
aom_lpf_vertical_6_quad_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0)337 void aom_lpf_vertical_6_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
338                                const uint8_t *limit0, const uint8_t *thresh0) {
339   aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
340   aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
341   aom_lpf_vertical_6_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
342   aom_lpf_vertical_6_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
343 }
344 
aom_lpf_vertical_8_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)345 void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
346                           const uint8_t *limit, const uint8_t *thresh) {
347   int i;
348   int count = 4;
349 
350   for (i = 0; i < count; ++i) {
351     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
352     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
353     const int8_t mask =
354         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
355     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
356     filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
357             s + 3);
358     s += pitch;
359   }
360 }
361 
aom_lpf_vertical_8_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)362 void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
363                                const uint8_t *limit0, const uint8_t *thresh0,
364                                const uint8_t *blimit1, const uint8_t *limit1,
365                                const uint8_t *thresh1) {
366   aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
367   aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
368 }
369 
aom_lpf_vertical_8_quad_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0)370 void aom_lpf_vertical_8_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
371                                const uint8_t *limit0, const uint8_t *thresh0) {
372   aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
373   aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
374   aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
375   aom_lpf_vertical_8_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
376 }
377 
filter14(int8_t mask,uint8_t thresh,int8_t flat,int8_t flat2,uint8_t * op6,uint8_t * op5,uint8_t * op4,uint8_t * op3,uint8_t * op2,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1,uint8_t * oq2,uint8_t * oq3,uint8_t * oq4,uint8_t * oq5,uint8_t * oq6)378 static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
379                             int8_t flat2, uint8_t *op6, uint8_t *op5,
380                             uint8_t *op4, uint8_t *op3, uint8_t *op2,
381                             uint8_t *op1, uint8_t *op0, uint8_t *oq0,
382                             uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
383                             uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
384   if (flat2 && flat && mask) {
385     const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
386                   p1 = *op1, p0 = *op0;
387     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
388                   q5 = *oq5, q6 = *oq6;
389 
390     // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
391     *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
392                               4);
393     *op4 = ROUND_POWER_OF_TWO(
394         p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
395     *op3 = ROUND_POWER_OF_TWO(
396         p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
397     *op2 = ROUND_POWER_OF_TWO(
398         p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
399         4);
400     *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
401                                   q0 + q1 + q2 + q3 + q4,
402                               4);
403     *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
404                                   q0 * 2 + q1 + q2 + q3 + q4 + q5,
405                               4);
406     *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
407                                   q1 * 2 + q2 + q3 + q4 + q5 + q6,
408                               4);
409     *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
410                                   q2 * 2 + q3 + q4 + q5 + q6 * 2,
411                               4);
412     *oq2 = ROUND_POWER_OF_TWO(
413         p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
414         4);
415     *oq3 = ROUND_POWER_OF_TWO(
416         p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
417     *oq4 = ROUND_POWER_OF_TWO(
418         p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
419     *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
420                               4);
421   } else {
422     filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
423   }
424 }
425 
mb_lpf_horizontal_edge_w(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)426 static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
427                                      const uint8_t *limit,
428                                      const uint8_t *thresh, int count) {
429   int i;
430   int step = 4;
431 
432   // loop filter designed to work using chars so that we can make maximum use
433   // of 8 bit simd instructions.
434   for (i = 0; i < step * count; ++i) {
435     const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
436                   p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
437     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
438                   q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
439     const int8_t mask =
440         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
441     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
442     const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
443 
444     filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
445              s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
446              s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
447     ++s;
448   }
449 }
450 
aom_lpf_horizontal_14_c(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)451 void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
452                              const uint8_t *limit, const uint8_t *thresh) {
453   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
454 }
455 
aom_lpf_horizontal_14_dual_c(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)456 void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
457                                   const uint8_t *limit0, const uint8_t *thresh0,
458                                   const uint8_t *blimit1, const uint8_t *limit1,
459                                   const uint8_t *thresh1) {
460   mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
461   mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
462 }
463 
aom_lpf_horizontal_14_quad_c(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0)464 void aom_lpf_horizontal_14_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
465                                   const uint8_t *limit0,
466                                   const uint8_t *thresh0) {
467   mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
468   mb_lpf_horizontal_edge_w(s + 4, p, blimit0, limit0, thresh0, 1);
469   mb_lpf_horizontal_edge_w(s + 8, p, blimit0, limit0, thresh0, 1);
470   mb_lpf_horizontal_edge_w(s + 12, p, blimit0, limit0, thresh0, 1);
471 }
472 
mb_lpf_vertical_edge_w(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)473 static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
474                                    const uint8_t *limit, const uint8_t *thresh,
475                                    int count) {
476   int i;
477 
478   for (i = 0; i < count; ++i) {
479     const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
480                   p1 = s[-2], p0 = s[-1];
481     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
482                   q5 = s[5], q6 = s[6];
483     const int8_t mask =
484         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
485     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
486     const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
487 
488     filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
489              s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
490     s += p;
491   }
492 }
493 
aom_lpf_vertical_14_c(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)494 void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
495                            const uint8_t *limit, const uint8_t *thresh) {
496   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
497 }
498 
aom_lpf_vertical_14_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)499 void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
500                                 const uint8_t *limit0, const uint8_t *thresh0,
501                                 const uint8_t *blimit1, const uint8_t *limit1,
502                                 const uint8_t *thresh1) {
503   mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
504   mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
505 }
506 
aom_lpf_vertical_14_quad_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0)507 void aom_lpf_vertical_14_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
508                                 const uint8_t *limit0, const uint8_t *thresh0) {
509   mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
510   mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit0, limit0, thresh0, 4);
511   mb_lpf_vertical_edge_w(s + 8 * pitch, pitch, blimit0, limit0, thresh0, 4);
512   mb_lpf_vertical_edge_w(s + 12 * pitch, pitch, blimit0, limit0, thresh0, 4);
513 }
514 
515 #if CONFIG_AV1_HIGHBITDEPTH
516 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
highbd_filter_mask2(uint8_t limit,uint8_t blimit,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,int bd)517 static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
518                                          uint16_t p1, uint16_t p0, uint16_t q0,
519                                          uint16_t q1, int bd) {
520   int8_t mask = 0;
521   int16_t limit16 = (uint16_t)limit << (bd - 8);
522   int16_t blimit16 = (uint16_t)blimit << (bd - 8);
523   mask |= (abs(p1 - p0) > limit16) * -1;
524   mask |= (abs(q1 - q0) > limit16) * -1;
525   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
526   return ~mask;
527 }
528 
529 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
highbd_filter_mask(uint8_t limit,uint8_t blimit,uint16_t p3,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,uint16_t q3,int bd)530 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
531                                         uint16_t p3, uint16_t p2, uint16_t p1,
532                                         uint16_t p0, uint16_t q0, uint16_t q1,
533                                         uint16_t q2, uint16_t q3, int bd) {
534   int8_t mask = 0;
535   int16_t limit16 = (uint16_t)limit << (bd - 8);
536   int16_t blimit16 = (uint16_t)blimit << (bd - 8);
537   mask |= (abs(p3 - p2) > limit16) * -1;
538   mask |= (abs(p2 - p1) > limit16) * -1;
539   mask |= (abs(p1 - p0) > limit16) * -1;
540   mask |= (abs(q1 - q0) > limit16) * -1;
541   mask |= (abs(q2 - q1) > limit16) * -1;
542   mask |= (abs(q3 - q2) > limit16) * -1;
543   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
544   return ~mask;
545 }
546 
highbd_filter_mask3_chroma(uint8_t limit,uint8_t blimit,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,int bd)547 static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
548                                                 uint16_t p2, uint16_t p1,
549                                                 uint16_t p0, uint16_t q0,
550                                                 uint16_t q1, uint16_t q2,
551                                                 int bd) {
552   int8_t mask = 0;
553   int16_t limit16 = (uint16_t)limit << (bd - 8);
554   int16_t blimit16 = (uint16_t)blimit << (bd - 8);
555   mask |= (abs(p2 - p1) > limit16) * -1;
556   mask |= (abs(p1 - p0) > limit16) * -1;
557   mask |= (abs(q1 - q0) > limit16) * -1;
558   mask |= (abs(q2 - q1) > limit16) * -1;
559   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
560   return ~mask;
561 }
562 
highbd_flat_mask3_chroma(uint8_t thresh,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,int bd)563 static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
564                                               uint16_t p1, uint16_t p0,
565                                               uint16_t q0, uint16_t q1,
566                                               uint16_t q2, int bd) {
567   int8_t mask = 0;
568   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
569   mask |= (abs(p1 - p0) > thresh16) * -1;
570   mask |= (abs(q1 - q0) > thresh16) * -1;
571   mask |= (abs(p2 - p0) > thresh16) * -1;
572   mask |= (abs(q2 - q0) > thresh16) * -1;
573   return ~mask;
574 }
575 
highbd_flat_mask4(uint8_t thresh,uint16_t p3,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,uint16_t q3,int bd)576 static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
577                                        uint16_t p1, uint16_t p0, uint16_t q0,
578                                        uint16_t q1, uint16_t q2, uint16_t q3,
579                                        int bd) {
580   int8_t mask = 0;
581   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
582   mask |= (abs(p1 - p0) > thresh16) * -1;
583   mask |= (abs(q1 - q0) > thresh16) * -1;
584   mask |= (abs(p2 - p0) > thresh16) * -1;
585   mask |= (abs(q2 - q0) > thresh16) * -1;
586   mask |= (abs(p3 - p0) > thresh16) * -1;
587   mask |= (abs(q3 - q0) > thresh16) * -1;
588   return ~mask;
589 }
590 
591 // Is there high edge variance internal edge:
592 // 11111111_11111111 yes, 00000000_00000000 no ?
highbd_hev_mask(uint8_t thresh,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,int bd)593 static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
594                                       uint16_t q0, uint16_t q1, int bd) {
595   int16_t hev = 0;
596   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
597   hev |= (abs(p1 - p0) > thresh16) * -1;
598   hev |= (abs(q1 - q0) > thresh16) * -1;
599   return hev;
600 }
601 
highbd_filter4(int8_t mask,uint8_t thresh,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,int bd)602 static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
603                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
604                                   int bd) {
605   int16_t filter1, filter2;
606   // ^0x80 equivalent to subtracting 0x80 from the values to turn them
607   // into -128 to +127 instead of 0 to 255.
608   int shift = bd - 8;
609   const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
610   const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
611   const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
612   const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
613   const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
614 
615   // Add outer taps if we have high edge variance.
616   int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
617 
618   // Inner taps.
619   filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
620 
621   // Save bottom 3 bits so that we round one side +4 and the other +3
622   // if it equals 4 we'll set to adjust by -1 to account for the fact
623   // we'd round 3 the other way.
624   filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
625   filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
626 
627   *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
628   *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
629 
630   // Outer tap adjustments.
631   filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
632 
633   *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
634   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
635 }
636 
aom_highbd_lpf_horizontal_4_c(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)637 void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
638                                    const uint8_t *blimit, const uint8_t *limit,
639                                    const uint8_t *thresh, int bd) {
640   int i;
641   int count = 4;
642 
643   // loop filter designed to work using chars so that we can make maximum use
644   // of 8 bit simd instructions.
645   for (i = 0; i < count; ++i) {
646     const uint16_t p1 = s[-2 * p];
647     const uint16_t p0 = s[-p];
648     const uint16_t q0 = s[0 * p];
649     const uint16_t q1 = s[1 * p];
650     const int8_t mask =
651         highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
652     highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
653     ++s;
654   }
655 }
656 
aom_highbd_lpf_horizontal_4_dual_c(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)657 void aom_highbd_lpf_horizontal_4_dual_c(
658     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
659     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
660     const uint8_t *thresh1, int bd) {
661   aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
662   aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
663 }
664 
aom_highbd_lpf_vertical_4_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)665 void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
666                                  const uint8_t *limit, const uint8_t *thresh,
667                                  int bd) {
668   int i;
669   int count = 4;
670 
671   // loop filter designed to work using chars so that we can make maximum use
672   // of 8 bit simd instructions.
673   for (i = 0; i < count; ++i) {
674     const uint16_t p1 = s[-2], p0 = s[-1];
675     const uint16_t q0 = s[0], q1 = s[1];
676     const int8_t mask =
677         highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
678     highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
679     s += pitch;
680   }
681 }
682 
aom_highbd_lpf_vertical_4_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)683 void aom_highbd_lpf_vertical_4_dual_c(
684     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
685     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
686     const uint8_t *thresh1, int bd) {
687   aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
688   aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
689                               bd);
690 }
691 
highbd_filter6(int8_t mask,uint8_t thresh,int8_t flat,uint16_t * op2,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,uint16_t * oq2,int bd)692 static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
693                                   uint16_t *op2, uint16_t *op1, uint16_t *op0,
694                                   uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
695                                   int bd) {
696   if (flat && mask) {
697     const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
698     const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
699 
700     // 5-tap filter [1, 2, 2, 2, 1]
701     *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
702     *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
703     *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
704     *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
705   } else {
706     highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
707   }
708 }
709 
highbd_filter8(int8_t mask,uint8_t thresh,int8_t flat,uint16_t * op3,uint16_t * op2,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,uint16_t * oq2,uint16_t * oq3,int bd)710 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
711                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
712                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
713                                   uint16_t *oq2, uint16_t *oq3, int bd) {
714   if (flat && mask) {
715     const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
716     const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
717 
718     // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
719     *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
720     *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
721     *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
722     *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
723     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
724     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
725   } else {
726     highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
727   }
728 }
729 
aom_highbd_lpf_horizontal_8_c(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)730 void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
731                                    const uint8_t *limit, const uint8_t *thresh,
732                                    int bd) {
733   int i;
734   int count = 4;
735 
736   // loop filter designed to work using chars so that we can make maximum use
737   // of 8 bit simd instructions.
738   for (i = 0; i < count; ++i) {
739     const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
740     const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
741 
742     const int8_t mask =
743         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
744     const int8_t flat =
745         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
746     highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
747                    s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
748     ++s;
749   }
750 }
751 
aom_highbd_lpf_horizontal_6_c(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)752 void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
753                                    const uint8_t *limit, const uint8_t *thresh,
754                                    int bd) {
755   int i;
756   int count = 4;
757 
758   // loop filter designed to work using chars so that we can make maximum use
759   // of 8 bit simd instructions.
760   for (i = 0; i < count; ++i) {
761     const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
762     const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
763 
764     const int8_t mask =
765         highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
766     const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
767     highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
768                    s + 1 * p, s + 2 * p, bd);
769     ++s;
770   }
771 }
772 
aom_highbd_lpf_horizontal_6_dual_c(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)773 void aom_highbd_lpf_horizontal_6_dual_c(
774     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
775     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
776     const uint8_t *thresh1, int bd) {
777   aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
778   aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
779 }
780 
aom_highbd_lpf_horizontal_8_dual_c(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)781 void aom_highbd_lpf_horizontal_8_dual_c(
782     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
783     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
784     const uint8_t *thresh1, int bd) {
785   aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
786   aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
787 }
788 
aom_highbd_lpf_vertical_6_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)789 void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
790                                  const uint8_t *limit, const uint8_t *thresh,
791                                  int bd) {
792   int i;
793   int count = 4;
794 
795   for (i = 0; i < count; ++i) {
796     const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
797     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
798     const int8_t mask =
799         highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
800     const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
801     highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
802                    bd);
803     s += pitch;
804   }
805 }
806 
aom_highbd_lpf_vertical_6_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)807 void aom_highbd_lpf_vertical_6_dual_c(
808     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
809     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
810     const uint8_t *thresh1, int bd) {
811   aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
812   aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
813                               bd);
814 }
815 
aom_highbd_lpf_vertical_8_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)816 void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
817                                  const uint8_t *limit, const uint8_t *thresh,
818                                  int bd) {
819   int i;
820   int count = 4;
821 
822   for (i = 0; i < count; ++i) {
823     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
824     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
825     const int8_t mask =
826         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
827     const int8_t flat =
828         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
829     highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
830                    s + 2, s + 3, bd);
831     s += pitch;
832   }
833 }
834 
aom_highbd_lpf_vertical_8_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)835 void aom_highbd_lpf_vertical_8_dual_c(
836     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
837     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
838     const uint8_t *thresh1, int bd) {
839   aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
840   aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
841                               bd);
842 }
843 
highbd_filter14(int8_t mask,uint8_t thresh,int8_t flat,int8_t flat2,uint16_t * op6,uint16_t * op5,uint16_t * op4,uint16_t * op3,uint16_t * op2,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,uint16_t * oq2,uint16_t * oq3,uint16_t * oq4,uint16_t * oq5,uint16_t * oq6,int bd)844 static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
845                                    int8_t flat2, uint16_t *op6, uint16_t *op5,
846                                    uint16_t *op4, uint16_t *op3, uint16_t *op2,
847                                    uint16_t *op1, uint16_t *op0, uint16_t *oq0,
848                                    uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
849                                    uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
850                                    int bd) {
851   if (flat2 && flat && mask) {
852     const uint16_t p6 = *op6;
853     const uint16_t p5 = *op5;
854     const uint16_t p4 = *op4;
855     const uint16_t p3 = *op3;
856     const uint16_t p2 = *op2;
857     const uint16_t p1 = *op1;
858     const uint16_t p0 = *op0;
859     const uint16_t q0 = *oq0;
860     const uint16_t q1 = *oq1;
861     const uint16_t q2 = *oq2;
862     const uint16_t q3 = *oq3;
863     const uint16_t q4 = *oq4;
864     const uint16_t q5 = *oq5;
865     const uint16_t q6 = *oq6;
866 
867     // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
868     *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
869                               4);
870     *op4 = ROUND_POWER_OF_TWO(
871         p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
872     *op3 = ROUND_POWER_OF_TWO(
873         p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
874     *op2 = ROUND_POWER_OF_TWO(
875         p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
876         4);
877     *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
878                                   q0 + q1 + q2 + q3 + q4,
879                               4);
880     *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
881                                   q0 * 2 + q1 + q2 + q3 + q4 + q5,
882                               4);
883     *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
884                                   q1 * 2 + q2 + q3 + q4 + q5 + q6,
885                               4);
886     *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
887                                   q2 * 2 + q3 + q4 + q5 + q6 * 2,
888                               4);
889     *oq2 = ROUND_POWER_OF_TWO(
890         p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
891         4);
892     *oq3 = ROUND_POWER_OF_TWO(
893         p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
894     *oq4 = ROUND_POWER_OF_TWO(
895         p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
896     *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
897                               4);
898   } else {
899     highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
900                    bd);
901   }
902 }
903 
highbd_mb_lpf_horizontal_edge_w(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count,int bd)904 static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
905                                             const uint8_t *blimit,
906                                             const uint8_t *limit,
907                                             const uint8_t *thresh, int count,
908                                             int bd) {
909   int i;
910   int step = 4;
911 
912   // loop filter designed to work using chars so that we can make maximum use
913   // of 8 bit simd instructions.
914   for (i = 0; i < step * count; ++i) {
915     const uint16_t p3 = s[-4 * p];
916     const uint16_t p2 = s[-3 * p];
917     const uint16_t p1 = s[-2 * p];
918     const uint16_t p0 = s[-p];
919     const uint16_t q0 = s[0 * p];
920     const uint16_t q1 = s[1 * p];
921     const uint16_t q2 = s[2 * p];
922     const uint16_t q3 = s[3 * p];
923     const int8_t mask =
924         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
925     const int8_t flat =
926         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
927 
928     const int8_t flat2 =
929         highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
930                           s[5 * p], s[6 * p], bd);
931 
932     highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
933                     s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
934                     s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
935     ++s;
936   }
937 }
938 
aom_highbd_lpf_horizontal_14_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)939 void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch,
940                                     const uint8_t *blimit, const uint8_t *limit,
941                                     const uint8_t *thresh, int bd) {
942   highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
943 }
944 
aom_highbd_lpf_horizontal_14_dual_c(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)945 void aom_highbd_lpf_horizontal_14_dual_c(
946     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
947     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
948     const uint8_t *thresh1, int bd) {
949   highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
950   highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
951 }
952 
highbd_mb_lpf_vertical_edge_w(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count,int bd)953 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
954                                           const uint8_t *blimit,
955                                           const uint8_t *limit,
956                                           const uint8_t *thresh, int count,
957                                           int bd) {
958   int i;
959 
960   for (i = 0; i < count; ++i) {
961     const uint16_t p3 = s[-4];
962     const uint16_t p2 = s[-3];
963     const uint16_t p1 = s[-2];
964     const uint16_t p0 = s[-1];
965     const uint16_t q0 = s[0];
966     const uint16_t q1 = s[1];
967     const uint16_t q2 = s[2];
968     const uint16_t q3 = s[3];
969     const int8_t mask =
970         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
971     const int8_t flat =
972         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
973     const int8_t flat2 =
974         highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
975 
976     highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
977                     s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
978                     s + 6, bd);
979     s += p;
980   }
981 }
982 
aom_highbd_lpf_vertical_14_c(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)983 void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
984                                   const uint8_t *limit, const uint8_t *thresh,
985                                   int bd) {
986   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
987 }
988 
aom_highbd_lpf_vertical_14_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)989 void aom_highbd_lpf_vertical_14_dual_c(
990     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
991     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
992     const uint8_t *thresh1, int bd) {
993   highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
994   highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
995                                 4, bd);
996 }
997 #endif  // CONFIG_AV1_HIGHBITDEPTH
998