• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <string.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25 
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27                              int dst_stride, int w, int h,
28                              const int16_t *x_filters, int x0_qn,
29                              int x_step_qn) {
30   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31   for (int y = 0; y < h; ++y) {
32     int x_qn = x0_qn;
33     for (int x = 0; x < w; ++x) {
34       const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35       const int x_filter_idx =
36           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37       assert(x_filter_idx <= RS_SUBPEL_MASK);
38       const int16_t *const x_filter =
39           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40       int sum = 0;
41       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42         sum += src_x[k] * x_filter[k];
43       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44       x_qn += x_step_qn;
45     }
46     src += src_stride;
47     dst += dst_stride;
48   }
49 }
50 
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52                                     uint16_t *dst, int dst_stride, int w, int h,
53                                     const int16_t *x_filters, int x0_qn,
54                                     int x_step_qn, int bd) {
55   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56   for (int y = 0; y < h; ++y) {
57     int x_qn = x0_qn;
58     for (int x = 0; x < w; ++x) {
59       const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60       const int x_filter_idx =
61           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62       assert(x_filter_idx <= RS_SUBPEL_MASK);
63       const int16_t *const x_filter =
64           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65       int sum = 0;
66       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67         sum += src_x[k] * x_filter[k];
68       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69       x_qn += x_step_qn;
70     }
71     src += src_stride;
72     dst += dst_stride;
73   }
74 }
75 
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)76 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
77                           int dst_stride, int w, int h,
78                           const InterpFilterParams *filter_params_x,
79                           const InterpFilterParams *filter_params_y,
80                           const int subpel_x_qn, const int subpel_y_qn,
81                           ConvolveParams *conv_params) {
82   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
83   int im_h = h + filter_params_y->taps - 1;
84   int im_stride = w;
85   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
86   const int fo_vert = filter_params_y->taps / 2 - 1;
87   const int fo_horiz = filter_params_x->taps / 2 - 1;
88   const int bd = 8;
89   const int bits =
90       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
91 
92   // horizontal filter
93   const uint8_t *src_horiz = src - fo_vert * src_stride;
94   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
95       filter_params_x, subpel_x_qn & SUBPEL_MASK);
96   for (int y = 0; y < im_h; ++y) {
97     for (int x = 0; x < w; ++x) {
98       int32_t sum = (1 << (bd + FILTER_BITS - 1));
99       for (int k = 0; k < filter_params_x->taps; ++k) {
100         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
101       }
102 
103       // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can
104       // be beyond the following range. For better prediction, a clamping can be
105       // added for 12 tap filter to ensure the horizontal filtering result is
106       // within 16 bit. The same applies to the vertical filtering.
107       assert(filter_params_x->taps > 8 ||
108              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
109       im_block[y * im_stride + x] =
110           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
111     }
112   }
113 
114   // vertical filter
115   int16_t *src_vert = im_block + fo_vert * im_stride;
116   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
117       filter_params_y, subpel_y_qn & SUBPEL_MASK);
118   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
119   for (int y = 0; y < h; ++y) {
120     for (int x = 0; x < w; ++x) {
121       int32_t sum = 1 << offset_bits;
122       for (int k = 0; k < filter_params_y->taps; ++k) {
123         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
124       }
125       assert(filter_params_y->taps > 8 ||
126              (0 <= sum && sum < (1 << (offset_bits + 2))));
127       int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
128                     ((1 << (offset_bits - conv_params->round_1)) +
129                      (1 << (offset_bits - conv_params->round_1 - 1)));
130       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
131     }
132   }
133 }
134 
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)135 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
136                          int dst_stride, int w, int h,
137                          const InterpFilterParams *filter_params_y,
138                          const int subpel_y_qn) {
139   const int fo_vert = filter_params_y->taps / 2 - 1;
140 
141   // vertical filter
142   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
143       filter_params_y, subpel_y_qn & SUBPEL_MASK);
144   for (int y = 0; y < h; ++y) {
145     for (int x = 0; x < w; ++x) {
146       int32_t res = 0;
147       for (int k = 0; k < filter_params_y->taps; ++k) {
148         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
149       }
150       dst[y * dst_stride + x] =
151           clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
152     }
153   }
154 }
155 
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)156 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
157                          int dst_stride, int w, int h,
158                          const InterpFilterParams *filter_params_x,
159                          const int subpel_x_qn, ConvolveParams *conv_params) {
160   const int fo_horiz = filter_params_x->taps / 2 - 1;
161   const int bits = FILTER_BITS - conv_params->round_0;
162 
163   assert(bits >= 0);
164   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
165          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
166 
167   // horizontal filter
168   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
169       filter_params_x, subpel_x_qn & SUBPEL_MASK);
170 
171   for (int y = 0; y < h; ++y) {
172     for (int x = 0; x < w; ++x) {
173       int32_t res = 0;
174       for (int k = 0; k < filter_params_x->taps; ++k) {
175         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
176       }
177       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
178       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
179     }
180   }
181 }
182 
183 // This function is exactly the same as av1_convolve_2d_sr_c, and is an
184 // optimized version for intrabc. Use the following 2-tap filter:
185 // DECLARE_ALIGNED(256, static const int16_t,
186 //                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
187 //   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 //   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 // };
av1_convolve_2d_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)190 void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride,
191                                   uint8_t *dst, int dst_stride, int w, int h,
192                                   const InterpFilterParams *filter_params_x,
193                                   const InterpFilterParams *filter_params_y,
194                                   const int subpel_x_qn, const int subpel_y_qn,
195                                   ConvolveParams *conv_params) {
196   assert(subpel_x_qn == 8);
197   assert(subpel_y_qn == 8);
198   assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
199   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
200   (void)filter_params_x;
201   (void)subpel_x_qn;
202   (void)filter_params_y;
203   (void)subpel_y_qn;
204   (void)conv_params;
205 
206   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
207   int im_h = h + 1;
208   int im_stride = w;
209   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
210   const int bd = 8;
211 
212   // horizontal filter
213   // explicitly operate for subpel_x_qn = 8.
214   int16_t *im = im_block;
215   for (int y = 0; y < im_h; ++y) {
216     for (int x = 0; x < w; ++x) {
217       const int32_t sum = (1 << bd) + src[x] + src[x + 1];
218       assert(0 <= sum && sum < (1 << (bd + 2)));
219       im[x] = sum;
220     }
221     src += src_stride;
222     im += im_stride;
223   }
224 
225   // vertical filter
226   // explicitly operate for subpel_y_qn = 8.
227   int16_t *src_vert = im_block;
228   for (int y = 0; y < h; ++y) {
229     for (int x = 0; x < w; ++x) {
230       const int32_t sum =
231           (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
232       assert(0 <= sum && sum < (1 << (bd + 4)));
233       const int16_t res =
234           ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
235       dst[x] = clip_pixel(res);
236     }
237     src_vert += im_stride;
238     dst += dst_stride;
239   }
240 }
241 
242 // This function is exactly the same as av1_convolve_y_sr_c, and is an
243 // optimized version for intrabc.
av1_convolve_y_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)244 void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride,
245                                  uint8_t *dst, int dst_stride, int w, int h,
246                                  const InterpFilterParams *filter_params_y,
247                                  const int subpel_y_qn) {
248   assert(subpel_y_qn == 8);
249   assert(filter_params_y->taps == 2);
250   (void)filter_params_y;
251   (void)subpel_y_qn;
252 
253   // vertical filter
254   // explicitly operate for subpel_y_qn = 8.
255   for (int y = 0; y < h; ++y) {
256     for (int x = 0; x < w; ++x) {
257       const int32_t res = src[x] + src[src_stride + x];
258       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
259     }
260     src += src_stride;
261     dst += dst_stride;
262   }
263 }
264 
265 // This function is exactly the same as av1_convolve_x_sr_c, and is an
266 // optimized version for intrabc.
av1_convolve_x_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)267 void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride,
268                                  uint8_t *dst, int dst_stride, int w, int h,
269                                  const InterpFilterParams *filter_params_x,
270                                  const int subpel_x_qn,
271                                  ConvolveParams *conv_params) {
272   assert(subpel_x_qn == 8);
273   assert(filter_params_x->taps == 2);
274   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
275   (void)filter_params_x;
276   (void)subpel_x_qn;
277   (void)conv_params;
278 
279   // horizontal filter
280   // explicitly operate for subpel_x_qn = 8.
281   for (int y = 0; y < h; ++y) {
282     for (int x = 0; x < w; ++x) {
283       const int32_t res = src[x] + src[x + 1];
284       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
285     }
286     src += src_stride;
287     dst += dst_stride;
288   }
289 }
290 
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)291 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
292                                 uint8_t *dst, int dst_stride, int w, int h,
293                                 const InterpFilterParams *filter_params_x,
294                                 const InterpFilterParams *filter_params_y,
295                                 const int subpel_x_qn, const int subpel_y_qn,
296                                 ConvolveParams *conv_params) {
297   CONV_BUF_TYPE *dst16 = conv_params->dst;
298   int dst16_stride = conv_params->dst_stride;
299   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
300   int im_h = h + filter_params_y->taps - 1;
301   int im_stride = w;
302   const int fo_vert = filter_params_y->taps / 2 - 1;
303   const int fo_horiz = filter_params_x->taps / 2 - 1;
304   const int bd = 8;
305   const int round_bits =
306       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
307 
308   // horizontal filter
309   const uint8_t *src_horiz = src - fo_vert * src_stride;
310   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
311       filter_params_x, subpel_x_qn & SUBPEL_MASK);
312   for (int y = 0; y < im_h; ++y) {
313     for (int x = 0; x < w; ++x) {
314       int32_t sum = (1 << (bd + FILTER_BITS - 1));
315       for (int k = 0; k < filter_params_x->taps; ++k) {
316         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
317       }
318       assert(filter_params_x->taps > 8 ||
319              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
320       im_block[y * im_stride + x] =
321           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
322     }
323   }
324 
325   // vertical filter
326   int16_t *src_vert = im_block + fo_vert * im_stride;
327   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
328       filter_params_y, subpel_y_qn & SUBPEL_MASK);
329   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
330   for (int y = 0; y < h; ++y) {
331     for (int x = 0; x < w; ++x) {
332       int32_t sum = 1 << offset_bits;
333       for (int k = 0; k < filter_params_y->taps; ++k) {
334         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
335       }
336       assert(filter_params_y->taps > 8 ||
337              (0 <= sum && sum < (1 << (offset_bits + 2))));
338       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
339       if (conv_params->do_average) {
340         int32_t tmp = dst16[y * dst16_stride + x];
341         if (conv_params->use_dist_wtd_comp_avg) {
342           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
343           tmp = tmp >> DIST_PRECISION_BITS;
344         } else {
345           tmp += res;
346           tmp = tmp >> 1;
347         }
348         tmp -= (1 << (offset_bits - conv_params->round_1)) +
349                (1 << (offset_bits - conv_params->round_1 - 1));
350         dst[y * dst_stride + x] =
351             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
352       } else {
353         dst16[y * dst16_stride + x] = res;
354       }
355     }
356   }
357 }
358 
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)359 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
360                                int dst_stride, int w, int h,
361                                const InterpFilterParams *filter_params_y,
362                                const int subpel_y_qn,
363                                ConvolveParams *conv_params) {
364   CONV_BUF_TYPE *dst16 = conv_params->dst;
365   int dst16_stride = conv_params->dst_stride;
366   const int fo_vert = filter_params_y->taps / 2 - 1;
367   const int bits = FILTER_BITS - conv_params->round_0;
368   const int bd = 8;
369   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
370   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
371                            (1 << (offset_bits - conv_params->round_1 - 1));
372   const int round_bits =
373       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
374 
375   // vertical filter
376   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
377       filter_params_y, subpel_y_qn & SUBPEL_MASK);
378   for (int y = 0; y < h; ++y) {
379     for (int x = 0; x < w; ++x) {
380       int32_t res = 0;
381       for (int k = 0; k < filter_params_y->taps; ++k) {
382         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
383       }
384       res *= (1 << bits);
385       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
386 
387       if (conv_params->do_average) {
388         int32_t tmp = dst16[y * dst16_stride + x];
389         if (conv_params->use_dist_wtd_comp_avg) {
390           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
391           tmp = tmp >> DIST_PRECISION_BITS;
392         } else {
393           tmp += res;
394           tmp = tmp >> 1;
395         }
396         tmp -= round_offset;
397         dst[y * dst_stride + x] =
398             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
399       } else {
400         dst16[y * dst16_stride + x] = res;
401       }
402     }
403   }
404 }
405 
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)406 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
407                                int dst_stride, int w, int h,
408                                const InterpFilterParams *filter_params_x,
409                                const int subpel_x_qn,
410                                ConvolveParams *conv_params) {
411   CONV_BUF_TYPE *dst16 = conv_params->dst;
412   int dst16_stride = conv_params->dst_stride;
413   const int fo_horiz = filter_params_x->taps / 2 - 1;
414   const int bits = FILTER_BITS - conv_params->round_1;
415   const int bd = 8;
416   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
417   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
418                            (1 << (offset_bits - conv_params->round_1 - 1));
419   const int round_bits =
420       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
421 
422   // horizontal filter
423   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
424       filter_params_x, subpel_x_qn & SUBPEL_MASK);
425   for (int y = 0; y < h; ++y) {
426     for (int x = 0; x < w; ++x) {
427       int32_t res = 0;
428       for (int k = 0; k < filter_params_x->taps; ++k) {
429         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
430       }
431       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
432       res += round_offset;
433 
434       if (conv_params->do_average) {
435         int32_t tmp = dst16[y * dst16_stride + x];
436         if (conv_params->use_dist_wtd_comp_avg) {
437           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
438           tmp = tmp >> DIST_PRECISION_BITS;
439         } else {
440           tmp += res;
441           tmp = tmp >> 1;
442         }
443         tmp -= round_offset;
444         dst[y * dst_stride + x] =
445             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
446       } else {
447         dst16[y * dst16_stride + x] = res;
448       }
449     }
450   }
451 }
452 
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params)453 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
454                                      uint8_t *dst, int dst_stride, int w, int h,
455                                      ConvolveParams *conv_params) {
456   CONV_BUF_TYPE *dst16 = conv_params->dst;
457   int dst16_stride = conv_params->dst_stride;
458   const int bits =
459       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
460   const int bd = 8;
461   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
462   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
463                            (1 << (offset_bits - conv_params->round_1 - 1));
464 
465   for (int y = 0; y < h; ++y) {
466     for (int x = 0; x < w; ++x) {
467       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
468       res += round_offset;
469 
470       if (conv_params->do_average) {
471         int32_t tmp = dst16[y * dst16_stride + x];
472         if (conv_params->use_dist_wtd_comp_avg) {
473           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
474           tmp = tmp >> DIST_PRECISION_BITS;
475         } else {
476           tmp += res;
477           tmp = tmp >> 1;
478         }
479         tmp -= round_offset;
480         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
481       } else {
482         dst16[y * dst16_stride + x] = res;
483       }
484     }
485   }
486 }
487 
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)488 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
489                              int dst_stride, int w, int h,
490                              const InterpFilterParams *filter_params_x,
491                              const InterpFilterParams *filter_params_y,
492                              const int subpel_x_qn, const int x_step_qn,
493                              const int subpel_y_qn, const int y_step_qn,
494                              ConvolveParams *conv_params) {
495   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
496   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
497              filter_params_y->taps;
498   CONV_BUF_TYPE *dst16 = conv_params->dst;
499   const int dst16_stride = conv_params->dst_stride;
500   const int bits =
501       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
502   assert(bits >= 0);
503   int im_stride = w;
504   const int fo_vert = filter_params_y->taps / 2 - 1;
505   const int fo_horiz = filter_params_x->taps / 2 - 1;
506   const int bd = 8;
507 
508   // horizontal filter
509   const uint8_t *src_horiz = src - fo_vert * src_stride;
510   for (int y = 0; y < im_h; ++y) {
511     int x_qn = subpel_x_qn;
512     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
513       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
514       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
515       assert(x_filter_idx < SUBPEL_SHIFTS);
516       const int16_t *x_filter =
517           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
518       int32_t sum = (1 << (bd + FILTER_BITS - 1));
519       for (int k = 0; k < filter_params_x->taps; ++k) {
520         sum += x_filter[k] * src_x[k - fo_horiz];
521       }
522       assert(filter_params_x->taps > 8 ||
523              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
524       im_block[y * im_stride + x] =
525           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
526     }
527     src_horiz += src_stride;
528   }
529 
530   // vertical filter
531   int16_t *src_vert = im_block + fo_vert * im_stride;
532   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
533   for (int x = 0; x < w; ++x) {
534     int y_qn = subpel_y_qn;
535     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
536       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
537       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
538       assert(y_filter_idx < SUBPEL_SHIFTS);
539       const int16_t *y_filter =
540           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
541       int32_t sum = 1 << offset_bits;
542       for (int k = 0; k < filter_params_y->taps; ++k) {
543         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
544       }
545       assert(filter_params_y->taps > 8 ||
546              (0 <= sum && sum < (1 << (offset_bits + 2))));
547       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
548       if (conv_params->is_compound) {
549         if (conv_params->do_average) {
550           int32_t tmp = dst16[y * dst16_stride + x];
551           if (conv_params->use_dist_wtd_comp_avg) {
552             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
553             tmp = tmp >> DIST_PRECISION_BITS;
554           } else {
555             tmp += res;
556             tmp = tmp >> 1;
557           }
558           /* Subtract round offset and convolve round */
559           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
560                        (1 << (offset_bits - conv_params->round_1 - 1)));
561           dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
562         } else {
563           dst16[y * dst16_stride + x] = res;
564         }
565       } else {
566         /* Subtract round offset and convolve round */
567         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
568                              (1 << (offset_bits - conv_params->round_1 - 1)));
569         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
570       }
571     }
572     src_vert++;
573   }
574 }
575 
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)576 static void convolve_2d_scale_wrapper(
577     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
578     int h, const InterpFilterParams *filter_params_x,
579     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
580     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
581     ConvolveParams *conv_params) {
582   if (conv_params->is_compound) {
583     assert(conv_params->dst != NULL);
584   }
585   av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
586                         filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
587                         y_step_qn, conv_params);
588 }
589 
convolve_2d_facade_compound(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)590 static void convolve_2d_facade_compound(
591     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
592     int h, const InterpFilterParams *filter_params_x,
593     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
594     const int subpel_y_qn, ConvolveParams *conv_params) {
595   const bool need_x = subpel_x_qn != 0;
596   const bool need_y = subpel_y_qn != 0;
597   if (!need_x && !need_y) {
598     av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
599                                   conv_params);
600   } else if (need_x && !need_y) {
601     av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
602                             filter_params_x, subpel_x_qn, conv_params);
603   } else if (!need_x && need_y) {
604     av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
605                             filter_params_y, subpel_y_qn, conv_params);
606   } else {
607     assert(need_y && need_x);
608     av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
609                              filter_params_x, filter_params_y, subpel_x_qn,
610                              subpel_y_qn, conv_params);
611   }
612 }
613 
convolve_2d_facade_single(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)614 static void convolve_2d_facade_single(
615     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
616     int h, const InterpFilterParams *filter_params_x,
617     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
618     const int subpel_y_qn, ConvolveParams *conv_params) {
619   const bool need_x = subpel_x_qn != 0;
620   const bool need_y = subpel_y_qn != 0;
621   if (!need_x && !need_y) {
622     aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
623   } else if (need_x && !need_y) {
624     av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
625                       subpel_x_qn, conv_params);
626   } else if (!need_x && need_y) {
627     av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
628                       subpel_y_qn);
629   } else {
630     assert(need_x && need_y);
631     av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
632                        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
633   }
634 }
635 
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params)636 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
637                             int dst_stride, int w, int h,
638                             const InterpFilterParams *interp_filters[2],
639                             const int subpel_x_qn, int x_step_q4,
640                             const int subpel_y_qn, int y_step_q4, int scaled,
641                             ConvolveParams *conv_params) {
642   (void)x_step_q4;
643   (void)y_step_q4;
644   (void)dst;
645   (void)dst_stride;
646 
647   const InterpFilterParams *filter_params_x = interp_filters[0];
648   const InterpFilterParams *filter_params_y = interp_filters[1];
649 
650   // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
651   // 2-tap filter indicates that it is for IntraBC.
652   if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
653     assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
654     assert(!scaled);
655     if (subpel_x_qn && subpel_y_qn) {
656       av1_convolve_2d_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
657                                    filter_params_x, filter_params_y,
658                                    subpel_x_qn, subpel_y_qn, conv_params);
659       return;
660     } else if (subpel_x_qn) {
661       av1_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
662                                   filter_params_x, subpel_x_qn, conv_params);
663       return;
664     } else if (subpel_y_qn) {
665       av1_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
666                                   filter_params_y, subpel_y_qn);
667       return;
668     }
669   }
670 
671   if (scaled) {
672     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
673                               filter_params_x, filter_params_y, subpel_x_qn,
674                               x_step_q4, subpel_y_qn, y_step_q4, conv_params);
675   } else if (conv_params->is_compound) {
676     convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
677                                 filter_params_x, filter_params_y, subpel_x_qn,
678                                 subpel_y_qn, conv_params);
679   } else {
680     convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
681                               filter_params_x, filter_params_y, subpel_x_qn,
682                               subpel_y_qn, conv_params);
683   }
684 }
685 
686 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)687 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
688                                 uint16_t *dst, int dst_stride, int w, int h,
689                                 const InterpFilterParams *filter_params_x,
690                                 const int subpel_x_qn,
691                                 ConvolveParams *conv_params, int bd) {
692   const int fo_horiz = filter_params_x->taps / 2 - 1;
693   const int bits = FILTER_BITS - conv_params->round_0;
694 
695   assert(bits >= 0);
696   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
697          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
698 
699   // horizontal filter
700   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
701       filter_params_x, subpel_x_qn & SUBPEL_MASK);
702   for (int y = 0; y < h; ++y) {
703     for (int x = 0; x < w; ++x) {
704       int32_t res = 0;
705       for (int k = 0; k < filter_params_x->taps; ++k) {
706         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
707       }
708       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
709       dst[y * dst_stride + x] =
710           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
711     }
712   }
713 }
714 
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)715 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
716                                 uint16_t *dst, int dst_stride, int w, int h,
717                                 const InterpFilterParams *filter_params_y,
718                                 const int subpel_y_qn, int bd) {
719   const int fo_vert = filter_params_y->taps / 2 - 1;
720   // vertical filter
721   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
722       filter_params_y, subpel_y_qn & SUBPEL_MASK);
723   for (int y = 0; y < h; ++y) {
724     for (int x = 0; x < w; ++x) {
725       int32_t res = 0;
726       for (int k = 0; k < filter_params_y->taps; ++k) {
727         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
728       }
729       dst[y * dst_stride + x] =
730           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
731     }
732   }
733 }
734 
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)735 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
736                                  uint16_t *dst, int dst_stride, int w, int h,
737                                  const InterpFilterParams *filter_params_x,
738                                  const InterpFilterParams *filter_params_y,
739                                  const int subpel_x_qn, const int subpel_y_qn,
740                                  ConvolveParams *conv_params, int bd) {
741   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
742   int im_h = h + filter_params_y->taps - 1;
743   int im_stride = w;
744   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
745   const int fo_vert = filter_params_y->taps / 2 - 1;
746   const int fo_horiz = filter_params_x->taps / 2 - 1;
747   const int bits =
748       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
749   assert(bits >= 0);
750 
751   // horizontal filter
752   const uint16_t *src_horiz = src - fo_vert * src_stride;
753   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
754       filter_params_x, subpel_x_qn & SUBPEL_MASK);
755   for (int y = 0; y < im_h; ++y) {
756     for (int x = 0; x < w; ++x) {
757       int32_t sum = (1 << (bd + FILTER_BITS - 1));
758       for (int k = 0; k < filter_params_x->taps; ++k) {
759         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
760       }
761       assert(filter_params_x->taps > 8 ||
762              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
763       im_block[y * im_stride + x] =
764           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
765     }
766   }
767 
768   // vertical filter
769   int16_t *src_vert = im_block + fo_vert * im_stride;
770   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
771       filter_params_y, subpel_y_qn & SUBPEL_MASK);
772   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
773   for (int y = 0; y < h; ++y) {
774     for (int x = 0; x < w; ++x) {
775       int32_t sum = 1 << offset_bits;
776       for (int k = 0; k < filter_params_y->taps; ++k) {
777         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
778       }
779       assert(filter_params_y->taps > 8 ||
780              (0 <= sum && sum < (1 << (offset_bits + 2))));
781       int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
782                     ((1 << (offset_bits - conv_params->round_1)) +
783                      (1 << (offset_bits - conv_params->round_1 - 1)));
784       dst[y * dst_stride + x] =
785           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
786     }
787   }
788 }
789 
790 // This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
791 // optimized version for intrabc. Use the following 2-tap filter:
792 // DECLARE_ALIGNED(256, static const int16_t,
793 //                 av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
794 //   128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
795 //   64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
796 // };
av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)797 void av1_highbd_convolve_2d_sr_intrabc_c(
798     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
799     int h, const InterpFilterParams *filter_params_x,
800     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
801     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
802   const int bits =
803       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
804   assert(bits >= 0);
805   assert(subpel_x_qn == 8);
806   assert(subpel_y_qn == 8);
807   assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
808   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
809   (void)filter_params_x;
810   (void)subpel_x_qn;
811   (void)filter_params_y;
812   (void)subpel_y_qn;
813   (void)conv_params;
814 
815   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
816   int im_h = h + 1;
817   int im_stride = w;
818   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
819 
820   // horizontal filter
821   // explicitly operate for subpel_x_qn = 8.
822   int16_t *im = im_block;
823   for (int y = 0; y < im_h; ++y) {
824     for (int x = 0; x < w; ++x) {
825       int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
826       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
827       sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
828       im[x] = sum;
829     }
830     src += src_stride;
831     im += im_stride;
832   }
833 
834   // vertical filter
835   // explicitly operate for subpel_y_qn = 8.
836   int16_t *src_vert = im_block;
837   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
838   for (int y = 0; y < h; ++y) {
839     for (int x = 0; x < w; ++x) {
840       const int32_t sum =
841           (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
842       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
843       const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
844                           ((1 << (offset_bits - conv_params->round_1)) +
845                            (1 << (offset_bits - conv_params->round_1 - 1)));
846 
847       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
848     }
849     src_vert += im_stride;
850     dst += dst_stride;
851   }
852 }
853 
854 // This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
855 // optimized version for intrabc.
av1_highbd_convolve_y_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)856 void av1_highbd_convolve_y_sr_intrabc_c(
857     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
858     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
859     int bd) {
860   assert(subpel_y_qn == 8);
861   assert(filter_params_y->taps == 2);
862   (void)filter_params_y;
863   (void)subpel_y_qn;
864 
865   // vertical filter
866   // explicitly operate for subpel_y_qn = 8.
867   for (int y = 0; y < h; ++y) {
868     for (int x = 0; x < w; ++x) {
869       const int32_t res = src[x] + src[src_stride + x];
870       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
871     }
872     src += src_stride;
873     dst += dst_stride;
874   }
875 }
876 
877 // This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
878 // optimized version for intrabc.
av1_highbd_convolve_x_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)879 void av1_highbd_convolve_x_sr_intrabc_c(
880     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
881     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
882     ConvolveParams *conv_params, int bd) {
883   const int bits = FILTER_BITS - conv_params->round_0;
884   assert(bits >= 0);
885   assert(subpel_x_qn == 8);
886   assert(filter_params_x->taps == 2);
887   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
888   (void)filter_params_x;
889   (void)subpel_x_qn;
890 
891   // horizontal filter
892   // explicitly operate for subpel_x_qn = 8.
893   for (int y = 0; y < h; ++y) {
894     for (int x = 0; x < w; ++x) {
895       int32_t res = 64 * (src[x] + src[x + 1]);
896       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
897       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
898     }
899     src += src_stride;
900     dst += dst_stride;
901   }
902 }
903 
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)904 void av1_highbd_dist_wtd_convolve_2d_c(
905     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
906     int h, const InterpFilterParams *filter_params_x,
907     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
908     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
909   int x, y, k;
910   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
911   CONV_BUF_TYPE *dst16 = conv_params->dst;
912   int dst16_stride = conv_params->dst_stride;
913   int im_h = h + filter_params_y->taps - 1;
914   int im_stride = w;
915   const int fo_vert = filter_params_y->taps / 2 - 1;
916   const int fo_horiz = filter_params_x->taps / 2 - 1;
917   const int round_bits =
918       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
919   assert(round_bits >= 0);
920 
921   // horizontal filter
922   const uint16_t *src_horiz = src - fo_vert * src_stride;
923   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
924       filter_params_x, subpel_x_qn & SUBPEL_MASK);
925   for (y = 0; y < im_h; ++y) {
926     for (x = 0; x < w; ++x) {
927       int32_t sum = (1 << (bd + FILTER_BITS - 1));
928       for (k = 0; k < filter_params_x->taps; ++k) {
929         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
930       }
931       assert(filter_params_x->taps > 8 ||
932              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
933       (void)bd;
934       im_block[y * im_stride + x] =
935           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
936     }
937   }
938 
939   // vertical filter
940   int16_t *src_vert = im_block + fo_vert * im_stride;
941   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
942   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
943       filter_params_y, subpel_y_qn & SUBPEL_MASK);
944   for (y = 0; y < h; ++y) {
945     for (x = 0; x < w; ++x) {
946       int32_t sum = 1 << offset_bits;
947       for (k = 0; k < filter_params_y->taps; ++k) {
948         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
949       }
950       assert(filter_params_y->taps > 8 ||
951              (0 <= sum && sum < (1 << (offset_bits + 2))));
952       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
953       if (conv_params->do_average) {
954         int32_t tmp = dst16[y * dst16_stride + x];
955         if (conv_params->use_dist_wtd_comp_avg) {
956           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
957           tmp = tmp >> DIST_PRECISION_BITS;
958         } else {
959           tmp += res;
960           tmp = tmp >> 1;
961         }
962         tmp -= (1 << (offset_bits - conv_params->round_1)) +
963                (1 << (offset_bits - conv_params->round_1 - 1));
964         dst[y * dst_stride + x] =
965             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
966       } else {
967         dst16[y * dst16_stride + x] = res;
968       }
969     }
970   }
971 }
972 
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)973 void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
974                                       uint16_t *dst, int dst_stride, int w,
975                                       int h,
976                                       const InterpFilterParams *filter_params_x,
977                                       const int subpel_x_qn,
978                                       ConvolveParams *conv_params, int bd) {
979   CONV_BUF_TYPE *dst16 = conv_params->dst;
980   int dst16_stride = conv_params->dst_stride;
981   const int fo_horiz = filter_params_x->taps / 2 - 1;
982   const int bits = FILTER_BITS - conv_params->round_1;
983   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
984   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
985                            (1 << (offset_bits - conv_params->round_1 - 1));
986   const int round_bits =
987       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
988   assert(round_bits >= 0);
989   assert(bits >= 0);
990   // horizontal filter
991   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
992       filter_params_x, subpel_x_qn & SUBPEL_MASK);
993   for (int y = 0; y < h; ++y) {
994     for (int x = 0; x < w; ++x) {
995       int32_t res = 0;
996       for (int k = 0; k < filter_params_x->taps; ++k) {
997         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
998       }
999       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
1000       res += round_offset;
1001 
1002       if (conv_params->do_average) {
1003         int32_t tmp = dst16[y * dst16_stride + x];
1004         if (conv_params->use_dist_wtd_comp_avg) {
1005           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1006           tmp = tmp >> DIST_PRECISION_BITS;
1007         } else {
1008           tmp += res;
1009           tmp = tmp >> 1;
1010         }
1011         tmp -= round_offset;
1012         dst[y * dst_stride + x] =
1013             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1014       } else {
1015         dst16[y * dst16_stride + x] = res;
1016       }
1017     }
1018   }
1019 }
1020 
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1021 void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
1022                                       uint16_t *dst, int dst_stride, int w,
1023                                       int h,
1024                                       const InterpFilterParams *filter_params_y,
1025                                       const int subpel_y_qn,
1026                                       ConvolveParams *conv_params, int bd) {
1027   CONV_BUF_TYPE *dst16 = conv_params->dst;
1028   int dst16_stride = conv_params->dst_stride;
1029   const int fo_vert = filter_params_y->taps / 2 - 1;
1030   const int bits = FILTER_BITS - conv_params->round_0;
1031   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1032   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1033                            (1 << (offset_bits - conv_params->round_1 - 1));
1034   const int round_bits =
1035       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1036   assert(round_bits >= 0);
1037   assert(bits >= 0);
1038   // vertical filter
1039   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1040       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1041   for (int y = 0; y < h; ++y) {
1042     for (int x = 0; x < w; ++x) {
1043       int32_t res = 0;
1044       for (int k = 0; k < filter_params_y->taps; ++k) {
1045         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
1046       }
1047       res *= (1 << bits);
1048       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
1049 
1050       if (conv_params->do_average) {
1051         int32_t tmp = dst16[y * dst16_stride + x];
1052         if (conv_params->use_dist_wtd_comp_avg) {
1053           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1054           tmp = tmp >> DIST_PRECISION_BITS;
1055         } else {
1056           tmp += res;
1057           tmp = tmp >> 1;
1058         }
1059         tmp -= round_offset;
1060         dst[y * dst_stride + x] =
1061             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1062       } else {
1063         dst16[y * dst16_stride + x] = res;
1064       }
1065     }
1066   }
1067 }
1068 
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)1069 void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
1070                                             uint16_t *dst, int dst_stride,
1071                                             int w, int h,
1072                                             ConvolveParams *conv_params,
1073                                             int bd) {
1074   CONV_BUF_TYPE *dst16 = conv_params->dst;
1075   int dst16_stride = conv_params->dst_stride;
1076   const int bits =
1077       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
1078   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1079   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1080                            (1 << (offset_bits - conv_params->round_1 - 1));
1081   assert(bits >= 0);
1082 
1083   for (int y = 0; y < h; ++y) {
1084     for (int x = 0; x < w; ++x) {
1085       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
1086       res += round_offset;
1087       if (conv_params->do_average) {
1088         int32_t tmp = dst16[y * dst16_stride + x];
1089         if (conv_params->use_dist_wtd_comp_avg) {
1090           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1091           tmp = tmp >> DIST_PRECISION_BITS;
1092         } else {
1093           tmp += res;
1094           tmp = tmp >> 1;
1095         }
1096         tmp -= round_offset;
1097         dst[y * dst_stride + x] =
1098             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1099       } else {
1100         dst16[y * dst16_stride + x] = res;
1101       }
1102     }
1103   }
1104 }
1105 
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)1106 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
1107                                     uint16_t *dst, int dst_stride, int w, int h,
1108                                     const InterpFilterParams *filter_params_x,
1109                                     const InterpFilterParams *filter_params_y,
1110                                     const int subpel_x_qn, const int x_step_qn,
1111                                     const int subpel_y_qn, const int y_step_qn,
1112                                     ConvolveParams *conv_params, int bd) {
1113   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
1114   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1115              filter_params_y->taps;
1116   int im_stride = w;
1117   const int fo_vert = filter_params_y->taps / 2 - 1;
1118   const int fo_horiz = filter_params_x->taps / 2 - 1;
1119   CONV_BUF_TYPE *dst16 = conv_params->dst;
1120   const int dst16_stride = conv_params->dst_stride;
1121   const int bits =
1122       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
1123   assert(bits >= 0);
1124   // horizontal filter
1125   const uint16_t *src_horiz = src - fo_vert * src_stride;
1126   for (int y = 0; y < im_h; ++y) {
1127     int x_qn = subpel_x_qn;
1128     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
1129       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
1130       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1131       assert(x_filter_idx < SUBPEL_SHIFTS);
1132       const int16_t *x_filter =
1133           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
1134       int32_t sum = (1 << (bd + FILTER_BITS - 1));
1135       for (int k = 0; k < filter_params_x->taps; ++k) {
1136         sum += x_filter[k] * src_x[k - fo_horiz];
1137       }
1138       assert(filter_params_x->taps > 8 ||
1139              (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
1140       im_block[y * im_stride + x] =
1141           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1142     }
1143     src_horiz += src_stride;
1144   }
1145 
1146   // vertical filter
1147   int16_t *src_vert = im_block + fo_vert * im_stride;
1148   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1149   for (int x = 0; x < w; ++x) {
1150     int y_qn = subpel_y_qn;
1151     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
1152       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
1153       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1154       assert(y_filter_idx < SUBPEL_SHIFTS);
1155       const int16_t *y_filter =
1156           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
1157       int32_t sum = 1 << offset_bits;
1158       for (int k = 0; k < filter_params_y->taps; ++k) {
1159         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
1160       }
1161       assert(filter_params_y->taps > 8 ||
1162              (0 <= sum && sum < (1 << (offset_bits + 2))));
1163       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1164       if (conv_params->is_compound) {
1165         if (conv_params->do_average) {
1166           int32_t tmp = dst16[y * dst16_stride + x];
1167           if (conv_params->use_dist_wtd_comp_avg) {
1168             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1169             tmp = tmp >> DIST_PRECISION_BITS;
1170           } else {
1171             tmp += res;
1172             tmp = tmp >> 1;
1173           }
1174           /* Subtract round offset and convolve round */
1175           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1176                        (1 << (offset_bits - conv_params->round_1 - 1)));
1177           dst[y * dst_stride + x] =
1178               clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1179         } else {
1180           dst16[y * dst16_stride + x] = res;
1181         }
1182       } else {
1183         /* Subtract round offset and convolve round */
1184         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1185                              (1 << (offset_bits - conv_params->round_1 - 1)));
1186         dst[y * dst_stride + x] =
1187             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1188       }
1189     }
1190     src_vert++;
1191   }
1192 }
1193 
highbd_convolve_2d_facade_compound(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1194 static void highbd_convolve_2d_facade_compound(
1195     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1196     const int w, const int h, const InterpFilterParams *filter_params_x,
1197     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1198     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1199   const bool need_x = subpel_x_qn != 0;
1200   const bool need_y = subpel_y_qn != 0;
1201   if (!need_x && !need_y) {
1202     av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
1203                                          conv_params, bd);
1204   } else if (need_x && !need_y) {
1205     av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
1206                                    filter_params_x, subpel_x_qn, conv_params,
1207                                    bd);
1208   } else if (!need_x && need_y) {
1209     av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
1210                                    filter_params_y, subpel_y_qn, conv_params,
1211                                    bd);
1212   } else {
1213     assert(need_x && need_y);
1214     av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
1215                                     filter_params_x, filter_params_y,
1216                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
1217   }
1218 }
1219 
highbd_convolve_2d_facade_single(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1220 static void highbd_convolve_2d_facade_single(
1221     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1222     const int w, const int h, const InterpFilterParams *filter_params_x,
1223     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1224     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1225   const bool need_x = subpel_x_qn != 0;
1226   const bool need_y = subpel_y_qn != 0;
1227 
1228   if (!need_x && !need_y) {
1229     aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
1230   } else if (need_x && !need_y) {
1231     av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
1232                              filter_params_x, subpel_x_qn, conv_params, bd);
1233   } else if (!need_x && need_y) {
1234     av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
1235                              filter_params_y, subpel_y_qn, bd);
1236   } else {
1237     assert(need_x && need_y);
1238     av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1239                               filter_params_x, filter_params_y, subpel_x_qn,
1240                               subpel_y_qn, conv_params, bd);
1241   }
1242 }
1243 
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,int bd)1244 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1245                                    uint8_t *dst8, int dst_stride, int w, int h,
1246                                    const InterpFilterParams *interp_filters[2],
1247                                    const int subpel_x_qn, int x_step_q4,
1248                                    const int subpel_y_qn, int y_step_q4,
1249                                    int scaled, ConvolveParams *conv_params,
1250                                    int bd) {
1251   (void)x_step_q4;
1252   (void)y_step_q4;
1253   (void)dst_stride;
1254   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1255 
1256   const InterpFilterParams *filter_params_x = interp_filters[0];
1257   const InterpFilterParams *filter_params_y = interp_filters[1];
1258 
1259   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1260   // 2-tap filter indicates that it is for IntraBC.
1261   if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
1262     assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1263     assert(!scaled);
1264     if (subpel_x_qn && subpel_y_qn) {
1265       av1_highbd_convolve_2d_sr_intrabc_c(
1266           src, src_stride, dst, dst_stride, w, h, filter_params_x,
1267           filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1268       return;
1269     } else if (subpel_x_qn) {
1270       av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1271                                          filter_params_x, subpel_x_qn,
1272                                          conv_params, bd);
1273       return;
1274     } else if (subpel_y_qn) {
1275       av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1276                                          filter_params_y, subpel_y_qn, bd);
1277       return;
1278     }
1279   }
1280 
1281   if (scaled) {
1282     if (conv_params->is_compound) {
1283       assert(conv_params->dst != NULL);
1284     }
1285     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1286                                  filter_params_x, filter_params_y, subpel_x_qn,
1287                                  x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1288                                  bd);
1289   } else if (conv_params->is_compound) {
1290     highbd_convolve_2d_facade_compound(
1291         src, src_stride, dst, dst_stride, w, h, filter_params_x,
1292         filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1293   } else {
1294     highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1295                                      filter_params_x, filter_params_y,
1296                                      subpel_x_qn, subpel_y_qn, conv_params, bd);
1297   }
1298 }
1299 #endif  // CONFIG_AV1_HIGHBITDEPTH
1300 
1301 // Note: Fixed size intermediate buffers, place limits on parameters
1302 // of some functions. 2d filtering proceeds in 2 steps:
1303 //   (1) Interpolate horizontally into an intermediate buffer, temp.
1304 //   (2) Interpolate temp vertically to derive the sub-pixel result.
1305 // Deriving the maximum number of rows in the temp buffer (135):
1306 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1307 // --Largest block size is 128x128 pixels.
1308 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1309 //   original frame (in 1/16th pixel units).
1310 // --Must round-up because block may be located at sub-pixel position.
1311 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1312 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1313 #define WIENER_MAX_EXT_SIZE 263
1314 
horz_scalar_product(const uint8_t * a,const int16_t * b)1315 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1316   int sum = 0;
1317   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1318   return sum;
1319 }
1320 
1321 #if CONFIG_AV1_HIGHBITDEPTH
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1322 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1323                                              const int16_t *b) {
1324   int sum = 0;
1325   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1326   return sum;
1327 }
1328 #endif
1329 
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1330 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1331                                              ptrdiff_t a_stride,
1332                                              const int16_t *b) {
1333   int sum = 0;
1334   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1335   return sum;
1336 }
1337 
get_filter_base(const int16_t * filter)1338 static const InterpKernel *get_filter_base(const int16_t *filter) {
1339   // NOTE: This assumes that the filter table is 256-byte aligned.
1340   // TODO(agrange) Modify to make independent of table alignment.
1341   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1342 }
1343 
get_filter_offset(const int16_t * f,const InterpKernel * base)1344 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1345   return (int)((const InterpKernel *)(intptr_t)f - base);
1346 }
1347 
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1348 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1349                                        uint16_t *dst, ptrdiff_t dst_stride,
1350                                        const InterpKernel *x_filters, int x0_q4,
1351                                        int x_step_q4, int w, int h,
1352                                        int round0_bits) {
1353   const int bd = 8;
1354   src -= SUBPEL_TAPS / 2 - 1;
1355   for (int y = 0; y < h; ++y) {
1356     int x_q4 = x0_q4;
1357     for (int x = 0; x < w; ++x) {
1358       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1359       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1360       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1361                            (1 << (bd + FILTER_BITS - 1));
1362       const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1363       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1364                                WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1365       x_q4 += x_step_q4;
1366     }
1367     src += src_stride;
1368     dst += dst_stride;
1369   }
1370 }
1371 
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1372 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1373                                       uint8_t *dst, ptrdiff_t dst_stride,
1374                                       const InterpKernel *y_filters, int y0_q4,
1375                                       int y_step_q4, int w, int h,
1376                                       int round1_bits) {
1377   const int bd = 8;
1378   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1379 
1380   for (int x = 0; x < w; ++x) {
1381     int y_q4 = y0_q4;
1382     for (int y = 0; y < h; ++y) {
1383       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1384       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1385       const int rounding =
1386           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1387           (1 << (bd + round1_bits - 1));
1388       const int sum =
1389           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1390       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1391       y_q4 += y_step_q4;
1392     }
1393     ++src;
1394     ++dst;
1395   }
1396 }
1397 
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const WienerConvolveParams * conv_params)1398 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1399                                    uint8_t *dst, ptrdiff_t dst_stride,
1400                                    const int16_t *filter_x, int x_step_q4,
1401                                    const int16_t *filter_y, int y_step_q4,
1402                                    int w, int h,
1403                                    const WienerConvolveParams *conv_params) {
1404   const InterpKernel *const filters_x = get_filter_base(filter_x);
1405   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1406 
1407   const InterpKernel *const filters_y = get_filter_base(filter_y);
1408   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1409 
1410   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1411   const int intermediate_height =
1412       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1413   memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1414 
1415   assert(w <= MAX_SB_SIZE);
1416   assert(h <= MAX_SB_SIZE);
1417   assert(y_step_q4 <= 32);
1418   assert(x_step_q4 <= 32);
1419 
1420   convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1421                              src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1422                              x_step_q4, w, intermediate_height,
1423                              conv_params->round_0);
1424   convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1425                             MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1426                             y_step_q4, w, h, conv_params->round_1);
1427 }
1428 
1429 #if CONFIG_AV1_HIGHBITDEPTH
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1430 static void highbd_convolve_add_src_horiz_hip(
1431     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1432     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1433     int x_step_q4, int w, int h, int round0_bits, int bd) {
1434   const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1435   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1436   src -= SUBPEL_TAPS / 2 - 1;
1437   for (int y = 0; y < h; ++y) {
1438     int x_q4 = x0_q4;
1439     for (int x = 0; x < w; ++x) {
1440       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1441       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1442       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1443                            (1 << (bd + FILTER_BITS - 1));
1444       const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1445       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1446                                extraprec_clamp_limit - 1);
1447       x_q4 += x_step_q4;
1448     }
1449     src += src_stride;
1450     dst += dst_stride;
1451   }
1452 }
1453 
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1454 static void highbd_convolve_add_src_vert_hip(
1455     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1456     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1457     int y_step_q4, int w, int h, int round1_bits, int bd) {
1458   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1459   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1460   for (int x = 0; x < w; ++x) {
1461     int y_q4 = y0_q4;
1462     for (int y = 0; y < h; ++y) {
1463       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1464       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1465       const int rounding =
1466           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1467           (1 << (bd + round1_bits - 1));
1468       const int sum =
1469           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1470       dst[y * dst_stride] =
1471           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1472       y_q4 += y_step_q4;
1473     }
1474     ++src;
1475     ++dst;
1476   }
1477 }
1478 
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const WienerConvolveParams * conv_params,int bd)1479 void av1_highbd_wiener_convolve_add_src_c(
1480     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1481     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1482     const int16_t *filter_y, int y_step_q4, int w, int h,
1483     const WienerConvolveParams *conv_params, int bd) {
1484   const InterpKernel *const filters_x = get_filter_base(filter_x);
1485   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1486 
1487   const InterpKernel *const filters_y = get_filter_base(filter_y);
1488   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1489 
1490   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1491   const int intermediate_height =
1492       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1493 
1494   assert(w <= MAX_SB_SIZE);
1495   assert(h <= MAX_SB_SIZE);
1496   assert(y_step_q4 <= 32);
1497   assert(x_step_q4 <= 32);
1498   assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1499 
1500   highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1501                                     src_stride, temp, MAX_SB_SIZE, filters_x,
1502                                     x0_q4, x_step_q4, w, intermediate_height,
1503                                     conv_params->round_0, bd);
1504   highbd_convolve_add_src_vert_hip(
1505       temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1506       filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1507 }
1508 #endif  // CONFIG_AV1_HIGHBITDEPTH
1509