• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <string.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25 
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27                              int dst_stride, int w, int h,
28                              const int16_t *x_filters, int x0_qn,
29                              int x_step_qn) {
30   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31   for (int y = 0; y < h; ++y) {
32     int x_qn = x0_qn;
33     for (int x = 0; x < w; ++x) {
34       const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35       const int x_filter_idx =
36           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37       assert(x_filter_idx <= RS_SUBPEL_MASK);
38       const int16_t *const x_filter =
39           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40       int sum = 0;
41       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42         sum += src_x[k] * x_filter[k];
43       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44       x_qn += x_step_qn;
45     }
46     src += src_stride;
47     dst += dst_stride;
48   }
49 }
50 
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52                                     uint16_t *dst, int dst_stride, int w, int h,
53                                     const int16_t *x_filters, int x0_qn,
54                                     int x_step_qn, int bd) {
55   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56   for (int y = 0; y < h; ++y) {
57     int x_qn = x0_qn;
58     for (int x = 0; x < w; ++x) {
59       const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60       const int x_filter_idx =
61           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62       assert(x_filter_idx <= RS_SUBPEL_MASK);
63       const int16_t *const x_filter =
64           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65       int sum = 0;
66       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67         sum += src_x[k] * x_filter[k];
68       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69       x_qn += x_step_qn;
70     }
71     src += src_stride;
72     dst += dst_stride;
73   }
74 }
75 
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)76 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
77                           int dst_stride, int w, int h,
78                           const InterpFilterParams *filter_params_x,
79                           const InterpFilterParams *filter_params_y,
80                           const int subpel_x_qn, const int subpel_y_qn,
81                           ConvolveParams *conv_params) {
82   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
83   int im_h = h + filter_params_y->taps - 1;
84   int im_stride = w;
85   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
86   const int fo_vert = filter_params_y->taps / 2 - 1;
87   const int fo_horiz = filter_params_x->taps / 2 - 1;
88   const int bd = 8;
89   const int bits =
90       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
91 
92   // horizontal filter
93   const uint8_t *src_horiz = src - fo_vert * src_stride;
94   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
95       filter_params_x, subpel_x_qn & SUBPEL_MASK);
96   for (int y = 0; y < im_h; ++y) {
97     for (int x = 0; x < w; ++x) {
98       int32_t sum = (1 << (bd + FILTER_BITS - 1));
99       for (int k = 0; k < filter_params_x->taps; ++k) {
100         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
101       }
102       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
103       im_block[y * im_stride + x] =
104           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
105     }
106   }
107 
108   // vertical filter
109   int16_t *src_vert = im_block + fo_vert * im_stride;
110   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
111       filter_params_y, subpel_y_qn & SUBPEL_MASK);
112   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
113   for (int y = 0; y < h; ++y) {
114     for (int x = 0; x < w; ++x) {
115       int32_t sum = 1 << offset_bits;
116       for (int k = 0; k < filter_params_y->taps; ++k) {
117         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
118       }
119       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
120       int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
121                     ((1 << (offset_bits - conv_params->round_1)) +
122                      (1 << (offset_bits - conv_params->round_1 - 1)));
123       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
124     }
125   }
126 }
127 
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)128 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
129                          int dst_stride, int w, int h,
130                          const InterpFilterParams *filter_params_y,
131                          const int subpel_y_qn) {
132   const int fo_vert = filter_params_y->taps / 2 - 1;
133 
134   // vertical filter
135   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
136       filter_params_y, subpel_y_qn & SUBPEL_MASK);
137   for (int y = 0; y < h; ++y) {
138     for (int x = 0; x < w; ++x) {
139       int32_t res = 0;
140       for (int k = 0; k < filter_params_y->taps; ++k) {
141         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
142       }
143       dst[y * dst_stride + x] =
144           clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
145     }
146   }
147 }
148 
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)149 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
150                          int dst_stride, int w, int h,
151                          const InterpFilterParams *filter_params_x,
152                          const int subpel_x_qn, ConvolveParams *conv_params) {
153   const int fo_horiz = filter_params_x->taps / 2 - 1;
154   const int bits = FILTER_BITS - conv_params->round_0;
155 
156   assert(bits >= 0);
157   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
158          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
159 
160   // horizontal filter
161   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
162       filter_params_x, subpel_x_qn & SUBPEL_MASK);
163 
164   for (int y = 0; y < h; ++y) {
165     for (int x = 0; x < w; ++x) {
166       int32_t res = 0;
167       for (int k = 0; k < filter_params_x->taps; ++k) {
168         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
169       }
170       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
171       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
172     }
173   }
174 }
175 
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)176 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
177                                 uint8_t *dst, int dst_stride, int w, int h,
178                                 const InterpFilterParams *filter_params_x,
179                                 const InterpFilterParams *filter_params_y,
180                                 const int subpel_x_qn, const int subpel_y_qn,
181                                 ConvolveParams *conv_params) {
182   CONV_BUF_TYPE *dst16 = conv_params->dst;
183   int dst16_stride = conv_params->dst_stride;
184   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
185   int im_h = h + filter_params_y->taps - 1;
186   int im_stride = w;
187   const int fo_vert = filter_params_y->taps / 2 - 1;
188   const int fo_horiz = filter_params_x->taps / 2 - 1;
189   const int bd = 8;
190   const int round_bits =
191       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
192 
193   // horizontal filter
194   const uint8_t *src_horiz = src - fo_vert * src_stride;
195   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
196       filter_params_x, subpel_x_qn & SUBPEL_MASK);
197   for (int y = 0; y < im_h; ++y) {
198     for (int x = 0; x < w; ++x) {
199       int32_t sum = (1 << (bd + FILTER_BITS - 1));
200       for (int k = 0; k < filter_params_x->taps; ++k) {
201         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
202       }
203       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
204       im_block[y * im_stride + x] =
205           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
206     }
207   }
208 
209   // vertical filter
210   int16_t *src_vert = im_block + fo_vert * im_stride;
211   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
212       filter_params_y, subpel_y_qn & SUBPEL_MASK);
213   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
214   for (int y = 0; y < h; ++y) {
215     for (int x = 0; x < w; ++x) {
216       int32_t sum = 1 << offset_bits;
217       for (int k = 0; k < filter_params_y->taps; ++k) {
218         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
219       }
220       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
221       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
222       if (conv_params->do_average) {
223         int32_t tmp = dst16[y * dst16_stride + x];
224         if (conv_params->use_dist_wtd_comp_avg) {
225           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
226           tmp = tmp >> DIST_PRECISION_BITS;
227         } else {
228           tmp += res;
229           tmp = tmp >> 1;
230         }
231         tmp -= (1 << (offset_bits - conv_params->round_1)) +
232                (1 << (offset_bits - conv_params->round_1 - 1));
233         dst[y * dst_stride + x] =
234             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
235       } else {
236         dst16[y * dst16_stride + x] = res;
237       }
238     }
239   }
240 }
241 
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)242 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
243                                int dst_stride, int w, int h,
244                                const InterpFilterParams *filter_params_y,
245                                const int subpel_y_qn,
246                                ConvolveParams *conv_params) {
247   CONV_BUF_TYPE *dst16 = conv_params->dst;
248   int dst16_stride = conv_params->dst_stride;
249   const int fo_vert = filter_params_y->taps / 2 - 1;
250   const int bits = FILTER_BITS - conv_params->round_0;
251   const int bd = 8;
252   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
253   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
254                            (1 << (offset_bits - conv_params->round_1 - 1));
255   const int round_bits =
256       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
257 
258   // vertical filter
259   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
260       filter_params_y, subpel_y_qn & SUBPEL_MASK);
261   for (int y = 0; y < h; ++y) {
262     for (int x = 0; x < w; ++x) {
263       int32_t res = 0;
264       for (int k = 0; k < filter_params_y->taps; ++k) {
265         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
266       }
267       res *= (1 << bits);
268       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
269 
270       if (conv_params->do_average) {
271         int32_t tmp = dst16[y * dst16_stride + x];
272         if (conv_params->use_dist_wtd_comp_avg) {
273           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
274           tmp = tmp >> DIST_PRECISION_BITS;
275         } else {
276           tmp += res;
277           tmp = tmp >> 1;
278         }
279         tmp -= round_offset;
280         dst[y * dst_stride + x] =
281             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
282       } else {
283         dst16[y * dst16_stride + x] = res;
284       }
285     }
286   }
287 }
288 
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)289 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
290                                int dst_stride, int w, int h,
291                                const InterpFilterParams *filter_params_x,
292                                const int subpel_x_qn,
293                                ConvolveParams *conv_params) {
294   CONV_BUF_TYPE *dst16 = conv_params->dst;
295   int dst16_stride = conv_params->dst_stride;
296   const int fo_horiz = filter_params_x->taps / 2 - 1;
297   const int bits = FILTER_BITS - conv_params->round_1;
298   const int bd = 8;
299   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
300   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
301                            (1 << (offset_bits - conv_params->round_1 - 1));
302   const int round_bits =
303       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
304 
305   // horizontal filter
306   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
307       filter_params_x, subpel_x_qn & SUBPEL_MASK);
308   for (int y = 0; y < h; ++y) {
309     for (int x = 0; x < w; ++x) {
310       int32_t res = 0;
311       for (int k = 0; k < filter_params_x->taps; ++k) {
312         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
313       }
314       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
315       res += round_offset;
316 
317       if (conv_params->do_average) {
318         int32_t tmp = dst16[y * dst16_stride + x];
319         if (conv_params->use_dist_wtd_comp_avg) {
320           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
321           tmp = tmp >> DIST_PRECISION_BITS;
322         } else {
323           tmp += res;
324           tmp = tmp >> 1;
325         }
326         tmp -= round_offset;
327         dst[y * dst_stride + x] =
328             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
329       } else {
330         dst16[y * dst16_stride + x] = res;
331       }
332     }
333   }
334 }
335 
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params)336 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
337                                      uint8_t *dst, int dst_stride, int w, int h,
338                                      ConvolveParams *conv_params) {
339   CONV_BUF_TYPE *dst16 = conv_params->dst;
340   int dst16_stride = conv_params->dst_stride;
341   const int bits =
342       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
343   const int bd = 8;
344   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
345   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
346                            (1 << (offset_bits - conv_params->round_1 - 1));
347 
348   for (int y = 0; y < h; ++y) {
349     for (int x = 0; x < w; ++x) {
350       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
351       res += round_offset;
352 
353       if (conv_params->do_average) {
354         int32_t tmp = dst16[y * dst16_stride + x];
355         if (conv_params->use_dist_wtd_comp_avg) {
356           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
357           tmp = tmp >> DIST_PRECISION_BITS;
358         } else {
359           tmp += res;
360           tmp = tmp >> 1;
361         }
362         tmp -= round_offset;
363         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
364       } else {
365         dst16[y * dst16_stride + x] = res;
366       }
367     }
368   }
369 }
370 
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)371 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
372                              int dst_stride, int w, int h,
373                              const InterpFilterParams *filter_params_x,
374                              const InterpFilterParams *filter_params_y,
375                              const int subpel_x_qn, const int x_step_qn,
376                              const int subpel_y_qn, const int y_step_qn,
377                              ConvolveParams *conv_params) {
378   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
379   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
380              filter_params_y->taps;
381   CONV_BUF_TYPE *dst16 = conv_params->dst;
382   const int dst16_stride = conv_params->dst_stride;
383   const int bits =
384       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
385   assert(bits >= 0);
386   int im_stride = w;
387   const int fo_vert = filter_params_y->taps / 2 - 1;
388   const int fo_horiz = filter_params_x->taps / 2 - 1;
389   const int bd = 8;
390 
391   // horizontal filter
392   const uint8_t *src_horiz = src - fo_vert * src_stride;
393   for (int y = 0; y < im_h; ++y) {
394     int x_qn = subpel_x_qn;
395     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
396       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
397       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
398       assert(x_filter_idx < SUBPEL_SHIFTS);
399       const int16_t *x_filter =
400           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
401       int32_t sum = (1 << (bd + FILTER_BITS - 1));
402       for (int k = 0; k < filter_params_x->taps; ++k) {
403         sum += x_filter[k] * src_x[k - fo_horiz];
404       }
405       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
406       im_block[y * im_stride + x] =
407           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
408     }
409     src_horiz += src_stride;
410   }
411 
412   // vertical filter
413   int16_t *src_vert = im_block + fo_vert * im_stride;
414   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
415   for (int x = 0; x < w; ++x) {
416     int y_qn = subpel_y_qn;
417     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
418       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
419       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
420       assert(y_filter_idx < SUBPEL_SHIFTS);
421       const int16_t *y_filter =
422           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
423       int32_t sum = 1 << offset_bits;
424       for (int k = 0; k < filter_params_y->taps; ++k) {
425         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
426       }
427       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
428       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
429       if (conv_params->is_compound) {
430         if (conv_params->do_average) {
431           int32_t tmp = dst16[y * dst16_stride + x];
432           if (conv_params->use_dist_wtd_comp_avg) {
433             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
434             tmp = tmp >> DIST_PRECISION_BITS;
435           } else {
436             tmp += res;
437             tmp = tmp >> 1;
438           }
439           /* Subtract round offset and convolve round */
440           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
441                        (1 << (offset_bits - conv_params->round_1 - 1)));
442           dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
443         } else {
444           dst16[y * dst16_stride + x] = res;
445         }
446       } else {
447         /* Subtract round offset and convolve round */
448         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
449                              (1 << (offset_bits - conv_params->round_1 - 1)));
450         dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
451       }
452     }
453     src_vert++;
454   }
455 }
456 
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)457 static void convolve_2d_scale_wrapper(
458     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
459     int h, const InterpFilterParams *filter_params_x,
460     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
461     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
462     ConvolveParams *conv_params) {
463   if (conv_params->is_compound) {
464     assert(conv_params->dst != NULL);
465   }
466   av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
467                         filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
468                         y_step_qn, conv_params);
469 }
470 
convolve_2d_facade_compound(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)471 static void convolve_2d_facade_compound(
472     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
473     int h, const InterpFilterParams *filter_params_x,
474     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
475     const int subpel_y_qn, ConvolveParams *conv_params) {
476   const bool need_x = subpel_x_qn != 0;
477   const bool need_y = subpel_y_qn != 0;
478   if (!need_x && !need_y) {
479     av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
480                                   conv_params);
481   } else if (need_x && !need_y) {
482     av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
483                             filter_params_x, subpel_x_qn, conv_params);
484   } else if (!need_x && need_y) {
485     av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
486                             filter_params_y, subpel_y_qn, conv_params);
487   } else {
488     assert(need_y && need_x);
489     av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
490                              filter_params_x, filter_params_y, subpel_x_qn,
491                              subpel_y_qn, conv_params);
492   }
493 }
494 
convolve_2d_facade_single(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)495 static void convolve_2d_facade_single(
496     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
497     int h, const InterpFilterParams *filter_params_x,
498     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
499     const int subpel_y_qn, ConvolveParams *conv_params) {
500   const bool need_x = subpel_x_qn != 0;
501   const bool need_y = subpel_y_qn != 0;
502   if (!need_x && !need_y) {
503     aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
504   } else if (need_x && !need_y) {
505     av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
506                       subpel_x_qn, conv_params);
507   } else if (!need_x && need_y) {
508     av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
509                       subpel_y_qn);
510   } else {
511     assert(need_x && need_y);
512     av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
513                        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
514   }
515 }
516 
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params)517 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
518                             int dst_stride, int w, int h,
519                             const InterpFilterParams *interp_filters[2],
520                             const int subpel_x_qn, int x_step_q4,
521                             const int subpel_y_qn, int y_step_q4, int scaled,
522                             ConvolveParams *conv_params) {
523   (void)x_step_q4;
524   (void)y_step_q4;
525   (void)dst;
526   (void)dst_stride;
527 
528   const InterpFilterParams *filter_params_x = interp_filters[0];
529   const InterpFilterParams *filter_params_y = interp_filters[1];
530 
531   // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
532   // Do we have SIMD support to 4-tap case?
533   // 2-tap filter indicates that it is for IntraBC.
534   if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
535     assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
536     assert(!scaled);
537     if (subpel_x_qn && subpel_y_qn) {
538       av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
539                            filter_params_x, filter_params_y, subpel_x_qn,
540                            subpel_y_qn, conv_params);
541       return;
542     } else if (subpel_x_qn) {
543       av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
544                           filter_params_x, subpel_x_qn, conv_params);
545       return;
546     } else if (subpel_y_qn) {
547       av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
548                           filter_params_y, subpel_y_qn);
549       return;
550     }
551   }
552 
553   if (scaled) {
554     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
555                               filter_params_x, filter_params_y, subpel_x_qn,
556                               x_step_q4, subpel_y_qn, y_step_q4, conv_params);
557   } else if (conv_params->is_compound) {
558     convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
559                                 filter_params_x, filter_params_y, subpel_x_qn,
560                                 subpel_y_qn, conv_params);
561   } else {
562     convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
563                               filter_params_x, filter_params_y, subpel_x_qn,
564                               subpel_y_qn, conv_params);
565   }
566 }
567 
568 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)569 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
570                                 uint16_t *dst, int dst_stride, int w, int h,
571                                 const InterpFilterParams *filter_params_x,
572                                 const int subpel_x_qn,
573                                 ConvolveParams *conv_params, int bd) {
574   const int fo_horiz = filter_params_x->taps / 2 - 1;
575   const int bits = FILTER_BITS - conv_params->round_0;
576 
577   assert(bits >= 0);
578   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
579          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
580 
581   // horizontal filter
582   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
583       filter_params_x, subpel_x_qn & SUBPEL_MASK);
584   for (int y = 0; y < h; ++y) {
585     for (int x = 0; x < w; ++x) {
586       int32_t res = 0;
587       for (int k = 0; k < filter_params_x->taps; ++k) {
588         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
589       }
590       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
591       dst[y * dst_stride + x] =
592           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
593     }
594   }
595 }
596 
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)597 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
598                                 uint16_t *dst, int dst_stride, int w, int h,
599                                 const InterpFilterParams *filter_params_y,
600                                 const int subpel_y_qn, int bd) {
601   const int fo_vert = filter_params_y->taps / 2 - 1;
602   // vertical filter
603   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
604       filter_params_y, subpel_y_qn & SUBPEL_MASK);
605   for (int y = 0; y < h; ++y) {
606     for (int x = 0; x < w; ++x) {
607       int32_t res = 0;
608       for (int k = 0; k < filter_params_y->taps; ++k) {
609         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
610       }
611       dst[y * dst_stride + x] =
612           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
613     }
614   }
615 }
616 
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)617 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
618                                  uint16_t *dst, int dst_stride, int w, int h,
619                                  const InterpFilterParams *filter_params_x,
620                                  const InterpFilterParams *filter_params_y,
621                                  const int subpel_x_qn, const int subpel_y_qn,
622                                  ConvolveParams *conv_params, int bd) {
623   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
624   int im_h = h + filter_params_y->taps - 1;
625   int im_stride = w;
626   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
627   const int fo_vert = filter_params_y->taps / 2 - 1;
628   const int fo_horiz = filter_params_x->taps / 2 - 1;
629   const int bits =
630       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
631   assert(bits >= 0);
632 
633   // horizontal filter
634   const uint16_t *src_horiz = src - fo_vert * src_stride;
635   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
636       filter_params_x, subpel_x_qn & SUBPEL_MASK);
637   for (int y = 0; y < im_h; ++y) {
638     for (int x = 0; x < w; ++x) {
639       int32_t sum = (1 << (bd + FILTER_BITS - 1));
640       for (int k = 0; k < filter_params_x->taps; ++k) {
641         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
642       }
643       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
644       im_block[y * im_stride + x] =
645           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
646     }
647   }
648 
649   // vertical filter
650   int16_t *src_vert = im_block + fo_vert * im_stride;
651   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
652       filter_params_y, subpel_y_qn & SUBPEL_MASK);
653   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
654   for (int y = 0; y < h; ++y) {
655     for (int x = 0; x < w; ++x) {
656       int32_t sum = 1 << offset_bits;
657       for (int k = 0; k < filter_params_y->taps; ++k) {
658         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
659       }
660       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
661       int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
662                     ((1 << (offset_bits - conv_params->round_1)) +
663                      (1 << (offset_bits - conv_params->round_1 - 1)));
664       dst[y * dst_stride + x] =
665           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
666     }
667   }
668 }
669 
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)670 void av1_highbd_dist_wtd_convolve_2d_c(
671     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
672     int h, const InterpFilterParams *filter_params_x,
673     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
674     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
675   int x, y, k;
676   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
677   CONV_BUF_TYPE *dst16 = conv_params->dst;
678   int dst16_stride = conv_params->dst_stride;
679   int im_h = h + filter_params_y->taps - 1;
680   int im_stride = w;
681   const int fo_vert = filter_params_y->taps / 2 - 1;
682   const int fo_horiz = filter_params_x->taps / 2 - 1;
683   const int round_bits =
684       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
685   assert(round_bits >= 0);
686 
687   // horizontal filter
688   const uint16_t *src_horiz = src - fo_vert * src_stride;
689   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
690       filter_params_x, subpel_x_qn & SUBPEL_MASK);
691   for (y = 0; y < im_h; ++y) {
692     for (x = 0; x < w; ++x) {
693       int32_t sum = (1 << (bd + FILTER_BITS - 1));
694       for (k = 0; k < filter_params_x->taps; ++k) {
695         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
696       }
697       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
698       (void)bd;
699       im_block[y * im_stride + x] =
700           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
701     }
702   }
703 
704   // vertical filter
705   int16_t *src_vert = im_block + fo_vert * im_stride;
706   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
707   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
708       filter_params_y, subpel_y_qn & SUBPEL_MASK);
709   for (y = 0; y < h; ++y) {
710     for (x = 0; x < w; ++x) {
711       int32_t sum = 1 << offset_bits;
712       for (k = 0; k < filter_params_y->taps; ++k) {
713         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
714       }
715       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
716       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
717       if (conv_params->do_average) {
718         int32_t tmp = dst16[y * dst16_stride + x];
719         if (conv_params->use_dist_wtd_comp_avg) {
720           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
721           tmp = tmp >> DIST_PRECISION_BITS;
722         } else {
723           tmp += res;
724           tmp = tmp >> 1;
725         }
726         tmp -= (1 << (offset_bits - conv_params->round_1)) +
727                (1 << (offset_bits - conv_params->round_1 - 1));
728         dst[y * dst_stride + x] =
729             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
730       } else {
731         dst16[y * dst16_stride + x] = res;
732       }
733     }
734   }
735 }
736 
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)737 void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
738                                       uint16_t *dst, int dst_stride, int w,
739                                       int h,
740                                       const InterpFilterParams *filter_params_x,
741                                       const int subpel_x_qn,
742                                       ConvolveParams *conv_params, int bd) {
743   CONV_BUF_TYPE *dst16 = conv_params->dst;
744   int dst16_stride = conv_params->dst_stride;
745   const int fo_horiz = filter_params_x->taps / 2 - 1;
746   const int bits = FILTER_BITS - conv_params->round_1;
747   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
748   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
749                            (1 << (offset_bits - conv_params->round_1 - 1));
750   const int round_bits =
751       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
752   assert(round_bits >= 0);
753   assert(bits >= 0);
754   // horizontal filter
755   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
756       filter_params_x, subpel_x_qn & SUBPEL_MASK);
757   for (int y = 0; y < h; ++y) {
758     for (int x = 0; x < w; ++x) {
759       int32_t res = 0;
760       for (int k = 0; k < filter_params_x->taps; ++k) {
761         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
762       }
763       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
764       res += round_offset;
765 
766       if (conv_params->do_average) {
767         int32_t tmp = dst16[y * dst16_stride + x];
768         if (conv_params->use_dist_wtd_comp_avg) {
769           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
770           tmp = tmp >> DIST_PRECISION_BITS;
771         } else {
772           tmp += res;
773           tmp = tmp >> 1;
774         }
775         tmp -= round_offset;
776         dst[y * dst_stride + x] =
777             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
778       } else {
779         dst16[y * dst16_stride + x] = res;
780       }
781     }
782   }
783 }
784 
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)785 void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
786                                       uint16_t *dst, int dst_stride, int w,
787                                       int h,
788                                       const InterpFilterParams *filter_params_y,
789                                       const int subpel_y_qn,
790                                       ConvolveParams *conv_params, int bd) {
791   CONV_BUF_TYPE *dst16 = conv_params->dst;
792   int dst16_stride = conv_params->dst_stride;
793   const int fo_vert = filter_params_y->taps / 2 - 1;
794   const int bits = FILTER_BITS - conv_params->round_0;
795   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
796   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
797                            (1 << (offset_bits - conv_params->round_1 - 1));
798   const int round_bits =
799       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
800   assert(round_bits >= 0);
801   assert(bits >= 0);
802   // vertical filter
803   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
804       filter_params_y, subpel_y_qn & SUBPEL_MASK);
805   for (int y = 0; y < h; ++y) {
806     for (int x = 0; x < w; ++x) {
807       int32_t res = 0;
808       for (int k = 0; k < filter_params_y->taps; ++k) {
809         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
810       }
811       res *= (1 << bits);
812       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
813 
814       if (conv_params->do_average) {
815         int32_t tmp = dst16[y * dst16_stride + x];
816         if (conv_params->use_dist_wtd_comp_avg) {
817           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
818           tmp = tmp >> DIST_PRECISION_BITS;
819         } else {
820           tmp += res;
821           tmp = tmp >> 1;
822         }
823         tmp -= round_offset;
824         dst[y * dst_stride + x] =
825             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
826       } else {
827         dst16[y * dst16_stride + x] = res;
828       }
829     }
830   }
831 }
832 
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)833 void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
834                                             uint16_t *dst, int dst_stride,
835                                             int w, int h,
836                                             ConvolveParams *conv_params,
837                                             int bd) {
838   CONV_BUF_TYPE *dst16 = conv_params->dst;
839   int dst16_stride = conv_params->dst_stride;
840   const int bits =
841       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
842   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
843   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
844                            (1 << (offset_bits - conv_params->round_1 - 1));
845   assert(bits >= 0);
846 
847   for (int y = 0; y < h; ++y) {
848     for (int x = 0; x < w; ++x) {
849       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
850       res += round_offset;
851       if (conv_params->do_average) {
852         int32_t tmp = dst16[y * dst16_stride + x];
853         if (conv_params->use_dist_wtd_comp_avg) {
854           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
855           tmp = tmp >> DIST_PRECISION_BITS;
856         } else {
857           tmp += res;
858           tmp = tmp >> 1;
859         }
860         tmp -= round_offset;
861         dst[y * dst_stride + x] =
862             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
863       } else {
864         dst16[y * dst16_stride + x] = res;
865       }
866     }
867   }
868 }
869 
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)870 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
871                                     uint16_t *dst, int dst_stride, int w, int h,
872                                     const InterpFilterParams *filter_params_x,
873                                     const InterpFilterParams *filter_params_y,
874                                     const int subpel_x_qn, const int x_step_qn,
875                                     const int subpel_y_qn, const int y_step_qn,
876                                     ConvolveParams *conv_params, int bd) {
877   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
878   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
879              filter_params_y->taps;
880   int im_stride = w;
881   const int fo_vert = filter_params_y->taps / 2 - 1;
882   const int fo_horiz = filter_params_x->taps / 2 - 1;
883   CONV_BUF_TYPE *dst16 = conv_params->dst;
884   const int dst16_stride = conv_params->dst_stride;
885   const int bits =
886       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
887   assert(bits >= 0);
888   // horizontal filter
889   const uint16_t *src_horiz = src - fo_vert * src_stride;
890   for (int y = 0; y < im_h; ++y) {
891     int x_qn = subpel_x_qn;
892     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
893       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
894       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
895       assert(x_filter_idx < SUBPEL_SHIFTS);
896       const int16_t *x_filter =
897           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
898       int32_t sum = (1 << (bd + FILTER_BITS - 1));
899       for (int k = 0; k < filter_params_x->taps; ++k) {
900         sum += x_filter[k] * src_x[k - fo_horiz];
901       }
902       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
903       im_block[y * im_stride + x] =
904           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
905     }
906     src_horiz += src_stride;
907   }
908 
909   // vertical filter
910   int16_t *src_vert = im_block + fo_vert * im_stride;
911   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
912   for (int x = 0; x < w; ++x) {
913     int y_qn = subpel_y_qn;
914     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
915       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
916       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
917       assert(y_filter_idx < SUBPEL_SHIFTS);
918       const int16_t *y_filter =
919           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
920       int32_t sum = 1 << offset_bits;
921       for (int k = 0; k < filter_params_y->taps; ++k) {
922         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
923       }
924       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
925       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
926       if (conv_params->is_compound) {
927         if (conv_params->do_average) {
928           int32_t tmp = dst16[y * dst16_stride + x];
929           if (conv_params->use_dist_wtd_comp_avg) {
930             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
931             tmp = tmp >> DIST_PRECISION_BITS;
932           } else {
933             tmp += res;
934             tmp = tmp >> 1;
935           }
936           /* Subtract round offset and convolve round */
937           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
938                        (1 << (offset_bits - conv_params->round_1 - 1)));
939           dst[y * dst_stride + x] =
940               clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
941         } else {
942           dst16[y * dst16_stride + x] = res;
943         }
944       } else {
945         /* Subtract round offset and convolve round */
946         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
947                              (1 << (offset_bits - conv_params->round_1 - 1)));
948         dst[y * dst_stride + x] =
949             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
950       }
951     }
952     src_vert++;
953   }
954 }
955 
highbd_convolve_2d_facade_compound(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)956 static void highbd_convolve_2d_facade_compound(
957     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
958     const int w, const int h, const InterpFilterParams *filter_params_x,
959     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
960     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
961   const bool need_x = subpel_x_qn != 0;
962   const bool need_y = subpel_y_qn != 0;
963   if (!need_x && !need_y) {
964     av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
965                                          conv_params, bd);
966   } else if (need_x && !need_y) {
967     av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
968                                    filter_params_x, subpel_x_qn, conv_params,
969                                    bd);
970   } else if (!need_x && need_y) {
971     av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
972                                    filter_params_y, subpel_y_qn, conv_params,
973                                    bd);
974   } else {
975     assert(need_x && need_y);
976     av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
977                                     filter_params_x, filter_params_y,
978                                     subpel_x_qn, subpel_y_qn, conv_params, bd);
979   }
980 }
981 
highbd_convolve_2d_facade_single(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)982 static void highbd_convolve_2d_facade_single(
983     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
984     const int w, const int h, const InterpFilterParams *filter_params_x,
985     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
986     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
987   const bool need_x = subpel_x_qn != 0;
988   const bool need_y = subpel_y_qn != 0;
989 
990   if (!need_x && !need_y) {
991     aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
992   } else if (need_x && !need_y) {
993     av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
994                              filter_params_x, subpel_x_qn, conv_params, bd);
995   } else if (!need_x && need_y) {
996     av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
997                              filter_params_y, subpel_y_qn, bd);
998   } else {
999     assert(need_x && need_y);
1000     av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1001                               filter_params_x, filter_params_y, subpel_x_qn,
1002                               subpel_y_qn, conv_params, bd);
1003   }
1004 }
1005 
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,int bd)1006 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1007                                    uint8_t *dst8, int dst_stride, int w, int h,
1008                                    const InterpFilterParams *interp_filters[2],
1009                                    const int subpel_x_qn, int x_step_q4,
1010                                    const int subpel_y_qn, int y_step_q4,
1011                                    int scaled, ConvolveParams *conv_params,
1012                                    int bd) {
1013   (void)x_step_q4;
1014   (void)y_step_q4;
1015   (void)dst_stride;
1016   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1017 
1018   const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
1019   const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
1020   const InterpFilterParams *filter_params_x =
1021       need_filter_params_x ? interp_filters[0] : NULL;
1022   const InterpFilterParams *filter_params_y =
1023       need_filter_params_y ? interp_filters[1] : NULL;
1024 
1025   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1026   if (scaled) {
1027     if (conv_params->is_compound) {
1028       assert(conv_params->dst != NULL);
1029     }
1030     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1031                                  filter_params_x, filter_params_y, subpel_x_qn,
1032                                  x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1033                                  bd);
1034   } else if (conv_params->is_compound) {
1035     highbd_convolve_2d_facade_compound(
1036         src, src_stride, dst, dst_stride, w, h, filter_params_x,
1037         filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1038   } else {
1039     highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1040                                      filter_params_x, filter_params_y,
1041                                      subpel_x_qn, subpel_y_qn, conv_params, bd);
1042   }
1043 }
1044 #endif  // CONFIG_AV1_HIGHBITDEPTH
1045 
1046 // Note: Fixed size intermediate buffers, place limits on parameters
1047 // of some functions. 2d filtering proceeds in 2 steps:
1048 //   (1) Interpolate horizontally into an intermediate buffer, temp.
1049 //   (2) Interpolate temp vertically to derive the sub-pixel result.
1050 // Deriving the maximum number of rows in the temp buffer (135):
1051 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1052 // --Largest block size is 128x128 pixels.
1053 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1054 //   original frame (in 1/16th pixel units).
1055 // --Must round-up because block may be located at sub-pixel position.
1056 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1057 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1058 #define WIENER_MAX_EXT_SIZE 263
1059 
horz_scalar_product(const uint8_t * a,const int16_t * b)1060 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1061   int sum = 0;
1062   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1063   return sum;
1064 }
1065 
1066 #if CONFIG_AV1_HIGHBITDEPTH
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1067 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1068                                              const int16_t *b) {
1069   int sum = 0;
1070   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1071   return sum;
1072 }
1073 #endif
1074 
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1075 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1076                                              ptrdiff_t a_stride,
1077                                              const int16_t *b) {
1078   int sum = 0;
1079   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1080   return sum;
1081 }
1082 
get_filter_base(const int16_t * filter)1083 static const InterpKernel *get_filter_base(const int16_t *filter) {
1084   // NOTE: This assumes that the filter table is 256-byte aligned.
1085   // TODO(agrange) Modify to make independent of table alignment.
1086   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1087 }
1088 
get_filter_offset(const int16_t * f,const InterpKernel * base)1089 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1090   return (int)((const InterpKernel *)(intptr_t)f - base);
1091 }
1092 
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1093 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1094                                        uint16_t *dst, ptrdiff_t dst_stride,
1095                                        const InterpKernel *x_filters, int x0_q4,
1096                                        int x_step_q4, int w, int h,
1097                                        int round0_bits) {
1098   const int bd = 8;
1099   src -= SUBPEL_TAPS / 2 - 1;
1100   for (int y = 0; y < h; ++y) {
1101     int x_q4 = x0_q4;
1102     for (int x = 0; x < w; ++x) {
1103       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1104       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1105       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1106                            (1 << (bd + FILTER_BITS - 1));
1107       const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1108       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1109                                WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1110       x_q4 += x_step_q4;
1111     }
1112     src += src_stride;
1113     dst += dst_stride;
1114   }
1115 }
1116 
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1117 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1118                                       uint8_t *dst, ptrdiff_t dst_stride,
1119                                       const InterpKernel *y_filters, int y0_q4,
1120                                       int y_step_q4, int w, int h,
1121                                       int round1_bits) {
1122   const int bd = 8;
1123   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1124 
1125   for (int x = 0; x < w; ++x) {
1126     int y_q4 = y0_q4;
1127     for (int y = 0; y < h; ++y) {
1128       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1129       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1130       const int rounding =
1131           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1132           (1 << (bd + round1_bits - 1));
1133       const int sum =
1134           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1135       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1136       y_q4 += y_step_q4;
1137     }
1138     ++src;
1139     ++dst;
1140   }
1141 }
1142 
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params)1143 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1144                                    uint8_t *dst, ptrdiff_t dst_stride,
1145                                    const int16_t *filter_x, int x_step_q4,
1146                                    const int16_t *filter_y, int y_step_q4,
1147                                    int w, int h,
1148                                    const ConvolveParams *conv_params) {
1149   const InterpKernel *const filters_x = get_filter_base(filter_x);
1150   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1151 
1152   const InterpKernel *const filters_y = get_filter_base(filter_y);
1153   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1154 
1155   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1156   const int intermediate_height =
1157       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1158   memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1159 
1160   assert(w <= MAX_SB_SIZE);
1161   assert(h <= MAX_SB_SIZE);
1162   assert(y_step_q4 <= 32);
1163   assert(x_step_q4 <= 32);
1164 
1165   convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1166                              src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1167                              x_step_q4, w, intermediate_height,
1168                              conv_params->round_0);
1169   convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1170                             MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1171                             y_step_q4, w, h, conv_params->round_1);
1172 }
1173 
1174 #if CONFIG_AV1_HIGHBITDEPTH
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1175 static void highbd_convolve_add_src_horiz_hip(
1176     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1177     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1178     int x_step_q4, int w, int h, int round0_bits, int bd) {
1179   const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1180   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1181   src -= SUBPEL_TAPS / 2 - 1;
1182   for (int y = 0; y < h; ++y) {
1183     int x_q4 = x0_q4;
1184     for (int x = 0; x < w; ++x) {
1185       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1186       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1187       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1188                            (1 << (bd + FILTER_BITS - 1));
1189       const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1190       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1191                                extraprec_clamp_limit - 1);
1192       x_q4 += x_step_q4;
1193     }
1194     src += src_stride;
1195     dst += dst_stride;
1196   }
1197 }
1198 
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1199 static void highbd_convolve_add_src_vert_hip(
1200     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1201     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1202     int y_step_q4, int w, int h, int round1_bits, int bd) {
1203   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1204   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1205   for (int x = 0; x < w; ++x) {
1206     int y_q4 = y0_q4;
1207     for (int y = 0; y < h; ++y) {
1208       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1209       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1210       const int rounding =
1211           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1212           (1 << (bd + round1_bits - 1));
1213       const int sum =
1214           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1215       dst[y * dst_stride] =
1216           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1217       y_q4 += y_step_q4;
1218     }
1219     ++src;
1220     ++dst;
1221   }
1222 }
1223 
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params,int bd)1224 void av1_highbd_wiener_convolve_add_src_c(
1225     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1226     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1227     const int16_t *filter_y, int y_step_q4, int w, int h,
1228     const ConvolveParams *conv_params, int bd) {
1229   const InterpKernel *const filters_x = get_filter_base(filter_x);
1230   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1231 
1232   const InterpKernel *const filters_y = get_filter_base(filter_y);
1233   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1234 
1235   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1236   const int intermediate_height =
1237       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1238 
1239   assert(w <= MAX_SB_SIZE);
1240   assert(h <= MAX_SB_SIZE);
1241   assert(y_step_q4 <= 32);
1242   assert(x_step_q4 <= 32);
1243   assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1244 
1245   highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1246                                     src_stride, temp, MAX_SB_SIZE, filters_x,
1247                                     x0_q4, x_step_q4, w, intermediate_height,
1248                                     conv_params->round_0, bd);
1249   highbd_convolve_add_src_vert_hip(
1250       temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1251       filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1252 }
1253 #endif  // CONFIG_AV1_HIGHBITDEPTH
1254