1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <string.h>
14
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27 int dst_stride, int w, int h,
28 const int16_t *x_filters, int x0_qn,
29 int x_step_qn) {
30 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31 for (int y = 0; y < h; ++y) {
32 int x_qn = x0_qn;
33 for (int x = 0; x < w; ++x) {
34 const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35 const int x_filter_idx =
36 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37 assert(x_filter_idx <= RS_SUBPEL_MASK);
38 const int16_t *const x_filter =
39 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40 int sum = 0;
41 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42 sum += src_x[k] * x_filter[k];
43 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44 x_qn += x_step_qn;
45 }
46 src += src_stride;
47 dst += dst_stride;
48 }
49 }
50
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52 uint16_t *dst, int dst_stride, int w, int h,
53 const int16_t *x_filters, int x0_qn,
54 int x_step_qn, int bd) {
55 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56 for (int y = 0; y < h; ++y) {
57 int x_qn = x0_qn;
58 for (int x = 0; x < w; ++x) {
59 const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60 const int x_filter_idx =
61 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62 assert(x_filter_idx <= RS_SUBPEL_MASK);
63 const int16_t *const x_filter =
64 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65 int sum = 0;
66 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67 sum += src_x[k] * x_filter[k];
68 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69 x_qn += x_step_qn;
70 }
71 src += src_stride;
72 dst += dst_stride;
73 }
74 }
75
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)76 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
77 int dst_stride, int w, int h,
78 const InterpFilterParams *filter_params_x,
79 const InterpFilterParams *filter_params_y,
80 const int subpel_x_qn, const int subpel_y_qn,
81 ConvolveParams *conv_params) {
82 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
83 int im_h = h + filter_params_y->taps - 1;
84 int im_stride = w;
85 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
86 const int fo_vert = filter_params_y->taps / 2 - 1;
87 const int fo_horiz = filter_params_x->taps / 2 - 1;
88 const int bd = 8;
89 const int bits =
90 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
91
92 // horizontal filter
93 const uint8_t *src_horiz = src - fo_vert * src_stride;
94 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
95 filter_params_x, subpel_x_qn & SUBPEL_MASK);
96 for (int y = 0; y < im_h; ++y) {
97 for (int x = 0; x < w; ++x) {
98 int32_t sum = (1 << (bd + FILTER_BITS - 1));
99 for (int k = 0; k < filter_params_x->taps; ++k) {
100 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
101 }
102 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
103 im_block[y * im_stride + x] =
104 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
105 }
106 }
107
108 // vertical filter
109 int16_t *src_vert = im_block + fo_vert * im_stride;
110 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
111 filter_params_y, subpel_y_qn & SUBPEL_MASK);
112 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
113 for (int y = 0; y < h; ++y) {
114 for (int x = 0; x < w; ++x) {
115 int32_t sum = 1 << offset_bits;
116 for (int k = 0; k < filter_params_y->taps; ++k) {
117 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
118 }
119 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
120 int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
121 ((1 << (offset_bits - conv_params->round_1)) +
122 (1 << (offset_bits - conv_params->round_1 - 1)));
123 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
124 }
125 }
126 }
127
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)128 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
129 int dst_stride, int w, int h,
130 const InterpFilterParams *filter_params_y,
131 const int subpel_y_qn) {
132 const int fo_vert = filter_params_y->taps / 2 - 1;
133
134 // vertical filter
135 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
136 filter_params_y, subpel_y_qn & SUBPEL_MASK);
137 for (int y = 0; y < h; ++y) {
138 for (int x = 0; x < w; ++x) {
139 int32_t res = 0;
140 for (int k = 0; k < filter_params_y->taps; ++k) {
141 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
142 }
143 dst[y * dst_stride + x] =
144 clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
145 }
146 }
147 }
148
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)149 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
150 int dst_stride, int w, int h,
151 const InterpFilterParams *filter_params_x,
152 const int subpel_x_qn, ConvolveParams *conv_params) {
153 const int fo_horiz = filter_params_x->taps / 2 - 1;
154 const int bits = FILTER_BITS - conv_params->round_0;
155
156 assert(bits >= 0);
157 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
158 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
159
160 // horizontal filter
161 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
162 filter_params_x, subpel_x_qn & SUBPEL_MASK);
163
164 for (int y = 0; y < h; ++y) {
165 for (int x = 0; x < w; ++x) {
166 int32_t res = 0;
167 for (int k = 0; k < filter_params_x->taps; ++k) {
168 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
169 }
170 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
171 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
172 }
173 }
174 }
175
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)176 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
177 uint8_t *dst, int dst_stride, int w, int h,
178 const InterpFilterParams *filter_params_x,
179 const InterpFilterParams *filter_params_y,
180 const int subpel_x_qn, const int subpel_y_qn,
181 ConvolveParams *conv_params) {
182 CONV_BUF_TYPE *dst16 = conv_params->dst;
183 int dst16_stride = conv_params->dst_stride;
184 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
185 int im_h = h + filter_params_y->taps - 1;
186 int im_stride = w;
187 const int fo_vert = filter_params_y->taps / 2 - 1;
188 const int fo_horiz = filter_params_x->taps / 2 - 1;
189 const int bd = 8;
190 const int round_bits =
191 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
192
193 // horizontal filter
194 const uint8_t *src_horiz = src - fo_vert * src_stride;
195 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
196 filter_params_x, subpel_x_qn & SUBPEL_MASK);
197 for (int y = 0; y < im_h; ++y) {
198 for (int x = 0; x < w; ++x) {
199 int32_t sum = (1 << (bd + FILTER_BITS - 1));
200 for (int k = 0; k < filter_params_x->taps; ++k) {
201 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
202 }
203 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
204 im_block[y * im_stride + x] =
205 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
206 }
207 }
208
209 // vertical filter
210 int16_t *src_vert = im_block + fo_vert * im_stride;
211 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
212 filter_params_y, subpel_y_qn & SUBPEL_MASK);
213 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
214 for (int y = 0; y < h; ++y) {
215 for (int x = 0; x < w; ++x) {
216 int32_t sum = 1 << offset_bits;
217 for (int k = 0; k < filter_params_y->taps; ++k) {
218 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
219 }
220 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
221 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
222 if (conv_params->do_average) {
223 int32_t tmp = dst16[y * dst16_stride + x];
224 if (conv_params->use_dist_wtd_comp_avg) {
225 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
226 tmp = tmp >> DIST_PRECISION_BITS;
227 } else {
228 tmp += res;
229 tmp = tmp >> 1;
230 }
231 tmp -= (1 << (offset_bits - conv_params->round_1)) +
232 (1 << (offset_bits - conv_params->round_1 - 1));
233 dst[y * dst_stride + x] =
234 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
235 } else {
236 dst16[y * dst16_stride + x] = res;
237 }
238 }
239 }
240 }
241
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)242 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
243 int dst_stride, int w, int h,
244 const InterpFilterParams *filter_params_y,
245 const int subpel_y_qn,
246 ConvolveParams *conv_params) {
247 CONV_BUF_TYPE *dst16 = conv_params->dst;
248 int dst16_stride = conv_params->dst_stride;
249 const int fo_vert = filter_params_y->taps / 2 - 1;
250 const int bits = FILTER_BITS - conv_params->round_0;
251 const int bd = 8;
252 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
253 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
254 (1 << (offset_bits - conv_params->round_1 - 1));
255 const int round_bits =
256 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
257
258 // vertical filter
259 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
260 filter_params_y, subpel_y_qn & SUBPEL_MASK);
261 for (int y = 0; y < h; ++y) {
262 for (int x = 0; x < w; ++x) {
263 int32_t res = 0;
264 for (int k = 0; k < filter_params_y->taps; ++k) {
265 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
266 }
267 res *= (1 << bits);
268 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
269
270 if (conv_params->do_average) {
271 int32_t tmp = dst16[y * dst16_stride + x];
272 if (conv_params->use_dist_wtd_comp_avg) {
273 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
274 tmp = tmp >> DIST_PRECISION_BITS;
275 } else {
276 tmp += res;
277 tmp = tmp >> 1;
278 }
279 tmp -= round_offset;
280 dst[y * dst_stride + x] =
281 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
282 } else {
283 dst16[y * dst16_stride + x] = res;
284 }
285 }
286 }
287 }
288
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)289 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
290 int dst_stride, int w, int h,
291 const InterpFilterParams *filter_params_x,
292 const int subpel_x_qn,
293 ConvolveParams *conv_params) {
294 CONV_BUF_TYPE *dst16 = conv_params->dst;
295 int dst16_stride = conv_params->dst_stride;
296 const int fo_horiz = filter_params_x->taps / 2 - 1;
297 const int bits = FILTER_BITS - conv_params->round_1;
298 const int bd = 8;
299 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
300 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
301 (1 << (offset_bits - conv_params->round_1 - 1));
302 const int round_bits =
303 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
304
305 // horizontal filter
306 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
307 filter_params_x, subpel_x_qn & SUBPEL_MASK);
308 for (int y = 0; y < h; ++y) {
309 for (int x = 0; x < w; ++x) {
310 int32_t res = 0;
311 for (int k = 0; k < filter_params_x->taps; ++k) {
312 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
313 }
314 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
315 res += round_offset;
316
317 if (conv_params->do_average) {
318 int32_t tmp = dst16[y * dst16_stride + x];
319 if (conv_params->use_dist_wtd_comp_avg) {
320 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
321 tmp = tmp >> DIST_PRECISION_BITS;
322 } else {
323 tmp += res;
324 tmp = tmp >> 1;
325 }
326 tmp -= round_offset;
327 dst[y * dst_stride + x] =
328 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
329 } else {
330 dst16[y * dst16_stride + x] = res;
331 }
332 }
333 }
334 }
335
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params)336 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
337 uint8_t *dst, int dst_stride, int w, int h,
338 ConvolveParams *conv_params) {
339 CONV_BUF_TYPE *dst16 = conv_params->dst;
340 int dst16_stride = conv_params->dst_stride;
341 const int bits =
342 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
343 const int bd = 8;
344 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
345 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
346 (1 << (offset_bits - conv_params->round_1 - 1));
347
348 for (int y = 0; y < h; ++y) {
349 for (int x = 0; x < w; ++x) {
350 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
351 res += round_offset;
352
353 if (conv_params->do_average) {
354 int32_t tmp = dst16[y * dst16_stride + x];
355 if (conv_params->use_dist_wtd_comp_avg) {
356 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
357 tmp = tmp >> DIST_PRECISION_BITS;
358 } else {
359 tmp += res;
360 tmp = tmp >> 1;
361 }
362 tmp -= round_offset;
363 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
364 } else {
365 dst16[y * dst16_stride + x] = res;
366 }
367 }
368 }
369 }
370
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)371 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
372 int dst_stride, int w, int h,
373 const InterpFilterParams *filter_params_x,
374 const InterpFilterParams *filter_params_y,
375 const int subpel_x_qn, const int x_step_qn,
376 const int subpel_y_qn, const int y_step_qn,
377 ConvolveParams *conv_params) {
378 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
379 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
380 filter_params_y->taps;
381 CONV_BUF_TYPE *dst16 = conv_params->dst;
382 const int dst16_stride = conv_params->dst_stride;
383 const int bits =
384 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
385 assert(bits >= 0);
386 int im_stride = w;
387 const int fo_vert = filter_params_y->taps / 2 - 1;
388 const int fo_horiz = filter_params_x->taps / 2 - 1;
389 const int bd = 8;
390
391 // horizontal filter
392 const uint8_t *src_horiz = src - fo_vert * src_stride;
393 for (int y = 0; y < im_h; ++y) {
394 int x_qn = subpel_x_qn;
395 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
396 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
397 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
398 assert(x_filter_idx < SUBPEL_SHIFTS);
399 const int16_t *x_filter =
400 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
401 int32_t sum = (1 << (bd + FILTER_BITS - 1));
402 for (int k = 0; k < filter_params_x->taps; ++k) {
403 sum += x_filter[k] * src_x[k - fo_horiz];
404 }
405 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
406 im_block[y * im_stride + x] =
407 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
408 }
409 src_horiz += src_stride;
410 }
411
412 // vertical filter
413 int16_t *src_vert = im_block + fo_vert * im_stride;
414 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
415 for (int x = 0; x < w; ++x) {
416 int y_qn = subpel_y_qn;
417 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
418 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
419 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
420 assert(y_filter_idx < SUBPEL_SHIFTS);
421 const int16_t *y_filter =
422 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
423 int32_t sum = 1 << offset_bits;
424 for (int k = 0; k < filter_params_y->taps; ++k) {
425 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
426 }
427 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
428 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
429 if (conv_params->is_compound) {
430 if (conv_params->do_average) {
431 int32_t tmp = dst16[y * dst16_stride + x];
432 if (conv_params->use_dist_wtd_comp_avg) {
433 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
434 tmp = tmp >> DIST_PRECISION_BITS;
435 } else {
436 tmp += res;
437 tmp = tmp >> 1;
438 }
439 /* Subtract round offset and convolve round */
440 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
441 (1 << (offset_bits - conv_params->round_1 - 1)));
442 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
443 } else {
444 dst16[y * dst16_stride + x] = res;
445 }
446 } else {
447 /* Subtract round offset and convolve round */
448 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
449 (1 << (offset_bits - conv_params->round_1 - 1)));
450 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
451 }
452 }
453 src_vert++;
454 }
455 }
456
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)457 static void convolve_2d_scale_wrapper(
458 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
459 int h, const InterpFilterParams *filter_params_x,
460 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
461 const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
462 ConvolveParams *conv_params) {
463 if (conv_params->is_compound) {
464 assert(conv_params->dst != NULL);
465 }
466 av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
467 filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
468 y_step_qn, conv_params);
469 }
470
convolve_2d_facade_compound(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)471 static void convolve_2d_facade_compound(
472 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
473 int h, const InterpFilterParams *filter_params_x,
474 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
475 const int subpel_y_qn, ConvolveParams *conv_params) {
476 const bool need_x = subpel_x_qn != 0;
477 const bool need_y = subpel_y_qn != 0;
478 if (!need_x && !need_y) {
479 av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
480 conv_params);
481 } else if (need_x && !need_y) {
482 av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
483 filter_params_x, subpel_x_qn, conv_params);
484 } else if (!need_x && need_y) {
485 av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
486 filter_params_y, subpel_y_qn, conv_params);
487 } else {
488 assert(need_y && need_x);
489 av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
490 filter_params_x, filter_params_y, subpel_x_qn,
491 subpel_y_qn, conv_params);
492 }
493 }
494
convolve_2d_facade_single(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)495 static void convolve_2d_facade_single(
496 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
497 int h, const InterpFilterParams *filter_params_x,
498 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
499 const int subpel_y_qn, ConvolveParams *conv_params) {
500 const bool need_x = subpel_x_qn != 0;
501 const bool need_y = subpel_y_qn != 0;
502 if (!need_x && !need_y) {
503 aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
504 } else if (need_x && !need_y) {
505 av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
506 subpel_x_qn, conv_params);
507 } else if (!need_x && need_y) {
508 av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
509 subpel_y_qn);
510 } else {
511 assert(need_x && need_y);
512 av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
513 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
514 }
515 }
516
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params)517 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
518 int dst_stride, int w, int h,
519 const InterpFilterParams *interp_filters[2],
520 const int subpel_x_qn, int x_step_q4,
521 const int subpel_y_qn, int y_step_q4, int scaled,
522 ConvolveParams *conv_params) {
523 (void)x_step_q4;
524 (void)y_step_q4;
525 (void)dst;
526 (void)dst_stride;
527
528 const InterpFilterParams *filter_params_x = interp_filters[0];
529 const InterpFilterParams *filter_params_y = interp_filters[1];
530
531 // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
532 // Do we have SIMD support to 4-tap case?
533 // 2-tap filter indicates that it is for IntraBC.
534 if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
535 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
536 assert(!scaled);
537 if (subpel_x_qn && subpel_y_qn) {
538 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
539 filter_params_x, filter_params_y, subpel_x_qn,
540 subpel_y_qn, conv_params);
541 return;
542 } else if (subpel_x_qn) {
543 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
544 filter_params_x, subpel_x_qn, conv_params);
545 return;
546 } else if (subpel_y_qn) {
547 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
548 filter_params_y, subpel_y_qn);
549 return;
550 }
551 }
552
553 if (scaled) {
554 convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
555 filter_params_x, filter_params_y, subpel_x_qn,
556 x_step_q4, subpel_y_qn, y_step_q4, conv_params);
557 } else if (conv_params->is_compound) {
558 convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
559 filter_params_x, filter_params_y, subpel_x_qn,
560 subpel_y_qn, conv_params);
561 } else {
562 convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
563 filter_params_x, filter_params_y, subpel_x_qn,
564 subpel_y_qn, conv_params);
565 }
566 }
567
568 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)569 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
570 uint16_t *dst, int dst_stride, int w, int h,
571 const InterpFilterParams *filter_params_x,
572 const int subpel_x_qn,
573 ConvolveParams *conv_params, int bd) {
574 const int fo_horiz = filter_params_x->taps / 2 - 1;
575 const int bits = FILTER_BITS - conv_params->round_0;
576
577 assert(bits >= 0);
578 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
579 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
580
581 // horizontal filter
582 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
583 filter_params_x, subpel_x_qn & SUBPEL_MASK);
584 for (int y = 0; y < h; ++y) {
585 for (int x = 0; x < w; ++x) {
586 int32_t res = 0;
587 for (int k = 0; k < filter_params_x->taps; ++k) {
588 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
589 }
590 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
591 dst[y * dst_stride + x] =
592 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
593 }
594 }
595 }
596
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)597 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
598 uint16_t *dst, int dst_stride, int w, int h,
599 const InterpFilterParams *filter_params_y,
600 const int subpel_y_qn, int bd) {
601 const int fo_vert = filter_params_y->taps / 2 - 1;
602 // vertical filter
603 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
604 filter_params_y, subpel_y_qn & SUBPEL_MASK);
605 for (int y = 0; y < h; ++y) {
606 for (int x = 0; x < w; ++x) {
607 int32_t res = 0;
608 for (int k = 0; k < filter_params_y->taps; ++k) {
609 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
610 }
611 dst[y * dst_stride + x] =
612 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
613 }
614 }
615 }
616
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)617 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
618 uint16_t *dst, int dst_stride, int w, int h,
619 const InterpFilterParams *filter_params_x,
620 const InterpFilterParams *filter_params_y,
621 const int subpel_x_qn, const int subpel_y_qn,
622 ConvolveParams *conv_params, int bd) {
623 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
624 int im_h = h + filter_params_y->taps - 1;
625 int im_stride = w;
626 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
627 const int fo_vert = filter_params_y->taps / 2 - 1;
628 const int fo_horiz = filter_params_x->taps / 2 - 1;
629 const int bits =
630 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
631 assert(bits >= 0);
632
633 // horizontal filter
634 const uint16_t *src_horiz = src - fo_vert * src_stride;
635 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
636 filter_params_x, subpel_x_qn & SUBPEL_MASK);
637 for (int y = 0; y < im_h; ++y) {
638 for (int x = 0; x < w; ++x) {
639 int32_t sum = (1 << (bd + FILTER_BITS - 1));
640 for (int k = 0; k < filter_params_x->taps; ++k) {
641 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
642 }
643 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
644 im_block[y * im_stride + x] =
645 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
646 }
647 }
648
649 // vertical filter
650 int16_t *src_vert = im_block + fo_vert * im_stride;
651 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
652 filter_params_y, subpel_y_qn & SUBPEL_MASK);
653 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
654 for (int y = 0; y < h; ++y) {
655 for (int x = 0; x < w; ++x) {
656 int32_t sum = 1 << offset_bits;
657 for (int k = 0; k < filter_params_y->taps; ++k) {
658 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
659 }
660 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
661 int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
662 ((1 << (offset_bits - conv_params->round_1)) +
663 (1 << (offset_bits - conv_params->round_1 - 1)));
664 dst[y * dst_stride + x] =
665 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
666 }
667 }
668 }
669
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)670 void av1_highbd_dist_wtd_convolve_2d_c(
671 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
672 int h, const InterpFilterParams *filter_params_x,
673 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
674 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
675 int x, y, k;
676 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
677 CONV_BUF_TYPE *dst16 = conv_params->dst;
678 int dst16_stride = conv_params->dst_stride;
679 int im_h = h + filter_params_y->taps - 1;
680 int im_stride = w;
681 const int fo_vert = filter_params_y->taps / 2 - 1;
682 const int fo_horiz = filter_params_x->taps / 2 - 1;
683 const int round_bits =
684 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
685 assert(round_bits >= 0);
686
687 // horizontal filter
688 const uint16_t *src_horiz = src - fo_vert * src_stride;
689 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
690 filter_params_x, subpel_x_qn & SUBPEL_MASK);
691 for (y = 0; y < im_h; ++y) {
692 for (x = 0; x < w; ++x) {
693 int32_t sum = (1 << (bd + FILTER_BITS - 1));
694 for (k = 0; k < filter_params_x->taps; ++k) {
695 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
696 }
697 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
698 (void)bd;
699 im_block[y * im_stride + x] =
700 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
701 }
702 }
703
704 // vertical filter
705 int16_t *src_vert = im_block + fo_vert * im_stride;
706 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
707 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
708 filter_params_y, subpel_y_qn & SUBPEL_MASK);
709 for (y = 0; y < h; ++y) {
710 for (x = 0; x < w; ++x) {
711 int32_t sum = 1 << offset_bits;
712 for (k = 0; k < filter_params_y->taps; ++k) {
713 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
714 }
715 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
716 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
717 if (conv_params->do_average) {
718 int32_t tmp = dst16[y * dst16_stride + x];
719 if (conv_params->use_dist_wtd_comp_avg) {
720 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
721 tmp = tmp >> DIST_PRECISION_BITS;
722 } else {
723 tmp += res;
724 tmp = tmp >> 1;
725 }
726 tmp -= (1 << (offset_bits - conv_params->round_1)) +
727 (1 << (offset_bits - conv_params->round_1 - 1));
728 dst[y * dst_stride + x] =
729 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
730 } else {
731 dst16[y * dst16_stride + x] = res;
732 }
733 }
734 }
735 }
736
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)737 void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
738 uint16_t *dst, int dst_stride, int w,
739 int h,
740 const InterpFilterParams *filter_params_x,
741 const int subpel_x_qn,
742 ConvolveParams *conv_params, int bd) {
743 CONV_BUF_TYPE *dst16 = conv_params->dst;
744 int dst16_stride = conv_params->dst_stride;
745 const int fo_horiz = filter_params_x->taps / 2 - 1;
746 const int bits = FILTER_BITS - conv_params->round_1;
747 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
748 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
749 (1 << (offset_bits - conv_params->round_1 - 1));
750 const int round_bits =
751 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
752 assert(round_bits >= 0);
753 assert(bits >= 0);
754 // horizontal filter
755 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
756 filter_params_x, subpel_x_qn & SUBPEL_MASK);
757 for (int y = 0; y < h; ++y) {
758 for (int x = 0; x < w; ++x) {
759 int32_t res = 0;
760 for (int k = 0; k < filter_params_x->taps; ++k) {
761 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
762 }
763 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
764 res += round_offset;
765
766 if (conv_params->do_average) {
767 int32_t tmp = dst16[y * dst16_stride + x];
768 if (conv_params->use_dist_wtd_comp_avg) {
769 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
770 tmp = tmp >> DIST_PRECISION_BITS;
771 } else {
772 tmp += res;
773 tmp = tmp >> 1;
774 }
775 tmp -= round_offset;
776 dst[y * dst_stride + x] =
777 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
778 } else {
779 dst16[y * dst16_stride + x] = res;
780 }
781 }
782 }
783 }
784
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)785 void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
786 uint16_t *dst, int dst_stride, int w,
787 int h,
788 const InterpFilterParams *filter_params_y,
789 const int subpel_y_qn,
790 ConvolveParams *conv_params, int bd) {
791 CONV_BUF_TYPE *dst16 = conv_params->dst;
792 int dst16_stride = conv_params->dst_stride;
793 const int fo_vert = filter_params_y->taps / 2 - 1;
794 const int bits = FILTER_BITS - conv_params->round_0;
795 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
796 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
797 (1 << (offset_bits - conv_params->round_1 - 1));
798 const int round_bits =
799 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
800 assert(round_bits >= 0);
801 assert(bits >= 0);
802 // vertical filter
803 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
804 filter_params_y, subpel_y_qn & SUBPEL_MASK);
805 for (int y = 0; y < h; ++y) {
806 for (int x = 0; x < w; ++x) {
807 int32_t res = 0;
808 for (int k = 0; k < filter_params_y->taps; ++k) {
809 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
810 }
811 res *= (1 << bits);
812 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
813
814 if (conv_params->do_average) {
815 int32_t tmp = dst16[y * dst16_stride + x];
816 if (conv_params->use_dist_wtd_comp_avg) {
817 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
818 tmp = tmp >> DIST_PRECISION_BITS;
819 } else {
820 tmp += res;
821 tmp = tmp >> 1;
822 }
823 tmp -= round_offset;
824 dst[y * dst_stride + x] =
825 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
826 } else {
827 dst16[y * dst16_stride + x] = res;
828 }
829 }
830 }
831 }
832
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)833 void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
834 uint16_t *dst, int dst_stride,
835 int w, int h,
836 ConvolveParams *conv_params,
837 int bd) {
838 CONV_BUF_TYPE *dst16 = conv_params->dst;
839 int dst16_stride = conv_params->dst_stride;
840 const int bits =
841 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
842 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
843 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
844 (1 << (offset_bits - conv_params->round_1 - 1));
845 assert(bits >= 0);
846
847 for (int y = 0; y < h; ++y) {
848 for (int x = 0; x < w; ++x) {
849 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
850 res += round_offset;
851 if (conv_params->do_average) {
852 int32_t tmp = dst16[y * dst16_stride + x];
853 if (conv_params->use_dist_wtd_comp_avg) {
854 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
855 tmp = tmp >> DIST_PRECISION_BITS;
856 } else {
857 tmp += res;
858 tmp = tmp >> 1;
859 }
860 tmp -= round_offset;
861 dst[y * dst_stride + x] =
862 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
863 } else {
864 dst16[y * dst16_stride + x] = res;
865 }
866 }
867 }
868 }
869
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)870 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
871 uint16_t *dst, int dst_stride, int w, int h,
872 const InterpFilterParams *filter_params_x,
873 const InterpFilterParams *filter_params_y,
874 const int subpel_x_qn, const int x_step_qn,
875 const int subpel_y_qn, const int y_step_qn,
876 ConvolveParams *conv_params, int bd) {
877 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
878 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
879 filter_params_y->taps;
880 int im_stride = w;
881 const int fo_vert = filter_params_y->taps / 2 - 1;
882 const int fo_horiz = filter_params_x->taps / 2 - 1;
883 CONV_BUF_TYPE *dst16 = conv_params->dst;
884 const int dst16_stride = conv_params->dst_stride;
885 const int bits =
886 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
887 assert(bits >= 0);
888 // horizontal filter
889 const uint16_t *src_horiz = src - fo_vert * src_stride;
890 for (int y = 0; y < im_h; ++y) {
891 int x_qn = subpel_x_qn;
892 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
893 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
894 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
895 assert(x_filter_idx < SUBPEL_SHIFTS);
896 const int16_t *x_filter =
897 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
898 int32_t sum = (1 << (bd + FILTER_BITS - 1));
899 for (int k = 0; k < filter_params_x->taps; ++k) {
900 sum += x_filter[k] * src_x[k - fo_horiz];
901 }
902 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
903 im_block[y * im_stride + x] =
904 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
905 }
906 src_horiz += src_stride;
907 }
908
909 // vertical filter
910 int16_t *src_vert = im_block + fo_vert * im_stride;
911 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
912 for (int x = 0; x < w; ++x) {
913 int y_qn = subpel_y_qn;
914 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
915 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
916 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
917 assert(y_filter_idx < SUBPEL_SHIFTS);
918 const int16_t *y_filter =
919 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
920 int32_t sum = 1 << offset_bits;
921 for (int k = 0; k < filter_params_y->taps; ++k) {
922 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
923 }
924 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
925 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
926 if (conv_params->is_compound) {
927 if (conv_params->do_average) {
928 int32_t tmp = dst16[y * dst16_stride + x];
929 if (conv_params->use_dist_wtd_comp_avg) {
930 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
931 tmp = tmp >> DIST_PRECISION_BITS;
932 } else {
933 tmp += res;
934 tmp = tmp >> 1;
935 }
936 /* Subtract round offset and convolve round */
937 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
938 (1 << (offset_bits - conv_params->round_1 - 1)));
939 dst[y * dst_stride + x] =
940 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
941 } else {
942 dst16[y * dst16_stride + x] = res;
943 }
944 } else {
945 /* Subtract round offset and convolve round */
946 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
947 (1 << (offset_bits - conv_params->round_1 - 1)));
948 dst[y * dst_stride + x] =
949 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
950 }
951 }
952 src_vert++;
953 }
954 }
955
highbd_convolve_2d_facade_compound(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)956 static void highbd_convolve_2d_facade_compound(
957 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
958 const int w, const int h, const InterpFilterParams *filter_params_x,
959 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
960 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
961 const bool need_x = subpel_x_qn != 0;
962 const bool need_y = subpel_y_qn != 0;
963 if (!need_x && !need_y) {
964 av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
965 conv_params, bd);
966 } else if (need_x && !need_y) {
967 av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
968 filter_params_x, subpel_x_qn, conv_params,
969 bd);
970 } else if (!need_x && need_y) {
971 av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
972 filter_params_y, subpel_y_qn, conv_params,
973 bd);
974 } else {
975 assert(need_x && need_y);
976 av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
977 filter_params_x, filter_params_y,
978 subpel_x_qn, subpel_y_qn, conv_params, bd);
979 }
980 }
981
highbd_convolve_2d_facade_single(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)982 static void highbd_convolve_2d_facade_single(
983 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
984 const int w, const int h, const InterpFilterParams *filter_params_x,
985 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
986 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
987 const bool need_x = subpel_x_qn != 0;
988 const bool need_y = subpel_y_qn != 0;
989
990 if (!need_x && !need_y) {
991 aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
992 } else if (need_x && !need_y) {
993 av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
994 filter_params_x, subpel_x_qn, conv_params, bd);
995 } else if (!need_x && need_y) {
996 av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
997 filter_params_y, subpel_y_qn, bd);
998 } else {
999 assert(need_x && need_y);
1000 av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1001 filter_params_x, filter_params_y, subpel_x_qn,
1002 subpel_y_qn, conv_params, bd);
1003 }
1004 }
1005
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,int bd)1006 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1007 uint8_t *dst8, int dst_stride, int w, int h,
1008 const InterpFilterParams *interp_filters[2],
1009 const int subpel_x_qn, int x_step_q4,
1010 const int subpel_y_qn, int y_step_q4,
1011 int scaled, ConvolveParams *conv_params,
1012 int bd) {
1013 (void)x_step_q4;
1014 (void)y_step_q4;
1015 (void)dst_stride;
1016 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1017
1018 const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
1019 const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
1020 const InterpFilterParams *filter_params_x =
1021 need_filter_params_x ? interp_filters[0] : NULL;
1022 const InterpFilterParams *filter_params_y =
1023 need_filter_params_y ? interp_filters[1] : NULL;
1024
1025 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1026 if (scaled) {
1027 if (conv_params->is_compound) {
1028 assert(conv_params->dst != NULL);
1029 }
1030 av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1031 filter_params_x, filter_params_y, subpel_x_qn,
1032 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1033 bd);
1034 } else if (conv_params->is_compound) {
1035 highbd_convolve_2d_facade_compound(
1036 src, src_stride, dst, dst_stride, w, h, filter_params_x,
1037 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1038 } else {
1039 highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1040 filter_params_x, filter_params_y,
1041 subpel_x_qn, subpel_y_qn, conv_params, bd);
1042 }
1043 }
1044 #endif // CONFIG_AV1_HIGHBITDEPTH
1045
1046 // Note: Fixed size intermediate buffers, place limits on parameters
1047 // of some functions. 2d filtering proceeds in 2 steps:
1048 // (1) Interpolate horizontally into an intermediate buffer, temp.
1049 // (2) Interpolate temp vertically to derive the sub-pixel result.
1050 // Deriving the maximum number of rows in the temp buffer (135):
1051 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1052 // --Largest block size is 128x128 pixels.
1053 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1054 // original frame (in 1/16th pixel units).
1055 // --Must round-up because block may be located at sub-pixel position.
1056 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1057 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1058 #define WIENER_MAX_EXT_SIZE 263
1059
horz_scalar_product(const uint8_t * a,const int16_t * b)1060 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1061 int sum = 0;
1062 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1063 return sum;
1064 }
1065
1066 #if CONFIG_AV1_HIGHBITDEPTH
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1067 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1068 const int16_t *b) {
1069 int sum = 0;
1070 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1071 return sum;
1072 }
1073 #endif
1074
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1075 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1076 ptrdiff_t a_stride,
1077 const int16_t *b) {
1078 int sum = 0;
1079 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1080 return sum;
1081 }
1082
get_filter_base(const int16_t * filter)1083 static const InterpKernel *get_filter_base(const int16_t *filter) {
1084 // NOTE: This assumes that the filter table is 256-byte aligned.
1085 // TODO(agrange) Modify to make independent of table alignment.
1086 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1087 }
1088
get_filter_offset(const int16_t * f,const InterpKernel * base)1089 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1090 return (int)((const InterpKernel *)(intptr_t)f - base);
1091 }
1092
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1093 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1094 uint16_t *dst, ptrdiff_t dst_stride,
1095 const InterpKernel *x_filters, int x0_q4,
1096 int x_step_q4, int w, int h,
1097 int round0_bits) {
1098 const int bd = 8;
1099 src -= SUBPEL_TAPS / 2 - 1;
1100 for (int y = 0; y < h; ++y) {
1101 int x_q4 = x0_q4;
1102 for (int x = 0; x < w; ++x) {
1103 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1104 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1105 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1106 (1 << (bd + FILTER_BITS - 1));
1107 const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1108 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1109 WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1110 x_q4 += x_step_q4;
1111 }
1112 src += src_stride;
1113 dst += dst_stride;
1114 }
1115 }
1116
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1117 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1118 uint8_t *dst, ptrdiff_t dst_stride,
1119 const InterpKernel *y_filters, int y0_q4,
1120 int y_step_q4, int w, int h,
1121 int round1_bits) {
1122 const int bd = 8;
1123 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1124
1125 for (int x = 0; x < w; ++x) {
1126 int y_q4 = y0_q4;
1127 for (int y = 0; y < h; ++y) {
1128 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1129 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1130 const int rounding =
1131 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1132 (1 << (bd + round1_bits - 1));
1133 const int sum =
1134 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1135 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1136 y_q4 += y_step_q4;
1137 }
1138 ++src;
1139 ++dst;
1140 }
1141 }
1142
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params)1143 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1144 uint8_t *dst, ptrdiff_t dst_stride,
1145 const int16_t *filter_x, int x_step_q4,
1146 const int16_t *filter_y, int y_step_q4,
1147 int w, int h,
1148 const ConvolveParams *conv_params) {
1149 const InterpKernel *const filters_x = get_filter_base(filter_x);
1150 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1151
1152 const InterpKernel *const filters_y = get_filter_base(filter_y);
1153 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1154
1155 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1156 const int intermediate_height =
1157 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1158 memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1159
1160 assert(w <= MAX_SB_SIZE);
1161 assert(h <= MAX_SB_SIZE);
1162 assert(y_step_q4 <= 32);
1163 assert(x_step_q4 <= 32);
1164
1165 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1166 src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1167 x_step_q4, w, intermediate_height,
1168 conv_params->round_0);
1169 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1170 MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1171 y_step_q4, w, h, conv_params->round_1);
1172 }
1173
1174 #if CONFIG_AV1_HIGHBITDEPTH
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1175 static void highbd_convolve_add_src_horiz_hip(
1176 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1177 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1178 int x_step_q4, int w, int h, int round0_bits, int bd) {
1179 const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1180 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1181 src -= SUBPEL_TAPS / 2 - 1;
1182 for (int y = 0; y < h; ++y) {
1183 int x_q4 = x0_q4;
1184 for (int x = 0; x < w; ++x) {
1185 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1186 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1187 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1188 (1 << (bd + FILTER_BITS - 1));
1189 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1190 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1191 extraprec_clamp_limit - 1);
1192 x_q4 += x_step_q4;
1193 }
1194 src += src_stride;
1195 dst += dst_stride;
1196 }
1197 }
1198
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1199 static void highbd_convolve_add_src_vert_hip(
1200 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1201 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1202 int y_step_q4, int w, int h, int round1_bits, int bd) {
1203 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1204 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1205 for (int x = 0; x < w; ++x) {
1206 int y_q4 = y0_q4;
1207 for (int y = 0; y < h; ++y) {
1208 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1209 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1210 const int rounding =
1211 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1212 (1 << (bd + round1_bits - 1));
1213 const int sum =
1214 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1215 dst[y * dst_stride] =
1216 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1217 y_q4 += y_step_q4;
1218 }
1219 ++src;
1220 ++dst;
1221 }
1222 }
1223
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params,int bd)1224 void av1_highbd_wiener_convolve_add_src_c(
1225 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1226 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1227 const int16_t *filter_y, int y_step_q4, int w, int h,
1228 const ConvolveParams *conv_params, int bd) {
1229 const InterpKernel *const filters_x = get_filter_base(filter_x);
1230 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1231
1232 const InterpKernel *const filters_y = get_filter_base(filter_y);
1233 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1234
1235 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1236 const int intermediate_height =
1237 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1238
1239 assert(w <= MAX_SB_SIZE);
1240 assert(h <= MAX_SB_SIZE);
1241 assert(y_step_q4 <= 32);
1242 assert(x_step_q4 <= 32);
1243 assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1244
1245 highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1246 src_stride, temp, MAX_SB_SIZE, filters_x,
1247 x0_q4, x_step_q4, w, intermediate_height,
1248 conv_params->round_0, bd);
1249 highbd_convolve_add_src_vert_hip(
1250 temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1251 filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1252 }
1253 #endif // CONFIG_AV1_HIGHBITDEPTH
1254