1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <string.h>
14
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27 int dst_stride, int w, int h,
28 const int16_t *x_filters, int x0_qn,
29 int x_step_qn) {
30 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31 for (int y = 0; y < h; ++y) {
32 int x_qn = x0_qn;
33 for (int x = 0; x < w; ++x) {
34 const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35 const int x_filter_idx =
36 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37 assert(x_filter_idx <= RS_SUBPEL_MASK);
38 const int16_t *const x_filter =
39 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40 int sum = 0;
41 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42 sum += src_x[k] * x_filter[k];
43 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44 x_qn += x_step_qn;
45 }
46 src += src_stride;
47 dst += dst_stride;
48 }
49 }
50
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52 uint16_t *dst, int dst_stride, int w, int h,
53 const int16_t *x_filters, int x0_qn,
54 int x_step_qn, int bd) {
55 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56 for (int y = 0; y < h; ++y) {
57 int x_qn = x0_qn;
58 for (int x = 0; x < w; ++x) {
59 const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60 const int x_filter_idx =
61 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62 assert(x_filter_idx <= RS_SUBPEL_MASK);
63 const int16_t *const x_filter =
64 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65 int sum = 0;
66 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67 sum += src_x[k] * x_filter[k];
68 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69 x_qn += x_step_qn;
70 }
71 src += src_stride;
72 dst += dst_stride;
73 }
74 }
75
av1_convolve_2d_sobel_y_c(const uint8_t * src,int src_stride,double * dst,int dst_stride,int w,int h,int dir,double norm)76 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
77 int dst_stride, int w, int h, int dir,
78 double norm) {
79 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
80 DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
81 DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
82 const int taps = 3;
83 int im_h = h + taps - 1;
84 int im_stride = w;
85 const int fo_vert = 1;
86 const int fo_horiz = 1;
87
88 // horizontal filter
89 const uint8_t *src_horiz = src - fo_vert * src_stride;
90 const int16_t *x_filter = dir ? sobel_a : sobel_b;
91 for (int y = 0; y < im_h; ++y) {
92 for (int x = 0; x < w; ++x) {
93 int16_t sum = 0;
94 for (int k = 0; k < taps; ++k) {
95 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
96 }
97 im_block[y * im_stride + x] = sum;
98 }
99 }
100
101 // vertical filter
102 int16_t *src_vert = im_block + fo_vert * im_stride;
103 const int16_t *y_filter = dir ? sobel_b : sobel_a;
104 for (int y = 0; y < h; ++y) {
105 for (int x = 0; x < w; ++x) {
106 int16_t sum = 0;
107 for (int k = 0; k < taps; ++k) {
108 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
109 }
110 dst[y * dst_stride + x] = sum * norm;
111 }
112 }
113 }
114
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)115 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
116 int dst_stride, int w, int h,
117 const InterpFilterParams *filter_params_x,
118 const InterpFilterParams *filter_params_y,
119 const int subpel_x_qn, const int subpel_y_qn,
120 ConvolveParams *conv_params) {
121 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
122 int im_h = h + filter_params_y->taps - 1;
123 int im_stride = w;
124 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
125 const int fo_vert = filter_params_y->taps / 2 - 1;
126 const int fo_horiz = filter_params_x->taps / 2 - 1;
127 const int bd = 8;
128 const int bits =
129 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
130
131 // horizontal filter
132 const uint8_t *src_horiz = src - fo_vert * src_stride;
133 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
134 filter_params_x, subpel_x_qn & SUBPEL_MASK);
135 for (int y = 0; y < im_h; ++y) {
136 for (int x = 0; x < w; ++x) {
137 int32_t sum = (1 << (bd + FILTER_BITS - 1));
138 for (int k = 0; k < filter_params_x->taps; ++k) {
139 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
140 }
141 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
142 im_block[y * im_stride + x] =
143 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
144 }
145 }
146
147 // vertical filter
148 int16_t *src_vert = im_block + fo_vert * im_stride;
149 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
150 filter_params_y, subpel_y_qn & SUBPEL_MASK);
151 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
152 for (int y = 0; y < h; ++y) {
153 for (int x = 0; x < w; ++x) {
154 int32_t sum = 1 << offset_bits;
155 for (int k = 0; k < filter_params_y->taps; ++k) {
156 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
157 }
158 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
159 int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
160 ((1 << (offset_bits - conv_params->round_1)) +
161 (1 << (offset_bits - conv_params->round_1 - 1)));
162 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
163 }
164 }
165 }
166
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)167 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
168 int dst_stride, int w, int h,
169 const InterpFilterParams *filter_params_x,
170 const InterpFilterParams *filter_params_y,
171 const int subpel_x_qn, const int subpel_y_qn,
172 ConvolveParams *conv_params) {
173 const int fo_vert = filter_params_y->taps / 2 - 1;
174 (void)filter_params_x;
175 (void)subpel_x_qn;
176 (void)conv_params;
177
178 assert(conv_params->round_0 <= FILTER_BITS);
179 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
180 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
181
182 // vertical filter
183 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
184 filter_params_y, subpel_y_qn & SUBPEL_MASK);
185 for (int y = 0; y < h; ++y) {
186 for (int x = 0; x < w; ++x) {
187 int32_t res = 0;
188 for (int k = 0; k < filter_params_y->taps; ++k) {
189 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
190 }
191 dst[y * dst_stride + x] =
192 clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
193 }
194 }
195 }
196
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)197 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
198 int dst_stride, int w, int h,
199 const InterpFilterParams *filter_params_x,
200 const InterpFilterParams *filter_params_y,
201 const int subpel_x_qn, const int subpel_y_qn,
202 ConvolveParams *conv_params) {
203 const int fo_horiz = filter_params_x->taps / 2 - 1;
204 const int bits = FILTER_BITS - conv_params->round_0;
205 (void)filter_params_y;
206 (void)subpel_y_qn;
207 (void)conv_params;
208
209 assert(bits >= 0);
210 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
211 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
212
213 // horizontal filter
214 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
215 filter_params_x, subpel_x_qn & SUBPEL_MASK);
216
217 for (int y = 0; y < h; ++y) {
218 for (int x = 0; x < w; ++x) {
219 int32_t res = 0;
220 for (int k = 0; k < filter_params_x->taps; ++k) {
221 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
222 }
223 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
224 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
225 }
226 }
227 }
228
av1_convolve_2d_copy_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)229 void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
230 int dst_stride, int w, int h,
231 const InterpFilterParams *filter_params_x,
232 const InterpFilterParams *filter_params_y,
233 const int subpel_x_qn, const int subpel_y_qn,
234 ConvolveParams *conv_params) {
235 (void)filter_params_x;
236 (void)filter_params_y;
237 (void)subpel_x_qn;
238 (void)subpel_y_qn;
239 (void)conv_params;
240
241 for (int y = 0; y < h; ++y) {
242 memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
243 }
244 }
245
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)246 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
247 uint8_t *dst, int dst_stride, int w, int h,
248 const InterpFilterParams *filter_params_x,
249 const InterpFilterParams *filter_params_y,
250 const int subpel_x_qn, const int subpel_y_qn,
251 ConvolveParams *conv_params) {
252 CONV_BUF_TYPE *dst16 = conv_params->dst;
253 int dst16_stride = conv_params->dst_stride;
254 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
255 int im_h = h + filter_params_y->taps - 1;
256 int im_stride = w;
257 const int fo_vert = filter_params_y->taps / 2 - 1;
258 const int fo_horiz = filter_params_x->taps / 2 - 1;
259 const int bd = 8;
260 const int round_bits =
261 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
262
263 // horizontal filter
264 const uint8_t *src_horiz = src - fo_vert * src_stride;
265 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
266 filter_params_x, subpel_x_qn & SUBPEL_MASK);
267 for (int y = 0; y < im_h; ++y) {
268 for (int x = 0; x < w; ++x) {
269 int32_t sum = (1 << (bd + FILTER_BITS - 1));
270 for (int k = 0; k < filter_params_x->taps; ++k) {
271 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
272 }
273 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
274 im_block[y * im_stride + x] =
275 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
276 }
277 }
278
279 // vertical filter
280 int16_t *src_vert = im_block + fo_vert * im_stride;
281 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
282 filter_params_y, subpel_y_qn & SUBPEL_MASK);
283 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
284 for (int y = 0; y < h; ++y) {
285 for (int x = 0; x < w; ++x) {
286 int32_t sum = 1 << offset_bits;
287 for (int k = 0; k < filter_params_y->taps; ++k) {
288 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
289 }
290 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
291 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
292 if (conv_params->do_average) {
293 int32_t tmp = dst16[y * dst16_stride + x];
294 if (conv_params->use_dist_wtd_comp_avg) {
295 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
296 tmp = tmp >> DIST_PRECISION_BITS;
297 } else {
298 tmp += res;
299 tmp = tmp >> 1;
300 }
301 tmp -= (1 << (offset_bits - conv_params->round_1)) +
302 (1 << (offset_bits - conv_params->round_1 - 1));
303 dst[y * dst_stride + x] =
304 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
305 } else {
306 dst16[y * dst16_stride + x] = res;
307 }
308 }
309 }
310 }
311
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)312 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
313 int dst_stride, int w, int h,
314 const InterpFilterParams *filter_params_x,
315 const InterpFilterParams *filter_params_y,
316 const int subpel_x_qn, const int subpel_y_qn,
317 ConvolveParams *conv_params) {
318 CONV_BUF_TYPE *dst16 = conv_params->dst;
319 int dst16_stride = conv_params->dst_stride;
320 const int fo_vert = filter_params_y->taps / 2 - 1;
321 const int bits = FILTER_BITS - conv_params->round_0;
322 const int bd = 8;
323 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
324 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
325 (1 << (offset_bits - conv_params->round_1 - 1));
326 const int round_bits =
327 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
328 (void)filter_params_x;
329 (void)subpel_x_qn;
330
331 // vertical filter
332 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
333 filter_params_y, subpel_y_qn & SUBPEL_MASK);
334 for (int y = 0; y < h; ++y) {
335 for (int x = 0; x < w; ++x) {
336 int32_t res = 0;
337 for (int k = 0; k < filter_params_y->taps; ++k) {
338 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
339 }
340 res *= (1 << bits);
341 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
342
343 if (conv_params->do_average) {
344 int32_t tmp = dst16[y * dst16_stride + x];
345 if (conv_params->use_dist_wtd_comp_avg) {
346 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
347 tmp = tmp >> DIST_PRECISION_BITS;
348 } else {
349 tmp += res;
350 tmp = tmp >> 1;
351 }
352 tmp -= round_offset;
353 dst[y * dst_stride + x] =
354 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
355 } else {
356 dst16[y * dst16_stride + x] = res;
357 }
358 }
359 }
360 }
361
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)362 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
363 int dst_stride, int w, int h,
364 const InterpFilterParams *filter_params_x,
365 const InterpFilterParams *filter_params_y,
366 const int subpel_x_qn, const int subpel_y_qn,
367 ConvolveParams *conv_params) {
368 CONV_BUF_TYPE *dst16 = conv_params->dst;
369 int dst16_stride = conv_params->dst_stride;
370 const int fo_horiz = filter_params_x->taps / 2 - 1;
371 const int bits = FILTER_BITS - conv_params->round_1;
372 const int bd = 8;
373 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
374 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
375 (1 << (offset_bits - conv_params->round_1 - 1));
376 const int round_bits =
377 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
378 (void)filter_params_y;
379 (void)subpel_y_qn;
380
381 // horizontal filter
382 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
383 filter_params_x, subpel_x_qn & SUBPEL_MASK);
384 for (int y = 0; y < h; ++y) {
385 for (int x = 0; x < w; ++x) {
386 int32_t res = 0;
387 for (int k = 0; k < filter_params_x->taps; ++k) {
388 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
389 }
390 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
391 res += round_offset;
392
393 if (conv_params->do_average) {
394 int32_t tmp = dst16[y * dst16_stride + x];
395 if (conv_params->use_dist_wtd_comp_avg) {
396 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
397 tmp = tmp >> DIST_PRECISION_BITS;
398 } else {
399 tmp += res;
400 tmp = tmp >> 1;
401 }
402 tmp -= round_offset;
403 dst[y * dst_stride + x] =
404 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
405 } else {
406 dst16[y * dst16_stride + x] = res;
407 }
408 }
409 }
410 }
411
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)412 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
413 uint8_t *dst, int dst_stride, int w, int h,
414 const InterpFilterParams *filter_params_x,
415 const InterpFilterParams *filter_params_y,
416 const int subpel_x_qn,
417 const int subpel_y_qn,
418 ConvolveParams *conv_params) {
419 CONV_BUF_TYPE *dst16 = conv_params->dst;
420 int dst16_stride = conv_params->dst_stride;
421 const int bits =
422 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
423 const int bd = 8;
424 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
425 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
426 (1 << (offset_bits - conv_params->round_1 - 1));
427 (void)filter_params_x;
428 (void)filter_params_y;
429 (void)subpel_x_qn;
430 (void)subpel_y_qn;
431
432 for (int y = 0; y < h; ++y) {
433 for (int x = 0; x < w; ++x) {
434 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
435 res += round_offset;
436
437 if (conv_params->do_average) {
438 int32_t tmp = dst16[y * dst16_stride + x];
439 if (conv_params->use_dist_wtd_comp_avg) {
440 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
441 tmp = tmp >> DIST_PRECISION_BITS;
442 } else {
443 tmp += res;
444 tmp = tmp >> 1;
445 }
446 tmp -= round_offset;
447 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
448 } else {
449 dst16[y * dst16_stride + x] = res;
450 }
451 }
452 }
453 }
454
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)455 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
456 int dst_stride, int w, int h,
457 const InterpFilterParams *filter_params_x,
458 const InterpFilterParams *filter_params_y,
459 const int subpel_x_qn, const int x_step_qn,
460 const int subpel_y_qn, const int y_step_qn,
461 ConvolveParams *conv_params) {
462 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
463 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
464 filter_params_y->taps;
465 CONV_BUF_TYPE *dst16 = conv_params->dst;
466 const int dst16_stride = conv_params->dst_stride;
467 const int bits =
468 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
469 assert(bits >= 0);
470 int im_stride = w;
471 const int fo_vert = filter_params_y->taps / 2 - 1;
472 const int fo_horiz = filter_params_x->taps / 2 - 1;
473 const int bd = 8;
474
475 // horizontal filter
476 const uint8_t *src_horiz = src - fo_vert * src_stride;
477 for (int y = 0; y < im_h; ++y) {
478 int x_qn = subpel_x_qn;
479 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
480 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
481 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
482 assert(x_filter_idx < SUBPEL_SHIFTS);
483 const int16_t *x_filter =
484 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
485 int32_t sum = (1 << (bd + FILTER_BITS - 1));
486 for (int k = 0; k < filter_params_x->taps; ++k) {
487 sum += x_filter[k] * src_x[k - fo_horiz];
488 }
489 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
490 im_block[y * im_stride + x] =
491 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
492 }
493 src_horiz += src_stride;
494 }
495
496 // vertical filter
497 int16_t *src_vert = im_block + fo_vert * im_stride;
498 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
499 for (int x = 0; x < w; ++x) {
500 int y_qn = subpel_y_qn;
501 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
502 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
503 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
504 assert(y_filter_idx < SUBPEL_SHIFTS);
505 const int16_t *y_filter =
506 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
507 int32_t sum = 1 << offset_bits;
508 for (int k = 0; k < filter_params_y->taps; ++k) {
509 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
510 }
511 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
512 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
513 if (conv_params->is_compound) {
514 if (conv_params->do_average) {
515 int32_t tmp = dst16[y * dst16_stride + x];
516 if (conv_params->use_dist_wtd_comp_avg) {
517 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
518 tmp = tmp >> DIST_PRECISION_BITS;
519 } else {
520 tmp += res;
521 tmp = tmp >> 1;
522 }
523 /* Subtract round offset and convolve round */
524 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
525 (1 << (offset_bits - conv_params->round_1 - 1)));
526 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
527 } else {
528 dst16[y * dst16_stride + x] = res;
529 }
530 } else {
531 /* Subtract round offset and convolve round */
532 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
533 (1 << (offset_bits - conv_params->round_1 - 1)));
534 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
535 }
536 }
537 src_vert++;
538 }
539 }
540
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)541 static void convolve_2d_scale_wrapper(
542 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
543 int h, const InterpFilterParams *filter_params_x,
544 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
545 const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
546 ConvolveParams *conv_params) {
547 if (conv_params->is_compound) {
548 assert(conv_params->dst != NULL);
549 }
550 av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
551 filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
552 y_step_qn, conv_params);
553 }
554
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,const struct scale_factors * sf)555 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
556 int dst_stride, int w, int h,
557 const InterpFilterParams *interp_filters[2],
558 const int subpel_x_qn, int x_step_q4,
559 const int subpel_y_qn, int y_step_q4, int scaled,
560 ConvolveParams *conv_params,
561 const struct scale_factors *sf) {
562 (void)x_step_q4;
563 (void)y_step_q4;
564 (void)dst;
565 (void)dst_stride;
566
567 const InterpFilterParams *filter_params_x = interp_filters[0];
568 const InterpFilterParams *filter_params_y = interp_filters[1];
569
570 // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
571 // Do we have SIMD support to 4-tap case?
572 // 2-tap filter indicates that it is for IntraBC.
573 if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
574 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
575 assert(!scaled);
576 if (subpel_x_qn && subpel_y_qn) {
577 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
578 filter_params_x, filter_params_y, subpel_x_qn,
579 subpel_y_qn, conv_params);
580 return;
581 } else if (subpel_x_qn) {
582 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
583 filter_params_x, filter_params_y, subpel_x_qn,
584 subpel_y_qn, conv_params);
585 return;
586 } else if (subpel_y_qn) {
587 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
588 filter_params_x, filter_params_y, subpel_x_qn,
589 subpel_y_qn, conv_params);
590 return;
591 }
592 }
593
594 if (scaled) {
595 convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
596 filter_params_x, filter_params_y, subpel_x_qn,
597 x_step_q4, subpel_y_qn, y_step_q4, conv_params);
598 } else {
599 sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound](
600 src, src_stride, dst, dst_stride, w, h, filter_params_x,
601 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
602 }
603 }
604
605 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_2d_copy_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)606 void av1_highbd_convolve_2d_copy_sr_c(
607 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
608 int h, const InterpFilterParams *filter_params_x,
609 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
610 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
611 (void)filter_params_x;
612 (void)filter_params_y;
613 (void)subpel_x_qn;
614 (void)subpel_y_qn;
615 (void)conv_params;
616 (void)bd;
617
618 for (int y = 0; y < h; ++y) {
619 memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
620 }
621 }
622
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)623 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
624 uint16_t *dst, int dst_stride, int w, int h,
625 const InterpFilterParams *filter_params_x,
626 const InterpFilterParams *filter_params_y,
627 const int subpel_x_qn, const int subpel_y_qn,
628 ConvolveParams *conv_params, int bd) {
629 const int fo_horiz = filter_params_x->taps / 2 - 1;
630 const int bits = FILTER_BITS - conv_params->round_0;
631 (void)filter_params_y;
632 (void)subpel_y_qn;
633
634 assert(bits >= 0);
635 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
636 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
637
638 // horizontal filter
639 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
640 filter_params_x, subpel_x_qn & SUBPEL_MASK);
641 for (int y = 0; y < h; ++y) {
642 for (int x = 0; x < w; ++x) {
643 int32_t res = 0;
644 for (int k = 0; k < filter_params_x->taps; ++k) {
645 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
646 }
647 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
648 dst[y * dst_stride + x] =
649 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
650 }
651 }
652 }
653
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)654 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
655 uint16_t *dst, int dst_stride, int w, int h,
656 const InterpFilterParams *filter_params_x,
657 const InterpFilterParams *filter_params_y,
658 const int subpel_x_qn, const int subpel_y_qn,
659 ConvolveParams *conv_params, int bd) {
660 const int fo_vert = filter_params_y->taps / 2 - 1;
661 (void)filter_params_x;
662 (void)subpel_x_qn;
663 (void)conv_params;
664
665 assert(conv_params->round_0 <= FILTER_BITS);
666 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
667 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
668 // vertical filter
669 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
670 filter_params_y, subpel_y_qn & SUBPEL_MASK);
671 for (int y = 0; y < h; ++y) {
672 for (int x = 0; x < w; ++x) {
673 int32_t res = 0;
674 for (int k = 0; k < filter_params_y->taps; ++k) {
675 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
676 }
677 dst[y * dst_stride + x] =
678 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
679 }
680 }
681 }
682
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)683 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
684 uint16_t *dst, int dst_stride, int w, int h,
685 const InterpFilterParams *filter_params_x,
686 const InterpFilterParams *filter_params_y,
687 const int subpel_x_qn, const int subpel_y_qn,
688 ConvolveParams *conv_params, int bd) {
689 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
690 int im_h = h + filter_params_y->taps - 1;
691 int im_stride = w;
692 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
693 const int fo_vert = filter_params_y->taps / 2 - 1;
694 const int fo_horiz = filter_params_x->taps / 2 - 1;
695 const int bits =
696 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
697 assert(bits >= 0);
698
699 // horizontal filter
700 const uint16_t *src_horiz = src - fo_vert * src_stride;
701 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
702 filter_params_x, subpel_x_qn & SUBPEL_MASK);
703 for (int y = 0; y < im_h; ++y) {
704 for (int x = 0; x < w; ++x) {
705 int32_t sum = (1 << (bd + FILTER_BITS - 1));
706 for (int k = 0; k < filter_params_x->taps; ++k) {
707 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
708 }
709 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
710 im_block[y * im_stride + x] =
711 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
712 }
713 }
714
715 // vertical filter
716 int16_t *src_vert = im_block + fo_vert * im_stride;
717 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
718 filter_params_y, subpel_y_qn & SUBPEL_MASK);
719 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
720 for (int y = 0; y < h; ++y) {
721 for (int x = 0; x < w; ++x) {
722 int32_t sum = 1 << offset_bits;
723 for (int k = 0; k < filter_params_y->taps; ++k) {
724 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
725 }
726 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
727 int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
728 ((1 << (offset_bits - conv_params->round_1)) +
729 (1 << (offset_bits - conv_params->round_1 - 1)));
730 dst[y * dst_stride + x] =
731 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
732 }
733 }
734 }
735
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)736 void av1_highbd_dist_wtd_convolve_2d_c(
737 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
738 int h, const InterpFilterParams *filter_params_x,
739 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
740 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
741 int x, y, k;
742 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
743 CONV_BUF_TYPE *dst16 = conv_params->dst;
744 int dst16_stride = conv_params->dst_stride;
745 int im_h = h + filter_params_y->taps - 1;
746 int im_stride = w;
747 const int fo_vert = filter_params_y->taps / 2 - 1;
748 const int fo_horiz = filter_params_x->taps / 2 - 1;
749 const int round_bits =
750 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
751 assert(round_bits >= 0);
752
753 // horizontal filter
754 const uint16_t *src_horiz = src - fo_vert * src_stride;
755 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
756 filter_params_x, subpel_x_qn & SUBPEL_MASK);
757 for (y = 0; y < im_h; ++y) {
758 for (x = 0; x < w; ++x) {
759 int32_t sum = (1 << (bd + FILTER_BITS - 1));
760 for (k = 0; k < filter_params_x->taps; ++k) {
761 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
762 }
763 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
764 (void)bd;
765 im_block[y * im_stride + x] =
766 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
767 }
768 }
769
770 // vertical filter
771 int16_t *src_vert = im_block + fo_vert * im_stride;
772 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
773 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
774 filter_params_y, subpel_y_qn & SUBPEL_MASK);
775 for (y = 0; y < h; ++y) {
776 for (x = 0; x < w; ++x) {
777 int32_t sum = 1 << offset_bits;
778 for (k = 0; k < filter_params_y->taps; ++k) {
779 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
780 }
781 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
782 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
783 if (conv_params->do_average) {
784 int32_t tmp = dst16[y * dst16_stride + x];
785 if (conv_params->use_dist_wtd_comp_avg) {
786 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
787 tmp = tmp >> DIST_PRECISION_BITS;
788 } else {
789 tmp += res;
790 tmp = tmp >> 1;
791 }
792 tmp -= (1 << (offset_bits - conv_params->round_1)) +
793 (1 << (offset_bits - conv_params->round_1 - 1));
794 dst[y * dst_stride + x] =
795 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
796 } else {
797 dst16[y * dst16_stride + x] = res;
798 }
799 }
800 }
801 }
802
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)803 void av1_highbd_dist_wtd_convolve_x_c(
804 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
805 int h, const InterpFilterParams *filter_params_x,
806 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
807 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
808 CONV_BUF_TYPE *dst16 = conv_params->dst;
809 int dst16_stride = conv_params->dst_stride;
810 const int fo_horiz = filter_params_x->taps / 2 - 1;
811 const int bits = FILTER_BITS - conv_params->round_1;
812 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
813 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
814 (1 << (offset_bits - conv_params->round_1 - 1));
815 const int round_bits =
816 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
817 assert(round_bits >= 0);
818 (void)filter_params_y;
819 (void)subpel_y_qn;
820 assert(bits >= 0);
821 // horizontal filter
822 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
823 filter_params_x, subpel_x_qn & SUBPEL_MASK);
824 for (int y = 0; y < h; ++y) {
825 for (int x = 0; x < w; ++x) {
826 int32_t res = 0;
827 for (int k = 0; k < filter_params_x->taps; ++k) {
828 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
829 }
830 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
831 res += round_offset;
832
833 if (conv_params->do_average) {
834 int32_t tmp = dst16[y * dst16_stride + x];
835 if (conv_params->use_dist_wtd_comp_avg) {
836 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
837 tmp = tmp >> DIST_PRECISION_BITS;
838 } else {
839 tmp += res;
840 tmp = tmp >> 1;
841 }
842 tmp -= round_offset;
843 dst[y * dst_stride + x] =
844 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
845 } else {
846 dst16[y * dst16_stride + x] = res;
847 }
848 }
849 }
850 }
851
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)852 void av1_highbd_dist_wtd_convolve_y_c(
853 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
854 int h, const InterpFilterParams *filter_params_x,
855 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
856 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
857 CONV_BUF_TYPE *dst16 = conv_params->dst;
858 int dst16_stride = conv_params->dst_stride;
859 const int fo_vert = filter_params_y->taps / 2 - 1;
860 const int bits = FILTER_BITS - conv_params->round_0;
861 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
862 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
863 (1 << (offset_bits - conv_params->round_1 - 1));
864 const int round_bits =
865 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
866 assert(round_bits >= 0);
867 (void)filter_params_x;
868 (void)subpel_x_qn;
869 assert(bits >= 0);
870 // vertical filter
871 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
872 filter_params_y, subpel_y_qn & SUBPEL_MASK);
873 for (int y = 0; y < h; ++y) {
874 for (int x = 0; x < w; ++x) {
875 int32_t res = 0;
876 for (int k = 0; k < filter_params_y->taps; ++k) {
877 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
878 }
879 res *= (1 << bits);
880 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
881
882 if (conv_params->do_average) {
883 int32_t tmp = dst16[y * dst16_stride + x];
884 if (conv_params->use_dist_wtd_comp_avg) {
885 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
886 tmp = tmp >> DIST_PRECISION_BITS;
887 } else {
888 tmp += res;
889 tmp = tmp >> 1;
890 }
891 tmp -= round_offset;
892 dst[y * dst_stride + x] =
893 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
894 } else {
895 dst16[y * dst16_stride + x] = res;
896 }
897 }
898 }
899 }
900
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)901 void av1_highbd_dist_wtd_convolve_2d_copy_c(
902 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
903 int h, const InterpFilterParams *filter_params_x,
904 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
905 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
906 CONV_BUF_TYPE *dst16 = conv_params->dst;
907 int dst16_stride = conv_params->dst_stride;
908 const int bits =
909 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
910 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
911 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
912 (1 << (offset_bits - conv_params->round_1 - 1));
913 assert(bits >= 0);
914 (void)filter_params_x;
915 (void)filter_params_y;
916 (void)subpel_x_qn;
917 (void)subpel_y_qn;
918
919 for (int y = 0; y < h; ++y) {
920 for (int x = 0; x < w; ++x) {
921 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
922 res += round_offset;
923 if (conv_params->do_average) {
924 int32_t tmp = dst16[y * dst16_stride + x];
925 if (conv_params->use_dist_wtd_comp_avg) {
926 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
927 tmp = tmp >> DIST_PRECISION_BITS;
928 } else {
929 tmp += res;
930 tmp = tmp >> 1;
931 }
932 tmp -= round_offset;
933 dst[y * dst_stride + x] =
934 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
935 } else {
936 dst16[y * dst16_stride + x] = res;
937 }
938 }
939 }
940 }
941
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)942 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
943 uint16_t *dst, int dst_stride, int w, int h,
944 const InterpFilterParams *filter_params_x,
945 const InterpFilterParams *filter_params_y,
946 const int subpel_x_qn, const int x_step_qn,
947 const int subpel_y_qn, const int y_step_qn,
948 ConvolveParams *conv_params, int bd) {
949 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
950 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
951 filter_params_y->taps;
952 int im_stride = w;
953 const int fo_vert = filter_params_y->taps / 2 - 1;
954 const int fo_horiz = filter_params_x->taps / 2 - 1;
955 CONV_BUF_TYPE *dst16 = conv_params->dst;
956 const int dst16_stride = conv_params->dst_stride;
957 const int bits =
958 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
959 assert(bits >= 0);
960 // horizontal filter
961 const uint16_t *src_horiz = src - fo_vert * src_stride;
962 for (int y = 0; y < im_h; ++y) {
963 int x_qn = subpel_x_qn;
964 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
965 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
966 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
967 assert(x_filter_idx < SUBPEL_SHIFTS);
968 const int16_t *x_filter =
969 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
970 int32_t sum = (1 << (bd + FILTER_BITS - 1));
971 for (int k = 0; k < filter_params_x->taps; ++k) {
972 sum += x_filter[k] * src_x[k - fo_horiz];
973 }
974 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
975 im_block[y * im_stride + x] =
976 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
977 }
978 src_horiz += src_stride;
979 }
980
981 // vertical filter
982 int16_t *src_vert = im_block + fo_vert * im_stride;
983 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
984 for (int x = 0; x < w; ++x) {
985 int y_qn = subpel_y_qn;
986 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
987 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
988 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
989 assert(y_filter_idx < SUBPEL_SHIFTS);
990 const int16_t *y_filter =
991 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
992 int32_t sum = 1 << offset_bits;
993 for (int k = 0; k < filter_params_y->taps; ++k) {
994 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
995 }
996 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
997 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
998 if (conv_params->is_compound) {
999 if (conv_params->do_average) {
1000 int32_t tmp = dst16[y * dst16_stride + x];
1001 if (conv_params->use_dist_wtd_comp_avg) {
1002 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1003 tmp = tmp >> DIST_PRECISION_BITS;
1004 } else {
1005 tmp += res;
1006 tmp = tmp >> 1;
1007 }
1008 /* Subtract round offset and convolve round */
1009 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1010 (1 << (offset_bits - conv_params->round_1 - 1)));
1011 dst[y * dst_stride + x] =
1012 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1013 } else {
1014 dst16[y * dst16_stride + x] = res;
1015 }
1016 } else {
1017 /* Subtract round offset and convolve round */
1018 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1019 (1 << (offset_bits - conv_params->round_1 - 1)));
1020 dst[y * dst_stride + x] =
1021 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1022 }
1023 }
1024 src_vert++;
1025 }
1026 }
1027
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,const struct scale_factors * sf,int bd)1028 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1029 uint8_t *dst8, int dst_stride, int w, int h,
1030 const InterpFilterParams *interp_filters[2],
1031 const int subpel_x_qn, int x_step_q4,
1032 const int subpel_y_qn, int y_step_q4,
1033 int scaled, ConvolveParams *conv_params,
1034 const struct scale_factors *sf, int bd) {
1035 (void)x_step_q4;
1036 (void)y_step_q4;
1037 (void)dst_stride;
1038 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1039
1040 const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
1041 const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
1042 const InterpFilterParams *filter_params_x =
1043 need_filter_params_x ? interp_filters[0] : NULL;
1044 const InterpFilterParams *filter_params_y =
1045 need_filter_params_y ? interp_filters[1] : NULL;
1046
1047 if (scaled) {
1048 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1049 if (conv_params->is_compound) {
1050 assert(conv_params->dst != NULL);
1051 }
1052 av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1053 filter_params_x, filter_params_y, subpel_x_qn,
1054 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1055 bd);
1056 } else {
1057 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1058
1059 sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn !=
1060 0][conv_params->is_compound](
1061 src, src_stride, dst, dst_stride, w, h, filter_params_x,
1062 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1063 }
1064 }
1065 #endif // CONFIG_AV1_HIGHBITDEPTH
1066
1067 // Note: Fixed size intermediate buffers, place limits on parameters
1068 // of some functions. 2d filtering proceeds in 2 steps:
1069 // (1) Interpolate horizontally into an intermediate buffer, temp.
1070 // (2) Interpolate temp vertically to derive the sub-pixel result.
1071 // Deriving the maximum number of rows in the temp buffer (135):
1072 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1073 // --Largest block size is 128x128 pixels.
1074 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1075 // original frame (in 1/16th pixel units).
1076 // --Must round-up because block may be located at sub-pixel position.
1077 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1078 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1079 #define WIENER_MAX_EXT_SIZE 263
1080
horz_scalar_product(const uint8_t * a,const int16_t * b)1081 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1082 int sum = 0;
1083 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1084 return sum;
1085 }
1086
1087 #if CONFIG_AV1_HIGHBITDEPTH
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1088 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1089 const int16_t *b) {
1090 int sum = 0;
1091 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1092 return sum;
1093 }
1094 #endif
1095
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1096 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1097 ptrdiff_t a_stride,
1098 const int16_t *b) {
1099 int sum = 0;
1100 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1101 return sum;
1102 }
1103
get_filter_base(const int16_t * filter)1104 static const InterpKernel *get_filter_base(const int16_t *filter) {
1105 // NOTE: This assumes that the filter table is 256-byte aligned.
1106 // TODO(agrange) Modify to make independent of table alignment.
1107 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1108 }
1109
get_filter_offset(const int16_t * f,const InterpKernel * base)1110 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1111 return (int)((const InterpKernel *)(intptr_t)f - base);
1112 }
1113
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1114 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1115 uint16_t *dst, ptrdiff_t dst_stride,
1116 const InterpKernel *x_filters, int x0_q4,
1117 int x_step_q4, int w, int h,
1118 int round0_bits) {
1119 const int bd = 8;
1120 src -= SUBPEL_TAPS / 2 - 1;
1121 for (int y = 0; y < h; ++y) {
1122 int x_q4 = x0_q4;
1123 for (int x = 0; x < w; ++x) {
1124 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1125 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1126 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1127 (1 << (bd + FILTER_BITS - 1));
1128 const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1129 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1130 WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1131 x_q4 += x_step_q4;
1132 }
1133 src += src_stride;
1134 dst += dst_stride;
1135 }
1136 }
1137
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1138 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1139 uint8_t *dst, ptrdiff_t dst_stride,
1140 const InterpKernel *y_filters, int y0_q4,
1141 int y_step_q4, int w, int h,
1142 int round1_bits) {
1143 const int bd = 8;
1144 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1145
1146 for (int x = 0; x < w; ++x) {
1147 int y_q4 = y0_q4;
1148 for (int y = 0; y < h; ++y) {
1149 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1150 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1151 const int rounding =
1152 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1153 (1 << (bd + round1_bits - 1));
1154 const int sum =
1155 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1156 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1157 y_q4 += y_step_q4;
1158 }
1159 ++src;
1160 ++dst;
1161 }
1162 }
1163
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params)1164 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1165 uint8_t *dst, ptrdiff_t dst_stride,
1166 const int16_t *filter_x, int x_step_q4,
1167 const int16_t *filter_y, int y_step_q4,
1168 int w, int h,
1169 const ConvolveParams *conv_params) {
1170 const InterpKernel *const filters_x = get_filter_base(filter_x);
1171 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1172
1173 const InterpKernel *const filters_y = get_filter_base(filter_y);
1174 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1175
1176 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1177 const int intermediate_height =
1178 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1179 memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1180
1181 assert(w <= MAX_SB_SIZE);
1182 assert(h <= MAX_SB_SIZE);
1183 assert(y_step_q4 <= 32);
1184 assert(x_step_q4 <= 32);
1185
1186 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1187 src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1188 x_step_q4, w, intermediate_height,
1189 conv_params->round_0);
1190 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1191 MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1192 y_step_q4, w, h, conv_params->round_1);
1193 }
1194
1195 #if CONFIG_AV1_HIGHBITDEPTH
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1196 static void highbd_convolve_add_src_horiz_hip(
1197 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1198 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1199 int x_step_q4, int w, int h, int round0_bits, int bd) {
1200 const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1201 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1202 src -= SUBPEL_TAPS / 2 - 1;
1203 for (int y = 0; y < h; ++y) {
1204 int x_q4 = x0_q4;
1205 for (int x = 0; x < w; ++x) {
1206 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1207 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1208 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1209 (1 << (bd + FILTER_BITS - 1));
1210 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1211 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1212 extraprec_clamp_limit - 1);
1213 x_q4 += x_step_q4;
1214 }
1215 src += src_stride;
1216 dst += dst_stride;
1217 }
1218 }
1219
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1220 static void highbd_convolve_add_src_vert_hip(
1221 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1222 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1223 int y_step_q4, int w, int h, int round1_bits, int bd) {
1224 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1225 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1226 for (int x = 0; x < w; ++x) {
1227 int y_q4 = y0_q4;
1228 for (int y = 0; y < h; ++y) {
1229 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1230 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1231 const int rounding =
1232 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1233 (1 << (bd + round1_bits - 1));
1234 const int sum =
1235 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1236 dst[y * dst_stride] =
1237 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1238 y_q4 += y_step_q4;
1239 }
1240 ++src;
1241 ++dst;
1242 }
1243 }
1244
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params,int bd)1245 void av1_highbd_wiener_convolve_add_src_c(
1246 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1247 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1248 const int16_t *filter_y, int y_step_q4, int w, int h,
1249 const ConvolveParams *conv_params, int bd) {
1250 const InterpKernel *const filters_x = get_filter_base(filter_x);
1251 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1252
1253 const InterpKernel *const filters_y = get_filter_base(filter_y);
1254 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1255
1256 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1257 const int intermediate_height =
1258 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1259
1260 assert(w <= MAX_SB_SIZE);
1261 assert(h <= MAX_SB_SIZE);
1262 assert(y_step_q4 <= 32);
1263 assert(x_step_q4 <= 32);
1264 assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1265
1266 highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1267 src_stride, temp, MAX_SB_SIZE, filters_x,
1268 x0_q4, x_step_q4, w, intermediate_height,
1269 conv_params->round_0, bd);
1270 highbd_convolve_add_src_vert_hip(
1271 temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1272 filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1273 }
1274 #endif // CONFIG_AV1_HIGHBITDEPTH
1275