1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <string.h>
14
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17
18 #include "av1/common/blockd.h"
19 #include "av1/common/convolve.h"
20 #include "av1/common/filter.h"
21 #include "av1/common/onyxc_int.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27 int dst_stride, int w, int h,
28 const int16_t *x_filters, int x0_qn,
29 int x_step_qn) {
30 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31 for (int y = 0; y < h; ++y) {
32 int x_qn = x0_qn;
33 for (int x = 0; x < w; ++x) {
34 const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35 const int x_filter_idx =
36 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37 assert(x_filter_idx <= RS_SUBPEL_MASK);
38 const int16_t *const x_filter =
39 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40 int sum = 0;
41 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42 sum += src_x[k] * x_filter[k];
43 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44 x_qn += x_step_qn;
45 }
46 src += src_stride;
47 dst += dst_stride;
48 }
49 }
50
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52 uint16_t *dst, int dst_stride, int w, int h,
53 const int16_t *x_filters, int x0_qn,
54 int x_step_qn, int bd) {
55 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56 for (int y = 0; y < h; ++y) {
57 int x_qn = x0_qn;
58 for (int x = 0; x < w; ++x) {
59 const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60 const int x_filter_idx =
61 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62 assert(x_filter_idx <= RS_SUBPEL_MASK);
63 const int16_t *const x_filter =
64 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65 int sum = 0;
66 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67 sum += src_x[k] * x_filter[k];
68 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69 x_qn += x_step_qn;
70 }
71 src += src_stride;
72 dst += dst_stride;
73 }
74 }
75
av1_convolve_2d_sobel_y_c(const uint8_t * src,int src_stride,double * dst,int dst_stride,int w,int h,int dir,double norm)76 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
77 int dst_stride, int w, int h, int dir,
78 double norm) {
79 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
80 DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
81 DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
82 const int taps = 3;
83 int im_h = h + taps - 1;
84 int im_stride = w;
85 const int fo_vert = 1;
86 const int fo_horiz = 1;
87
88 // horizontal filter
89 const uint8_t *src_horiz = src - fo_vert * src_stride;
90 const int16_t *x_filter = dir ? sobel_a : sobel_b;
91 for (int y = 0; y < im_h; ++y) {
92 for (int x = 0; x < w; ++x) {
93 int16_t sum = 0;
94 for (int k = 0; k < taps; ++k) {
95 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
96 }
97 im_block[y * im_stride + x] = sum;
98 }
99 }
100
101 // vertical filter
102 int16_t *src_vert = im_block + fo_vert * im_stride;
103 const int16_t *y_filter = dir ? sobel_b : sobel_a;
104 for (int y = 0; y < h; ++y) {
105 for (int x = 0; x < w; ++x) {
106 int16_t sum = 0;
107 for (int k = 0; k < taps; ++k) {
108 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
109 }
110 dst[y * dst_stride + x] = sum * norm;
111 }
112 }
113 }
114
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)115 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
116 int dst_stride, int w, int h,
117 const InterpFilterParams *filter_params_x,
118 const InterpFilterParams *filter_params_y,
119 const int subpel_x_q4, const int subpel_y_q4,
120 ConvolveParams *conv_params) {
121 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
122 int im_h = h + filter_params_y->taps - 1;
123 int im_stride = w;
124 const int fo_vert = filter_params_y->taps / 2 - 1;
125 const int fo_horiz = filter_params_x->taps / 2 - 1;
126 const int bd = 8;
127 const int bits =
128 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
129
130 // horizontal filter
131 const uint8_t *src_horiz = src - fo_vert * src_stride;
132 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
133 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
134 for (int y = 0; y < im_h; ++y) {
135 for (int x = 0; x < w; ++x) {
136 int32_t sum = (1 << (bd + FILTER_BITS - 1));
137 for (int k = 0; k < filter_params_x->taps; ++k) {
138 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
139 }
140 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
141 im_block[y * im_stride + x] =
142 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
143 }
144 }
145
146 // vertical filter
147 int16_t *src_vert = im_block + fo_vert * im_stride;
148 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
149 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
150 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
151 for (int y = 0; y < h; ++y) {
152 for (int x = 0; x < w; ++x) {
153 int32_t sum = 1 << offset_bits;
154 for (int k = 0; k < filter_params_y->taps; ++k) {
155 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
156 }
157 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
158 int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
159 ((1 << (offset_bits - conv_params->round_1)) +
160 (1 << (offset_bits - conv_params->round_1 - 1)));
161 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
162 }
163 }
164 }
165
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)166 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
167 int dst_stride, int w, int h,
168 const InterpFilterParams *filter_params_x,
169 const InterpFilterParams *filter_params_y,
170 const int subpel_x_q4, const int subpel_y_q4,
171 ConvolveParams *conv_params) {
172 const int fo_vert = filter_params_y->taps / 2 - 1;
173 (void)filter_params_x;
174 (void)subpel_x_q4;
175 (void)conv_params;
176
177 assert(conv_params->round_0 <= FILTER_BITS);
178 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
179 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
180
181 // vertical filter
182 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
183 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
184 for (int y = 0; y < h; ++y) {
185 for (int x = 0; x < w; ++x) {
186 int32_t res = 0;
187 for (int k = 0; k < filter_params_y->taps; ++k) {
188 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
189 }
190 dst[y * dst_stride + x] =
191 clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
192 }
193 }
194 }
195
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)196 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
197 int dst_stride, int w, int h,
198 const InterpFilterParams *filter_params_x,
199 const InterpFilterParams *filter_params_y,
200 const int subpel_x_q4, const int subpel_y_q4,
201 ConvolveParams *conv_params) {
202 const int fo_horiz = filter_params_x->taps / 2 - 1;
203 const int bits = FILTER_BITS - conv_params->round_0;
204 (void)filter_params_y;
205 (void)subpel_y_q4;
206 (void)conv_params;
207
208 assert(bits >= 0);
209 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
210 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
211
212 // horizontal filter
213 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
214 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
215
216 for (int y = 0; y < h; ++y) {
217 for (int x = 0; x < w; ++x) {
218 int32_t res = 0;
219 for (int k = 0; k < filter_params_x->taps; ++k) {
220 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
221 }
222 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
223 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
224 }
225 }
226 }
227
av1_convolve_2d_copy_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)228 void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
229 int dst_stride, int w, int h,
230 const InterpFilterParams *filter_params_x,
231 const InterpFilterParams *filter_params_y,
232 const int subpel_x_q4, const int subpel_y_q4,
233 ConvolveParams *conv_params) {
234 (void)filter_params_x;
235 (void)filter_params_y;
236 (void)subpel_x_q4;
237 (void)subpel_y_q4;
238 (void)conv_params;
239
240 for (int y = 0; y < h; ++y) {
241 memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
242 }
243 }
244
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)245 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
246 uint8_t *dst8, int dst8_stride, int w, int h,
247 const InterpFilterParams *filter_params_x,
248 const InterpFilterParams *filter_params_y,
249 const int subpel_x_q4, const int subpel_y_q4,
250 ConvolveParams *conv_params) {
251 CONV_BUF_TYPE *dst = conv_params->dst;
252 int dst_stride = conv_params->dst_stride;
253 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
254 int im_h = h + filter_params_y->taps - 1;
255 int im_stride = w;
256 const int fo_vert = filter_params_y->taps / 2 - 1;
257 const int fo_horiz = filter_params_x->taps / 2 - 1;
258 const int bd = 8;
259 const int round_bits =
260 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
261
262 // horizontal filter
263 const uint8_t *src_horiz = src - fo_vert * src_stride;
264 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
265 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
266 for (int y = 0; y < im_h; ++y) {
267 for (int x = 0; x < w; ++x) {
268 int32_t sum = (1 << (bd + FILTER_BITS - 1));
269 for (int k = 0; k < filter_params_x->taps; ++k) {
270 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
271 }
272 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
273 im_block[y * im_stride + x] =
274 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
275 }
276 }
277
278 // vertical filter
279 int16_t *src_vert = im_block + fo_vert * im_stride;
280 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
281 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
282 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
283 for (int y = 0; y < h; ++y) {
284 for (int x = 0; x < w; ++x) {
285 int32_t sum = 1 << offset_bits;
286 for (int k = 0; k < filter_params_y->taps; ++k) {
287 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
288 }
289 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
290 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
291 if (conv_params->do_average) {
292 int32_t tmp = dst[y * dst_stride + x];
293 if (conv_params->use_dist_wtd_comp_avg) {
294 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
295 tmp = tmp >> DIST_PRECISION_BITS;
296 } else {
297 tmp += res;
298 tmp = tmp >> 1;
299 }
300 tmp -= (1 << (offset_bits - conv_params->round_1)) +
301 (1 << (offset_bits - conv_params->round_1 - 1));
302 dst8[y * dst8_stride + x] =
303 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
304 } else {
305 dst[y * dst_stride + x] = res;
306 }
307 }
308 }
309 }
310
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)311 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride,
312 uint8_t *dst8, int dst8_stride, int w, int h,
313 const InterpFilterParams *filter_params_x,
314 const InterpFilterParams *filter_params_y,
315 const int subpel_x_q4, const int subpel_y_q4,
316 ConvolveParams *conv_params) {
317 CONV_BUF_TYPE *dst = conv_params->dst;
318 int dst_stride = conv_params->dst_stride;
319 const int fo_vert = filter_params_y->taps / 2 - 1;
320 const int bits = FILTER_BITS - conv_params->round_0;
321 const int bd = 8;
322 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
323 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
324 (1 << (offset_bits - conv_params->round_1 - 1));
325 const int round_bits =
326 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
327 (void)filter_params_x;
328 (void)subpel_x_q4;
329
330 // vertical filter
331 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
332 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
333 for (int y = 0; y < h; ++y) {
334 for (int x = 0; x < w; ++x) {
335 int32_t res = 0;
336 for (int k = 0; k < filter_params_y->taps; ++k) {
337 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
338 }
339 res *= (1 << bits);
340 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
341
342 if (conv_params->do_average) {
343 int32_t tmp = dst[y * dst_stride + x];
344 if (conv_params->use_dist_wtd_comp_avg) {
345 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
346 tmp = tmp >> DIST_PRECISION_BITS;
347 } else {
348 tmp += res;
349 tmp = tmp >> 1;
350 }
351 tmp -= round_offset;
352 dst8[y * dst8_stride + x] =
353 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
354 } else {
355 dst[y * dst_stride + x] = res;
356 }
357 }
358 }
359 }
360
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)361 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride,
362 uint8_t *dst8, int dst8_stride, int w, int h,
363 const InterpFilterParams *filter_params_x,
364 const InterpFilterParams *filter_params_y,
365 const int subpel_x_q4, const int subpel_y_q4,
366 ConvolveParams *conv_params) {
367 CONV_BUF_TYPE *dst = conv_params->dst;
368 int dst_stride = conv_params->dst_stride;
369 const int fo_horiz = filter_params_x->taps / 2 - 1;
370 const int bits = FILTER_BITS - conv_params->round_1;
371 const int bd = 8;
372 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
373 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
374 (1 << (offset_bits - conv_params->round_1 - 1));
375 const int round_bits =
376 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
377 (void)filter_params_y;
378 (void)subpel_y_q4;
379
380 // horizontal filter
381 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
382 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
383 for (int y = 0; y < h; ++y) {
384 for (int x = 0; x < w; ++x) {
385 int32_t res = 0;
386 for (int k = 0; k < filter_params_x->taps; ++k) {
387 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
388 }
389 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
390 res += round_offset;
391
392 if (conv_params->do_average) {
393 int32_t tmp = dst[y * dst_stride + x];
394 if (conv_params->use_dist_wtd_comp_avg) {
395 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
396 tmp = tmp >> DIST_PRECISION_BITS;
397 } else {
398 tmp += res;
399 tmp = tmp >> 1;
400 }
401 tmp -= round_offset;
402 dst8[y * dst8_stride + x] =
403 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
404 } else {
405 dst[y * dst_stride + x] = res;
406 }
407 }
408 }
409 }
410
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)411 void av1_dist_wtd_convolve_2d_copy_c(
412 const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
413 int h, const InterpFilterParams *filter_params_x,
414 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
415 const int subpel_y_q4, ConvolveParams *conv_params) {
416 CONV_BUF_TYPE *dst = conv_params->dst;
417 int dst_stride = conv_params->dst_stride;
418 const int bits =
419 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
420 const int bd = 8;
421 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
422 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
423 (1 << (offset_bits - conv_params->round_1 - 1));
424 (void)filter_params_x;
425 (void)filter_params_y;
426 (void)subpel_x_q4;
427 (void)subpel_y_q4;
428
429 for (int y = 0; y < h; ++y) {
430 for (int x = 0; x < w; ++x) {
431 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
432 res += round_offset;
433
434 if (conv_params->do_average) {
435 int32_t tmp = dst[y * dst_stride + x];
436 if (conv_params->use_dist_wtd_comp_avg) {
437 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
438 tmp = tmp >> DIST_PRECISION_BITS;
439 } else {
440 tmp += res;
441 tmp = tmp >> 1;
442 }
443 tmp -= round_offset;
444 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
445 } else {
446 dst[y * dst_stride + x] = res;
447 }
448 }
449 }
450 }
451
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)452 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
453 int dst8_stride, int w, int h,
454 const InterpFilterParams *filter_params_x,
455 const InterpFilterParams *filter_params_y,
456 const int subpel_x_qn, const int x_step_qn,
457 const int subpel_y_qn, const int y_step_qn,
458 ConvolveParams *conv_params) {
459 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
460 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
461 filter_params_y->taps;
462 CONV_BUF_TYPE *dst16 = conv_params->dst;
463 const int dst16_stride = conv_params->dst_stride;
464 const int bits =
465 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
466 assert(bits >= 0);
467 int im_stride = w;
468 const int fo_vert = filter_params_y->taps / 2 - 1;
469 const int fo_horiz = filter_params_x->taps / 2 - 1;
470 const int bd = 8;
471
472 // horizontal filter
473 const uint8_t *src_horiz = src - fo_vert * src_stride;
474 for (int y = 0; y < im_h; ++y) {
475 int x_qn = subpel_x_qn;
476 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
477 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
478 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
479 assert(x_filter_idx < SUBPEL_SHIFTS);
480 const int16_t *x_filter =
481 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
482 int32_t sum = (1 << (bd + FILTER_BITS - 1));
483 for (int k = 0; k < filter_params_x->taps; ++k) {
484 sum += x_filter[k] * src_x[k - fo_horiz];
485 }
486 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
487 im_block[y * im_stride + x] =
488 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
489 }
490 src_horiz += src_stride;
491 }
492
493 // vertical filter
494 int16_t *src_vert = im_block + fo_vert * im_stride;
495 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
496 for (int x = 0; x < w; ++x) {
497 int y_qn = subpel_y_qn;
498 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
499 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
500 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
501 assert(y_filter_idx < SUBPEL_SHIFTS);
502 const int16_t *y_filter =
503 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
504 int32_t sum = 1 << offset_bits;
505 for (int k = 0; k < filter_params_y->taps; ++k) {
506 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
507 }
508 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
509 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
510 if (conv_params->is_compound) {
511 if (conv_params->do_average) {
512 int32_t tmp = dst16[y * dst16_stride + x];
513 if (conv_params->use_dist_wtd_comp_avg) {
514 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
515 tmp = tmp >> DIST_PRECISION_BITS;
516 } else {
517 tmp += res;
518 tmp = tmp >> 1;
519 }
520 /* Subtract round offset and convolve round */
521 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
522 (1 << (offset_bits - conv_params->round_1 - 1)));
523 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
524 } else {
525 dst16[y * dst16_stride + x] = res;
526 }
527 } else {
528 /* Subtract round offset and convolve round */
529 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
530 (1 << (offset_bits - conv_params->round_1 - 1)));
531 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
532 }
533 }
534 src_vert++;
535 }
536 }
537
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)538 static void convolve_2d_scale_wrapper(
539 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
540 int h, const InterpFilterParams *filter_params_x,
541 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
542 const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
543 ConvolveParams *conv_params) {
544 if (conv_params->is_compound) {
545 assert(conv_params->dst != NULL);
546 }
547 av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
548 filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
549 y_step_qn, conv_params);
550 }
551
552 // TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
553 // we may create optimized code to do 2-tap filtering for all bilinear filtering
554 // usages, not just IntraBC.
convolve_2d_for_intrabc(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,int subpel_x_q4,int subpel_y_q4,ConvolveParams * conv_params)555 static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
556 uint8_t *dst, int dst_stride, int w, int h,
557 int subpel_x_q4, int subpel_y_q4,
558 ConvolveParams *conv_params) {
559 const InterpFilterParams *filter_params_x =
560 subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
561 const InterpFilterParams *filter_params_y =
562 subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
563 if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
564 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
565 filter_params_x, filter_params_y, 0, 0, conv_params);
566 } else if (subpel_x_q4 != 0) {
567 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
568 filter_params_y, 0, 0, conv_params);
569 } else {
570 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
571 filter_params_y, 0, 0, conv_params);
572 }
573 }
574
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int scaled,ConvolveParams * conv_params,const struct scale_factors * sf,int is_intrabc)575 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
576 int dst_stride, int w, int h,
577 InterpFilters interp_filters, const int subpel_x_q4,
578 int x_step_q4, const int subpel_y_q4, int y_step_q4,
579 int scaled, ConvolveParams *conv_params,
580 const struct scale_factors *sf, int is_intrabc) {
581 assert(IMPLIES(is_intrabc, !scaled));
582 (void)x_step_q4;
583 (void)y_step_q4;
584 (void)dst;
585 (void)dst_stride;
586
587 if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
588 convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
589 subpel_y_q4, conv_params);
590 return;
591 }
592
593 InterpFilter filter_x = 0;
594 InterpFilter filter_y = 0;
595 const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
596 const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
597 if (need_filter_params_x)
598 filter_x = av1_extract_interp_filter(interp_filters, 1);
599 if (need_filter_params_y)
600 filter_y = av1_extract_interp_filter(interp_filters, 0);
601 const InterpFilterParams *filter_params_x =
602 need_filter_params_x
603 ? av1_get_interp_filter_params_with_block_size(filter_x, w)
604 : NULL;
605 const InterpFilterParams *filter_params_y =
606 need_filter_params_y
607 ? av1_get_interp_filter_params_with_block_size(filter_y, h)
608 : NULL;
609
610 if (scaled) {
611 convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
612 filter_params_x, filter_params_y, subpel_x_q4,
613 x_step_q4, subpel_y_q4, y_step_q4, conv_params);
614 } else {
615 sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
616 src, src_stride, dst, dst_stride, w, h, filter_params_x,
617 filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
618 }
619 }
620
av1_highbd_convolve_2d_copy_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)621 void av1_highbd_convolve_2d_copy_sr_c(
622 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
623 int h, const InterpFilterParams *filter_params_x,
624 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
625 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
626 (void)filter_params_x;
627 (void)filter_params_y;
628 (void)subpel_x_q4;
629 (void)subpel_y_q4;
630 (void)conv_params;
631 (void)bd;
632
633 for (int y = 0; y < h; ++y) {
634 memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
635 }
636 }
637
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)638 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
639 uint16_t *dst, int dst_stride, int w, int h,
640 const InterpFilterParams *filter_params_x,
641 const InterpFilterParams *filter_params_y,
642 const int subpel_x_q4, const int subpel_y_q4,
643 ConvolveParams *conv_params, int bd) {
644 const int fo_horiz = filter_params_x->taps / 2 - 1;
645 const int bits = FILTER_BITS - conv_params->round_0;
646 (void)filter_params_y;
647 (void)subpel_y_q4;
648
649 assert(bits >= 0);
650 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
651 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
652
653 // horizontal filter
654 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
655 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
656 for (int y = 0; y < h; ++y) {
657 for (int x = 0; x < w; ++x) {
658 int32_t res = 0;
659 for (int k = 0; k < filter_params_x->taps; ++k) {
660 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
661 }
662 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
663 dst[y * dst_stride + x] =
664 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
665 }
666 }
667 }
668
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)669 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
670 uint16_t *dst, int dst_stride, int w, int h,
671 const InterpFilterParams *filter_params_x,
672 const InterpFilterParams *filter_params_y,
673 const int subpel_x_q4, const int subpel_y_q4,
674 ConvolveParams *conv_params, int bd) {
675 const int fo_vert = filter_params_y->taps / 2 - 1;
676 (void)filter_params_x;
677 (void)subpel_x_q4;
678 (void)conv_params;
679
680 assert(conv_params->round_0 <= FILTER_BITS);
681 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
682 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
683 // vertical filter
684 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
685 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
686 for (int y = 0; y < h; ++y) {
687 for (int x = 0; x < w; ++x) {
688 int32_t res = 0;
689 for (int k = 0; k < filter_params_y->taps; ++k) {
690 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
691 }
692 dst[y * dst_stride + x] =
693 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
694 }
695 }
696 }
697
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)698 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
699 uint16_t *dst, int dst_stride, int w, int h,
700 const InterpFilterParams *filter_params_x,
701 const InterpFilterParams *filter_params_y,
702 const int subpel_x_q4, const int subpel_y_q4,
703 ConvolveParams *conv_params, int bd) {
704 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
705 int im_h = h + filter_params_y->taps - 1;
706 int im_stride = w;
707 const int fo_vert = filter_params_y->taps / 2 - 1;
708 const int fo_horiz = filter_params_x->taps / 2 - 1;
709 const int bits =
710 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
711 assert(bits >= 0);
712
713 // horizontal filter
714 const uint16_t *src_horiz = src - fo_vert * src_stride;
715 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
716 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
717 for (int y = 0; y < im_h; ++y) {
718 for (int x = 0; x < w; ++x) {
719 int32_t sum = (1 << (bd + FILTER_BITS - 1));
720 for (int k = 0; k < filter_params_x->taps; ++k) {
721 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
722 }
723 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
724 im_block[y * im_stride + x] =
725 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
726 }
727 }
728
729 // vertical filter
730 int16_t *src_vert = im_block + fo_vert * im_stride;
731 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
732 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
733 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
734 for (int y = 0; y < h; ++y) {
735 for (int x = 0; x < w; ++x) {
736 int32_t sum = 1 << offset_bits;
737 for (int k = 0; k < filter_params_y->taps; ++k) {
738 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
739 }
740 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
741 int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
742 ((1 << (offset_bits - conv_params->round_1)) +
743 (1 << (offset_bits - conv_params->round_1 - 1)));
744 dst[y * dst_stride + x] =
745 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
746 }
747 }
748 }
749
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst16,int dst16_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)750 void av1_highbd_dist_wtd_convolve_2d_c(
751 const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
752 int w, int h, const InterpFilterParams *filter_params_x,
753 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
754 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
755 int x, y, k;
756 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
757 CONV_BUF_TYPE *dst = conv_params->dst;
758 int dst_stride = conv_params->dst_stride;
759 int im_h = h + filter_params_y->taps - 1;
760 int im_stride = w;
761 const int fo_vert = filter_params_y->taps / 2 - 1;
762 const int fo_horiz = filter_params_x->taps / 2 - 1;
763 const int round_bits =
764 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
765 assert(round_bits >= 0);
766
767 // horizontal filter
768 const uint16_t *src_horiz = src - fo_vert * src_stride;
769 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
770 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
771 for (y = 0; y < im_h; ++y) {
772 for (x = 0; x < w; ++x) {
773 int32_t sum = (1 << (bd + FILTER_BITS - 1));
774 for (k = 0; k < filter_params_x->taps; ++k) {
775 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
776 }
777 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
778 (void)bd;
779 im_block[y * im_stride + x] =
780 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
781 }
782 }
783
784 // vertical filter
785 int16_t *src_vert = im_block + fo_vert * im_stride;
786 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
787 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
788 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
789 for (y = 0; y < h; ++y) {
790 for (x = 0; x < w; ++x) {
791 int32_t sum = 1 << offset_bits;
792 for (k = 0; k < filter_params_y->taps; ++k) {
793 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
794 }
795 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
796 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
797 if (conv_params->do_average) {
798 int32_t tmp = dst[y * dst_stride + x];
799 if (conv_params->use_dist_wtd_comp_avg) {
800 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
801 tmp = tmp >> DIST_PRECISION_BITS;
802 } else {
803 tmp += res;
804 tmp = tmp >> 1;
805 }
806 tmp -= (1 << (offset_bits - conv_params->round_1)) +
807 (1 << (offset_bits - conv_params->round_1 - 1));
808 dst16[y * dst16_stride + x] =
809 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
810 } else {
811 dst[y * dst_stride + x] = res;
812 }
813 }
814 }
815 }
816
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst16,int dst16_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)817 void av1_highbd_dist_wtd_convolve_x_c(
818 const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
819 int w, int h, const InterpFilterParams *filter_params_x,
820 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
821 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
822 CONV_BUF_TYPE *dst = conv_params->dst;
823 int dst_stride = conv_params->dst_stride;
824 const int fo_horiz = filter_params_x->taps / 2 - 1;
825 const int bits = FILTER_BITS - conv_params->round_1;
826 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
827 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
828 (1 << (offset_bits - conv_params->round_1 - 1));
829 const int round_bits =
830 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
831 assert(round_bits >= 0);
832 (void)filter_params_y;
833 (void)subpel_y_q4;
834 assert(bits >= 0);
835 // horizontal filter
836 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
837 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
838 for (int y = 0; y < h; ++y) {
839 for (int x = 0; x < w; ++x) {
840 int32_t res = 0;
841 for (int k = 0; k < filter_params_x->taps; ++k) {
842 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
843 }
844 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
845 res += round_offset;
846
847 if (conv_params->do_average) {
848 int32_t tmp = dst[y * dst_stride + x];
849 if (conv_params->use_dist_wtd_comp_avg) {
850 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
851 tmp = tmp >> DIST_PRECISION_BITS;
852 } else {
853 tmp += res;
854 tmp = tmp >> 1;
855 }
856 tmp -= round_offset;
857 dst16[y * dst16_stride + x] =
858 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
859 } else {
860 dst[y * dst_stride + x] = res;
861 }
862 }
863 }
864 }
865
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst16,int dst16_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)866 void av1_highbd_dist_wtd_convolve_y_c(
867 const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
868 int w, int h, const InterpFilterParams *filter_params_x,
869 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
870 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
871 CONV_BUF_TYPE *dst = conv_params->dst;
872 int dst_stride = conv_params->dst_stride;
873 const int fo_vert = filter_params_y->taps / 2 - 1;
874 const int bits = FILTER_BITS - conv_params->round_0;
875 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
876 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
877 (1 << (offset_bits - conv_params->round_1 - 1));
878 const int round_bits =
879 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
880 assert(round_bits >= 0);
881 (void)filter_params_x;
882 (void)subpel_x_q4;
883 assert(bits >= 0);
884 // vertical filter
885 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
886 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
887 for (int y = 0; y < h; ++y) {
888 for (int x = 0; x < w; ++x) {
889 int32_t res = 0;
890 for (int k = 0; k < filter_params_y->taps; ++k) {
891 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
892 }
893 res *= (1 << bits);
894 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
895
896 if (conv_params->do_average) {
897 int32_t tmp = dst[y * dst_stride + x];
898 if (conv_params->use_dist_wtd_comp_avg) {
899 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
900 tmp = tmp >> DIST_PRECISION_BITS;
901 } else {
902 tmp += res;
903 tmp = tmp >> 1;
904 }
905 tmp -= round_offset;
906 dst16[y * dst16_stride + x] =
907 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
908 } else {
909 dst[y * dst_stride + x] = res;
910 }
911 }
912 }
913 }
914
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst16,int dst16_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)915 void av1_highbd_dist_wtd_convolve_2d_copy_c(
916 const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
917 int w, int h, const InterpFilterParams *filter_params_x,
918 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
919 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
920 CONV_BUF_TYPE *dst = conv_params->dst;
921 int dst_stride = conv_params->dst_stride;
922 const int bits =
923 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
924 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
925 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
926 (1 << (offset_bits - conv_params->round_1 - 1));
927 assert(bits >= 0);
928 (void)filter_params_x;
929 (void)filter_params_y;
930 (void)subpel_x_q4;
931 (void)subpel_y_q4;
932
933 for (int y = 0; y < h; ++y) {
934 for (int x = 0; x < w; ++x) {
935 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
936 res += round_offset;
937 if (conv_params->do_average) {
938 int32_t tmp = dst[y * dst_stride + x];
939 if (conv_params->use_dist_wtd_comp_avg) {
940 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
941 tmp = tmp >> DIST_PRECISION_BITS;
942 } else {
943 tmp += res;
944 tmp = tmp >> 1;
945 }
946 tmp -= round_offset;
947 dst16[y * dst16_stride + x] =
948 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
949 } else {
950 dst[y * dst_stride + x] = res;
951 }
952 }
953 }
954 }
955
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)956 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
957 uint16_t *dst, int dst_stride, int w, int h,
958 const InterpFilterParams *filter_params_x,
959 const InterpFilterParams *filter_params_y,
960 const int subpel_x_qn, const int x_step_qn,
961 const int subpel_y_qn, const int y_step_qn,
962 ConvolveParams *conv_params, int bd) {
963 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
964 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
965 filter_params_y->taps;
966 int im_stride = w;
967 const int fo_vert = filter_params_y->taps / 2 - 1;
968 const int fo_horiz = filter_params_x->taps / 2 - 1;
969 CONV_BUF_TYPE *dst16 = conv_params->dst;
970 const int dst16_stride = conv_params->dst_stride;
971 const int bits =
972 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
973 assert(bits >= 0);
974 // horizontal filter
975 const uint16_t *src_horiz = src - fo_vert * src_stride;
976 for (int y = 0; y < im_h; ++y) {
977 int x_qn = subpel_x_qn;
978 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
979 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
980 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
981 assert(x_filter_idx < SUBPEL_SHIFTS);
982 const int16_t *x_filter =
983 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
984 int32_t sum = (1 << (bd + FILTER_BITS - 1));
985 for (int k = 0; k < filter_params_x->taps; ++k) {
986 sum += x_filter[k] * src_x[k - fo_horiz];
987 }
988 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
989 im_block[y * im_stride + x] =
990 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
991 }
992 src_horiz += src_stride;
993 }
994
995 // vertical filter
996 int16_t *src_vert = im_block + fo_vert * im_stride;
997 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
998 for (int x = 0; x < w; ++x) {
999 int y_qn = subpel_y_qn;
1000 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
1001 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
1002 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1003 assert(y_filter_idx < SUBPEL_SHIFTS);
1004 const int16_t *y_filter =
1005 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
1006 int32_t sum = 1 << offset_bits;
1007 for (int k = 0; k < filter_params_y->taps; ++k) {
1008 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
1009 }
1010 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
1011 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1012 if (conv_params->is_compound) {
1013 if (conv_params->do_average) {
1014 int32_t tmp = dst16[y * dst16_stride + x];
1015 if (conv_params->use_dist_wtd_comp_avg) {
1016 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1017 tmp = tmp >> DIST_PRECISION_BITS;
1018 } else {
1019 tmp += res;
1020 tmp = tmp >> 1;
1021 }
1022 /* Subtract round offset and convolve round */
1023 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1024 (1 << (offset_bits - conv_params->round_1 - 1)));
1025 dst[y * dst_stride + x] =
1026 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1027 } else {
1028 dst16[y * dst16_stride + x] = res;
1029 }
1030 } else {
1031 /* Subtract round offset and convolve round */
1032 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1033 (1 << (offset_bits - conv_params->round_1 - 1)));
1034 dst[y * dst_stride + x] =
1035 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1036 }
1037 }
1038 src_vert++;
1039 }
1040 }
1041
highbd_convolve_2d_for_intrabc(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,int subpel_x_q4,int subpel_y_q4,ConvolveParams * conv_params,int bd)1042 static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
1043 uint16_t *dst, int dst_stride, int w,
1044 int h, int subpel_x_q4,
1045 int subpel_y_q4,
1046 ConvolveParams *conv_params,
1047 int bd) {
1048 const InterpFilterParams *filter_params_x =
1049 subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
1050 const InterpFilterParams *filter_params_y =
1051 subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
1052 if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1053 av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1054 filter_params_x, filter_params_y, 0, 0,
1055 conv_params, bd);
1056 } else if (subpel_x_q4 != 0) {
1057 av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
1058 filter_params_x, filter_params_y, 0, 0,
1059 conv_params, bd);
1060 } else {
1061 av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
1062 filter_params_x, filter_params_y, 0, 0,
1063 conv_params, bd);
1064 }
1065 }
1066
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int scaled,ConvolveParams * conv_params,const struct scale_factors * sf,int is_intrabc,int bd)1067 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1068 uint8_t *dst8, int dst_stride, int w, int h,
1069 InterpFilters interp_filters,
1070 const int subpel_x_q4, int x_step_q4,
1071 const int subpel_y_q4, int y_step_q4,
1072 int scaled, ConvolveParams *conv_params,
1073 const struct scale_factors *sf,
1074 int is_intrabc, int bd) {
1075 assert(IMPLIES(is_intrabc, !scaled));
1076 (void)x_step_q4;
1077 (void)y_step_q4;
1078 (void)dst_stride;
1079 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1080
1081 if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
1082 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1083 highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
1084 subpel_x_q4, subpel_y_q4, conv_params, bd);
1085 return;
1086 }
1087
1088 InterpFilter filter_x = 0;
1089 InterpFilter filter_y = 0;
1090 const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
1091 const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
1092 if (need_filter_params_x)
1093 filter_x = av1_extract_interp_filter(interp_filters, 1);
1094 if (need_filter_params_y)
1095 filter_y = av1_extract_interp_filter(interp_filters, 0);
1096 const InterpFilterParams *filter_params_x =
1097 need_filter_params_x
1098 ? av1_get_interp_filter_params_with_block_size(filter_x, w)
1099 : NULL;
1100 const InterpFilterParams *filter_params_y =
1101 need_filter_params_y
1102 ? av1_get_interp_filter_params_with_block_size(filter_y, h)
1103 : NULL;
1104
1105 if (scaled) {
1106 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1107 if (conv_params->is_compound) {
1108 assert(conv_params->dst != NULL);
1109 }
1110 av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1111 filter_params_x, filter_params_y, subpel_x_q4,
1112 x_step_q4, subpel_y_q4, y_step_q4, conv_params,
1113 bd);
1114 } else {
1115 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1116
1117 sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
1118 0][conv_params->is_compound](
1119 src, src_stride, dst, dst_stride, w, h, filter_params_x,
1120 filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
1121 }
1122 }
1123
1124 // Note: Fixed size intermediate buffers, place limits on parameters
1125 // of some functions. 2d filtering proceeds in 2 steps:
1126 // (1) Interpolate horizontally into an intermediate buffer, temp.
1127 // (2) Interpolate temp vertically to derive the sub-pixel result.
1128 // Deriving the maximum number of rows in the temp buffer (135):
1129 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1130 // --Largest block size is 128x128 pixels.
1131 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1132 // original frame (in 1/16th pixel units).
1133 // --Must round-up because block may be located at sub-pixel position.
1134 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1135 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1136 #define WIENER_MAX_EXT_SIZE 263
1137
horz_scalar_product(const uint8_t * a,const int16_t * b)1138 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1139 int sum = 0;
1140 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1141 return sum;
1142 }
1143
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1144 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1145 const int16_t *b) {
1146 int sum = 0;
1147 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1148 return sum;
1149 }
1150
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1151 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1152 ptrdiff_t a_stride,
1153 const int16_t *b) {
1154 int sum = 0;
1155 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1156 return sum;
1157 }
1158
get_filter_base(const int16_t * filter)1159 static const InterpKernel *get_filter_base(const int16_t *filter) {
1160 // NOTE: This assumes that the filter table is 256-byte aligned.
1161 // TODO(agrange) Modify to make independent of table alignment.
1162 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1163 }
1164
get_filter_offset(const int16_t * f,const InterpKernel * base)1165 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1166 return (int)((const InterpKernel *)(intptr_t)f - base);
1167 }
1168
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1169 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1170 uint16_t *dst, ptrdiff_t dst_stride,
1171 const InterpKernel *x_filters, int x0_q4,
1172 int x_step_q4, int w, int h,
1173 int round0_bits) {
1174 const int bd = 8;
1175 src -= SUBPEL_TAPS / 2 - 1;
1176 for (int y = 0; y < h; ++y) {
1177 int x_q4 = x0_q4;
1178 for (int x = 0; x < w; ++x) {
1179 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1180 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1181 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1182 (1 << (bd + FILTER_BITS - 1));
1183 const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1184 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1185 WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1186 x_q4 += x_step_q4;
1187 }
1188 src += src_stride;
1189 dst += dst_stride;
1190 }
1191 }
1192
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1193 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1194 uint8_t *dst, ptrdiff_t dst_stride,
1195 const InterpKernel *y_filters, int y0_q4,
1196 int y_step_q4, int w, int h,
1197 int round1_bits) {
1198 const int bd = 8;
1199 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1200
1201 for (int x = 0; x < w; ++x) {
1202 int y_q4 = y0_q4;
1203 for (int y = 0; y < h; ++y) {
1204 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1205 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1206 const int rounding =
1207 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1208 (1 << (bd + round1_bits - 1));
1209 const int sum =
1210 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1211 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1212 y_q4 += y_step_q4;
1213 }
1214 ++src;
1215 ++dst;
1216 }
1217 }
1218
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params)1219 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1220 uint8_t *dst, ptrdiff_t dst_stride,
1221 const int16_t *filter_x, int x_step_q4,
1222 const int16_t *filter_y, int y_step_q4,
1223 int w, int h,
1224 const ConvolveParams *conv_params) {
1225 const InterpKernel *const filters_x = get_filter_base(filter_x);
1226 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1227
1228 const InterpKernel *const filters_y = get_filter_base(filter_y);
1229 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1230
1231 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1232 const int intermediate_height =
1233 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1234 memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1235
1236 assert(w <= MAX_SB_SIZE);
1237 assert(h <= MAX_SB_SIZE);
1238 assert(y_step_q4 <= 32);
1239 assert(x_step_q4 <= 32);
1240
1241 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1242 src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1243 x_step_q4, w, intermediate_height,
1244 conv_params->round_0);
1245 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1246 MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1247 y_step_q4, w, h, conv_params->round_1);
1248 }
1249
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1250 static void highbd_convolve_add_src_horiz_hip(
1251 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1252 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1253 int x_step_q4, int w, int h, int round0_bits, int bd) {
1254 const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1255 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1256 src -= SUBPEL_TAPS / 2 - 1;
1257 for (int y = 0; y < h; ++y) {
1258 int x_q4 = x0_q4;
1259 for (int x = 0; x < w; ++x) {
1260 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1261 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1262 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1263 (1 << (bd + FILTER_BITS - 1));
1264 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1265 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1266 extraprec_clamp_limit - 1);
1267 x_q4 += x_step_q4;
1268 }
1269 src += src_stride;
1270 dst += dst_stride;
1271 }
1272 }
1273
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1274 static void highbd_convolve_add_src_vert_hip(
1275 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1276 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1277 int y_step_q4, int w, int h, int round1_bits, int bd) {
1278 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1279 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1280 for (int x = 0; x < w; ++x) {
1281 int y_q4 = y0_q4;
1282 for (int y = 0; y < h; ++y) {
1283 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1284 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1285 const int rounding =
1286 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1287 (1 << (bd + round1_bits - 1));
1288 const int sum =
1289 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1290 dst[y * dst_stride] =
1291 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1292 y_q4 += y_step_q4;
1293 }
1294 ++src;
1295 ++dst;
1296 }
1297 }
1298
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params,int bd)1299 void av1_highbd_wiener_convolve_add_src_c(
1300 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1301 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1302 const int16_t *filter_y, int y_step_q4, int w, int h,
1303 const ConvolveParams *conv_params, int bd) {
1304 const InterpKernel *const filters_x = get_filter_base(filter_x);
1305 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1306
1307 const InterpKernel *const filters_y = get_filter_base(filter_y);
1308 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1309
1310 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1311 const int intermediate_height =
1312 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1313
1314 assert(w <= MAX_SB_SIZE);
1315 assert(h <= MAX_SB_SIZE);
1316 assert(y_step_q4 <= 32);
1317 assert(x_step_q4 <= 32);
1318 assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1319
1320 highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1321 src_stride, temp, MAX_SB_SIZE, filters_x,
1322 x0_q4, x_step_q4, w, intermediate_height,
1323 conv_params->round_0, bd);
1324 highbd_convolve_add_src_vert_hip(
1325 temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1326 filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1327 }
1328