1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <string.h>
14
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/resize.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_ports/mem.h"
25
av1_convolve_horiz_rs_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn)26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
27 int dst_stride, int w, int h,
28 const int16_t *x_filters, int x0_qn,
29 int x_step_qn) {
30 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
31 for (int y = 0; y < h; ++y) {
32 int x_qn = x0_qn;
33 for (int x = 0; x < w; ++x) {
34 const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
35 const int x_filter_idx =
36 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
37 assert(x_filter_idx <= RS_SUBPEL_MASK);
38 const int16_t *const x_filter =
39 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
40 int sum = 0;
41 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
42 sum += src_x[k] * x_filter[k];
43 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
44 x_qn += x_step_qn;
45 }
46 src += src_stride;
47 dst += dst_stride;
48 }
49 }
50
av1_highbd_convolve_horiz_rs_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const int16_t * x_filters,int x0_qn,int x_step_qn,int bd)51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
52 uint16_t *dst, int dst_stride, int w, int h,
53 const int16_t *x_filters, int x0_qn,
54 int x_step_qn, int bd) {
55 src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
56 for (int y = 0; y < h; ++y) {
57 int x_qn = x0_qn;
58 for (int x = 0; x < w; ++x) {
59 const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
60 const int x_filter_idx =
61 (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
62 assert(x_filter_idx <= RS_SUBPEL_MASK);
63 const int16_t *const x_filter =
64 &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
65 int sum = 0;
66 for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
67 sum += src_x[k] * x_filter[k];
68 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
69 x_qn += x_step_qn;
70 }
71 src += src_stride;
72 dst += dst_stride;
73 }
74 }
75
av1_convolve_2d_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)76 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
77 int dst_stride, int w, int h,
78 const InterpFilterParams *filter_params_x,
79 const InterpFilterParams *filter_params_y,
80 const int subpel_x_qn, const int subpel_y_qn,
81 ConvolveParams *conv_params) {
82 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
83 int im_h = h + filter_params_y->taps - 1;
84 int im_stride = w;
85 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
86 const int fo_vert = filter_params_y->taps / 2 - 1;
87 const int fo_horiz = filter_params_x->taps / 2 - 1;
88 const int bd = 8;
89 const int bits =
90 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
91
92 // horizontal filter
93 const uint8_t *src_horiz = src - fo_vert * src_stride;
94 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
95 filter_params_x, subpel_x_qn & SUBPEL_MASK);
96 for (int y = 0; y < im_h; ++y) {
97 for (int x = 0; x < w; ++x) {
98 int32_t sum = (1 << (bd + FILTER_BITS - 1));
99 for (int k = 0; k < filter_params_x->taps; ++k) {
100 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
101 }
102
103 // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can
104 // be beyond the following range. For better prediction, a clamping can be
105 // added for 12 tap filter to ensure the horizontal filtering result is
106 // within 16 bit. The same applies to the vertical filtering.
107 assert(filter_params_x->taps > 8 ||
108 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
109 im_block[y * im_stride + x] =
110 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
111 }
112 }
113
114 // vertical filter
115 int16_t *src_vert = im_block + fo_vert * im_stride;
116 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
117 filter_params_y, subpel_y_qn & SUBPEL_MASK);
118 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
119 for (int y = 0; y < h; ++y) {
120 for (int x = 0; x < w; ++x) {
121 int32_t sum = 1 << offset_bits;
122 for (int k = 0; k < filter_params_y->taps; ++k) {
123 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
124 }
125 assert(filter_params_y->taps > 8 ||
126 (0 <= sum && sum < (1 << (offset_bits + 2))));
127 int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
128 ((1 << (offset_bits - conv_params->round_1)) +
129 (1 << (offset_bits - conv_params->round_1 - 1)));
130 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
131 }
132 }
133 }
134
av1_convolve_y_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)135 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
136 int dst_stride, int w, int h,
137 const InterpFilterParams *filter_params_y,
138 const int subpel_y_qn) {
139 const int fo_vert = filter_params_y->taps / 2 - 1;
140
141 // vertical filter
142 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
143 filter_params_y, subpel_y_qn & SUBPEL_MASK);
144 for (int y = 0; y < h; ++y) {
145 for (int x = 0; x < w; ++x) {
146 int32_t res = 0;
147 for (int k = 0; k < filter_params_y->taps; ++k) {
148 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
149 }
150 dst[y * dst_stride + x] =
151 clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
152 }
153 }
154 }
155
av1_convolve_x_sr_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)156 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
157 int dst_stride, int w, int h,
158 const InterpFilterParams *filter_params_x,
159 const int subpel_x_qn, ConvolveParams *conv_params) {
160 const int fo_horiz = filter_params_x->taps / 2 - 1;
161 const int bits = FILTER_BITS - conv_params->round_0;
162
163 assert(bits >= 0);
164 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
165 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
166
167 // horizontal filter
168 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
169 filter_params_x, subpel_x_qn & SUBPEL_MASK);
170
171 for (int y = 0; y < h; ++y) {
172 for (int x = 0; x < w; ++x) {
173 int32_t res = 0;
174 for (int k = 0; k < filter_params_x->taps; ++k) {
175 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
176 }
177 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
178 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
179 }
180 }
181 }
182
183 // This function is exactly the same as av1_convolve_2d_sr_c, and is an
184 // optimized version for intrabc. Use the following 2-tap filter:
185 // DECLARE_ALIGNED(256, static const int16_t,
186 // av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
187 // 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 // 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 // };
av1_convolve_2d_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)190 void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride,
191 uint8_t *dst, int dst_stride, int w, int h,
192 const InterpFilterParams *filter_params_x,
193 const InterpFilterParams *filter_params_y,
194 const int subpel_x_qn, const int subpel_y_qn,
195 ConvolveParams *conv_params) {
196 assert(subpel_x_qn == 8);
197 assert(subpel_y_qn == 8);
198 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
199 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
200 (void)filter_params_x;
201 (void)subpel_x_qn;
202 (void)filter_params_y;
203 (void)subpel_y_qn;
204 (void)conv_params;
205
206 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
207 int im_h = h + 1;
208 int im_stride = w;
209 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
210 const int bd = 8;
211
212 // horizontal filter
213 // explicitly operate for subpel_x_qn = 8.
214 int16_t *im = im_block;
215 for (int y = 0; y < im_h; ++y) {
216 for (int x = 0; x < w; ++x) {
217 const int32_t sum = (1 << bd) + src[x] + src[x + 1];
218 assert(0 <= sum && sum < (1 << (bd + 2)));
219 im[x] = sum;
220 }
221 src += src_stride;
222 im += im_stride;
223 }
224
225 // vertical filter
226 // explicitly operate for subpel_y_qn = 8.
227 int16_t *src_vert = im_block;
228 for (int y = 0; y < h; ++y) {
229 for (int x = 0; x < w; ++x) {
230 const int32_t sum =
231 (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x];
232 assert(0 <= sum && sum < (1 << (bd + 4)));
233 const int16_t res =
234 ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1)));
235 dst[x] = clip_pixel(res);
236 }
237 src_vert += im_stride;
238 dst += dst_stride;
239 }
240 }
241
242 // This function is exactly the same as av1_convolve_y_sr_c, and is an
243 // optimized version for intrabc.
av1_convolve_y_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)244 void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride,
245 uint8_t *dst, int dst_stride, int w, int h,
246 const InterpFilterParams *filter_params_y,
247 const int subpel_y_qn) {
248 assert(subpel_y_qn == 8);
249 assert(filter_params_y->taps == 2);
250 (void)filter_params_y;
251 (void)subpel_y_qn;
252
253 // vertical filter
254 // explicitly operate for subpel_y_qn = 8.
255 for (int y = 0; y < h; ++y) {
256 for (int x = 0; x < w; ++x) {
257 const int32_t res = src[x] + src[src_stride + x];
258 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
259 }
260 src += src_stride;
261 dst += dst_stride;
262 }
263 }
264
265 // This function is exactly the same as av1_convolve_x_sr_c, and is an
266 // optimized version for intrabc.
av1_convolve_x_sr_intrabc_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)267 void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride,
268 uint8_t *dst, int dst_stride, int w, int h,
269 const InterpFilterParams *filter_params_x,
270 const int subpel_x_qn,
271 ConvolveParams *conv_params) {
272 assert(subpel_x_qn == 8);
273 assert(filter_params_x->taps == 2);
274 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
275 (void)filter_params_x;
276 (void)subpel_x_qn;
277 (void)conv_params;
278
279 // horizontal filter
280 // explicitly operate for subpel_x_qn = 8.
281 for (int y = 0; y < h; ++y) {
282 for (int x = 0; x < w; ++x) {
283 const int32_t res = src[x] + src[x + 1];
284 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1));
285 }
286 src += src_stride;
287 dst += dst_stride;
288 }
289 }
290
av1_dist_wtd_convolve_2d_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)291 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
292 uint8_t *dst, int dst_stride, int w, int h,
293 const InterpFilterParams *filter_params_x,
294 const InterpFilterParams *filter_params_y,
295 const int subpel_x_qn, const int subpel_y_qn,
296 ConvolveParams *conv_params) {
297 CONV_BUF_TYPE *dst16 = conv_params->dst;
298 int dst16_stride = conv_params->dst_stride;
299 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
300 int im_h = h + filter_params_y->taps - 1;
301 int im_stride = w;
302 const int fo_vert = filter_params_y->taps / 2 - 1;
303 const int fo_horiz = filter_params_x->taps / 2 - 1;
304 const int bd = 8;
305 const int round_bits =
306 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
307
308 // horizontal filter
309 const uint8_t *src_horiz = src - fo_vert * src_stride;
310 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
311 filter_params_x, subpel_x_qn & SUBPEL_MASK);
312 for (int y = 0; y < im_h; ++y) {
313 for (int x = 0; x < w; ++x) {
314 int32_t sum = (1 << (bd + FILTER_BITS - 1));
315 for (int k = 0; k < filter_params_x->taps; ++k) {
316 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
317 }
318 assert(filter_params_x->taps > 8 ||
319 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
320 im_block[y * im_stride + x] =
321 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
322 }
323 }
324
325 // vertical filter
326 int16_t *src_vert = im_block + fo_vert * im_stride;
327 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
328 filter_params_y, subpel_y_qn & SUBPEL_MASK);
329 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
330 for (int y = 0; y < h; ++y) {
331 for (int x = 0; x < w; ++x) {
332 int32_t sum = 1 << offset_bits;
333 for (int k = 0; k < filter_params_y->taps; ++k) {
334 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
335 }
336 assert(filter_params_y->taps > 8 ||
337 (0 <= sum && sum < (1 << (offset_bits + 2))));
338 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
339 if (conv_params->do_average) {
340 int32_t tmp = dst16[y * dst16_stride + x];
341 if (conv_params->use_dist_wtd_comp_avg) {
342 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
343 tmp = tmp >> DIST_PRECISION_BITS;
344 } else {
345 tmp += res;
346 tmp = tmp >> 1;
347 }
348 tmp -= (1 << (offset_bits - conv_params->round_1)) +
349 (1 << (offset_bits - conv_params->round_1 - 1));
350 dst[y * dst_stride + x] =
351 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
352 } else {
353 dst16[y * dst16_stride + x] = res;
354 }
355 }
356 }
357 }
358
av1_dist_wtd_convolve_y_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)359 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
360 int dst_stride, int w, int h,
361 const InterpFilterParams *filter_params_y,
362 const int subpel_y_qn,
363 ConvolveParams *conv_params) {
364 CONV_BUF_TYPE *dst16 = conv_params->dst;
365 int dst16_stride = conv_params->dst_stride;
366 const int fo_vert = filter_params_y->taps / 2 - 1;
367 const int bits = FILTER_BITS - conv_params->round_0;
368 const int bd = 8;
369 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
370 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
371 (1 << (offset_bits - conv_params->round_1 - 1));
372 const int round_bits =
373 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
374
375 // vertical filter
376 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
377 filter_params_y, subpel_y_qn & SUBPEL_MASK);
378 for (int y = 0; y < h; ++y) {
379 for (int x = 0; x < w; ++x) {
380 int32_t res = 0;
381 for (int k = 0; k < filter_params_y->taps; ++k) {
382 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
383 }
384 res *= (1 << bits);
385 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
386
387 if (conv_params->do_average) {
388 int32_t tmp = dst16[y * dst16_stride + x];
389 if (conv_params->use_dist_wtd_comp_avg) {
390 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
391 tmp = tmp >> DIST_PRECISION_BITS;
392 } else {
393 tmp += res;
394 tmp = tmp >> 1;
395 }
396 tmp -= round_offset;
397 dst[y * dst_stride + x] =
398 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
399 } else {
400 dst16[y * dst16_stride + x] = res;
401 }
402 }
403 }
404 }
405
av1_dist_wtd_convolve_x_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)406 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
407 int dst_stride, int w, int h,
408 const InterpFilterParams *filter_params_x,
409 const int subpel_x_qn,
410 ConvolveParams *conv_params) {
411 CONV_BUF_TYPE *dst16 = conv_params->dst;
412 int dst16_stride = conv_params->dst_stride;
413 const int fo_horiz = filter_params_x->taps / 2 - 1;
414 const int bits = FILTER_BITS - conv_params->round_1;
415 const int bd = 8;
416 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
417 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
418 (1 << (offset_bits - conv_params->round_1 - 1));
419 const int round_bits =
420 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
421
422 // horizontal filter
423 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
424 filter_params_x, subpel_x_qn & SUBPEL_MASK);
425 for (int y = 0; y < h; ++y) {
426 for (int x = 0; x < w; ++x) {
427 int32_t res = 0;
428 for (int k = 0; k < filter_params_x->taps; ++k) {
429 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
430 }
431 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
432 res += round_offset;
433
434 if (conv_params->do_average) {
435 int32_t tmp = dst16[y * dst16_stride + x];
436 if (conv_params->use_dist_wtd_comp_avg) {
437 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
438 tmp = tmp >> DIST_PRECISION_BITS;
439 } else {
440 tmp += res;
441 tmp = tmp >> 1;
442 }
443 tmp -= round_offset;
444 dst[y * dst_stride + x] =
445 clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
446 } else {
447 dst16[y * dst16_stride + x] = res;
448 }
449 }
450 }
451 }
452
av1_dist_wtd_convolve_2d_copy_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params)453 void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
454 uint8_t *dst, int dst_stride, int w, int h,
455 ConvolveParams *conv_params) {
456 CONV_BUF_TYPE *dst16 = conv_params->dst;
457 int dst16_stride = conv_params->dst_stride;
458 const int bits =
459 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
460 const int bd = 8;
461 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
462 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
463 (1 << (offset_bits - conv_params->round_1 - 1));
464
465 for (int y = 0; y < h; ++y) {
466 for (int x = 0; x < w; ++x) {
467 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
468 res += round_offset;
469
470 if (conv_params->do_average) {
471 int32_t tmp = dst16[y * dst16_stride + x];
472 if (conv_params->use_dist_wtd_comp_avg) {
473 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
474 tmp = tmp >> DIST_PRECISION_BITS;
475 } else {
476 tmp += res;
477 tmp = tmp >> 1;
478 }
479 tmp -= round_offset;
480 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
481 } else {
482 dst16[y * dst16_stride + x] = res;
483 }
484 }
485 }
486 }
487
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)488 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
489 int dst_stride, int w, int h,
490 const InterpFilterParams *filter_params_x,
491 const InterpFilterParams *filter_params_y,
492 const int subpel_x_qn, const int x_step_qn,
493 const int subpel_y_qn, const int y_step_qn,
494 ConvolveParams *conv_params) {
495 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
496 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
497 filter_params_y->taps;
498 CONV_BUF_TYPE *dst16 = conv_params->dst;
499 const int dst16_stride = conv_params->dst_stride;
500 const int bits =
501 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
502 assert(bits >= 0);
503 int im_stride = w;
504 const int fo_vert = filter_params_y->taps / 2 - 1;
505 const int fo_horiz = filter_params_x->taps / 2 - 1;
506 const int bd = 8;
507
508 // horizontal filter
509 const uint8_t *src_horiz = src - fo_vert * src_stride;
510 for (int y = 0; y < im_h; ++y) {
511 int x_qn = subpel_x_qn;
512 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
513 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
514 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
515 assert(x_filter_idx < SUBPEL_SHIFTS);
516 const int16_t *x_filter =
517 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
518 int32_t sum = (1 << (bd + FILTER_BITS - 1));
519 for (int k = 0; k < filter_params_x->taps; ++k) {
520 sum += x_filter[k] * src_x[k - fo_horiz];
521 }
522 assert(filter_params_x->taps > 8 ||
523 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
524 im_block[y * im_stride + x] =
525 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
526 }
527 src_horiz += src_stride;
528 }
529
530 // vertical filter
531 int16_t *src_vert = im_block + fo_vert * im_stride;
532 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
533 for (int x = 0; x < w; ++x) {
534 int y_qn = subpel_y_qn;
535 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
536 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
537 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
538 assert(y_filter_idx < SUBPEL_SHIFTS);
539 const int16_t *y_filter =
540 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
541 int32_t sum = 1 << offset_bits;
542 for (int k = 0; k < filter_params_y->taps; ++k) {
543 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
544 }
545 assert(filter_params_y->taps > 8 ||
546 (0 <= sum && sum < (1 << (offset_bits + 2))));
547 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
548 if (conv_params->is_compound) {
549 if (conv_params->do_average) {
550 int32_t tmp = dst16[y * dst16_stride + x];
551 if (conv_params->use_dist_wtd_comp_avg) {
552 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
553 tmp = tmp >> DIST_PRECISION_BITS;
554 } else {
555 tmp += res;
556 tmp = tmp >> 1;
557 }
558 /* Subtract round offset and convolve round */
559 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
560 (1 << (offset_bits - conv_params->round_1 - 1)));
561 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
562 } else {
563 dst16[y * dst16_stride + x] = res;
564 }
565 } else {
566 /* Subtract round offset and convolve round */
567 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
568 (1 << (offset_bits - conv_params->round_1 - 1)));
569 dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
570 }
571 }
572 src_vert++;
573 }
574 }
575
convolve_2d_scale_wrapper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)576 static void convolve_2d_scale_wrapper(
577 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
578 int h, const InterpFilterParams *filter_params_x,
579 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
580 const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
581 ConvolveParams *conv_params) {
582 if (conv_params->is_compound) {
583 assert(conv_params->dst != NULL);
584 }
585 av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
586 filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
587 y_step_qn, conv_params);
588 }
589
convolve_2d_facade_compound(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)590 static void convolve_2d_facade_compound(
591 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
592 int h, const InterpFilterParams *filter_params_x,
593 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
594 const int subpel_y_qn, ConvolveParams *conv_params) {
595 const bool need_x = subpel_x_qn != 0;
596 const bool need_y = subpel_y_qn != 0;
597 if (!need_x && !need_y) {
598 av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
599 conv_params);
600 } else if (need_x && !need_y) {
601 av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
602 filter_params_x, subpel_x_qn, conv_params);
603 } else if (!need_x && need_y) {
604 av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
605 filter_params_y, subpel_y_qn, conv_params);
606 } else {
607 assert(need_y && need_x);
608 av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
609 filter_params_x, filter_params_y, subpel_x_qn,
610 subpel_y_qn, conv_params);
611 }
612 }
613
convolve_2d_facade_single(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)614 static void convolve_2d_facade_single(
615 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
616 int h, const InterpFilterParams *filter_params_x,
617 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
618 const int subpel_y_qn, ConvolveParams *conv_params) {
619 const bool need_x = subpel_x_qn != 0;
620 const bool need_y = subpel_y_qn != 0;
621 if (!need_x && !need_y) {
622 aom_convolve_copy(src, src_stride, dst, dst_stride, w, h);
623 } else if (need_x && !need_y) {
624 av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
625 subpel_x_qn, conv_params);
626 } else if (!need_x && need_y) {
627 av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
628 subpel_y_qn);
629 } else {
630 assert(need_x && need_y);
631 av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
632 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
633 }
634 }
635
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params)636 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
637 int dst_stride, int w, int h,
638 const InterpFilterParams *interp_filters[2],
639 const int subpel_x_qn, int x_step_q4,
640 const int subpel_y_qn, int y_step_q4, int scaled,
641 ConvolveParams *conv_params) {
642 (void)x_step_q4;
643 (void)y_step_q4;
644 (void)dst;
645 (void)dst_stride;
646
647 const InterpFilterParams *filter_params_x = interp_filters[0];
648 const InterpFilterParams *filter_params_y = interp_filters[1];
649
650 // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
651 // 2-tap filter indicates that it is for IntraBC.
652 if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
653 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
654 assert(!scaled);
655 if (subpel_x_qn && subpel_y_qn) {
656 av1_convolve_2d_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
657 filter_params_x, filter_params_y,
658 subpel_x_qn, subpel_y_qn, conv_params);
659 return;
660 } else if (subpel_x_qn) {
661 av1_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
662 filter_params_x, subpel_x_qn, conv_params);
663 return;
664 } else if (subpel_y_qn) {
665 av1_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
666 filter_params_y, subpel_y_qn);
667 return;
668 }
669 }
670
671 if (scaled) {
672 convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
673 filter_params_x, filter_params_y, subpel_x_qn,
674 x_step_q4, subpel_y_qn, y_step_q4, conv_params);
675 } else if (conv_params->is_compound) {
676 convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h,
677 filter_params_x, filter_params_y, subpel_x_qn,
678 subpel_y_qn, conv_params);
679 } else {
680 convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
681 filter_params_x, filter_params_y, subpel_x_qn,
682 subpel_y_qn, conv_params);
683 }
684 }
685
686 #if CONFIG_AV1_HIGHBITDEPTH
av1_highbd_convolve_x_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)687 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
688 uint16_t *dst, int dst_stride, int w, int h,
689 const InterpFilterParams *filter_params_x,
690 const int subpel_x_qn,
691 ConvolveParams *conv_params, int bd) {
692 const int fo_horiz = filter_params_x->taps / 2 - 1;
693 const int bits = FILTER_BITS - conv_params->round_0;
694
695 assert(bits >= 0);
696 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
697 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
698
699 // horizontal filter
700 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
701 filter_params_x, subpel_x_qn & SUBPEL_MASK);
702 for (int y = 0; y < h; ++y) {
703 for (int x = 0; x < w; ++x) {
704 int32_t res = 0;
705 for (int k = 0; k < filter_params_x->taps; ++k) {
706 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
707 }
708 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
709 dst[y * dst_stride + x] =
710 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
711 }
712 }
713 }
714
av1_highbd_convolve_y_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)715 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
716 uint16_t *dst, int dst_stride, int w, int h,
717 const InterpFilterParams *filter_params_y,
718 const int subpel_y_qn, int bd) {
719 const int fo_vert = filter_params_y->taps / 2 - 1;
720 // vertical filter
721 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
722 filter_params_y, subpel_y_qn & SUBPEL_MASK);
723 for (int y = 0; y < h; ++y) {
724 for (int x = 0; x < w; ++x) {
725 int32_t res = 0;
726 for (int k = 0; k < filter_params_y->taps; ++k) {
727 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
728 }
729 dst[y * dst_stride + x] =
730 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
731 }
732 }
733 }
734
av1_highbd_convolve_2d_sr_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)735 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
736 uint16_t *dst, int dst_stride, int w, int h,
737 const InterpFilterParams *filter_params_x,
738 const InterpFilterParams *filter_params_y,
739 const int subpel_x_qn, const int subpel_y_qn,
740 ConvolveParams *conv_params, int bd) {
741 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
742 int im_h = h + filter_params_y->taps - 1;
743 int im_stride = w;
744 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
745 const int fo_vert = filter_params_y->taps / 2 - 1;
746 const int fo_horiz = filter_params_x->taps / 2 - 1;
747 const int bits =
748 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
749 assert(bits >= 0);
750
751 // horizontal filter
752 const uint16_t *src_horiz = src - fo_vert * src_stride;
753 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
754 filter_params_x, subpel_x_qn & SUBPEL_MASK);
755 for (int y = 0; y < im_h; ++y) {
756 for (int x = 0; x < w; ++x) {
757 int32_t sum = (1 << (bd + FILTER_BITS - 1));
758 for (int k = 0; k < filter_params_x->taps; ++k) {
759 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
760 }
761 assert(filter_params_x->taps > 8 ||
762 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
763 im_block[y * im_stride + x] =
764 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
765 }
766 }
767
768 // vertical filter
769 int16_t *src_vert = im_block + fo_vert * im_stride;
770 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
771 filter_params_y, subpel_y_qn & SUBPEL_MASK);
772 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
773 for (int y = 0; y < h; ++y) {
774 for (int x = 0; x < w; ++x) {
775 int32_t sum = 1 << offset_bits;
776 for (int k = 0; k < filter_params_y->taps; ++k) {
777 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
778 }
779 assert(filter_params_y->taps > 8 ||
780 (0 <= sum && sum < (1 << (offset_bits + 2))));
781 int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
782 ((1 << (offset_bits - conv_params->round_1)) +
783 (1 << (offset_bits - conv_params->round_1 - 1)));
784 dst[y * dst_stride + x] =
785 clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
786 }
787 }
788 }
789
790 // This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an
791 // optimized version for intrabc. Use the following 2-tap filter:
792 // DECLARE_ALIGNED(256, static const int16_t,
793 // av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
794 // 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
795 // 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
796 // };
av1_highbd_convolve_2d_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)797 void av1_highbd_convolve_2d_sr_intrabc_c(
798 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
799 int h, const InterpFilterParams *filter_params_x,
800 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
801 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
802 const int bits =
803 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
804 assert(bits >= 0);
805 assert(subpel_x_qn == 8);
806 assert(subpel_y_qn == 8);
807 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
808 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
809 (void)filter_params_x;
810 (void)subpel_x_qn;
811 (void)filter_params_y;
812 (void)subpel_y_qn;
813 (void)conv_params;
814
815 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
816 int im_h = h + 1;
817 int im_stride = w;
818 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
819
820 // horizontal filter
821 // explicitly operate for subpel_x_qn = 8.
822 int16_t *im = im_block;
823 for (int y = 0; y < im_h; ++y) {
824 for (int x = 0; x < w; ++x) {
825 int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]);
826 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
827 sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
828 im[x] = sum;
829 }
830 src += src_stride;
831 im += im_stride;
832 }
833
834 // vertical filter
835 // explicitly operate for subpel_y_qn = 8.
836 int16_t *src_vert = im_block;
837 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
838 for (int y = 0; y < h; ++y) {
839 for (int x = 0; x < w; ++x) {
840 const int32_t sum =
841 (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]);
842 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
843 const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
844 ((1 << (offset_bits - conv_params->round_1)) +
845 (1 << (offset_bits - conv_params->round_1 - 1)));
846
847 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
848 }
849 src_vert += im_stride;
850 dst += dst_stride;
851 }
852 }
853
854 // This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an
855 // optimized version for intrabc.
av1_highbd_convolve_y_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)856 void av1_highbd_convolve_y_sr_intrabc_c(
857 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
858 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
859 int bd) {
860 assert(subpel_y_qn == 8);
861 assert(filter_params_y->taps == 2);
862 (void)filter_params_y;
863 (void)subpel_y_qn;
864
865 // vertical filter
866 // explicitly operate for subpel_y_qn = 8.
867 for (int y = 0; y < h; ++y) {
868 for (int x = 0; x < w; ++x) {
869 const int32_t res = src[x] + src[src_stride + x];
870 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd);
871 }
872 src += src_stride;
873 dst += dst_stride;
874 }
875 }
876
877 // This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an
878 // optimized version for intrabc.
av1_highbd_convolve_x_sr_intrabc_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)879 void av1_highbd_convolve_x_sr_intrabc_c(
880 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
881 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
882 ConvolveParams *conv_params, int bd) {
883 const int bits = FILTER_BITS - conv_params->round_0;
884 assert(bits >= 0);
885 assert(subpel_x_qn == 8);
886 assert(filter_params_x->taps == 2);
887 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
888 (void)filter_params_x;
889 (void)subpel_x_qn;
890
891 // horizontal filter
892 // explicitly operate for subpel_x_qn = 8.
893 for (int y = 0; y < h; ++y) {
894 for (int x = 0; x < w; ++x) {
895 int32_t res = 64 * (src[x] + src[x + 1]);
896 res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
897 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
898 }
899 src += src_stride;
900 dst += dst_stride;
901 }
902 }
903
av1_highbd_dist_wtd_convolve_2d_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)904 void av1_highbd_dist_wtd_convolve_2d_c(
905 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
906 int h, const InterpFilterParams *filter_params_x,
907 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
908 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
909 int x, y, k;
910 int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
911 CONV_BUF_TYPE *dst16 = conv_params->dst;
912 int dst16_stride = conv_params->dst_stride;
913 int im_h = h + filter_params_y->taps - 1;
914 int im_stride = w;
915 const int fo_vert = filter_params_y->taps / 2 - 1;
916 const int fo_horiz = filter_params_x->taps / 2 - 1;
917 const int round_bits =
918 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
919 assert(round_bits >= 0);
920
921 // horizontal filter
922 const uint16_t *src_horiz = src - fo_vert * src_stride;
923 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
924 filter_params_x, subpel_x_qn & SUBPEL_MASK);
925 for (y = 0; y < im_h; ++y) {
926 for (x = 0; x < w; ++x) {
927 int32_t sum = (1 << (bd + FILTER_BITS - 1));
928 for (k = 0; k < filter_params_x->taps; ++k) {
929 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
930 }
931 assert(filter_params_x->taps > 8 ||
932 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
933 (void)bd;
934 im_block[y * im_stride + x] =
935 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
936 }
937 }
938
939 // vertical filter
940 int16_t *src_vert = im_block + fo_vert * im_stride;
941 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
942 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
943 filter_params_y, subpel_y_qn & SUBPEL_MASK);
944 for (y = 0; y < h; ++y) {
945 for (x = 0; x < w; ++x) {
946 int32_t sum = 1 << offset_bits;
947 for (k = 0; k < filter_params_y->taps; ++k) {
948 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
949 }
950 assert(filter_params_y->taps > 8 ||
951 (0 <= sum && sum < (1 << (offset_bits + 2))));
952 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
953 if (conv_params->do_average) {
954 int32_t tmp = dst16[y * dst16_stride + x];
955 if (conv_params->use_dist_wtd_comp_avg) {
956 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
957 tmp = tmp >> DIST_PRECISION_BITS;
958 } else {
959 tmp += res;
960 tmp = tmp >> 1;
961 }
962 tmp -= (1 << (offset_bits - conv_params->round_1)) +
963 (1 << (offset_bits - conv_params->round_1 - 1));
964 dst[y * dst_stride + x] =
965 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
966 } else {
967 dst16[y * dst16_stride + x] = res;
968 }
969 }
970 }
971 }
972
av1_highbd_dist_wtd_convolve_x_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)973 void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride,
974 uint16_t *dst, int dst_stride, int w,
975 int h,
976 const InterpFilterParams *filter_params_x,
977 const int subpel_x_qn,
978 ConvolveParams *conv_params, int bd) {
979 CONV_BUF_TYPE *dst16 = conv_params->dst;
980 int dst16_stride = conv_params->dst_stride;
981 const int fo_horiz = filter_params_x->taps / 2 - 1;
982 const int bits = FILTER_BITS - conv_params->round_1;
983 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
984 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
985 (1 << (offset_bits - conv_params->round_1 - 1));
986 const int round_bits =
987 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
988 assert(round_bits >= 0);
989 assert(bits >= 0);
990 // horizontal filter
991 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
992 filter_params_x, subpel_x_qn & SUBPEL_MASK);
993 for (int y = 0; y < h; ++y) {
994 for (int x = 0; x < w; ++x) {
995 int32_t res = 0;
996 for (int k = 0; k < filter_params_x->taps; ++k) {
997 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
998 }
999 res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
1000 res += round_offset;
1001
1002 if (conv_params->do_average) {
1003 int32_t tmp = dst16[y * dst16_stride + x];
1004 if (conv_params->use_dist_wtd_comp_avg) {
1005 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1006 tmp = tmp >> DIST_PRECISION_BITS;
1007 } else {
1008 tmp += res;
1009 tmp = tmp >> 1;
1010 }
1011 tmp -= round_offset;
1012 dst[y * dst_stride + x] =
1013 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1014 } else {
1015 dst16[y * dst16_stride + x] = res;
1016 }
1017 }
1018 }
1019 }
1020
av1_highbd_dist_wtd_convolve_y_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1021 void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride,
1022 uint16_t *dst, int dst_stride, int w,
1023 int h,
1024 const InterpFilterParams *filter_params_y,
1025 const int subpel_y_qn,
1026 ConvolveParams *conv_params, int bd) {
1027 CONV_BUF_TYPE *dst16 = conv_params->dst;
1028 int dst16_stride = conv_params->dst_stride;
1029 const int fo_vert = filter_params_y->taps / 2 - 1;
1030 const int bits = FILTER_BITS - conv_params->round_0;
1031 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1032 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1033 (1 << (offset_bits - conv_params->round_1 - 1));
1034 const int round_bits =
1035 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1036 assert(round_bits >= 0);
1037 assert(bits >= 0);
1038 // vertical filter
1039 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1040 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1041 for (int y = 0; y < h; ++y) {
1042 for (int x = 0; x < w; ++x) {
1043 int32_t res = 0;
1044 for (int k = 0; k < filter_params_y->taps; ++k) {
1045 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
1046 }
1047 res *= (1 << bits);
1048 res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
1049
1050 if (conv_params->do_average) {
1051 int32_t tmp = dst16[y * dst16_stride + x];
1052 if (conv_params->use_dist_wtd_comp_avg) {
1053 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1054 tmp = tmp >> DIST_PRECISION_BITS;
1055 } else {
1056 tmp += res;
1057 tmp = tmp >> 1;
1058 }
1059 tmp -= round_offset;
1060 dst[y * dst_stride + x] =
1061 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1062 } else {
1063 dst16[y * dst16_stride + x] = res;
1064 }
1065 }
1066 }
1067 }
1068
av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)1069 void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride,
1070 uint16_t *dst, int dst_stride,
1071 int w, int h,
1072 ConvolveParams *conv_params,
1073 int bd) {
1074 CONV_BUF_TYPE *dst16 = conv_params->dst;
1075 int dst16_stride = conv_params->dst_stride;
1076 const int bits =
1077 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
1078 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1079 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1080 (1 << (offset_bits - conv_params->round_1 - 1));
1081 assert(bits >= 0);
1082
1083 for (int y = 0; y < h; ++y) {
1084 for (int x = 0; x < w; ++x) {
1085 CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
1086 res += round_offset;
1087 if (conv_params->do_average) {
1088 int32_t tmp = dst16[y * dst16_stride + x];
1089 if (conv_params->use_dist_wtd_comp_avg) {
1090 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1091 tmp = tmp >> DIST_PRECISION_BITS;
1092 } else {
1093 tmp += res;
1094 tmp = tmp >> 1;
1095 }
1096 tmp -= round_offset;
1097 dst[y * dst_stride + x] =
1098 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1099 } else {
1100 dst16[y * dst16_stride + x] = res;
1101 }
1102 }
1103 }
1104 }
1105
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)1106 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
1107 uint16_t *dst, int dst_stride, int w, int h,
1108 const InterpFilterParams *filter_params_x,
1109 const InterpFilterParams *filter_params_y,
1110 const int subpel_x_qn, const int x_step_qn,
1111 const int subpel_y_qn, const int y_step_qn,
1112 ConvolveParams *conv_params, int bd) {
1113 int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
1114 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1115 filter_params_y->taps;
1116 int im_stride = w;
1117 const int fo_vert = filter_params_y->taps / 2 - 1;
1118 const int fo_horiz = filter_params_x->taps / 2 - 1;
1119 CONV_BUF_TYPE *dst16 = conv_params->dst;
1120 const int dst16_stride = conv_params->dst_stride;
1121 const int bits =
1122 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
1123 assert(bits >= 0);
1124 // horizontal filter
1125 const uint16_t *src_horiz = src - fo_vert * src_stride;
1126 for (int y = 0; y < im_h; ++y) {
1127 int x_qn = subpel_x_qn;
1128 for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
1129 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
1130 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1131 assert(x_filter_idx < SUBPEL_SHIFTS);
1132 const int16_t *x_filter =
1133 av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
1134 int32_t sum = (1 << (bd + FILTER_BITS - 1));
1135 for (int k = 0; k < filter_params_x->taps; ++k) {
1136 sum += x_filter[k] * src_x[k - fo_horiz];
1137 }
1138 assert(filter_params_x->taps > 8 ||
1139 (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))));
1140 im_block[y * im_stride + x] =
1141 (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1142 }
1143 src_horiz += src_stride;
1144 }
1145
1146 // vertical filter
1147 int16_t *src_vert = im_block + fo_vert * im_stride;
1148 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1149 for (int x = 0; x < w; ++x) {
1150 int y_qn = subpel_y_qn;
1151 for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
1152 const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
1153 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1154 assert(y_filter_idx < SUBPEL_SHIFTS);
1155 const int16_t *y_filter =
1156 av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
1157 int32_t sum = 1 << offset_bits;
1158 for (int k = 0; k < filter_params_y->taps; ++k) {
1159 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
1160 }
1161 assert(filter_params_y->taps > 8 ||
1162 (0 <= sum && sum < (1 << (offset_bits + 2))));
1163 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1164 if (conv_params->is_compound) {
1165 if (conv_params->do_average) {
1166 int32_t tmp = dst16[y * dst16_stride + x];
1167 if (conv_params->use_dist_wtd_comp_avg) {
1168 tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1169 tmp = tmp >> DIST_PRECISION_BITS;
1170 } else {
1171 tmp += res;
1172 tmp = tmp >> 1;
1173 }
1174 /* Subtract round offset and convolve round */
1175 tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
1176 (1 << (offset_bits - conv_params->round_1 - 1)));
1177 dst[y * dst_stride + x] =
1178 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1179 } else {
1180 dst16[y * dst16_stride + x] = res;
1181 }
1182 } else {
1183 /* Subtract round offset and convolve round */
1184 int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
1185 (1 << (offset_bits - conv_params->round_1 - 1)));
1186 dst[y * dst_stride + x] =
1187 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1188 }
1189 }
1190 src_vert++;
1191 }
1192 }
1193
highbd_convolve_2d_facade_compound(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1194 static void highbd_convolve_2d_facade_compound(
1195 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1196 const int w, const int h, const InterpFilterParams *filter_params_x,
1197 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1198 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1199 const bool need_x = subpel_x_qn != 0;
1200 const bool need_y = subpel_y_qn != 0;
1201 if (!need_x && !need_y) {
1202 av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h,
1203 conv_params, bd);
1204 } else if (need_x && !need_y) {
1205 av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h,
1206 filter_params_x, subpel_x_qn, conv_params,
1207 bd);
1208 } else if (!need_x && need_y) {
1209 av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h,
1210 filter_params_y, subpel_y_qn, conv_params,
1211 bd);
1212 } else {
1213 assert(need_x && need_y);
1214 av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h,
1215 filter_params_x, filter_params_y,
1216 subpel_x_qn, subpel_y_qn, conv_params, bd);
1217 }
1218 }
1219
highbd_convolve_2d_facade_single(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,const int w,const int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1220 static void highbd_convolve_2d_facade_single(
1221 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
1222 const int w, const int h, const InterpFilterParams *filter_params_x,
1223 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1224 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1225 const bool need_x = subpel_x_qn != 0;
1226 const bool need_y = subpel_y_qn != 0;
1227
1228 if (!need_x && !need_y) {
1229 aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h);
1230 } else if (need_x && !need_y) {
1231 av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h,
1232 filter_params_x, subpel_x_qn, conv_params, bd);
1233 } else if (!need_x && need_y) {
1234 av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h,
1235 filter_params_y, subpel_y_qn, bd);
1236 } else {
1237 assert(need_x && need_y);
1238 av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
1239 filter_params_x, filter_params_y, subpel_x_qn,
1240 subpel_y_qn, conv_params, bd);
1241 }
1242 }
1243
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams * interp_filters[2],const int subpel_x_qn,int x_step_q4,const int subpel_y_qn,int y_step_q4,int scaled,ConvolveParams * conv_params,int bd)1244 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
1245 uint8_t *dst8, int dst_stride, int w, int h,
1246 const InterpFilterParams *interp_filters[2],
1247 const int subpel_x_qn, int x_step_q4,
1248 const int subpel_y_qn, int y_step_q4,
1249 int scaled, ConvolveParams *conv_params,
1250 int bd) {
1251 (void)x_step_q4;
1252 (void)y_step_q4;
1253 (void)dst_stride;
1254 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1255
1256 const InterpFilterParams *filter_params_x = interp_filters[0];
1257 const InterpFilterParams *filter_params_y = interp_filters[1];
1258
1259 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1260 // 2-tap filter indicates that it is for IntraBC.
1261 if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
1262 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
1263 assert(!scaled);
1264 if (subpel_x_qn && subpel_y_qn) {
1265 av1_highbd_convolve_2d_sr_intrabc_c(
1266 src, src_stride, dst, dst_stride, w, h, filter_params_x,
1267 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1268 return;
1269 } else if (subpel_x_qn) {
1270 av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1271 filter_params_x, subpel_x_qn,
1272 conv_params, bd);
1273 return;
1274 } else if (subpel_y_qn) {
1275 av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h,
1276 filter_params_y, subpel_y_qn, bd);
1277 return;
1278 }
1279 }
1280
1281 if (scaled) {
1282 if (conv_params->is_compound) {
1283 assert(conv_params->dst != NULL);
1284 }
1285 av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
1286 filter_params_x, filter_params_y, subpel_x_qn,
1287 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
1288 bd);
1289 } else if (conv_params->is_compound) {
1290 highbd_convolve_2d_facade_compound(
1291 src, src_stride, dst, dst_stride, w, h, filter_params_x,
1292 filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
1293 } else {
1294 highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h,
1295 filter_params_x, filter_params_y,
1296 subpel_x_qn, subpel_y_qn, conv_params, bd);
1297 }
1298 }
1299 #endif // CONFIG_AV1_HIGHBITDEPTH
1300
1301 // Note: Fixed size intermediate buffers, place limits on parameters
1302 // of some functions. 2d filtering proceeds in 2 steps:
1303 // (1) Interpolate horizontally into an intermediate buffer, temp.
1304 // (2) Interpolate temp vertically to derive the sub-pixel result.
1305 // Deriving the maximum number of rows in the temp buffer (135):
1306 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1307 // --Largest block size is 128x128 pixels.
1308 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
1309 // original frame (in 1/16th pixel units).
1310 // --Must round-up because block may be located at sub-pixel position.
1311 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1312 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
1313 #define WIENER_MAX_EXT_SIZE 263
1314
horz_scalar_product(const uint8_t * a,const int16_t * b)1315 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
1316 int sum = 0;
1317 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1318 return sum;
1319 }
1320
1321 #if CONFIG_AV1_HIGHBITDEPTH
highbd_horz_scalar_product(const uint16_t * a,const int16_t * b)1322 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
1323 const int16_t *b) {
1324 int sum = 0;
1325 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
1326 return sum;
1327 }
1328 #endif
1329
highbd_vert_scalar_product(const uint16_t * a,ptrdiff_t a_stride,const int16_t * b)1330 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
1331 ptrdiff_t a_stride,
1332 const int16_t *b) {
1333 int sum = 0;
1334 for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
1335 return sum;
1336 }
1337
get_filter_base(const int16_t * filter)1338 static const InterpKernel *get_filter_base(const int16_t *filter) {
1339 // NOTE: This assumes that the filter table is 256-byte aligned.
1340 // TODO(agrange) Modify to make independent of table alignment.
1341 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1342 }
1343
get_filter_offset(const int16_t * f,const InterpKernel * base)1344 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1345 return (int)((const InterpKernel *)(intptr_t)f - base);
1346 }
1347
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits)1348 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
1349 uint16_t *dst, ptrdiff_t dst_stride,
1350 const InterpKernel *x_filters, int x0_q4,
1351 int x_step_q4, int w, int h,
1352 int round0_bits) {
1353 const int bd = 8;
1354 src -= SUBPEL_TAPS / 2 - 1;
1355 for (int y = 0; y < h; ++y) {
1356 int x_q4 = x0_q4;
1357 for (int x = 0; x < w; ++x) {
1358 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1359 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1360 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1361 (1 << (bd + FILTER_BITS - 1));
1362 const int sum = horz_scalar_product(src_x, x_filter) + rounding;
1363 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1364 WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
1365 x_q4 += x_step_q4;
1366 }
1367 src += src_stride;
1368 dst += dst_stride;
1369 }
1370 }
1371
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits)1372 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
1373 uint8_t *dst, ptrdiff_t dst_stride,
1374 const InterpKernel *y_filters, int y0_q4,
1375 int y_step_q4, int w, int h,
1376 int round1_bits) {
1377 const int bd = 8;
1378 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1379
1380 for (int x = 0; x < w; ++x) {
1381 int y_q4 = y0_q4;
1382 for (int y = 0; y < h; ++y) {
1383 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1384 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1385 const int rounding =
1386 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1387 (1 << (bd + round1_bits - 1));
1388 const int sum =
1389 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1390 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
1391 y_q4 += y_step_q4;
1392 }
1393 ++src;
1394 ++dst;
1395 }
1396 }
1397
av1_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const WienerConvolveParams * conv_params)1398 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1399 uint8_t *dst, ptrdiff_t dst_stride,
1400 const int16_t *filter_x, int x_step_q4,
1401 const int16_t *filter_y, int y_step_q4,
1402 int w, int h,
1403 const WienerConvolveParams *conv_params) {
1404 const InterpKernel *const filters_x = get_filter_base(filter_x);
1405 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1406
1407 const InterpKernel *const filters_y = get_filter_base(filter_y);
1408 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1409
1410 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1411 const int intermediate_height =
1412 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
1413 memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
1414
1415 assert(w <= MAX_SB_SIZE);
1416 assert(h <= MAX_SB_SIZE);
1417 assert(y_step_q4 <= 32);
1418 assert(x_step_q4 <= 32);
1419
1420 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1421 src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
1422 x_step_q4, w, intermediate_height,
1423 conv_params->round_0);
1424 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1425 MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
1426 y_step_q4, w, h, conv_params->round_1);
1427 }
1428
1429 #if CONFIG_AV1_HIGHBITDEPTH
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int round0_bits,int bd)1430 static void highbd_convolve_add_src_horiz_hip(
1431 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1432 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1433 int x_step_q4, int w, int h, int round0_bits, int bd) {
1434 const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
1435 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1436 src -= SUBPEL_TAPS / 2 - 1;
1437 for (int y = 0; y < h; ++y) {
1438 int x_q4 = x0_q4;
1439 for (int x = 0; x < w; ++x) {
1440 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1441 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1442 const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1443 (1 << (bd + FILTER_BITS - 1));
1444 const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1445 dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
1446 extraprec_clamp_limit - 1);
1447 x_q4 += x_step_q4;
1448 }
1449 src += src_stride;
1450 dst += dst_stride;
1451 }
1452 }
1453
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int round1_bits,int bd)1454 static void highbd_convolve_add_src_vert_hip(
1455 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1456 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1457 int y_step_q4, int w, int h, int round1_bits, int bd) {
1458 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1459 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1460 for (int x = 0; x < w; ++x) {
1461 int y_q4 = y0_q4;
1462 for (int y = 0; y < h; ++y) {
1463 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1464 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1465 const int rounding =
1466 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1467 (1 << (bd + round1_bits - 1));
1468 const int sum =
1469 highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
1470 dst[y * dst_stride] =
1471 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
1472 y_q4 += y_step_q4;
1473 }
1474 ++src;
1475 ++dst;
1476 }
1477 }
1478
av1_highbd_wiener_convolve_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const WienerConvolveParams * conv_params,int bd)1479 void av1_highbd_wiener_convolve_add_src_c(
1480 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1481 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1482 const int16_t *filter_y, int y_step_q4, int w, int h,
1483 const WienerConvolveParams *conv_params, int bd) {
1484 const InterpKernel *const filters_x = get_filter_base(filter_x);
1485 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1486
1487 const InterpKernel *const filters_y = get_filter_base(filter_y);
1488 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1489
1490 uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
1491 const int intermediate_height =
1492 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1493
1494 assert(w <= MAX_SB_SIZE);
1495 assert(h <= MAX_SB_SIZE);
1496 assert(y_step_q4 <= 32);
1497 assert(x_step_q4 <= 32);
1498 assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
1499
1500 highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1501 src_stride, temp, MAX_SB_SIZE, filters_x,
1502 x0_q4, x_step_q4, w, intermediate_height,
1503 conv_params->round_0, bd);
1504 highbd_convolve_add_src_vert_hip(
1505 temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
1506 filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
1507 }
1508 #endif // CONFIG_AV1_HIGHBITDEPTH
1509