1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/av1_rtcd.h"
18
19 #include "aom/aom_integer.h"
20 #include "aom_ports/mem.h"
21
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/blend.h"
24 #include "aom_dsp/variance.h"
25
26 #include "av1/common/av1_common_int.h"
27 #include "av1/common/filter.h"
28 #include "av1/common/reconinter.h"
29 #include "av1/encoder/reconinter_enc.h"
30
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)31 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
32 int b_stride) {
33 int distortion = 0;
34 int r, c;
35
36 for (r = 0; r < 4; ++r) {
37 for (c = 0; c < 4; ++c) {
38 int diff = a[c] - b[c];
39 distortion += diff * diff;
40 }
41
42 a += a_stride;
43 b += b_stride;
44 }
45
46 return distortion;
47 }
48
aom_get_mb_ss_c(const int16_t * a)49 uint32_t aom_get_mb_ss_c(const int16_t *a) {
50 unsigned int i, sum = 0;
51
52 for (i = 0; i < 256; ++i) {
53 sum += a[i] * a[i];
54 }
55
56 return sum;
57 }
58
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)59 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
60 int b_stride, int w, int h, uint32_t *sse, int *sum) {
61 int i, j;
62
63 *sum = 0;
64 *sse = 0;
65
66 for (i = 0; i < h; ++i) {
67 for (j = 0; j < w; ++j) {
68 const int diff = a[j] - b[j];
69 *sum += diff;
70 *sse += diff * diff;
71 }
72
73 a += a_stride;
74 b += b_stride;
75 }
76 }
77
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)78 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
79 int b_stride, int w, int h) {
80 uint32_t sse;
81 int sum;
82 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
83 return sse;
84 }
85
86 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
87 // or vertical direction to produce the filtered output block. Used to implement
88 // the first-pass of 2-D separable filter.
89 //
90 // Produces int16_t output to retain precision for the next pass. Two filter
91 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
92 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
93 // It defines the offset required to move from one input to the next.
aom_var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)94 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
95 unsigned int src_pixels_per_line,
96 unsigned int pixel_step,
97 unsigned int output_height,
98 unsigned int output_width,
99 const uint8_t *filter) {
100 unsigned int i, j;
101
102 for (i = 0; i < output_height; ++i) {
103 for (j = 0; j < output_width; ++j) {
104 b[j] = ROUND_POWER_OF_TWO(
105 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
106
107 ++a;
108 }
109
110 a += src_pixels_per_line - output_width;
111 b += output_width;
112 }
113 }
114
115 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
116 // or vertical direction to produce the filtered output block. Used to implement
117 // the second-pass of 2-D separable filter.
118 //
119 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
120 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
121 // filter is applied horizontally (pixel_step = 1) or vertically
122 // (pixel_step = stride). It defines the offset required to move from one input
123 // to the next. Output is 8-bit.
aom_var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)124 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
125 unsigned int src_pixels_per_line,
126 unsigned int pixel_step,
127 unsigned int output_height,
128 unsigned int output_width,
129 const uint8_t *filter) {
130 unsigned int i, j;
131
132 for (i = 0; i < output_height; ++i) {
133 for (j = 0; j < output_width; ++j) {
134 b[j] = ROUND_POWER_OF_TWO(
135 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
136 ++a;
137 }
138
139 a += src_pixels_per_line - output_width;
140 b += output_width;
141 }
142 }
143
144 #define VAR(W, H) \
145 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
146 const uint8_t *b, int b_stride, \
147 uint32_t *sse) { \
148 int sum; \
149 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
150 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
151 }
152
153 #define SUBPIX_VAR(W, H) \
154 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
155 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
156 const uint8_t *b, int b_stride, uint32_t *sse) { \
157 uint16_t fdata3[(H + 1) * W]; \
158 uint8_t temp2[H * W]; \
159 \
160 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
161 bilinear_filters_2t[xoffset]); \
162 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
163 bilinear_filters_2t[yoffset]); \
164 \
165 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
166 }
167
168 #define SUBPIX_AVG_VAR(W, H) \
169 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
170 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
171 const uint8_t *b, int b_stride, uint32_t *sse, \
172 const uint8_t *second_pred) { \
173 uint16_t fdata3[(H + 1) * W]; \
174 uint8_t temp2[H * W]; \
175 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
176 \
177 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
178 bilinear_filters_2t[xoffset]); \
179 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
180 bilinear_filters_2t[yoffset]); \
181 \
182 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
183 \
184 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
185 } \
186 uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
187 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
188 const uint8_t *b, int b_stride, uint32_t *sse, \
189 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
190 uint16_t fdata3[(H + 1) * W]; \
191 uint8_t temp2[H * W]; \
192 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
193 \
194 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
195 bilinear_filters_2t[xoffset]); \
196 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
197 bilinear_filters_2t[yoffset]); \
198 \
199 aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
200 \
201 return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
202 }
203
204 /* Identical to the variance call except it takes an additional parameter, sum,
205 * and returns that value using pass-by-reference instead of returning
206 * sse - sum^2 / w*h
207 */
208 #define GET_VAR(W, H) \
209 void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
210 const uint8_t *b, int b_stride, uint32_t *sse, \
211 int *sum) { \
212 variance(a, a_stride, b, b_stride, W, H, sse, sum); \
213 }
214
215 /* Identical to the variance call except it does not calculate the
216 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
217 * variable.
218 */
219 #define MSE(W, H) \
220 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
221 const uint8_t *b, int b_stride, \
222 uint32_t *sse) { \
223 int sum; \
224 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
225 return *sse; \
226 }
227
228 /* All three forms of the variance are available in the same sizes. */
229 #define VARIANCES(W, H) \
230 VAR(W, H) \
231 SUBPIX_VAR(W, H) \
232 SUBPIX_AVG_VAR(W, H)
233
234 VARIANCES(128, 128)
235 VARIANCES(128, 64)
236 VARIANCES(64, 128)
237 VARIANCES(64, 64)
238 VARIANCES(64, 32)
239 VARIANCES(32, 64)
240 VARIANCES(32, 32)
241 VARIANCES(32, 16)
242 VARIANCES(16, 32)
243 VARIANCES(16, 16)
244 VARIANCES(16, 8)
245 VARIANCES(8, 16)
246 VARIANCES(8, 8)
247 VARIANCES(8, 4)
248 VARIANCES(4, 8)
249 VARIANCES(4, 4)
250 VARIANCES(4, 2)
251 VARIANCES(2, 4)
252 VARIANCES(2, 2)
253 VARIANCES(4, 16)
254 VARIANCES(16, 4)
255 VARIANCES(8, 32)
256 VARIANCES(32, 8)
257 VARIANCES(16, 64)
258 VARIANCES(64, 16)
259
260 GET_VAR(16, 16)
261 GET_VAR(8, 8)
262
263 MSE(16, 16)
264 MSE(16, 8)
265 MSE(8, 16)
266 MSE(8, 8)
267
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)268 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
269 int height, const uint8_t *ref, int ref_stride) {
270 int i, j;
271
272 for (i = 0; i < height; ++i) {
273 for (j = 0; j < width; ++j) {
274 const int tmp = pred[j] + ref[j];
275 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
276 }
277 comp_pred += width;
278 pred += width;
279 ref += ref_stride;
280 }
281 }
282
283 // Get pred block from up-sampled reference.
aom_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)284 void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
285 int mi_row, int mi_col, const MV *const mv,
286 uint8_t *comp_pred, int width, int height,
287 int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
288 int ref_stride, int subpel_search) {
289 // expect xd == NULL only in tests
290 if (xd != NULL) {
291 const MB_MODE_INFO *mi = xd->mi[0];
292 const int ref_num = 0;
293 const int is_intrabc = is_intrabc_block(mi);
294 const struct scale_factors *const sf =
295 is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
296 const int is_scaled = av1_is_scaled(sf);
297
298 if (is_scaled) {
299 int plane = 0;
300 const int mi_x = mi_col * MI_SIZE;
301 const int mi_y = mi_row * MI_SIZE;
302 const struct macroblockd_plane *const pd = &xd->plane[plane];
303 const struct buf_2d *const dst_buf = &pd->dst;
304 const struct buf_2d *const pre_buf =
305 is_intrabc ? dst_buf : &pd->pre[ref_num];
306
307 InterPredParams inter_pred_params;
308 inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
309 const int_interpfilters filters =
310 av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
311 av1_init_inter_params(
312 &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
313 mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
314 xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
315 av1_enc_build_one_inter_predictor(comp_pred, width, mv,
316 &inter_pred_params);
317 return;
318 }
319 }
320
321 const InterpFilterParams *filter = av1_get_filter(subpel_search);
322
323 if (!subpel_x_q3 && !subpel_y_q3) {
324 for (int i = 0; i < height; i++) {
325 memcpy(comp_pred, ref, width * sizeof(*comp_pred));
326 comp_pred += width;
327 ref += ref_stride;
328 }
329 } else if (!subpel_y_q3) {
330 const int16_t *const kernel =
331 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
332 aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
333 -1, width, height);
334 } else if (!subpel_x_q3) {
335 const int16_t *const kernel =
336 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
337 aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
338 16, width, height);
339 } else {
340 DECLARE_ALIGNED(16, uint8_t,
341 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
342 const int16_t *const kernel_x =
343 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
344 const int16_t *const kernel_y =
345 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
346 const int intermediate_height =
347 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
348 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
349 aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
350 ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
351 width, intermediate_height);
352 aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
353 MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
354 width, height);
355 }
356 }
357
aom_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,int subpel_search)358 void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
359 int mi_row, int mi_col, const MV *const mv,
360 uint8_t *comp_pred, const uint8_t *pred,
361 int width, int height, int subpel_x_q3,
362 int subpel_y_q3, const uint8_t *ref,
363 int ref_stride, int subpel_search) {
364 int i, j;
365
366 aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
367 subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
368 for (i = 0; i < height; i++) {
369 for (j = 0; j < width; j++) {
370 comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
371 }
372 comp_pred += width;
373 pred += width;
374 }
375 }
376
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)377 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
378 int width, int height, const uint8_t *ref,
379 int ref_stride,
380 const DIST_WTD_COMP_PARAMS *jcp_param) {
381 int i, j;
382 const int fwd_offset = jcp_param->fwd_offset;
383 const int bck_offset = jcp_param->bck_offset;
384
385 for (i = 0; i < height; ++i) {
386 for (j = 0; j < width; ++j) {
387 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
388 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
389 comp_pred[j] = (uint8_t)tmp;
390 }
391 comp_pred += width;
392 pred += width;
393 ref += ref_stride;
394 }
395 }
396
aom_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)397 void aom_dist_wtd_comp_avg_upsampled_pred_c(
398 MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
399 const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
400 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
401 int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
402 int i, j;
403 const int fwd_offset = jcp_param->fwd_offset;
404 const int bck_offset = jcp_param->bck_offset;
405
406 aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
407 subpel_x_q3, subpel_y_q3, ref, ref_stride,
408 subpel_search);
409
410 for (i = 0; i < height; i++) {
411 for (j = 0; j < width; j++) {
412 int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
413 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
414 comp_pred[j] = (uint8_t)tmp;
415 }
416 comp_pred += width;
417 pred += width;
418 }
419 }
420
421 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)422 static void highbd_variance64(const uint8_t *a8, int a_stride,
423 const uint8_t *b8, int b_stride, int w, int h,
424 uint64_t *sse, int64_t *sum) {
425 const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
426 const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
427 int64_t tsum = 0;
428 uint64_t tsse = 0;
429 for (int i = 0; i < h; ++i) {
430 int32_t lsum = 0;
431 for (int j = 0; j < w; ++j) {
432 const int diff = a[j] - b[j];
433 lsum += diff;
434 tsse += (uint32_t)(diff * diff);
435 }
436 tsum += lsum;
437 a += a_stride;
438 b += b_stride;
439 }
440 *sum = tsum;
441 *sse = tsse;
442 }
443
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)444 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
445 const uint8_t *b, int b_stride, int w, int h) {
446 uint64_t sse;
447 int64_t sum;
448 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
449 return sse;
450 }
451
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)452 static void highbd_8_variance(const uint8_t *a8, int a_stride,
453 const uint8_t *b8, int b_stride, int w, int h,
454 uint32_t *sse, int *sum) {
455 uint64_t sse_long = 0;
456 int64_t sum_long = 0;
457 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
458 *sse = (uint32_t)sse_long;
459 *sum = (int)sum_long;
460 }
461
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)462 static void highbd_10_variance(const uint8_t *a8, int a_stride,
463 const uint8_t *b8, int b_stride, int w, int h,
464 uint32_t *sse, int *sum) {
465 uint64_t sse_long = 0;
466 int64_t sum_long = 0;
467 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
468 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
469 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
470 }
471
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)472 static void highbd_12_variance(const uint8_t *a8, int a_stride,
473 const uint8_t *b8, int b_stride, int w, int h,
474 uint32_t *sse, int *sum) {
475 uint64_t sse_long = 0;
476 int64_t sum_long = 0;
477 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
478 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
479 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
480 }
481
482 #define HIGHBD_VAR(W, H) \
483 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
484 const uint8_t *b, int b_stride, \
485 uint32_t *sse) { \
486 int sum; \
487 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
488 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
489 } \
490 \
491 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
492 const uint8_t *b, int b_stride, \
493 uint32_t *sse) { \
494 int sum; \
495 int64_t var; \
496 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
497 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
498 return (var >= 0) ? (uint32_t)var : 0; \
499 } \
500 \
501 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
502 const uint8_t *b, int b_stride, \
503 uint32_t *sse) { \
504 int sum; \
505 int64_t var; \
506 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
507 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
508 return (var >= 0) ? (uint32_t)var : 0; \
509 }
510
511 #define HIGHBD_GET_VAR(S) \
512 void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
513 const uint8_t *ref, int ref_stride, \
514 uint32_t *sse, int *sum) { \
515 highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
516 } \
517 \
518 void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
519 const uint8_t *ref, int ref_stride, \
520 uint32_t *sse, int *sum) { \
521 highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
522 } \
523 \
524 void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
525 const uint8_t *ref, int ref_stride, \
526 uint32_t *sse, int *sum) { \
527 highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
528 }
529
530 #define HIGHBD_MSE(W, H) \
531 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
532 const uint8_t *ref, int ref_stride, \
533 uint32_t *sse) { \
534 int sum; \
535 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
536 return *sse; \
537 } \
538 \
539 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
540 const uint8_t *ref, int ref_stride, \
541 uint32_t *sse) { \
542 int sum; \
543 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
544 return *sse; \
545 } \
546 \
547 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
548 const uint8_t *ref, int ref_stride, \
549 uint32_t *sse) { \
550 int sum; \
551 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
552 return *sse; \
553 }
554
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)555 void aom_highbd_var_filter_block2d_bil_first_pass(
556 const uint8_t *src_ptr8, uint16_t *output_ptr,
557 unsigned int src_pixels_per_line, int pixel_step,
558 unsigned int output_height, unsigned int output_width,
559 const uint8_t *filter) {
560 unsigned int i, j;
561 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
562 for (i = 0; i < output_height; ++i) {
563 for (j = 0; j < output_width; ++j) {
564 output_ptr[j] = ROUND_POWER_OF_TWO(
565 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
566 FILTER_BITS);
567
568 ++src_ptr;
569 }
570
571 // Next row...
572 src_ptr += src_pixels_per_line - output_width;
573 output_ptr += output_width;
574 }
575 }
576
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)577 void aom_highbd_var_filter_block2d_bil_second_pass(
578 const uint16_t *src_ptr, uint16_t *output_ptr,
579 unsigned int src_pixels_per_line, unsigned int pixel_step,
580 unsigned int output_height, unsigned int output_width,
581 const uint8_t *filter) {
582 unsigned int i, j;
583
584 for (i = 0; i < output_height; ++i) {
585 for (j = 0; j < output_width; ++j) {
586 output_ptr[j] = ROUND_POWER_OF_TWO(
587 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
588 FILTER_BITS);
589 ++src_ptr;
590 }
591
592 src_ptr += src_pixels_per_line - output_width;
593 output_ptr += output_width;
594 }
595 }
596
597 #define HIGHBD_SUBPIX_VAR(W, H) \
598 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
599 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
600 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
601 uint16_t fdata3[(H + 1) * W]; \
602 uint16_t temp2[H * W]; \
603 \
604 aom_highbd_var_filter_block2d_bil_first_pass( \
605 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
606 aom_highbd_var_filter_block2d_bil_second_pass( \
607 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
608 \
609 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
610 dst, dst_stride, sse); \
611 } \
612 \
613 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
614 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
615 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
616 uint16_t fdata3[(H + 1) * W]; \
617 uint16_t temp2[H * W]; \
618 \
619 aom_highbd_var_filter_block2d_bil_first_pass( \
620 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
621 aom_highbd_var_filter_block2d_bil_second_pass( \
622 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
623 \
624 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
625 dst, dst_stride, sse); \
626 } \
627 \
628 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
629 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
630 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
631 uint16_t fdata3[(H + 1) * W]; \
632 uint16_t temp2[H * W]; \
633 \
634 aom_highbd_var_filter_block2d_bil_first_pass( \
635 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
636 aom_highbd_var_filter_block2d_bil_second_pass( \
637 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
638 \
639 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
640 dst, dst_stride, sse); \
641 }
642
643 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
644 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
645 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
646 const uint8_t *dst, int dst_stride, uint32_t *sse, \
647 const uint8_t *second_pred) { \
648 uint16_t fdata3[(H + 1) * W]; \
649 uint16_t temp2[H * W]; \
650 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
651 \
652 aom_highbd_var_filter_block2d_bil_first_pass( \
653 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
654 aom_highbd_var_filter_block2d_bil_second_pass( \
655 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
656 \
657 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
658 CONVERT_TO_BYTEPTR(temp2), W); \
659 \
660 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
661 dst, dst_stride, sse); \
662 } \
663 \
664 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
665 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
666 const uint8_t *dst, int dst_stride, uint32_t *sse, \
667 const uint8_t *second_pred) { \
668 uint16_t fdata3[(H + 1) * W]; \
669 uint16_t temp2[H * W]; \
670 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
671 \
672 aom_highbd_var_filter_block2d_bil_first_pass( \
673 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
674 aom_highbd_var_filter_block2d_bil_second_pass( \
675 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
676 \
677 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
678 CONVERT_TO_BYTEPTR(temp2), W); \
679 \
680 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
681 dst, dst_stride, sse); \
682 } \
683 \
684 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
685 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
686 const uint8_t *dst, int dst_stride, uint32_t *sse, \
687 const uint8_t *second_pred) { \
688 uint16_t fdata3[(H + 1) * W]; \
689 uint16_t temp2[H * W]; \
690 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
691 \
692 aom_highbd_var_filter_block2d_bil_first_pass( \
693 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
694 aom_highbd_var_filter_block2d_bil_second_pass( \
695 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
696 \
697 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
698 CONVERT_TO_BYTEPTR(temp2), W); \
699 \
700 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
701 dst, dst_stride, sse); \
702 } \
703 \
704 uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
705 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
706 const uint8_t *dst, int dst_stride, uint32_t *sse, \
707 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
708 uint16_t fdata3[(H + 1) * W]; \
709 uint16_t temp2[H * W]; \
710 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
711 \
712 aom_highbd_var_filter_block2d_bil_first_pass( \
713 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
714 aom_highbd_var_filter_block2d_bil_second_pass( \
715 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
716 \
717 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
718 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
719 jcp_param); \
720 \
721 return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
722 dst_stride, sse); \
723 } \
724 \
725 uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
726 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
727 const uint8_t *dst, int dst_stride, uint32_t *sse, \
728 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
729 uint16_t fdata3[(H + 1) * W]; \
730 uint16_t temp2[H * W]; \
731 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
732 \
733 aom_highbd_var_filter_block2d_bil_first_pass( \
734 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
735 aom_highbd_var_filter_block2d_bil_second_pass( \
736 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
737 \
738 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
739 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
740 jcp_param); \
741 \
742 return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
743 dst_stride, sse); \
744 } \
745 \
746 uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
747 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
748 const uint8_t *dst, int dst_stride, uint32_t *sse, \
749 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
750 uint16_t fdata3[(H + 1) * W]; \
751 uint16_t temp2[H * W]; \
752 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
753 \
754 aom_highbd_var_filter_block2d_bil_first_pass( \
755 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
756 aom_highbd_var_filter_block2d_bil_second_pass( \
757 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
758 \
759 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
760 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
761 jcp_param); \
762 \
763 return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
764 dst_stride, sse); \
765 }
766
767 /* All three forms of the variance are available in the same sizes. */
768 #define HIGHBD_VARIANCES(W, H) \
769 HIGHBD_VAR(W, H) \
770 HIGHBD_SUBPIX_VAR(W, H) \
771 HIGHBD_SUBPIX_AVG_VAR(W, H)
772
773 HIGHBD_VARIANCES(128, 128)
774 HIGHBD_VARIANCES(128, 64)
775 HIGHBD_VARIANCES(64, 128)
776 HIGHBD_VARIANCES(64, 64)
777 HIGHBD_VARIANCES(64, 32)
778 HIGHBD_VARIANCES(32, 64)
779 HIGHBD_VARIANCES(32, 32)
780 HIGHBD_VARIANCES(32, 16)
781 HIGHBD_VARIANCES(16, 32)
782 HIGHBD_VARIANCES(16, 16)
783 HIGHBD_VARIANCES(16, 8)
784 HIGHBD_VARIANCES(8, 16)
785 HIGHBD_VARIANCES(8, 8)
786 HIGHBD_VARIANCES(8, 4)
787 HIGHBD_VARIANCES(4, 8)
788 HIGHBD_VARIANCES(4, 4)
789 HIGHBD_VARIANCES(4, 2)
790 HIGHBD_VARIANCES(2, 4)
791 HIGHBD_VARIANCES(2, 2)
792 HIGHBD_VARIANCES(4, 16)
793 HIGHBD_VARIANCES(16, 4)
794 HIGHBD_VARIANCES(8, 32)
795 HIGHBD_VARIANCES(32, 8)
796 HIGHBD_VARIANCES(16, 64)
797 HIGHBD_VARIANCES(64, 16)
798
799 HIGHBD_GET_VAR(8)
800 HIGHBD_GET_VAR(16)
801
802 HIGHBD_MSE(16, 16)
803 HIGHBD_MSE(16, 8)
804 HIGHBD_MSE(8, 16)
805 HIGHBD_MSE(8, 8)
806
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)807 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
808 int width, int height, const uint8_t *ref8,
809 int ref_stride) {
810 int i, j;
811 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
812 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
813 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
814 for (i = 0; i < height; ++i) {
815 for (j = 0; j < width; ++j) {
816 const int tmp = pred[j] + ref[j];
817 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
818 }
819 comp_pred += width;
820 pred += width;
821 ref += ref_stride;
822 }
823 }
824
aom_highbd_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)825 void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
826 const struct AV1Common *const cm, int mi_row,
827 int mi_col, const MV *const mv,
828 uint8_t *comp_pred8, int width, int height,
829 int subpel_x_q3, int subpel_y_q3,
830 const uint8_t *ref8, int ref_stride, int bd,
831 int subpel_search) {
832 // expect xd == NULL only in tests
833 if (xd != NULL) {
834 const MB_MODE_INFO *mi = xd->mi[0];
835 const int ref_num = 0;
836 const int is_intrabc = is_intrabc_block(mi);
837 const struct scale_factors *const sf =
838 is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
839 const int is_scaled = av1_is_scaled(sf);
840
841 if (is_scaled) {
842 int plane = 0;
843 const int mi_x = mi_col * MI_SIZE;
844 const int mi_y = mi_row * MI_SIZE;
845 const struct macroblockd_plane *const pd = &xd->plane[plane];
846 const struct buf_2d *const dst_buf = &pd->dst;
847 const struct buf_2d *const pre_buf =
848 is_intrabc ? dst_buf : &pd->pre[ref_num];
849
850 InterPredParams inter_pred_params;
851 inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
852 const int_interpfilters filters =
853 av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
854 av1_init_inter_params(
855 &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
856 mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
857 xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
858 av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
859 &inter_pred_params);
860 return;
861 }
862 }
863
864 const InterpFilterParams *filter = av1_get_filter(subpel_search);
865
866 if (!subpel_x_q3 && !subpel_y_q3) {
867 const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
868 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
869 for (int i = 0; i < height; i++) {
870 memcpy(comp_pred, ref, width * sizeof(*comp_pred));
871 comp_pred += width;
872 ref += ref_stride;
873 }
874 } else if (!subpel_y_q3) {
875 const int16_t *const kernel =
876 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
877 aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
878 16, NULL, -1, width, height, bd);
879 } else if (!subpel_x_q3) {
880 const int16_t *const kernel =
881 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
882 aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
883 kernel, 16, width, height, bd);
884 } else {
885 DECLARE_ALIGNED(16, uint16_t,
886 temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
887 const int16_t *const kernel_x =
888 av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
889 const int16_t *const kernel_y =
890 av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
891 const int intermediate_height =
892 (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
893 assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
894 aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
895 ref_stride, CONVERT_TO_BYTEPTR(temp),
896 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
897 intermediate_height, bd);
898 aom_highbd_convolve8_vert_c(
899 CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
900 MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
901 bd);
902 }
903 }
904
aom_highbd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,int subpel_search)905 void aom_highbd_comp_avg_upsampled_pred_c(
906 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
907 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
908 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
909 int ref_stride, int bd, int subpel_search) {
910 int i, j;
911
912 const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
913 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
914 aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
915 height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
916 bd, subpel_search);
917 for (i = 0; i < height; ++i) {
918 for (j = 0; j < width; ++j) {
919 comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
920 }
921 comp_pred += width;
922 pred += width;
923 }
924 }
925
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)926 void aom_highbd_dist_wtd_comp_avg_pred_c(
927 uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
928 const uint8_t *ref8, int ref_stride,
929 const DIST_WTD_COMP_PARAMS *jcp_param) {
930 int i, j;
931 const int fwd_offset = jcp_param->fwd_offset;
932 const int bck_offset = jcp_param->bck_offset;
933 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
934 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
935 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
936
937 for (i = 0; i < height; ++i) {
938 for (j = 0; j < width; ++j) {
939 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
940 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
941 comp_pred[j] = (uint16_t)tmp;
942 }
943 comp_pred += width;
944 pred += width;
945 ref += ref_stride;
946 }
947 }
948
aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,int bd,const DIST_WTD_COMP_PARAMS * jcp_param,int subpel_search)949 void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
950 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
951 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
952 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
953 int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
954 int subpel_search) {
955 int i, j;
956 const int fwd_offset = jcp_param->fwd_offset;
957 const int bck_offset = jcp_param->bck_offset;
958 const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
959 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
960 aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
961 height, subpel_x_q3, subpel_y_q3, ref8,
962 ref_stride, bd, subpel_search);
963
964 for (i = 0; i < height; i++) {
965 for (j = 0; j < width; j++) {
966 int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
967 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
968 comp_pred[j] = (uint16_t)tmp;
969 }
970 comp_pred += width;
971 pred += width;
972 }
973 }
974 #endif // CONFIG_AV1_HIGHBITDEPTH
975
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)976 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
977 int height, const uint8_t *ref, int ref_stride,
978 const uint8_t *mask, int mask_stride,
979 int invert_mask) {
980 int i, j;
981 const uint8_t *src0 = invert_mask ? pred : ref;
982 const uint8_t *src1 = invert_mask ? ref : pred;
983 const int stride0 = invert_mask ? width : ref_stride;
984 const int stride1 = invert_mask ? ref_stride : width;
985 for (i = 0; i < height; ++i) {
986 for (j = 0; j < width; ++j) {
987 comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
988 }
989 comp_pred += width;
990 src0 += stride0;
991 src1 += stride1;
992 mask += mask_stride;
993 }
994 }
995
aom_comp_mask_upsampled_pred_c(MACROBLOCKD * xd,const AV1_COMMON * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred,const uint8_t * pred,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int subpel_search)996 void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
997 int mi_row, int mi_col, const MV *const mv,
998 uint8_t *comp_pred, const uint8_t *pred,
999 int width, int height, int subpel_x_q3,
1000 int subpel_y_q3, const uint8_t *ref,
1001 int ref_stride, const uint8_t *mask,
1002 int mask_stride, int invert_mask,
1003 int subpel_search) {
1004 if (subpel_x_q3 | subpel_y_q3) {
1005 aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
1006 subpel_x_q3, subpel_y_q3, ref, ref_stride,
1007 subpel_search);
1008 ref = comp_pred;
1009 ref_stride = width;
1010 }
1011 aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
1012 mask_stride, invert_mask);
1013 }
1014
1015 #define MASK_SUBPIX_VAR(W, H) \
1016 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
1017 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1018 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1019 const uint8_t *msk, int msk_stride, int invert_mask, \
1020 unsigned int *sse) { \
1021 uint16_t fdata3[(H + 1) * W]; \
1022 uint8_t temp2[H * W]; \
1023 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
1024 \
1025 aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
1026 W, bilinear_filters_2t[xoffset]); \
1027 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
1028 bilinear_filters_2t[yoffset]); \
1029 \
1030 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
1031 invert_mask); \
1032 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
1033 }
1034
1035 MASK_SUBPIX_VAR(4, 4)
1036 MASK_SUBPIX_VAR(4, 8)
1037 MASK_SUBPIX_VAR(8, 4)
1038 MASK_SUBPIX_VAR(8, 8)
1039 MASK_SUBPIX_VAR(8, 16)
1040 MASK_SUBPIX_VAR(16, 8)
1041 MASK_SUBPIX_VAR(16, 16)
1042 MASK_SUBPIX_VAR(16, 32)
1043 MASK_SUBPIX_VAR(32, 16)
1044 MASK_SUBPIX_VAR(32, 32)
1045 MASK_SUBPIX_VAR(32, 64)
1046 MASK_SUBPIX_VAR(64, 32)
1047 MASK_SUBPIX_VAR(64, 64)
1048 MASK_SUBPIX_VAR(64, 128)
1049 MASK_SUBPIX_VAR(128, 64)
1050 MASK_SUBPIX_VAR(128, 128)
1051 MASK_SUBPIX_VAR(4, 16)
1052 MASK_SUBPIX_VAR(16, 4)
1053 MASK_SUBPIX_VAR(8, 32)
1054 MASK_SUBPIX_VAR(32, 8)
1055 MASK_SUBPIX_VAR(16, 64)
1056 MASK_SUBPIX_VAR(64, 16)
1057
1058 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)1059 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
1060 int width, int height, const uint8_t *ref8,
1061 int ref_stride, const uint8_t *mask,
1062 int mask_stride, int invert_mask) {
1063 int i, j;
1064 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1065 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
1066 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
1067 for (i = 0; i < height; ++i) {
1068 for (j = 0; j < width; ++j) {
1069 if (!invert_mask)
1070 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
1071 else
1072 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
1073 }
1074 comp_pred += width;
1075 pred += width;
1076 ref += ref_stride;
1077 mask += mask_stride;
1078 }
1079 }
1080
aom_highbd_comp_mask_upsampled_pred(MACROBLOCKD * xd,const struct AV1Common * const cm,int mi_row,int mi_col,const MV * const mv,uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,int subpel_x_q3,int subpel_y_q3,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask,int bd,int subpel_search)1081 void aom_highbd_comp_mask_upsampled_pred(
1082 MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
1083 const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
1084 int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
1085 int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
1086 int bd, int subpel_search) {
1087 aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
1088 height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
1089 bd, subpel_search);
1090 aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
1091 mask, mask_stride, invert_mask);
1092 }
1093
1094 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
1095 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
1096 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1097 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1098 const uint8_t *msk, int msk_stride, int invert_mask, \
1099 unsigned int *sse) { \
1100 uint16_t fdata3[(H + 1) * W]; \
1101 uint16_t temp2[H * W]; \
1102 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1103 \
1104 aom_highbd_var_filter_block2d_bil_first_pass( \
1105 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1106 aom_highbd_var_filter_block2d_bil_second_pass( \
1107 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1108 \
1109 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1110 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1111 invert_mask); \
1112 \
1113 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1114 ref, ref_stride, sse); \
1115 } \
1116 \
1117 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
1118 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1119 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1120 const uint8_t *msk, int msk_stride, int invert_mask, \
1121 unsigned int *sse) { \
1122 uint16_t fdata3[(H + 1) * W]; \
1123 uint16_t temp2[H * W]; \
1124 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1125 \
1126 aom_highbd_var_filter_block2d_bil_first_pass( \
1127 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1128 aom_highbd_var_filter_block2d_bil_second_pass( \
1129 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1130 \
1131 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1132 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1133 invert_mask); \
1134 \
1135 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1136 ref, ref_stride, sse); \
1137 } \
1138 \
1139 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
1140 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
1141 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
1142 const uint8_t *msk, int msk_stride, int invert_mask, \
1143 unsigned int *sse) { \
1144 uint16_t fdata3[(H + 1) * W]; \
1145 uint16_t temp2[H * W]; \
1146 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
1147 \
1148 aom_highbd_var_filter_block2d_bil_first_pass( \
1149 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1150 aom_highbd_var_filter_block2d_bil_second_pass( \
1151 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1152 \
1153 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
1154 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
1155 invert_mask); \
1156 \
1157 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
1158 ref, ref_stride, sse); \
1159 }
1160
1161 HIGHBD_MASK_SUBPIX_VAR(4, 4)
1162 HIGHBD_MASK_SUBPIX_VAR(4, 8)
1163 HIGHBD_MASK_SUBPIX_VAR(8, 4)
1164 HIGHBD_MASK_SUBPIX_VAR(8, 8)
1165 HIGHBD_MASK_SUBPIX_VAR(8, 16)
1166 HIGHBD_MASK_SUBPIX_VAR(16, 8)
1167 HIGHBD_MASK_SUBPIX_VAR(16, 16)
1168 HIGHBD_MASK_SUBPIX_VAR(16, 32)
1169 HIGHBD_MASK_SUBPIX_VAR(32, 16)
1170 HIGHBD_MASK_SUBPIX_VAR(32, 32)
1171 HIGHBD_MASK_SUBPIX_VAR(32, 64)
1172 HIGHBD_MASK_SUBPIX_VAR(64, 32)
1173 HIGHBD_MASK_SUBPIX_VAR(64, 64)
1174 HIGHBD_MASK_SUBPIX_VAR(64, 128)
1175 HIGHBD_MASK_SUBPIX_VAR(128, 64)
1176 HIGHBD_MASK_SUBPIX_VAR(128, 128)
1177 HIGHBD_MASK_SUBPIX_VAR(4, 16)
1178 HIGHBD_MASK_SUBPIX_VAR(16, 4)
1179 HIGHBD_MASK_SUBPIX_VAR(8, 32)
1180 HIGHBD_MASK_SUBPIX_VAR(32, 8)
1181 HIGHBD_MASK_SUBPIX_VAR(16, 64)
1182 HIGHBD_MASK_SUBPIX_VAR(64, 16)
1183 #endif // CONFIG_AV1_HIGHBITDEPTH
1184
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1185 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
1186 const int32_t *wsrc, const int32_t *mask,
1187 int w, int h, unsigned int *sse, int *sum) {
1188 int i, j;
1189
1190 *sse = 0;
1191 *sum = 0;
1192
1193 for (i = 0; i < h; i++) {
1194 for (j = 0; j < w; j++) {
1195 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1196 *sum += diff;
1197 *sse += diff * diff;
1198 }
1199
1200 pre += pre_stride;
1201 wsrc += w;
1202 mask += w;
1203 }
1204 }
1205
1206 #define OBMC_VAR(W, H) \
1207 unsigned int aom_obmc_variance##W##x##H##_c( \
1208 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1209 const int32_t *mask, unsigned int *sse) { \
1210 int sum; \
1211 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1212 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1213 }
1214
1215 #define OBMC_SUBPIX_VAR(W, H) \
1216 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
1217 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1218 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1219 uint16_t fdata3[(H + 1) * W]; \
1220 uint8_t temp2[H * W]; \
1221 \
1222 aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
1223 W, bilinear_filters_2t[xoffset]); \
1224 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
1225 bilinear_filters_2t[yoffset]); \
1226 \
1227 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
1228 }
1229
1230 OBMC_VAR(4, 4)
1231 OBMC_SUBPIX_VAR(4, 4)
1232
1233 OBMC_VAR(4, 8)
1234 OBMC_SUBPIX_VAR(4, 8)
1235
1236 OBMC_VAR(8, 4)
1237 OBMC_SUBPIX_VAR(8, 4)
1238
1239 OBMC_VAR(8, 8)
1240 OBMC_SUBPIX_VAR(8, 8)
1241
1242 OBMC_VAR(8, 16)
1243 OBMC_SUBPIX_VAR(8, 16)
1244
1245 OBMC_VAR(16, 8)
1246 OBMC_SUBPIX_VAR(16, 8)
1247
1248 OBMC_VAR(16, 16)
1249 OBMC_SUBPIX_VAR(16, 16)
1250
1251 OBMC_VAR(16, 32)
1252 OBMC_SUBPIX_VAR(16, 32)
1253
1254 OBMC_VAR(32, 16)
1255 OBMC_SUBPIX_VAR(32, 16)
1256
1257 OBMC_VAR(32, 32)
1258 OBMC_SUBPIX_VAR(32, 32)
1259
1260 OBMC_VAR(32, 64)
1261 OBMC_SUBPIX_VAR(32, 64)
1262
1263 OBMC_VAR(64, 32)
1264 OBMC_SUBPIX_VAR(64, 32)
1265
1266 OBMC_VAR(64, 64)
1267 OBMC_SUBPIX_VAR(64, 64)
1268
1269 OBMC_VAR(64, 128)
1270 OBMC_SUBPIX_VAR(64, 128)
1271
1272 OBMC_VAR(128, 64)
1273 OBMC_SUBPIX_VAR(128, 64)
1274
1275 OBMC_VAR(128, 128)
1276 OBMC_SUBPIX_VAR(128, 128)
1277
1278 OBMC_VAR(4, 16)
1279 OBMC_SUBPIX_VAR(4, 16)
1280 OBMC_VAR(16, 4)
1281 OBMC_SUBPIX_VAR(16, 4)
1282 OBMC_VAR(8, 32)
1283 OBMC_SUBPIX_VAR(8, 32)
1284 OBMC_VAR(32, 8)
1285 OBMC_SUBPIX_VAR(32, 8)
1286 OBMC_VAR(16, 64)
1287 OBMC_SUBPIX_VAR(16, 64)
1288 OBMC_VAR(64, 16)
1289 OBMC_SUBPIX_VAR(64, 16)
1290
1291 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1292 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1293 const int32_t *wsrc,
1294 const int32_t *mask, int w, int h,
1295 uint64_t *sse, int64_t *sum) {
1296 int i, j;
1297 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1298
1299 *sse = 0;
1300 *sum = 0;
1301
1302 for (i = 0; i < h; i++) {
1303 for (j = 0; j < w; j++) {
1304 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1305 *sum += diff;
1306 *sse += diff * diff;
1307 }
1308
1309 pre += pre_stride;
1310 wsrc += w;
1311 mask += w;
1312 }
1313 }
1314
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1315 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1316 const int32_t *wsrc,
1317 const int32_t *mask, int w, int h,
1318 unsigned int *sse, int *sum) {
1319 int64_t sum64;
1320 uint64_t sse64;
1321 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1322 *sum = (int)sum64;
1323 *sse = (unsigned int)sse64;
1324 }
1325
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1326 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1327 const int32_t *wsrc,
1328 const int32_t *mask, int w, int h,
1329 unsigned int *sse, int *sum) {
1330 int64_t sum64;
1331 uint64_t sse64;
1332 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1333 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1334 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1335 }
1336
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1337 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1338 const int32_t *wsrc,
1339 const int32_t *mask, int w, int h,
1340 unsigned int *sse, int *sum) {
1341 int64_t sum64;
1342 uint64_t sse64;
1343 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1344 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1345 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1346 }
1347
1348 #define HIGHBD_OBMC_VAR(W, H) \
1349 unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
1350 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1351 const int32_t *mask, unsigned int *sse) { \
1352 int sum; \
1353 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1354 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1355 } \
1356 \
1357 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
1358 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1359 const int32_t *mask, unsigned int *sse) { \
1360 int sum; \
1361 int64_t var; \
1362 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1363 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1364 return (var >= 0) ? (uint32_t)var : 0; \
1365 } \
1366 \
1367 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
1368 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1369 const int32_t *mask, unsigned int *sse) { \
1370 int sum; \
1371 int64_t var; \
1372 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1373 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1374 return (var >= 0) ? (uint32_t)var : 0; \
1375 }
1376
1377 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
1378 unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
1379 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1380 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1381 uint16_t fdata3[(H + 1) * W]; \
1382 uint16_t temp2[H * W]; \
1383 \
1384 aom_highbd_var_filter_block2d_bil_first_pass( \
1385 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1386 aom_highbd_var_filter_block2d_bil_second_pass( \
1387 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1388 \
1389 return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1390 wsrc, mask, sse); \
1391 } \
1392 \
1393 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
1394 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1395 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1396 uint16_t fdata3[(H + 1) * W]; \
1397 uint16_t temp2[H * W]; \
1398 \
1399 aom_highbd_var_filter_block2d_bil_first_pass( \
1400 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1401 aom_highbd_var_filter_block2d_bil_second_pass( \
1402 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1403 \
1404 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1405 W, wsrc, mask, sse); \
1406 } \
1407 \
1408 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
1409 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1410 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1411 uint16_t fdata3[(H + 1) * W]; \
1412 uint16_t temp2[H * W]; \
1413 \
1414 aom_highbd_var_filter_block2d_bil_first_pass( \
1415 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1416 aom_highbd_var_filter_block2d_bil_second_pass( \
1417 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1418 \
1419 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1420 W, wsrc, mask, sse); \
1421 }
1422
1423 HIGHBD_OBMC_VAR(4, 4)
1424 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1425
1426 HIGHBD_OBMC_VAR(4, 8)
1427 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1428
1429 HIGHBD_OBMC_VAR(8, 4)
1430 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1431
1432 HIGHBD_OBMC_VAR(8, 8)
1433 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1434
1435 HIGHBD_OBMC_VAR(8, 16)
1436 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1437
1438 HIGHBD_OBMC_VAR(16, 8)
1439 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1440
1441 HIGHBD_OBMC_VAR(16, 16)
1442 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1443
1444 HIGHBD_OBMC_VAR(16, 32)
1445 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1446
1447 HIGHBD_OBMC_VAR(32, 16)
1448 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1449
1450 HIGHBD_OBMC_VAR(32, 32)
1451 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1452
1453 HIGHBD_OBMC_VAR(32, 64)
1454 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1455
1456 HIGHBD_OBMC_VAR(64, 32)
1457 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1458
1459 HIGHBD_OBMC_VAR(64, 64)
1460 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1461
1462 HIGHBD_OBMC_VAR(64, 128)
1463 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1464
1465 HIGHBD_OBMC_VAR(128, 64)
1466 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1467
1468 HIGHBD_OBMC_VAR(128, 128)
1469 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1470
1471 HIGHBD_OBMC_VAR(4, 16)
1472 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1473 HIGHBD_OBMC_VAR(16, 4)
1474 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1475 HIGHBD_OBMC_VAR(8, 32)
1476 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1477 HIGHBD_OBMC_VAR(32, 8)
1478 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1479 HIGHBD_OBMC_VAR(16, 64)
1480 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1481 HIGHBD_OBMC_VAR(64, 16)
1482 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1483 #endif // CONFIG_AV1_HIGHBITDEPTH
1484