1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <stdlib.h>
13
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom/aom_integer.h"
18 #include "aom_ports/mem.h"
19
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_dsp/blend.h"
22 #include "aom_dsp/variance.h"
23
24 #include "av1/common/filter.h"
25 #include "av1/common/reconinter.h"
26
aom_get_mb_ss_c(const int16_t * a)27 uint32_t aom_get_mb_ss_c(const int16_t *a) {
28 unsigned int i, sum = 0;
29
30 for (i = 0; i < 256; ++i) {
31 sum += a[i] * a[i];
32 }
33
34 return sum;
35 }
36
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)37 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
38 int b_stride, int w, int h, uint32_t *sse, int *sum) {
39 int i, j;
40
41 *sum = 0;
42 *sse = 0;
43
44 for (i = 0; i < h; ++i) {
45 for (j = 0; j < w; ++j) {
46 const int diff = a[j] - b[j];
47 *sum += diff;
48 *sse += diff * diff;
49 }
50
51 a += a_stride;
52 b += b_stride;
53 }
54 }
55
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)56 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
57 int b_stride, int w, int h) {
58 uint32_t sse;
59 int sum;
60 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
61 return sse;
62 }
63
64 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
65 // or vertical direction to produce the filtered output block. Used to implement
66 // the first-pass of 2-D separable filter.
67 //
68 // Produces int16_t output to retain precision for the next pass. Two filter
69 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
70 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
71 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)72 static void var_filter_block2d_bil_first_pass_c(
73 const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
74 unsigned int pixel_step, unsigned int output_height,
75 unsigned int output_width, const uint8_t *filter) {
76 unsigned int i, j;
77
78 for (i = 0; i < output_height; ++i) {
79 for (j = 0; j < output_width; ++j) {
80 b[j] = ROUND_POWER_OF_TWO(
81 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
82
83 ++a;
84 }
85
86 a += src_pixels_per_line - output_width;
87 b += output_width;
88 }
89 }
90
91 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
92 // or vertical direction to produce the filtered output block. Used to implement
93 // the second-pass of 2-D separable filter.
94 //
95 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
96 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
97 // filter is applied horizontally (pixel_step = 1) or vertically
98 // (pixel_step = stride). It defines the offset required to move from one input
99 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)100 static void var_filter_block2d_bil_second_pass_c(
101 const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
102 unsigned int pixel_step, unsigned int output_height,
103 unsigned int output_width, const uint8_t *filter) {
104 unsigned int i, j;
105
106 for (i = 0; i < output_height; ++i) {
107 for (j = 0; j < output_width; ++j) {
108 b[j] = ROUND_POWER_OF_TWO(
109 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
110 ++a;
111 }
112
113 a += src_pixels_per_line - output_width;
114 b += output_width;
115 }
116 }
117
118 #define VAR(W, H) \
119 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
120 const uint8_t *b, int b_stride, \
121 uint32_t *sse) { \
122 int sum; \
123 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
124 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
125 }
126
127 #define SUBPIX_VAR(W, H) \
128 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
129 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
130 const uint8_t *b, int b_stride, uint32_t *sse) { \
131 uint16_t fdata3[(H + 1) * W]; \
132 uint8_t temp2[H * W]; \
133 \
134 var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
135 bilinear_filters_2t[xoffset]); \
136 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
137 bilinear_filters_2t[yoffset]); \
138 \
139 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
140 }
141
142 #define SUBPIX_AVG_VAR(W, H) \
143 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
144 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
145 const uint8_t *b, int b_stride, uint32_t *sse, \
146 const uint8_t *second_pred) { \
147 uint16_t fdata3[(H + 1) * W]; \
148 uint8_t temp2[H * W]; \
149 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
150 \
151 var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
152 bilinear_filters_2t[xoffset]); \
153 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
154 bilinear_filters_2t[yoffset]); \
155 \
156 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
157 \
158 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
159 } \
160 uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
161 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
162 const uint8_t *b, int b_stride, uint32_t *sse, \
163 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
164 uint16_t fdata3[(H + 1) * W]; \
165 uint8_t temp2[H * W]; \
166 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
167 \
168 var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
169 bilinear_filters_2t[xoffset]); \
170 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
171 bilinear_filters_2t[yoffset]); \
172 \
173 aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
174 \
175 return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
176 }
177
aom_get_var_sse_sum_8x8_quad_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)178 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
179 const uint8_t *b, int b_stride,
180 uint32_t *sse8x8, int *sum8x8,
181 unsigned int *tot_sse, int *tot_sum,
182 uint32_t *var8x8) {
183 // Loop over 4 8x8 blocks. Process one 8x32 block.
184 for (int k = 0; k < 4; k++) {
185 variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
186 &sum8x8[k]);
187 }
188
189 // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
190 *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
191 *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
192 for (int i = 0; i < 4; i++)
193 var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
194 }
195
aom_get_var_sse_sum_16x16_dual_c(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)196 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
197 const uint8_t *ref_ptr, int ref_stride,
198 uint32_t *sse16x16, unsigned int *tot_sse,
199 int *tot_sum, uint32_t *var16x16) {
200 int sum16x16[2] = { 0 };
201 // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
202 for (int k = 0; k < 2; k++) {
203 variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
204 16, 16, &sse16x16[k], &sum16x16[k]);
205 }
206
207 // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
208 *tot_sse += sse16x16[0] + sse16x16[1];
209 *tot_sum += sum16x16[0] + sum16x16[1];
210 for (int i = 0; i < 2; i++)
211 var16x16[i] =
212 sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
213 }
214
215 /* Identical to the variance call except it does not calculate the
216 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
217 * variable.
218 */
219 #define MSE(W, H) \
220 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
221 const uint8_t *b, int b_stride, \
222 uint32_t *sse) { \
223 int sum; \
224 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
225 return *sse; \
226 }
227
228 /* All three forms of the variance are available in the same sizes. */
229 #define VARIANCES(W, H) \
230 VAR(W, H) \
231 SUBPIX_VAR(W, H) \
232 SUBPIX_AVG_VAR(W, H)
233
234 VARIANCES(128, 128)
235 VARIANCES(128, 64)
236 VARIANCES(64, 128)
237 VARIANCES(64, 64)
238 VARIANCES(64, 32)
239 VARIANCES(32, 64)
240 VARIANCES(32, 32)
241 VARIANCES(32, 16)
242 VARIANCES(16, 32)
243 VARIANCES(16, 16)
244 VARIANCES(16, 8)
245 VARIANCES(8, 16)
246 VARIANCES(8, 8)
247 VARIANCES(8, 4)
248 VARIANCES(4, 8)
249 VARIANCES(4, 4)
250
251 // Realtime mode doesn't use rectangular blocks.
252 #if !CONFIG_REALTIME_ONLY
253 VARIANCES(4, 16)
254 VARIANCES(16, 4)
255 VARIANCES(8, 32)
256 VARIANCES(32, 8)
257 VARIANCES(16, 64)
258 VARIANCES(64, 16)
259 #endif
260
261 MSE(16, 16)
262 MSE(16, 8)
263 MSE(8, 16)
264 MSE(8, 8)
265
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)266 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
267 int height, const uint8_t *ref, int ref_stride) {
268 int i, j;
269
270 for (i = 0; i < height; ++i) {
271 for (j = 0; j < width; ++j) {
272 const int tmp = pred[j] + ref[j];
273 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
274 }
275 comp_pred += width;
276 pred += width;
277 ref += ref_stride;
278 }
279 }
280
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)281 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
282 int width, int height, const uint8_t *ref,
283 int ref_stride,
284 const DIST_WTD_COMP_PARAMS *jcp_param) {
285 int i, j;
286 const int fwd_offset = jcp_param->fwd_offset;
287 const int bck_offset = jcp_param->bck_offset;
288
289 for (i = 0; i < height; ++i) {
290 for (j = 0; j < width; ++j) {
291 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
292 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
293 comp_pred[j] = (uint8_t)tmp;
294 }
295 comp_pred += width;
296 pred += width;
297 ref += ref_stride;
298 }
299 }
300
301 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)302 static void highbd_variance64(const uint8_t *a8, int a_stride,
303 const uint8_t *b8, int b_stride, int w, int h,
304 uint64_t *sse, int64_t *sum) {
305 const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
306 const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
307 int64_t tsum = 0;
308 uint64_t tsse = 0;
309 for (int i = 0; i < h; ++i) {
310 int32_t lsum = 0;
311 for (int j = 0; j < w; ++j) {
312 const int diff = a[j] - b[j];
313 lsum += diff;
314 tsse += (uint32_t)(diff * diff);
315 }
316 tsum += lsum;
317 a += a_stride;
318 b += b_stride;
319 }
320 *sum = tsum;
321 *sse = tsse;
322 }
323
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)324 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
325 const uint8_t *b, int b_stride, int w, int h) {
326 uint64_t sse;
327 int64_t sum;
328 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
329 return sse;
330 }
331
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)332 static void highbd_8_variance(const uint8_t *a8, int a_stride,
333 const uint8_t *b8, int b_stride, int w, int h,
334 uint32_t *sse, int *sum) {
335 uint64_t sse_long = 0;
336 int64_t sum_long = 0;
337 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
338 *sse = (uint32_t)sse_long;
339 *sum = (int)sum_long;
340 }
341
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)342 static void highbd_10_variance(const uint8_t *a8, int a_stride,
343 const uint8_t *b8, int b_stride, int w, int h,
344 uint32_t *sse, int *sum) {
345 uint64_t sse_long = 0;
346 int64_t sum_long = 0;
347 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
348 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
349 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
350 }
351
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)352 static void highbd_12_variance(const uint8_t *a8, int a_stride,
353 const uint8_t *b8, int b_stride, int w, int h,
354 uint32_t *sse, int *sum) {
355 uint64_t sse_long = 0;
356 int64_t sum_long = 0;
357 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
358 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
359 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
360 }
361
362 #define HIGHBD_VAR(W, H) \
363 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
364 const uint8_t *b, int b_stride, \
365 uint32_t *sse) { \
366 int sum; \
367 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
368 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
369 } \
370 \
371 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
372 const uint8_t *b, int b_stride, \
373 uint32_t *sse) { \
374 int sum; \
375 int64_t var; \
376 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
377 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
378 return (var >= 0) ? (uint32_t)var : 0; \
379 } \
380 \
381 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
382 const uint8_t *b, int b_stride, \
383 uint32_t *sse) { \
384 int sum; \
385 int64_t var; \
386 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
387 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
388 return (var >= 0) ? (uint32_t)var : 0; \
389 }
390
391 #define HIGHBD_MSE(W, H) \
392 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
393 const uint8_t *ref, int ref_stride, \
394 uint32_t *sse) { \
395 int sum; \
396 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
397 return *sse; \
398 } \
399 \
400 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
401 const uint8_t *ref, int ref_stride, \
402 uint32_t *sse) { \
403 int sum; \
404 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
405 return *sse; \
406 } \
407 \
408 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
409 const uint8_t *ref, int ref_stride, \
410 uint32_t *sse) { \
411 int sum; \
412 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
413 return *sse; \
414 }
415
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)416 void aom_highbd_var_filter_block2d_bil_first_pass(
417 const uint8_t *src_ptr8, uint16_t *output_ptr,
418 unsigned int src_pixels_per_line, int pixel_step,
419 unsigned int output_height, unsigned int output_width,
420 const uint8_t *filter) {
421 unsigned int i, j;
422 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
423 for (i = 0; i < output_height; ++i) {
424 for (j = 0; j < output_width; ++j) {
425 output_ptr[j] = ROUND_POWER_OF_TWO(
426 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
427 FILTER_BITS);
428
429 ++src_ptr;
430 }
431
432 // Next row...
433 src_ptr += src_pixels_per_line - output_width;
434 output_ptr += output_width;
435 }
436 }
437
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)438 void aom_highbd_var_filter_block2d_bil_second_pass(
439 const uint16_t *src_ptr, uint16_t *output_ptr,
440 unsigned int src_pixels_per_line, unsigned int pixel_step,
441 unsigned int output_height, unsigned int output_width,
442 const uint8_t *filter) {
443 unsigned int i, j;
444
445 for (i = 0; i < output_height; ++i) {
446 for (j = 0; j < output_width; ++j) {
447 output_ptr[j] = ROUND_POWER_OF_TWO(
448 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
449 FILTER_BITS);
450 ++src_ptr;
451 }
452
453 src_ptr += src_pixels_per_line - output_width;
454 output_ptr += output_width;
455 }
456 }
457
458 #define HIGHBD_SUBPIX_VAR(W, H) \
459 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
460 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
461 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
462 uint16_t fdata3[(H + 1) * W]; \
463 uint16_t temp2[H * W]; \
464 \
465 aom_highbd_var_filter_block2d_bil_first_pass( \
466 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
467 aom_highbd_var_filter_block2d_bil_second_pass( \
468 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
469 \
470 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
471 dst, dst_stride, sse); \
472 } \
473 \
474 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
475 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
476 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
477 uint16_t fdata3[(H + 1) * W]; \
478 uint16_t temp2[H * W]; \
479 \
480 aom_highbd_var_filter_block2d_bil_first_pass( \
481 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
482 aom_highbd_var_filter_block2d_bil_second_pass( \
483 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
484 \
485 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
486 dst, dst_stride, sse); \
487 } \
488 \
489 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
490 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
491 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
492 uint16_t fdata3[(H + 1) * W]; \
493 uint16_t temp2[H * W]; \
494 \
495 aom_highbd_var_filter_block2d_bil_first_pass( \
496 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
497 aom_highbd_var_filter_block2d_bil_second_pass( \
498 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
499 \
500 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
501 dst, dst_stride, sse); \
502 }
503
504 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
505 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
506 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
507 const uint8_t *dst, int dst_stride, uint32_t *sse, \
508 const uint8_t *second_pred) { \
509 uint16_t fdata3[(H + 1) * W]; \
510 uint16_t temp2[H * W]; \
511 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
512 \
513 aom_highbd_var_filter_block2d_bil_first_pass( \
514 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
515 aom_highbd_var_filter_block2d_bil_second_pass( \
516 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
517 \
518 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
519 CONVERT_TO_BYTEPTR(temp2), W); \
520 \
521 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
522 dst, dst_stride, sse); \
523 } \
524 \
525 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
526 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
527 const uint8_t *dst, int dst_stride, uint32_t *sse, \
528 const uint8_t *second_pred) { \
529 uint16_t fdata3[(H + 1) * W]; \
530 uint16_t temp2[H * W]; \
531 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
532 \
533 aom_highbd_var_filter_block2d_bil_first_pass( \
534 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
535 aom_highbd_var_filter_block2d_bil_second_pass( \
536 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
537 \
538 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
539 CONVERT_TO_BYTEPTR(temp2), W); \
540 \
541 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
542 dst, dst_stride, sse); \
543 } \
544 \
545 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
546 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
547 const uint8_t *dst, int dst_stride, uint32_t *sse, \
548 const uint8_t *second_pred) { \
549 uint16_t fdata3[(H + 1) * W]; \
550 uint16_t temp2[H * W]; \
551 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
552 \
553 aom_highbd_var_filter_block2d_bil_first_pass( \
554 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
555 aom_highbd_var_filter_block2d_bil_second_pass( \
556 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
557 \
558 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
559 CONVERT_TO_BYTEPTR(temp2), W); \
560 \
561 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
562 dst, dst_stride, sse); \
563 } \
564 \
565 uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
566 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
567 const uint8_t *dst, int dst_stride, uint32_t *sse, \
568 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
569 uint16_t fdata3[(H + 1) * W]; \
570 uint16_t temp2[H * W]; \
571 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
572 \
573 aom_highbd_var_filter_block2d_bil_first_pass( \
574 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
575 aom_highbd_var_filter_block2d_bil_second_pass( \
576 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
577 \
578 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
579 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
580 jcp_param); \
581 \
582 return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
583 dst_stride, sse); \
584 } \
585 \
586 uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
587 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
588 const uint8_t *dst, int dst_stride, uint32_t *sse, \
589 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
590 uint16_t fdata3[(H + 1) * W]; \
591 uint16_t temp2[H * W]; \
592 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
593 \
594 aom_highbd_var_filter_block2d_bil_first_pass( \
595 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
596 aom_highbd_var_filter_block2d_bil_second_pass( \
597 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
598 \
599 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
600 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
601 jcp_param); \
602 \
603 return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
604 dst_stride, sse); \
605 } \
606 \
607 uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
608 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
609 const uint8_t *dst, int dst_stride, uint32_t *sse, \
610 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
611 uint16_t fdata3[(H + 1) * W]; \
612 uint16_t temp2[H * W]; \
613 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
614 \
615 aom_highbd_var_filter_block2d_bil_first_pass( \
616 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
617 aom_highbd_var_filter_block2d_bil_second_pass( \
618 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
619 \
620 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
621 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
622 jcp_param); \
623 \
624 return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
625 dst_stride, sse); \
626 }
627
628 /* All three forms of the variance are available in the same sizes. */
629 #define HIGHBD_VARIANCES(W, H) \
630 HIGHBD_VAR(W, H) \
631 HIGHBD_SUBPIX_VAR(W, H) \
632 HIGHBD_SUBPIX_AVG_VAR(W, H)
633
634 HIGHBD_VARIANCES(128, 128)
635 HIGHBD_VARIANCES(128, 64)
636 HIGHBD_VARIANCES(64, 128)
637 HIGHBD_VARIANCES(64, 64)
638 HIGHBD_VARIANCES(64, 32)
639 HIGHBD_VARIANCES(32, 64)
640 HIGHBD_VARIANCES(32, 32)
641 HIGHBD_VARIANCES(32, 16)
642 HIGHBD_VARIANCES(16, 32)
643 HIGHBD_VARIANCES(16, 16)
644 HIGHBD_VARIANCES(16, 8)
645 HIGHBD_VARIANCES(8, 16)
646 HIGHBD_VARIANCES(8, 8)
647 HIGHBD_VARIANCES(8, 4)
648 HIGHBD_VARIANCES(4, 8)
649 HIGHBD_VARIANCES(4, 4)
650
651 // Realtime mode doesn't use 4x rectangular blocks.
652 #if !CONFIG_REALTIME_ONLY
653 HIGHBD_VARIANCES(4, 16)
654 HIGHBD_VARIANCES(16, 4)
655 HIGHBD_VARIANCES(8, 32)
656 HIGHBD_VARIANCES(32, 8)
657 HIGHBD_VARIANCES(16, 64)
658 HIGHBD_VARIANCES(64, 16)
659 #endif
660
661 HIGHBD_MSE(16, 16)
662 HIGHBD_MSE(16, 8)
663 HIGHBD_MSE(8, 16)
664 HIGHBD_MSE(8, 8)
665
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)666 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
667 int width, int height, const uint8_t *ref8,
668 int ref_stride) {
669 int i, j;
670 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
671 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
672 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
673 for (i = 0; i < height; ++i) {
674 for (j = 0; j < width; ++j) {
675 const int tmp = pred[j] + ref[j];
676 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
677 }
678 comp_pred += width;
679 pred += width;
680 ref += ref_stride;
681 }
682 }
683
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)684 void aom_highbd_dist_wtd_comp_avg_pred_c(
685 uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
686 const uint8_t *ref8, int ref_stride,
687 const DIST_WTD_COMP_PARAMS *jcp_param) {
688 int i, j;
689 const int fwd_offset = jcp_param->fwd_offset;
690 const int bck_offset = jcp_param->bck_offset;
691 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
692 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
693 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
694
695 for (i = 0; i < height; ++i) {
696 for (j = 0; j < width; ++j) {
697 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
698 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
699 comp_pred[j] = (uint16_t)tmp;
700 }
701 comp_pred += width;
702 pred += width;
703 ref += ref_stride;
704 }
705 }
706 #endif // CONFIG_AV1_HIGHBITDEPTH
707
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)708 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
709 int height, const uint8_t *ref, int ref_stride,
710 const uint8_t *mask, int mask_stride,
711 int invert_mask) {
712 int i, j;
713 const uint8_t *src0 = invert_mask ? pred : ref;
714 const uint8_t *src1 = invert_mask ? ref : pred;
715 const int stride0 = invert_mask ? width : ref_stride;
716 const int stride1 = invert_mask ? ref_stride : width;
717 for (i = 0; i < height; ++i) {
718 for (j = 0; j < width; ++j) {
719 comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
720 }
721 comp_pred += width;
722 src0 += stride0;
723 src1 += stride1;
724 mask += mask_stride;
725 }
726 }
727
728 #define MASK_SUBPIX_VAR(W, H) \
729 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
730 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
731 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
732 const uint8_t *msk, int msk_stride, int invert_mask, \
733 unsigned int *sse) { \
734 uint16_t fdata3[(H + 1) * W]; \
735 uint8_t temp2[H * W]; \
736 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
737 \
738 var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \
739 bilinear_filters_2t[xoffset]); \
740 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
741 bilinear_filters_2t[yoffset]); \
742 \
743 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
744 invert_mask); \
745 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
746 }
747
748 MASK_SUBPIX_VAR(4, 4)
749 MASK_SUBPIX_VAR(4, 8)
750 MASK_SUBPIX_VAR(8, 4)
751 MASK_SUBPIX_VAR(8, 8)
752 MASK_SUBPIX_VAR(8, 16)
753 MASK_SUBPIX_VAR(16, 8)
754 MASK_SUBPIX_VAR(16, 16)
755 MASK_SUBPIX_VAR(16, 32)
756 MASK_SUBPIX_VAR(32, 16)
757 MASK_SUBPIX_VAR(32, 32)
758 MASK_SUBPIX_VAR(32, 64)
759 MASK_SUBPIX_VAR(64, 32)
760 MASK_SUBPIX_VAR(64, 64)
761 MASK_SUBPIX_VAR(64, 128)
762 MASK_SUBPIX_VAR(128, 64)
763 MASK_SUBPIX_VAR(128, 128)
764
765 // Realtime mode doesn't use 4x rectangular blocks.
766 #if !CONFIG_REALTIME_ONLY
767 MASK_SUBPIX_VAR(4, 16)
768 MASK_SUBPIX_VAR(16, 4)
769 MASK_SUBPIX_VAR(8, 32)
770 MASK_SUBPIX_VAR(32, 8)
771 MASK_SUBPIX_VAR(16, 64)
772 MASK_SUBPIX_VAR(64, 16)
773 #endif
774
775 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)776 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
777 int width, int height, const uint8_t *ref8,
778 int ref_stride, const uint8_t *mask,
779 int mask_stride, int invert_mask) {
780 int i, j;
781 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
782 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
783 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
784 for (i = 0; i < height; ++i) {
785 for (j = 0; j < width; ++j) {
786 if (!invert_mask)
787 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
788 else
789 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
790 }
791 comp_pred += width;
792 pred += width;
793 ref += ref_stride;
794 mask += mask_stride;
795 }
796 }
797
798 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
799 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
800 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
801 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
802 const uint8_t *msk, int msk_stride, int invert_mask, \
803 unsigned int *sse) { \
804 uint16_t fdata3[(H + 1) * W]; \
805 uint16_t temp2[H * W]; \
806 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
807 \
808 aom_highbd_var_filter_block2d_bil_first_pass( \
809 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
810 aom_highbd_var_filter_block2d_bil_second_pass( \
811 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
812 \
813 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
814 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
815 invert_mask); \
816 \
817 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
818 ref, ref_stride, sse); \
819 } \
820 \
821 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
822 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
823 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
824 const uint8_t *msk, int msk_stride, int invert_mask, \
825 unsigned int *sse) { \
826 uint16_t fdata3[(H + 1) * W]; \
827 uint16_t temp2[H * W]; \
828 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
829 \
830 aom_highbd_var_filter_block2d_bil_first_pass( \
831 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
832 aom_highbd_var_filter_block2d_bil_second_pass( \
833 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
834 \
835 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
836 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
837 invert_mask); \
838 \
839 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
840 ref, ref_stride, sse); \
841 } \
842 \
843 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
844 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
845 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
846 const uint8_t *msk, int msk_stride, int invert_mask, \
847 unsigned int *sse) { \
848 uint16_t fdata3[(H + 1) * W]; \
849 uint16_t temp2[H * W]; \
850 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
851 \
852 aom_highbd_var_filter_block2d_bil_first_pass( \
853 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
854 aom_highbd_var_filter_block2d_bil_second_pass( \
855 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
856 \
857 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
858 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
859 invert_mask); \
860 \
861 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
862 ref, ref_stride, sse); \
863 }
864
865 HIGHBD_MASK_SUBPIX_VAR(4, 4)
866 HIGHBD_MASK_SUBPIX_VAR(4, 8)
867 HIGHBD_MASK_SUBPIX_VAR(8, 4)
868 HIGHBD_MASK_SUBPIX_VAR(8, 8)
869 HIGHBD_MASK_SUBPIX_VAR(8, 16)
870 HIGHBD_MASK_SUBPIX_VAR(16, 8)
871 HIGHBD_MASK_SUBPIX_VAR(16, 16)
872 HIGHBD_MASK_SUBPIX_VAR(16, 32)
873 HIGHBD_MASK_SUBPIX_VAR(32, 16)
874 HIGHBD_MASK_SUBPIX_VAR(32, 32)
875 HIGHBD_MASK_SUBPIX_VAR(32, 64)
876 HIGHBD_MASK_SUBPIX_VAR(64, 32)
877 HIGHBD_MASK_SUBPIX_VAR(64, 64)
878 HIGHBD_MASK_SUBPIX_VAR(64, 128)
879 HIGHBD_MASK_SUBPIX_VAR(128, 64)
880 HIGHBD_MASK_SUBPIX_VAR(128, 128)
881 #if !CONFIG_REALTIME_ONLY
882 HIGHBD_MASK_SUBPIX_VAR(4, 16)
883 HIGHBD_MASK_SUBPIX_VAR(16, 4)
884 HIGHBD_MASK_SUBPIX_VAR(8, 32)
885 HIGHBD_MASK_SUBPIX_VAR(32, 8)
886 HIGHBD_MASK_SUBPIX_VAR(16, 64)
887 HIGHBD_MASK_SUBPIX_VAR(64, 16)
888 #endif
889 #endif // CONFIG_AV1_HIGHBITDEPTH
890
891 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)892 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
893 const int32_t *wsrc, const int32_t *mask,
894 int w, int h, unsigned int *sse, int *sum) {
895 int i, j;
896
897 *sse = 0;
898 *sum = 0;
899
900 for (i = 0; i < h; i++) {
901 for (j = 0; j < w; j++) {
902 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
903 *sum += diff;
904 *sse += diff * diff;
905 }
906
907 pre += pre_stride;
908 wsrc += w;
909 mask += w;
910 }
911 }
912
913 #define OBMC_VAR(W, H) \
914 unsigned int aom_obmc_variance##W##x##H##_c( \
915 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
916 const int32_t *mask, unsigned int *sse) { \
917 int sum; \
918 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
919 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
920 }
921
922 #define OBMC_SUBPIX_VAR(W, H) \
923 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
924 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
925 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
926 uint16_t fdata3[(H + 1) * W]; \
927 uint8_t temp2[H * W]; \
928 \
929 var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \
930 bilinear_filters_2t[xoffset]); \
931 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
932 bilinear_filters_2t[yoffset]); \
933 \
934 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
935 }
936
937 OBMC_VAR(4, 4)
938 OBMC_SUBPIX_VAR(4, 4)
939
940 OBMC_VAR(4, 8)
941 OBMC_SUBPIX_VAR(4, 8)
942
943 OBMC_VAR(8, 4)
944 OBMC_SUBPIX_VAR(8, 4)
945
946 OBMC_VAR(8, 8)
947 OBMC_SUBPIX_VAR(8, 8)
948
949 OBMC_VAR(8, 16)
950 OBMC_SUBPIX_VAR(8, 16)
951
952 OBMC_VAR(16, 8)
953 OBMC_SUBPIX_VAR(16, 8)
954
955 OBMC_VAR(16, 16)
956 OBMC_SUBPIX_VAR(16, 16)
957
958 OBMC_VAR(16, 32)
959 OBMC_SUBPIX_VAR(16, 32)
960
961 OBMC_VAR(32, 16)
962 OBMC_SUBPIX_VAR(32, 16)
963
964 OBMC_VAR(32, 32)
965 OBMC_SUBPIX_VAR(32, 32)
966
967 OBMC_VAR(32, 64)
968 OBMC_SUBPIX_VAR(32, 64)
969
970 OBMC_VAR(64, 32)
971 OBMC_SUBPIX_VAR(64, 32)
972
973 OBMC_VAR(64, 64)
974 OBMC_SUBPIX_VAR(64, 64)
975
976 OBMC_VAR(64, 128)
977 OBMC_SUBPIX_VAR(64, 128)
978
979 OBMC_VAR(128, 64)
980 OBMC_SUBPIX_VAR(128, 64)
981
982 OBMC_VAR(128, 128)
983 OBMC_SUBPIX_VAR(128, 128)
984
985 OBMC_VAR(4, 16)
986 OBMC_SUBPIX_VAR(4, 16)
987 OBMC_VAR(16, 4)
988 OBMC_SUBPIX_VAR(16, 4)
989 OBMC_VAR(8, 32)
990 OBMC_SUBPIX_VAR(8, 32)
991 OBMC_VAR(32, 8)
992 OBMC_SUBPIX_VAR(32, 8)
993 OBMC_VAR(16, 64)
994 OBMC_SUBPIX_VAR(16, 64)
995 OBMC_VAR(64, 16)
996 OBMC_SUBPIX_VAR(64, 16)
997
998 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)999 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1000 const int32_t *wsrc,
1001 const int32_t *mask, int w, int h,
1002 uint64_t *sse, int64_t *sum) {
1003 int i, j;
1004 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1005
1006 *sse = 0;
1007 *sum = 0;
1008
1009 for (i = 0; i < h; i++) {
1010 for (j = 0; j < w; j++) {
1011 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1012 *sum += diff;
1013 *sse += diff * diff;
1014 }
1015
1016 pre += pre_stride;
1017 wsrc += w;
1018 mask += w;
1019 }
1020 }
1021
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1022 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1023 const int32_t *wsrc,
1024 const int32_t *mask, int w, int h,
1025 unsigned int *sse, int *sum) {
1026 int64_t sum64;
1027 uint64_t sse64;
1028 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1029 *sum = (int)sum64;
1030 *sse = (unsigned int)sse64;
1031 }
1032
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1033 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1034 const int32_t *wsrc,
1035 const int32_t *mask, int w, int h,
1036 unsigned int *sse, int *sum) {
1037 int64_t sum64;
1038 uint64_t sse64;
1039 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1040 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1041 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1042 }
1043
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1044 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1045 const int32_t *wsrc,
1046 const int32_t *mask, int w, int h,
1047 unsigned int *sse, int *sum) {
1048 int64_t sum64;
1049 uint64_t sse64;
1050 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1051 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1052 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1053 }
1054
1055 #define HIGHBD_OBMC_VAR(W, H) \
1056 unsigned int aom_highbd_8_obmc_variance##W##x##H##_c( \
1057 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1058 const int32_t *mask, unsigned int *sse) { \
1059 int sum; \
1060 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1061 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1062 } \
1063 \
1064 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
1065 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1066 const int32_t *mask, unsigned int *sse) { \
1067 int sum; \
1068 int64_t var; \
1069 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1070 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1071 return (var >= 0) ? (uint32_t)var : 0; \
1072 } \
1073 \
1074 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
1075 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1076 const int32_t *mask, unsigned int *sse) { \
1077 int sum; \
1078 int64_t var; \
1079 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1080 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1081 return (var >= 0) ? (uint32_t)var : 0; \
1082 }
1083
1084 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
1085 unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c( \
1086 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1087 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1088 uint16_t fdata3[(H + 1) * W]; \
1089 uint16_t temp2[H * W]; \
1090 \
1091 aom_highbd_var_filter_block2d_bil_first_pass( \
1092 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1093 aom_highbd_var_filter_block2d_bil_second_pass( \
1094 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1095 \
1096 return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1097 W, wsrc, mask, sse); \
1098 } \
1099 \
1100 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
1101 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1102 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1103 uint16_t fdata3[(H + 1) * W]; \
1104 uint16_t temp2[H * W]; \
1105 \
1106 aom_highbd_var_filter_block2d_bil_first_pass( \
1107 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1108 aom_highbd_var_filter_block2d_bil_second_pass( \
1109 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1110 \
1111 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1112 W, wsrc, mask, sse); \
1113 } \
1114 \
1115 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
1116 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1117 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1118 uint16_t fdata3[(H + 1) * W]; \
1119 uint16_t temp2[H * W]; \
1120 \
1121 aom_highbd_var_filter_block2d_bil_first_pass( \
1122 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1123 aom_highbd_var_filter_block2d_bil_second_pass( \
1124 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1125 \
1126 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1127 W, wsrc, mask, sse); \
1128 }
1129
1130 HIGHBD_OBMC_VAR(4, 4)
1131 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1132
1133 HIGHBD_OBMC_VAR(4, 8)
1134 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1135
1136 HIGHBD_OBMC_VAR(8, 4)
1137 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1138
1139 HIGHBD_OBMC_VAR(8, 8)
1140 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1141
1142 HIGHBD_OBMC_VAR(8, 16)
1143 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1144
1145 HIGHBD_OBMC_VAR(16, 8)
1146 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1147
1148 HIGHBD_OBMC_VAR(16, 16)
1149 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1150
1151 HIGHBD_OBMC_VAR(16, 32)
1152 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1153
1154 HIGHBD_OBMC_VAR(32, 16)
1155 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1156
1157 HIGHBD_OBMC_VAR(32, 32)
1158 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1159
1160 HIGHBD_OBMC_VAR(32, 64)
1161 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1162
1163 HIGHBD_OBMC_VAR(64, 32)
1164 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1165
1166 HIGHBD_OBMC_VAR(64, 64)
1167 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1168
1169 HIGHBD_OBMC_VAR(64, 128)
1170 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1171
1172 HIGHBD_OBMC_VAR(128, 64)
1173 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1174
1175 HIGHBD_OBMC_VAR(128, 128)
1176 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1177
1178 HIGHBD_OBMC_VAR(4, 16)
1179 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1180 HIGHBD_OBMC_VAR(16, 4)
1181 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1182 HIGHBD_OBMC_VAR(8, 32)
1183 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1184 HIGHBD_OBMC_VAR(32, 8)
1185 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1186 HIGHBD_OBMC_VAR(16, 64)
1187 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1188 HIGHBD_OBMC_VAR(64, 16)
1189 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1190 #endif // CONFIG_AV1_HIGHBITDEPTH
1191 #endif // !CONFIG_REALTIME_ONLY
1192
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1193 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1194 int sstride, int w, int h) {
1195 uint64_t sum = 0;
1196 for (int i = 0; i < h; i++) {
1197 for (int j = 0; j < w; j++) {
1198 int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1199 sum += e * e;
1200 }
1201 }
1202 return sum;
1203 }
1204
aom_mse_16xh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int w,int h)1205 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
1206 int h) {
1207 uint16_t *src_temp = src;
1208 uint8_t *dst_temp = dst;
1209 const int num_blks = 16 / w;
1210 int64_t sum = 0;
1211 for (int i = 0; i < num_blks; i++) {
1212 sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
1213 dst_temp += w;
1214 src_temp += (w * h);
1215 }
1216 return sum;
1217 }
1218
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1219 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1220 int sstride, int w, int h) {
1221 uint64_t sum = 0;
1222 for (int i = 0; i < h; i++) {
1223 for (int j = 0; j < w; j++) {
1224 int e = dst[i * dstride + j] - src[i * sstride + j];
1225 sum += e * e;
1226 }
1227 }
1228 return sum;
1229 }
1230