1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <stdlib.h>
13
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom/aom_integer.h"
18 #include "aom_ports/mem.h"
19
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_dsp/blend.h"
22 #include "aom_dsp/variance.h"
23
24 #include "av1/common/filter.h"
25 #include "av1/common/reconinter.h"
26
27 #if !CONFIG_REALTIME_ONLY
aom_get_mb_ss_c(const int16_t * a)28 uint32_t aom_get_mb_ss_c(const int16_t *a) {
29 unsigned int i, sum = 0;
30
31 for (i = 0; i < 256; ++i) {
32 sum += a[i] * a[i];
33 }
34
35 return sum;
36 }
37 #endif // !CONFIG_REALTIME_ONLY
38
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)39 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
40 int b_stride, int w, int h, uint32_t *sse, int *sum) {
41 int i, j;
42 int tsum = 0;
43 uint32_t tsse = 0;
44
45 for (i = 0; i < h; ++i) {
46 for (j = 0; j < w; ++j) {
47 const int diff = a[j] - b[j];
48 tsum += diff;
49 tsse += diff * diff;
50 }
51
52 a += a_stride;
53 b += b_stride;
54 }
55 *sum = tsum;
56 *sse = tsse;
57 }
58
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)59 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
60 int b_stride, int w, int h) {
61 uint32_t sse;
62 int sum;
63 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
64 return sse;
65 }
66
67 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
68 // or vertical direction to produce the filtered output block. Used to implement
69 // the first-pass of 2-D separable filter.
70 //
71 // Produces int16_t output to retain precision for the next pass. Two filter
72 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
73 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
74 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)75 static void var_filter_block2d_bil_first_pass_c(
76 const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
77 unsigned int pixel_step, unsigned int output_height,
78 unsigned int output_width, const uint8_t *filter) {
79 unsigned int i, j;
80
81 for (i = 0; i < output_height; ++i) {
82 for (j = 0; j < output_width; ++j) {
83 b[j] = ROUND_POWER_OF_TWO(
84 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
85
86 ++a;
87 }
88
89 a += src_pixels_per_line - output_width;
90 b += output_width;
91 }
92 }
93
94 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
95 // or vertical direction to produce the filtered output block. Used to implement
96 // the second-pass of 2-D separable filter.
97 //
98 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
99 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
100 // filter is applied horizontally (pixel_step = 1) or vertically
101 // (pixel_step = stride). It defines the offset required to move from one input
102 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)103 static void var_filter_block2d_bil_second_pass_c(
104 const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
105 unsigned int pixel_step, unsigned int output_height,
106 unsigned int output_width, const uint8_t *filter) {
107 unsigned int i, j;
108
109 for (i = 0; i < output_height; ++i) {
110 for (j = 0; j < output_width; ++j) {
111 b[j] = ROUND_POWER_OF_TWO(
112 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
113 ++a;
114 }
115
116 a += src_pixels_per_line - output_width;
117 b += output_width;
118 }
119 }
120
121 #define VAR(W, H) \
122 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
123 const uint8_t *b, int b_stride, \
124 uint32_t *sse) { \
125 int sum; \
126 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
127 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
128 }
129
130 #define SUBPIX_VAR(W, H) \
131 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
132 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
133 const uint8_t *b, int b_stride, uint32_t *sse) { \
134 uint16_t fdata3[(H + 1) * W]; \
135 uint8_t temp2[H * W]; \
136 \
137 var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
138 bilinear_filters_2t[xoffset]); \
139 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
140 bilinear_filters_2t[yoffset]); \
141 \
142 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
143 }
144
145 #define SUBPIX_AVG_VAR(W, H) \
146 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
147 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
148 const uint8_t *b, int b_stride, uint32_t *sse, \
149 const uint8_t *second_pred) { \
150 uint16_t fdata3[(H + 1) * W]; \
151 uint8_t temp2[H * W]; \
152 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
153 \
154 var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
155 bilinear_filters_2t[xoffset]); \
156 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
157 bilinear_filters_2t[yoffset]); \
158 \
159 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
160 \
161 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
162 }
163
aom_get_var_sse_sum_8x8_quad_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)164 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
165 const uint8_t *b, int b_stride,
166 uint32_t *sse8x8, int *sum8x8,
167 unsigned int *tot_sse, int *tot_sum,
168 uint32_t *var8x8) {
169 // Loop over 4 8x8 blocks. Process one 8x32 block.
170 for (int k = 0; k < 4; k++) {
171 variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
172 &sum8x8[k]);
173 }
174
175 // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
176 *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
177 *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
178 for (int i = 0; i < 4; i++)
179 var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
180 }
181
aom_get_var_sse_sum_16x16_dual_c(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)182 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
183 const uint8_t *ref_ptr, int ref_stride,
184 uint32_t *sse16x16, unsigned int *tot_sse,
185 int *tot_sum, uint32_t *var16x16) {
186 int sum16x16[2] = { 0 };
187 // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
188 for (int k = 0; k < 2; k++) {
189 variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
190 16, 16, &sse16x16[k], &sum16x16[k]);
191 }
192
193 // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
194 *tot_sse += sse16x16[0] + sse16x16[1];
195 *tot_sum += sum16x16[0] + sum16x16[1];
196 for (int i = 0; i < 2; i++)
197 var16x16[i] =
198 sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
199 }
200
201 /* Identical to the variance call except it does not calculate the
202 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
203 * variable.
204 */
205 #define MSE(W, H) \
206 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
207 const uint8_t *b, int b_stride, \
208 uint32_t *sse) { \
209 int sum; \
210 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
211 return *sse; \
212 }
213
214 /* All three forms of the variance are available in the same sizes. */
215 #define VARIANCES(W, H) \
216 VAR(W, H) \
217 SUBPIX_VAR(W, H) \
218 SUBPIX_AVG_VAR(W, H)
219
220 VARIANCES(128, 128)
221 VARIANCES(128, 64)
222 VARIANCES(64, 128)
223 VARIANCES(64, 64)
224 VARIANCES(64, 32)
225 VARIANCES(32, 64)
226 VARIANCES(32, 32)
227 VARIANCES(32, 16)
228 VARIANCES(16, 32)
229 VARIANCES(16, 16)
230 VARIANCES(16, 8)
231 VARIANCES(8, 16)
232 VARIANCES(8, 8)
233 VARIANCES(8, 4)
234 VARIANCES(4, 8)
235 VARIANCES(4, 4)
236
237 // Realtime mode doesn't use rectangular blocks.
238 #if !CONFIG_REALTIME_ONLY
239 VARIANCES(4, 16)
240 VARIANCES(16, 4)
241 VARIANCES(8, 32)
242 VARIANCES(32, 8)
243 VARIANCES(16, 64)
244 VARIANCES(64, 16)
245 #endif
246
247 MSE(16, 16)
248 MSE(16, 8)
249 MSE(8, 16)
250 MSE(8, 8)
251
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)252 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
253 int height, const uint8_t *ref, int ref_stride) {
254 int i, j;
255
256 for (i = 0; i < height; ++i) {
257 for (j = 0; j < width; ++j) {
258 const int tmp = pred[j] + ref[j];
259 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
260 }
261 comp_pred += width;
262 pred += width;
263 ref += ref_stride;
264 }
265 }
266
267 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)268 static void highbd_variance64(const uint8_t *a8, int a_stride,
269 const uint8_t *b8, int b_stride, int w, int h,
270 uint64_t *sse, int64_t *sum) {
271 const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
272 const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
273 int64_t tsum = 0;
274 uint64_t tsse = 0;
275 for (int i = 0; i < h; ++i) {
276 int32_t lsum = 0;
277 for (int j = 0; j < w; ++j) {
278 const int diff = a[j] - b[j];
279 lsum += diff;
280 tsse += (uint32_t)(diff * diff);
281 }
282 tsum += lsum;
283 a += a_stride;
284 b += b_stride;
285 }
286 *sum = tsum;
287 *sse = tsse;
288 }
289
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)290 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
291 const uint8_t *b, int b_stride, int w, int h) {
292 uint64_t sse;
293 int64_t sum;
294 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
295 return sse;
296 }
297
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)298 static void highbd_8_variance(const uint8_t *a8, int a_stride,
299 const uint8_t *b8, int b_stride, int w, int h,
300 uint32_t *sse, int *sum) {
301 uint64_t sse_long = 0;
302 int64_t sum_long = 0;
303 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
304 *sse = (uint32_t)sse_long;
305 *sum = (int)sum_long;
306 }
307
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)308 static void highbd_10_variance(const uint8_t *a8, int a_stride,
309 const uint8_t *b8, int b_stride, int w, int h,
310 uint32_t *sse, int *sum) {
311 uint64_t sse_long = 0;
312 int64_t sum_long = 0;
313 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
314 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
315 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
316 }
317
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)318 static void highbd_12_variance(const uint8_t *a8, int a_stride,
319 const uint8_t *b8, int b_stride, int w, int h,
320 uint32_t *sse, int *sum) {
321 uint64_t sse_long = 0;
322 int64_t sum_long = 0;
323 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
324 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
325 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
326 }
327
328 #define HIGHBD_VAR(W, H) \
329 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
330 const uint8_t *b, int b_stride, \
331 uint32_t *sse) { \
332 int sum; \
333 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
334 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
335 } \
336 \
337 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
338 const uint8_t *b, int b_stride, \
339 uint32_t *sse) { \
340 int sum; \
341 int64_t var; \
342 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
343 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
344 return (var >= 0) ? (uint32_t)var : 0; \
345 } \
346 \
347 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
348 const uint8_t *b, int b_stride, \
349 uint32_t *sse) { \
350 int sum; \
351 int64_t var; \
352 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
353 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
354 return (var >= 0) ? (uint32_t)var : 0; \
355 }
356
357 #define HIGHBD_MSE(W, H) \
358 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
359 const uint8_t *ref, int ref_stride, \
360 uint32_t *sse) { \
361 int sum; \
362 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
363 return *sse; \
364 } \
365 \
366 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
367 const uint8_t *ref, int ref_stride, \
368 uint32_t *sse) { \
369 int sum; \
370 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
371 return *sse; \
372 } \
373 \
374 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
375 const uint8_t *ref, int ref_stride, \
376 uint32_t *sse) { \
377 int sum; \
378 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
379 return *sse; \
380 }
381
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)382 void aom_highbd_var_filter_block2d_bil_first_pass(
383 const uint8_t *src_ptr8, uint16_t *output_ptr,
384 unsigned int src_pixels_per_line, int pixel_step,
385 unsigned int output_height, unsigned int output_width,
386 const uint8_t *filter) {
387 unsigned int i, j;
388 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
389 for (i = 0; i < output_height; ++i) {
390 for (j = 0; j < output_width; ++j) {
391 output_ptr[j] = ROUND_POWER_OF_TWO(
392 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
393 FILTER_BITS);
394
395 ++src_ptr;
396 }
397
398 // Next row...
399 src_ptr += src_pixels_per_line - output_width;
400 output_ptr += output_width;
401 }
402 }
403
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)404 void aom_highbd_var_filter_block2d_bil_second_pass(
405 const uint16_t *src_ptr, uint16_t *output_ptr,
406 unsigned int src_pixels_per_line, unsigned int pixel_step,
407 unsigned int output_height, unsigned int output_width,
408 const uint8_t *filter) {
409 unsigned int i, j;
410
411 for (i = 0; i < output_height; ++i) {
412 for (j = 0; j < output_width; ++j) {
413 output_ptr[j] = ROUND_POWER_OF_TWO(
414 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
415 FILTER_BITS);
416 ++src_ptr;
417 }
418
419 src_ptr += src_pixels_per_line - output_width;
420 output_ptr += output_width;
421 }
422 }
423
424 #define HIGHBD_SUBPIX_VAR(W, H) \
425 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
426 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
427 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
428 uint16_t fdata3[(H + 1) * W]; \
429 uint16_t temp2[H * W]; \
430 \
431 aom_highbd_var_filter_block2d_bil_first_pass( \
432 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
433 aom_highbd_var_filter_block2d_bil_second_pass( \
434 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
435 \
436 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
437 dst, dst_stride, sse); \
438 } \
439 \
440 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
441 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
442 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
443 uint16_t fdata3[(H + 1) * W]; \
444 uint16_t temp2[H * W]; \
445 \
446 aom_highbd_var_filter_block2d_bil_first_pass( \
447 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
448 aom_highbd_var_filter_block2d_bil_second_pass( \
449 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
450 \
451 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
452 dst, dst_stride, sse); \
453 } \
454 \
455 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
456 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
457 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
458 uint16_t fdata3[(H + 1) * W]; \
459 uint16_t temp2[H * W]; \
460 \
461 aom_highbd_var_filter_block2d_bil_first_pass( \
462 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
463 aom_highbd_var_filter_block2d_bil_second_pass( \
464 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
465 \
466 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
467 dst, dst_stride, sse); \
468 }
469
470 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
471 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
472 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
473 const uint8_t *dst, int dst_stride, uint32_t *sse, \
474 const uint8_t *second_pred) { \
475 uint16_t fdata3[(H + 1) * W]; \
476 uint16_t temp2[H * W]; \
477 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
478 \
479 aom_highbd_var_filter_block2d_bil_first_pass( \
480 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
481 aom_highbd_var_filter_block2d_bil_second_pass( \
482 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
483 \
484 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
485 CONVERT_TO_BYTEPTR(temp2), W); \
486 \
487 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
488 dst, dst_stride, sse); \
489 } \
490 \
491 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
492 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
493 const uint8_t *dst, int dst_stride, uint32_t *sse, \
494 const uint8_t *second_pred) { \
495 uint16_t fdata3[(H + 1) * W]; \
496 uint16_t temp2[H * W]; \
497 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
498 \
499 aom_highbd_var_filter_block2d_bil_first_pass( \
500 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
501 aom_highbd_var_filter_block2d_bil_second_pass( \
502 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
503 \
504 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
505 CONVERT_TO_BYTEPTR(temp2), W); \
506 \
507 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
508 dst, dst_stride, sse); \
509 } \
510 \
511 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
512 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
513 const uint8_t *dst, int dst_stride, uint32_t *sse, \
514 const uint8_t *second_pred) { \
515 uint16_t fdata3[(H + 1) * W]; \
516 uint16_t temp2[H * W]; \
517 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
518 \
519 aom_highbd_var_filter_block2d_bil_first_pass( \
520 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
521 aom_highbd_var_filter_block2d_bil_second_pass( \
522 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
523 \
524 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
525 CONVERT_TO_BYTEPTR(temp2), W); \
526 \
527 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
528 dst, dst_stride, sse); \
529 } \
530 \
531 /* All three forms of the variance are available in the same sizes. */
532 #define HIGHBD_VARIANCES(W, H) \
533 HIGHBD_VAR(W, H) \
534 HIGHBD_SUBPIX_VAR(W, H) \
535 HIGHBD_SUBPIX_AVG_VAR(W, H)
536
537 HIGHBD_VARIANCES(128, 128)
538 HIGHBD_VARIANCES(128, 64)
539 HIGHBD_VARIANCES(64, 128)
540 HIGHBD_VARIANCES(64, 64)
541 HIGHBD_VARIANCES(64, 32)
542 HIGHBD_VARIANCES(32, 64)
543 HIGHBD_VARIANCES(32, 32)
544 HIGHBD_VARIANCES(32, 16)
545 HIGHBD_VARIANCES(16, 32)
546 HIGHBD_VARIANCES(16, 16)
547 HIGHBD_VARIANCES(16, 8)
548 HIGHBD_VARIANCES(8, 16)
549 HIGHBD_VARIANCES(8, 8)
550 HIGHBD_VARIANCES(8, 4)
551 HIGHBD_VARIANCES(4, 8)
552 HIGHBD_VARIANCES(4, 4)
553
554 // Realtime mode doesn't use 4x rectangular blocks.
555 #if !CONFIG_REALTIME_ONLY
556 HIGHBD_VARIANCES(4, 16)
557 HIGHBD_VARIANCES(16, 4)
558 HIGHBD_VARIANCES(8, 32)
559 HIGHBD_VARIANCES(32, 8)
560 HIGHBD_VARIANCES(16, 64)
561 HIGHBD_VARIANCES(64, 16)
562 #endif
563
564 HIGHBD_MSE(16, 16)
565 HIGHBD_MSE(16, 8)
566 HIGHBD_MSE(8, 16)
567 HIGHBD_MSE(8, 8)
568
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)569 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
570 int width, int height, const uint8_t *ref8,
571 int ref_stride) {
572 int i, j;
573 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
574 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
575 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
576 for (i = 0; i < height; ++i) {
577 for (j = 0; j < width; ++j) {
578 const int tmp = pred[j] + ref[j];
579 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
580 }
581 comp_pred += width;
582 pred += width;
583 ref += ref_stride;
584 }
585 }
586 #endif // CONFIG_AV1_HIGHBITDEPTH
587
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)588 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
589 int height, const uint8_t *ref, int ref_stride,
590 const uint8_t *mask, int mask_stride,
591 int invert_mask) {
592 int i, j;
593 const uint8_t *src0 = invert_mask ? pred : ref;
594 const uint8_t *src1 = invert_mask ? ref : pred;
595 const int stride0 = invert_mask ? width : ref_stride;
596 const int stride1 = invert_mask ? ref_stride : width;
597 for (i = 0; i < height; ++i) {
598 for (j = 0; j < width; ++j) {
599 comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
600 }
601 comp_pred += width;
602 src0 += stride0;
603 src1 += stride1;
604 mask += mask_stride;
605 }
606 }
607
608 #define MASK_SUBPIX_VAR(W, H) \
609 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
610 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
611 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
612 const uint8_t *msk, int msk_stride, int invert_mask, \
613 unsigned int *sse) { \
614 uint16_t fdata3[(H + 1) * W]; \
615 uint8_t temp2[H * W]; \
616 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
617 \
618 var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \
619 bilinear_filters_2t[xoffset]); \
620 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
621 bilinear_filters_2t[yoffset]); \
622 \
623 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
624 invert_mask); \
625 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
626 }
627
628 MASK_SUBPIX_VAR(4, 4)
629 MASK_SUBPIX_VAR(4, 8)
630 MASK_SUBPIX_VAR(8, 4)
631 MASK_SUBPIX_VAR(8, 8)
632 MASK_SUBPIX_VAR(8, 16)
633 MASK_SUBPIX_VAR(16, 8)
634 MASK_SUBPIX_VAR(16, 16)
635 MASK_SUBPIX_VAR(16, 32)
636 MASK_SUBPIX_VAR(32, 16)
637 MASK_SUBPIX_VAR(32, 32)
638 MASK_SUBPIX_VAR(32, 64)
639 MASK_SUBPIX_VAR(64, 32)
640 MASK_SUBPIX_VAR(64, 64)
641 MASK_SUBPIX_VAR(64, 128)
642 MASK_SUBPIX_VAR(128, 64)
643 MASK_SUBPIX_VAR(128, 128)
644
645 // Realtime mode doesn't use 4x rectangular blocks.
646 #if !CONFIG_REALTIME_ONLY
647 MASK_SUBPIX_VAR(4, 16)
648 MASK_SUBPIX_VAR(16, 4)
649 MASK_SUBPIX_VAR(8, 32)
650 MASK_SUBPIX_VAR(32, 8)
651 MASK_SUBPIX_VAR(16, 64)
652 MASK_SUBPIX_VAR(64, 16)
653 #endif
654
655 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)656 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
657 int width, int height, const uint8_t *ref8,
658 int ref_stride, const uint8_t *mask,
659 int mask_stride, int invert_mask) {
660 int i, j;
661 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
662 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
663 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
664 for (i = 0; i < height; ++i) {
665 for (j = 0; j < width; ++j) {
666 if (!invert_mask)
667 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
668 else
669 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
670 }
671 comp_pred += width;
672 pred += width;
673 ref += ref_stride;
674 mask += mask_stride;
675 }
676 }
677
678 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
679 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
680 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
681 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
682 const uint8_t *msk, int msk_stride, int invert_mask, \
683 unsigned int *sse) { \
684 uint16_t fdata3[(H + 1) * W]; \
685 uint16_t temp2[H * W]; \
686 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
687 \
688 aom_highbd_var_filter_block2d_bil_first_pass( \
689 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
690 aom_highbd_var_filter_block2d_bil_second_pass( \
691 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
692 \
693 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
694 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
695 invert_mask); \
696 \
697 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
698 ref, ref_stride, sse); \
699 } \
700 \
701 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
702 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
703 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
704 const uint8_t *msk, int msk_stride, int invert_mask, \
705 unsigned int *sse) { \
706 uint16_t fdata3[(H + 1) * W]; \
707 uint16_t temp2[H * W]; \
708 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
709 \
710 aom_highbd_var_filter_block2d_bil_first_pass( \
711 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
712 aom_highbd_var_filter_block2d_bil_second_pass( \
713 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
714 \
715 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
716 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
717 invert_mask); \
718 \
719 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
720 ref, ref_stride, sse); \
721 } \
722 \
723 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
724 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
725 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
726 const uint8_t *msk, int msk_stride, int invert_mask, \
727 unsigned int *sse) { \
728 uint16_t fdata3[(H + 1) * W]; \
729 uint16_t temp2[H * W]; \
730 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
731 \
732 aom_highbd_var_filter_block2d_bil_first_pass( \
733 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
734 aom_highbd_var_filter_block2d_bil_second_pass( \
735 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
736 \
737 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
738 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
739 invert_mask); \
740 \
741 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
742 ref, ref_stride, sse); \
743 }
744
745 HIGHBD_MASK_SUBPIX_VAR(4, 4)
746 HIGHBD_MASK_SUBPIX_VAR(4, 8)
747 HIGHBD_MASK_SUBPIX_VAR(8, 4)
748 HIGHBD_MASK_SUBPIX_VAR(8, 8)
749 HIGHBD_MASK_SUBPIX_VAR(8, 16)
750 HIGHBD_MASK_SUBPIX_VAR(16, 8)
751 HIGHBD_MASK_SUBPIX_VAR(16, 16)
752 HIGHBD_MASK_SUBPIX_VAR(16, 32)
753 HIGHBD_MASK_SUBPIX_VAR(32, 16)
754 HIGHBD_MASK_SUBPIX_VAR(32, 32)
755 HIGHBD_MASK_SUBPIX_VAR(32, 64)
756 HIGHBD_MASK_SUBPIX_VAR(64, 32)
757 HIGHBD_MASK_SUBPIX_VAR(64, 64)
758 HIGHBD_MASK_SUBPIX_VAR(64, 128)
759 HIGHBD_MASK_SUBPIX_VAR(128, 64)
760 HIGHBD_MASK_SUBPIX_VAR(128, 128)
761 #if !CONFIG_REALTIME_ONLY
762 HIGHBD_MASK_SUBPIX_VAR(4, 16)
763 HIGHBD_MASK_SUBPIX_VAR(16, 4)
764 HIGHBD_MASK_SUBPIX_VAR(8, 32)
765 HIGHBD_MASK_SUBPIX_VAR(32, 8)
766 HIGHBD_MASK_SUBPIX_VAR(16, 64)
767 HIGHBD_MASK_SUBPIX_VAR(64, 16)
768 #endif
769 #endif // CONFIG_AV1_HIGHBITDEPTH
770
771 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)772 static inline void obmc_variance(const uint8_t *pre, int pre_stride,
773 const int32_t *wsrc, const int32_t *mask,
774 int w, int h, unsigned int *sse, int *sum) {
775 int i, j;
776 unsigned int tsse = 0;
777 int tsum = 0;
778
779 for (i = 0; i < h; i++) {
780 for (j = 0; j < w; j++) {
781 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
782 tsum += diff;
783 tsse += diff * diff;
784 }
785
786 pre += pre_stride;
787 wsrc += w;
788 mask += w;
789 }
790 *sse = tsse;
791 *sum = tsum;
792 }
793
794 #define OBMC_VAR(W, H) \
795 unsigned int aom_obmc_variance##W##x##H##_c( \
796 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
797 const int32_t *mask, unsigned int *sse) { \
798 int sum; \
799 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
800 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
801 }
802
803 #define OBMC_SUBPIX_VAR(W, H) \
804 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
805 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
806 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
807 uint16_t fdata3[(H + 1) * W]; \
808 uint8_t temp2[H * W]; \
809 \
810 var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \
811 bilinear_filters_2t[xoffset]); \
812 var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
813 bilinear_filters_2t[yoffset]); \
814 \
815 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
816 }
817
818 OBMC_VAR(4, 4)
819 OBMC_SUBPIX_VAR(4, 4)
820
821 OBMC_VAR(4, 8)
822 OBMC_SUBPIX_VAR(4, 8)
823
824 OBMC_VAR(8, 4)
825 OBMC_SUBPIX_VAR(8, 4)
826
827 OBMC_VAR(8, 8)
828 OBMC_SUBPIX_VAR(8, 8)
829
830 OBMC_VAR(8, 16)
831 OBMC_SUBPIX_VAR(8, 16)
832
833 OBMC_VAR(16, 8)
834 OBMC_SUBPIX_VAR(16, 8)
835
836 OBMC_VAR(16, 16)
837 OBMC_SUBPIX_VAR(16, 16)
838
839 OBMC_VAR(16, 32)
840 OBMC_SUBPIX_VAR(16, 32)
841
842 OBMC_VAR(32, 16)
843 OBMC_SUBPIX_VAR(32, 16)
844
845 OBMC_VAR(32, 32)
846 OBMC_SUBPIX_VAR(32, 32)
847
848 OBMC_VAR(32, 64)
849 OBMC_SUBPIX_VAR(32, 64)
850
851 OBMC_VAR(64, 32)
852 OBMC_SUBPIX_VAR(64, 32)
853
854 OBMC_VAR(64, 64)
855 OBMC_SUBPIX_VAR(64, 64)
856
857 OBMC_VAR(64, 128)
858 OBMC_SUBPIX_VAR(64, 128)
859
860 OBMC_VAR(128, 64)
861 OBMC_SUBPIX_VAR(128, 64)
862
863 OBMC_VAR(128, 128)
864 OBMC_SUBPIX_VAR(128, 128)
865
866 OBMC_VAR(4, 16)
867 OBMC_SUBPIX_VAR(4, 16)
868 OBMC_VAR(16, 4)
869 OBMC_SUBPIX_VAR(16, 4)
870 OBMC_VAR(8, 32)
871 OBMC_SUBPIX_VAR(8, 32)
872 OBMC_VAR(32, 8)
873 OBMC_SUBPIX_VAR(32, 8)
874 OBMC_VAR(16, 64)
875 OBMC_SUBPIX_VAR(16, 64)
876 OBMC_VAR(64, 16)
877 OBMC_SUBPIX_VAR(64, 16)
878
879 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)880 static inline void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
881 const int32_t *wsrc,
882 const int32_t *mask, int w, int h,
883 uint64_t *sse, int64_t *sum) {
884 int i, j;
885 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
886 uint64_t tsse = 0;
887 int64_t tsum = 0;
888
889 for (i = 0; i < h; i++) {
890 for (j = 0; j < w; j++) {
891 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
892 tsum += diff;
893 tsse += diff * diff;
894 }
895
896 pre += pre_stride;
897 wsrc += w;
898 mask += w;
899 }
900 *sse = tsse;
901 *sum = tsum;
902 }
903
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)904 static inline void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
905 const int32_t *wsrc,
906 const int32_t *mask, int w, int h,
907 unsigned int *sse, int *sum) {
908 int64_t sum64;
909 uint64_t sse64;
910 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
911 *sum = (int)sum64;
912 *sse = (unsigned int)sse64;
913 }
914
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)915 static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
916 const int32_t *wsrc,
917 const int32_t *mask, int w, int h,
918 unsigned int *sse, int *sum) {
919 int64_t sum64;
920 uint64_t sse64;
921 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
922 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
923 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
924 }
925
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)926 static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
927 const int32_t *wsrc,
928 const int32_t *mask, int w, int h,
929 unsigned int *sse, int *sum) {
930 int64_t sum64;
931 uint64_t sse64;
932 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
933 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
934 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
935 }
936
937 #define HIGHBD_OBMC_VAR(W, H) \
938 unsigned int aom_highbd_8_obmc_variance##W##x##H##_c( \
939 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
940 const int32_t *mask, unsigned int *sse) { \
941 int sum; \
942 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
943 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
944 } \
945 \
946 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
947 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
948 const int32_t *mask, unsigned int *sse) { \
949 int sum; \
950 int64_t var; \
951 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
952 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
953 return (var >= 0) ? (uint32_t)var : 0; \
954 } \
955 \
956 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
957 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
958 const int32_t *mask, unsigned int *sse) { \
959 int sum; \
960 int64_t var; \
961 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
962 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
963 return (var >= 0) ? (uint32_t)var : 0; \
964 }
965
966 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
967 unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c( \
968 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
969 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
970 uint16_t fdata3[(H + 1) * W]; \
971 uint16_t temp2[H * W]; \
972 \
973 aom_highbd_var_filter_block2d_bil_first_pass( \
974 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
975 aom_highbd_var_filter_block2d_bil_second_pass( \
976 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
977 \
978 return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
979 W, wsrc, mask, sse); \
980 } \
981 \
982 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
983 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
984 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
985 uint16_t fdata3[(H + 1) * W]; \
986 uint16_t temp2[H * W]; \
987 \
988 aom_highbd_var_filter_block2d_bil_first_pass( \
989 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
990 aom_highbd_var_filter_block2d_bil_second_pass( \
991 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
992 \
993 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
994 W, wsrc, mask, sse); \
995 } \
996 \
997 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
998 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
999 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1000 uint16_t fdata3[(H + 1) * W]; \
1001 uint16_t temp2[H * W]; \
1002 \
1003 aom_highbd_var_filter_block2d_bil_first_pass( \
1004 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1005 aom_highbd_var_filter_block2d_bil_second_pass( \
1006 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1007 \
1008 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1009 W, wsrc, mask, sse); \
1010 }
1011
1012 HIGHBD_OBMC_VAR(4, 4)
1013 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1014
1015 HIGHBD_OBMC_VAR(4, 8)
1016 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1017
1018 HIGHBD_OBMC_VAR(8, 4)
1019 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1020
1021 HIGHBD_OBMC_VAR(8, 8)
1022 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1023
1024 HIGHBD_OBMC_VAR(8, 16)
1025 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1026
1027 HIGHBD_OBMC_VAR(16, 8)
1028 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1029
1030 HIGHBD_OBMC_VAR(16, 16)
1031 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1032
1033 HIGHBD_OBMC_VAR(16, 32)
1034 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1035
1036 HIGHBD_OBMC_VAR(32, 16)
1037 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1038
1039 HIGHBD_OBMC_VAR(32, 32)
1040 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1041
1042 HIGHBD_OBMC_VAR(32, 64)
1043 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1044
1045 HIGHBD_OBMC_VAR(64, 32)
1046 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1047
1048 HIGHBD_OBMC_VAR(64, 64)
1049 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1050
1051 HIGHBD_OBMC_VAR(64, 128)
1052 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1053
1054 HIGHBD_OBMC_VAR(128, 64)
1055 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1056
1057 HIGHBD_OBMC_VAR(128, 128)
1058 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1059
1060 HIGHBD_OBMC_VAR(4, 16)
1061 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1062 HIGHBD_OBMC_VAR(16, 4)
1063 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1064 HIGHBD_OBMC_VAR(8, 32)
1065 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1066 HIGHBD_OBMC_VAR(32, 8)
1067 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1068 HIGHBD_OBMC_VAR(16, 64)
1069 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1070 HIGHBD_OBMC_VAR(64, 16)
1071 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1072 #endif // CONFIG_AV1_HIGHBITDEPTH
1073 #endif // !CONFIG_REALTIME_ONLY
1074
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1075 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1076 int sstride, int w, int h) {
1077 uint64_t sum = 0;
1078 for (int i = 0; i < h; i++) {
1079 for (int j = 0; j < w; j++) {
1080 int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1081 sum += e * e;
1082 }
1083 }
1084 return sum;
1085 }
1086
aom_mse_16xh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int w,int h)1087 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
1088 int h) {
1089 uint16_t *src_temp = src;
1090 uint8_t *dst_temp = dst;
1091 const int num_blks = 16 / w;
1092 int64_t sum = 0;
1093 for (int i = 0; i < num_blks; i++) {
1094 sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
1095 dst_temp += w;
1096 src_temp += (w * h);
1097 }
1098 return sum;
1099 }
1100
1101 #if CONFIG_AV1_HIGHBITDEPTH
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1102 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1103 int sstride, int w, int h) {
1104 uint64_t sum = 0;
1105 for (int i = 0; i < h; i++) {
1106 for (int j = 0; j < w; j++) {
1107 int e = dst[i * dstride + j] - src[i * sstride + j];
1108 sum += e * e;
1109 }
1110 }
1111 return sum;
1112 }
1113 #endif // CONFIG_AV1_HIGHBITDEPTH
1114