1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17
18 #include "aom/aom_integer.h"
19 #include "aom_ports/mem.h"
20
21 #include "aom_dsp/aom_filter.h"
22 #include "aom_dsp/blend.h"
23 #include "aom_dsp/variance.h"
24
25 #include "av1/common/filter.h"
26 #include "av1/common/reconinter.h"
27
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)28 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
29 int b_stride) {
30 int distortion = 0;
31 int r, c;
32
33 for (r = 0; r < 4; ++r) {
34 for (c = 0; c < 4; ++c) {
35 int diff = a[c] - b[c];
36 distortion += diff * diff;
37 }
38
39 a += a_stride;
40 b += b_stride;
41 }
42
43 return distortion;
44 }
45
aom_get_mb_ss_c(const int16_t * a)46 uint32_t aom_get_mb_ss_c(const int16_t *a) {
47 unsigned int i, sum = 0;
48
49 for (i = 0; i < 256; ++i) {
50 sum += a[i] * a[i];
51 }
52
53 return sum;
54 }
55
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)56 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
57 int b_stride, int w, int h, uint32_t *sse, int *sum) {
58 int i, j;
59
60 *sum = 0;
61 *sse = 0;
62
63 for (i = 0; i < h; ++i) {
64 for (j = 0; j < w; ++j) {
65 const int diff = a[j] - b[j];
66 *sum += diff;
67 *sse += diff * diff;
68 }
69
70 a += a_stride;
71 b += b_stride;
72 }
73 }
74
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)75 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
76 int b_stride, int w, int h) {
77 uint32_t sse;
78 int sum;
79 variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
80 return sse;
81 }
82
83 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
84 // or vertical direction to produce the filtered output block. Used to implement
85 // the first-pass of 2-D separable filter.
86 //
87 // Produces int16_t output to retain precision for the next pass. Two filter
88 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
89 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
90 // It defines the offset required to move from one input to the next.
aom_var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)91 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
92 unsigned int src_pixels_per_line,
93 unsigned int pixel_step,
94 unsigned int output_height,
95 unsigned int output_width,
96 const uint8_t *filter) {
97 unsigned int i, j;
98
99 for (i = 0; i < output_height; ++i) {
100 for (j = 0; j < output_width; ++j) {
101 b[j] = ROUND_POWER_OF_TWO(
102 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
103
104 ++a;
105 }
106
107 a += src_pixels_per_line - output_width;
108 b += output_width;
109 }
110 }
111
112 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
113 // or vertical direction to produce the filtered output block. Used to implement
114 // the second-pass of 2-D separable filter.
115 //
116 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
117 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
118 // filter is applied horizontally (pixel_step = 1) or vertically
119 // (pixel_step = stride). It defines the offset required to move from one input
120 // to the next. Output is 8-bit.
aom_var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)121 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
122 unsigned int src_pixels_per_line,
123 unsigned int pixel_step,
124 unsigned int output_height,
125 unsigned int output_width,
126 const uint8_t *filter) {
127 unsigned int i, j;
128
129 for (i = 0; i < output_height; ++i) {
130 for (j = 0; j < output_width; ++j) {
131 b[j] = ROUND_POWER_OF_TWO(
132 (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
133 ++a;
134 }
135
136 a += src_pixels_per_line - output_width;
137 b += output_width;
138 }
139 }
140
141 #define VAR(W, H) \
142 uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
143 const uint8_t *b, int b_stride, \
144 uint32_t *sse) { \
145 int sum; \
146 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
147 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
148 }
149
150 #define SUBPIX_VAR(W, H) \
151 uint32_t aom_sub_pixel_variance##W##x##H##_c( \
152 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
153 const uint8_t *b, int b_stride, uint32_t *sse) { \
154 uint16_t fdata3[(H + 1) * W]; \
155 uint8_t temp2[H * W]; \
156 \
157 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
158 bilinear_filters_2t[xoffset]); \
159 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
160 bilinear_filters_2t[yoffset]); \
161 \
162 return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
163 }
164
165 #define SUBPIX_AVG_VAR(W, H) \
166 uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
167 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
168 const uint8_t *b, int b_stride, uint32_t *sse, \
169 const uint8_t *second_pred) { \
170 uint16_t fdata3[(H + 1) * W]; \
171 uint8_t temp2[H * W]; \
172 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
173 \
174 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
175 bilinear_filters_2t[xoffset]); \
176 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
177 bilinear_filters_2t[yoffset]); \
178 \
179 aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
180 \
181 return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
182 } \
183 uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
184 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
185 const uint8_t *b, int b_stride, uint32_t *sse, \
186 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
187 uint16_t fdata3[(H + 1) * W]; \
188 uint8_t temp2[H * W]; \
189 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
190 \
191 aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
192 bilinear_filters_2t[xoffset]); \
193 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
194 bilinear_filters_2t[yoffset]); \
195 \
196 aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
197 \
198 return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
199 }
200
201 /* Identical to the variance call except it takes an additional parameter, sum,
202 * and returns that value using pass-by-reference instead of returning
203 * sse - sum^2 / w*h
204 */
205 #define GET_VAR(W, H) \
206 void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
207 const uint8_t *b, int b_stride, uint32_t *sse, \
208 int *sum) { \
209 variance(a, a_stride, b, b_stride, W, H, sse, sum); \
210 }
211
aom_get_var_sse_sum_8x8_quad_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)212 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
213 const uint8_t *b, int b_stride,
214 uint32_t *sse8x8, int *sum8x8,
215 unsigned int *tot_sse, int *tot_sum,
216 uint32_t *var8x8) {
217 // Loop over 4 8x8 blocks. Process one 8x32 block.
218 for (int k = 0; k < 4; k++) {
219 variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
220 &sum8x8[k]);
221 }
222
223 // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
224 *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
225 *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
226 for (int i = 0; i < 4; i++)
227 var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
228 }
229
aom_get_var_sse_sum_16x16_dual_c(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)230 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
231 const uint8_t *ref_ptr, int ref_stride,
232 uint32_t *sse16x16, unsigned int *tot_sse,
233 int *tot_sum, uint32_t *var16x16) {
234 int sum16x16[64] = { 0 };
235 // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
236 for (int k = 0; k < 2; k++) {
237 variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
238 16, 16, &sse16x16[k], &sum16x16[k]);
239 }
240
241 // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
242 *tot_sse += sse16x16[0] + sse16x16[1];
243 *tot_sum += sum16x16[0] + sum16x16[1];
244 for (int i = 0; i < 2; i++)
245 var16x16[i] =
246 sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
247 }
248
249 /* Identical to the variance call except it does not calculate the
250 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
251 * variable.
252 */
253 #define MSE(W, H) \
254 uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
255 const uint8_t *b, int b_stride, \
256 uint32_t *sse) { \
257 int sum; \
258 variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
259 return *sse; \
260 }
261
262 /* All three forms of the variance are available in the same sizes. */
263 #define VARIANCES(W, H) \
264 VAR(W, H) \
265 SUBPIX_VAR(W, H) \
266 SUBPIX_AVG_VAR(W, H)
267
268 VARIANCES(128, 128)
269 VARIANCES(128, 64)
270 VARIANCES(64, 128)
271 VARIANCES(64, 64)
272 VARIANCES(64, 32)
273 VARIANCES(32, 64)
274 VARIANCES(32, 32)
275 VARIANCES(32, 16)
276 VARIANCES(16, 32)
277 VARIANCES(16, 16)
278 VARIANCES(16, 8)
279 VARIANCES(8, 16)
280 VARIANCES(8, 8)
281 VARIANCES(8, 4)
282 VARIANCES(4, 8)
283 VARIANCES(4, 4)
284 VARIANCES(4, 2)
285 VARIANCES(2, 4)
286 VARIANCES(2, 2)
287
288 // Realtime mode doesn't use rectangular blocks.
289 #if !CONFIG_REALTIME_ONLY
290 VARIANCES(4, 16)
291 VARIANCES(16, 4)
292 VARIANCES(8, 32)
293 VARIANCES(32, 8)
294 VARIANCES(16, 64)
295 VARIANCES(64, 16)
296 #endif
297
298 GET_VAR(16, 16)
299 GET_VAR(8, 8)
300
301 MSE(16, 16)
302 MSE(16, 8)
303 MSE(8, 16)
304 MSE(8, 8)
305
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)306 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
307 int height, const uint8_t *ref, int ref_stride) {
308 int i, j;
309
310 for (i = 0; i < height; ++i) {
311 for (j = 0; j < width; ++j) {
312 const int tmp = pred[j] + ref[j];
313 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
314 }
315 comp_pred += width;
316 pred += width;
317 ref += ref_stride;
318 }
319 }
320
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)321 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
322 int width, int height, const uint8_t *ref,
323 int ref_stride,
324 const DIST_WTD_COMP_PARAMS *jcp_param) {
325 int i, j;
326 const int fwd_offset = jcp_param->fwd_offset;
327 const int bck_offset = jcp_param->bck_offset;
328
329 for (i = 0; i < height; ++i) {
330 for (j = 0; j < width; ++j) {
331 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
332 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
333 comp_pred[j] = (uint8_t)tmp;
334 }
335 comp_pred += width;
336 pred += width;
337 ref += ref_stride;
338 }
339 }
340
341 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)342 static void highbd_variance64(const uint8_t *a8, int a_stride,
343 const uint8_t *b8, int b_stride, int w, int h,
344 uint64_t *sse, int64_t *sum) {
345 const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
346 const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
347 int64_t tsum = 0;
348 uint64_t tsse = 0;
349 for (int i = 0; i < h; ++i) {
350 int32_t lsum = 0;
351 for (int j = 0; j < w; ++j) {
352 const int diff = a[j] - b[j];
353 lsum += diff;
354 tsse += (uint32_t)(diff * diff);
355 }
356 tsum += lsum;
357 a += a_stride;
358 b += b_stride;
359 }
360 *sum = tsum;
361 *sse = tsse;
362 }
363
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)364 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
365 const uint8_t *b, int b_stride, int w, int h) {
366 uint64_t sse;
367 int64_t sum;
368 highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
369 return sse;
370 }
371
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)372 static void highbd_8_variance(const uint8_t *a8, int a_stride,
373 const uint8_t *b8, int b_stride, int w, int h,
374 uint32_t *sse, int *sum) {
375 uint64_t sse_long = 0;
376 int64_t sum_long = 0;
377 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
378 *sse = (uint32_t)sse_long;
379 *sum = (int)sum_long;
380 }
381
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)382 static void highbd_10_variance(const uint8_t *a8, int a_stride,
383 const uint8_t *b8, int b_stride, int w, int h,
384 uint32_t *sse, int *sum) {
385 uint64_t sse_long = 0;
386 int64_t sum_long = 0;
387 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
388 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
389 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
390 }
391
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)392 static void highbd_12_variance(const uint8_t *a8, int a_stride,
393 const uint8_t *b8, int b_stride, int w, int h,
394 uint32_t *sse, int *sum) {
395 uint64_t sse_long = 0;
396 int64_t sum_long = 0;
397 highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
398 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
399 *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
400 }
401
402 #define HIGHBD_VAR(W, H) \
403 uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
404 const uint8_t *b, int b_stride, \
405 uint32_t *sse) { \
406 int sum; \
407 highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
408 return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
409 } \
410 \
411 uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
412 const uint8_t *b, int b_stride, \
413 uint32_t *sse) { \
414 int sum; \
415 int64_t var; \
416 highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
417 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
418 return (var >= 0) ? (uint32_t)var : 0; \
419 } \
420 \
421 uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
422 const uint8_t *b, int b_stride, \
423 uint32_t *sse) { \
424 int sum; \
425 int64_t var; \
426 highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
427 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
428 return (var >= 0) ? (uint32_t)var : 0; \
429 }
430
431 #define HIGHBD_GET_VAR(S) \
432 void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
433 const uint8_t *ref, int ref_stride, \
434 uint32_t *sse, int *sum) { \
435 highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
436 } \
437 \
438 void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
439 const uint8_t *ref, int ref_stride, \
440 uint32_t *sse, int *sum) { \
441 highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
442 } \
443 \
444 void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
445 const uint8_t *ref, int ref_stride, \
446 uint32_t *sse, int *sum) { \
447 highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
448 }
449
450 #define HIGHBD_MSE(W, H) \
451 uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
452 const uint8_t *ref, int ref_stride, \
453 uint32_t *sse) { \
454 int sum; \
455 highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
456 return *sse; \
457 } \
458 \
459 uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
460 const uint8_t *ref, int ref_stride, \
461 uint32_t *sse) { \
462 int sum; \
463 highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
464 return *sse; \
465 } \
466 \
467 uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
468 const uint8_t *ref, int ref_stride, \
469 uint32_t *sse) { \
470 int sum; \
471 highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
472 return *sse; \
473 }
474
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)475 void aom_highbd_var_filter_block2d_bil_first_pass(
476 const uint8_t *src_ptr8, uint16_t *output_ptr,
477 unsigned int src_pixels_per_line, int pixel_step,
478 unsigned int output_height, unsigned int output_width,
479 const uint8_t *filter) {
480 unsigned int i, j;
481 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
482 for (i = 0; i < output_height; ++i) {
483 for (j = 0; j < output_width; ++j) {
484 output_ptr[j] = ROUND_POWER_OF_TWO(
485 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
486 FILTER_BITS);
487
488 ++src_ptr;
489 }
490
491 // Next row...
492 src_ptr += src_pixels_per_line - output_width;
493 output_ptr += output_width;
494 }
495 }
496
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)497 void aom_highbd_var_filter_block2d_bil_second_pass(
498 const uint16_t *src_ptr, uint16_t *output_ptr,
499 unsigned int src_pixels_per_line, unsigned int pixel_step,
500 unsigned int output_height, unsigned int output_width,
501 const uint8_t *filter) {
502 unsigned int i, j;
503
504 for (i = 0; i < output_height; ++i) {
505 for (j = 0; j < output_width; ++j) {
506 output_ptr[j] = ROUND_POWER_OF_TWO(
507 (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
508 FILTER_BITS);
509 ++src_ptr;
510 }
511
512 src_ptr += src_pixels_per_line - output_width;
513 output_ptr += output_width;
514 }
515 }
516
517 #define HIGHBD_SUBPIX_VAR(W, H) \
518 uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
519 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
520 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
521 uint16_t fdata3[(H + 1) * W]; \
522 uint16_t temp2[H * W]; \
523 \
524 aom_highbd_var_filter_block2d_bil_first_pass( \
525 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
526 aom_highbd_var_filter_block2d_bil_second_pass( \
527 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
528 \
529 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
530 dst, dst_stride, sse); \
531 } \
532 \
533 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
534 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
535 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
536 uint16_t fdata3[(H + 1) * W]; \
537 uint16_t temp2[H * W]; \
538 \
539 aom_highbd_var_filter_block2d_bil_first_pass( \
540 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
541 aom_highbd_var_filter_block2d_bil_second_pass( \
542 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
543 \
544 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
545 dst, dst_stride, sse); \
546 } \
547 \
548 uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
549 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
550 const uint8_t *dst, int dst_stride, uint32_t *sse) { \
551 uint16_t fdata3[(H + 1) * W]; \
552 uint16_t temp2[H * W]; \
553 \
554 aom_highbd_var_filter_block2d_bil_first_pass( \
555 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
556 aom_highbd_var_filter_block2d_bil_second_pass( \
557 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
558 \
559 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
560 dst, dst_stride, sse); \
561 }
562
563 #define HIGHBD_SUBPIX_AVG_VAR(W, H) \
564 uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
565 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
566 const uint8_t *dst, int dst_stride, uint32_t *sse, \
567 const uint8_t *second_pred) { \
568 uint16_t fdata3[(H + 1) * W]; \
569 uint16_t temp2[H * W]; \
570 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
571 \
572 aom_highbd_var_filter_block2d_bil_first_pass( \
573 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
574 aom_highbd_var_filter_block2d_bil_second_pass( \
575 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
576 \
577 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
578 CONVERT_TO_BYTEPTR(temp2), W); \
579 \
580 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
581 dst, dst_stride, sse); \
582 } \
583 \
584 uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
585 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
586 const uint8_t *dst, int dst_stride, uint32_t *sse, \
587 const uint8_t *second_pred) { \
588 uint16_t fdata3[(H + 1) * W]; \
589 uint16_t temp2[H * W]; \
590 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
591 \
592 aom_highbd_var_filter_block2d_bil_first_pass( \
593 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
594 aom_highbd_var_filter_block2d_bil_second_pass( \
595 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
596 \
597 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
598 CONVERT_TO_BYTEPTR(temp2), W); \
599 \
600 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
601 dst, dst_stride, sse); \
602 } \
603 \
604 uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
605 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
606 const uint8_t *dst, int dst_stride, uint32_t *sse, \
607 const uint8_t *second_pred) { \
608 uint16_t fdata3[(H + 1) * W]; \
609 uint16_t temp2[H * W]; \
610 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
611 \
612 aom_highbd_var_filter_block2d_bil_first_pass( \
613 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
614 aom_highbd_var_filter_block2d_bil_second_pass( \
615 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
616 \
617 aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
618 CONVERT_TO_BYTEPTR(temp2), W); \
619 \
620 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
621 dst, dst_stride, sse); \
622 } \
623 \
624 uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
625 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
626 const uint8_t *dst, int dst_stride, uint32_t *sse, \
627 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
628 uint16_t fdata3[(H + 1) * W]; \
629 uint16_t temp2[H * W]; \
630 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
631 \
632 aom_highbd_var_filter_block2d_bil_first_pass( \
633 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
634 aom_highbd_var_filter_block2d_bil_second_pass( \
635 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
636 \
637 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
638 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
639 jcp_param); \
640 \
641 return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
642 dst_stride, sse); \
643 } \
644 \
645 uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
646 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
647 const uint8_t *dst, int dst_stride, uint32_t *sse, \
648 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
649 uint16_t fdata3[(H + 1) * W]; \
650 uint16_t temp2[H * W]; \
651 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
652 \
653 aom_highbd_var_filter_block2d_bil_first_pass( \
654 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
655 aom_highbd_var_filter_block2d_bil_second_pass( \
656 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
657 \
658 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
659 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
660 jcp_param); \
661 \
662 return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
663 dst_stride, sse); \
664 } \
665 \
666 uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
667 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
668 const uint8_t *dst, int dst_stride, uint32_t *sse, \
669 const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
670 uint16_t fdata3[(H + 1) * W]; \
671 uint16_t temp2[H * W]; \
672 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
673 \
674 aom_highbd_var_filter_block2d_bil_first_pass( \
675 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
676 aom_highbd_var_filter_block2d_bil_second_pass( \
677 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
678 \
679 aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
680 W, H, CONVERT_TO_BYTEPTR(temp2), W, \
681 jcp_param); \
682 \
683 return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
684 dst_stride, sse); \
685 }
686
687 /* All three forms of the variance are available in the same sizes. */
688 #define HIGHBD_VARIANCES(W, H) \
689 HIGHBD_VAR(W, H) \
690 HIGHBD_SUBPIX_VAR(W, H) \
691 HIGHBD_SUBPIX_AVG_VAR(W, H)
692
693 HIGHBD_VARIANCES(128, 128)
694 HIGHBD_VARIANCES(128, 64)
695 HIGHBD_VARIANCES(64, 128)
696 HIGHBD_VARIANCES(64, 64)
697 HIGHBD_VARIANCES(64, 32)
698 HIGHBD_VARIANCES(32, 64)
699 HIGHBD_VARIANCES(32, 32)
700 HIGHBD_VARIANCES(32, 16)
701 HIGHBD_VARIANCES(16, 32)
702 HIGHBD_VARIANCES(16, 16)
703 HIGHBD_VARIANCES(16, 8)
704 HIGHBD_VARIANCES(8, 16)
705 HIGHBD_VARIANCES(8, 8)
706 HIGHBD_VARIANCES(8, 4)
707 HIGHBD_VARIANCES(4, 8)
708 HIGHBD_VARIANCES(4, 4)
709 HIGHBD_VARIANCES(4, 2)
710 HIGHBD_VARIANCES(2, 4)
711 HIGHBD_VARIANCES(2, 2)
712
713 // Realtime mode doesn't use 4x rectangular blocks.
714 #if !CONFIG_REALTIME_ONLY
715 HIGHBD_VARIANCES(4, 16)
716 HIGHBD_VARIANCES(16, 4)
717 HIGHBD_VARIANCES(8, 32)
718 HIGHBD_VARIANCES(32, 8)
719 HIGHBD_VARIANCES(16, 64)
720 HIGHBD_VARIANCES(64, 16)
721 #endif
722
723 HIGHBD_GET_VAR(8)
724 HIGHBD_GET_VAR(16)
725
726 HIGHBD_MSE(16, 16)
727 HIGHBD_MSE(16, 8)
728 HIGHBD_MSE(8, 16)
729 HIGHBD_MSE(8, 8)
730
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)731 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
732 int width, int height, const uint8_t *ref8,
733 int ref_stride) {
734 int i, j;
735 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
736 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
737 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
738 for (i = 0; i < height; ++i) {
739 for (j = 0; j < width; ++j) {
740 const int tmp = pred[j] + ref[j];
741 comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
742 }
743 comp_pred += width;
744 pred += width;
745 ref += ref_stride;
746 }
747 }
748
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)749 void aom_highbd_dist_wtd_comp_avg_pred_c(
750 uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
751 const uint8_t *ref8, int ref_stride,
752 const DIST_WTD_COMP_PARAMS *jcp_param) {
753 int i, j;
754 const int fwd_offset = jcp_param->fwd_offset;
755 const int bck_offset = jcp_param->bck_offset;
756 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
757 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
758 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
759
760 for (i = 0; i < height; ++i) {
761 for (j = 0; j < width; ++j) {
762 int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
763 tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
764 comp_pred[j] = (uint16_t)tmp;
765 }
766 comp_pred += width;
767 pred += width;
768 ref += ref_stride;
769 }
770 }
771 #endif // CONFIG_AV1_HIGHBITDEPTH
772
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)773 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
774 int height, const uint8_t *ref, int ref_stride,
775 const uint8_t *mask, int mask_stride,
776 int invert_mask) {
777 int i, j;
778 const uint8_t *src0 = invert_mask ? pred : ref;
779 const uint8_t *src1 = invert_mask ? ref : pred;
780 const int stride0 = invert_mask ? width : ref_stride;
781 const int stride1 = invert_mask ? ref_stride : width;
782 for (i = 0; i < height; ++i) {
783 for (j = 0; j < width; ++j) {
784 comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
785 }
786 comp_pred += width;
787 src0 += stride0;
788 src1 += stride1;
789 mask += mask_stride;
790 }
791 }
792
793 #define MASK_SUBPIX_VAR(W, H) \
794 unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
795 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
796 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
797 const uint8_t *msk, int msk_stride, int invert_mask, \
798 unsigned int *sse) { \
799 uint16_t fdata3[(H + 1) * W]; \
800 uint8_t temp2[H * W]; \
801 DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
802 \
803 aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
804 W, bilinear_filters_2t[xoffset]); \
805 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
806 bilinear_filters_2t[yoffset]); \
807 \
808 aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
809 invert_mask); \
810 return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
811 }
812
813 MASK_SUBPIX_VAR(4, 4)
814 MASK_SUBPIX_VAR(4, 8)
815 MASK_SUBPIX_VAR(8, 4)
816 MASK_SUBPIX_VAR(8, 8)
817 MASK_SUBPIX_VAR(8, 16)
818 MASK_SUBPIX_VAR(16, 8)
819 MASK_SUBPIX_VAR(16, 16)
820 MASK_SUBPIX_VAR(16, 32)
821 MASK_SUBPIX_VAR(32, 16)
822 MASK_SUBPIX_VAR(32, 32)
823 MASK_SUBPIX_VAR(32, 64)
824 MASK_SUBPIX_VAR(64, 32)
825 MASK_SUBPIX_VAR(64, 64)
826 MASK_SUBPIX_VAR(64, 128)
827 MASK_SUBPIX_VAR(128, 64)
828 MASK_SUBPIX_VAR(128, 128)
829
830 // Realtime mode doesn't use 4x rectangular blocks.
831 #if !CONFIG_REALTIME_ONLY
832 MASK_SUBPIX_VAR(4, 16)
833 MASK_SUBPIX_VAR(16, 4)
834 MASK_SUBPIX_VAR(8, 32)
835 MASK_SUBPIX_VAR(32, 8)
836 MASK_SUBPIX_VAR(16, 64)
837 MASK_SUBPIX_VAR(64, 16)
838 #endif
839
840 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)841 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
842 int width, int height, const uint8_t *ref8,
843 int ref_stride, const uint8_t *mask,
844 int mask_stride, int invert_mask) {
845 int i, j;
846 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
847 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
848 uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
849 for (i = 0; i < height; ++i) {
850 for (j = 0; j < width; ++j) {
851 if (!invert_mask)
852 comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
853 else
854 comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
855 }
856 comp_pred += width;
857 pred += width;
858 ref += ref_stride;
859 mask += mask_stride;
860 }
861 }
862
863 #define HIGHBD_MASK_SUBPIX_VAR(W, H) \
864 unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
865 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
866 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
867 const uint8_t *msk, int msk_stride, int invert_mask, \
868 unsigned int *sse) { \
869 uint16_t fdata3[(H + 1) * W]; \
870 uint16_t temp2[H * W]; \
871 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
872 \
873 aom_highbd_var_filter_block2d_bil_first_pass( \
874 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
875 aom_highbd_var_filter_block2d_bil_second_pass( \
876 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
877 \
878 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
879 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
880 invert_mask); \
881 \
882 return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
883 ref, ref_stride, sse); \
884 } \
885 \
886 unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
887 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
888 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
889 const uint8_t *msk, int msk_stride, int invert_mask, \
890 unsigned int *sse) { \
891 uint16_t fdata3[(H + 1) * W]; \
892 uint16_t temp2[H * W]; \
893 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
894 \
895 aom_highbd_var_filter_block2d_bil_first_pass( \
896 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
897 aom_highbd_var_filter_block2d_bil_second_pass( \
898 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
899 \
900 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
901 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
902 invert_mask); \
903 \
904 return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
905 ref, ref_stride, sse); \
906 } \
907 \
908 unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
909 const uint8_t *src, int src_stride, int xoffset, int yoffset, \
910 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
911 const uint8_t *msk, int msk_stride, int invert_mask, \
912 unsigned int *sse) { \
913 uint16_t fdata3[(H + 1) * W]; \
914 uint16_t temp2[H * W]; \
915 DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
916 \
917 aom_highbd_var_filter_block2d_bil_first_pass( \
918 src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
919 aom_highbd_var_filter_block2d_bil_second_pass( \
920 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
921 \
922 aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
923 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
924 invert_mask); \
925 \
926 return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
927 ref, ref_stride, sse); \
928 }
929
930 HIGHBD_MASK_SUBPIX_VAR(4, 4)
931 HIGHBD_MASK_SUBPIX_VAR(4, 8)
932 HIGHBD_MASK_SUBPIX_VAR(8, 4)
933 HIGHBD_MASK_SUBPIX_VAR(8, 8)
934 HIGHBD_MASK_SUBPIX_VAR(8, 16)
935 HIGHBD_MASK_SUBPIX_VAR(16, 8)
936 HIGHBD_MASK_SUBPIX_VAR(16, 16)
937 HIGHBD_MASK_SUBPIX_VAR(16, 32)
938 HIGHBD_MASK_SUBPIX_VAR(32, 16)
939 HIGHBD_MASK_SUBPIX_VAR(32, 32)
940 HIGHBD_MASK_SUBPIX_VAR(32, 64)
941 HIGHBD_MASK_SUBPIX_VAR(64, 32)
942 HIGHBD_MASK_SUBPIX_VAR(64, 64)
943 HIGHBD_MASK_SUBPIX_VAR(64, 128)
944 HIGHBD_MASK_SUBPIX_VAR(128, 64)
945 HIGHBD_MASK_SUBPIX_VAR(128, 128)
946 #if !CONFIG_REALTIME_ONLY
947 HIGHBD_MASK_SUBPIX_VAR(4, 16)
948 HIGHBD_MASK_SUBPIX_VAR(16, 4)
949 HIGHBD_MASK_SUBPIX_VAR(8, 32)
950 HIGHBD_MASK_SUBPIX_VAR(32, 8)
951 HIGHBD_MASK_SUBPIX_VAR(16, 64)
952 HIGHBD_MASK_SUBPIX_VAR(64, 16)
953 #endif
954 #endif // CONFIG_AV1_HIGHBITDEPTH
955
956 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)957 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
958 const int32_t *wsrc, const int32_t *mask,
959 int w, int h, unsigned int *sse, int *sum) {
960 int i, j;
961
962 *sse = 0;
963 *sum = 0;
964
965 for (i = 0; i < h; i++) {
966 for (j = 0; j < w; j++) {
967 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
968 *sum += diff;
969 *sse += diff * diff;
970 }
971
972 pre += pre_stride;
973 wsrc += w;
974 mask += w;
975 }
976 }
977
978 #define OBMC_VAR(W, H) \
979 unsigned int aom_obmc_variance##W##x##H##_c( \
980 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
981 const int32_t *mask, unsigned int *sse) { \
982 int sum; \
983 obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
984 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
985 }
986
987 #define OBMC_SUBPIX_VAR(W, H) \
988 unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
989 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
990 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
991 uint16_t fdata3[(H + 1) * W]; \
992 uint8_t temp2[H * W]; \
993 \
994 aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
995 W, bilinear_filters_2t[xoffset]); \
996 aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
997 bilinear_filters_2t[yoffset]); \
998 \
999 return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
1000 }
1001
1002 OBMC_VAR(4, 4)
1003 OBMC_SUBPIX_VAR(4, 4)
1004
1005 OBMC_VAR(4, 8)
1006 OBMC_SUBPIX_VAR(4, 8)
1007
1008 OBMC_VAR(8, 4)
1009 OBMC_SUBPIX_VAR(8, 4)
1010
1011 OBMC_VAR(8, 8)
1012 OBMC_SUBPIX_VAR(8, 8)
1013
1014 OBMC_VAR(8, 16)
1015 OBMC_SUBPIX_VAR(8, 16)
1016
1017 OBMC_VAR(16, 8)
1018 OBMC_SUBPIX_VAR(16, 8)
1019
1020 OBMC_VAR(16, 16)
1021 OBMC_SUBPIX_VAR(16, 16)
1022
1023 OBMC_VAR(16, 32)
1024 OBMC_SUBPIX_VAR(16, 32)
1025
1026 OBMC_VAR(32, 16)
1027 OBMC_SUBPIX_VAR(32, 16)
1028
1029 OBMC_VAR(32, 32)
1030 OBMC_SUBPIX_VAR(32, 32)
1031
1032 OBMC_VAR(32, 64)
1033 OBMC_SUBPIX_VAR(32, 64)
1034
1035 OBMC_VAR(64, 32)
1036 OBMC_SUBPIX_VAR(64, 32)
1037
1038 OBMC_VAR(64, 64)
1039 OBMC_SUBPIX_VAR(64, 64)
1040
1041 OBMC_VAR(64, 128)
1042 OBMC_SUBPIX_VAR(64, 128)
1043
1044 OBMC_VAR(128, 64)
1045 OBMC_SUBPIX_VAR(128, 64)
1046
1047 OBMC_VAR(128, 128)
1048 OBMC_SUBPIX_VAR(128, 128)
1049
1050 OBMC_VAR(4, 16)
1051 OBMC_SUBPIX_VAR(4, 16)
1052 OBMC_VAR(16, 4)
1053 OBMC_SUBPIX_VAR(16, 4)
1054 OBMC_VAR(8, 32)
1055 OBMC_SUBPIX_VAR(8, 32)
1056 OBMC_VAR(32, 8)
1057 OBMC_SUBPIX_VAR(32, 8)
1058 OBMC_VAR(16, 64)
1059 OBMC_SUBPIX_VAR(16, 64)
1060 OBMC_VAR(64, 16)
1061 OBMC_SUBPIX_VAR(64, 16)
1062
1063 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1064 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1065 const int32_t *wsrc,
1066 const int32_t *mask, int w, int h,
1067 uint64_t *sse, int64_t *sum) {
1068 int i, j;
1069 uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1070
1071 *sse = 0;
1072 *sum = 0;
1073
1074 for (i = 0; i < h; i++) {
1075 for (j = 0; j < w; j++) {
1076 int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1077 *sum += diff;
1078 *sse += diff * diff;
1079 }
1080
1081 pre += pre_stride;
1082 wsrc += w;
1083 mask += w;
1084 }
1085 }
1086
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1087 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1088 const int32_t *wsrc,
1089 const int32_t *mask, int w, int h,
1090 unsigned int *sse, int *sum) {
1091 int64_t sum64;
1092 uint64_t sse64;
1093 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1094 *sum = (int)sum64;
1095 *sse = (unsigned int)sse64;
1096 }
1097
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1098 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1099 const int32_t *wsrc,
1100 const int32_t *mask, int w, int h,
1101 unsigned int *sse, int *sum) {
1102 int64_t sum64;
1103 uint64_t sse64;
1104 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1105 *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1106 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1107 }
1108
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1109 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1110 const int32_t *wsrc,
1111 const int32_t *mask, int w, int h,
1112 unsigned int *sse, int *sum) {
1113 int64_t sum64;
1114 uint64_t sse64;
1115 highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1116 *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1117 *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1118 }
1119
1120 #define HIGHBD_OBMC_VAR(W, H) \
1121 unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
1122 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1123 const int32_t *mask, unsigned int *sse) { \
1124 int sum; \
1125 highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1126 return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
1127 } \
1128 \
1129 unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
1130 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1131 const int32_t *mask, unsigned int *sse) { \
1132 int sum; \
1133 int64_t var; \
1134 highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1135 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1136 return (var >= 0) ? (uint32_t)var : 0; \
1137 } \
1138 \
1139 unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
1140 const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
1141 const int32_t *mask, unsigned int *sse) { \
1142 int sum; \
1143 int64_t var; \
1144 highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1145 var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
1146 return (var >= 0) ? (uint32_t)var : 0; \
1147 }
1148
1149 #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
1150 unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
1151 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1152 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1153 uint16_t fdata3[(H + 1) * W]; \
1154 uint16_t temp2[H * W]; \
1155 \
1156 aom_highbd_var_filter_block2d_bil_first_pass( \
1157 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1158 aom_highbd_var_filter_block2d_bil_second_pass( \
1159 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1160 \
1161 return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1162 wsrc, mask, sse); \
1163 } \
1164 \
1165 unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
1166 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1167 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1168 uint16_t fdata3[(H + 1) * W]; \
1169 uint16_t temp2[H * W]; \
1170 \
1171 aom_highbd_var_filter_block2d_bil_first_pass( \
1172 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1173 aom_highbd_var_filter_block2d_bil_second_pass( \
1174 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1175 \
1176 return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1177 W, wsrc, mask, sse); \
1178 } \
1179 \
1180 unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
1181 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
1182 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
1183 uint16_t fdata3[(H + 1) * W]; \
1184 uint16_t temp2[H * W]; \
1185 \
1186 aom_highbd_var_filter_block2d_bil_first_pass( \
1187 pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
1188 aom_highbd_var_filter_block2d_bil_second_pass( \
1189 fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
1190 \
1191 return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1192 W, wsrc, mask, sse); \
1193 }
1194
1195 HIGHBD_OBMC_VAR(4, 4)
1196 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1197
1198 HIGHBD_OBMC_VAR(4, 8)
1199 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1200
1201 HIGHBD_OBMC_VAR(8, 4)
1202 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1203
1204 HIGHBD_OBMC_VAR(8, 8)
1205 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1206
1207 HIGHBD_OBMC_VAR(8, 16)
1208 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1209
1210 HIGHBD_OBMC_VAR(16, 8)
1211 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1212
1213 HIGHBD_OBMC_VAR(16, 16)
1214 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1215
1216 HIGHBD_OBMC_VAR(16, 32)
1217 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1218
1219 HIGHBD_OBMC_VAR(32, 16)
1220 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1221
1222 HIGHBD_OBMC_VAR(32, 32)
1223 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1224
1225 HIGHBD_OBMC_VAR(32, 64)
1226 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1227
1228 HIGHBD_OBMC_VAR(64, 32)
1229 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1230
1231 HIGHBD_OBMC_VAR(64, 64)
1232 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1233
1234 HIGHBD_OBMC_VAR(64, 128)
1235 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1236
1237 HIGHBD_OBMC_VAR(128, 64)
1238 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1239
1240 HIGHBD_OBMC_VAR(128, 128)
1241 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1242
1243 HIGHBD_OBMC_VAR(4, 16)
1244 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1245 HIGHBD_OBMC_VAR(16, 4)
1246 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1247 HIGHBD_OBMC_VAR(8, 32)
1248 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1249 HIGHBD_OBMC_VAR(32, 8)
1250 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1251 HIGHBD_OBMC_VAR(16, 64)
1252 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1253 HIGHBD_OBMC_VAR(64, 16)
1254 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1255 #endif // CONFIG_AV1_HIGHBITDEPTH
1256 #endif // !CONFIG_REALTIME_ONLY
1257
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1258 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1259 int sstride, int w, int h) {
1260 uint64_t sum = 0;
1261 for (int i = 0; i < h; i++) {
1262 for (int j = 0; j < w; j++) {
1263 int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1264 sum += e * e;
1265 }
1266 }
1267 return sum;
1268 }
1269
aom_mse_16xh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int w,int h)1270 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
1271 int h) {
1272 uint16_t *src_temp = src;
1273 uint8_t *dst_temp = dst;
1274 const int num_blks = 16 / w;
1275 int64_t sum = 0;
1276 for (int i = 0; i < num_blks; i++) {
1277 sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
1278 dst_temp += w;
1279 src_temp += (w * h);
1280 }
1281 return sum;
1282 }
1283
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1284 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1285 int sstride, int w, int h) {
1286 uint64_t sum = 0;
1287 for (int i = 0; i < h; i++) {
1288 for (int j = 0; j < w; j++) {
1289 int e = dst[i * dstride + j] - src[i * sstride + j];
1290 sum += e * e;
1291 }
1292 }
1293 return sum;
1294 }
1295