• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <stdlib.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom/aom_integer.h"
18 #include "aom_ports/mem.h"
19 
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_dsp/blend.h"
22 #include "aom_dsp/variance.h"
23 
24 #include "av1/common/filter.h"
25 #include "av1/common/reconinter.h"
26 
aom_get_mb_ss_c(const int16_t * a)27 uint32_t aom_get_mb_ss_c(const int16_t *a) {
28   unsigned int i, sum = 0;
29 
30   for (i = 0; i < 256; ++i) {
31     sum += a[i] * a[i];
32   }
33 
34   return sum;
35 }
36 
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)37 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
38                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
39   int i, j;
40 
41   *sum = 0;
42   *sse = 0;
43 
44   for (i = 0; i < h; ++i) {
45     for (j = 0; j < w; ++j) {
46       const int diff = a[j] - b[j];
47       *sum += diff;
48       *sse += diff * diff;
49     }
50 
51     a += a_stride;
52     b += b_stride;
53   }
54 }
55 
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)56 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
57                           int b_stride, int w, int h) {
58   uint32_t sse;
59   int sum;
60   variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
61   return sse;
62 }
63 
64 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
65 // or vertical direction to produce the filtered output block. Used to implement
66 // the first-pass of 2-D separable filter.
67 //
68 // Produces int16_t output to retain precision for the next pass. Two filter
69 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
70 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
71 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)72 static void var_filter_block2d_bil_first_pass_c(
73     const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
74     unsigned int pixel_step, unsigned int output_height,
75     unsigned int output_width, const uint8_t *filter) {
76   unsigned int i, j;
77 
78   for (i = 0; i < output_height; ++i) {
79     for (j = 0; j < output_width; ++j) {
80       b[j] = ROUND_POWER_OF_TWO(
81           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
82 
83       ++a;
84     }
85 
86     a += src_pixels_per_line - output_width;
87     b += output_width;
88   }
89 }
90 
91 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
92 // or vertical direction to produce the filtered output block. Used to implement
93 // the second-pass of 2-D separable filter.
94 //
95 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
96 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
97 // filter is applied horizontally (pixel_step = 1) or vertically
98 // (pixel_step = stride). It defines the offset required to move from one input
99 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)100 static void var_filter_block2d_bil_second_pass_c(
101     const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
102     unsigned int pixel_step, unsigned int output_height,
103     unsigned int output_width, const uint8_t *filter) {
104   unsigned int i, j;
105 
106   for (i = 0; i < output_height; ++i) {
107     for (j = 0; j < output_width; ++j) {
108       b[j] = ROUND_POWER_OF_TWO(
109           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
110       ++a;
111     }
112 
113     a += src_pixels_per_line - output_width;
114     b += output_width;
115   }
116 }
117 
118 #define VAR(W, H)                                                    \
119   uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
120                                      const uint8_t *b, int b_stride, \
121                                      uint32_t *sse) {                \
122     int sum;                                                         \
123     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
124     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
125   }
126 
127 #define SUBPIX_VAR(W, H)                                                  \
128   uint32_t aom_sub_pixel_variance##W##x##H##_c(                           \
129       const uint8_t *a, int a_stride, int xoffset, int yoffset,           \
130       const uint8_t *b, int b_stride, uint32_t *sse) {                    \
131     uint16_t fdata3[(H + 1) * W];                                         \
132     uint8_t temp2[H * W];                                                 \
133                                                                           \
134     var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
135                                         bilinear_filters_2t[xoffset]);    \
136     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
137                                          bilinear_filters_2t[yoffset]);   \
138                                                                           \
139     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);         \
140   }
141 
142 #define SUBPIX_AVG_VAR(W, H)                                                   \
143   uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
144       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
145       const uint8_t *b, int b_stride, uint32_t *sse,                           \
146       const uint8_t *second_pred) {                                            \
147     uint16_t fdata3[(H + 1) * W];                                              \
148     uint8_t temp2[H * W];                                                      \
149     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
150                                                                                \
151     var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,      \
152                                         bilinear_filters_2t[xoffset]);         \
153     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,            \
154                                          bilinear_filters_2t[yoffset]);        \
155                                                                                \
156     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
157                                                                                \
158     return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
159   }                                                                            \
160   uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
161       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
162       const uint8_t *b, int b_stride, uint32_t *sse,                           \
163       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
164     uint16_t fdata3[(H + 1) * W];                                              \
165     uint8_t temp2[H * W];                                                      \
166     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
167                                                                                \
168     var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,      \
169                                         bilinear_filters_2t[xoffset]);         \
170     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,            \
171                                          bilinear_filters_2t[yoffset]);        \
172                                                                                \
173     aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
174                                                                                \
175     return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
176   }
177 
aom_get_var_sse_sum_8x8_quad_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)178 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
179                                     const uint8_t *b, int b_stride,
180                                     uint32_t *sse8x8, int *sum8x8,
181                                     unsigned int *tot_sse, int *tot_sum,
182                                     uint32_t *var8x8) {
183   // Loop over 4 8x8 blocks. Process one 8x32 block.
184   for (int k = 0; k < 4; k++) {
185     variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
186              &sum8x8[k]);
187   }
188 
189   // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
190   *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
191   *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
192   for (int i = 0; i < 4; i++)
193     var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
194 }
195 
aom_get_var_sse_sum_16x16_dual_c(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)196 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
197                                       const uint8_t *ref_ptr, int ref_stride,
198                                       uint32_t *sse16x16, unsigned int *tot_sse,
199                                       int *tot_sum, uint32_t *var16x16) {
200   int sum16x16[2] = { 0 };
201   // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
202   for (int k = 0; k < 2; k++) {
203     variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
204              16, 16, &sse16x16[k], &sum16x16[k]);
205   }
206 
207   // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
208   *tot_sse += sse16x16[0] + sse16x16[1];
209   *tot_sum += sum16x16[0] + sum16x16[1];
210   for (int i = 0; i < 2; i++)
211     var16x16[i] =
212         sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
213 }
214 
215 /* Identical to the variance call except it does not calculate the
216  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
217  * variable.
218  */
219 #define MSE(W, H)                                               \
220   uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
221                                 const uint8_t *b, int b_stride, \
222                                 uint32_t *sse) {                \
223     int sum;                                                    \
224     variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
225     return *sse;                                                \
226   }
227 
228 /* All three forms of the variance are available in the same sizes. */
229 #define VARIANCES(W, H) \
230   VAR(W, H)             \
231   SUBPIX_VAR(W, H)      \
232   SUBPIX_AVG_VAR(W, H)
233 
234 VARIANCES(128, 128)
235 VARIANCES(128, 64)
236 VARIANCES(64, 128)
237 VARIANCES(64, 64)
238 VARIANCES(64, 32)
239 VARIANCES(32, 64)
240 VARIANCES(32, 32)
241 VARIANCES(32, 16)
242 VARIANCES(16, 32)
243 VARIANCES(16, 16)
244 VARIANCES(16, 8)
245 VARIANCES(8, 16)
246 VARIANCES(8, 8)
247 VARIANCES(8, 4)
248 VARIANCES(4, 8)
249 VARIANCES(4, 4)
250 
251 // Realtime mode doesn't use rectangular blocks.
252 #if !CONFIG_REALTIME_ONLY
253 VARIANCES(4, 16)
254 VARIANCES(16, 4)
255 VARIANCES(8, 32)
256 VARIANCES(32, 8)
257 VARIANCES(16, 64)
258 VARIANCES(64, 16)
259 #endif
260 
261 MSE(16, 16)
262 MSE(16, 8)
263 MSE(8, 16)
264 MSE(8, 8)
265 
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)266 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
267                          int height, const uint8_t *ref, int ref_stride) {
268   int i, j;
269 
270   for (i = 0; i < height; ++i) {
271     for (j = 0; j < width; ++j) {
272       const int tmp = pred[j] + ref[j];
273       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
274     }
275     comp_pred += width;
276     pred += width;
277     ref += ref_stride;
278   }
279 }
280 
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)281 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
282                                   int width, int height, const uint8_t *ref,
283                                   int ref_stride,
284                                   const DIST_WTD_COMP_PARAMS *jcp_param) {
285   int i, j;
286   const int fwd_offset = jcp_param->fwd_offset;
287   const int bck_offset = jcp_param->bck_offset;
288 
289   for (i = 0; i < height; ++i) {
290     for (j = 0; j < width; ++j) {
291       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
292       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
293       comp_pred[j] = (uint8_t)tmp;
294     }
295     comp_pred += width;
296     pred += width;
297     ref += ref_stride;
298   }
299 }
300 
301 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)302 static void highbd_variance64(const uint8_t *a8, int a_stride,
303                               const uint8_t *b8, int b_stride, int w, int h,
304                               uint64_t *sse, int64_t *sum) {
305   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
306   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
307   int64_t tsum = 0;
308   uint64_t tsse = 0;
309   for (int i = 0; i < h; ++i) {
310     int32_t lsum = 0;
311     for (int j = 0; j < w; ++j) {
312       const int diff = a[j] - b[j];
313       lsum += diff;
314       tsse += (uint32_t)(diff * diff);
315     }
316     tsum += lsum;
317     a += a_stride;
318     b += b_stride;
319   }
320   *sum = tsum;
321   *sse = tsse;
322 }
323 
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)324 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
325                                  const uint8_t *b, int b_stride, int w, int h) {
326   uint64_t sse;
327   int64_t sum;
328   highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
329   return sse;
330 }
331 
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)332 static void highbd_8_variance(const uint8_t *a8, int a_stride,
333                               const uint8_t *b8, int b_stride, int w, int h,
334                               uint32_t *sse, int *sum) {
335   uint64_t sse_long = 0;
336   int64_t sum_long = 0;
337   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
338   *sse = (uint32_t)sse_long;
339   *sum = (int)sum_long;
340 }
341 
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)342 static void highbd_10_variance(const uint8_t *a8, int a_stride,
343                                const uint8_t *b8, int b_stride, int w, int h,
344                                uint32_t *sse, int *sum) {
345   uint64_t sse_long = 0;
346   int64_t sum_long = 0;
347   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
348   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
349   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
350 }
351 
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)352 static void highbd_12_variance(const uint8_t *a8, int a_stride,
353                                const uint8_t *b8, int b_stride, int w, int h,
354                                uint32_t *sse, int *sum) {
355   uint64_t sse_long = 0;
356   int64_t sum_long = 0;
357   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
358   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
359   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
360 }
361 
362 #define HIGHBD_VAR(W, H)                                                       \
363   uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
364                                               const uint8_t *b, int b_stride,  \
365                                               uint32_t *sse) {                 \
366     int sum;                                                                   \
367     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
368     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
369   }                                                                            \
370                                                                                \
371   uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
372                                                const uint8_t *b, int b_stride, \
373                                                uint32_t *sse) {                \
374     int sum;                                                                   \
375     int64_t var;                                                               \
376     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
377     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
378     return (var >= 0) ? (uint32_t)var : 0;                                     \
379   }                                                                            \
380                                                                                \
381   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
382                                                const uint8_t *b, int b_stride, \
383                                                uint32_t *sse) {                \
384     int sum;                                                                   \
385     int64_t var;                                                               \
386     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
387     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
388     return (var >= 0) ? (uint32_t)var : 0;                                     \
389   }
390 
391 #define HIGHBD_MSE(W, H)                                                      \
392   uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
393                                          const uint8_t *ref, int ref_stride,  \
394                                          uint32_t *sse) {                     \
395     int sum;                                                                  \
396     highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
397     return *sse;                                                              \
398   }                                                                           \
399                                                                               \
400   uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
401                                           const uint8_t *ref, int ref_stride, \
402                                           uint32_t *sse) {                    \
403     int sum;                                                                  \
404     highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
405     return *sse;                                                              \
406   }                                                                           \
407                                                                               \
408   uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
409                                           const uint8_t *ref, int ref_stride, \
410                                           uint32_t *sse) {                    \
411     int sum;                                                                  \
412     highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
413     return *sse;                                                              \
414   }
415 
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)416 void aom_highbd_var_filter_block2d_bil_first_pass(
417     const uint8_t *src_ptr8, uint16_t *output_ptr,
418     unsigned int src_pixels_per_line, int pixel_step,
419     unsigned int output_height, unsigned int output_width,
420     const uint8_t *filter) {
421   unsigned int i, j;
422   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
423   for (i = 0; i < output_height; ++i) {
424     for (j = 0; j < output_width; ++j) {
425       output_ptr[j] = ROUND_POWER_OF_TWO(
426           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
427           FILTER_BITS);
428 
429       ++src_ptr;
430     }
431 
432     // Next row...
433     src_ptr += src_pixels_per_line - output_width;
434     output_ptr += output_width;
435   }
436 }
437 
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)438 void aom_highbd_var_filter_block2d_bil_second_pass(
439     const uint16_t *src_ptr, uint16_t *output_ptr,
440     unsigned int src_pixels_per_line, unsigned int pixel_step,
441     unsigned int output_height, unsigned int output_width,
442     const uint8_t *filter) {
443   unsigned int i, j;
444 
445   for (i = 0; i < output_height; ++i) {
446     for (j = 0; j < output_width; ++j) {
447       output_ptr[j] = ROUND_POWER_OF_TWO(
448           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
449           FILTER_BITS);
450       ++src_ptr;
451     }
452 
453     src_ptr += src_pixels_per_line - output_width;
454     output_ptr += output_width;
455   }
456 }
457 
458 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
459   uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
460       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
461       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
462     uint16_t fdata3[(H + 1) * W];                                            \
463     uint16_t temp2[H * W];                                                   \
464                                                                              \
465     aom_highbd_var_filter_block2d_bil_first_pass(                            \
466         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
467     aom_highbd_var_filter_block2d_bil_second_pass(                           \
468         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
469                                                                              \
470     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
471                                               dst, dst_stride, sse);         \
472   }                                                                          \
473                                                                              \
474   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
475       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
476       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
477     uint16_t fdata3[(H + 1) * W];                                            \
478     uint16_t temp2[H * W];                                                   \
479                                                                              \
480     aom_highbd_var_filter_block2d_bil_first_pass(                            \
481         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
482     aom_highbd_var_filter_block2d_bil_second_pass(                           \
483         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
484                                                                              \
485     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
486                                                dst, dst_stride, sse);        \
487   }                                                                          \
488                                                                              \
489   uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
490       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
491       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
492     uint16_t fdata3[(H + 1) * W];                                            \
493     uint16_t temp2[H * W];                                                   \
494                                                                              \
495     aom_highbd_var_filter_block2d_bil_first_pass(                            \
496         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
497     aom_highbd_var_filter_block2d_bil_second_pass(                           \
498         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
499                                                                              \
500     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
501                                                dst, dst_stride, sse);        \
502   }
503 
504 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
505   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
506       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
507       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
508       const uint8_t *second_pred) {                                           \
509     uint16_t fdata3[(H + 1) * W];                                             \
510     uint16_t temp2[H * W];                                                    \
511     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
512                                                                               \
513     aom_highbd_var_filter_block2d_bil_first_pass(                             \
514         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
515     aom_highbd_var_filter_block2d_bil_second_pass(                            \
516         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
517                                                                               \
518     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
519                                CONVERT_TO_BYTEPTR(temp2), W);                 \
520                                                                               \
521     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
522                                               dst, dst_stride, sse);          \
523   }                                                                           \
524                                                                               \
525   uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
526       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
527       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
528       const uint8_t *second_pred) {                                           \
529     uint16_t fdata3[(H + 1) * W];                                             \
530     uint16_t temp2[H * W];                                                    \
531     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
532                                                                               \
533     aom_highbd_var_filter_block2d_bil_first_pass(                             \
534         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
535     aom_highbd_var_filter_block2d_bil_second_pass(                            \
536         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
537                                                                               \
538     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
539                                CONVERT_TO_BYTEPTR(temp2), W);                 \
540                                                                               \
541     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
542                                                dst, dst_stride, sse);         \
543   }                                                                           \
544                                                                               \
545   uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
546       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
547       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
548       const uint8_t *second_pred) {                                           \
549     uint16_t fdata3[(H + 1) * W];                                             \
550     uint16_t temp2[H * W];                                                    \
551     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
552                                                                               \
553     aom_highbd_var_filter_block2d_bil_first_pass(                             \
554         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
555     aom_highbd_var_filter_block2d_bil_second_pass(                            \
556         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
557                                                                               \
558     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
559                                CONVERT_TO_BYTEPTR(temp2), W);                 \
560                                                                               \
561     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
562                                                dst, dst_stride, sse);         \
563   }                                                                           \
564                                                                               \
565   uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
566       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
567       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
568       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
569     uint16_t fdata3[(H + 1) * W];                                             \
570     uint16_t temp2[H * W];                                                    \
571     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
572                                                                               \
573     aom_highbd_var_filter_block2d_bil_first_pass(                             \
574         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
575     aom_highbd_var_filter_block2d_bil_second_pass(                            \
576         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
577                                                                               \
578     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
579                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
580                                       jcp_param);                             \
581                                                                               \
582     return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
583                                           dst_stride, sse);                   \
584   }                                                                           \
585                                                                               \
586   uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
587       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
588       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
589       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
590     uint16_t fdata3[(H + 1) * W];                                             \
591     uint16_t temp2[H * W];                                                    \
592     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
593                                                                               \
594     aom_highbd_var_filter_block2d_bil_first_pass(                             \
595         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
596     aom_highbd_var_filter_block2d_bil_second_pass(                            \
597         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
598                                                                               \
599     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
600                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
601                                       jcp_param);                             \
602                                                                               \
603     return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
604                                            dst_stride, sse);                  \
605   }                                                                           \
606                                                                               \
607   uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
608       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
609       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
610       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
611     uint16_t fdata3[(H + 1) * W];                                             \
612     uint16_t temp2[H * W];                                                    \
613     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
614                                                                               \
615     aom_highbd_var_filter_block2d_bil_first_pass(                             \
616         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
617     aom_highbd_var_filter_block2d_bil_second_pass(                            \
618         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
619                                                                               \
620     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
621                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
622                                       jcp_param);                             \
623                                                                               \
624     return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
625                                            dst_stride, sse);                  \
626   }
627 
628 /* All three forms of the variance are available in the same sizes. */
629 #define HIGHBD_VARIANCES(W, H) \
630   HIGHBD_VAR(W, H)             \
631   HIGHBD_SUBPIX_VAR(W, H)      \
632   HIGHBD_SUBPIX_AVG_VAR(W, H)
633 
634 HIGHBD_VARIANCES(128, 128)
635 HIGHBD_VARIANCES(128, 64)
636 HIGHBD_VARIANCES(64, 128)
637 HIGHBD_VARIANCES(64, 64)
638 HIGHBD_VARIANCES(64, 32)
639 HIGHBD_VARIANCES(32, 64)
640 HIGHBD_VARIANCES(32, 32)
641 HIGHBD_VARIANCES(32, 16)
642 HIGHBD_VARIANCES(16, 32)
643 HIGHBD_VARIANCES(16, 16)
644 HIGHBD_VARIANCES(16, 8)
645 HIGHBD_VARIANCES(8, 16)
646 HIGHBD_VARIANCES(8, 8)
647 HIGHBD_VARIANCES(8, 4)
648 HIGHBD_VARIANCES(4, 8)
649 HIGHBD_VARIANCES(4, 4)
650 
651 // Realtime mode doesn't use 4x rectangular blocks.
652 #if !CONFIG_REALTIME_ONLY
653 HIGHBD_VARIANCES(4, 16)
654 HIGHBD_VARIANCES(16, 4)
655 HIGHBD_VARIANCES(8, 32)
656 HIGHBD_VARIANCES(32, 8)
657 HIGHBD_VARIANCES(16, 64)
658 HIGHBD_VARIANCES(64, 16)
659 #endif
660 
661 HIGHBD_MSE(16, 16)
662 HIGHBD_MSE(16, 8)
663 HIGHBD_MSE(8, 16)
664 HIGHBD_MSE(8, 8)
665 
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)666 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
667                                 int width, int height, const uint8_t *ref8,
668                                 int ref_stride) {
669   int i, j;
670   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
671   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
672   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
673   for (i = 0; i < height; ++i) {
674     for (j = 0; j < width; ++j) {
675       const int tmp = pred[j] + ref[j];
676       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
677     }
678     comp_pred += width;
679     pred += width;
680     ref += ref_stride;
681   }
682 }
683 
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)684 void aom_highbd_dist_wtd_comp_avg_pred_c(
685     uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
686     const uint8_t *ref8, int ref_stride,
687     const DIST_WTD_COMP_PARAMS *jcp_param) {
688   int i, j;
689   const int fwd_offset = jcp_param->fwd_offset;
690   const int bck_offset = jcp_param->bck_offset;
691   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
692   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
693   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
694 
695   for (i = 0; i < height; ++i) {
696     for (j = 0; j < width; ++j) {
697       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
698       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
699       comp_pred[j] = (uint16_t)tmp;
700     }
701     comp_pred += width;
702     pred += width;
703     ref += ref_stride;
704   }
705 }
706 #endif  // CONFIG_AV1_HIGHBITDEPTH
707 
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)708 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
709                           int height, const uint8_t *ref, int ref_stride,
710                           const uint8_t *mask, int mask_stride,
711                           int invert_mask) {
712   int i, j;
713   const uint8_t *src0 = invert_mask ? pred : ref;
714   const uint8_t *src1 = invert_mask ? ref : pred;
715   const int stride0 = invert_mask ? width : ref_stride;
716   const int stride1 = invert_mask ? ref_stride : width;
717   for (i = 0; i < height; ++i) {
718     for (j = 0; j < width; ++j) {
719       comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
720     }
721     comp_pred += width;
722     src0 += stride0;
723     src1 += stride1;
724     mask += mask_stride;
725   }
726 }
727 
728 #define MASK_SUBPIX_VAR(W, H)                                                 \
729   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
730       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
731       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
732       const uint8_t *msk, int msk_stride, int invert_mask,                    \
733       unsigned int *sse) {                                                    \
734     uint16_t fdata3[(H + 1) * W];                                             \
735     uint8_t temp2[H * W];                                                     \
736     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
737                                                                               \
738     var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \
739                                         bilinear_filters_2t[xoffset]);        \
740     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,           \
741                                          bilinear_filters_2t[yoffset]);       \
742                                                                               \
743     aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
744                          invert_mask);                                        \
745     return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);         \
746   }
747 
748 MASK_SUBPIX_VAR(4, 4)
749 MASK_SUBPIX_VAR(4, 8)
750 MASK_SUBPIX_VAR(8, 4)
751 MASK_SUBPIX_VAR(8, 8)
752 MASK_SUBPIX_VAR(8, 16)
753 MASK_SUBPIX_VAR(16, 8)
754 MASK_SUBPIX_VAR(16, 16)
755 MASK_SUBPIX_VAR(16, 32)
756 MASK_SUBPIX_VAR(32, 16)
757 MASK_SUBPIX_VAR(32, 32)
758 MASK_SUBPIX_VAR(32, 64)
759 MASK_SUBPIX_VAR(64, 32)
760 MASK_SUBPIX_VAR(64, 64)
761 MASK_SUBPIX_VAR(64, 128)
762 MASK_SUBPIX_VAR(128, 64)
763 MASK_SUBPIX_VAR(128, 128)
764 
765 // Realtime mode doesn't use 4x rectangular blocks.
766 #if !CONFIG_REALTIME_ONLY
767 MASK_SUBPIX_VAR(4, 16)
768 MASK_SUBPIX_VAR(16, 4)
769 MASK_SUBPIX_VAR(8, 32)
770 MASK_SUBPIX_VAR(32, 8)
771 MASK_SUBPIX_VAR(16, 64)
772 MASK_SUBPIX_VAR(64, 16)
773 #endif
774 
775 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)776 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
777                                  int width, int height, const uint8_t *ref8,
778                                  int ref_stride, const uint8_t *mask,
779                                  int mask_stride, int invert_mask) {
780   int i, j;
781   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
782   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
783   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
784   for (i = 0; i < height; ++i) {
785     for (j = 0; j < width; ++j) {
786       if (!invert_mask)
787         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
788       else
789         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
790     }
791     comp_pred += width;
792     pred += width;
793     ref += ref_stride;
794     mask += mask_stride;
795   }
796 }
797 
798 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
799   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
800       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
801       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
802       const uint8_t *msk, int msk_stride, int invert_mask,                     \
803       unsigned int *sse) {                                                     \
804     uint16_t fdata3[(H + 1) * W];                                              \
805     uint16_t temp2[H * W];                                                     \
806     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
807                                                                                \
808     aom_highbd_var_filter_block2d_bil_first_pass(                              \
809         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
810     aom_highbd_var_filter_block2d_bil_second_pass(                             \
811         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
812                                                                                \
813     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
814                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
815                                 invert_mask);                                  \
816                                                                                \
817     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
818                                               ref, ref_stride, sse);           \
819   }                                                                            \
820                                                                                \
821   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
822       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
823       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
824       const uint8_t *msk, int msk_stride, int invert_mask,                     \
825       unsigned int *sse) {                                                     \
826     uint16_t fdata3[(H + 1) * W];                                              \
827     uint16_t temp2[H * W];                                                     \
828     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
829                                                                                \
830     aom_highbd_var_filter_block2d_bil_first_pass(                              \
831         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
832     aom_highbd_var_filter_block2d_bil_second_pass(                             \
833         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
834                                                                                \
835     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
836                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
837                                 invert_mask);                                  \
838                                                                                \
839     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
840                                                ref, ref_stride, sse);          \
841   }                                                                            \
842                                                                                \
843   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
844       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
845       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
846       const uint8_t *msk, int msk_stride, int invert_mask,                     \
847       unsigned int *sse) {                                                     \
848     uint16_t fdata3[(H + 1) * W];                                              \
849     uint16_t temp2[H * W];                                                     \
850     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
851                                                                                \
852     aom_highbd_var_filter_block2d_bil_first_pass(                              \
853         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
854     aom_highbd_var_filter_block2d_bil_second_pass(                             \
855         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
856                                                                                \
857     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
858                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
859                                 invert_mask);                                  \
860                                                                                \
861     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
862                                                ref, ref_stride, sse);          \
863   }
864 
865 HIGHBD_MASK_SUBPIX_VAR(4, 4)
866 HIGHBD_MASK_SUBPIX_VAR(4, 8)
867 HIGHBD_MASK_SUBPIX_VAR(8, 4)
868 HIGHBD_MASK_SUBPIX_VAR(8, 8)
869 HIGHBD_MASK_SUBPIX_VAR(8, 16)
870 HIGHBD_MASK_SUBPIX_VAR(16, 8)
871 HIGHBD_MASK_SUBPIX_VAR(16, 16)
872 HIGHBD_MASK_SUBPIX_VAR(16, 32)
873 HIGHBD_MASK_SUBPIX_VAR(32, 16)
874 HIGHBD_MASK_SUBPIX_VAR(32, 32)
875 HIGHBD_MASK_SUBPIX_VAR(32, 64)
876 HIGHBD_MASK_SUBPIX_VAR(64, 32)
877 HIGHBD_MASK_SUBPIX_VAR(64, 64)
878 HIGHBD_MASK_SUBPIX_VAR(64, 128)
879 HIGHBD_MASK_SUBPIX_VAR(128, 64)
880 HIGHBD_MASK_SUBPIX_VAR(128, 128)
881 #if !CONFIG_REALTIME_ONLY
882 HIGHBD_MASK_SUBPIX_VAR(4, 16)
883 HIGHBD_MASK_SUBPIX_VAR(16, 4)
884 HIGHBD_MASK_SUBPIX_VAR(8, 32)
885 HIGHBD_MASK_SUBPIX_VAR(32, 8)
886 HIGHBD_MASK_SUBPIX_VAR(16, 64)
887 HIGHBD_MASK_SUBPIX_VAR(64, 16)
888 #endif
889 #endif  // CONFIG_AV1_HIGHBITDEPTH
890 
891 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)892 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
893                                  const int32_t *wsrc, const int32_t *mask,
894                                  int w, int h, unsigned int *sse, int *sum) {
895   int i, j;
896 
897   *sse = 0;
898   *sum = 0;
899 
900   for (i = 0; i < h; i++) {
901     for (j = 0; j < w; j++) {
902       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
903       *sum += diff;
904       *sse += diff * diff;
905     }
906 
907     pre += pre_stride;
908     wsrc += w;
909     mask += w;
910   }
911 }
912 
913 #define OBMC_VAR(W, H)                                            \
914   unsigned int aom_obmc_variance##W##x##H##_c(                    \
915       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
916       const int32_t *mask, unsigned int *sse) {                   \
917     int sum;                                                      \
918     obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
919     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
920   }
921 
922 #define OBMC_SUBPIX_VAR(W, H)                                                 \
923   unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                      \
924       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,           \
925       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {          \
926     uint16_t fdata3[(H + 1) * W];                                             \
927     uint8_t temp2[H * W];                                                     \
928                                                                               \
929     var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \
930                                         bilinear_filters_2t[xoffset]);        \
931     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,           \
932                                          bilinear_filters_2t[yoffset]);       \
933                                                                               \
934     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);         \
935   }
936 
937 OBMC_VAR(4, 4)
938 OBMC_SUBPIX_VAR(4, 4)
939 
940 OBMC_VAR(4, 8)
941 OBMC_SUBPIX_VAR(4, 8)
942 
943 OBMC_VAR(8, 4)
944 OBMC_SUBPIX_VAR(8, 4)
945 
946 OBMC_VAR(8, 8)
947 OBMC_SUBPIX_VAR(8, 8)
948 
949 OBMC_VAR(8, 16)
950 OBMC_SUBPIX_VAR(8, 16)
951 
952 OBMC_VAR(16, 8)
953 OBMC_SUBPIX_VAR(16, 8)
954 
955 OBMC_VAR(16, 16)
956 OBMC_SUBPIX_VAR(16, 16)
957 
958 OBMC_VAR(16, 32)
959 OBMC_SUBPIX_VAR(16, 32)
960 
961 OBMC_VAR(32, 16)
962 OBMC_SUBPIX_VAR(32, 16)
963 
964 OBMC_VAR(32, 32)
965 OBMC_SUBPIX_VAR(32, 32)
966 
967 OBMC_VAR(32, 64)
968 OBMC_SUBPIX_VAR(32, 64)
969 
970 OBMC_VAR(64, 32)
971 OBMC_SUBPIX_VAR(64, 32)
972 
973 OBMC_VAR(64, 64)
974 OBMC_SUBPIX_VAR(64, 64)
975 
976 OBMC_VAR(64, 128)
977 OBMC_SUBPIX_VAR(64, 128)
978 
979 OBMC_VAR(128, 64)
980 OBMC_SUBPIX_VAR(128, 64)
981 
982 OBMC_VAR(128, 128)
983 OBMC_SUBPIX_VAR(128, 128)
984 
985 OBMC_VAR(4, 16)
986 OBMC_SUBPIX_VAR(4, 16)
987 OBMC_VAR(16, 4)
988 OBMC_SUBPIX_VAR(16, 4)
989 OBMC_VAR(8, 32)
990 OBMC_SUBPIX_VAR(8, 32)
991 OBMC_VAR(32, 8)
992 OBMC_SUBPIX_VAR(32, 8)
993 OBMC_VAR(16, 64)
994 OBMC_SUBPIX_VAR(16, 64)
995 OBMC_VAR(64, 16)
996 OBMC_SUBPIX_VAR(64, 16)
997 
998 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)999 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1000                                           const int32_t *wsrc,
1001                                           const int32_t *mask, int w, int h,
1002                                           uint64_t *sse, int64_t *sum) {
1003   int i, j;
1004   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1005 
1006   *sse = 0;
1007   *sum = 0;
1008 
1009   for (i = 0; i < h; i++) {
1010     for (j = 0; j < w; j++) {
1011       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1012       *sum += diff;
1013       *sse += diff * diff;
1014     }
1015 
1016     pre += pre_stride;
1017     wsrc += w;
1018     mask += w;
1019   }
1020 }
1021 
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1022 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1023                                         const int32_t *wsrc,
1024                                         const int32_t *mask, int w, int h,
1025                                         unsigned int *sse, int *sum) {
1026   int64_t sum64;
1027   uint64_t sse64;
1028   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1029   *sum = (int)sum64;
1030   *sse = (unsigned int)sse64;
1031 }
1032 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1033 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1034                                            const int32_t *wsrc,
1035                                            const int32_t *mask, int w, int h,
1036                                            unsigned int *sse, int *sum) {
1037   int64_t sum64;
1038   uint64_t sse64;
1039   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1040   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1041   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1042 }
1043 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1044 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1045                                            const int32_t *wsrc,
1046                                            const int32_t *mask, int w, int h,
1047                                            unsigned int *sse, int *sum) {
1048   int64_t sum64;
1049   uint64_t sse64;
1050   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1051   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1052   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1053 }
1054 
1055 #define HIGHBD_OBMC_VAR(W, H)                                              \
1056   unsigned int aom_highbd_8_obmc_variance##W##x##H##_c(                    \
1057       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1058       const int32_t *mask, unsigned int *sse) {                            \
1059     int sum;                                                               \
1060     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
1061     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
1062   }                                                                        \
1063                                                                            \
1064   unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
1065       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1066       const int32_t *mask, unsigned int *sse) {                            \
1067     int sum;                                                               \
1068     int64_t var;                                                           \
1069     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1070     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1071     return (var >= 0) ? (uint32_t)var : 0;                                 \
1072   }                                                                        \
1073                                                                            \
1074   unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
1075       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1076       const int32_t *mask, unsigned int *sse) {                            \
1077     int sum;                                                               \
1078     int64_t var;                                                           \
1079     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1080     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1081     return (var >= 0) ? (uint32_t)var : 0;                                 \
1082   }
1083 
1084 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
1085   unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c(              \
1086       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1087       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1088     uint16_t fdata3[(H + 1) * W];                                              \
1089     uint16_t temp2[H * W];                                                     \
1090                                                                                \
1091     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1092         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1093     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1094         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1095                                                                                \
1096     return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
1097                                                    W, wsrc, mask, sse);        \
1098   }                                                                            \
1099                                                                                \
1100   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
1101       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1102       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1103     uint16_t fdata3[(H + 1) * W];                                              \
1104     uint16_t temp2[H * W];                                                     \
1105                                                                                \
1106     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1107         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1108     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1109         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1110                                                                                \
1111     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1112                                                     W, wsrc, mask, sse);       \
1113   }                                                                            \
1114                                                                                \
1115   unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
1116       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1117       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1118     uint16_t fdata3[(H + 1) * W];                                              \
1119     uint16_t temp2[H * W];                                                     \
1120                                                                                \
1121     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1122         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1123     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1124         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1125                                                                                \
1126     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1127                                                     W, wsrc, mask, sse);       \
1128   }
1129 
1130 HIGHBD_OBMC_VAR(4, 4)
1131 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1132 
1133 HIGHBD_OBMC_VAR(4, 8)
1134 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1135 
1136 HIGHBD_OBMC_VAR(8, 4)
1137 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1138 
1139 HIGHBD_OBMC_VAR(8, 8)
1140 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1141 
1142 HIGHBD_OBMC_VAR(8, 16)
1143 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1144 
1145 HIGHBD_OBMC_VAR(16, 8)
1146 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1147 
1148 HIGHBD_OBMC_VAR(16, 16)
1149 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1150 
1151 HIGHBD_OBMC_VAR(16, 32)
1152 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1153 
1154 HIGHBD_OBMC_VAR(32, 16)
1155 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1156 
1157 HIGHBD_OBMC_VAR(32, 32)
1158 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1159 
1160 HIGHBD_OBMC_VAR(32, 64)
1161 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1162 
1163 HIGHBD_OBMC_VAR(64, 32)
1164 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1165 
1166 HIGHBD_OBMC_VAR(64, 64)
1167 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1168 
1169 HIGHBD_OBMC_VAR(64, 128)
1170 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1171 
1172 HIGHBD_OBMC_VAR(128, 64)
1173 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1174 
1175 HIGHBD_OBMC_VAR(128, 128)
1176 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1177 
1178 HIGHBD_OBMC_VAR(4, 16)
1179 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1180 HIGHBD_OBMC_VAR(16, 4)
1181 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1182 HIGHBD_OBMC_VAR(8, 32)
1183 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1184 HIGHBD_OBMC_VAR(32, 8)
1185 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1186 HIGHBD_OBMC_VAR(16, 64)
1187 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1188 HIGHBD_OBMC_VAR(64, 16)
1189 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1190 #endif  // CONFIG_AV1_HIGHBITDEPTH
1191 #endif  // !CONFIG_REALTIME_ONLY
1192 
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1193 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1194                              int sstride, int w, int h) {
1195   uint64_t sum = 0;
1196   for (int i = 0; i < h; i++) {
1197     for (int j = 0; j < w; j++) {
1198       int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1199       sum += e * e;
1200     }
1201   }
1202   return sum;
1203 }
1204 
aom_mse_16xh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int w,int h)1205 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
1206                               int h) {
1207   uint16_t *src_temp = src;
1208   uint8_t *dst_temp = dst;
1209   const int num_blks = 16 / w;
1210   int64_t sum = 0;
1211   for (int i = 0; i < num_blks; i++) {
1212     sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
1213     dst_temp += w;
1214     src_temp += (w * h);
1215   }
1216   return sum;
1217 }
1218 
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1219 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1220                                     int sstride, int w, int h) {
1221   uint64_t sum = 0;
1222   for (int i = 0; i < h; i++) {
1223     for (int j = 0; j < w; j++) {
1224       int e = dst[i * dstride + j] - src[i * sstride + j];
1225       sum += e * e;
1226     }
1227   }
1228   return sum;
1229 }
1230