• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <stdlib.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom/aom_integer.h"
18 #include "aom_ports/mem.h"
19 
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_dsp/blend.h"
22 #include "aom_dsp/variance.h"
23 
24 #include "av1/common/filter.h"
25 #include "av1/common/reconinter.h"
26 
27 #if !CONFIG_REALTIME_ONLY
aom_get_mb_ss_c(const int16_t * a)28 uint32_t aom_get_mb_ss_c(const int16_t *a) {
29   unsigned int i, sum = 0;
30 
31   for (i = 0; i < 256; ++i) {
32     sum += a[i] * a[i];
33   }
34 
35   return sum;
36 }
37 #endif  // !CONFIG_REALTIME_ONLY
38 
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)39 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
40                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
41   int i, j;
42   int tsum = 0;
43   uint32_t tsse = 0;
44 
45   for (i = 0; i < h; ++i) {
46     for (j = 0; j < w; ++j) {
47       const int diff = a[j] - b[j];
48       tsum += diff;
49       tsse += diff * diff;
50     }
51 
52     a += a_stride;
53     b += b_stride;
54   }
55   *sum = tsum;
56   *sse = tsse;
57 }
58 
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)59 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
60                           int b_stride, int w, int h) {
61   uint32_t sse;
62   int sum;
63   variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
64   return sse;
65 }
66 
67 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
68 // or vertical direction to produce the filtered output block. Used to implement
69 // the first-pass of 2-D separable filter.
70 //
71 // Produces int16_t output to retain precision for the next pass. Two filter
72 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
73 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
74 // It defines the offset required to move from one input to the next.
var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)75 static void var_filter_block2d_bil_first_pass_c(
76     const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
77     unsigned int pixel_step, unsigned int output_height,
78     unsigned int output_width, const uint8_t *filter) {
79   unsigned int i, j;
80 
81   for (i = 0; i < output_height; ++i) {
82     for (j = 0; j < output_width; ++j) {
83       b[j] = ROUND_POWER_OF_TWO(
84           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
85 
86       ++a;
87     }
88 
89     a += src_pixels_per_line - output_width;
90     b += output_width;
91   }
92 }
93 
94 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
95 // or vertical direction to produce the filtered output block. Used to implement
96 // the second-pass of 2-D separable filter.
97 //
98 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
99 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
100 // filter is applied horizontally (pixel_step = 1) or vertically
101 // (pixel_step = stride). It defines the offset required to move from one input
102 // to the next. Output is 8-bit.
var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)103 static void var_filter_block2d_bil_second_pass_c(
104     const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
105     unsigned int pixel_step, unsigned int output_height,
106     unsigned int output_width, const uint8_t *filter) {
107   unsigned int i, j;
108 
109   for (i = 0; i < output_height; ++i) {
110     for (j = 0; j < output_width; ++j) {
111       b[j] = ROUND_POWER_OF_TWO(
112           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
113       ++a;
114     }
115 
116     a += src_pixels_per_line - output_width;
117     b += output_width;
118   }
119 }
120 
121 #define VAR(W, H)                                                    \
122   uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
123                                      const uint8_t *b, int b_stride, \
124                                      uint32_t *sse) {                \
125     int sum;                                                         \
126     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
127     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
128   }
129 
130 #define SUBPIX_VAR(W, H)                                                  \
131   uint32_t aom_sub_pixel_variance##W##x##H##_c(                           \
132       const uint8_t *a, int a_stride, int xoffset, int yoffset,           \
133       const uint8_t *b, int b_stride, uint32_t *sse) {                    \
134     uint16_t fdata3[(H + 1) * W];                                         \
135     uint8_t temp2[H * W];                                                 \
136                                                                           \
137     var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
138                                         bilinear_filters_2t[xoffset]);    \
139     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
140                                          bilinear_filters_2t[yoffset]);   \
141                                                                           \
142     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);         \
143   }
144 
145 #define SUBPIX_AVG_VAR(W, H)                                              \
146   uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                       \
147       const uint8_t *a, int a_stride, int xoffset, int yoffset,           \
148       const uint8_t *b, int b_stride, uint32_t *sse,                      \
149       const uint8_t *second_pred) {                                       \
150     uint16_t fdata3[(H + 1) * W];                                         \
151     uint8_t temp2[H * W];                                                 \
152     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                           \
153                                                                           \
154     var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
155                                         bilinear_filters_2t[xoffset]);    \
156     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
157                                          bilinear_filters_2t[yoffset]);   \
158                                                                           \
159     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                \
160                                                                           \
161     return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);         \
162   }
163 
aom_get_var_sse_sum_8x8_quad_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)164 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
165                                     const uint8_t *b, int b_stride,
166                                     uint32_t *sse8x8, int *sum8x8,
167                                     unsigned int *tot_sse, int *tot_sum,
168                                     uint32_t *var8x8) {
169   // Loop over 4 8x8 blocks. Process one 8x32 block.
170   for (int k = 0; k < 4; k++) {
171     variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
172              &sum8x8[k]);
173   }
174 
175   // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
176   *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
177   *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
178   for (int i = 0; i < 4; i++)
179     var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
180 }
181 
aom_get_var_sse_sum_16x16_dual_c(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)182 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
183                                       const uint8_t *ref_ptr, int ref_stride,
184                                       uint32_t *sse16x16, unsigned int *tot_sse,
185                                       int *tot_sum, uint32_t *var16x16) {
186   int sum16x16[2] = { 0 };
187   // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
188   for (int k = 0; k < 2; k++) {
189     variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
190              16, 16, &sse16x16[k], &sum16x16[k]);
191   }
192 
193   // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
194   *tot_sse += sse16x16[0] + sse16x16[1];
195   *tot_sum += sum16x16[0] + sum16x16[1];
196   for (int i = 0; i < 2; i++)
197     var16x16[i] =
198         sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
199 }
200 
201 /* Identical to the variance call except it does not calculate the
202  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
203  * variable.
204  */
205 #define MSE(W, H)                                               \
206   uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
207                                 const uint8_t *b, int b_stride, \
208                                 uint32_t *sse) {                \
209     int sum;                                                    \
210     variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
211     return *sse;                                                \
212   }
213 
214 /* All three forms of the variance are available in the same sizes. */
215 #define VARIANCES(W, H) \
216   VAR(W, H)             \
217   SUBPIX_VAR(W, H)      \
218   SUBPIX_AVG_VAR(W, H)
219 
220 VARIANCES(128, 128)
221 VARIANCES(128, 64)
222 VARIANCES(64, 128)
223 VARIANCES(64, 64)
224 VARIANCES(64, 32)
225 VARIANCES(32, 64)
226 VARIANCES(32, 32)
227 VARIANCES(32, 16)
228 VARIANCES(16, 32)
229 VARIANCES(16, 16)
230 VARIANCES(16, 8)
231 VARIANCES(8, 16)
232 VARIANCES(8, 8)
233 VARIANCES(8, 4)
234 VARIANCES(4, 8)
235 VARIANCES(4, 4)
236 
237 // Realtime mode doesn't use rectangular blocks.
238 #if !CONFIG_REALTIME_ONLY
239 VARIANCES(4, 16)
240 VARIANCES(16, 4)
241 VARIANCES(8, 32)
242 VARIANCES(32, 8)
243 VARIANCES(16, 64)
244 VARIANCES(64, 16)
245 #endif
246 
247 MSE(16, 16)
248 MSE(16, 8)
249 MSE(8, 16)
250 MSE(8, 8)
251 
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)252 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
253                          int height, const uint8_t *ref, int ref_stride) {
254   int i, j;
255 
256   for (i = 0; i < height; ++i) {
257     for (j = 0; j < width; ++j) {
258       const int tmp = pred[j] + ref[j];
259       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
260     }
261     comp_pred += width;
262     pred += width;
263     ref += ref_stride;
264   }
265 }
266 
267 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)268 static void highbd_variance64(const uint8_t *a8, int a_stride,
269                               const uint8_t *b8, int b_stride, int w, int h,
270                               uint64_t *sse, int64_t *sum) {
271   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
272   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
273   int64_t tsum = 0;
274   uint64_t tsse = 0;
275   for (int i = 0; i < h; ++i) {
276     int32_t lsum = 0;
277     for (int j = 0; j < w; ++j) {
278       const int diff = a[j] - b[j];
279       lsum += diff;
280       tsse += (uint32_t)(diff * diff);
281     }
282     tsum += lsum;
283     a += a_stride;
284     b += b_stride;
285   }
286   *sum = tsum;
287   *sse = tsse;
288 }
289 
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)290 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
291                                  const uint8_t *b, int b_stride, int w, int h) {
292   uint64_t sse;
293   int64_t sum;
294   highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
295   return sse;
296 }
297 
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)298 static void highbd_8_variance(const uint8_t *a8, int a_stride,
299                               const uint8_t *b8, int b_stride, int w, int h,
300                               uint32_t *sse, int *sum) {
301   uint64_t sse_long = 0;
302   int64_t sum_long = 0;
303   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
304   *sse = (uint32_t)sse_long;
305   *sum = (int)sum_long;
306 }
307 
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)308 static void highbd_10_variance(const uint8_t *a8, int a_stride,
309                                const uint8_t *b8, int b_stride, int w, int h,
310                                uint32_t *sse, int *sum) {
311   uint64_t sse_long = 0;
312   int64_t sum_long = 0;
313   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
314   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
315   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
316 }
317 
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)318 static void highbd_12_variance(const uint8_t *a8, int a_stride,
319                                const uint8_t *b8, int b_stride, int w, int h,
320                                uint32_t *sse, int *sum) {
321   uint64_t sse_long = 0;
322   int64_t sum_long = 0;
323   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
324   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
325   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
326 }
327 
328 #define HIGHBD_VAR(W, H)                                                       \
329   uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
330                                               const uint8_t *b, int b_stride,  \
331                                               uint32_t *sse) {                 \
332     int sum;                                                                   \
333     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
334     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
335   }                                                                            \
336                                                                                \
337   uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
338                                                const uint8_t *b, int b_stride, \
339                                                uint32_t *sse) {                \
340     int sum;                                                                   \
341     int64_t var;                                                               \
342     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
343     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
344     return (var >= 0) ? (uint32_t)var : 0;                                     \
345   }                                                                            \
346                                                                                \
347   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
348                                                const uint8_t *b, int b_stride, \
349                                                uint32_t *sse) {                \
350     int sum;                                                                   \
351     int64_t var;                                                               \
352     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
353     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
354     return (var >= 0) ? (uint32_t)var : 0;                                     \
355   }
356 
357 #define HIGHBD_MSE(W, H)                                                      \
358   uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
359                                          const uint8_t *ref, int ref_stride,  \
360                                          uint32_t *sse) {                     \
361     int sum;                                                                  \
362     highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
363     return *sse;                                                              \
364   }                                                                           \
365                                                                               \
366   uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
367                                           const uint8_t *ref, int ref_stride, \
368                                           uint32_t *sse) {                    \
369     int sum;                                                                  \
370     highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
371     return *sse;                                                              \
372   }                                                                           \
373                                                                               \
374   uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
375                                           const uint8_t *ref, int ref_stride, \
376                                           uint32_t *sse) {                    \
377     int sum;                                                                  \
378     highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
379     return *sse;                                                              \
380   }
381 
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)382 void aom_highbd_var_filter_block2d_bil_first_pass(
383     const uint8_t *src_ptr8, uint16_t *output_ptr,
384     unsigned int src_pixels_per_line, int pixel_step,
385     unsigned int output_height, unsigned int output_width,
386     const uint8_t *filter) {
387   unsigned int i, j;
388   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
389   for (i = 0; i < output_height; ++i) {
390     for (j = 0; j < output_width; ++j) {
391       output_ptr[j] = ROUND_POWER_OF_TWO(
392           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
393           FILTER_BITS);
394 
395       ++src_ptr;
396     }
397 
398     // Next row...
399     src_ptr += src_pixels_per_line - output_width;
400     output_ptr += output_width;
401   }
402 }
403 
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)404 void aom_highbd_var_filter_block2d_bil_second_pass(
405     const uint16_t *src_ptr, uint16_t *output_ptr,
406     unsigned int src_pixels_per_line, unsigned int pixel_step,
407     unsigned int output_height, unsigned int output_width,
408     const uint8_t *filter) {
409   unsigned int i, j;
410 
411   for (i = 0; i < output_height; ++i) {
412     for (j = 0; j < output_width; ++j) {
413       output_ptr[j] = ROUND_POWER_OF_TWO(
414           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
415           FILTER_BITS);
416       ++src_ptr;
417     }
418 
419     src_ptr += src_pixels_per_line - output_width;
420     output_ptr += output_width;
421   }
422 }
423 
424 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
425   uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
426       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
427       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
428     uint16_t fdata3[(H + 1) * W];                                            \
429     uint16_t temp2[H * W];                                                   \
430                                                                              \
431     aom_highbd_var_filter_block2d_bil_first_pass(                            \
432         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
433     aom_highbd_var_filter_block2d_bil_second_pass(                           \
434         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
435                                                                              \
436     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
437                                               dst, dst_stride, sse);         \
438   }                                                                          \
439                                                                              \
440   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
441       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
442       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
443     uint16_t fdata3[(H + 1) * W];                                            \
444     uint16_t temp2[H * W];                                                   \
445                                                                              \
446     aom_highbd_var_filter_block2d_bil_first_pass(                            \
447         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
448     aom_highbd_var_filter_block2d_bil_second_pass(                           \
449         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
450                                                                              \
451     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
452                                                dst, dst_stride, sse);        \
453   }                                                                          \
454                                                                              \
455   uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
456       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
457       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
458     uint16_t fdata3[(H + 1) * W];                                            \
459     uint16_t temp2[H * W];                                                   \
460                                                                              \
461     aom_highbd_var_filter_block2d_bil_first_pass(                            \
462         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
463     aom_highbd_var_filter_block2d_bil_second_pass(                           \
464         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
465                                                                              \
466     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
467                                                dst, dst_stride, sse);        \
468   }
469 
470 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
471   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
472       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
473       const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
474       const uint8_t *second_pred) {                                          \
475     uint16_t fdata3[(H + 1) * W];                                            \
476     uint16_t temp2[H * W];                                                   \
477     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
478                                                                              \
479     aom_highbd_var_filter_block2d_bil_first_pass(                            \
480         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
481     aom_highbd_var_filter_block2d_bil_second_pass(                           \
482         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
483                                                                              \
484     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
485                                CONVERT_TO_BYTEPTR(temp2), W);                \
486                                                                              \
487     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
488                                               dst, dst_stride, sse);         \
489   }                                                                          \
490                                                                              \
491   uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
492       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
493       const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
494       const uint8_t *second_pred) {                                          \
495     uint16_t fdata3[(H + 1) * W];                                            \
496     uint16_t temp2[H * W];                                                   \
497     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
498                                                                              \
499     aom_highbd_var_filter_block2d_bil_first_pass(                            \
500         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
501     aom_highbd_var_filter_block2d_bil_second_pass(                           \
502         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
503                                                                              \
504     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
505                                CONVERT_TO_BYTEPTR(temp2), W);                \
506                                                                              \
507     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
508                                                dst, dst_stride, sse);        \
509   }                                                                          \
510                                                                              \
511   uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
512       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
513       const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
514       const uint8_t *second_pred) {                                          \
515     uint16_t fdata3[(H + 1) * W];                                            \
516     uint16_t temp2[H * W];                                                   \
517     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
518                                                                              \
519     aom_highbd_var_filter_block2d_bil_first_pass(                            \
520         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
521     aom_highbd_var_filter_block2d_bil_second_pass(                           \
522         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
523                                                                              \
524     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
525                                CONVERT_TO_BYTEPTR(temp2), W);                \
526                                                                              \
527     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
528                                                dst, dst_stride, sse);        \
529   }                                                                          \
530                                                                              \
531 /* All three forms of the variance are available in the same sizes. */
532 #define HIGHBD_VARIANCES(W, H) \
533   HIGHBD_VAR(W, H)             \
534   HIGHBD_SUBPIX_VAR(W, H)      \
535   HIGHBD_SUBPIX_AVG_VAR(W, H)
536 
537 HIGHBD_VARIANCES(128, 128)
538 HIGHBD_VARIANCES(128, 64)
539 HIGHBD_VARIANCES(64, 128)
540 HIGHBD_VARIANCES(64, 64)
541 HIGHBD_VARIANCES(64, 32)
542 HIGHBD_VARIANCES(32, 64)
543 HIGHBD_VARIANCES(32, 32)
544 HIGHBD_VARIANCES(32, 16)
545 HIGHBD_VARIANCES(16, 32)
546 HIGHBD_VARIANCES(16, 16)
547 HIGHBD_VARIANCES(16, 8)
548 HIGHBD_VARIANCES(8, 16)
549 HIGHBD_VARIANCES(8, 8)
550 HIGHBD_VARIANCES(8, 4)
551 HIGHBD_VARIANCES(4, 8)
552 HIGHBD_VARIANCES(4, 4)
553 
554 // Realtime mode doesn't use 4x rectangular blocks.
555 #if !CONFIG_REALTIME_ONLY
556 HIGHBD_VARIANCES(4, 16)
557 HIGHBD_VARIANCES(16, 4)
558 HIGHBD_VARIANCES(8, 32)
559 HIGHBD_VARIANCES(32, 8)
560 HIGHBD_VARIANCES(16, 64)
561 HIGHBD_VARIANCES(64, 16)
562 #endif
563 
564 HIGHBD_MSE(16, 16)
565 HIGHBD_MSE(16, 8)
566 HIGHBD_MSE(8, 16)
567 HIGHBD_MSE(8, 8)
568 
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)569 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
570                                 int width, int height, const uint8_t *ref8,
571                                 int ref_stride) {
572   int i, j;
573   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
574   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
575   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
576   for (i = 0; i < height; ++i) {
577     for (j = 0; j < width; ++j) {
578       const int tmp = pred[j] + ref[j];
579       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
580     }
581     comp_pred += width;
582     pred += width;
583     ref += ref_stride;
584   }
585 }
586 #endif  // CONFIG_AV1_HIGHBITDEPTH
587 
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)588 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
589                           int height, const uint8_t *ref, int ref_stride,
590                           const uint8_t *mask, int mask_stride,
591                           int invert_mask) {
592   int i, j;
593   const uint8_t *src0 = invert_mask ? pred : ref;
594   const uint8_t *src1 = invert_mask ? ref : pred;
595   const int stride0 = invert_mask ? width : ref_stride;
596   const int stride1 = invert_mask ? ref_stride : width;
597   for (i = 0; i < height; ++i) {
598     for (j = 0; j < width; ++j) {
599       comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
600     }
601     comp_pred += width;
602     src0 += stride0;
603     src1 += stride1;
604     mask += mask_stride;
605   }
606 }
607 
608 #define MASK_SUBPIX_VAR(W, H)                                                 \
609   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
610       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
611       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
612       const uint8_t *msk, int msk_stride, int invert_mask,                    \
613       unsigned int *sse) {                                                    \
614     uint16_t fdata3[(H + 1) * W];                                             \
615     uint8_t temp2[H * W];                                                     \
616     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
617                                                                               \
618     var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \
619                                         bilinear_filters_2t[xoffset]);        \
620     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,           \
621                                          bilinear_filters_2t[yoffset]);       \
622                                                                               \
623     aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
624                          invert_mask);                                        \
625     return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);         \
626   }
627 
628 MASK_SUBPIX_VAR(4, 4)
629 MASK_SUBPIX_VAR(4, 8)
630 MASK_SUBPIX_VAR(8, 4)
631 MASK_SUBPIX_VAR(8, 8)
632 MASK_SUBPIX_VAR(8, 16)
633 MASK_SUBPIX_VAR(16, 8)
634 MASK_SUBPIX_VAR(16, 16)
635 MASK_SUBPIX_VAR(16, 32)
636 MASK_SUBPIX_VAR(32, 16)
637 MASK_SUBPIX_VAR(32, 32)
638 MASK_SUBPIX_VAR(32, 64)
639 MASK_SUBPIX_VAR(64, 32)
640 MASK_SUBPIX_VAR(64, 64)
641 MASK_SUBPIX_VAR(64, 128)
642 MASK_SUBPIX_VAR(128, 64)
643 MASK_SUBPIX_VAR(128, 128)
644 
645 // Realtime mode doesn't use 4x rectangular blocks.
646 #if !CONFIG_REALTIME_ONLY
647 MASK_SUBPIX_VAR(4, 16)
648 MASK_SUBPIX_VAR(16, 4)
649 MASK_SUBPIX_VAR(8, 32)
650 MASK_SUBPIX_VAR(32, 8)
651 MASK_SUBPIX_VAR(16, 64)
652 MASK_SUBPIX_VAR(64, 16)
653 #endif
654 
655 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)656 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
657                                  int width, int height, const uint8_t *ref8,
658                                  int ref_stride, const uint8_t *mask,
659                                  int mask_stride, int invert_mask) {
660   int i, j;
661   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
662   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
663   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
664   for (i = 0; i < height; ++i) {
665     for (j = 0; j < width; ++j) {
666       if (!invert_mask)
667         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
668       else
669         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
670     }
671     comp_pred += width;
672     pred += width;
673     ref += ref_stride;
674     mask += mask_stride;
675   }
676 }
677 
678 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
679   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
680       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
681       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
682       const uint8_t *msk, int msk_stride, int invert_mask,                     \
683       unsigned int *sse) {                                                     \
684     uint16_t fdata3[(H + 1) * W];                                              \
685     uint16_t temp2[H * W];                                                     \
686     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
687                                                                                \
688     aom_highbd_var_filter_block2d_bil_first_pass(                              \
689         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
690     aom_highbd_var_filter_block2d_bil_second_pass(                             \
691         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
692                                                                                \
693     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
694                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
695                                 invert_mask);                                  \
696                                                                                \
697     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
698                                               ref, ref_stride, sse);           \
699   }                                                                            \
700                                                                                \
701   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
702       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
703       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
704       const uint8_t *msk, int msk_stride, int invert_mask,                     \
705       unsigned int *sse) {                                                     \
706     uint16_t fdata3[(H + 1) * W];                                              \
707     uint16_t temp2[H * W];                                                     \
708     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
709                                                                                \
710     aom_highbd_var_filter_block2d_bil_first_pass(                              \
711         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
712     aom_highbd_var_filter_block2d_bil_second_pass(                             \
713         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
714                                                                                \
715     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
716                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
717                                 invert_mask);                                  \
718                                                                                \
719     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
720                                                ref, ref_stride, sse);          \
721   }                                                                            \
722                                                                                \
723   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
724       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
725       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
726       const uint8_t *msk, int msk_stride, int invert_mask,                     \
727       unsigned int *sse) {                                                     \
728     uint16_t fdata3[(H + 1) * W];                                              \
729     uint16_t temp2[H * W];                                                     \
730     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
731                                                                                \
732     aom_highbd_var_filter_block2d_bil_first_pass(                              \
733         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
734     aom_highbd_var_filter_block2d_bil_second_pass(                             \
735         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
736                                                                                \
737     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
738                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
739                                 invert_mask);                                  \
740                                                                                \
741     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
742                                                ref, ref_stride, sse);          \
743   }
744 
745 HIGHBD_MASK_SUBPIX_VAR(4, 4)
746 HIGHBD_MASK_SUBPIX_VAR(4, 8)
747 HIGHBD_MASK_SUBPIX_VAR(8, 4)
748 HIGHBD_MASK_SUBPIX_VAR(8, 8)
749 HIGHBD_MASK_SUBPIX_VAR(8, 16)
750 HIGHBD_MASK_SUBPIX_VAR(16, 8)
751 HIGHBD_MASK_SUBPIX_VAR(16, 16)
752 HIGHBD_MASK_SUBPIX_VAR(16, 32)
753 HIGHBD_MASK_SUBPIX_VAR(32, 16)
754 HIGHBD_MASK_SUBPIX_VAR(32, 32)
755 HIGHBD_MASK_SUBPIX_VAR(32, 64)
756 HIGHBD_MASK_SUBPIX_VAR(64, 32)
757 HIGHBD_MASK_SUBPIX_VAR(64, 64)
758 HIGHBD_MASK_SUBPIX_VAR(64, 128)
759 HIGHBD_MASK_SUBPIX_VAR(128, 64)
760 HIGHBD_MASK_SUBPIX_VAR(128, 128)
761 #if !CONFIG_REALTIME_ONLY
762 HIGHBD_MASK_SUBPIX_VAR(4, 16)
763 HIGHBD_MASK_SUBPIX_VAR(16, 4)
764 HIGHBD_MASK_SUBPIX_VAR(8, 32)
765 HIGHBD_MASK_SUBPIX_VAR(32, 8)
766 HIGHBD_MASK_SUBPIX_VAR(16, 64)
767 HIGHBD_MASK_SUBPIX_VAR(64, 16)
768 #endif
769 #endif  // CONFIG_AV1_HIGHBITDEPTH
770 
771 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)772 static inline void obmc_variance(const uint8_t *pre, int pre_stride,
773                                  const int32_t *wsrc, const int32_t *mask,
774                                  int w, int h, unsigned int *sse, int *sum) {
775   int i, j;
776   unsigned int tsse = 0;
777   int tsum = 0;
778 
779   for (i = 0; i < h; i++) {
780     for (j = 0; j < w; j++) {
781       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
782       tsum += diff;
783       tsse += diff * diff;
784     }
785 
786     pre += pre_stride;
787     wsrc += w;
788     mask += w;
789   }
790   *sse = tsse;
791   *sum = tsum;
792 }
793 
794 #define OBMC_VAR(W, H)                                            \
795   unsigned int aom_obmc_variance##W##x##H##_c(                    \
796       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
797       const int32_t *mask, unsigned int *sse) {                   \
798     int sum;                                                      \
799     obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
800     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
801   }
802 
803 #define OBMC_SUBPIX_VAR(W, H)                                                 \
804   unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                      \
805       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,           \
806       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {          \
807     uint16_t fdata3[(H + 1) * W];                                             \
808     uint8_t temp2[H * W];                                                     \
809                                                                               \
810     var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \
811                                         bilinear_filters_2t[xoffset]);        \
812     var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,           \
813                                          bilinear_filters_2t[yoffset]);       \
814                                                                               \
815     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);         \
816   }
817 
818 OBMC_VAR(4, 4)
819 OBMC_SUBPIX_VAR(4, 4)
820 
821 OBMC_VAR(4, 8)
822 OBMC_SUBPIX_VAR(4, 8)
823 
824 OBMC_VAR(8, 4)
825 OBMC_SUBPIX_VAR(8, 4)
826 
827 OBMC_VAR(8, 8)
828 OBMC_SUBPIX_VAR(8, 8)
829 
830 OBMC_VAR(8, 16)
831 OBMC_SUBPIX_VAR(8, 16)
832 
833 OBMC_VAR(16, 8)
834 OBMC_SUBPIX_VAR(16, 8)
835 
836 OBMC_VAR(16, 16)
837 OBMC_SUBPIX_VAR(16, 16)
838 
839 OBMC_VAR(16, 32)
840 OBMC_SUBPIX_VAR(16, 32)
841 
842 OBMC_VAR(32, 16)
843 OBMC_SUBPIX_VAR(32, 16)
844 
845 OBMC_VAR(32, 32)
846 OBMC_SUBPIX_VAR(32, 32)
847 
848 OBMC_VAR(32, 64)
849 OBMC_SUBPIX_VAR(32, 64)
850 
851 OBMC_VAR(64, 32)
852 OBMC_SUBPIX_VAR(64, 32)
853 
854 OBMC_VAR(64, 64)
855 OBMC_SUBPIX_VAR(64, 64)
856 
857 OBMC_VAR(64, 128)
858 OBMC_SUBPIX_VAR(64, 128)
859 
860 OBMC_VAR(128, 64)
861 OBMC_SUBPIX_VAR(128, 64)
862 
863 OBMC_VAR(128, 128)
864 OBMC_SUBPIX_VAR(128, 128)
865 
866 OBMC_VAR(4, 16)
867 OBMC_SUBPIX_VAR(4, 16)
868 OBMC_VAR(16, 4)
869 OBMC_SUBPIX_VAR(16, 4)
870 OBMC_VAR(8, 32)
871 OBMC_SUBPIX_VAR(8, 32)
872 OBMC_VAR(32, 8)
873 OBMC_SUBPIX_VAR(32, 8)
874 OBMC_VAR(16, 64)
875 OBMC_SUBPIX_VAR(16, 64)
876 OBMC_VAR(64, 16)
877 OBMC_SUBPIX_VAR(64, 16)
878 
879 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)880 static inline void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
881                                           const int32_t *wsrc,
882                                           const int32_t *mask, int w, int h,
883                                           uint64_t *sse, int64_t *sum) {
884   int i, j;
885   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
886   uint64_t tsse = 0;
887   int64_t tsum = 0;
888 
889   for (i = 0; i < h; i++) {
890     for (j = 0; j < w; j++) {
891       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
892       tsum += diff;
893       tsse += diff * diff;
894     }
895 
896     pre += pre_stride;
897     wsrc += w;
898     mask += w;
899   }
900   *sse = tsse;
901   *sum = tsum;
902 }
903 
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)904 static inline void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
905                                         const int32_t *wsrc,
906                                         const int32_t *mask, int w, int h,
907                                         unsigned int *sse, int *sum) {
908   int64_t sum64;
909   uint64_t sse64;
910   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
911   *sum = (int)sum64;
912   *sse = (unsigned int)sse64;
913 }
914 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)915 static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
916                                            const int32_t *wsrc,
917                                            const int32_t *mask, int w, int h,
918                                            unsigned int *sse, int *sum) {
919   int64_t sum64;
920   uint64_t sse64;
921   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
922   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
923   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
924 }
925 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)926 static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
927                                            const int32_t *wsrc,
928                                            const int32_t *mask, int w, int h,
929                                            unsigned int *sse, int *sum) {
930   int64_t sum64;
931   uint64_t sse64;
932   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
933   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
934   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
935 }
936 
937 #define HIGHBD_OBMC_VAR(W, H)                                              \
938   unsigned int aom_highbd_8_obmc_variance##W##x##H##_c(                    \
939       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
940       const int32_t *mask, unsigned int *sse) {                            \
941     int sum;                                                               \
942     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
943     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
944   }                                                                        \
945                                                                            \
946   unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
947       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
948       const int32_t *mask, unsigned int *sse) {                            \
949     int sum;                                                               \
950     int64_t var;                                                           \
951     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
952     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
953     return (var >= 0) ? (uint32_t)var : 0;                                 \
954   }                                                                        \
955                                                                            \
956   unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
957       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
958       const int32_t *mask, unsigned int *sse) {                            \
959     int sum;                                                               \
960     int64_t var;                                                           \
961     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
962     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
963     return (var >= 0) ? (uint32_t)var : 0;                                 \
964   }
965 
966 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
967   unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c(              \
968       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
969       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
970     uint16_t fdata3[(H + 1) * W];                                              \
971     uint16_t temp2[H * W];                                                     \
972                                                                                \
973     aom_highbd_var_filter_block2d_bil_first_pass(                              \
974         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
975     aom_highbd_var_filter_block2d_bil_second_pass(                             \
976         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
977                                                                                \
978     return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
979                                                    W, wsrc, mask, sse);        \
980   }                                                                            \
981                                                                                \
982   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
983       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
984       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
985     uint16_t fdata3[(H + 1) * W];                                              \
986     uint16_t temp2[H * W];                                                     \
987                                                                                \
988     aom_highbd_var_filter_block2d_bil_first_pass(                              \
989         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
990     aom_highbd_var_filter_block2d_bil_second_pass(                             \
991         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
992                                                                                \
993     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
994                                                     W, wsrc, mask, sse);       \
995   }                                                                            \
996                                                                                \
997   unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
998       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
999       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1000     uint16_t fdata3[(H + 1) * W];                                              \
1001     uint16_t temp2[H * W];                                                     \
1002                                                                                \
1003     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1004         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1005     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1006         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1007                                                                                \
1008     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1009                                                     W, wsrc, mask, sse);       \
1010   }
1011 
1012 HIGHBD_OBMC_VAR(4, 4)
1013 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1014 
1015 HIGHBD_OBMC_VAR(4, 8)
1016 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1017 
1018 HIGHBD_OBMC_VAR(8, 4)
1019 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1020 
1021 HIGHBD_OBMC_VAR(8, 8)
1022 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1023 
1024 HIGHBD_OBMC_VAR(8, 16)
1025 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1026 
1027 HIGHBD_OBMC_VAR(16, 8)
1028 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1029 
1030 HIGHBD_OBMC_VAR(16, 16)
1031 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1032 
1033 HIGHBD_OBMC_VAR(16, 32)
1034 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1035 
1036 HIGHBD_OBMC_VAR(32, 16)
1037 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1038 
1039 HIGHBD_OBMC_VAR(32, 32)
1040 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1041 
1042 HIGHBD_OBMC_VAR(32, 64)
1043 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1044 
1045 HIGHBD_OBMC_VAR(64, 32)
1046 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1047 
1048 HIGHBD_OBMC_VAR(64, 64)
1049 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1050 
1051 HIGHBD_OBMC_VAR(64, 128)
1052 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1053 
1054 HIGHBD_OBMC_VAR(128, 64)
1055 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1056 
1057 HIGHBD_OBMC_VAR(128, 128)
1058 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1059 
1060 HIGHBD_OBMC_VAR(4, 16)
1061 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1062 HIGHBD_OBMC_VAR(16, 4)
1063 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1064 HIGHBD_OBMC_VAR(8, 32)
1065 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1066 HIGHBD_OBMC_VAR(32, 8)
1067 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1068 HIGHBD_OBMC_VAR(16, 64)
1069 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1070 HIGHBD_OBMC_VAR(64, 16)
1071 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1072 #endif  // CONFIG_AV1_HIGHBITDEPTH
1073 #endif  // !CONFIG_REALTIME_ONLY
1074 
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1075 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1076                              int sstride, int w, int h) {
1077   uint64_t sum = 0;
1078   for (int i = 0; i < h; i++) {
1079     for (int j = 0; j < w; j++) {
1080       int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1081       sum += e * e;
1082     }
1083   }
1084   return sum;
1085 }
1086 
aom_mse_16xh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int w,int h)1087 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
1088                               int h) {
1089   uint16_t *src_temp = src;
1090   uint8_t *dst_temp = dst;
1091   const int num_blks = 16 / w;
1092   int64_t sum = 0;
1093   for (int i = 0; i < num_blks; i++) {
1094     sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
1095     dst_temp += w;
1096     src_temp += (w * h);
1097   }
1098   return sum;
1099 }
1100 
1101 #if CONFIG_AV1_HIGHBITDEPTH
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1102 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1103                                     int sstride, int w, int h) {
1104   uint64_t sum = 0;
1105   for (int i = 0; i < h; i++) {
1106     for (int j = 0; j < w; j++) {
1107       int e = dst[i * dstride + j] - src[i * sstride + j];
1108       sum += e * e;
1109     }
1110   }
1111   return sum;
1112 }
1113 #endif  // CONFIG_AV1_HIGHBITDEPTH
1114