1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom/aom_integer.h"
19 #include "aom_ports/mem.h"
20 
21 #include "aom_dsp/aom_filter.h"
22 #include "aom_dsp/blend.h"
23 #include "aom_dsp/variance.h"
24 
25 #include "av1/common/filter.h"
26 #include "av1/common/reconinter.h"
27 
aom_get4x4sse_cs_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride)28 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
29                             int b_stride) {
30   int distortion = 0;
31   int r, c;
32 
33   for (r = 0; r < 4; ++r) {
34     for (c = 0; c < 4; ++c) {
35       int diff = a[c] - b[c];
36       distortion += diff * diff;
37     }
38 
39     a += a_stride;
40     b += b_stride;
41   }
42 
43   return distortion;
44 }
45 
aom_get_mb_ss_c(const int16_t * a)46 uint32_t aom_get_mb_ss_c(const int16_t *a) {
47   unsigned int i, sum = 0;
48 
49   for (i = 0; i < 256; ++i) {
50     sum += a[i] * a[i];
51   }
52 
53   return sum;
54 }
55 
variance(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h,uint32_t * sse,int * sum)56 static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
57                      int b_stride, int w, int h, uint32_t *sse, int *sum) {
58   int i, j;
59 
60   *sum = 0;
61   *sse = 0;
62 
63   for (i = 0; i < h; ++i) {
64     for (j = 0; j < w; ++j) {
65       const int diff = a[j] - b[j];
66       *sum += diff;
67       *sse += diff * diff;
68     }
69 
70     a += a_stride;
71     b += b_stride;
72   }
73 }
74 
aom_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)75 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
76                           int b_stride, int w, int h) {
77   uint32_t sse;
78   int sum;
79   variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
80   return sse;
81 }
82 
83 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
84 // or vertical direction to produce the filtered output block. Used to implement
85 // the first-pass of 2-D separable filter.
86 //
87 // Produces int16_t output to retain precision for the next pass. Two filter
88 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
89 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
90 // It defines the offset required to move from one input to the next.
aom_var_filter_block2d_bil_first_pass_c(const uint8_t * a,uint16_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)91 void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
92                                              unsigned int src_pixels_per_line,
93                                              unsigned int pixel_step,
94                                              unsigned int output_height,
95                                              unsigned int output_width,
96                                              const uint8_t *filter) {
97   unsigned int i, j;
98 
99   for (i = 0; i < output_height; ++i) {
100     for (j = 0; j < output_width; ++j) {
101       b[j] = ROUND_POWER_OF_TWO(
102           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
103 
104       ++a;
105     }
106 
107     a += src_pixels_per_line - output_width;
108     b += output_width;
109   }
110 }
111 
112 // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
113 // or vertical direction to produce the filtered output block. Used to implement
114 // the second-pass of 2-D separable filter.
115 //
116 // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
117 // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
118 // filter is applied horizontally (pixel_step = 1) or vertically
119 // (pixel_step = stride). It defines the offset required to move from one input
120 // to the next. Output is 8-bit.
aom_var_filter_block2d_bil_second_pass_c(const uint16_t * a,uint8_t * b,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)121 void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
122                                               unsigned int src_pixels_per_line,
123                                               unsigned int pixel_step,
124                                               unsigned int output_height,
125                                               unsigned int output_width,
126                                               const uint8_t *filter) {
127   unsigned int i, j;
128 
129   for (i = 0; i < output_height; ++i) {
130     for (j = 0; j < output_width; ++j) {
131       b[j] = ROUND_POWER_OF_TWO(
132           (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
133       ++a;
134     }
135 
136     a += src_pixels_per_line - output_width;
137     b += output_width;
138   }
139 }
140 
141 #define VAR(W, H)                                                    \
142   uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
143                                      const uint8_t *b, int b_stride, \
144                                      uint32_t *sse) {                \
145     int sum;                                                         \
146     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
147     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
148   }
149 
150 #define SUBPIX_VAR(W, H)                                                      \
151   uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
152       const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
153       const uint8_t *b, int b_stride, uint32_t *sse) {                        \
154     uint16_t fdata3[(H + 1) * W];                                             \
155     uint8_t temp2[H * W];                                                     \
156                                                                               \
157     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
158                                             bilinear_filters_2t[xoffset]);    \
159     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
160                                              bilinear_filters_2t[yoffset]);   \
161                                                                               \
162     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
163   }
164 
165 #define SUBPIX_AVG_VAR(W, H)                                                   \
166   uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
167       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
168       const uint8_t *b, int b_stride, uint32_t *sse,                           \
169       const uint8_t *second_pred) {                                            \
170     uint16_t fdata3[(H + 1) * W];                                              \
171     uint8_t temp2[H * W];                                                      \
172     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
173                                                                                \
174     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
175                                             bilinear_filters_2t[xoffset]);     \
176     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
177                                              bilinear_filters_2t[yoffset]);    \
178                                                                                \
179     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
180                                                                                \
181     return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
182   }                                                                            \
183   uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
184       const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
185       const uint8_t *b, int b_stride, uint32_t *sse,                           \
186       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
187     uint16_t fdata3[(H + 1) * W];                                              \
188     uint8_t temp2[H * W];                                                      \
189     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
190                                                                                \
191     aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
192                                             bilinear_filters_2t[xoffset]);     \
193     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
194                                              bilinear_filters_2t[yoffset]);    \
195                                                                                \
196     aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
197                                                                                \
198     return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
199   }
200 
201 /* Identical to the variance call except it takes an additional parameter, sum,
202  * and returns that value using pass-by-reference instead of returning
203  * sse - sum^2 / w*h
204  */
205 #define GET_VAR(W, H)                                                         \
206   void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
207                                const uint8_t *b, int b_stride, uint32_t *sse, \
208                                int *sum) {                                    \
209     variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
210   }
211 
aom_get_var_sse_sum_8x8_quad_c(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)212 void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
213                                     const uint8_t *b, int b_stride,
214                                     uint32_t *sse8x8, int *sum8x8,
215                                     unsigned int *tot_sse, int *tot_sum,
216                                     uint32_t *var8x8) {
217   // Loop over 4 8x8 blocks. Process one 8x32 block.
218   for (int k = 0; k < 4; k++) {
219     variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
220              &sum8x8[k]);
221   }
222 
223   // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
224   *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
225   *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
226   for (int i = 0; i < 4; i++)
227     var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
228 }
229 
aom_get_var_sse_sum_16x16_dual_c(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)230 void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
231                                       const uint8_t *ref_ptr, int ref_stride,
232                                       uint32_t *sse16x16, unsigned int *tot_sse,
233                                       int *tot_sum, uint32_t *var16x16) {
234   int sum16x16[64] = { 0 };
235   // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
236   for (int k = 0; k < 2; k++) {
237     variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
238              16, 16, &sse16x16[k], &sum16x16[k]);
239   }
240 
241   // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
242   *tot_sse += sse16x16[0] + sse16x16[1];
243   *tot_sum += sum16x16[0] + sum16x16[1];
244   for (int i = 0; i < 2; i++)
245     var16x16[i] =
246         sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
247 }
248 
249 /* Identical to the variance call except it does not calculate the
250  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
251  * variable.
252  */
253 #define MSE(W, H)                                               \
254   uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
255                                 const uint8_t *b, int b_stride, \
256                                 uint32_t *sse) {                \
257     int sum;                                                    \
258     variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
259     return *sse;                                                \
260   }
261 
262 /* All three forms of the variance are available in the same sizes. */
263 #define VARIANCES(W, H) \
264   VAR(W, H)             \
265   SUBPIX_VAR(W, H)      \
266   SUBPIX_AVG_VAR(W, H)
267 
268 VARIANCES(128, 128)
269 VARIANCES(128, 64)
270 VARIANCES(64, 128)
271 VARIANCES(64, 64)
272 VARIANCES(64, 32)
273 VARIANCES(32, 64)
274 VARIANCES(32, 32)
275 VARIANCES(32, 16)
276 VARIANCES(16, 32)
277 VARIANCES(16, 16)
278 VARIANCES(16, 8)
279 VARIANCES(8, 16)
280 VARIANCES(8, 8)
281 VARIANCES(8, 4)
282 VARIANCES(4, 8)
283 VARIANCES(4, 4)
284 VARIANCES(4, 2)
285 VARIANCES(2, 4)
286 VARIANCES(2, 2)
287 
288 // Realtime mode doesn't use rectangular blocks.
289 #if !CONFIG_REALTIME_ONLY
290 VARIANCES(4, 16)
291 VARIANCES(16, 4)
292 VARIANCES(8, 32)
293 VARIANCES(32, 8)
294 VARIANCES(16, 64)
295 VARIANCES(64, 16)
296 #endif
297 
298 GET_VAR(16, 16)
299 GET_VAR(8, 8)
300 
301 MSE(16, 16)
302 MSE(16, 8)
303 MSE(8, 16)
304 MSE(8, 8)
305 
aom_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)306 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
307                          int height, const uint8_t *ref, int ref_stride) {
308   int i, j;
309 
310   for (i = 0; i < height; ++i) {
311     for (j = 0; j < width; ++j) {
312       const int tmp = pred[j] + ref[j];
313       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
314     }
315     comp_pred += width;
316     pred += width;
317     ref += ref_stride;
318   }
319 }
320 
aom_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)321 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
322                                   int width, int height, const uint8_t *ref,
323                                   int ref_stride,
324                                   const DIST_WTD_COMP_PARAMS *jcp_param) {
325   int i, j;
326   const int fwd_offset = jcp_param->fwd_offset;
327   const int bck_offset = jcp_param->bck_offset;
328 
329   for (i = 0; i < height; ++i) {
330     for (j = 0; j < width; ++j) {
331       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
332       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
333       comp_pred[j] = (uint8_t)tmp;
334     }
335     comp_pred += width;
336     pred += width;
337     ref += ref_stride;
338   }
339 }
340 
341 #if CONFIG_AV1_HIGHBITDEPTH
highbd_variance64(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint64_t * sse,int64_t * sum)342 static void highbd_variance64(const uint8_t *a8, int a_stride,
343                               const uint8_t *b8, int b_stride, int w, int h,
344                               uint64_t *sse, int64_t *sum) {
345   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
346   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
347   int64_t tsum = 0;
348   uint64_t tsse = 0;
349   for (int i = 0; i < h; ++i) {
350     int32_t lsum = 0;
351     for (int j = 0; j < w; ++j) {
352       const int diff = a[j] - b[j];
353       lsum += diff;
354       tsse += (uint32_t)(diff * diff);
355     }
356     tsum += lsum;
357     a += a_stride;
358     b += b_stride;
359   }
360   *sum = tsum;
361   *sse = tsse;
362 }
363 
aom_highbd_sse_odd_size(const uint8_t * a,int a_stride,const uint8_t * b,int b_stride,int w,int h)364 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
365                                  const uint8_t *b, int b_stride, int w, int h) {
366   uint64_t sse;
367   int64_t sum;
368   highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
369   return sse;
370 }
371 
highbd_8_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)372 static void highbd_8_variance(const uint8_t *a8, int a_stride,
373                               const uint8_t *b8, int b_stride, int w, int h,
374                               uint32_t *sse, int *sum) {
375   uint64_t sse_long = 0;
376   int64_t sum_long = 0;
377   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
378   *sse = (uint32_t)sse_long;
379   *sum = (int)sum_long;
380 }
381 
highbd_10_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)382 static void highbd_10_variance(const uint8_t *a8, int a_stride,
383                                const uint8_t *b8, int b_stride, int w, int h,
384                                uint32_t *sse, int *sum) {
385   uint64_t sse_long = 0;
386   int64_t sum_long = 0;
387   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
388   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
389   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
390 }
391 
highbd_12_variance(const uint8_t * a8,int a_stride,const uint8_t * b8,int b_stride,int w,int h,uint32_t * sse,int * sum)392 static void highbd_12_variance(const uint8_t *a8, int a_stride,
393                                const uint8_t *b8, int b_stride, int w, int h,
394                                uint32_t *sse, int *sum) {
395   uint64_t sse_long = 0;
396   int64_t sum_long = 0;
397   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
398   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
399   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
400 }
401 
402 #define HIGHBD_VAR(W, H)                                                       \
403   uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
404                                               const uint8_t *b, int b_stride,  \
405                                               uint32_t *sse) {                 \
406     int sum;                                                                   \
407     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
408     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
409   }                                                                            \
410                                                                                \
411   uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
412                                                const uint8_t *b, int b_stride, \
413                                                uint32_t *sse) {                \
414     int sum;                                                                   \
415     int64_t var;                                                               \
416     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
417     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
418     return (var >= 0) ? (uint32_t)var : 0;                                     \
419   }                                                                            \
420                                                                                \
421   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
422                                                const uint8_t *b, int b_stride, \
423                                                uint32_t *sse) {                \
424     int sum;                                                                   \
425     int64_t var;                                                               \
426     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
427     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
428     return (var >= 0) ? (uint32_t)var : 0;                                     \
429   }
430 
431 #define HIGHBD_GET_VAR(S)                                                    \
432   void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
433                                         const uint8_t *ref, int ref_stride,  \
434                                         uint32_t *sse, int *sum) {           \
435     highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
436   }                                                                          \
437                                                                              \
438   void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
439                                          const uint8_t *ref, int ref_stride, \
440                                          uint32_t *sse, int *sum) {          \
441     highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
442   }                                                                          \
443                                                                              \
444   void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
445                                          const uint8_t *ref, int ref_stride, \
446                                          uint32_t *sse, int *sum) {          \
447     highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
448   }
449 
450 #define HIGHBD_MSE(W, H)                                                      \
451   uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
452                                          const uint8_t *ref, int ref_stride,  \
453                                          uint32_t *sse) {                     \
454     int sum;                                                                  \
455     highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
456     return *sse;                                                              \
457   }                                                                           \
458                                                                               \
459   uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
460                                           const uint8_t *ref, int ref_stride, \
461                                           uint32_t *sse) {                    \
462     int sum;                                                                  \
463     highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
464     return *sse;                                                              \
465   }                                                                           \
466                                                                               \
467   uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
468                                           const uint8_t *ref, int ref_stride, \
469                                           uint32_t *sse) {                    \
470     int sum;                                                                  \
471     highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
472     return *sse;                                                              \
473   }
474 
aom_highbd_var_filter_block2d_bil_first_pass(const uint8_t * src_ptr8,uint16_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)475 void aom_highbd_var_filter_block2d_bil_first_pass(
476     const uint8_t *src_ptr8, uint16_t *output_ptr,
477     unsigned int src_pixels_per_line, int pixel_step,
478     unsigned int output_height, unsigned int output_width,
479     const uint8_t *filter) {
480   unsigned int i, j;
481   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
482   for (i = 0; i < output_height; ++i) {
483     for (j = 0; j < output_width; ++j) {
484       output_ptr[j] = ROUND_POWER_OF_TWO(
485           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
486           FILTER_BITS);
487 
488       ++src_ptr;
489     }
490 
491     // Next row...
492     src_ptr += src_pixels_per_line - output_width;
493     output_ptr += output_width;
494   }
495 }
496 
aom_highbd_var_filter_block2d_bil_second_pass(const uint16_t * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)497 void aom_highbd_var_filter_block2d_bil_second_pass(
498     const uint16_t *src_ptr, uint16_t *output_ptr,
499     unsigned int src_pixels_per_line, unsigned int pixel_step,
500     unsigned int output_height, unsigned int output_width,
501     const uint8_t *filter) {
502   unsigned int i, j;
503 
504   for (i = 0; i < output_height; ++i) {
505     for (j = 0; j < output_width; ++j) {
506       output_ptr[j] = ROUND_POWER_OF_TWO(
507           (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
508           FILTER_BITS);
509       ++src_ptr;
510     }
511 
512     src_ptr += src_pixels_per_line - output_width;
513     output_ptr += output_width;
514   }
515 }
516 
517 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
518   uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
519       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
520       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
521     uint16_t fdata3[(H + 1) * W];                                            \
522     uint16_t temp2[H * W];                                                   \
523                                                                              \
524     aom_highbd_var_filter_block2d_bil_first_pass(                            \
525         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
526     aom_highbd_var_filter_block2d_bil_second_pass(                           \
527         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
528                                                                              \
529     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
530                                               dst, dst_stride, sse);         \
531   }                                                                          \
532                                                                              \
533   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
534       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
535       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
536     uint16_t fdata3[(H + 1) * W];                                            \
537     uint16_t temp2[H * W];                                                   \
538                                                                              \
539     aom_highbd_var_filter_block2d_bil_first_pass(                            \
540         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
541     aom_highbd_var_filter_block2d_bil_second_pass(                           \
542         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
543                                                                              \
544     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
545                                                dst, dst_stride, sse);        \
546   }                                                                          \
547                                                                              \
548   uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
549       const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
550       const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
551     uint16_t fdata3[(H + 1) * W];                                            \
552     uint16_t temp2[H * W];                                                   \
553                                                                              \
554     aom_highbd_var_filter_block2d_bil_first_pass(                            \
555         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
556     aom_highbd_var_filter_block2d_bil_second_pass(                           \
557         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
558                                                                              \
559     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
560                                                dst, dst_stride, sse);        \
561   }
562 
563 #define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
564   uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
565       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
566       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
567       const uint8_t *second_pred) {                                           \
568     uint16_t fdata3[(H + 1) * W];                                             \
569     uint16_t temp2[H * W];                                                    \
570     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
571                                                                               \
572     aom_highbd_var_filter_block2d_bil_first_pass(                             \
573         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
574     aom_highbd_var_filter_block2d_bil_second_pass(                            \
575         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
576                                                                               \
577     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
578                                CONVERT_TO_BYTEPTR(temp2), W);                 \
579                                                                               \
580     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
581                                               dst, dst_stride, sse);          \
582   }                                                                           \
583                                                                               \
584   uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
585       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
586       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
587       const uint8_t *second_pred) {                                           \
588     uint16_t fdata3[(H + 1) * W];                                             \
589     uint16_t temp2[H * W];                                                    \
590     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
591                                                                               \
592     aom_highbd_var_filter_block2d_bil_first_pass(                             \
593         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
594     aom_highbd_var_filter_block2d_bil_second_pass(                            \
595         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
596                                                                               \
597     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
598                                CONVERT_TO_BYTEPTR(temp2), W);                 \
599                                                                               \
600     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
601                                                dst, dst_stride, sse);         \
602   }                                                                           \
603                                                                               \
604   uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
605       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
606       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
607       const uint8_t *second_pred) {                                           \
608     uint16_t fdata3[(H + 1) * W];                                             \
609     uint16_t temp2[H * W];                                                    \
610     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
611                                                                               \
612     aom_highbd_var_filter_block2d_bil_first_pass(                             \
613         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
614     aom_highbd_var_filter_block2d_bil_second_pass(                            \
615         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
616                                                                               \
617     aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
618                                CONVERT_TO_BYTEPTR(temp2), W);                 \
619                                                                               \
620     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
621                                                dst, dst_stride, sse);         \
622   }                                                                           \
623                                                                               \
624   uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
625       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
626       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
627       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
628     uint16_t fdata3[(H + 1) * W];                                             \
629     uint16_t temp2[H * W];                                                    \
630     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
631                                                                               \
632     aom_highbd_var_filter_block2d_bil_first_pass(                             \
633         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
634     aom_highbd_var_filter_block2d_bil_second_pass(                            \
635         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
636                                                                               \
637     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
638                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
639                                       jcp_param);                             \
640                                                                               \
641     return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
642                                           dst_stride, sse);                   \
643   }                                                                           \
644                                                                               \
645   uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
646       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
647       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
648       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
649     uint16_t fdata3[(H + 1) * W];                                             \
650     uint16_t temp2[H * W];                                                    \
651     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
652                                                                               \
653     aom_highbd_var_filter_block2d_bil_first_pass(                             \
654         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
655     aom_highbd_var_filter_block2d_bil_second_pass(                            \
656         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
657                                                                               \
658     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
659                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
660                                       jcp_param);                             \
661                                                                               \
662     return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
663                                            dst_stride, sse);                  \
664   }                                                                           \
665                                                                               \
666   uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
667       const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
668       const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
669       const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
670     uint16_t fdata3[(H + 1) * W];                                             \
671     uint16_t temp2[H * W];                                                    \
672     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
673                                                                               \
674     aom_highbd_var_filter_block2d_bil_first_pass(                             \
675         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
676     aom_highbd_var_filter_block2d_bil_second_pass(                            \
677         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
678                                                                               \
679     aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
680                                       W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
681                                       jcp_param);                             \
682                                                                               \
683     return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
684                                            dst_stride, sse);                  \
685   }
686 
687 /* All three forms of the variance are available in the same sizes. */
688 #define HIGHBD_VARIANCES(W, H) \
689   HIGHBD_VAR(W, H)             \
690   HIGHBD_SUBPIX_VAR(W, H)      \
691   HIGHBD_SUBPIX_AVG_VAR(W, H)
692 
693 HIGHBD_VARIANCES(128, 128)
694 HIGHBD_VARIANCES(128, 64)
695 HIGHBD_VARIANCES(64, 128)
696 HIGHBD_VARIANCES(64, 64)
697 HIGHBD_VARIANCES(64, 32)
698 HIGHBD_VARIANCES(32, 64)
699 HIGHBD_VARIANCES(32, 32)
700 HIGHBD_VARIANCES(32, 16)
701 HIGHBD_VARIANCES(16, 32)
702 HIGHBD_VARIANCES(16, 16)
703 HIGHBD_VARIANCES(16, 8)
704 HIGHBD_VARIANCES(8, 16)
705 HIGHBD_VARIANCES(8, 8)
706 HIGHBD_VARIANCES(8, 4)
707 HIGHBD_VARIANCES(4, 8)
708 HIGHBD_VARIANCES(4, 4)
709 HIGHBD_VARIANCES(4, 2)
710 HIGHBD_VARIANCES(2, 4)
711 HIGHBD_VARIANCES(2, 2)
712 
713 // Realtime mode doesn't use 4x rectangular blocks.
714 #if !CONFIG_REALTIME_ONLY
715 HIGHBD_VARIANCES(4, 16)
716 HIGHBD_VARIANCES(16, 4)
717 HIGHBD_VARIANCES(8, 32)
718 HIGHBD_VARIANCES(32, 8)
719 HIGHBD_VARIANCES(16, 64)
720 HIGHBD_VARIANCES(64, 16)
721 #endif
722 
723 HIGHBD_GET_VAR(8)
724 HIGHBD_GET_VAR(16)
725 
726 HIGHBD_MSE(16, 16)
727 HIGHBD_MSE(16, 8)
728 HIGHBD_MSE(8, 16)
729 HIGHBD_MSE(8, 8)
730 
aom_highbd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride)731 void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
732                                 int width, int height, const uint8_t *ref8,
733                                 int ref_stride) {
734   int i, j;
735   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
736   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
737   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
738   for (i = 0; i < height; ++i) {
739     for (j = 0; j < width; ++j) {
740       const int tmp = pred[j] + ref[j];
741       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
742     }
743     comp_pred += width;
744     pred += width;
745     ref += ref_stride;
746   }
747 }
748 
aom_highbd_dist_wtd_comp_avg_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const DIST_WTD_COMP_PARAMS * jcp_param)749 void aom_highbd_dist_wtd_comp_avg_pred_c(
750     uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
751     const uint8_t *ref8, int ref_stride,
752     const DIST_WTD_COMP_PARAMS *jcp_param) {
753   int i, j;
754   const int fwd_offset = jcp_param->fwd_offset;
755   const int bck_offset = jcp_param->bck_offset;
756   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
757   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
758   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
759 
760   for (i = 0; i < height; ++i) {
761     for (j = 0; j < width; ++j) {
762       int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
763       tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
764       comp_pred[j] = (uint16_t)tmp;
765     }
766     comp_pred += width;
767     pred += width;
768     ref += ref_stride;
769   }
770 }
771 #endif  // CONFIG_AV1_HIGHBITDEPTH
772 
aom_comp_mask_pred_c(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)773 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
774                           int height, const uint8_t *ref, int ref_stride,
775                           const uint8_t *mask, int mask_stride,
776                           int invert_mask) {
777   int i, j;
778   const uint8_t *src0 = invert_mask ? pred : ref;
779   const uint8_t *src1 = invert_mask ? ref : pred;
780   const int stride0 = invert_mask ? width : ref_stride;
781   const int stride1 = invert_mask ? ref_stride : width;
782   for (i = 0; i < height; ++i) {
783     for (j = 0; j < width; ++j) {
784       comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
785     }
786     comp_pred += width;
787     src0 += stride0;
788     src1 += stride1;
789     mask += mask_stride;
790   }
791 }
792 
793 #define MASK_SUBPIX_VAR(W, H)                                                  \
794   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
795       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
796       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
797       const uint8_t *msk, int msk_stride, int invert_mask,                     \
798       unsigned int *sse) {                                                     \
799     uint16_t fdata3[(H + 1) * W];                                              \
800     uint8_t temp2[H * W];                                                      \
801     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
802                                                                                \
803     aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
804                                             W, bilinear_filters_2t[xoffset]);  \
805     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
806                                              bilinear_filters_2t[yoffset]);    \
807                                                                                \
808     aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
809                          invert_mask);                                         \
810     return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
811   }
812 
813 MASK_SUBPIX_VAR(4, 4)
814 MASK_SUBPIX_VAR(4, 8)
815 MASK_SUBPIX_VAR(8, 4)
816 MASK_SUBPIX_VAR(8, 8)
817 MASK_SUBPIX_VAR(8, 16)
818 MASK_SUBPIX_VAR(16, 8)
819 MASK_SUBPIX_VAR(16, 16)
820 MASK_SUBPIX_VAR(16, 32)
821 MASK_SUBPIX_VAR(32, 16)
822 MASK_SUBPIX_VAR(32, 32)
823 MASK_SUBPIX_VAR(32, 64)
824 MASK_SUBPIX_VAR(64, 32)
825 MASK_SUBPIX_VAR(64, 64)
826 MASK_SUBPIX_VAR(64, 128)
827 MASK_SUBPIX_VAR(128, 64)
828 MASK_SUBPIX_VAR(128, 128)
829 
830 // Realtime mode doesn't use 4x rectangular blocks.
831 #if !CONFIG_REALTIME_ONLY
832 MASK_SUBPIX_VAR(4, 16)
833 MASK_SUBPIX_VAR(16, 4)
834 MASK_SUBPIX_VAR(8, 32)
835 MASK_SUBPIX_VAR(32, 8)
836 MASK_SUBPIX_VAR(16, 64)
837 MASK_SUBPIX_VAR(64, 16)
838 #endif
839 
840 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_comp_mask_pred_c(uint8_t * comp_pred8,const uint8_t * pred8,int width,int height,const uint8_t * ref8,int ref_stride,const uint8_t * mask,int mask_stride,int invert_mask)841 void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
842                                  int width, int height, const uint8_t *ref8,
843                                  int ref_stride, const uint8_t *mask,
844                                  int mask_stride, int invert_mask) {
845   int i, j;
846   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
847   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
848   uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
849   for (i = 0; i < height; ++i) {
850     for (j = 0; j < width; ++j) {
851       if (!invert_mask)
852         comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
853       else
854         comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
855     }
856     comp_pred += width;
857     pred += width;
858     ref += ref_stride;
859     mask += mask_stride;
860   }
861 }
862 
863 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
864   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
865       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
866       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
867       const uint8_t *msk, int msk_stride, int invert_mask,                     \
868       unsigned int *sse) {                                                     \
869     uint16_t fdata3[(H + 1) * W];                                              \
870     uint16_t temp2[H * W];                                                     \
871     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
872                                                                                \
873     aom_highbd_var_filter_block2d_bil_first_pass(                              \
874         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
875     aom_highbd_var_filter_block2d_bil_second_pass(                             \
876         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
877                                                                                \
878     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
879                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
880                                 invert_mask);                                  \
881                                                                                \
882     return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
883                                               ref, ref_stride, sse);           \
884   }                                                                            \
885                                                                                \
886   unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
887       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
888       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
889       const uint8_t *msk, int msk_stride, int invert_mask,                     \
890       unsigned int *sse) {                                                     \
891     uint16_t fdata3[(H + 1) * W];                                              \
892     uint16_t temp2[H * W];                                                     \
893     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
894                                                                                \
895     aom_highbd_var_filter_block2d_bil_first_pass(                              \
896         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
897     aom_highbd_var_filter_block2d_bil_second_pass(                             \
898         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
899                                                                                \
900     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
901                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
902                                 invert_mask);                                  \
903                                                                                \
904     return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
905                                                ref, ref_stride, sse);          \
906   }                                                                            \
907                                                                                \
908   unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
909       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
910       const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
911       const uint8_t *msk, int msk_stride, int invert_mask,                     \
912       unsigned int *sse) {                                                     \
913     uint16_t fdata3[(H + 1) * W];                                              \
914     uint16_t temp2[H * W];                                                     \
915     DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
916                                                                                \
917     aom_highbd_var_filter_block2d_bil_first_pass(                              \
918         src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
919     aom_highbd_var_filter_block2d_bil_second_pass(                             \
920         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
921                                                                                \
922     aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
923                                 CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
924                                 invert_mask);                                  \
925                                                                                \
926     return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
927                                                ref, ref_stride, sse);          \
928   }
929 
930 HIGHBD_MASK_SUBPIX_VAR(4, 4)
931 HIGHBD_MASK_SUBPIX_VAR(4, 8)
932 HIGHBD_MASK_SUBPIX_VAR(8, 4)
933 HIGHBD_MASK_SUBPIX_VAR(8, 8)
934 HIGHBD_MASK_SUBPIX_VAR(8, 16)
935 HIGHBD_MASK_SUBPIX_VAR(16, 8)
936 HIGHBD_MASK_SUBPIX_VAR(16, 16)
937 HIGHBD_MASK_SUBPIX_VAR(16, 32)
938 HIGHBD_MASK_SUBPIX_VAR(32, 16)
939 HIGHBD_MASK_SUBPIX_VAR(32, 32)
940 HIGHBD_MASK_SUBPIX_VAR(32, 64)
941 HIGHBD_MASK_SUBPIX_VAR(64, 32)
942 HIGHBD_MASK_SUBPIX_VAR(64, 64)
943 HIGHBD_MASK_SUBPIX_VAR(64, 128)
944 HIGHBD_MASK_SUBPIX_VAR(128, 64)
945 HIGHBD_MASK_SUBPIX_VAR(128, 128)
946 #if !CONFIG_REALTIME_ONLY
947 HIGHBD_MASK_SUBPIX_VAR(4, 16)
948 HIGHBD_MASK_SUBPIX_VAR(16, 4)
949 HIGHBD_MASK_SUBPIX_VAR(8, 32)
950 HIGHBD_MASK_SUBPIX_VAR(32, 8)
951 HIGHBD_MASK_SUBPIX_VAR(16, 64)
952 HIGHBD_MASK_SUBPIX_VAR(64, 16)
953 #endif
954 #endif  // CONFIG_AV1_HIGHBITDEPTH
955 
956 #if !CONFIG_REALTIME_ONLY
obmc_variance(const uint8_t * pre,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)957 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
958                                  const int32_t *wsrc, const int32_t *mask,
959                                  int w, int h, unsigned int *sse, int *sum) {
960   int i, j;
961 
962   *sse = 0;
963   *sum = 0;
964 
965   for (i = 0; i < h; i++) {
966     for (j = 0; j < w; j++) {
967       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
968       *sum += diff;
969       *sse += diff * diff;
970     }
971 
972     pre += pre_stride;
973     wsrc += w;
974     mask += w;
975   }
976 }
977 
978 #define OBMC_VAR(W, H)                                            \
979   unsigned int aom_obmc_variance##W##x##H##_c(                    \
980       const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
981       const int32_t *mask, unsigned int *sse) {                   \
982     int sum;                                                      \
983     obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
984     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
985   }
986 
987 #define OBMC_SUBPIX_VAR(W, H)                                                  \
988   unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
989       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
990       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
991     uint16_t fdata3[(H + 1) * W];                                              \
992     uint8_t temp2[H * W];                                                      \
993                                                                                \
994     aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
995                                             W, bilinear_filters_2t[xoffset]);  \
996     aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
997                                              bilinear_filters_2t[yoffset]);    \
998                                                                                \
999     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
1000   }
1001 
1002 OBMC_VAR(4, 4)
1003 OBMC_SUBPIX_VAR(4, 4)
1004 
1005 OBMC_VAR(4, 8)
1006 OBMC_SUBPIX_VAR(4, 8)
1007 
1008 OBMC_VAR(8, 4)
1009 OBMC_SUBPIX_VAR(8, 4)
1010 
1011 OBMC_VAR(8, 8)
1012 OBMC_SUBPIX_VAR(8, 8)
1013 
1014 OBMC_VAR(8, 16)
1015 OBMC_SUBPIX_VAR(8, 16)
1016 
1017 OBMC_VAR(16, 8)
1018 OBMC_SUBPIX_VAR(16, 8)
1019 
1020 OBMC_VAR(16, 16)
1021 OBMC_SUBPIX_VAR(16, 16)
1022 
1023 OBMC_VAR(16, 32)
1024 OBMC_SUBPIX_VAR(16, 32)
1025 
1026 OBMC_VAR(32, 16)
1027 OBMC_SUBPIX_VAR(32, 16)
1028 
1029 OBMC_VAR(32, 32)
1030 OBMC_SUBPIX_VAR(32, 32)
1031 
1032 OBMC_VAR(32, 64)
1033 OBMC_SUBPIX_VAR(32, 64)
1034 
1035 OBMC_VAR(64, 32)
1036 OBMC_SUBPIX_VAR(64, 32)
1037 
1038 OBMC_VAR(64, 64)
1039 OBMC_SUBPIX_VAR(64, 64)
1040 
1041 OBMC_VAR(64, 128)
1042 OBMC_SUBPIX_VAR(64, 128)
1043 
1044 OBMC_VAR(128, 64)
1045 OBMC_SUBPIX_VAR(128, 64)
1046 
1047 OBMC_VAR(128, 128)
1048 OBMC_SUBPIX_VAR(128, 128)
1049 
1050 OBMC_VAR(4, 16)
1051 OBMC_SUBPIX_VAR(4, 16)
1052 OBMC_VAR(16, 4)
1053 OBMC_SUBPIX_VAR(16, 4)
1054 OBMC_VAR(8, 32)
1055 OBMC_SUBPIX_VAR(8, 32)
1056 OBMC_VAR(32, 8)
1057 OBMC_SUBPIX_VAR(32, 8)
1058 OBMC_VAR(16, 64)
1059 OBMC_SUBPIX_VAR(16, 64)
1060 OBMC_VAR(64, 16)
1061 OBMC_SUBPIX_VAR(64, 16)
1062 
1063 #if CONFIG_AV1_HIGHBITDEPTH
highbd_obmc_variance64(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,uint64_t * sse,int64_t * sum)1064 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
1065                                           const int32_t *wsrc,
1066                                           const int32_t *mask, int w, int h,
1067                                           uint64_t *sse, int64_t *sum) {
1068   int i, j;
1069   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
1070 
1071   *sse = 0;
1072   *sum = 0;
1073 
1074   for (i = 0; i < h; i++) {
1075     for (j = 0; j < w; j++) {
1076       int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
1077       *sum += diff;
1078       *sse += diff * diff;
1079     }
1080 
1081     pre += pre_stride;
1082     wsrc += w;
1083     mask += w;
1084   }
1085 }
1086 
highbd_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1087 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
1088                                         const int32_t *wsrc,
1089                                         const int32_t *mask, int w, int h,
1090                                         unsigned int *sse, int *sum) {
1091   int64_t sum64;
1092   uint64_t sse64;
1093   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1094   *sum = (int)sum64;
1095   *sse = (unsigned int)sse64;
1096 }
1097 
highbd_10_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1098 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
1099                                            const int32_t *wsrc,
1100                                            const int32_t *mask, int w, int h,
1101                                            unsigned int *sse, int *sum) {
1102   int64_t sum64;
1103   uint64_t sse64;
1104   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1105   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
1106   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
1107 }
1108 
highbd_12_obmc_variance(const uint8_t * pre8,int pre_stride,const int32_t * wsrc,const int32_t * mask,int w,int h,unsigned int * sse,int * sum)1109 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
1110                                            const int32_t *wsrc,
1111                                            const int32_t *mask, int w, int h,
1112                                            unsigned int *sse, int *sum) {
1113   int64_t sum64;
1114   uint64_t sse64;
1115   highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
1116   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
1117   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
1118 }
1119 
1120 #define HIGHBD_OBMC_VAR(W, H)                                              \
1121   unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
1122       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1123       const int32_t *mask, unsigned int *sse) {                            \
1124     int sum;                                                               \
1125     highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
1126     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
1127   }                                                                        \
1128                                                                            \
1129   unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
1130       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1131       const int32_t *mask, unsigned int *sse) {                            \
1132     int sum;                                                               \
1133     int64_t var;                                                           \
1134     highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1135     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1136     return (var >= 0) ? (uint32_t)var : 0;                                 \
1137   }                                                                        \
1138                                                                            \
1139   unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
1140       const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
1141       const int32_t *mask, unsigned int *sse) {                            \
1142     int sum;                                                               \
1143     int64_t var;                                                           \
1144     highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
1145     var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
1146     return (var >= 0) ? (uint32_t)var : 0;                                 \
1147   }
1148 
1149 #define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
1150   unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
1151       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1152       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1153     uint16_t fdata3[(H + 1) * W];                                              \
1154     uint16_t temp2[H * W];                                                     \
1155                                                                                \
1156     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1157         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1158     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1159         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1160                                                                                \
1161     return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
1162                                                  wsrc, mask, sse);             \
1163   }                                                                            \
1164                                                                                \
1165   unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
1166       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1167       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1168     uint16_t fdata3[(H + 1) * W];                                              \
1169     uint16_t temp2[H * W];                                                     \
1170                                                                                \
1171     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1172         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1173     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1174         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1175                                                                                \
1176     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1177                                                     W, wsrc, mask, sse);       \
1178   }                                                                            \
1179                                                                                \
1180   unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
1181       const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
1182       const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
1183     uint16_t fdata3[(H + 1) * W];                                              \
1184     uint16_t temp2[H * W];                                                     \
1185                                                                                \
1186     aom_highbd_var_filter_block2d_bil_first_pass(                              \
1187         pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
1188     aom_highbd_var_filter_block2d_bil_second_pass(                             \
1189         fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
1190                                                                                \
1191     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
1192                                                     W, wsrc, mask, sse);       \
1193   }
1194 
1195 HIGHBD_OBMC_VAR(4, 4)
1196 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
1197 
1198 HIGHBD_OBMC_VAR(4, 8)
1199 HIGHBD_OBMC_SUBPIX_VAR(4, 8)
1200 
1201 HIGHBD_OBMC_VAR(8, 4)
1202 HIGHBD_OBMC_SUBPIX_VAR(8, 4)
1203 
1204 HIGHBD_OBMC_VAR(8, 8)
1205 HIGHBD_OBMC_SUBPIX_VAR(8, 8)
1206 
1207 HIGHBD_OBMC_VAR(8, 16)
1208 HIGHBD_OBMC_SUBPIX_VAR(8, 16)
1209 
1210 HIGHBD_OBMC_VAR(16, 8)
1211 HIGHBD_OBMC_SUBPIX_VAR(16, 8)
1212 
1213 HIGHBD_OBMC_VAR(16, 16)
1214 HIGHBD_OBMC_SUBPIX_VAR(16, 16)
1215 
1216 HIGHBD_OBMC_VAR(16, 32)
1217 HIGHBD_OBMC_SUBPIX_VAR(16, 32)
1218 
1219 HIGHBD_OBMC_VAR(32, 16)
1220 HIGHBD_OBMC_SUBPIX_VAR(32, 16)
1221 
1222 HIGHBD_OBMC_VAR(32, 32)
1223 HIGHBD_OBMC_SUBPIX_VAR(32, 32)
1224 
1225 HIGHBD_OBMC_VAR(32, 64)
1226 HIGHBD_OBMC_SUBPIX_VAR(32, 64)
1227 
1228 HIGHBD_OBMC_VAR(64, 32)
1229 HIGHBD_OBMC_SUBPIX_VAR(64, 32)
1230 
1231 HIGHBD_OBMC_VAR(64, 64)
1232 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
1233 
1234 HIGHBD_OBMC_VAR(64, 128)
1235 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
1236 
1237 HIGHBD_OBMC_VAR(128, 64)
1238 HIGHBD_OBMC_SUBPIX_VAR(128, 64)
1239 
1240 HIGHBD_OBMC_VAR(128, 128)
1241 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
1242 
1243 HIGHBD_OBMC_VAR(4, 16)
1244 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
1245 HIGHBD_OBMC_VAR(16, 4)
1246 HIGHBD_OBMC_SUBPIX_VAR(16, 4)
1247 HIGHBD_OBMC_VAR(8, 32)
1248 HIGHBD_OBMC_SUBPIX_VAR(8, 32)
1249 HIGHBD_OBMC_VAR(32, 8)
1250 HIGHBD_OBMC_SUBPIX_VAR(32, 8)
1251 HIGHBD_OBMC_VAR(16, 64)
1252 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
1253 HIGHBD_OBMC_VAR(64, 16)
1254 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
1255 #endif  // CONFIG_AV1_HIGHBITDEPTH
1256 #endif  // !CONFIG_REALTIME_ONLY
1257 
aom_mse_wxh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1258 uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
1259                              int sstride, int w, int h) {
1260   uint64_t sum = 0;
1261   for (int i = 0; i < h; i++) {
1262     for (int j = 0; j < w; j++) {
1263       int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
1264       sum += e * e;
1265     }
1266   }
1267   return sum;
1268 }
1269 
aom_mse_16xh_16bit_c(uint8_t * dst,int dstride,uint16_t * src,int w,int h)1270 uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
1271                               int h) {
1272   uint16_t *src_temp = src;
1273   uint8_t *dst_temp = dst;
1274   const int num_blks = 16 / w;
1275   int64_t sum = 0;
1276   for (int i = 0; i < num_blks; i++) {
1277     sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
1278     dst_temp += w;
1279     src_temp += (w * h);
1280   }
1281   return sum;
1282 }
1283 
aom_mse_wxh_16bit_highbd_c(uint16_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)1284 uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
1285                                     int sstride, int w, int h) {
1286   uint64_t sum = 0;
1287   for (int i = 0; i < h; i++) {
1288     for (int j = 0; j < w; j++) {
1289       int e = dst[i * dstride + j] - src[i * sstride + j];
1290       sum += e * e;
1291     }
1292   }
1293   return sum;
1294 }
1295