• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <math.h>
13 #include <limits.h>
14 
15 #include "config/aom_config.h"
16 
17 #include "av1/common/alloccommon.h"
18 #include "av1/common/av1_common_int.h"
19 #include "av1/common/odintrin.h"
20 #include "av1/common/quant_common.h"
21 #include "av1/common/reconinter.h"
22 #include "av1/encoder/av1_quantize.h"
23 #include "av1/encoder/encoder.h"
24 #include "av1/encoder/extend.h"
25 #include "av1/encoder/firstpass.h"
26 #include "av1/encoder/mcomp.h"
27 #include "av1/encoder/ratectrl.h"
28 #include "av1/encoder/reconinter_enc.h"
29 #include "av1/encoder/segmentation.h"
30 #include "av1/encoder/temporal_filter.h"
31 #include "aom_dsp/aom_dsp_common.h"
32 #include "aom_mem/aom_mem.h"
33 #include "aom_ports/aom_timer.h"
34 #include "aom_ports/mem.h"
35 #include "aom_ports/system_state.h"
36 #include "aom_scale/aom_scale.h"
37 
38 // NOTE: All `tf` in this file means `temporal filtering`.
39 
40 // Does motion search for blocks in temporal filtering. This is the first step
41 // for temporal filtering. More specifically, given a frame to be filtered and
42 // another frame as reference, this function searches the reference frame to
43 // find out the most alike block as that from the frame to be filtered. This
44 // found block will be further used for weighted averaging.
45 // NOTE: Besides doing motion search for the entire block, this function will
46 // also do motion search for each 1/4 sub-block to get more precise prediction.
47 // Inputs:
48 //   cpi: Pointer to the composed information of input video.
49 //   frame_to_filter: Pointer to the frame to be filtered.
50 //   ref_frame: Pointer to the reference frame.
51 //   block_size: Block size used for motion search.
52 //   mb_row: Row index of the block in the entire frame.
53 //   mb_col: Column index of the block in the entire frame.
54 //   ref_mv: Reference motion vector, which is commonly inherited from the
55 //           motion search result of previous frame.
56 //   subblock_mvs: Pointer to the result motion vectors for 4 sub-blocks.
57 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
58 // Returns:
59 //   Search error (MSE) of the entire block.
tf_motion_search(AV1_COMP * cpi,const YV12_BUFFER_CONFIG * frame_to_filter,const YV12_BUFFER_CONFIG * ref_frame,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,MV * ref_mv,MV * subblock_mvs,int * subblock_mses)60 static int tf_motion_search(AV1_COMP *cpi,
61                             const YV12_BUFFER_CONFIG *frame_to_filter,
62                             const YV12_BUFFER_CONFIG *ref_frame,
63                             const BLOCK_SIZE block_size, const int mb_row,
64                             const int mb_col, MV *ref_mv, MV *subblock_mvs,
65                             int *subblock_mses) {
66   // Frame information
67   const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
68 
69   // Block information (ONLY Y-plane is used for motion search).
70   const int mb_height = block_size_high[block_size];
71   const int mb_width = block_size_wide[block_size];
72   const int mb_pels = mb_height * mb_width;
73   const int y_stride = frame_to_filter->y_stride;
74   assert(y_stride == ref_frame->y_stride);
75   const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
76 
77   // Save input state.
78   MACROBLOCK *const mb = &cpi->td.mb;
79   MACROBLOCKD *const mbd = &mb->e_mbd;
80   const struct buf_2d ori_src_buf = mb->plane[0].src;
81   const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
82   const MV_COST_TYPE ori_mv_cost_type = mb->mv_cost_type;
83 
84   // Parameters used for motion search.
85   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
86   SUBPEL_MOTION_SEARCH_PARAMS ms_params;
87 
88   const search_site_config ss_cfg =
89       cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
90   const SEARCH_METHODS full_search_method = NSTEP;
91   const int step_param = av1_init_search_range(
92       AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
93   const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
94   const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv;
95   const MV_COST_TYPE mv_cost_type =
96       min_frame_size >= 720
97           ? MV_COST_L1_HDRES
98           : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES);
99 
100   // Starting position for motion search.
101   FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
102   // Baseline position for motion search (used for rate distortion comparison).
103   const MV baseline_mv = kZeroMv;
104 
105   // Setup.
106   mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
107   mb->plane[0].src.stride = y_stride;
108   mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
109   mbd->plane[0].pre[0].stride = y_stride;
110   // Unused intermediate results for motion search.
111   unsigned int sse, error;
112   int distortion;
113   int cost_list[5];
114 
115   // Do motion search.
116   // NOTE: In `av1_full_pixel_search()` and `find_fractional_mv_step()`, the
117   // searched result will be stored in `mb->best_mv`.
118   int_mv best_mv;
119   int block_mse = INT_MAX;
120   mb->mv_cost_type = mv_cost_type;
121 
122   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
123                                      &baseline_mv, &ss_cfg);
124   full_ms_params.run_mesh_search = 1;
125   full_ms_params.search_method = full_search_method;
126   av1_full_pixel_search(start_mv, &full_ms_params, step_param,
127                         cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
128                         NULL);
129 
130   // Since we are merely refining the result from full pixel search, we don't
131   // need regularization for subpel search
132   mb->mv_cost_type = MV_COST_NONE;
133   if (force_integer_mv == 1) {  // Only do full search on the entire block.
134     const int mv_row = best_mv.as_mv.row;
135     const int mv_col = best_mv.as_mv.col;
136     best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
137     best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
138     const int mv_offset = mv_row * y_stride + mv_col;
139     error = cpi->fn_ptr[block_size].vf(
140         ref_frame->y_buffer + y_offset + mv_offset, y_stride,
141         frame_to_filter->y_buffer + y_offset, y_stride, &sse);
142     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
143     mb->e_mbd.mi[0]->mv[0] = best_mv;
144   } else {  // Do fractional search on the entire block and all sub-blocks.
145     av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
146                                       &baseline_mv, cost_list);
147     ms_params.forced_stop = EIGHTH_PEL;
148     ms_params.var_params.subpel_search_type = subpel_search_type;
149     MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
150     error = cpi->mv_search_params.find_fractional_mv_step(
151         &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
152         &distortion, &sse, NULL);
153     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
154     mb->e_mbd.mi[0]->mv[0] = best_mv;
155     *ref_mv = best_mv.as_mv;
156     // On 4 sub-blocks.
157     const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1];
158     const int subblock_height = block_size_high[subblock_size];
159     const int subblock_width = block_size_wide[subblock_size];
160     const int subblock_pels = subblock_height * subblock_width;
161     start_mv = get_fullmv_from_mv(ref_mv);
162 
163     int subblock_idx = 0;
164     for (int i = 0; i < mb_height; i += subblock_height) {
165       for (int j = 0; j < mb_width; j += subblock_width) {
166         const int offset = i * y_stride + j;
167         mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
168         mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
169         mb->mv_cost_type = mv_cost_type;
170 
171         av1_make_default_fullpel_ms_params(
172             &full_ms_params, cpi, mb, subblock_size, &baseline_mv, &ss_cfg);
173         full_ms_params.run_mesh_search = 1;
174         full_ms_params.search_method = full_search_method;
175         av1_full_pixel_search(start_mv, &full_ms_params, step_param,
176                               cond_cost_list(cpi, cost_list),
177                               &best_mv.as_fullmv, NULL);
178 
179         // Since we are merely refining the result from full pixel search, we
180         // don't need regularization for subpel search
181         mb->mv_cost_type = MV_COST_NONE;
182         av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
183                                           &baseline_mv, cost_list);
184         ms_params.forced_stop = EIGHTH_PEL;
185         ms_params.var_params.subpel_search_type = subpel_search_type;
186         subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
187         error = cpi->mv_search_params.find_fractional_mv_step(
188             &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
189             &best_mv.as_mv, &distortion, &sse, NULL);
190         subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
191         subblock_mvs[subblock_idx] = best_mv.as_mv;
192         ++subblock_idx;
193       }
194     }
195   }
196 
197   // Restore input state.
198   mb->plane[0].src = ori_src_buf;
199   mbd->plane[0].pre[0] = ori_pre_buf;
200   mb->mv_cost_type = ori_mv_cost_type;
201 
202   return block_mse;
203 }
204 
205 // Helper function to get weight according to thresholds.
get_weight_by_thresh(const int value,const int low,const int high)206 static INLINE int get_weight_by_thresh(const int value, const int low,
207                                        const int high) {
208   return value < low ? 2 : value < high ? 1 : 0;
209 }
210 
211 // Gets filter weight for blocks in temporal filtering. The weights will be
212 // assigned based on the motion search errors.
213 // NOTE: Besides assigning filter weight for the block, this function will also
214 // determine whether to split the entire block into 4 sub-blocks for further
215 // filtering.
216 // TODO(any): Many magic numbers are used in this function. They may be tuned
217 // to improve the performance.
218 // Inputs:
219 //   block_mse: Motion search error (MSE) for the entire block.
220 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
221 //   is_second_arf: Whether the to-filter frame is the second ARF. This field
222 //                  will affect the filter weight for the to-filter frame.
223 //   subblock_filter_weights: Pointer to the assigned filter weight for each
224 //                            sub-block. If not using sub-blocks, the first
225 //                            element will be used for the entire block.
226 // Returns: Whether to use 4 sub-blocks to replace the original block.
tf_get_filter_weight(const int block_mse,const int * subblock_mses,const int is_second_arf,int * subblock_filter_weights)227 static int tf_get_filter_weight(const int block_mse, const int *subblock_mses,
228                                 const int is_second_arf,
229                                 int *subblock_filter_weights) {
230   // `block_mse` is initialized as INT_MAX and will be overwritten after the
231   // motion search with reference frame, therefore INT_MAX can ONLY be accessed
232   // by to-filter frame.
233   if (block_mse == INT_MAX) {
234     const int weight = TF_ENABLE_PLANEWISE_STRATEGY
235                            ? TF_PLANEWISE_FILTER_WEIGHT_SCALE
236                            : is_second_arf ? 64 : 32;
237     subblock_filter_weights[0] = subblock_filter_weights[1] =
238         subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
239     return 0;
240   }
241 
242   const int thresh_low = is_second_arf ? 20 : 40;
243   const int thresh_high = is_second_arf ? 40 : 80;
244 
245   int min_subblock_mse = INT_MAX;
246   int max_subblock_mse = INT_MIN;
247   int sum_subblock_mse = 0;
248   for (int i = 0; i < 4; ++i) {
249     sum_subblock_mse += subblock_mses[i];
250     min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
251     max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
252     subblock_filter_weights[i] =
253         get_weight_by_thresh(subblock_mses[i], thresh_low, thresh_high);
254   }
255 
256   if (((block_mse * 15 < sum_subblock_mse * 4) &&
257        max_subblock_mse - min_subblock_mse < 48) ||
258       ((block_mse * 14 < sum_subblock_mse * 4) &&
259        max_subblock_mse - min_subblock_mse < 24)) {  // No split.
260     const int weight = get_weight_by_thresh(block_mse, thresh_low, thresh_high);
261     subblock_filter_weights[0] = subblock_filter_weights[1] =
262         subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
263     return 0;
264   } else {  // Do split.
265     return 1;
266   }
267 }
268 
269 // Helper function to determine whether a frame is encoded with high bit-depth.
is_frame_high_bitdepth(const YV12_BUFFER_CONFIG * frame)270 static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
271   return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
272 }
273 
274 // Builds predictor for blocks in temporal filtering. This is the second step
275 // for temporal filtering, which is to construct predictions from all reference
276 // frames INCLUDING the frame to be filtered itself. These predictors are built
277 // based on the motion search results (motion vector is set as 0 for the frame
278 // to be filtered), and will be futher used for weighted averaging.
279 // Inputs:
280 //   ref_frame: Pointer to the reference frame (or the frame to be filtered).
281 //   mbd: Pointer to the block for filtering. Besides containing the subsampling
282 //        information of all planes, this field also gives the searched motion
283 //        vector for the entire block, i.e., `mbd->mi[0]->mv[0]`. This vector
284 //        should be 0 if the `ref_frame` itself is the frame to be filtered.
285 //   block_size: Size of the block.
286 //   mb_row: Row index of the block in the entire frame.
287 //   mb_col: Column index of the block in the entire frame.
288 //   num_planes: Number of planes in the frame.
289 //   scale: Scaling factor.
290 //   use_subblock: Whether to use 4 sub-blocks to replace the original block.
291 //   subblock_mvs: The motion vectors for each sub-block (row-major order).
292 //   pred: Pointer to the predictor to build.
293 // Returns:
294 //   Nothing will be returned. But the content to which `pred` points will be
295 //   modified.
tf_build_predictor(const YV12_BUFFER_CONFIG * ref_frame,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const struct scale_factors * scale,const int use_subblock,const MV * subblock_mvs,uint8_t * pred)296 static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
297                                const MACROBLOCKD *mbd,
298                                const BLOCK_SIZE block_size, const int mb_row,
299                                const int mb_col, const int num_planes,
300                                const struct scale_factors *scale,
301                                const int use_subblock, const MV *subblock_mvs,
302                                uint8_t *pred) {
303   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
304 
305   // Information of the entire block.
306   const int mb_height = block_size_high[block_size];  // Height.
307   const int mb_width = block_size_wide[block_size];   // Width.
308   const int mb_pels = mb_height * mb_width;           // Number of pixels.
309   const int mb_y = mb_height * mb_row;                // Y-coord (Top-left).
310   const int mb_x = mb_width * mb_col;                 // X-coord (Top-left).
311   const int bit_depth = mbd->bd;                      // Bit depth.
312   const int is_intrabc = 0;                           // Is intra-copied?
313   const int mb_mv_row = mbd->mi[0]->mv[0].as_mv.row;  // Motion vector (y).
314   const int mb_mv_col = mbd->mi[0]->mv[0].as_mv.col;  // Motion vector (x).
315   const MV mb_mv = { (int16_t)mb_mv_row, (int16_t)mb_mv_col };
316   const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
317 
318   // Information of each sub-block (actually in use).
319   const int num_blocks = use_subblock ? 2 : 1;  // Num of blocks on each side.
320   const int block_height = mb_height >> (num_blocks - 1);  // Height.
321   const int block_width = mb_width >> (num_blocks - 1);    // Width.
322 
323   // Default interpolation filters.
324   const int_interpfilters interp_filters =
325       av1_broadcast_interp_filter(MULTITAP_SHARP);
326 
327   // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
328   int plane_offset = 0;
329   for (int plane = 0; plane < num_planes; ++plane) {
330     const int subsampling_y = mbd->plane[plane].subsampling_y;
331     const int subsampling_x = mbd->plane[plane].subsampling_x;
332     // Information of each sub-block in current plane.
333     const int plane_h = mb_height >> subsampling_y;  // Plane height.
334     const int plane_w = mb_width >> subsampling_x;   // Plane width.
335     const int plane_y = mb_y >> subsampling_y;       // Y-coord (Top-left).
336     const int plane_x = mb_x >> subsampling_x;       // X-coord (Top-left).
337     const int h = block_height >> subsampling_y;     // Sub-block height.
338     const int w = block_width >> subsampling_x;      // Sub-block width.
339     const int is_y_plane = (plane == 0);             // Is Y-plane?
340 
341     const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
342                                     ref_frame->widths[is_y_plane ? 0 : 1],
343                                     ref_frame->heights[is_y_plane ? 0 : 1],
344                                     ref_frame->strides[is_y_plane ? 0 : 1] };
345 
346     // Handle entire block or sub-blocks if needed.
347     int subblock_idx = 0;
348     for (int i = 0; i < plane_h; i += h) {
349       for (int j = 0; j < plane_w; j += w) {
350         // Choose proper motion vector.
351         const MV mv = use_subblock ? subblock_mvs[subblock_idx] : mb_mv;
352         assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
353                mv.col >= INT16_MIN && mv.col <= INT16_MAX);
354 
355         const int y = plane_y + i;
356         const int x = plane_x + j;
357 
358         // Build predictior for each sub-block on current plane.
359         InterPredParams inter_pred_params;
360         av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
361                               subsampling_y, bit_depth, is_high_bitdepth,
362                               is_intrabc, scale, &ref_buf, interp_filters);
363         inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
364         av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
365                                           plane_w, &mv, &inter_pred_params);
366 
367         ++subblock_idx;
368       }
369     }
370     plane_offset += mb_pels;
371   }
372 }
373 
374 // Computes temporal filter weights and accumulators for the frame to be
375 // filtered. More concretely, the filter weights for all pixels are the same.
376 // Inputs:
377 //   mbd: Pointer to the block for filtering, which is ONLY used to get
378 //        subsampling information of all planes as well as the bit-depth.
379 //   block_size: Size of the block.
380 //   num_planes: Number of planes in the frame.
381 //   filter_weight: Weight used for filtering.
382 //   pred: Pointer to the well-built predictors.
383 //   accum: Pointer to the pixel-wise accumulator for filtering.
384 //   count: Pointer to the pixel-wise counter fot filtering.
385 // Returns:
386 //   Nothing will be returned. But the content to which `accum` and `pred`
387 //   point will be modified.
av1_apply_temporal_filter_self(const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int num_planes,const int filter_weight,const uint8_t * pred,uint32_t * accum,uint16_t * count)388 void av1_apply_temporal_filter_self(const MACROBLOCKD *mbd,
389                                     const BLOCK_SIZE block_size,
390                                     const int num_planes,
391                                     const int filter_weight,
392                                     const uint8_t *pred, uint32_t *accum,
393                                     uint16_t *count) {
394   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
395 
396   // Block information.
397   const int mb_height = block_size_high[block_size];
398   const int mb_width = block_size_wide[block_size];
399   const int mb_pels = mb_height * mb_width;
400   const int is_high_bitdepth = is_cur_buf_hbd(mbd);
401   const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
402 
403   int plane_offset = 0;
404   for (int plane = 0; plane < num_planes; ++plane) {
405     const int subsampling_y = mbd->plane[plane].subsampling_y;
406     const int subsampling_x = mbd->plane[plane].subsampling_x;
407     const int h = mb_height >> subsampling_y;  // Plane height.
408     const int w = mb_width >> subsampling_x;   // Plane width.
409 
410     int pred_idx = 0;
411     for (int i = 0; i < h; ++i) {
412       for (int j = 0; j < w; ++j) {
413         const int idx = plane_offset + pred_idx;  // Index with plane shift.
414         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
415         accum[idx] += filter_weight * pred_value;
416         count[idx] += filter_weight;
417         ++pred_idx;
418       }
419     }
420     plane_offset += mb_pels;
421   }
422 }
423 
424 // Function to compute pixel-wise squared difference between two buffers.
425 // Inputs:
426 //   ref: Pointer to reference buffer.
427 //   ref_offset: Start position of reference buffer for computation.
428 //   ref_stride: Stride for reference buffer.
429 //   tgt: Pointer to target buffer.
430 //   tgt_offset: Start position of target buffer for computation.
431 //   tgt_stride: Stride for target buffer.
432 //   height: Height of block for computation.
433 //   width: Width of block for computation.
434 //   is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
435 //   square_diff: Pointer to save the squared differces.
436 // Returns:
437 //   Nothing will be returned. But the content to which `square_diff` points
438 //   will be modified.
compute_square_diff(const uint8_t * ref,const int ref_offset,const int ref_stride,const uint8_t * tgt,const int tgt_offset,const int tgt_stride,const int height,const int width,const int is_high_bitdepth,uint32_t * square_diff)439 static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
440                                        const int ref_stride, const uint8_t *tgt,
441                                        const int tgt_offset,
442                                        const int tgt_stride, const int height,
443                                        const int width,
444                                        const int is_high_bitdepth,
445                                        uint32_t *square_diff) {
446   const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
447   const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
448 
449   int ref_idx = 0;
450   int tgt_idx = 0;
451   int idx = 0;
452   for (int i = 0; i < height; ++i) {
453     for (int j = 0; j < width; ++j) {
454       const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
455                                                   : ref[ref_offset + ref_idx];
456       const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
457                                                   : tgt[tgt_offset + tgt_idx];
458       const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
459                                                     : (tgt_value - ref_value);
460       square_diff[idx] = diff * diff;
461 
462       ++ref_idx;
463       ++tgt_idx;
464       ++idx;
465     }
466     ref_idx += (ref_stride - width);
467     tgt_idx += (tgt_stride - width);
468   }
469 }
470 
471 // Function to adjust the filter weight when use YUV strategy.
472 // Inputs:
473 //   filter_weight: Original filter weight.
474 //   sum_square_diff: Sum of squared difference between input frame and
475 //                    prediction. This field is computed pixel by pixel, and
476 //                    is used as a reference for the filter weight adjustment.
477 //   num_ref_pixels: Number of pixels used to compute the `sum_square_diff`.
478 //                   This field should align with the above lookup tables
479 //                   `filter_weight_adjustment_lookup_table_yuv` and
480 //                   `highbd_filter_weight_adjustment_lookup_table_yuv`.
481 //   strength: Strength for filter weight adjustment.
482 // Returns:
483 //   Adjusted filter weight which will finally be used for filtering.
adjust_filter_weight_yuv(const int filter_weight,const uint64_t sum_square_diff,const int num_ref_pixels,const int strength)484 static INLINE int adjust_filter_weight_yuv(const int filter_weight,
485                                            const uint64_t sum_square_diff,
486                                            const int num_ref_pixels,
487                                            const int strength) {
488   int modifier =
489       (int)(AOMMIN(sum_square_diff * TF_YUV_FILTER_WEIGHT_SCALE, INT32_MAX)) /
490       num_ref_pixels;
491   const int rounding = (1 << strength) >> 1;
492   modifier = (modifier + rounding) >> strength;
493   return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight;
494 }
495 
496 // Applies temporal filter with YUV strategy.
497 // Inputs:
498 //   frame_to_filter: Pointer to the frame to be filtered, which is used as
499 //                    reference to compute squared differece from the predictor.
500 //   mbd: Pointer to the block for filtering, which is ONLY used to get
501 //        subsampling information of all YUV planes.
502 //   block_size: Size of the block.
503 //   mb_row: Row index of the block in the entire frame.
504 //   mb_col: Column index of the block in the entire frame.
505 //   num_planes: Number of planes in the frame.
506 //   strength: Strength for filter weight adjustment.
507 //   use_subblock: Whether to use 4 sub-blocks to replace the original block.
508 //   subblock_filter_weights: The filter weights for each sub-block (row-major
509 //                            order). If `use_subblock` is set as 0, the first
510 //                            weight will be applied to the entire block.
511 //   pred: Pointer to the well-built predictors.
512 //   accum: Pointer to the pixel-wise accumulator for filtering.
513 //   count: Pointer to the pixel-wise counter fot filtering.
514 // Returns:
515 //   Nothing will be returned. But the content to which `accum` and `pred`
516 //   point will be modified.
av1_apply_temporal_filter_yuv_c(const YV12_BUFFER_CONFIG * frame_to_filter,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const int strength,const int use_subblock,const int * subblock_filter_weights,const uint8_t * pred,uint32_t * accum,uint16_t * count)517 void av1_apply_temporal_filter_yuv_c(
518     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
519     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
520     const int num_planes, const int strength, const int use_subblock,
521     const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
522     uint16_t *count) {
523   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
524 
525   // Block information.
526   const int mb_height = block_size_high[block_size];
527   const int mb_width = block_size_wide[block_size];
528   const int mb_pels = mb_height * mb_width;
529   const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
530   const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
531 
532   // Allocate memory for pixel-wise squared differences for all planes. They,
533   // regardless of the subsampling, are assigned with memory of size `mb_pels`.
534   uint32_t *square_diff =
535       aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
536   memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
537 
538   int plane_offset = 0;
539   for (int plane = 0; plane < num_planes; ++plane) {
540     // Locate pixel on reference frame.
541     const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
542     const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
543     const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
544     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
545     const uint8_t *ref = frame_to_filter->buffers[plane];
546     compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
547                         plane_w, plane_h, plane_w, is_high_bitdepth,
548                         square_diff + plane_offset);
549     plane_offset += mb_pels;
550   }
551 
552   // Get window size for pixel-wise filtering.
553   assert(TF_YUV_FILTER_WINDOW_LENGTH % 2 == 1);
554   const int half_window = TF_YUV_FILTER_WINDOW_LENGTH >> 1;
555 
556   // Handle planes in sequence.
557   plane_offset = 0;
558   for (int plane = 0; plane < num_planes; ++plane) {
559     const int subsampling_y = mbd->plane[plane].subsampling_y;
560     const int subsampling_x = mbd->plane[plane].subsampling_x;
561     const int h = mb_height >> subsampling_y;  // Plane height.
562     const int w = mb_width >> subsampling_x;   // Plane width.
563 
564     // Perform filtering.
565     int pred_idx = 0;
566     for (int i = 0; i < h; ++i) {
567       for (int j = 0; j < w; ++j) {
568         // non-local mean approach
569         uint64_t sum_square_diff = 0;
570         int num_ref_pixels = 0;
571 
572         for (int wi = -half_window; wi <= half_window; ++wi) {
573           for (int wj = -half_window; wj <= half_window; ++wj) {
574             const int y = i + wi;  // Y-coord on the current plane.
575             const int x = j + wj;  // X-coord on the current plane.
576             if (y >= 0 && y < h && x >= 0 && x < w) {
577               sum_square_diff += square_diff[plane_offset + y * w + x];
578               ++num_ref_pixels;
579             }
580           }
581         }
582 
583         if (plane == 0) {  // Filter Y-plane using both U-plane and V-plane.
584           for (int p = 1; p < num_planes; ++p) {
585             const int ss_y_shift = mbd->plane[p].subsampling_y - subsampling_y;
586             const int ss_x_shift = mbd->plane[p].subsampling_x - subsampling_x;
587             const int yy = i >> ss_y_shift;  // Y-coord on UV-plane.
588             const int xx = j >> ss_x_shift;  // X-coord on UV-plane.
589             const int ww = w >> ss_x_shift;  // Width of UV-plane.
590             sum_square_diff += square_diff[p * mb_pels + yy * ww + xx];
591             ++num_ref_pixels;
592           }
593         } else {  // Filter U-plane and V-plane using Y-plane.
594           const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
595           const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
596           for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
597             for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
598               const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
599               const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
600               const int ww = w << ss_x_shift;         // Width of Y-plane.
601               sum_square_diff += square_diff[yy * ww + xx];
602               ++num_ref_pixels;
603             }
604           }
605         }
606 
607         // Base filter weight estimated by motion search error.
608         const int subblock_idx =
609             use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
610         const int filter_weight = subblock_filter_weights[subblock_idx];
611 
612         const int idx = plane_offset + pred_idx;  // Index with plane shift.
613         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
614         const int adjusted_weight = adjust_filter_weight_yuv(
615             filter_weight, sum_square_diff, num_ref_pixels, strength);
616         accum[idx] += adjusted_weight * pred_value;
617         count[idx] += adjusted_weight;
618 
619         ++pred_idx;
620       }
621     }
622     plane_offset += mb_pels;
623   }
624 
625   aom_free(square_diff);
626 }
627 
628 // Applies temporal filter with plane-wise strategy.
629 // The strategy of filter weight adjustment is different from the function
630 // `av1_apply_temporal_filter_yuv_c()`.
631 // Inputs:
632 //   frame_to_filter: Pointer to the frame to be filtered, which is used as
633 //                    reference to compute squared differece from the predictor.
634 //   mbd: Pointer to the block for filtering, which is ONLY used to get
635 //        subsampling information of all planes.
636 //   block_size: Size of the block.
637 //   mb_row: Row index of the block in the entire frame.
638 //   mb_col: Column index of the block in the entire frame.
639 //   num_planes: Number of planes in the frame.
640 //   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
641 //                 with each plane (in Y, U, V order).
642 //   use_subblock: Whether to use 4 sub-blocks to replace the original block.
643 //   block_mse: Motion search error (MSE) for the entire block.
644 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
645 //   q_factor: Quantization factor. This is actually the `q` defined in libaom,
646 //             which is converted from `qindex`.
647 //   pred: Pointer to the well-built predictors.
648 //   accum: Pointer to the pixel-wise accumulator for filtering.
649 //   count: Pointer to the pixel-wise counter fot filtering.
650 // Returns:
651 //   Nothing will be returned. But the content to which `accum` and `pred`
652 //   point will be modified.
av1_apply_temporal_filter_planewise_c(const YV12_BUFFER_CONFIG * frame_to_filter,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const double * noise_levels,const int use_subblock,const int block_mse,const int * subblock_mses,const int q_factor,const uint8_t * pred,uint32_t * accum,uint16_t * count)653 void av1_apply_temporal_filter_planewise_c(
654     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
655     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
656     const int num_planes, const double *noise_levels, const int use_subblock,
657     const int block_mse, const int *subblock_mses, const int q_factor,
658     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
659   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
660 
661   // Block information.
662   const int mb_height = block_size_high[block_size];
663   const int mb_width = block_size_wide[block_size];
664   const int mb_pels = mb_height * mb_width;
665   const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
666   const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
667 
668   // Allocate memory for pixel-wise squared differences for all planes. They,
669   // regardless of the subsampling, are assigned with memory of size `mb_pels`.
670   uint32_t *square_diff =
671       aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
672   memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
673 
674   int plane_offset = 0;
675   for (int plane = 0; plane < num_planes; ++plane) {
676     // Locate pixel on reference frame.
677     const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
678     const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
679     const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
680     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
681     const uint8_t *ref = frame_to_filter->buffers[plane];
682     compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
683                         plane_w, plane_h, plane_w, is_high_bitdepth,
684                         square_diff + plane_offset);
685     plane_offset += mb_pels;
686   }
687 
688   // Get window size for pixel-wise filtering.
689   assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH % 2 == 1);
690   const int half_window = TF_PLANEWISE_FILTER_WINDOW_LENGTH >> 1;
691 
692   // Hyper-parameter for filter weight adjustment.
693   const int frame_height = frame_to_filter->heights[0]
694                            << mbd->plane[0].subsampling_y;
695   const int decay_control = frame_height >= 720 ? 4 : 3;
696 
697   // Handle planes in sequence.
698   plane_offset = 0;
699   for (int plane = 0; plane < num_planes; ++plane) {
700     const int subsampling_y = mbd->plane[plane].subsampling_y;
701     const int subsampling_x = mbd->plane[plane].subsampling_x;
702     const int h = mb_height >> subsampling_y;  // Plane height.
703     const int w = mb_width >> subsampling_x;   // Plane width.
704 
705     // Perform filtering.
706     int pred_idx = 0;
707     for (int i = 0; i < h; ++i) {
708       for (int j = 0; j < w; ++j) {
709         // non-local mean approach
710         uint64_t sum_square_diff = 0;
711         int num_ref_pixels = 0;
712 
713         for (int wi = -half_window; wi <= half_window; ++wi) {
714           for (int wj = -half_window; wj <= half_window; ++wj) {
715             const int y = CLIP(i + wi, 0, h - 1);  // Y-coord on current plane.
716             const int x = CLIP(j + wj, 0, w - 1);  // X-coord on current plane.
717             sum_square_diff += square_diff[plane_offset + y * w + x];
718             ++num_ref_pixels;
719           }
720         }
721 
722         // Filter U-plane and V-plane using Y-plane. This is because motion
723         // search is only done on Y-plane, so the information from Y-plane will
724         // be more accurate.
725         if (plane != 0) {
726           const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
727           const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
728           for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
729             for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
730               const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
731               const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
732               const int ww = w << ss_x_shift;         // Width of Y-plane.
733               sum_square_diff += square_diff[yy * ww + xx];
734               ++num_ref_pixels;
735             }
736           }
737         }
738 
739         // Scale down the difference for high bit depth input.
740         if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
741         const double window_error = (double)(sum_square_diff) / num_ref_pixels;
742         const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
743         const double block_error =
744             (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
745 
746         // Control factor for non-local mean approach.
747         const double r =
748             (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
749         const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
750 
751         // Compute filter weight.
752         const double scaled_diff =
753             AOMMAX(-(window_error + block_error / 10) / (2 * r * r * q), -15.0);
754         const int adjusted_weight =
755             (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
756 
757         const int idx = plane_offset + pred_idx;  // Index with plane shift.
758         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
759         accum[idx] += adjusted_weight * pred_value;
760         count[idx] += adjusted_weight;
761 
762         ++pred_idx;
763       }
764     }
765     plane_offset += mb_pels;
766   }
767 
768   aom_free(square_diff);
769 }
770 
771 // Computes temporal filter weights and accumulators from all reference frames
772 // excluding the current frame to be filtered.
773 // Inputs:
774 //   frame_to_filter: Pointer to the frame to be filtered, which is used as
775 //                    reference to compute squared differece from the predictor.
776 //   mbd: Pointer to the block for filtering, which is ONLY used to get
777 //        subsampling information of all planes and the bit-depth.
778 //   block_size: Size of the block.
779 //   mb_row: Row index of the block in the entire frame.
780 //   mb_col: Column index of the block in the entire frame.
781 //   num_planes: Number of planes in the frame.
782 //   strength: Strength for filter weight adjustment. (Used in YUV strategy)
783 //   use_subblock: Whether to use 4 sub-blocks to replace the original block.
784 //                 (Used in YUV strategy)
785 //   subblock_filter_weights: The filter weights for each sub-block (row-major
786 //                            order). If `use_subblock` is set as 0, the first
787 //                            weight will be applied to the entire block. (Used
788 //                            in YUV strategy)
789 //   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
790 //                 with each plane (in Y, U, V order). (Used in plane-wise
791 //                 strategy)
792 //   block_mse: Motion search error (MSE) for the entire block.
793 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
794 //   q_factor: Quantization factor.
795 //   pred: Pointer to the well-built predictors.
796 //   accum: Pointer to the pixel-wise accumulator for filtering.
797 //   count: Pointer to the pixel-wise counter fot filtering.
798 // Returns:
799 //   Nothing will be returned. But the content to which `accum` and `pred`
800 //   point will be modified.
av1_apply_temporal_filter_others(const YV12_BUFFER_CONFIG * frame_to_filter,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const int strength,const int use_subblock,const int * subblock_filter_weights,const double * noise_levels,const int block_mse,const int * subblock_mses,const int q_factor,const uint8_t * pred,uint32_t * accum,uint16_t * count)801 void av1_apply_temporal_filter_others(
802     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
803     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
804     const int num_planes, const int strength, const int use_subblock,
805     const int *subblock_filter_weights, const double *noise_levels,
806     const int block_mse, const int *subblock_mses, const int q_factor,
807     const uint8_t *pred, uint32_t *accum, uint16_t *count) {
808   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
809 
810   if (TF_ENABLE_PLANEWISE_STRATEGY) {
811     // TODO(any): avx2 and sse2 version should be changed to align with C
812     // function before using.
813     if (is_frame_high_bitdepth(frame_to_filter) || block_size != BLOCK_32X32) {
814       av1_apply_temporal_filter_planewise_c(
815           frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
816           noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
817           accum, count);
818     } else {
819       av1_apply_temporal_filter_planewise(
820           frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
821           noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
822           accum, count);
823     }
824   } else {  // Commonly used for low-resolution video.
825     if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
826         subblock_filter_weights[2] == 0 && subblock_filter_weights[3] == 0) {
827       return;
828     }
829     const int adj_strength = strength + 2 * (mbd->bd - 8);
830     if (num_planes == 3 && TF_YUV_FILTER_WEIGHT_SCALE == 3 &&
831         block_size != BLOCK_32X32) {
832       av1_apply_temporal_filter_yuv(frame_to_filter, mbd, block_size, mb_row,
833                                     mb_col, num_planes, adj_strength,
834                                     use_subblock, subblock_filter_weights, pred,
835                                     accum, count);
836     } else {
837       // TODO(any): sse4 version should be changed to align with C function
838       // before using.
839       av1_apply_temporal_filter_yuv_c(frame_to_filter, mbd, block_size, mb_row,
840                                       mb_col, num_planes, adj_strength,
841                                       use_subblock, subblock_filter_weights,
842                                       pred, accum, count);
843     }
844   }
845 }
846 
847 // Normalizes the accumulated filtering result to produce the filtered frame.
848 // Inputs:
849 //   mbd: Pointer to the block for filtering, which is ONLY used to get
850 //        subsampling information of all planes.
851 //   block_size: Size of the block.
852 //   mb_row: Row index of the block in the entire frame.
853 //   mb_col: Column index of the block in the entire frame.
854 //   num_planes: Number of planes in the frame.
855 //   accum: Pointer to the pre-computed accumulator.
856 //   count: Pointer to the pre-computed count.
857 //   result_buffer: Pointer to result buffer.
858 // Returns:
859 //   Nothing will be returned. But the content to which `result_buffer` point
860 //   will be modified.
tf_normalize_filtered_frame(const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const uint32_t * accum,const uint16_t * count,YV12_BUFFER_CONFIG * result_buffer)861 static void tf_normalize_filtered_frame(
862     const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
863     const int mb_col, const int num_planes, const uint32_t *accum,
864     const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
865   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
866 
867   // Block information.
868   const int mb_height = block_size_high[block_size];
869   const int mb_width = block_size_wide[block_size];
870   const int mb_pels = mb_height * mb_width;
871   const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
872 
873   int plane_offset = 0;
874   for (int plane = 0; plane < num_planes; ++plane) {
875     const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
876     const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
877     const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1];
878     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
879     uint8_t *const buf = result_buffer->buffers[plane];
880     uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf);
881 
882     int plane_idx = 0;             // Pixel index on current plane (block-base).
883     int frame_idx = frame_offset;  // Pixel index on the entire frame.
884     for (int i = 0; i < plane_h; ++i) {
885       for (int j = 0; j < plane_w; ++j) {
886         const int idx = plane_idx + plane_offset;
887         const uint16_t rounding = count[idx] >> 1;
888         if (is_high_bitdepth) {
889           buf16[frame_idx] =
890               (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
891         } else {
892           buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
893         }
894         ++plane_idx;
895         ++frame_idx;
896       }
897       frame_idx += (frame_stride - plane_w);
898     }
899     plane_offset += mb_pels;
900   }
901 }
902 
903 // Helper function to compute number of blocks on either side of the frame.
get_num_blocks(const int frame_length,const int mb_length)904 static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
905   return (frame_length + mb_length - 1) / mb_length;
906 }
907 
908 typedef struct {
909   int64_t sum;
910   int64_t sse;
911 } FRAME_DIFF;
912 
913 // Does temporal filter for a particular frame.
914 // Inputs:
915 //   cpi: Pointer to the composed information of input video.
916 //   frames: Frame buffers used for temporal filtering.
917 //   num_frames: Number of frames in the frame buffer.
918 //   filter_frame_idx: Index of the frame to be filtered.
919 //   is_key_frame: Whether the to-filter is a key frame.
920 //   is_second_arf: Whether the to-filter frame is the second ARF. This field
921 //                  is ONLY used for assigning filter weight.
922 //   block_size: Block size used for temporal filtering.
923 //   scale: Scaling factor.
924 //   strength: Pre-estimated strength for filter weight adjustment.
925 //   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
926 //                 with each plane (in Y, U, V order).
927 // Returns:
928 //   Difference between filtered frame and the original frame.
tf_do_filtering(AV1_COMP * cpi,YV12_BUFFER_CONFIG ** frames,const int num_frames,const int filter_frame_idx,const int is_key_frame,const int is_second_arf,const BLOCK_SIZE block_size,const struct scale_factors * scale,const int strength,const double * noise_levels)929 static FRAME_DIFF tf_do_filtering(
930     AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, const int num_frames,
931     const int filter_frame_idx, const int is_key_frame, const int is_second_arf,
932     const BLOCK_SIZE block_size, const struct scale_factors *scale,
933     const int strength, const double *noise_levels) {
934   // Basic information.
935   const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
936   const int frame_height = frame_to_filter->y_crop_height;
937   const int frame_width = frame_to_filter->y_crop_width;
938   const int mb_height = block_size_high[block_size];
939   const int mb_width = block_size_wide[block_size];
940   const int mb_pels = mb_height * mb_width;
941   const int mb_rows = get_num_blocks(frame_height, mb_height);
942   const int mb_cols = get_num_blocks(frame_width, mb_width);
943   const int num_planes = av1_num_planes(&cpi->common);
944   const int mi_h = mi_size_high_log2[block_size];
945   const int mi_w = mi_size_wide_log2[block_size];
946   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
947   const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
948 
949   // Save input state.
950   MACROBLOCK *const mb = &cpi->td.mb;
951   MACROBLOCKD *const mbd = &mb->e_mbd;
952   uint8_t *input_buffer[MAX_MB_PLANE];
953   for (int i = 0; i < num_planes; i++) {
954     input_buffer[i] = mbd->plane[i].pre[0].buf;
955   }
956   MB_MODE_INFO **input_mb_mode_info = mbd->mi;
957 
958   // Setup.
959   mbd->block_ref_scale_factors[0] = scale;
960   mbd->block_ref_scale_factors[1] = scale;
961   // A temporary block info used to store state in temporal filtering process.
962   MB_MODE_INFO *tmp_mb_mode_info = (MB_MODE_INFO *)malloc(sizeof(MB_MODE_INFO));
963   memset(tmp_mb_mode_info, 0, sizeof(MB_MODE_INFO));
964   mbd->mi = &tmp_mb_mode_info;
965   mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
966   // Allocate memory for predictor, accumulator and count.
967   uint8_t *pred8 = aom_memalign(32, num_planes * mb_pels * sizeof(uint8_t));
968   uint16_t *pred16 = aom_memalign(32, num_planes * mb_pels * sizeof(uint16_t));
969   uint32_t *accum = aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
970   uint16_t *count = aom_memalign(16, num_planes * mb_pels * sizeof(uint16_t));
971   memset(pred8, 0, num_planes * mb_pels * sizeof(pred8[0]));
972   memset(pred16, 0, num_planes * mb_pels * sizeof(pred16[0]));
973   uint8_t *const pred = is_high_bitdepth ? CONVERT_TO_BYTEPTR(pred16) : pred8;
974 
975   // Do filtering.
976   FRAME_DIFF diff = { 0, 0 };
977   // Perform temporal filtering block by block.
978   for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
979     av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
980                           (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
981                           cpi->oxcf.border_in_pixels);
982     for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
983       av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
984                             (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
985                             cpi->oxcf.border_in_pixels);
986       memset(accum, 0, num_planes * mb_pels * sizeof(accum[0]));
987       memset(count, 0, num_planes * mb_pels * sizeof(count[0]));
988       MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
989       // Perform temporal filtering frame by frame.
990       for (int frame = 0; frame < num_frames; frame++) {
991         if (frames[frame] == NULL) continue;
992 
993         // Motion search.
994         MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
995         int subblock_filter_weights[4] = { 0, 0, 0, 0 };
996         int block_mse = INT_MAX;
997         int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
998 
999         if (frame == filter_frame_idx) {  // Frame to be filtered.
1000           // Set motion vector as 0 for the frame to be filtered.
1001           mbd->mi[0]->mv[0].as_mv = kZeroMv;
1002           // Change ref_mv sign for following frames.
1003           ref_mv.row *= -1;
1004           ref_mv.col *= -1;
1005         } else {  // Other reference frames.
1006           block_mse = tf_motion_search(cpi, frame_to_filter, frames[frame],
1007                                        block_size, mb_row, mb_col, &ref_mv,
1008                                        subblock_mvs, subblock_mses);
1009           // Do not pass down the reference motion vector if error is too large.
1010           const int thresh = AOMMIN(frame_height, frame_width) >= 720 ? 12 : 3;
1011           if (block_mse > (thresh << (mbd->bd - 8))) {
1012             ref_mv = kZeroMv;
1013           }
1014         }
1015 
1016         // Build predictor.
1017         int use_subblock = tf_get_filter_weight(
1018             block_mse, subblock_mses, is_second_arf, subblock_filter_weights);
1019         tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
1020                            num_planes, scale, use_subblock, subblock_mvs, pred);
1021 
1022         // Perform weighted averaging.
1023         if (frame == filter_frame_idx) {  // Frame to be filtered.
1024           av1_apply_temporal_filter_self(mbd, block_size, num_planes,
1025                                          subblock_filter_weights[0], pred,
1026                                          accum, count);
1027         } else {  // Other reference frames.
1028           const FRAME_TYPE frame_type =
1029               (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME
1030                                                            : KEY_FRAME;
1031           const int q_factor =
1032               (int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[frame_type],
1033                                            cpi->common.seq_params.bit_depth);
1034           av1_apply_temporal_filter_others(
1035               frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
1036               strength, use_subblock, subblock_filter_weights, noise_levels,
1037               block_mse, subblock_mses, q_factor, pred, accum, count);
1038         }
1039       }
1040 
1041       tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
1042                                   accum, count, &cpi->alt_ref_buffer);
1043 
1044       if (!is_key_frame && cpi->sf.hl_sf.adaptive_overlay_encoding) {
1045         const int y_height = mb_height >> mbd->plane[0].subsampling_y;
1046         const int y_width = mb_width >> mbd->plane[0].subsampling_x;
1047         const int source_y_stride = frame_to_filter->y_stride;
1048         const int filter_y_stride = cpi->alt_ref_buffer.y_stride;
1049         const int source_offset =
1050             mb_row * y_height * source_y_stride + mb_col * y_width;
1051         const int filter_offset =
1052             mb_row * y_height * filter_y_stride + mb_col * y_width;
1053         unsigned int sse = 0;
1054         cpi->fn_ptr[block_size].vf(frame_to_filter->y_buffer + source_offset,
1055                                    source_y_stride,
1056                                    cpi->alt_ref_buffer.y_buffer + filter_offset,
1057                                    filter_y_stride, &sse);
1058         diff.sum += sse;
1059         diff.sse += sse * sse;
1060       }
1061     }
1062   }
1063 
1064   // Restore input state
1065   for (int i = 0; i < num_planes; i++) {
1066     mbd->plane[i].pre[0].buf = input_buffer[i];
1067   }
1068   mbd->mi = input_mb_mode_info;
1069 
1070   free(tmp_mb_mode_info);
1071   aom_free(pred8);
1072   aom_free(pred16);
1073   aom_free(accum);
1074   aom_free(count);
1075 
1076   return diff;
1077 }
1078 
1079 // A constant number, sqrt(pi / 2),  used for noise estimation.
1080 static const double SQRT_PI_BY_2 = 1.25331413732;
1081 
av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG * frame,const int plane,const int bit_depth)1082 double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
1083                                             const int plane,
1084                                             const int bit_depth) {
1085   const int is_y_plane = (plane == 0);
1086   const int height = frame->crop_heights[is_y_plane ? 0 : 1];
1087   const int width = frame->crop_widths[is_y_plane ? 0 : 1];
1088   const int stride = frame->strides[is_y_plane ? 0 : 1];
1089   const uint8_t *src = frame->buffers[plane];
1090   const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
1091   const int is_high_bitdepth = is_frame_high_bitdepth(frame);
1092 
1093   int64_t accum = 0;
1094   int count = 0;
1095   for (int i = 1; i < height - 1; ++i) {
1096     for (int j = 1; j < width - 1; ++j) {
1097       // Setup a small 3x3 matrix.
1098       const int center_idx = i * stride + j;
1099       int mat[3][3];
1100       for (int ii = -1; ii <= 1; ++ii) {
1101         for (int jj = -1; jj <= 1; ++jj) {
1102           const int idx = center_idx + ii * stride + jj;
1103           mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx];
1104         }
1105       }
1106       // Compute sobel gradients.
1107       const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
1108                      2 * (mat[1][0] - mat[1][2]);
1109       const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
1110                      2 * (mat[0][1] - mat[2][1]);
1111       const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
1112       // Accumulate Laplacian.
1113       if (Ga < NOISE_ESTIMATION_EDGE_THRESHOLD) {  // Only count smooth pixels.
1114         const int v = 4 * mat[1][1] -
1115                       2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
1116                       (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
1117         accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8);
1118         ++count;
1119       }
1120     }
1121   }
1122 
1123   // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
1124   return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
1125 }
1126 
1127 // Estimates the strength for filter weight adjustment, which is used in YUV
1128 // strategy. This estimation is based on the pre-estimated noise level of the
1129 // to-filter frame.
1130 // Inputs:
1131 //   cpi: Pointer to the composed information of input video.
1132 //   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
1133 //   group_boost: Boost level for the current group of frames.
1134 // Returns:
1135 //   Estimated strength which will be used for filter weight adjustment.
tf_estimate_strength(const AV1_COMP * cpi,const double noise_level,const int group_boost)1136 static int tf_estimate_strength(const AV1_COMP *cpi, const double noise_level,
1137                                 const int group_boost) {
1138   int strength = cpi->oxcf.arnr_strength;
1139 
1140   // Adjust the strength based on the estimated noise level.
1141   if (noise_level > 0) {       // Adjust when the noise level is reliable.
1142     if (noise_level < 0.75) {  // Noise level lies in range (0, 0.75).
1143       strength = strength - 2;
1144     } else if (noise_level < 1.75) {  // Noise level lies in range [0.75, 1.75).
1145       strength = strength - 1;
1146     } else if (noise_level < 4.0) {  // Noise level lies in range [1.75, 4.0).
1147       strength = strength + 0;
1148     } else {  // Noise level lies in range [4.0, +inf).
1149       strength = strength + 1;
1150     }
1151   }
1152 
1153   // Adjust the strength based on active max q.
1154   const FRAME_TYPE frame_type =
1155       (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME : KEY_FRAME;
1156   const int q = (int)av1_convert_qindex_to_q(
1157       cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
1158   strength = strength - AOMMAX(0, (16 - q) / 2);
1159 
1160   return CLIP(strength, 0, group_boost / 300);
1161 }
1162 
1163 // Setups the frame buffer for temporal filtering. Basically, this fuction
1164 // determines how many frames will be used for temporal filtering and then
1165 // groups them into a buffer.
1166 // Inputs:
1167 //   cpi: Pointer to the composed information of input video.
1168 //   filter_frame_lookahead_idx: The index of the to-filter frame in the
1169 //                               lookahead buffer `cpi->lookahead`.
1170 //   is_second_arf: Whether the to-filter frame is the second ARF. This field
1171 //                  will affect the number of frames used for filtering.
1172 //   frames: Pointer to the frame buffer to setup.
1173 //   num_frames_for_filtering: Number of frames used for filtering.
1174 //   filter_frame_idx: Index of the to-filter frame in the setup frame buffer.
1175 // Returns:
1176 //   Nothing will be returned. But the frame buffer `frames`, number of frames
1177 //   in the buffer `num_frames_for_filtering`, and the index of the to-filter
1178 //   frame in the buffer `filter_frame_idx` will be updated in this function.
tf_setup_filtering_buffer(const AV1_COMP * cpi,const int filter_frame_lookahead_idx,const int is_second_arf,YV12_BUFFER_CONFIG ** frames,int * num_frames_for_filtering,int * filter_frame_idx)1179 static void tf_setup_filtering_buffer(const AV1_COMP *cpi,
1180                                       const int filter_frame_lookahead_idx,
1181                                       const int is_second_arf,
1182                                       YV12_BUFFER_CONFIG **frames,
1183                                       int *num_frames_for_filtering,
1184                                       int *filter_frame_idx) {
1185   int num_frames = 0;          // Number of frames used for filtering.
1186   int num_frames_before = -1;  // Number of frames before the to-filter frame.
1187   int filter_frame_offset;
1188 
1189   if (filter_frame_lookahead_idx == -1) {  // Key frame.
1190     num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
1191     num_frames_before = 0;
1192     filter_frame_offset = filter_frame_lookahead_idx;
1193   } else if (filter_frame_lookahead_idx < -1) {  // Key frame in one-pass mode.
1194     num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
1195     num_frames_before = num_frames - 1;
1196     filter_frame_offset = -filter_frame_lookahead_idx;
1197   } else {
1198     num_frames = cpi->oxcf.arnr_max_frames;
1199     if (is_second_arf) {  // Only use 2 neighbours for the second ARF.
1200       num_frames = AOMMIN(num_frames, 3);
1201     }
1202     if (num_frames > cpi->rc.gfu_boost / 150) {
1203       num_frames = cpi->rc.gfu_boost / 150;
1204       num_frames += !(num_frames & 1);
1205     }
1206     num_frames_before = AOMMIN(num_frames >> 1, filter_frame_lookahead_idx + 1);
1207     const int lookahead_depth =
1208         av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage);
1209     const int num_frames_after =
1210         AOMMIN((num_frames - 1) >> 1,
1211                lookahead_depth - filter_frame_lookahead_idx - 1);
1212     num_frames = num_frames_before + 1 + num_frames_after;
1213     filter_frame_offset = filter_frame_lookahead_idx;
1214   }
1215   *num_frames_for_filtering = num_frames;
1216   *filter_frame_idx = num_frames_before;
1217 
1218   // Setup the frame buffer.
1219   for (int frame = 0; frame < num_frames; ++frame) {
1220     const int lookahead_idx = frame - num_frames_before + filter_frame_offset;
1221     struct lookahead_entry *buf = av1_lookahead_peek(
1222         cpi->lookahead, lookahead_idx, cpi->compressor_stage);
1223     frames[frame] = (buf == NULL) ? NULL : &buf->img;
1224   }
1225 }
1226 
av1_temporal_filter(AV1_COMP * cpi,const int filter_frame_lookahead_idx,int * show_existing_arf)1227 int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
1228                         int *show_existing_arf) {
1229   // Basic informaton of the current frame.
1230   const GF_GROUP *const gf_group = &cpi->gf_group;
1231   const uint8_t group_idx = gf_group->index;
1232   const FRAME_UPDATE_TYPE update_type = gf_group->update_type[group_idx];
1233   // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame).
1234   // This frame is ALWAYS a show existing frame.
1235   const int is_second_arf = (update_type == INTNL_ARF_UPDATE) &&
1236                             (filter_frame_lookahead_idx >= 7) &&
1237                             cpi->sf.hl_sf.second_alt_ref_filtering;
1238   // TODO(anyone): Currently, we enforce the filtering strength on internal
1239   // ARFs except the second ARF to be zero. We should investigate in which case
1240   // it is more beneficial to use non-zero strength filtering.
1241   if (update_type == INTNL_ARF_UPDATE && !is_second_arf) {
1242     return 0;
1243   }
1244 
1245   // TODO(yunqing): For INTNL_ARF_UPDATE type, the following me initialization
1246   // is used somewhere unexpectedly. Should be resolved later.
1247   // Initialize errorperbit, sadperbit16 and sadperbit4.
1248   const int rdmult = av1_compute_rd_mult_based_on_qindex(cpi, TF_QINDEX);
1249   set_error_per_bit(&cpi->td.mb, rdmult);
1250   av1_initialize_me_consts(cpi, &cpi->td.mb, TF_QINDEX);
1251   av1_fill_mv_costs(cpi->common.fc,
1252                     cpi->common.features.cur_frame_force_integer_mv,
1253                     cpi->common.features.allow_high_precision_mv, &cpi->td.mb);
1254 
1255   // Setup frame buffer for filtering.
1256   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
1257   int num_frames_for_filtering = 0;
1258   int filter_frame_idx = -1;
1259   tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf,
1260                             frames, &num_frames_for_filtering,
1261                             &filter_frame_idx);
1262 
1263   // Estimate noise and strength.
1264   const int bit_depth = cpi->common.seq_params.bit_depth;
1265   const int num_planes = av1_num_planes(&cpi->common);
1266   double noise_levels[MAX_MB_PLANE] = { 0 };
1267   for (int plane = 0; plane < num_planes; ++plane) {
1268     noise_levels[plane] = av1_estimate_noise_from_single_plane(
1269         frames[filter_frame_idx], plane, bit_depth);
1270   }
1271   const int strength =
1272       tf_estimate_strength(cpi, noise_levels[0], cpi->rc.gfu_boost);
1273   if (filter_frame_lookahead_idx >= 0) {
1274     cpi->common.showable_frame =
1275         (strength == 0 && num_frames_for_filtering == 1) || is_second_arf ||
1276         (cpi->oxcf.enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames);
1277   }
1278 
1279   // Do filtering.
1280   const int is_key_frame = (filter_frame_lookahead_idx < 0);
1281   FRAME_DIFF diff = { 0, 0 };
1282   if (num_frames_for_filtering > 0 && frames[0] != NULL) {
1283     // Setup scaling factors. Scaling on each of the arnr frames is not
1284     // supported.
1285     // ARF is produced at the native frame size and resized when coded.
1286     struct scale_factors sf;
1287     av1_setup_scale_factors_for_frame(
1288         &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
1289         frames[0]->y_crop_width, frames[0]->y_crop_height);
1290     diff = tf_do_filtering(cpi, frames, num_frames_for_filtering,
1291                            filter_frame_idx, is_key_frame, is_second_arf,
1292                            TF_BLOCK_SIZE, &sf, strength, noise_levels);
1293   }
1294 
1295   if (is_key_frame) {  // Key frame should always be filtered.
1296     return 1;
1297   }
1298 
1299   if ((show_existing_arf != NULL && cpi->sf.hl_sf.adaptive_overlay_encoding) ||
1300       is_second_arf) {
1301     const int frame_height = frames[filter_frame_idx]->y_crop_height;
1302     const int frame_width = frames[filter_frame_idx]->y_crop_width;
1303     const int block_height = block_size_high[TF_BLOCK_SIZE];
1304     const int block_width = block_size_wide[TF_BLOCK_SIZE];
1305     const int mb_rows = get_num_blocks(frame_height, block_height);
1306     const int mb_cols = get_num_blocks(frame_width, block_width);
1307     const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
1308     const float mean = (float)diff.sum / num_mbs;
1309     const float std = (float)sqrt((float)diff.sse / num_mbs - mean * mean);
1310 
1311     aom_clear_system_state();
1312     // TODO(yunqing): This can be combined with TPL q calculation later.
1313     cpi->rc.base_frame_target = gf_group->bit_allocation[group_idx];
1314     av1_set_target_rate(cpi, cpi->common.width, cpi->common.height);
1315     int top_index = 0;
1316     int bottom_index = 0;
1317     const int q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cpi->oxcf.width,
1318                                            cpi->oxcf.height, group_idx,
1319                                            &bottom_index, &top_index);
1320     const int ac_q = av1_ac_quant_QTX(q, 0, bit_depth);
1321     const float threshold = 0.7f * ac_q * ac_q;
1322 
1323     if (!is_second_arf) {
1324       *show_existing_arf = 0;
1325       if (mean < threshold && std < mean * 1.2) {
1326         *show_existing_arf = 1;
1327       }
1328       cpi->common.showable_frame |= *show_existing_arf;
1329     } else {
1330       // Use source frame if the filtered frame becomes very different.
1331       if (!(mean < threshold && std < mean * 1.2)) {
1332         return 0;
1333       }
1334     }
1335   }
1336 
1337   return 1;
1338 }
1339