1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <math.h>
13 #include <limits.h>
14
15 #include "config/aom_config.h"
16 #include "config/aom_scale_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/odintrin.h"
20 #include "aom_mem/aom_mem.h"
21 #include "aom_ports/aom_timer.h"
22 #include "aom_ports/mem.h"
23 #include "aom_scale/aom_scale.h"
24 #include "av1/common/alloccommon.h"
25 #include "av1/common/av1_common_int.h"
26 #include "av1/common/quant_common.h"
27 #include "av1/common/reconinter.h"
28 #include "av1/encoder/av1_quantize.h"
29 #include "av1/encoder/encodeframe.h"
30 #include "av1/encoder/encoder.h"
31 #include "av1/encoder/ethread.h"
32 #include "av1/encoder/extend.h"
33 #include "av1/encoder/firstpass.h"
34 #include "av1/encoder/gop_structure.h"
35 #include "av1/encoder/mcomp.h"
36 #include "av1/encoder/motion_search_facade.h"
37 #include "av1/encoder/pass2_strategy.h"
38 #include "av1/encoder/ratectrl.h"
39 #include "av1/encoder/reconinter_enc.h"
40 #include "av1/encoder/segmentation.h"
41 #include "av1/encoder/temporal_filter.h"
42
43 /*!\cond */
44
45 // NOTE: All `tf` in this file means `temporal filtering`.
46
47 // Forward Declaration.
48 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
49 MV *subblock_mvs, int *subblock_mses);
50
51 /*!\endcond */
52 /*!\brief Does motion search for blocks in temporal filtering. This is
53 * the first step for temporal filtering. More specifically, given a frame to
54 * be filtered and another frame as reference, this function searches the
55 * reference frame to find out the most similar block as that from the frame
56 * to be filtered. This found block will be further used for weighted
57 * averaging.
58 *
59 * NOTE: Besides doing motion search for the entire block, this function will
60 * also do motion search for each 1/4 sub-block to get more precise
61 * predictions. Then, this function will determines whether to use 4
62 * sub-blocks to replace the entire block. If we do need to split the
63 * entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to
64 * the searched motion vector and search error (MSE) w.r.t. each sub-block
65 * respectively. Otherwise, the 4 elements will be the same, all of which
66 * are assigned as the searched motion vector and search error (MSE) for
67 * the entire block.
68 *
69 * \ingroup src_frame_proc
70 * \param[in] cpi Top level encoder instance structure
71 * \param[in] mb Pointer to macroblock
72 * \param[in] frame_to_filter Pointer to the frame to be filtered
73 * \param[in] ref_frame Pointer to the reference frame
74 * \param[in] block_size Block size used for motion search
75 * \param[in] mb_row Row index of the block in the frame
76 * \param[in] mb_col Column index of the block in the frame
77 * \param[in] ref_mv Reference motion vector, which is commonly
78 * inherited from the motion search result of
79 * previous frame.
80 * \param[out] subblock_mvs Pointer to the motion vectors for 4 sub-blocks
81 * \param[out] subblock_mses Pointer to the search errors (MSE) for 4
82 * sub-blocks
83 *
84 * \remark Nothing will be returned. Results are saved in subblock_mvs and
85 * subblock_mses
86 */
tf_motion_search(AV1_COMP * cpi,MACROBLOCK * mb,const YV12_BUFFER_CONFIG * frame_to_filter,const YV12_BUFFER_CONFIG * ref_frame,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,MV * ref_mv,MV * subblock_mvs,int * subblock_mses)87 static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
88 const YV12_BUFFER_CONFIG *frame_to_filter,
89 const YV12_BUFFER_CONFIG *ref_frame,
90 const BLOCK_SIZE block_size, const int mb_row,
91 const int mb_col, MV *ref_mv, MV *subblock_mvs,
92 int *subblock_mses) {
93 // Frame information
94 const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
95
96 // Block information (ONLY Y-plane is used for motion search).
97 const int mb_height = block_size_high[block_size];
98 const int mb_width = block_size_wide[block_size];
99 const int mb_pels = mb_height * mb_width;
100 const int y_stride = frame_to_filter->y_stride;
101 assert(y_stride == ref_frame->y_stride);
102 const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
103
104 // Save input state.
105 MACROBLOCKD *const mbd = &mb->e_mbd;
106 const struct buf_2d ori_src_buf = mb->plane[0].src;
107 const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
108
109 // Parameters used for motion search.
110 FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
111 SUBPEL_MOTION_SEARCH_PARAMS ms_params;
112 const int step_param = av1_init_search_range(
113 AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
114 const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
115 const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv;
116 const MV_COST_TYPE mv_cost_type =
117 min_frame_size >= 720
118 ? MV_COST_L1_HDRES
119 : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES);
120
121 // Starting position for motion search.
122 FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
123 // Baseline position for motion search (used for rate distortion comparison).
124 const MV baseline_mv = kZeroMv;
125
126 // Setup.
127 mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
128 mb->plane[0].src.stride = y_stride;
129 mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
130 mbd->plane[0].pre[0].stride = y_stride;
131
132 const SEARCH_METHODS search_method = NSTEP;
133 const search_site_config *search_site_cfg =
134 av1_get_search_site_config(cpi, mb, search_method);
135
136 // Unused intermediate results for motion search.
137 unsigned int sse, error;
138 int distortion;
139 int cost_list[5];
140
141 // Do motion search.
142 int_mv best_mv; // Searched motion vector.
143 int block_mse = INT_MAX;
144 MV block_mv = kZeroMv;
145 const int q = av1_get_q(cpi);
146
147 av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
148 &baseline_mv, search_site_cfg,
149 /*fine_search_interval=*/0);
150 av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
151 full_ms_params.run_mesh_search = 1;
152 full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
153
154 if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
155 // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
156 full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
157 full_ms_params.mesh_search_mv_diff_threshold = 2;
158 }
159
160 av1_full_pixel_search(start_mv, &full_ms_params, step_param,
161 cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
162 NULL);
163
164 if (force_integer_mv == 1) { // Only do full search on the entire block.
165 const int mv_row = best_mv.as_mv.row;
166 const int mv_col = best_mv.as_mv.col;
167 best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
168 best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
169 const int mv_offset = mv_row * y_stride + mv_col;
170 error = cpi->ppi->fn_ptr[block_size].vf(
171 ref_frame->y_buffer + y_offset + mv_offset, y_stride,
172 frame_to_filter->y_buffer + y_offset, y_stride, &sse);
173 block_mse = DIVIDE_AND_ROUND(error, mb_pels);
174 block_mv = best_mv.as_mv;
175 } else { // Do fractional search on the entire block and all sub-blocks.
176 av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
177 &baseline_mv, cost_list);
178 ms_params.forced_stop = EIGHTH_PEL;
179 ms_params.var_params.subpel_search_type = subpel_search_type;
180 // Since we are merely refining the result from full pixel search, we don't
181 // need regularization for subpel search
182 ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
183
184 MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
185 error = cpi->mv_search_params.find_fractional_mv_step(
186 &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
187 &distortion, &sse, NULL);
188 block_mse = DIVIDE_AND_ROUND(error, mb_pels);
189 block_mv = best_mv.as_mv;
190 *ref_mv = best_mv.as_mv;
191 // On 4 sub-blocks.
192 const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
193 const int subblock_height = block_size_high[subblock_size];
194 const int subblock_width = block_size_wide[subblock_size];
195 const int subblock_pels = subblock_height * subblock_width;
196 start_mv = get_fullmv_from_mv(ref_mv);
197
198 int subblock_idx = 0;
199 for (int i = 0; i < mb_height; i += subblock_height) {
200 for (int j = 0; j < mb_width; j += subblock_width) {
201 const int offset = i * y_stride + j;
202 mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
203 mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
204 av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb,
205 subblock_size, &baseline_mv,
206 search_site_cfg,
207 /*fine_search_interval=*/0);
208 av1_set_mv_search_method(&full_ms_params, search_site_cfg,
209 search_method);
210 full_ms_params.run_mesh_search = 1;
211 full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
212
213 if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
214 // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
215 full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
216 full_ms_params.mesh_search_mv_diff_threshold = 2;
217 }
218
219 av1_full_pixel_search(start_mv, &full_ms_params, step_param,
220 cond_cost_list(cpi, cost_list),
221 &best_mv.as_fullmv, NULL);
222
223 av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
224 &baseline_mv, cost_list);
225 ms_params.forced_stop = EIGHTH_PEL;
226 ms_params.var_params.subpel_search_type = subpel_search_type;
227 // Since we are merely refining the result from full pixel search, we
228 // don't need regularization for subpel search
229 ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
230
231 subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
232 error = cpi->mv_search_params.find_fractional_mv_step(
233 &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
234 &best_mv.as_mv, &distortion, &sse, NULL);
235 subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
236 subblock_mvs[subblock_idx] = best_mv.as_mv;
237 ++subblock_idx;
238 }
239 }
240 }
241
242 // Restore input state.
243 mb->plane[0].src = ori_src_buf;
244 mbd->plane[0].pre[0] = ori_pre_buf;
245
246 // Make partition decision.
247 tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
248 subblock_mses);
249
250 // Do not pass down the reference motion vector if error is too large.
251 const int thresh = (min_frame_size >= 720) ? 12 : 3;
252 if (block_mse > (thresh << (mbd->bd - 8))) {
253 *ref_mv = kZeroMv;
254 }
255 }
256 /*!\cond */
257
258 // Determines whether to split the entire block to 4 sub-blocks for filtering.
259 // In particular, this decision is made based on the comparison between the
260 // motion search error of the entire block and the errors of all sub-blocks.
261 // Inputs:
262 // block_mv: Motion vector for the entire block (ONLY as reference).
263 // block_mse: Motion search error (MSE) for the entire block (ONLY as
264 // reference).
265 // subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
266 // modified based on the partition decision).
267 // subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
268 // be modified based on the partition decision).
269 // Returns:
270 // Nothing will be returned. Results are saved in `subblock_mvs` and
271 // `subblock_mses`.
tf_determine_block_partition(const MV block_mv,const int block_mse,MV * subblock_mvs,int * subblock_mses)272 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
273 MV *subblock_mvs, int *subblock_mses) {
274 int min_subblock_mse = INT_MAX;
275 int max_subblock_mse = INT_MIN;
276 int64_t sum_subblock_mse = 0;
277 for (int i = 0; i < 4; ++i) {
278 sum_subblock_mse += subblock_mses[i];
279 min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
280 max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
281 }
282
283 // TODO(any): The following magic numbers may be tuned to improve the
284 // performance OR find a way to get rid of these magic numbers.
285 if (((block_mse * 15 < sum_subblock_mse * 4) &&
286 max_subblock_mse - min_subblock_mse < 48) ||
287 ((block_mse * 14 < sum_subblock_mse * 4) &&
288 max_subblock_mse - min_subblock_mse < 24)) { // No split.
289 for (int i = 0; i < 4; ++i) {
290 subblock_mvs[i] = block_mv;
291 subblock_mses[i] = block_mse;
292 }
293 }
294 }
295
296 // Helper function to determine whether a frame is encoded with high bit-depth.
is_frame_high_bitdepth(const YV12_BUFFER_CONFIG * frame)297 static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
298 return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
299 }
300
301 /*!\endcond */
302 /*!\brief Builds predictor for blocks in temporal filtering. This is the
303 * second step for temporal filtering, which is to construct predictions from
304 * all reference frames INCLUDING the frame to be filtered itself. These
305 * predictors are built based on the motion search results (motion vector is
306 * set as 0 for the frame to be filtered), and will be futher used for
307 * weighted averaging.
308 *
309 * \ingroup src_frame_proc
310 * \param[in] ref_frame Pointer to the reference frame (or the frame
311 * to be filtered)
312 * \param[in] mbd Pointer to the block for filtering. Besides
313 * containing the subsampling information of all
314 * planes, this field also gives the searched
315 * motion vector for the entire block, i.e.,
316 * `mbd->mi[0]->mv[0]`. This vector should be 0
317 * if the `ref_frame` itself is the frame to be
318 * filtered.
319 * \param[in] block_size Size of the block
320 * \param[in] mb_row Row index of the block in the frame
321 * \param[in] mb_col Column index of the block in the frame
322 * \param[in] num_planes Number of planes in the frame
323 * \param[in] scale Scaling factor
324 * \param[in] subblock_mvs The motion vectors for each sub-block (row-major
325 * order)
326 * \param[out] pred Pointer to the predictor to be built
327 *
328 * \remark Nothing returned, But the contents of `pred` will be modified
329 */
tf_build_predictor(const YV12_BUFFER_CONFIG * ref_frame,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const struct scale_factors * scale,const MV * subblock_mvs,uint8_t * pred)330 static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
331 const MACROBLOCKD *mbd,
332 const BLOCK_SIZE block_size, const int mb_row,
333 const int mb_col, const int num_planes,
334 const struct scale_factors *scale,
335 const MV *subblock_mvs, uint8_t *pred) {
336 // Information of the entire block.
337 const int mb_height = block_size_high[block_size]; // Height.
338 const int mb_width = block_size_wide[block_size]; // Width.
339 const int mb_y = mb_height * mb_row; // Y-coord (Top-left).
340 const int mb_x = mb_width * mb_col; // X-coord (Top-left).
341 const int bit_depth = mbd->bd; // Bit depth.
342 const int is_intrabc = 0; // Is intra-copied?
343 const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
344
345 // Default interpolation filters.
346 const int_interpfilters interp_filters =
347 av1_broadcast_interp_filter(MULTITAP_SHARP2);
348
349 // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
350 int plane_offset = 0;
351 for (int plane = 0; plane < num_planes; ++plane) {
352 const int subsampling_y = mbd->plane[plane].subsampling_y;
353 const int subsampling_x = mbd->plane[plane].subsampling_x;
354 // Information of each sub-block in current plane.
355 const int plane_h = mb_height >> subsampling_y; // Plane height.
356 const int plane_w = mb_width >> subsampling_x; // Plane width.
357 const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left).
358 const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left).
359 const int h = plane_h >> 1; // Sub-block height.
360 const int w = plane_w >> 1; // Sub-block width.
361 const int is_y_plane = (plane == 0); // Is Y-plane?
362
363 const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
364 ref_frame->widths[is_y_plane ? 0 : 1],
365 ref_frame->heights[is_y_plane ? 0 : 1],
366 ref_frame->strides[is_y_plane ? 0 : 1] };
367
368 // Handle each subblock.
369 int subblock_idx = 0;
370 for (int i = 0; i < plane_h; i += h) {
371 for (int j = 0; j < plane_w; j += w) {
372 // Choose proper motion vector.
373 const MV mv = subblock_mvs[subblock_idx++];
374 assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
375 mv.col >= INT16_MIN && mv.col <= INT16_MAX);
376
377 const int y = plane_y + i;
378 const int x = plane_x + j;
379
380 // Build predictior for each sub-block on current plane.
381 InterPredParams inter_pred_params;
382 av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
383 subsampling_y, bit_depth, is_high_bitdepth,
384 is_intrabc, scale, &ref_buf, interp_filters);
385 inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
386 av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
387 plane_w, &mv, &inter_pred_params);
388 }
389 }
390 plane_offset += plane_h * plane_w;
391 }
392 }
393 /*!\cond */
394
395 // Computes temporal filter weights and accumulators for the frame to be
396 // filtered. More concretely, the filter weights for all pixels are the same.
397 // Inputs:
398 // mbd: Pointer to the block for filtering, which is ONLY used to get
399 // subsampling information of all planes as well as the bit-depth.
400 // block_size: Size of the block.
401 // num_planes: Number of planes in the frame.
402 // pred: Pointer to the well-built predictors.
403 // accum: Pointer to the pixel-wise accumulator for filtering.
404 // count: Pointer to the pixel-wise counter fot filtering.
405 // Returns:
406 // Nothing will be returned. But the content to which `accum` and `pred`
407 // point will be modified.
tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG * ref_frame,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,uint32_t * accum,uint16_t * count)408 void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
409 const MACROBLOCKD *mbd,
410 const BLOCK_SIZE block_size,
411 const int mb_row, const int mb_col,
412 const int num_planes, uint32_t *accum,
413 uint16_t *count) {
414 // Block information.
415 const int mb_height = block_size_high[block_size];
416 const int mb_width = block_size_wide[block_size];
417 const int is_high_bitdepth = is_cur_buf_hbd(mbd);
418
419 int plane_offset = 0;
420 for (int plane = 0; plane < num_planes; ++plane) {
421 const int subsampling_y = mbd->plane[plane].subsampling_y;
422 const int subsampling_x = mbd->plane[plane].subsampling_x;
423 const int h = mb_height >> subsampling_y; // Plane height.
424 const int w = mb_width >> subsampling_x; // Plane width.
425
426 const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1];
427 const uint8_t *buf8 = ref_frame->buffers[plane];
428 const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8);
429 const int frame_offset = mb_row * h * frame_stride + mb_col * w;
430
431 int pred_idx = 0;
432 int pixel_idx = 0;
433 for (int i = 0; i < h; ++i) {
434 for (int j = 0; j < w; ++j) {
435 const int idx = plane_offset + pred_idx; // Index with plane shift.
436 const int pred_value = is_high_bitdepth
437 ? buf16[frame_offset + pixel_idx]
438 : buf8[frame_offset + pixel_idx];
439 accum[idx] += TF_WEIGHT_SCALE * pred_value;
440 count[idx] += TF_WEIGHT_SCALE;
441 ++pred_idx;
442 ++pixel_idx;
443 }
444 pixel_idx += (frame_stride - w);
445 }
446 plane_offset += h * w;
447 }
448 }
449
450 // Function to compute pixel-wise squared difference between two buffers.
451 // Inputs:
452 // ref: Pointer to reference buffer.
453 // ref_offset: Start position of reference buffer for computation.
454 // ref_stride: Stride for reference buffer.
455 // tgt: Pointer to target buffer.
456 // tgt_offset: Start position of target buffer for computation.
457 // tgt_stride: Stride for target buffer.
458 // height: Height of block for computation.
459 // width: Width of block for computation.
460 // is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
461 // square_diff: Pointer to save the squared differces.
462 // Returns:
463 // Nothing will be returned. But the content to which `square_diff` points
464 // will be modified.
compute_square_diff(const uint8_t * ref,const int ref_offset,const int ref_stride,const uint8_t * tgt,const int tgt_offset,const int tgt_stride,const int height,const int width,const int is_high_bitdepth,uint32_t * square_diff)465 static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
466 const int ref_stride, const uint8_t *tgt,
467 const int tgt_offset,
468 const int tgt_stride, const int height,
469 const int width,
470 const int is_high_bitdepth,
471 uint32_t *square_diff) {
472 const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
473 const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
474
475 int ref_idx = 0;
476 int tgt_idx = 0;
477 int idx = 0;
478 for (int i = 0; i < height; ++i) {
479 for (int j = 0; j < width; ++j) {
480 const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
481 : ref[ref_offset + ref_idx];
482 const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
483 : tgt[tgt_offset + tgt_idx];
484 const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
485 : (tgt_value - ref_value);
486 square_diff[idx] = diff * diff;
487
488 ++ref_idx;
489 ++tgt_idx;
490 ++idx;
491 }
492 ref_idx += (ref_stride - width);
493 tgt_idx += (tgt_stride - width);
494 }
495 }
496
497 // Function to accumulate pixel-wise squared difference between two luma buffers
498 // to be consumed while filtering the chroma planes.
499 // Inputs:
500 // square_diff: Pointer to squared differences from luma plane.
501 // luma_sse_sum: Pointer to save the sum of luma squared differences.
502 // block_height: Height of block for computation.
503 // block_width: Width of block for computation.
504 // ss_x_shift: Chroma subsampling shift in 'X' direction
505 // ss_y_shift: Chroma subsampling shift in 'Y' direction
506 // Returns:
507 // Nothing will be returned. But the content to which `luma_sse_sum` points
508 // will be modified.
compute_luma_sq_error_sum(uint32_t * square_diff,uint32_t * luma_sse_sum,int block_height,int block_width,int ss_x_shift,int ss_y_shift)509 void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
510 int block_height, int block_width,
511 int ss_x_shift, int ss_y_shift) {
512 for (int i = 0; i < block_height; ++i) {
513 for (int j = 0; j < block_width; ++j) {
514 for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
515 for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
516 const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
517 const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
518 const int ww = block_width << ss_x_shift; // Width of Y-plane.
519 luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx];
520 }
521 }
522 }
523 }
524 }
525
526 /*!\endcond */
527 /*!\brief Applies temporal filtering. NOTE that there are various optimised
528 * versions of this function called where the appropriate instruction set is
529 * supported.
530 *
531 * \ingroup src_frame_proc
532 * \param[in] frame_to_filter Pointer to the frame to be filtered, which is
533 * used as reference to compute squared
534 * difference from the predictor.
535 * \param[in] mbd Pointer to the block for filtering, ONLY used
536 * to get subsampling information for the planes
537 * \param[in] block_size Size of the block
538 * \param[in] mb_row Row index of the block in the frame
539 * \param[in] mb_col Column index of the block in the frame
540 * \param[in] num_planes Number of planes in the frame
541 * \param[in] noise_levels Estimated noise levels for each plane
542 * in the frame (Y,U,V)
543 * \param[in] subblock_mvs Pointer to the motion vectors for 4 sub-blocks
544 * \param[in] subblock_mses Pointer to the search errors (MSE) for 4
545 * sub-blocks
546 * \param[in] q_factor Quantization factor. This is actually the `q`
547 * defined in libaom, converted from `qindex`
548 * \param[in] filter_strength Filtering strength. This value lies in range
549 * [0, 6] where 6 is the maximum strength.
550 * \param[out] pred Pointer to the well-built predictors
551 * \param[out] accum Pointer to the pixel-wise accumulator for
552 * filtering
553 * \param[out] count Pointer to the pixel-wise counter for
554 * filtering
555 *
556 * \remark Nothing returned, But the contents of `accum`, `pred` and 'count'
557 * will be modified
558 */
av1_apply_temporal_filter_c(const YV12_BUFFER_CONFIG * frame_to_filter,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const double * noise_levels,const MV * subblock_mvs,const int * subblock_mses,const int q_factor,const int filter_strength,const uint8_t * pred,uint32_t * accum,uint16_t * count)559 void av1_apply_temporal_filter_c(
560 const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
561 const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
562 const int num_planes, const double *noise_levels, const MV *subblock_mvs,
563 const int *subblock_mses, const int q_factor, const int filter_strength,
564 const uint8_t *pred, uint32_t *accum, uint16_t *count) {
565 // Block information.
566 const int mb_height = block_size_high[block_size];
567 const int mb_width = block_size_wide[block_size];
568 const int mb_pels = mb_height * mb_width;
569 const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
570 const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
571 // Frame information.
572 const int frame_height = frame_to_filter->y_crop_height;
573 const int frame_width = frame_to_filter->y_crop_width;
574 const int min_frame_size = AOMMIN(frame_height, frame_width);
575 // Variables to simplify combined error calculation.
576 const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
577 TF_SEARCH_ERROR_NORM_WEIGHT);
578 const double weight_factor =
579 (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
580 // Decay factors for non-local mean approach.
581 double decay_factor[MAX_MB_PLANE] = { 0 };
582 // Adjust filtering based on q.
583 // Larger q -> stronger filtering -> larger weight.
584 // Smaller q -> weaker filtering -> smaller weight.
585 double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
586 q_decay = CLIP(q_decay, 1e-5, 1);
587 if (q_factor >= TF_QINDEX_CUTOFF) {
588 // Max q_factor is 255, therefore the upper bound of q_decay is 8.
589 // We do not need a clip here.
590 q_decay = 0.5 * pow((double)q_factor / 64, 2);
591 }
592 // Smaller strength -> smaller filtering weight.
593 double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
594 s_decay = CLIP(s_decay, 1e-5, 1);
595 for (int plane = 0; plane < num_planes; plane++) {
596 // Larger noise -> larger filtering weight.
597 const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
598 decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
599 }
600 double d_factor[4] = { 0 };
601 for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
602 // Larger motion vector -> smaller filtering weight.
603 const MV mv = subblock_mvs[subblock_idx];
604 const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
605 double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
606 distance_threshold = AOMMAX(distance_threshold, 1);
607 d_factor[subblock_idx] = distance / distance_threshold;
608 d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
609 }
610
611 // Allocate memory for pixel-wise squared differences. They,
612 // regardless of the subsampling, are assigned with memory of size `mb_pels`.
613 uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
614 if (!square_diff) {
615 aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
616 "Error allocating temporal filter data");
617 }
618 memset(square_diff, 0, mb_pels * sizeof(square_diff[0]));
619
620 // Allocate memory for accumulated luma squared error. This value will be
621 // consumed while filtering the chroma planes.
622 uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t));
623 if (!luma_sse_sum) {
624 aom_free(square_diff);
625 aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
626 "Error allocating temporal filter data");
627 }
628 memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0]));
629
630 // Get window size for pixel-wise filtering.
631 assert(TF_WINDOW_LENGTH % 2 == 1);
632 const int half_window = TF_WINDOW_LENGTH >> 1;
633
634 // Handle planes in sequence.
635 int plane_offset = 0;
636 for (int plane = 0; plane < num_planes; ++plane) {
637 // Locate pixel on reference frame.
638 const int subsampling_y = mbd->plane[plane].subsampling_y;
639 const int subsampling_x = mbd->plane[plane].subsampling_x;
640 const int h = mb_height >> subsampling_y; // Plane height.
641 const int w = mb_width >> subsampling_x; // Plane width.
642 const int frame_stride =
643 frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
644 const int frame_offset = mb_row * h * frame_stride + mb_col * w;
645 const uint8_t *ref = frame_to_filter->buffers[plane];
646 const int ss_y_shift =
647 subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
648 const int ss_x_shift =
649 subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
650 const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
651 ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
652 const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
653
654 // Filter U-plane and V-plane using Y-plane. This is because motion
655 // search is only done on Y-plane, so the information from Y-plane will
656 // be more accurate. The luma sse sum is reused in both chroma planes.
657 if (plane == AOM_PLANE_U)
658 compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift,
659 ss_y_shift);
660 compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w,
661 h, w, is_high_bitdepth, square_diff);
662
663 // Perform filtering.
664 int pred_idx = 0;
665 for (int i = 0; i < h; ++i) {
666 for (int j = 0; j < w; ++j) {
667 // non-local mean approach
668 uint64_t sum_square_diff = 0;
669
670 for (int wi = -half_window; wi <= half_window; ++wi) {
671 for (int wj = -half_window; wj <= half_window; ++wj) {
672 const int y = CLIP(i + wi, 0, h - 1); // Y-coord on current plane.
673 const int x = CLIP(j + wj, 0, w - 1); // X-coord on current plane.
674 sum_square_diff += square_diff[y * w + x];
675 }
676 }
677
678 sum_square_diff += luma_sse_sum[i * w + j];
679
680 // Scale down the difference for high bit depth input.
681 if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2);
682
683 // Combine window error and block error, and normalize it.
684 const double window_error = sum_square_diff * inv_num_ref_pixels;
685 const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
686 const double block_error = (double)subblock_mses[subblock_idx];
687 const double combined_error =
688 weight_factor * window_error + block_error * inv_factor;
689
690 // Compute filter weight.
691 double scaled_error =
692 combined_error * d_factor[subblock_idx] * decay_factor[plane];
693 scaled_error = AOMMIN(scaled_error, 7);
694 const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
695
696 const int idx = plane_offset + pred_idx; // Index with plane shift.
697 const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
698 accum[idx] += weight * pred_value;
699 count[idx] += weight;
700
701 ++pred_idx;
702 }
703 }
704 plane_offset += h * w;
705 }
706
707 aom_free(square_diff);
708 aom_free(luma_sse_sum);
709 }
710 #if CONFIG_AV1_HIGHBITDEPTH
711 // Calls High bit-depth temporal filter
av1_highbd_apply_temporal_filter_c(const YV12_BUFFER_CONFIG * frame_to_filter,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const double * noise_levels,const MV * subblock_mvs,const int * subblock_mses,const int q_factor,const int filter_strength,const uint8_t * pred,uint32_t * accum,uint16_t * count)712 void av1_highbd_apply_temporal_filter_c(
713 const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
714 const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
715 const int num_planes, const double *noise_levels, const MV *subblock_mvs,
716 const int *subblock_mses, const int q_factor, const int filter_strength,
717 const uint8_t *pred, uint32_t *accum, uint16_t *count) {
718 av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
719 num_planes, noise_levels, subblock_mvs,
720 subblock_mses, q_factor, filter_strength, pred,
721 accum, count);
722 }
723 #endif // CONFIG_AV1_HIGHBITDEPTH
724 /*!\brief Normalizes the accumulated filtering result to produce the filtered
725 * frame
726 *
727 * \ingroup src_frame_proc
728 * \param[in] mbd Pointer to the block for filtering, which is
729 * ONLY used to get subsampling information for
730 * all the planes
731 * \param[in] block_size Size of the block
732 * \param[in] mb_row Row index of the block in the frame
733 * \param[in] mb_col Column index of the block in the frame
734 * \param[in] num_planes Number of planes in the frame
735 * \param[in] accum Pointer to the pre-computed accumulator
736 * \param[in] count Pointer to the pre-computed count
737 * \param[out] result_buffer Pointer to result buffer
738 *
739 * \remark Nothing returned, but the content to which `result_buffer` pointer
740 * will be modified
741 */
tf_normalize_filtered_frame(const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const uint32_t * accum,const uint16_t * count,YV12_BUFFER_CONFIG * result_buffer)742 static void tf_normalize_filtered_frame(
743 const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
744 const int mb_col, const int num_planes, const uint32_t *accum,
745 const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
746 // Block information.
747 const int mb_height = block_size_high[block_size];
748 const int mb_width = block_size_wide[block_size];
749 const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
750
751 int plane_offset = 0;
752 for (int plane = 0; plane < num_planes; ++plane) {
753 const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
754 const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
755 const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1];
756 const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
757 uint8_t *const buf = result_buffer->buffers[plane];
758 uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf);
759
760 int plane_idx = 0; // Pixel index on current plane (block-base).
761 int frame_idx = frame_offset; // Pixel index on the entire frame.
762 for (int i = 0; i < plane_h; ++i) {
763 for (int j = 0; j < plane_w; ++j) {
764 const int idx = plane_idx + plane_offset;
765 const uint16_t rounding = count[idx] >> 1;
766 if (is_high_bitdepth) {
767 buf16[frame_idx] =
768 (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
769 } else {
770 buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
771 }
772 ++plane_idx;
773 ++frame_idx;
774 }
775 frame_idx += (frame_stride - plane_w);
776 }
777 plane_offset += plane_h * plane_w;
778 }
779 }
780
av1_get_q(const AV1_COMP * cpi)781 int av1_get_q(const AV1_COMP *cpi) {
782 const GF_GROUP *gf_group = &cpi->ppi->gf_group;
783 const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
784 const int q =
785 (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
786 cpi->common.seq_params->bit_depth);
787 return q;
788 }
789
av1_tf_do_filtering_row(AV1_COMP * cpi,ThreadData * td,int mb_row)790 void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
791 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
792 YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
793 const int num_frames = tf_ctx->num_frames;
794 const int filter_frame_idx = tf_ctx->filter_frame_idx;
795 const int compute_frame_diff = tf_ctx->compute_frame_diff;
796 const struct scale_factors *scale = &tf_ctx->sf;
797 const double *noise_levels = tf_ctx->noise_levels;
798 const int num_pels = tf_ctx->num_pels;
799 const int q_factor = tf_ctx->q_factor;
800 const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
801 const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
802 MACROBLOCK *const mb = &td->mb;
803 MACROBLOCKD *const mbd = &mb->e_mbd;
804 TemporalFilterData *const tf_data = &td->tf_data;
805 const int mb_height = block_size_high[block_size];
806 const int mb_width = block_size_wide[block_size];
807 const int mi_h = mi_size_high_log2[block_size];
808 const int mi_w = mi_size_wide_log2[block_size];
809 const int num_planes = av1_num_planes(&cpi->common);
810 uint32_t *accum = tf_data->accum;
811 uint16_t *count = tf_data->count;
812 uint8_t *pred = tf_data->pred;
813
814 // Factor to control the filering strength.
815 const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength;
816
817 // Do filtering.
818 FRAME_DIFF *diff = &td->tf_data.diff;
819 av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
820 (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
821 cpi->oxcf.border_in_pixels);
822 for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) {
823 av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
824 (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
825 cpi->oxcf.border_in_pixels);
826 memset(accum, 0, num_pels * sizeof(accum[0]));
827 memset(count, 0, num_pels * sizeof(count[0]));
828 MV ref_mv = kZeroMv; // Reference motion vector passed down along frames.
829 // Perform temporal filtering frame by frame.
830 for (int frame = 0; frame < num_frames; frame++) {
831 if (frames[frame] == NULL) continue;
832
833 // Motion search.
834 MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
835 int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
836 if (frame ==
837 filter_frame_idx) { // Frame to be filtered.
838 // Change ref_mv sign for following frames.
839 ref_mv.row *= -1;
840 ref_mv.col *= -1;
841 } else { // Other reference frames.
842 tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
843 mb_row, mb_col, &ref_mv, subblock_mvs, subblock_mses);
844 }
845
846 // Perform weighted averaging.
847 if (frame == filter_frame_idx) { // Frame to be filtered.
848 tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row,
849 mb_col, num_planes, accum, count);
850 } else { // Other reference frames.
851 tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
852 num_planes, scale, subblock_mvs, pred);
853
854 // All variants of av1_apply_temporal_filter() contain floating point
855 // operations. Hence, clear the system state.
856
857 // TODO(any): avx2/sse2 version should be changed to align with C
858 // function before using. In particular, current avx2/sse2 function
859 // only supports 32x32 block size and 5x5 filtering window.
860 if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth
861 #if CONFIG_AV1_HIGHBITDEPTH
862 if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
863 av1_highbd_apply_temporal_filter(
864 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
865 noise_levels, subblock_mvs, subblock_mses, q_factor,
866 filter_strength, pred, accum, count);
867 } else {
868 #endif // CONFIG_AV1_HIGHBITDEPTH
869 av1_apply_temporal_filter_c(
870 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
871 noise_levels, subblock_mvs, subblock_mses, q_factor,
872 filter_strength, pred, accum, count);
873 #if CONFIG_AV1_HIGHBITDEPTH
874 }
875 #endif // CONFIG_AV1_HIGHBITDEPTH
876 } else { // for 8-bit
877 if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
878 av1_apply_temporal_filter(frame_to_filter, mbd, block_size, mb_row,
879 mb_col, num_planes, noise_levels,
880 subblock_mvs, subblock_mses, q_factor,
881 filter_strength, pred, accum, count);
882 } else {
883 av1_apply_temporal_filter_c(
884 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
885 noise_levels, subblock_mvs, subblock_mses, q_factor,
886 filter_strength, pred, accum, count);
887 }
888 }
889 }
890 }
891 tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
892 accum, count, tf_ctx->output_frame);
893
894 if (compute_frame_diff) {
895 const int y_height = mb_height >> mbd->plane[0].subsampling_y;
896 const int y_width = mb_width >> mbd->plane[0].subsampling_x;
897 const int source_y_stride = frame_to_filter->y_stride;
898 const int filter_y_stride = tf_ctx->output_frame->y_stride;
899 const int source_offset =
900 mb_row * y_height * source_y_stride + mb_col * y_width;
901 const int filter_offset =
902 mb_row * y_height * filter_y_stride + mb_col * y_width;
903 unsigned int sse = 0;
904 cpi->ppi->fn_ptr[block_size].vf(
905 frame_to_filter->y_buffer + source_offset, source_y_stride,
906 tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride,
907 &sse);
908 diff->sum += sse;
909 diff->sse += sse * (int64_t)sse;
910 }
911 }
912 }
913
914 /*!\brief Does temporal filter for a given frame.
915 *
916 * \ingroup src_frame_proc
917 * \param[in] cpi Top level encoder instance structure
918 *
919 * \remark Nothing will be returned, but the contents of td->diff will be
920 modified.
921 */
tf_do_filtering(AV1_COMP * cpi)922 static void tf_do_filtering(AV1_COMP *cpi) {
923 // Basic information.
924 ThreadData *td = &cpi->td;
925 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
926 const struct scale_factors *scale = &tf_ctx->sf;
927 const int num_planes = av1_num_planes(&cpi->common);
928 assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
929
930 MACROBLOCKD *mbd = &td->mb.e_mbd;
931 uint8_t *input_buffer[MAX_MB_PLANE];
932 MB_MODE_INFO **input_mb_mode_info;
933 tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
934 tf_setup_macroblockd(mbd, &td->tf_data, scale);
935
936 // Perform temporal filtering for each row.
937 for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++)
938 av1_tf_do_filtering_row(cpi, td, mb_row);
939
940 tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
941 }
942
943 /*!\brief Setups the frame buffer for temporal filtering. This fuction
944 * determines how many frames will be used for temporal filtering and then
945 * groups them into a buffer. This function will also estimate the noise level
946 * of the to-filter frame.
947 *
948 * \ingroup src_frame_proc
949 * \param[in] cpi Top level encoder instance structure
950 * \param[in] filter_frame_lookahead_idx The index of the to-filter frame
951 * in the lookahead buffer cpi->lookahead
952 * \param[in] gf_frame_index GOP index
953 *
954 * \remark Nothing will be returned. But the fields `frames`, `num_frames`,
955 * `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx.
956 */
tf_setup_filtering_buffer(AV1_COMP * cpi,int filter_frame_lookahead_idx,int gf_frame_index)957 static void tf_setup_filtering_buffer(AV1_COMP *cpi,
958 int filter_frame_lookahead_idx,
959 int gf_frame_index) {
960 const GF_GROUP *gf_group = &cpi->ppi->gf_group;
961 const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
962 const FRAME_TYPE frame_type = gf_group->frame_type[gf_frame_index];
963 const int is_forward_keyframe =
964 av1_gop_check_forward_keyframe(gf_group, gf_frame_index);
965
966 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
967 YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
968 // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable
969 // temporal filtering.
970 int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1);
971 int num_before = 0; // Number of filtering frames before the to-filter frame.
972 int num_after = 0; // Number of filtering frames after the to-filer frame.
973 const int lookahead_depth =
974 av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
975
976 // Temporal filtering should not go beyond key frames
977 const int key_to_curframe =
978 AOMMAX(cpi->rc.frames_since_key + filter_frame_lookahead_idx, 0);
979 const int curframe_to_key =
980 AOMMAX(cpi->rc.frames_to_key - filter_frame_lookahead_idx - 1, 0);
981
982 // Number of buffered frames before the to-filter frame.
983 int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
984
985 // Number of buffered frames after the to-filter frame.
986 int max_after =
987 AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key);
988
989 // Estimate noises for each plane.
990 const struct lookahead_entry *to_filter_buf = av1_lookahead_peek(
991 cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage);
992 assert(to_filter_buf != NULL);
993 const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
994 const int num_planes = av1_num_planes(&cpi->common);
995 double *noise_levels = tf_ctx->noise_levels;
996 for (int plane = 0; plane < num_planes; ++plane) {
997 noise_levels[plane] = av1_estimate_noise_from_single_plane(
998 to_filter_frame, plane, cpi->common.seq_params->bit_depth,
999 NOISE_ESTIMATION_EDGE_THRESHOLD);
1000 }
1001 // Get quantization factor.
1002 const int q = av1_get_q(cpi);
1003 // Get correlation estimates from first-pass;
1004 const FIRSTPASS_STATS *stats =
1005 cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0);
1006 double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
1007 for (int i = 1; i <= max_after; i++) {
1008 if (stats + filter_frame_lookahead_idx + i >=
1009 cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
1010 max_after = i - 1;
1011 break;
1012 }
1013 accu_coeff1 *=
1014 AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001);
1015 }
1016 if (max_after >= 1) {
1017 accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
1018 }
1019 for (int i = 1; i <= max_before; i++) {
1020 if (stats + filter_frame_lookahead_idx - i + 1 <=
1021 cpi->ppi->twopass.stats_buf_ctx->stats_in_start) {
1022 max_before = i - 1;
1023 break;
1024 }
1025 accu_coeff0 *=
1026 AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001);
1027 }
1028 if (max_before >= 1) {
1029 accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
1030 }
1031
1032 // Adjust number of filtering frames based on quantization factor. When the
1033 // quantization factor is small enough (lossless compression), we will not
1034 // change the number of frames for key frame filtering, which is to avoid
1035 // visual quality drop.
1036 int adjust_num = 6;
1037 if (num_frames == 1) { // `arnr_max_frames = 1` is used to disable filtering.
1038 adjust_num = 0;
1039 } else if ((update_type == KF_UPDATE) && q <= 10) {
1040 adjust_num = 0;
1041 }
1042 num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
1043
1044 if (frame_type == KEY_FRAME) {
1045 num_before = AOMMIN(is_forward_keyframe ? num_frames / 2 : 0, max_before);
1046 num_after = AOMMIN(num_frames - 1, max_after);
1047 } else {
1048 int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame,
1049 &cpi->ppi->p_rc, &cpi->frame_info,
1050 filter_frame_lookahead_idx, max_before,
1051 max_after, NULL, NULL, 0);
1052
1053 num_frames = AOMMIN(num_frames, gfu_boost / 150);
1054 num_frames += !(num_frames & 1); // Make the number odd.
1055
1056 // Limit the number of frames if noise levels are low and high quantizers.
1057 if (noise_levels[AOM_PLANE_Y] < 1.9 && cpi->ppi->p_rc.arf_q > 40)
1058 num_frames = AOMMIN(num_frames, cpi->sf.hl_sf.num_frames_used_in_tf);
1059
1060 // Only use 2 neighbours for the second ARF.
1061 if (update_type == INTNL_ARF_UPDATE) num_frames = AOMMIN(num_frames, 3);
1062 if (AOMMIN(max_after, max_before) >= num_frames / 2) {
1063 // just use half half
1064 num_before = num_frames / 2;
1065 num_after = num_frames / 2;
1066 } else {
1067 if (max_after < num_frames / 2) {
1068 num_after = max_after;
1069 num_before = AOMMIN(num_frames - 1 - num_after, max_before);
1070 } else {
1071 num_before = max_before;
1072 num_after = AOMMIN(num_frames - 1 - num_before, max_after);
1073 }
1074 // Adjust insymmetry based on frame-level correlation
1075 if (max_after > 0 && max_before > 0) {
1076 if (num_after < num_before) {
1077 const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01));
1078 num_before = AOMMIN(num_before, num_after + insym);
1079 } else {
1080 const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01));
1081 num_after = AOMMIN(num_after, num_before + insym);
1082 }
1083 }
1084 }
1085 }
1086 num_frames = num_before + 1 + num_after;
1087
1088 // Setup the frame buffer.
1089 for (int frame = 0; frame < num_frames; ++frame) {
1090 const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx;
1091 struct lookahead_entry *buf = av1_lookahead_peek(
1092 cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage);
1093 assert(buf != NULL);
1094 frames[frame] = &buf->img;
1095 }
1096 tf_ctx->num_frames = num_frames;
1097 tf_ctx->filter_frame_idx = num_before;
1098 assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
1099
1100 av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
1101 cpi->common.seq_params->sb_size);
1102 av1_setup_block_planes(&cpi->td.mb.e_mbd,
1103 cpi->common.seq_params->subsampling_x,
1104 cpi->common.seq_params->subsampling_y, num_planes);
1105 }
1106
1107 /*!\cond */
1108
1109 // A constant number, sqrt(pi / 2), used for noise estimation.
1110 static const double SQRT_PI_BY_2 = 1.25331413732;
1111
av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG * frame,const int plane,const int bit_depth,const int edge_thresh)1112 double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
1113 const int plane,
1114 const int bit_depth,
1115 const int edge_thresh) {
1116 const int is_y_plane = (plane == 0);
1117 const int height = frame->crop_heights[is_y_plane ? 0 : 1];
1118 const int width = frame->crop_widths[is_y_plane ? 0 : 1];
1119 const int stride = frame->strides[is_y_plane ? 0 : 1];
1120 const uint8_t *src = frame->buffers[plane];
1121 const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
1122 const int is_high_bitdepth = is_frame_high_bitdepth(frame);
1123
1124 int64_t accum = 0;
1125 int count = 0;
1126 for (int i = 1; i < height - 1; ++i) {
1127 for (int j = 1; j < width - 1; ++j) {
1128 // Setup a small 3x3 matrix.
1129 const int center_idx = i * stride + j;
1130 int mat[3][3];
1131 for (int ii = -1; ii <= 1; ++ii) {
1132 for (int jj = -1; jj <= 1; ++jj) {
1133 const int idx = center_idx + ii * stride + jj;
1134 mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx];
1135 }
1136 }
1137 // Compute sobel gradients.
1138 const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
1139 2 * (mat[1][0] - mat[1][2]);
1140 const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
1141 2 * (mat[0][1] - mat[2][1]);
1142 const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
1143 // Accumulate Laplacian.
1144 if (Ga < edge_thresh) { // Only count smooth pixels.
1145 const int v = 4 * mat[1][1] -
1146 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
1147 (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
1148 accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8);
1149 ++count;
1150 }
1151 }
1152 }
1153
1154 // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
1155 return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
1156 }
1157
1158 // Initializes the members of TemporalFilterCtx
1159 // Inputs:
1160 // cpi: Top level encoder instance structure
1161 // check_show_existing: If 1, check whether the filtered frame is similar
1162 // to the original frame.
1163 // filter_frame_lookahead_idx: The index of the frame to be filtered in the
1164 // lookahead buffer cpi->lookahead.
1165 // Returns:
1166 // Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
init_tf_ctx(AV1_COMP * cpi,int filter_frame_lookahead_idx,int gf_frame_index,int compute_frame_diff,YV12_BUFFER_CONFIG * output_frame)1167 static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
1168 int gf_frame_index, int compute_frame_diff,
1169 YV12_BUFFER_CONFIG *output_frame) {
1170 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
1171 // Setup frame buffer for filtering.
1172 YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
1173 tf_ctx->num_frames = 0;
1174 tf_ctx->filter_frame_idx = -1;
1175 tf_ctx->output_frame = output_frame;
1176 tf_ctx->compute_frame_diff = compute_frame_diff;
1177 tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, gf_frame_index);
1178 assert(tf_ctx->num_frames > 0);
1179 assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames);
1180
1181 // Setup scaling factors. Scaling on each of the arnr frames is not
1182 // supported.
1183 // ARF is produced at the native frame size and resized when coded.
1184 struct scale_factors *sf = &tf_ctx->sf;
1185 av1_setup_scale_factors_for_frame(
1186 sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
1187 frames[0]->y_crop_width, frames[0]->y_crop_height);
1188
1189 // Initialize temporal filter parameters.
1190 MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
1191 const int filter_frame_idx = tf_ctx->filter_frame_idx;
1192 const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
1193 const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
1194 const int frame_height = frame_to_filter->y_crop_height;
1195 const int frame_width = frame_to_filter->y_crop_width;
1196 const int mb_width = block_size_wide[block_size];
1197 const int mb_height = block_size_high[block_size];
1198 const int mb_rows = get_num_blocks(frame_height, mb_height);
1199 const int mb_cols = get_num_blocks(frame_width, mb_width);
1200 const int mb_pels = mb_width * mb_height;
1201 const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter);
1202 const int num_planes = av1_num_planes(&cpi->common);
1203 int num_pels = 0;
1204 for (int i = 0; i < num_planes; i++) {
1205 const int subsampling_x = mbd->plane[i].subsampling_x;
1206 const int subsampling_y = mbd->plane[i].subsampling_y;
1207 num_pels += mb_pels >> (subsampling_x + subsampling_y);
1208 }
1209 tf_ctx->num_pels = num_pels;
1210 tf_ctx->mb_rows = mb_rows;
1211 tf_ctx->mb_cols = mb_cols;
1212 tf_ctx->is_highbitdepth = is_highbitdepth;
1213 tf_ctx->q_factor = av1_get_q(cpi);
1214 }
1215
av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG * frame,const FRAME_DIFF * frame_diff,int q_index,aom_bit_depth_t bit_depth)1216 int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
1217 const FRAME_DIFF *frame_diff, int q_index,
1218 aom_bit_depth_t bit_depth) {
1219 const int frame_height = frame->y_crop_height;
1220 const int frame_width = frame->y_crop_width;
1221 const int block_height = block_size_high[TF_BLOCK_SIZE];
1222 const int block_width = block_size_wide[TF_BLOCK_SIZE];
1223 const int mb_rows = get_num_blocks(frame_height, block_height);
1224 const int mb_cols = get_num_blocks(frame_width, block_width);
1225 const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
1226 const float mean = (float)frame_diff->sum / num_mbs;
1227 const float std = (float)sqrt((float)frame_diff->sse / num_mbs - mean * mean);
1228
1229 const int ac_q_step = av1_ac_quant_QTX(q_index, 0, bit_depth);
1230 const float threshold = 0.7f * ac_q_step * ac_q_step;
1231
1232 if (mean < threshold && std < mean * 1.2) {
1233 return 1;
1234 }
1235 return 0;
1236 }
1237
av1_temporal_filter(AV1_COMP * cpi,const int filter_frame_lookahead_idx,int gf_frame_index,FRAME_DIFF * frame_diff,YV12_BUFFER_CONFIG * output_frame)1238 void av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
1239 int gf_frame_index, FRAME_DIFF *frame_diff,
1240 YV12_BUFFER_CONFIG *output_frame) {
1241 MultiThreadInfo *const mt_info = &cpi->mt_info;
1242 // Basic informaton of the current frame.
1243 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
1244 TemporalFilterData *tf_data = &cpi->td.tf_data;
1245 const int compute_frame_diff = frame_diff != NULL;
1246 // TODO(anyone): Currently, we enforce the filtering strength on internal
1247 // ARFs except the second ARF to be zero. We should investigate in which case
1248 // it is more beneficial to use non-zero strength filtering.
1249 // Only parallel level 0 frames go through temporal filtering.
1250 assert(cpi->ppi->gf_group.frame_parallel_level[gf_frame_index] == 0);
1251
1252 // Initialize temporal filter context structure.
1253 init_tf_ctx(cpi, filter_frame_lookahead_idx, gf_frame_index,
1254 compute_frame_diff, output_frame);
1255
1256 // Allocate and reset temporal filter buffers.
1257 const int is_highbitdepth = tf_ctx->is_highbitdepth;
1258 if (!tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth)) {
1259 aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
1260 "Error allocating temporal filter data");
1261 }
1262
1263 // Perform temporal filtering process.
1264 if (mt_info->num_workers > 1)
1265 av1_tf_do_filtering_mt(cpi);
1266 else
1267 tf_do_filtering(cpi);
1268
1269 if (compute_frame_diff) {
1270 *frame_diff = tf_data->diff;
1271 }
1272 // Deallocate temporal filter buffers.
1273 tf_dealloc_data(tf_data, is_highbitdepth);
1274 }
1275
av1_is_temporal_filter_on(const AV1EncoderConfig * oxcf)1276 int av1_is_temporal_filter_on(const AV1EncoderConfig *oxcf) {
1277 return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1;
1278 }
1279
av1_tf_info_alloc(TEMPORAL_FILTER_INFO * tf_info,const AV1_COMP * cpi)1280 void av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) {
1281 const AV1EncoderConfig *oxcf = &cpi->oxcf;
1282 tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf);
1283 if (tf_info->is_temporal_filter_on == 0) return;
1284
1285 const AV1_COMMON *cm = &cpi->common;
1286 const SequenceHeader *const seq_params = cm->seq_params;
1287 int ret;
1288 for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
1289 ret = aom_realloc_frame_buffer(
1290 &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
1291 seq_params->subsampling_x, seq_params->subsampling_y,
1292 seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
1293 cm->features.byte_alignment, NULL, NULL, NULL,
1294 cpi->oxcf.tool_cfg.enable_global_motion, 0);
1295 if (ret) {
1296 aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
1297 "Failed to allocate tf_info");
1298 }
1299 }
1300 }
1301
av1_tf_info_free(TEMPORAL_FILTER_INFO * tf_info)1302 void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) {
1303 if (tf_info->is_temporal_filter_on == 0) return;
1304 for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
1305 aom_free_frame_buffer(&tf_info->tf_buf[i]);
1306 }
1307 aom_free_frame_buffer(&tf_info->tf_buf_second_arf);
1308 }
1309
av1_tf_info_reset(TEMPORAL_FILTER_INFO * tf_info)1310 void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info) {
1311 av1_zero(tf_info->tf_buf_valid);
1312 av1_zero(tf_info->tf_buf_gf_index);
1313 av1_zero(tf_info->tf_buf_display_index_offset);
1314 }
1315
av1_tf_info_filtering(TEMPORAL_FILTER_INFO * tf_info,AV1_COMP * cpi,const GF_GROUP * gf_group)1316 void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi,
1317 const GF_GROUP *gf_group) {
1318 if (tf_info->is_temporal_filter_on == 0) return;
1319 const AV1_COMMON *const cm = &cpi->common;
1320 for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
1321 int update_type = gf_group->update_type[gf_index];
1322 if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
1323 int buf_idx = gf_group->frame_type[gf_index] == INTER_FRAME;
1324 int lookahead_idx = gf_group->arf_src_offset[gf_index] +
1325 gf_group->cur_frame_idx[gf_index];
1326 // This function is designed to be called multiple times after
1327 // av1_tf_info_reset(). It will only generate the filtered frame that does
1328 // not exist yet.
1329 if (tf_info->tf_buf_valid[buf_idx] == 0 ||
1330 tf_info->tf_buf_display_index_offset[buf_idx] != lookahead_idx) {
1331 YV12_BUFFER_CONFIG *out_buf = &tf_info->tf_buf[buf_idx];
1332 av1_temporal_filter(cpi, lookahead_idx, gf_index,
1333 &tf_info->frame_diff[buf_idx], out_buf);
1334 aom_extend_frame_borders(out_buf, av1_num_planes(cm));
1335 tf_info->tf_buf_gf_index[buf_idx] = gf_index;
1336 tf_info->tf_buf_display_index_offset[buf_idx] = lookahead_idx;
1337 tf_info->tf_buf_valid[buf_idx] = 1;
1338 }
1339 }
1340 }
1341 }
1342
av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO * tf_info,int gf_index,FRAME_DIFF * frame_diff)1343 YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
1344 int gf_index,
1345 FRAME_DIFF *frame_diff) {
1346 if (tf_info->is_temporal_filter_on == 0) return NULL;
1347 YV12_BUFFER_CONFIG *out_buf = NULL;
1348 for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
1349 if (tf_info->tf_buf_valid[i] && tf_info->tf_buf_gf_index[i] == gf_index) {
1350 out_buf = &tf_info->tf_buf[i];
1351 *frame_diff = tf_info->frame_diff[i];
1352 }
1353 }
1354 return out_buf;
1355 }
1356 /*!\endcond */
1357