• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 
11  */
12 
13 #include <assert.h>
14 #include <limits.h>
15 #include <math.h>
16 #include <stdio.h>
17 
18 #include "config/aom_dsp_rtcd.h"
19 #include "config/av1_rtcd.h"
20 
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/txfm_common.h"
23 #include "aom_ports/mem.h"
24 
25 #include "av1/common/blockd.h"
26 #include "av1/common/mvref_common.h"
27 #include "av1/common/pred_common.h"
28 #include "av1/common/reconinter.h"
29 #include "av1/common/reconintra.h"
30 
31 #include "av1/encoder/encodemv.h"
32 #include "av1/encoder/encoder.h"
33 #include "av1/encoder/intra_mode_search.h"
34 #include "av1/encoder/model_rd.h"
35 #include "av1/encoder/motion_search_facade.h"
36 #include "av1/encoder/nonrd_opt.h"
37 #include "av1/encoder/rdopt.h"
38 #include "av1/encoder/reconinter_enc.h"
39 #include "av1/encoder/var_based_part.h"
40 
41 #define CALC_BIASED_RDCOST(rdcost) (7 * (rdcost) >> 3)
42 extern int g_pick_inter_mode_cnt;
43 /*!\cond */
44 typedef struct {
45   uint8_t *data;
46   int stride;
47   int in_use;
48 } PRED_BUFFER;
49 
50 typedef struct {
51   PRED_BUFFER *best_pred;
52   PREDICTION_MODE best_mode;
53   TX_SIZE best_tx_size;
54   TX_TYPE tx_type;
55   MV_REFERENCE_FRAME best_ref_frame;
56   MV_REFERENCE_FRAME best_second_ref_frame;
57   uint8_t best_mode_skip_txfm;
58   uint8_t best_mode_initial_skip_flag;
59   int_interpfilters best_pred_filter;
60   MOTION_MODE best_motion_mode;
61   WarpedMotionParams wm_params;
62   int num_proj_ref;
63   uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE / 4];
64   PALETTE_MODE_INFO pmi;
65   int64_t best_sse;
66 } BEST_PICKMODE;
67 
68 typedef struct {
69   MV_REFERENCE_FRAME ref_frame;
70   PREDICTION_MODE pred_mode;
71 } REF_MODE;
72 
73 typedef struct {
74   MV_REFERENCE_FRAME ref_frame[2];
75   PREDICTION_MODE pred_mode;
76 } COMP_REF_MODE;
77 
78 typedef struct {
79   InterpFilter filter_x;
80   InterpFilter filter_y;
81 } INTER_FILTER;
82 
83 /*!\brief Structure to store parameters and statistics used in non-rd inter mode
84  * evaluation.
85  */
86 typedef struct {
87   BEST_PICKMODE best_pickmode;
88   RD_STATS this_rdc;
89   RD_STATS best_rdc;
90   int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES];
91   struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
92   unsigned int vars[RTC_INTER_MODES][REF_FRAMES];
93   unsigned int ref_costs_single[REF_FRAMES];
94   int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
95   int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES];
96   int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES];
97   int use_ref_frame_mask[REF_FRAMES];
98   uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
99 } InterModeSearchStateNonrd;
100 /*!\endcond */
101 
102 #define NUM_COMP_INTER_MODES_RT (6)
103 #define NUM_INTER_MODES 12
104 
105 // GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT
106 // mode
107 static const REF_MODE ref_mode_set[NUM_INTER_MODES] = {
108   { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
109   { LAST_FRAME, GLOBALMV },    { LAST_FRAME, NEWMV },
110   { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
111   { GOLDEN_FRAME, GLOBALMV },  { GOLDEN_FRAME, NEWMV },
112   { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV },
113   { ALTREF_FRAME, GLOBALMV },  { ALTREF_FRAME, NEWMV },
114 };
115 
116 static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = {
117   { { LAST_FRAME, GOLDEN_FRAME }, GLOBAL_GLOBALMV },
118   { { LAST_FRAME, GOLDEN_FRAME }, NEAREST_NEARESTMV },
119   { { LAST_FRAME, LAST2_FRAME }, GLOBAL_GLOBALMV },
120   { { LAST_FRAME, LAST2_FRAME }, NEAREST_NEARESTMV },
121   { { LAST_FRAME, ALTREF_FRAME }, GLOBAL_GLOBALMV },
122   { { LAST_FRAME, ALTREF_FRAME }, NEAREST_NEARESTMV },
123 };
124 
125 static const INTER_FILTER filters_ref_set[9] = {
126   { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
127   { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH },  { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR },
128   { MULTITAP_SHARP, MULTITAP_SHARP },     { EIGHTTAP_REGULAR, MULTITAP_SHARP },
129   { MULTITAP_SHARP, EIGHTTAP_REGULAR },   { EIGHTTAP_SMOOTH, MULTITAP_SHARP },
130   { MULTITAP_SHARP, EIGHTTAP_SMOOTH }
131 };
132 
133 enum {
134   //  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
135   INTER_NEAREST = (1 << NEARESTMV),
136   INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
137   INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV),
138   INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV),
139 };
140 
141 // The original scan order (default_scan_8x8) is modified according to the extra
142 // transpose in hadamard c implementation, i.e., aom_hadamard_lp_8x8_c and
143 // aom_hadamard_8x8_c.
144 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8_transpose[64]) = {
145   0,  8,  1,  2,  9,  16, 24, 17, 10, 3,  4,  11, 18, 25, 32, 40,
146   33, 26, 19, 12, 5,  6,  13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
147   28, 21, 14, 7,  15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
148   23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
149 };
150 
151 // The original scan order (av1_default_iscan_8x8) is modified to match
152 // hadamard AVX2 implementation, i.e., aom_hadamard_lp_8x8_avx2 and
153 // aom_hadamard_8x8_avx2. Since hadamard AVX2 implementation will modify the
154 // order of coefficients, such that the normal scan order is no longer
155 // guaranteed to scan low coefficients first, therefore we modify the scan order
156 // accordingly.
157 // Note that this one has to be used together with default_scan_8x8_transpose.
158 DECLARE_ALIGNED(16, static const int16_t,
159                 av1_default_iscan_8x8_transpose[64]) = {
160   0,  2,  3,  9,  10, 20, 21, 35, 1,  4,  8,  11, 19, 22, 34, 36,
161   5,  7,  12, 18, 23, 33, 37, 48, 6,  13, 17, 24, 32, 38, 47, 49,
162   14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58,
163   27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63
164 };
165 
166 // The original scan order (default_scan_16x16) is modified according to the
167 // extra transpose in hadamard c implementation in lp case, i.e.,
168 // aom_hadamard_lp_16x16_c.
169 DECLARE_ALIGNED(16, static const int16_t,
170                 default_scan_lp_16x16_transpose[256]) = {
171   0,   8,   2,   4,   10,  16,  24,  18,  12,  6,   64,  14,  20,  26,  32,
172   40,  34,  28,  22,  72,  66,  68,  74,  80,  30,  36,  42,  48,  56,  50,
173   44,  38,  88,  82,  76,  70,  128, 78,  84,  90,  96,  46,  52,  58,  1,
174   9,   3,   60,  54,  104, 98,  92,  86,  136, 130, 132, 138, 144, 94,  100,
175   106, 112, 62,  5,   11,  17,  25,  19,  13,  7,   120, 114, 108, 102, 152,
176   146, 140, 134, 192, 142, 148, 154, 160, 110, 116, 122, 65,  15,  21,  27,
177   33,  41,  35,  29,  23,  73,  67,  124, 118, 168, 162, 156, 150, 200, 194,
178   196, 202, 208, 158, 164, 170, 176, 126, 69,  75,  81,  31,  37,  43,  49,
179   57,  51,  45,  39,  89,  83,  77,  71,  184, 178, 172, 166, 216, 210, 204,
180   198, 206, 212, 218, 224, 174, 180, 186, 129, 79,  85,  91,  97,  47,  53,
181   59,  61,  55,  105, 99,  93,  87,  137, 131, 188, 182, 232, 226, 220, 214,
182   222, 228, 234, 240, 190, 133, 139, 145, 95,  101, 107, 113, 63,  121, 115,
183   109, 103, 153, 147, 141, 135, 248, 242, 236, 230, 238, 244, 250, 193, 143,
184   149, 155, 161, 111, 117, 123, 125, 119, 169, 163, 157, 151, 201, 195, 252,
185   246, 254, 197, 203, 209, 159, 165, 171, 177, 127, 185, 179, 173, 167, 217,
186   211, 205, 199, 207, 213, 219, 225, 175, 181, 187, 189, 183, 233, 227, 221,
187   215, 223, 229, 235, 241, 191, 249, 243, 237, 231, 239, 245, 251, 253, 247,
188   255
189 };
190 
191 #if CONFIG_AV1_HIGHBITDEPTH
192 // The original scan order (default_scan_16x16) is modified according to the
193 // extra shift in hadamard c implementation in fp case, i.e.,
194 // aom_hadamard_16x16_c. Note that 16x16 lp and fp hadamard generate different
195 // outputs, so we handle them separately.
196 DECLARE_ALIGNED(16, static const int16_t,
197                 default_scan_fp_16x16_transpose[256]) = {
198   0,   4,   2,   8,   6,   16,  20,  18,  12,  10,  64,  14,  24,  22,  32,
199   36,  34,  28,  26,  68,  66,  72,  70,  80,  30,  40,  38,  48,  52,  50,
200   44,  42,  84,  82,  76,  74,  128, 78,  88,  86,  96,  46,  56,  54,  1,
201   5,   3,   60,  58,  100, 98,  92,  90,  132, 130, 136, 134, 144, 94,  104,
202   102, 112, 62,  9,   7,   17,  21,  19,  13,  11,  116, 114, 108, 106, 148,
203   146, 140, 138, 192, 142, 152, 150, 160, 110, 120, 118, 65,  15,  25,  23,
204   33,  37,  35,  29,  27,  69,  67,  124, 122, 164, 162, 156, 154, 196, 194,
205   200, 198, 208, 158, 168, 166, 176, 126, 73,  71,  81,  31,  41,  39,  49,
206   53,  51,  45,  43,  85,  83,  77,  75,  180, 178, 172, 170, 212, 210, 204,
207   202, 206, 216, 214, 224, 174, 184, 182, 129, 79,  89,  87,  97,  47,  57,
208   55,  61,  59,  101, 99,  93,  91,  133, 131, 188, 186, 228, 226, 220, 218,
209   222, 232, 230, 240, 190, 137, 135, 145, 95,  105, 103, 113, 63,  117, 115,
210   109, 107, 149, 147, 141, 139, 244, 242, 236, 234, 238, 248, 246, 193, 143,
211   153, 151, 161, 111, 121, 119, 125, 123, 165, 163, 157, 155, 197, 195, 252,
212   250, 254, 201, 199, 209, 159, 169, 167, 177, 127, 181, 179, 173, 171, 213,
213   211, 205, 203, 207, 217, 215, 225, 175, 185, 183, 189, 187, 229, 227, 221,
214   219, 223, 233, 231, 241, 191, 245, 243, 237, 235, 239, 249, 247, 253, 251,
215   255
216 };
217 #endif
218 
219 // The original scan order (av1_default_iscan_16x16) is modified to match
220 // hadamard AVX2 implementation, i.e., aom_hadamard_lp_16x16_avx2.
221 // Since hadamard AVX2 implementation will modify the order of coefficients,
222 // such that the normal scan order is no longer guaranteed to scan low
223 // coefficients first, therefore we modify the scan order accordingly. Note that
224 // this one has to be used together with default_scan_lp_16x16_transpose.
225 DECLARE_ALIGNED(16, static const int16_t,
226                 av1_default_iscan_lp_16x16_transpose[256]) = {
227   0,   44,  2,   46,  3,   63,  9,   69,  1,   45,  4,   64,  8,   68,  11,
228   87,  5,   65,  7,   67,  12,  88,  18,  94,  6,   66,  13,  89,  17,  93,
229   24,  116, 14,  90,  16,  92,  25,  117, 31,  123, 15,  91,  26,  118, 30,
230   122, 41,  148, 27,  119, 29,  121, 42,  149, 48,  152, 28,  120, 43,  150,
231   47,  151, 62,  177, 10,  86,  20,  96,  21,  113, 35,  127, 19,  95,  22,
232   114, 34,  126, 37,  144, 23,  115, 33,  125, 38,  145, 52,  156, 32,  124,
233   39,  146, 51,  155, 58,  173, 40,  147, 50,  154, 59,  174, 73,  181, 49,
234   153, 60,  175, 72,  180, 83,  198, 61,  176, 71,  179, 84,  199, 98,  202,
235   70,  178, 85,  200, 97,  201, 112, 219, 36,  143, 54,  158, 55,  170, 77,
236   185, 53,  157, 56,  171, 76,  184, 79,  194, 57,  172, 75,  183, 80,  195,
237   102, 206, 74,  182, 81,  196, 101, 205, 108, 215, 82,  197, 100, 204, 109,
238   216, 131, 223, 99,  203, 110, 217, 130, 222, 140, 232, 111, 218, 129, 221,
239   141, 233, 160, 236, 128, 220, 142, 234, 159, 235, 169, 245, 78,  193, 104,
240   208, 105, 212, 135, 227, 103, 207, 106, 213, 134, 226, 136, 228, 107, 214,
241   133, 225, 137, 229, 164, 240, 132, 224, 138, 230, 163, 239, 165, 241, 139,
242   231, 162, 238, 166, 242, 189, 249, 161, 237, 167, 243, 188, 248, 190, 250,
243   168, 244, 187, 247, 191, 251, 210, 254, 186, 246, 192, 252, 209, 253, 211,
244   255
245 };
246 
247 #if CONFIG_AV1_HIGHBITDEPTH
248 // The original scan order (av1_default_iscan_16x16) is modified to match
249 // hadamard AVX2 implementation, i.e., aom_hadamard_16x16_avx2.
250 // Since hadamard AVX2 implementation will modify the order of coefficients,
251 // such that the normal scan order is no longer guaranteed to scan low
252 // coefficients first, therefore we modify the scan order accordingly. Note that
253 // this one has to be used together with default_scan_fp_16x16_transpose.
254 DECLARE_ALIGNED(16, static const int16_t,
255                 av1_default_iscan_fp_16x16_transpose[256]) = {
256   0,   44,  2,   46,  1,   45,  4,   64,  3,   63,  9,   69,  8,   68,  11,
257   87,  5,   65,  7,   67,  6,   66,  13,  89,  12,  88,  18,  94,  17,  93,
258   24,  116, 14,  90,  16,  92,  15,  91,  26,  118, 25,  117, 31,  123, 30,
259   122, 41,  148, 27,  119, 29,  121, 28,  120, 43,  150, 42,  149, 48,  152,
260   47,  151, 62,  177, 10,  86,  20,  96,  19,  95,  22,  114, 21,  113, 35,
261   127, 34,  126, 37,  144, 23,  115, 33,  125, 32,  124, 39,  146, 38,  145,
262   52,  156, 51,  155, 58,  173, 40,  147, 50,  154, 49,  153, 60,  175, 59,
263   174, 73,  181, 72,  180, 83,  198, 61,  176, 71,  179, 70,  178, 85,  200,
264   84,  199, 98,  202, 97,  201, 112, 219, 36,  143, 54,  158, 53,  157, 56,
265   171, 55,  170, 77,  185, 76,  184, 79,  194, 57,  172, 75,  183, 74,  182,
266   81,  196, 80,  195, 102, 206, 101, 205, 108, 215, 82,  197, 100, 204, 99,
267   203, 110, 217, 109, 216, 131, 223, 130, 222, 140, 232, 111, 218, 129, 221,
268   128, 220, 142, 234, 141, 233, 160, 236, 159, 235, 169, 245, 78,  193, 104,
269   208, 103, 207, 106, 213, 105, 212, 135, 227, 134, 226, 136, 228, 107, 214,
270   133, 225, 132, 224, 138, 230, 137, 229, 164, 240, 163, 239, 165, 241, 139,
271   231, 162, 238, 161, 237, 167, 243, 166, 242, 189, 249, 188, 248, 190, 250,
272   168, 244, 187, 247, 186, 246, 192, 252, 191, 251, 210, 254, 209, 253, 211,
273   255
274 };
275 #endif
276 
early_term_inter_search_with_sse(int early_term_idx,BLOCK_SIZE bsize,int64_t this_sse,int64_t best_sse,PREDICTION_MODE this_mode)277 static INLINE int early_term_inter_search_with_sse(int early_term_idx,
278                                                    BLOCK_SIZE bsize,
279                                                    int64_t this_sse,
280                                                    int64_t best_sse,
281                                                    PREDICTION_MODE this_mode) {
282   // Aggressiveness to terminate inter mode search early is adjusted based on
283   // speed and block size.
284   static const double early_term_thresh[4][4] = { { 0.65, 0.65, 0.65, 0.7 },
285                                                   { 0.6, 0.65, 0.85, 0.9 },
286                                                   { 0.5, 0.5, 0.55, 0.6 },
287                                                   { 0.6, 0.75, 0.85, 0.85 } };
288   static const double early_term_thresh_newmv_nearestmv[4] = { 0.3, 0.3, 0.3,
289                                                                0.3 };
290 
291   const int size_group = size_group_lookup[bsize];
292   assert(size_group < 4);
293   assert((early_term_idx > 0) && (early_term_idx < EARLY_TERM_INDICES));
294   const double threshold =
295       ((early_term_idx == EARLY_TERM_IDX_4) &&
296        (this_mode == NEWMV || this_mode == NEARESTMV))
297           ? early_term_thresh_newmv_nearestmv[size_group]
298           : early_term_thresh[early_term_idx - 1][size_group];
299 
300   // Terminate inter mode search early based on best sse so far.
301   if ((early_term_idx > 0) && (threshold * this_sse > best_sse)) {
302     return 1;
303   }
304   return 0;
305 }
306 
init_best_pickmode(BEST_PICKMODE * bp)307 static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
308   bp->best_sse = INT64_MAX;
309   bp->best_mode = NEARESTMV;
310   bp->best_ref_frame = LAST_FRAME;
311   bp->best_second_ref_frame = NONE_FRAME;
312   bp->best_tx_size = TX_8X8;
313   bp->tx_type = DCT_DCT;
314   bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
315   bp->best_mode_skip_txfm = 0;
316   bp->best_mode_initial_skip_flag = 0;
317   bp->best_pred = NULL;
318   bp->best_motion_mode = SIMPLE_TRANSLATION;
319   bp->num_proj_ref = 0;
320   memset(&bp->wm_params, 0, sizeof(bp->wm_params));
321   memset(&bp->blk_skip, 0, sizeof(bp->blk_skip));
322   memset(&bp->pmi, 0, sizeof(bp->pmi));
323 }
324 
subpel_select(AV1_COMP * cpi,MACROBLOCK * x,BLOCK_SIZE bsize,int_mv * mv,MV ref_mv,FULLPEL_MV start_mv,bool fullpel_performed_well)325 static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
326                                 int_mv *mv, MV ref_mv, FULLPEL_MV start_mv,
327                                 bool fullpel_performed_well) {
328   const int frame_lowmotion = cpi->rc.avg_frame_low_motion;
329   // Reduce MV precision for higher int MV value & frame-level motion
330   if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion >= 3) {
331     int mv_thresh = 4;
332     const int is_low_resoln =
333         (cpi->common.width * cpi->common.height <= 320 * 240);
334     mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
335     if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12;
336     mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
337     if (abs(mv->as_fullmv.row) >= mv_thresh ||
338         abs(mv->as_fullmv.col) >= mv_thresh)
339       return HALF_PEL;
340   } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion >= 1) {
341     int mv_thresh;
342     const int th_vals[2][3] = { { 4, 8, 10 }, { 4, 6, 8 } };
343     const int th_idx = cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion - 1;
344     assert(th_idx >= 0 && th_idx < 2);
345     if (frame_lowmotion > 0 && frame_lowmotion < 40)
346       mv_thresh = 12;
347     else
348       mv_thresh = (bsize >= BLOCK_32X32)   ? th_vals[th_idx][0]
349                   : (bsize >= BLOCK_16X16) ? th_vals[th_idx][1]
350                                            : th_vals[th_idx][2];
351     if (abs(mv->as_fullmv.row) >= (mv_thresh << 1) ||
352         abs(mv->as_fullmv.col) >= (mv_thresh << 1))
353       return FULL_PEL;
354     else if (abs(mv->as_fullmv.row) >= mv_thresh ||
355              abs(mv->as_fullmv.col) >= mv_thresh)
356       return HALF_PEL;
357   }
358   // Reduce MV precision for relatively static (e.g. background), low-complex
359   // large areas
360   if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 2) {
361     const int qband = x->qindex >> (QINDEX_BITS - 2);
362     assert(qband < 4);
363     if (x->content_state_sb.source_sad_nonrd <= kVeryLowSad &&
364         bsize > BLOCK_16X16 && qband != 0) {
365       if (x->source_variance < 500)
366         return FULL_PEL;
367       else if (x->source_variance < 5000)
368         return HALF_PEL;
369     }
370   } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 1) {
371     if (fullpel_performed_well && ref_mv.row == 0 && ref_mv.col == 0 &&
372         start_mv.row == 0 && start_mv.col == 0)
373       return HALF_PEL;
374   }
375   return cpi->sf.mv_sf.subpel_force_stop;
376 }
377 
use_aggressive_subpel_search_method(MACROBLOCK * x,bool use_adaptive_subpel_search,const bool fullpel_performed_well)378 static bool use_aggressive_subpel_search_method(
379     MACROBLOCK *x, bool use_adaptive_subpel_search,
380     const bool fullpel_performed_well) {
381   if (!use_adaptive_subpel_search) return false;
382   const int qband = x->qindex >> (QINDEX_BITS - 2);
383   assert(qband < 4);
384   if ((qband > 0) && (fullpel_performed_well ||
385                       (x->content_state_sb.source_sad_nonrd <= kLowSad) ||
386                       (x->source_variance < 100)))
387     return true;
388   return false;
389 }
390 
391 /*!\brief Runs Motion Estimation for a specific block and specific ref frame.
392  *
393  * \ingroup nonrd_mode_search
394  * \callgraph
395  * \callergraph
396  * Finds the best Motion Vector by running Motion Estimation for a specific
397  * block and a specific reference frame. Exits early if RDCost of Full Pel part
398  * exceeds best RD Cost fund so far
399  * \param[in]    cpi                      Top-level encoder structure
400  * \param[in]    x                        Pointer to structure holding all the
401  *                                        data for the current macroblock
402  * \param[in]    bsize                    Current block size
403  * \param[in]    mi_row                   Row index in 4x4 units
404  * \param[in]    mi_col                   Column index in 4x4 units
405  * \param[in]    tmp_mv                   Pointer to best found New MV
406  * \param[in]    rate_mv                  Pointer to Rate of the best new MV
407  * \param[in]    best_rd_sofar            RD Cost of the best mode found so far
408  * \param[in]    use_base_mv              Flag, indicating that tmp_mv holds
409  *                                        specific MV to start the search with
410  *
411  * \return Returns 0 if ME was terminated after Full Pel Search because too
412  * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv.
413  * Rate estimation for this vector is placed to \c rate_mv
414  */
combined_motion_search(AV1_COMP * cpi,MACROBLOCK * x,BLOCK_SIZE bsize,int mi_row,int mi_col,int_mv * tmp_mv,int * rate_mv,int64_t best_rd_sofar,int use_base_mv)415 static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
416                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
417                                   int_mv *tmp_mv, int *rate_mv,
418                                   int64_t best_rd_sofar, int use_base_mv) {
419   MACROBLOCKD *xd = &x->e_mbd;
420   const AV1_COMMON *cm = &cpi->common;
421   const int num_planes = av1_num_planes(cm);
422   const SPEED_FEATURES *sf = &cpi->sf;
423   MB_MODE_INFO *mi = xd->mi[0];
424   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
425   int step_param = (sf->rt_sf.fullpel_search_step_param)
426                        ? sf->rt_sf.fullpel_search_step_param
427                        : cpi->mv_search_params.mv_step_param;
428   FULLPEL_MV start_mv;
429   const int ref = mi->ref_frame[0];
430   const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv;
431   MV center_mv;
432   int dis;
433   int rv = 0;
434   int cost_list[5];
435   int search_subpel = 1;
436   const YV12_BUFFER_CONFIG *scaled_ref_frame =
437       av1_get_scaled_ref_frame(cpi, ref);
438 
439   if (scaled_ref_frame) {
440     int i;
441     // Swap out the reference frame for a version that's been scaled to
442     // match the resolution of the current frame, allowing the existing
443     // motion search code to be used without additional modifications.
444     for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
445     av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
446                          num_planes);
447   }
448 
449   start_mv = get_fullmv_from_mv(&ref_mv);
450 
451   if (!use_base_mv)
452     center_mv = ref_mv;
453   else
454     center_mv = tmp_mv->as_mv;
455 
456   const SEARCH_METHODS search_method = sf->mv_sf.search_method;
457   const search_site_config *src_search_sites =
458       av1_get_search_site_config(cpi, x, search_method);
459   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
460   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
461                                      src_search_sites,
462                                      /*fine_search_interval=*/0);
463 
464   const unsigned int full_var_rd = av1_full_pixel_search(
465       start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
466       &tmp_mv->as_fullmv, NULL);
467 
468   // calculate the bit cost on motion vector
469   MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
470 
471   *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs->nmv_joint_cost,
472                              x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
473 
474   // TODO(kyslov) Account for Rate Mode!
475   rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar);
476 
477   if (rv && search_subpel) {
478     SUBPEL_MOTION_SEARCH_PARAMS ms_params;
479     av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
480                                       cost_list);
481     const bool fullpel_performed_well =
482         (bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) ||
483         (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) ||
484         (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127);
485     if (sf->rt_sf.reduce_mv_pel_precision_highmotion ||
486         sf->rt_sf.reduce_mv_pel_precision_lowcomplex)
487       ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv, ref_mv,
488                                             start_mv, fullpel_performed_well);
489 
490     MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv);
491     // adaptively downgrade subpel search method based on block properties
492     if (use_aggressive_subpel_search_method(
493             x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well))
494       av1_find_best_sub_pixel_tree_pruned_more(xd, cm, &ms_params,
495                                                subpel_start_mv, &tmp_mv->as_mv,
496                                                &dis, &x->pred_sse[ref], NULL);
497     else
498       cpi->mv_search_params.find_fractional_mv_step(
499           xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
500           &x->pred_sse[ref], NULL);
501     *rate_mv =
502         av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost,
503                         x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
504   }
505 
506   if (scaled_ref_frame) {
507     int i;
508     for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
509   }
510   // The final MV can not be equal to the reference MV as this will trigger an
511   // assert later. This can happen if both NEAREST and NEAR modes were skipped.
512   rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
513   return rv;
514 }
515 
516 /*!\brief Searches for the best New Motion Vector.
517  *
518  * \ingroup nonrd_mode_search
519  * \callgraph
520  * \callergraph
521  * Finds the best Motion Vector by doing Motion Estimation. Uses reduced
522  * complexity ME for non-LAST frames or calls \c combined_motion_search
523  * for LAST reference frame
524  * \param[in]    cpi                      Top-level encoder structure
525  * \param[in]    x                        Pointer to structure holding all the
526  *                                        data for the current macroblock
527  * \param[in]    frame_mv                 Array that holds MVs for all modes
528  *                                        and ref frames
529  * \param[in]    ref_frame                Reference frame for which to find
530  *                                        the best New MVs
531  * \param[in]    gf_temporal_ref          Flag, indicating temporal reference
532  *                                        for GOLDEN frame
533  * \param[in]    bsize                    Current block size
534  * \param[in]    mi_row                   Row index in 4x4 units
535  * \param[in]    mi_col                   Column index in 4x4 units
536  * \param[in]    rate_mv                  Pointer to Rate of the best new MV
537  * \param[in]    best_rdc                 Pointer to the RD Cost for the best
538  *                                        mode found so far
539  *
540  * \return Returns -1 if the search was not done, otherwise returns 0.
541  * Best New MV is placed into \c frame_mv array, Rate estimation for this
542  * vector is placed to \c rate_mv
543  */
search_new_mv(AV1_COMP * cpi,MACROBLOCK * x,int_mv frame_mv[][REF_FRAMES],MV_REFERENCE_FRAME ref_frame,int gf_temporal_ref,BLOCK_SIZE bsize,int mi_row,int mi_col,int * rate_mv,RD_STATS * best_rdc)544 static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
545                          int_mv frame_mv[][REF_FRAMES],
546                          MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
547                          BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv,
548                          RD_STATS *best_rdc) {
549   MACROBLOCKD *const xd = &x->e_mbd;
550   MB_MODE_INFO *const mi = xd->mi[0];
551   AV1_COMMON *cm = &cpi->common;
552   if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
553       gf_temporal_ref) {
554     int tmp_sad;
555     int dis;
556 
557     if (bsize < BLOCK_16X16) return -1;
558 
559     tmp_sad = av1_int_pro_motion_estimation(
560         cpi, x, bsize, mi_row, mi_col,
561         &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv);
562 
563     if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
564 
565     frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
566     int_mv best_mv = mi->mv[0];
567     best_mv.as_mv.row >>= 3;
568     best_mv.as_mv.col >>= 3;
569     MV ref_mv = av1_get_ref_mv(x, 0).as_mv;
570     frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
571     frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
572 
573     SUBPEL_MOTION_SEARCH_PARAMS ms_params;
574     av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL);
575     if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion ||
576         cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex) {
577       FULLPEL_MV start_mv = { .row = 0, .col = 0 };
578       ms_params.forced_stop =
579           subpel_select(cpi, x, bsize, &best_mv, ref_mv, start_mv, false);
580     }
581     MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
582     cpi->mv_search_params.find_fractional_mv_step(
583         xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis,
584         &x->pred_sse[ref_frame], NULL);
585     frame_mv[NEWMV][ref_frame].as_int = best_mv.as_int;
586 
587     // When NEWMV is same as ref_mv from the drl, it is preferred to code the
588     // MV as NEARESTMV or NEARMV. In this case, NEWMV needs to be skipped to
589     // avoid an assert failure at a later stage. The scenario can occur if
590     // NEARESTMV was not evaluated for ALTREF.
591     if (frame_mv[NEWMV][ref_frame].as_mv.col == ref_mv.col &&
592         frame_mv[NEWMV][ref_frame].as_mv.row == ref_mv.row)
593       return -1;
594 
595     *rate_mv = av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv,
596                                x->mv_costs->nmv_joint_cost,
597                                x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
598   } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
599                                      &frame_mv[NEWMV][ref_frame], rate_mv,
600                                      best_rdc->rdcost, 0)) {
601     return -1;
602   }
603 
604   return 0;
605 }
606 
estimate_single_ref_frame_costs(const AV1_COMMON * cm,const MACROBLOCKD * xd,const ModeCosts * mode_costs,int segment_id,BLOCK_SIZE bsize,unsigned int * ref_costs_single)607 static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
608                                             const MACROBLOCKD *xd,
609                                             const ModeCosts *mode_costs,
610                                             int segment_id, BLOCK_SIZE bsize,
611                                             unsigned int *ref_costs_single) {
612   int seg_ref_active =
613       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
614   if (seg_ref_active) {
615     memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
616   } else {
617     int intra_inter_ctx = av1_get_intra_inter_context(xd);
618     ref_costs_single[INTRA_FRAME] =
619         mode_costs->intra_inter_cost[intra_inter_ctx][0];
620     unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
621     if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
622         is_comp_ref_allowed(bsize)) {
623       const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
624       base_cost += mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
625     }
626     ref_costs_single[LAST_FRAME] = base_cost;
627     ref_costs_single[GOLDEN_FRAME] = base_cost;
628     ref_costs_single[ALTREF_FRAME] = base_cost;
629     // add cost for last, golden, altref
630     ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0];
631     ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1];
632     ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0];
633     ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1];
634     ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0];
635   }
636 }
637 
set_force_skip_flag(const AV1_COMP * const cpi,MACROBLOCK * const x,unsigned int sse,int * force_skip)638 static INLINE void set_force_skip_flag(const AV1_COMP *const cpi,
639                                        MACROBLOCK *const x, unsigned int sse,
640                                        int *force_skip) {
641   if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT &&
642       cpi->sf.rt_sf.tx_size_level_based_on_qstep &&
643       cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
644     const int qstep = x->plane[0].dequant_QTX[1] >> (x->e_mbd.bd - 5);
645     const unsigned int qstep_sq = qstep * qstep;
646     // If the sse is low for low source variance blocks, mark those as
647     // transform skip.
648     // Note: Though qstep_sq is based on ac qstep, the threshold is kept
649     // low so that reliable early estimate of tx skip can be obtained
650     // through its comparison with sse.
651     if (sse < qstep_sq && x->source_variance < qstep_sq &&
652         x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0)
653       *force_skip = 1;
654   }
655 }
656 
657 #define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \
658   (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false)
659 #define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16)
660 
calculate_tx_size(const AV1_COMP * const cpi,BLOCK_SIZE bsize,MACROBLOCK * const x,unsigned int var,unsigned int sse,int * force_skip)661 static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
662                                  MACROBLOCK *const x, unsigned int var,
663                                  unsigned int sse, int *force_skip) {
664   MACROBLOCKD *const xd = &x->e_mbd;
665   TX_SIZE tx_size;
666   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
667   if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) {
668     int multiplier = 8;
669     unsigned int var_thresh = 0;
670     unsigned int is_high_var = 1;
671     // Use quantizer based thresholds to determine transform size.
672     if (cpi->sf.rt_sf.tx_size_level_based_on_qstep) {
673       const int qband = x->qindex >> (QINDEX_BITS - 2);
674       const int mult[4] = { 8, 7, 6, 5 };
675       assert(qband < 4);
676       multiplier = mult[qband];
677       const int qstep = x->plane[0].dequant_QTX[1] >> (xd->bd - 5);
678       const unsigned int qstep_sq = qstep * qstep;
679       var_thresh = qstep_sq * 2;
680       if (cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
681         // If the sse is low for low source variance blocks, mark those as
682         // transform skip.
683         // Note: Though qstep_sq is based on ac qstep, the threshold is kept
684         // low so that reliable early estimate of tx skip can be obtained
685         // through its comparison with sse.
686         if (sse < qstep_sq && x->source_variance < qstep_sq &&
687             x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0)
688           *force_skip = 1;
689         // Further lower transform size based on aq mode only if residual
690         // variance is high.
691         is_high_var = (var >= var_thresh);
692       }
693     }
694     // Choose larger transform size for blocks where dc component is dominant or
695     // the ac component is low.
696     if (sse > ((var * multiplier) >> 2) || (var < var_thresh))
697       tx_size =
698           AOMMIN(max_txsize_lookup[bsize],
699                  tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
700     else
701       tx_size = TX_8X8;
702 
703     if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
704         cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && is_high_var)
705       tx_size = TX_8X8;
706     else if (tx_size > TX_16X16)
707       tx_size = TX_16X16;
708   } else {
709     tx_size =
710         AOMMIN(max_txsize_lookup[bsize],
711                tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
712   }
713 
714   if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize))
715     tx_size = TX_SIZE_FOR_BSIZE_GT32;
716 
717   return AOMMIN(tx_size, TX_16X16);
718 }
719 
720 static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2,
721                                                           2, 2, 3, 3, 3, 4,
722                                                           4, 4, 5, 5 };
723 static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1,
724                                                            2, 3, 2, 3, 4, 3,
725                                                            4, 5, 4, 5 };
726 
block_variance(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int w,int h,unsigned int * sse,int * sum,int block_size,uint32_t * sse8x8,int * sum8x8,uint32_t * var8x8)727 static void block_variance(const uint8_t *src, int src_stride,
728                            const uint8_t *ref, int ref_stride, int w, int h,
729                            unsigned int *sse, int *sum, int block_size,
730                            uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
731   int k = 0;
732   *sse = 0;
733   *sum = 0;
734 
735   // This function is called for block sizes >= BLOCK_32x32. As per the design
736   // the aom_get_var_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32)
737   // per call. Hence the width and height of the block need to be at least 8 and
738   // 32 samples respectively.
739   assert(w >= 32);
740   assert(h >= 8);
741   for (int i = 0; i < h; i += block_size) {
742     for (int j = 0; j < w; j += 32) {
743       aom_get_var_sse_sum_8x8_quad(
744           src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
745           ref_stride, &sse8x8[k], &sum8x8[k], sse, sum, &var8x8[k]);
746       k += 4;
747     }
748   }
749 }
750 
block_variance_16x16_dual(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int w,int h,unsigned int * sse,int * sum,int block_size,uint32_t * sse16x16,uint32_t * var16x16)751 static void block_variance_16x16_dual(const uint8_t *src, int src_stride,
752                                       const uint8_t *ref, int ref_stride, int w,
753                                       int h, unsigned int *sse, int *sum,
754                                       int block_size, uint32_t *sse16x16,
755                                       uint32_t *var16x16) {
756   int k = 0;
757   *sse = 0;
758   *sum = 0;
759   // This function is called for block sizes >= BLOCK_32x32. As per the design
760   // the aom_get_var_sse_sum_16x16_dual() processes four 16x16 blocks (in a
761   // 16x32) per call. Hence the width and height of the block need to be at
762   // least 16 and 32 samples respectively.
763   assert(w >= 32);
764   assert(h >= 16);
765   for (int i = 0; i < h; i += block_size) {
766     for (int j = 0; j < w; j += 32) {
767       aom_get_var_sse_sum_16x16_dual(src + src_stride * i + j, src_stride,
768                                      ref + ref_stride * i + j, ref_stride,
769                                      &sse16x16[k], sse, sum, &var16x16[k]);
770       k += 2;
771     }
772   }
773 }
774 
calculate_variance(int bw,int bh,TX_SIZE tx_size,unsigned int * sse_i,int * sum_i,unsigned int * var_o,unsigned int * sse_o,int * sum_o)775 static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
776                                unsigned int *sse_i, int *sum_i,
777                                unsigned int *var_o, unsigned int *sse_o,
778                                int *sum_o) {
779   const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
780   const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
781   const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
782   int i, j, k = 0;
783 
784   for (i = 0; i < nh; i += 2) {
785     for (j = 0; j < nw; j += 2) {
786       sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
787                  sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
788       sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
789                  sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
790       var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
791                                        (b_width_log2_lookup[unit_size] +
792                                         b_height_log2_lookup[unit_size] + 6));
793       k++;
794     }
795   }
796 }
797 
798 // Adjust the ac_thr according to speed, width, height and normalized sum
ac_thr_factor(const int speed,const int width,const int height,const int norm_sum)799 static int ac_thr_factor(const int speed, const int width, const int height,
800                          const int norm_sum) {
801   if (speed >= 8 && norm_sum < 5) {
802     if (width <= 640 && height <= 480)
803       return 4;
804     else
805       return 2;
806   }
807   return 1;
808 }
809 
810 // Sets early_term flag based on chroma planes prediction
set_early_term_based_on_uv_plane(AV1_COMP * cpi,MACROBLOCK * x,BLOCK_SIZE bsize,MACROBLOCKD * xd,int mi_row,int mi_col,int * early_term,int num_blk,const unsigned int * sse_tx,const unsigned int * var_tx,int sum,unsigned int var,unsigned int sse)811 static INLINE void set_early_term_based_on_uv_plane(
812     AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row,
813     int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx,
814     const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) {
815   AV1_COMMON *const cm = &cpi->common;
816   struct macroblock_plane *const p = &x->plane[0];
817   const uint32_t dc_quant = p->dequant_QTX[0];
818   const uint32_t ac_quant = p->dequant_QTX[1];
819   const int64_t dc_thr = dc_quant * dc_quant >> 6;
820   int64_t ac_thr = ac_quant * ac_quant >> 6;
821   const int bw = b_width_log2_lookup[bsize];
822   const int bh = b_height_log2_lookup[bsize];
823   int ac_test = 1;
824   int dc_test = 1;
825   const int norm_sum = abs(sum) >> (bw + bh);
826 
827 #if CONFIG_AV1_TEMPORAL_DENOISING
828   if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
829       cpi->oxcf.speed > 5)
830     ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
831                                      norm_sum, cpi->svc.temporal_layer_id);
832   else
833     ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
834 #else
835   ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
836 
837 #endif
838 
839   for (int k = 0; k < num_blk; k++) {
840     // Check if all ac coefficients can be quantized to zero.
841     if (!(var_tx[k] < ac_thr || var == 0)) {
842       ac_test = 0;
843       break;
844     }
845     // Check if dc coefficient can be quantized to zero.
846     if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
847       dc_test = 0;
848       break;
849     }
850   }
851 
852   // Check if chroma can be skipped based on ac and dc test flags.
853   if (ac_test && dc_test) {
854     int skip_uv[2] = { 0 };
855     unsigned int var_uv[2];
856     unsigned int sse_uv[2];
857     // Transform skipping test in UV planes.
858     for (int i = 1; i <= 2; i++) {
859       int j = i - 1;
860       skip_uv[j] = 1;
861       if (x->color_sensitivity[j]) {
862         skip_uv[j] = 0;
863         struct macroblock_plane *const puv = &x->plane[i];
864         struct macroblockd_plane *const puvd = &xd->plane[i];
865         const BLOCK_SIZE uv_bsize = get_plane_block_size(
866             bsize, puvd->subsampling_x, puvd->subsampling_y);
867         // Adjust these thresholds for UV.
868         const int64_t uv_dc_thr =
869             (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> 3;
870         const int64_t uv_ac_thr =
871             (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3;
872         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i,
873                                       i);
874         var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
875                                                   puvd->dst.buf,
876                                                   puvd->dst.stride, &sse_uv[j]);
877         if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
878             (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
879           skip_uv[j] = 1;
880         else
881           break;
882       }
883     }
884     if (skip_uv[0] & skip_uv[1]) {
885       *early_term = 1;
886     }
887   }
888 }
889 
calc_rate_dist_block_param(AV1_COMP * cpi,MACROBLOCK * x,RD_STATS * rd_stats,int calculate_rd,int * early_term,BLOCK_SIZE bsize,unsigned int sse)890 static INLINE void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x,
891                                               RD_STATS *rd_stats,
892                                               int calculate_rd, int *early_term,
893                                               BLOCK_SIZE bsize,
894                                               unsigned int sse) {
895   if (calculate_rd) {
896     if (!*early_term) {
897       const int bw = block_size_wide[bsize];
898       const int bh = block_size_high[bsize];
899 
900       model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, rd_stats->sse, bw * bh,
901                             &rd_stats->rate, &rd_stats->dist);
902     }
903 
904     if (*early_term) {
905       rd_stats->rate = 0;
906       rd_stats->dist = sse << 4;
907     }
908   }
909 }
910 
model_skip_for_sb_y_large_64(AV1_COMP * cpi,BLOCK_SIZE bsize,int mi_row,int mi_col,MACROBLOCK * x,MACROBLOCKD * xd,RD_STATS * rd_stats,int * early_term,int calculate_rd,int64_t best_sse,unsigned int * var_output,unsigned int var_prune_threshold)911 static void model_skip_for_sb_y_large_64(AV1_COMP *cpi, BLOCK_SIZE bsize,
912                                          int mi_row, int mi_col, MACROBLOCK *x,
913                                          MACROBLOCKD *xd, RD_STATS *rd_stats,
914                                          int *early_term, int calculate_rd,
915                                          int64_t best_sse,
916                                          unsigned int *var_output,
917                                          unsigned int var_prune_threshold) {
918   // Note our transform coeffs are 8 times an orthogonal transform.
919   // Hence quantizer step is also 8 times. To get effective quantizer
920   // we need to divide by 8 before sending to modeling function.
921   unsigned int sse;
922   struct macroblock_plane *const p = &x->plane[0];
923   struct macroblockd_plane *const pd = &xd->plane[0];
924   int test_skip = 1;
925   unsigned int var;
926   int sum;
927   const int bw = b_width_log2_lookup[bsize];
928   const int bh = b_height_log2_lookup[bsize];
929   unsigned int sse16x16[64] = { 0 };
930   unsigned int var16x16[64] = { 0 };
931   assert(xd->mi[0]->tx_size == TX_16X16);
932   assert(bsize > BLOCK_32X32);
933 
934   // Calculate variance for whole partition, and also save 16x16 blocks'
935   // variance to be used in following transform skipping test.
936   block_variance_16x16_dual(p->src.buf, p->src.stride, pd->dst.buf,
937                             pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 16,
938                             sse16x16, var16x16);
939 
940   var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
941   if (var_output) {
942     *var_output = var;
943     if (*var_output > var_prune_threshold) {
944       return;
945     }
946   }
947 
948   rd_stats->sse = sse;
949   // Skipping test
950   *early_term = 0;
951   set_force_skip_flag(cpi, x, sse, early_term);
952   // The code below for setting skip flag assumes transform size of at least
953   // 8x8, so force this lower limit on transform.
954   MB_MODE_INFO *const mi = xd->mi[0];
955   if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
956       early_term_inter_search_with_sse(
957           cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
958           mi->mode))
959     test_skip = 0;
960 
961   if (*early_term) test_skip = 0;
962 
963   // Evaluate if the partition block is a skippable block in Y plane.
964   if (test_skip) {
965     const unsigned int *sse_tx = sse16x16;
966     const unsigned int *var_tx = var16x16;
967     const unsigned int num_block = (1 << (bw + bh - 2)) >> 2;
968     set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
969                                      early_term, num_block, sse_tx, var_tx, sum,
970                                      var, sse);
971   }
972   calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
973                              sse);
974 }
975 
model_skip_for_sb_y_large(AV1_COMP * cpi,BLOCK_SIZE bsize,int mi_row,int mi_col,MACROBLOCK * x,MACROBLOCKD * xd,RD_STATS * rd_stats,int * early_term,int calculate_rd,int64_t best_sse,unsigned int * var_output,unsigned int var_prune_threshold)976 static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
977                                       int mi_row, int mi_col, MACROBLOCK *x,
978                                       MACROBLOCKD *xd, RD_STATS *rd_stats,
979                                       int *early_term, int calculate_rd,
980                                       int64_t best_sse,
981                                       unsigned int *var_output,
982                                       unsigned int var_prune_threshold) {
983   if (x->force_zeromv_skip_for_blk) {
984     *early_term = 1;
985     rd_stats->rate = 0;
986     rd_stats->dist = 0;
987     rd_stats->sse = 0;
988     return;
989   }
990 
991   // For block sizes greater than 32x32, the transform size is always 16x16.
992   // This function avoids calling calculate_variance() for tx_size 16x16 cases
993   // by directly populating variance at tx_size level from
994   // block_variance_16x16_dual() function.
995   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
996   if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) {
997     xd->mi[0]->tx_size = TX_SIZE_FOR_BSIZE_GT32;
998     model_skip_for_sb_y_large_64(cpi, bsize, mi_row, mi_col, x, xd, rd_stats,
999                                  early_term, calculate_rd, best_sse, var_output,
1000                                  var_prune_threshold);
1001     return;
1002   }
1003 
1004   // Note our transform coeffs are 8 times an orthogonal transform.
1005   // Hence quantizer step is also 8 times. To get effective quantizer
1006   // we need to divide by 8 before sending to modeling function.
1007   unsigned int sse;
1008   struct macroblock_plane *const p = &x->plane[0];
1009   struct macroblockd_plane *const pd = &xd->plane[0];
1010   int test_skip = 1;
1011   unsigned int var;
1012   int sum;
1013 
1014   const int bw = b_width_log2_lookup[bsize];
1015   const int bh = b_height_log2_lookup[bsize];
1016   unsigned int sse8x8[256] = { 0 };
1017   int sum8x8[256] = { 0 };
1018   unsigned int var8x8[256] = { 0 };
1019   TX_SIZE tx_size;
1020 
1021   // Calculate variance for whole partition, and also save 8x8 blocks' variance
1022   // to be used in following transform skipping test.
1023   block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
1024                  4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
1025   var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
1026   if (var_output) {
1027     *var_output = var;
1028     if (*var_output > var_prune_threshold) {
1029       return;
1030     }
1031   }
1032 
1033   rd_stats->sse = sse;
1034   // Skipping test
1035   *early_term = 0;
1036   tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term);
1037   assert(tx_size <= TX_16X16);
1038   // The code below for setting skip flag assumes transform size of at least
1039   // 8x8, so force this lower limit on transform.
1040   if (tx_size < TX_8X8) tx_size = TX_8X8;
1041   xd->mi[0]->tx_size = tx_size;
1042 
1043   MB_MODE_INFO *const mi = xd->mi[0];
1044   if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
1045       early_term_inter_search_with_sse(
1046           cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
1047           mi->mode))
1048     test_skip = 0;
1049 
1050   if (*early_term) test_skip = 0;
1051 
1052   // Evaluate if the partition block is a skippable block in Y plane.
1053   if (test_skip) {
1054     unsigned int sse16x16[64] = { 0 };
1055     int sum16x16[64] = { 0 };
1056     unsigned int var16x16[64] = { 0 };
1057     const unsigned int *sse_tx = sse8x8;
1058     const unsigned int *var_tx = var8x8;
1059     unsigned int num_blks = 1 << (bw + bh - 2);
1060 
1061     if (tx_size >= TX_16X16) {
1062       calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
1063                          sum16x16);
1064       sse_tx = sse16x16;
1065       var_tx = var16x16;
1066       num_blks = num_blks >> 2;
1067     }
1068     set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
1069                                      early_term, num_blks, sse_tx, var_tx, sum,
1070                                      var, sse);
1071   }
1072   calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
1073                              sse);
1074 }
1075 
model_rd_for_sb_y(const AV1_COMP * const cpi,BLOCK_SIZE bsize,MACROBLOCK * x,MACROBLOCKD * xd,RD_STATS * rd_stats,unsigned int * var_out,int calculate_rd,int * early_term)1076 static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
1077                               MACROBLOCK *x, MACROBLOCKD *xd,
1078                               RD_STATS *rd_stats, unsigned int *var_out,
1079                               int calculate_rd, int *early_term) {
1080   if (x->force_zeromv_skip_for_blk && early_term != NULL) {
1081     *early_term = 1;
1082     rd_stats->rate = 0;
1083     rd_stats->dist = 0;
1084     rd_stats->sse = 0;
1085   }
1086 
1087   // Note our transform coeffs are 8 times an orthogonal transform.
1088   // Hence quantizer step is also 8 times. To get effective quantizer
1089   // we need to divide by 8 before sending to modeling function.
1090   const int ref = xd->mi[0]->ref_frame[0];
1091 
1092   assert(bsize < BLOCK_SIZES_ALL);
1093 
1094   struct macroblock_plane *const p = &x->plane[0];
1095   struct macroblockd_plane *const pd = &xd->plane[0];
1096   unsigned int sse;
1097   int rate;
1098   int64_t dist;
1099 
1100   unsigned int var = cpi->ppi->fn_ptr[bsize].vf(
1101       p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
1102   int force_skip = 0;
1103   xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip);
1104   if (var_out) {
1105     *var_out = var;
1106   }
1107 
1108   if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) {
1109     const int bwide = block_size_wide[bsize];
1110     const int bhigh = block_size_high[bsize];
1111     model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate,
1112                           &dist);
1113   } else {
1114     rate = INT_MAX;  // this will be overwritten later with block_yrd
1115     dist = INT_MAX;
1116   }
1117   rd_stats->sse = sse;
1118   x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
1119 
1120   if (force_skip && ref > INTRA_FRAME) {
1121     rate = 0;
1122     dist = (int64_t)sse << 4;
1123   }
1124 
1125   assert(rate >= 0);
1126 
1127   rd_stats->skip_txfm = (rate == 0);
1128   rate = AOMMIN(rate, INT_MAX);
1129   rd_stats->rate = rate;
1130   rd_stats->dist = dist;
1131 }
1132 
aom_process_hadamard_lp_8x16(MACROBLOCK * x,int max_blocks_high,int max_blocks_wide,int num_4x4_w,int step,int block_step)1133 static INLINE void aom_process_hadamard_lp_8x16(MACROBLOCK *x,
1134                                                 int max_blocks_high,
1135                                                 int max_blocks_wide,
1136                                                 int num_4x4_w, int step,
1137                                                 int block_step) {
1138   struct macroblock_plane *const p = &x->plane[0];
1139   const int bw = 4 * num_4x4_w;
1140   const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide);
1141   int block = 0;
1142 
1143   for (int r = 0; r < max_blocks_high; r += block_step) {
1144     for (int c = 0; c < num_4x4; c += 2 * block_step) {
1145       const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2];
1146       int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block);
1147       aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff);
1148       block += 2 * step;
1149     }
1150   }
1151 }
1152 
1153 #define DECLARE_BLOCK_YRD_BUFFERS()                      \
1154   DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \
1155   DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]);  \
1156   DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]);   \
1157   uint16_t eob[1];
1158 
1159 #define DECLARE_BLOCK_YRD_VARS()                                           \
1160   /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the   \
1161    * entire bsize and write macroblock_plane::coeff. So low_coeff is kept  \
1162    * as a non-const so we can reassign it to macroblock_plane::coeff. */   \
1163   int16_t *low_coeff = (int16_t *)coeff_buf;                               \
1164   int16_t *const low_qcoeff = (int16_t *)qcoeff_buf;                       \
1165   int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf;                     \
1166   const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; \
1167   const int diff_stride = bw;
1168 
1169 #define DECLARE_LOOP_VARS_BLOCK_YRD() \
1170   const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2];
1171 
1172 #if CONFIG_AV1_HIGHBITDEPTH
1173 #define DECLARE_BLOCK_YRD_HBD_VARS()     \
1174   tran_low_t *const coeff = coeff_buf;   \
1175   tran_low_t *const qcoeff = qcoeff_buf; \
1176   tran_low_t *const dqcoeff = dqcoeff_buf;
1177 
update_yrd_loop_vars_hbd(MACROBLOCK * x,int * skippable,const int step,const int ncoeffs,tran_low_t * const coeff,tran_low_t * const qcoeff,tran_low_t * const dqcoeff,RD_STATS * this_rdc,int * eob_cost,const int tx_blk_id)1178 static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd(
1179     MACROBLOCK *x, int *skippable, const int step, const int ncoeffs,
1180     tran_low_t *const coeff, tran_low_t *const qcoeff,
1181     tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost,
1182     const int tx_blk_id) {
1183   const int is_txfm_skip = (ncoeffs == 0);
1184   *skippable &= is_txfm_skip;
1185   x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
1186   *eob_cost += get_msb(ncoeffs + 1);
1187 
1188   int64_t dummy;
1189   if (ncoeffs == 1)
1190     this_rdc->rate += (int)abs(qcoeff[0]);
1191   else if (ncoeffs > 1)
1192     this_rdc->rate += aom_satd(qcoeff, step << 4);
1193 
1194   this_rdc->dist += av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
1195 }
1196 #endif
update_yrd_loop_vars(MACROBLOCK * x,int * skippable,const int step,const int ncoeffs,int16_t * const low_coeff,int16_t * const low_qcoeff,int16_t * const low_dqcoeff,RD_STATS * this_rdc,int * eob_cost,const int tx_blk_id)1197 static AOM_FORCE_INLINE void update_yrd_loop_vars(
1198     MACROBLOCK *x, int *skippable, const int step, const int ncoeffs,
1199     int16_t *const low_coeff, int16_t *const low_qcoeff,
1200     int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost,
1201     const int tx_blk_id) {
1202   const int is_txfm_skip = (ncoeffs == 0);
1203   *skippable &= is_txfm_skip;
1204   x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
1205   *eob_cost += get_msb(ncoeffs + 1);
1206   if (ncoeffs == 1)
1207     this_rdc->rate += (int)abs(low_qcoeff[0]);
1208   else if (ncoeffs > 1)
1209     this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
1210 
1211   this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
1212 }
1213 
1214 /*!\brief Calculates RD Cost using Hadamard transform.
1215  *
1216  * \ingroup nonrd_mode_search
1217  * \callgraph
1218  * \callergraph
1219  * Calculates RD Cost using Hadamard transform. For low bit depth this function
1220  * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
1221  * \param[in]    x              Pointer to structure holding all the data for
1222                                 the current macroblock
1223  * \param[in]    this_rdc       Pointer to calculated RD Cost
1224  * \param[in]    skippable      Pointer to a flag indicating possible tx skip
1225  * \param[in]    bsize          Current block size
1226  * \param[in]    tx_size        Transform size
1227  * \param[in]    is_inter_mode  Flag to indicate inter mode
1228  *
1229  * \remark Nothing is returned. Instead, calculated RD cost is placed to
1230  * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
1231  * coefficients for Hadamard transform
1232  */
block_yrd(MACROBLOCK * x,RD_STATS * this_rdc,int * skippable,const BLOCK_SIZE bsize,const TX_SIZE tx_size,const int is_inter_mode)1233 static void block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
1234                       const BLOCK_SIZE bsize, const TX_SIZE tx_size,
1235                       const int is_inter_mode) {
1236   MACROBLOCKD *xd = &x->e_mbd;
1237   const struct macroblockd_plane *pd = &xd->plane[0];
1238   struct macroblock_plane *const p = &x->plane[0];
1239   assert(bsize < BLOCK_SIZES_ALL);
1240   const int num_4x4_w = mi_size_wide[bsize];
1241   const int num_4x4_h = mi_size_high[bsize];
1242   const int step = 1 << (tx_size << 1);
1243   const int block_step = (1 << tx_size);
1244   const int row_step = step * num_4x4_w >> tx_size;
1245   int block = 0;
1246   const int max_blocks_wide =
1247       num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
1248   const int max_blocks_high =
1249       num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
1250   int eob_cost = 0;
1251   const int bw = 4 * num_4x4_w;
1252   const int bh = 4 * num_4x4_h;
1253   const int use_hbd = is_cur_buf_hbd(xd);
1254   int num_blk_skip_w = num_4x4_w;
1255   int sh_blk_skip = 0;
1256   if (is_inter_mode) {
1257     num_blk_skip_w = num_4x4_w >> 1;
1258     sh_blk_skip = 1;
1259   }
1260 
1261 #if CONFIG_AV1_HIGHBITDEPTH
1262   if (use_hbd) {
1263     aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
1264                               p->src.stride, pd->dst.buf, pd->dst.stride);
1265   } else {
1266     aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
1267                        pd->dst.buf, pd->dst.stride);
1268   }
1269 #else
1270   aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
1271                      pd->dst.buf, pd->dst.stride);
1272 #endif
1273 
1274   // Keep the intermediate value on the stack here. Writing directly to
1275   // skippable causes speed regression due to load-and-store issues in
1276   // update_yrd_loop_vars.
1277   int temp_skippable = 1;
1278   this_rdc->dist = 0;
1279   this_rdc->rate = 0;
1280   // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks
1281   // can be done per function call. Hence the call of Hadamard txfm is
1282   // abstracted here for the specified cases.
1283   int is_tx_8x8_dual_applicable =
1284       (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 &&
1285        block_size_high[bsize] >= 8);
1286 
1287 #if CONFIG_AV1_HIGHBITDEPTH
1288   // As of now, dual implementation of hadamard txfm is available for low
1289   // bitdepth.
1290   if (use_hbd) is_tx_8x8_dual_applicable = 0;
1291 #endif
1292 
1293   if (is_tx_8x8_dual_applicable) {
1294     aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w,
1295                                  step, block_step);
1296   }
1297 
1298   DECLARE_BLOCK_YRD_BUFFERS()
1299   DECLARE_BLOCK_YRD_VARS()
1300 #if CONFIG_AV1_HIGHBITDEPTH
1301   DECLARE_BLOCK_YRD_HBD_VARS()
1302 #else
1303   (void)use_hbd;
1304 #endif
1305 
1306   // Keep track of the row and column of the blocks we use so that we know
1307   // if we are in the unrestricted motion border.
1308   for (int r = 0; r < max_blocks_high; r += block_step) {
1309     for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
1310       DECLARE_LOOP_VARS_BLOCK_YRD()
1311 
1312       switch (tx_size) {
1313 #if CONFIG_AV1_HIGHBITDEPTH
1314         case TX_16X16:
1315           if (use_hbd) {
1316             aom_hadamard_16x16(src_diff, diff_stride, coeff);
1317             av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
1318                             p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
1319                             dqcoeff, p->dequant_QTX, eob,
1320                             // default_scan_fp_16x16_transpose and
1321                             // av1_default_iscan_fp_16x16_transpose have to be
1322                             // used together.
1323                             default_scan_fp_16x16_transpose,
1324                             av1_default_iscan_fp_16x16_transpose);
1325           } else {
1326             aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
1327             av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
1328                             p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
1329                             p->dequant_QTX, eob,
1330                             // default_scan_lp_16x16_transpose and
1331                             // av1_default_iscan_lp_16x16_transpose have to be
1332                             // used together.
1333                             default_scan_lp_16x16_transpose,
1334                             av1_default_iscan_lp_16x16_transpose);
1335           }
1336           break;
1337         case TX_8X8:
1338           if (use_hbd) {
1339             aom_hadamard_8x8(src_diff, diff_stride, coeff);
1340             av1_quantize_fp(
1341                 coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
1342                 p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
1343                 default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
1344           } else {
1345             if (is_tx_8x8_dual_applicable) {
1346               // The coeffs are pre-computed for the whole block, so re-assign
1347               // low_coeff to the appropriate location.
1348               const int block_offset = BLOCK_OFFSET(block + s);
1349               low_coeff = (int16_t *)p->coeff + block_offset;
1350             } else {
1351               aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
1352             }
1353             av1_quantize_lp(
1354                 low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff,
1355                 low_dqcoeff, p->dequant_QTX, eob,
1356                 // default_scan_8x8_transpose and
1357                 // av1_default_iscan_8x8_transpose have to be used together.
1358                 default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
1359           }
1360           break;
1361         default:
1362           assert(tx_size == TX_4X4);
1363           // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate
1364           // normal coefficients order, so we don't need to change the scan
1365           // order here.
1366           if (use_hbd) {
1367             aom_fdct4x4(src_diff, coeff, diff_stride);
1368             av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
1369                             p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
1370                             dqcoeff, p->dequant_QTX, eob, scan_order->scan,
1371                             scan_order->iscan);
1372           } else {
1373             aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
1374             av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
1375                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
1376                             scan_order->scan, scan_order->iscan);
1377           }
1378           break;
1379 #else
1380         case TX_16X16:
1381           aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
1382           av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX,
1383                           low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
1384                           default_scan_lp_16x16_transpose,
1385                           av1_default_iscan_lp_16x16_transpose);
1386           break;
1387         case TX_8X8:
1388           if (is_tx_8x8_dual_applicable) {
1389             // The coeffs are pre-computed for the whole block, so re-assign
1390             // low_coeff to the appropriate location.
1391             const int block_offset = BLOCK_OFFSET(block + s);
1392             low_coeff = (int16_t *)p->coeff + block_offset;
1393           } else {
1394             aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
1395           }
1396           av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
1397                           low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
1398                           default_scan_8x8_transpose,
1399                           av1_default_iscan_8x8_transpose);
1400           break;
1401         default:
1402           aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
1403           av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
1404                           low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
1405                           scan_order->scan, scan_order->iscan);
1406           break;
1407 #endif
1408       }
1409       assert(*eob <= 1024);
1410 #if CONFIG_AV1_HIGHBITDEPTH
1411       if (use_hbd)
1412         update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff,
1413                                  dqcoeff, this_rdc, &eob_cost,
1414                                  (r * num_blk_skip_w + c) >> sh_blk_skip);
1415       else
1416 #endif
1417         update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
1418                              low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
1419                              (r * num_blk_skip_w + c) >> sh_blk_skip);
1420     }
1421     block += row_step;
1422   }
1423 
1424   this_rdc->skip_txfm = *skippable = temp_skippable;
1425   if (this_rdc->sse < INT64_MAX) {
1426     this_rdc->sse = (this_rdc->sse << 6) >> 2;
1427     if (temp_skippable) {
1428       this_rdc->dist = 0;
1429       this_rdc->dist = this_rdc->sse;
1430       return;
1431     }
1432   }
1433 
1434   // If skippable is set, rate gets clobbered later.
1435   this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
1436   this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
1437 }
1438 
1439 // Explicitly enumerate the cases so the compiler can generate SIMD for the
1440 // function. According to the disassembler, gcc generates SSE codes for each of
1441 // the possible block sizes. The hottest case is tx_width 16, which takes up
1442 // about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since
1443 // av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
1444 // potential room of improvement for writing AVX2 optimization is only 3% * 8% =
1445 // 0.24% of total encoding time.
scale_square_buf_vals(int16_t * dst,const int tx_width,const int16_t * src,const int src_stride)1446 static AOM_INLINE void scale_square_buf_vals(int16_t *dst, const int tx_width,
1447                                              const int16_t *src,
1448                                              const int src_stride) {
1449 #define DO_SCALING                                                   \
1450   do {                                                               \
1451     for (int idy = 0; idy < tx_width; ++idy) {                       \
1452       for (int idx = 0; idx < tx_width; ++idx) {                     \
1453         dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \
1454       }                                                              \
1455     }                                                                \
1456   } while (0)
1457 
1458   if (tx_width == 4) {
1459     DO_SCALING;
1460   } else if (tx_width == 8) {
1461     DO_SCALING;
1462   } else if (tx_width == 16) {
1463     DO_SCALING;
1464   } else {
1465     assert(0);
1466   }
1467 
1468 #undef DO_SCALING
1469 }
1470 
1471 /*!\brief Calculates RD Cost when the block uses Identity transform.
1472  * Note that thie function is only for low bit depth encoding, since it
1473  * is called in real-time mode for now, which sets high bit depth to 0:
1474  * -DCONFIG_AV1_HIGHBITDEPTH=0
1475  *
1476  * \ingroup nonrd_mode_search
1477  * \callgraph
1478  * \callergraph
1479  * Calculates RD Cost. For low bit depth this function
1480  * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
1481  * \param[in]    x              Pointer to structure holding all the data for
1482                                 the current macroblock
1483  * \param[in]    this_rdc       Pointer to calculated RD Cost
1484  * \param[in]    skippable      Pointer to a flag indicating possible tx skip
1485  * \param[in]    bsize          Current block size
1486  * \param[in]    tx_size        Transform size
1487  *
1488  * \remark Nothing is returned. Instead, calculated RD cost is placed to
1489  * \c this_rdc. \c skippable flag is set if all coefficients are zero.
1490  */
block_yrd_idtx(MACROBLOCK * x,RD_STATS * this_rdc,int * skippable,const BLOCK_SIZE bsize,const TX_SIZE tx_size)1491 static void block_yrd_idtx(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
1492                            const BLOCK_SIZE bsize, const TX_SIZE tx_size) {
1493   MACROBLOCKD *xd = &x->e_mbd;
1494   const struct macroblockd_plane *pd = &xd->plane[0];
1495   struct macroblock_plane *const p = &x->plane[0];
1496   assert(bsize < BLOCK_SIZES_ALL);
1497   const int num_4x4_w = mi_size_wide[bsize];
1498   const int num_4x4_h = mi_size_high[bsize];
1499   const int step = 1 << (tx_size << 1);
1500   const int block_step = (1 << tx_size);
1501   const int max_blocks_wide =
1502       num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
1503   const int max_blocks_high =
1504       num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
1505   int eob_cost = 0;
1506   const int bw = 4 * num_4x4_w;
1507   const int bh = 4 * num_4x4_h;
1508   const int num_blk_skip_w = num_4x4_w >> 1;
1509   const int sh_blk_skip = 1;
1510   // Keep the intermediate value on the stack here. Writing directly to
1511   // skippable causes speed regression due to load-and-store issues in
1512   // update_yrd_loop_vars.
1513   int temp_skippable = 1;
1514   int tx_wd = 0;
1515   switch (tx_size) {
1516     case TX_64X64:
1517       assert(0);  // Not implemented
1518       break;
1519     case TX_32X32:
1520       assert(0);  // Not used
1521       break;
1522     case TX_16X16: tx_wd = 16; break;
1523     case TX_8X8: tx_wd = 8; break;
1524     default:
1525       assert(tx_size == TX_4X4);
1526       tx_wd = 4;
1527       break;
1528   }
1529   this_rdc->dist = 0;
1530   this_rdc->rate = 0;
1531   aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
1532                      pd->dst.buf, pd->dst.stride);
1533   // Keep track of the row and column of the blocks we use so that we know
1534   // if we are in the unrestricted motion border.
1535   DECLARE_BLOCK_YRD_BUFFERS()
1536   DECLARE_BLOCK_YRD_VARS()
1537   for (int r = 0; r < max_blocks_high; r += block_step) {
1538     for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
1539       DECLARE_LOOP_VARS_BLOCK_YRD()
1540       scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride);
1541       av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
1542                       p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX,
1543                       eob, scan_order->scan, scan_order->iscan);
1544       assert(*eob <= 1024);
1545       update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
1546                            low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
1547                            (r * num_blk_skip_w + c) >> sh_blk_skip);
1548     }
1549   }
1550   this_rdc->skip_txfm = *skippable = temp_skippable;
1551   if (this_rdc->sse < INT64_MAX) {
1552     this_rdc->sse = (this_rdc->sse << 6) >> 2;
1553     if (temp_skippable) {
1554       this_rdc->dist = 0;
1555       this_rdc->dist = this_rdc->sse;
1556       return;
1557     }
1558   }
1559   // If skippable is set, rate gets clobbered later.
1560   this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
1561   this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
1562 }
1563 
init_mbmi(MB_MODE_INFO * mbmi,PREDICTION_MODE pred_mode,MV_REFERENCE_FRAME ref_frame0,MV_REFERENCE_FRAME ref_frame1,const AV1_COMMON * cm)1564 static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE pred_mode,
1565                              MV_REFERENCE_FRAME ref_frame0,
1566                              MV_REFERENCE_FRAME ref_frame1,
1567                              const AV1_COMMON *cm) {
1568   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
1569   mbmi->ref_mv_idx = 0;
1570   mbmi->mode = pred_mode;
1571   mbmi->uv_mode = UV_DC_PRED;
1572   mbmi->ref_frame[0] = ref_frame0;
1573   mbmi->ref_frame[1] = ref_frame1;
1574   pmi->palette_size[0] = 0;
1575   pmi->palette_size[1] = 0;
1576   mbmi->filter_intra_mode_info.use_filter_intra = 0;
1577   mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
1578   mbmi->motion_mode = SIMPLE_TRANSLATION;
1579   mbmi->num_proj_ref = 1;
1580   mbmi->interintra_mode = 0;
1581   set_default_interp_filters(mbmi, cm->features.interp_filter);
1582 }
1583 
1584 #if CONFIG_INTERNAL_STATS
store_coding_context(MACROBLOCK * x,PICK_MODE_CONTEXT * ctx,int mode_index)1585 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
1586                                  int mode_index) {
1587 #else
1588 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
1589 #endif  // CONFIG_INTERNAL_STATS
1590   MACROBLOCKD *const xd = &x->e_mbd;
1591   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
1592 
1593   // Take a snapshot of the coding context so it can be
1594   // restored if we decide to encode this way
1595   ctx->rd_stats.skip_txfm = txfm_info->skip_txfm;
1596 
1597   ctx->skippable = txfm_info->skip_txfm;
1598 #if CONFIG_INTERNAL_STATS
1599   ctx->best_mode_index = mode_index;
1600 #endif  // CONFIG_INTERNAL_STATS
1601   ctx->mic = *xd->mi[0];
1602   ctx->skippable = txfm_info->skip_txfm;
1603   av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
1604                                       av1_ref_frame_type(xd->mi[0]->ref_frame));
1605 }
1606 
1607 static int get_pred_buffer(PRED_BUFFER *p, int len) {
1608   for (int i = 0; i < len; i++) {
1609     if (!p[i].in_use) {
1610       p[i].in_use = 1;
1611       return i;
1612     }
1613   }
1614   return -1;
1615 }
1616 
1617 static void free_pred_buffer(PRED_BUFFER *p) {
1618   if (p != NULL) p->in_use = 0;
1619 }
1620 
1621 static INLINE int get_drl_cost(const PREDICTION_MODE this_mode,
1622                                const int ref_mv_idx,
1623                                const MB_MODE_INFO_EXT *mbmi_ext,
1624                                const int (*const drl_mode_cost0)[2],
1625                                int8_t ref_frame_type) {
1626   int cost = 0;
1627   if (this_mode == NEWMV || this_mode == NEW_NEWMV) {
1628     for (int idx = 0; idx < 2; ++idx) {
1629       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
1630         uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
1631         cost += drl_mode_cost0[drl_ctx][ref_mv_idx != idx];
1632         if (ref_mv_idx == idx) return cost;
1633       }
1634     }
1635     return cost;
1636   }
1637 
1638   if (have_nearmv_in_inter_mode(this_mode)) {
1639     for (int idx = 1; idx < 3; ++idx) {
1640       if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
1641         uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
1642         cost += drl_mode_cost0[drl_ctx][ref_mv_idx != (idx - 1)];
1643         if (ref_mv_idx == (idx - 1)) return cost;
1644       }
1645     }
1646     return cost;
1647   }
1648   return cost;
1649 }
1650 
1651 static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
1652                        int16_t mode_context) {
1653   if (is_inter_compound_mode(mode)) {
1654     return mode_costs
1655         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
1656   }
1657 
1658   int mode_cost = 0;
1659   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
1660 
1661   assert(is_inter_mode(mode));
1662 
1663   if (mode == NEWMV) {
1664     mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
1665     return mode_cost;
1666   } else {
1667     mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
1668     mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
1669 
1670     if (mode == GLOBALMV) {
1671       mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
1672       return mode_cost;
1673     } else {
1674       mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
1675       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
1676       mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
1677       return mode_cost;
1678     }
1679   }
1680 }
1681 
1682 static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
1683                             RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row,
1684                             int mv_col, int speed, uint32_t spatial_variance,
1685                             CONTENT_STATE_SB content_state_sb) {
1686   // Bias against MVs associated with NEWMV mode that are very different from
1687   // top/left neighbors.
1688   if (this_mode == NEWMV) {
1689     int al_mv_average_row;
1690     int al_mv_average_col;
1691     int row_diff, col_diff;
1692     int above_mv_valid = 0;
1693     int left_mv_valid = 0;
1694     int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL;
1695     int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL;
1696     if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad &&
1697         spatial_variance < 300 &&
1698         (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) {
1699       this_rdc->rdcost = this_rdc->rdcost << 2;
1700       return;
1701     }
1702     if (xd->above_mbmi) {
1703       above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV;
1704       above_row = xd->above_mbmi->mv[0].as_mv.row;
1705       above_col = xd->above_mbmi->mv[0].as_mv.col;
1706     }
1707     if (xd->left_mbmi) {
1708       left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV;
1709       left_row = xd->left_mbmi->mv[0].as_mv.row;
1710       left_col = xd->left_mbmi->mv[0].as_mv.col;
1711     }
1712     if (above_mv_valid && left_mv_valid) {
1713       al_mv_average_row = (above_row + left_row + 1) >> 1;
1714       al_mv_average_col = (above_col + left_col + 1) >> 1;
1715     } else if (above_mv_valid) {
1716       al_mv_average_row = above_row;
1717       al_mv_average_col = above_col;
1718     } else if (left_mv_valid) {
1719       al_mv_average_row = left_row;
1720       al_mv_average_col = left_col;
1721     } else {
1722       al_mv_average_row = al_mv_average_col = 0;
1723     }
1724     row_diff = al_mv_average_row - mv_row;
1725     col_diff = al_mv_average_col - mv_col;
1726     if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) {
1727       if (bsize >= BLOCK_32X32)
1728         this_rdc->rdcost = this_rdc->rdcost << 1;
1729       else
1730         this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
1731     }
1732   } else {
1733     // Bias for speed >= 8 for low spatial variance.
1734     if (speed >= 8 && spatial_variance < 150 &&
1735         (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64))
1736       this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
1737   }
1738 }
1739 
1740 static int64_t model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
1741                                   MACROBLOCK *x, MACROBLOCKD *xd,
1742                                   RD_STATS *this_rdc, int start_plane,
1743                                   int stop_plane) {
1744   // Note our transform coeffs are 8 times an orthogonal transform.
1745   // Hence quantizer step is also 8 times. To get effective quantizer
1746   // we need to divide by 8 before sending to modeling function.
1747   unsigned int sse;
1748   int rate;
1749   int64_t dist;
1750   int i;
1751   int64_t tot_sse = 0;
1752 
1753   this_rdc->rate = 0;
1754   this_rdc->dist = 0;
1755   this_rdc->skip_txfm = 0;
1756 
1757   for (i = start_plane; i <= stop_plane; ++i) {
1758     struct macroblock_plane *const p = &x->plane[i];
1759     struct macroblockd_plane *const pd = &xd->plane[i];
1760     const uint32_t dc_quant = p->dequant_QTX[0];
1761     const uint32_t ac_quant = p->dequant_QTX[1];
1762     const BLOCK_SIZE bs = plane_bsize;
1763     unsigned int var;
1764     if (!x->color_sensitivity[i - 1]) continue;
1765 
1766     var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
1767                                   pd->dst.stride, &sse);
1768     assert(sse >= var);
1769     tot_sse += sse;
1770 
1771     av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
1772                                  dc_quant >> 3, &rate, &dist);
1773 
1774     this_rdc->rate += rate >> 1;
1775     this_rdc->dist += dist << 3;
1776 
1777     av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
1778                                  &rate, &dist);
1779 
1780     this_rdc->rate += rate;
1781     this_rdc->dist += dist << 4;
1782   }
1783 
1784   if (this_rdc->rate == 0) {
1785     this_rdc->skip_txfm = 1;
1786   }
1787 
1788   if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
1789       RDCOST(x->rdmult, 0, tot_sse << 4)) {
1790     this_rdc->rate = 0;
1791     this_rdc->dist = tot_sse << 4;
1792     this_rdc->skip_txfm = 1;
1793   }
1794 
1795   return tot_sse;
1796 }
1797 
1798 /*!\cond */
1799 struct estimate_block_intra_args {
1800   AV1_COMP *cpi;
1801   MACROBLOCK *x;
1802   PREDICTION_MODE mode;
1803   int skippable;
1804   RD_STATS *rdc;
1805 };
1806 /*!\endcond */
1807 
1808 /*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case.
1809  *
1810  * \ingroup nonrd_mode_search
1811  * \callgraph
1812  * \callergraph
1813  * Calculates RD Cost for an intra mode for a single TX block using Hadamard
1814  * transform.
1815  * \param[in]    plane          Color plane
1816  * \param[in]    block          Index of a TX block in a prediction block
1817  * \param[in]    row            Row of a current TX block
1818  * \param[in]    col            Column of a current TX block
1819  * \param[in]    plane_bsize    Block size of a current prediction block
1820  * \param[in]    tx_size        Transform size
1821  * \param[in]    arg            Pointer to a structure that holds parameters
1822  *                              for intra mode search
1823  *
1824  * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode
1825  * are set in \c args->rdc and \c args->mode
1826  */
1827 static void estimate_block_intra(int plane, int block, int row, int col,
1828                                  BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
1829                                  void *arg) {
1830   struct estimate_block_intra_args *const args = arg;
1831   AV1_COMP *const cpi = args->cpi;
1832   AV1_COMMON *const cm = &cpi->common;
1833   MACROBLOCK *const x = args->x;
1834   MACROBLOCKD *const xd = &x->e_mbd;
1835   struct macroblock_plane *const p = &x->plane[plane];
1836   struct macroblockd_plane *const pd = &xd->plane[plane];
1837   const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
1838   uint8_t *const src_buf_base = p->src.buf;
1839   uint8_t *const dst_buf_base = pd->dst.buf;
1840   const int64_t src_stride = p->src.stride;
1841   const int64_t dst_stride = pd->dst.stride;
1842   RD_STATS this_rdc;
1843 
1844   (void)block;
1845   (void)plane_bsize;
1846 
1847   av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
1848   av1_invalid_rd_stats(&this_rdc);
1849 
1850   p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
1851   pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
1852 
1853   if (plane == 0) {
1854     block_yrd(x, &this_rdc, &args->skippable, bsize_tx,
1855               AOMMIN(tx_size, TX_16X16), 0);
1856   } else {
1857     model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane);
1858   }
1859 
1860   p->src.buf = src_buf_base;
1861   pd->dst.buf = dst_buf_base;
1862   args->rdc->rate += this_rdc.rate;
1863   args->rdc->dist += this_rdc.dist;
1864 }
1865 
1866 static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
1867                                            BLOCK_SIZE bsize,
1868                                            MV_REFERENCE_FRAME ref_frame,
1869                                            THR_MODES best_mode_idx,
1870                                            PREDICTION_MODE mode) {
1871   const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
1872   const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4);
1873   const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128);
1874   for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) {
1875     int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx];
1876     if (thr_mode_idx == best_mode_idx) {
1877       *freq_fact -= (*freq_fact >> 4);
1878     } else {
1879       *freq_fact =
1880           AOMMIN(*freq_fact + RD_THRESH_INC,
1881                  cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
1882     }
1883   }
1884 }
1885 
1886 #if CONFIG_AV1_TEMPORAL_DENOISING
1887 static void av1_pickmode_ctx_den_update(
1888     AV1_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
1889     unsigned int ref_frame_cost[REF_FRAMES],
1890     int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int reuse_inter_pred,
1891     BEST_PICKMODE *bp) {
1892   ctx_den->zero_last_cost_orig = zero_last_cost_orig;
1893   ctx_den->ref_frame_cost = ref_frame_cost;
1894   ctx_den->frame_mv = frame_mv;
1895   ctx_den->reuse_inter_pred = reuse_inter_pred;
1896   ctx_den->best_tx_size = bp->best_tx_size;
1897   ctx_den->best_mode = bp->best_mode;
1898   ctx_den->best_ref_frame = bp->best_ref_frame;
1899   ctx_den->best_pred_filter = bp->best_pred_filter;
1900   ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
1901 }
1902 
1903 static void recheck_zeromv_after_denoising(
1904     AV1_COMP *cpi, MB_MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd,
1905     AV1_DENOISER_DECISION decision, AV1_PICKMODE_CTX_DEN *ctx_den,
1906     struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_STATS *best_rdc,
1907     BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) {
1908   // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
1909   // denoised result. Only do this under noise conditions, and if rdcost of
1910   // ZEROMV on original source is not significantly higher than rdcost of best
1911   // mode.
1912   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow &&
1913       ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
1914       ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
1915        (ctx_den->best_ref_frame == GOLDEN_FRAME &&
1916         cpi->svc.number_spatial_layers == 1 &&
1917         decision == FILTER_ZEROMV_BLOCK))) {
1918     // Check if we should pick ZEROMV on denoised signal.
1919     AV1_COMMON *const cm = &cpi->common;
1920     RD_STATS this_rdc;
1921     const ModeCosts *mode_costs = &x->mode_costs;
1922     TxfmSearchInfo *txfm_info = &x->txfm_search_info;
1923     MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
1924 
1925     mi->mode = GLOBALMV;
1926     mi->ref_frame[0] = LAST_FRAME;
1927     mi->ref_frame[1] = NONE_FRAME;
1928     set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
1929     mi->mv[0].as_int = 0;
1930     mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
1931     xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
1932     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
1933     unsigned int var;
1934     model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL);
1935 
1936     const int16_t mode_ctx =
1937         av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
1938     this_rdc.rate += cost_mv_ref(mode_costs, GLOBALMV, mode_ctx);
1939 
1940     this_rdc.rate += ctx_den->ref_frame_cost[LAST_FRAME];
1941     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
1942     txfm_info->skip_txfm = this_rdc.skip_txfm;
1943     // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
1944     // is higher than best_ref mode (on original source).
1945     if (this_rdc.rdcost > best_rdc->rdcost) {
1946       this_rdc = *best_rdc;
1947       mi->mode = best_pickmode->best_mode;
1948       mi->ref_frame[0] = best_pickmode->best_ref_frame;
1949       set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
1950       mi->interp_filters = best_pickmode->best_pred_filter;
1951       if (best_pickmode->best_ref_frame == INTRA_FRAME) {
1952         mi->mv[0].as_int = INVALID_MV;
1953       } else {
1954         mi->mv[0].as_int = ctx_den
1955                                ->frame_mv[best_pickmode->best_mode]
1956                                          [best_pickmode->best_ref_frame]
1957                                .as_int;
1958         if (ctx_den->reuse_inter_pred) {
1959           xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
1960           av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
1961         }
1962       }
1963       mi->tx_size = best_pickmode->best_tx_size;
1964       txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm;
1965     } else {
1966       ctx_den->best_ref_frame = LAST_FRAME;
1967       *best_rdc = this_rdc;
1968     }
1969   }
1970 }
1971 #endif  // CONFIG_AV1_TEMPORAL_DENOISING
1972 
1973 #define FILTER_SEARCH_SIZE 2
1974 
1975 /*!\brief Searches for the best interpolation filter
1976  *
1977  * \ingroup nonrd_mode_search
1978  * \callgraph
1979  * \callergraph
1980  * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR,
1981  * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects
1982  * the one that gives lowest RD cost. RD cost is calculated using curvfit model.
1983  * Support for dual filters (different filters in the x & y directions) is
1984  * allowed if sf.interp_sf.disable_dual_filter = 0.
1985  *
1986  * \param[in]    cpi                  Top-level encoder structure
1987  * \param[in]    x                    Pointer to structure holding all the
1988  *                                    data for the current macroblock
1989  * \param[in]    this_rdc             Pointer to calculated RD Cost
1990  * \param[in]    inter_pred_params_sr Pointer to structure holding parameters of
1991                                       inter prediction for single reference
1992  * \param[in]    mi_row               Row index in 4x4 units
1993  * \param[in]    mi_col               Column index in 4x4 units
1994  * \param[in]    tmp_buffer           Pointer to a temporary buffer for
1995  *                                    prediction re-use
1996  * \param[in]    bsize                Current block size
1997  * \param[in]    reuse_inter_pred     Flag, indicating prediction re-use
1998  * \param[out]   this_mode_pred       Pointer to store prediction buffer
1999  *                                    for prediction re-use
2000  * \param[out]   this_early_term      Flag, indicating that transform can be
2001  *                                    skipped
2002  * \param[out]   var                  The residue variance of the current
2003  *                                    predictor.
2004  * \param[in]    use_model_yrd_large  Flag, indicating special logic to handle
2005  *                                    large blocks
2006  * \param[in]    best_sse             Best sse so far.
2007  * \param[in]    comp_pred            Flag, indicating compound mode.
2008  *
2009  * \remark Nothing is returned. Instead, calculated RD cost is placed to
2010  * \c this_rdc and best filter is placed to \c mi->interp_filters. In case
2011  * \c reuse_inter_pred flag is set, this function also outputs
2012  * \c this_mode_pred. Also \c this_early_temp is set if transform can be
2013  * skipped
2014  */
2015 static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
2016                               InterPredParams *inter_pred_params_sr, int mi_row,
2017                               int mi_col, PRED_BUFFER *tmp_buffer,
2018                               BLOCK_SIZE bsize, int reuse_inter_pred,
2019                               PRED_BUFFER **this_mode_pred,
2020                               int *this_early_term, unsigned int *var,
2021                               int use_model_yrd_large, int64_t best_sse,
2022                               int comp_pred) {
2023   AV1_COMMON *const cm = &cpi->common;
2024   MACROBLOCKD *const xd = &x->e_mbd;
2025   struct macroblockd_plane *const pd = &xd->plane[0];
2026   MB_MODE_INFO *const mi = xd->mi[0];
2027   const int bw = block_size_wide[bsize];
2028   int dim_factor =
2029       (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1;
2030   RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
2031   TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
2032   PRED_BUFFER *current_pred = *this_mode_pred;
2033   int best_skip = 0;
2034   int best_early_term = 0;
2035   int64_t best_cost = INT64_MAX;
2036   int best_filter_index = -1;
2037 
2038   SubpelParams subpel_params;
2039   // Initialize inter prediction params at mode level for single reference
2040   // mode.
2041   if (!comp_pred)
2042     init_inter_mode_params(&mi->mv[0].as_mv, inter_pred_params_sr,
2043                            &subpel_params, xd->block_ref_scale_factors[0],
2044                            pd->pre->width, pd->pre->height);
2045   for (int i = 0; i < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE; ++i) {
2046     int64_t cost;
2047     if (cpi->sf.interp_sf.disable_dual_filter &&
2048         filters_ref_set[i].filter_x != filters_ref_set[i].filter_y)
2049       continue;
2050     mi->interp_filters.as_filters.x_filter = filters_ref_set[i].filter_x;
2051     mi->interp_filters.as_filters.y_filter = filters_ref_set[i].filter_y;
2052     if (!comp_pred)
2053       av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
2054                                             &subpel_params);
2055     else
2056       av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
2057     unsigned int curr_var = UINT_MAX;
2058     if (use_model_yrd_large)
2059       model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
2060                                 &pf_rd_stats[i], this_early_term, 1, best_sse,
2061                                 &curr_var, UINT_MAX);
2062     else
2063       model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], &curr_var, 1, NULL);
2064     pf_rd_stats[i].rate += av1_get_switchable_rate(
2065         x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
2066     cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
2067     pf_tx_size[i] = mi->tx_size;
2068     if (cost < best_cost) {
2069       *var = curr_var;
2070       best_filter_index = i;
2071       best_cost = cost;
2072       best_skip = pf_rd_stats[i].skip_txfm;
2073       best_early_term = *this_early_term;
2074       if (reuse_inter_pred) {
2075         if (*this_mode_pred != current_pred) {
2076           free_pred_buffer(*this_mode_pred);
2077           *this_mode_pred = current_pred;
2078         }
2079         current_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
2080         pd->dst.buf = current_pred->data;
2081         pd->dst.stride = bw;
2082       }
2083     }
2084   }
2085   assert(best_filter_index >= 0 &&
2086          best_filter_index < dim_factor * FILTER_SEARCH_SIZE);
2087   if (reuse_inter_pred && *this_mode_pred != current_pred)
2088     free_pred_buffer(current_pred);
2089 
2090   mi->interp_filters.as_filters.x_filter =
2091       filters_ref_set[best_filter_index].filter_x;
2092   mi->interp_filters.as_filters.y_filter =
2093       filters_ref_set[best_filter_index].filter_y;
2094   mi->tx_size = pf_tx_size[best_filter_index];
2095   this_rdc->rate = pf_rd_stats[best_filter_index].rate;
2096   this_rdc->dist = pf_rd_stats[best_filter_index].dist;
2097   this_rdc->sse = pf_rd_stats[best_filter_index].sse;
2098   this_rdc->skip_txfm = (best_skip || best_early_term);
2099   *this_early_term = best_early_term;
2100   if (reuse_inter_pred) {
2101     pd->dst.buf = (*this_mode_pred)->data;
2102     pd->dst.stride = (*this_mode_pred)->stride;
2103   } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) {
2104     if (!comp_pred)
2105       av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
2106                                             &subpel_params);
2107     else
2108       av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
2109   }
2110 }
2111 #if !CONFIG_REALTIME_ONLY
2112 #define MOTION_MODE_SEARCH_SIZE 2
2113 
2114 static AOM_INLINE int is_warped_mode_allowed(const AV1_COMP *cpi,
2115                                              MACROBLOCK *const x,
2116                                              const MB_MODE_INFO *mbmi) {
2117   const FeatureFlags *const features = &cpi->common.features;
2118   const MACROBLOCKD *xd = &x->e_mbd;
2119 
2120   if (cpi->sf.inter_sf.extra_prune_warped) return 0;
2121   if (has_second_ref(mbmi)) return 0;
2122   MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
2123 
2124   if (features->switchable_motion_mode) {
2125     // Determine which motion modes to search if more than SIMPLE_TRANSLATION
2126     // is allowed.
2127     last_motion_mode_allowed = motion_mode_allowed(
2128         xd->global_motion, xd, mbmi, features->allow_warped_motion);
2129   }
2130 
2131   if (last_motion_mode_allowed == WARPED_CAUSAL) {
2132     return 1;
2133   }
2134 
2135   return 0;
2136 }
2137 
2138 static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) {
2139   AV1_COMMON *const cm = &cpi->common;
2140   MACROBLOCKD *const xd = &x->e_mbd;
2141   const FeatureFlags *const features = &cm->features;
2142 
2143   mi->num_proj_ref = 1;
2144   WARP_SAMPLE_INFO *const warp_sample_info =
2145       &x->warp_sample_info[mi->ref_frame[0]];
2146   int *pts0 = warp_sample_info->pts;
2147   int *pts_inref0 = warp_sample_info->pts_inref;
2148   MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
2149 
2150   if (features->switchable_motion_mode) {
2151     // Determine which motion modes to search if more than SIMPLE_TRANSLATION
2152     // is allowed.
2153     last_motion_mode_allowed = motion_mode_allowed(
2154         xd->global_motion, xd, mi, features->allow_warped_motion);
2155   }
2156 
2157   if (last_motion_mode_allowed == WARPED_CAUSAL) {
2158     if (warp_sample_info->num < 0) {
2159       warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
2160     }
2161     mi->num_proj_ref = warp_sample_info->num;
2162   }
2163 }
2164 
2165 static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
2166                                int mi_row, int mi_col, BLOCK_SIZE bsize,
2167                                int *this_early_term, int use_model_yrd_large,
2168                                int *rate_mv, int64_t best_sse) {
2169   AV1_COMMON *const cm = &cpi->common;
2170   MACROBLOCKD *const xd = &x->e_mbd;
2171   const FeatureFlags *const features = &cm->features;
2172   MB_MODE_INFO *const mi = xd->mi[0];
2173   RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 };
2174   int best_skip = 0;
2175   int best_early_term = 0;
2176   int64_t best_cost = INT64_MAX;
2177   int best_mode_index = -1;
2178   const int interp_filter = features->interp_filter;
2179 
2180   const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = {
2181     SIMPLE_TRANSLATION, WARPED_CAUSAL
2182   };
2183   int mode_search_size = is_warped_mode_allowed(cpi, x, mi) ? 2 : 1;
2184 
2185   WARP_SAMPLE_INFO *const warp_sample_info =
2186       &x->warp_sample_info[mi->ref_frame[0]];
2187   int *pts0 = warp_sample_info->pts;
2188   int *pts_inref0 = warp_sample_info->pts_inref;
2189 
2190   const int total_samples = mi->num_proj_ref;
2191   if (total_samples == 0) {
2192     // Do not search WARPED_CAUSAL if there are no samples to use to determine
2193     // warped parameters.
2194     mode_search_size = 1;
2195   }
2196 
2197   const MB_MODE_INFO base_mbmi = *mi;
2198   MB_MODE_INFO best_mbmi;
2199 
2200   for (int i = 0; i < mode_search_size; ++i) {
2201     int64_t cost = INT64_MAX;
2202     MOTION_MODE motion_mode = motion_modes[i];
2203     *mi = base_mbmi;
2204     mi->motion_mode = motion_mode;
2205     if (motion_mode == SIMPLE_TRANSLATION) {
2206       mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
2207 
2208       av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
2209       if (use_model_yrd_large)
2210         model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
2211                                   &pf_rd_stats[i], this_early_term, 1, best_sse,
2212                                   NULL, UINT_MAX);
2213       else
2214         model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1, NULL);
2215       pf_rd_stats[i].rate +=
2216           av1_get_switchable_rate(x, xd, cm->features.interp_filter,
2217                                   cm->seq_params->enable_dual_filter);
2218       cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
2219     } else if (motion_mode == WARPED_CAUSAL) {
2220       int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
2221       const ModeCosts *mode_costs = &x->mode_costs;
2222       mi->wm_params.wmtype = DEFAULT_WMTYPE;
2223       mi->interp_filters =
2224           av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
2225 
2226       memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
2227       memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
2228       // Select the samples according to motion vector difference
2229       if (mi->num_proj_ref > 1) {
2230         mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref,
2231                                              mi->num_proj_ref, bsize);
2232       }
2233 
2234       // Compute the warped motion parameters with a least squares fit
2235       //  using the collected samples
2236       if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize,
2237                                mi->mv[0].as_mv.row, mi->mv[0].as_mv.col,
2238                                &mi->wm_params, mi_row, mi_col)) {
2239         if (mi->mode == NEWMV) {
2240           const int_mv mv0 = mi->mv[0];
2241           const WarpedMotionParams wm_params0 = mi->wm_params;
2242           const int num_proj_ref0 = mi->num_proj_ref;
2243 
2244           const int_mv ref_mv = av1_get_ref_mv(x, 0);
2245           SUBPEL_MOTION_SEARCH_PARAMS ms_params;
2246           av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
2247                                             &ref_mv.as_mv, NULL);
2248 
2249           // Refine MV in a small range.
2250           av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
2251                                total_samples);
2252           if (mi->mv[0].as_int == ref_mv.as_int) {
2253             continue;
2254           }
2255 
2256           if (mv0.as_int != mi->mv[0].as_int) {
2257             // Keep the refined MV and WM parameters.
2258             int tmp_rate_mv = av1_mv_bit_cost(
2259                 &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
2260                 x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
2261             *rate_mv = tmp_rate_mv;
2262           } else {
2263             // Restore the old MV and WM parameters.
2264             mi->mv[0] = mv0;
2265             mi->wm_params = wm_params0;
2266             mi->num_proj_ref = num_proj_ref0;
2267           }
2268         }
2269         // Build the warped predictor
2270         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
2271                                       av1_num_planes(cm) - 1);
2272         if (use_model_yrd_large)
2273           model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
2274                                     &pf_rd_stats[i], this_early_term, 1,
2275                                     best_sse, NULL, UINT_MAX);
2276         else
2277           model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1, NULL);
2278 
2279         pf_rd_stats[i].rate +=
2280             mode_costs->motion_mode_cost[bsize][mi->motion_mode];
2281         cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
2282       } else {
2283         cost = INT64_MAX;
2284       }
2285     }
2286     if (cost < best_cost) {
2287       best_mode_index = i;
2288       best_cost = cost;
2289       best_skip = pf_rd_stats[i].skip_txfm;
2290       best_early_term = *this_early_term;
2291       best_mbmi = *mi;
2292     }
2293   }
2294   assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE);
2295 
2296   *mi = best_mbmi;
2297   this_rdc->rate = pf_rd_stats[best_mode_index].rate;
2298   this_rdc->dist = pf_rd_stats[best_mode_index].dist;
2299   this_rdc->sse = pf_rd_stats[best_mode_index].sse;
2300   this_rdc->skip_txfm = (best_skip || best_early_term);
2301   *this_early_term = best_early_term;
2302   if (best_mode_index < FILTER_SEARCH_SIZE - 1) {
2303     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
2304   }
2305 }
2306 #endif  // !CONFIG_REALTIME_ONLY
2307 
2308 #define COLLECT_PICK_MODE_STAT 0
2309 #define COLLECT_NON_SQR_STAT 0
2310 
2311 #if COLLECT_PICK_MODE_STAT
2312 #include "aom_ports/aom_timer.h"
2313 typedef struct _mode_search_stat {
2314   int32_t num_blocks[BLOCK_SIZES];
2315   int64_t total_block_times[BLOCK_SIZES];
2316   int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT];
2317   int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT];
2318   int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT];
2319   int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT];
2320   int64_t ms_time[BLOCK_SIZES][MB_MODE_COUNT];
2321   int64_t ifs_time[BLOCK_SIZES][MB_MODE_COUNT];
2322   int64_t model_rd_time[BLOCK_SIZES][MB_MODE_COUNT];
2323   int64_t txfm_time[BLOCK_SIZES][MB_MODE_COUNT];
2324   struct aom_usec_timer timer1;
2325   struct aom_usec_timer timer2;
2326   struct aom_usec_timer bsize_timer;
2327 } mode_search_stat;
2328 
2329 static mode_search_stat ms_stat;
2330 
2331 static AOM_INLINE void print_stage_time(const char *stage_name,
2332                                         int64_t stage_time,
2333                                         int64_t total_time) {
2334   printf("    %s: %ld (%f%%)\n", stage_name, stage_time,
2335          100 * stage_time / (float)total_time);
2336 }
2337 
2338 static void print_time(const mode_search_stat *const ms_stat,
2339                        const BLOCK_SIZE bsize, const int mi_rows,
2340                        const int mi_cols, const int mi_row, const int mi_col) {
2341   if ((mi_row + mi_size_high[bsize] >= mi_rows) &&
2342       (mi_col + mi_size_wide[bsize] >= mi_cols)) {
2343     int64_t total_time = 0l;
2344     int32_t total_blocks = 0;
2345     for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
2346       total_time += ms_stat->total_block_times[bs];
2347       total_blocks += ms_stat->num_blocks[bs];
2348     }
2349 
2350     printf("\n");
2351     for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
2352       if (ms_stat->num_blocks[bs] == 0) {
2353         continue;
2354       }
2355       if (!COLLECT_NON_SQR_STAT && block_size_wide[bs] != block_size_high[bs]) {
2356         continue;
2357       }
2358 
2359       printf("BLOCK_%dX%d Num %d, Time: %ld (%f%%), Avg_time %f:\n",
2360              block_size_wide[bs], block_size_high[bs], ms_stat->num_blocks[bs],
2361              ms_stat->total_block_times[bs],
2362              100 * ms_stat->total_block_times[bs] / (float)total_time,
2363              (float)ms_stat->total_block_times[bs] / ms_stat->num_blocks[bs]);
2364       for (int j = 0; j < MB_MODE_COUNT; j++) {
2365         if (ms_stat->nonskipped_search_times[bs][j] == 0) {
2366           continue;
2367         }
2368 
2369         int64_t total_mode_time = ms_stat->nonskipped_search_times[bs][j];
2370         printf("  Mode %d, %d/%d tps %f\n", j,
2371                ms_stat->num_nonskipped_searches[bs][j],
2372                ms_stat->num_searches[bs][j],
2373                ms_stat->num_nonskipped_searches[bs][j] > 0
2374                    ? (float)ms_stat->nonskipped_search_times[bs][j] /
2375                          ms_stat->num_nonskipped_searches[bs][j]
2376                    : 0l);
2377         if (j >= INTER_MODE_START) {
2378           total_mode_time = ms_stat->ms_time[bs][j] + ms_stat->ifs_time[bs][j] +
2379                             ms_stat->model_rd_time[bs][j] +
2380                             ms_stat->txfm_time[bs][j];
2381           print_stage_time("Motion Search Time", ms_stat->ms_time[bs][j],
2382                            total_time);
2383           print_stage_time("Filter Search Time", ms_stat->ifs_time[bs][j],
2384                            total_time);
2385           print_stage_time("Model    RD   Time", ms_stat->model_rd_time[bs][j],
2386                            total_time);
2387           print_stage_time("Tranfm Search Time", ms_stat->txfm_time[bs][j],
2388                            total_time);
2389         }
2390         print_stage_time("Total  Mode   Time", total_mode_time, total_time);
2391       }
2392       printf("\n");
2393     }
2394     printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks);
2395   }
2396 }
2397 #endif  // COLLECT_PICK_MODE_STAT
2398 
2399 static void compute_intra_yprediction(const AV1_COMMON *cm,
2400                                       PREDICTION_MODE mode, BLOCK_SIZE bsize,
2401                                       MACROBLOCK *x, MACROBLOCKD *xd) {
2402   const SequenceHeader *seq_params = cm->seq_params;
2403   struct macroblockd_plane *const pd = &xd->plane[0];
2404   struct macroblock_plane *const p = &x->plane[0];
2405   uint8_t *const src_buf_base = p->src.buf;
2406   uint8_t *const dst_buf_base = pd->dst.buf;
2407   const int src_stride = p->src.stride;
2408   const int dst_stride = pd->dst.stride;
2409   int plane = 0;
2410   int row, col;
2411   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
2412   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
2413   // transform size varies per plane, look it up in a common way.
2414   const TX_SIZE tx_size = max_txsize_lookup[bsize];
2415   const BLOCK_SIZE plane_bsize =
2416       get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
2417   // If mb_to_right_edge is < 0 we are in a situation in which
2418   // the current block size extends into the UMV and we won't
2419   // visit the sub blocks that are wholly within the UMV.
2420   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
2421   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
2422   // Keep track of the row and column of the blocks we use so that we know
2423   // if we are in the unrestricted motion border.
2424   for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
2425     // Skip visiting the sub blocks that are wholly within the UMV.
2426     for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
2427       p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
2428       pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
2429       av1_predict_intra_block(
2430           xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
2431           block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0,
2432           FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride,
2433           0, 0, plane);
2434     }
2435   }
2436   p->src.buf = src_buf_base;
2437   pd->dst.buf = dst_buf_base;
2438 }
2439 
2440 void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
2441                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
2442   AV1_COMMON *const cm = &cpi->common;
2443   MACROBLOCKD *const xd = &x->e_mbd;
2444   MB_MODE_INFO *const mi = xd->mi[0];
2445   RD_STATS this_rdc, best_rdc;
2446   struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
2447   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
2448   const TX_SIZE intra_tx_size =
2449       AOMMIN(max_txsize_lookup[bsize],
2450              tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
2451   int *bmode_costs;
2452   PREDICTION_MODE best_mode = DC_PRED;
2453   const MB_MODE_INFO *above_mi = xd->above_mbmi;
2454   const MB_MODE_INFO *left_mi = xd->left_mbmi;
2455   const PREDICTION_MODE A = av1_above_block_mode(above_mi);
2456   const PREDICTION_MODE L = av1_left_block_mode(left_mi);
2457   const int above_ctx = intra_mode_context[A];
2458   const int left_ctx = intra_mode_context[L];
2459   bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
2460 
2461   av1_invalid_rd_stats(&best_rdc);
2462   av1_invalid_rd_stats(&this_rdc);
2463 
2464   init_mbmi(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm);
2465   mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV;
2466 
2467   // Change the limit of this loop to add other intra prediction
2468   // mode tests.
2469   for (int i = 0; i < 4; ++i) {
2470     PREDICTION_MODE this_mode = intra_mode_list[i];
2471 
2472     // As per the statistics generated for intra mode evaluation in the nonrd
2473     // path, it is found that the probability of H_PRED mode being the winner is
2474     // very less when the best mode so far is V_PRED (out of DC_PRED and
2475     // V_PRED). If V_PRED is the winner mode out of DC_PRED and V_PRED, it could
2476     // imply the presence of a vertically dominant pattern. Hence, H_PRED mode
2477     // is not evaluated.
2478     if (cpi->sf.rt_sf.prune_h_pred_using_best_mode_so_far &&
2479         this_mode == H_PRED && best_mode == V_PRED)
2480       continue;
2481 
2482     this_rdc.dist = this_rdc.rate = 0;
2483     args.mode = this_mode;
2484     args.skippable = 1;
2485     args.rdc = &this_rdc;
2486     mi->tx_size = intra_tx_size;
2487     mi->mode = this_mode;
2488     av1_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
2489                                            &args);
2490     const int skip_ctx = av1_get_skip_txfm_context(xd);
2491     if (args.skippable) {
2492       this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
2493     } else {
2494       this_rdc.rate += x->mode_costs.skip_txfm_cost[skip_ctx][0];
2495     }
2496     this_rdc.rate += bmode_costs[this_mode];
2497     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
2498 
2499     if (this_rdc.rdcost < best_rdc.rdcost) {
2500       best_rdc = this_rdc;
2501       best_mode = this_mode;
2502       if (!this_rdc.skip_txfm) {
2503         memset(ctx->blk_skip, 0,
2504                sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
2505       }
2506     }
2507   }
2508 
2509   mi->mode = best_mode;
2510   // Keep DC for UV since mode test is based on Y channel only.
2511   mi->uv_mode = UV_DC_PRED;
2512   *rd_cost = best_rdc;
2513 
2514 #if CONFIG_INTERNAL_STATS
2515   store_coding_context(x, ctx, mi->mode);
2516 #else
2517   store_coding_context(x, ctx);
2518 #endif  // CONFIG_INTERNAL_STATS
2519 }
2520 
2521 static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
2522   struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
2523   struct scale_factors *const sf_golden =
2524       get_ref_scale_factors(cm, GOLDEN_FRAME);
2525   return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) &&
2526           (sf_last->y_scale_fp == sf_golden->y_scale_fp));
2527 }
2528 
2529 static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
2530                                               MB_MODE_INFO *mi, int mi_row,
2531                                               int mi_col, int bsize,
2532                                               int gf_temporal_ref,
2533                                               int use_ref_frame[],
2534                                               int *force_skip_low_temp_var) {
2535   AV1_COMMON *const cm = &cpi->common;
2536   const struct segmentation *const seg = &cm->seg;
2537   const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
2538 
2539   // When the ref_frame_config is used to set the reference frame structure
2540   // then the usage of alt_ref is determined by the ref_frame_flags
2541   // (and not the speed feature use_nonrd_altref_frame).
2542   int use_alt_ref_frame = cpi->ppi->rtc_ref.set_ref_frame_config ||
2543                           cpi->sf.rt_sf.use_nonrd_altref_frame;
2544 
2545   int use_golden_ref_frame = 1;
2546   int use_last_ref_frame = 1;
2547 
2548   // When the ref_frame_config is used to set the reference frame structure:
2549   // check if LAST is used as a reference. And only remove golden and altref
2550   // references below if last is used as a reference.
2551   if (cpi->ppi->rtc_ref.set_ref_frame_config)
2552     use_last_ref_frame =
2553         cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0;
2554 
2555   // frame_since_golden is not used when user sets the referene structure.
2556   if (!cpi->ppi->rtc_ref.set_ref_frame_config && use_last_ref_frame &&
2557       cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
2558     use_golden_ref_frame = 0;
2559   }
2560 
2561   if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var &&
2562       x->nonrd_prune_ref_frame_search) {
2563     if (is_small_sb)
2564       *force_skip_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
2565           &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
2566     else
2567       *force_skip_low_temp_var = av1_get_force_skip_low_temp_var(
2568           &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
2569     // If force_skip_low_temp_var is set, skip golden reference.
2570     if (*force_skip_low_temp_var) {
2571       use_golden_ref_frame = 0;
2572       use_alt_ref_frame = 0;
2573     }
2574   }
2575 
2576   if (use_last_ref_frame &&
2577       (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk ||
2578        (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) {
2579     use_golden_ref_frame = 0;
2580     use_alt_ref_frame = 0;
2581   }
2582 
2583   if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
2584       get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
2585     use_golden_ref_frame = 1;
2586     use_alt_ref_frame = 0;
2587   }
2588 
2589   // Skip golden reference if color is set, on flat blocks with motion.
2590   // For screen: always skip golden (if color_sensitivity_sb_g is set)
2591   // except when x->nonrd_prune_ref_frame_search = 0. This latter flag
2592   // may be set in the variance partition when golden is a much better
2593   // reference than last, in which case it may not be worth skipping
2594   // golden completely.
2595   if (((cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
2596         x->nonrd_prune_ref_frame_search != 0) ||
2597        (x->source_variance < 500 &&
2598         x->content_state_sb.source_sad_nonrd > kLowSad)) &&
2599       (x->color_sensitivity_sb_g[0] == 1 || x->color_sensitivity_sb_g[1] == 1))
2600     use_golden_ref_frame = 0;
2601 
2602   // For non-screen: if golden and altref are not being selected as references
2603   // (use_golden_ref_frame/use_alt_ref_frame = 0) check to allow golden back
2604   // based on the sad of nearest/nearmv of LAST ref. If this block sad is large,
2605   // keep golden as reference. Only do this for the agrressive pruning mode and
2606   // avoid it when color is set for golden reference.
2607   if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
2608       (cpi->ref_frame_flags & AOM_LAST_FLAG) && !use_golden_ref_frame &&
2609       !use_alt_ref_frame && x->pred_mv_sad[LAST_FRAME] != INT_MAX &&
2610       x->nonrd_prune_ref_frame_search > 2 &&
2611       x->color_sensitivity_sb_g[0] == 0 && x->color_sensitivity_sb_g[1] == 0) {
2612     int thr = (cm->width * cm->height >= 640 * 360) ? 100 : 150;
2613     int pred = x->pred_mv_sad[LAST_FRAME] >>
2614                (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
2615     if (pred > thr) use_golden_ref_frame = 1;
2616   }
2617 
2618   use_alt_ref_frame =
2619       cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0;
2620   use_golden_ref_frame =
2621       cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0;
2622 
2623   // For spatial layers: enable golden ref if it is set by user and
2624   // corresponds to the lower spatial layer.
2625   if (cpi->svc.spatial_layer_id > 0 && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
2626       x->content_state_sb.source_sad_nonrd < kHighSad) {
2627     const int buffslot_golden =
2628         cpi->ppi->rtc_ref.ref_idx[GOLDEN_FRAME - LAST_FRAME];
2629     if (cpi->svc.buffer_time_index[buffslot_golden] ==
2630         cpi->svc.current_superframe)
2631       use_golden_ref_frame = 1;
2632   }
2633 
2634   use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
2635   use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
2636   use_ref_frame[LAST_FRAME] = use_last_ref_frame;
2637   // For now keep this assert on, but we should remove it for svc mode,
2638   // as the user may want to generate an intra-only frame (no inter-modes).
2639   // Remove this assert in subsequent CL when nonrd_pickmode is tested for the
2640   // case of intra-only frame (no references enabled).
2641   assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
2642 }
2643 
2644 // Checks whether Intra mode needs to be pruned based on
2645 // 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad'
2646 // speed features.
2647 static INLINE bool is_prune_intra_mode(AV1_COMP *cpi, int mode_index,
2648                                        int force_intra_check, BLOCK_SIZE bsize,
2649                                        uint8_t segment_id,
2650                                        SOURCE_SAD source_sad_nonrd,
2651                                        uint8_t color_sensitivity[2]) {
2652   const PREDICTION_MODE this_mode = intra_mode_list[mode_index];
2653   if (mode_index > 2 || force_intra_check == 0) {
2654     if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
2655       return true;
2656 
2657     if (this_mode == DC_PRED) return false;
2658 
2659     if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false;
2660 
2661     const bool has_color_sensitivity =
2662         color_sensitivity[0] && color_sensitivity[1];
2663     if (has_color_sensitivity &&
2664         (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad ||
2665          cyclic_refresh_segment_id_boosted(segment_id) ||
2666          source_sad_nonrd > kMedSad))
2667       return false;
2668 
2669     return true;
2670   }
2671   return false;
2672 }
2673 
2674 /*!\brief Estimates best intra mode for inter mode search
2675  *
2676  * \ingroup nonrd_mode_search
2677  * \callgraph
2678  * \callergraph
2679  *
2680  * Using heuristics based on best inter mode, block size, and other decides
2681  * whether to check intra modes. If so, estimates and selects best intra mode
2682  * from the reduced set of intra modes (max 4 intra modes checked)
2683  *
2684  * \param[in]    cpi                      Top-level encoder structure
2685  * \param[in]    x                        Pointer to structure holding all the
2686  *                                        data for the current macroblock
2687  * \param[in]    bsize                    Current block size
2688  * \param[in]    best_early_term          Flag, indicating that TX for the
2689  *                                        best inter mode was skipped
2690  * \param[in]    ref_cost_intra           Cost of signalling intra mode
2691  * \param[in]    reuse_prediction         Flag, indicating prediction re-use
2692  * \param[in]    orig_dst                 Original destination buffer
2693  * \param[in]    tmp_buffers              Pointer to a temporary buffers for
2694  *                                        prediction re-use
2695  * \param[out]   this_mode_pred           Pointer to store prediction buffer
2696  *                                        for prediction re-use
2697  * \param[in]    best_rdc                 Pointer to RD cost for the best
2698  *                                        selected intra mode
2699  * \param[in]    best_pickmode            Pointer to a structure containing
2700  *                                        best mode picked so far
2701  * \param[in]    ctx                      Pointer to structure holding coding
2702  *                                        contexts and modes for the block
2703  *
2704  * \remark Nothing is returned. Instead, calculated RD cost is placed to
2705  * \c best_rdc and best selected mode is placed to \c best_pickmode
2706  */
2707 static void estimate_intra_mode(
2708     AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int best_early_term,
2709     unsigned int ref_cost_intra, int reuse_prediction, struct buf_2d *orig_dst,
2710     PRED_BUFFER *tmp_buffers, PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
2711     BEST_PICKMODE *best_pickmode, PICK_MODE_CONTEXT *ctx) {
2712   AV1_COMMON *const cm = &cpi->common;
2713   MACROBLOCKD *const xd = &x->e_mbd;
2714   MB_MODE_INFO *const mi = xd->mi[0];
2715   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
2716   const unsigned char segment_id = mi->segment_id;
2717   const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
2718   const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
2719   const bool is_screen_content =
2720       cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
2721   struct macroblockd_plane *const pd = &xd->plane[0];
2722 
2723   const CommonQuantParams *quant_params = &cm->quant_params;
2724 
2725   RD_STATS this_rdc;
2726 
2727   int intra_cost_penalty = av1_get_intra_cost_penalty(
2728       quant_params->base_qindex, quant_params->y_dc_delta_q,
2729       cm->seq_params->bit_depth);
2730   int64_t inter_mode_thresh =
2731       RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
2732   int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
2733   int force_intra_check = 0;
2734   // For spatial enhancement layer: turn off intra prediction if the
2735   // previous spatial layer as golden ref is not chosen as best reference.
2736   // only do this for temporal enhancement layer and on non-key frames.
2737   if (cpi->svc.spatial_layer_id > 0 &&
2738       best_pickmode->best_ref_frame != GOLDEN_FRAME &&
2739       cpi->svc.temporal_layer_id > 0 &&
2740       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
2741     perform_intra_pred = 0;
2742 
2743   int do_early_exit_rdthresh = 1;
2744 
2745   uint32_t spatial_var_thresh = 50;
2746   int motion_thresh = 32;
2747   // Adjust thresholds to make intra mode likely tested if the other
2748   // references (golden, alt) are skipped/not checked. For now always
2749   // adjust for svc mode.
2750   if (cpi->ppi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
2751                             cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) {
2752     spatial_var_thresh = 150;
2753     motion_thresh = 0;
2754   }
2755 
2756   // Some adjustments to checking intra mode based on source variance.
2757   if (x->source_variance < spatial_var_thresh) {
2758     // If the best inter mode is large motion or non-LAST ref reduce intra cost
2759     // penalty, so intra mode is more likely tested.
2760     if (best_rdc->rdcost != INT64_MAX &&
2761         (best_pickmode->best_ref_frame != LAST_FRAME ||
2762          abs(mi->mv[0].as_mv.row) >= motion_thresh ||
2763          abs(mi->mv[0].as_mv.col) >= motion_thresh)) {
2764       intra_cost_penalty = intra_cost_penalty >> 2;
2765       inter_mode_thresh =
2766           RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
2767       do_early_exit_rdthresh = 0;
2768     }
2769     if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
2770          x->content_state_sb.source_sad_nonrd >= kHighSad) ||
2771         (is_screen_content && x->source_variance < 50 &&
2772          ((bsize >= BLOCK_32X32 &&
2773            x->content_state_sb.source_sad_nonrd != kZeroSad) ||
2774           x->color_sensitivity[0] == 1 || x->color_sensitivity[1] == 1)))
2775       force_intra_check = 1;
2776     // For big blocks worth checking intra (since only DC will be checked),
2777     // even if best_early_term is set.
2778     if (bsize >= BLOCK_32X32) best_early_term = 0;
2779   } else if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
2780              x->content_state_sb.source_sad_nonrd <= kLowSad) {
2781     perform_intra_pred = 0;
2782   }
2783 
2784   if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) {
2785     if (cpi->sf.rt_sf.skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV)
2786       perform_intra_pred = 0;
2787     else if (cpi->sf.rt_sf.skip_intra_pred == 2)
2788       perform_intra_pred = 0;
2789   }
2790 
2791   if (!(best_rdc->rdcost == INT64_MAX || force_intra_check ||
2792         (perform_intra_pred && !best_early_term &&
2793          bsize <= cpi->sf.part_sf.max_intra_bsize))) {
2794     return;
2795   }
2796 
2797   // Early exit based on RD cost calculated using known rate. When
2798   // is_screen_content is true, more bias is given to intra modes. Hence,
2799   // considered conservative threshold in early exit for the same.
2800   const int64_t known_rd = is_screen_content
2801                                ? CALC_BIASED_RDCOST(inter_mode_thresh)
2802                                : inter_mode_thresh;
2803   if (known_rd > best_rdc->rdcost) return;
2804 
2805   struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
2806   TX_SIZE intra_tx_size = AOMMIN(
2807       AOMMIN(max_txsize_lookup[bsize],
2808              tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
2809       TX_16X16);
2810   if (is_screen_content && cpi->rc.high_source_sad &&
2811       x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16)
2812     intra_tx_size = TX_4X4;
2813 
2814   PRED_BUFFER *const best_pred = best_pickmode->best_pred;
2815   if (reuse_prediction && best_pred != NULL) {
2816     const int bh = block_size_high[bsize];
2817     const int bw = block_size_wide[bsize];
2818     if (best_pred->data == orig_dst->buf) {
2819       *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)];
2820       aom_convolve_copy(best_pred->data, best_pred->stride,
2821                         (*this_mode_pred)->data, (*this_mode_pred)->stride, bw,
2822                         bh);
2823       best_pickmode->best_pred = *this_mode_pred;
2824     }
2825   }
2826   pd->dst = *orig_dst;
2827 
2828   for (int i = 0; i < 4; ++i) {
2829     const PREDICTION_MODE this_mode = intra_mode_list[i];
2830     const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
2831     const int64_t mode_rd_thresh = rd_threshes[mode_index];
2832 
2833     if (is_prune_intra_mode(cpi, i, force_intra_check, bsize, segment_id,
2834                             x->content_state_sb.source_sad_nonrd,
2835                             x->color_sensitivity))
2836       continue;
2837 
2838     if (is_screen_content && cpi->sf.rt_sf.source_metrics_sb_nonrd) {
2839       // For spatially flat blocks with zero motion only check
2840       // DC mode.
2841       if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
2842           x->source_variance == 0 && this_mode != DC_PRED)
2843         continue;
2844       // Only test Intra for big blocks if spatial_variance is small.
2845       else if (bsize > BLOCK_32X32 && x->source_variance > 50)
2846         continue;
2847     }
2848 
2849     if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh,
2850                             rd_thresh_freq_fact[mode_index]) &&
2851         (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
2852       continue;
2853     }
2854     const BLOCK_SIZE uv_bsize = get_plane_block_size(
2855         bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
2856 
2857     mi->mode = this_mode;
2858     mi->ref_frame[0] = INTRA_FRAME;
2859     mi->ref_frame[1] = NONE_FRAME;
2860 
2861     av1_invalid_rd_stats(&this_rdc);
2862     args.mode = this_mode;
2863     args.skippable = 1;
2864     args.rdc = &this_rdc;
2865     mi->tx_size = intra_tx_size;
2866     compute_intra_yprediction(cm, this_mode, bsize, x, xd);
2867     // Look into selecting tx_size here, based on prediction residual.
2868     block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size, 0);
2869     // TODO(kyslov@) Need to account for skippable
2870     if (x->color_sensitivity[0]) {
2871       av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
2872                                              estimate_block_intra, &args);
2873     }
2874     if (x->color_sensitivity[1]) {
2875       av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2,
2876                                              estimate_block_intra, &args);
2877     }
2878 
2879     int mode_cost = 0;
2880     if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
2881       mode_cost +=
2882           x->mode_costs.angle_delta_cost[this_mode - V_PRED]
2883                                         [MAX_ANGLE_DELTA +
2884                                          mi->angle_delta[PLANE_TYPE_Y]];
2885     }
2886     if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
2887       mode_cost += x->mode_costs.filter_intra_cost[bsize][0];
2888     }
2889     this_rdc.rate += ref_cost_intra;
2890     this_rdc.rate += intra_cost_penalty;
2891     this_rdc.rate += mode_cost;
2892     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
2893 
2894     if (is_screen_content && cpi->sf.rt_sf.source_metrics_sb_nonrd) {
2895       // For blocks with low spatial variance and color sad,
2896       // favor the intra-modes, only on scene/slide change.
2897       if (cpi->rc.high_source_sad && x->source_variance < 800 &&
2898           (x->color_sensitivity[0] || x->color_sensitivity[1]))
2899         this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost);
2900       // Otherwise bias against intra for blocks with zero
2901       // motion and no color, on non-scene/slide changes.
2902       else if (!cpi->rc.high_source_sad && x->source_variance > 0 &&
2903                x->content_state_sb.source_sad_nonrd == kZeroSad &&
2904                x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0)
2905         this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1;
2906     }
2907 
2908     if (this_rdc.rdcost < best_rdc->rdcost) {
2909       *best_rdc = this_rdc;
2910       best_pickmode->best_mode = this_mode;
2911       best_pickmode->best_tx_size = mi->tx_size;
2912       best_pickmode->best_ref_frame = INTRA_FRAME;
2913       best_pickmode->best_second_ref_frame = NONE;
2914       best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm;
2915       if (!this_rdc.skip_txfm) {
2916         memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
2917                sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
2918       }
2919       mi->uv_mode = this_mode;
2920       mi->mv[0].as_int = INVALID_MV;
2921       mi->mv[1].as_int = INVALID_MV;
2922     }
2923   }
2924   mi->tx_size = best_pickmode->best_tx_size;
2925 }
2926 
2927 static AOM_INLINE int is_filter_search_enabled_blk(
2928     AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize,
2929     int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) {
2930   const AV1_COMMON *const cm = &cpi->common;
2931   // filt search disabled
2932   if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0;
2933   // filt search purely based on mode properties
2934   if (!cb_pred_filter_search) return 1;
2935   MACROBLOCKD *const xd = &x->e_mbd;
2936   int enable_interp_search = 0;
2937   if (!(xd->left_mbmi && xd->above_mbmi)) {
2938     // neighbors info unavailable
2939     enable_interp_search = 2;
2940   } else if (!(is_inter_block(xd->left_mbmi) &&
2941                is_inter_block(xd->above_mbmi))) {
2942     // neighbor is INTRA
2943     enable_interp_search = 2;
2944   } else if (xd->left_mbmi->interp_filters.as_int !=
2945              xd->above_mbmi->interp_filters.as_int) {
2946     // filters are different
2947     enable_interp_search = 2;
2948   } else if ((cb_pred_filter_search == 1) &&
2949              (xd->left_mbmi->interp_filters.as_filters.x_filter !=
2950               EIGHTTAP_REGULAR)) {
2951     // not regular
2952     enable_interp_search = 2;
2953   } else {
2954     // enable prediction based on chessboard pattern
2955     if (xd->left_mbmi->interp_filters.as_filters.x_filter == EIGHTTAP_SMOOTH)
2956       *filt_select = EIGHTTAP_SMOOTH;
2957     const int bsl = mi_size_wide_log2[bsize];
2958     enable_interp_search =
2959         (bool)((((mi_row + mi_col) >> bsl) +
2960                 get_chessboard_index(cm->current_frame.frame_number)) &
2961                0x1);
2962     if (cyclic_refresh_segment_id_boosted(segment_id)) enable_interp_search = 1;
2963   }
2964   return enable_interp_search;
2965 }
2966 
2967 static AOM_INLINE int skip_mode_by_threshold(
2968     PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv,
2969     int frames_since_golden, const int *const rd_threshes,
2970     const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip,
2971     int extra_shift) {
2972   int skip_this_mode = 0;
2973   const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)];
2974   int64_t mode_rd_thresh =
2975       best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1)
2976                 : ((int64_t)rd_threshes[mode_index]) << extra_shift;
2977 
2978   // Increase mode_rd_thresh value for non-LAST for improved encoding
2979   // speed
2980   if (ref_frame != LAST_FRAME) {
2981     mode_rd_thresh = mode_rd_thresh << 1;
2982     if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4)
2983       mode_rd_thresh = mode_rd_thresh << (extra_shift + 1);
2984   }
2985 
2986   if (rd_less_than_thresh(best_cost, mode_rd_thresh,
2987                           rd_thresh_freq_fact[mode_index]))
2988     if (mv.as_int != 0) skip_this_mode = 1;
2989 
2990   return skip_this_mode;
2991 }
2992 
2993 static AOM_INLINE int skip_mode_by_low_temp(
2994     PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
2995     CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) {
2996   // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
2997   // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
2998   // later.
2999   if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) {
3000     return 1;
3001   }
3002 
3003   if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 &&
3004       force_skip_low_temp_var && mode == NEWMV) {
3005     return 1;
3006   }
3007   return 0;
3008 }
3009 
3010 static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
3011     PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
3012     int extra_prune, unsigned int sse_zeromv_norm, int more_prune) {
3013   const unsigned int thresh_skip_golden = 500;
3014 
3015   if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden &&
3016       mode == NEWMV)
3017     return 1;
3018 
3019   if (bsize == BLOCK_128X128 && mode == NEWMV) return 1;
3020 
3021   // Skip testing non-LAST if this flag is set.
3022   if (extra_prune) {
3023     if (extra_prune > 1 && ref_frame != LAST_FRAME &&
3024         (bsize > BLOCK_16X16 && mode == NEWMV))
3025       return 1;
3026 
3027     if (ref_frame != LAST_FRAME && mode == NEARMV) return 1;
3028 
3029     if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1;
3030   }
3031   return 0;
3032 }
3033 
3034 static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
3035                                   BLOCK_SIZE bsize, int y_sad,
3036                                   unsigned int source_variance,
3037                                   struct buf_2d yv12_mb[MAX_MB_PLANE]) {
3038   const int subsampling_x = cpi->common.seq_params->subsampling_x;
3039   const int subsampling_y = cpi->common.seq_params->subsampling_y;
3040   int factor = (bsize >= BLOCK_32X32) ? 2 : 3;
3041   int shift = 3;
3042   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
3043       cpi->rc.high_source_sad) {
3044     factor = 1;
3045     shift = 6;
3046   }
3047   NOISE_LEVEL noise_level = kLow;
3048   int norm_sad =
3049       y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
3050   unsigned int thresh_spatial = (cpi->common.width > 1920) ? 5000 : 1000;
3051   // If the spatial source variance is high and the normalized y_sad
3052   // is low, then y-channel is likely good for mode estimation, so keep
3053   // color_sensitivity off. For low noise content for now, since there is
3054   // some bdrate regression for noisy color clip.
3055   if (cpi->noise_estimate.enabled)
3056     noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
3057   if (noise_level == kLow && source_variance > thresh_spatial &&
3058       cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && norm_sad < 50) {
3059     x->color_sensitivity[0] = 0;
3060     x->color_sensitivity[1] = 0;
3061     return;
3062   }
3063   const int num_planes = av1_num_planes(&cpi->common);
3064   for (int i = 1; i < num_planes; ++i) {
3065     if (x->color_sensitivity[i - 1] == 2 || source_variance < 50) {
3066       struct macroblock_plane *const p = &x->plane[i];
3067       const BLOCK_SIZE bs =
3068           get_plane_block_size(bsize, subsampling_x, subsampling_y);
3069 
3070       const int uv_sad = cpi->ppi->fn_ptr[bs].sdf(
3071           p->src.buf, p->src.stride, yv12_mb[i].buf, yv12_mb[i].stride);
3072 
3073       const int norm_uv_sad =
3074           uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]);
3075       x->color_sensitivity[i - 1] =
3076           uv_sad > (factor * (y_sad >> shift)) && norm_uv_sad > 40;
3077       if (source_variance < 50 && norm_uv_sad > 100)
3078         x->color_sensitivity[i - 1] = 1;
3079     }
3080   }
3081 }
3082 
3083 static void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x,
3084                                       struct buf_2d yv12_mb[8][MAX_MB_PLANE],
3085                                       const int *use_ref_frame_mask,
3086                                       const MV_REFERENCE_FRAME *rf,
3087                                       int *ref_mv_idx) {
3088   MACROBLOCKD *const xd = &x->e_mbd;
3089   MB_MODE_INFO *const mbmi = xd->mi[0];
3090   MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
3091   MV_REFERENCE_FRAME ref_frame_comp;
3092   if (!use_ref_frame_mask[rf[1]]) {
3093     // Need to setup pred_block, if it hasn't been done in find_predictors.
3094     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]);
3095     const int num_planes = av1_num_planes(cm);
3096     if (yv12 != NULL) {
3097       const struct scale_factors *const sf =
3098           get_ref_scale_factors_const(cm, rf[1]);
3099       av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes);
3100     }
3101   }
3102   ref_frame_comp = av1_ref_frame_type(rf);
3103   mbmi_ext->mode_context[ref_frame_comp] = 0;
3104   mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX;
3105   av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count,
3106                    xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
3107                    mbmi_ext->mode_context);
3108   av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp);
3109   *ref_mv_idx = mbmi->ref_mv_idx + 1;
3110 }
3111 
3112 static void set_compound_mode(MACROBLOCK *x, int ref_frame, int ref_frame2,
3113                               int ref_mv_idx,
3114                               int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
3115                               PREDICTION_MODE this_mode) {
3116   MACROBLOCKD *const xd = &x->e_mbd;
3117   MB_MODE_INFO *const mi = xd->mi[0];
3118   mi->ref_frame[0] = ref_frame;
3119   mi->ref_frame[1] = ref_frame2;
3120   mi->compound_idx = 1;
3121   mi->comp_group_idx = 0;
3122   mi->interinter_comp.type = COMPOUND_AVERAGE;
3123   MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame);
3124   if (this_mode == GLOBAL_GLOBALMV) {
3125     frame_mv[this_mode][ref_frame].as_int = 0;
3126     frame_mv[this_mode][ref_frame2].as_int = 0;
3127   } else if (this_mode == NEAREST_NEARESTMV) {
3128     frame_mv[this_mode][ref_frame].as_int =
3129         xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int;
3130     frame_mv[this_mode][ref_frame2].as_int =
3131         xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int;
3132   } else if (this_mode == NEAR_NEARMV) {
3133     frame_mv[this_mode][ref_frame].as_int =
3134         xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int;
3135     frame_mv[this_mode][ref_frame2].as_int =
3136         xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int;
3137   }
3138 }
3139 
3140 // Prune compound mode if the single mode variance is lower than a fixed
3141 // percentage of the median value.
3142 static bool skip_comp_based_on_var(
3143     const unsigned int (*single_vars)[REF_FRAMES], BLOCK_SIZE bsize) {
3144   unsigned int best_var = UINT_MAX;
3145   for (int cur_mode_idx = 0; cur_mode_idx < RTC_INTER_MODES; cur_mode_idx++) {
3146     for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
3147       best_var = AOMMIN(best_var, single_vars[cur_mode_idx][ref_idx]);
3148     }
3149   }
3150   const unsigned int thresh_64 = (unsigned int)(0.57356805f * 8659);
3151   const unsigned int thresh_32 = (unsigned int)(0.23964763f * 4281);
3152 
3153   // Currently, the thresh for 128 and 16 are not well-tuned. We are using the
3154   // results from 64 and 32 as an heuristic.
3155   switch (bsize) {
3156     case BLOCK_128X128: return best_var < 4 * thresh_64;
3157     case BLOCK_64X64: return best_var < thresh_64;
3158     case BLOCK_32X32: return best_var < thresh_32;
3159     case BLOCK_16X16: return best_var < thresh_32 / 4;
3160     default: return false;
3161   }
3162 }
3163 
3164 static AOM_FORCE_INLINE void fill_single_inter_mode_costs(
3165     int (*single_inter_mode_costs)[REF_FRAMES], const int num_inter_modes,
3166     const REF_MODE *reference_mode_set, const ModeCosts *mode_costs,
3167     const int16_t *mode_context) {
3168   bool ref_frame_used[REF_FRAMES] = { false };
3169   for (int idx = 0; idx < num_inter_modes; idx++) {
3170     ref_frame_used[reference_mode_set[idx].ref_frame] = true;
3171   }
3172 
3173   for (int this_ref_frame = LAST_FRAME; this_ref_frame < REF_FRAMES;
3174        this_ref_frame++) {
3175     if (!ref_frame_used[this_ref_frame]) {
3176       continue;
3177     }
3178 
3179     const MV_REFERENCE_FRAME rf[2] = { this_ref_frame, NONE_FRAME };
3180     const int16_t mode_ctx = av1_mode_context_analyzer(mode_context, rf);
3181     for (PREDICTION_MODE this_mode = NEARESTMV; this_mode <= NEWMV;
3182          this_mode++) {
3183       single_inter_mode_costs[INTER_OFFSET(this_mode)][this_ref_frame] =
3184           cost_mv_ref(mode_costs, this_mode, mode_ctx);
3185     }
3186   }
3187 }
3188 
3189 static AOM_INLINE bool is_globalmv_better(
3190     PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, int rate_mv,
3191     const ModeCosts *mode_costs,
3192     const int (*single_inter_mode_costs)[REF_FRAMES],
3193     const MB_MODE_INFO_EXT *mbmi_ext) {
3194   const int globalmv_mode_cost =
3195       single_inter_mode_costs[INTER_OFFSET(GLOBALMV)][ref_frame];
3196   int this_mode_cost =
3197       rate_mv + single_inter_mode_costs[INTER_OFFSET(this_mode)][ref_frame];
3198   if (this_mode == NEWMV || this_mode == NEARMV) {
3199     const MV_REFERENCE_FRAME rf[2] = { ref_frame, NONE_FRAME };
3200     this_mode_cost += get_drl_cost(
3201         NEWMV, 0, mbmi_ext, mode_costs->drl_mode_cost0, av1_ref_frame_type(rf));
3202   }
3203   return this_mode_cost > globalmv_mode_cost;
3204 }
3205 
3206 // Set up the mv/ref_frames etc based on the comp_index. Returns 1 if it
3207 // succeeds, 0 if it fails.
3208 static AOM_INLINE int setup_compound_params_from_comp_idx(
3209     const AV1_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE],
3210     PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *ref_frame,
3211     MV_REFERENCE_FRAME *ref_frame2, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
3212     const int *use_ref_frame_mask, int comp_index,
3213     bool comp_use_zero_zeromv_only, MV_REFERENCE_FRAME *last_comp_ref_frame) {
3214   const MV_REFERENCE_FRAME *rf = comp_ref_mode_set[comp_index].ref_frame;
3215   *this_mode = comp_ref_mode_set[comp_index].pred_mode;
3216   *ref_frame = rf[0];
3217   *ref_frame2 = rf[1];
3218   assert(*ref_frame == LAST_FRAME);
3219   assert(*this_mode == GLOBAL_GLOBALMV || *this_mode == NEAREST_NEARESTMV);
3220   if (comp_use_zero_zeromv_only && *this_mode != GLOBAL_GLOBALMV) {
3221     return 0;
3222   }
3223   if (*ref_frame2 == GOLDEN_FRAME &&
3224       (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 ||
3225        !(cpi->ref_frame_flags & AOM_GOLD_FLAG))) {
3226     return 0;
3227   } else if (*ref_frame2 == LAST2_FRAME &&
3228              (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 ||
3229               !(cpi->ref_frame_flags & AOM_LAST2_FLAG))) {
3230     return 0;
3231   } else if (*ref_frame2 == ALTREF_FRAME &&
3232              (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 ||
3233               !(cpi->ref_frame_flags & AOM_ALT_FLAG))) {
3234     return 0;
3235   }
3236   int ref_mv_idx = 0;
3237   if (*last_comp_ref_frame != rf[1]) {
3238     // Only needs to be done once per reference pair.
3239     setup_compound_prediction(&cpi->common, x, yv12_mb, use_ref_frame_mask, rf,
3240                               &ref_mv_idx);
3241     *last_comp_ref_frame = rf[1];
3242   }
3243   set_compound_mode(x, *ref_frame, *ref_frame2, ref_mv_idx, frame_mv,
3244                     *this_mode);
3245   if (*this_mode != GLOBAL_GLOBALMV &&
3246       frame_mv[*this_mode][*ref_frame].as_int == 0 &&
3247       frame_mv[*this_mode][*ref_frame2].as_int == 0) {
3248     return 0;
3249   }
3250 
3251   return 1;
3252 }
3253 
3254 static AOM_INLINE bool previous_mode_performed_poorly(
3255     PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
3256     const unsigned int (*vars)[REF_FRAMES],
3257     const int64_t (*uv_dist)[REF_FRAMES]) {
3258   unsigned int best_var = UINT_MAX;
3259   int64_t best_uv_dist = INT64_MAX;
3260   for (int midx = 0; midx < RTC_INTER_MODES; midx++) {
3261     best_var = AOMMIN(best_var, vars[midx][ref_frame]);
3262     best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]);
3263   }
3264   assert(best_var != UINT_MAX && "Invalid variance data.");
3265   const float mult = 1.125f;
3266   bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame];
3267   if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX &&
3268       best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) {
3269     // If we have chroma info, then take it into account
3270     var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame];
3271   }
3272   return var_bad;
3273 }
3274 
3275 static AOM_INLINE bool prune_compoundmode_with_singlemode_var(
3276     PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame,
3277     MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES],
3278     const uint8_t (*mode_checked)[REF_FRAMES],
3279     const unsigned int (*vars)[REF_FRAMES],
3280     const int64_t (*uv_dist)[REF_FRAMES]) {
3281   const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode);
3282   const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode);
3283 
3284   bool first_ref_valid = false, second_ref_valid = false;
3285   bool first_ref_bad = false, second_ref_bad = false;
3286   if (mode_checked[single_mode0][ref_frame] &&
3287       frame_mv[single_mode0][ref_frame].as_int ==
3288           frame_mv[compound_mode][ref_frame].as_int &&
3289       vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) {
3290     first_ref_valid = true;
3291     first_ref_bad =
3292         previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist);
3293   }
3294   if (mode_checked[single_mode1][ref_frame2] &&
3295       frame_mv[single_mode1][ref_frame2].as_int ==
3296           frame_mv[compound_mode][ref_frame2].as_int &&
3297       vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) {
3298     second_ref_valid = true;
3299     second_ref_bad =
3300         previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist);
3301   }
3302   if (first_ref_valid && second_ref_valid) {
3303     return first_ref_bad && second_ref_bad;
3304   } else if (first_ref_valid || second_ref_valid) {
3305     return first_ref_bad || second_ref_bad;
3306   }
3307   return false;
3308 }
3309 
3310 // Function to setup parameters used for inter mode evaluation.
3311 static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
3312     AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
3313     TileDataEnc *tile_data, PICK_MODE_CONTEXT *ctx, RD_STATS *rd_cost,
3314     int *force_skip_low_temp_var, int *skip_pred_mv, const int mi_row,
3315     const int mi_col, const int gf_temporal_ref, const unsigned char segment_id,
3316     BLOCK_SIZE bsize
3317 #if CONFIG_AV1_TEMPORAL_DENOISING
3318     ,
3319     int denoise_svc_pickmode
3320 #endif
3321 ) {
3322   AV1_COMMON *const cm = &cpi->common;
3323   MACROBLOCKD *const xd = &x->e_mbd;
3324   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
3325   MB_MODE_INFO *const mi = xd->mi[0];
3326   const ModeCosts *mode_costs = &x->mode_costs;
3327   (void)ctx;
3328 
3329   for (int idx = 0; idx < RTC_INTER_MODES; idx++) {
3330     for (int ref = 0; ref < REF_FRAMES; ref++) {
3331       search_state->vars[idx][ref] = UINT_MAX;
3332       search_state->uv_dist[idx][ref] = INT64_MAX;
3333     }
3334   }
3335 
3336   x->color_sensitivity[0] = x->color_sensitivity_sb[0];
3337   x->color_sensitivity[1] = x->color_sensitivity_sb[1];
3338   init_best_pickmode(&search_state->best_pickmode);
3339 
3340   estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize,
3341                                   search_state->ref_costs_single);
3342 
3343   memset(&search_state->mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES);
3344 
3345   txfm_info->skip_txfm = 0;
3346 
3347   // initialize mode decisions
3348   av1_invalid_rd_stats(&search_state->best_rdc);
3349   av1_invalid_rd_stats(&search_state->this_rdc);
3350   av1_invalid_rd_stats(rd_cost);
3351   for (int i = 0; i < REF_FRAMES; ++i) {
3352     x->warp_sample_info[i].num = -1;
3353   }
3354 
3355   mi->bsize = bsize;
3356   mi->ref_frame[0] = NONE_FRAME;
3357   mi->ref_frame[1] = NONE_FRAME;
3358 
3359 #if CONFIG_AV1_TEMPORAL_DENOISING
3360   if (cpi->oxcf.noise_sensitivity > 0) {
3361     // if (cpi->ppi->use_svc) denoise_svc_pickmode =
3362     // av1_denoise_svc_non_key(cpi);
3363     if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
3364       av1_denoiser_reset_frame_stats(ctx);
3365   }
3366 #endif
3367 
3368   if (cpi->ref_frame_flags & AOM_LAST_FLAG)
3369     find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv, tile_data,
3370                     search_state->yv12_mb, bsize, *force_skip_low_temp_var,
3371                     x->force_zeromv_skip_for_blk);
3372 
3373   get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
3374                          search_state->use_ref_frame_mask,
3375                          force_skip_low_temp_var);
3376 
3377   *skip_pred_mv =
3378       x->force_zeromv_skip_for_blk ||
3379       (x->nonrd_prune_ref_frame_search > 2 && x->color_sensitivity[0] != 2 &&
3380        x->color_sensitivity[1] != 2);
3381 
3382   // Start at LAST_FRAME + 1.
3383   for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME + 1;
3384        ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
3385     if (search_state->use_ref_frame_mask[ref_frame_iter]) {
3386       find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv, tile_data,
3387                       search_state->yv12_mb, bsize, *force_skip_low_temp_var,
3388                       *skip_pred_mv);
3389     }
3390   }
3391 }
3392 
3393 // Function to check the inter mode can be skipped based on mode statistics and
3394 // speed features settings.
3395 static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
3396     AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
3397     int64_t *thresh_sad_pred, int *force_mv_inter_layer, int *comp_pred,
3398     PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *last_comp_ref_frame,
3399     MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int idx,
3400     int svc_mv_col, int svc_mv_row, int force_skip_low_temp_var,
3401     unsigned int sse_zeromv_norm, const int num_inter_modes,
3402     const unsigned char segment_id, BLOCK_SIZE bsize,
3403     bool comp_use_zero_zeromv_only, bool check_globalmv) {
3404   AV1_COMMON *const cm = &cpi->common;
3405   const struct segmentation *const seg = &cm->seg;
3406   const SVC *const svc = &cpi->svc;
3407   MACROBLOCKD *const xd = &x->e_mbd;
3408   MB_MODE_INFO *const mi = xd->mi[0];
3409 
3410   if (idx >= num_inter_modes) {
3411     const int comp_index = idx - num_inter_modes;
3412     if (!setup_compound_params_from_comp_idx(
3413             cpi, x, search_state->yv12_mb, this_mode, ref_frame, ref_frame2,
3414             search_state->frame_mv, search_state->use_ref_frame_mask,
3415             comp_index, comp_use_zero_zeromv_only, last_comp_ref_frame)) {
3416       return true;
3417     }
3418     *comp_pred = 1;
3419   } else {
3420     *this_mode = ref_mode_set[idx].pred_mode;
3421     *ref_frame = ref_mode_set[idx].ref_frame;
3422     *ref_frame2 = NONE_FRAME;
3423   }
3424 
3425   if (!*comp_pred && search_state->mode_checked[*this_mode][*ref_frame]) {
3426     return true;
3427   }
3428 
3429   if (!check_globalmv && *this_mode == GLOBALMV) {
3430     return true;
3431   }
3432 
3433 #if COLLECT_PICK_MODE_STAT
3434   aom_usec_timer_start(&ms_stat.timer1);
3435   ms_stat.num_searches[bsize][*this_mode]++;
3436 #endif
3437   mi->mode = *this_mode;
3438   mi->ref_frame[0] = *ref_frame;
3439   mi->ref_frame[1] = *ref_frame2;
3440 
3441   if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
3442 
3443   if (x->force_zeromv_skip_for_blk &&
3444       ((!(*this_mode == NEARESTMV &&
3445           search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
3446         *this_mode != GLOBALMV) ||
3447        *ref_frame != LAST_FRAME))
3448     return true;
3449 
3450   if (cpi->sf.rt_sf.prune_compoundmode_with_singlemode_var && *comp_pred &&
3451       prune_compoundmode_with_singlemode_var(
3452           *this_mode, *ref_frame, *ref_frame2, search_state->frame_mv,
3453           search_state->mode_checked, search_state->vars,
3454           search_state->uv_dist)) {
3455     return true;
3456   }
3457 
3458   *force_mv_inter_layer = 0;
3459   if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
3460       ((*ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
3461        (*ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) ||
3462        (*ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) {
3463     // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
3464     // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
3465     // Skip newmv and filter search.
3466     *force_mv_inter_layer = 1;
3467     if (*this_mode == NEWMV) {
3468       search_state->frame_mv[*this_mode][*ref_frame].as_mv.col = svc_mv_col;
3469       search_state->frame_mv[*this_mode][*ref_frame].as_mv.row = svc_mv_row;
3470     } else if (search_state->frame_mv[*this_mode][*ref_frame].as_mv.col !=
3471                    svc_mv_col ||
3472                search_state->frame_mv[*this_mode][*ref_frame].as_mv.row !=
3473                    svc_mv_row) {
3474       return true;
3475     }
3476   }
3477 
3478   // If the segment reference frame feature is enabled then do nothing if the
3479   // current ref frame is not allowed.
3480   if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3481       get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame))
3482     return true;
3483 
3484   // For screen content: for base spatial layer only for now.
3485   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
3486       cpi->svc.spatial_layer_id == 0) {
3487     // If source_sad is computed: skip non-zero motion
3488     // check for stationary (super)blocks. Otherwise if superblock
3489     // has motion skip the modes with zero motion for flat blocks,
3490     // and color is not set.
3491     // For the latter condition: the same condition should apply
3492     // to newmv if (0, 0), so this latter condition is repeated
3493     // below after search_new_mv.
3494     if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
3495       if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 &&
3496            x->content_state_sb.source_sad_nonrd == kZeroSad) ||
3497           (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 &&
3498            x->content_state_sb.source_sad_nonrd != kZeroSad &&
3499            ((x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0) ||
3500             cpi->rc.high_source_sad) &&
3501            x->source_variance == 0))
3502         return true;
3503     }
3504     // Skip NEWMV search for flat blocks.
3505     if (*this_mode == NEWMV && x->source_variance < 100) return true;
3506     // Skip non-LAST for color on flat blocks.
3507     if (*ref_frame > LAST_FRAME && x->source_variance == 0 &&
3508         (x->color_sensitivity[0] == 1 || x->color_sensitivity[1] == 1))
3509       return true;
3510   }
3511 
3512   if (skip_mode_by_bsize_and_ref_frame(
3513           *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search,
3514           sse_zeromv_norm, cpi->sf.rt_sf.nonrd_aggressive_skip))
3515     return true;
3516 
3517   if (skip_mode_by_low_temp(*this_mode, *ref_frame, bsize, x->content_state_sb,
3518                             search_state->frame_mv[*this_mode][*ref_frame],
3519                             force_skip_low_temp_var))
3520     return true;
3521 
3522   // Disable this drop out case if the ref frame segment level feature is
3523   // enabled for this segment. This is to prevent the possibility that we
3524   // end up unable to pick any mode.
3525   if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
3526     // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
3527     if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0 &&
3528         x->pred_mv_sad[*ref_frame] != INT_MAX && *ref_frame != LAST_FRAME) {
3529       if ((int64_t)(x->pred_mv_sad[*ref_frame]) > *thresh_sad_pred) return true;
3530     }
3531   }
3532 
3533   // Check for skipping NEARMV based on pred_mv_sad.
3534   if (*this_mode == NEARMV && x->pred_mv1_sad[*ref_frame] != INT_MAX &&
3535       x->pred_mv1_sad[*ref_frame] > (x->pred_mv0_sad[*ref_frame] << 1))
3536     return true;
3537 
3538   if (!*comp_pred) {
3539     if (skip_mode_by_threshold(
3540             *this_mode, *ref_frame,
3541             search_state->frame_mv[*this_mode][*ref_frame],
3542             cpi->rc.frames_since_golden, cpi->rd.threshes[segment_id][bsize],
3543             x->thresh_freq_fact[bsize], search_state->best_rdc.rdcost,
3544             search_state->best_pickmode.best_mode_skip_txfm,
3545             (cpi->sf.rt_sf.nonrd_aggressive_skip ? 1 : 0)))
3546       return true;
3547   }
3548   return false;
3549 }
3550 
3551 void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
3552                                   MACROBLOCK *x, RD_STATS *rd_cost,
3553                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
3554   AV1_COMMON *const cm = &cpi->common;
3555   SVC *const svc = &cpi->svc;
3556   MACROBLOCKD *const xd = &x->e_mbd;
3557   MB_MODE_INFO *const mi = xd->mi[0];
3558   struct macroblockd_plane *const pd = &xd->plane[0];
3559   const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
3560   const InterpFilter filter_ref = cm->features.interp_filter;
3561   const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
3562   MV_REFERENCE_FRAME ref_frame, ref_frame2;
3563   const unsigned char segment_id = mi->segment_id;
3564   int best_early_term = 0;
3565   int force_skip_low_temp_var = 0;
3566   unsigned int sse_zeromv_norm = UINT_MAX;
3567   int skip_pred_mv = 0;
3568   const int num_inter_modes = NUM_INTER_MODES;
3569   bool check_globalmv = cpi->sf.rt_sf.check_globalmv_on_single_ref;
3570   PRED_BUFFER tmp_buffer[4];
3571   DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]);
3572   PRED_BUFFER *this_mode_pred = NULL;
3573   const int reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
3574                                cm->seq_params->bit_depth == AOM_BITS_8;
3575   InterModeSearchStateNonrd search_state;
3576   av1_zero(search_state.use_ref_frame_mask);
3577 
3578   const int bh = block_size_high[bsize];
3579   const int bw = block_size_wide[bsize];
3580   const int pixels_in_block = bh * bw;
3581   const int num_8x8_blocks = ctx->num_4x4_blk / 4;
3582   struct buf_2d orig_dst = pd->dst;
3583   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
3584   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
3585 #if COLLECT_PICK_MODE_STAT
3586   aom_usec_timer_start(&ms_stat.bsize_timer);
3587 #endif
3588   int64_t thresh_sad_pred = INT64_MAX;
3589   const int mi_row = xd->mi_row;
3590   const int mi_col = xd->mi_col;
3591   int svc_mv_col = 0;
3592   int svc_mv_row = 0;
3593   int force_mv_inter_layer = 0;
3594   bool comp_use_zero_zeromv_only = 0;
3595   int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT;
3596 #if CONFIG_AV1_TEMPORAL_DENOISING
3597   const int denoise_recheck_zeromv = 1;
3598   AV1_PICKMODE_CTX_DEN ctx_den;
3599   int64_t zero_last_cost_orig = INT64_MAX;
3600   int denoise_svc_pickmode = 1;
3601   const int resize_pending = is_frame_resize_pending(cpi);
3602 #endif
3603   const ModeCosts *mode_costs = &x->mode_costs;
3604 
3605   if (reuse_inter_pred) {
3606     for (int i = 0; i < 3; i++) {
3607       tmp_buffer[i].data = &pred_buf[pixels_in_block * i];
3608       tmp_buffer[i].stride = bw;
3609       tmp_buffer[i].in_use = 0;
3610     }
3611     tmp_buffer[3].data = pd->dst.buf;
3612     tmp_buffer[3].stride = pd->dst.stride;
3613     tmp_buffer[3].in_use = 0;
3614   }
3615 
3616   const int gf_temporal_ref = is_same_gf_and_last_scale(cm);
3617 
3618   // If the lower spatial layer uses an averaging filter for downsampling
3619   // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative
3620   // to source, so use subpel motion vector to compensate. The nonzero motion
3621   // is half pixel shifted to left and top, so (-4, -4). This has more effect
3622   // on higher resolutions, so condition it on that for now.
3623   if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
3624       svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
3625       cm->width * cm->height > 640 * 480) {
3626     svc_mv_col = -4;
3627     svc_mv_row = -4;
3628   }
3629 
3630   // Setup parameters used for inter mode evaluation.
3631   set_params_nonrd_pick_inter_mode(
3632       cpi, x, &search_state, tile_data, ctx, rd_cost, &force_skip_low_temp_var,
3633       &skip_pred_mv, mi_row, mi_col, gf_temporal_ref, segment_id, bsize
3634 #if CONFIG_AV1_TEMPORAL_DENOISING
3635       ,
3636       denoise_svc_pickmode
3637 #endif
3638   );
3639 
3640   if (cpi->sf.rt_sf.use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) {
3641     // Only search compound if bsize \gt BLOCK_16X16.
3642     if (bsize > BLOCK_16X16) {
3643       comp_use_zero_zeromv_only =
3644           cpi->sf.rt_sf.check_only_zero_zeromv_on_large_blocks;
3645     } else {
3646       tot_num_comp_modes = 0;
3647     }
3648   } else {
3649     tot_num_comp_modes = 0;
3650   }
3651 
3652   if (x->pred_mv_sad[LAST_FRAME] != INT_MAX) {
3653     thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1;
3654     // Increase threshold for less aggressive pruning.
3655     if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search == 1)
3656       thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2);
3657   }
3658 
3659   const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
3660 
3661   // decide block-level interp filter search flags:
3662   // filter_search_enabled_blk:
3663   // 0: disabled
3664   // 1: filter search depends on mode properties
3665   // 2: filter search forced since prediction is unreliable
3666   // cb_pred_filter_search 0: disabled cb prediction
3667   InterpFilter filt_select = EIGHTTAP_REGULAR;
3668   const int cb_pred_filter_search =
3669       x->content_state_sb.source_sad_nonrd > kVeryLowSad
3670           ? cpi->sf.interp_sf.cb_pred_filter_search
3671           : 0;
3672   const int filter_search_enabled_blk =
3673       is_filter_search_enabled_blk(cpi, x, mi_row, mi_col, bsize, segment_id,
3674                                    cb_pred_filter_search, &filt_select);
3675 
3676 #if COLLECT_PICK_MODE_STAT
3677   ms_stat.num_blocks[bsize]++;
3678 #endif
3679   init_mbmi(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm);
3680   mi->tx_size = AOMMIN(
3681       AOMMIN(max_txsize_lookup[bsize],
3682              tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
3683       TX_16X16);
3684 
3685   fill_single_inter_mode_costs(search_state.single_inter_mode_costs,
3686                                num_inter_modes, ref_mode_set, mode_costs,
3687                                mbmi_ext->mode_context);
3688 
3689   MV_REFERENCE_FRAME last_comp_ref_frame = NONE_FRAME;
3690 
3691   // Initialize inter prediction params at block level for single reference
3692   // mode.
3693   InterPredParams inter_pred_params_sr;
3694   init_inter_block_params(&inter_pred_params_sr, pd->width, pd->height,
3695                           mi_row * MI_SIZE, mi_col * MI_SIZE, pd->subsampling_x,
3696                           pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd),
3697                           /*is_intrabc=*/0);
3698   inter_pred_params_sr.conv_params =
3699       get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd);
3700 
3701   for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
3702     // If we are at the first compound mode, and the single modes already
3703     // perform well, then end the search.
3704     if (cpi->sf.rt_sf.skip_compound_based_on_var && idx == num_inter_modes &&
3705         skip_comp_based_on_var(search_state.vars, bsize)) {
3706       break;
3707     }
3708 
3709     int rate_mv = 0;
3710     int is_skippable;
3711     int this_early_term = 0;
3712     int skip_this_mv = 0;
3713     int comp_pred = 0;
3714     unsigned int var = UINT_MAX;
3715     PREDICTION_MODE this_mode;
3716     RD_STATS nonskip_rdc;
3717     av1_invalid_rd_stats(&nonskip_rdc);
3718     memset(txfm_info->blk_skip, 0,
3719            sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
3720 
3721     // Check the inter mode can be skipped based on mode statistics and speed
3722     // features settings.
3723     if (skip_inter_mode_nonrd(
3724             cpi, x, &search_state, &thresh_sad_pred, &force_mv_inter_layer,
3725             &comp_pred, &this_mode, &last_comp_ref_frame, &ref_frame,
3726             &ref_frame2, idx, svc_mv_col, svc_mv_row, force_skip_low_temp_var,
3727             sse_zeromv_norm, num_inter_modes, segment_id, bsize,
3728             comp_use_zero_zeromv_only, check_globalmv))
3729       continue;
3730 
3731     // Select prediction reference frames.
3732     for (int i = 0; i < MAX_MB_PLANE; i++) {
3733       xd->plane[i].pre[0] = search_state.yv12_mb[ref_frame][i];
3734       if (comp_pred) xd->plane[i].pre[1] = search_state.yv12_mb[ref_frame2][i];
3735     }
3736 
3737     mi->ref_frame[0] = ref_frame;
3738     mi->ref_frame[1] = ref_frame2;
3739     set_ref_ptrs(cm, xd, ref_frame, ref_frame2);
3740 
3741     if (this_mode == NEWMV && !force_mv_inter_layer) {
3742 #if COLLECT_PICK_MODE_STAT
3743       aom_usec_timer_start(&ms_stat.timer2);
3744 #endif
3745       const bool skip_newmv = search_new_mv(
3746           cpi, x, search_state.frame_mv, ref_frame, gf_temporal_ref, bsize,
3747           mi_row, mi_col, &rate_mv, &search_state.best_rdc);
3748 #if COLLECT_PICK_MODE_STAT
3749       aom_usec_timer_mark(&ms_stat.timer2);
3750       ms_stat.ms_time[bsize][this_mode] +=
3751           aom_usec_timer_elapsed(&ms_stat.timer2);
3752 #endif
3753       if (skip_newmv) {
3754         continue;
3755       }
3756     }
3757 
3758     for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
3759          inter_mv_mode++) {
3760       if (inter_mv_mode == this_mode) continue;
3761       if (!comp_pred && search_state.mode_checked[inter_mv_mode][ref_frame] &&
3762           search_state.frame_mv[this_mode][ref_frame].as_int ==
3763               search_state.frame_mv[inter_mv_mode][ref_frame].as_int) {
3764         skip_this_mv = 1;
3765         break;
3766       }
3767     }
3768 
3769     if (skip_this_mv && !comp_pred) continue;
3770 
3771     // For screen: for spatially flat blocks with non-zero motion,
3772     // skip newmv if the motion vector is (0, 0), and color is not set.
3773     if (this_mode == NEWMV &&
3774         cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
3775         cpi->svc.spatial_layer_id == 0 &&
3776         cpi->sf.rt_sf.source_metrics_sb_nonrd) {
3777       if (search_state.frame_mv[this_mode][ref_frame].as_int == 0 &&
3778           x->content_state_sb.source_sad_nonrd != kZeroSad &&
3779           ((x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0) ||
3780            cpi->rc.high_source_sad) &&
3781           x->source_variance == 0)
3782         continue;
3783     }
3784 
3785     mi->mode = this_mode;
3786     mi->mv[0].as_int = search_state.frame_mv[this_mode][ref_frame].as_int;
3787     mi->mv[1].as_int = 0;
3788     if (comp_pred)
3789       mi->mv[1].as_int = search_state.frame_mv[this_mode][ref_frame2].as_int;
3790 
3791     if (reuse_inter_pred) {
3792       if (!this_mode_pred) {
3793         this_mode_pred = &tmp_buffer[3];
3794       } else {
3795         this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
3796         pd->dst.buf = this_mode_pred->data;
3797         pd->dst.stride = bw;
3798       }
3799     }
3800 
3801     if (idx == 0 && !skip_pred_mv) {
3802       // Set color sensitivity on first tested mode only.
3803       // Use y-sad already computed in find_predictors: take the sad with motion
3804       // vector closest to 0; the uv-sad computed below in set_color_sensitivity
3805       // is for zeromv.
3806       // For screen: first check if golden reference is being used, if so,
3807       // force color_sensitivity on if the color sensitivity for sb_g is on.
3808       if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
3809           search_state.use_ref_frame_mask[GOLDEN_FRAME]) {
3810         if (x->color_sensitivity_sb_g[0] == 1) x->color_sensitivity[0] = 1;
3811         if (x->color_sensitivity_sb_g[1] == 1) x->color_sensitivity[1] = 1;
3812       } else {
3813         int y_sad = x->pred_mv0_sad[LAST_FRAME];
3814         if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
3815             (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
3816              abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
3817                 (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
3818                  abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
3819           y_sad = x->pred_mv1_sad[LAST_FRAME];
3820         set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
3821                               search_state.yv12_mb[LAST_FRAME]);
3822       }
3823     }
3824     mi->motion_mode = SIMPLE_TRANSLATION;
3825 #if !CONFIG_REALTIME_ONLY
3826     if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) {
3827       calc_num_proj_ref(cpi, x, mi);
3828     }
3829 #endif
3830     // set variance threshold for compound more pruning
3831     unsigned int var_threshold = UINT_MAX;
3832     if (cpi->sf.rt_sf.prune_compoundmode_with_singlecompound_var && comp_pred &&
3833         use_model_yrd_large) {
3834       const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode);
3835       const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode);
3836       var_threshold =
3837           AOMMIN(var_threshold,
3838                  search_state.vars[INTER_OFFSET(single_mode0)][ref_frame]);
3839       var_threshold =
3840           AOMMIN(var_threshold,
3841                  search_state.vars[INTER_OFFSET(single_mode1)][ref_frame2]);
3842     }
3843     // decide interpolation filter, build prediction signal, get sse
3844     const bool is_mv_subpel =
3845         (mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07);
3846     const bool enable_filt_search_this_mode =
3847         (filter_search_enabled_blk == 2)
3848             ? true
3849             : (filter_search_enabled_blk && !force_mv_inter_layer &&
3850                !comp_pred &&
3851                (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search));
3852     if (is_mv_subpel && enable_filt_search_this_mode) {
3853 #if COLLECT_PICK_MODE_STAT
3854       aom_usec_timer_start(&ms_stat.timer2);
3855 #endif
3856       search_filter_ref(cpi, x, &search_state.this_rdc, &inter_pred_params_sr,
3857                         mi_row, mi_col, tmp_buffer, bsize, reuse_inter_pred,
3858                         &this_mode_pred, &this_early_term, &var,
3859                         use_model_yrd_large,
3860                         search_state.best_pickmode.best_sse, comp_pred);
3861 #if COLLECT_PICK_MODE_STAT
3862       aom_usec_timer_mark(&ms_stat.timer2);
3863       ms_stat.ifs_time[bsize][this_mode] +=
3864           aom_usec_timer_elapsed(&ms_stat.timer2);
3865 #endif
3866 #if !CONFIG_REALTIME_ONLY
3867     } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion &&
3868                this_mode == NEWMV) {
3869       search_motion_mode(cpi, x, &search_state.this_rdc, mi_row, mi_col, bsize,
3870                          &this_early_term, use_model_yrd_large, &rate_mv,
3871                          search_state.best_pickmode.best_sse);
3872       if (this_mode == NEWMV) {
3873         search_state.frame_mv[this_mode][ref_frame] = mi->mv[0];
3874       }
3875 #endif
3876     } else {
3877       mi->interp_filters =
3878           (filter_ref == SWITCHABLE)
3879               ? av1_broadcast_interp_filter(default_interp_filter)
3880               : av1_broadcast_interp_filter(filter_ref);
3881       if (force_mv_inter_layer)
3882         mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
3883 
3884       // If it is sub-pel motion and cb_pred_filter_search is enabled, select
3885       // the pre-decided filter
3886       if (is_mv_subpel && cb_pred_filter_search)
3887         mi->interp_filters = av1_broadcast_interp_filter(filt_select);
3888 
3889 #if COLLECT_PICK_MODE_STAT
3890       aom_usec_timer_start(&ms_stat.timer2);
3891 #endif
3892       if (!comp_pred) {
3893         SubpelParams subpel_params;
3894         // Initialize inter mode level params for single reference mode.
3895         init_inter_mode_params(&mi->mv[0].as_mv, &inter_pred_params_sr,
3896                                &subpel_params, xd->block_ref_scale_factors[0],
3897                                pd->pre->width, pd->pre->height);
3898         av1_enc_build_inter_predictor_y_nonrd(xd, &inter_pred_params_sr,
3899                                               &subpel_params);
3900       } else {
3901         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
3902                                       0);
3903       }
3904 
3905       if (use_model_yrd_large) {
3906         model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
3907                                   &search_state.this_rdc, &this_early_term, 0,
3908                                   search_state.best_pickmode.best_sse, &var,
3909                                   var_threshold);
3910       } else {
3911         model_rd_for_sb_y(cpi, bsize, x, xd, &search_state.this_rdc, &var, 0,
3912                           &this_early_term);
3913       }
3914 #if COLLECT_PICK_MODE_STAT
3915       aom_usec_timer_mark(&ms_stat.timer2);
3916       ms_stat.model_rd_time[bsize][this_mode] +=
3917           aom_usec_timer_elapsed(&ms_stat.timer2);
3918 #endif
3919     }
3920     // update variance for single mode
3921     if (!comp_pred) {
3922       search_state.vars[INTER_OFFSET(this_mode)][ref_frame] = var;
3923       if (search_state.frame_mv[this_mode][ref_frame].as_int == 0) {
3924         search_state.vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
3925       }
3926     }
3927     // prune compound mode based on single mode var threshold
3928     if (comp_pred && var > var_threshold) {
3929       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
3930       continue;
3931     }
3932 
3933     if (ref_frame == LAST_FRAME &&
3934         search_state.frame_mv[this_mode][ref_frame].as_int == 0) {
3935       sse_zeromv_norm = (unsigned int)(search_state.this_rdc.sse >>
3936                                        (b_width_log2_lookup[bsize] +
3937                                         b_height_log2_lookup[bsize]));
3938     }
3939 
3940     if (cpi->sf.rt_sf.sse_early_term_inter_search &&
3941         early_term_inter_search_with_sse(
3942             cpi->sf.rt_sf.sse_early_term_inter_search, bsize,
3943             search_state.this_rdc.sse, search_state.best_pickmode.best_sse,
3944             this_mode)) {
3945       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
3946       continue;
3947     }
3948 
3949 #if COLLECT_PICK_MODE_STAT
3950     ms_stat.num_nonskipped_searches[bsize][this_mode]++;
3951 #endif
3952 
3953     const int skip_ctx = av1_get_skip_txfm_context(xd);
3954     const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1];
3955     const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0];
3956     const int64_t sse_y = search_state.this_rdc.sse;
3957     if (this_early_term) {
3958       search_state.this_rdc.skip_txfm = 1;
3959       search_state.this_rdc.rate = skip_txfm_cost;
3960       search_state.this_rdc.dist = search_state.this_rdc.sse << 4;
3961     } else {
3962 #if COLLECT_PICK_MODE_STAT
3963       aom_usec_timer_start(&ms_stat.timer2);
3964 #endif
3965       block_yrd(x, &search_state.this_rdc, &is_skippable, bsize, mi->tx_size,
3966                 1);
3967       if (search_state.this_rdc.skip_txfm ||
3968           RDCOST(x->rdmult, search_state.this_rdc.rate,
3969                  search_state.this_rdc.dist) >=
3970               RDCOST(x->rdmult, 0, search_state.this_rdc.sse)) {
3971         if (!search_state.this_rdc.skip_txfm) {
3972           // Need to store "real" rdc for possible future use if UV rdc
3973           // disallows tx skip
3974           nonskip_rdc = search_state.this_rdc;
3975           nonskip_rdc.rate += no_skip_txfm_cost;
3976         }
3977         search_state.this_rdc.rate = skip_txfm_cost;
3978         search_state.this_rdc.skip_txfm = 1;
3979         search_state.this_rdc.dist = search_state.this_rdc.sse;
3980       } else {
3981         search_state.this_rdc.rate += no_skip_txfm_cost;
3982       }
3983       if ((x->color_sensitivity[0] || x->color_sensitivity[1])) {
3984         RD_STATS rdc_uv;
3985         const BLOCK_SIZE uv_bsize = get_plane_block_size(
3986             bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
3987         if (x->color_sensitivity[0]) {
3988           av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
3989                                         AOM_PLANE_U, AOM_PLANE_U);
3990         }
3991         if (x->color_sensitivity[1]) {
3992           av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
3993                                         AOM_PLANE_V, AOM_PLANE_V);
3994         }
3995         const int64_t sse_uv =
3996             model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, 1, 2);
3997         search_state.this_rdc.sse += sse_uv;
3998         // Restore Y rdc if UV rdc disallows txfm skip
3999         if (search_state.this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
4000             nonskip_rdc.rate != INT_MAX)
4001           search_state.this_rdc = nonskip_rdc;
4002         if (!comp_pred) {
4003           search_state.uv_dist[INTER_OFFSET(this_mode)][ref_frame] =
4004               rdc_uv.dist;
4005         }
4006         search_state.this_rdc.rate += rdc_uv.rate;
4007         search_state.this_rdc.dist += rdc_uv.dist;
4008         search_state.this_rdc.skip_txfm =
4009             search_state.this_rdc.skip_txfm && rdc_uv.skip_txfm;
4010       }
4011 #if COLLECT_PICK_MODE_STAT
4012       aom_usec_timer_mark(&ms_stat.timer2);
4013       ms_stat.txfm_time[bsize][this_mode] +=
4014           aom_usec_timer_elapsed(&ms_stat.timer2);
4015 #endif
4016     }
4017     PREDICTION_MODE this_best_mode = this_mode;
4018 
4019     // TODO(kyslov) account for UV prediction cost
4020     search_state.this_rdc.rate += rate_mv;
4021     if (comp_pred) {
4022       const int16_t mode_ctx =
4023           av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
4024       search_state.this_rdc.rate +=
4025           cost_mv_ref(mode_costs, this_mode, mode_ctx);
4026     } else {
4027       // If the current mode has zeromv but is not GLOBALMV, compare the rate
4028       // cost. If GLOBALMV is cheaper, use GLOBALMV instead.
4029       if (this_mode != GLOBALMV &&
4030           search_state.frame_mv[this_mode][ref_frame].as_int ==
4031               search_state.frame_mv[GLOBALMV][ref_frame].as_int) {
4032         if (is_globalmv_better(this_mode, ref_frame, rate_mv, mode_costs,
4033                                search_state.single_inter_mode_costs,
4034                                mbmi_ext)) {
4035           this_best_mode = GLOBALMV;
4036         }
4037       }
4038 
4039       search_state.this_rdc.rate +=
4040           search_state
4041               .single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame];
4042     }
4043 
4044     if (!comp_pred && search_state.frame_mv[this_mode][ref_frame].as_int == 0 &&
4045         var < UINT_MAX) {
4046       search_state.vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
4047     }
4048 
4049     search_state.this_rdc.rate += search_state.ref_costs_single[ref_frame];
4050 
4051     search_state.this_rdc.rdcost = RDCOST(x->rdmult, search_state.this_rdc.rate,
4052                                           search_state.this_rdc.dist);
4053     if (cpi->oxcf.rc_cfg.mode == AOM_CBR && !comp_pred) {
4054       newmv_diff_bias(
4055           xd, this_best_mode, &search_state.this_rdc, bsize,
4056           search_state.frame_mv[this_best_mode][ref_frame].as_mv.row,
4057           search_state.frame_mv[this_best_mode][ref_frame].as_mv.col,
4058           cpi->speed, x->source_variance, x->content_state_sb);
4059     }
4060 #if CONFIG_AV1_TEMPORAL_DENOISING
4061     if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
4062         cpi->denoiser.denoising_level > kDenLowLow) {
4063       av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
4064       // Keep track of zero_last cost.
4065       if (ref_frame == LAST_FRAME &&
4066           search_state.frame_mv[this_mode][ref_frame].as_int == 0)
4067         zero_last_cost_orig = search_state.this_rdc.rdcost;
4068     }
4069 #else
4070     (void)sse_y;
4071 #endif
4072 
4073     search_state.mode_checked[this_mode][ref_frame] = 1;
4074     search_state.mode_checked[this_best_mode][ref_frame] = 1;
4075 
4076     if (check_globalmv) {
4077       int32_t abs_mv =
4078           abs(search_state.frame_mv[this_best_mode][ref_frame].as_mv.row) +
4079           abs(search_state.frame_mv[this_best_mode][ref_frame].as_mv.col);
4080       // Early exit check: if the magnitude of this_best_mode's mv is small
4081       // enough, we skip GLOBALMV check in the next loop iteration.
4082       if (abs_mv < 2) {
4083         check_globalmv = false;
4084       }
4085     }
4086 #if COLLECT_PICK_MODE_STAT
4087     aom_usec_timer_mark(&ms_stat.timer1);
4088     ms_stat.nonskipped_search_times[bsize][this_mode] +=
4089         aom_usec_timer_elapsed(&ms_stat.timer1);
4090 #endif
4091     if (search_state.this_rdc.rdcost < search_state.best_rdc.rdcost) {
4092       search_state.best_rdc = search_state.this_rdc;
4093       best_early_term = this_early_term;
4094       search_state.best_pickmode.best_sse = sse_y;
4095       search_state.best_pickmode.best_mode = this_best_mode;
4096       search_state.best_pickmode.best_motion_mode = mi->motion_mode;
4097       search_state.best_pickmode.wm_params = mi->wm_params;
4098       search_state.best_pickmode.num_proj_ref = mi->num_proj_ref;
4099       search_state.best_pickmode.best_pred_filter = mi->interp_filters;
4100       search_state.best_pickmode.best_tx_size = mi->tx_size;
4101       search_state.best_pickmode.best_ref_frame = ref_frame;
4102       search_state.best_pickmode.best_second_ref_frame = ref_frame2;
4103       search_state.best_pickmode.best_mode_skip_txfm =
4104           search_state.this_rdc.skip_txfm;
4105       search_state.best_pickmode.best_mode_initial_skip_flag =
4106           (nonskip_rdc.rate == INT_MAX && search_state.this_rdc.skip_txfm);
4107       if (!search_state.best_pickmode.best_mode_skip_txfm) {
4108         memcpy(search_state.best_pickmode.blk_skip, txfm_info->blk_skip,
4109                sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
4110       }
4111 
4112       // This is needed for the compound modes.
4113       search_state.frame_mv_best[this_best_mode][ref_frame].as_int =
4114           search_state.frame_mv[this_best_mode][ref_frame].as_int;
4115       if (ref_frame2 > NONE_FRAME) {
4116         search_state.frame_mv_best[this_best_mode][ref_frame2].as_int =
4117             search_state.frame_mv[this_best_mode][ref_frame2].as_int;
4118       }
4119 
4120       if (reuse_inter_pred) {
4121         free_pred_buffer(search_state.best_pickmode.best_pred);
4122         search_state.best_pickmode.best_pred = this_mode_pred;
4123       }
4124     } else {
4125       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
4126     }
4127     if (best_early_term && (idx > 0 || cpi->sf.rt_sf.nonrd_aggressive_skip)) {
4128       txfm_info->skip_txfm = 1;
4129       break;
4130     }
4131   }
4132 
4133   mi->mode = search_state.best_pickmode.best_mode;
4134   mi->motion_mode = search_state.best_pickmode.best_motion_mode;
4135   mi->wm_params = search_state.best_pickmode.wm_params;
4136   mi->num_proj_ref = search_state.best_pickmode.num_proj_ref;
4137   mi->interp_filters = search_state.best_pickmode.best_pred_filter;
4138   mi->tx_size = search_state.best_pickmode.best_tx_size;
4139   memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
4140   mi->ref_frame[0] = search_state.best_pickmode.best_ref_frame;
4141   mi->mv[0].as_int =
4142       search_state
4143           .frame_mv_best[search_state.best_pickmode.best_mode]
4144                         [search_state.best_pickmode.best_ref_frame]
4145           .as_int;
4146   mi->mv[1].as_int = 0;
4147   if (search_state.best_pickmode.best_second_ref_frame > INTRA_FRAME) {
4148     mi->ref_frame[1] = search_state.best_pickmode.best_second_ref_frame;
4149     mi->mv[1].as_int =
4150         search_state
4151             .frame_mv_best[search_state.best_pickmode.best_mode]
4152                           [search_state.best_pickmode.best_second_ref_frame]
4153             .as_int;
4154   }
4155   // Perform intra prediction search, if the best SAD is above a certain
4156   // threshold.
4157   mi->angle_delta[PLANE_TYPE_Y] = 0;
4158   mi->angle_delta[PLANE_TYPE_UV] = 0;
4159   mi->filter_intra_mode_info.use_filter_intra = 0;
4160 
4161 #if COLLECT_PICK_MODE_STAT
4162   aom_usec_timer_start(&ms_stat.timer1);
4163   ms_stat.num_searches[bsize][DC_PRED]++;
4164   ms_stat.num_nonskipped_searches[bsize][DC_PRED]++;
4165 #endif
4166 
4167   if (!x->force_zeromv_skip_for_blk)
4168     estimate_intra_mode(cpi, x, bsize, best_early_term,
4169                         search_state.ref_costs_single[INTRA_FRAME],
4170                         reuse_inter_pred, &orig_dst, tmp_buffer,
4171                         &this_mode_pred, &search_state.best_rdc,
4172                         &search_state.best_pickmode, ctx);
4173 
4174   int skip_idtx_palette =
4175       (x->color_sensitivity[0] || x->color_sensitivity[1]) &&
4176       x->content_state_sb.source_sad_nonrd != kZeroSad &&
4177       !cpi->rc.high_source_sad;
4178 
4179   // Check for IDTX: based only on Y channel, so avoid when color_sensitivity
4180   // is set.
4181   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette &&
4182       !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk &&
4183       is_inter_mode(search_state.best_pickmode.best_mode) &&
4184       (!cpi->sf.rt_sf.prune_idtx_nonrd ||
4185        (cpi->sf.rt_sf.prune_idtx_nonrd && bsize <= BLOCK_32X32 &&
4186         search_state.best_pickmode.best_mode_skip_txfm != 1 &&
4187         x->source_variance > 200))) {
4188     RD_STATS idtx_rdc;
4189     av1_init_rd_stats(&idtx_rdc);
4190     int is_skippable;
4191     this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
4192     pd->dst.buf = this_mode_pred->data;
4193     pd->dst.stride = bw;
4194     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
4195     block_yrd_idtx(x, &idtx_rdc, &is_skippable, bsize, mi->tx_size);
4196     int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
4197     if (idx_rdcost < search_state.best_rdc.rdcost) {
4198       // Keep the skip_txfm off if the color_sensitivity is set.
4199       if (x->color_sensitivity[0] || x->color_sensitivity[1])
4200         idtx_rdc.skip_txfm = 0;
4201       search_state.best_pickmode.tx_type = IDTX;
4202       search_state.best_rdc.rdcost = idx_rdcost;
4203       search_state.best_pickmode.best_mode_skip_txfm = idtx_rdc.skip_txfm;
4204       if (!idtx_rdc.skip_txfm) {
4205         memcpy(search_state.best_pickmode.blk_skip, txfm_info->blk_skip,
4206                sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
4207       }
4208       xd->tx_type_map[0] = search_state.best_pickmode.tx_type;
4209       memset(ctx->tx_type_map, search_state.best_pickmode.tx_type,
4210              ctx->num_4x4_blk);
4211       memset(xd->tx_type_map, search_state.best_pickmode.tx_type,
4212              ctx->num_4x4_blk);
4213     }
4214     pd->dst = orig_dst;
4215   }
4216 
4217   int try_palette =
4218       !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette &&
4219       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
4220                         mi->bsize);
4221   try_palette = try_palette &&
4222                 is_mode_intra(search_state.best_pickmode.best_mode) &&
4223                 x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
4224                 (cpi->rc.high_source_sad || x->source_variance > 500);
4225 
4226   if (try_palette) {
4227     const unsigned int intra_ref_frame_cost =
4228         search_state.ref_costs_single[INTRA_FRAME];
4229 
4230     av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
4231                                  &search_state.this_rdc,
4232                                  search_state.best_rdc.rdcost);
4233     if (search_state.this_rdc.rdcost < search_state.best_rdc.rdcost) {
4234       search_state.best_pickmode.pmi = mi->palette_mode_info;
4235       search_state.best_pickmode.best_mode = DC_PRED;
4236       mi->mv[0].as_int = 0;
4237       search_state.best_rdc.rate = search_state.this_rdc.rate;
4238       search_state.best_rdc.dist = search_state.this_rdc.dist;
4239       search_state.best_rdc.rdcost = search_state.this_rdc.rdcost;
4240       search_state.best_pickmode.best_mode_skip_txfm =
4241           search_state.this_rdc.skip_txfm;
4242       // Keep the skip_txfm off if the color_sensitivity is set.
4243       if (x->color_sensitivity[0] || x->color_sensitivity[1])
4244         search_state.this_rdc.skip_txfm = 0;
4245       if (!search_state.this_rdc.skip_txfm) {
4246         memcpy(ctx->blk_skip, txfm_info->blk_skip,
4247                sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
4248       }
4249       if (xd->tx_type_map[0] != DCT_DCT)
4250         av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
4251     }
4252   }
4253 
4254 #if COLLECT_PICK_MODE_STAT
4255   aom_usec_timer_mark(&ms_stat.timer1);
4256   ms_stat.nonskipped_search_times[bsize][DC_PRED] +=
4257       aom_usec_timer_elapsed(&ms_stat.timer1);
4258 #endif
4259 
4260   pd->dst = orig_dst;
4261   if (try_palette) mi->palette_mode_info = search_state.best_pickmode.pmi;
4262   mi->mode = search_state.best_pickmode.best_mode;
4263   mi->ref_frame[0] = search_state.best_pickmode.best_ref_frame;
4264   mi->ref_frame[1] = search_state.best_pickmode.best_second_ref_frame;
4265   txfm_info->skip_txfm = search_state.best_pickmode.best_mode_skip_txfm;
4266   if (!txfm_info->skip_txfm) {
4267     // For inter modes: copy blk_skip from best_pickmode, which is
4268     // defined for 8x8 blocks. If palette or intra mode was selected
4269     // as best then blk_skip is already copied into the ctx.
4270     if (search_state.best_pickmode.best_mode >= INTRA_MODE_END)
4271       memcpy(ctx->blk_skip, search_state.best_pickmode.blk_skip,
4272              sizeof(search_state.best_pickmode.blk_skip[0]) * num_8x8_blocks);
4273   }
4274   if (has_second_ref(mi)) {
4275     mi->comp_group_idx = 0;
4276     mi->compound_idx = 1;
4277     mi->interinter_comp.type = COMPOUND_AVERAGE;
4278   }
4279 
4280   if (!is_inter_block(mi)) {
4281     mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
4282   }
4283 
4284   if (reuse_inter_pred && search_state.best_pickmode.best_pred != NULL) {
4285     PRED_BUFFER *const best_pred = search_state.best_pickmode.best_pred;
4286     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
4287       aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
4288                         pd->dst.stride, bw, bh);
4289     }
4290   }
4291 
4292 #if CONFIG_AV1_TEMPORAL_DENOISING
4293   if (cpi->oxcf.noise_sensitivity > 0 && resize_pending == 0 &&
4294       denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
4295       cpi->denoiser.reset == 0) {
4296     AV1_DENOISER_DECISION decision = COPY_BLOCK;
4297     ctx->sb_skip_denoising = 0;
4298     av1_pickmode_ctx_den_update(
4299         &ctx_den, zero_last_cost_orig, search_state.ref_costs_single,
4300         search_state.frame_mv, reuse_inter_pred, &search_state.best_pickmode);
4301     av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
4302                          gf_temporal_ref);
4303     if (denoise_recheck_zeromv)
4304       recheck_zeromv_after_denoising(
4305           cpi, mi, x, xd, decision, &ctx_den, search_state.yv12_mb,
4306           &search_state.best_rdc, &search_state.best_pickmode, bsize, mi_row,
4307           mi_col);
4308     search_state.best_pickmode.best_ref_frame = ctx_den.best_ref_frame;
4309   }
4310 #endif
4311 
4312   if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) {
4313     THR_MODES best_mode_idx =
4314         mode_idx[search_state.best_pickmode.best_ref_frame]
4315                 [mode_offset(mi->mode)];
4316     if (search_state.best_pickmode.best_ref_frame == INTRA_FRAME) {
4317       // Only consider the modes that are included in the intra_mode_list.
4318       int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
4319       for (int i = 0; i < intra_modes; i++) {
4320         update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx,
4321                                 intra_mode_list[i]);
4322       }
4323     } else {
4324       PREDICTION_MODE this_mode;
4325       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
4326         update_thresh_freq_fact(cpi, x, bsize,
4327                                 search_state.best_pickmode.best_ref_frame,
4328                                 best_mode_idx, this_mode);
4329       }
4330     }
4331   }
4332 
4333 #if CONFIG_INTERNAL_STATS
4334   store_coding_context(x, ctx, mi->mode);
4335 #else
4336   store_coding_context(x, ctx);
4337 #endif  // CONFIG_INTERNAL_STATS
4338 
4339 #if COLLECT_PICK_MODE_STAT
4340   aom_usec_timer_mark(&ms_stat.bsize_timer);
4341   ms_stat.total_block_times[bsize] +=
4342       aom_usec_timer_elapsed(&ms_stat.bsize_timer);
4343   print_time(&ms_stat, bsize, cm->mi_params.mi_rows, cm->mi_params.mi_cols,
4344              mi_row, mi_col);
4345 #endif  // COLLECT_PICK_MODE_STAT
4346 
4347   *rd_cost = search_state.best_rdc;
4348 }
4349