1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /*!
22 ***************************************************************************
23 * \file hme_err_compute.c
24 *
25 * \brief
26 *    SAD / SATD routines for error computation
27 *
28 * Detailed_description : Contains various types of SAD/SATD routines for
29 *   error computation between a given input and reference ptr. The SAD
30 *   routines can evaluate for either a single point or a grid, and can
31 *   evaluate with either partial updates or no partial updates. Partial
32 *   updates means evaluating sub block SADs, e.g. 4 4x4 subblock SAD in
33 *   addition to the main 8x8 block SAD.
34 *
35 * \date
36 *    22/9/2012
37 *
38 * \author  Ittiam
39 ***************************************************************************
40 */
41 
42 /*****************************************************************************/
43 /* File Includes                                                             */
44 /*****************************************************************************/
45 /* System include files */
46 #include <stdio.h>
47 #include <string.h>
48 #include <stdlib.h>
49 #include <assert.h>
50 #include <stdarg.h>
51 #include <math.h>
52 #include <limits.h>
53 
54 /* User include files */
55 #include "ihevc_typedefs.h"
56 #include "itt_video_api.h"
57 #include "ihevce_api.h"
58 
59 #include "rc_cntrl_param.h"
60 #include "rc_frame_info_collector.h"
61 #include "rc_look_ahead_params.h"
62 
63 #include "ihevc_defs.h"
64 #include "ihevc_structs.h"
65 #include "ihevc_platform_macros.h"
66 #include "ihevc_deblk.h"
67 #include "ihevc_itrans_recon.h"
68 #include "ihevc_chroma_itrans_recon.h"
69 #include "ihevc_chroma_intra_pred.h"
70 #include "ihevc_intra_pred.h"
71 #include "ihevc_inter_pred.h"
72 #include "ihevc_mem_fns.h"
73 #include "ihevc_padding.h"
74 #include "ihevc_weighted_pred.h"
75 #include "ihevc_sao.h"
76 #include "ihevc_resi_trans.h"
77 #include "ihevc_quant_iquant_ssd.h"
78 #include "ihevc_cabac_tables.h"
79 
80 #include "ihevce_defs.h"
81 #include "ihevce_lap_enc_structs.h"
82 #include "ihevce_multi_thrd_structs.h"
83 #include "ihevce_multi_thrd_funcs.h"
84 #include "ihevce_me_common_defs.h"
85 #include "ihevce_had_satd.h"
86 #include "ihevce_error_codes.h"
87 #include "ihevce_bitstream.h"
88 #include "ihevce_cabac.h"
89 #include "ihevce_rdoq_macros.h"
90 #include "ihevce_function_selector.h"
91 #include "ihevce_enc_structs.h"
92 #include "ihevce_entropy_structs.h"
93 #include "ihevce_cmn_utils_instr_set_router.h"
94 #include "ihevce_enc_loop_structs.h"
95 #include "ihevce_bs_compute_ctb.h"
96 #include "ihevce_global_tables.h"
97 #include "ihevce_dep_mngr_interface.h"
98 #include "hme_datatype.h"
99 #include "hme_interface.h"
100 #include "hme_common_defs.h"
101 #include "hme_defs.h"
102 #include "ihevce_me_instr_set_router.h"
103 #include "hme_globals.h"
104 #include "hme_utils.h"
105 #include "hme_coarse.h"
106 #include "hme_refine.h"
107 #include "hme_err_compute.h"
108 #include "hme_common_utils.h"
109 #include "hme_search_algo.h"
110 #include "ihevce_stasino_helpers.h"
111 
112 /******************************************************************************
113 *                         MACRO DEFINITIONS
114 ******************************************************************************/
115 
116 /*****************************************************************************/
117 /* Theoritically, the various types of SAD functions that are needed for     */
118 /* reasons of optimality. SADs that are to be evaluated at a single pt can be*/
119 /* more optimal than SADs that are to be evaluated for a grid of 3x3. The    */
120 /* SADs to be evaluated at a grid are classified as separate functions, since*/
121 /* evaluating them on a single function call helps reuse inputs for a small  */
122 /* grid of 3x3. Also, if no partial updates are required, there are 3 basic  */
123 /* funcitons, width 4K (K = odd number), width 8K (K = odd number) and width */
124 /* 16K, K any number. For partial updates, it is assumed that the block size */
125 /* is square (8x8, 16x16, 32x32, 64x64) and further differentiation is done  */
126 /* based on the basic evaluation unit. E.g. if 16x16 blk size requires, part */
127 /* update on AMP partitions, then basic SAD unit is 4x4, if it doesnt, then  */
128 /* basic SAD unit is 8x8.                                                    */
129 /*****************************************************************************/
130 
131 #define UPD_RES_PT_NPU_BEST1 hme_update_results_grid_pu_bestn
132 #define UPD_RES_PT_NPU_BESTN hme_update_results_grid_pu_bestn
133 #define UPD_RES_PT_PU_BEST1 hme_update_results_grid_pu_bestn
134 #define UPD_RES_PT_PU_BESTN hme_update_results_grid_pu_bestn
135 #define UPD_RES_GRID_NPU_BEST1 hme_update_results_grid_pu_bestn
136 #define UPD_RES_GRID_NPU_BESTN hme_update_results_grid_pu_bestn
137 #define UPD_RES_GRID_PU_BEST1 hme_update_results_grid_pu_bestn
138 #define UPD_RES_GRID_PU_BESTN hme_update_results_grid_pu_bestn
139 
140 /*******************************************************************************
141 *                         FUNCTION DEFINITIONS
142 *******************************************************************************/
hme_cmp_nodes(search_node_t * ps_best_node1,search_node_t * ps_best_node2)143 S32 hme_cmp_nodes(search_node_t *ps_best_node1, search_node_t *ps_best_node2)
144 {
145     if((ps_best_node1->s_mv.i2_mvx == ps_best_node2->s_mv.i2_mvx) &&
146        (ps_best_node1->s_mv.i2_mvy == ps_best_node2->s_mv.i2_mvy) &&
147        (ps_best_node1->i1_ref_idx == ps_best_node2->i1_ref_idx))
148     {
149         return 0;
150     }
151     return -1;
152 }
153 
compute_4x4_sads_for_16x16_blk(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,UWORD16 ** u2_part_sads,cand_t * ps_cand,WORD32 * num_cands)154 void compute_4x4_sads_for_16x16_blk(
155     grid_ctxt_t *ps_grid, /* Grid ctxt */
156     UWORD8 *pu1_cur_ptr, /* Pointer to top-left of current block */
157     WORD32 cur_buf_stride, /* Buffer stride of current buffer */
158     UWORD16 **
159         u2_part_sads, /* 2D Array containing SADs for all 17 partitions. As many rows as partitions. SADs in a row correspond to each of the candidates */
160     cand_t *ps_cand, /* Return the list of candidates evaluated */
161     WORD32 *num_cands /* Number of candidates that were processed */
162 )
163 {
164     WORD32 a, b, c, d, i;
165     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
166     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
167     //WORD32 offset_x[9] = {-grd_sz_x, 0, grd_sz_x, -grd_sz_x, 0, grd_sz_x, grd_sz_x, 0, -grd_sz_x};
168     //WORD32 offset_y[9] = {-grd_sz_y, -grd_sz_y, -grd_sz_y, 0, 0, 0, grd_sz_y, grd_sz_y, grd_sz_y};
169     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
170     WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
171     WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
172     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
173     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
174     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
175     cand_t *cand0 = ps_cand;
176     UWORD16 au2_4x4_sad[NUM_4X4];
177 
178     *num_cands = 0;
179 
180     /* Loop to fill up the cand_t array and to calculate num_cands */
181     for(i = 0; i < ps_grid->num_grids; i++)
182     {
183         WORD32 j;
184         WORD32 mask = ps_grid->pi4_grd_mask[i];
185         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
186         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
187         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
188 
189         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
190         {
191             if(mask & 1)
192             {
193                 *num_cands = *num_cands + 1;
194                 cand0->grid_ix = i;
195                 cand0->ref_idx = ps_grid->p_ref_idx[i];
196                 cand0->pu1_ref_ptr =
197                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
198                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
199                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
200                 cand0++;
201             }
202         }
203     }
204 
205     /* Loop to compute the SAD's */
206     for(a = 0; a < *num_cands; a++)
207     {
208         cand_t *cand = ps_cand + a;
209         memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
210         for(b = 0; b < NUM_4X4; b++)
211         {
212             WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
213             WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
214 
215             for(c = 0; c < NUM_ROWS_IN_4X4; c++)
216             {
217                 WORD32 z_cur = (cur_buf_stride)*c + t1;
218                 WORD32 z_ref = (ref_buf_stride)*c + t2;
219                 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
220                 {
221                     au2_4x4_sad[b] += (UWORD16)ABS(
222                         (((S32)cand->pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
223                 }
224             }
225         }
226 
227         u2_part_sads[PART_ID_NxN_TL][a] =
228             (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
229         u2_part_sads[PART_ID_NxN_TR][a] =
230             (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
231         u2_part_sads[PART_ID_NxN_BL][a] =
232             (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
233         u2_part_sads[PART_ID_NxN_BR][a] =
234             (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
235         u2_part_sads[PART_ID_Nx2N_L][a] =
236             u2_part_sads[PART_ID_NxN_TL][a] + u2_part_sads[PART_ID_NxN_BL][a];
237         u2_part_sads[PART_ID_Nx2N_R][a] =
238             u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_BR][a];
239         u2_part_sads[PART_ID_2NxN_T][a] =
240             u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_TL][a];
241         u2_part_sads[PART_ID_2NxN_B][a] =
242             u2_part_sads[PART_ID_NxN_BR][a] + u2_part_sads[PART_ID_NxN_BL][a];
243         u2_part_sads[PART_ID_nLx2N_L][a] =
244             (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
245         u2_part_sads[PART_ID_nRx2N_R][a] =
246             (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
247         u2_part_sads[PART_ID_2NxnU_T][a] =
248             (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
249         u2_part_sads[PART_ID_2NxnD_B][a] =
250             (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
251         u2_part_sads[PART_ID_2Nx2N][a] =
252             u2_part_sads[PART_ID_2NxN_T][a] + u2_part_sads[PART_ID_2NxN_B][a];
253         u2_part_sads[PART_ID_2NxnU_B][a] =
254             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnU_T][a];
255         u2_part_sads[PART_ID_2NxnD_T][a] =
256             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnD_B][a];
257         u2_part_sads[PART_ID_nRx2N_L][a] =
258             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nRx2N_R][a];
259         u2_part_sads[PART_ID_nLx2N_R][a] =
260             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nLx2N_L][a];
261     }
262 }
263 
264 /**
265 ********************************************************************************
266 *  @fn     compute_part_sads_for_MxM_blk(grid_ctxt_t *ps_grid,
267 *                                       UWORD8      *pu1_cur_ptr,
268 *                                       WORD32      cur_buf_stride,
269 *                                       WORD32     **pi4_part_sads,
270 *                                       cand_t      *ps_cand,
271 *                                       WORD32      *num_cands
272 *
273 *  @brief  Computes partial SADs and updates partition results for an MxM blk
274 *          and does so for several grids of points. This can be used for
275 *          32x32/64x64 blks with 17 partition updates
276 *
277 *
278 *  @param[in]  ps_grid : Pointer to grid ctxt that has multiple grid of max
279 *                        9 pts per grid
280 *
281 *  @param[in]  pu1_cur_ptr : Top left of input buffer
282 *
283 *  @param[in]  pi4_part_sads : array of pointers, each entry pointing to
284 *                             results to be updated for a given partition
285 *
286 *  @return   The ps_search_results structure has the best result updated for
287 *            the 2Nx2N partition alone.
288 
289 ********************************************************************************
290 */
compute_part_sads_for_MxM_blk(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,WORD32 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands,CU_SIZE_T e_cu_size)291 void compute_part_sads_for_MxM_blk(
292     grid_ctxt_t *ps_grid,
293     UWORD8 *pu1_cur_ptr,
294     WORD32 cur_buf_stride,
295     WORD32 **pp_part_sads,
296     cand_t *ps_cand,
297     WORD32 *num_cands,
298     CU_SIZE_T e_cu_size)
299 {
300     WORD32 a, b, c, d, i;
301     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
302     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
303 
304     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
305     WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
306     WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
307     WORD32 shift = (WORD32)e_cu_size;
308 
309     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
310     WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
311     WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
312     /* Num rows and pixels per row: 8 for CU_32x32 and 16 for CU_64x64 */
313     WORD32 num_rows_in_nxn = 2 << shift;
314     WORD32 num_pixels_in_row = 2 << shift;
315     cand_t *cand0 = ps_cand;
316     /* for a 2Nx2N partition we evaluate nxn SADs, where n = N/2. This is */
317     /* needed for AMP cases.                                              */
318     WORD32 a_nxn_sad[NUM_4X4];
319     *num_cands = 0;
320 
321     /* Loop to fill up the cand_t array and to calculate num_cands */
322     for(i = 0; i < ps_grid->num_grids; i++)
323     {
324         WORD32 j;
325         WORD32 mask = ps_grid->pi4_grd_mask[i];
326         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
327         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
328         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
329 
330         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
331         {
332             if(mask & 1)
333             {
334                 *num_cands = *num_cands + 1;
335                 cand0->grid_ix = i;
336                 cand0->ref_idx = ps_grid->p_ref_idx[i];
337                 cand0->pu1_ref_ptr =
338                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
339                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
340                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
341                 cand0++;
342             }
343         }
344     }
345 
346     /* Loop to compute the SAD's */
347     for(a = 0; a < *num_cands; a++)
348     {
349         cand_t *cand = ps_cand + a;
350         memset(&a_nxn_sad[0], 0, NUM_4X4 * sizeof(WORD32));
351         for(b = 0; b < NUM_4X4; b++)
352         {
353             WORD32 t1 = (b % 4) * num_pixels_in_row + (b >> 2) * cur_buf_stride_lsN;
354             WORD32 t2 = (b % 4) * num_pixels_in_row + (b >> 2) * ref_buf_stride_lsN;
355 
356             for(c = 0; c < num_rows_in_nxn; c++)
357             {
358                 WORD32 z_cur = (cur_buf_stride)*c + t1;
359                 WORD32 z_ref = (ref_buf_stride)*c + t2;
360                 for(d = 0; d < num_pixels_in_row; d++)
361                 {
362                     a_nxn_sad[b] += (WORD32)ABS(
363                         (((WORD32)cand->pu1_ref_ptr[(z_ref + d)]) -
364                          ((WORD32)pu1_cur_ptr[(z_cur + d)])));
365                 }
366             }
367         }
368 
369         pp_part_sads[PART_ID_NxN_TL][a] =
370             (a_nxn_sad[0] + a_nxn_sad[1] + a_nxn_sad[4] + a_nxn_sad[5]);
371         pp_part_sads[PART_ID_NxN_TR][a] =
372             (a_nxn_sad[2] + a_nxn_sad[3] + a_nxn_sad[6] + a_nxn_sad[7]);
373         pp_part_sads[PART_ID_NxN_BL][a] =
374             (a_nxn_sad[8] + a_nxn_sad[9] + a_nxn_sad[12] + a_nxn_sad[13]);
375         pp_part_sads[PART_ID_NxN_BR][a] =
376             (a_nxn_sad[10] + a_nxn_sad[11] + a_nxn_sad[14] + a_nxn_sad[15]);
377         pp_part_sads[PART_ID_Nx2N_L][a] =
378             pp_part_sads[PART_ID_NxN_TL][a] + pp_part_sads[PART_ID_NxN_BL][a];
379         pp_part_sads[PART_ID_Nx2N_R][a] =
380             pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_BR][a];
381         pp_part_sads[PART_ID_2NxN_T][a] =
382             pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_TL][a];
383         pp_part_sads[PART_ID_2NxN_B][a] =
384             pp_part_sads[PART_ID_NxN_BR][a] + pp_part_sads[PART_ID_NxN_BL][a];
385         pp_part_sads[PART_ID_nLx2N_L][a] =
386             (a_nxn_sad[8] + a_nxn_sad[0] + a_nxn_sad[12] + a_nxn_sad[4]);
387         pp_part_sads[PART_ID_nRx2N_R][a] =
388             (a_nxn_sad[3] + a_nxn_sad[7] + a_nxn_sad[15] + a_nxn_sad[11]);
389         pp_part_sads[PART_ID_2NxnU_T][a] =
390             (a_nxn_sad[1] + a_nxn_sad[0] + a_nxn_sad[2] + a_nxn_sad[3]);
391         pp_part_sads[PART_ID_2NxnD_B][a] =
392             (a_nxn_sad[15] + a_nxn_sad[14] + a_nxn_sad[12] + a_nxn_sad[13]);
393         pp_part_sads[PART_ID_2Nx2N][a] =
394             pp_part_sads[PART_ID_2NxN_T][a] + pp_part_sads[PART_ID_2NxN_B][a];
395         pp_part_sads[PART_ID_2NxnU_B][a] =
396             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnU_T][a];
397         pp_part_sads[PART_ID_2NxnD_T][a] =
398             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnD_B][a];
399         pp_part_sads[PART_ID_nRx2N_L][a] =
400             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nRx2N_R][a];
401         pp_part_sads[PART_ID_nLx2N_R][a] =
402             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nLx2N_L][a];
403     }
404 }
405 
hme_evalsad_grid_pu_16x16(err_prms_t * ps_prms)406 void hme_evalsad_grid_pu_16x16(err_prms_t *ps_prms)
407 {
408     grid_ctxt_t s_grid;
409     cand_t as_candt[9];
410     U16 au2_sad_grid[TOT_NUM_PARTS * 9];
411     U16 *apu2_sad_grid[TOT_NUM_PARTS];
412     hme_mv_t s_mv = { 0, 0 };
413     S32 i4_ref_idx = 0, i;
414     S32 num_candts = 0;
415     s_grid.num_grids = 1;
416     s_grid.ref_buf_stride = ps_prms->i4_ref_stride;
417     s_grid.grd_sz_y_x = ((ps_prms->i4_step << 16) | ps_prms->i4_step);
418     s_grid.ppu1_ref_ptr = &ps_prms->pu1_ref;
419     s_grid.pi4_grd_mask = &ps_prms->i4_grid_mask;
420     s_grid.p_mv = &s_mv;
421     s_grid.p_ref_idx = &i4_ref_idx;
422     for(i = 0; i < 9; i++)
423     {
424         if(s_grid.pi4_grd_mask[0] & (1 << i))
425             num_candts++;
426     }
427 
428     for(i = 0; i < TOT_NUM_PARTS; i++)
429         apu2_sad_grid[i] = &au2_sad_grid[i * num_candts];
430 
431     compute_4x4_sads_for_16x16_blk(
432         &s_grid, ps_prms->pu1_inp, ps_prms->i4_inp_stride, apu2_sad_grid, as_candt, &num_candts);
433     for(i = 0; i < TOT_NUM_PARTS * num_candts; i++)
434     {
435         ps_prms->pi4_sad_grid[i] = au2_sad_grid[i];
436     }
437 }
438 
hme_evalsad_grid_npu_MxN(err_prms_t * ps_prms)439 void hme_evalsad_grid_npu_MxN(err_prms_t *ps_prms)
440 {
441     U08 *pu1_inp_base, *pu1_ref_c;
442     S32 *pi4_sad = ps_prms->pi4_sad_grid;
443     S32 i, grid_count = 0;
444     S32 step = ps_prms->i4_step;
445     S32 x_off = step, y_off = step * ps_prms->i4_ref_stride;
446 
447     ASSERT((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
448 
449     //assert(ps_prms->i4_blk_ht <= 8);
450     //assert(ps_prms->i4_blk_wd <= 8);
451     for(i = 0; i < 9; i++)
452     {
453         if(ps_prms->i4_grid_mask & (1 << i))
454             grid_count++;
455     }
456     pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
457 
458     pu1_inp_base = ps_prms->pu1_inp;
459     pu1_ref_c = ps_prms->pu1_ref;
460     for(i = 0; i < 9; i++)
461     {
462         S32 sad = 0, j, k;
463         U08 *pu1_inp, *pu1_ref;
464 
465         if(!(ps_prms->i4_grid_mask & (1 << i)))
466             continue;
467         pu1_ref = pu1_ref_c + x_off * gai1_grid_id_to_x[i];
468         pu1_ref += y_off * gai1_grid_id_to_y[i];
469         pu1_inp = pu1_inp_base;
470 
471         for(j = 0; j < ps_prms->i4_blk_ht; j++)
472         {
473             for(k = 0; k < ps_prms->i4_blk_wd; k++)
474             {
475                 sad += (ABS((pu1_inp[k] - pu1_ref[k])));
476             }
477             pu1_inp += ps_prms->i4_inp_stride;
478             pu1_ref += ps_prms->i4_ref_stride;
479         }
480         *pi4_sad++ = sad;
481     }
482 }
483 
hme_evalsad_pt_npu_MxN_8bit_compute(WORD32 ht,WORD32 wd,UWORD8 * pu1_inp,UWORD8 * pu1_ref,WORD32 i4_inp_stride,WORD32 i4_ref_stride)484 WORD32 hme_evalsad_pt_npu_MxN_8bit_compute(
485     WORD32 ht,
486     WORD32 wd,
487     UWORD8 *pu1_inp,
488     UWORD8 *pu1_ref,
489     WORD32 i4_inp_stride,
490     WORD32 i4_ref_stride)
491 {
492     WORD32 i, j;
493     WORD32 sad = 0;
494     for(i = 0; i < ht; i++)
495     {
496         for(j = 0; j < wd; j++)
497         {
498             sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
499         }
500         pu1_inp += i4_inp_stride;
501         pu1_ref += i4_ref_stride;
502     }
503     return sad;
504 }
505 
hme_evalsad_pt_npu_MxN_8bit(err_prms_t * ps_prms)506 void hme_evalsad_pt_npu_MxN_8bit(err_prms_t *ps_prms)
507 {
508     S32 wd, ht;
509     U08 *pu1_inp, *pu1_ref;
510 
511     wd = ps_prms->i4_blk_wd;
512     ht = ps_prms->i4_blk_ht;
513 
514     pu1_inp = ps_prms->pu1_inp;
515     pu1_ref = ps_prms->pu1_ref;
516 
517     ps_prms->pi4_sad_grid[0] = hme_evalsad_pt_npu_MxN_8bit_compute(
518         ht, wd, pu1_inp, pu1_ref, ps_prms->i4_inp_stride, ps_prms->i4_ref_stride);
519 }
520 
compute_satd_8bit(err_prms_t * ps_prms)521 void compute_satd_8bit(err_prms_t *ps_prms)
522 {
523     U08 *pu1_origin;
524     S32 src_strd;
525     U08 *pu1_pred_buf;
526     S32 dst_strd;
527     S32 wd, ht;
528     U32 u4_sad = 0;
529     WORD32 x, y;
530     U08 *u1_pi0, *u1_pi1;
531 
532     pu1_origin = ps_prms->pu1_inp;
533     pu1_pred_buf = ps_prms->pu1_ref;
534     src_strd = ps_prms->i4_inp_stride;
535     dst_strd = ps_prms->i4_ref_stride;
536     wd = ps_prms->i4_blk_wd;
537     ht = ps_prms->i4_blk_ht;
538 
539     u1_pi0 = pu1_origin;
540     u1_pi1 = pu1_pred_buf;
541 
542     /* Follows the following logic:
543     For block sizes less than or equal to 16X16, the basic transform size is 4x4
544     For block sizes greater than or equal to 32x32, the basic transform size is 8x8 */
545     if((wd > 0x10) || (ht > 0x10))
546     {
547         for(y = 0; y < ht; y += 8)
548         {
549             for(x = 0; x < wd; x += 8)
550             {
551                 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
552                     &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
553             }
554             u1_pi0 += src_strd * 8;
555             u1_pi1 += dst_strd * 8;
556         }
557     }
558     else
559     {
560         for(y = 0; y < ht; y += 4)
561         {
562             for(x = 0; x < wd; x += 4)
563             {
564                 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
565                     &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
566             }
567             u1_pi0 += src_strd * 4;
568             u1_pi1 += dst_strd * 4;
569         }
570     }
571 
572     ps_prms->pi4_sad_grid[0] = (S32)u4_sad;
573 }
574 
hme_init_pred_part(pred_ctxt_t * ps_pred_ctxt,search_node_t * ps_tl,search_node_t * ps_t,search_node_t * ps_tr,search_node_t * ps_l,search_node_t * ps_bl,search_node_t * ps_coloc,search_node_t * ps_zeromv,search_node_t ** pps_proj_coloc,PART_ID_T e_part_id)575 void hme_init_pred_part(
576     pred_ctxt_t *ps_pred_ctxt,
577     search_node_t *ps_tl,
578     search_node_t *ps_t,
579     search_node_t *ps_tr,
580     search_node_t *ps_l,
581     search_node_t *ps_bl,
582     search_node_t *ps_coloc,
583     search_node_t *ps_zeromv,
584     search_node_t **pps_proj_coloc,
585     PART_ID_T e_part_id)
586 {
587     pred_candt_nodes_t *ps_candt_nodes;
588 
589     ps_candt_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
590 
591     ps_candt_nodes->ps_tl = ps_tl;
592     ps_candt_nodes->ps_tr = ps_tr;
593     ps_candt_nodes->ps_t = ps_t;
594     ps_candt_nodes->ps_l = ps_l;
595     ps_candt_nodes->ps_bl = ps_bl;
596     ps_candt_nodes->ps_coloc = ps_coloc;
597     ps_candt_nodes->ps_zeromv = ps_zeromv;
598     ps_candt_nodes->pps_proj_coloc = pps_proj_coloc;
599 }
600 
hme_init_pred_ctxt_no_encode(pred_ctxt_t * ps_pred_ctxt,search_results_t * ps_search_results,search_node_t * ps_top_candts,search_node_t * ps_left_candts,search_node_t ** pps_proj_coloc_candts,search_node_t * ps_coloc_candts,search_node_t * ps_zeromv_candt,S32 pred_lx,S32 lambda,S32 lambda_q_shift,U08 ** ppu1_ref_bits_tlu,S16 * pi2_ref_scf)601 void hme_init_pred_ctxt_no_encode(
602     pred_ctxt_t *ps_pred_ctxt,
603     search_results_t *ps_search_results,
604     search_node_t *ps_top_candts,
605     search_node_t *ps_left_candts,
606     search_node_t **pps_proj_coloc_candts,
607     search_node_t *ps_coloc_candts,
608     search_node_t *ps_zeromv_candt,
609     S32 pred_lx,
610     S32 lambda,
611     S32 lambda_q_shift,
612     U08 **ppu1_ref_bits_tlu,
613     S16 *pi2_ref_scf)
614 {
615     search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
616     search_node_t *ps_coloc;
617     PART_ID_T e_part_id;
618 
619     /* Assume that resolution is subpel to begin with */
620     ps_pred_ctxt->mv_pel = 0;  // FPEL
621 
622     /* lambda and pred_lx (PRED_L0/PRED_L1) */
623     ps_pred_ctxt->lambda = lambda;
624     ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
625     ps_pred_ctxt->pred_lx = pred_lx;
626     ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
627     ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
628     ps_pred_ctxt->proj_used = 0;
629 
630     /* Bottom left should not be valid */
631     ASSERT(ps_left_candts[2].u1_is_avail == 0);
632     ps_invalid = &ps_left_candts[2];
633 
634     /*************************************************************************/
635     /* for the case of no encode, the idea is to set up cants as follows     */
636     /*                                                                       */
637     /*    ____ ______________                                                */
638     /*   | TL | T  | T1 | TR |                                               */
639     /*   |____|____|____|____|                                               */
640     /*   | L  | b0 | b1 |                                                    */
641     /*   |____|____|____|                                                    */
642     /*   | L1 | b2 | b3 |                                                    */
643     /*   |____|____|____|                                                    */
644     /*   | BL |                                                              */
645     /*   |____|                                                              */
646     /*                                                                       */
647     /*  If use_4x4 is 0, then b0,b1,b2,b3 are single 8x8 blk. then T=T1      */
648     /* and L=L1. topleft, top and topright are TL,T,TR respectively          */
649     /* Left and bottom left is L and BL respectively.                        */
650     /* If use_4x4 is 1: then the above holds true only for PARTID = 0 (8x8)  */
651     /*  For the 4 subblocks (partids 4-7)                                    */
652     /*                                                                       */
653     /*  Block   Left   Top   Top Left   Top Right   Bottom Left             */
654     /*    b0    L      T      TL          T1          L1                     */
655     /*    b1    b0     T1     T           TR          BL(invalid)            */
656     /*    b2    L1     b0     L0          b1          BL (invalid)           */
657     /*    b3    b2     b1     b0          BL(inv)     BL (inv)               */
658     /*                                                                       */
659     /* Note : For block b1, bottom left pts to b2, which is not yet ready    */
660     /*  hence it is kept invalid and made to pt to BL. For block b3 top rt   */
661     /* is invalid and hence made to pt to BL which is invalid.               */
662     /* BL is invalid since it lies in a bottom left 8x8 blk and not yet ready*/
663     /*************************************************************************/
664 
665     /* ps_coloc always points to a fixe candt (global) */
666     /* TODO : replace incoming ps_coloc from global to geniune coloc */
667     ps_coloc = ps_coloc_candts;
668 
669     /* INITIALIZATION OF 8x8 BLK */
670     ps_tl = ps_top_candts;
671     ps_t = ps_tl + 2;
672     ps_tr = ps_t + 1;
673     ps_l = ps_left_candts + 1;
674     ps_bl = ps_invalid;
675     e_part_id = PART_ID_2Nx2N;
676     hme_init_pred_part(
677         ps_pred_ctxt,
678         ps_tl,
679         ps_t,
680         ps_tr,
681         ps_l,
682         ps_bl,
683         ps_coloc,
684         ps_zeromv_candt,
685         pps_proj_coloc_candts,
686         e_part_id);
687 
688     /* INITIALIZATION OF 4x4 TL BLK */
689     e_part_id = PART_ID_NxN_TL;
690     ps_tl = ps_top_candts;
691     ps_t = ps_tl + 1;
692     ps_tr = ps_t + 1;
693     ps_l = ps_left_candts;
694     ps_bl = ps_l + 1;
695     hme_init_pred_part(
696         ps_pred_ctxt,
697         ps_tl,
698         ps_t,
699         ps_tr,
700         ps_l,
701         ps_bl,
702         ps_coloc,
703         ps_zeromv_candt,
704         pps_proj_coloc_candts,
705         e_part_id);
706 
707     /* INITIALIZATION OF 4x4 TR BLK */
708     e_part_id = PART_ID_NxN_TR;
709     ps_tl = ps_top_candts + 1;
710     ps_t = ps_tl + 1;
711     ps_tr = ps_t + 1;
712     ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
713     ps_bl = ps_invalid;
714     hme_init_pred_part(
715         ps_pred_ctxt,
716         ps_tl,
717         ps_t,
718         ps_tr,
719         ps_l,
720         ps_bl,
721         ps_coloc,
722         ps_zeromv_candt,
723         pps_proj_coloc_candts,
724         e_part_id);
725 
726     /* INITIALIZATION OF 4x4 BL BLK */
727     e_part_id = PART_ID_NxN_BL;
728     ps_tl = ps_left_candts;
729     ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
730     ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
731     ps_l = ps_left_candts + 1;
732     ps_bl = ps_invalid;  //invalid
733     hme_init_pred_part(
734         ps_pred_ctxt,
735         ps_tl,
736         ps_t,
737         ps_tr,
738         ps_l,
739         ps_bl,
740         ps_coloc,
741         ps_zeromv_candt,
742         pps_proj_coloc_candts,
743         e_part_id);
744 
745     /* INITIALIZATION OF 4x4 BR BLK */
746     e_part_id = PART_ID_NxN_BR;
747     ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
748     ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
749     ps_tr = ps_invalid;  // invalid
750     ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
751     ps_bl = ps_invalid;  // invalid
752     hme_init_pred_part(
753         ps_pred_ctxt,
754         ps_tl,
755         ps_t,
756         ps_tr,
757         ps_l,
758         ps_bl,
759         ps_coloc,
760         ps_zeromv_candt,
761         pps_proj_coloc_candts,
762         e_part_id);
763 }
764 
hme_init_pred_ctxt_encode(pred_ctxt_t * ps_pred_ctxt,search_results_t * ps_search_results,search_node_t * ps_coloc_candts,search_node_t * ps_zeromv_candt,mv_grid_t * ps_mv_grid,S32 pred_lx,S32 lambda,S32 lambda_q_shift,U08 ** ppu1_ref_bits_tlu,S16 * pi2_ref_scf)765 void hme_init_pred_ctxt_encode(
766     pred_ctxt_t *ps_pred_ctxt,
767     search_results_t *ps_search_results,
768     search_node_t *ps_coloc_candts,
769     search_node_t *ps_zeromv_candt,
770     mv_grid_t *ps_mv_grid,
771     S32 pred_lx,
772     S32 lambda,
773     S32 lambda_q_shift,
774     U08 **ppu1_ref_bits_tlu,
775     S16 *pi2_ref_scf)
776 {
777     search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
778     search_node_t *ps_coloc;
779     search_node_t *ps_grid_cu_base;
780     CU_SIZE_T e_cu_size = ps_search_results->e_cu_size;
781 
782     /* Part Start, Part sizes in 4x4 units */
783     S32 part_wd, part_ht, part_start_x, part_start_y;
784 
785     /* Partition type, number of partitions in type */
786     S32 part_id;
787 
788     /* Coordinates of the CU in 4x4 units */
789     S32 cu_start_x, cu_start_y;
790     S32 shift = e_cu_size;
791 
792     /* top right and bot left validity at CU level */
793     S32 cu_tr_valid, cu_bl_valid;
794     /* strideo f the grid */
795     S32 grid_stride = ps_mv_grid->i4_stride;
796 
797     ps_pred_ctxt->lambda = lambda;
798     ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
799     ps_pred_ctxt->pred_lx = pred_lx;
800     ps_pred_ctxt->mv_pel = 0;
801     ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
802     ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
803     ps_pred_ctxt->proj_used = 1;
804 
805     cu_start_x = ps_search_results->u1_x_off >> 2;
806     cu_start_y = ps_search_results->u1_y_off >> 2;
807 
808     /* Coloc always points to fixed global candt */
809     ps_coloc = ps_coloc_candts;
810 
811     /* Go to base of the CU in the MV Grid */
812     ps_grid_cu_base = &ps_mv_grid->as_node[0];
813     ps_grid_cu_base += (ps_mv_grid->i4_start_offset + cu_start_x);
814     ps_grid_cu_base += (grid_stride * cu_start_y);
815 
816     /* points to the real bottom left of the grid, will never be valid */
817     ps_invalid = &ps_mv_grid->as_node[0];
818     ps_invalid += (grid_stride * 17);
819 
820     {
821         S32 shift = 1 + e_cu_size;
822         cu_tr_valid = gau1_cu_tr_valid[cu_start_y >> shift][cu_start_x >> shift];
823         cu_bl_valid = gau1_cu_bl_valid[cu_start_y >> shift][cu_start_x >> shift];
824     }
825 
826     /*************************************************************************/
827     /* for the case of    encode, the idea is to set up cants as follows     */
828     /*                                                                       */
829     /*    ____ ______________ ____ ____                                      */
830     /*   | T0 | T1 | T2 | T3 | T4 | T5 |                                     */
831     /*   |____|____|____|____|____|____|                                     */
832     /*   | L1 |    |              |                                          */
833     /*   |____|    |              |                                          */
834     /*   | L2 | p0 |     p1       |                                          */
835     /*   |____|    |              |                                          */
836     /*   | L3 |    |              |                                          */
837     /*   |____|    |              |                                          */
838     /*   | L4 | L' |              |                                          */
839     /*   |____|____|______________|                                          */
840     /*   | BL |                                                              */
841     /*   |____|                                                              */
842     /*  The example is shown with 16x16 CU, though it can be generalized     */
843     /*  This CU has 2 partitions, cu_wd = 4. also p_wd, p_ht are partition   */
844     /*  width and ht in 4x4 units.                                           */
845     /*  For a given CU, derive the top left, top and bottom left and top rt  */
846     /*  pts. Left and top are assumed to be valid.                           */
847     /*  IF there aretwo partitions in the CU (like p0 and p1) and vertical,  */
848     /*  then for first partition, left, top, top left and top right valid    */
849     /*  Bottom left is valid. store these validity flags. Also store the     */
850     /*  grid offsets of the partitions w.r.t. CU start in units of 4x4.For p0*/
851     /*  Left grid offset = -1, 3. Top Grd offset = -1, 0.                    */
852     /*  Top left grid offset = -1, -1. Top right = 1, -1. BL = -1, 4.        */
853     /*  For p1, validity flags are left, top, top left, top right, valid.    */
854     /*  BL is invalid. Grid offsets are: Left = dont care. T = 1, -1 (T2)    */
855     /*  TR = 4, -1 (T5). TL = 0, -1 (T1). BL = don't care.                   */
856     /*  For p1, set the left pred candt to the best search result of p0.     */
857     /*************************************************************************/
858 
859     /* Loop over all partitions, and identify the 5 neighbours */
860     for(part_id = 0; part_id < TOT_NUM_PARTS; part_id++)
861     {
862         part_attr_t *ps_part_attr = &gas_part_attr_in_cu[part_id];
863         S32 tr_valid, bl_valid, is_vert;
864         search_node_t *ps_grid_pu_base;
865         PART_TYPE_T e_part_type;
866         PART_ID_T first_part;
867         S32 part_num;
868 
869         e_part_type = ge_part_id_to_part_type[part_id];
870         first_part = ge_part_type_to_part_id[e_part_type][0];
871         is_vert = gau1_is_vert_part[e_part_type];
872         part_num = gau1_part_id_to_part_num[part_id];
873         tr_valid = gau1_partid_tr_valid[part_id] & cu_tr_valid;
874         bl_valid = gau1_partid_bl_valid[part_id] & cu_bl_valid;
875 
876         part_start_x = (ps_part_attr->u1_x_start << shift) >> 2;
877         part_start_y = (ps_part_attr->u1_y_start << shift) >> 2;
878         part_wd = (ps_part_attr->u1_x_count << shift) >> 2;
879         part_ht = (ps_part_attr->u1_y_count << shift) >> 2;
880 
881         /* go to top left of part */
882         ps_grid_pu_base = ps_grid_cu_base + part_start_x;
883         ps_grid_pu_base += (part_start_y * grid_stride);
884 
885         ps_tl = ps_grid_pu_base - 1 - grid_stride;
886         ps_t = ps_grid_pu_base - grid_stride + part_wd - 1;
887         ps_l = ps_grid_pu_base - 1 + ((part_ht - 1) * grid_stride);
888         ps_tr = ps_t + 1;
889         ps_bl = ps_l + grid_stride;
890 
891         if(!tr_valid)
892             ps_tr = ps_invalid;
893         if(!bl_valid)
894             ps_bl = ps_invalid;
895 
896         if(part_num == 1)
897         {
898             /* for cases of two partitions 2nd part has 1st part as candt */
899             /* if vertical type, left candt of 2nd part is 1st part.      */
900             /* if horz type, top candt of 2nd part is 1st part.           */
901             if(is_vert)
902             {
903                 ps_l = ps_search_results->aps_part_results[pred_lx][first_part];
904             }
905             else
906             {
907                 ps_t = ps_search_results->aps_part_results[pred_lx][first_part];
908             }
909         }
910         if(part_num == 2)
911         {
912             /* only possible for NxN_BL */
913             ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
914             ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
915         }
916         if(part_num == 3)
917         {
918             /* only possible for NxN_BR */
919             ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
920             ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
921             ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
922         }
923         hme_init_pred_part(
924             ps_pred_ctxt,
925             ps_tl,
926             ps_t,
927             ps_tr,
928             ps_l,
929             ps_bl,
930             ps_coloc,
931             ps_zeromv_candt,
932             NULL,
933             (PART_ID_T)part_id);
934     }
935 }
936 
937 /**
938 ********************************************************************************
939 *  @fn     compute_mv_cost_explicit(search_node_t *ps_node,
940 *                   pred_ctxt_t *ps_pred_ctxt,
941 *                   PART_ID_T e_part_id)
942 *
943 *  @brief  MV cost for explicit search in layers not encoded
944 *
945 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
946 *
947 *  @param[in]  ps_pred_ctxt : mv pred context
948 *
949 *  @param[in]  e_part_id : Partition id.
950 *
951 *  @return   Cost value
952 
953 ********************************************************************************
954 */
compute_mv_cost_explicit(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)955 S32 compute_mv_cost_explicit(
956     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
957 {
958 #define RETURN_FIXED_COST 0
959     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
960     pred_candt_nodes_t *ps_pred_nodes;
961     S32 inp_shift = 2 - inp_mv_pel;
962     S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
963     S32 mv_p_x, mv_p_y;
964     S16 mvdx1, mvdx2, mvdy1, mvdy2;
965     S32 cost, ref_bits;
966 
967     /*************************************************************************/
968     /* Logic for cost computation for explicit search. For such a search,    */
969     /* it is guaranteed that all predictor candts have same ref id. The only */
970     /* probable issue is with the availability which needs checking. This fxn*/
971     /* does not suffer the need to scale predictor candts due to diff ref id */
972     /*************************************************************************/
973 
974     /* Hack: currently we always assume 2Nx2N. */
975     /* TODO: get rid of this hack and return cost tuned to each partition */
976     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
977     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
978 
979     /*************************************************************************/
980     /* Priority to bottom left availability. Else we go to left. If both are */
981     /* not available, then a remains null                                    */
982     /*************************************************************************/
983     if(ps_pred_nodes->ps_tl->u1_is_avail)
984         ps_pred_node_a = ps_pred_nodes->ps_tl;
985     else if(ps_pred_nodes->ps_l->u1_is_avail)
986         ps_pred_node_a = ps_pred_nodes->ps_l;
987 
988     /*************************************************************************/
989     /* For encoder, top left may not be really needed unless we use slices,  */
990     /* and even then in ME it may not be relevant. So we only consider T or  */
991     /* TR, as, if both T and TR are not available, TL also will not be       */
992     /*************************************************************************/
993     if(ps_pred_nodes->ps_tr->u1_is_avail)
994         ps_pred_node_b = ps_pred_nodes->ps_tr;
995     else if(ps_pred_nodes->ps_t->u1_is_avail)
996         ps_pred_node_b = ps_pred_nodes->ps_t;
997 
998     if(ps_pred_node_a == NULL)
999     {
1000         ps_pred_node_a = ps_pred_nodes->ps_coloc;
1001         if(ps_pred_node_b == NULL)
1002             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1003     }
1004     else if(ps_pred_node_b == NULL)
1005         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1006     else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1007     {
1008         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1009     }
1010 
1011     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1012     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1013     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1014     mvdx1 = ABS(mvdx1);
1015     mvdy1 = ABS(mvdy1);
1016 
1017     mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1018     mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1019     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1020     mvdx2 = ABS(mvdx2);
1021     mvdy2 = ABS(mvdy2);
1022 
1023     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1024     {
1025         cost =
1026             hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1027     }
1028     else
1029     {
1030         cost =
1031             hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1032     }
1033     {
1034         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1035         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1036     }
1037 }
1038 /**
1039 ********************************************************************************
1040 *  @fn     compute_mv_cost_coarse(search_node_t *ps_node,
1041 *                   pred_ctxt_t *ps_pred_ctxt,
1042 *                   PART_ID_T e_part_id)
1043 *
1044 *  @brief  MV cost for coarse explicit search in coarsest layer
1045 *
1046 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1047 *
1048 *  @param[in]  ps_pred_ctxt : mv pred context
1049 *
1050 *  @param[in]  e_part_id : Partition id.
1051 *
1052 *  @return   Cost value
1053 
1054 ********************************************************************************
1055 */
compute_mv_cost_coarse(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1056 S32 compute_mv_cost_coarse(
1057     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1058 {
1059     ARG_NOT_USED(e_part_id);
1060 
1061     return (compute_mv_cost_explicit(ps_node, ps_pred_ctxt, PART_ID_2Nx2N, inp_mv_pel));
1062 }
1063 
1064 /**
1065 ********************************************************************************
1066 *  @fn     compute_mv_cost_coarse_high_speed(search_node_t *ps_node,
1067 *                                            pred_ctxt_t *ps_pred_ctxt,
1068 *                                            PART_ID_T e_part_id)
1069 *
1070 *  @brief  MV cost for coarse explicit search in coarsest layer
1071 *
1072 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1073 *
1074 *  @param[in]  ps_pred_ctxt : mv pred context
1075 *
1076 *  @param[in]  e_part_id : Partition id.
1077 *
1078 *  @return   Cost value
1079 
1080 ********************************************************************************
1081 */
compute_mv_cost_coarse_high_speed(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1082 S32 compute_mv_cost_coarse_high_speed(
1083     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1084 {
1085     S32 rnd, mvx, mvy, i4_search_idx;
1086     S32 cost;
1087 
1088     mvx = ps_node->s_mv.i2_mvx;
1089     mvy = ps_node->s_mv.i2_mvy;
1090     i4_search_idx = ps_node->i1_ref_idx;
1091 
1092     cost = (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i4_search_idx;
1093     cost += (mvx != 0) ? 1 : 0;
1094     cost += (mvy != 0) ? 1 : 0;
1095     rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1096     cost = (cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift;
1097     return cost;
1098 }
1099 
1100 /**
1101 ********************************************************************************
1102 *  @fn     compute_mv_cost_explicit_refine(search_node_t *ps_node,
1103 *                                          pred_ctxt_t *ps_pred_ctxt,
1104 *                                          PART_ID_T e_part_id)
1105 *
1106 *  @brief  MV cost for explicit search in layers not encoded. Always returns
1107 *          cost of the projected colocated candidate
1108 *
1109 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1110 *
1111 *  @param[in]  ps_pred_ctxt : mv pred context
1112 *
1113 *  @param[in]  e_part_id : Partition id.
1114 *
1115 *  @return   Cost value
1116 
1117 ********************************************************************************
1118 */
compute_mv_cost_explicit_refine(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1119 S32 compute_mv_cost_explicit_refine(
1120     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1121 {
1122     search_node_t *ps_pred_node_a = NULL;
1123     pred_candt_nodes_t *ps_pred_nodes;
1124     S32 inp_shift = 2 - inp_mv_pel;
1125     S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
1126     S32 mv_p_x, mv_p_y;
1127     S16 mvdx1, mvdy1;
1128     S32 cost, ref_bits;
1129 
1130     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1131     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1132 
1133     ps_pred_node_a = ps_pred_nodes->pps_proj_coloc[0];
1134 
1135     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1136     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1137     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1138     mvdx1 = ABS(mvdx1);
1139     mvdy1 = ABS(mvdy1);
1140 
1141     cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1142 
1143     {
1144         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1145         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1146     }
1147 }
1148 
1149 /**
1150 ********************************************************************************
1151 *  @fn     compute_mv_cost_refine(search_node_t *ps_node,
1152 *                   pred_ctxt_t *ps_pred_ctxt,
1153 *                   PART_ID_T e_part_id)
1154 *
1155 *  @brief  MV cost for coarse explicit search in coarsest layer
1156 *
1157 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
1158 *
1159 *  @param[in]  ps_pred_ctxt : mv pred context
1160 *
1161 *  @param[in]  e_part_id : Partition id.
1162 *
1163 *  @return   Cost value
1164 
1165 ********************************************************************************
1166 */
compute_mv_cost_refine(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1167 S32 compute_mv_cost_refine(
1168     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1169 {
1170     return (compute_mv_cost_explicit_refine(ps_node, ps_pred_ctxt, e_part_id, inp_mv_pel));
1171 }
1172 
compute_mv_cost_implicit(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1173 S32 compute_mv_cost_implicit(
1174     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1175 {
1176     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1177     pred_candt_nodes_t *ps_pred_nodes;
1178     S08 i1_ref_idx;
1179     S08 i1_ref_tl = -1, i1_ref_tr = -1, i1_ref_t = -1;
1180     S08 i1_ref_bl = -1, i1_ref_l = -1;
1181     S32 inp_shift = 2 - inp_mv_pel;
1182     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel;*/
1183     S32 ref_bits, cost;
1184     S32 mv_p_x, mv_p_y;
1185     S16 mvdx1, mvdx2, mvdy1, mvdy2;
1186 
1187     //return 0;
1188     i1_ref_idx = ps_node->i1_ref_idx;
1189 
1190     /*************************************************************************/
1191     /* Logic for cost computation for explicit search. For such a search,    */
1192     /* it is guaranteed that all predictor candts have same ref id. The only */
1193     /* probable issue is with the availability which needs checking. This fxn*/
1194     /* does not suffer the need to scale predictor candts due to diff ref id */
1195     /*************************************************************************/
1196 
1197     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1198     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1199 
1200     /*************************************************************************/
1201     /* Priority to bottom left availability. Else we go to left. If both are */
1202     /* not available, then a remains null                                    */
1203     /*************************************************************************/
1204     if(ps_pred_nodes->ps_bl->u1_is_avail)
1205         i1_ref_bl = ps_pred_nodes->ps_bl->i1_ref_idx;
1206     if(ps_pred_nodes->ps_l->u1_is_avail)
1207         i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1208     if(i1_ref_bl == i1_ref_idx)
1209         ps_pred_node_a = ps_pred_nodes->ps_bl;
1210     else if(i1_ref_l == i1_ref_idx)
1211         ps_pred_node_a = ps_pred_nodes->ps_l;
1212     if(ps_pred_node_a == NULL)
1213     {
1214         if(i1_ref_bl != -1)
1215             ps_pred_node_a = ps_pred_nodes->ps_bl;
1216         else if(i1_ref_l != -1)
1217             ps_pred_node_a = ps_pred_nodes->ps_l;
1218     }
1219 
1220     /*************************************************************************/
1221     /* For encoder, top left may not be really needed unless we use slices,  */
1222     /* and even then in ME it may not be relevant. So we only consider T or  */
1223     /* TR, as, if both T and TR are not available, TL also will not be       */
1224     /*************************************************************************/
1225     if(ps_pred_nodes->ps_tr->u1_is_avail)
1226         i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1227     if(ps_pred_nodes->ps_t->u1_is_avail)
1228         i1_ref_t = ps_pred_nodes->ps_t->i1_ref_idx;
1229     if(ps_pred_nodes->ps_tl->u1_is_avail)
1230         i1_ref_tl = ps_pred_nodes->ps_tl->i1_ref_idx;
1231     if(i1_ref_tr == i1_ref_idx)
1232         ps_pred_node_b = ps_pred_nodes->ps_tr;
1233     else if(i1_ref_t == i1_ref_idx)
1234         ps_pred_node_b = ps_pred_nodes->ps_t;
1235     else if(i1_ref_tl == i1_ref_idx)
1236         ps_pred_node_b = ps_pred_nodes->ps_tl;
1237 
1238     if(ps_pred_node_b == NULL)
1239     {
1240         if(i1_ref_tr != -1)
1241             ps_pred_node_b = ps_pred_nodes->ps_tr;
1242         else if(i1_ref_t != -1)
1243             ps_pred_node_b = ps_pred_nodes->ps_t;
1244         else if(i1_ref_tl != -1)
1245             ps_pred_node_b = ps_pred_nodes->ps_tl;
1246     }
1247     if(ps_pred_node_a == NULL)
1248     {
1249         ps_pred_node_a = ps_pred_nodes->ps_coloc;
1250         if(ps_pred_node_b == NULL)
1251             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1252     }
1253     else if(ps_pred_node_b == NULL)
1254         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1255     else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1256     {
1257         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1258     }
1259 
1260     if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1261     {
1262         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1263     }
1264     else
1265     {
1266         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1267         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1268     }
1269     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1270     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1271     mvdx1 = ABS(mvdx1);
1272     mvdy1 = ABS(mvdy1);
1273 
1274     if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1275     {
1276         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1277     }
1278     else
1279     {
1280         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1281         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1282     }
1283     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1284     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1285     mvdx2 = ABS(mvdx2);
1286     mvdy2 = ABS(mvdy2);
1287 
1288     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1289     {
1290         cost = 2 * hme_get_range(mvdx1) + 2 * hme_get_range(mvdy1) + 2 * (mvdx1 > 0) +
1291                2 * (mvdy1 > 0) + ref_bits + 2;
1292     }
1293     else
1294     {
1295         cost = 2 * hme_get_range(mvdx2) + 2 * hme_get_range(mvdy2) + 2 * (mvdx2 > 0) +
1296                2 * (mvdy2 > 0) + ref_bits + 2;
1297     }
1298     {
1299         /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1300         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift);
1301         S32 tot_cost = (cost * ps_pred_ctxt->lambda) << 1;
1302 
1303         tot_cost += (gau1_bits_for_part_id_q1[e_part_id] * ps_pred_ctxt->lambda);
1304         return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift + 1));
1305     }
1306 }
1307 
compute_mv_cost_implicit_high_speed(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1308 S32 compute_mv_cost_implicit_high_speed(
1309     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1310 {
1311     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1312     pred_candt_nodes_t *ps_pred_nodes;
1313     S08 i1_ref_idx;
1314     S08 i1_ref_tr = -1;
1315     S08 i1_ref_l = -1;
1316     S32 inp_shift = 2 - inp_mv_pel;
1317     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1318     S32 ref_bits, cost;
1319     S32 mv_p_x, mv_p_y;
1320     S16 mvdx1, mvdx2, mvdy1, mvdy2;
1321 
1322     i1_ref_idx = ps_node->i1_ref_idx;
1323 
1324     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1325     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1326 
1327     /*************************************************************************/
1328     /* Priority to bottom left availability. Else we go to left. If both are */
1329     /* not available, then a remains null                                    */
1330     /*************************************************************************/
1331     if(ps_pred_nodes->ps_l->u1_is_avail)
1332     {
1333         i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1334         ps_pred_node_a = ps_pred_nodes->ps_l;
1335     }
1336 
1337     /*************************************************************************/
1338     /* For encoder, top left may not be really needed unless we use slices,  */
1339     /* and even then in ME it may not be relevant. So we only consider T or  */
1340     /* TR, as, if both T and TR are not available, TL also will not be       */
1341     /*************************************************************************/
1342 
1343     if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
1344     {
1345         i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1346         ps_pred_node_b = ps_pred_nodes->ps_tr;
1347     }
1348     else
1349     {
1350         ps_pred_node_b = ps_pred_nodes->ps_coloc;
1351     }
1352 
1353     if(ps_pred_node_a == NULL)
1354     {
1355         ps_pred_node_a = ps_pred_nodes->ps_coloc;
1356 
1357         if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
1358             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1359     }
1360 
1361     if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1362     {
1363         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1364     }
1365     else
1366     {
1367         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1368         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1369     }
1370 
1371     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1372     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1373     mvdx1 = ABS(mvdx1);
1374     mvdy1 = ABS(mvdy1);
1375 
1376     if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1377     {
1378         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1379     }
1380     else
1381     {
1382         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1383         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1384     }
1385 
1386     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1387     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1388     mvdx2 = ABS(mvdx2);
1389     mvdy2 = ABS(mvdy2);
1390 
1391     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1392     {
1393         cost =
1394             hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1395     }
1396     else
1397     {
1398         cost =
1399             hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1400     }
1401     {
1402         /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1403         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1404         S32 tot_cost = (cost * ps_pred_ctxt->lambda);
1405 
1406         return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift));
1407     }
1408 }
1409 
compute_mv_cost_implicit_high_speed_modified(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1410 S32 compute_mv_cost_implicit_high_speed_modified(
1411     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1412 {
1413     search_node_t *ps_pred_node_a = NULL;
1414     pred_candt_nodes_t *ps_pred_nodes;
1415     S32 inp_shift = 2 - inp_mv_pel;
1416     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1417     S32 mv_p_x, mv_p_y;
1418     S16 mvdx1, mvdy1;
1419     S32 cost, ref_bits;
1420 
1421     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1422     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1423 
1424     ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
1425 
1426     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1427     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1428     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1429     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1430     mvdx1 = ABS(mvdx1);
1431     mvdy1 = ABS(mvdy1);
1432 
1433     cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1434 
1435     {
1436         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1437         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1438     }
1439 }
1440 
hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t * ps_result_prms)1441 void hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t *ps_result_prms)
1442 {
1443     /*The function modified with assumption that only 2NxN_B and Nx2N_R is modified */
1444 
1445     search_node_t s_search_node_grid;
1446     const search_node_t *ps_search_node_base;
1447     search_node_t *ps_search_node_grid, *ps_best_node;
1448     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1449     S32 num_results, i4_unique_id = -1, i4_grid_pt;
1450     search_results_t *ps_search_results;
1451     S32 *pi4_valid_part_ids;
1452     S32 i4_step = ps_result_prms->i4_step;
1453     S32 i4_grid_mask, i, i4_min_id;
1454     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1455     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1456     S32 grid_count = 0;
1457     S32 pred_lx;
1458 
1459     i4_min_id = (S32)PT_C;
1460     i4_min_cost = MAX_32BIT_VAL;
1461     ps_search_node_grid = &s_search_node_grid;
1462     ps_search_node_base = ps_result_prms->ps_search_node_base;
1463     *ps_search_node_grid = *ps_search_node_base;
1464     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1465     ps_search_results = ps_result_prms->ps_search_results;
1466     num_results = (S32)ps_search_results->u1_num_results_per_part;
1467     i4_grid_mask = ps_result_prms->i4_grid_mask;
1468 
1469     for(i = 0; i < 9; i++)
1470     {
1471         if(i4_grid_mask & (1 << i))
1472             grid_count++;
1473     }
1474 
1475     /* Some basic assumptions: only single pt, only part updates */
1476     /* and more than 1 best result to be computed.               */
1477     //ASSERT(ps_result_prms->i4_grid_mask != 1);
1478     //ASSERT(ps_result_prms->i4_part_mask != ENABLE_2Nx2N);
1479     //ASSERT(ps_search_results->num_results > 1);
1480 
1481     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1482     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1483 
1484     /*************************************************************************/
1485     /* Supposing we do hte result update for a unique partid, we can */
1486     /* store the best pt id in the grid and also min cost is return */
1487     /* param. This will be useful for early exit cases.             */
1488     /* TODO : once we have separate fxn for unique part+grid, we can */
1489     /* do away with this code here                                   */
1490     /*************************************************************************/
1491     //if (pi4_valid_part_ids[1] == -1)
1492     i4_unique_id = pi4_valid_part_ids[0];
1493 
1494     /* pi4_valid_part_ids contains all the valid ids. We loop through */
1495     /* this till we encounter -1. This is easier than having to       */
1496     /* figure out part by part, besides, active part decision is      */
1497     /* usually fixed for a given duration of search, e.g. entire fpel */
1498     /* refinement for a blk/cu will use fixed valid part mask         */
1499     id = pi4_valid_part_ids[0];
1500 
1501     /*****************************************************************/
1502     /* points to the best search results corresponding to this       */
1503     /* specific part type.                                           */
1504     /*****************************************************************/
1505     ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1506 
1507     /*************************************************************************/
1508     /* Outer loop runs through all active pts in the grid                    */
1509     /*************************************************************************/
1510     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1511     {
1512         if(!(i4_grid_mask & (1 << i4_grid_pt)))
1513             continue;
1514 
1515         /* For the pt in the grid, update mvx and y depending on */
1516         /* location of pt. Updates are in FPEL units.            */
1517         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1518         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1519         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1520         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1521 
1522         {
1523             /* evaluate mv cost and totalcost for this part for this given mv*/
1524             i4_mv_cost = compute_mv_cost_coarse_high_speed(
1525                 ps_search_node_grid,
1526                 &ps_search_results->as_pred_ctxt[pred_lx],
1527                 (PART_ID_T)id,
1528                 MV_RES_FPEL);
1529 
1530             i4_sad = pi4_sad_grid[grid_count * id];
1531             i4_tot_cost = i4_sad + i4_mv_cost;
1532 
1533             ASSERT(i4_unique_id == id);
1534             ASSERT(num_results == 1);
1535 
1536             /*****************************************************************/
1537             /* We do not labor through the results if the total cost worse   */
1538             /* than the last of the results.                                 */
1539             /*****************************************************************/
1540             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1541             {
1542                 i4_min_id = i4_grid_pt;
1543                 ps_result_prms->i4_min_cost = i4_tot_cost;
1544 
1545                 ps_best_node[0] = *ps_search_node_grid;
1546                 ps_best_node[0].i4_sad = i4_sad;
1547                 ps_best_node[0].i4_mv_cost = i4_mv_cost;
1548                 ps_best_node[0].i4_tot_cost = i4_tot_cost;
1549             }
1550         }
1551         pi4_sad_grid++;
1552     }
1553     ps_result_prms->i4_min_id = i4_min_id;
1554 }
1555 
hme_update_results_grid_pu_bestn(result_upd_prms_t * ps_result_prms)1556 void hme_update_results_grid_pu_bestn(result_upd_prms_t *ps_result_prms)
1557 {
1558     search_node_t s_search_node_grid;
1559     const search_node_t *ps_search_node_base;
1560     search_node_t *ps_search_node_grid, *ps_best_node;
1561     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1562     S32 num_results, i4_unique_id = -1, i4_grid_pt;
1563     search_results_t *ps_search_results;
1564     S32 *pi4_valid_part_ids;
1565     S32 i4_step = ps_result_prms->i4_step;
1566     S32 i4_grid_mask, i4_count, i, i4_min_id;
1567     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1568     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1569     S32 grid_count = 0;
1570     S32 pred_lx;
1571 
1572     i4_min_id = (S32)PT_C;
1573     i4_min_cost = MAX_32BIT_VAL;
1574     ps_search_node_grid = &s_search_node_grid;
1575     ps_search_node_base = ps_result_prms->ps_search_node_base;
1576     *ps_search_node_grid = *ps_search_node_base;
1577     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1578     ps_search_results = ps_result_prms->ps_search_results;
1579     num_results = (S32)ps_search_results->u1_num_results_per_part;
1580     i4_grid_mask = ps_result_prms->i4_grid_mask;
1581 
1582     for(i = 0; i < 9; i++)
1583     {
1584         if(i4_grid_mask & (1 << i))
1585         {
1586             grid_count++;
1587         }
1588     }
1589 
1590     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1591     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1592 
1593     i4_unique_id = pi4_valid_part_ids[0];
1594 
1595     /*************************************************************************/
1596     /* Outer loop runs through all active pts in the grid                    */
1597     /*************************************************************************/
1598     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1599     {
1600         if(!(i4_grid_mask & (1 << i4_grid_pt)))
1601         {
1602             continue;
1603         }
1604 
1605         /* For the pt in the grid, update mvx and y depending on */
1606         /* location of pt. Updates are in FPEL units.            */
1607         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1608         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1609         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1610         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1611 
1612         i4_count = 0;
1613 
1614         while((id = pi4_valid_part_ids[i4_count]) >= 0)
1615         {
1616             /*****************************************************************/
1617             /* points to the best search results corresponding to this       */
1618             /* specific part type.                                           */
1619             /*****************************************************************/
1620             ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1621 
1622             /* evaluate mv cost and totalcost for this part for this given mv*/
1623             i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1624                 ps_search_node_grid,
1625                 &ps_search_results->as_pred_ctxt[pred_lx],
1626                 (PART_ID_T)id,
1627                 MV_RES_FPEL);
1628 
1629             i4_sad = pi4_sad_grid[grid_count * id];
1630             i4_tot_cost = i4_sad + i4_mv_cost;
1631 
1632             if(i4_unique_id == id)
1633             {
1634                 if(i4_tot_cost < ps_result_prms->i4_min_cost)
1635                 {
1636                     i4_min_id = i4_grid_pt;
1637                     ps_result_prms->i4_min_cost = i4_tot_cost;
1638                 }
1639             }
1640 
1641             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1642             {
1643                 for(i = 0; i < num_results - 1; i++)
1644                 {
1645                     if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1646                     {
1647                         memmove(
1648                             ps_best_node + i + 1,
1649                             ps_best_node + i,
1650                             sizeof(search_node_t) * (num_results - 1 - i));
1651                         break;
1652                     }
1653                     else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1654                     {
1655                         if(0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node + i))
1656                             break;
1657                     }
1658                 }
1659                 ps_best_node[i] = *ps_search_node_grid;
1660                 ps_best_node[i].i4_sad = i4_sad;
1661                 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1662                 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1663             }
1664             i4_count++;
1665         }
1666         pi4_sad_grid++;
1667     }
1668     ps_result_prms->i4_min_id = i4_min_id;
1669 }
1670 
1671 /**
1672 ********************************************************************************
1673 *  @fn     hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1674 *
1675 *  @brief  Updates results for the case where 1 best result is to be updated
1676 *          for a given pt, for several parts
1677 *          Note : The function is replicated for CLIPing the cost to 16bit to make
1678 *                  bit match with SIMD version
1679 *
1680 *  @param[in]  result_upd_prms_t : Contains the input parameters to this fxn
1681 *
1682 *  @return   The result_upd_prms_t structure is updated for all the active
1683 *            parts in case the current candt has results for any given part
1684 *             that is the best result for that part
1685 ********************************************************************************
1686 */
hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t * ps_result_prms)1687 void hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1688 {
1689     search_node_t s_search_node_grid;
1690     const search_node_t *ps_search_node_base;
1691     search_node_t *ps_search_node_grid, *ps_best_node;
1692     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1693     S32 num_results, i4_unique_id = -1, i4_grid_pt;
1694     search_results_t *ps_search_results;
1695     S32 *pi4_valid_part_ids;
1696     S32 i4_step = ps_result_prms->i4_step;
1697     S32 i4_grid_mask, i4_count, i, i4_min_id;
1698     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1699     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1700     S32 grid_count = 0;
1701     S32 pred_lx;
1702 
1703     i4_min_id = (S32)PT_C;
1704     i4_min_cost = MAX_32BIT_VAL;
1705     ps_search_node_grid = &s_search_node_grid;
1706     ps_search_node_base = ps_result_prms->ps_search_node_base;
1707     *ps_search_node_grid = *ps_search_node_base;
1708     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1709     ps_search_results = ps_result_prms->ps_search_results;
1710     num_results = (S32)ps_search_results->u1_num_results_per_part;
1711     i4_grid_mask = ps_result_prms->i4_grid_mask;
1712 
1713     for(i = 0; i < 9; i++)
1714     {
1715         if(i4_grid_mask & (1 << i))
1716             grid_count++;
1717     }
1718 
1719     /* Some basic assumptions: only single pt, only part updates */
1720     /* and more than 1 best result to be computed.               */
1721 
1722     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1723     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1724 
1725     /*************************************************************************/
1726     /* Supposing we do hte result update for a unique partid, we can */
1727     /* store the best pt id in the grid and also min cost is return */
1728     /* param. This will be useful for early exit cases.             */
1729     /* TODO : once we have separate fxn for unique part+grid, we can */
1730     /* do away with this code here                                   */
1731     /*************************************************************************/
1732     //if (pi4_valid_part_ids[1] == -1)
1733     i4_unique_id = pi4_valid_part_ids[0];
1734 
1735     /*************************************************************************/
1736     /* Outer loop runs through all active pts in the grid                    */
1737     /*************************************************************************/
1738     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1739     {
1740         if(!(i4_grid_mask & (1 << i4_grid_pt)))
1741             continue;
1742 
1743         /* For the pt in the grid, update mvx and y depending on */
1744         /* location of pt. Updates are in FPEL units.            */
1745         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1746         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1747         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1748         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1749 
1750         i4_count = 0;
1751 
1752         /* pi4_valid_part_ids contains all the valid ids. We loop through */
1753         /* this till we encounter -1. This is easier than having to       */
1754         /* figure out part by part, besides, active part decision is      */
1755         /* usually fixed for a given duration of search, e.g. entire fpel */
1756         /* refinement for a blk/cu will use fixed valid part mask         */
1757 
1758         while((id = pi4_valid_part_ids[i4_count]) >= 0)
1759         {
1760             //ps_search_node_grid->e_part_type = (PART_TYPE_T)id;
1761 
1762             /*****************************************************************/
1763             /* points to the best search results corresponding to this       */
1764             /* specific part type.                                           */
1765             /*****************************************************************/
1766             ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1767 
1768             /* evaluate mv cost and totalcost for this part for this given mv*/
1769             i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1770                 ps_search_node_grid,
1771                 &ps_search_results->as_pred_ctxt[pred_lx],
1772                 (PART_ID_T)id,
1773                 MV_RES_FPEL);
1774 
1775             i4_sad = pi4_sad_grid[grid_count * id];
1776 
1777             /* Clipping to 16 bit to bit match with SIMD version */
1778             i4_mv_cost = CLIP_S16(i4_mv_cost);
1779             i4_sad = CLIP_S16(i4_sad);
1780 
1781             i4_tot_cost = i4_sad + i4_mv_cost;
1782             /* Clipping to 16 bit to bit match with SIMD version */
1783             i4_tot_cost = CLIP_S16(i4_tot_cost);
1784 
1785             if(i4_unique_id == id)
1786             {
1787                 if(i4_tot_cost < ps_result_prms->i4_min_cost)
1788                 {
1789                     i4_min_id = i4_grid_pt;
1790                     ps_result_prms->i4_min_cost = i4_tot_cost;
1791                 }
1792             }
1793 
1794             /*****************************************************************/
1795             /* We do not labor through the results if the total cost worse   */
1796             /* than the last of the results.                                 */
1797             /*****************************************************************/
1798             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1799             {
1800                 /*************************************************************/
1801                 /* Identify where the current result isto be placed.Basically*/
1802                 /* find the node which has cost just higher thannodeundertest*/
1803                 /*************************************************************/
1804                 for(i = 0; i < num_results - 1; i++)
1805                 {
1806                     if(i4_tot_cost <= ps_best_node[i].i4_tot_cost)
1807                     {
1808                         memmove(
1809                             ps_best_node + i + 1,
1810                             ps_best_node + i,
1811                             sizeof(search_node_t) * (num_results - 1 - i));
1812                         break;
1813                     }
1814                 }
1815                 ps_best_node[i] = *ps_search_node_grid;
1816                 ps_best_node[i].i4_sad = i4_sad;
1817                 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1818                 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1819             }
1820             i4_count++;
1821         }
1822         pi4_sad_grid++;
1823     }
1824     ps_result_prms->i4_min_id = i4_min_id;
1825 }
1826 
1827 /**
1828 ********************************************************************************
1829 *  @fn     hme_update_results_pt_npu_best1(result_upd_prms_t *ps_result_prms)
1830 *
1831 *  @brief  Updates results for the case where 1 best result is to be updated
1832 *          for a given pt, for several parts
1833 *
1834 *  @param[in]  ps_result_prms. Contains the input parameters to this fxn
1835 *              ::ps_pred_info : contains cost fxn ptr and predictor info
1836 *              ::pi4_sad : 17x9 SAD Grid, this case, only 1st 17 entries valid
1837 *              ::ps_search_results: Search results structure
1838 *              ::i1_ref_id : Reference index
1839 *              ::i4_grid_mask: Dont Care for this fxn
1840 *              ::pi4_valid_part_ids : valid part ids
1841 *              ::ps_search_node_base: Contains the centre pt candt info.
1842 *
1843 *  @return   The ps_search_results structure is updated for all the active
1844 *            parts in case the current candt has results for any given part
1845 *             that is the best result for that part
1846 ********************************************************************************
1847 */
1848 
hme_update_results_pt_pu_best1_subpel_hs(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)1849 void hme_update_results_pt_pu_best1_subpel_hs(
1850     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1851 {
1852     search_node_t *ps_search_node_base, *ps_best_node;
1853     search_results_t *ps_search_results;
1854     S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1855     S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1856     S32 num_results, i;
1857     S32 *pi4_valid_part_ids;
1858 
1859     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1860     /* Some basic assumptions: only single pt, only part updates */
1861     /* and more than 1 best result to be computed.               */
1862     ASSERT(ps_result_prms->i4_grid_mask == 1);
1863 
1864     ps_search_results = ps_result_prms->ps_search_results;
1865     num_results = (S32)ps_search_results->u1_num_results_per_part;
1866 
1867     /* Compute mv cost, total cost */
1868     ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1869 
1870     while((id = pi4_valid_part_ids[i4_count]) >= 0)
1871     {
1872         S32 update_required = 1;
1873 
1874         ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1875         /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1876         i4_mv_cost = ps_best_node->i4_mv_cost;
1877         i4_sad = ps_result_prms->pi4_sad_grid[id];
1878         i4_tot_cost = i4_sad + i4_mv_cost;
1879 
1880         /* We do not labor through the results if the total cost is worse than   */
1881         /* the last of the results.                                              */
1882         if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1883         {
1884             /* Identify where the current result is to be placed. Basically find  */
1885             /* the node which has cost just higher than node under test           */
1886             for(i = 0; i < num_results - 1; i++)
1887             {
1888                 if(ps_best_node[i].i1_ref_idx != -1)
1889                 {
1890                     if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1891                     {
1892                         memmove(
1893                             ps_best_node + i + 1,
1894                             ps_best_node + i,
1895                             sizeof(search_node_t) * (num_results - 1 - i));
1896                         break;
1897                     }
1898                     else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1899                     {
1900                         update_required = 0;
1901                         break;
1902                     }
1903                 }
1904                 else
1905                 {
1906                     break;
1907                 }
1908             }
1909 
1910             if(update_required)
1911             {
1912                 /* Update when either ref_idx or mv's are different */
1913                 ps_best_node[i] = *ps_search_node_base;
1914                 ps_best_node[i].i4_sad = i4_sad;
1915                 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1916                 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1917             }
1918         }
1919         i4_count++;
1920     }
1921 }
1922 
hme_update_results_pt_pu_best1_subpel_hs_1(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)1923 void hme_update_results_pt_pu_best1_subpel_hs_1(
1924     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1925 {
1926     search_node_t *ps_search_node_base, *ps_best_node;
1927     search_results_t *ps_search_results;
1928     S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1929     S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1930     S32 num_results;
1931     S32 *pi4_valid_part_ids;
1932 
1933     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1934     /* Some basic assumptions: only single pt, only part updates */
1935     /* and more than 1 best result to be computed.               */
1936     ASSERT(ps_result_prms->i4_grid_mask == 1);
1937 
1938     ps_search_results = ps_result_prms->ps_search_results;
1939     num_results = (S32)ps_search_results->u1_num_results_per_part;
1940 
1941     /* Compute mv cost, total cost */
1942     ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1943 
1944     while((id = pi4_valid_part_ids[i4_count]) >= 0)
1945     {
1946         S32 update_required = 0;
1947 
1948         ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1949         /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1950         i4_mv_cost = ps_best_node->i4_mv_cost;
1951         i4_sad = ps_result_prms->pi4_sad_grid[id];
1952         i4_tot_cost = i4_sad + i4_mv_cost;
1953 
1954         /* We do not labor through the results if the total cost is worse than   */
1955         /* the last of the results.                                              */
1956         if(i4_tot_cost < ps_best_node[1].i4_tot_cost)
1957         {
1958             S32 sdi_value = 0;
1959 
1960             update_required = 2;
1961             /* Identify where the current result is to be placed. Basically find  */
1962             /* the node which has cost just higher than node under test           */
1963             {
1964                 if(i4_tot_cost < ps_best_node[0].i4_tot_cost)
1965                 {
1966                     update_required = 1;
1967                     sdi_value = ps_best_node[0].i4_sad - i4_sad;
1968                 }
1969                 else if(
1970                     (ps_result_prms->i2_mv_x == ps_best_node[0].s_mv.i2_mvx) &&
1971                     (ps_result_prms->i2_mv_y == ps_best_node[0].s_mv.i2_mvy) &&
1972                     (ps_best_node[0].i1_ref_idx == ps_result_prms->i1_ref_idx))
1973                 {
1974                     update_required = 0;
1975                 }
1976             }
1977             if(update_required == 2)
1978             {
1979                 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1980 
1981                 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] = i4_tot_cost;
1982                 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] = i4_mv_cost;
1983                 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] = ps_result_prms->i2_mv_x;
1984                 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] = ps_result_prms->i2_mv_y;
1985                 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] = ps_result_prms->i1_ref_idx;
1986             }
1987             else if(update_required == 1)
1988             {
1989                 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1990 
1991                 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] =
1992                     ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count];
1993                 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] =
1994                     ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count];
1995                 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] =
1996                     ps_subpel_refine_ctxt->i2_mv_x[0][i4_count];
1997                 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] =
1998                     ps_subpel_refine_ctxt->i2_mv_y[0][i4_count];
1999                 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] =
2000                     ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count];
2001 
2002                 ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] = i4_tot_cost;
2003                 ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count] = i4_mv_cost;
2004                 ps_subpel_refine_ctxt->i2_mv_x[0][i4_count] = ps_result_prms->i2_mv_x;
2005                 ps_subpel_refine_ctxt->i2_mv_y[0][i4_count] = ps_result_prms->i2_mv_y;
2006                 ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count] = ps_result_prms->i1_ref_idx;
2007             }
2008         }
2009         i4_count++;
2010     }
2011 }
2012 
2013 /**
2014 ******************************************************************************
2015 *  @brief Gives a result fxn ptr for a index [x] where x is as:
2016 *         0 : single pt, no partial updates, 1 best result
2017 *         1 : single pt, no partial updates, N best results
2018 *         2 : single pt,    partial updates, 1 best result
2019 *         3 : single pt,    partial updates, N best results
2020 *         0 : grid     , no partial updates, 1 best result
2021 *         1 : grid     , no partial updates, N best results
2022 *         2 : grid     ,    partial updates, 1 best result
2023 *         3 : grid     ,    partial updates, N best results
2024 ******************************************************************************
2025 */
2026 
2027 static PF_RESULT_FXN_T g_pf_result_fxn[8] = { UPD_RES_PT_NPU_BEST1,   UPD_RES_PT_NPU_BESTN,
2028                                               UPD_RES_PT_PU_BEST1,    UPD_RES_PT_PU_BESTN,
2029                                               UPD_RES_GRID_NPU_BEST1, UPD_RES_GRID_NPU_BESTN,
2030                                               UPD_RES_GRID_PU_BEST1,  UPD_RES_GRID_PU_BESTN };
2031 
2032 /**
2033 ********************************************************************************
2034 *  @fn     hme_get_result_fxn(i4_grid_mask, i4_part_mask, i4_num_results)
2035 *
2036 *  @brief  Obtains the suitable result function that evaluates COST and also
2037 *           computes one or more best results for point/grid, single part or
2038 *           more than one part.
2039 *
2040 *  @param[in]  i4_grid_mask : Mask containing which of 9 grid pts active
2041 *
2042 *  @param[in]  i4_part_mask : Mask containing which of the 17 parts active
2043 *
2044 *  @param[in]  i4_num_results: Number of active results
2045 *
2046 *  @return   Pointer to the appropriate result update function
2047 ********************************************************************************
2048 */
hme_get_result_fxn(S32 i4_grid_mask,S32 i4_part_mask,S32 i4_num_results)2049 PF_RESULT_FXN_T hme_get_result_fxn(S32 i4_grid_mask, S32 i4_part_mask, S32 i4_num_results)
2050 {
2051     S32 i4_is_grid = (i4_grid_mask != 1);
2052     S32 i4_is_pu = ((i4_part_mask & (i4_part_mask - 1)) != 0);
2053     S32 i4_res_gt1 = (i4_num_results > 1);
2054     S32 id;
2055 
2056     id = (i4_is_grid << 2) + (i4_is_pu << 1) + i4_res_gt1;
2057 
2058     return (g_pf_result_fxn[id]);
2059 }
2060 
hme_calc_sad_and_2_best_results(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2061 void hme_calc_sad_and_2_best_results(
2062     hme_search_prms_t *ps_search_prms,
2063     wgt_pred_ctxt_t *ps_wt_inp_prms,
2064     err_prms_t *ps_err_prms,
2065     result_upd_prms_t *ps_result_prms,
2066     U08 **ppu1_ref,
2067     S32 i4_ref_stride)
2068 {
2069     S32 i4_candt;
2070     S32 i4_inp_off;
2071     S32 i4_ref_offset;
2072     S32 i4_num_nodes;
2073 
2074     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2075     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2076     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2077     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2078     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2079 
2080     mv_refine_ctxt_t *ps_mv_refine_ctxt;
2081     search_node_t *ps_search_node;
2082 
2083     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2084     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2085     i4_inp_off = ps_search_prms->i4_cu_x_off;
2086     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2087     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2088     ps_search_node = ps_search_prms->ps_search_nodes;
2089 
2090     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2091     {
2092         /**********************************************************************/
2093         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2094         /**********************************************************************/
2095         {
2096             WORD32 b, c, d;
2097             UWORD8 *pu1_cur_ptr;
2098             UWORD8 *pu1_ref_ptr;
2099             UWORD16 au2_4x4_sad[NUM_4X4];
2100 
2101             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2102             {
2103                 continue;
2104             }
2105 
2106             ps_err_prms->pu1_inp =
2107                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2108             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2109             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2110             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2111 
2112             pu1_cur_ptr = ps_err_prms->pu1_inp;
2113             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2114 
2115             /* Loop to compute the SAD's */
2116             {
2117                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2118                 for(b = 0; b < NUM_4X4; b++)
2119                 {
2120                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2121                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2122 
2123                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2124                     {
2125                         WORD32 z_cur = (cur_buf_stride)*c + t1;
2126                         WORD32 z_ref = (ref_buf_stride)*c + t2;
2127                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2128                         {
2129                             au2_4x4_sad[b] += (UWORD16)ABS((
2130                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2131                         }
2132                     }
2133                 }
2134 
2135                 pi4_sad_grid[PART_ID_NxN_TL] =
2136                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2137                 pi4_sad_grid[PART_ID_NxN_TR] =
2138                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2139                 pi4_sad_grid[PART_ID_NxN_BL] =
2140                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2141                 pi4_sad_grid[PART_ID_NxN_BR] =
2142                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2143                 pi4_sad_grid[PART_ID_Nx2N_L] =
2144                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2145                 pi4_sad_grid[PART_ID_Nx2N_R] =
2146                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2147                 pi4_sad_grid[PART_ID_2NxN_T] =
2148                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2149                 pi4_sad_grid[PART_ID_2NxN_B] =
2150                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2151                 pi4_sad_grid[PART_ID_nLx2N_L] =
2152                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2153                 pi4_sad_grid[PART_ID_nRx2N_R] =
2154                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2155                 pi4_sad_grid[PART_ID_2NxnU_T] =
2156                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2157                 pi4_sad_grid[PART_ID_2NxnD_B] =
2158                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2159                 pi4_sad_grid[PART_ID_2Nx2N] =
2160                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2161                 pi4_sad_grid[PART_ID_2NxnU_B] =
2162                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2163                 pi4_sad_grid[PART_ID_2NxnD_T] =
2164                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2165                 pi4_sad_grid[PART_ID_nRx2N_L] =
2166                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2167                 pi4_sad_grid[PART_ID_nLx2N_R] =
2168                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2169             }
2170         }
2171 
2172         {
2173             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2174             S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2175             S32 best_node_cost;
2176             S32 second_best_node_cost;
2177 
2178             {
2179                 S16 mvdx1, mvdy1;
2180                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2181                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2182                 S32 pred_lx = i4_search_idx;
2183 
2184                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2185                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2186                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2187 
2188                 S32 inp_shift = 2;
2189                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2190                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2191                 S32 lambda = ps_pred_ctxt->lambda;
2192                 S32 rnd = 1 << (lambda_q_shift - 1);
2193                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2194                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2195                 S32 ref_bits =
2196                     ps_pred_ctxt
2197                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2198 
2199                 COMPUTE_DIFF_MV(
2200                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2201 
2202                 mvdx1 = ABS(mvdx1);
2203                 mvdy1 = ABS(mvdy1);
2204 
2205                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2206                              (mvdy1 > 0) + ref_bits + 2;
2207 
2208                 i4_mv_cost *= lambda;
2209                 i4_mv_cost += rnd;
2210                 i4_mv_cost >>= lambda_q_shift;
2211 
2212                 i4_mv_cost = CLIP_U16(i4_mv_cost);
2213             }
2214 
2215             /*For each valid partition, update the refine_prm structure to reflect the best and second
2216             best candidates for that partition*/
2217 
2218             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2219             {
2220                 S32 update_required = 0;
2221                 S32 part_id = pi4_valid_part_ids[i4_count];
2222                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2223 
2224                 /*Calculate total cost*/
2225                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2226                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2227 
2228                 /*****************************************************************/
2229                 /* We do not labor through the results if the total cost worse   */
2230                 /* than the last of the results.                                 */
2231                 /*****************************************************************/
2232                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
2233                 second_best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[1][index]);
2234 
2235                 if(i4_tot_cost < second_best_node_cost)
2236                 {
2237                     update_required = 2;
2238 
2239                     /*************************************************************/
2240                     /* Identify where the current result isto be placed.Basically*/
2241                     /* find the node which has cost just higher thannodeundertest*/
2242                     /*************************************************************/
2243                     if(i4_tot_cost < best_node_cost)
2244                     {
2245                         update_required = 1;
2246                     }
2247                     else if(i4_tot_cost == best_node_cost)
2248                     {
2249                         update_required = 0;
2250                     }
2251 
2252                     if(update_required == 2)
2253                     {
2254                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2255                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2256                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2257                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2258                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2259                     }
2260                     else if(update_required == 1)
2261                     {
2262                         ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2263                             ps_mv_refine_ctxt->i2_tot_cost[0][index];
2264                         ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2265                             ps_mv_refine_ctxt->i2_mv_cost[0][index];
2266                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2267                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2268                         ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2269                             ps_mv_refine_ctxt->i2_ref_idx[0][index];
2270 
2271                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2272                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2273                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2274                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2275                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2276                     }
2277                 }
2278             }
2279         }
2280         ps_search_node++;
2281     }
2282 
2283     {
2284         WORD32 i4_i;
2285         WORD32 part_id;
2286         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2287         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2288         {
2289             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2290             if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2291             {
2292                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2293                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2294                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2295 
2296                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2297             }
2298             if(ps_mv_refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2299             {
2300                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2301                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2302                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2303 
2304                 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2305             }
2306         }
2307     }
2308 }
2309 
hme_calc_sad_and_2_best_results_subpel(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)2310 void hme_calc_sad_and_2_best_results_subpel(
2311     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
2312 {
2313     S32 i4_candt;
2314     S32 i4_num_nodes;
2315 
2316     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2317     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2318     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2319     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2320     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2321 
2322     mv_refine_ctxt_t *ps_subpel_refine_ctxt;
2323     ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2324     i4_num_nodes = 1;
2325 
2326     /* Run through each of the candts in a loop */
2327     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2328     {
2329         /**********************************************************************/
2330         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2331         /**********************************************************************/
2332         {
2333             WORD32 b, c, d;
2334             UWORD8 *pu1_cur_ptr;
2335             UWORD8 *pu1_ref_ptr;
2336             UWORD16 au2_4x4_sad[NUM_4X4];
2337 
2338             pu1_cur_ptr = ps_err_prms->pu1_inp;
2339             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2340 
2341             /* Loop to compute the SAD's */
2342             {
2343                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2344                 for(b = 0; b < NUM_4X4; b++)
2345                 {
2346                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2347                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2348 
2349                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2350                     {
2351                         WORD32 z_cur = (cur_buf_stride)*c + t1;
2352                         WORD32 z_ref = (ref_buf_stride)*c + t2;
2353                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2354                         {
2355                             au2_4x4_sad[b] += (UWORD16)ABS((
2356                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2357                         }
2358                     }
2359                 }
2360 
2361                 pi4_sad_grid[PART_ID_NxN_TL] =
2362                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2363                 pi4_sad_grid[PART_ID_NxN_TR] =
2364                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2365                 pi4_sad_grid[PART_ID_NxN_BL] =
2366                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2367                 pi4_sad_grid[PART_ID_NxN_BR] =
2368                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2369                 pi4_sad_grid[PART_ID_Nx2N_L] =
2370                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2371                 pi4_sad_grid[PART_ID_Nx2N_R] =
2372                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2373                 pi4_sad_grid[PART_ID_2NxN_T] =
2374                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2375                 pi4_sad_grid[PART_ID_2NxN_B] =
2376                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2377                 pi4_sad_grid[PART_ID_nLx2N_L] =
2378                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2379                 pi4_sad_grid[PART_ID_nRx2N_R] =
2380                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2381                 pi4_sad_grid[PART_ID_2NxnU_T] =
2382                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2383                 pi4_sad_grid[PART_ID_2NxnD_B] =
2384                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2385                 pi4_sad_grid[PART_ID_2Nx2N] =
2386                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2387                 pi4_sad_grid[PART_ID_2NxnU_B] =
2388                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2389                 pi4_sad_grid[PART_ID_2NxnD_T] =
2390                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2391                 pi4_sad_grid[PART_ID_nRx2N_L] =
2392                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2393                 pi4_sad_grid[PART_ID_nLx2N_R] =
2394                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2395             }
2396         }
2397         /**********************************************************************/
2398         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
2399         /**********************************************************************/
2400         {
2401             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2402             S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
2403             S32 best_node_cost;
2404             S32 second_best_node_cost;
2405 
2406             /*For each valid partition, update the refine_prm structure to reflect the best and second
2407             best candidates for that partition*/
2408 
2409             for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
2410             {
2411                 S32 update_required = 0;
2412                 S32 part_id = pi4_valid_part_ids[i4_count];
2413                 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2414 
2415                 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
2416                 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2417 
2418                 /*Calculate total cost*/
2419                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2420                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2421 
2422                 /*****************************************************************/
2423                 /* We do not labor through the results if the total cost worse   */
2424                 /* than the last of the results.                                 */
2425                 /*****************************************************************/
2426                 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
2427                 second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
2428 
2429                 if(i4_tot_cost < second_best_node_cost)
2430                 {
2431                     update_required = 2;
2432 
2433                     /*************************************************************/
2434                     /* Identify where the current result isto be placed.Basically*/
2435                     /* find the node which has cost just higher thannodeundertest*/
2436                     /*************************************************************/
2437                     if(i4_tot_cost < best_node_cost)
2438                     {
2439                         update_required = 1;
2440                     }
2441                     else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
2442                     {
2443                         update_required = 0;
2444                     }
2445                     if(update_required == 2)
2446                     {
2447                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2448                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2449                         ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
2450                         ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
2451                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
2452                     }
2453                     else if(update_required == 1)
2454                     {
2455                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
2456                             ps_subpel_refine_ctxt->i2_tot_cost[0][index];
2457                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
2458                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2459                         ps_subpel_refine_ctxt->i2_mv_x[1][index] =
2460                             ps_subpel_refine_ctxt->i2_mv_x[0][index];
2461                         ps_subpel_refine_ctxt->i2_mv_y[1][index] =
2462                             ps_subpel_refine_ctxt->i2_mv_y[0][index];
2463                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
2464                             ps_subpel_refine_ctxt->i2_ref_idx[0][index];
2465 
2466                         ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2467                         ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2468                         ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
2469                         ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
2470                         ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
2471                     }
2472                 }
2473             }
2474         }
2475     }
2476 
2477     {
2478         WORD32 i4_count = 0;
2479         for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
2480         {
2481             WORD32 j;
2482             for(j = 0; j < 2; j++)
2483             {
2484                 if(ps_subpel_refine_ctxt->i2_tot_cost[j][i4_count] >= MAX_SIGNED_16BIT_VAL)
2485                 {
2486                     ps_subpel_refine_ctxt->ai2_fullpel_satd[j][i4_count] = MAX_SIGNED_16BIT_VAL;
2487                 }
2488             }
2489         }
2490     }
2491 }
2492 
hme_calc_stim_injected_sad_and_2_best_results(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2493 void hme_calc_stim_injected_sad_and_2_best_results(
2494     hme_search_prms_t *ps_search_prms,
2495     wgt_pred_ctxt_t *ps_wt_inp_prms,
2496     err_prms_t *ps_err_prms,
2497     result_upd_prms_t *ps_result_prms,
2498     U08 **ppu1_ref,
2499     S32 i4_ref_stride)
2500 {
2501     mv_refine_ctxt_t *ps_mv_refine_ctxt;
2502     search_node_t *ps_search_node;
2503 
2504     S32 i4_candt;
2505     S32 i4_count;
2506     S32 i4_inp_off;
2507     S32 i4_ref_offset;
2508     S32 i4_num_nodes;
2509     ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
2510         au8_final_ref_sigmaXSquared[17];
2511     UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
2512     S32 *pi4_valid_part_ids;
2513 
2514     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2515     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2516     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2517     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2518     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2519 
2520     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2521     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2522     i4_inp_off = ps_search_prms->i4_cu_x_off;
2523     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2524     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2525     ps_search_node = ps_search_prms->ps_search_nodes;
2526     pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2527 
2528     /* Set local pointer to point to partition level sigma values calculated in hme_refine */
2529     au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
2530     au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
2531 
2532     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2533     {
2534         {
2535             WORD32 b, c, d;
2536             UWORD8 *pu1_cur_ptr;
2537             UWORD8 *pu1_ref_ptr;
2538             UWORD16 au2_4x4_sad[NUM_4X4];
2539 
2540             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2541             {
2542                 continue;
2543             }
2544 
2545             ps_err_prms->pu1_inp =
2546                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2547             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2548             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2549             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2550 
2551             pu1_cur_ptr = ps_err_prms->pu1_inp;
2552             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2553 
2554             /* Loop to compute the SAD's */
2555             {
2556                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2557                 for(b = 0; b < NUM_4X4; b++)
2558                 {
2559                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2560                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2561 
2562                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2563                     {
2564                         WORD32 z_cur = (cur_buf_stride)*c + t1;
2565                         WORD32 z_ref = (ref_buf_stride)*c + t2;
2566                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2567                         {
2568                             au2_4x4_sad[b] += (UWORD16)ABS((
2569                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2570                         }
2571                     }
2572                 }
2573 
2574                 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
2575                 hme_compute_sigmaX_and_sigmaXSquared(
2576                     pu1_ref_ptr,
2577                     ref_buf_stride,
2578                     au4_4x4_ref_sigmaX,
2579                     au4_4x4_ref_sigmaXSquared,
2580                     4,
2581                     4,
2582                     16,
2583                     16,
2584                     1,
2585                     4);
2586 
2587                 pi4_sad_grid[PART_ID_NxN_TL] =
2588                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2589                 pi4_sad_grid[PART_ID_NxN_TR] =
2590                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2591                 pi4_sad_grid[PART_ID_NxN_BL] =
2592                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2593                 pi4_sad_grid[PART_ID_NxN_BR] =
2594                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2595                 pi4_sad_grid[PART_ID_Nx2N_L] =
2596                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2597                 pi4_sad_grid[PART_ID_Nx2N_R] =
2598                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2599                 pi4_sad_grid[PART_ID_2NxN_T] =
2600                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2601                 pi4_sad_grid[PART_ID_2NxN_B] =
2602                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2603                 pi4_sad_grid[PART_ID_nLx2N_L] =
2604                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2605                 pi4_sad_grid[PART_ID_nRx2N_R] =
2606                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2607                 pi4_sad_grid[PART_ID_2NxnU_T] =
2608                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2609                 pi4_sad_grid[PART_ID_2NxnD_B] =
2610                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2611                 pi4_sad_grid[PART_ID_2Nx2N] =
2612                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2613                 pi4_sad_grid[PART_ID_2NxnU_B] =
2614                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2615                 pi4_sad_grid[PART_ID_2NxnD_T] =
2616                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2617                 pi4_sad_grid[PART_ID_nRx2N_L] =
2618                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2619                 pi4_sad_grid[PART_ID_nLx2N_R] =
2620                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2621             }
2622         }
2623 
2624         {
2625             S32 i4_sad, i4_mv_cost, i4_tot_cost;
2626             S32 best_node_cost;
2627             S32 second_best_node_cost;
2628             ULWORD64 u8_temp_var, u8_temp_var1;
2629             ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
2630 
2631             {
2632                 S16 mvdx1, mvdy1;
2633                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2634                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2635                 S32 pred_lx = i4_search_idx;
2636 
2637                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2638                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2639                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2640 
2641                 S32 inp_shift = 2;
2642                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2643                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2644                 S32 lambda = ps_pred_ctxt->lambda;
2645                 S32 rnd = 1 << (lambda_q_shift - 1);
2646                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2647                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2648                 S32 ref_bits =
2649                     ps_pred_ctxt
2650                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2651 
2652                 COMPUTE_DIFF_MV(
2653                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2654 
2655                 mvdx1 = ABS(mvdx1);
2656                 mvdy1 = ABS(mvdy1);
2657 
2658                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2659                              (mvdy1 > 0) + ref_bits + 2;
2660 
2661                 i4_mv_cost *= lambda;
2662                 i4_mv_cost += rnd;
2663                 i4_mv_cost >>= lambda_q_shift;
2664 
2665                 i4_mv_cost = CLIP_U16(i4_mv_cost);
2666             }
2667 
2668             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2669             {
2670                 S32 i4_stim_injected_sad;
2671                 S32 i4_stim_injected_cost;
2672                 S32 i4_noise_term;
2673                 unsigned long u4_shift_val;
2674                 S32 i4_bits_req;
2675 
2676                 S32 update_required = 0;
2677                 S32 part_id = pi4_valid_part_ids[i4_count];
2678                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2679 
2680                 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
2681 
2682                 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
2683 
2684                 if(ps_search_prms->i4_alpha_stim_multiplier)
2685                 {
2686                     /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
2687                     hme_compute_final_sigma_of_pu_from_base_blocks(
2688                         au4_4x4_ref_sigmaX,
2689                         au4_4x4_ref_sigmaXSquared,
2690                         au8_final_ref_sigmaX,
2691                         au8_final_ref_sigmaXSquared,
2692                         16,
2693                         4,
2694                         part_id,
2695                         4);
2696 
2697                     u8_ref_X_Square =
2698                         (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
2699                     u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
2700 
2701                     /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
2702                     /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
2703                     /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
2704                     u4_shift_val = ihevce_calc_stim_injected_variance(
2705                         au8_final_src_sigmaX,
2706                         au8_final_src_sigmaXSquared,
2707                         &u8_src_var,
2708                         i4_inv_wt,
2709                         ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
2710                         ps_wt_inp_prms->wpred_log_wdc,
2711                         part_id);
2712 
2713                     u8_ref_var = u8_ref_var >> u4_shift_val;
2714 
2715                     /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
2716                     GETRANGE64(i4_bits_req, u8_ref_var);
2717 
2718                     if(i4_bits_req > 27)
2719                     {
2720                         u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
2721                         u8_src_var = u8_src_var >> (i4_bits_req - 27);
2722                     }
2723 
2724                     if(u8_src_var == u8_ref_var)
2725                     {
2726                         u8_temp_var = (1 << STIM_Q_FORMAT);
2727                     }
2728                     else
2729                     {
2730                         u8_temp_var = (2 * u8_src_var * u8_ref_var);
2731                         u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
2732                         u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
2733                         u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
2734                         u8_temp_var = (u8_temp_var / u8_temp_var1);
2735                     }
2736 
2737                     i4_noise_term = (UWORD32)u8_temp_var;
2738 
2739                     ASSERT(i4_noise_term >= 0);
2740 
2741                     i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
2742                 }
2743                 else
2744                 {
2745                     i4_noise_term = 0;
2746                 }
2747                 u8_pure_dist = pi4_sad_grid[part_id];
2748                 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
2749                 u8_pure_dist += (1 << ((i4_q_level)-1));
2750                 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
2751 
2752                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2753                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2754                 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
2755                 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
2756 
2757                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
2758                 second_best_node_cost =
2759                     CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[1][index]);
2760 
2761                 if(i4_stim_injected_cost < second_best_node_cost)
2762                 {
2763                     update_required = 2;
2764 
2765                     if(i4_stim_injected_cost < best_node_cost)
2766                     {
2767                         update_required = 1;
2768                     }
2769                     else if(i4_stim_injected_cost == best_node_cost)
2770                     {
2771                         update_required = 0;
2772                     }
2773 
2774                     if(update_required == 2)
2775                     {
2776                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2777                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
2778                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2779                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2780                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2781                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2782                     }
2783                     else if(update_required == 1)
2784                     {
2785                         ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2786                             ps_mv_refine_ctxt->i2_tot_cost[0][index];
2787                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] =
2788                             ps_mv_refine_ctxt->i2_stim_injected_cost[0][index];
2789                         ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2790                             ps_mv_refine_ctxt->i2_mv_cost[0][index];
2791                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2792                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2793                         ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2794                             ps_mv_refine_ctxt->i2_ref_idx[0][index];
2795 
2796                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2797                         ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
2798                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2799                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2800                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2801                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2802                     }
2803                 }
2804             }
2805         }
2806 
2807         ps_search_node++;
2808     }
2809 
2810     {
2811         WORD32 i4_i;
2812         WORD32 part_id;
2813         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2814         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2815         {
2816             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2817             if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2818             {
2819                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2820                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2821                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2822 
2823                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2824             }
2825             if(ps_mv_refine_ctxt->i2_stim_injected_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2826             {
2827                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2828                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2829                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2830 
2831                 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2832             }
2833         }
2834     }
2835 }
2836 
hme_calc_sad_and_1_best_result(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2837 void hme_calc_sad_and_1_best_result(
2838     hme_search_prms_t *ps_search_prms,
2839     wgt_pred_ctxt_t *ps_wt_inp_prms,
2840     err_prms_t *ps_err_prms,
2841     result_upd_prms_t *ps_result_prms,
2842     U08 **ppu1_ref,
2843     S32 i4_ref_stride)
2844 {
2845     S32 i4_candt;
2846     S32 i4_inp_off;
2847     S32 i4_ref_offset;
2848     S32 i4_num_nodes;
2849 
2850     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2851     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2852     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2853     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2854     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2855 
2856     mv_refine_ctxt_t *ps_mv_refine_ctxt;
2857     search_node_t *ps_search_node;
2858 
2859     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2860     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2861     i4_inp_off = ps_search_prms->i4_cu_x_off;
2862     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2863     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2864     ps_search_node = ps_search_prms->ps_search_nodes;
2865 
2866     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2867     {
2868         /**********************************************************************/
2869         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
2870         /**********************************************************************/
2871         {
2872             WORD32 b, c, d;
2873             UWORD8 *pu1_cur_ptr;
2874             UWORD8 *pu1_ref_ptr;
2875             UWORD16 au2_4x4_sad[NUM_4X4];
2876 
2877             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2878             {
2879                 continue;
2880             }
2881 
2882             ps_err_prms->pu1_inp =
2883                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2884             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2885             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2886             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2887 
2888             pu1_cur_ptr = ps_err_prms->pu1_inp;
2889             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2890 
2891             /* Loop to compute the SAD's */
2892             {
2893                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2894                 for(b = 0; b < NUM_4X4; b++)
2895                 {
2896                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2897                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2898 
2899                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2900                     {
2901                         WORD32 z_cur = (cur_buf_stride)*c + t1;
2902                         WORD32 z_ref = (ref_buf_stride)*c + t2;
2903                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2904                         {
2905                             au2_4x4_sad[b] += (UWORD16)ABS((
2906                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2907                         }
2908                     }
2909                 }
2910 
2911                 pi4_sad_grid[PART_ID_NxN_TL] =
2912                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2913                 pi4_sad_grid[PART_ID_NxN_TR] =
2914                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2915                 pi4_sad_grid[PART_ID_NxN_BL] =
2916                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2917                 pi4_sad_grid[PART_ID_NxN_BR] =
2918                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2919                 pi4_sad_grid[PART_ID_Nx2N_L] =
2920                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2921                 pi4_sad_grid[PART_ID_Nx2N_R] =
2922                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2923                 pi4_sad_grid[PART_ID_2NxN_T] =
2924                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2925                 pi4_sad_grid[PART_ID_2NxN_B] =
2926                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2927                 pi4_sad_grid[PART_ID_nLx2N_L] =
2928                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2929                 pi4_sad_grid[PART_ID_nRx2N_R] =
2930                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2931                 pi4_sad_grid[PART_ID_2NxnU_T] =
2932                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2933                 pi4_sad_grid[PART_ID_2NxnD_B] =
2934                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2935                 pi4_sad_grid[PART_ID_2Nx2N] =
2936                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2937                 pi4_sad_grid[PART_ID_2NxnU_B] =
2938                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2939                 pi4_sad_grid[PART_ID_2NxnD_T] =
2940                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2941                 pi4_sad_grid[PART_ID_nRx2N_L] =
2942                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2943                 pi4_sad_grid[PART_ID_nLx2N_R] =
2944                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2945             }
2946         }
2947 
2948         {
2949             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2950             S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2951             S32 best_node_cost;
2952             S32 second_best_node_cost;
2953 
2954             {
2955                 S16 mvdx1, mvdy1;
2956                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2957                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2958                 S32 pred_lx = i4_search_idx;
2959 
2960                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2961                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2962                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2963 
2964                 S32 inp_shift = 2;
2965                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2966                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2967                 S32 lambda = ps_pred_ctxt->lambda;
2968                 S32 rnd = 1 << (lambda_q_shift - 1);
2969                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2970                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2971                 S32 ref_bits =
2972                     ps_pred_ctxt
2973                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2974 
2975                 COMPUTE_DIFF_MV(
2976                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2977 
2978                 mvdx1 = ABS(mvdx1);
2979                 mvdy1 = ABS(mvdy1);
2980 
2981                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2982                              (mvdy1 > 0) + ref_bits + 2;
2983 
2984                 i4_mv_cost *= lambda;
2985                 i4_mv_cost += rnd;
2986                 i4_mv_cost >>= lambda_q_shift;
2987 
2988                 i4_mv_cost = CLIP_U16(i4_mv_cost);
2989             }
2990 
2991             /*For each valid partition, update the refine_prm structure to reflect the best and second
2992             best candidates for that partition*/
2993 
2994             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2995             {
2996                 S32 update_required = 0;
2997                 S32 part_id = pi4_valid_part_ids[i4_count];
2998                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2999 
3000                 /*Calculate total cost*/
3001                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3002                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3003 
3004                 /*****************************************************************/
3005                 /* We do not labor through the results if the total cost worse   */
3006                 /* than the last of the results.                                 */
3007                 /*****************************************************************/
3008                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
3009                 second_best_node_cost = SHRT_MAX;
3010 
3011                 if(i4_tot_cost < second_best_node_cost)
3012                 {
3013                     update_required = 0;
3014 
3015                     /*************************************************************/
3016                     /* Identify where the current result isto be placed.Basically*/
3017                     /* find the node which has cost just higher thannodeundertest*/
3018                     /*************************************************************/
3019                     if(i4_tot_cost < best_node_cost)
3020                     {
3021                         update_required = 1;
3022                     }
3023                     else if(i4_tot_cost == best_node_cost)
3024                     {
3025                         update_required = 0;
3026                     }
3027 
3028                     if(update_required == 2)
3029                     {
3030                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3031                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3032                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3033                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3034                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3035                     }
3036                     else if(update_required == 1)
3037                     {
3038                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3039                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3040                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3041                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3042                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3043                     }
3044                 }
3045             }
3046         }
3047         ps_search_node++;
3048     }
3049 
3050     {
3051         WORD32 i4_i;
3052         WORD32 part_id;
3053         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3054         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3055         {
3056             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3057             if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3058             {
3059                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3060                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3061                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3062 
3063                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3064             }
3065         }
3066     }
3067 }
3068 
hme_calc_stim_injected_sad_and_1_best_result(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)3069 void hme_calc_stim_injected_sad_and_1_best_result(
3070     hme_search_prms_t *ps_search_prms,
3071     wgt_pred_ctxt_t *ps_wt_inp_prms,
3072     err_prms_t *ps_err_prms,
3073     result_upd_prms_t *ps_result_prms,
3074     U08 **ppu1_ref,
3075     S32 i4_ref_stride)
3076 {
3077     mv_refine_ctxt_t *ps_mv_refine_ctxt;
3078     search_node_t *ps_search_node;
3079 
3080     S32 i4_candt;
3081     S32 i4_count;
3082     S32 i4_inp_off;
3083     S32 i4_ref_offset;
3084     S32 i4_num_nodes;
3085     ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
3086         au8_final_ref_sigmaXSquared[17];
3087     UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
3088     S32 *pi4_valid_part_ids;
3089 
3090     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3091     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3092     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3093     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3094     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3095 
3096     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
3097     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3098     i4_inp_off = ps_search_prms->i4_cu_x_off;
3099     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
3100     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3101     ps_search_node = ps_search_prms->ps_search_nodes;
3102     pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
3103 
3104     /* Set local pointer to point to partition level sigma values calculated in hme_refine */
3105     au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
3106     au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
3107 
3108     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3109     {
3110         {
3111             WORD32 b, c, d;
3112             UWORD8 *pu1_cur_ptr;
3113             UWORD8 *pu1_ref_ptr;
3114             UWORD16 au2_4x4_sad[NUM_4X4];
3115 
3116             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3117             {
3118                 continue;
3119             }
3120 
3121             ps_err_prms->pu1_inp =
3122                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3123             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3124             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3125             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3126 
3127             pu1_cur_ptr = ps_err_prms->pu1_inp;
3128             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3129 
3130             /* Loop to compute the SAD's */
3131             {
3132                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3133                 for(b = 0; b < NUM_4X4; b++)
3134                 {
3135                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3136                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3137 
3138                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3139                     {
3140                         WORD32 z_cur = (cur_buf_stride)*c + t1;
3141                         WORD32 z_ref = (ref_buf_stride)*c + t2;
3142                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3143                         {
3144                             au2_4x4_sad[b] += (UWORD16)ABS((
3145                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3146                         }
3147                     }
3148                 }
3149 
3150                 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
3151                 hme_compute_sigmaX_and_sigmaXSquared(
3152                     pu1_ref_ptr,
3153                     ref_buf_stride,
3154                     au4_4x4_ref_sigmaX,
3155                     au4_4x4_ref_sigmaXSquared,
3156                     4,
3157                     4,
3158                     16,
3159                     16,
3160                     1,
3161                     4);
3162 
3163                 pi4_sad_grid[PART_ID_NxN_TL] =
3164                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3165                 pi4_sad_grid[PART_ID_NxN_TR] =
3166                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3167                 pi4_sad_grid[PART_ID_NxN_BL] =
3168                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3169                 pi4_sad_grid[PART_ID_NxN_BR] =
3170                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3171                 pi4_sad_grid[PART_ID_Nx2N_L] =
3172                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3173                 pi4_sad_grid[PART_ID_Nx2N_R] =
3174                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3175                 pi4_sad_grid[PART_ID_2NxN_T] =
3176                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3177                 pi4_sad_grid[PART_ID_2NxN_B] =
3178                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3179                 pi4_sad_grid[PART_ID_nLx2N_L] =
3180                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3181                 pi4_sad_grid[PART_ID_nRx2N_R] =
3182                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3183                 pi4_sad_grid[PART_ID_2NxnU_T] =
3184                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3185                 pi4_sad_grid[PART_ID_2NxnD_B] =
3186                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3187                 pi4_sad_grid[PART_ID_2Nx2N] =
3188                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3189                 pi4_sad_grid[PART_ID_2NxnU_B] =
3190                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3191                 pi4_sad_grid[PART_ID_2NxnD_T] =
3192                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3193                 pi4_sad_grid[PART_ID_nRx2N_L] =
3194                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3195                 pi4_sad_grid[PART_ID_nLx2N_R] =
3196                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3197             }
3198         }
3199 
3200         {
3201             S32 i4_sad, i4_mv_cost, i4_tot_cost;
3202             S32 best_node_cost;
3203             S32 second_best_node_cost;
3204             ULWORD64 u8_temp_var, u8_temp_var1;
3205             ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
3206 
3207             {
3208                 S16 mvdx1, mvdy1;
3209                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
3210                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
3211                 S32 pred_lx = i4_search_idx;
3212 
3213                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
3214                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
3215                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
3216 
3217                 S32 inp_shift = 2;
3218                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3219                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
3220                 S32 lambda = ps_pred_ctxt->lambda;
3221                 S32 rnd = 1 << (lambda_q_shift - 1);
3222                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3223                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3224                 S32 ref_bits =
3225                     ps_pred_ctxt
3226                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
3227 
3228                 COMPUTE_DIFF_MV(
3229                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
3230 
3231                 mvdx1 = ABS(mvdx1);
3232                 mvdy1 = ABS(mvdy1);
3233 
3234                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
3235                              (mvdy1 > 0) + ref_bits + 2;
3236 
3237                 i4_mv_cost *= lambda;
3238                 i4_mv_cost += rnd;
3239                 i4_mv_cost >>= lambda_q_shift;
3240 
3241                 i4_mv_cost = CLIP_U16(i4_mv_cost);
3242             }
3243 
3244             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3245             {
3246                 S32 i4_stim_injected_sad;
3247                 S32 i4_stim_injected_cost;
3248                 S32 i4_noise_term;
3249                 unsigned long u4_shift_val;
3250                 S32 i4_bits_req;
3251 
3252                 S32 update_required = 0;
3253                 S32 part_id = pi4_valid_part_ids[i4_count];
3254                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3255 
3256                 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
3257 
3258                 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
3259 
3260                 if(ps_search_prms->i4_alpha_stim_multiplier)
3261                 {
3262                     /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
3263                     hme_compute_final_sigma_of_pu_from_base_blocks(
3264                         au4_4x4_ref_sigmaX,
3265                         au4_4x4_ref_sigmaXSquared,
3266                         au8_final_ref_sigmaX,
3267                         au8_final_ref_sigmaXSquared,
3268                         16,
3269                         4,
3270                         part_id,
3271                         4);
3272 
3273                     u8_ref_X_Square =
3274                         (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
3275                     u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
3276 
3277                     /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
3278                     /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
3279                     /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
3280                     u4_shift_val = ihevce_calc_stim_injected_variance(
3281                         au8_final_src_sigmaX,
3282                         au8_final_src_sigmaXSquared,
3283                         &u8_src_var,
3284                         i4_inv_wt,
3285                         ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
3286                         ps_wt_inp_prms->wpred_log_wdc,
3287                         part_id);
3288 
3289                     u8_ref_var = u8_ref_var >> u4_shift_val;
3290 
3291                     /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
3292                     GETRANGE64(i4_bits_req, u8_ref_var);
3293 
3294                     if(i4_bits_req > 27)
3295                     {
3296                         u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
3297                         u8_src_var = u8_src_var >> (i4_bits_req - 27);
3298                     }
3299 
3300                     if(u8_src_var == u8_ref_var)
3301                     {
3302                         u8_temp_var = (1 << STIM_Q_FORMAT);
3303                     }
3304                     else
3305                     {
3306                         u8_temp_var = (2 * u8_src_var * u8_ref_var);
3307                         u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
3308                         u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
3309                         u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
3310                         u8_temp_var = (u8_temp_var / u8_temp_var1);
3311                     }
3312 
3313                     i4_noise_term = (UWORD32)u8_temp_var;
3314 
3315                     ASSERT(i4_noise_term >= 0);
3316 
3317                     i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
3318                 }
3319                 else
3320                 {
3321                     i4_noise_term = 0;
3322                 }
3323                 u8_pure_dist = pi4_sad_grid[part_id];
3324                 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
3325                 u8_pure_dist += (1 << ((i4_q_level)-1));
3326                 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
3327 
3328                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3329                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3330                 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
3331                 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
3332 
3333                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
3334                 second_best_node_cost = SHRT_MAX;
3335 
3336                 if(i4_stim_injected_cost < second_best_node_cost)
3337                 {
3338                     update_required = 0;
3339 
3340                     if(i4_stim_injected_cost < best_node_cost)
3341                     {
3342                         update_required = 1;
3343                     }
3344                     else if(i4_stim_injected_cost == best_node_cost)
3345                     {
3346                         update_required = 0;
3347                     }
3348 
3349                     if(update_required == 2)
3350                     {
3351                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3352                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
3353                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3354                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3355                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3356                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3357                     }
3358                     else if(update_required == 1)
3359                     {
3360                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3361                         ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
3362                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3363                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3364                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3365                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3366                     }
3367                 }
3368             }
3369         }
3370 
3371         ps_search_node++;
3372     }
3373 
3374     {
3375         WORD32 i4_i;
3376         WORD32 part_id;
3377         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3378         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3379         {
3380             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3381             if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3382             {
3383                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3384                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3385                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3386 
3387                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3388             }
3389         }
3390     }
3391 }
3392 
hme_calc_sad_and_1_best_result_subpel(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)3393 void hme_calc_sad_and_1_best_result_subpel(
3394     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
3395 {
3396     S32 i4_candt;
3397     S32 i4_num_nodes;
3398 
3399     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3400 
3401     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3402     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3403     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3404     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3405 
3406     mv_refine_ctxt_t *ps_subpel_refine_ctxt;
3407     ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
3408     i4_num_nodes = 1;
3409 
3410     /* Run through each of the candts in a loop */
3411     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3412     {
3413         /**********************************************************************/
3414         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3415         /**********************************************************************/
3416         {
3417             WORD32 b, c, d;
3418             UWORD8 *pu1_cur_ptr;
3419             UWORD8 *pu1_ref_ptr;
3420             UWORD16 au2_4x4_sad[NUM_4X4];
3421 
3422             pu1_cur_ptr = ps_err_prms->pu1_inp;
3423             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3424 
3425             /* Loop to compute the SAD's */
3426             {
3427                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3428                 for(b = 0; b < NUM_4X4; b++)
3429                 {
3430                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3431                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3432 
3433                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3434                     {
3435                         WORD32 z_cur = (cur_buf_stride)*c + t1;
3436                         WORD32 z_ref = (ref_buf_stride)*c + t2;
3437                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3438                         {
3439                             au2_4x4_sad[b] += (UWORD16)ABS((
3440                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3441                         }
3442                     }
3443                 }
3444 
3445                 pi4_sad_grid[PART_ID_NxN_TL] =
3446                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3447                 pi4_sad_grid[PART_ID_NxN_TR] =
3448                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3449                 pi4_sad_grid[PART_ID_NxN_BL] =
3450                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3451                 pi4_sad_grid[PART_ID_NxN_BR] =
3452                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3453                 pi4_sad_grid[PART_ID_Nx2N_L] =
3454                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3455                 pi4_sad_grid[PART_ID_Nx2N_R] =
3456                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3457                 pi4_sad_grid[PART_ID_2NxN_T] =
3458                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3459                 pi4_sad_grid[PART_ID_2NxN_B] =
3460                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3461                 pi4_sad_grid[PART_ID_nLx2N_L] =
3462                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3463                 pi4_sad_grid[PART_ID_nRx2N_R] =
3464                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3465                 pi4_sad_grid[PART_ID_2NxnU_T] =
3466                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3467                 pi4_sad_grid[PART_ID_2NxnD_B] =
3468                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3469                 pi4_sad_grid[PART_ID_2Nx2N] =
3470                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3471                 pi4_sad_grid[PART_ID_2NxnU_B] =
3472                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3473                 pi4_sad_grid[PART_ID_2NxnD_T] =
3474                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3475                 pi4_sad_grid[PART_ID_nRx2N_L] =
3476                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3477                 pi4_sad_grid[PART_ID_nLx2N_R] =
3478                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3479             }
3480         }
3481         /**********************************************************************/
3482         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3483         /**********************************************************************/
3484         {
3485             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
3486             S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
3487             S32 best_node_cost;
3488             S32 second_best_node_cost;
3489 
3490             /*For each valid partition, update the refine_prm structure to reflect the best and second
3491             best candidates for that partition*/
3492 
3493             for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
3494             {
3495                 S32 update_required = 0;
3496                 S32 part_id = pi4_valid_part_ids[i4_count];
3497                 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3498 
3499                 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
3500                 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3501 
3502                 /*Calculate total cost*/
3503                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3504                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3505 
3506                 /*****************************************************************/
3507                 /* We do not labor through the results if the total cost worse   */
3508                 /* than the last of the results.                                 */
3509                 /*****************************************************************/
3510                 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
3511                 second_best_node_cost = SHRT_MAX;
3512 
3513                 if(i4_tot_cost < second_best_node_cost)
3514                 {
3515                     update_required = 0;
3516 
3517                     /*************************************************************/
3518                     /* Identify where the current result isto be placed.Basically*/
3519                     /* find the node which has cost just higher thannodeundertest*/
3520                     /*************************************************************/
3521                     if(i4_tot_cost < best_node_cost)
3522                     {
3523                         update_required = 1;
3524                     }
3525                     else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
3526                     {
3527                         update_required = 0;
3528                     }
3529                     if(update_required == 2)
3530                     {
3531                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3532                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3533                         ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
3534                         ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
3535                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
3536                     }
3537                     else if(update_required == 1)
3538                     {
3539                         ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3540                         ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3541                         ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
3542                         ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
3543                         ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
3544                     }
3545                 }
3546             }
3547         }
3548     }
3549 
3550     {
3551         WORD32 i4_count = 0;
3552         for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
3553         {
3554             if(ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] >= MAX_SIGNED_16BIT_VAL)
3555             {
3556                 ps_subpel_refine_ctxt->ai2_fullpel_satd[0][i4_count] = MAX_SIGNED_16BIT_VAL;
3557             }
3558         }
3559     }
3560 }
3561 
3562 /**
3563 ********************************************************************************
3564 *  @fn     hme_calc_pt_sad_and_result_explicit(hme_search_prms_t *ps_search_prms,
3565 *                                              wgt_pred_ctxt_t *ps_wt_inp_prms,
3566 *                                              err_prms_t *ps_err_prms,
3567 *                                              result_upd_prms_t *ps_result_prms,
3568 *                                              U08 **ppu1_ref,
3569 *                                              S32 i4_ref_stride)
3570 *
3571 *  @brief   Run thorugh the provided candidates and compute the point SAD and
3572 *           cost and update the results in the order
3573 *
3574 *  @param[in]  ps_search_prms
3575 *  @param[in]  ps_wt_inp_prms
3576 *  @param[in]  ps_err_prms
3577 *  @param[out] ps_result_prms
3578 *  @param[in]  ppu1_ref
3579 *  @param[in]  i4_ref_stride
3580 *
3581 *  @return   None
3582 ********************************************************************************
3583 */
3584 
hme_calc_pt_sad_and_result_explicit(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)3585 void hme_calc_pt_sad_and_result_explicit(
3586     hme_search_prms_t *ps_search_prms,
3587     wgt_pred_ctxt_t *ps_wt_inp_prms,
3588     err_prms_t *ps_err_prms,
3589     result_upd_prms_t *ps_result_prms,
3590     U08 **ppu1_ref,
3591     S32 i4_ref_stride)
3592 {
3593     WORD32 i4_grid_mask, i4_part_mask, i4_num_results, i4_candt, i4_num_nodes;
3594     WORD32 i4_inp_stride, i4_inp_off, i4_ref_offset;
3595 
3596     search_node_t *ps_search_node;
3597     BLK_SIZE_T e_blk_size;
3598     PF_SAD_FXN_T pf_sad_fxn;
3599     PF_RESULT_FXN_T pf_hme_result_fxn;
3600 
3601     i4_grid_mask = 0x1; /* Point SAD */
3602 
3603     /* Get the parameters required */
3604     i4_part_mask = ps_search_prms->i4_part_mask;
3605     e_blk_size = ps_search_prms->e_blk_size;
3606     i4_num_results = (S32)ps_search_prms->ps_search_results->u1_num_results_per_part;
3607     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3608     ps_search_node = ps_search_prms->ps_search_nodes;
3609 
3610     i4_inp_stride = ps_search_prms->i4_inp_stride;
3611     /* Move to the location of the search blk in inp buffer */
3612     i4_inp_off = ps_search_prms->i4_cu_x_off;
3613     i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride;
3614     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3615 
3616     pf_sad_fxn = hme_get_sad_fxn(e_blk_size, i4_grid_mask, i4_part_mask);
3617     /**********************************************************************/
3618     /* we have a sparsely populated SAD grid of size 9x17.                */
3619     /* the id of the results in the grid is shown                         */
3620     /*     5   2   6                                                      */
3621     /*     1   0   3                                                      */
3622     /*     7   4   8                                                      */
3623     /* The motivation for choosing a grid like this is that               */
3624     /* in case of no refinement, the central location is                  */
3625     /* the first entry in the grid                                        */
3626     /* Also for diamond, the 4 entries get considered first               */
3627     /* This is consistent with the diamond notation used in               */
3628     /* subpel refinement. To Check                                        */
3629     /* Update the results for the given search candt                      */
3630     /* returns the cost of the 2Nx2N partition                            */
3631     /**********************************************************************/
3632 
3633     /* Get the modified update result fun. with CLIP16 of cost to match   */
3634     /* with SIMD */
3635     pf_hme_result_fxn = hme_update_results_grid_pu_bestn_no_encode;
3636 
3637     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3638     {
3639         if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3640             continue;
3641 
3642         /* initialize minimum cost for this candidate. As we search around */
3643         /* this candidate, this is used to check early exit, when in any   */
3644         /* given iteration, the center pt of the grid is lowest value      */
3645         ps_result_prms->i4_min_cost = MAX_32BIT_VAL;
3646 
3647         ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3648         ps_err_prms->i4_grid_mask = i4_grid_mask;
3649 
3650         ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3651         ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3652         ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3653 
3654         /**********************************************************************/
3655         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
3656         /**********************************************************************/
3657         pf_sad_fxn(ps_err_prms);
3658 
3659         /**********************************************************************/
3660         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
3661         /**********************************************************************/
3662         ps_result_prms->i4_grid_mask = i4_grid_mask;
3663         ps_result_prms->ps_search_node_base = ps_search_node;
3664         pf_hme_result_fxn(ps_result_prms);
3665 
3666         ps_search_node++;
3667     }
3668 }
3669 
3670 /**
3671 ********************************************************************************
3672 *  @fn     hme_set_mvp_node(search_results_t *ps_search_results,
3673 *                           search_node_t *ps_candt_prj_coloc,
3674 *                           S08 i1_ref_idx)
3675 *
3676 *  @brief   Set node used for motion vector predictor computation
3677 *           Either TR or L is compared to projected colocated and
3678 *           closest is decided as MVP
3679 *
3680 *  @param[in]  ps_search_results
3681 *
3682 *  @param[in]  ps_candt_prj_coloc
3683 *
3684 *  @param[in]  i1_ref_idx
3685 *
3686 *  @return   None
3687 ********************************************************************************
3688 */
hme_set_mvp_node(search_results_t * ps_search_results,search_node_t * ps_candt_prj_coloc,U08 u1_pred_lx,U08 u1_default_ref_id)3689 void hme_set_mvp_node(
3690     search_results_t *ps_search_results,
3691     search_node_t *ps_candt_prj_coloc,
3692     U08 u1_pred_lx,
3693     U08 u1_default_ref_id)
3694 {
3695     S32 i;
3696     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[u1_pred_lx];
3697     pred_candt_nodes_t *ps_pred_nodes = ps_pred_ctxt->as_pred_nodes;
3698     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
3699 
3700     S32 inp_shift = 2;
3701     S32 pred_shift;
3702     S32 ref_bits;
3703     S32 mv_p_x, mv_p_y;
3704     S16 mvdx1, mvdx2, mvdy1, mvdy2;
3705 
3706     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[u1_pred_lx][u1_default_ref_id];
3707 
3708     /*************************************************************************/
3709     /* Priority to bottom left availability. Else we go to left. If both are */
3710     /* not available, then a remains null                                    */
3711     /*************************************************************************/
3712     if(ps_pred_nodes->ps_l->u1_is_avail)
3713     {
3714         ps_pred_node_a = ps_pred_nodes->ps_l;
3715     }
3716 
3717     if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
3718     {
3719         ps_pred_node_b = ps_pred_nodes->ps_tr;
3720     }
3721     else
3722     {
3723         ps_pred_node_b = ps_pred_nodes->ps_coloc;
3724         ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3725     }
3726 
3727     if(ps_pred_node_a == NULL)
3728     {
3729         ps_pred_node_a = ps_pred_nodes->ps_coloc;
3730         ps_pred_node_a->s_mv = ps_pred_node_a->ps_mv[0];
3731 
3732         if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
3733         {
3734             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
3735             ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3736         }
3737     }
3738 
3739     if(ps_pred_node_a->i1_ref_idx != u1_default_ref_id)
3740     {
3741         SCALE_FOR_POC_DELTA(
3742             mv_p_x, mv_p_y, ps_pred_node_a, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3743     }
3744     else
3745     {
3746         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3747         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3748     }
3749     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3750     COMPUTE_MV_DIFFERENCE(mvdx1, mvdy1, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3751     mvdx1 = ABS(mvdx1);
3752     mvdy1 = ABS(mvdy1);
3753 
3754     if(ps_pred_node_b->i1_ref_idx != u1_default_ref_id)
3755     {
3756         SCALE_FOR_POC_DELTA(
3757             mv_p_x, mv_p_y, ps_pred_node_b, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3758     }
3759     else
3760     {
3761         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
3762         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
3763     }
3764     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
3765     COMPUTE_MV_DIFFERENCE(mvdx2, mvdy2, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3766     mvdx2 = ABS(mvdx2);
3767     mvdy2 = ABS(mvdy2);
3768 
3769     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
3770     {
3771         for(i = 0; i < TOT_NUM_PARTS; i++)
3772         {
3773             ps_pred_nodes[i].ps_mvp_node = ps_pred_node_a;
3774         }
3775     }
3776     else
3777     {
3778         for(i = 0; i < TOT_NUM_PARTS; i++)
3779         {
3780             ps_pred_nodes[i].ps_mvp_node = ps_pred_node_b;
3781         }
3782     }
3783 }
3784