1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /*!
22 ***************************************************************************
23 * \file hme_err_compute.c
24 *
25 * \brief
26 * SAD / SATD routines for error computation
27 *
28 * Detailed_description : Contains various types of SAD/SATD routines for
29 * error computation between a given input and reference ptr. The SAD
30 * routines can evaluate for either a single point or a grid, and can
31 * evaluate with either partial updates or no partial updates. Partial
32 * updates means evaluating sub block SADs, e.g. 4 4x4 subblock SAD in
33 * addition to the main 8x8 block SAD.
34 *
35 * \date
36 * 22/9/2012
37 *
38 * \author Ittiam
39 ***************************************************************************
40 */
41
42 /*****************************************************************************/
43 /* File Includes */
44 /*****************************************************************************/
45 /* System include files */
46 #include <stdio.h>
47 #include <string.h>
48 #include <stdlib.h>
49 #include <assert.h>
50 #include <stdarg.h>
51 #include <math.h>
52 #include <limits.h>
53
54 /* User include files */
55 #include "ihevc_typedefs.h"
56 #include "itt_video_api.h"
57 #include "ihevce_api.h"
58
59 #include "rc_cntrl_param.h"
60 #include "rc_frame_info_collector.h"
61 #include "rc_look_ahead_params.h"
62
63 #include "ihevc_defs.h"
64 #include "ihevc_structs.h"
65 #include "ihevc_platform_macros.h"
66 #include "ihevc_deblk.h"
67 #include "ihevc_itrans_recon.h"
68 #include "ihevc_chroma_itrans_recon.h"
69 #include "ihevc_chroma_intra_pred.h"
70 #include "ihevc_intra_pred.h"
71 #include "ihevc_inter_pred.h"
72 #include "ihevc_mem_fns.h"
73 #include "ihevc_padding.h"
74 #include "ihevc_weighted_pred.h"
75 #include "ihevc_sao.h"
76 #include "ihevc_resi_trans.h"
77 #include "ihevc_quant_iquant_ssd.h"
78 #include "ihevc_cabac_tables.h"
79
80 #include "ihevce_defs.h"
81 #include "ihevce_lap_enc_structs.h"
82 #include "ihevce_multi_thrd_structs.h"
83 #include "ihevce_multi_thrd_funcs.h"
84 #include "ihevce_me_common_defs.h"
85 #include "ihevce_had_satd.h"
86 #include "ihevce_error_codes.h"
87 #include "ihevce_bitstream.h"
88 #include "ihevce_cabac.h"
89 #include "ihevce_rdoq_macros.h"
90 #include "ihevce_function_selector.h"
91 #include "ihevce_enc_structs.h"
92 #include "ihevce_entropy_structs.h"
93 #include "ihevce_cmn_utils_instr_set_router.h"
94 #include "ihevce_enc_loop_structs.h"
95 #include "ihevce_bs_compute_ctb.h"
96 #include "ihevce_global_tables.h"
97 #include "ihevce_dep_mngr_interface.h"
98 #include "hme_datatype.h"
99 #include "hme_interface.h"
100 #include "hme_common_defs.h"
101 #include "hme_defs.h"
102 #include "ihevce_me_instr_set_router.h"
103 #include "hme_globals.h"
104 #include "hme_utils.h"
105 #include "hme_coarse.h"
106 #include "hme_refine.h"
107 #include "hme_err_compute.h"
108 #include "hme_common_utils.h"
109 #include "hme_search_algo.h"
110 #include "ihevce_stasino_helpers.h"
111
112 /******************************************************************************
113 * MACRO DEFINITIONS
114 ******************************************************************************/
115
116 /*****************************************************************************/
117 /* Theoritically, the various types of SAD functions that are needed for */
118 /* reasons of optimality. SADs that are to be evaluated at a single pt can be*/
119 /* more optimal than SADs that are to be evaluated for a grid of 3x3. The */
120 /* SADs to be evaluated at a grid are classified as separate functions, since*/
121 /* evaluating them on a single function call helps reuse inputs for a small */
122 /* grid of 3x3. Also, if no partial updates are required, there are 3 basic */
123 /* funcitons, width 4K (K = odd number), width 8K (K = odd number) and width */
124 /* 16K, K any number. For partial updates, it is assumed that the block size */
125 /* is square (8x8, 16x16, 32x32, 64x64) and further differentiation is done */
126 /* based on the basic evaluation unit. E.g. if 16x16 blk size requires, part */
127 /* update on AMP partitions, then basic SAD unit is 4x4, if it doesnt, then */
128 /* basic SAD unit is 8x8. */
129 /*****************************************************************************/
130
131 #define UPD_RES_PT_NPU_BEST1 hme_update_results_grid_pu_bestn
132 #define UPD_RES_PT_NPU_BESTN hme_update_results_grid_pu_bestn
133 #define UPD_RES_PT_PU_BEST1 hme_update_results_grid_pu_bestn
134 #define UPD_RES_PT_PU_BESTN hme_update_results_grid_pu_bestn
135 #define UPD_RES_GRID_NPU_BEST1 hme_update_results_grid_pu_bestn
136 #define UPD_RES_GRID_NPU_BESTN hme_update_results_grid_pu_bestn
137 #define UPD_RES_GRID_PU_BEST1 hme_update_results_grid_pu_bestn
138 #define UPD_RES_GRID_PU_BESTN hme_update_results_grid_pu_bestn
139
140 /*******************************************************************************
141 * FUNCTION DEFINITIONS
142 *******************************************************************************/
hme_cmp_nodes(search_node_t * ps_best_node1,search_node_t * ps_best_node2)143 S32 hme_cmp_nodes(search_node_t *ps_best_node1, search_node_t *ps_best_node2)
144 {
145 if((ps_best_node1->s_mv.i2_mvx == ps_best_node2->s_mv.i2_mvx) &&
146 (ps_best_node1->s_mv.i2_mvy == ps_best_node2->s_mv.i2_mvy) &&
147 (ps_best_node1->i1_ref_idx == ps_best_node2->i1_ref_idx))
148 {
149 return 0;
150 }
151 return -1;
152 }
153
compute_4x4_sads_for_16x16_blk(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,UWORD16 ** u2_part_sads,cand_t * ps_cand,WORD32 * num_cands)154 void compute_4x4_sads_for_16x16_blk(
155 grid_ctxt_t *ps_grid, /* Grid ctxt */
156 UWORD8 *pu1_cur_ptr, /* Pointer to top-left of current block */
157 WORD32 cur_buf_stride, /* Buffer stride of current buffer */
158 UWORD16 **
159 u2_part_sads, /* 2D Array containing SADs for all 17 partitions. As many rows as partitions. SADs in a row correspond to each of the candidates */
160 cand_t *ps_cand, /* Return the list of candidates evaluated */
161 WORD32 *num_cands /* Number of candidates that were processed */
162 )
163 {
164 WORD32 a, b, c, d, i;
165 WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
166 WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
167 //WORD32 offset_x[9] = {-grd_sz_x, 0, grd_sz_x, -grd_sz_x, 0, grd_sz_x, grd_sz_x, 0, -grd_sz_x};
168 //WORD32 offset_y[9] = {-grd_sz_y, -grd_sz_y, -grd_sz_y, 0, 0, 0, grd_sz_y, grd_sz_y, grd_sz_y};
169 /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
170 WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
171 WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
172 WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
173 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
174 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
175 cand_t *cand0 = ps_cand;
176 UWORD16 au2_4x4_sad[NUM_4X4];
177
178 *num_cands = 0;
179
180 /* Loop to fill up the cand_t array and to calculate num_cands */
181 for(i = 0; i < ps_grid->num_grids; i++)
182 {
183 WORD32 j;
184 WORD32 mask = ps_grid->pi4_grd_mask[i];
185 UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
186 WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
187 WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
188
189 for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
190 {
191 if(mask & 1)
192 {
193 *num_cands = *num_cands + 1;
194 cand0->grid_ix = i;
195 cand0->ref_idx = ps_grid->p_ref_idx[i];
196 cand0->pu1_ref_ptr =
197 pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
198 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
199 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
200 cand0++;
201 }
202 }
203 }
204
205 /* Loop to compute the SAD's */
206 for(a = 0; a < *num_cands; a++)
207 {
208 cand_t *cand = ps_cand + a;
209 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
210 for(b = 0; b < NUM_4X4; b++)
211 {
212 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
213 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
214
215 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
216 {
217 WORD32 z_cur = (cur_buf_stride)*c + t1;
218 WORD32 z_ref = (ref_buf_stride)*c + t2;
219 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
220 {
221 au2_4x4_sad[b] += (UWORD16)ABS(
222 (((S32)cand->pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
223 }
224 }
225 }
226
227 u2_part_sads[PART_ID_NxN_TL][a] =
228 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
229 u2_part_sads[PART_ID_NxN_TR][a] =
230 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
231 u2_part_sads[PART_ID_NxN_BL][a] =
232 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
233 u2_part_sads[PART_ID_NxN_BR][a] =
234 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
235 u2_part_sads[PART_ID_Nx2N_L][a] =
236 u2_part_sads[PART_ID_NxN_TL][a] + u2_part_sads[PART_ID_NxN_BL][a];
237 u2_part_sads[PART_ID_Nx2N_R][a] =
238 u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_BR][a];
239 u2_part_sads[PART_ID_2NxN_T][a] =
240 u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_TL][a];
241 u2_part_sads[PART_ID_2NxN_B][a] =
242 u2_part_sads[PART_ID_NxN_BR][a] + u2_part_sads[PART_ID_NxN_BL][a];
243 u2_part_sads[PART_ID_nLx2N_L][a] =
244 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
245 u2_part_sads[PART_ID_nRx2N_R][a] =
246 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
247 u2_part_sads[PART_ID_2NxnU_T][a] =
248 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
249 u2_part_sads[PART_ID_2NxnD_B][a] =
250 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
251 u2_part_sads[PART_ID_2Nx2N][a] =
252 u2_part_sads[PART_ID_2NxN_T][a] + u2_part_sads[PART_ID_2NxN_B][a];
253 u2_part_sads[PART_ID_2NxnU_B][a] =
254 u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnU_T][a];
255 u2_part_sads[PART_ID_2NxnD_T][a] =
256 u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnD_B][a];
257 u2_part_sads[PART_ID_nRx2N_L][a] =
258 u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nRx2N_R][a];
259 u2_part_sads[PART_ID_nLx2N_R][a] =
260 u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nLx2N_L][a];
261 }
262 }
263
264 /**
265 ********************************************************************************
266 * @fn compute_part_sads_for_MxM_blk(grid_ctxt_t *ps_grid,
267 * UWORD8 *pu1_cur_ptr,
268 * WORD32 cur_buf_stride,
269 * WORD32 **pi4_part_sads,
270 * cand_t *ps_cand,
271 * WORD32 *num_cands
272 *
273 * @brief Computes partial SADs and updates partition results for an MxM blk
274 * and does so for several grids of points. This can be used for
275 * 32x32/64x64 blks with 17 partition updates
276 *
277 *
278 * @param[in] ps_grid : Pointer to grid ctxt that has multiple grid of max
279 * 9 pts per grid
280 *
281 * @param[in] pu1_cur_ptr : Top left of input buffer
282 *
283 * @param[in] pi4_part_sads : array of pointers, each entry pointing to
284 * results to be updated for a given partition
285 *
286 * @return The ps_search_results structure has the best result updated for
287 * the 2Nx2N partition alone.
288
289 ********************************************************************************
290 */
compute_part_sads_for_MxM_blk(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,WORD32 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands,CU_SIZE_T e_cu_size)291 void compute_part_sads_for_MxM_blk(
292 grid_ctxt_t *ps_grid,
293 UWORD8 *pu1_cur_ptr,
294 WORD32 cur_buf_stride,
295 WORD32 **pp_part_sads,
296 cand_t *ps_cand,
297 WORD32 *num_cands,
298 CU_SIZE_T e_cu_size)
299 {
300 WORD32 a, b, c, d, i;
301 WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
302 WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
303
304 /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
305 WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
306 WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
307 WORD32 shift = (WORD32)e_cu_size;
308
309 WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
310 WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
311 WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
312 /* Num rows and pixels per row: 8 for CU_32x32 and 16 for CU_64x64 */
313 WORD32 num_rows_in_nxn = 2 << shift;
314 WORD32 num_pixels_in_row = 2 << shift;
315 cand_t *cand0 = ps_cand;
316 /* for a 2Nx2N partition we evaluate nxn SADs, where n = N/2. This is */
317 /* needed for AMP cases. */
318 WORD32 a_nxn_sad[NUM_4X4];
319 *num_cands = 0;
320
321 /* Loop to fill up the cand_t array and to calculate num_cands */
322 for(i = 0; i < ps_grid->num_grids; i++)
323 {
324 WORD32 j;
325 WORD32 mask = ps_grid->pi4_grd_mask[i];
326 UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
327 WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
328 WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
329
330 for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
331 {
332 if(mask & 1)
333 {
334 *num_cands = *num_cands + 1;
335 cand0->grid_ix = i;
336 cand0->ref_idx = ps_grid->p_ref_idx[i];
337 cand0->pu1_ref_ptr =
338 pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
339 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
340 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
341 cand0++;
342 }
343 }
344 }
345
346 /* Loop to compute the SAD's */
347 for(a = 0; a < *num_cands; a++)
348 {
349 cand_t *cand = ps_cand + a;
350 memset(&a_nxn_sad[0], 0, NUM_4X4 * sizeof(WORD32));
351 for(b = 0; b < NUM_4X4; b++)
352 {
353 WORD32 t1 = (b % 4) * num_pixels_in_row + (b >> 2) * cur_buf_stride_lsN;
354 WORD32 t2 = (b % 4) * num_pixels_in_row + (b >> 2) * ref_buf_stride_lsN;
355
356 for(c = 0; c < num_rows_in_nxn; c++)
357 {
358 WORD32 z_cur = (cur_buf_stride)*c + t1;
359 WORD32 z_ref = (ref_buf_stride)*c + t2;
360 for(d = 0; d < num_pixels_in_row; d++)
361 {
362 a_nxn_sad[b] += (WORD32)ABS(
363 (((WORD32)cand->pu1_ref_ptr[(z_ref + d)]) -
364 ((WORD32)pu1_cur_ptr[(z_cur + d)])));
365 }
366 }
367 }
368
369 pp_part_sads[PART_ID_NxN_TL][a] =
370 (a_nxn_sad[0] + a_nxn_sad[1] + a_nxn_sad[4] + a_nxn_sad[5]);
371 pp_part_sads[PART_ID_NxN_TR][a] =
372 (a_nxn_sad[2] + a_nxn_sad[3] + a_nxn_sad[6] + a_nxn_sad[7]);
373 pp_part_sads[PART_ID_NxN_BL][a] =
374 (a_nxn_sad[8] + a_nxn_sad[9] + a_nxn_sad[12] + a_nxn_sad[13]);
375 pp_part_sads[PART_ID_NxN_BR][a] =
376 (a_nxn_sad[10] + a_nxn_sad[11] + a_nxn_sad[14] + a_nxn_sad[15]);
377 pp_part_sads[PART_ID_Nx2N_L][a] =
378 pp_part_sads[PART_ID_NxN_TL][a] + pp_part_sads[PART_ID_NxN_BL][a];
379 pp_part_sads[PART_ID_Nx2N_R][a] =
380 pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_BR][a];
381 pp_part_sads[PART_ID_2NxN_T][a] =
382 pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_TL][a];
383 pp_part_sads[PART_ID_2NxN_B][a] =
384 pp_part_sads[PART_ID_NxN_BR][a] + pp_part_sads[PART_ID_NxN_BL][a];
385 pp_part_sads[PART_ID_nLx2N_L][a] =
386 (a_nxn_sad[8] + a_nxn_sad[0] + a_nxn_sad[12] + a_nxn_sad[4]);
387 pp_part_sads[PART_ID_nRx2N_R][a] =
388 (a_nxn_sad[3] + a_nxn_sad[7] + a_nxn_sad[15] + a_nxn_sad[11]);
389 pp_part_sads[PART_ID_2NxnU_T][a] =
390 (a_nxn_sad[1] + a_nxn_sad[0] + a_nxn_sad[2] + a_nxn_sad[3]);
391 pp_part_sads[PART_ID_2NxnD_B][a] =
392 (a_nxn_sad[15] + a_nxn_sad[14] + a_nxn_sad[12] + a_nxn_sad[13]);
393 pp_part_sads[PART_ID_2Nx2N][a] =
394 pp_part_sads[PART_ID_2NxN_T][a] + pp_part_sads[PART_ID_2NxN_B][a];
395 pp_part_sads[PART_ID_2NxnU_B][a] =
396 pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnU_T][a];
397 pp_part_sads[PART_ID_2NxnD_T][a] =
398 pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnD_B][a];
399 pp_part_sads[PART_ID_nRx2N_L][a] =
400 pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nRx2N_R][a];
401 pp_part_sads[PART_ID_nLx2N_R][a] =
402 pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nLx2N_L][a];
403 }
404 }
405
hme_evalsad_grid_pu_16x16(err_prms_t * ps_prms)406 void hme_evalsad_grid_pu_16x16(err_prms_t *ps_prms)
407 {
408 grid_ctxt_t s_grid;
409 cand_t as_candt[9];
410 U16 au2_sad_grid[TOT_NUM_PARTS * 9];
411 U16 *apu2_sad_grid[TOT_NUM_PARTS];
412 hme_mv_t s_mv = { 0, 0 };
413 S32 i4_ref_idx = 0, i;
414 S32 num_candts = 0;
415 s_grid.num_grids = 1;
416 s_grid.ref_buf_stride = ps_prms->i4_ref_stride;
417 s_grid.grd_sz_y_x = ((ps_prms->i4_step << 16) | ps_prms->i4_step);
418 s_grid.ppu1_ref_ptr = &ps_prms->pu1_ref;
419 s_grid.pi4_grd_mask = &ps_prms->i4_grid_mask;
420 s_grid.p_mv = &s_mv;
421 s_grid.p_ref_idx = &i4_ref_idx;
422 for(i = 0; i < 9; i++)
423 {
424 if(s_grid.pi4_grd_mask[0] & (1 << i))
425 num_candts++;
426 }
427
428 for(i = 0; i < TOT_NUM_PARTS; i++)
429 apu2_sad_grid[i] = &au2_sad_grid[i * num_candts];
430
431 compute_4x4_sads_for_16x16_blk(
432 &s_grid, ps_prms->pu1_inp, ps_prms->i4_inp_stride, apu2_sad_grid, as_candt, &num_candts);
433 for(i = 0; i < TOT_NUM_PARTS * num_candts; i++)
434 {
435 ps_prms->pi4_sad_grid[i] = au2_sad_grid[i];
436 }
437 }
438
hme_evalsad_grid_npu_MxN(err_prms_t * ps_prms)439 void hme_evalsad_grid_npu_MxN(err_prms_t *ps_prms)
440 {
441 U08 *pu1_inp_base, *pu1_ref_c;
442 S32 *pi4_sad = ps_prms->pi4_sad_grid;
443 S32 i, grid_count = 0;
444 S32 step = ps_prms->i4_step;
445 S32 x_off = step, y_off = step * ps_prms->i4_ref_stride;
446
447 ASSERT((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
448
449 //assert(ps_prms->i4_blk_ht <= 8);
450 //assert(ps_prms->i4_blk_wd <= 8);
451 for(i = 0; i < 9; i++)
452 {
453 if(ps_prms->i4_grid_mask & (1 << i))
454 grid_count++;
455 }
456 pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
457
458 pu1_inp_base = ps_prms->pu1_inp;
459 pu1_ref_c = ps_prms->pu1_ref;
460 for(i = 0; i < 9; i++)
461 {
462 S32 sad = 0, j, k;
463 U08 *pu1_inp, *pu1_ref;
464
465 if(!(ps_prms->i4_grid_mask & (1 << i)))
466 continue;
467 pu1_ref = pu1_ref_c + x_off * gai1_grid_id_to_x[i];
468 pu1_ref += y_off * gai1_grid_id_to_y[i];
469 pu1_inp = pu1_inp_base;
470
471 for(j = 0; j < ps_prms->i4_blk_ht; j++)
472 {
473 for(k = 0; k < ps_prms->i4_blk_wd; k++)
474 {
475 sad += (ABS((pu1_inp[k] - pu1_ref[k])));
476 }
477 pu1_inp += ps_prms->i4_inp_stride;
478 pu1_ref += ps_prms->i4_ref_stride;
479 }
480 *pi4_sad++ = sad;
481 }
482 }
483
hme_evalsad_pt_npu_MxN_8bit_compute(WORD32 ht,WORD32 wd,UWORD8 * pu1_inp,UWORD8 * pu1_ref,WORD32 i4_inp_stride,WORD32 i4_ref_stride)484 WORD32 hme_evalsad_pt_npu_MxN_8bit_compute(
485 WORD32 ht,
486 WORD32 wd,
487 UWORD8 *pu1_inp,
488 UWORD8 *pu1_ref,
489 WORD32 i4_inp_stride,
490 WORD32 i4_ref_stride)
491 {
492 WORD32 i, j;
493 WORD32 sad = 0;
494 for(i = 0; i < ht; i++)
495 {
496 for(j = 0; j < wd; j++)
497 {
498 sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
499 }
500 pu1_inp += i4_inp_stride;
501 pu1_ref += i4_ref_stride;
502 }
503 return sad;
504 }
505
hme_evalsad_pt_npu_MxN_8bit(err_prms_t * ps_prms)506 void hme_evalsad_pt_npu_MxN_8bit(err_prms_t *ps_prms)
507 {
508 S32 wd, ht;
509 U08 *pu1_inp, *pu1_ref;
510
511 wd = ps_prms->i4_blk_wd;
512 ht = ps_prms->i4_blk_ht;
513
514 pu1_inp = ps_prms->pu1_inp;
515 pu1_ref = ps_prms->pu1_ref;
516
517 ps_prms->pi4_sad_grid[0] = hme_evalsad_pt_npu_MxN_8bit_compute(
518 ht, wd, pu1_inp, pu1_ref, ps_prms->i4_inp_stride, ps_prms->i4_ref_stride);
519 }
520
compute_satd_8bit(err_prms_t * ps_prms)521 void compute_satd_8bit(err_prms_t *ps_prms)
522 {
523 U08 *pu1_origin;
524 S32 src_strd;
525 U08 *pu1_pred_buf;
526 S32 dst_strd;
527 S32 wd, ht;
528 U32 u4_sad = 0;
529 WORD32 x, y;
530 U08 *u1_pi0, *u1_pi1;
531
532 pu1_origin = ps_prms->pu1_inp;
533 pu1_pred_buf = ps_prms->pu1_ref;
534 src_strd = ps_prms->i4_inp_stride;
535 dst_strd = ps_prms->i4_ref_stride;
536 wd = ps_prms->i4_blk_wd;
537 ht = ps_prms->i4_blk_ht;
538
539 u1_pi0 = pu1_origin;
540 u1_pi1 = pu1_pred_buf;
541
542 /* Follows the following logic:
543 For block sizes less than or equal to 16X16, the basic transform size is 4x4
544 For block sizes greater than or equal to 32x32, the basic transform size is 8x8 */
545 if((wd > 0x10) || (ht > 0x10))
546 {
547 for(y = 0; y < ht; y += 8)
548 {
549 for(x = 0; x < wd; x += 8)
550 {
551 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
552 &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
553 }
554 u1_pi0 += src_strd * 8;
555 u1_pi1 += dst_strd * 8;
556 }
557 }
558 else
559 {
560 for(y = 0; y < ht; y += 4)
561 {
562 for(x = 0; x < wd; x += 4)
563 {
564 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
565 &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
566 }
567 u1_pi0 += src_strd * 4;
568 u1_pi1 += dst_strd * 4;
569 }
570 }
571
572 ps_prms->pi4_sad_grid[0] = (S32)u4_sad;
573 }
574
hme_init_pred_part(pred_ctxt_t * ps_pred_ctxt,search_node_t * ps_tl,search_node_t * ps_t,search_node_t * ps_tr,search_node_t * ps_l,search_node_t * ps_bl,search_node_t * ps_coloc,search_node_t * ps_zeromv,search_node_t ** pps_proj_coloc,PART_ID_T e_part_id)575 void hme_init_pred_part(
576 pred_ctxt_t *ps_pred_ctxt,
577 search_node_t *ps_tl,
578 search_node_t *ps_t,
579 search_node_t *ps_tr,
580 search_node_t *ps_l,
581 search_node_t *ps_bl,
582 search_node_t *ps_coloc,
583 search_node_t *ps_zeromv,
584 search_node_t **pps_proj_coloc,
585 PART_ID_T e_part_id)
586 {
587 pred_candt_nodes_t *ps_candt_nodes;
588
589 ps_candt_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
590
591 ps_candt_nodes->ps_tl = ps_tl;
592 ps_candt_nodes->ps_tr = ps_tr;
593 ps_candt_nodes->ps_t = ps_t;
594 ps_candt_nodes->ps_l = ps_l;
595 ps_candt_nodes->ps_bl = ps_bl;
596 ps_candt_nodes->ps_coloc = ps_coloc;
597 ps_candt_nodes->ps_zeromv = ps_zeromv;
598 ps_candt_nodes->pps_proj_coloc = pps_proj_coloc;
599 }
600
hme_init_pred_ctxt_no_encode(pred_ctxt_t * ps_pred_ctxt,search_results_t * ps_search_results,search_node_t * ps_top_candts,search_node_t * ps_left_candts,search_node_t ** pps_proj_coloc_candts,search_node_t * ps_coloc_candts,search_node_t * ps_zeromv_candt,S32 pred_lx,S32 lambda,S32 lambda_q_shift,U08 ** ppu1_ref_bits_tlu,S16 * pi2_ref_scf)601 void hme_init_pred_ctxt_no_encode(
602 pred_ctxt_t *ps_pred_ctxt,
603 search_results_t *ps_search_results,
604 search_node_t *ps_top_candts,
605 search_node_t *ps_left_candts,
606 search_node_t **pps_proj_coloc_candts,
607 search_node_t *ps_coloc_candts,
608 search_node_t *ps_zeromv_candt,
609 S32 pred_lx,
610 S32 lambda,
611 S32 lambda_q_shift,
612 U08 **ppu1_ref_bits_tlu,
613 S16 *pi2_ref_scf)
614 {
615 search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
616 search_node_t *ps_coloc;
617 PART_ID_T e_part_id;
618
619 /* Assume that resolution is subpel to begin with */
620 ps_pred_ctxt->mv_pel = 0; // FPEL
621
622 /* lambda and pred_lx (PRED_L0/PRED_L1) */
623 ps_pred_ctxt->lambda = lambda;
624 ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
625 ps_pred_ctxt->pred_lx = pred_lx;
626 ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
627 ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
628 ps_pred_ctxt->proj_used = 0;
629
630 /* Bottom left should not be valid */
631 ASSERT(ps_left_candts[2].u1_is_avail == 0);
632 ps_invalid = &ps_left_candts[2];
633
634 /*************************************************************************/
635 /* for the case of no encode, the idea is to set up cants as follows */
636 /* */
637 /* ____ ______________ */
638 /* | TL | T | T1 | TR | */
639 /* |____|____|____|____| */
640 /* | L | b0 | b1 | */
641 /* |____|____|____| */
642 /* | L1 | b2 | b3 | */
643 /* |____|____|____| */
644 /* | BL | */
645 /* |____| */
646 /* */
647 /* If use_4x4 is 0, then b0,b1,b2,b3 are single 8x8 blk. then T=T1 */
648 /* and L=L1. topleft, top and topright are TL,T,TR respectively */
649 /* Left and bottom left is L and BL respectively. */
650 /* If use_4x4 is 1: then the above holds true only for PARTID = 0 (8x8) */
651 /* For the 4 subblocks (partids 4-7) */
652 /* */
653 /* Block Left Top Top Left Top Right Bottom Left */
654 /* b0 L T TL T1 L1 */
655 /* b1 b0 T1 T TR BL(invalid) */
656 /* b2 L1 b0 L0 b1 BL (invalid) */
657 /* b3 b2 b1 b0 BL(inv) BL (inv) */
658 /* */
659 /* Note : For block b1, bottom left pts to b2, which is not yet ready */
660 /* hence it is kept invalid and made to pt to BL. For block b3 top rt */
661 /* is invalid and hence made to pt to BL which is invalid. */
662 /* BL is invalid since it lies in a bottom left 8x8 blk and not yet ready*/
663 /*************************************************************************/
664
665 /* ps_coloc always points to a fixe candt (global) */
666 /* TODO : replace incoming ps_coloc from global to geniune coloc */
667 ps_coloc = ps_coloc_candts;
668
669 /* INITIALIZATION OF 8x8 BLK */
670 ps_tl = ps_top_candts;
671 ps_t = ps_tl + 2;
672 ps_tr = ps_t + 1;
673 ps_l = ps_left_candts + 1;
674 ps_bl = ps_invalid;
675 e_part_id = PART_ID_2Nx2N;
676 hme_init_pred_part(
677 ps_pred_ctxt,
678 ps_tl,
679 ps_t,
680 ps_tr,
681 ps_l,
682 ps_bl,
683 ps_coloc,
684 ps_zeromv_candt,
685 pps_proj_coloc_candts,
686 e_part_id);
687
688 /* INITIALIZATION OF 4x4 TL BLK */
689 e_part_id = PART_ID_NxN_TL;
690 ps_tl = ps_top_candts;
691 ps_t = ps_tl + 1;
692 ps_tr = ps_t + 1;
693 ps_l = ps_left_candts;
694 ps_bl = ps_l + 1;
695 hme_init_pred_part(
696 ps_pred_ctxt,
697 ps_tl,
698 ps_t,
699 ps_tr,
700 ps_l,
701 ps_bl,
702 ps_coloc,
703 ps_zeromv_candt,
704 pps_proj_coloc_candts,
705 e_part_id);
706
707 /* INITIALIZATION OF 4x4 TR BLK */
708 e_part_id = PART_ID_NxN_TR;
709 ps_tl = ps_top_candts + 1;
710 ps_t = ps_tl + 1;
711 ps_tr = ps_t + 1;
712 ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
713 ps_bl = ps_invalid;
714 hme_init_pred_part(
715 ps_pred_ctxt,
716 ps_tl,
717 ps_t,
718 ps_tr,
719 ps_l,
720 ps_bl,
721 ps_coloc,
722 ps_zeromv_candt,
723 pps_proj_coloc_candts,
724 e_part_id);
725
726 /* INITIALIZATION OF 4x4 BL BLK */
727 e_part_id = PART_ID_NxN_BL;
728 ps_tl = ps_left_candts;
729 ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
730 ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
731 ps_l = ps_left_candts + 1;
732 ps_bl = ps_invalid; //invalid
733 hme_init_pred_part(
734 ps_pred_ctxt,
735 ps_tl,
736 ps_t,
737 ps_tr,
738 ps_l,
739 ps_bl,
740 ps_coloc,
741 ps_zeromv_candt,
742 pps_proj_coloc_candts,
743 e_part_id);
744
745 /* INITIALIZATION OF 4x4 BR BLK */
746 e_part_id = PART_ID_NxN_BR;
747 ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
748 ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
749 ps_tr = ps_invalid; // invalid
750 ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
751 ps_bl = ps_invalid; // invalid
752 hme_init_pred_part(
753 ps_pred_ctxt,
754 ps_tl,
755 ps_t,
756 ps_tr,
757 ps_l,
758 ps_bl,
759 ps_coloc,
760 ps_zeromv_candt,
761 pps_proj_coloc_candts,
762 e_part_id);
763 }
764
hme_init_pred_ctxt_encode(pred_ctxt_t * ps_pred_ctxt,search_results_t * ps_search_results,search_node_t * ps_coloc_candts,search_node_t * ps_zeromv_candt,mv_grid_t * ps_mv_grid,S32 pred_lx,S32 lambda,S32 lambda_q_shift,U08 ** ppu1_ref_bits_tlu,S16 * pi2_ref_scf)765 void hme_init_pred_ctxt_encode(
766 pred_ctxt_t *ps_pred_ctxt,
767 search_results_t *ps_search_results,
768 search_node_t *ps_coloc_candts,
769 search_node_t *ps_zeromv_candt,
770 mv_grid_t *ps_mv_grid,
771 S32 pred_lx,
772 S32 lambda,
773 S32 lambda_q_shift,
774 U08 **ppu1_ref_bits_tlu,
775 S16 *pi2_ref_scf)
776 {
777 search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
778 search_node_t *ps_coloc;
779 search_node_t *ps_grid_cu_base;
780 CU_SIZE_T e_cu_size = ps_search_results->e_cu_size;
781
782 /* Part Start, Part sizes in 4x4 units */
783 S32 part_wd, part_ht, part_start_x, part_start_y;
784
785 /* Partition type, number of partitions in type */
786 S32 part_id;
787
788 /* Coordinates of the CU in 4x4 units */
789 S32 cu_start_x, cu_start_y;
790 S32 shift = e_cu_size;
791
792 /* top right and bot left validity at CU level */
793 S32 cu_tr_valid, cu_bl_valid;
794 /* strideo f the grid */
795 S32 grid_stride = ps_mv_grid->i4_stride;
796
797 ps_pred_ctxt->lambda = lambda;
798 ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
799 ps_pred_ctxt->pred_lx = pred_lx;
800 ps_pred_ctxt->mv_pel = 0;
801 ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
802 ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
803 ps_pred_ctxt->proj_used = 1;
804
805 cu_start_x = ps_search_results->u1_x_off >> 2;
806 cu_start_y = ps_search_results->u1_y_off >> 2;
807
808 /* Coloc always points to fixed global candt */
809 ps_coloc = ps_coloc_candts;
810
811 /* Go to base of the CU in the MV Grid */
812 ps_grid_cu_base = &ps_mv_grid->as_node[0];
813 ps_grid_cu_base += (ps_mv_grid->i4_start_offset + cu_start_x);
814 ps_grid_cu_base += (grid_stride * cu_start_y);
815
816 /* points to the real bottom left of the grid, will never be valid */
817 ps_invalid = &ps_mv_grid->as_node[0];
818 ps_invalid += (grid_stride * 17);
819
820 {
821 S32 shift = 1 + e_cu_size;
822 cu_tr_valid = gau1_cu_tr_valid[cu_start_y >> shift][cu_start_x >> shift];
823 cu_bl_valid = gau1_cu_bl_valid[cu_start_y >> shift][cu_start_x >> shift];
824 }
825
826 /*************************************************************************/
827 /* for the case of encode, the idea is to set up cants as follows */
828 /* */
829 /* ____ ______________ ____ ____ */
830 /* | T0 | T1 | T2 | T3 | T4 | T5 | */
831 /* |____|____|____|____|____|____| */
832 /* | L1 | | | */
833 /* |____| | | */
834 /* | L2 | p0 | p1 | */
835 /* |____| | | */
836 /* | L3 | | | */
837 /* |____| | | */
838 /* | L4 | L' | | */
839 /* |____|____|______________| */
840 /* | BL | */
841 /* |____| */
842 /* The example is shown with 16x16 CU, though it can be generalized */
843 /* This CU has 2 partitions, cu_wd = 4. also p_wd, p_ht are partition */
844 /* width and ht in 4x4 units. */
845 /* For a given CU, derive the top left, top and bottom left and top rt */
846 /* pts. Left and top are assumed to be valid. */
847 /* IF there aretwo partitions in the CU (like p0 and p1) and vertical, */
848 /* then for first partition, left, top, top left and top right valid */
849 /* Bottom left is valid. store these validity flags. Also store the */
850 /* grid offsets of the partitions w.r.t. CU start in units of 4x4.For p0*/
851 /* Left grid offset = -1, 3. Top Grd offset = -1, 0. */
852 /* Top left grid offset = -1, -1. Top right = 1, -1. BL = -1, 4. */
853 /* For p1, validity flags are left, top, top left, top right, valid. */
854 /* BL is invalid. Grid offsets are: Left = dont care. T = 1, -1 (T2) */
855 /* TR = 4, -1 (T5). TL = 0, -1 (T1). BL = don't care. */
856 /* For p1, set the left pred candt to the best search result of p0. */
857 /*************************************************************************/
858
859 /* Loop over all partitions, and identify the 5 neighbours */
860 for(part_id = 0; part_id < TOT_NUM_PARTS; part_id++)
861 {
862 part_attr_t *ps_part_attr = &gas_part_attr_in_cu[part_id];
863 S32 tr_valid, bl_valid, is_vert;
864 search_node_t *ps_grid_pu_base;
865 PART_TYPE_T e_part_type;
866 PART_ID_T first_part;
867 S32 part_num;
868
869 e_part_type = ge_part_id_to_part_type[part_id];
870 first_part = ge_part_type_to_part_id[e_part_type][0];
871 is_vert = gau1_is_vert_part[e_part_type];
872 part_num = gau1_part_id_to_part_num[part_id];
873 tr_valid = gau1_partid_tr_valid[part_id] & cu_tr_valid;
874 bl_valid = gau1_partid_bl_valid[part_id] & cu_bl_valid;
875
876 part_start_x = (ps_part_attr->u1_x_start << shift) >> 2;
877 part_start_y = (ps_part_attr->u1_y_start << shift) >> 2;
878 part_wd = (ps_part_attr->u1_x_count << shift) >> 2;
879 part_ht = (ps_part_attr->u1_y_count << shift) >> 2;
880
881 /* go to top left of part */
882 ps_grid_pu_base = ps_grid_cu_base + part_start_x;
883 ps_grid_pu_base += (part_start_y * grid_stride);
884
885 ps_tl = ps_grid_pu_base - 1 - grid_stride;
886 ps_t = ps_grid_pu_base - grid_stride + part_wd - 1;
887 ps_l = ps_grid_pu_base - 1 + ((part_ht - 1) * grid_stride);
888 ps_tr = ps_t + 1;
889 ps_bl = ps_l + grid_stride;
890
891 if(!tr_valid)
892 ps_tr = ps_invalid;
893 if(!bl_valid)
894 ps_bl = ps_invalid;
895
896 if(part_num == 1)
897 {
898 /* for cases of two partitions 2nd part has 1st part as candt */
899 /* if vertical type, left candt of 2nd part is 1st part. */
900 /* if horz type, top candt of 2nd part is 1st part. */
901 if(is_vert)
902 {
903 ps_l = ps_search_results->aps_part_results[pred_lx][first_part];
904 }
905 else
906 {
907 ps_t = ps_search_results->aps_part_results[pred_lx][first_part];
908 }
909 }
910 if(part_num == 2)
911 {
912 /* only possible for NxN_BL */
913 ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
914 ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
915 }
916 if(part_num == 3)
917 {
918 /* only possible for NxN_BR */
919 ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
920 ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
921 ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
922 }
923 hme_init_pred_part(
924 ps_pred_ctxt,
925 ps_tl,
926 ps_t,
927 ps_tr,
928 ps_l,
929 ps_bl,
930 ps_coloc,
931 ps_zeromv_candt,
932 NULL,
933 (PART_ID_T)part_id);
934 }
935 }
936
937 /**
938 ********************************************************************************
939 * @fn compute_mv_cost_explicit(search_node_t *ps_node,
940 * pred_ctxt_t *ps_pred_ctxt,
941 * PART_ID_T e_part_id)
942 *
943 * @brief MV cost for explicit search in layers not encoded
944 *
945 * @param[in] ps_node: search node having mv and ref id for which to eval cost
946 *
947 * @param[in] ps_pred_ctxt : mv pred context
948 *
949 * @param[in] e_part_id : Partition id.
950 *
951 * @return Cost value
952
953 ********************************************************************************
954 */
compute_mv_cost_explicit(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)955 S32 compute_mv_cost_explicit(
956 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
957 {
958 #define RETURN_FIXED_COST 0
959 search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
960 pred_candt_nodes_t *ps_pred_nodes;
961 S32 inp_shift = 2 - inp_mv_pel;
962 S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
963 S32 mv_p_x, mv_p_y;
964 S16 mvdx1, mvdx2, mvdy1, mvdy2;
965 S32 cost, ref_bits;
966
967 /*************************************************************************/
968 /* Logic for cost computation for explicit search. For such a search, */
969 /* it is guaranteed that all predictor candts have same ref id. The only */
970 /* probable issue is with the availability which needs checking. This fxn*/
971 /* does not suffer the need to scale predictor candts due to diff ref id */
972 /*************************************************************************/
973
974 /* Hack: currently we always assume 2Nx2N. */
975 /* TODO: get rid of this hack and return cost tuned to each partition */
976 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
977 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
978
979 /*************************************************************************/
980 /* Priority to bottom left availability. Else we go to left. If both are */
981 /* not available, then a remains null */
982 /*************************************************************************/
983 if(ps_pred_nodes->ps_tl->u1_is_avail)
984 ps_pred_node_a = ps_pred_nodes->ps_tl;
985 else if(ps_pred_nodes->ps_l->u1_is_avail)
986 ps_pred_node_a = ps_pred_nodes->ps_l;
987
988 /*************************************************************************/
989 /* For encoder, top left may not be really needed unless we use slices, */
990 /* and even then in ME it may not be relevant. So we only consider T or */
991 /* TR, as, if both T and TR are not available, TL also will not be */
992 /*************************************************************************/
993 if(ps_pred_nodes->ps_tr->u1_is_avail)
994 ps_pred_node_b = ps_pred_nodes->ps_tr;
995 else if(ps_pred_nodes->ps_t->u1_is_avail)
996 ps_pred_node_b = ps_pred_nodes->ps_t;
997
998 if(ps_pred_node_a == NULL)
999 {
1000 ps_pred_node_a = ps_pred_nodes->ps_coloc;
1001 if(ps_pred_node_b == NULL)
1002 ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1003 }
1004 else if(ps_pred_node_b == NULL)
1005 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1006 else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1007 {
1008 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1009 }
1010
1011 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1012 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1013 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1014 mvdx1 = ABS(mvdx1);
1015 mvdy1 = ABS(mvdy1);
1016
1017 mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1018 mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1019 COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1020 mvdx2 = ABS(mvdx2);
1021 mvdy2 = ABS(mvdy2);
1022
1023 if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1024 {
1025 cost =
1026 hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1027 }
1028 else
1029 {
1030 cost =
1031 hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1032 }
1033 {
1034 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1035 return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1036 }
1037 }
1038 /**
1039 ********************************************************************************
1040 * @fn compute_mv_cost_coarse(search_node_t *ps_node,
1041 * pred_ctxt_t *ps_pred_ctxt,
1042 * PART_ID_T e_part_id)
1043 *
1044 * @brief MV cost for coarse explicit search in coarsest layer
1045 *
1046 * @param[in] ps_node: search node having mv and ref id for which to eval cost
1047 *
1048 * @param[in] ps_pred_ctxt : mv pred context
1049 *
1050 * @param[in] e_part_id : Partition id.
1051 *
1052 * @return Cost value
1053
1054 ********************************************************************************
1055 */
compute_mv_cost_coarse(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1056 S32 compute_mv_cost_coarse(
1057 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1058 {
1059 ARG_NOT_USED(e_part_id);
1060
1061 return (compute_mv_cost_explicit(ps_node, ps_pred_ctxt, PART_ID_2Nx2N, inp_mv_pel));
1062 }
1063
1064 /**
1065 ********************************************************************************
1066 * @fn compute_mv_cost_coarse_high_speed(search_node_t *ps_node,
1067 * pred_ctxt_t *ps_pred_ctxt,
1068 * PART_ID_T e_part_id)
1069 *
1070 * @brief MV cost for coarse explicit search in coarsest layer
1071 *
1072 * @param[in] ps_node: search node having mv and ref id for which to eval cost
1073 *
1074 * @param[in] ps_pred_ctxt : mv pred context
1075 *
1076 * @param[in] e_part_id : Partition id.
1077 *
1078 * @return Cost value
1079
1080 ********************************************************************************
1081 */
compute_mv_cost_coarse_high_speed(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1082 S32 compute_mv_cost_coarse_high_speed(
1083 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1084 {
1085 S32 rnd, mvx, mvy, i4_search_idx;
1086 S32 cost;
1087
1088 mvx = ps_node->s_mv.i2_mvx;
1089 mvy = ps_node->s_mv.i2_mvy;
1090 i4_search_idx = ps_node->i1_ref_idx;
1091
1092 cost = (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i4_search_idx;
1093 cost += (mvx != 0) ? 1 : 0;
1094 cost += (mvy != 0) ? 1 : 0;
1095 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1096 cost = (cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift;
1097 return cost;
1098 }
1099
1100 /**
1101 ********************************************************************************
1102 * @fn compute_mv_cost_explicit_refine(search_node_t *ps_node,
1103 * pred_ctxt_t *ps_pred_ctxt,
1104 * PART_ID_T e_part_id)
1105 *
1106 * @brief MV cost for explicit search in layers not encoded. Always returns
1107 * cost of the projected colocated candidate
1108 *
1109 * @param[in] ps_node: search node having mv and ref id for which to eval cost
1110 *
1111 * @param[in] ps_pred_ctxt : mv pred context
1112 *
1113 * @param[in] e_part_id : Partition id.
1114 *
1115 * @return Cost value
1116
1117 ********************************************************************************
1118 */
compute_mv_cost_explicit_refine(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1119 S32 compute_mv_cost_explicit_refine(
1120 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1121 {
1122 search_node_t *ps_pred_node_a = NULL;
1123 pred_candt_nodes_t *ps_pred_nodes;
1124 S32 inp_shift = 2 - inp_mv_pel;
1125 S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
1126 S32 mv_p_x, mv_p_y;
1127 S16 mvdx1, mvdy1;
1128 S32 cost, ref_bits;
1129
1130 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1131 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1132
1133 ps_pred_node_a = ps_pred_nodes->pps_proj_coloc[0];
1134
1135 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1136 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1137 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1138 mvdx1 = ABS(mvdx1);
1139 mvdy1 = ABS(mvdy1);
1140
1141 cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1142
1143 {
1144 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1145 return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1146 }
1147 }
1148
1149 /**
1150 ********************************************************************************
1151 * @fn compute_mv_cost_refine(search_node_t *ps_node,
1152 * pred_ctxt_t *ps_pred_ctxt,
1153 * PART_ID_T e_part_id)
1154 *
1155 * @brief MV cost for coarse explicit search in coarsest layer
1156 *
1157 * @param[in] ps_node: search node having mv and ref id for which to eval cost
1158 *
1159 * @param[in] ps_pred_ctxt : mv pred context
1160 *
1161 * @param[in] e_part_id : Partition id.
1162 *
1163 * @return Cost value
1164
1165 ********************************************************************************
1166 */
compute_mv_cost_refine(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1167 S32 compute_mv_cost_refine(
1168 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1169 {
1170 return (compute_mv_cost_explicit_refine(ps_node, ps_pred_ctxt, e_part_id, inp_mv_pel));
1171 }
1172
compute_mv_cost_implicit(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1173 S32 compute_mv_cost_implicit(
1174 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1175 {
1176 search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1177 pred_candt_nodes_t *ps_pred_nodes;
1178 S08 i1_ref_idx;
1179 S08 i1_ref_tl = -1, i1_ref_tr = -1, i1_ref_t = -1;
1180 S08 i1_ref_bl = -1, i1_ref_l = -1;
1181 S32 inp_shift = 2 - inp_mv_pel;
1182 S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel;*/
1183 S32 ref_bits, cost;
1184 S32 mv_p_x, mv_p_y;
1185 S16 mvdx1, mvdx2, mvdy1, mvdy2;
1186
1187 //return 0;
1188 i1_ref_idx = ps_node->i1_ref_idx;
1189
1190 /*************************************************************************/
1191 /* Logic for cost computation for explicit search. For such a search, */
1192 /* it is guaranteed that all predictor candts have same ref id. The only */
1193 /* probable issue is with the availability which needs checking. This fxn*/
1194 /* does not suffer the need to scale predictor candts due to diff ref id */
1195 /*************************************************************************/
1196
1197 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1198 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1199
1200 /*************************************************************************/
1201 /* Priority to bottom left availability. Else we go to left. If both are */
1202 /* not available, then a remains null */
1203 /*************************************************************************/
1204 if(ps_pred_nodes->ps_bl->u1_is_avail)
1205 i1_ref_bl = ps_pred_nodes->ps_bl->i1_ref_idx;
1206 if(ps_pred_nodes->ps_l->u1_is_avail)
1207 i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1208 if(i1_ref_bl == i1_ref_idx)
1209 ps_pred_node_a = ps_pred_nodes->ps_bl;
1210 else if(i1_ref_l == i1_ref_idx)
1211 ps_pred_node_a = ps_pred_nodes->ps_l;
1212 if(ps_pred_node_a == NULL)
1213 {
1214 if(i1_ref_bl != -1)
1215 ps_pred_node_a = ps_pred_nodes->ps_bl;
1216 else if(i1_ref_l != -1)
1217 ps_pred_node_a = ps_pred_nodes->ps_l;
1218 }
1219
1220 /*************************************************************************/
1221 /* For encoder, top left may not be really needed unless we use slices, */
1222 /* and even then in ME it may not be relevant. So we only consider T or */
1223 /* TR, as, if both T and TR are not available, TL also will not be */
1224 /*************************************************************************/
1225 if(ps_pred_nodes->ps_tr->u1_is_avail)
1226 i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1227 if(ps_pred_nodes->ps_t->u1_is_avail)
1228 i1_ref_t = ps_pred_nodes->ps_t->i1_ref_idx;
1229 if(ps_pred_nodes->ps_tl->u1_is_avail)
1230 i1_ref_tl = ps_pred_nodes->ps_tl->i1_ref_idx;
1231 if(i1_ref_tr == i1_ref_idx)
1232 ps_pred_node_b = ps_pred_nodes->ps_tr;
1233 else if(i1_ref_t == i1_ref_idx)
1234 ps_pred_node_b = ps_pred_nodes->ps_t;
1235 else if(i1_ref_tl == i1_ref_idx)
1236 ps_pred_node_b = ps_pred_nodes->ps_tl;
1237
1238 if(ps_pred_node_b == NULL)
1239 {
1240 if(i1_ref_tr != -1)
1241 ps_pred_node_b = ps_pred_nodes->ps_tr;
1242 else if(i1_ref_t != -1)
1243 ps_pred_node_b = ps_pred_nodes->ps_t;
1244 else if(i1_ref_tl != -1)
1245 ps_pred_node_b = ps_pred_nodes->ps_tl;
1246 }
1247 if(ps_pred_node_a == NULL)
1248 {
1249 ps_pred_node_a = ps_pred_nodes->ps_coloc;
1250 if(ps_pred_node_b == NULL)
1251 ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1252 }
1253 else if(ps_pred_node_b == NULL)
1254 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1255 else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1256 {
1257 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1258 }
1259
1260 if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1261 {
1262 SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1263 }
1264 else
1265 {
1266 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1267 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1268 }
1269 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1270 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1271 mvdx1 = ABS(mvdx1);
1272 mvdy1 = ABS(mvdy1);
1273
1274 if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1275 {
1276 SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1277 }
1278 else
1279 {
1280 mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1281 mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1282 }
1283 pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1284 COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1285 mvdx2 = ABS(mvdx2);
1286 mvdy2 = ABS(mvdy2);
1287
1288 if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1289 {
1290 cost = 2 * hme_get_range(mvdx1) + 2 * hme_get_range(mvdy1) + 2 * (mvdx1 > 0) +
1291 2 * (mvdy1 > 0) + ref_bits + 2;
1292 }
1293 else
1294 {
1295 cost = 2 * hme_get_range(mvdx2) + 2 * hme_get_range(mvdy2) + 2 * (mvdx2 > 0) +
1296 2 * (mvdy2 > 0) + ref_bits + 2;
1297 }
1298 {
1299 /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1300 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift);
1301 S32 tot_cost = (cost * ps_pred_ctxt->lambda) << 1;
1302
1303 tot_cost += (gau1_bits_for_part_id_q1[e_part_id] * ps_pred_ctxt->lambda);
1304 return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift + 1));
1305 }
1306 }
1307
compute_mv_cost_implicit_high_speed(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1308 S32 compute_mv_cost_implicit_high_speed(
1309 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1310 {
1311 search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1312 pred_candt_nodes_t *ps_pred_nodes;
1313 S08 i1_ref_idx;
1314 S08 i1_ref_tr = -1;
1315 S08 i1_ref_l = -1;
1316 S32 inp_shift = 2 - inp_mv_pel;
1317 S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1318 S32 ref_bits, cost;
1319 S32 mv_p_x, mv_p_y;
1320 S16 mvdx1, mvdx2, mvdy1, mvdy2;
1321
1322 i1_ref_idx = ps_node->i1_ref_idx;
1323
1324 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1325 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1326
1327 /*************************************************************************/
1328 /* Priority to bottom left availability. Else we go to left. If both are */
1329 /* not available, then a remains null */
1330 /*************************************************************************/
1331 if(ps_pred_nodes->ps_l->u1_is_avail)
1332 {
1333 i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1334 ps_pred_node_a = ps_pred_nodes->ps_l;
1335 }
1336
1337 /*************************************************************************/
1338 /* For encoder, top left may not be really needed unless we use slices, */
1339 /* and even then in ME it may not be relevant. So we only consider T or */
1340 /* TR, as, if both T and TR are not available, TL also will not be */
1341 /*************************************************************************/
1342
1343 if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
1344 {
1345 i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1346 ps_pred_node_b = ps_pred_nodes->ps_tr;
1347 }
1348 else
1349 {
1350 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1351 }
1352
1353 if(ps_pred_node_a == NULL)
1354 {
1355 ps_pred_node_a = ps_pred_nodes->ps_coloc;
1356
1357 if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
1358 ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1359 }
1360
1361 if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1362 {
1363 SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1364 }
1365 else
1366 {
1367 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1368 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1369 }
1370
1371 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1372 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1373 mvdx1 = ABS(mvdx1);
1374 mvdy1 = ABS(mvdy1);
1375
1376 if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1377 {
1378 SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1379 }
1380 else
1381 {
1382 mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1383 mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1384 }
1385
1386 pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1387 COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1388 mvdx2 = ABS(mvdx2);
1389 mvdy2 = ABS(mvdy2);
1390
1391 if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1392 {
1393 cost =
1394 hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1395 }
1396 else
1397 {
1398 cost =
1399 hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1400 }
1401 {
1402 /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1403 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1404 S32 tot_cost = (cost * ps_pred_ctxt->lambda);
1405
1406 return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift));
1407 }
1408 }
1409
compute_mv_cost_implicit_high_speed_modified(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1410 S32 compute_mv_cost_implicit_high_speed_modified(
1411 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1412 {
1413 search_node_t *ps_pred_node_a = NULL;
1414 pred_candt_nodes_t *ps_pred_nodes;
1415 S32 inp_shift = 2 - inp_mv_pel;
1416 S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1417 S32 mv_p_x, mv_p_y;
1418 S16 mvdx1, mvdy1;
1419 S32 cost, ref_bits;
1420
1421 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1422 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1423
1424 ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
1425
1426 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1427 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1428 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1429 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1430 mvdx1 = ABS(mvdx1);
1431 mvdy1 = ABS(mvdy1);
1432
1433 cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1434
1435 {
1436 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1437 return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1438 }
1439 }
1440
hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t * ps_result_prms)1441 void hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t *ps_result_prms)
1442 {
1443 /*The function modified with assumption that only 2NxN_B and Nx2N_R is modified */
1444
1445 search_node_t s_search_node_grid;
1446 const search_node_t *ps_search_node_base;
1447 search_node_t *ps_search_node_grid, *ps_best_node;
1448 S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1449 S32 num_results, i4_unique_id = -1, i4_grid_pt;
1450 search_results_t *ps_search_results;
1451 S32 *pi4_valid_part_ids;
1452 S32 i4_step = ps_result_prms->i4_step;
1453 S32 i4_grid_mask, i, i4_min_id;
1454 S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1455 S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1456 S32 grid_count = 0;
1457 S32 pred_lx;
1458
1459 i4_min_id = (S32)PT_C;
1460 i4_min_cost = MAX_32BIT_VAL;
1461 ps_search_node_grid = &s_search_node_grid;
1462 ps_search_node_base = ps_result_prms->ps_search_node_base;
1463 *ps_search_node_grid = *ps_search_node_base;
1464 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1465 ps_search_results = ps_result_prms->ps_search_results;
1466 num_results = (S32)ps_search_results->u1_num_results_per_part;
1467 i4_grid_mask = ps_result_prms->i4_grid_mask;
1468
1469 for(i = 0; i < 9; i++)
1470 {
1471 if(i4_grid_mask & (1 << i))
1472 grid_count++;
1473 }
1474
1475 /* Some basic assumptions: only single pt, only part updates */
1476 /* and more than 1 best result to be computed. */
1477 //ASSERT(ps_result_prms->i4_grid_mask != 1);
1478 //ASSERT(ps_result_prms->i4_part_mask != ENABLE_2Nx2N);
1479 //ASSERT(ps_search_results->num_results > 1);
1480
1481 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1482 pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1483
1484 /*************************************************************************/
1485 /* Supposing we do hte result update for a unique partid, we can */
1486 /* store the best pt id in the grid and also min cost is return */
1487 /* param. This will be useful for early exit cases. */
1488 /* TODO : once we have separate fxn for unique part+grid, we can */
1489 /* do away with this code here */
1490 /*************************************************************************/
1491 //if (pi4_valid_part_ids[1] == -1)
1492 i4_unique_id = pi4_valid_part_ids[0];
1493
1494 /* pi4_valid_part_ids contains all the valid ids. We loop through */
1495 /* this till we encounter -1. This is easier than having to */
1496 /* figure out part by part, besides, active part decision is */
1497 /* usually fixed for a given duration of search, e.g. entire fpel */
1498 /* refinement for a blk/cu will use fixed valid part mask */
1499 id = pi4_valid_part_ids[0];
1500
1501 /*****************************************************************/
1502 /* points to the best search results corresponding to this */
1503 /* specific part type. */
1504 /*****************************************************************/
1505 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1506
1507 /*************************************************************************/
1508 /* Outer loop runs through all active pts in the grid */
1509 /*************************************************************************/
1510 for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1511 {
1512 if(!(i4_grid_mask & (1 << i4_grid_pt)))
1513 continue;
1514
1515 /* For the pt in the grid, update mvx and y depending on */
1516 /* location of pt. Updates are in FPEL units. */
1517 ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1518 ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1519 ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1520 ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1521
1522 {
1523 /* evaluate mv cost and totalcost for this part for this given mv*/
1524 i4_mv_cost = compute_mv_cost_coarse_high_speed(
1525 ps_search_node_grid,
1526 &ps_search_results->as_pred_ctxt[pred_lx],
1527 (PART_ID_T)id,
1528 MV_RES_FPEL);
1529
1530 i4_sad = pi4_sad_grid[grid_count * id];
1531 i4_tot_cost = i4_sad + i4_mv_cost;
1532
1533 ASSERT(i4_unique_id == id);
1534 ASSERT(num_results == 1);
1535
1536 /*****************************************************************/
1537 /* We do not labor through the results if the total cost worse */
1538 /* than the last of the results. */
1539 /*****************************************************************/
1540 if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1541 {
1542 i4_min_id = i4_grid_pt;
1543 ps_result_prms->i4_min_cost = i4_tot_cost;
1544
1545 ps_best_node[0] = *ps_search_node_grid;
1546 ps_best_node[0].i4_sad = i4_sad;
1547 ps_best_node[0].i4_mv_cost = i4_mv_cost;
1548 ps_best_node[0].i4_tot_cost = i4_tot_cost;
1549 }
1550 }
1551 pi4_sad_grid++;
1552 }
1553 ps_result_prms->i4_min_id = i4_min_id;
1554 }
1555
hme_update_results_grid_pu_bestn(result_upd_prms_t * ps_result_prms)1556 void hme_update_results_grid_pu_bestn(result_upd_prms_t *ps_result_prms)
1557 {
1558 search_node_t s_search_node_grid;
1559 const search_node_t *ps_search_node_base;
1560 search_node_t *ps_search_node_grid, *ps_best_node;
1561 S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1562 S32 num_results, i4_unique_id = -1, i4_grid_pt;
1563 search_results_t *ps_search_results;
1564 S32 *pi4_valid_part_ids;
1565 S32 i4_step = ps_result_prms->i4_step;
1566 S32 i4_grid_mask, i4_count, i, i4_min_id;
1567 S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1568 S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1569 S32 grid_count = 0;
1570 S32 pred_lx;
1571
1572 i4_min_id = (S32)PT_C;
1573 i4_min_cost = MAX_32BIT_VAL;
1574 ps_search_node_grid = &s_search_node_grid;
1575 ps_search_node_base = ps_result_prms->ps_search_node_base;
1576 *ps_search_node_grid = *ps_search_node_base;
1577 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1578 ps_search_results = ps_result_prms->ps_search_results;
1579 num_results = (S32)ps_search_results->u1_num_results_per_part;
1580 i4_grid_mask = ps_result_prms->i4_grid_mask;
1581
1582 for(i = 0; i < 9; i++)
1583 {
1584 if(i4_grid_mask & (1 << i))
1585 {
1586 grid_count++;
1587 }
1588 }
1589
1590 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1591 pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1592
1593 i4_unique_id = pi4_valid_part_ids[0];
1594
1595 /*************************************************************************/
1596 /* Outer loop runs through all active pts in the grid */
1597 /*************************************************************************/
1598 for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1599 {
1600 if(!(i4_grid_mask & (1 << i4_grid_pt)))
1601 {
1602 continue;
1603 }
1604
1605 /* For the pt in the grid, update mvx and y depending on */
1606 /* location of pt. Updates are in FPEL units. */
1607 ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1608 ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1609 ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1610 ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1611
1612 i4_count = 0;
1613
1614 while((id = pi4_valid_part_ids[i4_count]) >= 0)
1615 {
1616 /*****************************************************************/
1617 /* points to the best search results corresponding to this */
1618 /* specific part type. */
1619 /*****************************************************************/
1620 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1621
1622 /* evaluate mv cost and totalcost for this part for this given mv*/
1623 i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1624 ps_search_node_grid,
1625 &ps_search_results->as_pred_ctxt[pred_lx],
1626 (PART_ID_T)id,
1627 MV_RES_FPEL);
1628
1629 i4_sad = pi4_sad_grid[grid_count * id];
1630 i4_tot_cost = i4_sad + i4_mv_cost;
1631
1632 if(i4_unique_id == id)
1633 {
1634 if(i4_tot_cost < ps_result_prms->i4_min_cost)
1635 {
1636 i4_min_id = i4_grid_pt;
1637 ps_result_prms->i4_min_cost = i4_tot_cost;
1638 }
1639 }
1640
1641 if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1642 {
1643 for(i = 0; i < num_results - 1; i++)
1644 {
1645 if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1646 {
1647 memmove(
1648 ps_best_node + i + 1,
1649 ps_best_node + i,
1650 sizeof(search_node_t) * (num_results - 1 - i));
1651 break;
1652 }
1653 else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1654 {
1655 if(0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node + i))
1656 break;
1657 }
1658 }
1659 ps_best_node[i] = *ps_search_node_grid;
1660 ps_best_node[i].i4_sad = i4_sad;
1661 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1662 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1663 }
1664 i4_count++;
1665 }
1666 pi4_sad_grid++;
1667 }
1668 ps_result_prms->i4_min_id = i4_min_id;
1669 }
1670
1671 /**
1672 ********************************************************************************
1673 * @fn hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1674 *
1675 * @brief Updates results for the case where 1 best result is to be updated
1676 * for a given pt, for several parts
1677 * Note : The function is replicated for CLIPing the cost to 16bit to make
1678 * bit match with SIMD version
1679 *
1680 * @param[in] result_upd_prms_t : Contains the input parameters to this fxn
1681 *
1682 * @return The result_upd_prms_t structure is updated for all the active
1683 * parts in case the current candt has results for any given part
1684 * that is the best result for that part
1685 ********************************************************************************
1686 */
hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t * ps_result_prms)1687 void hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1688 {
1689 search_node_t s_search_node_grid;
1690 const search_node_t *ps_search_node_base;
1691 search_node_t *ps_search_node_grid, *ps_best_node;
1692 S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1693 S32 num_results, i4_unique_id = -1, i4_grid_pt;
1694 search_results_t *ps_search_results;
1695 S32 *pi4_valid_part_ids;
1696 S32 i4_step = ps_result_prms->i4_step;
1697 S32 i4_grid_mask, i4_count, i, i4_min_id;
1698 S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1699 S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1700 S32 grid_count = 0;
1701 S32 pred_lx;
1702
1703 i4_min_id = (S32)PT_C;
1704 i4_min_cost = MAX_32BIT_VAL;
1705 ps_search_node_grid = &s_search_node_grid;
1706 ps_search_node_base = ps_result_prms->ps_search_node_base;
1707 *ps_search_node_grid = *ps_search_node_base;
1708 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1709 ps_search_results = ps_result_prms->ps_search_results;
1710 num_results = (S32)ps_search_results->u1_num_results_per_part;
1711 i4_grid_mask = ps_result_prms->i4_grid_mask;
1712
1713 for(i = 0; i < 9; i++)
1714 {
1715 if(i4_grid_mask & (1 << i))
1716 grid_count++;
1717 }
1718
1719 /* Some basic assumptions: only single pt, only part updates */
1720 /* and more than 1 best result to be computed. */
1721
1722 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1723 pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1724
1725 /*************************************************************************/
1726 /* Supposing we do hte result update for a unique partid, we can */
1727 /* store the best pt id in the grid and also min cost is return */
1728 /* param. This will be useful for early exit cases. */
1729 /* TODO : once we have separate fxn for unique part+grid, we can */
1730 /* do away with this code here */
1731 /*************************************************************************/
1732 //if (pi4_valid_part_ids[1] == -1)
1733 i4_unique_id = pi4_valid_part_ids[0];
1734
1735 /*************************************************************************/
1736 /* Outer loop runs through all active pts in the grid */
1737 /*************************************************************************/
1738 for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1739 {
1740 if(!(i4_grid_mask & (1 << i4_grid_pt)))
1741 continue;
1742
1743 /* For the pt in the grid, update mvx and y depending on */
1744 /* location of pt. Updates are in FPEL units. */
1745 ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1746 ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1747 ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1748 ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1749
1750 i4_count = 0;
1751
1752 /* pi4_valid_part_ids contains all the valid ids. We loop through */
1753 /* this till we encounter -1. This is easier than having to */
1754 /* figure out part by part, besides, active part decision is */
1755 /* usually fixed for a given duration of search, e.g. entire fpel */
1756 /* refinement for a blk/cu will use fixed valid part mask */
1757
1758 while((id = pi4_valid_part_ids[i4_count]) >= 0)
1759 {
1760 //ps_search_node_grid->e_part_type = (PART_TYPE_T)id;
1761
1762 /*****************************************************************/
1763 /* points to the best search results corresponding to this */
1764 /* specific part type. */
1765 /*****************************************************************/
1766 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1767
1768 /* evaluate mv cost and totalcost for this part for this given mv*/
1769 i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1770 ps_search_node_grid,
1771 &ps_search_results->as_pred_ctxt[pred_lx],
1772 (PART_ID_T)id,
1773 MV_RES_FPEL);
1774
1775 i4_sad = pi4_sad_grid[grid_count * id];
1776
1777 /* Clipping to 16 bit to bit match with SIMD version */
1778 i4_mv_cost = CLIP_S16(i4_mv_cost);
1779 i4_sad = CLIP_S16(i4_sad);
1780
1781 i4_tot_cost = i4_sad + i4_mv_cost;
1782 /* Clipping to 16 bit to bit match with SIMD version */
1783 i4_tot_cost = CLIP_S16(i4_tot_cost);
1784
1785 if(i4_unique_id == id)
1786 {
1787 if(i4_tot_cost < ps_result_prms->i4_min_cost)
1788 {
1789 i4_min_id = i4_grid_pt;
1790 ps_result_prms->i4_min_cost = i4_tot_cost;
1791 }
1792 }
1793
1794 /*****************************************************************/
1795 /* We do not labor through the results if the total cost worse */
1796 /* than the last of the results. */
1797 /*****************************************************************/
1798 if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1799 {
1800 S32 eq_cost = 0;
1801 /*************************************************************/
1802 /* Identify where the current result isto be placed.Basically*/
1803 /* find the node which has cost just higher thannodeundertest*/
1804 /*************************************************************/
1805 for(i = 0; i < num_results - 1; i++)
1806 {
1807 if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1808 {
1809 memmove(
1810 ps_best_node + i + 1,
1811 ps_best_node + i,
1812 sizeof(search_node_t) * (num_results - 1 - i));
1813 break;
1814 }
1815 else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1816 {
1817 //if (0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node+i))
1818 // break;
1819 /* When cost is same we comp. the nodes and if it's same skip. */
1820 /* We don't want to add this code to intrinsic. So we are */
1821 /* commenting it. The quality impact was minor when we did the */
1822 /* regression. */
1823 eq_cost = 1;
1824 }
1825 }
1826 if(!eq_cost)
1827 {
1828 ps_best_node[i] = *ps_search_node_grid;
1829 ps_best_node[i].i4_sad = i4_sad;
1830 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1831 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1832 }
1833 }
1834 i4_count++;
1835 }
1836 pi4_sad_grid++;
1837 }
1838 ps_result_prms->i4_min_id = i4_min_id;
1839 }
1840
1841 /**
1842 ********************************************************************************
1843 * @fn hme_update_results_pt_npu_best1(result_upd_prms_t *ps_result_prms)
1844 *
1845 * @brief Updates results for the case where 1 best result is to be updated
1846 * for a given pt, for several parts
1847 *
1848 * @param[in] ps_result_prms. Contains the input parameters to this fxn
1849 * ::ps_pred_info : contains cost fxn ptr and predictor info
1850 * ::pi4_sad : 17x9 SAD Grid, this case, only 1st 17 entries valid
1851 * ::ps_search_results: Search results structure
1852 * ::i1_ref_id : Reference index
1853 * ::i4_grid_mask: Dont Care for this fxn
1854 * ::pi4_valid_part_ids : valid part ids
1855 * ::ps_search_node_base: Contains the centre pt candt info.
1856 *
1857 * @return The ps_search_results structure is updated for all the active
1858 * parts in case the current candt has results for any given part
1859 * that is the best result for that part
1860 ********************************************************************************
1861 */
1862
hme_update_results_pt_pu_best1_subpel_hs(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)1863 void hme_update_results_pt_pu_best1_subpel_hs(
1864 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1865 {
1866 search_node_t *ps_search_node_base, *ps_best_node;
1867 search_results_t *ps_search_results;
1868 S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1869 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1870 S32 num_results, i;
1871 S32 *pi4_valid_part_ids;
1872
1873 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1874 /* Some basic assumptions: only single pt, only part updates */
1875 /* and more than 1 best result to be computed. */
1876 ASSERT(ps_result_prms->i4_grid_mask == 1);
1877
1878 ps_search_results = ps_result_prms->ps_search_results;
1879 num_results = (S32)ps_search_results->u1_num_results_per_part;
1880
1881 /* Compute mv cost, total cost */
1882 ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1883
1884 while((id = pi4_valid_part_ids[i4_count]) >= 0)
1885 {
1886 S32 update_required = 1;
1887
1888 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1889 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1890 i4_mv_cost = ps_best_node->i4_mv_cost;
1891 i4_sad = ps_result_prms->pi4_sad_grid[id];
1892 i4_tot_cost = i4_sad + i4_mv_cost;
1893
1894 /* We do not labor through the results if the total cost is worse than */
1895 /* the last of the results. */
1896 if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1897 {
1898 /* Identify where the current result is to be placed. Basically find */
1899 /* the node which has cost just higher than node under test */
1900 for(i = 0; i < num_results - 1; i++)
1901 {
1902 if(ps_best_node[i].i1_ref_idx != -1)
1903 {
1904 if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1905 {
1906 memmove(
1907 ps_best_node + i + 1,
1908 ps_best_node + i,
1909 sizeof(search_node_t) * (num_results - 1 - i));
1910 break;
1911 }
1912 else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1913 {
1914 update_required = 0;
1915 break;
1916 }
1917 }
1918 else
1919 {
1920 break;
1921 }
1922 }
1923
1924 if(update_required)
1925 {
1926 /* Update when either ref_idx or mv's are different */
1927 ps_best_node[i] = *ps_search_node_base;
1928 ps_best_node[i].i4_sad = i4_sad;
1929 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1930 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1931 }
1932 }
1933 i4_count++;
1934 }
1935 }
1936
hme_update_results_pt_pu_best1_subpel_hs_1(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)1937 void hme_update_results_pt_pu_best1_subpel_hs_1(
1938 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1939 {
1940 search_node_t *ps_search_node_base, *ps_best_node;
1941 search_results_t *ps_search_results;
1942 S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1943 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1944 S32 num_results;
1945 S32 *pi4_valid_part_ids;
1946
1947 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1948 /* Some basic assumptions: only single pt, only part updates */
1949 /* and more than 1 best result to be computed. */
1950 ASSERT(ps_result_prms->i4_grid_mask == 1);
1951
1952 ps_search_results = ps_result_prms->ps_search_results;
1953 num_results = (S32)ps_search_results->u1_num_results_per_part;
1954
1955 /* Compute mv cost, total cost */
1956 ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1957
1958 while((id = pi4_valid_part_ids[i4_count]) >= 0)
1959 {
1960 S32 update_required = 0;
1961
1962 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1963 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1964 i4_mv_cost = ps_best_node->i4_mv_cost;
1965 i4_sad = ps_result_prms->pi4_sad_grid[id];
1966 i4_tot_cost = i4_sad + i4_mv_cost;
1967
1968 /* We do not labor through the results if the total cost is worse than */
1969 /* the last of the results. */
1970 if(i4_tot_cost < ps_best_node[1].i4_tot_cost)
1971 {
1972 S32 sdi_value = 0;
1973
1974 update_required = 2;
1975 /* Identify where the current result is to be placed. Basically find */
1976 /* the node which has cost just higher than node under test */
1977 {
1978 if(i4_tot_cost < ps_best_node[0].i4_tot_cost)
1979 {
1980 update_required = 1;
1981 sdi_value = ps_best_node[0].i4_sad - i4_sad;
1982 }
1983 else if(
1984 (ps_result_prms->i2_mv_x == ps_best_node[0].s_mv.i2_mvx) &&
1985 (ps_result_prms->i2_mv_y == ps_best_node[0].s_mv.i2_mvy) &&
1986 (ps_best_node[0].i1_ref_idx == ps_result_prms->i1_ref_idx))
1987 {
1988 update_required = 0;
1989 }
1990 }
1991 if(update_required == 2)
1992 {
1993 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1994
1995 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] = i4_tot_cost;
1996 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] = i4_mv_cost;
1997 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] = ps_result_prms->i2_mv_x;
1998 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] = ps_result_prms->i2_mv_y;
1999 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] = ps_result_prms->i1_ref_idx;
2000 }
2001 else if(update_required == 1)
2002 {
2003 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2004
2005 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] =
2006 ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count];
2007 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] =
2008 ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count];
2009 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] =
2010 ps_subpel_refine_ctxt->i2_mv_x[0][i4_count];
2011 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] =
2012 ps_subpel_refine_ctxt->i2_mv_y[0][i4_count];
2013 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] =
2014 ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count];
2015
2016 ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] = i4_tot_cost;
2017 ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count] = i4_mv_cost;
2018 ps_subpel_refine_ctxt->i2_mv_x[0][i4_count] = ps_result_prms->i2_mv_x;
2019 ps_subpel_refine_ctxt->i2_mv_y[0][i4_count] = ps_result_prms->i2_mv_y;
2020 ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count] = ps_result_prms->i1_ref_idx;
2021 }
2022 }
2023 i4_count++;
2024 }
2025 }
2026
2027 /**
2028 ******************************************************************************
2029 * @brief Gives a result fxn ptr for a index [x] where x is as:
2030 * 0 : single pt, no partial updates, 1 best result
2031 * 1 : single pt, no partial updates, N best results
2032 * 2 : single pt, partial updates, 1 best result
2033 * 3 : single pt, partial updates, N best results
2034 * 0 : grid , no partial updates, 1 best result
2035 * 1 : grid , no partial updates, N best results
2036 * 2 : grid , partial updates, 1 best result
2037 * 3 : grid , partial updates, N best results
2038 ******************************************************************************
2039 */
2040
2041 static PF_RESULT_FXN_T g_pf_result_fxn[8] = { UPD_RES_PT_NPU_BEST1, UPD_RES_PT_NPU_BESTN,
2042 UPD_RES_PT_PU_BEST1, UPD_RES_PT_PU_BESTN,
2043 UPD_RES_GRID_NPU_BEST1, UPD_RES_GRID_NPU_BESTN,
2044 UPD_RES_GRID_PU_BEST1, UPD_RES_GRID_PU_BESTN };
2045
2046 /**
2047 ********************************************************************************
2048 * @fn hme_get_result_fxn(i4_grid_mask, i4_part_mask, i4_num_results)
2049 *
2050 * @brief Obtains the suitable result function that evaluates COST and also
2051 * computes one or more best results for point/grid, single part or
2052 * more than one part.
2053 *
2054 * @param[in] i4_grid_mask : Mask containing which of 9 grid pts active
2055 *
2056 * @param[in] i4_part_mask : Mask containing which of the 17 parts active
2057 *
2058 * @param[in] i4_num_results: Number of active results
2059 *
2060 * @return Pointer to the appropriate result update function
2061 ********************************************************************************
2062 */
hme_get_result_fxn(S32 i4_grid_mask,S32 i4_part_mask,S32 i4_num_results)2063 PF_RESULT_FXN_T hme_get_result_fxn(S32 i4_grid_mask, S32 i4_part_mask, S32 i4_num_results)
2064 {
2065 S32 i4_is_grid = (i4_grid_mask != 1);
2066 S32 i4_is_pu = ((i4_part_mask & (i4_part_mask - 1)) != 0);
2067 S32 i4_res_gt1 = (i4_num_results > 1);
2068 S32 id;
2069
2070 id = (i4_is_grid << 2) + (i4_is_pu << 1) + i4_res_gt1;
2071
2072 return (g_pf_result_fxn[id]);
2073 }
2074
hme_calc_sad_and_2_best_results(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2075 void hme_calc_sad_and_2_best_results(
2076 hme_search_prms_t *ps_search_prms,
2077 wgt_pred_ctxt_t *ps_wt_inp_prms,
2078 err_prms_t *ps_err_prms,
2079 result_upd_prms_t *ps_result_prms,
2080 U08 **ppu1_ref,
2081 S32 i4_ref_stride)
2082 {
2083 S32 i4_candt;
2084 S32 i4_inp_off;
2085 S32 i4_ref_offset;
2086 S32 i4_num_nodes;
2087
2088 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2089 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2090 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2091 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2092 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2093
2094 mv_refine_ctxt_t *ps_mv_refine_ctxt;
2095 search_node_t *ps_search_node;
2096
2097 ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2098 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2099 i4_inp_off = ps_search_prms->i4_cu_x_off;
2100 i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2101 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2102 ps_search_node = ps_search_prms->ps_search_nodes;
2103
2104 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2105 {
2106 /**********************************************************************/
2107 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
2108 /**********************************************************************/
2109 {
2110 WORD32 b, c, d;
2111 UWORD8 *pu1_cur_ptr;
2112 UWORD8 *pu1_ref_ptr;
2113 UWORD16 au2_4x4_sad[NUM_4X4];
2114
2115 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2116 {
2117 continue;
2118 }
2119
2120 ps_err_prms->pu1_inp =
2121 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2122 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2123 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2124 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2125
2126 pu1_cur_ptr = ps_err_prms->pu1_inp;
2127 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2128
2129 /* Loop to compute the SAD's */
2130 {
2131 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2132 for(b = 0; b < NUM_4X4; b++)
2133 {
2134 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2135 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2136
2137 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2138 {
2139 WORD32 z_cur = (cur_buf_stride)*c + t1;
2140 WORD32 z_ref = (ref_buf_stride)*c + t2;
2141 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2142 {
2143 au2_4x4_sad[b] += (UWORD16)ABS((
2144 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2145 }
2146 }
2147 }
2148
2149 pi4_sad_grid[PART_ID_NxN_TL] =
2150 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2151 pi4_sad_grid[PART_ID_NxN_TR] =
2152 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2153 pi4_sad_grid[PART_ID_NxN_BL] =
2154 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2155 pi4_sad_grid[PART_ID_NxN_BR] =
2156 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2157 pi4_sad_grid[PART_ID_Nx2N_L] =
2158 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2159 pi4_sad_grid[PART_ID_Nx2N_R] =
2160 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2161 pi4_sad_grid[PART_ID_2NxN_T] =
2162 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2163 pi4_sad_grid[PART_ID_2NxN_B] =
2164 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2165 pi4_sad_grid[PART_ID_nLx2N_L] =
2166 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2167 pi4_sad_grid[PART_ID_nRx2N_R] =
2168 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2169 pi4_sad_grid[PART_ID_2NxnU_T] =
2170 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2171 pi4_sad_grid[PART_ID_2NxnD_B] =
2172 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2173 pi4_sad_grid[PART_ID_2Nx2N] =
2174 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2175 pi4_sad_grid[PART_ID_2NxnU_B] =
2176 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2177 pi4_sad_grid[PART_ID_2NxnD_T] =
2178 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2179 pi4_sad_grid[PART_ID_nRx2N_L] =
2180 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2181 pi4_sad_grid[PART_ID_nLx2N_R] =
2182 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2183 }
2184 }
2185
2186 {
2187 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2188 S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2189 S32 best_node_cost;
2190 S32 second_best_node_cost;
2191
2192 {
2193 S16 mvdx1, mvdy1;
2194 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2195 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2196 S32 pred_lx = i4_search_idx;
2197
2198 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2199 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2200 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2201
2202 S32 inp_shift = 2;
2203 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2204 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2205 S32 lambda = ps_pred_ctxt->lambda;
2206 S32 rnd = 1 << (lambda_q_shift - 1);
2207 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2208 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2209 S32 ref_bits =
2210 ps_pred_ctxt
2211 ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2212
2213 COMPUTE_DIFF_MV(
2214 mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2215
2216 mvdx1 = ABS(mvdx1);
2217 mvdy1 = ABS(mvdy1);
2218
2219 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2220 (mvdy1 > 0) + ref_bits + 2;
2221
2222 i4_mv_cost *= lambda;
2223 i4_mv_cost += rnd;
2224 i4_mv_cost >>= lambda_q_shift;
2225
2226 i4_mv_cost = CLIP_U16(i4_mv_cost);
2227 }
2228
2229 /*For each valid partition, update the refine_prm structure to reflect the best and second
2230 best candidates for that partition*/
2231
2232 for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2233 {
2234 S32 update_required = 0;
2235 S32 part_id = pi4_valid_part_ids[i4_count];
2236 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2237
2238 /*Calculate total cost*/
2239 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2240 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2241
2242 /*****************************************************************/
2243 /* We do not labor through the results if the total cost worse */
2244 /* than the last of the results. */
2245 /*****************************************************************/
2246 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
2247 second_best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[1][index]);
2248
2249 if(i4_tot_cost < second_best_node_cost)
2250 {
2251 update_required = 2;
2252
2253 /*************************************************************/
2254 /* Identify where the current result isto be placed.Basically*/
2255 /* find the node which has cost just higher thannodeundertest*/
2256 /*************************************************************/
2257 if(i4_tot_cost < best_node_cost)
2258 {
2259 update_required = 1;
2260 }
2261 else if(i4_tot_cost == best_node_cost)
2262 {
2263 update_required = 0;
2264 }
2265
2266 if(update_required == 2)
2267 {
2268 ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2269 ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2270 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2271 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2272 ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2273 }
2274 else if(update_required == 1)
2275 {
2276 ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2277 ps_mv_refine_ctxt->i2_tot_cost[0][index];
2278 ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2279 ps_mv_refine_ctxt->i2_mv_cost[0][index];
2280 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2281 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2282 ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2283 ps_mv_refine_ctxt->i2_ref_idx[0][index];
2284
2285 ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2286 ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2287 ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2288 ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2289 ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2290 }
2291 }
2292 }
2293 }
2294 ps_search_node++;
2295 }
2296
2297 {
2298 WORD32 i4_i;
2299 WORD32 part_id;
2300 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2301 for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2302 {
2303 part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2304 if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2305 {
2306 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2307 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2308 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2309
2310 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2311 }
2312 if(ps_mv_refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2313 {
2314 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2315 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2316 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2317
2318 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2319 }
2320 }
2321 }
2322 }
2323
hme_calc_sad_and_2_best_results_subpel(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)2324 void hme_calc_sad_and_2_best_results_subpel(
2325 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
2326 {
2327 S32 i4_candt;
2328 S32 i4_num_nodes;
2329
2330 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2331 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2332 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2333 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2334 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2335
2336 mv_refine_ctxt_t *ps_subpel_refine_ctxt;
2337 ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2338 i4_num_nodes = 1;
2339
2340 /* Run through each of the candts in a loop */
2341 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2342 {
2343 /**********************************************************************/
2344 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
2345 /**********************************************************************/
2346 {
2347 WORD32 b, c, d;
2348 UWORD8 *pu1_cur_ptr;
2349 UWORD8 *pu1_ref_ptr;
2350 UWORD16 au2_4x4_sad[NUM_4X4];
2351
2352 pu1_cur_ptr = ps_err_prms->pu1_inp;
2353 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2354
2355 /* Loop to compute the SAD's */
2356 {
2357 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2358 for(b = 0; b < NUM_4X4; b++)
2359 {
2360 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2361 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2362
2363 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2364 {
2365 WORD32 z_cur = (cur_buf_stride)*c + t1;
2366 WORD32 z_ref = (ref_buf_stride)*c + t2;
2367 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2368 {
2369 au2_4x4_sad[b] += (UWORD16)ABS((
2370 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2371 }
2372 }
2373 }
2374
2375 pi4_sad_grid[PART_ID_NxN_TL] =
2376 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2377 pi4_sad_grid[PART_ID_NxN_TR] =
2378 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2379 pi4_sad_grid[PART_ID_NxN_BL] =
2380 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2381 pi4_sad_grid[PART_ID_NxN_BR] =
2382 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2383 pi4_sad_grid[PART_ID_Nx2N_L] =
2384 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2385 pi4_sad_grid[PART_ID_Nx2N_R] =
2386 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2387 pi4_sad_grid[PART_ID_2NxN_T] =
2388 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2389 pi4_sad_grid[PART_ID_2NxN_B] =
2390 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2391 pi4_sad_grid[PART_ID_nLx2N_L] =
2392 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2393 pi4_sad_grid[PART_ID_nRx2N_R] =
2394 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2395 pi4_sad_grid[PART_ID_2NxnU_T] =
2396 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2397 pi4_sad_grid[PART_ID_2NxnD_B] =
2398 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2399 pi4_sad_grid[PART_ID_2Nx2N] =
2400 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2401 pi4_sad_grid[PART_ID_2NxnU_B] =
2402 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2403 pi4_sad_grid[PART_ID_2NxnD_T] =
2404 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2405 pi4_sad_grid[PART_ID_nRx2N_L] =
2406 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2407 pi4_sad_grid[PART_ID_nLx2N_R] =
2408 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2409 }
2410 }
2411 /**********************************************************************/
2412 /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS */
2413 /**********************************************************************/
2414 {
2415 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2416 S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
2417 S32 best_node_cost;
2418 S32 second_best_node_cost;
2419
2420 /*For each valid partition, update the refine_prm structure to reflect the best and second
2421 best candidates for that partition*/
2422
2423 for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
2424 {
2425 S32 update_required = 0;
2426 S32 part_id = pi4_valid_part_ids[i4_count];
2427 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2428
2429 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
2430 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2431
2432 /*Calculate total cost*/
2433 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2434 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2435
2436 /*****************************************************************/
2437 /* We do not labor through the results if the total cost worse */
2438 /* than the last of the results. */
2439 /*****************************************************************/
2440 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
2441 second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
2442
2443 if(i4_tot_cost < second_best_node_cost)
2444 {
2445 update_required = 2;
2446
2447 /*************************************************************/
2448 /* Identify where the current result isto be placed.Basically*/
2449 /* find the node which has cost just higher thannodeundertest*/
2450 /*************************************************************/
2451 if(i4_tot_cost < best_node_cost)
2452 {
2453 update_required = 1;
2454 }
2455 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
2456 {
2457 update_required = 0;
2458 }
2459 if(update_required == 2)
2460 {
2461 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2462 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2463 ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
2464 ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
2465 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
2466 }
2467 else if(update_required == 1)
2468 {
2469 ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
2470 ps_subpel_refine_ctxt->i2_tot_cost[0][index];
2471 ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
2472 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2473 ps_subpel_refine_ctxt->i2_mv_x[1][index] =
2474 ps_subpel_refine_ctxt->i2_mv_x[0][index];
2475 ps_subpel_refine_ctxt->i2_mv_y[1][index] =
2476 ps_subpel_refine_ctxt->i2_mv_y[0][index];
2477 ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
2478 ps_subpel_refine_ctxt->i2_ref_idx[0][index];
2479
2480 ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2481 ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2482 ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
2483 ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
2484 ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
2485 }
2486 }
2487 }
2488 }
2489 }
2490
2491 {
2492 WORD32 i4_count = 0;
2493 for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
2494 {
2495 WORD32 j;
2496 for(j = 0; j < 2; j++)
2497 {
2498 if(ps_subpel_refine_ctxt->i2_tot_cost[j][i4_count] >= MAX_SIGNED_16BIT_VAL)
2499 {
2500 ps_subpel_refine_ctxt->ai2_fullpel_satd[j][i4_count] = MAX_SIGNED_16BIT_VAL;
2501 }
2502 }
2503 }
2504 }
2505 }
2506
hme_calc_stim_injected_sad_and_2_best_results(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2507 void hme_calc_stim_injected_sad_and_2_best_results(
2508 hme_search_prms_t *ps_search_prms,
2509 wgt_pred_ctxt_t *ps_wt_inp_prms,
2510 err_prms_t *ps_err_prms,
2511 result_upd_prms_t *ps_result_prms,
2512 U08 **ppu1_ref,
2513 S32 i4_ref_stride)
2514 {
2515 mv_refine_ctxt_t *ps_mv_refine_ctxt;
2516 search_node_t *ps_search_node;
2517
2518 S32 i4_candt;
2519 S32 i4_count;
2520 S32 i4_inp_off;
2521 S32 i4_ref_offset;
2522 S32 i4_num_nodes;
2523 ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
2524 au8_final_ref_sigmaXSquared[17];
2525 UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
2526 S32 *pi4_valid_part_ids;
2527
2528 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2529 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2530 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2531 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2532 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2533
2534 ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2535 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2536 i4_inp_off = ps_search_prms->i4_cu_x_off;
2537 i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2538 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2539 ps_search_node = ps_search_prms->ps_search_nodes;
2540 pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2541
2542 /* Set local pointer to point to partition level sigma values calculated in hme_refine */
2543 au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
2544 au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
2545
2546 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2547 {
2548 {
2549 WORD32 b, c, d;
2550 UWORD8 *pu1_cur_ptr;
2551 UWORD8 *pu1_ref_ptr;
2552 UWORD16 au2_4x4_sad[NUM_4X4];
2553
2554 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2555 {
2556 continue;
2557 }
2558
2559 ps_err_prms->pu1_inp =
2560 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2561 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2562 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2563 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2564
2565 pu1_cur_ptr = ps_err_prms->pu1_inp;
2566 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2567
2568 /* Loop to compute the SAD's */
2569 {
2570 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2571 for(b = 0; b < NUM_4X4; b++)
2572 {
2573 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2574 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2575
2576 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2577 {
2578 WORD32 z_cur = (cur_buf_stride)*c + t1;
2579 WORD32 z_ref = (ref_buf_stride)*c + t2;
2580 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2581 {
2582 au2_4x4_sad[b] += (UWORD16)ABS((
2583 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2584 }
2585 }
2586 }
2587
2588 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
2589 hme_compute_sigmaX_and_sigmaXSquared(
2590 pu1_ref_ptr,
2591 ref_buf_stride,
2592 au4_4x4_ref_sigmaX,
2593 au4_4x4_ref_sigmaXSquared,
2594 4,
2595 4,
2596 16,
2597 16,
2598 1,
2599 4);
2600
2601 pi4_sad_grid[PART_ID_NxN_TL] =
2602 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2603 pi4_sad_grid[PART_ID_NxN_TR] =
2604 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2605 pi4_sad_grid[PART_ID_NxN_BL] =
2606 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2607 pi4_sad_grid[PART_ID_NxN_BR] =
2608 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2609 pi4_sad_grid[PART_ID_Nx2N_L] =
2610 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2611 pi4_sad_grid[PART_ID_Nx2N_R] =
2612 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2613 pi4_sad_grid[PART_ID_2NxN_T] =
2614 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2615 pi4_sad_grid[PART_ID_2NxN_B] =
2616 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2617 pi4_sad_grid[PART_ID_nLx2N_L] =
2618 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2619 pi4_sad_grid[PART_ID_nRx2N_R] =
2620 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2621 pi4_sad_grid[PART_ID_2NxnU_T] =
2622 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2623 pi4_sad_grid[PART_ID_2NxnD_B] =
2624 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2625 pi4_sad_grid[PART_ID_2Nx2N] =
2626 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2627 pi4_sad_grid[PART_ID_2NxnU_B] =
2628 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2629 pi4_sad_grid[PART_ID_2NxnD_T] =
2630 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2631 pi4_sad_grid[PART_ID_nRx2N_L] =
2632 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2633 pi4_sad_grid[PART_ID_nLx2N_R] =
2634 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2635 }
2636 }
2637
2638 {
2639 S32 i4_sad, i4_mv_cost, i4_tot_cost;
2640 S32 best_node_cost;
2641 S32 second_best_node_cost;
2642 ULWORD64 u8_temp_var, u8_temp_var1;
2643 ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
2644
2645 {
2646 S16 mvdx1, mvdy1;
2647 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2648 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2649 S32 pred_lx = i4_search_idx;
2650
2651 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2652 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2653 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2654
2655 S32 inp_shift = 2;
2656 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2657 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2658 S32 lambda = ps_pred_ctxt->lambda;
2659 S32 rnd = 1 << (lambda_q_shift - 1);
2660 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2661 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2662 S32 ref_bits =
2663 ps_pred_ctxt
2664 ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2665
2666 COMPUTE_DIFF_MV(
2667 mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2668
2669 mvdx1 = ABS(mvdx1);
2670 mvdy1 = ABS(mvdy1);
2671
2672 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2673 (mvdy1 > 0) + ref_bits + 2;
2674
2675 i4_mv_cost *= lambda;
2676 i4_mv_cost += rnd;
2677 i4_mv_cost >>= lambda_q_shift;
2678
2679 i4_mv_cost = CLIP_U16(i4_mv_cost);
2680 }
2681
2682 for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2683 {
2684 S32 i4_stim_injected_sad;
2685 S32 i4_stim_injected_cost;
2686 S32 i4_noise_term;
2687 unsigned long u4_shift_val;
2688 S32 i4_bits_req;
2689
2690 S32 update_required = 0;
2691 S32 part_id = pi4_valid_part_ids[i4_count];
2692 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2693
2694 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
2695
2696 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
2697
2698 if(ps_search_prms->i4_alpha_stim_multiplier)
2699 {
2700 /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
2701 hme_compute_final_sigma_of_pu_from_base_blocks(
2702 au4_4x4_ref_sigmaX,
2703 au4_4x4_ref_sigmaXSquared,
2704 au8_final_ref_sigmaX,
2705 au8_final_ref_sigmaXSquared,
2706 16,
2707 4,
2708 part_id,
2709 4);
2710
2711 u8_ref_X_Square =
2712 (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
2713 u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
2714
2715 /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
2716 /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
2717 /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
2718 u4_shift_val = ihevce_calc_stim_injected_variance(
2719 au8_final_src_sigmaX,
2720 au8_final_src_sigmaXSquared,
2721 &u8_src_var,
2722 i4_inv_wt,
2723 ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
2724 ps_wt_inp_prms->wpred_log_wdc,
2725 part_id);
2726
2727 u8_ref_var = u8_ref_var >> u4_shift_val;
2728
2729 /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
2730 GETRANGE64(i4_bits_req, u8_ref_var);
2731
2732 if(i4_bits_req > 27)
2733 {
2734 u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
2735 u8_src_var = u8_src_var >> (i4_bits_req - 27);
2736 }
2737
2738 if(u8_src_var == u8_ref_var)
2739 {
2740 u8_temp_var = (1 << STIM_Q_FORMAT);
2741 }
2742 else
2743 {
2744 u8_temp_var = (2 * u8_src_var * u8_ref_var);
2745 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
2746 u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
2747 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
2748 u8_temp_var = (u8_temp_var / u8_temp_var1);
2749 }
2750
2751 i4_noise_term = (UWORD32)u8_temp_var;
2752
2753 ASSERT(i4_noise_term >= 0);
2754
2755 i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
2756 }
2757 else
2758 {
2759 i4_noise_term = 0;
2760 }
2761 u8_pure_dist = pi4_sad_grid[part_id];
2762 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
2763 u8_pure_dist += (1 << ((i4_q_level)-1));
2764 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
2765
2766 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2767 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2768 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
2769 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
2770
2771 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
2772 second_best_node_cost =
2773 CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[1][index]);
2774
2775 if(i4_stim_injected_cost < second_best_node_cost)
2776 {
2777 update_required = 2;
2778
2779 if(i4_stim_injected_cost < best_node_cost)
2780 {
2781 update_required = 1;
2782 }
2783 else if(i4_stim_injected_cost == best_node_cost)
2784 {
2785 update_required = 0;
2786 }
2787
2788 if(update_required == 2)
2789 {
2790 ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2791 ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
2792 ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2793 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2794 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2795 ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2796 }
2797 else if(update_required == 1)
2798 {
2799 ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2800 ps_mv_refine_ctxt->i2_tot_cost[0][index];
2801 ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] =
2802 ps_mv_refine_ctxt->i2_stim_injected_cost[0][index];
2803 ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2804 ps_mv_refine_ctxt->i2_mv_cost[0][index];
2805 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2806 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2807 ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2808 ps_mv_refine_ctxt->i2_ref_idx[0][index];
2809
2810 ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2811 ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
2812 ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2813 ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2814 ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2815 ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2816 }
2817 }
2818 }
2819 }
2820
2821 ps_search_node++;
2822 }
2823
2824 {
2825 WORD32 i4_i;
2826 WORD32 part_id;
2827 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2828 for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2829 {
2830 part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2831 if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2832 {
2833 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2834 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2835 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2836
2837 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2838 }
2839 if(ps_mv_refine_ctxt->i2_stim_injected_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2840 {
2841 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2842 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2843 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2844
2845 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2846 }
2847 }
2848 }
2849 }
2850
hme_calc_sad_and_1_best_result(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2851 void hme_calc_sad_and_1_best_result(
2852 hme_search_prms_t *ps_search_prms,
2853 wgt_pred_ctxt_t *ps_wt_inp_prms,
2854 err_prms_t *ps_err_prms,
2855 result_upd_prms_t *ps_result_prms,
2856 U08 **ppu1_ref,
2857 S32 i4_ref_stride)
2858 {
2859 S32 i4_candt;
2860 S32 i4_inp_off;
2861 S32 i4_ref_offset;
2862 S32 i4_num_nodes;
2863
2864 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2865 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2866 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2867 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2868 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2869
2870 mv_refine_ctxt_t *ps_mv_refine_ctxt;
2871 search_node_t *ps_search_node;
2872
2873 ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2874 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2875 i4_inp_off = ps_search_prms->i4_cu_x_off;
2876 i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2877 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2878 ps_search_node = ps_search_prms->ps_search_nodes;
2879
2880 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2881 {
2882 /**********************************************************************/
2883 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
2884 /**********************************************************************/
2885 {
2886 WORD32 b, c, d;
2887 UWORD8 *pu1_cur_ptr;
2888 UWORD8 *pu1_ref_ptr;
2889 UWORD16 au2_4x4_sad[NUM_4X4];
2890
2891 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2892 {
2893 continue;
2894 }
2895
2896 ps_err_prms->pu1_inp =
2897 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2898 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2899 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2900 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2901
2902 pu1_cur_ptr = ps_err_prms->pu1_inp;
2903 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2904
2905 /* Loop to compute the SAD's */
2906 {
2907 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2908 for(b = 0; b < NUM_4X4; b++)
2909 {
2910 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2911 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2912
2913 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2914 {
2915 WORD32 z_cur = (cur_buf_stride)*c + t1;
2916 WORD32 z_ref = (ref_buf_stride)*c + t2;
2917 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2918 {
2919 au2_4x4_sad[b] += (UWORD16)ABS((
2920 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2921 }
2922 }
2923 }
2924
2925 pi4_sad_grid[PART_ID_NxN_TL] =
2926 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2927 pi4_sad_grid[PART_ID_NxN_TR] =
2928 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2929 pi4_sad_grid[PART_ID_NxN_BL] =
2930 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2931 pi4_sad_grid[PART_ID_NxN_BR] =
2932 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2933 pi4_sad_grid[PART_ID_Nx2N_L] =
2934 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2935 pi4_sad_grid[PART_ID_Nx2N_R] =
2936 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2937 pi4_sad_grid[PART_ID_2NxN_T] =
2938 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2939 pi4_sad_grid[PART_ID_2NxN_B] =
2940 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2941 pi4_sad_grid[PART_ID_nLx2N_L] =
2942 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2943 pi4_sad_grid[PART_ID_nRx2N_R] =
2944 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2945 pi4_sad_grid[PART_ID_2NxnU_T] =
2946 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2947 pi4_sad_grid[PART_ID_2NxnD_B] =
2948 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2949 pi4_sad_grid[PART_ID_2Nx2N] =
2950 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2951 pi4_sad_grid[PART_ID_2NxnU_B] =
2952 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2953 pi4_sad_grid[PART_ID_2NxnD_T] =
2954 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2955 pi4_sad_grid[PART_ID_nRx2N_L] =
2956 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2957 pi4_sad_grid[PART_ID_nLx2N_R] =
2958 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2959 }
2960 }
2961
2962 {
2963 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2964 S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2965 S32 best_node_cost;
2966 S32 second_best_node_cost;
2967
2968 {
2969 S16 mvdx1, mvdy1;
2970 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2971 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2972 S32 pred_lx = i4_search_idx;
2973
2974 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2975 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2976 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2977
2978 S32 inp_shift = 2;
2979 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2980 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2981 S32 lambda = ps_pred_ctxt->lambda;
2982 S32 rnd = 1 << (lambda_q_shift - 1);
2983 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2984 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2985 S32 ref_bits =
2986 ps_pred_ctxt
2987 ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2988
2989 COMPUTE_DIFF_MV(
2990 mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2991
2992 mvdx1 = ABS(mvdx1);
2993 mvdy1 = ABS(mvdy1);
2994
2995 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2996 (mvdy1 > 0) + ref_bits + 2;
2997
2998 i4_mv_cost *= lambda;
2999 i4_mv_cost += rnd;
3000 i4_mv_cost >>= lambda_q_shift;
3001
3002 i4_mv_cost = CLIP_U16(i4_mv_cost);
3003 }
3004
3005 /*For each valid partition, update the refine_prm structure to reflect the best and second
3006 best candidates for that partition*/
3007
3008 for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3009 {
3010 S32 update_required = 0;
3011 S32 part_id = pi4_valid_part_ids[i4_count];
3012 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3013
3014 /*Calculate total cost*/
3015 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3016 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3017
3018 /*****************************************************************/
3019 /* We do not labor through the results if the total cost worse */
3020 /* than the last of the results. */
3021 /*****************************************************************/
3022 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
3023 second_best_node_cost = SHRT_MAX;
3024
3025 if(i4_tot_cost < second_best_node_cost)
3026 {
3027 update_required = 0;
3028
3029 /*************************************************************/
3030 /* Identify where the current result isto be placed.Basically*/
3031 /* find the node which has cost just higher thannodeundertest*/
3032 /*************************************************************/
3033 if(i4_tot_cost < best_node_cost)
3034 {
3035 update_required = 1;
3036 }
3037 else if(i4_tot_cost == best_node_cost)
3038 {
3039 update_required = 0;
3040 }
3041
3042 if(update_required == 2)
3043 {
3044 ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3045 ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3046 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3047 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3048 ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3049 }
3050 else if(update_required == 1)
3051 {
3052 ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3053 ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3054 ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3055 ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3056 ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3057 }
3058 }
3059 }
3060 }
3061 ps_search_node++;
3062 }
3063
3064 {
3065 WORD32 i4_i;
3066 WORD32 part_id;
3067 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3068 for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3069 {
3070 part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3071 if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3072 {
3073 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3074 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3075 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3076
3077 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3078 }
3079 }
3080 }
3081 }
3082
hme_calc_stim_injected_sad_and_1_best_result(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)3083 void hme_calc_stim_injected_sad_and_1_best_result(
3084 hme_search_prms_t *ps_search_prms,
3085 wgt_pred_ctxt_t *ps_wt_inp_prms,
3086 err_prms_t *ps_err_prms,
3087 result_upd_prms_t *ps_result_prms,
3088 U08 **ppu1_ref,
3089 S32 i4_ref_stride)
3090 {
3091 mv_refine_ctxt_t *ps_mv_refine_ctxt;
3092 search_node_t *ps_search_node;
3093
3094 S32 i4_candt;
3095 S32 i4_count;
3096 S32 i4_inp_off;
3097 S32 i4_ref_offset;
3098 S32 i4_num_nodes;
3099 ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
3100 au8_final_ref_sigmaXSquared[17];
3101 UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
3102 S32 *pi4_valid_part_ids;
3103
3104 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3105 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3106 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3107 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3108 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3109
3110 ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
3111 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3112 i4_inp_off = ps_search_prms->i4_cu_x_off;
3113 i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
3114 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3115 ps_search_node = ps_search_prms->ps_search_nodes;
3116 pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
3117
3118 /* Set local pointer to point to partition level sigma values calculated in hme_refine */
3119 au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
3120 au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
3121
3122 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3123 {
3124 {
3125 WORD32 b, c, d;
3126 UWORD8 *pu1_cur_ptr;
3127 UWORD8 *pu1_ref_ptr;
3128 UWORD16 au2_4x4_sad[NUM_4X4];
3129
3130 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3131 {
3132 continue;
3133 }
3134
3135 ps_err_prms->pu1_inp =
3136 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3137 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3138 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3139 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3140
3141 pu1_cur_ptr = ps_err_prms->pu1_inp;
3142 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3143
3144 /* Loop to compute the SAD's */
3145 {
3146 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3147 for(b = 0; b < NUM_4X4; b++)
3148 {
3149 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3150 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3151
3152 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3153 {
3154 WORD32 z_cur = (cur_buf_stride)*c + t1;
3155 WORD32 z_ref = (ref_buf_stride)*c + t2;
3156 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3157 {
3158 au2_4x4_sad[b] += (UWORD16)ABS((
3159 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3160 }
3161 }
3162 }
3163
3164 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
3165 hme_compute_sigmaX_and_sigmaXSquared(
3166 pu1_ref_ptr,
3167 ref_buf_stride,
3168 au4_4x4_ref_sigmaX,
3169 au4_4x4_ref_sigmaXSquared,
3170 4,
3171 4,
3172 16,
3173 16,
3174 1,
3175 4);
3176
3177 pi4_sad_grid[PART_ID_NxN_TL] =
3178 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3179 pi4_sad_grid[PART_ID_NxN_TR] =
3180 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3181 pi4_sad_grid[PART_ID_NxN_BL] =
3182 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3183 pi4_sad_grid[PART_ID_NxN_BR] =
3184 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3185 pi4_sad_grid[PART_ID_Nx2N_L] =
3186 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3187 pi4_sad_grid[PART_ID_Nx2N_R] =
3188 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3189 pi4_sad_grid[PART_ID_2NxN_T] =
3190 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3191 pi4_sad_grid[PART_ID_2NxN_B] =
3192 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3193 pi4_sad_grid[PART_ID_nLx2N_L] =
3194 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3195 pi4_sad_grid[PART_ID_nRx2N_R] =
3196 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3197 pi4_sad_grid[PART_ID_2NxnU_T] =
3198 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3199 pi4_sad_grid[PART_ID_2NxnD_B] =
3200 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3201 pi4_sad_grid[PART_ID_2Nx2N] =
3202 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3203 pi4_sad_grid[PART_ID_2NxnU_B] =
3204 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3205 pi4_sad_grid[PART_ID_2NxnD_T] =
3206 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3207 pi4_sad_grid[PART_ID_nRx2N_L] =
3208 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3209 pi4_sad_grid[PART_ID_nLx2N_R] =
3210 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3211 }
3212 }
3213
3214 {
3215 S32 i4_sad, i4_mv_cost, i4_tot_cost;
3216 S32 best_node_cost;
3217 S32 second_best_node_cost;
3218 ULWORD64 u8_temp_var, u8_temp_var1;
3219 ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
3220
3221 {
3222 S16 mvdx1, mvdy1;
3223 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
3224 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
3225 S32 pred_lx = i4_search_idx;
3226
3227 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
3228 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
3229 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
3230
3231 S32 inp_shift = 2;
3232 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3233 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
3234 S32 lambda = ps_pred_ctxt->lambda;
3235 S32 rnd = 1 << (lambda_q_shift - 1);
3236 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3237 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3238 S32 ref_bits =
3239 ps_pred_ctxt
3240 ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
3241
3242 COMPUTE_DIFF_MV(
3243 mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
3244
3245 mvdx1 = ABS(mvdx1);
3246 mvdy1 = ABS(mvdy1);
3247
3248 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
3249 (mvdy1 > 0) + ref_bits + 2;
3250
3251 i4_mv_cost *= lambda;
3252 i4_mv_cost += rnd;
3253 i4_mv_cost >>= lambda_q_shift;
3254
3255 i4_mv_cost = CLIP_U16(i4_mv_cost);
3256 }
3257
3258 for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3259 {
3260 S32 i4_stim_injected_sad;
3261 S32 i4_stim_injected_cost;
3262 S32 i4_noise_term;
3263 unsigned long u4_shift_val;
3264 S32 i4_bits_req;
3265
3266 S32 update_required = 0;
3267 S32 part_id = pi4_valid_part_ids[i4_count];
3268 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3269
3270 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
3271
3272 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
3273
3274 if(ps_search_prms->i4_alpha_stim_multiplier)
3275 {
3276 /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
3277 hme_compute_final_sigma_of_pu_from_base_blocks(
3278 au4_4x4_ref_sigmaX,
3279 au4_4x4_ref_sigmaXSquared,
3280 au8_final_ref_sigmaX,
3281 au8_final_ref_sigmaXSquared,
3282 16,
3283 4,
3284 part_id,
3285 4);
3286
3287 u8_ref_X_Square =
3288 (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
3289 u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
3290
3291 /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
3292 /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
3293 /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
3294 u4_shift_val = ihevce_calc_stim_injected_variance(
3295 au8_final_src_sigmaX,
3296 au8_final_src_sigmaXSquared,
3297 &u8_src_var,
3298 i4_inv_wt,
3299 ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
3300 ps_wt_inp_prms->wpred_log_wdc,
3301 part_id);
3302
3303 u8_ref_var = u8_ref_var >> u4_shift_val;
3304
3305 /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
3306 GETRANGE64(i4_bits_req, u8_ref_var);
3307
3308 if(i4_bits_req > 27)
3309 {
3310 u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
3311 u8_src_var = u8_src_var >> (i4_bits_req - 27);
3312 }
3313
3314 if(u8_src_var == u8_ref_var)
3315 {
3316 u8_temp_var = (1 << STIM_Q_FORMAT);
3317 }
3318 else
3319 {
3320 u8_temp_var = (2 * u8_src_var * u8_ref_var);
3321 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
3322 u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
3323 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
3324 u8_temp_var = (u8_temp_var / u8_temp_var1);
3325 }
3326
3327 i4_noise_term = (UWORD32)u8_temp_var;
3328
3329 ASSERT(i4_noise_term >= 0);
3330
3331 i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
3332 }
3333 else
3334 {
3335 i4_noise_term = 0;
3336 }
3337 u8_pure_dist = pi4_sad_grid[part_id];
3338 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
3339 u8_pure_dist += (1 << ((i4_q_level)-1));
3340 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
3341
3342 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3343 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3344 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
3345 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
3346
3347 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
3348 second_best_node_cost = SHRT_MAX;
3349
3350 if(i4_stim_injected_cost < second_best_node_cost)
3351 {
3352 update_required = 0;
3353
3354 if(i4_stim_injected_cost < best_node_cost)
3355 {
3356 update_required = 1;
3357 }
3358 else if(i4_stim_injected_cost == best_node_cost)
3359 {
3360 update_required = 0;
3361 }
3362
3363 if(update_required == 2)
3364 {
3365 ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3366 ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
3367 ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3368 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3369 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3370 ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3371 }
3372 else if(update_required == 1)
3373 {
3374 ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3375 ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
3376 ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3377 ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3378 ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3379 ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3380 }
3381 }
3382 }
3383 }
3384
3385 ps_search_node++;
3386 }
3387
3388 {
3389 WORD32 i4_i;
3390 WORD32 part_id;
3391 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3392 for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3393 {
3394 part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3395 if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3396 {
3397 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3398 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3399 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3400
3401 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3402 }
3403 }
3404 }
3405 }
3406
hme_calc_sad_and_1_best_result_subpel(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)3407 void hme_calc_sad_and_1_best_result_subpel(
3408 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
3409 {
3410 S32 i4_candt;
3411 S32 i4_num_nodes;
3412
3413 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3414
3415 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3416 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3417 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3418 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3419
3420 mv_refine_ctxt_t *ps_subpel_refine_ctxt;
3421 ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
3422 i4_num_nodes = 1;
3423
3424 /* Run through each of the candts in a loop */
3425 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3426 {
3427 /**********************************************************************/
3428 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
3429 /**********************************************************************/
3430 {
3431 WORD32 b, c, d;
3432 UWORD8 *pu1_cur_ptr;
3433 UWORD8 *pu1_ref_ptr;
3434 UWORD16 au2_4x4_sad[NUM_4X4];
3435
3436 pu1_cur_ptr = ps_err_prms->pu1_inp;
3437 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3438
3439 /* Loop to compute the SAD's */
3440 {
3441 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3442 for(b = 0; b < NUM_4X4; b++)
3443 {
3444 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3445 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3446
3447 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3448 {
3449 WORD32 z_cur = (cur_buf_stride)*c + t1;
3450 WORD32 z_ref = (ref_buf_stride)*c + t2;
3451 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3452 {
3453 au2_4x4_sad[b] += (UWORD16)ABS((
3454 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3455 }
3456 }
3457 }
3458
3459 pi4_sad_grid[PART_ID_NxN_TL] =
3460 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3461 pi4_sad_grid[PART_ID_NxN_TR] =
3462 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3463 pi4_sad_grid[PART_ID_NxN_BL] =
3464 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3465 pi4_sad_grid[PART_ID_NxN_BR] =
3466 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3467 pi4_sad_grid[PART_ID_Nx2N_L] =
3468 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3469 pi4_sad_grid[PART_ID_Nx2N_R] =
3470 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3471 pi4_sad_grid[PART_ID_2NxN_T] =
3472 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3473 pi4_sad_grid[PART_ID_2NxN_B] =
3474 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3475 pi4_sad_grid[PART_ID_nLx2N_L] =
3476 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3477 pi4_sad_grid[PART_ID_nRx2N_R] =
3478 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3479 pi4_sad_grid[PART_ID_2NxnU_T] =
3480 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3481 pi4_sad_grid[PART_ID_2NxnD_B] =
3482 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3483 pi4_sad_grid[PART_ID_2Nx2N] =
3484 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3485 pi4_sad_grid[PART_ID_2NxnU_B] =
3486 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3487 pi4_sad_grid[PART_ID_2NxnD_T] =
3488 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3489 pi4_sad_grid[PART_ID_nRx2N_L] =
3490 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3491 pi4_sad_grid[PART_ID_nLx2N_R] =
3492 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3493 }
3494 }
3495 /**********************************************************************/
3496 /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS */
3497 /**********************************************************************/
3498 {
3499 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
3500 S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
3501 S32 best_node_cost;
3502 S32 second_best_node_cost;
3503
3504 /*For each valid partition, update the refine_prm structure to reflect the best and second
3505 best candidates for that partition*/
3506
3507 for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
3508 {
3509 S32 update_required = 0;
3510 S32 part_id = pi4_valid_part_ids[i4_count];
3511 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3512
3513 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
3514 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3515
3516 /*Calculate total cost*/
3517 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3518 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3519
3520 /*****************************************************************/
3521 /* We do not labor through the results if the total cost worse */
3522 /* than the last of the results. */
3523 /*****************************************************************/
3524 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
3525 second_best_node_cost = SHRT_MAX;
3526
3527 if(i4_tot_cost < second_best_node_cost)
3528 {
3529 update_required = 0;
3530
3531 /*************************************************************/
3532 /* Identify where the current result isto be placed.Basically*/
3533 /* find the node which has cost just higher thannodeundertest*/
3534 /*************************************************************/
3535 if(i4_tot_cost < best_node_cost)
3536 {
3537 update_required = 1;
3538 }
3539 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
3540 {
3541 update_required = 0;
3542 }
3543 if(update_required == 2)
3544 {
3545 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3546 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3547 ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
3548 ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
3549 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
3550 }
3551 else if(update_required == 1)
3552 {
3553 ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3554 ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3555 ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
3556 ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
3557 ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
3558 }
3559 }
3560 }
3561 }
3562 }
3563
3564 {
3565 WORD32 i4_count = 0;
3566 for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
3567 {
3568 if(ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] >= MAX_SIGNED_16BIT_VAL)
3569 {
3570 ps_subpel_refine_ctxt->ai2_fullpel_satd[0][i4_count] = MAX_SIGNED_16BIT_VAL;
3571 }
3572 }
3573 }
3574 }
3575
3576 /**
3577 ********************************************************************************
3578 * @fn hme_calc_pt_sad_and_result_explicit(hme_search_prms_t *ps_search_prms,
3579 * wgt_pred_ctxt_t *ps_wt_inp_prms,
3580 * err_prms_t *ps_err_prms,
3581 * result_upd_prms_t *ps_result_prms,
3582 * U08 **ppu1_ref,
3583 * S32 i4_ref_stride)
3584 *
3585 * @brief Run thorugh the provided candidates and compute the point SAD and
3586 * cost and update the results in the order
3587 *
3588 * @param[in] ps_search_prms
3589 * @param[in] ps_wt_inp_prms
3590 * @param[in] ps_err_prms
3591 * @param[out] ps_result_prms
3592 * @param[in] ppu1_ref
3593 * @param[in] i4_ref_stride
3594 *
3595 * @return None
3596 ********************************************************************************
3597 */
3598
hme_calc_pt_sad_and_result_explicit(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)3599 void hme_calc_pt_sad_and_result_explicit(
3600 hme_search_prms_t *ps_search_prms,
3601 wgt_pred_ctxt_t *ps_wt_inp_prms,
3602 err_prms_t *ps_err_prms,
3603 result_upd_prms_t *ps_result_prms,
3604 U08 **ppu1_ref,
3605 S32 i4_ref_stride)
3606 {
3607 WORD32 i4_grid_mask, i4_part_mask, i4_num_results, i4_candt, i4_num_nodes;
3608 WORD32 i4_inp_stride, i4_inp_off, i4_ref_offset;
3609
3610 search_node_t *ps_search_node;
3611 BLK_SIZE_T e_blk_size;
3612 PF_SAD_FXN_T pf_sad_fxn;
3613 PF_RESULT_FXN_T pf_hme_result_fxn;
3614
3615 i4_grid_mask = 0x1; /* Point SAD */
3616
3617 /* Get the parameters required */
3618 i4_part_mask = ps_search_prms->i4_part_mask;
3619 e_blk_size = ps_search_prms->e_blk_size;
3620 i4_num_results = (S32)ps_search_prms->ps_search_results->u1_num_results_per_part;
3621 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3622 ps_search_node = ps_search_prms->ps_search_nodes;
3623
3624 i4_inp_stride = ps_search_prms->i4_inp_stride;
3625 /* Move to the location of the search blk in inp buffer */
3626 i4_inp_off = ps_search_prms->i4_cu_x_off;
3627 i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride;
3628 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3629
3630 pf_sad_fxn = hme_get_sad_fxn(e_blk_size, i4_grid_mask, i4_part_mask);
3631 /**********************************************************************/
3632 /* we have a sparsely populated SAD grid of size 9x17. */
3633 /* the id of the results in the grid is shown */
3634 /* 5 2 6 */
3635 /* 1 0 3 */
3636 /* 7 4 8 */
3637 /* The motivation for choosing a grid like this is that */
3638 /* in case of no refinement, the central location is */
3639 /* the first entry in the grid */
3640 /* Also for diamond, the 4 entries get considered first */
3641 /* This is consistent with the diamond notation used in */
3642 /* subpel refinement. To Check */
3643 /* Update the results for the given search candt */
3644 /* returns the cost of the 2Nx2N partition */
3645 /**********************************************************************/
3646
3647 /* Get the modified update result fun. with CLIP16 of cost to match */
3648 /* with SIMD */
3649 pf_hme_result_fxn = hme_update_results_grid_pu_bestn_no_encode;
3650
3651 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3652 {
3653 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3654 continue;
3655
3656 /* initialize minimum cost for this candidate. As we search around */
3657 /* this candidate, this is used to check early exit, when in any */
3658 /* given iteration, the center pt of the grid is lowest value */
3659 ps_result_prms->i4_min_cost = MAX_32BIT_VAL;
3660
3661 ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3662 ps_err_prms->i4_grid_mask = i4_grid_mask;
3663
3664 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3665 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3666 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3667
3668 /**********************************************************************/
3669 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
3670 /**********************************************************************/
3671 pf_sad_fxn(ps_err_prms);
3672
3673 /**********************************************************************/
3674 /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS */
3675 /**********************************************************************/
3676 ps_result_prms->i4_grid_mask = i4_grid_mask;
3677 ps_result_prms->ps_search_node_base = ps_search_node;
3678 pf_hme_result_fxn(ps_result_prms);
3679
3680 ps_search_node++;
3681 }
3682 }
3683
3684 /**
3685 ********************************************************************************
3686 * @fn hme_set_mvp_node(search_results_t *ps_search_results,
3687 * search_node_t *ps_candt_prj_coloc,
3688 * S08 i1_ref_idx)
3689 *
3690 * @brief Set node used for motion vector predictor computation
3691 * Either TR or L is compared to projected colocated and
3692 * closest is decided as MVP
3693 *
3694 * @param[in] ps_search_results
3695 *
3696 * @param[in] ps_candt_prj_coloc
3697 *
3698 * @param[in] i1_ref_idx
3699 *
3700 * @return None
3701 ********************************************************************************
3702 */
hme_set_mvp_node(search_results_t * ps_search_results,search_node_t * ps_candt_prj_coloc,U08 u1_pred_lx,U08 u1_default_ref_id)3703 void hme_set_mvp_node(
3704 search_results_t *ps_search_results,
3705 search_node_t *ps_candt_prj_coloc,
3706 U08 u1_pred_lx,
3707 U08 u1_default_ref_id)
3708 {
3709 S32 i;
3710 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[u1_pred_lx];
3711 pred_candt_nodes_t *ps_pred_nodes = ps_pred_ctxt->as_pred_nodes;
3712 search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
3713
3714 S32 inp_shift = 2;
3715 S32 pred_shift;
3716 S32 ref_bits;
3717 S32 mv_p_x, mv_p_y;
3718 S16 mvdx1, mvdx2, mvdy1, mvdy2;
3719
3720 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[u1_pred_lx][u1_default_ref_id];
3721
3722 /*************************************************************************/
3723 /* Priority to bottom left availability. Else we go to left. If both are */
3724 /* not available, then a remains null */
3725 /*************************************************************************/
3726 if(ps_pred_nodes->ps_l->u1_is_avail)
3727 {
3728 ps_pred_node_a = ps_pred_nodes->ps_l;
3729 }
3730
3731 if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
3732 {
3733 ps_pred_node_b = ps_pred_nodes->ps_tr;
3734 }
3735 else
3736 {
3737 ps_pred_node_b = ps_pred_nodes->ps_coloc;
3738 ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3739 }
3740
3741 if(ps_pred_node_a == NULL)
3742 {
3743 ps_pred_node_a = ps_pred_nodes->ps_coloc;
3744 ps_pred_node_a->s_mv = ps_pred_node_a->ps_mv[0];
3745
3746 if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
3747 {
3748 ps_pred_node_b = ps_pred_nodes->ps_zeromv;
3749 ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3750 }
3751 }
3752
3753 if(ps_pred_node_a->i1_ref_idx != u1_default_ref_id)
3754 {
3755 SCALE_FOR_POC_DELTA(
3756 mv_p_x, mv_p_y, ps_pred_node_a, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3757 }
3758 else
3759 {
3760 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3761 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3762 }
3763 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3764 COMPUTE_MV_DIFFERENCE(mvdx1, mvdy1, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3765 mvdx1 = ABS(mvdx1);
3766 mvdy1 = ABS(mvdy1);
3767
3768 if(ps_pred_node_b->i1_ref_idx != u1_default_ref_id)
3769 {
3770 SCALE_FOR_POC_DELTA(
3771 mv_p_x, mv_p_y, ps_pred_node_b, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3772 }
3773 else
3774 {
3775 mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
3776 mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
3777 }
3778 pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
3779 COMPUTE_MV_DIFFERENCE(mvdx2, mvdy2, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3780 mvdx2 = ABS(mvdx2);
3781 mvdy2 = ABS(mvdy2);
3782
3783 if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
3784 {
3785 for(i = 0; i < TOT_NUM_PARTS; i++)
3786 {
3787 ps_pred_nodes[i].ps_mvp_node = ps_pred_node_a;
3788 }
3789 }
3790 else
3791 {
3792 for(i = 0; i < TOT_NUM_PARTS; i++)
3793 {
3794 ps_pred_nodes[i].ps_mvp_node = ps_pred_node_b;
3795 }
3796 }
3797 }
3798