1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 ******************************************************************************
23 * @file hme_subpel.c
24 *
25 * @brief
26 * Subpel refinement modules for ME algo
27 *
28 * @author
29 * Ittiam
30 *
31 *
32 * List of Functions
33 * hme_qpel_interp_avg()
34 * hme_subpel_refine_ctblist_bck()
35 * hme_subpel_refine_ctblist_fwd()
36 * hme_refine_bidirect()
37 * hme_subpel_refinement()
38 * hme_subpel_refine_ctb_fwd()
39 * hme_subpel_refine_ctb_bck()
40 * hme_create_bck_inp()
41 * hme_subpel_refine_search_node()
42 ******************************************************************************
43 */
44
45 /*****************************************************************************/
46 /* File Includes */
47 /*****************************************************************************/
48 /* System include files */
49 #include <stdio.h>
50 #include <string.h>
51 #include <stdlib.h>
52 #include <assert.h>
53 #include <stdarg.h>
54 #include <math.h>
55 #include <limits.h>
56
57 /* User include files */
58 #include "ihevc_typedefs.h"
59 #include "itt_video_api.h"
60 #include "ihevce_api.h"
61
62 #include "rc_cntrl_param.h"
63 #include "rc_frame_info_collector.h"
64 #include "rc_look_ahead_params.h"
65
66 #include "ihevc_defs.h"
67 #include "ihevc_structs.h"
68 #include "ihevc_platform_macros.h"
69 #include "ihevc_deblk.h"
70 #include "ihevc_itrans_recon.h"
71 #include "ihevc_chroma_itrans_recon.h"
72 #include "ihevc_chroma_intra_pred.h"
73 #include "ihevc_intra_pred.h"
74 #include "ihevc_inter_pred.h"
75 #include "ihevc_mem_fns.h"
76 #include "ihevc_padding.h"
77 #include "ihevc_weighted_pred.h"
78 #include "ihevc_sao.h"
79 #include "ihevc_resi_trans.h"
80 #include "ihevc_quant_iquant_ssd.h"
81 #include "ihevc_cabac_tables.h"
82
83 #include "ihevce_defs.h"
84 #include "ihevce_lap_enc_structs.h"
85 #include "ihevce_multi_thrd_structs.h"
86 #include "ihevce_multi_thrd_funcs.h"
87 #include "ihevce_me_common_defs.h"
88 #include "ihevce_had_satd.h"
89 #include "ihevce_error_codes.h"
90 #include "ihevce_bitstream.h"
91 #include "ihevce_cabac.h"
92 #include "ihevce_rdoq_macros.h"
93 #include "ihevce_function_selector.h"
94 #include "ihevce_enc_structs.h"
95 #include "ihevce_entropy_structs.h"
96 #include "ihevce_cmn_utils_instr_set_router.h"
97 #include "ihevce_enc_loop_structs.h"
98 #include "ihevce_bs_compute_ctb.h"
99 #include "ihevce_global_tables.h"
100 #include "ihevce_dep_mngr_interface.h"
101 #include "hme_datatype.h"
102 #include "hme_interface.h"
103 #include "hme_common_defs.h"
104 #include "hme_defs.h"
105 #include "ihevce_me_instr_set_router.h"
106 #include "hme_globals.h"
107 #include "hme_utils.h"
108 #include "hme_coarse.h"
109 #include "hme_fullpel.h"
110 #include "hme_subpel.h"
111 #include "hme_refine.h"
112 #include "hme_err_compute.h"
113 #include "hme_common_utils.h"
114 #include "hme_search_algo.h"
115 #include "ihevce_stasino_helpers.h"
116 #include "ihevce_common_utils.h"
117
118 /*****************************************************************************/
119 /* Function Definitions */
120 /*****************************************************************************/
hme_qpel_interp_avg(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,S32 i4_buf_id)121 void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
122 {
123 U08 *pu1_src1, *pu1_src2, *pu1_dst;
124 qpel_input_buf_cfg_t *ps_inp_cfg;
125 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
126
127 /*************************************************************************/
128 /* For a given QPEL pt, we need to determine the 2 source pts that are */
129 /* needed to do the QPEL averaging. The logic to do this is as follows */
130 /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are */
131 /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0 */
132 /* pt of th reference blk that is colocated to the inp blk. */
133 /* A j E k B */
134 /* l m n o p */
135 /* F q G r H */
136 /* s t u v w */
137 /* C x I y D */
138 /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/
139 /* and (1,1) respectively in the fpel buffer (id = 0) */
140 /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf */
141 /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf */
142 /* G is hxhy pt in offset 0,0 in hxhy buf */
143 /* All above offsets are computed w.r.t. motion displaced pt in */
144 /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and */
145 /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G */
146 /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3 */
147 /* If we consider pt v to be derived. v has a fractional comp of 3, 3 */
148 /* v is avg of H and I. So the table look up of v should give following */
149 /* buf 1 (H) : offset = (1, 0) buf id = 2. */
150 /* buf 2 (I) : offset = 0 , 1) buf id = 1. */
151 /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1. */
152 /*************************************************************************/
153 i4_mv_x_frac = i4_mv_x & 3;
154 i4_mv_y_frac = i4_mv_y & 3;
155
156 i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride;
157
158 /* Derive the descriptor that has all offset and size info */
159 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
160
161 if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
162 {
163 /* This is case for fxfy/hxfy/fxhy/hxhy */
164 ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
165 ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
166 ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
167 ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride;
168
169 return;
170 }
171
172 pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
173 pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
174 pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
175
176 pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
177 pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
178 pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride);
179
180 pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
181 hevc_avg_2d(
182 pu1_src1,
183 pu1_src2,
184 ps_prms->i4_ref_stride,
185 ps_prms->i4_ref_stride,
186 ps_prms->i4_blk_wd,
187 ps_prms->i4_blk_ht,
188 pu1_dst,
189 ps_prms->i4_out_stride);
190 ps_prms->pu1_final_out = pu1_dst;
191 ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
192 }
193
hme_qpel_interp_avg_2pt_vert_no_reuse(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride,FT_QPEL_INTERP_AVG_1PT * pf_qpel_interp_avg_1pt)194 static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse(
195 interp_prms_t *ps_prms,
196 S32 i4_mv_x,
197 S32 i4_mv_y,
198 U08 **ppu1_final,
199 S32 *pi4_final_stride,
200 FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
201 {
202 pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
203
204 pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
205 }
206
hme_qpel_interp_avg_2pt_horz_no_reuse(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride,FT_QPEL_INTERP_AVG_1PT * pf_qpel_interp_avg_1pt)207 static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse(
208 interp_prms_t *ps_prms,
209 S32 i4_mv_x,
210 S32 i4_mv_y,
211 U08 **ppu1_final,
212 S32 *pi4_final_stride,
213 FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
214 {
215 pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
216
217 pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
218 }
219
220 /********************************************************************************
221 * @fn hme_qpel_interp_comprehensive
222 *
223 * @brief Interpolates 2 qpel points by hpel averaging
224 *
225 * @param[in,out] ps_prms: Both input buffer ptrs and location of output
226 *
227 * @param[in] i4_mv_x : x component of motion vector in QPEL units
228 *
229 * @param[in] i4_mv_y : y component of motion vector in QPEL units
230 *
231 * @param[in] i4_grid_mask : mask which determines qpels to be computed
232 *
233 * @param[out] ppu1_final : storage for final buffer pointers
234 *
235 * @param[out] pi4_final_stride : storage for final buffer strides
236 *
237 * @return None
238 ********************************************************************************
239 */
hme_qpel_interp_comprehensive(interp_prms_t * ps_prms,U08 ** ppu1_final,S32 * pi4_final_stride,S32 i4_mv_x,S32 i4_mv_y,S32 i4_grid_mask,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)240 static __inline void hme_qpel_interp_comprehensive(
241 interp_prms_t *ps_prms,
242 U08 **ppu1_final,
243 S32 *pi4_final_stride,
244 S32 i4_mv_x,
245 S32 i4_mv_y,
246 S32 i4_grid_mask,
247 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
248 {
249 S32 pt_select_for_TB, pt_select_for_LR;
250 S32 dx, dy, dydx;
251 S32 vert_func_selector, horz_func_selector;
252
253 S32 i4_ref_stride = ps_prms->i4_ref_stride;
254
255 pt_select_for_TB =
256 ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1));
257
258 pt_select_for_LR =
259 ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1));
260
261 dx = (i4_mv_x & 3);
262 dy = (i4_mv_y & 3);
263 dydx = (dx + (dy << 2));
264
265 vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx];
266 horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx];
267
268 /* case descriptions */
269 /* Let T = (gridmask & T) & B = (gridmask & B) */
270 /* & hp = pt is an hpel or an fpel */
271 /* & r = reuse possible */
272 /* 0 => T || B = 0 */
273 /* 1 => (!T) && (B) && hp */
274 /* 2 => (T) && (!B) && hp */
275 /* 3 => (!T) && (B) && !hp */
276 /* 4 => (T) && (!B) && !hp */
277 /* 5 => (T) && (B) && !hp && r */
278 /* 6 => (T) && (B) && !hp && !r */
279 /* 7 => (T) && (B) && hp */
280
281 switch(vert_func_selector)
282 {
283 case 0:
284 {
285 break;
286 }
287 case 1:
288 {
289 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
290 qpel_input_buf_cfg_t *ps_inp_cfg;
291 S32 i4_mvyp1 = (i4_mv_y + 1);
292
293 i4_mv_x_frac = dx;
294 i4_mv_y_frac = i4_mvyp1 & 3;
295
296 i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
297
298 /* Derive the descriptor that has all offset and size info */
299 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
300
301 ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
302 ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
303 ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
304 pi4_final_stride[3] = i4_ref_stride;
305
306 break;
307 }
308 case 2:
309 {
310 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
311 qpel_input_buf_cfg_t *ps_inp_cfg;
312 S32 i4_mvym1 = (i4_mv_y - 1);
313
314 i4_mv_x_frac = dx;
315 i4_mv_y_frac = i4_mvym1 & 3;
316
317 i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
318
319 /* Derive the descriptor that has all offset and size info */
320 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
321
322 ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
323 ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
324 ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
325 pi4_final_stride[1] = i4_ref_stride;
326
327 break;
328 }
329 case 3:
330 {
331 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
332 ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
333
334 break;
335 }
336 case 4:
337 {
338 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
339 ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
340
341 break;
342 }
343 case 5:
344 {
345 ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse(
346 ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
347 break;
348 }
349 case 6:
350 {
351 hme_qpel_interp_avg_2pt_vert_no_reuse(
352 ps_prms,
353 i4_mv_x,
354 i4_mv_y,
355 ppu1_final,
356 pi4_final_stride,
357 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
358 break;
359 }
360 case 7:
361 {
362 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
363 qpel_input_buf_cfg_t *ps_inp_cfg;
364
365 S32 i4_mvyp1 = (i4_mv_y + 1);
366 S32 i4_mvym1 = (i4_mv_y - 1);
367
368 i4_mv_x_frac = dx;
369 i4_mv_y_frac = i4_mvyp1 & 3;
370
371 i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
372
373 /* Derive the descriptor that has all offset and size info */
374 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
375
376 ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
377 ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
378 ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
379 pi4_final_stride[3] = i4_ref_stride;
380
381 i4_mv_y_frac = i4_mvym1 & 3;
382
383 i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
384
385 /* Derive the descriptor that has all offset and size info */
386 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
387
388 ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
389 ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
390 ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
391 pi4_final_stride[1] = i4_ref_stride;
392
393 break;
394 }
395 }
396
397 /* case descriptions */
398 /* Let L = (gridmask & L) & R = (gridmask & R) */
399 /* & hp = pt is an hpel or an fpel */
400 /* & r = reuse possible */
401 /* 0 => L || R = 0 */
402 /* 1 => (!L) && (R) && hp */
403 /* 2 => (L) && (!R) && hp */
404 /* 3 => (!L) && (R) && !hp */
405 /* 4 => (L) && (!R) && !hp */
406 /* 5 => (L) && (R) && !hp && r */
407 /* 6 => (L) && (R) && !hp && !r */
408 /* 7 => (L) && (R) && hp */
409
410 switch(horz_func_selector)
411 {
412 case 0:
413 {
414 break;
415 }
416 case 1:
417 {
418 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
419 qpel_input_buf_cfg_t *ps_inp_cfg;
420 S32 i4_mvxp1 = (i4_mv_x + 1);
421
422 i4_mv_x_frac = i4_mvxp1 & 3;
423 i4_mv_y_frac = dy;
424
425 i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
426
427 /* Derive the descriptor that has all offset and size info */
428 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
429
430 ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
431 ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
432 ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
433 pi4_final_stride[2] = i4_ref_stride;
434
435 break;
436 }
437 case 2:
438 {
439 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
440 qpel_input_buf_cfg_t *ps_inp_cfg;
441 S32 i4_mvxm1 = (i4_mv_x - 1);
442
443 i4_mv_x_frac = i4_mvxm1 & 3;
444 i4_mv_y_frac = dy;
445
446 i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
447
448 /* Derive the descriptor that has all offset and size info */
449 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
450
451 ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
452 ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
453 ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
454 pi4_final_stride[0] = i4_ref_stride;
455
456 break;
457 }
458 case 3:
459 {
460 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
461 ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
462
463 break;
464 }
465 case 4:
466 {
467 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
468 ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
469
470 break;
471 }
472 case 5:
473 {
474 ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse(
475 ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
476 break;
477 }
478 case 6:
479 {
480 hme_qpel_interp_avg_2pt_horz_no_reuse(
481 ps_prms,
482 i4_mv_x,
483 i4_mv_y,
484 ppu1_final,
485 pi4_final_stride,
486 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
487 break;
488 }
489 case 7:
490 {
491 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
492 qpel_input_buf_cfg_t *ps_inp_cfg;
493
494 S32 i4_mvxp1 = (i4_mv_x + 1);
495 S32 i4_mvxm1 = (i4_mv_x - 1);
496
497 i4_mv_x_frac = i4_mvxp1 & 3;
498 i4_mv_y_frac = dy;
499
500 i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
501
502 /* Derive the descriptor that has all offset and size info */
503 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
504
505 ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
506 ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
507 ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
508 pi4_final_stride[2] = i4_ref_stride;
509
510 i4_mv_x_frac = i4_mvxm1 & 3;
511
512 i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
513
514 /* Derive the descriptor that has all offset and size info */
515 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
516
517 ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
518 ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
519 ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
520 pi4_final_stride[0] = i4_ref_stride;
521
522 break;
523 }
524 }
525 }
526
527 /**
528 ********************************************************************************
529 * @fn S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms,
530 * search_results_t *ps_search_results,
531 * layer_ctxt_t *ps_curr_layer,
532 * U08 **ppu1_pred)
533 *
534 *
535 * @brief Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are
536 * best L0 and L1 bufs respectively for the entire CU
537 *
538 * @param[in] ps_prms: subpel prms input to this function
539 *
540 * @param[in] ps_curr_layer: points to the current layer ctxt
541 *
542 * @return The best BI cost of best uni cost, whichever better
543 ********************************************************************************
544 */
hme_compute_pred_and_evaluate_bi(inter_cu_results_t * ps_cu_results,inter_pu_results_t * ps_pu_results,inter_ctb_prms_t * ps_inter_ctb_prms,part_type_results_t * ps_part_type_result,ULWORD64 * pu8_winning_pred_sigmaXSquare,ULWORD64 * pu8_winning_pred_sigmaX,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)545 void hme_compute_pred_and_evaluate_bi(
546 inter_cu_results_t *ps_cu_results,
547 inter_pu_results_t *ps_pu_results,
548 inter_ctb_prms_t *ps_inter_ctb_prms,
549 part_type_results_t *ps_part_type_result,
550 ULWORD64 *pu8_winning_pred_sigmaXSquare,
551 ULWORD64 *pu8_winning_pred_sigmaX,
552 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
553 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
554 {
555 /* Idx0 - Uni winner */
556 /* Idx1 - Uni runner-up */
557 /* Idx2 - Bi winner */
558 hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS];
559 err_prms_t s_err_prms;
560 interp_prms_t s_interp_prms;
561
562 PF_SAD_FXN_T pf_err_compute;
563
564 S32 i, j;
565 S32 x_off, y_off, x_pic, y_pic;
566 S32 i4_sad_grid;
567 U08 e_cu_size;
568 S32 i4_part_type;
569 U08 u1_cu_size;
570 S32 shift;
571 S32 x_part, y_part, num_parts;
572 S32 inp_stride, ref_stride;
573 U08 au1_pred_buf_array_indixes[3];
574 S32 cur_iter_best_cost;
575 S32 uni_cost, bi_cost, best_cost, tot_cost;
576 /* Idx0 - Uni winner */
577 /* Idx1 - Bi winner */
578 ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS];
579 ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS];
580 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
581 S32 i4_noise_term;
582 #endif
583
584 interp_prms_t *ps_interp_prms = &s_interp_prms;
585
586 S32 best_cand_in_opp_dir_idx = 0;
587 S32 is_best_cand_an_intra = 0;
588 U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy;
589 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
590 const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
591 #endif
592 tot_cost = 0;
593
594 /* Start of the CU w.r.t. CTB */
595 x_off = ps_cu_results->u1_x_off;
596 y_off = ps_cu_results->u1_y_off;
597
598 inp_stride = ps_inter_ctb_prms->i4_inp_stride;
599 ref_stride = ps_inter_ctb_prms->i4_rec_stride;
600
601 ps_interp_prms->i4_ref_stride = ref_stride;
602
603 /* Start of the CU w.r.t. Pic 0,0 */
604 x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off;
605 y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off;
606
607 u1_cu_size = ps_cu_results->u1_cu_size;
608 e_cu_size = u1_cu_size;
609 shift = (S32)e_cu_size;
610 i4_part_type = ps_part_type_result->u1_part_type;
611 num_parts = gau1_num_parts_in_part_type[i4_part_type];
612
613 for(i = 0; i < 3; i++)
614 {
615 hme_init_pred_buf_info(
616 &as_pred_buf_data[i],
617 &ps_inter_ctb_prms->s_pred_buf_mngr,
618 (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2,
619 (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2,
620 (PART_TYPE_T)i4_part_type);
621
622 au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id;
623 }
624
625 for(j = 0; j < num_parts; j++)
626 {
627 UWORD8 *apu1_hpel_ref[2][4];
628 PART_ID_T e_part_id;
629 BLK_SIZE_T e_blk_size;
630 WORD8 i1_ref_idx;
631 UWORD8 pred_dir;
632 WORD32 ref_offset, inp_offset, wd, ht;
633 pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result;
634 mv_t *aps_mv[2];
635 UWORD8 num_active_ref_opp;
636 UWORD8 num_results_per_part;
637 WORD32 luma_weight_ref1, luma_offset_ref1;
638 WORD32 luma_weight_ref2, luma_offset_ref2;
639 WORD32 pu_node2_found = 0;
640
641 e_part_id = ge_part_type_to_part_id[i4_part_type][j];
642 e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id];
643
644 x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift;
645 y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift;
646
647 ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride;
648 inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset;
649
650 pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode;
651
652 ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]);
653
654 if(PRED_L0 == pred_dir)
655 {
656 i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx;
657 aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv);
658
659 num_active_ref_opp =
660 ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled);
661 num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id];
662
663 ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id];
664
665 ASSERT(i1_ref_idx >= 0);
666
667 apu1_hpel_ref[0][0] =
668 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
669 ref_offset;
670 apu1_hpel_ref[0][1] =
671 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
672 ref_offset;
673 apu1_hpel_ref[0][2] =
674 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
675 ref_offset;
676 apu1_hpel_ref[0][3] =
677 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
678 ref_offset;
679
680 luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
681 ->s_weight_offset.i2_luma_weight;
682 luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
683 ->s_weight_offset.i2_luma_offset;
684 }
685 else
686 {
687 i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx;
688 aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv);
689
690 ASSERT(i1_ref_idx >= 0);
691
692 num_active_ref_opp =
693 ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled);
694 num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id];
695
696 ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id];
697
698 apu1_hpel_ref[0][0] =
699 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
700 ref_offset;
701 apu1_hpel_ref[0][1] =
702 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
703 ref_offset;
704 apu1_hpel_ref[0][2] =
705 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
706 ref_offset;
707 apu1_hpel_ref[0][3] =
708 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
709 ref_offset;
710
711 luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
712 ->s_weight_offset.i2_luma_weight;
713 luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
714 ->s_weight_offset.i2_luma_offset;
715 }
716
717 if(aps_mv[0]->i2_mvx == INTRA_MV)
718 {
719 uni_cost = ps_pu_node1->i4_tot_cost;
720 cur_iter_best_cost = ps_pu_node1->i4_tot_cost;
721 best_cost = MIN(uni_cost, cur_iter_best_cost);
722 tot_cost += best_cost;
723 continue;
724 }
725
726 ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size];
727 ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size];
728 ps_interp_prms->i4_out_stride = MAX_CU_SIZE;
729
730 if(num_active_ref_opp)
731 {
732 if(PRED_L0 == pred_dir)
733 {
734 if(ps_pu_results->u1_num_results_per_part_l1[e_part_id])
735 {
736 ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id];
737 pu_node2_found = 1;
738 }
739 }
740 else
741 {
742 if(ps_pu_results->u1_num_results_per_part_l0[e_part_id])
743 {
744 ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id];
745 pu_node2_found = 1;
746 }
747 }
748 }
749
750 if(!pu_node2_found)
751 {
752 bi_cost = INT_MAX >> 1;
753
754 s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
755 ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
756
757 ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
758 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
759
760 if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
761 {
762 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
763 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
764 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
765 }
766
767 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
768 {
769 hme_compute_sigmaX_and_sigmaXSquared(
770 as_pred_buf_data[0][j].pu1_pred,
771 as_pred_buf_data[0][j].i4_pred_stride,
772 &au8_sigmaX[0][j],
773 &au8_sigmaXSquared[0][j],
774 ps_interp_prms->i4_blk_wd,
775 ps_interp_prms->i4_blk_ht,
776 ps_interp_prms->i4_blk_wd,
777 ps_interp_prms->i4_blk_ht,
778 0,
779 1);
780 }
781 }
782 else
783 {
784 i = 0;
785 bi_cost = MAX_32BIT_VAL;
786 is_best_cand_an_intra = 0;
787 best_cand_in_opp_dir_idx = 0;
788
789 pred_dir = ps_pu_node2[i].pu.b2_pred_mode;
790
791 if(PRED_L0 == pred_dir)
792 {
793 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx;
794 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv);
795
796 ASSERT(i1_ref_idx >= 0);
797
798 apu1_hpel_ref[1][0] =
799 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
800 ->s_yuv_buf_desc.pv_y_buf) +
801 ref_offset; //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
802 apu1_hpel_ref[1][1] =
803 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
804 ref_offset;
805 apu1_hpel_ref[1][2] =
806 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
807 ref_offset;
808 apu1_hpel_ref[1][3] =
809 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
810 ref_offset;
811
812 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
813 ->s_weight_offset.i2_luma_weight;
814 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
815 ->s_weight_offset.i2_luma_offset;
816 }
817 else
818 {
819 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx;
820 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv);
821
822 ASSERT(i1_ref_idx >= 0);
823
824 apu1_hpel_ref[1][0] =
825 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
826 ->s_yuv_buf_desc.pv_y_buf) +
827 ref_offset; //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
828 apu1_hpel_ref[1][1] =
829 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
830 ref_offset;
831 apu1_hpel_ref[1][2] =
832 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
833 ref_offset;
834 apu1_hpel_ref[1][3] =
835 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
836 ref_offset;
837
838 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
839 ->s_weight_offset.i2_luma_weight;
840 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
841 ->s_weight_offset.i2_luma_offset;
842 }
843
844 if(aps_mv[1]->i2_mvx == INTRA_MV)
845 {
846 uni_cost = ps_pu_node1->i4_tot_cost;
847 cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost;
848
849 if(cur_iter_best_cost < bi_cost)
850 {
851 bi_cost = cur_iter_best_cost;
852 best_cand_in_opp_dir_idx = i;
853 is_best_cand_an_intra = 1;
854 }
855
856 best_cost = MIN(uni_cost, bi_cost);
857 tot_cost += best_cost;
858 continue;
859 }
860
861 s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
862 ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
863
864 ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
865 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
866
867 if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
868 {
869 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
870 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
871 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
872 }
873
874 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
875 {
876 hme_compute_sigmaX_and_sigmaXSquared(
877 as_pred_buf_data[0][j].pu1_pred,
878 as_pred_buf_data[0][j].i4_pred_stride,
879 &au8_sigmaX[0][j],
880 &au8_sigmaXSquared[0][j],
881 ps_interp_prms->i4_blk_wd,
882 ps_interp_prms->i4_blk_ht,
883 ps_interp_prms->i4_blk_wd,
884 ps_interp_prms->i4_blk_ht,
885 0,
886 1);
887 }
888
889 s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred;
890 ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0];
891
892 ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
893 ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0);
894
895 if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
896 {
897 as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX;
898 as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out;
899 as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
900 }
901
902 ps_cmn_utils_optimised_function_list->pf_wt_avg_2d(
903 as_pred_buf_data[0][j].pu1_pred,
904 as_pred_buf_data[1][j].pu1_pred,
905 as_pred_buf_data[0][j].i4_pred_stride,
906 as_pred_buf_data[1][j].i4_pred_stride,
907 wd,
908 ht,
909 as_pred_buf_data[2][j].pu1_pred,
910 as_pred_buf_data[2][j].i4_pred_stride,
911 luma_weight_ref1,
912 luma_weight_ref2,
913 luma_offset_ref1,
914 luma_offset_ref2,
915 ps_inter_ctb_prms->wpred_log_wdc);
916
917 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
918 {
919 hme_compute_sigmaX_and_sigmaXSquared(
920 as_pred_buf_data[2][j].pu1_pred,
921 as_pred_buf_data[2][j].i4_pred_stride,
922 &au8_sigmaX[1][j],
923 &au8_sigmaXSquared[1][j],
924 ps_interp_prms->i4_blk_wd,
925 ps_interp_prms->i4_blk_ht,
926 ps_interp_prms->i4_blk_wd,
927 ps_interp_prms->i4_blk_ht,
928 0,
929 1);
930 }
931
932 s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset;
933 s_err_prms.i4_inp_stride = inp_stride;
934 s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride;
935 s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
936 s_err_prms.i4_grid_mask = 1;
937 s_err_prms.pi4_sad_grid = &i4_sad_grid;
938 s_err_prms.i4_blk_wd = wd;
939 s_err_prms.i4_blk_ht = ht;
940 s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred;
941 s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list;
942
943 if(ps_inter_ctb_prms->u1_use_satd)
944 {
945 pf_err_compute = compute_satd_8bit;
946 }
947 else
948 {
949 pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit;
950 }
951
952 pf_err_compute(&s_err_prms);
953
954 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
955 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
956 {
957 unsigned long u4_shift_val;
958 ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
959 ULWORD64 u8_temp_var, u8_temp_var1;
960 S32 i4_bits_req;
961
962 S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
963
964 u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]);
965 u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX;
966
967 if(e_cu_size == CU_8x8)
968 {
969 PART_ID_T e_part_id =
970 (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
971
972 u4_shift_val = ihevce_calc_stim_injected_variance(
973 ps_inter_ctb_prms->pu8_part_src_sigmaX,
974 ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
975 &u8_src_variance,
976 i4_default_src_wt,
977 0,
978 ps_inter_ctb_prms->wpred_log_wdc,
979 e_part_id);
980 }
981 else
982 {
983 u4_shift_val = ihevce_calc_stim_injected_variance(
984 ps_inter_ctb_prms->pu8_part_src_sigmaX,
985 ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
986 &u8_src_variance,
987 i4_default_src_wt,
988 0,
989 ps_inter_ctb_prms->wpred_log_wdc,
990 e_part_id);
991 }
992
993 u8_pred_variance = u8_pred_variance >> u4_shift_val;
994
995 GETRANGE64(i4_bits_req, u8_pred_variance);
996
997 if(i4_bits_req > 27)
998 {
999 u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1000 u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1001 }
1002
1003 if(u8_src_variance == u8_pred_variance)
1004 {
1005 u8_temp_var = (1 << STIM_Q_FORMAT);
1006 }
1007 else
1008 {
1009 u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1010 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1011 u8_temp_var1 =
1012 (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1013 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1014 u8_temp_var = (u8_temp_var / u8_temp_var1);
1015 }
1016
1017 i4_noise_term = (UWORD32)u8_temp_var;
1018
1019 i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1020
1021 ASSERT(i4_noise_term >= 0);
1022
1023 u8_temp_var = i4_sad_grid;
1024 u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1025 u8_temp_var += (1 << ((i4_q_level)-1));
1026 i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level));
1027 }
1028 #endif
1029
1030 cur_iter_best_cost = i4_sad_grid;
1031 cur_iter_best_cost += ps_pu_node1->i4_mv_cost;
1032 cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost;
1033
1034 if(cur_iter_best_cost < bi_cost)
1035 {
1036 bi_cost = cur_iter_best_cost;
1037 best_cand_in_opp_dir_idx = i;
1038 is_best_cand_an_intra = 0;
1039 }
1040 }
1041
1042 uni_cost = ps_pu_node1->i4_tot_cost;
1043
1044 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
1045 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1046 {
1047 unsigned long u4_shift_val;
1048 ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
1049 ULWORD64 u8_temp_var, u8_temp_var1;
1050 S32 i4_bits_req;
1051
1052 S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
1053
1054 S08 i1_ref_idx =
1055 (PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1056 ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx]
1057 : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx];
1058 S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost;
1059
1060 u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]);
1061 u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX;
1062
1063 if(e_cu_size == CU_8x8)
1064 {
1065 PART_ID_T e_part_id =
1066 (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
1067
1068 u4_shift_val = ihevce_calc_stim_injected_variance(
1069 ps_inter_ctb_prms->pu8_part_src_sigmaX,
1070 ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1071 &u8_src_variance,
1072 ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1073 ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1074 ps_inter_ctb_prms->wpred_log_wdc,
1075 e_part_id);
1076 }
1077 else
1078 {
1079 u4_shift_val = ihevce_calc_stim_injected_variance(
1080 ps_inter_ctb_prms->pu8_part_src_sigmaX,
1081 ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1082 &u8_src_variance,
1083 ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1084 ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1085 ps_inter_ctb_prms->wpred_log_wdc,
1086 e_part_id);
1087 }
1088
1089 u8_pred_variance = u8_pred_variance >> (u4_shift_val);
1090
1091 GETRANGE64(i4_bits_req, u8_pred_variance);
1092
1093 if(i4_bits_req > 27)
1094 {
1095 u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1096 u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1097 }
1098
1099 if(u8_src_variance == u8_pred_variance)
1100 {
1101 u8_temp_var = (1 << STIM_Q_FORMAT);
1102 }
1103 else
1104 {
1105 u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1106 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1107 u8_temp_var1 =
1108 (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1109 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1110 u8_temp_var = (u8_temp_var / u8_temp_var1);
1111 }
1112
1113 i4_noise_term = (UWORD32)u8_temp_var;
1114
1115 i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1116
1117 ASSERT(i4_noise_term >= 0);
1118
1119 u8_temp_var = i4_sad;
1120 u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1121 u8_temp_var += (1 << ((i4_q_level)-1));
1122 i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level));
1123
1124 uni_cost = i4_sad + ps_pu_node1->i4_mv_cost;
1125
1126 pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j];
1127 pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j];
1128 }
1129 #endif
1130
1131 if((bi_cost < uni_cost) && (!is_best_cand_an_intra))
1132 {
1133 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1134 {
1135 pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j];
1136 pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j];
1137 }
1138
1139 if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1140 {
1141 ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1142
1143 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1144 {
1145 ps_pu_node1->pu.mv.i1_l1_ref_idx =
1146 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1147 ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1148 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1149 ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1150 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1151 }
1152 else
1153 {
1154 ps_pu_node1->pu.mv.i1_l1_ref_idx =
1155 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1156 ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1157 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1158 ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1159 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1160 }
1161 }
1162 else
1163 {
1164 ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1165
1166 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1167 {
1168 ps_pu_node1->pu.mv.i1_l0_ref_idx =
1169 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1170 ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1171 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1172 ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1173 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1174 }
1175 else
1176 {
1177 ps_pu_node1->pu.mv.i1_l0_ref_idx =
1178 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1179 ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1180 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1181 ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1182 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1183 }
1184 }
1185
1186 ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost;
1187 }
1188
1189 best_cost = MIN(uni_cost, bi_cost);
1190 tot_cost += best_cost;
1191 }
1192
1193 hme_debrief_bipred_eval(
1194 ps_part_type_result,
1195 as_pred_buf_data,
1196 &ps_inter_ctb_prms->s_pred_buf_mngr,
1197 au1_pred_buf_array_indixes,
1198 ps_cmn_utils_optimised_function_list);
1199
1200 ps_part_type_result->i4_tot_cost = tot_cost;
1201 }
1202
hme_evalsatd_pt_pu_8x8_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1203 WORD32 hme_evalsatd_pt_pu_8x8_tu_rec(
1204 err_prms_t *ps_prms,
1205 WORD32 lambda,
1206 WORD32 lambda_q_shift,
1207 WORD32 i4_frm_qstep,
1208 me_func_selector_t *ps_func_selector)
1209 {
1210 S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */
1211 S32 i4_satd_8x8;
1212 S16 *pi2_had_out;
1213 S32 i4_tu_split_flag = 0;
1214 S32 i4_tu_early_cbf = 0;
1215
1216 S32 i4_early_cbf = 1;
1217 // S32 i4_i, i4_k;
1218 S32 i4_total_satd_cost = 0;
1219 S32 best_cost_tu_split;
1220
1221 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1222 S32 *api4_satd_pu[HAD_32x32 + 1];
1223 S32 *api4_tu_split[HAD_32x32 + 1];
1224 S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1225
1226 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1227 S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags;
1228 S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf;
1229
1230 U08 *pu1_inp = ps_prms->pu1_inp;
1231 U08 *pu1_ref = ps_prms->pu1_ref;
1232
1233 S32 inp_stride = ps_prms->i4_inp_stride;
1234 S32 ref_stride = ps_prms->i4_ref_stride;
1235
1236 /* Initialize tu_split_cost to "0" */
1237 ps_prms->i4_tu_split_cost = 0;
1238 pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1239
1240 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1241 api4_satd_pu[HAD_8x8] = &i4_satd_8x8;
1242 api4_satd_pu[HAD_16x16] = NULL;
1243 api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1244
1245 api4_tu_split[HAD_4x4] = NULL;
1246 api4_tu_split[HAD_8x8] = &i4_tu_split_flag;
1247 api4_tu_split[HAD_16x16] = NULL;
1248 api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1249
1250 api4_tu_early_cbf[HAD_4x4] = NULL;
1251 api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf;
1252 api4_tu_early_cbf[HAD_16x16] = NULL;
1253 api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1254
1255 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1256
1257 /* Return value is merge of both best_stad_cost and tu_split_flags */
1258 best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r(
1259 pu1_inp,
1260 inp_stride,
1261 pu1_ref,
1262 ref_stride,
1263 pi2_had_out,
1264 8,
1265 api4_satd_pu,
1266 api4_tu_split,
1267 api4_tu_early_cbf,
1268 0,
1269 2,
1270 0,
1271 0,
1272 i4_frm_qstep,
1273 0,
1274 ps_prms->u1_max_tr_depth,
1275 ps_prms->u1_max_tr_size,
1276 &(ps_prms->i4_tu_split_cost),
1277 NULL);
1278
1279 /* For SATD computation following TU size are assumed for a 8x8 CU */
1280 /* 8 for 2Nx2N, 4 for Nx2N,2NxN */
1281
1282 i4_total_satd_cost = best_cost_tu_split >> 2;
1283
1284 /* Second last bit has the tu pslit flag */
1285 i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1;
1286
1287 /* Last bit corrsponds to the Early CBF flag */
1288 i4_early_cbf = (best_cost_tu_split & 0x1);
1289
1290 /* Update 8x8 SATDs */
1291 pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8;
1292 pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag;
1293 pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf;
1294
1295 return i4_total_satd_cost;
1296 }
1297 //#endif
1298 /**
1299 ********************************************************************************
1300 * @fn S32 hme_evalsatd_update_1_best_result_pt_pu_16x16
1301 *
1302 * @brief Evaluates the SATD with partial updates for all the best partitions
1303 * of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1304 *
1305 * @param[inout] ps_prms: error prms containg current and ref ptr, strides,
1306 * pointer to sad grid of each partitions
1307 *
1308 * @return None
1309 ********************************************************************************
1310 */
1311
hme_evalsatd_update_2_best_results_pt_pu_16x16(err_prms_t * ps_prms,result_upd_prms_t * ps_result_prms)1312 void hme_evalsatd_update_2_best_results_pt_pu_16x16(
1313 err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1314 {
1315 S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1316 S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1317 S32 i4_satd_16x16; /* 16x16 satd cost */
1318 S32 i;
1319 S16 ai2_8x8_had[256];
1320 S16 *pi2_y0;
1321 U08 *pu1_src, *pu1_pred;
1322 S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1323 S32 *ppi4_hsad;
1324
1325 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1326 S32 *api4_satd_pu[HAD_32x32 + 1];
1327 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1328
1329 U08 *pu1_inp = ps_prms->pu1_inp;
1330 U08 *pu1_ref = ps_prms->pu1_ref;
1331
1332 S32 inp_stride = ps_prms->i4_inp_stride;
1333 S32 ref_stride = ps_prms->i4_ref_stride;
1334
1335 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1336 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1337 api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1338 api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1339
1340 ppi4_hsad = api4_satd_pu[HAD_16x16];
1341
1342 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1343 for(i = 0; i < 4; i++)
1344 {
1345 pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1346 pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1347 pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1348 pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1349
1350 ihevce_had_8x8_using_4_4x4(
1351 pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1352 }
1353
1354 /* For SATD computation following TU size are assumed for a 16x16 CU */
1355 /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs */
1356
1357 /* Update 8x8 SATDs */
1358 /* Modified to cost calculation using only 4x4 SATD */
1359
1360 // ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1361 // ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1362 // ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1363 // ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1364
1365 /* Update 16x16 SATDs */
1366 pi4_sad_grid[PART_ID_2Nx2N] =
1367 ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1368
1369 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1370 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1371 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1372 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1373
1374 /* Update 8x16 / 16x8 SATDs */
1375 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1376 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1377 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1378 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1379
1380 /* Update AMP SATDs 16x12,16x4, 12x16,4x16 */
1381 pi4_sad_grid[PART_ID_nLx2N_L] =
1382 ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12];
1383
1384 pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] +
1385 ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1386
1387 pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] +
1388 ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1389
1390 pi4_sad_grid[PART_ID_nRx2N_R] =
1391 ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15];
1392
1393 pi4_sad_grid[PART_ID_2NxnU_T] =
1394 ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3];
1395
1396 pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] +
1397 ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B];
1398
1399 pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] +
1400 ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T];
1401
1402 pi4_sad_grid[PART_ID_2NxnD_B] =
1403 ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1404
1405 /* Call the update results function */
1406 {
1407 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1408 mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1409 S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1410 S32 best_node_cost;
1411 S32 second_best_node_cost;
1412
1413 /*For each valid partition, update the refine_prm structure to reflect the best and second
1414 best candidates for that partition*/
1415
1416 for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1417 {
1418 S32 update_required = 0;
1419 S32 part_id = pi4_valid_part_ids[i4_count];
1420 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1421
1422 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1423 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1424
1425 /*Calculate total cost*/
1426 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1427 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1428
1429 /*****************************************************************/
1430 /* We do not labor through the results if the total cost worse */
1431 /* than the last of the results. */
1432 /*****************************************************************/
1433 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1434 second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
1435
1436 if(i4_tot_cost < second_best_node_cost)
1437 {
1438 update_required = 2;
1439
1440 /*************************************************************/
1441 /* Identify where the current result isto be placed.Basically*/
1442 /* find the node which has cost just higher thannodeundertest*/
1443 /*************************************************************/
1444 if(i4_tot_cost < best_node_cost)
1445 {
1446 update_required = 1;
1447 }
1448 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1449 {
1450 update_required = 0;
1451 }
1452 if(update_required == 2)
1453 {
1454 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1455 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1456 ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1457 ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1458 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1459 }
1460 else if(update_required == 1)
1461 {
1462 ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
1463 ps_subpel_refine_ctxt->i2_tot_cost[0][index];
1464 ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
1465 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1466 ps_subpel_refine_ctxt->i2_mv_x[1][index] =
1467 ps_subpel_refine_ctxt->i2_mv_x[0][index];
1468 ps_subpel_refine_ctxt->i2_mv_y[1][index] =
1469 ps_subpel_refine_ctxt->i2_mv_y[0][index];
1470 ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
1471 ps_subpel_refine_ctxt->i2_ref_idx[0][index];
1472
1473 ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1474 ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1475 ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1476 ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1477 ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1478 }
1479 }
1480 }
1481 }
1482 }
1483
1484 //#if COMPUTE_16x16_R == C
hme_evalsatd_update_1_best_result_pt_pu_16x16(err_prms_t * ps_prms,result_upd_prms_t * ps_result_prms)1485 void hme_evalsatd_update_1_best_result_pt_pu_16x16(
1486 err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1487 {
1488 S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1489 S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1490 S32 i4_satd_16x16; /* 16x16 satd cost */
1491 S32 i;
1492 S16 ai2_8x8_had[256];
1493 S16 *pi2_y0;
1494 U08 *pu1_src, *pu1_pred;
1495 S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1496 S32 *ppi4_hsad;
1497
1498 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1499 S32 *api4_satd_pu[HAD_32x32 + 1];
1500 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1501
1502 U08 *pu1_inp = ps_prms->pu1_inp;
1503 U08 *pu1_ref = ps_prms->pu1_ref;
1504
1505 S32 inp_stride = ps_prms->i4_inp_stride;
1506 S32 ref_stride = ps_prms->i4_ref_stride;
1507
1508 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1509 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1510 api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1511 api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1512
1513 ppi4_hsad = api4_satd_pu[HAD_16x16];
1514
1515 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1516 for(i = 0; i < 4; i++)
1517 {
1518 pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1519 pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1520 pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1521 pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1522
1523 ihevce_had_8x8_using_4_4x4(
1524 pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1525 }
1526
1527 /* For SATD computation following TU size are assumed for a 16x16 CU */
1528 /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs */
1529
1530 /* Update 8x8 SATDs */
1531 /* Modified to cost calculation using only 4x4 SATD */
1532
1533 // ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1534 // ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1535 // ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1536 // ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1537
1538 /* Update 16x16 SATDs */
1539 pi4_sad_grid[PART_ID_2Nx2N] =
1540 ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1541
1542 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1543 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1544 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1545 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1546
1547 /* Update 8x16 / 16x8 SATDs */
1548 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1549 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1550 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1551 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1552
1553 /* Update AMP SATDs 16x12,16x4, 12x16,4x16 */
1554 pi4_sad_grid[PART_ID_nLx2N_L] =
1555 ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
1556 pi4_sad_grid[PART_ID_nRx2N_R] =
1557 ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
1558 pi4_sad_grid[PART_ID_2NxnU_T] =
1559 ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1560 pi4_sad_grid[PART_ID_2NxnD_B] =
1561 ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1562
1563 pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
1564 pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
1565 pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
1566 pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
1567
1568 /* Call the update results function */
1569 {
1570 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1571 mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1572 S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1573 S32 best_node_cost;
1574 S32 second_best_node_cost;
1575
1576 /*For each valid partition, update the refine_prm structure to reflect the best and second
1577 best candidates for that partition*/
1578
1579 for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1580 {
1581 S32 update_required = 0;
1582 S32 part_id = pi4_valid_part_ids[i4_count];
1583 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1584
1585 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1586 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1587
1588 /*Calculate total cost*/
1589 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1590 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1591
1592 /*****************************************************************/
1593 /* We do not labor through the results if the total cost worse */
1594 /* than the last of the results. */
1595 /*****************************************************************/
1596 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1597 second_best_node_cost = SHRT_MAX;
1598
1599 if(i4_tot_cost < second_best_node_cost)
1600 {
1601 update_required = 0;
1602
1603 /*************************************************************/
1604 /* Identify where the current result isto be placed.Basically*/
1605 /* find the node which has cost just higher thannodeundertest*/
1606 /*************************************************************/
1607 if(i4_tot_cost < best_node_cost)
1608 {
1609 update_required = 1;
1610 }
1611 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1612 {
1613 update_required = 0;
1614 }
1615 if(update_required == 2)
1616 {
1617 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1618 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1619 ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1620 ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1621 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1622 }
1623 else if(update_required == 1)
1624 {
1625 ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1626 ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1627 ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1628 ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1629 ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1630 }
1631 }
1632 }
1633 }
1634 }
1635
hme_evalsatd_pt_pu_16x16_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1636 WORD32 hme_evalsatd_pt_pu_16x16_tu_rec(
1637 err_prms_t *ps_prms,
1638 WORD32 lambda,
1639 WORD32 lambda_q_shift,
1640 WORD32 i4_frm_qstep,
1641 me_func_selector_t *ps_func_selector)
1642 {
1643 S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1644 S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1645 S32 ai4_tu_split_8x8[16];
1646 S32 i4_satd_16x16; /* 16x16 satd cost */
1647
1648 S32 ai4_tu_early_cbf_8x8[16];
1649
1650 //S16 ai2_had_out[256];
1651 S16 *pi2_had_out;
1652 S32 tu_split_flag = 0;
1653 S32 early_cbf_flag = 0;
1654 S32 total_satd_cost = 0;
1655
1656 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1657 S32 *api4_satd_pu[HAD_32x32 + 1];
1658 S32 *api4_tu_split[HAD_32x32 + 1];
1659 S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1660
1661 U08 *pu1_inp = ps_prms->pu1_inp;
1662 U08 *pu1_ref = ps_prms->pu1_ref;
1663
1664 S32 inp_stride = ps_prms->i4_inp_stride;
1665 S32 ref_stride = ps_prms->i4_ref_stride;
1666
1667 /* Initialize tu_split_cost to "0" */
1668 ps_prms->i4_tu_split_cost = 0;
1669
1670 pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1671
1672 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1673 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1674 api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1675 api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1676
1677 api4_tu_split[HAD_4x4] = NULL;
1678 api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1679 api4_tu_split[HAD_16x16] = &tu_split_flag;
1680 api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1681
1682 api4_tu_early_cbf[HAD_4x4] = NULL;
1683 api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1684 api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag;
1685 api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1686
1687 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1688 ps_func_selector->pf_had_16x16_r(
1689 pu1_inp,
1690 inp_stride,
1691 pu1_ref,
1692 ref_stride,
1693 pi2_had_out,
1694 16,
1695 api4_satd_pu,
1696 api4_tu_split,
1697 api4_tu_early_cbf,
1698 0,
1699 4,
1700 lambda,
1701 lambda_q_shift,
1702 i4_frm_qstep,
1703 0,
1704 ps_prms->u1_max_tr_depth,
1705 ps_prms->u1_max_tr_size,
1706 &(ps_prms->i4_tu_split_cost),
1707 NULL);
1708
1709 total_satd_cost = i4_satd_16x16;
1710
1711 ps_prms->pi4_tu_split_flags[0] = tu_split_flag;
1712
1713 ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag;
1714
1715 return total_satd_cost;
1716 }
1717
1718 /**
1719 ********************************************************************************
1720 * @fn S32 hme_evalsatd_pt_pu_32x32
1721 *
1722 * @brief Evaluates the SATD with partial updates for all the best partitions
1723 * of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1724 *
1725 * @param[inout] ps_prms: error prms containg current and ref ptr, strides,
1726 * pointer to sad grid of each partitions
1727 *
1728 * @return None
1729 ********************************************************************************
1730 */
hme_evalsatd_pt_pu_32x32(err_prms_t * ps_prms)1731 void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms)
1732 {
1733 //S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */
1734 S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1735 S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1736 S32 i4_satd_32x32;
1737 // S16 ai2_had_out[32*32];
1738 U08 *pu1_src;
1739 U08 *pu1_pred;
1740 S32 i;
1741
1742 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1743 S32 *api4_satd_pu[HAD_32x32 + 1];
1744 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1745
1746 U08 *pu1_inp = ps_prms->pu1_inp;
1747 U08 *pu1_ref = ps_prms->pu1_ref;
1748
1749 S32 inp_stride = ps_prms->i4_inp_stride;
1750 S32 ref_stride = ps_prms->i4_ref_stride;
1751
1752 //api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1753 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1754 api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1755 api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1756
1757 /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */
1758 for(i = 0; i < 16; i++)
1759 {
1760 pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8);
1761
1762 pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8);
1763
1764 ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1765 pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1766 }
1767
1768 /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1769 ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5];
1770 ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7];
1771 ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13];
1772 ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1773
1774 /* Update 32x32 SATD */
1775 pi4_sad_grid[PART_ID_2Nx2N] =
1776 ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3];
1777
1778 /* Update 16x16 SATDs */
1779 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0];
1780 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1];
1781 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2];
1782 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3];
1783
1784 /* Update 16x32 / 32x16 SATDs */
1785 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2];
1786 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3];
1787 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1];
1788 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3];
1789
1790 /* Update AMP SATDs 32x24,32x8, 24x32,8x32 */
1791 pi4_sad_grid[PART_ID_nLx2N_L] =
1792 ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12];
1793
1794 pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] +
1795 ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1796
1797 pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] +
1798 ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1799
1800 pi4_sad_grid[PART_ID_nRx2N_R] =
1801 ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15];
1802
1803 pi4_sad_grid[PART_ID_2NxnU_T] =
1804 ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1805
1806 pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] +
1807 ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B];
1808
1809 pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] +
1810 ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T];
1811
1812 pi4_sad_grid[PART_ID_2NxnD_B] =
1813 ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1814 }
1815
hme_evalsatd_pt_pu_32x32_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1816 WORD32 hme_evalsatd_pt_pu_32x32_tu_rec(
1817 err_prms_t *ps_prms,
1818 WORD32 lambda,
1819 WORD32 lambda_q_shift,
1820 WORD32 i4_frm_qstep,
1821 me_func_selector_t *ps_func_selector)
1822 {
1823 S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */
1824 S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1825 S32 ai4_tu_split_8x8[16];
1826 S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1827 S32 ai4_tu_split_16x16[4];
1828 S32 i4_satd_32x32;
1829
1830 S32 ai4_tu_early_cbf_8x8[16];
1831 S32 ai4_tu_early_cbf_16x16[4];
1832 S32 early_cbf_flag;
1833
1834 S16 *pi2_had_out;
1835
1836 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1837 S32 *api4_satd_pu[HAD_32x32 + 1];
1838 S32 *api4_tu_split[HAD_32x32 + 1];
1839 S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1840
1841 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1842 S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags;
1843 S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf;
1844
1845 S32 tu_split_flag = 0;
1846 S32 total_satd_cost = 0;
1847
1848 U08 *pu1_inp = ps_prms->pu1_inp;
1849 U08 *pu1_ref = ps_prms->pu1_ref;
1850
1851 S32 inp_stride = ps_prms->i4_inp_stride;
1852 S32 ref_stride = ps_prms->i4_ref_stride;
1853
1854 /* Initialize tu_split_cost to "0" */
1855 ps_prms->i4_tu_split_cost = 0;
1856
1857 pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1858
1859 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1860 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1861 api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1862 api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1863
1864 api4_tu_split[HAD_4x4] = NULL;
1865 api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1866 api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
1867 api4_tu_split[HAD_32x32] = &tu_split_flag;
1868
1869 api4_tu_early_cbf[HAD_4x4] = NULL;
1870 api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1871 api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
1872 api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag;
1873
1874 /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
1875 ihevce_had_32x32_r(
1876 pu1_inp,
1877 inp_stride,
1878 pu1_ref,
1879 ref_stride,
1880 pi2_had_out,
1881 32,
1882 api4_satd_pu,
1883 api4_tu_split,
1884 api4_tu_early_cbf,
1885 0,
1886 8,
1887 lambda,
1888 lambda_q_shift,
1889 i4_frm_qstep,
1890 0,
1891 ps_prms->u1_max_tr_depth,
1892 ps_prms->u1_max_tr_size,
1893 &(ps_prms->i4_tu_split_cost),
1894 ps_func_selector);
1895
1896 total_satd_cost = i4_satd_32x32;
1897
1898 /*The structure of the TU_SPLIT flag for the current 32x32 is as follows
1899 TL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1900 TR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1901 BL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1902 BR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1903 32x32_split - 1bit (LSBit)
1904
1905 TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/
1906
1907 pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost;
1908 pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag;
1909 pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag;
1910
1911 return total_satd_cost;
1912 }
1913
1914 /**
1915 ********************************************************************************
1916 * @fn S32 hme_evalsatd_pt_pu_64x64
1917 *
1918 * @brief Evaluates the SATD with partial updates for all the best partitions
1919 * of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds
1920 *
1921 * Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard
1922 * outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as
1923 * TU size of 64 is not supported in HEVC
1924 *
1925 * @param[inout] ps_prms: error prms containg current and ref ptr, strides,
1926 * pointer to sad grid of each partitions
1927 *
1928 * @return None
1929 ********************************************************************************
1930 */
1931
hme_evalsatd_pt_pu_64x64(err_prms_t * ps_prms)1932 void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms)
1933 {
1934 //S32 ai4_satd_4x4[4][64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
1935 S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
1936 S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
1937 S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
1938 // S16 ai2_had_out[32*32];
1939 S32 i, j;
1940
1941 // S32 ai4_tu_split_8x8[4][16];
1942 // S32 ai4_tu_split_16x16[4][4];
1943 // S32 ai4_tu_split_32x32[4];
1944
1945 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1946 S32 *api4_satd_pu[HAD_32x32 + 1];
1947 // S32 *api4_tu_split[HAD_32x32 + 1];
1948
1949 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1950
1951 U08 *pu1_inp = ps_prms->pu1_inp;
1952 U08 *pu1_ref = ps_prms->pu1_ref;
1953 U08 *pu1_src;
1954 U08 *pu1_pred;
1955
1956 S32 inp_stride = ps_prms->i4_inp_stride;
1957 S32 ref_stride = ps_prms->i4_ref_stride;
1958
1959 for(i = 0; i < 4; i++)
1960 {
1961 S32 blkx = (i & 0x1);
1962 S32 blky = (i >> 1);
1963 U08 *pu1_pi0, *pu1_pi1;
1964
1965 //api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[i][0];
1966 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0];
1967 api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0];
1968 api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
1969
1970 pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
1971 pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
1972
1973 /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */
1974 for(j = 0; j < 16; j++)
1975 {
1976 pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8);
1977
1978 pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8);
1979
1980 ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1981 pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1982 }
1983
1984 /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1985 ai4_satd_16x16[i][0] =
1986 ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5];
1987 ai4_satd_16x16[i][1] =
1988 ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7];
1989 ai4_satd_16x16[i][2] =
1990 ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13];
1991 ai4_satd_16x16[i][3] =
1992 ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15];
1993 }
1994
1995 /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1996
1997 ai4_satd_32x32[0] =
1998 ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3];
1999 ai4_satd_32x32[1] =
2000 ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3];
2001 ai4_satd_32x32[2] =
2002 ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3];
2003 ai4_satd_32x32[3] =
2004 ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2005
2006 /* Update 64x64 SATDs */
2007 pi4_sad_grid[PART_ID_2Nx2N] =
2008 ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2009
2010 /* Update 32x32 SATDs */
2011 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0];
2012 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1];
2013 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2];
2014 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3];
2015
2016 /* Update 32x64 / 64x32 SATDs */
2017 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2];
2018 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3];
2019 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1];
2020 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3];
2021
2022 /* Update AMP SATDs 64x48,64x16, 48x64,16x64 */
2023 pi4_sad_grid[PART_ID_nLx2N_L] =
2024 ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2];
2025
2026 pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] +
2027 ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] +
2028 pi4_sad_grid[PART_ID_Nx2N_R];
2029
2030 pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] +
2031 ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] +
2032 pi4_sad_grid[PART_ID_Nx2N_L];
2033
2034 pi4_sad_grid[PART_ID_nRx2N_R] =
2035 ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3];
2036
2037 pi4_sad_grid[PART_ID_2NxnU_T] =
2038 ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1];
2039
2040 pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] +
2041 ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] +
2042 pi4_sad_grid[PART_ID_2NxN_B];
2043
2044 pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] +
2045 ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] +
2046 pi4_sad_grid[PART_ID_2NxN_T];
2047
2048 pi4_sad_grid[PART_ID_2NxnD_B] =
2049 ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2050 }
2051
hme_evalsatd_pt_pu_64x64_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)2052 WORD32 hme_evalsatd_pt_pu_64x64_tu_rec(
2053 err_prms_t *ps_prms,
2054 WORD32 lambda,
2055 WORD32 lambda_q_shift,
2056 WORD32 i4_frm_qstep,
2057 me_func_selector_t *ps_func_selector)
2058 {
2059 S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
2060 S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
2061 S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
2062 S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
2063
2064 S32 ai4_tu_split_8x8[16];
2065 S32 ai4_tu_split_16x16[4];
2066
2067 S32 ai4_tu_early_cbf_8x8[16];
2068 S32 ai4_tu_early_cbf_16x16[4];
2069
2070 S16 *pi2_had_out;
2071 S32 i;
2072
2073 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
2074 S32 *api4_satd_pu[HAD_32x32 + 1];
2075 S32 *api4_tu_split[HAD_32x32 + 1];
2076 S32 *api4_tu_early_cbf[HAD_32x32 + 1];
2077
2078 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
2079
2080 S32 tu_split_flag = 0;
2081 S32 total_satd_cost = 0;
2082
2083 U08 *pu1_inp = ps_prms->pu1_inp;
2084 U08 *pu1_ref = ps_prms->pu1_ref;
2085
2086 S32 inp_stride = ps_prms->i4_inp_stride;
2087 S32 ref_stride = ps_prms->i4_ref_stride;
2088
2089 /* Initialize tu_split_cost to "0" */
2090 ps_prms->i4_tu_split_cost = 0;
2091
2092 pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
2093
2094 for(i = 0; i < 4; i++)
2095 {
2096 S32 blkx = (i & 0x1);
2097 S32 blky = (i >> 1);
2098 U08 *pu1_pi0, *pu1_pi1;
2099 tu_split_flag = 0;
2100
2101 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
2102 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
2103 api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
2104 api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
2105
2106 api4_tu_split[HAD_4x4] = NULL;
2107 api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
2108 api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
2109 api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i];
2110
2111 api4_tu_early_cbf[HAD_4x4] = NULL;
2112 api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
2113 api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
2114 api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i];
2115
2116 pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
2117 pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
2118
2119 /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
2120 ihevce_had_32x32_r(
2121 pu1_pi0,
2122 inp_stride,
2123 pu1_pi1,
2124 ref_stride,
2125 pi2_had_out,
2126 32,
2127 api4_satd_pu,
2128 api4_tu_split,
2129 api4_tu_early_cbf,
2130 0,
2131 8,
2132 lambda,
2133 lambda_q_shift,
2134 i4_frm_qstep,
2135 1,
2136 ps_prms->u1_max_tr_depth,
2137 ps_prms->u1_max_tr_size,
2138 &(ps_prms->i4_tu_split_cost),
2139 ps_func_selector);
2140 }
2141
2142 total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2143
2144 /* Update 64x64 SATDs */
2145 pi4_sad_grid[PART_ID_2Nx2N] =
2146 ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2147
2148 return total_satd_cost;
2149 }
2150
2151 /**
2152 ********************************************************************************
2153 * @fn void hme_subpel_refine_search_node(search_node_t *ps_search_node,
2154 * hme_subpel_prms_t *ps_prms,
2155 * layer_ctxt_t *ps_curr_layer,
2156 * BLK_SIZE_T e_blk_size,
2157 * S32 x_off,
2158 * S32 y_off)
2159 *
2160 * @brief Refines a given partition within a CU
2161 *
2162 * @param[in,out] ps_search_node: supplies starting mv and also ref id.
2163 * updated with the accurate subpel mv
2164 *
2165 * @param[in] ps_prms: subpel prms input to this function
2166 *
2167 * @param[in] ps_curr_layer : layer context
2168 *
2169 * @param[in] e_blk_size : Block size enumeration
2170 *
2171 * @param[in] x_off : x offset of the partition w.r.t. pic start
2172 *
2173 * @param[in] y_off : y offset of the partition w.r.t. pic start
2174 *
2175 * @return None
2176 ********************************************************************************
2177 */
2178
hme_get_calc_sad_and_result_subpel_fxn(me_func_selector_t * ps_func_selector,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list,S32 i4_part_mask,U08 u1_use_satd,U08 u1_num_parts,U08 u1_num_results)2179 static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn(
2180 me_func_selector_t *ps_func_selector,
2181 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list,
2182 S32 i4_part_mask,
2183 U08 u1_use_satd,
2184 U08 u1_num_parts,
2185 U08 u1_num_results)
2186 {
2187 PF_SAD_RESULT_FXN_T pf_err_compute;
2188
2189 ASSERT((1 == u1_num_results) || (2 == u1_num_results));
2190
2191 if(1 == u1_num_results)
2192 {
2193 if(u1_use_satd)
2194 {
2195 if(u1_num_parts == 1)
2196 {
2197 pf_err_compute =
2198 ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1;
2199 }
2200 else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2201 {
2202 pf_err_compute =
2203 ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9;
2204 }
2205 else
2206 {
2207 pf_err_compute =
2208 ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17;
2209 }
2210 }
2211 else
2212 {
2213 if(u1_num_parts == 1)
2214 {
2215 pf_err_compute = ps_me_optimised_function_list
2216 ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1;
2217 }
2218 else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2219 {
2220 pf_err_compute =
2221 ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts;
2222 }
2223 else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2224 {
2225 pf_err_compute = ps_me_optimised_function_list
2226 ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9;
2227 }
2228 else
2229 {
2230 pf_err_compute = ps_me_optimised_function_list
2231 ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17;
2232 }
2233 }
2234 }
2235 else
2236 {
2237 if(u1_use_satd)
2238 {
2239 if(u1_num_parts == 1)
2240 {
2241 pf_err_compute =
2242 ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1;
2243 }
2244 else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2245 {
2246 pf_err_compute =
2247 ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9;
2248 }
2249 else
2250 {
2251 pf_err_compute =
2252 ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17;
2253 }
2254 }
2255 else
2256 {
2257 if(u1_num_parts == 1)
2258 {
2259 pf_err_compute = ps_me_optimised_function_list
2260 ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1;
2261 }
2262 else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2263 {
2264 pf_err_compute = ps_me_optimised_function_list
2265 ->pf_calc_sad_and_2_best_results_subpel_square_parts;
2266 }
2267 else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2268 {
2269 pf_err_compute = ps_me_optimised_function_list
2270 ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9;
2271 }
2272 else
2273 {
2274 pf_err_compute = ps_me_optimised_function_list
2275 ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17;
2276 }
2277 }
2278 }
2279
2280 return pf_err_compute;
2281 }
2282
2283 #if DIAMOND_GRID == 1
hme_subpel_refine_search_node_high_speed(search_node_t * ps_search_node,hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,BLK_SIZE_T e_blk_size,S32 x_off,S32 y_off,search_results_t * ps_search_results,S32 pred_lx,S32 i4_part_mask,S32 * pi4_valid_part_ids,S32 search_idx,subpel_dedup_enabler_t * ps_dedup_enabler,me_func_selector_t * ps_func_selector,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)2284 S32 hme_subpel_refine_search_node_high_speed(
2285 search_node_t *ps_search_node,
2286 hme_subpel_prms_t *ps_prms,
2287 layer_ctxt_t *ps_curr_layer,
2288 BLK_SIZE_T e_blk_size,
2289 S32 x_off,
2290 S32 y_off,
2291 search_results_t *ps_search_results,
2292 S32 pred_lx,
2293 S32 i4_part_mask,
2294 S32 *pi4_valid_part_ids,
2295 S32 search_idx,
2296 subpel_dedup_enabler_t *ps_dedup_enabler,
2297 me_func_selector_t *ps_func_selector,
2298 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
2299 {
2300 S32 i4_num_hpel_refine, i4_num_qpel_refine;
2301 S32 i4_offset, i4_grid_mask;
2302 S08 i1_ref_idx;
2303 S32 i4_blk_wd, i4_blk_ht;
2304 S32 i4_ref_stride, i4_i;
2305 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2306 result_upd_prms_t s_result_prms;
2307 search_node_t s_temp_search_node;
2308
2309 /*************************************************************************/
2310 /* Tracks current MV with the fractional component. */
2311 /*************************************************************************/
2312 S32 i4_mv_x, i4_mv_y;
2313 S32 i4_frac_x, i4_frac_y;
2314
2315 /*************************************************************************/
2316 /* Function pointer for SAD/SATD, array and prms structure to pass to */
2317 /* This function */
2318 /*************************************************************************/
2319 PF_SAD_RESULT_FXN_T pf_err_compute;
2320
2321 S32 ai4_sad_grid[17], i4_tot_cost;
2322 err_prms_t s_err_prms;
2323
2324 /*************************************************************************/
2325 /* Allowed MV RANGE */
2326 /*************************************************************************/
2327 range_prms_t *ps_range_prms;
2328
2329 /*************************************************************************/
2330 /* stores min id in grid with associated min cost. */
2331 /*************************************************************************/
2332 S32 i4_min_cost, i4_min_sad;
2333 GRID_PT_T e_min_id;
2334
2335 PF_INTERP_FXN_T pf_qpel_interp;
2336 /*************************************************************************/
2337 /* For hpel and qpel we move in diamonds and hence each point in the */
2338 /* diamond will belong to a completely different plane. To simplify the */
2339 /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the */
2340 /* hpel planes which are interpolated during recon. */
2341 /*************************************************************************/
2342 U08 *apu1_hpel_ref[4], *pu1_ref;
2343
2344 interp_prms_t s_interp_prms;
2345
2346 /*************************************************************************/
2347 /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2348 /* points to the corresponding predicted buf with its stride. */
2349 /* Note that the pointer cannot be derived just from the id, since the */
2350 /* pointer may also point to the hpel buffer (in case we request interp */
2351 /* of a hpel pt, which already exists in the recon hpel planes) */
2352 /*************************************************************************/
2353 U08 *pu1_final_out;
2354 S32 i4_final_out_stride;
2355 S32 part_id;
2356 S32 check_for_duplicate = 0;
2357
2358 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
2359
2360 S32 mvx_qpel;
2361 S32 mvy_qpel;
2362
2363 pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn(
2364 ps_func_selector,
2365 ps_me_optimised_function_list,
2366 i4_part_mask,
2367 ps_prms->i4_use_satd,
2368 ps_subpel_refine_ctxt->i4_num_valid_parts,
2369 ps_search_results->u1_num_results_per_part);
2370
2371 i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2372 i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2373
2374 /* Prediction contet should now deal with qpel units */
2375 HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2376
2377 /* Buffer allocation for subpel */
2378 /* Current design is that there may be many partitions and different mvs */
2379 /* that attempt subpel refinemnt. While there is possibility of overlap, the */
2380 /* hashing to detect and avoid overlap may be very complex. So, currently, */
2381 /* the only thing done is to store the eventual predicted buffer with every */
2382 /* ctb node that holds the result of hte best subpel search */
2383
2384 /* Compute the base pointer for input, interpolated buffers */
2385 /* The base pointers point as follows: */
2386 /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
2387 /* To these, we need to add the offset of the current node */
2388 i4_ref_stride = ps_curr_layer->i4_rec_stride;
2389 i4_offset = x_off + (y_off * i4_ref_stride);
2390 i1_ref_idx = ps_search_node->i1_ref_idx;
2391
2392 apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
2393 apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
2394 apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
2395 apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
2396
2397 /* Initialize result params used for partition update */
2398 s_result_prms.pf_mv_cost_compute = NULL;
2399 s_result_prms.ps_search_results = ps_search_results;
2400 s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
2401 s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx;
2402 s_result_prms.u1_pred_lx = search_idx;
2403 s_result_prms.i4_part_mask = i4_part_mask;
2404 s_result_prms.ps_search_node_base = ps_search_node;
2405 s_result_prms.pi4_sad_grid = &ai4_sad_grid[0];
2406 s_result_prms.i4_grid_mask = 1;
2407 s_result_prms.ps_search_node = &s_temp_search_node;
2408 s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx;
2409
2410 /* convert to hpel units */
2411 i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
2412 i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
2413
2414 /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
2415 ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx];
2416 i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL);
2417 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2418
2419 i4_min_cost = MAX_32BIT_VAL;
2420 i4_min_sad = MAX_32BIT_VAL;
2421
2422 /*************************************************************************/
2423 /* Prepare the input params to SAD/SATD function. Note that input is */
2424 /* passed from the calling funcion since it may be I (normal subpel */
2425 /* refinement) or 2I - P0 in case of bidirect subpel refinement. */
2426 /* Both cases are handled here. */
2427 /*************************************************************************/
2428 s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
2429 s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
2430 s_err_prms.i4_ref_stride = i4_ref_stride;
2431 s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
2432 s_err_prms.i4_grid_mask = 1;
2433 s_err_prms.pi4_sad_grid = &ai4_sad_grid[0];
2434 s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
2435 s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
2436
2437 s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt;
2438
2439 part_id = ps_search_node->u1_part_id;
2440 for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
2441 {
2442 e_min_id = PT_C;
2443
2444 mvx_qpel = i4_mv_x << 1;
2445 mvy_qpel = i4_mv_y << 1;
2446
2447 /* Central pt */
2448 if(i4_grid_mask & BIT_EN(PT_C))
2449 {
2450 //ps_search_node->i2_mv_x = (S16)i4_mv_x;
2451 //ps_search_node->i2_mv_x = (S16)i4_mv_y;
2452 /* central pt is i4_mv_x, i4_mv_y */
2453 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2454 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
2455
2456 i4_frac_x = i4_mv_x & 1;
2457 i4_frac_y = i4_mv_y & 1;
2458 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2459 s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2460
2461 /* Update the mv's with the current candt motion vectors */
2462 s_result_prms.i2_mv_x = mvx_qpel;
2463 s_result_prms.i2_mv_y = mvy_qpel;
2464 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2465 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2466
2467 pf_err_compute(&s_err_prms, &s_result_prms);
2468
2469 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2470 if(i4_tot_cost < i4_min_cost)
2471 {
2472 i4_min_cost = i4_tot_cost;
2473 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2474 e_min_id = PT_C;
2475 pu1_final_out = s_err_prms.pu1_ref;
2476 }
2477 }
2478
2479 /* left pt */
2480 if(i4_grid_mask & BIT_EN(PT_L))
2481 {
2482 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2483 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
2484
2485 if(!check_for_duplicate)
2486 {
2487 /* search node mv is stored in qpel units */
2488 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
2489 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2490 /* central pt is i4_mv_x - 1, i4_mv_y */
2491 i4_frac_x = (i4_mv_x - 1) & 1; // same as (x-1)&1
2492 i4_frac_y = i4_mv_y & 1;
2493 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2494 s_err_prms.pu1_ref =
2495 pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2496
2497 /* Update the mv's with the current candt motion vectors */
2498 s_result_prms.i2_mv_x = mvx_qpel - 2;
2499 s_result_prms.i2_mv_y = mvy_qpel;
2500 s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2;
2501 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2502
2503 pf_err_compute(&s_err_prms, &s_result_prms);
2504 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2505 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2506 if(i4_tot_cost < i4_min_cost)
2507 {
2508 i4_min_cost = i4_tot_cost;
2509 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2510 e_min_id = PT_L;
2511 pu1_final_out = s_err_prms.pu1_ref;
2512 }
2513 }
2514 }
2515 /* top pt */
2516 if(i4_grid_mask & BIT_EN(PT_T))
2517 {
2518 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2519 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
2520
2521 if(!check_for_duplicate)
2522 {
2523 /* search node mv is stored in qpel units */
2524 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
2525 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
2526 /* top pt is i4_mv_x, i4_mv_y - 1 */
2527 i4_frac_x = i4_mv_x & 1;
2528 i4_frac_y = (i4_mv_y - 1) & 1;
2529 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2530 s_err_prms.pu1_ref =
2531 pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
2532
2533 /* Update the mv's with the current candt motion vectors */
2534 s_result_prms.i2_mv_x = mvx_qpel;
2535 s_result_prms.i2_mv_y = mvy_qpel - 2;
2536 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2537 s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2;
2538
2539 pf_err_compute(&s_err_prms, &s_result_prms);
2540 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2541 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2542 if(i4_tot_cost < i4_min_cost)
2543 {
2544 i4_min_cost = i4_tot_cost;
2545 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2546 e_min_id = PT_T;
2547 pu1_final_out = s_err_prms.pu1_ref;
2548 }
2549 }
2550 }
2551 /* right pt */
2552 if(i4_grid_mask & BIT_EN(PT_R))
2553 {
2554 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2555 ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
2556 if(!check_for_duplicate)
2557 {
2558 /* search node mv is stored in qpel units */
2559 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
2560 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2561 /* right pt is i4_mv_x + 1, i4_mv_y */
2562 i4_frac_x = (i4_mv_x + 1) & 1;
2563 i4_frac_y = i4_mv_y & 1;
2564
2565 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2566 s_err_prms.pu1_ref =
2567 pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2568
2569 /* Update the mv's with the current candt motion vectors */
2570 s_result_prms.i2_mv_x = mvx_qpel + 2;
2571 s_result_prms.i2_mv_y = mvy_qpel;
2572 s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2;
2573 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2574
2575 pf_err_compute(&s_err_prms, &s_result_prms);
2576 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2577 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2578 if(i4_tot_cost < i4_min_cost)
2579 {
2580 i4_min_cost = i4_tot_cost;
2581 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2582 e_min_id = PT_R;
2583 pu1_final_out = s_err_prms.pu1_ref;
2584 }
2585 }
2586 }
2587 /* bottom pt */
2588 if(i4_grid_mask & BIT_EN(PT_B))
2589 {
2590 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2591 ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
2592 if(!check_for_duplicate)
2593 {
2594 /* search node mv is stored in qpel units */
2595 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
2596 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
2597 i4_frac_x = i4_mv_x & 1;
2598 i4_frac_y = (i4_mv_y + 1) & 1;
2599 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2600 s_err_prms.pu1_ref =
2601 pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
2602
2603 /* Update the mv's with the current candt motion vectors */
2604 s_result_prms.i2_mv_x = mvx_qpel;
2605 s_result_prms.i2_mv_y = mvy_qpel + 2;
2606 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2607 s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2;
2608
2609 pf_err_compute(&s_err_prms, &s_result_prms);
2610 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2611 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2612 if(i4_tot_cost < i4_min_cost)
2613 {
2614 i4_min_cost = i4_tot_cost;
2615 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2616 e_min_id = PT_B;
2617 pu1_final_out = s_err_prms.pu1_ref;
2618 }
2619 }
2620 }
2621 /* Early exit in case of central point */
2622 if(e_min_id == PT_C)
2623 break;
2624
2625 /*********************************************************************/
2626 /* Depending on the best result location, we may be able to skip */
2627 /* atleast two pts, centre pt and one more pt. E.g. if right pt is */
2628 /* the best result, the next iteration need not do centre, left pts */
2629 /*********************************************************************/
2630 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2631 i4_mv_x += gai1_grid_id_to_x[e_min_id];
2632 i4_mv_y += gai1_grid_id_to_y[e_min_id];
2633 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2634 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2635 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2636 }
2637
2638 /* Convert to QPEL units */
2639 i4_mv_x <<= 1;
2640 i4_mv_y <<= 1;
2641
2642 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2643 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2644
2645 /* Exact interpolation or averaging chosen here */
2646 pf_qpel_interp = ps_prms->pf_qpel_interp;
2647
2648 /* Next QPEL ME */
2649 /* In this case, we have option of doing exact QPEL interpolation or avg */
2650 /*************************************************************************/
2651 /* x */
2652 /* A b C d */
2653 /* e f g h */
2654 /* I j K l */
2655 /* m n o p */
2656 /* Q r S t */
2657 /* */
2658 /* Approximate QPEL logic */
2659 /* b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K) */
2660 /* for any given pt, we can get all the information required about */
2661 /* the surrounding 4 pts. For example, given point C (0.5, 0) */
2662 /* surrounding pts info: */
2663 /* b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf */
2664 /* buffer 2: hxfy, offsets for both are 0, 0 */
2665 /* similarly for other pts the info can be gotten */
2666 /*************************************************************************/
2667 i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C));
2668 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2669
2670 /*************************************************************************/
2671 /* One time preparation of non changing interpolation params. These */
2672 /* include a set of ping pong result buf ptrs, input buf ptrs and some */
2673 /* working memory (not used though in case of averaging). */
2674 /*************************************************************************/
2675 s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
2676 s_interp_prms.i4_ref_stride = i4_ref_stride;
2677 s_interp_prms.i4_blk_wd = i4_blk_wd;
2678 s_interp_prms.i4_blk_ht = i4_blk_ht;
2679
2680 i4_final_out_stride = i4_ref_stride;
2681
2682 {
2683 U08 *pu1_mem;
2684 /*********************************************************************/
2685 /* Allocation of working memory for interpolated buffers. We maintain*/
2686 /* an intermediate working buffer, and 2 ping pong interpolated out */
2687 /* buffers, purpose of ping pong explained later below */
2688 /*********************************************************************/
2689 pu1_mem = ps_prms->pu1_wkg_mem;
2690 s_interp_prms.pu1_wkg_mem = pu1_mem;
2691
2692 //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
2693 s_interp_prms.apu1_interp_out[0] = pu1_mem;
2694
2695 pu1_mem += (INTERP_OUT_BUF_SIZE);
2696 s_interp_prms.apu1_interp_out[1] = pu1_mem;
2697
2698 pu1_mem += (INTERP_OUT_BUF_SIZE);
2699 s_interp_prms.apu1_interp_out[2] = pu1_mem;
2700
2701 pu1_mem += (INTERP_OUT_BUF_SIZE);
2702 s_interp_prms.apu1_interp_out[3] = pu1_mem;
2703
2704 pu1_mem += (INTERP_OUT_BUF_SIZE);
2705 s_interp_prms.apu1_interp_out[4] = pu1_mem;
2706
2707 /*********************************************************************/
2708 /* Stride of interpolated output is just a function of blk width of */
2709 /* this partition and hence remains constant for this partition */
2710 /*********************************************************************/
2711 s_interp_prms.i4_out_stride = (i4_blk_wd);
2712 }
2713
2714 {
2715 UWORD8 *apu1_final[4];
2716 WORD32 ai4_ref_stride[4];
2717 /*************************************************************************/
2718 /* Ping pong design for interpolated buffers. We use a min id, which */
2719 /* tracks the id of the ppu1_interp_out that stores the best result. */
2720 /* When new interp to be done, it uses 1 - bes result id to do the interp*/
2721 /* min id is toggled when any new result becomes the best result. */
2722 /*************************************************************************/
2723
2724 for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
2725 {
2726 e_min_id = PT_C;
2727
2728 mvx_qpel = i4_mv_x;
2729 mvy_qpel = i4_mv_y;
2730 hme_qpel_interp_comprehensive(
2731 &s_interp_prms,
2732 apu1_final,
2733 ai4_ref_stride,
2734 i4_mv_x,
2735 i4_mv_y,
2736 i4_grid_mask,
2737 ps_me_optimised_function_list);
2738 if(i4_grid_mask & BIT_EN(PT_L))
2739 {
2740 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2741 ps_dedup_enabler,
2742 num_unique_nodes,
2743 mvx_qpel - 1,
2744 mvy_qpel - 0,
2745 check_for_duplicate);
2746
2747 if(!check_for_duplicate)
2748 {
2749 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
2750 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2751
2752 s_err_prms.pu1_ref = apu1_final[0];
2753 s_err_prms.i4_ref_stride = ai4_ref_stride[0];
2754
2755 /* Update the mv's with the current candt motion vectors */
2756 s_result_prms.i2_mv_x = mvx_qpel - 1;
2757 s_result_prms.i2_mv_y = mvy_qpel;
2758 s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1;
2759 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2760
2761 pf_err_compute(&s_err_prms, &s_result_prms);
2762 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2763
2764 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2765 if(i4_tot_cost < i4_min_cost)
2766 {
2767 e_min_id = PT_L;
2768 i4_min_cost = i4_tot_cost;
2769 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2770 }
2771 }
2772 }
2773 if(i4_grid_mask & BIT_EN(PT_T))
2774 {
2775 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2776 ps_dedup_enabler,
2777 num_unique_nodes,
2778 mvx_qpel - 0,
2779 mvy_qpel - 1,
2780 check_for_duplicate);
2781
2782 if(!check_for_duplicate)
2783 {
2784 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2785 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
2786
2787 s_err_prms.pu1_ref = apu1_final[1];
2788 s_err_prms.i4_ref_stride = ai4_ref_stride[1];
2789
2790 /* Update the mv's with the current candt motion vectors */
2791 s_result_prms.i2_mv_x = mvx_qpel;
2792 s_result_prms.i2_mv_y = mvy_qpel - 1;
2793
2794 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2795 s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1;
2796
2797 pf_err_compute(&s_err_prms, &s_result_prms);
2798
2799 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2800 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2801 if(i4_tot_cost < i4_min_cost)
2802 {
2803 e_min_id = PT_T;
2804 i4_min_cost = i4_tot_cost;
2805 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2806 }
2807 }
2808 }
2809 if(i4_grid_mask & BIT_EN(PT_R))
2810 {
2811 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2812 ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
2813
2814 if(!check_for_duplicate)
2815 {
2816 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
2817 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2818
2819 s_err_prms.pu1_ref = apu1_final[2];
2820 s_err_prms.i4_ref_stride = ai4_ref_stride[2];
2821
2822 /* Update the mv's with the current candt motion vectors */
2823 s_result_prms.i2_mv_x = mvx_qpel + 1;
2824 s_result_prms.i2_mv_y = mvy_qpel;
2825
2826 s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1;
2827 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2828
2829 pf_err_compute(&s_err_prms, &s_result_prms);
2830
2831 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2832
2833 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2834 if(i4_tot_cost < i4_min_cost)
2835 {
2836 e_min_id = PT_R;
2837 i4_min_cost = i4_tot_cost;
2838 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2839 }
2840 }
2841 }
2842 /* i4_mv_x and i4_mv_y will always be the centre pt */
2843 /* for qpel we start with least hpel, and hence compute of center pt never reqd */
2844 if(i4_grid_mask & BIT_EN(PT_B))
2845 {
2846 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2847 ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
2848
2849 if(!check_for_duplicate)
2850 {
2851 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2852 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
2853
2854 s_err_prms.pu1_ref = apu1_final[3];
2855 s_err_prms.i4_ref_stride = ai4_ref_stride[3];
2856
2857 /* Update the mv's with the current candt motion vectors */
2858 s_result_prms.i2_mv_x = mvx_qpel;
2859 s_result_prms.i2_mv_y = mvy_qpel + 1;
2860
2861 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2862 s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1;
2863
2864 pf_err_compute(&s_err_prms, &s_result_prms);
2865
2866 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2867 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2868 if(i4_tot_cost < i4_min_cost)
2869 {
2870 e_min_id = PT_B;
2871 i4_min_cost = i4_tot_cost;
2872 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2873 }
2874 }
2875 }
2876
2877 /* New QPEL mv x and y */
2878 if(e_min_id == PT_C)
2879 break;
2880 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2881 i4_mv_x += gai1_grid_id_to_x[e_min_id];
2882 i4_mv_y += gai1_grid_id_to_y[e_min_id];
2883 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2884 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2885 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2886 }
2887 }
2888
2889 /* update modified motion vectors and cost at end of subpel */
2890 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2891 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2892 ps_search_node->i4_tot_cost = i4_min_cost;
2893 ps_search_node->i4_sad = i4_min_sad;
2894
2895 /********************************************************************************/
2896 /* TODO: Restoring back Sad lambda from Hadamard lambda */
2897 /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
2898 /********************************************************************************/
2899 //ps_pred_ctxt->lambda >>= 1;
2900
2901 return (i4_min_cost);
2902 }
2903 #elif DIAMOND_GRID == 0
hme_subpel_refine_search_node_high_speed(search_node_t * ps_search_node,hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,BLK_SIZE_T e_blk_size,S32 x_off,S32 y_off,search_results_t * ps_search_results,S32 pred_lx,S32 i4_part_mask,S32 * pi4_valid_part_ids,S32 search_idx,subpel_dedup_enabler_t * ps_dedup_enabler,me_func_selector_t * ps_func_selector)2904 S32 hme_subpel_refine_search_node_high_speed(
2905 search_node_t *ps_search_node,
2906 hme_subpel_prms_t *ps_prms,
2907 layer_ctxt_t *ps_curr_layer,
2908 BLK_SIZE_T e_blk_size,
2909 S32 x_off,
2910 S32 y_off,
2911 search_results_t *ps_search_results,
2912 S32 pred_lx,
2913 S32 i4_part_mask,
2914 S32 *pi4_valid_part_ids,
2915 S32 search_idx,
2916 subpel_dedup_enabler_t *ps_dedup_enabler,
2917 me_func_selector_t *ps_func_selector)
2918 {
2919 S32 i4_num_hpel_refine, i4_num_qpel_refine;
2920 S32 i4_offset, i4_grid_mask;
2921 S08 i1_ref_idx;
2922 S32 i4_blk_wd, i4_blk_ht;
2923 S32 i4_ref_stride, i4_i;
2924 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2925 result_upd_prms_t s_result_prms;
2926
2927 /*************************************************************************/
2928 /* Tracks current MV with the fractional component. */
2929 /*************************************************************************/
2930 S32 i4_mv_x, i4_mv_y;
2931 S32 i4_frac_x, i4_frac_y;
2932
2933 /*************************************************************************/
2934 /* Function pointer for SAD/SATD, array and prms structure to pass to */
2935 /* This function */
2936 /*************************************************************************/
2937 PF_SAD_FXN_T pf_err_compute;
2938 S32 ai4_sad_grid[9][17], i4_tot_cost;
2939 err_prms_t s_err_prms;
2940
2941 /*************************************************************************/
2942 /* Allowed MV RANGE */
2943 /*************************************************************************/
2944 range_prms_t *ps_range_prms;
2945
2946 /*************************************************************************/
2947 /* stores min id in grid with associated min cost. */
2948 /*************************************************************************/
2949 S32 i4_min_cost, i4_min_sad;
2950 GRID_PT_T e_min_id;
2951
2952 PF_INTERP_FXN_T pf_qpel_interp;
2953 /*************************************************************************/
2954 /* For hpel and qpel we move in diamonds and hence each point in the */
2955 /* diamond will belong to a completely different plane. To simplify the */
2956 /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the */
2957 /* hpel planes which are interpolated during recon. */
2958 /*************************************************************************/
2959 U08 *apu1_hpel_ref[4], *pu1_ref;
2960
2961 interp_prms_t s_interp_prms;
2962
2963 /*************************************************************************/
2964 /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2965 /* points to the corresponding predicted buf with its stride. */
2966 /* Note that the pointer cannot be derived just from the id, since the */
2967 /* pointer may also point to the hpel buffer (in case we request interp */
2968 /* of a hpel pt, which already exists in the recon hpel planes) */
2969 /*************************************************************************/
2970 U08 *pu1_final_out;
2971 S32 i4_final_out_stride;
2972 S32 part_id;
2973 S32 check_for_duplicate = 0;
2974
2975 S32 mvx_qpel;
2976 S32 mvy_qpel;
2977
2978 /*************************************************************************/
2979 /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/
2980 /* fixed through this subpel refinement for this partition. */
2981 /* Note, we do not enable grid sads since each pt is different buffers. */
2982 /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled. */
2983 /*************************************************************************/
2984 if(ps_prms->i4_use_satd)
2985 {
2986 pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16;
2987 }
2988 else
2989 {
2990 pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */
2991 }
2992
2993 i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2994 i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2995
2996 /* Prediction contet should now deal with qpel units */
2997 HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2998
2999 /* Buffer allocation for subpel */
3000 /* Current design is that there may be many partitions and different mvs */
3001 /* that attempt subpel refinemnt. While there is possibility of overlap, the */
3002 /* hashing to detect and avoid overlap may be very complex. So, currently, */
3003 /* the only thing done is to store the eventual predicted buffer with every */
3004 /* ctb node that holds the result of hte best subpel search */
3005
3006 /* Compute the base pointer for input, interpolated buffers */
3007 /* The base pointers point as follows:
3008 /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
3009 /* To these, we need to add the offset of the current node */
3010 i4_ref_stride = ps_curr_layer->i4_rec_stride;
3011 i4_offset = x_off + (y_off * i4_ref_stride);
3012 i1_ref_idx = ps_search_node->i1_ref_idx;
3013
3014 apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
3015 apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
3016 apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
3017 apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
3018
3019 /* Initialize result params used for partition update */
3020 s_result_prms.pf_mv_cost_compute = NULL;
3021 s_result_prms.ps_search_results = ps_search_results;
3022 s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
3023 s_result_prms.i1_ref_idx = search_idx;
3024 s_result_prms.i4_part_mask = i4_part_mask;
3025 s_result_prms.ps_search_node_base = ps_search_node;
3026 s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3027 s_result_prms.i4_grid_mask = 1;
3028
3029 /* convert to hpel units */
3030 i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
3031 i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
3032
3033 /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
3034 ps_range_prms = ps_prms->ps_mv_range_qpel;
3035 i4_grid_mask = (GRID_ALL_PTS_VALID);
3036 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3037
3038 i4_min_cost = MAX_32BIT_VAL;
3039 i4_min_sad = MAX_32BIT_VAL;
3040
3041 /*************************************************************************/
3042 /* Prepare the input params to SAD/SATD function. Note that input is */
3043 /* passed from the calling funcion since it may be I (normal subpel */
3044 /* refinement) or 2I - P0 in case of bidirect subpel refinement. */
3045 /* Both cases are handled here. */
3046 /*************************************************************************/
3047 s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
3048 s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
3049 s_err_prms.i4_ref_stride = i4_ref_stride;
3050 s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
3051 s_err_prms.i4_grid_mask = 1;
3052 s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3053 s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
3054 s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
3055
3056 /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */
3057 //ps_pred_ctxt->lambda <<= 1;
3058 part_id = ps_search_node->u1_part_id;
3059 for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
3060 {
3061 e_min_id = PT_C;
3062
3063 mvx_qpel = i4_mv_x << 1;
3064 mvy_qpel = i4_mv_y << 1;
3065
3066 /* Central pt */
3067 if(i4_grid_mask & BIT_EN(PT_C))
3068 {
3069 //ps_search_node->i2_mv_x = (S16)i4_mv_x;
3070 //ps_search_node->i2_mv_x = (S16)i4_mv_y;
3071 /* central pt is i4_mv_x, i4_mv_y */
3072 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3073 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
3074
3075 i4_frac_x = i4_mv_x & 1;
3076 i4_frac_y = i4_mv_y & 1;
3077 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3078 s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3079 pf_err_compute(&s_err_prms);
3080 /* Update the mv's with the current candt motion vectors */
3081 s_result_prms.i2_mv_x = mvx_qpel;
3082 s_result_prms.i2_mv_y = mvy_qpel;
3083 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3084 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3085 if(i4_tot_cost < i4_min_cost)
3086 {
3087 i4_min_cost = i4_tot_cost;
3088 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3089 e_min_id = PT_C;
3090 pu1_final_out = s_err_prms.pu1_ref;
3091 }
3092 }
3093
3094 /* left pt */
3095 if(i4_grid_mask & BIT_EN(PT_L))
3096 {
3097 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3098 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
3099
3100 if(!check_for_duplicate)
3101 {
3102 /* search node mv is stored in qpel units */
3103 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
3104 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3105 /* central pt is i4_mv_x - 1, i4_mv_y */
3106 i4_frac_x = (i4_mv_x - 1) & 1; // same as (x-1)&1
3107 i4_frac_y = i4_mv_y & 1;
3108 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3109 s_err_prms.pu1_ref =
3110 pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3111
3112 pf_err_compute(&s_err_prms);
3113 /* Update the mv's with the current candt motion vectors */
3114 s_result_prms.i2_mv_x = mvx_qpel;
3115 s_result_prms.i2_mv_y = mvy_qpel;
3116 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3117
3118 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3119
3120 if(i4_tot_cost < i4_min_cost)
3121 {
3122 i4_min_cost = i4_tot_cost;
3123 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3124 e_min_id = PT_L;
3125 pu1_final_out = s_err_prms.pu1_ref;
3126 }
3127 }
3128 }
3129 /* top pt */
3130 if(i4_grid_mask & BIT_EN(PT_T))
3131 {
3132 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3133 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
3134
3135 if(!check_for_duplicate)
3136 {
3137 /* search node mv is stored in qpel units */
3138 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3139 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
3140 /* top pt is i4_mv_x, i4_mv_y - 1 */
3141 i4_frac_x = i4_mv_x & 1;
3142 i4_frac_y = (i4_mv_y - 1) & 1;
3143 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3144 s_err_prms.pu1_ref =
3145 pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
3146 pf_err_compute(&s_err_prms);
3147 /* Update the mv's with the current candt motion vectors */
3148 s_result_prms.i2_mv_x = mvx_qpel;
3149 s_result_prms.i2_mv_y = mvy_qpel - 2;
3150 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3151
3152 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3153
3154 if(i4_tot_cost < i4_min_cost)
3155 {
3156 i4_min_cost = i4_tot_cost;
3157 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3158 e_min_id = PT_T;
3159 pu1_final_out = s_err_prms.pu1_ref;
3160 }
3161 }
3162 }
3163 /* right pt */
3164 if(i4_grid_mask & BIT_EN(PT_R))
3165 {
3166 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3167 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
3168
3169 if(!check_for_duplicate)
3170 {
3171 /* search node mv is stored in qpel units */
3172 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
3173 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3174 /* right pt is i4_mv_x + 1, i4_mv_y */
3175 i4_frac_x = (i4_mv_x + 1) & 1;
3176 i4_frac_y = i4_mv_y & 1;
3177
3178 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3179 s_err_prms.pu1_ref =
3180 pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3181 pf_err_compute(&s_err_prms);
3182 /* Update the mv's with the current candt motion vectors */
3183 s_result_prms.i2_mv_x = mvx_qpel + 2;
3184 s_result_prms.i2_mv_y = mvy_qpel;
3185 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3186
3187 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3188
3189 if(i4_tot_cost < i4_min_cost)
3190 {
3191 i4_min_cost = i4_tot_cost;
3192 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3193 e_min_id = PT_R;
3194 pu1_final_out = s_err_prms.pu1_ref;
3195 }
3196 }
3197 }
3198 /* bottom pt */
3199 if(i4_grid_mask & BIT_EN(PT_B))
3200 {
3201 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3202 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
3203
3204 if(!check_for_duplicate)
3205 {
3206 /* search node mv is stored in qpel units */
3207 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
3208 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
3209 i4_frac_x = i4_mv_x & 1;
3210 i4_frac_y = (i4_mv_y + 1) & 1;
3211 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3212 s_err_prms.pu1_ref =
3213 pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
3214
3215 pf_err_compute(&s_err_prms);
3216 /* Update the mv's with the current candt motion vectors */
3217 s_result_prms.i2_mv_x = mvx_qpel;
3218 s_result_prms.i2_mv_y = mvy_qpel + 2;
3219 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3220
3221 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3222
3223 if(i4_tot_cost < i4_min_cost)
3224 {
3225 i4_min_cost = i4_tot_cost;
3226 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3227 e_min_id = PT_B;
3228 pu1_final_out = s_err_prms.pu1_ref;
3229 }
3230 }
3231 }
3232 if(e_min_id == PT_C)
3233 {
3234 if(!i4_i)
3235 {
3236 /* TL pt */
3237 if(i4_grid_mask & BIT_EN(PT_TL))
3238 {
3239 S32 mvx_minus_1 = (i4_mv_x - 1);
3240 S32 mvy_minus_1 = (i4_mv_y - 1);
3241
3242 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3243 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate);
3244
3245 if(!check_for_duplicate)
3246 {
3247 /* search node mv is stored in qpel units */
3248 ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3249 ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3250 i4_frac_x = mvx_minus_1 & 1;
3251 i4_frac_y = mvy_minus_1 & 1;
3252 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3253 s_err_prms.pu1_ref =
3254 pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3255
3256 pf_err_compute(&s_err_prms);
3257 /* Update the mv's with the current candt motion vectors */
3258 s_result_prms.i2_mv_x = mvx_qpel - 2;
3259 s_result_prms.i2_mv_y = mvy_qpel - 2;
3260 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3261
3262 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3263
3264 if(i4_tot_cost < i4_min_cost)
3265 {
3266 i4_min_cost = i4_tot_cost;
3267 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3268 e_min_id = PT_TL;
3269 pu1_final_out = s_err_prms.pu1_ref;
3270 }
3271 }
3272 }
3273 /* TR pt */
3274 if(i4_grid_mask & BIT_EN(PT_TR))
3275 {
3276 S32 mvx_plus_1 = (i4_mv_x + 1);
3277 S32 mvy_minus_1 = (i4_mv_y - 1);
3278
3279 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3280 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate);
3281
3282 if(!check_for_duplicate)
3283 {
3284 /* search node mv is stored in qpel units */
3285 ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3286 ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3287 i4_frac_x = mvx_plus_1 & 1;
3288 i4_frac_y = mvy_minus_1 & 1;
3289 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3290 s_err_prms.pu1_ref =
3291 pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3292
3293 pf_err_compute(&s_err_prms);
3294 /* Update the mv's with the current candt motion vectors */
3295 s_result_prms.i2_mv_x = mvx_qpel + 2;
3296 s_result_prms.i2_mv_y = mvy_qpel - 2;
3297 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3298
3299 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3300
3301 if(i4_tot_cost < i4_min_cost)
3302 {
3303 i4_min_cost = i4_tot_cost;
3304 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3305 e_min_id = PT_TR;
3306 pu1_final_out = s_err_prms.pu1_ref;
3307 }
3308 }
3309 }
3310 /* BL pt */
3311 if(i4_grid_mask & BIT_EN(PT_BL))
3312 {
3313 S32 mvx_minus_1 = (i4_mv_x - 1);
3314 S32 mvy_plus_1 = (i4_mv_y + 1);
3315
3316 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3317 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate);
3318
3319 if(!check_for_duplicate)
3320 {
3321 /* search node mv is stored in qpel units */
3322 ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3323 ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3324 i4_frac_x = mvx_minus_1 & 1;
3325 i4_frac_y = mvy_plus_1 & 1;
3326 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3327 s_err_prms.pu1_ref =
3328 pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3329
3330 pf_err_compute(&s_err_prms);
3331 /* Update the mv's with the current candt motion vectors */
3332 s_result_prms.i2_mv_x = mvx_qpel - 2;
3333 s_result_prms.i2_mv_y = mvy_qpel + 2;
3334 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3335
3336 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3337
3338 if(i4_tot_cost < i4_min_cost)
3339 {
3340 i4_min_cost = i4_tot_cost;
3341 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3342 e_min_id = PT_BL;
3343 pu1_final_out = s_err_prms.pu1_ref;
3344 }
3345 }
3346 }
3347 /* BR pt */
3348 if(i4_grid_mask & BIT_EN(PT_BR))
3349 {
3350 S32 mvx_plus_1 = (i4_mv_x + 1);
3351 S32 mvy_plus_1 = (i4_mv_y + 1);
3352 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3353 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate);
3354
3355 if(!check_for_duplicate)
3356 {
3357 /* search node mv is stored in qpel units */
3358 ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3359 ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3360 i4_frac_x = mvx_plus_1 & 1;
3361 i4_frac_y = mvy_plus_1 & 1;
3362 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3363 s_err_prms.pu1_ref =
3364 pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3365
3366 pf_err_compute(&s_err_prms);
3367 /* Update the mv's with the current candt motion vectors */
3368 s_result_prms.i2_mv_x = mvx_qpel + 2;
3369 s_result_prms.i2_mv_y = mvy_qpel + 2;
3370 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3371
3372 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3373
3374 if(i4_tot_cost < i4_min_cost)
3375 {
3376 i4_min_cost = i4_tot_cost;
3377 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3378 e_min_id = PT_BR;
3379 pu1_final_out = s_err_prms.pu1_ref;
3380 }
3381 }
3382 }
3383 if(e_min_id == PT_C)
3384 {
3385 break;
3386 }
3387 }
3388 else
3389 {
3390 break;
3391 }
3392 }
3393
3394 /*********************************************************************/
3395 /* Depending on the best result location, we may be able to skip */
3396 /* atleast two pts, centre pt and one more pt. E.g. if right pt is */
3397 /* the best result, the next iteration need not do centre, left pts */
3398 /*********************************************************************/
3399 if(i4_i)
3400 {
3401 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3402 }
3403 else
3404 {
3405 i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3406 }
3407 i4_mv_x += gai1_grid_id_to_x[e_min_id];
3408 i4_mv_y += gai1_grid_id_to_y[e_min_id];
3409 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3410 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3411 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3412 }
3413
3414 /* Convert to QPEL units */
3415 i4_mv_x <<= 1;
3416 i4_mv_y <<= 1;
3417
3418 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3419 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3420
3421 /* Early exit if this partition is visiting same hpel mv again */
3422 /* Assumption : Checkin for early exit in best result of partition */
3423 if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x ==
3424 ps_search_node->s_mv.i2_mvx) &&
3425 (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y ==
3426 ps_search_node->s_mv.i2_mvy))
3427 {
3428 return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost);
3429 }
3430 else
3431 {
3432 /* Store the best hpel mv for future early exit checks */
3433 ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x =
3434 (S16)i4_mv_x;
3435 ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y =
3436 (S16)i4_mv_y;
3437 }
3438
3439 /* Early exit if this partition is visiting same hpel mv again */
3440 /* Assumption : Checkin for early exit in second best result of partition */
3441 if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x ==
3442 ps_search_node->s_mv.i2_mvx) &&
3443 (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y ==
3444 ps_search_node->s_mv.i2_mvy))
3445 {
3446 return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost);
3447 }
3448 else
3449 {
3450 /* Store the best hpel mv for future early exit checks */
3451 ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x =
3452 (S16)i4_mv_x;
3453 ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y =
3454 (S16)i4_mv_y;
3455 }
3456
3457 /* Exact interpolation or averaging chosen here */
3458 pf_qpel_interp = ps_prms->pf_qpel_interp;
3459
3460 /* Next QPEL ME */
3461 /* In this case, we have option of doing exact QPEL interpolation or avg */
3462 /*************************************************************************/
3463 /* x */
3464 /* A b C d */
3465 /* e f g h */
3466 /* I j K l */
3467 /* m n o p */
3468 /* Q r S t */
3469 /* */
3470 /* Approximate QPEL logic */
3471 /* b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K) */
3472 /* for any given pt, we can get all the information required about */
3473 /* the surrounding 4 pts. For example, given point C (0.5, 0) */
3474 /* surrounding pts info: */
3475 /* b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf */
3476 /* buffer 2: hxfy, offsets for both are 0, 0 */
3477 /* similarly for other pts the info can be gotten */
3478 /*************************************************************************/
3479 i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C));
3480 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3481
3482 /*************************************************************************/
3483 /* One time preparation of non changing interpolation params. These */
3484 /* include a set of ping pong result buf ptrs, input buf ptrs and some */
3485 /* working memory (not used though in case of averaging). */
3486 /*************************************************************************/
3487 s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
3488 s_interp_prms.i4_ref_stride = i4_ref_stride;
3489 s_interp_prms.i4_blk_wd = i4_blk_wd;
3490 s_interp_prms.i4_blk_ht = i4_blk_ht;
3491
3492 i4_final_out_stride = i4_ref_stride;
3493
3494 {
3495 U08 *pu1_mem;
3496 /*********************************************************************/
3497 /* Allocation of working memory for interpolated buffers. We maintain*/
3498 /* an intermediate working buffer, and 2 ping pong interpolated out */
3499 /* buffers, purpose of ping pong explained later below */
3500 /*********************************************************************/
3501 pu1_mem = ps_prms->pu1_wkg_mem;
3502 s_interp_prms.pu1_wkg_mem = pu1_mem;
3503
3504 //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
3505 s_interp_prms.apu1_interp_out[0] = pu1_mem;
3506
3507 pu1_mem += (INTERP_OUT_BUF_SIZE);
3508 s_interp_prms.apu1_interp_out[1] = pu1_mem;
3509
3510 pu1_mem += (INTERP_OUT_BUF_SIZE);
3511 s_interp_prms.apu1_interp_out[2] = pu1_mem;
3512
3513 pu1_mem += (INTERP_OUT_BUF_SIZE);
3514 s_interp_prms.apu1_interp_out[3] = pu1_mem;
3515
3516 pu1_mem += (INTERP_OUT_BUF_SIZE);
3517 s_interp_prms.apu1_interp_out[4] = pu1_mem;
3518
3519 /*********************************************************************/
3520 /* Stride of interpolated output is just a function of blk width of */
3521 /* this partition and hence remains constant for this partition */
3522 /*********************************************************************/
3523 s_interp_prms.i4_out_stride = (i4_blk_wd);
3524 }
3525
3526 {
3527 UWORD8 *apu1_final[4];
3528 WORD32 ai4_ref_stride[4];
3529 /*************************************************************************/
3530 /* Ping pong design for interpolated buffers. We use a min id, which */
3531 /* tracks the id of the ppu1_interp_out that stores the best result. */
3532 /* When new interp to be done, it uses 1 - bes result id to do the interp*/
3533 /* min id is toggled when any new result becomes the best result. */
3534 /*************************************************************************/
3535
3536 for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
3537 {
3538 e_min_id = PT_C;
3539
3540 hme_qpel_interp_comprehensive(
3541 &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask);
3542
3543 mvx_qpel = i4_mv_x;
3544 mvy_qpel = i4_mv_y;
3545
3546 if(i4_grid_mask & BIT_EN(PT_L))
3547 {
3548 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3549 ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate);
3550
3551 if(!check_for_duplicate)
3552 {
3553 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3554 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3555
3556 s_err_prms.pu1_ref = apu1_final[0];
3557 s_err_prms.i4_ref_stride = ai4_ref_stride[0];
3558
3559 pf_err_compute(&s_err_prms);
3560 /* Update the mv's with the current candt motion vectors */
3561 s_result_prms.i2_mv_x = mvx_qpel - 1;
3562 s_result_prms.i2_mv_y = mvy_qpel;
3563 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3564
3565 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3566 if(i4_tot_cost < i4_min_cost)
3567 {
3568 e_min_id = PT_L;
3569 i4_min_cost = i4_tot_cost;
3570 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3571 }
3572 }
3573 }
3574 if(i4_grid_mask & BIT_EN(PT_T))
3575 {
3576 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3577 ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate);
3578
3579 if(!check_for_duplicate)
3580 {
3581 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3582 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3583
3584 s_err_prms.pu1_ref = apu1_final[1];
3585 s_err_prms.i4_ref_stride = ai4_ref_stride[1];
3586
3587 pf_err_compute(&s_err_prms);
3588 /* Update the mv's with the current candt motion vectors */
3589 s_result_prms.i2_mv_x = mvx_qpel;
3590 s_result_prms.i2_mv_y = mvy_qpel - 1;
3591 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3592 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3593 if(i4_tot_cost < i4_min_cost)
3594 {
3595 e_min_id = PT_T;
3596 i4_min_cost = i4_tot_cost;
3597 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3598 }
3599 }
3600 }
3601 if(i4_grid_mask & BIT_EN(PT_R))
3602 {
3603 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3604 ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
3605
3606 if(!check_for_duplicate)
3607 {
3608 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3609 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3610
3611 s_err_prms.pu1_ref = apu1_final[2];
3612 s_err_prms.i4_ref_stride = ai4_ref_stride[2];
3613
3614 pf_err_compute(&s_err_prms);
3615 /* Update the mv's with the current candt motion vectors */
3616 s_result_prms.i2_mv_x = mvx_qpel + 1;
3617 s_result_prms.i2_mv_y = mvy_qpel;
3618 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3619
3620 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3621 if(i4_tot_cost < i4_min_cost)
3622 {
3623 e_min_id = PT_R;
3624 i4_min_cost = i4_tot_cost;
3625 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3626 }
3627 }
3628 }
3629 /* i4_mv_x and i4_mv_y will always be the centre pt */
3630 /* for qpel we start with least hpel, and hence compute of center pt never reqd */
3631 if(i4_grid_mask & BIT_EN(PT_B))
3632 {
3633 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3634 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
3635
3636 if(!check_for_duplicate)
3637 {
3638 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3639 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3640
3641 s_err_prms.pu1_ref = apu1_final[3];
3642 s_err_prms.i4_ref_stride = ai4_ref_stride[3];
3643
3644 pf_err_compute(&s_err_prms);
3645 /* Update the mv's with the current candt motion vectors */
3646 s_result_prms.i2_mv_x = mvx_qpel;
3647 s_result_prms.i2_mv_y = mvy_qpel + 1;
3648 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3649 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3650 if(i4_tot_cost < i4_min_cost)
3651 {
3652 e_min_id = PT_B;
3653 i4_min_cost = i4_tot_cost;
3654 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3655 }
3656 }
3657 }
3658
3659 if(e_min_id == PT_C)
3660 {
3661 if(!i4_i)
3662 {
3663 S32 i4_interp_buf_id = 0;
3664
3665 if(i4_grid_mask & BIT_EN(PT_TL))
3666 {
3667 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3668 ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate);
3669
3670 if(!check_for_duplicate)
3671 {
3672 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3673 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3674
3675 /* Carry out the interpolation */
3676 pf_qpel_interp(
3677 &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id);
3678
3679 s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3680 s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3681
3682 pf_err_compute(&s_err_prms);
3683 /* Update the mv's with the current candt motion vectors */
3684 s_result_prms.i2_mv_x = mvx_qpel - 1;
3685 s_result_prms.i2_mv_y = mvy_qpel - 1;
3686 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3687
3688 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3689
3690 if(i4_tot_cost < i4_min_cost)
3691 {
3692 e_min_id = PT_TL;
3693 i4_min_cost = i4_tot_cost;
3694 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3695 }
3696 }
3697 }
3698 if(i4_grid_mask & BIT_EN(PT_TR))
3699 {
3700 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3701 ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate);
3702
3703 if(!check_for_duplicate)
3704 {
3705 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3706 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3707
3708 /* Carry out the interpolation */
3709 pf_qpel_interp(
3710 &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id);
3711
3712 s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3713 s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3714
3715 pf_err_compute(&s_err_prms);
3716 /* Update the mv's with the current candt motion vectors */
3717 s_result_prms.i2_mv_x = mvx_qpel + 1;
3718 s_result_prms.i2_mv_y = mvy_qpel - 1;
3719 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3720
3721 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3722
3723 if(i4_tot_cost < i4_min_cost)
3724 {
3725 e_min_id = PT_TR;
3726 i4_min_cost = i4_tot_cost;
3727 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3728 }
3729 }
3730 }
3731 if(i4_grid_mask & BIT_EN(PT_BL))
3732 {
3733 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3734 ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate);
3735
3736 if(!check_for_duplicate)
3737 {
3738 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3739 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3740
3741 /* Carry out the interpolation */
3742 pf_qpel_interp(
3743 &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id);
3744
3745 s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3746 s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3747
3748 pf_err_compute(&s_err_prms);
3749 /* Update the mv's with the current candt motion vectors */
3750 s_result_prms.i2_mv_x = mvx_qpel - 1;
3751 s_result_prms.i2_mv_y = mvy_qpel + 1;
3752 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3753
3754 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3755
3756 if(i4_tot_cost < i4_min_cost)
3757 {
3758 e_min_id = PT_BL;
3759 i4_min_cost = i4_tot_cost;
3760 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3761 }
3762 }
3763 }
3764 /* i4_mv_x and i4_mv_y will always be the centre pt */
3765 /* for qpel we start with least hpel, and hence compute of center pt never reqd */
3766 if(i4_grid_mask & BIT_EN(PT_BR))
3767 {
3768 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3769 ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate);
3770
3771 if(!check_for_duplicate)
3772 {
3773 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3774 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3775
3776 /* Carry out the interpolation */
3777 pf_qpel_interp(
3778 &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id);
3779
3780 s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3781 s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3782
3783 pf_err_compute(&s_err_prms);
3784 /* Update the mv's with the current candt motion vectors */
3785 s_result_prms.i2_mv_x = mvx_qpel + 1;
3786 s_result_prms.i2_mv_y = mvy_qpel + 1;
3787 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3788
3789 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3790
3791 if(i4_tot_cost < i4_min_cost)
3792 {
3793 e_min_id = PT_BR;
3794 i4_min_cost = i4_tot_cost;
3795 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3796 }
3797 }
3798 }
3799 if(e_min_id == PT_C)
3800 {
3801 break;
3802 }
3803 }
3804 else
3805 {
3806 break;
3807 }
3808 }
3809
3810 if(i4_i)
3811 {
3812 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3813 }
3814 else
3815 {
3816 i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3817 }
3818 i4_mv_x += gai1_grid_id_to_x[e_min_id];
3819 i4_mv_y += gai1_grid_id_to_y[e_min_id];
3820 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3821 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3822 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3823 }
3824 }
3825
3826 /* update modified motion vectors and cost at end of subpel */
3827 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3828 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3829 ps_search_node->i4_tot_cost = i4_min_cost;
3830 ps_search_node->i4_sad = i4_min_sad;
3831
3832 /********************************************************************************/
3833 /* TODO: Restoring back Sad lambda from Hadamard lambda */
3834 /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
3835 /********************************************************************************/
3836 //ps_pred_ctxt->lambda >>= 1;
3837
3838 return (i4_min_cost);
3839 }
3840 #endif
3841
hme_subpel_refine_struct_to_search_results_struct_converter(subpel_refine_ctxt_t * ps_subpel_refine_ctxt,search_results_t * ps_search_results,U08 u1_pred_dir,ME_QUALITY_PRESETS_T e_quality_preset)3842 static void hme_subpel_refine_struct_to_search_results_struct_converter(
3843 subpel_refine_ctxt_t *ps_subpel_refine_ctxt,
3844 search_results_t *ps_search_results,
3845 U08 u1_pred_dir,
3846 ME_QUALITY_PRESETS_T e_quality_preset)
3847 {
3848 U08 i;
3849
3850 U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part;
3851
3852 for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
3853 {
3854 S32 index;
3855 S32 i4_sad;
3856
3857 S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
3858
3859 search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id];
3860
3861 if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
3862 {
3863 index = part_id;
3864 }
3865 else
3866 {
3867 index = i;
3868 }
3869
3870 if(!ps_best_node->u1_subpel_done)
3871 {
3872 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3873 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3874 ps_best_node[0].i4_sdi = 0;
3875 ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1);
3876 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3877
3878 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3879 {
3880 i4_sad = MAX_SIGNED_16BIT_VAL;
3881 }
3882
3883 ps_best_node[0].i4_sad = i4_sad;
3884 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3885 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3886 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3887 ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3888 ps_best_node->u1_subpel_done = 1;
3889
3890 if(2 == u1_num_results_per_part)
3891 {
3892 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3893 ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3894 ps_best_node[1].i4_sdi = 0;
3895 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3896
3897 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3898 {
3899 i4_sad = MAX_SIGNED_16BIT_VAL;
3900 }
3901
3902 ps_best_node[1].i4_sad = i4_sad;
3903 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3904 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3905 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3906 ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3907 ps_best_node[1].u1_subpel_done = 1;
3908 }
3909 }
3910 else if(
3911 (2 == u1_num_results_per_part) &&
3912 (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost))
3913 {
3914 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost)
3915 {
3916 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3917 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3918 ps_best_node[0].i4_sdi = 0;
3919 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3920
3921 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3922 {
3923 i4_sad = MAX_SIGNED_16BIT_VAL;
3924 }
3925
3926 ps_best_node[0].i4_sad = i4_sad;
3927 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3928 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3929 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3930 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3931
3932 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3933 ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3934 ps_best_node[1].i4_sdi = 0;
3935 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3936
3937 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3938 {
3939 i4_sad = MAX_SIGNED_16BIT_VAL;
3940 }
3941
3942 ps_best_node[1].i4_sad = i4_sad;
3943 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3944 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3945 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3946 ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3947 }
3948 else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost)
3949 {
3950 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost)
3951 {
3952 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3953 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3954 ps_best_node[1].i4_sdi = 0;
3955 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3956
3957 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3958 {
3959 i4_sad = MAX_SIGNED_16BIT_VAL;
3960 }
3961
3962 ps_best_node[1].i4_sad = i4_sad;
3963 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3964 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3965 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3966 ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3967 }
3968 else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)
3969 {
3970 memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t));
3971
3972 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3973 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3974 ps_best_node[0].i4_sdi = 0;
3975 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3976
3977 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3978 {
3979 i4_sad = MAX_SIGNED_16BIT_VAL;
3980 }
3981
3982 ps_best_node[0].i4_sad = i4_sad;
3983 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3984 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3985 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3986 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3987 }
3988 }
3989 }
3990 else if(
3991 (1 == u1_num_results_per_part) &&
3992 (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost))
3993 {
3994 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3995 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3996 ps_best_node[0].i4_sdi = 0;
3997 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3998
3999 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
4000 {
4001 i4_sad = MAX_SIGNED_16BIT_VAL;
4002 }
4003
4004 ps_best_node[0].i4_sad = i4_sad;
4005 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4006 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4007 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4008 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
4009 }
4010 }
4011 }
4012
4013 /**
4014 ********************************************************************************
4015 * @fn S32 hme_subpel_refine_cu_hs
4016 *
4017 * @brief Evaluates the best subpel mvs for active partitions of an MB in L0
4018 * layer for the high speed preset. Recursive hadamard SATD / SAD
4019 * and mv cost is used for 2NxN and NxN partitions with active partition
4020 * update
4021 *
4022 * @param[in] ps_prms: subpel prms input to this function
4023 *
4024 * @param[in] ps_curr_layer: points to the current layer ctxt
4025 *
4026 * @param[out] ps_search_results: points to the search resutls that get updated
4027 * with best results
4028 *
4029 * @param[in] search_idx: ref id of the frame for which results get updated
4030 *
4031 * @param[in] ps_wt_inp_prms: current frame input params
4032 *
4033 * @return None
4034 ********************************************************************************
4035 */
hme_subpel_refine_cu_hs(hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,search_results_t * ps_search_results,S32 search_idx,wgt_pred_ctxt_t * ps_wt_inp_prms,WORD32 blk_8x8_mask,me_func_selector_t * ps_func_selector,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)4036 void hme_subpel_refine_cu_hs(
4037 hme_subpel_prms_t *ps_prms,
4038 layer_ctxt_t *ps_curr_layer,
4039 search_results_t *ps_search_results,
4040 S32 search_idx,
4041 wgt_pred_ctxt_t *ps_wt_inp_prms,
4042 WORD32 blk_8x8_mask,
4043 me_func_selector_t *ps_func_selector,
4044 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
4045 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
4046 {
4047 /* Unique search node list for 2nx2n and nxn partitions */
4048 search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5];
4049 subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF];
4050 search_node_t *ps_search_node;
4051
4052 S32 i, i4_part_mask, j;
4053 S32 i4_sad_grid;
4054 S32 max_subpel_cand;
4055 WORD32 index;
4056 S32 num_unique_nodes_2nx2n;
4057 S32 part_id;
4058 S32 x_off, y_off;
4059 S32 i4_inp_off;
4060
4061 CU_SIZE_T e_cu_size;
4062 BLK_SIZE_T e_blk_size;
4063
4064 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
4065
4066 S32 i4_use_satd = ps_prms->i4_use_satd;
4067 S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1;
4068
4069 ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART);
4070
4071 if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy)
4072 {
4073 e_cu_size = ps_search_results->e_cu_size;
4074 i4_part_mask = ps_search_results->i4_part_mask;
4075
4076 ps_prms->i4_inp_type = sizeof(U08);
4077
4078 num_unique_nodes_2nx2n = 0;
4079
4080 for(i = 0; i < i4_num_act_refs; i++)
4081 {
4082 as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF;
4083 }
4084
4085 /************************************************************************/
4086 /* */
4087 /* Initialize SATD cost for each valid partition id.one time before */
4088 /* doing full pel time. This is because of the following reasons: */
4089 /* 1. Full pel cost was done in SAD while subpel is in SATD mode */
4090 /* 2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */
4091 /* doing Diamond search for 2Nx2N and NxN. This partitions are */
4092 /* not explicitly refine in high speed mode */
4093 /* */
4094 /************************************************************************/
4095 for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4096 {
4097 S32 enable_subpel = 0;
4098 S32 part_type;
4099
4100 /* Derive the x and y offsets of this part id */
4101 part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4102 if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4103 {
4104 index = part_id;
4105 }
4106 else
4107 {
4108 index = i;
4109 }
4110
4111 part_type = ge_part_id_to_part_type[part_id];
4112 x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size;
4113 y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size;
4114 x_off += ps_search_results->u1_x_off;
4115 y_off += ps_search_results->u1_y_off;
4116 i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4117 e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id];
4118
4119 x_off += ps_prms->i4_ctb_x_off;
4120 y_off += ps_prms->i4_ctb_y_off;
4121
4122 max_subpel_cand = 0;
4123
4124 /* Choose the minimum number of candidates to be used for Sub pel refinement */
4125 if(PART_ID_2Nx2N == part_type)
4126 {
4127 max_subpel_cand =
4128 MIN(ps_prms->u1_max_subpel_candts_2Nx2N,
4129 ps_search_results->u1_num_results_per_part);
4130 }
4131 else if(PRT_NxN == part_type)
4132 {
4133 max_subpel_cand = MIN(
4134 ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part);
4135 }
4136
4137 /* If incomplete CTB, NxN num candidates should be forced to min 1 */
4138 if((0 == max_subpel_cand) && (blk_8x8_mask != 15))
4139 {
4140 max_subpel_cand = 1;
4141 }
4142
4143 if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type))
4144 {
4145 enable_subpel = 1;
4146 }
4147
4148 /* Compute full pel SATD for each result per partition before subpel */
4149 /* refinement starts. */
4150 /* Also prepare unique candidate list for 2Nx2N and NxN partitions */
4151 for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4152 {
4153 err_prms_t s_err_prms;
4154 S32 i4_satd = 0;
4155 S32 i1_ref_idx;
4156 U08 *pu1_ref_base;
4157 S32 i4_ref_stride = ps_curr_layer->i4_rec_stride;
4158 S32 i4_mv_x, i4_mv_y;
4159
4160 ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j;
4161
4162 if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV)
4163 {
4164 ps_search_node->u1_subpel_done = 1;
4165 continue;
4166 }
4167
4168 i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4169 ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off);
4170 pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx];
4171
4172 i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index];
4173 i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index];
4174
4175 if(i4_use_satd)
4176 {
4177 s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
4178 s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
4179 s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x +
4180 (i4_mv_y * i4_ref_stride);
4181
4182 s_err_prms.i4_ref_stride = i4_ref_stride;
4183 s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
4184 s_err_prms.i4_grid_mask = 1;
4185 s_err_prms.pi4_sad_grid = &i4_sad_grid;
4186 s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
4187 s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
4188
4189 s_err_prms.ps_cmn_utils_optimised_function_list =
4190 ps_cmn_utils_optimised_function_list;
4191
4192 compute_satd_8bit(&s_err_prms);
4193
4194 i4_satd = s_err_prms.pi4_sad_grid[0];
4195
4196 ps_subpel_refine_ctxt->i2_tot_cost[j][index] =
4197 CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd);
4198 ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd;
4199 }
4200
4201 /* Sub-pel candidate filtration */
4202 if(j)
4203 {
4204 S16 i2_best_sad;
4205 S32 i4_best_mvx;
4206 S32 i4_best_mvy;
4207
4208 search_node_t *ps_node =
4209 ps_search_results->aps_part_results[search_idx][part_id];
4210
4211 U08 u1_is_subpel_done = ps_node->u1_subpel_done;
4212 S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index];
4213 S32 i4_curr_mvx = i4_mv_x << 2;
4214 S32 i4_curr_mvy = i4_mv_y << 2;
4215
4216 if(u1_is_subpel_done)
4217 {
4218 i2_best_sad = ps_node->i4_sad;
4219
4220 if(ps_node->i1_ref_idx == i1_ref_idx)
4221 {
4222 i4_best_mvx = ps_node->s_mv.i2_mvx;
4223 i4_best_mvy = ps_node->s_mv.i2_mvy;
4224 }
4225 else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4226 {
4227 i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4228 i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4229 }
4230 else
4231 {
4232 i4_best_mvx = INTRA_MV;
4233 i4_best_mvy = INTRA_MV;
4234 }
4235 }
4236 else
4237 {
4238 i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
4239 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4240
4241 if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4242 {
4243 i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4244 i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4245 }
4246 else
4247 {
4248 i4_best_mvx = INTRA_MV;
4249 i4_best_mvy = INTRA_MV;
4250 }
4251 }
4252
4253 i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold);
4254
4255 if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) &&
4256 (ABS(i4_curr_mvy - i4_best_mvy) < 2)) ||
4257 (i2_curr_sad > i2_best_sad))
4258 {
4259 enable_subpel = 0;
4260 }
4261 }
4262
4263 ps_search_node->u1_part_id = part_id;
4264
4265 /* Convert mvs in part results from FPEL to QPEL units */
4266 ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2;
4267 ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2;
4268
4269 /* If the candidate number is more than the number of candts
4270 set initally, do not add those candts for refinement */
4271 if(j >= max_subpel_cand)
4272 {
4273 enable_subpel = 0;
4274 }
4275
4276 if(enable_subpel)
4277 {
4278 if(num_unique_nodes_2nx2n == 0)
4279 {
4280 S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4281
4282 as_subpel_dedup_enabler[i4_index].i2_mv_x =
4283 ps_subpel_refine_ctxt->i2_mv_x[j][index];
4284 as_subpel_dedup_enabler[i4_index].i2_mv_y =
4285 ps_subpel_refine_ctxt->i2_mv_y[j][index];
4286 as_subpel_dedup_enabler[i4_index].u1_ref_idx =
4287 (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4288 memset(
4289 as_subpel_dedup_enabler[i4_index].au4_node_map,
4290 0,
4291 sizeof(U32) * 2 * MAP_X_MAX);
4292 }
4293 INSERT_NEW_NODE_NOMAP_ALTERNATE(
4294 as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i);
4295 }
4296 }
4297
4298 /*********************************************************************************************/
4299 /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */
4300 /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/
4301 /* for each partition again, based on the new costs */
4302 /*********************************************************************************************/
4303 /*********************************************************************************************/
4304 /* Because right now, we store only the two best candidates for each partition, the sort will*/
4305 /* converge to a simple swap. */
4306 /* ASSUMPTION : We store only two best results per partition */
4307 /*********************************************************************************************/
4308 if(ps_search_results->u1_num_results_per_part == 2)
4309 {
4310 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >
4311 ps_subpel_refine_ctxt->i2_tot_cost[1][index])
4312 {
4313 SWAP(
4314 ps_subpel_refine_ctxt->i2_tot_cost[0][index],
4315 ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
4316
4317 SWAP(
4318 ps_subpel_refine_ctxt->i2_mv_cost[0][index],
4319 ps_subpel_refine_ctxt->i2_mv_cost[1][index]);
4320
4321 SWAP(
4322 ps_subpel_refine_ctxt->i2_mv_x[0][index],
4323 ps_subpel_refine_ctxt->i2_mv_x[1][index]);
4324
4325 SWAP(
4326 ps_subpel_refine_ctxt->i2_mv_y[0][index],
4327 ps_subpel_refine_ctxt->i2_mv_y[1][index]);
4328
4329 SWAP(
4330 ps_subpel_refine_ctxt->i2_ref_idx[0][index],
4331 ps_subpel_refine_ctxt->i2_ref_idx[1][index]);
4332
4333 SWAP(
4334 ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index],
4335 ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]);
4336 }
4337 }
4338 }
4339
4340 if(blk_8x8_mask == 0xf)
4341 {
4342 num_unique_nodes_2nx2n =
4343 MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers);
4344 }
4345 {
4346 x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size;
4347 y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size;
4348 x_off += ps_search_results->u1_x_off;
4349 y_off += ps_search_results->u1_y_off;
4350 i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4351 e_blk_size = ge_part_id_to_blk_size[e_cu_size][0];
4352
4353 for(j = 0; j < num_unique_nodes_2nx2n; j++)
4354 {
4355 S32 pred_lx;
4356 ps_search_node = &as_nodes_2nx2n[j];
4357
4358 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
4359 {
4360 continue;
4361 }
4362
4363 {
4364 S08 i1_ref_idx = ps_search_node->i1_ref_idx;
4365 subpel_dedup_enabler_t *ps_dedup_enabler =
4366 &(as_subpel_dedup_enabler[i1_ref_idx]);
4367
4368 if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF)
4369 {
4370 as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx;
4371 as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy;
4372 as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx;
4373 memset(
4374 as_subpel_dedup_enabler[i1_ref_idx].au4_node_map,
4375 0,
4376 sizeof(U32) * 2 * MAP_X_MAX);
4377 }
4378 }
4379
4380 pred_lx = search_idx;
4381 ps_prms->pv_inp =
4382 (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off);
4383
4384 hme_subpel_refine_search_node_high_speed(
4385 ps_search_node,
4386 ps_prms,
4387 ps_curr_layer,
4388 e_blk_size,
4389 x_off + ps_prms->i4_ctb_x_off,
4390 y_off + ps_prms->i4_ctb_y_off,
4391 ps_search_results,
4392 pred_lx,
4393 i4_part_mask,
4394 &ps_subpel_refine_ctxt->ai4_part_id[0],
4395 search_idx,
4396 &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]),
4397 ps_func_selector,
4398 ps_me_optimised_function_list);
4399 }
4400 }
4401 }
4402 else
4403 {
4404 for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4405 {
4406 S32 i4_index;
4407
4408 S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4409
4410 if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4411 {
4412 i4_index = i4_part_id;
4413 }
4414 else
4415 {
4416 i4_index = i;
4417 }
4418
4419 for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4420 {
4421 ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2;
4422 ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2;
4423 }
4424 }
4425 }
4426
4427 hme_subpel_refine_struct_to_search_results_struct_converter(
4428 ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets);
4429 }
4430