• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /*!
22 ******************************************************************************
23 * \file ihevce_recur_bracketing.c
24 *
25 * \brief
26 *    This file contains interface functions of recursive bracketing
27 *    module
28 * \date
29 *    12/02/2012
30 *
31 * \author
32 *    Ittiam
33 *
34 * List of Functions
35 *
36 *
37 ******************************************************************************
38 */
39 
40 /*****************************************************************************/
41 /* File Includes                                                             */
42 /*****************************************************************************/
43 /* System include files */
44 #include <stdio.h>
45 #include <string.h>
46 #include <stdlib.h>
47 #include <assert.h>
48 #include <stdarg.h>
49 #include <math.h>
50 
51 /* User include files */
52 #include "ihevc_typedefs.h"
53 #include "itt_video_api.h"
54 #include "ihevce_api.h"
55 
56 #include "rc_cntrl_param.h"
57 #include "rc_frame_info_collector.h"
58 #include "rc_look_ahead_params.h"
59 
60 #include "ihevc_defs.h"
61 #include "ihevc_structs.h"
62 #include "ihevc_platform_macros.h"
63 #include "ihevc_deblk.h"
64 #include "ihevc_itrans_recon.h"
65 #include "ihevc_chroma_itrans_recon.h"
66 #include "ihevc_chroma_intra_pred.h"
67 #include "ihevc_intra_pred.h"
68 #include "ihevc_inter_pred.h"
69 #include "ihevc_mem_fns.h"
70 #include "ihevc_padding.h"
71 #include "ihevc_weighted_pred.h"
72 #include "ihevc_sao.h"
73 #include "ihevc_resi_trans.h"
74 #include "ihevc_quant_iquant_ssd.h"
75 #include "ihevc_cabac_tables.h"
76 
77 #include "ihevce_defs.h"
78 #include "ihevce_lap_enc_structs.h"
79 #include "ihevce_multi_thrd_structs.h"
80 #include "ihevce_me_common_defs.h"
81 #include "ihevce_had_satd.h"
82 #include "ihevce_error_codes.h"
83 #include "ihevce_bitstream.h"
84 #include "ihevce_cabac.h"
85 #include "ihevce_rdoq_macros.h"
86 #include "ihevce_function_selector.h"
87 #include "ihevce_enc_structs.h"
88 #include "ihevce_entropy_structs.h"
89 #include "ihevce_cmn_utils_instr_set_router.h"
90 #include "ihevce_enc_loop_structs.h"
91 #include "ihevce_ipe_instr_set_router.h"
92 #include "ihevce_ipe_structs.h"
93 #include "ihevce_ipe_pass.h"
94 #include "ihevce_recur_bracketing.h"
95 #include "ihevce_nbr_avail.h"
96 #include "ihevc_common_tables.h"
97 #include "ihevce_decomp_pre_intra_structs.h"
98 #include "ihevce_decomp_pre_intra_pass.h"
99 
100 #include "cast_types.h"
101 #include "osal.h"
102 #include "osal_defaults.h"
103 
104 /*****************************************************************************/
105 /* Constant Macros                                                           */
106 /*****************************************************************************/
107 #define IP_DBG_L1_l2 0
108 #define CHILD_BIAS 12
109 
110 /*****************************************************************************/
111 /* Globals                                                                   */
112 /*****************************************************************************/
113 extern pf_intra_pred g_apf_lum_ip[10];
114 
115 extern WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES];
116 
117 UWORD8 gau1_cu_pos_x[64] = { 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7,
118                              6, 7, 4, 5, 4, 5, 6, 7, 6, 7, 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1,
119                              2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7 };
120 
121 UWORD8 gau1_cu_pos_y[64] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 0, 0, 1, 1, 0, 0,
122                              1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7,
123                              6, 6, 7, 7, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7, 6, 6, 7, 7 };
124 
125 #define RESET_BIT(x, bit) (x = x & ~((WORD32)1 << bit))
126 
127 /*****************************************************************************/
128 /* Function Definitions                                                      */
129 /*****************************************************************************/
130 
131 /*!
132 ******************************************************************************
133 * \if Function name : ihevce_update_cand_list \endif
134 *
135 * \brief
136 *    Final Candidate list population, nbr flag andd nbr mode update function
137 *
138 * \param[in] ps_row_cu : pointer to cu analyse struct
139 * \param[in] ps_cu_node : pointer to cu node info buffer
140 * \param[in] ps_ed_blk_l1 : pointer to level 1 and 2 decision buffer
141 * \param[in] pu1_cand_mode_list  : pointer to candidate list buffer
142 *
143 * \return
144 *    None
145 *
146 * \author
147 *  Ittiam
148 *
149 *****************************************************************************
150 */
ihevce_update_cand_list(ihevce_ipe_cu_tree_t * ps_cu_node,ihevce_ed_blk_t * ps_ed_blk_l1,ihevce_ipe_ctxt_t * ps_ctxt)151 void ihevce_update_cand_list(
152     ihevce_ipe_cu_tree_t *ps_cu_node, ihevce_ed_blk_t *ps_ed_blk_l1, ihevce_ipe_ctxt_t *ps_ctxt)
153 {
154     WORD32 row, col, x, y, size;
155 
156     /* Candidate mode Update */
157     (void)ps_ed_blk_l1;
158     /* Update CTB mode map for the finalised CU */
159     x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
160     y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
161     size = ps_cu_node->u1_cu_size >> 2;
162     for(row = y; row < (y + size); row++)
163     {
164         for(col = x; col < (x + size); col++)
165         {
166             ps_ctxt->au1_ctb_mode_map[row][col] = ps_cu_node->best_mode;
167         }
168     }
169     return;
170 }
171 
172 /*!
173 ******************************************************************************
174 * \if Function name : ihevce_intra_populate_mode_bits_cost_bracketing \endif
175 *
176 * \brief
177 *    Mpm indx calc function based on left and top available modes
178 *
179 * \param[in] top_intra_mode : Top available intra mode
180 * \param[in] left_intra_mode : Left available intra mode
181 * \param[in] available_top : Top availability flag
182 * \param[in] available_left : Left availability flag
183 * \param[in] cu_pos_y : cu position wrt to CTB
184 * \param[in] mode_bits_cost : pointer to mode bits buffer
185 * \param[in] lambda : Lambda value (SAD/SATD)
186 * \param[in] cand_mode_list  : pointer to candidate list buffer
187 *
188 * \return
189 *    None
190 *
191 * \author
192 *  Ittiam
193 *
194 *****************************************************************************
195 */
ihevce_intra_populate_mode_bits_cost_bracketing(WORD32 top_intra_mode,WORD32 left_intra_mode,WORD32 available_top,WORD32 available_left,WORD32 cu_pos_y,UWORD16 * mode_bits_cost,UWORD16 * mode_bits,WORD32 lambda,WORD32 * cand_mode_list)196 void ihevce_intra_populate_mode_bits_cost_bracketing(
197     WORD32 top_intra_mode,
198     WORD32 left_intra_mode,
199     WORD32 available_top,
200     WORD32 available_left,
201     WORD32 cu_pos_y,
202     UWORD16 *mode_bits_cost,
203     UWORD16 *mode_bits,
204     WORD32 lambda,
205     WORD32 *cand_mode_list)
206 {
207     /* local variables */
208     WORD32 i;
209     WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
210 
211     UWORD16 one_bits_cost =
212         COMPUTE_RATE_COST_CLIP30(4, lambda, (LAMBDA_Q_SHIFT + 1));  //1.5 * lambda
213     UWORD16 two_bits_cost =
214         COMPUTE_RATE_COST_CLIP30(6, lambda, (LAMBDA_Q_SHIFT + 1));  //2.5 * lambda
215     UWORD16 five_bits_cost =
216         COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1));  //5.5 * lambda
217 
218     for(i = 0; i < 35; i++)
219     {
220         mode_bits_cost[i] = five_bits_cost;
221         mode_bits[i] = 5;
222     }
223 
224     /* EIID: set availability flag to zero if modes are invalid.
225        Required since some CU's might be skipped (though available)
226        and their modes will be set to 255 (-1)*/
227     if(35 < top_intra_mode || 0 > top_intra_mode)
228         available_top = 0;
229     if(35 < left_intra_mode || 0 > left_intra_mode)
230         available_left = 0;
231 
232     /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
233     /* N = top */
234     if(0 == available_top)
235     {
236         cand_intra_pred_mode_top = INTRA_DC;
237     }
238     /* for neighbour != INTRA, setting DC is done outside */
239     else if(0 == cu_pos_y) /* It's on the CTB boundary */
240     {
241         cand_intra_pred_mode_top = INTRA_DC;
242     }
243     else
244     {
245         cand_intra_pred_mode_top = top_intra_mode;
246     }
247 
248     /* N = left */
249     if(0 == available_left)
250     {
251         cand_intra_pred_mode_left = INTRA_DC;
252         //cand_intra_pred_mode_left = cand_intra_pred_mode_top;
253     }
254     /* for neighbour != INTRA, setting DC is done outside */
255     else
256     {
257         cand_intra_pred_mode_left = left_intra_mode;
258     }
259 
260     /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
261     if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
262     {
263         if(cand_intra_pred_mode_left < 2)
264         {
265             cand_mode_list[0] = INTRA_PLANAR;
266             cand_mode_list[1] = INTRA_DC;
267             cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
268         }
269         else
270         {
271             cand_mode_list[0] = cand_intra_pred_mode_left;
272             cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
273             cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
274         }
275     }
276     else
277     {
278         if(0 == available_left)
279         {
280             cand_mode_list[0] = cand_intra_pred_mode_top;
281             cand_mode_list[1] = cand_intra_pred_mode_left;
282         }
283         else
284         {
285             cand_mode_list[0] = cand_intra_pred_mode_left;
286             cand_mode_list[1] = cand_intra_pred_mode_top;
287         }
288         if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
289            (cand_intra_pred_mode_top != INTRA_PLANAR))
290         {
291             cand_mode_list[2] = INTRA_PLANAR;
292         }
293         else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
294         {
295             cand_mode_list[2] = INTRA_DC;
296         }
297         else
298         {
299             cand_mode_list[2] = INTRA_ANGULAR(26);
300         }
301     }
302     mode_bits_cost[cand_mode_list[0]] = one_bits_cost;
303     mode_bits_cost[cand_mode_list[1]] = two_bits_cost;
304     mode_bits_cost[cand_mode_list[2]] = two_bits_cost;
305 
306     mode_bits[cand_mode_list[0]] = 2;
307     mode_bits[cand_mode_list[1]] = 3;
308     mode_bits[cand_mode_list[2]] = 3;
309 }
310 
311 /*!
312 ******************************************************************************
313 * \if Function name : ihevce_pu_calc_4x4_blk \endif
314 *
315 * \brief
316 *    4x4 pu (8x8 CU) mode decision using step 8421 method
317 *
318 * \param[in] ps_cu_node : pointer to cu node info buffer
319 * \param[in] pu1_src : pointer to src pixels
320 * \param[in] src_stride : frm source stride
321 * \param[in] ref : pointer to reference pixels for prediction
322 * \param[in] cand_mode_list  : pointer to candidate list buffer
323 * \param[in] best_costs_4x4  : pointer to 3 best cost buffer
324 * \param[in] best_modes_4x4  : pointer to 3 best mode buffer
325 *
326 * \return
327 *    None
328 *
329 * \author
330 *  Ittiam
331 *
332 *****************************************************************************
333 */
ihevce_pu_calc_4x4_blk(ihevce_ipe_ctxt_t * ps_ctxt,ihevce_ipe_cu_tree_t * ps_cu_node,UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * ref,UWORD16 * mode_bits_cost,WORD32 * best_costs_4x4,UWORD8 * best_modes_4x4,func_selector_t * ps_func_selector)334 void ihevce_pu_calc_4x4_blk(
335     ihevce_ipe_ctxt_t *ps_ctxt,
336     ihevce_ipe_cu_tree_t *ps_cu_node,
337     UWORD8 *pu1_src,
338     WORD32 src_stride,
339     UWORD8 *ref,
340     UWORD16 *mode_bits_cost,
341     WORD32 *best_costs_4x4,
342     UWORD8 *best_modes_4x4,
343     func_selector_t *ps_func_selector)
344 {
345     WORD16 *pi2_trans_tmp = ps_ctxt->pi2_trans_tmp;
346     WORD16 *pi2_trans_out = ps_ctxt->pi2_trans_out;
347     UWORD8 u1_use_satd = ps_ctxt->u1_use_satd;
348     UWORD8 u1_level_1_refine_on = ps_ctxt->u1_level_1_refine_on;
349 
350     WORD32 i, j = 0, i_end;
351     UWORD8 mode, best_amode = 255;
352     UWORD8 pred[16];
353 
354     UWORD16 sad;
355     WORD32 sad_cost = 0;
356     WORD32 best_asad_cost = 0xFFFFF;
357     WORD32 temp;
358     UWORD8 modes_to_eval[5];
359     WORD32 costs_4x4[5];
360     UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
361 
362     /* LO resolution hence low resolution disable */
363     WORD32 u1_low_resol = 0;
364     UWORD8 au1_best_modes[1] = { 0 };
365     WORD32 ai4_best_sad_costs[1] = { 0 };
366 
367     WORD16 *pi2_tmp = &pi2_trans_tmp[0];
368 
369     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list =
370         &ps_ctxt->s_ipe_optimised_function_list;
371 
372     //apf_resd_trns[0] = &ihevc_resi_trans_4x4_ttype1;
373     //apf_resd_trns[0] = &ihevc_HAD_4x4_8bit;
374 
375     for(i = 0; i < 5; i++)
376     {
377         costs_4x4[i] = MAX_INTRA_COST_IPE;
378     }
379 
380     ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
381         pu1_src,
382         src_stride,
383         ref,
384         mode_bits_cost,
385         au1_best_modes,
386         ai4_best_sad_costs,
387         u1_low_resol,
388         ps_ipe_optimised_function_list->pf_4x4_sad_computer);
389 
390     best_amode = au1_best_modes[0];
391     best_asad_cost = ai4_best_sad_costs[0];
392 
393     ASSERT(best_amode != 255);
394     /* Around best level 4 angular mode, search for best level 2 mode */
395     modes_to_eval[0] = best_amode - 2;
396     modes_to_eval[1] = best_amode + 2;
397     i = 0;
398     i_end = 2;
399     if(best_amode == 2)
400         i = 1;
401     else if(best_amode == 34)
402         i_end = 1;
403     for(; i < i_end; i++)
404     {
405         mode = modes_to_eval[i];
406 
407         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
408 
409         sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
410 
411         sad_cost = sad;
412         sad_cost += mode_bits_cost[mode];
413 
414         if(sad_cost < best_asad_cost)
415         {
416             best_amode = mode;
417             best_asad_cost = sad_cost;
418         }
419     }
420 
421     /* Around best level 2 angular mode, search for best level 1 mode */
422     /* Also evaluate for non-angular mode */
423 
424     i = 0;
425     /*Level 1 refinement is disabled for ES preset */
426     if(1 == u1_level_1_refine_on)
427     {
428         if(best_amode != 2)
429             modes_to_eval[i++] = best_amode - 1;
430         modes_to_eval[i++] = best_amode;
431     }
432 
433     modes_to_eval[i++] = 0;
434     modes_to_eval[i++] = 1;
435 
436     if(1 == u1_level_1_refine_on)
437     {
438         if(best_amode != 34)
439             modes_to_eval[i++] = best_amode + 1;
440     }
441     i_end = i;
442     i = 0;
443 
444     for(; i < i_end; i++)
445     {
446         mode = modes_to_eval[i];
447 
448         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
449 
450         /* Hard coding to use SATD */
451         if(u1_use_satd)
452         {
453             ps_func_selector->ihevc_resi_trans_4x4_ttype1_fptr(
454                 pu1_src, &pred[0], (WORD32 *)pi2_tmp, pi2_trans_out, src_stride, 4, (4 << 16) | 0);
455 
456             sad = ihevce_ipe_pass_satd(pi2_trans_out, 4, 4);
457         }
458         else
459         {
460             sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(
461                 pu1_src, &pred[0], src_stride, 4);
462         }
463         sad_cost = sad;
464         sad_cost += mode_bits_cost[mode];
465 
466         costs_4x4[i] = sad_cost;
467     }
468 
469     /* Arrange the reference array in ascending order */
470     for(i = 0; i < (i_end - 1); i++)
471     {
472         for(j = i + 1; j < i_end; j++)
473         {
474             if(costs_4x4[i] > costs_4x4[j])
475             {
476                 temp = costs_4x4[i];
477                 costs_4x4[i] = costs_4x4[j];
478                 costs_4x4[j] = temp;
479 
480                 temp = modes_4x4[i];
481                 modes_4x4[i] = modes_4x4[j];
482                 modes_4x4[j] = temp;
483             }
484         }
485     }
486     for(i = 0; i < 3; i++)
487     {
488         best_costs_4x4[i] = costs_4x4[i];
489         best_modes_4x4[i] = modes_to_eval[modes_4x4[i]];
490     }
491 
492     {
493         ps_cu_node->best_mode = best_modes_4x4[0];
494         ps_cu_node->best_cost = best_costs_4x4[0];
495         ps_cu_node->best_satd = best_costs_4x4[0] - mode_bits_cost[ps_cu_node->best_mode];
496     }
497 }
498 
499 /*!
500 ******************************************************************************
501 * \if Function name : ihevce_pu_calc_8x8_blk \endif
502 *
503 * \brief
504 *    4x4 pu (8x8 CU) mode decision loop using step 8421 method
505 *
506 * \param[in] ps_curr_src : pointer to src pixels struct
507 * \param[in] ps_ctxt : pointer to IPE context struct
508 * \param[in] ps_cu_node : pointer to cu node info buffer
509 *
510 * \return
511 *    None
512 *
513 * \author
514 *  Ittiam
515 *
516 *****************************************************************************
517 */
ihevce_pu_calc_8x8_blk(iv_enc_yuv_buf_t * ps_curr_src,ihevce_ipe_ctxt_t * ps_ctxt,ihevce_ipe_cu_tree_t * ps_cu_node,func_selector_t * ps_func_selector)518 void ihevce_pu_calc_8x8_blk(
519     iv_enc_yuv_buf_t *ps_curr_src,
520     ihevce_ipe_ctxt_t *ps_ctxt,
521     ihevce_ipe_cu_tree_t *ps_cu_node,
522     func_selector_t *ps_func_selector)
523 {
524     WORD32 i, j;
525     WORD32 nbr_flags;
526     nbr_avail_flags_t s_nbr;
527     WORD32 trans_size = ps_cu_node->ps_parent->u1_cu_size >> 1;
528 
529     UWORD8 *pu1_src_4x4;
530     WORD32 xA, xB, yA, yB;
531     //WORD32 x, y, size;
532     WORD32 top_intra_mode;
533     WORD32 left_intra_mode;
534     //    WORD8 *top_intra_mode_ptr;
535     //  WORD8 *left_intra_mode_ptr;
536     UWORD8 *pu1_orig;
537     WORD32 src_strd = ps_curr_src->i4_y_strd;
538 
539     WORD32 cu_pos_x = ps_cu_node->ps_parent->u2_x0 << 1;
540     WORD32 cu_pos_y = ps_cu_node->ps_parent->u2_y0 << 1;
541     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
542 
543     ihevc_intra_pred_luma_ref_substitution_fptr =
544         ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
545 
546     pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) +
547                ((ps_cu_node->ps_parent->u2_y0 << 3) * src_strd) +
548                (ps_cu_node->ps_parent->u2_x0 << 3);
549     for(i = 0; i < 2; i++)
550     {
551         for(j = 0; j < 2; j++)
552         {
553             WORD32 cand_mode_list[3];
554             pu1_src_4x4 = pu1_orig + (i * trans_size * src_strd) + (j * trans_size);
555             /* get the neighbour availability flags */
556             nbr_flags = ihevce_get_nbr_intra(
557                 &s_nbr,
558                 ps_ctxt->pu1_ctb_nbr_map,
559                 ps_ctxt->i4_nbr_map_strd,
560                 cu_pos_x + ((j) * (trans_size >> 2)),
561                 cu_pos_y + ((i) * (trans_size >> 2)),
562                 trans_size >> 2);
563 
564             /* call the function which populates sad cost for all the modes */
565             xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + j;
566             yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
567             xB = xA + 1;
568             yB = yA - 1;
569             left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
570             top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
571 
572             ihevce_intra_populate_mode_bits_cost_bracketing(
573                 top_intra_mode,
574                 left_intra_mode,
575                 s_nbr.u1_top_avail,
576                 s_nbr.u1_left_avail,
577                 ps_cu_node->ps_parent->u2_y0,
578                 &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
579                 &ps_ctxt->au2_mode_bits_8x8_pu[0],
580                 ps_ctxt->i4_ol_sad_lambda,
581                 cand_mode_list);
582 
583             /* call the function which populates ref data for intra predicion */
584             ihevc_intra_pred_luma_ref_substitution_fptr(
585                 pu1_src_4x4 - src_strd - 1,
586                 pu1_src_4x4 - src_strd,
587                 pu1_src_4x4 - 1,
588                 src_strd,
589                 4,
590                 nbr_flags,
591                 &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
592                 0);
593 
594             ihevce_pu_calc_4x4_blk(
595                 ps_ctxt,
596                 ps_cu_node->ps_sub_cu[(i * 2) + j],
597                 pu1_src_4x4,
598                 src_strd,
599                 &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
600                 &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
601                 &ps_cu_node->ps_sub_cu[(i * 2) + j]->au4_best_cost_1tu[0],
602                 &ps_cu_node->ps_sub_cu[(i * 2) + j]->au1_best_mode_1tu[0],
603                 ps_func_selector);
604 
605             /*&au4_cost_4x4[i*2 + j][0],
606                 &au1_modes_4x4[i*2 + j][0]);*/ //TTODO : mode will change for the four partition
607 
608             ihevce_set_nbr_map(
609                 ps_ctxt->pu1_ctb_nbr_map,
610                 ps_ctxt->i4_nbr_map_strd,
611                 cu_pos_x + ((j) * (trans_size >> 2)),
612                 cu_pos_y + ((i) * (trans_size >> 2)),
613                 (trans_size >> 2),
614                 1);
615 
616             xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + 1 + j;
617             yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
618             ps_ctxt->au1_ctb_mode_map[yA][xA] = ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode;
619             ps_cu_node->ps_sub_cu[i * 2 + j]->u2_mode_bits_cost =
620                 ps_ctxt->au2_mode_bits_8x8_pu[ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode];
621         }
622     }
623 }
624 
625 /*!
626 ******************************************************************************
627 * \if Function name : ihevce_bracketing_analysis \endif
628 *
629 * \brief
630 *    Interface function that evaluates MAX cu and MAX - 1 cu, with MAX cu size
631 *    info decided coarse resolution mode decision. Compares the SATD/SAD cost btwn
632 *    2 CUS and determines the actual CU size and best 3 modes to be given to rdopt
633 *
634 * \param[in] ps_ctxt : pointer to IPE context struct
635 * \param[in] ps_cu_node : pointer to cu node info buffer
636 * \param[in] ps_curr_src : pointer to src pixels struct
637 * \param[in] ps_ctb_out : pointer to ip ctb out struct
638 * \param[in] ps_row_cu : pointer to cu analyse struct
639 * \param[in] ps_ed_l1_ctb : pointer to level 1 early deci struct
640 * \param[in] ps_ed_l2_ctb : pointer to level 2 early deci struct
641 * \param[in] ps_l0_ipe_out_ctb : pointer to ipe_l0_ctb_analyse_for_me_t struct
642 *
643 * \return
644 *    None
645 *
646 * \author
647 *  Ittiam
648 *
649 *****************************************************************************
650 */
ihevce_bracketing_analysis(ihevce_ipe_ctxt_t * ps_ctxt,ihevce_ipe_cu_tree_t * ps_cu_node,iv_enc_yuv_buf_t * ps_curr_src,ctb_analyse_t * ps_ctb_out,ihevce_ed_blk_t * ps_ed_l1_ctb,ihevce_ed_blk_t * ps_ed_l2_ctb,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,ipe_l0_ctb_analyse_for_me_t * ps_l0_ipe_out_ctb)651 void ihevce_bracketing_analysis(
652     ihevce_ipe_ctxt_t *ps_ctxt,
653     ihevce_ipe_cu_tree_t *ps_cu_node,
654     iv_enc_yuv_buf_t *ps_curr_src,
655     ctb_analyse_t *ps_ctb_out,
656     //cu_analyse_t         *ps_row_cu,
657     ihevce_ed_blk_t *ps_ed_l1_ctb,
658     ihevce_ed_blk_t *ps_ed_l2_ctb,
659     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
660     ipe_l0_ctb_analyse_for_me_t *ps_l0_ipe_out_ctb)
661 {
662     WORD32 cu_pos_x = 0;
663     WORD32 cu_pos_y = 0;
664 
665     UWORD8 u1_curr_ctb_wdt = ps_cu_node->u1_width;
666     UWORD8 u1_curr_ctb_hgt = ps_cu_node->u1_height;
667     WORD32 num_8x8_blks_x = (u1_curr_ctb_wdt >> 3);
668     WORD32 num_8x8_blks_y = (u1_curr_ctb_hgt >> 3);
669 
670     ihevce_ed_blk_t *ps_ed_blk_l1 = ps_ed_l1_ctb;
671     ihevce_ed_blk_t *ps_ed_blk_l2 = ps_ed_l2_ctb;
672 
673     WORD32 i;
674     WORD32 cand_mode_list[3];
675     //cu_analyse_t *ps_curr_cu = ps_row_cu;
676     WORD32 blk_cnt = 0;
677     WORD32 j = 0;
678     WORD32 merge_32x32_l1, merge_32x32_l2;
679 
680     WORD32 i4_skip_intra_eval_32x32_l1;
681     //EIID: flag indicating number of 16x16 blocks to be skipped for intra evaluation within 32x32 block
682 
683     WORD32 parent_cost = 0;
684     WORD32 child_cost[4] = { 0 };
685     WORD32 child_cost_least = 0;
686     WORD32 child_satd[4] = { 0 };
687     WORD32 x, y, size;
688     WORD32 merge_64x64 = 1;
689     UWORD8 au1_best_32x32_modes[4];
690     WORD32 au4_best_32x32_cost[4];
691     WORD32 parent_best_mode;
692     UWORD8 best_mode;
693 
694     WORD32 i4_quality_preset = ps_ctxt->i4_quality_preset;
695     /* flag to control 1CU-4TU modes based on quality preset                */
696     /* if set 1CU-4TU are explicity evaluated else 1CU-1TU modes are copied */
697     WORD32 i4_enable_1cu_4tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
698                                (i4_quality_preset == IHEVCE_QUALITY_P0);
699 
700     /* flag to control 4CU-16TU mode based on quality preset                */
701     /* if set 4CU-16TU are explicity evaluated else 4CU-4TU modes are copied*/
702     WORD32 i4_enable_4cu_16tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
703                                 (i4_quality_preset == IHEVCE_QUALITY_P0);
704 
705     WORD32 i4_mod_factor_num, i4_mod_factor_den = QP_MOD_FACTOR_DEN;  //2;
706     float f_strength;
707     /* Accumalte satd */
708     LWORD64 i8_frame_acc_satd_cost = 0, i8_frame_acc_satd_by_modqp_q10 = 0;
709     WORD32 i4_ctb_acc_satd = 0;
710 
711     /* Accumalate Mode bits cost */
712     LWORD64 i8_frame_acc_mode_bits_cost = 0;
713 
714     /* Step2 is bypassed for parent, uses children modes*/
715     WORD32 step2_bypass = 1;
716 
717     if(1 == ps_ctxt->u1_disable_child_cu_decide)
718         step2_bypass = 0;
719 
720     ps_cu_node->ps_parent = ps_ctxt->ps_ipe_cu_tree;
721     for(i = 0; i < 4; i++)
722     {
723         ps_cu_node->ps_sub_cu[i] = ps_ctxt->ps_ipe_cu_tree + 1 + i;
724     }
725 
726     /* Loop for all 8x8 block in a CTB */
727     ps_ctb_out->u4_cu_split_flags = 0x1;
728 
729     /* Initialize intra 64x64, 32x32 and 16x16 costs to max value */
730     for(i = 0; i < (MAX_CU_IN_CTB >> 4); i++)
731     {
732         ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i] = MAX_INTRA_COST_IPE;
733     }
734 
735     for(i = 0; i < (MAX_CU_IN_CTB >> 2); i++)
736     {
737         ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[i] = MAX_INTRA_COST_IPE;
738     }
739 
740     for(i = 0; i < (MAX_CU_IN_CTB); i++)
741     {
742         ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[i] = MAX_INTRA_COST_IPE;
743     }
744 
745     ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = MAX_INTRA_COST_IPE;
746 
747     /* by default 64x64 modes are set to default values DC and Planar */
748     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = 0;
749     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = 1;
750     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = 255;
751 
752     /* by default 64x4 split is set to 1 */
753     ps_l0_ipe_out_ctb->u1_split_flag = 1;
754 
755     /* Modulation factor calculated based on spatial variance instead of hardcoded val*/
756     i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[1];  //16;
757 
758     f_strength = ps_ctxt->f_strength;
759 
760     /* ------------------------------------------------ */
761     /* populate the early decisions done by L1 analysis */
762     /* ------------------------------------------------ */
763     {
764         ihevce_ed_blk_t *ps_ed_blk_l1_curr = ps_ed_l1_ctb;
765         WORD32 ctr_8x8;
766         WORD8 *pi1_ed_buf;
767 
768         /* set all the decisions to invalid */
769         memset(
770             &ps_l0_ipe_out_ctb->ai1_early_intra_inter_decision[0],
771             0,
772             sizeof(UWORD8) * MAX_CU_IN_CTB);
773 
774         pi1_ed_buf = &ps_l0_ipe_out_ctb->ai1_early_intra_inter_decision[0];
775 
776         for(ctr_8x8 = 0; ctr_8x8 < MAX_CTB_SIZE; ctr_8x8++)
777         {
778             WORD32 pos_x_8x8, pos_y_8x8;
779 
780             pos_x_8x8 = gau1_cu_pos_x[ctr_8x8];
781             pos_y_8x8 = gau1_cu_pos_y[ctr_8x8];
782 
783             pi1_ed_buf[pos_x_8x8 + (pos_y_8x8 * MAX_CU_IN_CTB_ROW)] =
784                 ps_ed_blk_l1_curr->intra_or_inter;
785             ps_ed_blk_l1_curr++;
786         }
787 
788         for(ctr_8x8 = 0; ctr_8x8 < (MAX_CU_IN_CTB >> 2); ctr_8x8++)
789         {
790             ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_ipe[ctr_8x8] =
791                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[ctr_8x8];
792 
793             ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[ctr_8x8] =
794                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[ctr_8x8];
795 
796             /*Earlier only me sad was getting populated, now best of ipe and me is populated*/
797             ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_me[ctr_8x8] =
798                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[ctr_8x8];
799             //ps_ed_ctb_l1->i4_sad_me_for_ref[ctr_8x8];
800 
801             ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_me[ctr_8x8] =
802                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[ctr_8x8];
803             //ps_ed_ctb_l1->i4_sad_cost_me_for_ref[ctr_8x8];
804         }
805 
806         /*Init CTB level accumalated SATD and MPM bits */
807         ps_l0_ipe_out_ctb->i4_ctb_acc_satd = 0;
808         ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = 0;
809     }
810 
811     /* ------------------------------------------------ */
812     /* Loop over all the blocks in current CTB          */
813     /* ------------------------------------------------ */
814 
815     {
816         /* 64 8x8 blocks should be encountered for the do,while loop to exit */
817         do
818         {
819             intra32_analyse_t *ps_intra32_analyse;
820             intra16_analyse_t *ps_intra16_analyse;
821             WORD32 *pi4_intra_32_cost;
822             WORD32 *pi4_intra_16_cost;
823             WORD32 *pi4_intra_8_cost;
824             WORD32 merge_16x16_l1;
825 
826             /* Given the blk_cnt, get the CU's top-left 8x8 block's x and y positions within the CTB */
827             cu_pos_x = gau1_cu_pos_x[blk_cnt];
828             cu_pos_y = gau1_cu_pos_y[blk_cnt];
829 
830             /* default value for 32x32 best mode - blk_cnt increases by 16 for each 32x32 */
831             au1_best_32x32_modes[blk_cnt >> 4] = 255;
832 
833             /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
834             /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
835             ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[blk_cnt >> 4];
836 
837             /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
838             /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
839             ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[(blk_cnt & 0xF) >> 2];
840 
841             /* Line below assumes min_cu_size of 8 - checks whether CU starts are within picture */
842             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
843             {
844                 /* Reset to zero for every cu decision */
845                 merge_32x32_l1 = 0;
846 
847                 child_cost_least = 0;
848 
849                 /* At L2, each 4x4 corresponds to 16x16 at L0. Every 4 16x16 stores a merge_success flag */
850                 ps_ed_blk_l2 = ps_ed_l2_ctb + (blk_cnt >> 2);
851 
852                 pi4_intra_32_cost = &ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[blk_cnt >> 4];
853 
854                 /* by default 32x32 modes are set to default values DC and Planar */
855                 ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 0;
856                 ps_intra32_analyse->au1_best_modes_32x32_tu[1] = 1;
857                 ps_intra32_analyse->au1_best_modes_32x32_tu[2] = 255;
858 
859                 /* By default 32x32 split is set to 1 */
860                 ps_intra32_analyse->b1_split_flag = 1;
861 
862                 ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 0;
863                 ps_intra32_analyse->au1_best_modes_16x16_tu[1] = 1;
864                 ps_intra32_analyse->au1_best_modes_16x16_tu[2] = 255;
865 
866                 /* 16x16 cost & 8x8 cost are stored in Raster scan order */
867                 /* stride of 16x16 buffer is MAX_CU_IN_CTB_ROW >> 1      */
868                 /* stride of 8x8 buffer is MAX_CU_IN_CTB_ROW             */
869                 {
870                     WORD32 pos_x_8x8, pos_y_8x8;
871 
872                     pos_x_8x8 = gau1_cu_pos_x[blk_cnt];
873                     pos_y_8x8 = gau1_cu_pos_y[blk_cnt];
874 
875                     pi4_intra_16_cost = &ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[0];
876 
877                     pi4_intra_16_cost +=
878                         ((pos_x_8x8 >> 1) + ((pos_y_8x8 >> 1) * (MAX_CU_IN_CTB_ROW >> 1)));
879 
880                     pi4_intra_8_cost = &ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[0];
881 
882                     pi4_intra_8_cost += (pos_x_8x8 + (pos_y_8x8 * MAX_CU_IN_CTB_ROW));
883                 }
884 
885                 merge_32x32_l1 = 0;
886                 merge_32x32_l2 = 0;
887                 i4_skip_intra_eval_32x32_l1 = 0;
888 
889                 /* Enable 16x16 merge iff sufficient 8x8 blocks remain in the current CTB */
890                 merge_16x16_l1 = 0;
891                 if(((num_8x8_blks_x - cu_pos_x) >= 2) && ((num_8x8_blks_y - cu_pos_y) >= 2))
892                 {
893 #if !ENABLE_UNIFORM_CU_SIZE_8x8
894                     merge_16x16_l1 = ps_ed_blk_l1->merge_success;
895 #else
896                     merge_16x16_l1 = 0;
897 #endif
898                 }
899 
900                 /* Enable 32x32 merge iff sufficient 8x8 blocks remain in the current CTB */
901                 if(((num_8x8_blks_x - cu_pos_x) >= 4) && ((num_8x8_blks_y - cu_pos_y) >= 4))
902                 {
903                     /* Check 4 flags of L1(8x8) say merge */
904                     for(i = 0; i < 4; i++)
905                     {
906                         merge_32x32_l1 += (ps_ed_blk_l1 + (i * 4))->merge_success;
907 
908                         //EIDD: num 16x16 blocks for which inter_intra flag says eval only inter, i.e. skip intra eval
909                         i4_skip_intra_eval_32x32_l1 +=
910                             ((ps_ed_blk_l1 + (i * 4))->intra_or_inter == 2) ? 1 : 0;
911                     }
912 
913 #if !ENABLE_UNIFORM_CU_SIZE_8x8
914                     /* Check 1 flag from L2(16x16) say merge */
915                     merge_32x32_l2 = ps_ed_blk_l2->merge_success;
916 #else
917                     merge_32x32_l1 = 0;
918                     merge_32x32_l2 = 0;
919 #endif
920                 }
921 
922 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
923                 if((i4_quality_preset == IHEVCE_QUALITY_P6) && (ps_ctxt->i4_slice_type != ISLICE))
924                 {
925                     merge_32x32_l2 = 0;
926                     ps_ed_blk_l2->merge_success = 0;
927                 }
928 #endif
929 
930                 ps_intra32_analyse->b1_valid_cu = 1;
931 
932                 /* If Merge success from all 4 L1 and L2, max CU size 32x32 is chosen */
933                 /* EIID: if all blocks to be skipped then skip entire 32x32 for intra eval,
934                 if no blocks to be skipped then eval entire 32x32,
935                 else break the merge and go to 16x16 level eval */
936                 if((merge_32x32_l1 == 4) && merge_32x32_l2 &&
937                    ((i4_skip_intra_eval_32x32_l1 == 0) ||
938                     (i4_skip_intra_eval_32x32_l1 == 4))  //comment this line to disable break-merge
939                 )
940                 {
941 #if IP_DBG_L1_l2
942                     /* Populate params for 32x32 block analysis */
943                     ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
944 
945                     ps_cu_node->ps_parent->u1_cu_size = 32;
946                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
947                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
948                     ps_cu_node->ps_parent->best_mode = ps_ed_blk_l2->best_merge_mode;
949                     /* CU size 32x32 and fill the final cu params */
950 
951                     ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
952 
953                     /* Increment pointers */
954                     ps_ed_blk_l1 += 16;
955                     blk_cnt += 16;
956                     ps_row_cu++;
957                     merge_64x64 &= 1;
958 #else
959 
960                     /* EIID: dont evaluate if all 4 blocks at L1 said inter is winning*/
961                     if(4 == i4_skip_intra_eval_32x32_l1 && (ps_ctxt->i4_slice_type != ISLICE))
962                     {
963                         WORD32 i4_local_ctr1, i4_local_ctr2;
964 
965                         ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
966 
967                         ps_cu_node->ps_parent->u1_cu_size = 32;
968                         ps_cu_node->ps_parent->u2_x0 =
969                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
970                         ps_cu_node->ps_parent->u2_y0 =
971                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
972                         ps_cu_node->ps_parent->best_mode =
973                             INTRA_DC;  //ps_ed_blk_l2->best_merge_mode;
974                         /* CU size 32x32 and fill the final cu params */
975 
976                         /* fill in the first modes as invalid */
977                         ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
978                         ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
979                             INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
980                         ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
981 
982                         ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
983                         ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
984                         ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
985 
986                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
987 
988                         //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
989                         //ps_row_cu->u1_num_intra_rdopt_cands = 0;
990 
991                         ps_intra32_analyse->b1_valid_cu = 0;
992                         ps_intra32_analyse->b1_split_flag = 0;
993                         ps_intra32_analyse->b1_merge_flag = 0;
994                         /*memset (&ps_intra32_analyse->au1_best_modes_32x32_tu,
995                         255,
996                         NUM_BEST_MODES);
997                         memset (&ps_intra32_analyse->au1_best_modes_16x16_tu,
998                         255,
999                         NUM_BEST_MODES);*/
1000                         //set only first mode since if it's 255. it wont go ahead
1001                         ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 255;
1002                         ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 255;
1003                         ps_intra32_analyse->i4_best_intra_cost = MAX_INTRA_COST_IPE;
1004 
1005                         *pi4_intra_32_cost = MAX_INTRA_COST_IPE;
1006 
1007                         /*since ME will start evaluating from bottom up, set the lower
1008                         cu size data invalid */
1009                         for(i4_local_ctr1 = 0; i4_local_ctr1 < 4; i4_local_ctr1++)
1010                         {
1011                             WORD32 *pi4_intra_8_cost_curr16;
1012 
1013                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1014                                 .au1_best_modes_16x16_tu[0] = 255;
1015                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1016                                 .au1_best_modes_8x8_tu[0] = 255;
1017                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1018                                 .i4_best_intra_cost = MAX_INTRA_COST_IPE;
1019                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_merge_flag = 0;
1020                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_valid_cu = 0;
1021                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_split_flag = 0;
1022 
1023                             pi4_intra_16_cost
1024                                 [(i4_local_ctr1 & 1) + ((MAX_CU_IN_CTB_ROW >> 1) *
1025                                                         (i4_local_ctr1 >> 1))] = MAX_INTRA_COST_IPE;
1026 
1027                             pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((i4_local_ctr1 & 1) << 1);
1028                             pi4_intra_8_cost_curr16 +=
1029                                 ((i4_local_ctr1 >> 1) << 1) * MAX_CU_IN_CTB_ROW;
1030 
1031                             for(i4_local_ctr2 = 0; i4_local_ctr2 < 4; i4_local_ctr2++)
1032                             {
1033                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1034                                     .as_intra8_analyse[i4_local_ctr2]
1035                                     .au1_4x4_best_modes[0][0] = 255;
1036                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1037                                     .as_intra8_analyse[i4_local_ctr2]
1038                                     .au1_4x4_best_modes[1][0] = 255;
1039                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1040                                     .as_intra8_analyse[i4_local_ctr2]
1041                                     .au1_4x4_best_modes[2][0] = 255;
1042                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1043                                     .as_intra8_analyse[i4_local_ctr2]
1044                                     .au1_4x4_best_modes[3][0] = 255;
1045                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1046                                     .as_intra8_analyse[i4_local_ctr2]
1047                                     .au1_best_modes_8x8_tu[0] = 255;
1048                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1049                                     .as_intra8_analyse[i4_local_ctr2]
1050                                     .au1_best_modes_4x4_tu[0] = 255;
1051                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1052                                     .as_intra8_analyse[i4_local_ctr2]
1053                                     .i4_best_intra_cost = MAX_INTRA_COST_IPE;
1054                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1055                                     .as_intra8_analyse[i4_local_ctr2]
1056                                     .b1_valid_cu = 0;
1057 
1058                                 pi4_intra_8_cost_curr16
1059                                     [(i4_local_ctr2 & 1) +
1060                                      (MAX_CU_IN_CTB_ROW * (i4_local_ctr2 >> 1))] =
1061                                         MAX_INTRA_COST_IPE;
1062                             }
1063                         }
1064 
1065                         /* set neighbours even if intra is not evaluated, since source is always available. */
1066                         ihevce_set_nbr_map(
1067                             ps_ctxt->pu1_ctb_nbr_map,
1068                             ps_ctxt->i4_nbr_map_strd,
1069                             ps_cu_node->ps_parent->u2_x0 << 1,
1070                             ps_cu_node->ps_parent->u2_y0 << 1,
1071                             (ps_cu_node->ps_parent->u1_cu_size >> 2),
1072                             1);
1073 
1074                         /* cost accumalation of best cu size candiate */
1075                         /*i8_frame_acc_satd_cost += parent_cost;*/
1076 
1077                         /* Mode bits cost accumalation for best cu size and cu mode */
1078                         /*i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;*/
1079 
1080                         /*satd/mod_qp accumulation of best cu */
1081                         /*i8_frame_acc_satd_by_modqp_q10 += ((LWORD64)ps_cu_node->ps_parent->best_satd << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3))/i4_q_scale_q3_mod;*/
1082 
1083                         /* Increment pointers */
1084                         ps_ed_blk_l1 += 16;
1085                         blk_cnt += 16;
1086                         //ps_row_cu++;
1087                         merge_64x64 = 0;
1088 
1089                         /* increment for stat purpose only. Increment is valid only on single thread */
1090                         ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 4;
1091                     }
1092                     else
1093                     {
1094                         /* Revaluation of 4 16x16 blocks at 8x8 prediction level */
1095                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1096 
1097                         if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1098                            (ps_ctxt->i4_slice_type == PSLICE))
1099                         {
1100                             ps_ctxt->u1_disable_child_cu_decide = 1;
1101                             step2_bypass = 0;
1102                         }
1103 
1104                         /* Based on the flag, Child modes decision can be disabled*/
1105                         if(0 == ps_ctxt->u1_disable_child_cu_decide)
1106                         {
1107                             for(j = 0; j < 4; j++)
1108                             {
1109                                 ps_cu_node->ps_sub_cu[j]->u2_x0 =
1110                                     gau1_cu_pos_x[blk_cnt + (j * 4)]; /* Populate properly */
1111                                 ps_cu_node->ps_sub_cu[j]->u2_y0 =
1112                                     gau1_cu_pos_y[blk_cnt + (j * 4)]; /* Populate properly */
1113                                 ps_cu_node->ps_sub_cu[j]->u1_cu_size = 16;
1114 
1115                                 {
1116                                     WORD32 best_ang_mode =
1117                                         (ps_ed_blk_l1 + (j * 4))->best_merge_mode;
1118 
1119                                     if(best_ang_mode < 2)
1120                                         best_ang_mode = 26;
1121 
1122                                     ihevce_mode_eval_filtering(
1123                                         ps_cu_node->ps_sub_cu[j],
1124                                         ps_cu_node,
1125                                         ps_ctxt,
1126                                         ps_curr_src,
1127                                         best_ang_mode,
1128                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1129                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1130                                         !step2_bypass,
1131                                         1);
1132 
1133                                     if(i4_enable_4cu_16tu)
1134                                     {
1135                                         ihevce_mode_eval_filtering(
1136                                             ps_cu_node->ps_sub_cu[j],
1137                                             ps_cu_node,
1138                                             ps_ctxt,
1139                                             ps_curr_src,
1140                                             best_ang_mode,
1141                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1142                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1143                                             !step2_bypass,
1144                                             0);
1145                                     }
1146                                     else
1147                                     {
1148                                         /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1149                                         memcpy(
1150                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1151                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1152                                             NUM_BEST_MODES);
1153 
1154                                         /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1155                                         memcpy(
1156                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1157                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1158                                             NUM_BEST_MODES * sizeof(WORD32));
1159                                     }
1160 
1161                                     child_cost[j] =
1162                                         MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1163                                             ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
1164 
1165                                     /* Child cost is sum of costs at 16x16 level  */
1166                                     child_cost_least += child_cost[j];
1167 
1168                                     /* Select the best mode to be populated as top and left nbr depending on the
1169                                     4tu and 1tu cost */
1170                                     if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
1171                                        ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
1172                                     {
1173                                         ps_cu_node->ps_sub_cu[j]->best_mode =
1174                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
1175                                     }
1176                                     else
1177                                     {
1178                                         ps_cu_node->ps_sub_cu[j]->best_mode =
1179                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
1180                                     }
1181 
1182                                     { /* Update the CTB nodes only for MAX - 1 CU nodes */
1183                                         WORD32 xA, yA, row, col;
1184                                         xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
1185                                         yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
1186                                         size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
1187                                         for(row = yA; row < (yA + size); row++)
1188                                         {
1189                                             for(col = xA; col < (xA + size); col++)
1190                                             {
1191                                                 ps_ctxt->au1_ctb_mode_map[row][col] =
1192                                                     ps_cu_node->ps_sub_cu[j]->best_mode;
1193                                             }
1194                                         }
1195                                     }
1196                                 }
1197 
1198                                 /*Child SATD cost*/
1199                                 child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
1200 
1201                                 /* store the child 16x16 costs */
1202                                 pi4_intra_16_cost[(j & 1) + ((MAX_CU_IN_CTB_ROW >> 1) * (j >> 1))] =
1203                                     child_cost[j];
1204 
1205                                 /* set the CU valid flag */
1206                                 ps_intra16_analyse[j].b1_valid_cu = 1;
1207 
1208                                 /* All 16x16 merge is valid, if Cu 32x32 is chosen */
1209                                 /* To be reset, if CU 64x64 is chosen */
1210                                 ps_intra16_analyse[j].b1_merge_flag = 1;
1211 
1212                                 /* storing the modes to intra 16 analyse */
1213                                 /* store the best 16x16 modes 8x8 tu */
1214                                 memcpy(
1215                                     &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1216                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1217                                     sizeof(UWORD8) * (NUM_BEST_MODES));
1218                                 ps_intra16_analyse[j].au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1219 
1220                                 /* store the best 16x16 modes 16x16 tu */
1221                                 memcpy(
1222                                     &ps_intra16_analyse[j].au1_best_modes_16x16_tu[0],
1223                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1224                                     sizeof(UWORD8) * (NUM_BEST_MODES));
1225                                 ps_intra16_analyse[j].au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1226 
1227                                 /* divide the 16x16 costs (pro rating) to 4 8x8 costs */
1228                                 /* store the same 16x16 modes as 4 8x8 child modes    */
1229                                 {
1230                                     WORD32 idx_8x8;
1231                                     WORD32 *pi4_intra_8_cost_curr16;
1232                                     intra8_analyse_t *ps_intra8_analyse;
1233 
1234                                     pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((j & 1) << 1);
1235                                     pi4_intra_8_cost_curr16 += ((j >> 1) << 1) * MAX_CU_IN_CTB_ROW;
1236 
1237                                     for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
1238                                     {
1239                                         pi4_intra_8_cost_curr16
1240                                             [(idx_8x8 & 1) + (MAX_CU_IN_CTB_ROW * (idx_8x8 >> 1))] =
1241                                                 (child_cost[j] + 3) >> 2;
1242 
1243                                         ps_intra8_analyse =
1244                                             &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
1245 
1246                                         ps_intra8_analyse->b1_enable_nxn = 0;
1247                                         ps_intra8_analyse->b1_valid_cu = 1;
1248 
1249                                         /* store the best 8x8 modes 8x8 tu */
1250                                         memcpy(
1251                                             &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
1252                                             &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1253                                             sizeof(UWORD8) * (NUM_BEST_MODES + 1));
1254 
1255                                         /* store the best 8x8 modes 4x4 tu */
1256                                         memcpy(
1257                                             &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
1258                                             &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1259                                             sizeof(UWORD8) * (NUM_BEST_MODES + 1));
1260 
1261                                         /* NXN modes not evaluated hence set to 0 */
1262                                         memset(
1263                                             &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1264                                             255,
1265                                             sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1266                                     }
1267                                 }
1268                             }
1269 
1270                             ihevce_set_nbr_map(
1271                                 ps_ctxt->pu1_ctb_nbr_map,
1272                                 ps_ctxt->i4_nbr_map_strd,
1273                                 ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
1274                                 ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
1275                                 (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
1276                                 0);
1277                         }
1278 #if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
1279                         else
1280                         {
1281                             for(j = 0; j < 4; j++)
1282                             {
1283                                 WORD32 idx_8x8;
1284                                 intra8_analyse_t *ps_intra8_analyse;
1285                                 ps_intra16_analyse[j].au1_best_modes_8x8_tu[0] = 255;
1286                                 ps_intra16_analyse[j].au1_best_modes_16x16_tu[0] = 255;
1287 
1288                                 ps_intra16_analyse[j].b1_valid_cu = 0;
1289 
1290                                 for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
1291                                 {
1292                                     ps_intra8_analyse =
1293                                         &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
1294 
1295                                     ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
1296                                     ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
1297 
1298                                     ps_intra8_analyse->b1_enable_nxn = 0;
1299                                     ps_intra8_analyse->b1_valid_cu = 0;
1300 
1301                                     /* NXN modes not evaluated hence set to 0 */
1302                                     memset(
1303                                         &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1304                                         255,
1305                                         sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1306                                 }
1307                             }
1308 
1309                             child_cost_least = MAX_INTRA_COST_IPE;
1310                         }
1311 #endif
1312 
1313                         /* Populate params for 32x32 block analysis */
1314 
1315                         ps_cu_node->ps_parent->u1_cu_size = 32;
1316                         ps_cu_node->ps_parent->u2_x0 =
1317                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1318                         ps_cu_node->ps_parent->u2_y0 =
1319                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1320 
1321                         /* Revaluation for 32x32 parent block at 16x16 prediction level */
1322                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1323 
1324                         {
1325                             /* Eval for TUSize = CuSize */
1326                             ihevce_mode_eval_filtering(
1327                                 ps_cu_node->ps_parent,
1328                                 ps_cu_node,
1329                                 ps_ctxt,
1330                                 ps_curr_src,
1331                                 26,
1332                                 &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1333                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1334                                 step2_bypass,
1335                                 1);
1336 
1337                             if(i4_enable_1cu_4tu)
1338                             {
1339                                 /* Eval for TUSize = CuSize/2 */
1340                                 ihevce_mode_eval_filtering(
1341                                     ps_cu_node->ps_parent,
1342                                     ps_cu_node,
1343                                     ps_ctxt,
1344                                     ps_curr_src,
1345                                     26,
1346                                     &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1347                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1348                                     step2_bypass,
1349                                     0);
1350                             }
1351                             else
1352                             {
1353                                 /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1354                                 memcpy(
1355                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1356                                     &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1357                                     NUM_BEST_MODES);
1358 
1359                                 /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1360                                 memcpy(
1361                                     &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1362                                     &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1363                                     NUM_BEST_MODES * sizeof(WORD32));
1364                             }
1365                         }
1366 
1367                         ps_ctxt->u1_disable_child_cu_decide = 0;
1368                         step2_bypass = 1;
1369 
1370                         /* Update parent cost */
1371                         parent_cost =
1372                             MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1373                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
1374 
1375                         /* Select the best mode to be populated as top and left nbr depending on the
1376                         4tu and 1tu cost */
1377                         if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
1378                            ps_cu_node->ps_parent->au4_best_cost_1tu[0])
1379                         {
1380                             ps_cu_node->ps_parent->best_mode =
1381                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1382                         }
1383                         else
1384                         {
1385                             ps_cu_node->ps_parent->best_mode =
1386                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0];
1387                         }
1388 
1389                         /* store the 32x32 cost */
1390                         *pi4_intra_32_cost = parent_cost;
1391 
1392                         /* set the CU valid flag */
1393                         ps_intra32_analyse->b1_valid_cu = 1;
1394 
1395                         ps_intra32_analyse->b1_merge_flag = 1;
1396 
1397                         /* storing the modes to intra 32 analyse */
1398                         {
1399                             /* store the best 32x32 modes 16x16 tu */
1400                             memcpy(
1401                                 &ps_intra32_analyse->au1_best_modes_16x16_tu[0],
1402                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1403                                 sizeof(UWORD8) * (NUM_BEST_MODES));
1404                             ps_intra32_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1405 
1406                             /* store the best 32x32 modes 32x32 tu */
1407                             memcpy(
1408                                 &ps_intra32_analyse->au1_best_modes_32x32_tu[0],
1409                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1410                                 sizeof(UWORD8) * (NUM_BEST_MODES));
1411                             ps_intra32_analyse->au1_best_modes_32x32_tu[NUM_BEST_MODES] = 255;
1412                         }
1413                         parent_best_mode = ps_cu_node->ps_parent->best_mode;
1414                         if((parent_cost <=
1415                             child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
1416                                                 LAMBDA_Q_SHIFT)))  //|| identical_modes)
1417                         {
1418                             WORD32 i4_q_scale_q3_mod;
1419                             UWORD8 u1_cu_possible_qp;
1420                             WORD32 i4_act_factor;
1421 
1422                             /* CU size 32x32 and fill the final cu params */
1423 
1424                             ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1425 
1426                             if((IHEVCE_QUALITY_P3 > i4_quality_preset))
1427                             {
1428                                 for(i = 0; i < 4; i++)
1429                                 {
1430                                     intra8_analyse_t *ps_intra8_analyse;
1431                                     ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
1432                                     for(j = 0; j < 4; j++)
1433                                     {
1434                                         /* Populate best 3 nxn modes */
1435                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] =
1436                                             ps_cu_node->ps_sub_cu[i]->au1_best_mode_4tu[0];
1437                                         ps_intra8_analyse->au1_4x4_best_modes[j][1] =
1438                                             ps_cu_node->ps_sub_cu[i]
1439                                                 ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
1440                                         ps_intra8_analyse->au1_4x4_best_modes[j][2] =
1441                                             ps_cu_node->ps_sub_cu[i]
1442                                                 ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
1443                                         ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
1444                                     }
1445                                 }
1446                             }
1447                             /* store the 32x32 non split flag */
1448                             ps_intra32_analyse->b1_split_flag = 0;
1449                             ps_intra32_analyse->as_intra16_analyse[0].b1_split_flag = 0;
1450                             ps_intra32_analyse->as_intra16_analyse[1].b1_split_flag = 0;
1451                             ps_intra32_analyse->as_intra16_analyse[2].b1_split_flag = 0;
1452                             ps_intra32_analyse->as_intra16_analyse[3].b1_split_flag = 0;
1453 
1454                             au1_best_32x32_modes[blk_cnt >> 4] =
1455                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1456 
1457                             au4_best_32x32_cost[blk_cnt >> 4] =
1458                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0];
1459                             /*As 32*32 has won, pick L2 8x8 qp which maps
1460                             to L0 32x32 Qp*/
1461                             ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
1462                             ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
1463                             u1_cu_possible_qp = ihevce_cu_level_qp_mod(
1464                                 ps_ctxt->i4_qscale,
1465                                 ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
1466                                 ps_ctxt->ld_curr_frame_16x16_log_avg[0],
1467                                 f_strength,
1468                                 &i4_act_factor,
1469                                 &i4_q_scale_q3_mod,
1470                                 ps_ctxt->ps_rc_quant_ctxt);
1471                             /* cost accumalation of best cu size candiate */
1472                             i8_frame_acc_satd_cost += parent_cost;
1473 
1474                             /* satd and mpm bits accumalation of best cu size candiate */
1475                             i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
1476 
1477                             /* Mode bits cost accumalation for best cu size and cu mode */
1478                             i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
1479 
1480                             /*satd/mod_qp accumulation of best cu */
1481                             i8_frame_acc_satd_by_modqp_q10 +=
1482                                 ((LWORD64)ps_cu_node->ps_parent->best_satd
1483                                  << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1484                                 i4_q_scale_q3_mod;
1485 
1486                             /* Increment pointers */
1487                             ps_ed_blk_l1 += 16;
1488                             blk_cnt += 16;
1489                             //ps_row_cu++;
1490                             merge_64x64 &= 1;
1491                         }
1492                         else
1493                         {
1494                             /* store the 32x32 split flag */
1495                             ps_intra32_analyse->b1_split_flag = 1;
1496 
1497                             /* CU size 16x16 and fill the final cu params for all 4 blocks */
1498                             for(j = 0; j < 4; j++)
1499                             {
1500                                 WORD32 i4_q_scale_q3_mod;
1501                                 UWORD8 u1_cu_possible_qp;
1502                                 WORD32 i4_act_factor;
1503 
1504                                 /* Set CU split flag */
1505                                 ASSERT(blk_cnt % 4 == 0);
1506 
1507                                 ihevce_update_cand_list(
1508                                     ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
1509 
1510                                 /* store the 16x16 non split flag  */
1511                                 ps_intra16_analyse[j].b1_split_flag = 0;
1512 
1513                                 ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1514                                 ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
1515                                 /*As 16*16 has won, pick L1 8x8 qp which maps
1516                                 to L0 16x16 Qp*/
1517                                 u1_cu_possible_qp = ihevce_cu_level_qp_mod(
1518                                     ps_ctxt->i4_qscale,
1519                                     ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
1520                                     ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1521                                     f_strength,
1522                                     &i4_act_factor,
1523                                     &i4_q_scale_q3_mod,
1524                                     ps_ctxt->ps_rc_quant_ctxt);
1525 
1526                                 /*accum satd/qp for all child block*/
1527                                 i8_frame_acc_satd_by_modqp_q10 +=
1528                                     ((LWORD64)child_satd[j]
1529                                      << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1530                                     i4_q_scale_q3_mod;
1531 
1532                                 /* Accumalate mode bits for all child blocks */
1533                                 i8_frame_acc_mode_bits_cost +=
1534                                     ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
1535 
1536                                 /* satd and mpm bits accumalation of best cu size candiate */
1537                                 i4_ctb_acc_satd += child_satd[j];
1538 
1539                                 /* Increment pointers */
1540                                 //ps_row_cu++;
1541                                 ps_ed_blk_l1 += 4;
1542                                 blk_cnt += 4;
1543                             }
1544 
1545                             /* cost accumalation of best cu size candiate */
1546                             i8_frame_acc_satd_cost += child_cost_least;
1547 
1548                             /* 64x64 merge is not possible */
1549                             merge_64x64 = 0;
1550                         }
1551 
1552                         //ps_ed_blk_l2 += 4;
1553 
1554                     }  //end of EIID's else
1555 #endif
1556                 }
1557                 /* If Merge success for L1 max CU size 16x16 is chosen */
1558                 else if(merge_16x16_l1)
1559                 {
1560 #if IP_DBG_L1_l2
1561                     ps_cu_node->ps_parent->u1_cu_size = 16;
1562                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1563                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1564                     ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_merge_mode;
1565                     ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1566 
1567                     blk_cnt += 4;
1568                     ps_ed_blk_l1 += 4;
1569                     ps_row_cu++;
1570                     merge_64x64 = 0;
1571 #else
1572 
1573                     /*EIID: evaluate only if L1 early-inter-intra decision is not favouring inter*/
1574                     /* enable this only in B pictures */
1575                     if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
1576                     {
1577                         WORD32 i4_q_scale_q3_mod, i4_local_ctr;
1578                         WORD8 i1_cu_possible_qp;
1579                         WORD32 i4_act_factor;
1580                         /* make cost infinity. */
1581                         /* make modes invalid */
1582                         /* update loop variables */
1583                         /* set other output variales */
1584                         /* dont set neighbour flag so that next blocks wont access this cu */
1585                         /* what happens to ctb_mode_map?? */
1586 
1587                         ps_cu_node->ps_parent->u1_cu_size = 16;
1588                         ps_cu_node->ps_parent->u2_x0 =
1589                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1590                         ps_cu_node->ps_parent->u2_y0 =
1591                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1592                         ps_cu_node->ps_parent->best_mode =
1593                             INTRA_DC;  //ps_ed_blk_l1->best_merge_mode;
1594 
1595                         /* fill in the first modes as invalid */
1596 
1597                         ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
1598                         ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
1599                             INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
1600                         ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
1601 
1602                         ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
1603                         ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
1604                         ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
1605 
1606                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1607 
1608                         //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
1609                         //ps_row_cu->u1_num_intra_rdopt_cands = 0;
1610 
1611                         ps_intra32_analyse->b1_split_flag = 1;
1612                         ps_intra32_analyse->b1_merge_flag = 0;
1613 
1614                         ps_intra16_analyse->b1_valid_cu = 0;
1615                         ps_intra16_analyse->b1_split_flag = 0;
1616                         ps_intra16_analyse->b1_merge_flag = 1;
1617                         //memset (&ps_intra16_analyse->au1_best_modes_16x16_tu,
1618                         //  255,
1619                         //  NUM_BEST_MODES);
1620                         //memset (&ps_intra16_analyse->au1_best_modes_8x8_tu,
1621                         //  255,
1622                         //  NUM_BEST_MODES);
1623                         //set only first mode since if it's 255. it wont go ahead
1624                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
1625                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
1626                         ps_intra16_analyse->i4_best_intra_cost = MAX_INTRA_COST_IPE;
1627                         *pi4_intra_16_cost = MAX_INTRA_COST_IPE;
1628 
1629                         /*since ME will start evaluating from bottom up, set the lower
1630                         cu size data invalid */
1631                         for(i4_local_ctr = 0; i4_local_ctr < 4; i4_local_ctr++)
1632                         {
1633                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1634                                 .au1_4x4_best_modes[0][0] = 255;
1635                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1636                                 .au1_4x4_best_modes[1][0] = 255;
1637                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1638                                 .au1_4x4_best_modes[2][0] = 255;
1639                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1640                                 .au1_4x4_best_modes[3][0] = 255;
1641                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1642                                 .au1_best_modes_8x8_tu[0] = 255;
1643                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1644                                 .au1_best_modes_4x4_tu[0] = 255;
1645                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr].i4_best_intra_cost =
1646                                 MAX_INTRA_COST_IPE;
1647 
1648                             pi4_intra_8_cost
1649                                 [(i4_local_ctr & 1) + (MAX_CU_IN_CTB_ROW * (i4_local_ctr >> 1))] =
1650                                     MAX_INTRA_COST_IPE;
1651                         }
1652 
1653                         /* set neighbours even if intra is not evaluated, since source is always available. */
1654                         ihevce_set_nbr_map(
1655                             ps_ctxt->pu1_ctb_nbr_map,
1656                             ps_ctxt->i4_nbr_map_strd,
1657                             ps_cu_node->ps_parent->u2_x0 << 1,
1658                             ps_cu_node->ps_parent->u2_y0 << 1,
1659                             (ps_cu_node->ps_parent->u1_cu_size >> 2),
1660                             1);
1661 
1662                         //what happends to RC variables??
1663                         /* run only constant Qp */
1664                         ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1665                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
1666                         i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1667                             ps_ctxt->i4_qscale,
1668                             ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
1669                             ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1670                             f_strength,
1671                             &i4_act_factor,
1672                             &i4_q_scale_q3_mod,
1673                             ps_ctxt->ps_rc_quant_ctxt);
1674 
1675                         /* cost accumalation of best cu size candiate */
1676                         i8_frame_acc_satd_cost += 0;  //parent_cost;  //incorrect accumulation
1677 
1678                         /*satd/mod_qp accumulation of best cu */
1679                         i8_frame_acc_satd_by_modqp_q10 += 0;  //incorrect accumulation
1680                         //((LWORD64)ps_cu_node->ps_parent->best_satd << SATD_BY_ACT_Q_FAC)/i4_q_scale_q3_mod;
1681 
1682                         /* Accumalate mode bits for all child blocks */
1683                         i8_frame_acc_mode_bits_cost +=
1684                             0;  //ps_cu_node->ps_parent->u2_mode_bits_cost;
1685                         //incoorect accumulation
1686 
1687                         blk_cnt += 4;
1688                         ps_ed_blk_l1 += 4;
1689                         //ps_row_cu++;
1690                         merge_64x64 = 0;
1691 
1692                         /* increment for stat purpose only. Increment is valid only on single thread */
1693                         ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 1;
1694                     }
1695                     else
1696                     {
1697                         /* 64x64 merge is not possible */
1698                         merge_64x64 = 0;
1699 
1700                         /* set the 32x32 split flag to 1 */
1701                         ps_intra32_analyse->b1_split_flag = 1;
1702 
1703                         ps_intra32_analyse->b1_merge_flag = 0;
1704 
1705                         ps_intra16_analyse->b1_merge_flag = 1;
1706 
1707                         if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1708                            (ps_ctxt->i4_slice_type == PSLICE))
1709                         {
1710                             ps_ctxt->u1_disable_child_cu_decide = 1;
1711                             step2_bypass = 0;
1712                         }
1713                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1714                         /* Based on the flag, Child modes decision can be disabled*/
1715                         if(0 == ps_ctxt->u1_disable_child_cu_decide)
1716                         {
1717                             for(j = 0; j < 4; j++)
1718                             {
1719                                 intra8_analyse_t *ps_intra8_analyse;
1720                                 WORD32 best_ang_mode = (ps_ed_blk_l1 + j)->best_mode;
1721 
1722                                 if(best_ang_mode < 2)
1723                                     best_ang_mode = 26;
1724 
1725                                 //ps_cu_node->ps_sub_cu[j]->best_cost = MAX_INTRA_COST_IPE;
1726                                 //ps_cu_node->ps_sub_cu[j]->best_mode = (ps_ed_blk_l1 + j)->best_mode;
1727 
1728                                 ps_cu_node->ps_sub_cu[j]->u2_x0 =
1729                                     gau1_cu_pos_x[blk_cnt + j]; /* Populate properly */
1730                                 ps_cu_node->ps_sub_cu[j]->u2_y0 =
1731                                     gau1_cu_pos_y[blk_cnt + j]; /* Populate properly */
1732                                 ps_cu_node->ps_sub_cu[j]->u1_cu_size = 8;
1733 
1734                                 ihevce_mode_eval_filtering(
1735                                     ps_cu_node->ps_sub_cu[j],
1736                                     ps_cu_node,
1737                                     ps_ctxt,
1738                                     ps_curr_src,
1739                                     best_ang_mode,
1740                                     &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1741                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1742                                     !step2_bypass,
1743                                     1);
1744 
1745                                 if(i4_enable_4cu_16tu)
1746                                 {
1747                                     ihevce_mode_eval_filtering(
1748                                         ps_cu_node->ps_sub_cu[j],
1749                                         ps_cu_node,
1750                                         ps_ctxt,
1751                                         ps_curr_src,
1752                                         best_ang_mode,
1753                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1754                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1755                                         !step2_bypass,
1756                                         0);
1757                                 }
1758                                 else
1759                                 {
1760                                     /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1761                                     memcpy(
1762                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1763                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1764                                         NUM_BEST_MODES);
1765 
1766                                     /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1767                                     memcpy(
1768                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1769                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1770                                         NUM_BEST_MODES * sizeof(WORD32));
1771                                 }
1772 
1773                                 child_cost[j] =
1774                                     MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1775                                         ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
1776 
1777                                 child_cost_least += child_cost[j];
1778 
1779                                 /* Select the best mode to be populated as top and left nbr depending on the
1780                                 4tu and 1tu cost */
1781                                 if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
1782                                    ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
1783                                 {
1784                                     ps_cu_node->ps_sub_cu[j]->best_mode =
1785                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
1786                                 }
1787                                 else
1788                                 {
1789                                     ps_cu_node->ps_sub_cu[j]->best_mode =
1790                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
1791                                 }
1792                                 { /* Update the CTB nodes only for MAX - 1 CU nodes */
1793                                     WORD32 xA, yA, row, col;
1794                                     xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
1795                                     yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
1796                                     size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
1797                                     for(row = yA; row < (yA + size); row++)
1798                                     {
1799                                         for(col = xA; col < (xA + size); col++)
1800                                         {
1801                                             ps_ctxt->au1_ctb_mode_map[row][col] =
1802                                                 ps_cu_node->ps_sub_cu[j]->best_mode;
1803                                         }
1804                                     }
1805                                 }
1806 
1807                                 /*collect individual child satd for final SATD/qp accum*/
1808                                 child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
1809 
1810                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
1811 
1812                                 /* store the child 8x8 costs */
1813                                 pi4_intra_8_cost[(j & 1) + (MAX_CU_IN_CTB_ROW * (j >> 1))] =
1814                                     child_cost[j];
1815 
1816                                 /* set the CU valid flag */
1817                                 ps_intra8_analyse->b1_valid_cu = 1;
1818                                 ps_intra8_analyse->b1_enable_nxn = 0;
1819 
1820                                 /* storing the modes to intra8  analyse */
1821 
1822                                 /* store the best 8x8 modes 8x8 tu */
1823                                 memcpy(
1824                                     &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
1825                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1826                                     sizeof(UWORD8) * (NUM_BEST_MODES));
1827                                 ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1828 
1829                                 /* store the best 8x8 modes 4x4 tu */
1830                                 memcpy(
1831                                     &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
1832                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1833                                     sizeof(UWORD8) * (NUM_BEST_MODES));
1834                                 ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
1835 
1836                                 /* NXN modes not evaluated hence set to 255 */
1837                                 memset(
1838                                     &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1839                                     255,
1840                                     sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1841                             }
1842 
1843                             ihevce_set_nbr_map(
1844                                 ps_ctxt->pu1_ctb_nbr_map,
1845                                 ps_ctxt->i4_nbr_map_strd,
1846                                 ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
1847                                 ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
1848                                 (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
1849                                 0);
1850                         }
1851 #if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
1852                         else
1853                         {
1854                             for(j = 0; j < 4; j++)
1855                             {
1856                                 intra8_analyse_t *ps_intra8_analyse;
1857                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
1858                                 ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
1859                                 ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
1860                                 /* NXN modes not evaluated hence set to 255 */
1861                                 memset(
1862                                     &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1863                                     255,
1864                                     sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1865 
1866                                 ps_intra8_analyse->b1_valid_cu = 0;
1867                                 ps_intra8_analyse->b1_enable_nxn = 0;
1868                             }
1869                             child_cost_least = MAX_INTRA_COST_IPE;
1870                         }
1871 #endif
1872                         //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
1873                         //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
1874 
1875                         ps_cu_node->ps_parent->u1_cu_size = 16;
1876                         ps_cu_node->ps_parent->u2_x0 =
1877                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1878                         ps_cu_node->ps_parent->u2_y0 =
1879                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1880 
1881                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1882 
1883                         /* Eval for TUSize = CuSize */
1884                         ihevce_mode_eval_filtering(
1885                             ps_cu_node->ps_parent,
1886                             ps_cu_node,
1887                             ps_ctxt,
1888                             ps_curr_src,
1889                             26,
1890                             &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1891                             &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1892                             step2_bypass,
1893                             1);
1894 
1895                         if(i4_enable_1cu_4tu)
1896                         {
1897                             /* Eval for TUSize = CuSize/2 */
1898                             ihevce_mode_eval_filtering(
1899                                 ps_cu_node->ps_parent,
1900                                 ps_cu_node,
1901                                 ps_ctxt,
1902                                 ps_curr_src,
1903                                 26,
1904                                 &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1905                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1906                                 step2_bypass,
1907                                 0);
1908                         }
1909                         else
1910                         {
1911                             /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1912                             memcpy(
1913                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1914                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1915                                 NUM_BEST_MODES);
1916 
1917                             /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1918                             memcpy(
1919                                 &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1920                                 &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1921                                 NUM_BEST_MODES * sizeof(WORD32));
1922                         }
1923 
1924                         ps_ctxt->u1_disable_child_cu_decide = 0;
1925                         step2_bypass = 1;
1926 
1927                         /* Update parent cost */
1928                         parent_cost =
1929                             MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1930                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
1931 
1932                         /* Select the best mode to be populated as top and left nbr depending on the
1933                         4tu and 1tu cost */
1934                         if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
1935                            ps_cu_node->ps_parent->au4_best_cost_1tu[0])
1936                         {
1937                             ps_cu_node->ps_parent->best_mode =
1938                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1939                         }
1940                         else
1941                         {
1942                             ps_cu_node->ps_parent->best_mode =
1943                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0];
1944                         }
1945 
1946                         /* store the 16x16 cost */
1947                         *pi4_intra_16_cost = parent_cost;
1948 
1949                         /* accumulate the 32x32 cost */
1950                         if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
1951                         {
1952                             *pi4_intra_32_cost = parent_cost;
1953                         }
1954                         else
1955                         {
1956                             *pi4_intra_32_cost += parent_cost;
1957                         }
1958 
1959                         /* set the CU valid flag */
1960                         ps_intra16_analyse->b1_valid_cu = 1;
1961 
1962                         /* storing the modes to intra 16 analyse */
1963                         {
1964                             /* store the best 16x16 modes 16x16 tu */
1965                             memcpy(
1966                                 &ps_intra16_analyse->au1_best_modes_16x16_tu[0],
1967                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1968                                 sizeof(UWORD8) * NUM_BEST_MODES);
1969                             ps_intra16_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1970 
1971                             /* store the best 16x16 modes 8x8 tu */
1972                             memcpy(
1973                                 &ps_intra16_analyse->au1_best_modes_8x8_tu[0],
1974                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1975                                 sizeof(UWORD8) * NUM_BEST_MODES);
1976                             ps_intra16_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1977                         }
1978 
1979                         parent_best_mode = ps_cu_node->ps_parent->best_mode;
1980                         if(parent_cost <=
1981                            child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
1982                                                LAMBDA_Q_SHIFT))  //|| identical_modes)
1983                         {
1984                             WORD32 i4_q_scale_q3_mod;
1985                             WORD8 i1_cu_possible_qp;
1986                             WORD32 i4_act_factor;
1987                             //choose parent CU
1988 
1989                             ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1990 
1991                             /* set the 16x16 non split flag */
1992                             ps_intra16_analyse->b1_split_flag = 0;
1993 
1994                             /*As 16*16 has won, pick L1 8x8 qp which maps
1995                             to L0 16x16 Qp*/
1996                             ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
1997                             ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
1998                             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1999                                 ps_ctxt->i4_qscale,
2000                                 ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
2001                                 ps_ctxt->ld_curr_frame_8x8_log_avg[0],
2002                                 f_strength,
2003                                 &i4_act_factor,
2004                                 &i4_q_scale_q3_mod,
2005                                 ps_ctxt->ps_rc_quant_ctxt);
2006 
2007                             /* cost accumalation of best cu size candiate */
2008                             i8_frame_acc_satd_cost += parent_cost;
2009 
2010                             /* satd and mpm bits accumalation of best cu size candiate */
2011                             i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
2012 
2013                             /*satd/mod_qp accumulation of best cu */
2014                             i8_frame_acc_satd_by_modqp_q10 +=
2015                                 ((LWORD64)ps_cu_node->ps_parent->best_satd
2016                                  << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2017                                 i4_q_scale_q3_mod;
2018 
2019                             /* Accumalate mode bits for all child blocks */
2020                             i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
2021 
2022                             blk_cnt += 4;
2023                             ps_ed_blk_l1 += 4;
2024                             //ps_row_cu++;
2025                         }
2026                         else
2027                         {
2028                             //choose child CU
2029                             WORD8 i1_cu_possible_qp;
2030                             WORD32 i4_act_factor;
2031                             WORD32 i4_q_scale_q3_mod;
2032 
2033                             ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2034                             ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1] != -2);
2035                             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2036                                 ps_ctxt->i4_qscale,
2037                                 ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1],
2038                                 ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2039                                 f_strength,
2040                                 &i4_act_factor,
2041                                 &i4_q_scale_q3_mod,
2042                                 ps_ctxt->ps_rc_quant_ctxt);
2043 
2044                             /* set the 16x16 split flag */
2045                             ps_intra16_analyse->b1_split_flag = 1;
2046 
2047                             for(j = 0; j < 4; j++)
2048                             {
2049                                 ihevce_update_cand_list(
2050                                     ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
2051 
2052                                 if((IHEVCE_QUALITY_P3 > i4_quality_preset))
2053                                 {
2054                                     WORD32 k;
2055                                     intra8_analyse_t *ps_intra8_analyse;
2056                                     ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
2057 
2058                                     for(k = 0; k < 4; k++)
2059                                     {
2060                                         /* Populate best 3 nxn modes */
2061                                         ps_intra8_analyse->au1_4x4_best_modes[k][0] =
2062                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
2063                                         ps_intra8_analyse->au1_4x4_best_modes[k][1] =
2064                                             ps_cu_node->ps_sub_cu[j]
2065                                                 ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
2066                                         ps_intra8_analyse->au1_4x4_best_modes[k][2] =
2067                                             ps_cu_node->ps_sub_cu[j]
2068                                                 ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
2069                                         ps_intra8_analyse->au1_4x4_best_modes[k][3] = 255;
2070                                     }
2071                                 }
2072                                 /*accum satd/qp for all child block*/
2073                                 i8_frame_acc_satd_by_modqp_q10 +=
2074                                     ((LWORD64)child_satd[j]
2075                                      << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2076                                     i4_q_scale_q3_mod;
2077 
2078                                 /* Accumalate mode bits for all child blocks */
2079                                 i8_frame_acc_mode_bits_cost +=
2080                                     ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
2081 
2082                                 /* satd and mpm bits accumalation of best cu size candiate */
2083                                 i4_ctb_acc_satd += child_satd[j];
2084 
2085                                 blk_cnt += 1;
2086                                 ps_ed_blk_l1 += 1;
2087                                 //ps_row_cu++;
2088                             }
2089 
2090                             /* cost accumalation of best cu size candiate */
2091                             i8_frame_acc_satd_cost += child_cost_least;
2092                         }
2093 
2094                     }  //else of EIID
2095 #endif
2096                 }  // if(merge_16x16_l1)
2097                 /* MAX CU SIZE 8x8 */
2098                 else
2099                 {
2100 #if IP_DBG_L1_l2
2101                     for(i = 0; i < 4; i++)
2102                     {
2103                         ps_cu_node->ps_parent->u1_cu_size = 8;
2104                         ps_cu_node->ps_parent->u2_x0 =
2105                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2106                         ps_cu_node->ps_parent->u2_y0 =
2107                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2108                         ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
2109 
2110                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2111                         blk_cnt++;
2112                         ps_ed_blk_l1++;
2113                         ps_row_cu++;
2114                         merge_64x64 = 0;
2115                     }
2116 #else
2117 
2118                     /* EIID: Skip all 4 8x8 block if L1 decisions says skip intra */
2119                     if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
2120                     {
2121                         WORD32 i4_q_scale_q3_mod;
2122                         WORD8 i1_cu_possible_qp;
2123                         WORD32 i4_act_factor;
2124 
2125                         merge_64x64 = 0;
2126 
2127                         ps_intra32_analyse->b1_merge_flag = 0;
2128 
2129                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
2130                         ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 255;
2131                         ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
2132 
2133                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
2134                         ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 255;
2135                         ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
2136                         ps_intra16_analyse->b1_split_flag = 1;
2137                         ps_intra16_analyse->b1_valid_cu = 0;
2138                         ps_intra16_analyse->b1_merge_flag = 0;
2139 
2140                         ps_intra16_analyse->i4_best_intra_cost = MAX_INTRA_COST_IPE;
2141 
2142                         for(i = 0; i < 4; i++)
2143                         {
2144                             intra8_analyse_t *ps_intra8_analyse;
2145                             WORD32 ctr_sub_cu;
2146 
2147                             cu_pos_x = gau1_cu_pos_x[blk_cnt];
2148                             cu_pos_y = gau1_cu_pos_y[blk_cnt];
2149 
2150                             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
2151                             {
2152                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
2153 
2154                                 ps_intra8_analyse->b1_valid_cu = 0;
2155                                 ps_intra8_analyse->b1_enable_nxn = 0;
2156                                 ps_intra8_analyse->au1_4x4_best_modes[0][0] = 255;
2157                                 ps_intra8_analyse->au1_4x4_best_modes[1][0] = 255;
2158                                 ps_intra8_analyse->au1_4x4_best_modes[2][0] = 255;
2159                                 ps_intra8_analyse->au1_4x4_best_modes[3][0] = 255;
2160                                 ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
2161                                 ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
2162                                 ps_intra8_analyse->i4_best_intra_cost = MAX_INTRA_COST_IPE;
2163 
2164                                 ps_cu_node->ps_parent->u1_cu_size = 8;
2165                                 ps_cu_node->ps_parent->u2_x0 =
2166                                     gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2167                                 ps_cu_node->ps_parent->u2_y0 =
2168                                     gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2169                                 ps_cu_node->ps_parent->best_mode =
2170                                     INTRA_DC;  //ps_ed_blk_l1->best_mode;
2171 
2172                                 /* fill in the first modes as invalid */
2173 
2174                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
2175                                 ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
2176                                     INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
2177                                 ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
2178 
2179                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
2180                                 ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
2181                                 ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
2182 
2183                                 ihevce_update_cand_list(
2184                                     ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2185 
2186                                 //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
2187                                 //ps_row_cu->u1_num_intra_rdopt_cands = 0;
2188 
2189                                 for(ctr_sub_cu = 0; ctr_sub_cu < 4; ctr_sub_cu++)
2190                                 {
2191                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_1tu[0] =
2192                                         INTRA_DC;
2193                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_4tu[0] =
2194                                         INTRA_DC;
2195                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_1tu[0] =
2196                                         MAX_INTRA_COST_IPE;
2197 
2198                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_4tu[0] =
2199                                         MAX_INTRA_COST_IPE;
2200                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->best_cost =
2201                                         MAX_INTRA_COST_IPE;
2202                                 }
2203 
2204                                 pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2205                                     MAX_INTRA_COST_IPE;
2206 
2207                                 ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2208                                 ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
2209                                 i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2210                                     ps_ctxt->i4_qscale,
2211                                     ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
2212                                     ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2213                                     f_strength,
2214                                     &i4_act_factor,
2215                                     &i4_q_scale_q3_mod,
2216                                     ps_ctxt->ps_rc_quant_ctxt);
2217 
2218                                 /* set neighbours even if intra is not evaluated, since source is always available. */
2219                                 ihevce_set_nbr_map(
2220                                     ps_ctxt->pu1_ctb_nbr_map,
2221                                     ps_ctxt->i4_nbr_map_strd,
2222                                     ps_cu_node->ps_parent->u2_x0 << 1,
2223                                     ps_cu_node->ps_parent->u2_y0 << 1,
2224                                     (ps_cu_node->ps_parent->u1_cu_size >> 2),
2225                                     1);
2226 
2227                                 //ps_row_cu++;
2228                             }
2229                             blk_cnt++;
2230                             ps_ed_blk_l1++;
2231                         }
2232                     }
2233                     else
2234                     {
2235                         //cu_intra_cand_t *ps_cu_intra_cand;
2236                         WORD8 i1_cu_possible_qp;
2237                         WORD32 i4_act_factor;
2238                         WORD32 i4_q_scale_q3_mod;
2239 
2240                         ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2241                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
2242                         i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2243                             ps_ctxt->i4_qscale,
2244                             ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
2245                             ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2246                             f_strength,
2247                             &i4_act_factor,
2248                             &i4_q_scale_q3_mod,
2249                             ps_ctxt->ps_rc_quant_ctxt);
2250 
2251                         /* 64x64 merge is not possible */
2252                         merge_64x64 = 0;
2253 
2254                         ps_intra32_analyse->b1_merge_flag = 0;
2255 
2256                         ps_intra16_analyse->b1_merge_flag = 0;
2257 
2258                         /* by default 16x16 modes are set to default values DC and Planar */
2259                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 0;
2260                         ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 1;
2261                         ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
2262 
2263                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 0;
2264                         ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 1;
2265                         ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
2266                         ps_intra16_analyse->b1_split_flag = 1;
2267                         ps_intra16_analyse->b1_valid_cu = 1;
2268 
2269                         for(i = 0; i < 4; i++)
2270                         {
2271                             intra8_analyse_t *ps_intra8_analyse;
2272                             cu_pos_x = gau1_cu_pos_x[blk_cnt];
2273                             cu_pos_y = gau1_cu_pos_y[blk_cnt];
2274                             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
2275                             {
2276                                 //ps_cu_intra_cand = &ps_row_cu->s_cu_intra_cand;
2277                                 //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
2278 
2279                                 //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
2280 
2281                                 child_cost_least = 0;
2282 
2283                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
2284                                 ps_cu_node->ps_parent->u1_cu_size = 8;
2285                                 ps_cu_node->ps_parent->u2_x0 =
2286                                     gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2287                                 ps_cu_node->ps_parent->u2_y0 =
2288                                     gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2289 
2290                                 //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
2291 
2292                                 /*EARLY DECISION 8x8 block */
2293                                 ihevce_pu_calc_8x8_blk(
2294                                     ps_curr_src, ps_ctxt, ps_cu_node, ps_ctxt->ps_func_selector);
2295                                 for(j = 0; j < 4; j++)
2296                                 {
2297                                     child_cost_least += ps_cu_node->ps_sub_cu[j]->best_cost;
2298                                     child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
2299                                 }
2300 
2301                                 /* Based on the flag, CU = 4TU modes decision can be disabled, CU = 4PU is retained */
2302                                 if(0 == ps_ctxt->u1_disable_child_cu_decide)
2303                                 {
2304                                     ihevce_set_nbr_map(
2305                                         ps_ctxt->pu1_ctb_nbr_map,
2306                                         ps_ctxt->i4_nbr_map_strd,
2307                                         ps_cu_node->ps_parent->u2_x0 << 1,
2308                                         ps_cu_node->ps_parent->u2_y0 << 1,
2309                                         (ps_cu_node->ps_parent->u1_cu_size >> 2),
2310                                         0);
2311 
2312                                     //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
2313 
2314                                     /* Eval for TUSize = CuSize */
2315                                     ihevce_mode_eval_filtering(
2316                                         ps_cu_node->ps_parent,
2317                                         ps_cu_node,
2318                                         ps_ctxt,
2319                                         ps_curr_src,
2320                                         26,
2321                                         &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2322                                         &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2323                                         step2_bypass,
2324                                         1);
2325 
2326                                     if(i4_enable_1cu_4tu)
2327                                     {
2328                                         /* Eval for TUSize = CuSize/2 */
2329                                         ihevce_mode_eval_filtering(
2330                                             ps_cu_node->ps_parent,
2331                                             ps_cu_node,
2332                                             ps_ctxt,
2333                                             ps_curr_src,
2334                                             26,
2335                                             &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2336                                             &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2337                                             step2_bypass,
2338                                             0);
2339                                     }
2340                                     else
2341                                     {
2342                                         /* 4TU not evaluated :  4tu modes set same as 1tu modes */
2343                                         memcpy(
2344                                             &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2345                                             &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2346                                             NUM_BEST_MODES);
2347 
2348                                         /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
2349                                         memcpy(
2350                                             &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2351                                             &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2352                                             NUM_BEST_MODES * sizeof(WORD32));
2353                                     }
2354 
2355                                     /* Update parent cost */
2356                                     parent_cost =
2357                                         MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2358                                             ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
2359 
2360                                     /* Select the best mode to be populated as top and left nbr depending on the
2361                             4tu and 1tu cost */
2362                                     if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
2363                                        ps_cu_node->ps_parent->au4_best_cost_1tu[0])
2364                                     {
2365                                         ps_cu_node->ps_parent->best_mode =
2366                                             ps_cu_node->ps_parent->au1_best_mode_1tu[0];
2367                                     }
2368                                     else
2369                                     {
2370                                         ps_cu_node->ps_parent->best_mode =
2371                                             ps_cu_node->ps_parent->au1_best_mode_4tu[0];
2372                                     }
2373                                 }
2374 
2375                                 /* set the CU valid flag */
2376                                 ps_intra8_analyse->b1_valid_cu = 1;
2377                                 ps_intra8_analyse->b1_enable_nxn = 0;
2378 
2379                                 /* storing the modes to intra 8 analyse */
2380 
2381                                 /* store the best 8x8 modes 8x8 tu */
2382                                 memcpy(
2383                                     &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
2384                                     &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2385                                     sizeof(UWORD8) * (NUM_BEST_MODES));
2386                                 ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
2387 
2388                                 /* store the best 8x8 modes 4x4 tu */
2389                                 memcpy(
2390                                     &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
2391                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2392                                     sizeof(UWORD8) * (NUM_BEST_MODES));
2393                                 ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
2394 
2395                                 /*As 8*8 has won, pick L1 4x4 qp which is equal to
2396                                 L1 8x8 Qp*/
2397                                 //ps_row_cu->u1_cu_possible_qp[0] = u1_cu_possible_qp;
2398                                 //ps_row_cu->i4_act_factor[0][1] = i4_act_factor;
2399 
2400                                 parent_best_mode = ps_cu_node->ps_parent->best_mode;
2401                                 if(parent_cost <=
2402                                    child_cost_least +
2403                                        (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >> LAMBDA_Q_SHIFT))
2404                                 {
2405                                     /*CU = 4TU */
2406                                     ihevce_update_cand_list(
2407                                         ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2408 
2409                                     /* store the child 8x8 costs */
2410                                     pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2411                                         parent_cost;
2412 
2413                                     /* cost accumalation of best cu size candiate */
2414                                     i8_frame_acc_satd_cost += parent_cost;
2415 
2416                                     /*satd/mod_qp accumulation of best cu */
2417                                     i8_frame_acc_satd_by_modqp_q10 +=
2418                                         ((LWORD64)ps_cu_node->ps_parent->best_satd
2419                                          << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2420                                         i4_q_scale_q3_mod;
2421 
2422                                     /* Accumalate mode bits for all child blocks */
2423                                     i8_frame_acc_mode_bits_cost +=
2424                                         ps_cu_node->ps_parent->u2_mode_bits_cost;
2425 
2426                                     /* satd and mpm bits accumalation of best cu size candiate */
2427                                     i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
2428 
2429                                     /* accumulate the 16x16 cost*/
2430                                     if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
2431                                     {
2432                                         *pi4_intra_16_cost = parent_cost;
2433                                     }
2434                                     else
2435                                     {
2436                                         *pi4_intra_16_cost += parent_cost;
2437                                     }
2438 
2439                                     /* accumulate the 32x32 cost*/
2440                                     if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
2441                                     {
2442                                         *pi4_intra_32_cost = parent_cost;
2443                                     }
2444                                     else
2445                                     {
2446                                         *pi4_intra_32_cost += parent_cost;
2447                                     }
2448                                 }
2449                                 else
2450                                 {
2451                                     /*CU = 4PU*/
2452                                     //ps_row_cu->b3_cu_pos_x = (UWORD8) ps_cu_node->ps_parent->u2_x0;
2453                                     //ps_row_cu->b3_cu_pos_y = (UWORD8) ps_cu_node->ps_parent->u2_y0;
2454                                     //ps_row_cu->u1_cu_size  = ps_cu_node->ps_parent->u1_cu_size;
2455 
2456                                     /* store the child 8x8 costs woth 4x4 pu summed cost */
2457                                     pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2458                                         (child_cost_least);
2459 
2460                                     /* accumulate the 16x16 cost*/
2461                                     if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
2462                                     {
2463                                         *pi4_intra_16_cost = child_cost_least;
2464                                     }
2465                                     else
2466                                     {
2467                                         *pi4_intra_16_cost += child_cost_least;
2468                                     }
2469 
2470                                     /* cost accumalation of best cu size candiate */
2471                                     i8_frame_acc_satd_cost += child_cost_least;
2472 
2473                                     for(j = 0; j < 4; j++)
2474                                     {
2475                                         /*satd/qp accumualtion*/
2476                                         i8_frame_acc_satd_by_modqp_q10 +=
2477                                             ((LWORD64)child_satd[j]
2478                                              << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2479                                             i4_q_scale_q3_mod;
2480 
2481                                         /* Accumalate mode bits for all child blocks */
2482                                         i8_frame_acc_mode_bits_cost +=
2483                                             ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
2484 
2485                                         /* satd and mpm bits accumalation of best cu size candiate */
2486                                         i4_ctb_acc_satd += child_satd[j];
2487                                     }
2488 
2489                                     /* accumulate the 32x32 cost*/
2490                                     if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
2491                                     {
2492                                         *pi4_intra_32_cost = child_cost_least;
2493                                     }
2494                                     else
2495                                     {
2496                                         *pi4_intra_32_cost += child_cost_least;
2497                                     }
2498 
2499                                     ps_intra8_analyse->b1_enable_nxn = 1;
2500 
2501                                     /* Insert the best 8x8 modes unconditionally */
2502 
2503                                     x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
2504                                     y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
2505                                     size = ps_cu_node->u1_cu_size >> 2;
2506 
2507                                     ps_ctxt->au1_ctb_mode_map[y][x] =
2508                                         ps_cu_node->ps_sub_cu[0]->best_mode;
2509                                     ps_ctxt->au1_ctb_mode_map[y][x + 1] =
2510                                         ps_cu_node->ps_sub_cu[1]->best_mode;
2511                                     ps_ctxt->au1_ctb_mode_map[y + 1][x] =
2512                                         ps_cu_node->ps_sub_cu[2]->best_mode;
2513                                     ps_ctxt->au1_ctb_mode_map[y + 1][x + 1] =
2514                                         ps_cu_node->ps_sub_cu[3]->best_mode;
2515                                 }
2516                                 /* NXN mode population */
2517                                 for(j = 0; j < 4; j++)
2518                                 {
2519                                     cand_mode_list[0] =
2520                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
2521                                     cand_mode_list[1] =
2522                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[1];
2523                                     cand_mode_list[2] =
2524                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[2];
2525 
2526                                     if(1)
2527                                     {
2528                                         /* Populate best 3 nxn modes */
2529                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] =
2530                                             cand_mode_list[0];
2531                                         ps_intra8_analyse->au1_4x4_best_modes[j][1] =
2532                                             cand_mode_list[1];  //(ps_ed + 1)->best_mode;
2533                                         ps_intra8_analyse->au1_4x4_best_modes[j][2] =
2534                                             cand_mode_list[2];  //(ps_ed + 2)->best_mode;
2535                                         ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
2536 
2537                                         //memcpy(ps_intra8_analyse->au1_4x4_best_modes[j], ps_row_cu->s_cu_intra_cand.au1_intra_luma_modes_nxn[j], 4);
2538                                     }
2539                                     /* For HQ, all 35 modes to be used for RDOPT, removed from here for memory clean-up */
2540 
2541                                     else /* IHEVCE_QUALITY_P0 == i4_quality_preset */
2542                                     {
2543                                         /* To indicate to enc loop that NXN is enabled in HIGH QUALITY fior CU 8x8*/
2544                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] = 0;
2545                                     }
2546 
2547                                     ps_intra8_analyse
2548                                         ->au1_4x4_best_modes[j][MAX_INTRA_CU_CANDIDATES] = 255;
2549                                 }
2550 
2551                                 //ps_row_cu++;
2552                             }
2553                             else
2554                             {
2555                                 /* For Incomplete CTB, 16x16 is not valid */
2556                                 ps_intra16_analyse->b1_valid_cu = 0;
2557                             }
2558                             blk_cnt++;
2559                             ps_ed_blk_l1++;
2560                         }
2561                         //ps_ed_blk_l2 ++;
2562                     }  //else of EIID
2563 #endif
2564                 }
2565             }
2566             else
2567             {
2568                 /* For incomplete CTB, init valid CU to 0 */
2569                 ps_ed_blk_l1++;
2570                 ps_intra32_analyse->b1_valid_cu = 0;
2571                 ps_intra16_analyse[0].b1_valid_cu = 0;
2572                 blk_cnt++;
2573                 merge_64x64 = 0;
2574             }
2575         } while(blk_cnt != MAX_CTB_SIZE);
2576         /* if 64x64 merge is possible then check for 32x32 having same best modes */
2577         if(1 == merge_64x64)
2578         {
2579             WORD32 act_mode = au1_best_32x32_modes[0];
2580 
2581             ps_ed_blk_l2 = ps_ed_l2_ctb;
2582             best_mode = ps_ed_blk_l2->best_mode;
2583             merge_64x64 =
2584                 ((act_mode == au1_best_32x32_modes[0]) + (act_mode == au1_best_32x32_modes[1]) +
2585                      (act_mode == au1_best_32x32_modes[2]) +
2586                      (act_mode == au1_best_32x32_modes[3]) ==
2587                  4);
2588             if(merge_64x64 == 1)
2589                 best_mode = au1_best_32x32_modes[0];
2590             else
2591                 best_mode = ps_ed_blk_l2->best_mode;
2592             /* All 32x32 costs are accumalated to 64x64 cost */
2593             ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
2594             for(i = 0; i < 4; i++)
2595             {
2596                 ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
2597                     ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
2598             }
2599 
2600             /* If all modes of 32x32 block is not same */
2601             if(0 == merge_64x64)
2602             {
2603                 /*Compute CHILD cost for 32x32 */
2604                 WORD32 child_cost_64x64 = au4_best_32x32_cost[0] + au4_best_32x32_cost[1] +
2605                                           au4_best_32x32_cost[2] + au4_best_32x32_cost[3];
2606                 WORD32 cost = MAX_INTRA_COST_IPE;
2607 
2608                 WORD32 best_mode_temp = 0;
2609                 /*Compute 64x64 cost for each mode of 32x32*/
2610                 for(i = 0; i < 4; i++)
2611                 {
2612                     WORD32 mode = au1_best_32x32_modes[i];
2613                     if(mode < 2)
2614                         mode = 26;
2615                     ps_cu_node->ps_parent->u1_cu_size = 64;
2616                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[0]; /* Populate properly */
2617                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[0]; /* Populate properly */
2618 
2619                     ihevce_set_nbr_map(
2620                         ps_ctxt->pu1_ctb_nbr_map,
2621                         ps_ctxt->i4_nbr_map_strd,
2622                         (ps_cu_node->ps_parent->u2_x0 << 1),
2623                         (ps_cu_node->ps_parent->u2_y0 << 1),
2624                         (ps_cu_node->ps_parent->u1_cu_size >> 2),
2625                         0);
2626 
2627                     ihevce_mode_eval_filtering(
2628                         ps_cu_node->ps_parent,
2629                         ps_cu_node,
2630                         ps_ctxt,
2631                         ps_curr_src,
2632                         mode,
2633                         &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2634                         &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2635                         !step2_bypass,
2636                         0);
2637 
2638                     parent_cost = ps_cu_node->ps_parent->best_cost;
2639                     if(cost > parent_cost)
2640                     {
2641                         cost = parent_cost;
2642                         best_mode_temp = ps_cu_node->ps_parent->best_mode;
2643                     }
2644                 }
2645                 if(cost < child_cost_64x64)
2646                 {
2647                     merge_64x64 = 1;
2648                     best_mode = best_mode_temp;
2649 
2650                     /* Update 64x64 cost if CU 64x64 is chosen  */
2651                     ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = cost;
2652 
2653                     /* Accumalate the least cost for CU 64x64 */
2654                     i8_frame_acc_satd_cost = cost;
2655                     i8_frame_acc_mode_bits_cost = ps_cu_node->ps_parent->u2_mode_bits_cost;
2656 
2657                     /* satd and mpm bits accumalation of best cu size candiate */
2658                     i4_ctb_acc_satd = ps_cu_node->ps_parent->best_satd;
2659                 }
2660             }
2661         }
2662 
2663         if(merge_64x64)
2664         {
2665             WORD32 i, j;
2666             intra32_analyse_t *ps_intra32_analyse;
2667             intra16_analyse_t *ps_intra16_analyse;
2668             WORD32 row, col;
2669             WORD32 i4_q_scale_q3_mod;
2670             WORD8 i1_cu_possible_qp;
2671             WORD32 i4_act_factor;
2672             //ps_row_cu = ps_curr_cu;
2673             ps_ctb_out->u4_cu_split_flags = 0x0;
2674             ps_ed_blk_l1 = ps_ed_l1_ctb;
2675             ps_ed_blk_l2 = ps_ed_l2_ctb;
2676 
2677             ps_l0_ipe_out_ctb->u1_split_flag = 0;
2678 
2679             /* If CU size of 64x64 is chosen, disbale all the 16x16 flag*/
2680             for(i = 0; i < 4; i++)
2681             {
2682                 /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
2683                 /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
2684                 ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[i];
2685 
2686                 for(j = 0; j < 4; j++)
2687                 {
2688                     /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
2689                     /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
2690                     ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[j];
2691                     ps_intra16_analyse->b1_merge_flag = 0;
2692                 }
2693             }
2694 
2695             /* CU size 64x64 and fill the final cu params */
2696             //ps_row_cu->b3_cu_pos_x = gau1_cu_pos_x[0];
2697             //ps_row_cu->b3_cu_pos_y = gau1_cu_pos_y[0];
2698             //ps_row_cu->u1_cu_size  = 64;
2699 
2700             /* Candidate mode Update */
2701             cand_mode_list[0] = best_mode;
2702             if(cand_mode_list[0] > 1)
2703             {
2704                 if(cand_mode_list[0] == 2)
2705                 {
2706                     cand_mode_list[1] = 34;
2707                     cand_mode_list[2] = 3;
2708                 }
2709                 else if(cand_mode_list[0] == 34)
2710                 {
2711                     cand_mode_list[1] = 2;
2712                     cand_mode_list[2] = 33;
2713                 }
2714                 else
2715                 {
2716                     cand_mode_list[1] = cand_mode_list[0] - 1;
2717                     cand_mode_list[2] = cand_mode_list[0] + 1;
2718                 }
2719                 //cand_mode_list[1] = ps_ed_blk_l1->nang_attr.best_mode;
2720                 //cand_mode_list[2] = ps_ed_blk_l1->ang_attr.best_mode;
2721             }
2722             else
2723             {
2724                 cand_mode_list[0] = 0;
2725                 cand_mode_list[1] = 1;
2726                 cand_mode_list[2] = 26;
2727                 //cand_mode_list[2] = ps_ed_blk_l1->nang_attr.best_mode;
2728             }
2729 
2730             /* All 32x32 costs are accumalated to 64x64 cost */
2731             ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
2732             for(i = 0; i < 4; i++)
2733             {
2734                 ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
2735                     ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
2736             }
2737             /* by default 64x64 modes are set to default values DC and Planar */
2738             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = cand_mode_list[0];
2739             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = cand_mode_list[1];
2740             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = cand_mode_list[2];
2741             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[3] = 255;
2742 
2743             /* Update CTB mode map for the finalised CU */
2744             x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
2745             y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
2746             size = ps_cu_node->u1_cu_size >> 2;
2747 
2748             for(row = y; row < (y + size); row++)
2749             {
2750                 for(col = x; col < (x + size); col++)
2751                 {
2752                     ps_ctxt->au1_ctb_mode_map[row][col] = best_mode;
2753                 }
2754             }
2755 
2756             ihevce_set_nbr_map(
2757                 ps_ctxt->pu1_ctb_nbr_map,
2758                 ps_ctxt->i4_nbr_map_strd,
2759                 (ps_cu_node->u2_x0 << 1),
2760                 (ps_cu_node->u2_y0 << 1),
2761                 (ps_cu_node->u1_cu_size >> 2),
2762                 1);
2763 
2764             /*As 64*64 has won, pick L1 32x32 qp*/
2765             //ASSERT(((blk_cnt>>6) & 0xF) == (blk_cnt>>6));
2766             //ASSERT((blk_cnt>>6) == 0);
2767             ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
2768             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2769                 ps_ctxt->i4_qscale,
2770                 ps_ed_ctb_l1->i4_32x32_satd[0][0],
2771                 ps_ctxt->ld_curr_frame_32x32_log_avg[0],
2772                 f_strength,
2773                 &i4_act_factor,
2774                 &i4_q_scale_q3_mod,
2775                 ps_ctxt->ps_rc_quant_ctxt);
2776 
2777             i8_frame_acc_satd_by_modqp_q10 =
2778                 (i8_frame_acc_satd_cost << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2779                 i4_q_scale_q3_mod;
2780             /* Increment pointers */
2781             ps_ed_blk_l1 += 64;
2782             ps_ed_blk_l2 += 16;
2783             //ps_row_cu++;
2784         }
2785     }
2786 
2787     //ps_ctb_out->u1_num_cus_in_ctb = (UWORD8)(ps_row_cu - ps_curr_cu);
2788 
2789     {
2790         WORD32 i4_i, i4_j;
2791         WORD32 dummy;
2792         WORD8 i1_cu_qp;
2793         (void)i1_cu_qp;
2794         /*MAM_VAR_L1*/
2795         for(i4_j = 0; i4_j < 2; i4_j++)
2796         {
2797             i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[i4_j];
2798             f_strength = ps_ctxt->f_strength;
2799 
2800             //i4_mod_factor_num = 4;
2801 
2802             ps_ed_blk_l1 = ps_ed_l1_ctb;
2803             ps_ed_blk_l2 = ps_ed_l2_ctb;
2804             //ps_row_cu = ps_curr_cu;
2805 
2806             /*Valid only for complete CTB */
2807             if((64 == u1_curr_ctb_wdt) && (64 == u1_curr_ctb_hgt))
2808             {
2809                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
2810                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][1] != -2);
2811                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][2] != -2);
2812                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][3] != -2);
2813 
2814                 i1_cu_qp = ihevce_cu_level_qp_mod(
2815                     ps_ctxt->i4_qscale,
2816                     ps_ed_ctb_l1->i4_32x32_satd[0][0],
2817                     ps_ctxt->ld_curr_frame_32x32_log_avg[0],
2818                     f_strength,
2819                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j],
2820                     &dummy,
2821                     ps_ctxt->ps_rc_quant_ctxt);
2822 
2823                 i1_cu_qp = ihevce_cu_level_qp_mod(
2824                     ps_ctxt->i4_qscale,
2825                     ps_ed_ctb_l1->i4_32x32_satd[0][1],
2826                     ps_ctxt->ld_curr_frame_32x32_log_avg[1],
2827                     f_strength,
2828                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j],
2829                     &dummy,
2830                     ps_ctxt->ps_rc_quant_ctxt);
2831                 i1_cu_qp = ihevce_cu_level_qp_mod(
2832                     ps_ctxt->i4_qscale,
2833                     ps_ed_ctb_l1->i4_32x32_satd[0][2],
2834                     ps_ctxt->ld_curr_frame_32x32_log_avg[2],
2835                     f_strength,
2836                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j],
2837                     &dummy,
2838                     ps_ctxt->ps_rc_quant_ctxt);
2839 
2840                 i1_cu_qp = ihevce_cu_level_qp_mod(
2841                     ps_ctxt->i4_qscale,
2842                     ps_ed_ctb_l1->i4_32x32_satd[0][3],
2843                     2.0 + ps_ctxt->ld_curr_frame_16x16_log_avg[0],
2844                     f_strength,
2845                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j],
2846                     &dummy,
2847                     ps_ctxt->ps_rc_quant_ctxt);
2848 
2849                 ASSERT(ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] > 0);
2850             }
2851             else
2852             {
2853                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j] = 1024;
2854                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j] = 1024;
2855                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j] = 1024;
2856                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] = 1024;
2857             }
2858 
2859             /*Store the 8x8 Qps from L2 (in raster order) as output of intra prediction
2860             for the usage by ME*/
2861 
2862             {
2863                 WORD32 pos_x_32, pos_y_32, pos;
2864                 //WORD32 i4_incomplete_ctb_val_8;
2865                 pos_x_32 = u1_curr_ctb_wdt / 16;
2866                 pos_y_32 = u1_curr_ctb_hgt / 16;
2867 
2868                 pos = (pos_x_32 < pos_y_32) ? pos_x_32 : pos_y_32;
2869 
2870                 for(i4_i = 0; i4_i < 4; i4_i++)
2871                 {
2872                     if(i4_i < pos)
2873                     {
2874                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] != -2);
2875                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] != -2);
2876                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] != -2);
2877                         i1_cu_qp = ihevce_cu_level_qp_mod(
2878                             ps_ctxt->i4_qscale,
2879                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][0],
2880                             ps_ctxt->ld_curr_frame_16x16_log_avg[0],
2881                             f_strength,
2882                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j],
2883                             &dummy,
2884                             ps_ctxt->ps_rc_quant_ctxt);
2885                         i1_cu_qp = ihevce_cu_level_qp_mod(
2886                             ps_ctxt->i4_qscale,
2887                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][1],
2888                             ps_ctxt->ld_curr_frame_16x16_log_avg[1],
2889                             f_strength,
2890                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j],
2891                             &dummy,
2892                             ps_ctxt->ps_rc_quant_ctxt);
2893                         i1_cu_qp = ihevce_cu_level_qp_mod(
2894                             ps_ctxt->i4_qscale,
2895                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][2],
2896                             ps_ctxt->ld_curr_frame_16x16_log_avg[2],
2897                             f_strength,
2898                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j],
2899                             &dummy,
2900                             ps_ctxt->ps_rc_quant_ctxt);
2901                     }
2902                     else
2903                     {
2904                         /*For incomplete CTB */
2905                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j] = 1024;
2906                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j] = 1024;
2907                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j] = 1024;
2908                     }
2909                 }
2910             }
2911 
2912             /*Store the 8x8 Qps from L1 (in raster order) as output of intra prediction
2913             for the usage by ME*/
2914             {
2915                 WORD32 pos_x_16, pos_y_16, pos;
2916                 //WORD32 i4_incomplete_ctb_val_8;
2917                 pos_x_16 = u1_curr_ctb_wdt / 4;
2918                 pos_y_16 = u1_curr_ctb_hgt / 4;
2919 
2920                 pos = (pos_x_16 < pos_y_16) ? pos_x_16 : pos_y_16;
2921                 for(i4_i = 0; i4_i < 16; i4_i++)
2922                 {
2923                     if(i4_i < pos)
2924                     {
2925                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] != -2);
2926                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] != -2);
2927                         i1_cu_qp = ihevce_cu_level_qp_mod(
2928                             ps_ctxt->i4_qscale,
2929                             ps_ed_ctb_l1->i4_8x8_satd[i4_i][0],
2930                             ps_ctxt->ld_curr_frame_8x8_log_avg[0],
2931                             f_strength,
2932                             &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j],
2933                             &dummy,
2934                             ps_ctxt->ps_rc_quant_ctxt);
2935                         i1_cu_qp = ihevce_cu_level_qp_mod(
2936                             ps_ctxt->i4_qscale,
2937                             ps_ed_ctb_l1->i4_8x8_satd[i4_i][1],
2938                             ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2939                             f_strength,
2940                             &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j],
2941                             &dummy,
2942                             ps_ctxt->ps_rc_quant_ctxt);
2943                     }
2944                     else
2945                     {
2946                         /*For incomplete CTB */
2947                         ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j] = 1024;
2948                         ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j] = 1024;
2949                     }
2950                 }
2951             }
2952         }  //for loop
2953 
2954         /* Accumalate the cost of ctb to the total cost */
2955         ps_ctxt->i8_frame_acc_satd_cost += i8_frame_acc_satd_cost;
2956         ps_ctxt->i8_frame_acc_satd_by_modqp_q10 += i8_frame_acc_satd_by_modqp_q10;
2957 
2958         ps_ctxt->i8_frame_acc_mode_bits_cost += i8_frame_acc_mode_bits_cost;
2959 
2960         /* satd and mpm bits accumalation of best cu size candiate for the ctb */
2961         ps_l0_ipe_out_ctb->i4_ctb_acc_satd = i4_ctb_acc_satd;
2962         ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = i8_frame_acc_mode_bits_cost;
2963 
2964         ps_ctxt->i8_frame_acc_satd += i4_ctb_acc_satd;
2965     }
2966 
2967     {
2968         WORD32 ctr_8x8;
2969         for(ctr_8x8 = 0; ctr_8x8 < (MAX_CU_IN_CTB >> 2); ctr_8x8++)
2970         {
2971             /*Accumalate activity factor for Intra and Inter*/
2972             if(ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[ctr_8x8] <
2973                ps_ed_ctb_l1->i4_sad_me_for_ref[ctr_8x8])
2974             {
2975                 ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
2976                     ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
2977             }
2978             else
2979             {
2980                 ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
2981                     ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
2982             }
2983 
2984             /*Accumalate activity factor at frame level*/
2985             ps_ctxt->i8_frame_acc_act_factor += ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8];
2986         }
2987     }
2988     return;
2989 }
2990 
ihevce_nxn_sad_computer(UWORD8 * pu1_inp,WORD32 i4_inp_stride,UWORD8 * pu1_ref,WORD32 i4_ref_stride,WORD32 trans_size)2991 WORD32 ihevce_nxn_sad_computer(
2992     UWORD8 *pu1_inp, WORD32 i4_inp_stride, UWORD8 *pu1_ref, WORD32 i4_ref_stride, WORD32 trans_size)
2993 {
2994     WORD32 wd, ht, i, j;
2995     WORD32 sad = 0;
2996 
2997     wd = trans_size;
2998     ht = trans_size;
2999 
3000     for(i = 0; i < ht; i++)
3001     {
3002         for(j = 0; j < wd; j++)
3003         {
3004             sad += (ABS(((WORD32)pu1_inp[j] - (WORD32)pu1_ref[j])));
3005         }
3006         pu1_inp += i4_inp_stride;
3007         pu1_ref += i4_ref_stride;
3008     }
3009 
3010     return sad;
3011 }
3012 
3013 /*!
3014 ******************************************************************************
3015 * \if Function name : ihevce_mode_eval_filtering \endif
3016 *
3017 * \brief
3018 *    Evaluates best 3 modes for the given CU size with probable modes from,
3019 *    early decision structure, mpm candidates and dc, planar mode
3020 *
3021 * \param[in] ps_cu_node : pointer to MAX cu node info buffer
3022 * \param[in] ps_child_cu_node : pointer to (MAX - 1) cu node info buffer
3023 * \param[in] ps_ctxt : pointer to IPE context struct
3024 * \param[in] ps_curr_src : pointer to src pixels struct
3025 * \param[in] best_amode : best angular mode from l1 layer or
3026                             from (MAX - 1) CU mode
3027 * \param[in] best_costs_4x4  : pointer to 3 best cost buffer
3028 * \param[in] best_modes_4x4  : pointer to 3 best mode buffer
3029 * \param[in] step2_bypass : if 0, (MAX - 1) CU is evaluated
3030 *                           if 1, (MAX CU) sugested is evaluated
3031 * \param[in] tu_eq_cu     : indicates if tu size is same as cu or cu/2
3032 *
3033 * \return
3034 *    None
3035 *
3036 * \author
3037 *  Ittiam
3038 *
3039 *****************************************************************************
3040 */
ihevce_mode_eval_filtering(ihevce_ipe_cu_tree_t * ps_cu_node,ihevce_ipe_cu_tree_t * ps_child_cu_node,ihevce_ipe_ctxt_t * ps_ctxt,iv_enc_yuv_buf_t * ps_curr_src,WORD32 best_amode,WORD32 * best_costs_4x4,UWORD8 * best_modes_4x4,WORD32 step2_bypass,WORD32 tu_eq_cu)3041 void ihevce_mode_eval_filtering(
3042     ihevce_ipe_cu_tree_t *ps_cu_node,
3043     ihevce_ipe_cu_tree_t *ps_child_cu_node,
3044     ihevce_ipe_ctxt_t *ps_ctxt,
3045     iv_enc_yuv_buf_t *ps_curr_src,
3046     WORD32 best_amode,
3047     WORD32 *best_costs_4x4,
3048     UWORD8 *best_modes_4x4,
3049     WORD32 step2_bypass,
3050     WORD32 tu_eq_cu)
3051 {
3052     UWORD8 *pu1_origin, *pu1_orig;
3053     WORD32 src_strd = ps_curr_src->i4_y_strd;
3054     WORD32 nbr_flags;
3055     nbr_avail_flags_t s_nbr;
3056     WORD32 trans_size = tu_eq_cu ? ps_cu_node->u1_cu_size : ps_cu_node->u1_cu_size >> 1;
3057     WORD32 num_tu_in_x = tu_eq_cu ? 1 : 2;
3058     WORD32 num_tu_in_y = tu_eq_cu ? 1 : 2;
3059     UWORD8 mode;
3060 
3061     WORD32 cost_ang_mode = MAX_INTRA_COST_IPE;
3062     WORD32 filter_flag;
3063     WORD32 cost_amode_step2[7] = { 0 };
3064     /*WORD32 best_sad[5];  // NOTE_A01: Not getting consumed at present */
3065     WORD32 sad = 0;
3066     WORD32 cu_pos_x, cu_pos_y;
3067     WORD32 temp;
3068     WORD32 i = 0, j, k, i_end, z;
3069     //WORD32 row, col, size;
3070     UWORD8 *pu1_ref;
3071     WORD32 xA, yA, xB, yB;
3072     WORD32 top_intra_mode;
3073     WORD32 left_intra_mode;
3074     UWORD8 *pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3075     UWORD8 *pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3076 
3077     UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
3078     WORD32 count;
3079 
3080     pf_ipe_res_trans_had apf_resd_trns_had[4];
3081 
3082     WORD32 cand_mode_satd_list[3];
3083     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
3084 
3085     ihevc_intra_pred_luma_ref_substitution_fptr =
3086         ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
3087 
3088     apf_resd_trns_had[0] = ps_ctxt->s_cmn_opt_func.pf_HAD_4x4_8bit;
3089     apf_resd_trns_had[1] = ps_ctxt->s_cmn_opt_func.pf_HAD_8x8_8bit;
3090     apf_resd_trns_had[2] = ps_ctxt->s_cmn_opt_func.pf_HAD_16x16_8bit;
3091     apf_resd_trns_had[3] = ps_ctxt->s_cmn_opt_func.pf_HAD_32x32_8bit;
3092 
3093     /* initialize modes_to_eval as zero */
3094     memset(&ps_ctxt->au1_modes_to_eval, 0, MAX_NUM_IP_MODES);
3095 
3096     /* Compute the Parent Cost */
3097 
3098     /* Pointer to top-left of the CU - y0,x0 in 8x8 granularity */
3099     pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) + ((ps_cu_node->u2_y0 << 3) * src_strd) +
3100                (ps_cu_node->u2_x0 << 3);
3101 
3102     /* Get position of CU within CTB at 4x4 granularity */
3103     cu_pos_x = ps_cu_node->u2_x0 << 1;
3104     cu_pos_y = ps_cu_node->u2_y0 << 1;
3105 
3106     /* get the neighbour availability flags */
3107     ihevce_get_only_nbr_flag(
3108         &s_nbr,
3109         ps_ctxt->pu1_ctb_nbr_map,
3110         ps_ctxt->i4_nbr_map_strd,
3111         cu_pos_x,
3112         cu_pos_y,
3113         trans_size >> 2,
3114         trans_size >> 2);
3115 
3116     /* Traverse for all 4 child blocks in the parent block */
3117     xA = (ps_cu_node->u2_x0 << 3) >> 2;
3118     yA = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
3119     xB = xA + 1;
3120     yB = yA - 1;
3121     left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
3122     top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
3123     /* call the function which populates sad cost for all the modes */
3124 
3125     ihevce_intra_populate_mode_bits_cost_bracketing(
3126         top_intra_mode,
3127         left_intra_mode,
3128         s_nbr.u1_top_avail,
3129         s_nbr.u1_left_avail,
3130         ps_cu_node->u2_y0,
3131         &ps_ctxt->au2_mode_bits_satd_cost[0],
3132         &ps_ctxt->au2_mode_bits_satd[0],
3133         ps_ctxt->i4_ol_satd_lambda,
3134         cand_mode_satd_list);
3135 
3136     for(k = 0; k < num_tu_in_y; k++)
3137     {
3138         for(j = 0; j < num_tu_in_x; j++)
3139         {
3140             /* get the neighbour availability flags */
3141             nbr_flags = ihevce_get_nbr_intra(
3142                 &s_nbr,
3143                 ps_ctxt->pu1_ctb_nbr_map,
3144                 ps_ctxt->i4_nbr_map_strd,
3145                 cu_pos_x + ((j) * (trans_size >> 2)),
3146                 cu_pos_y + ((k) * (trans_size >> 2)),
3147                 trans_size >> 2);
3148 
3149             pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3150 
3151             /* Create reference samples array */
3152             ihevc_intra_pred_luma_ref_substitution_fptr(
3153                 pu1_origin - src_strd - 1,
3154                 pu1_origin - src_strd,
3155                 pu1_origin - 1,
3156                 src_strd,
3157                 trans_size,
3158                 nbr_flags,
3159                 pu1_ref_orig,
3160                 0);
3161 
3162             /* Perform reference samples filtering */
3163             ihevce_intra_pred_ref_filtering(pu1_ref_orig, trans_size, pu1_ref_filt);
3164 
3165             ihevce_set_nbr_map(
3166                 ps_ctxt->pu1_ctb_nbr_map,
3167                 ps_ctxt->i4_nbr_map_strd,
3168                 cu_pos_x + ((j) * (trans_size >> 2)),
3169                 cu_pos_y + ((k) * (trans_size >> 2)),
3170                 (trans_size >> 2),
3171                 1);
3172 
3173             pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3174             pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3175         }
3176     }
3177 
3178     /* Revaluation for angular mode */
3179     //if(ps_ed_blk->ang_attr.mode_present == 1)
3180     //if(((best_amode & 0x1) != 1))
3181 
3182     {
3183         WORD32 u1_trans_idx = trans_size >> 3;
3184         if(trans_size == 32)
3185             u1_trans_idx = 3;
3186         //best_amode = ps_ed_blk->ang_attr.best_mode;
3187 
3188         i = 0;
3189         if(!step2_bypass)
3190         {
3191             /* Around best level 4 angular mode, search for best level 2 mode */
3192             ASSERT((best_amode >= 2) && (best_amode <= 34));
3193 
3194             if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
3195             {
3196                 if(best_amode >= 4)
3197                     ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode - 2;
3198             }
3199 
3200             ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode;
3201 
3202             if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
3203             {
3204                 if(best_amode <= 32)
3205                     ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode + 2;
3206             }
3207         }
3208         else
3209         {
3210             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[0]->best_mode;
3211             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[1]->best_mode;
3212             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[2]->best_mode;
3213             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[3]->best_mode;
3214         }
3215 
3216         /* Add the left and top MPM modes for computation*/
3217 
3218         ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[0];
3219         ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[1];
3220 
3221         i_end = i;
3222         count = 0;
3223 
3224         /*Remove duplicate modes from modes_to_eval_temp[] */
3225         for(j = 0; j < i_end; j++)
3226         {
3227             for(k = 0; k < count; k++)
3228             {
3229                 if(ps_ctxt->au1_modes_to_eval_temp[j] == ps_ctxt->au1_modes_to_eval[k])
3230                     break;
3231             }
3232             if((k == count) && (ps_ctxt->au1_modes_to_eval_temp[j] > 1))
3233             {
3234                 ps_ctxt->au1_modes_to_eval[count] = ps_ctxt->au1_modes_to_eval_temp[j];
3235                 count++;
3236             }
3237         }
3238         i_end = count;
3239         if(count == 0)
3240         {
3241             ps_ctxt->au1_modes_to_eval[0] = 26;
3242             i_end = 1;
3243         }
3244 
3245         for(i = 0; i < i_end; i++)
3246         {
3247             pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3248             pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3249 
3250             mode = ps_ctxt->au1_modes_to_eval[i];
3251             ASSERT((mode >= 2) && (mode <= 34));
3252             cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
3253             filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
3254 
3255             for(k = 0; k < num_tu_in_y; k++)
3256             {
3257                 for(j = 0; j < num_tu_in_x; j++)
3258                 {
3259                     pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3260 
3261                     if(0 == filter_flag)
3262                         pu1_ref = pu1_ref_orig;
3263                     else
3264                         pu1_ref = pu1_ref_filt;
3265 
3266                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
3267                         pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
3268 
3269                     if(ps_ctxt->u1_use_satd)
3270                     {
3271                         sad = apf_resd_trns_had[u1_trans_idx](
3272                             pu1_origin,
3273                             ps_curr_src->i4_y_strd,
3274                             &ps_ctxt->au1_pred_samples[0],
3275                             trans_size,
3276                             NULL,
3277                             0
3278 
3279                         );
3280                     }
3281                     else
3282                     {
3283                         sad = ps_ctxt->s_ipe_optimised_function_list.pf_nxn_sad_computer(
3284                             pu1_origin,
3285                             ps_curr_src->i4_y_strd,
3286                             &ps_ctxt->au1_pred_samples[0],
3287                             trans_size,
3288                             trans_size);
3289                     }
3290 
3291                     cost_amode_step2[i] += sad;
3292 
3293                     pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3294                     pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3295                 }
3296             }
3297         }
3298         best_amode = ps_ctxt->au1_modes_to_eval[0];
3299         /*Init cost indx */
3300         cost_ang_mode = MAX_INTRA_COST_IPE;  //cost_amode_step2[0];
3301         for(z = 0; z < i_end; z++)
3302         {
3303             /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
3304             if(cost_ang_mode >= cost_amode_step2[z])
3305             {
3306                 if(cost_ang_mode == cost_amode_step2[z])
3307                 {
3308                     if(best_amode > ps_ctxt->au1_modes_to_eval[z])
3309                         best_amode = ps_ctxt->au1_modes_to_eval[z];
3310                 }
3311                 else
3312                 {
3313                     best_amode = ps_ctxt->au1_modes_to_eval[z];
3314                 }
3315                 cost_ang_mode = cost_amode_step2[z];
3316             }
3317         }
3318 
3319         /*Modify mode bits for the angular modes */
3320     }
3321 
3322     {
3323         /* Step - I modification */
3324         ASSERT((best_amode >= 2) && (best_amode <= 34));
3325         i_end = 0;
3326         z = 0;
3327 
3328         /* Around best level 3 angular mode, search for best level 1 mode */
3329         ps_ctxt->au1_modes_to_eval[i_end++] = 0;
3330         ps_ctxt->au1_modes_to_eval[i_end++] = 1;
3331 
3332         if(best_amode != 2)
3333             ps_ctxt->au1_modes_to_eval[i_end++] = best_amode - 1;
3334 
3335         ps_ctxt->au1_modes_to_eval[i_end++] = best_amode;
3336 
3337         if(best_amode != 34)
3338             ps_ctxt->au1_modes_to_eval[i_end++] = best_amode + 1;
3339 
3340         /* Inserting step_2's best mode at last to avoid
3341         recalculation of it's SATD cost */
3342 
3343         //ps_ctxt->au1_modes_to_eval[i_end] = best_amode; //Bugfix: HSAD compared with SAD
3344         //cost_amode_step2[i_end] = cost_ang_mode;
3345 
3346         /*best_sad[i_end] = cost_ang_mode
3347                 - mode_bits_satd_cost[best_amode]; //See NOTE_A01 above */
3348 
3349         cost_ang_mode = MAX_INTRA_COST_IPE; /* Init cost */
3350 
3351         for(i = 0; i < i_end; i++)
3352         {
3353             WORD32 u1_trans_idx = trans_size >> 3;
3354             if(trans_size == 32)
3355                 u1_trans_idx = 3;
3356             pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3357             pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3358 
3359             /*best_sad[i] = 0; //See NOTE_A01 above */
3360             mode = ps_ctxt->au1_modes_to_eval[i];
3361             cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
3362             filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
3363 
3364             for(k = 0; k < num_tu_in_y; k++)
3365             {
3366                 for(j = 0; j < num_tu_in_x; j++)
3367                 {
3368                     pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3369 
3370                     if(0 == filter_flag)
3371                         pu1_ref = pu1_ref_orig;
3372                     else
3373                         pu1_ref = pu1_ref_filt;
3374 
3375                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
3376                         pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
3377 
3378                     //if(trans_size != 4)
3379                     {
3380                         sad = apf_resd_trns_had[u1_trans_idx](
3381                             pu1_origin,
3382                             ps_curr_src->i4_y_strd,
3383                             &ps_ctxt->au1_pred_samples[0],
3384                             trans_size,
3385                             NULL,
3386                             0);
3387                     }
3388 
3389                     /*accumualting SATD though name says it is sad*/
3390                     cost_amode_step2[i] += sad;
3391                     /*best_sad[i] +=sad; //See NOTE_A01 above */
3392                     pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3393                     pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3394                 }
3395             }
3396         }
3397         /* Updating i_end for the step_2's inserted mode*/
3398         //        i_end++;
3399 
3400         /* Arrange the reference array in ascending order */
3401 
3402         for(i = 0; i < (i_end - 1); i++)
3403         {
3404             for(j = i + 1; j < i_end; j++)
3405             {
3406                 if(cost_amode_step2[i] > cost_amode_step2[j])
3407                 {
3408                     temp = cost_amode_step2[i];
3409                     cost_amode_step2[i] = cost_amode_step2[j];
3410                     cost_amode_step2[j] = temp;
3411 
3412                     temp = modes_4x4[i];
3413                     modes_4x4[i] = modes_4x4[j];
3414                     modes_4x4[j] = temp;
3415                 }
3416             }
3417         }
3418 
3419         /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
3420         best_amode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
3421         cost_ang_mode = cost_amode_step2[0];
3422         ps_cu_node->best_satd = cost_ang_mode - ps_ctxt->au2_mode_bits_satd_cost[best_amode];
3423         ps_cu_node->best_cost = cost_amode_step2[0];
3424         ps_cu_node->best_mode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
3425         ps_cu_node->best_satd =
3426             ps_cu_node->best_cost - ps_ctxt->au2_mode_bits_satd_cost[ps_cu_node->best_mode];
3427 
3428         /*Accumalate best mode bits cost for RC*/
3429         ps_cu_node->u2_mode_bits_cost = ps_ctxt->au2_mode_bits_satd[ps_cu_node->best_mode];
3430 
3431         /* Store the best three candidates */
3432         for(i = 0; i < 3; i++)
3433         {
3434             best_costs_4x4[i] = cost_amode_step2[i];
3435             best_modes_4x4[i] = ps_ctxt->au1_modes_to_eval[modes_4x4[i]];
3436         }
3437     }
3438 
3439     return;
3440 }
3441