1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /*!
22 ******************************************************************************
23 * \file ihevce_decomp_pre_intra_pass.c
24 *
25 * \brief
26 * This file contains definitions related to frame decomposition done during
27 * pre intra processing
28 *
29 * \date
30 * 19/02/2013
31 *
32 * \author
33 * Ittiam
34 *
35 * List of Functions
36 * ihevce_intra_populate_mode_bits_cost()
37 * ihevce_8x8_sad_computer()
38 * ihevce_4x4_sad_computer()
39 * ihevce_ed_4x4_find_best_modes()
40 * ihevce_ed_calc_4x4_blk()
41 * ihevce_ed_calc_8x8_blk()
42 * ihevce_ed_calc_incomplete_ctb()
43 * ihevce_cu_level_qp_mod()
44 * ihevce_ed_calc_ctb()
45 * ihevce_ed_frame_init()
46 * ihevce_scale_by_2()
47 * ihevce_decomp_pre_intra_process_row()
48 * ihevce_decomp_pre_intra_process()
49 * ihevce_decomp_pre_intra_get_num_mem_recs()
50 * ihevce_decomp_pre_intra_get_mem_recs()
51 * ihevce_decomp_pre_intra_init()
52 * ihevce_decomp_pre_intra_frame_init()
53 * ihevce_merge_sort()
54 * ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit()
55 *
56 ******************************************************************************
57 */
58
59 /*****************************************************************************/
60 /* File Includes */
61 /*****************************************************************************/
62 /* System include files */
63 #include <stdio.h>
64 #include <string.h>
65 #include <stdlib.h>
66 #include <assert.h>
67 #include <stdarg.h>
68 #include <math.h>
69 #include <limits.h>
70
71 /* User include files */
72 #include "ihevc_typedefs.h"
73 #include "itt_video_api.h"
74 #include "ihevce_api.h"
75
76 #include "rc_cntrl_param.h"
77 #include "rc_frame_info_collector.h"
78 #include "rc_look_ahead_params.h"
79
80 #include "ihevc_defs.h"
81 #include "ihevc_debug.h"
82 #include "ihevc_structs.h"
83 #include "ihevc_platform_macros.h"
84 #include "ihevc_deblk.h"
85 #include "ihevc_itrans_recon.h"
86 #include "ihevc_chroma_itrans_recon.h"
87 #include "ihevc_chroma_intra_pred.h"
88 #include "ihevc_intra_pred.h"
89 #include "ihevc_inter_pred.h"
90 #include "ihevc_mem_fns.h"
91 #include "ihevc_padding.h"
92 #include "ihevc_weighted_pred.h"
93 #include "ihevc_sao.h"
94 #include "ihevc_resi_trans.h"
95 #include "ihevc_quant_iquant_ssd.h"
96 #include "ihevc_cabac_tables.h"
97
98 #include "ihevce_defs.h"
99 #include "ihevce_hle_interface.h"
100 #include "ihevce_lap_enc_structs.h"
101 #include "ihevce_multi_thrd_structs.h"
102 #include "ihevce_multi_thrd_funcs.h"
103 #include "ihevce_me_common_defs.h"
104 #include "ihevce_had_satd.h"
105 #include "ihevce_error_codes.h"
106 #include "ihevce_bitstream.h"
107 #include "ihevce_cabac.h"
108 #include "ihevce_rdoq_macros.h"
109 #include "ihevce_function_selector.h"
110 #include "ihevce_enc_structs.h"
111 #include "ihevce_entropy_structs.h"
112 #include "ihevce_cmn_utils_instr_set_router.h"
113 #include "ihevce_ipe_instr_set_router.h"
114 #include "ihevce_decomp_pre_intra_structs.h"
115 #include "ihevce_decomp_pre_intra_pass.h"
116 #include "ihevce_enc_loop_structs.h"
117 #include "hme_datatype.h"
118 #include "hme_interface.h"
119 #include "hme_common_defs.h"
120 #include "ihevce_global_tables.h"
121
122 /*****************************************************************************/
123 /* Typedefs */
124 /*****************************************************************************/
125 typedef void (*pf_ed_calc_ctb)(
126 ihevce_ed_ctxt_t *ps_ed_ctxt,
127 ihevce_ed_blk_t *ps_ed_ctb,
128 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
129 UWORD8 *pu1_src,
130 WORD32 src_stride,
131 WORD32 num_4x4_blks_x,
132 WORD32 num_4x4_blks_y,
133 WORD32 *nbr_flags,
134 WORD32 i4_layer_id,
135 WORD32 row_block_no,
136 WORD32 col_block_no,
137 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
138 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list);
139
140 /*****************************************************************************/
141 /* Constant Macros */
142 /*****************************************************************************/
143 #define SATD_NOISE_FLOOR_THRESHOLD 16
144 #define MINIMUM_VARIANCE 15
145 #define SCALE_FACTOR_VARIANCE 20
146 #define SCALE_FACTOR_VARIANCE_8x8 60
147 #define MIN_SATD_THRSHLD 0
148 #define MAX_SATD_THRSHLD 64
149 #define SUB_NOISE_THRSHLD 0
150 #define MIN_BLKS 2
151
152 /*****************************************************************************/
153 /* Global variables */
154 /*****************************************************************************/
155
156 /**
157 *****************************************************************************
158 * @brief list of pointers to luma intra pred functions
159 *****************************************************************************
160 */
161 pf_intra_pred g_apf_lum_ip[NUM_IP_FUNCS];
162
163 /*****************************************************************************/
164 /* Function Definitions */
165 /*****************************************************************************/
166
167 /*!
168 ******************************************************************************
169 * \if Function name : ihevce_intra_populate_mode_bits_cost \endif
170 *
171 * \brief: look-up table of cost of signalling an intra mode in the
172 * bitstream
173 *
174 *****************************************************************************
175 */
ihevce_intra_populate_mode_bits_cost(WORD32 top_intra_mode,WORD32 left_intra_mode,WORD32 available_top,WORD32 available_left,WORD32 cu_pos_y,UWORD16 * mode_bits_cost,WORD32 lambda)176 void ihevce_intra_populate_mode_bits_cost(
177 WORD32 top_intra_mode,
178 WORD32 left_intra_mode,
179 WORD32 available_top,
180 WORD32 available_left,
181 WORD32 cu_pos_y,
182 UWORD16 *mode_bits_cost,
183 WORD32 lambda)
184 {
185 WORD32 i;
186 // 5.5 * lambda
187 UWORD16 five_bits_cost = COMPUTE_RATE_COST_CLIP30(11, lambda, (LAMBDA_Q_SHIFT + 1));
188
189 (void)top_intra_mode;
190 (void)left_intra_mode;
191 (void)available_top;
192 (void)available_left;
193 (void)cu_pos_y;
194 for(i = 0; i < NUM_MODES; i++)
195 {
196 mode_bits_cost[i] = five_bits_cost;
197 }
198 }
199
200 /*!
201 ******************************************************************************
202 * \if Function name : ihevce_8x8_sad_computer \endif
203 *
204 * \brief: compute sad between 2 8x8 blocks
205 *
206 *****************************************************************************
207 */
208 UWORD16
ihevce_8x8_sad_computer(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd)209 ihevce_8x8_sad_computer(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
210 {
211 UWORD16 sad = 0;
212 WORD32 i, j;
213
214 for(i = 0; i < 8; i++)
215 {
216 for(j = 0; j < 8; j++)
217 {
218 sad += ABS(*pu1_src - *pu1_pred);
219 pu1_src++;
220 pu1_pred++;
221 }
222 pu1_src = pu1_src + (src_strd - 8);
223 pu1_pred = pu1_pred + (pred_strd - 8);
224 }
225
226 return sad;
227 }
228
229 /*!
230 ******************************************************************************
231 * \if Function name : ihevce_4x4_sad_computer \endif
232 *
233 * \brief: compute sad between 2 4x4 blocks
234 *
235 *****************************************************************************
236 */
237 UWORD16
ihevce_4x4_sad_computer(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 src_strd,WORD32 pred_strd)238 ihevce_4x4_sad_computer(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
239 {
240 UWORD16 sad = 0;
241 WORD32 i, j;
242
243 for(i = 0; i < 4; i++)
244 {
245 for(j = 0; j < 4; j++)
246 {
247 sad += ABS(*pu1_src - *pu1_pred);
248 pu1_src++;
249 pu1_pred++;
250 }
251 pu1_src = pu1_src + (src_strd - 4);
252 pu1_pred = pu1_pred + (pred_strd - 4);
253 }
254
255 return sad;
256 }
257
258 /*!
259 ******************************************************************************
260 * \if Function name : ihevce_ed_4x4_find_best_modes \endif
261 *
262 * \brief: evaluate input 4x4 block for pre-selected list of angular and normal
263 * intra modes and return best sad, cost
264 *
265 *****************************************************************************
266 */
ihevce_ed_4x4_find_best_modes(UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * ref,UWORD16 * mode_bits_cost,UWORD8 * pu1_best_modes,WORD32 * pu1_best_sad_costs,WORD32 u1_low_resol,FT_SAD_COMPUTER * pf_4x4_sad_computer)267 void ihevce_ed_4x4_find_best_modes(
268 UWORD8 *pu1_src,
269 WORD32 src_stride,
270 UWORD8 *ref,
271 UWORD16 *mode_bits_cost,
272 UWORD8 *pu1_best_modes,
273 WORD32 *pu1_best_sad_costs,
274 WORD32 u1_low_resol,
275 FT_SAD_COMPUTER *pf_4x4_sad_computer)
276 {
277 WORD32 i;
278 UWORD8 mode = 0, best_amode = 0, best_nmode = 0;
279 UWORD8 pred[16];
280 WORD32 sad = 0;
281 WORD32 sad_cost = 0;
282 WORD32 best_asad_cost = 0xFFFFF;
283 WORD32 best_nsad_cost = 0xFFFFF;
284
285 /* If lower layers, l1 or l2, all the 11 modes are evaluated */
286 /* If L0 layer, all modes excluding DC and Planar are evaluated */
287 if(1 == u1_low_resol)
288 i = 0;
289 else
290 i = 2;
291
292 /* Find the best non-angular and angular mode till level 4 */
293 for(; i < 11; i++)
294 {
295 mode = gau1_modes_to_eval[i];
296 g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
297 sad = pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
298 sad_cost = sad;
299 sad_cost += mode_bits_cost[mode];
300 if(mode < 2)
301 {
302 if(sad_cost < best_nsad_cost)
303 {
304 best_nmode = mode;
305 best_nsad_cost = sad_cost;
306 }
307 }
308 else
309 {
310 if(sad_cost < best_asad_cost)
311 {
312 best_amode = mode;
313 best_asad_cost = sad_cost;
314 }
315 }
316 }
317
318 pu1_best_modes[0] = best_amode;
319 pu1_best_sad_costs[0] = best_asad_cost;
320
321 /* Accumalate the best non-angular mode and cost for the l1 and l2 layers */
322 if(1 == u1_low_resol)
323 {
324 pu1_best_modes[1] = best_nmode;
325 pu1_best_sad_costs[1] = best_nsad_cost;
326 }
327 }
328
329 /*!
330 ******************************************************************************
331 * \if Function name : ihevce_ed_calc_4x4_blk \endif
332 *
333 * \brief: evaluate input 4x4 block for all intra modes and return best sad &
334 * cost
335 *
336 *****************************************************************************
337 */
ihevce_ed_calc_4x4_blk(ihevce_ed_blk_t * ps_ed,UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * ref,UWORD16 * mode_bits_cost,WORD32 * sad_ptr,WORD32 * pi4_best_satd,WORD32 i4_quality_preset,WORD32 * pi4_best_sad_cost,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list)338 static void ihevce_ed_calc_4x4_blk(
339 ihevce_ed_blk_t *ps_ed,
340 UWORD8 *pu1_src,
341 WORD32 src_stride,
342 UWORD8 *ref,
343 UWORD16 *mode_bits_cost,
344 WORD32 *sad_ptr,
345 WORD32 *pi4_best_satd,
346 WORD32 i4_quality_preset,
347 WORD32 *pi4_best_sad_cost,
348 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list)
349 {
350 WORD32 i, i_end;
351 UWORD8 mode, best_amode, best_nmode;
352 UWORD8 pred[16];
353
354 UWORD16 sad;
355 WORD32 sad_cost = 0;
356 WORD32 best_asad_cost = 0xFFFFF;
357 WORD32 best_nsad_cost = 0xFFFFF;
358
359 UWORD8 au1_best_modes[2];
360 WORD32 ai4_best_sad_costs[2];
361
362 /* L1/L2 resolution hence low resolution enable */
363 WORD32 u1_low_resol = 1;
364
365 UWORD8 modes_to_eval[2];
366
367 /* The *pi4_best_satd will be consumed only if current
368 layer has odd number of 4x4 blocks in either x or y
369 direction. But the function hme_derive_num_layers() makes
370 sure that every layer has width and height such that each one
371 is a multiple of 16. Which makes pi4_best_satd useless. Hence
372 feel free to remove pi4_best_satd. Concluded on 29th Aug13 */
373 *pi4_best_satd = -1;
374 ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
375 pu1_src,
376 src_stride,
377 ref,
378 mode_bits_cost,
379 au1_best_modes,
380 ai4_best_sad_costs,
381 u1_low_resol,
382 ps_ipe_optimised_function_list->pf_4x4_sad_computer);
383
384 best_nmode = au1_best_modes[1];
385 best_amode = au1_best_modes[0];
386 best_nsad_cost = ai4_best_sad_costs[1];
387 best_asad_cost = ai4_best_sad_costs[0];
388
389 /* Updation of pi4_best_satd here needed iff the mode given by
390 ihevce_ed_4x4_find_best_modes() comes out to be
391 the best mode at the end of the function */
392 *pi4_best_satd = best_asad_cost - mode_bits_cost[best_amode];
393
394 /* Around best level 4 angular mode, search for best level 2 mode */
395 modes_to_eval[0] = best_amode - 2;
396 modes_to_eval[1] = best_amode + 2;
397 i = 0;
398 i_end = 2;
399 if(best_amode == 2)
400 i = 1;
401 else if(best_amode == 34)
402 i_end = 1;
403 for(; i < i_end; i++)
404 {
405 mode = modes_to_eval[i];
406 g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
407 sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
408 sad_cost = sad;
409 sad_cost += mode_bits_cost[mode];
410 if(sad_cost < best_asad_cost)
411 {
412 best_amode = mode;
413 best_asad_cost = sad_cost;
414 *pi4_best_satd = sad;
415 }
416 sad_ptr[mode] = sad;
417 }
418
419 /*To be done : Add a flag here instead of preset condn*/
420 if((i4_quality_preset < IHEVCE_QUALITY_P4))
421 {
422 /* Around best level 2 angular mode, search for best level 1 mode */
423 modes_to_eval[0] = best_amode - 1;
424 modes_to_eval[1] = best_amode + 1;
425 i = 0;
426 i_end = 2;
427 if(best_amode == 2)
428 i = 1;
429 else if(best_amode == 34)
430 i_end = 1;
431 for(; i < i_end; i++)
432 {
433 mode = modes_to_eval[i];
434 g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
435 sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(
436 pu1_src, &pred[0], src_stride, 4);
437 sad_cost = sad;
438 sad_cost += mode_bits_cost[mode];
439 if(sad_cost < best_asad_cost)
440 {
441 best_amode = mode;
442 best_asad_cost = sad_cost;
443 *pi4_best_satd = sad;
444 }
445 sad_ptr[mode] = sad;
446 }
447 }
448
449 if(best_asad_cost < best_nsad_cost)
450 {
451 ps_ed->best_mode = best_amode;
452 *pi4_best_sad_cost = best_asad_cost;
453 }
454 else
455 {
456 ps_ed->best_mode = best_nmode;
457 *pi4_best_sad_cost = best_nsad_cost;
458 }
459 ps_ed->intra_or_inter = 0;
460 ps_ed->merge_success = 0;
461 }
462
463 /*!
464 ******************************************************************************
465 * \if Function name : ihevce_ed_calc_8x8_blk \endif
466 *
467 * \brief: evaluate input 8x8 block for intra modes basing on the intra mode
468 * decisions made at 4x4 level. This function also makes a decision whether
469 * to split blk in to 4x4 partitions or not.
470 *
471 *****************************************************************************
472 */
ihevce_ed_calc_8x8_blk(ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_8x8,UWORD8 * pu1_src,WORD32 src_stride,WORD32 * nbr_flags_ptr,WORD32 * top_intra_mode_ptr,WORD32 * left_intra_mode_ptr,WORD32 cu_pos_y,WORD32 lambda,WORD32 * sad_ptr_8x8,WORD32 * pi4_best_satd,WORD32 i4_layer_id,WORD32 i4_quality_preset,WORD32 i4_slice_type,WORD32 * pi4_best_sad_cost_8x8_l1_ipe,WORD32 * pi4_best_sad_8x8_l1_ipe,WORD32 * pi4_sum_4x4_satd,WORD32 * pi4_min_4x4_satd,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)473 static void ihevce_ed_calc_8x8_blk(
474 ihevce_ed_ctxt_t *ps_ed_ctxt,
475 ihevce_ed_blk_t *ps_ed_8x8,
476 UWORD8 *pu1_src,
477 WORD32 src_stride,
478 WORD32 *nbr_flags_ptr,
479 WORD32 *top_intra_mode_ptr,
480 WORD32 *left_intra_mode_ptr,
481 WORD32 cu_pos_y,
482 WORD32 lambda,
483 WORD32 *sad_ptr_8x8,
484 WORD32 *pi4_best_satd,
485 WORD32 i4_layer_id,
486 WORD32 i4_quality_preset,
487 WORD32 i4_slice_type,
488 WORD32 *pi4_best_sad_cost_8x8_l1_ipe,
489 WORD32 *pi4_best_sad_8x8_l1_ipe,
490 WORD32 *pi4_sum_4x4_satd,
491 WORD32 *pi4_min_4x4_satd,
492 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
493 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
494 {
495 WORD32 i, j;
496 WORD32 nbr_flags, nbr_flags_TR;
497 UWORD8 *pu1_src_4x4;
498 WORD32 top_available;
499 WORD32 left_available;
500 ihevce_ed_blk_t *ps_ed_4x4 = ps_ed_8x8;
501 WORD32 top_intra_mode;
502 WORD32 left_intra_mode;
503 WORD32 next_left_intra_mode;
504 WORD32 *sad_ptr = sad_ptr_8x8;
505 UWORD8 *pu1_src_arr[4];
506 WORD32 i4_4x4_best_sad_cost[4];
507 func_selector_t *ps_func_selector = ps_ed_ctxt->ps_func_selector;
508 ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution =
509 ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
510
511 (void)i4_slice_type;
512
513 /* Compute ref samples for 8x8 merge block */
514 nbr_flags = nbr_flags_ptr[0];
515 nbr_flags_TR = nbr_flags_ptr[1];
516
517 if(CHECK_TR_AVAILABLE(nbr_flags_TR))
518 {
519 SET_TR_AVAILABLE(nbr_flags);
520 }
521 else
522 {
523 SET_TR_UNAVAILABLE(nbr_flags);
524 }
525
526 if(CHECK_BL_AVAILABLE(nbr_flags))
527 {
528 SET_BL_AVAILABLE(nbr_flags);
529 }
530 else
531 {
532 SET_BL_UNAVAILABLE(nbr_flags);
533 }
534
535 /* call the function which populates ref data for intra predicion */
536 pf_intra_pred_luma_ref_substitution(
537 pu1_src - src_stride - 1,
538 pu1_src - src_stride,
539 pu1_src - 1,
540 src_stride,
541 8,
542 nbr_flags,
543 &ps_ed_ctxt->au1_ref_8x8[0][0],
544 0);
545
546 for(i = 0; i < 2; i++)
547 {
548 pu1_src_4x4 = pu1_src + i * 4 * src_stride;
549 cu_pos_y += i * 4;
550 next_left_intra_mode = left_intra_mode_ptr[i];
551 for(j = 0; j < 2; j++)
552 {
553 WORD32 i4_best_satd;
554 pu1_src_arr[i * 2 + j] = pu1_src_4x4;
555 nbr_flags = nbr_flags_ptr[i * 8 + j];
556 top_intra_mode = top_intra_mode_ptr[j];
557 left_intra_mode = next_left_intra_mode;
558 /* call the function which populates ref data for intra predicion */
559 pf_intra_pred_luma_ref_substitution(
560 pu1_src_4x4 - src_stride - 1,
561 pu1_src_4x4 - src_stride,
562 pu1_src_4x4 - 1,
563 src_stride,
564 4,
565 nbr_flags,
566 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
567 0);
568
569 top_available = CHECK_T_AVAILABLE(nbr_flags);
570 left_available = CHECK_L_AVAILABLE(nbr_flags);
571 /* call the function which populates sad cost for all the modes */
572 ihevce_intra_populate_mode_bits_cost(
573 top_intra_mode,
574 left_intra_mode,
575 top_available,
576 left_available,
577 cu_pos_y,
578 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0],
579 lambda);
580 ihevce_ed_calc_4x4_blk(
581 ps_ed_4x4,
582 pu1_src_4x4,
583 src_stride,
584 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
585 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0],
586 sad_ptr,
587 &i4_best_satd,
588 i4_quality_preset,
589 &i4_4x4_best_sad_cost[i * 2 + j],
590 ps_ipe_optimised_function_list);
591
592 top_intra_mode_ptr[j] = ps_ed_4x4->best_mode;
593 next_left_intra_mode = ps_ed_4x4->best_mode;
594 pu1_src_4x4 += 4;
595 ps_ed_4x4 += 1;
596 sad_ptr += NUM_MODES;
597 }
598 left_intra_mode_ptr[i] = next_left_intra_mode;
599 }
600
601 /* 8x8 merge */
602 {
603 UWORD8 modes_to_eval[6];
604 WORD32 sad;
605 UWORD8 pred[16];
606 UWORD8 pred_8x8[64] = { 0 };
607 WORD32 merge_success;
608 UWORD8 mode;
609
610 ps_ed_4x4 = ps_ed_8x8;
611 mode = (ps_ed_4x4)->best_mode;
612
613 *pi4_best_satd = -1;
614
615 merge_success =
616 ((((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 1)->best_mode) +
617 ((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 2)->best_mode) +
618 ((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 3)->best_mode)) == 3);
619
620 {
621 WORD32 i4_satd;
622 //UWORD16 au2_4x4_sad_cost_array[4];/*SAD of 4x4 blocks*/
623 UWORD16 u2_sum_best_4x4_sad_cost; /*Sum of 4x4 sad costs*/
624 UWORD16 u2_sum_best_4x4_satd_cost; /*Sum of 4x4 satd costs*/
625 UWORD8 u1_best_8x8_mode; /*8x8 mode.*/
626 UWORD16 u2_best_8x8_cost; /*8x8 Cost. Can store SATD/SAD cost*/
627 WORD32 i4_best_8x8_sad_satd; /* SATD/SAD value of 8x8 block*/
628 UWORD16 au2_8x8_costs[6] = { 0 }; /*Cost of 8x8 block for 6 modes*/
629 UWORD8 u1_cond_4x4_satd; /*condition if 4x4 SATD needs to be done*/
630 UWORD8 u1_cond_8x8_satd; /*condition if 8x8 SATD needs to be done*/
631 UWORD8 u1_good_quality;
632 WORD32 i4_merge_success_stage2;
633
634 /*Initiallization*/
635 *pi4_best_satd = 0;
636 u2_best_8x8_cost = (UWORD16)(-1) /*max value*/;
637 u2_sum_best_4x4_sad_cost = 0;
638 *pi4_sum_4x4_satd = -1;
639 *pi4_min_4x4_satd = 0x7FFFFFFF;
640 i4_best_8x8_sad_satd = 0;
641 u2_sum_best_4x4_satd_cost = 0;
642 u1_best_8x8_mode = ps_ed_4x4->best_mode;
643
644 /*We thought of "replacing" SATDs by SADs for 4x4 vs 8x8 decision
645 for speed improvement, but it gave opposite results. Setting
646 good_quality to 1 in order to throw away the idea of "replacing".*/
647 u1_good_quality = 1;
648 //u1_good_quality = ((i4_quality_preset != IHEVCE_QUALITY_P5)
649 // && (i4_quality_preset != IHEVCE_QUALITY_P4));
650
651 /*Needed to disable some processing based on speed preset*/
652 i4_merge_success_stage2 = 0;
653
654 /*Store SAD cost of 4x4 blocks */
655 for(i = 0; i < 4; i++)
656 {
657 //au2_4x4_sad_cost_array[i] = (ps_ed_4x4 + i)->best_sad_cost;
658 u2_sum_best_4x4_sad_cost +=
659 i4_4x4_best_sad_cost[i]; //(ps_ed_4x4 + i)->best_sad_cost;
660 modes_to_eval[i] = (ps_ed_4x4 + i)->best_mode;
661 /*NOTE_01: i4_4x4_satd is not used anywhere at present.
662 Setting it to zero to avoid ASSERT failure */
663 /*Now taken care of incomplete CTB*/
664 //(ps_ed_4x4 + i)->i4_4x4_satd = 0;
665 }
666
667 /*Calculate SATD/SAd for 4x4 blocks*/
668 /*For (layer_2 && high_speed): No need to get 4x4 SATDs bcoz
669 it won't have any impact on quality but speed will improve.*/
670 u1_cond_4x4_satd = ((1 == i4_layer_id) || (u1_good_quality && (!merge_success)));
671
672 if(u1_cond_4x4_satd)
673 {
674 *pi4_sum_4x4_satd = 0;
675 /*FYI: 1. Level 2 doesn't need the SATD.
676 2. The 4x4 vs. 8x8 decision for high_speed will
677 happen based on SAD. */
678 /*Get SATD for 4x4 blocks */
679 for(i = 0; i < 4; i++)
680 {
681 mode = modes_to_eval[i];
682 g_apf_lum_ip[g_i4_ip_funcs[mode]](
683 &ps_ed_ctxt->au1_ref_full_ctb[i][0], 0, &pred[0], 4, 4, mode);
684
685 i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
686 pu1_src_arr[i], src_stride, &pred[0], 4, NULL, 0);
687
688 {
689 /*Save 4x4x satd in ed blk struct */
690 (ps_ed_4x4 + i)->i4_4x4_satd = i4_satd;
691 }
692
693 /*(ps_ed_4x4 + i)->i4_4x4_satd = i4_satd; // See NOTE_01*/
694 u2_sum_best_4x4_satd_cost +=
695 ((UWORD16)i4_satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
696 *pi4_best_satd += i4_satd;
697 }
698 }
699 /* Not being used in current code */
700 else /* (Level_2 && extreme_speed) */
701 {
702 /******DONT ENTER HERE AT aNY COST***************************/
703 /* Transistor killers lie ahead!!!!!!! */
704 /*This else part is not getting executed as of now*/
705 if(2 != i4_layer_id)
706 ASSERT(0);
707 /*Update values by SAD_cost_array */
708 for(i = 0; i < 4; i++)
709 {
710 mode = modes_to_eval[i];
711 //u2_sum_best_4x4_satd_cost += au2_4x4_sad_cost_array[i];
712 //sad = (WORD32)((ps_ed_4x4 + i)->best_sad_cost - ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
713 sad = (WORD32)(
714 i4_4x4_best_sad_cost[i] - ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
715 *pi4_sum_4x4_satd += sad;
716 /*(ps_ed_4x4 + i)->i4_4x4_satd = sad;// See NOTE_01*/
717 *pi4_best_satd += sad;
718
719 if(*pi4_min_4x4_satd > sad)
720 *pi4_min_4x4_satd = sad;
721 }
722 }
723 if(!merge_success) /*If the modes are not identical*/
724 {
725 UWORD8 i1_start; /* no of modes to evaluate */
726 UWORD8 ai1_modes[6];
727
728 /* Prepare 6 candidates for 8x8 block. Two are DC and planar */
729 ai1_modes[4] = 0;
730 ai1_modes[5] = 1;
731 i1_start = 4;
732
733 /*Assign along with removing duplicates rest 4 candidates. */
734 for(i = 3; i >= 0; i--)
735 {
736 WORD8 i1_fresh_mode_flag = 1;
737 mode = modes_to_eval[i];
738 /*Check if duplicate already exists in ai1_modes*/
739 for(j = i1_start; j < 6; j++)
740 {
741 if(mode == ai1_modes[j])
742 i1_fresh_mode_flag = 0;
743 }
744 if(i1_fresh_mode_flag)
745 {
746 i1_start--;
747 ai1_modes[i1_start] = mode;
748 }
749 }
750
751 /*Calculate SATD/SAD of 8x8 block for all modes*/
752 /*If (u1_good_quality == 0) then SATD gets replaced by SAD*/
753 if(u1_good_quality && (i4_quality_preset <= IHEVCE_QUALITY_P4))
754 {
755 //7.5 * lambda to incorporate transfrom flags
756 u2_sum_best_4x4_satd_cost +=
757 (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
758
759 /*Loop over all modes for calculating SATD*/
760 for(i = i1_start; i < 6; i++)
761 {
762 mode = ai1_modes[i];
763 g_apf_lum_ip[g_i4_ip_funcs[mode]](
764 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
765
766 i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
767 pu1_src_arr[0], src_stride, &pred_8x8[0], 8, NULL, 0);
768
769 au2_8x8_costs[i] =
770 ((UWORD16)i4_satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode]);
771
772 /*Update data correspoinding to least 8x8 cost */
773 if(au2_8x8_costs[i] <= u2_best_8x8_cost)
774 {
775 u2_best_8x8_cost = au2_8x8_costs[i];
776 i4_best_8x8_sad_satd = i4_satd;
777 u1_best_8x8_mode = mode;
778 }
779 }
780 /*8x8 vs 4x4 decision based on SATD values*/
781 if((u2_best_8x8_cost <= u2_sum_best_4x4_satd_cost) || (u2_best_8x8_cost <= 300))
782 {
783 i4_merge_success_stage2 = 1;
784 }
785
786 /* EIID: Early inter-intra decision */
787 /* Find the SAD based cost for 8x8 block for best mode */
788 if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
789 {
790 UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
791 WORD32 i4_best_8x8_sad_curr;
792
793 g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]](
794 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, i4_best_8x8_mode);
795
796 i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
797 pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
798
799 //register best sad in the context
800 //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
801
802 //register the best cost in the context
803 //[0]th index is used since all 4 blocks are having same cost right now
804 //also it doesnt depends on mode. It only depends on the lambda
805
806 *pi4_best_sad_cost_8x8_l1_ipe =
807 i4_best_8x8_sad_curr +
808 ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode];
809 *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
810 }
811 }
812 else /*If high_speed or extreme speed*/
813 {
814 //7.5 * lambda to incorporate transfrom flags
815 u2_sum_best_4x4_sad_cost +=
816 (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
817
818 /*Loop over all modes for calculating SAD*/
819 for(i = i1_start; i < 6; i++)
820 {
821 mode = ai1_modes[i];
822 g_apf_lum_ip[g_i4_ip_funcs[mode]](
823 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
824
825 sad = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
826 pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
827
828 au2_8x8_costs[i] +=
829 ((UWORD16)sad + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode]);
830
831 /*Find the data correspoinding to least cost */
832 if(au2_8x8_costs[i] <= u2_best_8x8_cost)
833 {
834 u2_best_8x8_cost = au2_8x8_costs[i];
835 i4_best_8x8_sad_satd = sad;
836 u1_best_8x8_mode = mode;
837 }
838 }
839 /*8x8 vs 4x4 decision based on SAD values*/
840 if((u2_best_8x8_cost <= u2_sum_best_4x4_sad_cost) || (u2_best_8x8_cost <= 300))
841 {
842 i4_merge_success_stage2 = 1;
843 }
844
845 /* EIID: Early inter-intra decision */
846 /* Find the SAD based cost for 8x8 block for best mode */
847 if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
848 {
849 //UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
850 WORD32 i4_best_8x8_sad_cost_curr = u2_best_8x8_cost;
851
852 //register best sad in the context
853 //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
854
855 //register the best cost in the context
856 *pi4_best_sad_cost_8x8_l1_ipe = i4_best_8x8_sad_cost_curr;
857 *pi4_best_sad_8x8_l1_ipe =
858 i4_best_8x8_sad_satd; //i4_best_8x8_sad_cost_curr;
859 }
860 }
861 }
862
863 /***** Modes for 4x4 and 8x8 are decided before this point ****/
864 if(merge_success || i4_merge_success_stage2)
865 {
866 /*FYI: 1. 8x8 SATD is not needed if merge is failed.
867 2. For layer_2: SATD won't be calculated for 8x8. So
868 the best_8x8_cost is SAD-cost. */
869
870 /* Store the 8x8 level data in the first 4x4 block*/
871 ps_ed_4x4->merge_success = 1;
872 ps_ed_4x4->best_merge_mode = u1_best_8x8_mode;
873 /* ps_ed_4x4->best_merge_sad_cost = u2_best_8x8_cost;
874 This data is not getting consumed anywhere at present */
875
876 top_intra_mode_ptr[0] = u1_best_8x8_mode;
877 top_intra_mode_ptr[1] = u1_best_8x8_mode;
878 left_intra_mode_ptr[0] = u1_best_8x8_mode;
879 left_intra_mode_ptr[1] = u1_best_8x8_mode;
880
881 /*If it is layer_1 and high_speed*/
882 u1_cond_8x8_satd =
883 ((1 == i4_layer_id) &&
884 (merge_success || ((!u1_good_quality) && i4_merge_success_stage2)));
885 if(u1_cond_8x8_satd)
886 {
887 mode = u1_best_8x8_mode;
888 g_apf_lum_ip[g_i4_ip_funcs[mode]](
889 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
890
891 if(i4_quality_preset > IHEVCE_QUALITY_P3)
892 {
893 i4_satd = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
894 pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
895 }
896 else
897 {
898 i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
899 pu1_src_arr[0], src_stride, &pred_8x8[0], 8, NULL, 0);
900 }
901 /* u2_best_8x8_cost = ((UWORD16)i4_satd + mode_bits_cost[0][mode]);
902 This data is not getting consumed at present */
903 i4_best_8x8_sad_satd = i4_satd;
904 }
905 *pi4_best_satd = i4_best_8x8_sad_satd;
906
907 /* EIID: Early inter-intra decision */
908 /* Find the SAD based cost for 8x8 block for best mode */
909 if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
910 {
911 UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
912 WORD32 i4_best_8x8_sad_curr;
913
914 g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]](
915 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, i4_best_8x8_mode);
916
917 i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
918 pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
919 //register best sad in the context
920 //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
921
922 //register the best cost in the context
923 //[0]th index is used since all 4 blocks are having same cost right now
924 //also it doesnt depends on mode. It only depends on the lambda
925
926 *pi4_best_sad_cost_8x8_l1_ipe =
927 i4_best_8x8_sad_curr +
928 ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode];
929 *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
930
931 } // EIID ends
932
933 } //if(merge_success || i4_merge_success_stage2)
934 }
935 }
936 }
937
938 /*!
939 ******************************************************************************
940 * \if Function name : ihevce_ed_calc_incomplete_ctb \endif
941 *
942 * \brief: performs L1 8x8 and 4x4 intra mode analysis
943 *
944 *****************************************************************************
945 */
ihevce_ed_calc_incomplete_ctb(ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_ctb,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,UWORD8 * pu1_src,WORD32 src_stride,WORD32 num_4x4_blks_x,WORD32 num_4x4_blks_y,WORD32 * nbr_flags,WORD32 i4_layer_id,WORD32 i4_row_block_no,WORD32 i4_col_block_no,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)946 void ihevce_ed_calc_incomplete_ctb(
947 ihevce_ed_ctxt_t *ps_ed_ctxt,
948 ihevce_ed_blk_t *ps_ed_ctb,
949 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
950 UWORD8 *pu1_src,
951 WORD32 src_stride,
952 WORD32 num_4x4_blks_x,
953 WORD32 num_4x4_blks_y,
954 WORD32 *nbr_flags,
955 WORD32 i4_layer_id,
956 WORD32 i4_row_block_no,
957 WORD32 i4_col_block_no,
958 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
959 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
960 {
961 WORD32 i, j, k;
962 WORD32 z_scan_idx = 0;
963 WORD32 z_scan_act_idx = 0;
964 ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution =
965 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
966
967 //UWORD8 ref[18];
968 //WORD32 top_intra_modes[20];
969 WORD32 *sad_ptr = &ps_ed_ctxt->sad[0];
970 WORD32 lambda = ps_ed_ctxt->lambda;
971 //UWORD16 mode_bits_cost[NUM_MODES];
972
973 UWORD8 *pu1_src_8x8;
974 ihevce_ed_blk_t *ps_ed_8x8, *ps_ed_4x4;
975 WORD32 *top_intra_mode_ptr;
976 WORD32 *left_intra_mode_ptr = ps_ed_ctxt->left_ctb_intra_modes;
977 WORD32 *nbr_flags_ptr;
978 WORD32 top_intra_mode;
979 WORD32 left_intra_mode;
980 WORD32 next_left_intra_mode;
981 WORD32 nbr_flag = 0;
982 WORD32 top_available;
983 WORD32 left_available;
984 UWORD8 *pu1_src_4x4;
985 WORD32 left_over_4x4_blks;
986 WORD32 i4_incomplete_sum_4x4_satd = 0;
987 WORD32 i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
988 WORD32 i4_best_sad_cost_8x8_l1_ipe, i4_best_sad_8x8_l1_ipe, i4_sum_4x4_satd, i4_min_4x4_satd;
989
990 (void)i4_row_block_no;
991 (void)i4_col_block_no;
992 /*Find the modulated qp of 16*16 at L2 from 8*8 SATDs in L2
993 THis is used as 64*64 Qp in L0*/
994 /*For Incomplete CTB, init all SATD to -1 and then popualate for the complete 8x8 blocks (CU 16 in L0)*/
995 /* Not populated for 4x4 blocks (CU 8 in L0), can be done */
996 /*Also, not 32x32 satd is not populated, as it would correspong to CU 64 and it is not an incomplete CTB */
997 if(i4_layer_id == 1)
998 {
999 WORD32 i4_i;
1000
1001 for(i4_i = 0; i4_i < 64; i4_i++)
1002 {
1003 (ps_ed_ctb + i4_i)->i4_4x4_satd = -1;
1004 (ps_ed_ctb + i4_i)->i4_4x4_cur_satd = -1;
1005 }
1006
1007 for(i4_i = 0; i4_i < 16; i4_i++)
1008 {
1009 ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2;
1010 ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF;
1011 ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2;
1012 ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2;
1013 }
1014
1015 for(i4_i = 0; i4_i < 4; i4_i++)
1016 {
1017 ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2;
1018 ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2;
1019 ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2;
1020 }
1021 ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2;
1022 ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2;
1023 ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2;
1024
1025 ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2;
1026
1027 for(i4_i = 0; i4_i < 16; i4_i++)
1028 {
1029 ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -1;
1030 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -1;
1031 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -1;
1032 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -1;
1033 ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -1;
1034 ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -1;
1035 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -1;
1036
1037 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -1;
1038 }
1039 }
1040 /*
1041 * src scan happens in raster scan order. ps_ed update happens in z-scan order.
1042 */
1043 for(i = 0; i < num_4x4_blks_x; i++)
1044 {
1045 ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[i] = INTRA_DC;
1046 }
1047 next_left_intra_mode = left_intra_mode_ptr[0];
1048 for(i = 0; i < num_4x4_blks_y / 2; i++)
1049 {
1050 pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride;
1051 top_intra_mode_ptr = &ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[0];
1052 nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i;
1053
1054 for(j = 0; j < num_4x4_blks_x / 2; j++)
1055 {
1056 WORD32 i4_best_satd;
1057 // Multiply i by 16 since the
1058 // matrix is prepared for ctb_size = 64
1059 z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2];
1060 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
1061 ASSERT(z_scan_act_idx <= 15);
1062 ps_ed_8x8 = ps_ed_ctb + z_scan_idx;
1063
1064 ihevce_ed_calc_8x8_blk(
1065 ps_ed_ctxt,
1066 ps_ed_8x8,
1067 pu1_src_8x8,
1068 src_stride,
1069 nbr_flags_ptr,
1070 top_intra_mode_ptr,
1071 left_intra_mode_ptr,
1072 i * 8,
1073 lambda,
1074 sad_ptr + z_scan_idx * NUM_MODES,
1075 &i4_best_satd,
1076 i4_layer_id,
1077 ps_ed_ctxt->i4_quality_preset,
1078 ps_ed_ctxt->i4_slice_type,
1079 &i4_best_sad_cost_8x8_l1_ipe,
1080 &i4_best_sad_8x8_l1_ipe,
1081 &i4_sum_4x4_satd,
1082 &i4_min_4x4_satd,
1083 ps_ipe_optimised_function_list,
1084 ps_cmn_utils_optimised_function_list);
1085
1086 ASSERT(i4_best_satd >= 0);
1087 if(i4_layer_id == 1)
1088 {
1089 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] =
1090 i4_best_sad_cost_8x8_l1_ipe;
1091 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe;
1092 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd;
1093 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
1094 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
1095 //ps_ed_ctb_l1->i4_sum_4x4_satd[z_scan_act_idx] = i4_sum_4x4_satd;
1096 //ps_ed_ctb_l1->i4_min_4x4_satd[z_scan_act_idx] = i4_min_4x4_satd;
1097 }
1098
1099 pu1_src_8x8 += 8;
1100 //ps_ed_8x8 += 4;
1101 top_intra_mode_ptr += 2;
1102 nbr_flags_ptr += 2;
1103 }
1104
1105 next_left_intra_mode = left_intra_mode_ptr[0];
1106 left_over_4x4_blks = (num_4x4_blks_x - (2 * (num_4x4_blks_x / 2)));
1107 left_over_4x4_blks = left_over_4x4_blks * 2;
1108
1109 pu1_src_4x4 = pu1_src_8x8;
1110
1111 i4_incomplete_sum_4x4_satd = 0;
1112 i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
1113
1114 /* For leftover right 4x4 blks (num_4x4_blks_x - 2 *(num_4x4_blks_x/2))*/
1115 for(k = 0; k < left_over_4x4_blks; k++)
1116 {
1117 WORD32 i4_best_satd;
1118 WORD32 i4_dummy_sad_cost;
1119 // Multiply i by 16 since the
1120 // matrix is prepared for ctb_size = 64
1121 ASSERT(left_over_4x4_blks == 2);
1122 z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + k * 16 + j * 2];
1123 ps_ed_4x4 = ps_ed_ctb + z_scan_idx;
1124
1125 top_intra_mode = ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j];
1126 left_intra_mode = next_left_intra_mode;
1127
1128 nbr_flag = nbr_flags[i * 2 * 8 + k * 8 + j * 2];
1129
1130 /* call the function which populates ref data for intra predicion */
1131 pf_intra_pred_luma_ref_substitution(
1132 pu1_src_4x4 - src_stride - 1,
1133 pu1_src_4x4 - src_stride,
1134 pu1_src_4x4 - 1,
1135 src_stride,
1136 4,
1137 nbr_flag,
1138 &ps_ed_ctxt->au1_ref_ic_ctb[0],
1139 0);
1140
1141 top_available = CHECK_T_AVAILABLE(nbr_flag);
1142 left_available = CHECK_L_AVAILABLE(nbr_flag);
1143 /* call the function which populates sad cost for all the modes */
1144 ihevce_intra_populate_mode_bits_cost(
1145 top_intra_mode,
1146 left_intra_mode,
1147 top_available,
1148 left_available,
1149 i * 4,
1150 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
1151 lambda);
1152
1153 ihevce_ed_calc_4x4_blk(
1154 ps_ed_4x4,
1155 pu1_src_4x4,
1156 src_stride,
1157 &ps_ed_ctxt->au1_ref_ic_ctb[0],
1158 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
1159 sad_ptr + z_scan_idx * NUM_MODES,
1160 &i4_best_satd,
1161 ps_ed_ctxt->i4_quality_preset,
1162 &i4_dummy_sad_cost,
1163 ps_ipe_optimised_function_list);
1164
1165 ASSERT(i4_best_satd >= 0);
1166 if(i4_layer_id == 1) //Can we ignore this check?
1167 {
1168 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
1169 /*Note : The satd population is not populated for last 4*4 block in incomplete CTB */
1170 /* Which corresponds to CU 8 in L0 */
1171
1172 /*MAM_VAR_L1 */
1173 i4_incomplete_sum_4x4_satd = i4_incomplete_sum_4x4_satd + i4_best_satd;
1174 if(i4_incomplete_min_4x4_satd >= i4_best_satd)
1175 i4_incomplete_min_4x4_satd = i4_best_satd;
1176 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
1177 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
1178 if((k & 1) == 0)
1179 {
1180 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = 0;
1181 }
1182 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] += i4_best_satd;
1183 }
1184
1185 ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j * 2] = ps_ed_4x4->best_mode;
1186 next_left_intra_mode = ps_ed_4x4->best_mode;
1187 pu1_src_4x4 += src_stride;
1188 left_intra_mode_ptr[k] = next_left_intra_mode;
1189 }
1190 left_intra_mode_ptr += 2;
1191 }
1192
1193 if(num_4x4_blks_y & 1)
1194 {
1195 /* For leftover bottom 4x4 blks. (num_4x4_blks_x) */
1196 pu1_src_4x4 = pu1_src + i * 2 * 4 * src_stride;
1197 //memset(&ps_ed_ctb_l1->i4_best_satd_8x8[i][0],0,4*sizeof(WORD32));
1198 for(j = 0; j < num_4x4_blks_x; j++)
1199 {
1200 WORD32 i4_best_satd;
1201 WORD32 i4_dummy_sad_cost;
1202 // Multiply i by 16 since the
1203 // matrix is prepared for ctb_size = 64
1204 z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j];
1205 ps_ed_4x4 = ps_ed_ctb + z_scan_idx;
1206
1207 if((j & 1) == 0)
1208 {
1209 i4_incomplete_sum_4x4_satd = 0;
1210 i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
1211 }
1212
1213 top_intra_mode = ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j];
1214 left_intra_mode = next_left_intra_mode;
1215
1216 nbr_flag = nbr_flags[i * 2 * 8 + j];
1217
1218 /* call the function which populates ref data for intra predicion */
1219 pf_intra_pred_luma_ref_substitution(
1220 pu1_src_4x4 - src_stride - 1,
1221 pu1_src_4x4 - src_stride,
1222 pu1_src_4x4 - 1,
1223 src_stride,
1224 4,
1225 nbr_flag,
1226 &ps_ed_ctxt->au1_ref_ic_ctb[0],
1227 0);
1228
1229 top_available = CHECK_T_AVAILABLE(nbr_flag);
1230 left_available = CHECK_L_AVAILABLE(nbr_flag);
1231 /* call the function which populates sad cost for all the modes */
1232 ihevce_intra_populate_mode_bits_cost(
1233 top_intra_mode,
1234 left_intra_mode,
1235 top_available,
1236 left_available,
1237 i * 4,
1238 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
1239 lambda);
1240
1241 ihevce_ed_calc_4x4_blk(
1242 ps_ed_4x4,
1243 pu1_src_4x4,
1244 src_stride,
1245 &ps_ed_ctxt->au1_ref_ic_ctb[0],
1246 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
1247 sad_ptr + z_scan_idx * NUM_MODES,
1248 &i4_best_satd,
1249 ps_ed_ctxt->i4_quality_preset,
1250 &i4_dummy_sad_cost,
1251 ps_ipe_optimised_function_list);
1252
1253 /*Note : The satd population is not populated for last 4*4 block in incomplete CTB */
1254 /* Which corresponds to CU 8 in L0 */
1255
1256 /*MAM_VAR_L1 */
1257 ASSERT(i4_best_satd >= 0);
1258 if(i4_layer_id == 1) //Can we ignore this check?
1259 {
1260 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + (j >> 1)];
1261 if((j & 1) == 0)
1262 {
1263 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = 0;
1264 }
1265 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] += i4_best_satd;
1266 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
1267 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
1268 i4_incomplete_sum_4x4_satd = i4_incomplete_sum_4x4_satd + i4_best_satd;
1269 if(i4_incomplete_min_4x4_satd >= i4_best_satd)
1270 i4_incomplete_min_4x4_satd = i4_best_satd;
1271 }
1272
1273 ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j] = ps_ed_4x4->best_mode;
1274 next_left_intra_mode = ps_ed_4x4->best_mode;
1275 pu1_src_4x4 += 4;
1276 }
1277 }
1278 left_intra_mode_ptr[0] = next_left_intra_mode;
1279 }
1280
1281 /*!
1282 ******************************************************************************
1283 * \if Function name : ihevce_cu_level_qp_mod \endif
1284 *
1285 * \brief: Performs CU level QP modulation
1286 *
1287 *****************************************************************************
1288 */
ihevce_cu_level_qp_mod(WORD32 i4_qscale,WORD32 i4_satd,long double ld_curr_frame_log_avg_act,float f_mod_strength,WORD32 * pi4_act_factor,WORD32 * pi4_q_scale_mod,rc_quant_t * ps_rc_quant_ctxt)1289 WORD32 ihevce_cu_level_qp_mod(
1290 WORD32 i4_qscale,
1291 WORD32 i4_satd,
1292 long double ld_curr_frame_log_avg_act,
1293 float f_mod_strength,
1294 WORD32 *pi4_act_factor,
1295 WORD32 *pi4_q_scale_mod,
1296 rc_quant_t *ps_rc_quant_ctxt)
1297 {
1298 WORD32 i4_temp_qscale;
1299 WORD32 i4_temp_qp;
1300
1301 if(i4_satd != -1)
1302 {
1303 WORD32 i4_loc_satd = i4_satd;
1304 if(i4_loc_satd < 1)
1305 {
1306 i4_loc_satd = 1;
1307 }
1308 if((WORD32)ld_curr_frame_log_avg_act == 0)
1309 {
1310 *pi4_act_factor = (1 << (QP_LEVEL_MOD_ACT_FACTOR));
1311 }
1312 else
1313 {
1314 UWORD32 u4_log2_sq_cur_satd;
1315 ULWORD64 u8_sq_cur_satd;
1316 WORD32 qp_offset;
1317
1318 ASSERT(USE_SQRT_AVG_OF_SATD_SQR);
1319 u8_sq_cur_satd = (i4_loc_satd * i4_loc_satd);
1320 GET_POS_MSB_64(u4_log2_sq_cur_satd, u8_sq_cur_satd);
1321 if(ABS((
1322 long double)(((1 << u4_log2_sq_cur_satd) * POW_2_TO_1_BY_4) - ((long double)u8_sq_cur_satd))) >
1323 ABS((
1324 long double)(((1 << u4_log2_sq_cur_satd) * POW_2_TO_3_BY_4) - ((long double)u8_sq_cur_satd))))
1325 {
1326 u4_log2_sq_cur_satd += 1;
1327 }
1328 qp_offset = (WORD32)(
1329 f_mod_strength *
1330 (float)((long double)u4_log2_sq_cur_satd - ld_curr_frame_log_avg_act));
1331 qp_offset = CLIP3(qp_offset, MIN_QP_MOD_OFFSET, MAX_QP_MOD_OFFSET);
1332 *pi4_act_factor = (WORD32)(
1333 gad_look_up_activity[qp_offset + ABS(MIN_QP_MOD_OFFSET)] *
1334 (1 << QP_LEVEL_MOD_ACT_FACTOR));
1335 }
1336
1337 ASSERT(*pi4_act_factor > 0);
1338 i4_temp_qscale = ((i4_qscale * (*pi4_act_factor)) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >>
1339 QP_LEVEL_MOD_ACT_FACTOR;
1340 }
1341 else
1342 {
1343 i4_temp_qscale = i4_qscale;
1344 *pi4_act_factor = (1 << QP_LEVEL_MOD_ACT_FACTOR);
1345 }
1346 ASSERT(*pi4_act_factor > 0);
1347
1348 if(i4_temp_qscale > ps_rc_quant_ctxt->i2_max_qscale)
1349 {
1350 i4_temp_qscale = ps_rc_quant_ctxt->i2_max_qscale;
1351 }
1352 else if(i4_temp_qscale < ps_rc_quant_ctxt->i2_min_qscale)
1353 {
1354 i4_temp_qscale = ps_rc_quant_ctxt->i2_min_qscale;
1355 }
1356 /*store q scale for stat gen for I frame model*/
1357 /*Here activity factor is not modified as the cu qp would be clipped in rd-opt stage*/
1358 *pi4_q_scale_mod = i4_temp_qscale;
1359 i4_temp_qp = ps_rc_quant_ctxt->pi4_qscale_to_qp[i4_temp_qscale];
1360 if(i4_temp_qp > ps_rc_quant_ctxt->i2_max_qp)
1361 {
1362 i4_temp_qp = ps_rc_quant_ctxt->i2_max_qp;
1363 }
1364 else if(i4_temp_qp < ps_rc_quant_ctxt->i2_min_qp)
1365 {
1366 i4_temp_qp = ps_rc_quant_ctxt->i2_min_qp;
1367 }
1368 return (i4_temp_qp);
1369 }
1370
1371 /*!
1372 ******************************************************************************
1373 * \if Function name : ihevce_ed_calc_ctb \endif
1374 *
1375 * \brief: performs L1 8x8 and 4x4 intra mode analysis
1376 *
1377 *****************************************************************************
1378 */
ihevce_ed_calc_ctb(ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_ctb,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,UWORD8 * pu1_src,WORD32 src_stride,WORD32 num_4x4_blks_x,WORD32 num_4x4_blks_y,WORD32 * nbr_flags,WORD32 i4_layer_id,WORD32 i4_row_block_no,WORD32 i4_col_block_no,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)1379 void ihevce_ed_calc_ctb(
1380 ihevce_ed_ctxt_t *ps_ed_ctxt,
1381 ihevce_ed_blk_t *ps_ed_ctb,
1382 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
1383 UWORD8 *pu1_src,
1384 WORD32 src_stride,
1385 WORD32 num_4x4_blks_x,
1386 WORD32 num_4x4_blks_y,
1387 WORD32 *nbr_flags,
1388 WORD32 i4_layer_id,
1389 WORD32 i4_row_block_no,
1390 WORD32 i4_col_block_no,
1391 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
1392 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
1393 {
1394 WORD32 i, j;
1395 WORD32 z_scan_idx = 0;
1396 WORD32 z_scan_act_idx = 0;
1397 ihevce_ed_blk_t *ps_ed_8x8;
1398 UWORD8 *pu1_src_8x8;
1399
1400 WORD32 top_intra_modes[20];
1401 WORD32 *top_intra_mode_ptr;
1402 WORD32 *left_intra_mode_ptr = ps_ed_ctxt->left_ctb_intra_modes;
1403
1404 WORD32 *sad_ptr = &ps_ed_ctxt->sad[0];
1405 WORD32 lambda = ps_ed_ctxt->lambda;
1406 WORD32 *nbr_flags_ptr;
1407 WORD32 i4_best_sad_cost_8x8_l1_ipe, i4_best_sad_8x8_l1_ipe, i4_sum_4x4_satd, i4_min_4x4_satd;
1408
1409 (void)num_4x4_blks_y;
1410 (void)i4_row_block_no;
1411 (void)i4_col_block_no;
1412 ASSERT(num_4x4_blks_x % 2 == 0);
1413 ASSERT(num_4x4_blks_y % 2 == 0);
1414 ASSERT((num_4x4_blks_x == 4) || (num_4x4_blks_x == 8));
1415 ASSERT((num_4x4_blks_y == 4) || (num_4x4_blks_y == 8));
1416
1417 if(i4_layer_id == 1)
1418 {
1419 WORD32 i4_i;
1420
1421 for(i4_i = 0; i4_i < 64; i4_i++)
1422 {
1423 (ps_ed_ctb + i4_i)->i4_4x4_satd = -1;
1424 (ps_ed_ctb + i4_i)->i4_4x4_cur_satd = -1;
1425 }
1426
1427 for(i4_i = 0; i4_i < 16; i4_i++)
1428 {
1429 ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2;
1430 ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF;
1431 ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2;
1432 ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2;
1433 }
1434
1435 for(i4_i = 0; i4_i < 4; i4_i++)
1436 {
1437 ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2;
1438 ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2;
1439 ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2;
1440 }
1441 ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2;
1442 ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2;
1443 ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2;
1444 ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2;
1445 for(i4_i = 0; i4_i < 16; i4_i++)
1446 {
1447 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -2;
1448 ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -2;
1449 ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -2;
1450 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -2;
1451
1452 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -2;
1453
1454 ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -2;
1455 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -2;
1456 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -2;
1457 }
1458 }
1459 /*
1460 * src scan happens in raster scan order. ps_ed update happens in z-scan order.
1461 */
1462 for(i = 0; i < num_4x4_blks_x; i++)
1463 {
1464 top_intra_modes[i] = INTRA_DC;
1465 }
1466 for(i = 0; i < num_4x4_blks_x / 2; i++)
1467 {
1468 pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride;
1469 top_intra_mode_ptr = &top_intra_modes[0];
1470 nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i;
1471
1472 for(j = 0; j < num_4x4_blks_x / 2; j++)
1473 {
1474 WORD32 i4_best_satd;
1475 ASSERT(i <= 3);
1476 ASSERT(j <= 3);
1477
1478 // Multiply i by 16 since the
1479 // matrix is prepared for ctb_size = 64
1480 z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2];
1481 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
1482 ASSERT(z_scan_act_idx <= 15);
1483
1484 ps_ed_8x8 = ps_ed_ctb + z_scan_idx;
1485
1486 ihevce_ed_calc_8x8_blk(
1487 ps_ed_ctxt,
1488 ps_ed_8x8,
1489 pu1_src_8x8,
1490 src_stride,
1491 nbr_flags_ptr,
1492 top_intra_mode_ptr,
1493 left_intra_mode_ptr,
1494 i * 8,
1495 lambda,
1496 sad_ptr + z_scan_idx * NUM_MODES,
1497 &i4_best_satd,
1498 i4_layer_id,
1499 ps_ed_ctxt->i4_quality_preset,
1500 ps_ed_ctxt->i4_slice_type,
1501 &i4_best_sad_cost_8x8_l1_ipe,
1502 &i4_best_sad_8x8_l1_ipe,
1503 &i4_sum_4x4_satd,
1504 &i4_min_4x4_satd,
1505 ps_ipe_optimised_function_list,
1506 ps_cmn_utils_optimised_function_list);
1507
1508 if(i4_layer_id == 1)
1509 {
1510 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] =
1511 i4_best_sad_cost_8x8_l1_ipe;
1512 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe;
1513 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd;
1514 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
1515 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
1516 //ps_ed_ctb_l1->i4_sum_4x4_satd[z_scan_act_idx] = i4_sum_4x4_satd;
1517 //ps_ed_ctb_l1->i4_min_4x4_satd[z_scan_act_idx] = i4_min_4x4_satd;
1518 }
1519
1520 pu1_src_8x8 += 8;
1521 //ps_ed_8x8 += 4;
1522 top_intra_mode_ptr += 2;
1523 nbr_flags_ptr += 2;
1524 }
1525 left_intra_mode_ptr += 2;
1526 }
1527 }
1528
1529 /*!
1530 ******************************************************************************
1531 * \if Function name : ihevce_ed_frame_init \endif
1532 *
1533 * \brief: Initialize frame context for early decision
1534 *
1535 *****************************************************************************
1536 */
ihevce_ed_frame_init(void * pv_ed_ctxt,WORD32 i4_layer_no)1537 void ihevce_ed_frame_init(void *pv_ed_ctxt, WORD32 i4_layer_no)
1538 {
1539 ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
1540
1541 g_apf_lum_ip[IP_FUNC_MODE_0] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_planar_fptr;
1542 g_apf_lum_ip[IP_FUNC_MODE_1] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_dc_fptr;
1543 g_apf_lum_ip[IP_FUNC_MODE_2] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode2_fptr;
1544 g_apf_lum_ip[IP_FUNC_MODE_3TO9] =
1545 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_3_to_9_fptr;
1546 g_apf_lum_ip[IP_FUNC_MODE_10] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_horz_fptr;
1547 g_apf_lum_ip[IP_FUNC_MODE_11TO17] =
1548 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_11_to_17_fptr;
1549 g_apf_lum_ip[IP_FUNC_MODE_18_34] =
1550 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_18_34_fptr;
1551 g_apf_lum_ip[IP_FUNC_MODE_19TO25] =
1552 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_19_to_25_fptr;
1553 g_apf_lum_ip[IP_FUNC_MODE_26] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ver_fptr;
1554 g_apf_lum_ip[IP_FUNC_MODE_27TO33] =
1555 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_27_to_33_fptr;
1556
1557 if(i4_layer_no == 1)
1558 {
1559 ps_ed_ctxt->i8_sum_best_satd = 0;
1560 ps_ed_ctxt->i8_sum_sq_best_satd = 0;
1561 }
1562 }
1563
1564 /**
1565 ********************************************************************************
1566 *
1567 * @brief downscales by 2 in horz and vertical direction, creates output of
1568 * size wd/2 * ht/2
1569 *
1570 * @param[in] pu1_src : source pointer
1571 * @param[in] src_stride : source stride
1572 * @param[out] pu1_dst : destination pointer. Starting of a row.
1573 * @param[in] dst_stride : destination stride
1574 * @param[in] wd : width
1575 * @param[in] ht : height
1576 * @param[in] pu1_wkg_mem : working memory (atleast of size CEIL16(wd) * ht))
1577 * @param[in] ht_offset : height offset of the block to be scaled
1578 * @param[in] block_ht : height of the block to be scaled
1579 * @param[in] wd_offset : width offset of the block to be scaled
1580 * @param[in] block_wd : width of the block to be scaled
1581 *
1582 * @return void
1583 *
1584 * @remarks Assumption made block_ht should me multiple of 2. LANCZOS_SCALER
1585 *
1586 ********************************************************************************
1587 */
ihevce_scaling_filter_mxn(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_scrtch,WORD32 scrtch_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht,WORD32 wd)1588 void ihevce_scaling_filter_mxn(
1589 UWORD8 *pu1_src,
1590 WORD32 src_strd,
1591 UWORD8 *pu1_scrtch,
1592 WORD32 scrtch_strd,
1593 UWORD8 *pu1_dst,
1594 WORD32 dst_strd,
1595 WORD32 ht,
1596 WORD32 wd)
1597 {
1598 #define FILT_TAP_Q 8
1599 #define N_TAPS 7
1600 const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 };
1601 WORD32 i, j;
1602 WORD32 tmp;
1603 UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd;
1604 UWORD8 *pu1_scrtch_tmp = pu1_scrtch;
1605
1606 /* horizontal filtering */
1607 for(i = -3; i < ht + 2; i++)
1608 {
1609 for(j = 0; j < wd; j += 2)
1610 {
1611 tmp = (i4_ftaps[3] * pu1_src_tmp[j] +
1612 i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) +
1613 i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) +
1614 i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) +
1615 (1 << (FILT_TAP_Q - 1))) >>
1616 FILT_TAP_Q;
1617 pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp);
1618 }
1619 pu1_scrtch_tmp += scrtch_strd;
1620 pu1_src_tmp += src_strd;
1621 }
1622 /* vertical filtering */
1623 pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd;
1624 for(i = 0; i < ht; i += 2)
1625 {
1626 for(j = 0; j < (wd >> 1); j++)
1627 {
1628 tmp =
1629 (i4_ftaps[3] * pu1_scrtch_tmp[j] +
1630 i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) +
1631 i4_ftaps[1] *
1632 (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) +
1633 i4_ftaps[0] *
1634 (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) +
1635 (1 << (FILT_TAP_Q - 1))) >>
1636 FILT_TAP_Q;
1637 pu1_dst[j] = CLIP_U8(tmp);
1638 }
1639 pu1_dst += dst_strd;
1640 pu1_scrtch_tmp += (scrtch_strd << 1);
1641 }
1642 }
1643
ihevce_scale_by_2(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht,UWORD8 * pu1_wkg_mem,WORD32 ht_offset,WORD32 block_ht,WORD32 wd_offset,WORD32 block_wd,FT_COPY_2D * pf_copy_2d,FT_SCALING_FILTER_BY_2 * pf_scaling_filter_mxn)1644 void ihevce_scale_by_2(
1645 UWORD8 *pu1_src,
1646 WORD32 src_strd,
1647 UWORD8 *pu1_dst,
1648 WORD32 dst_strd,
1649 WORD32 wd,
1650 WORD32 ht,
1651 UWORD8 *pu1_wkg_mem,
1652 WORD32 ht_offset,
1653 WORD32 block_ht,
1654 WORD32 wd_offset,
1655 WORD32 block_wd,
1656 FT_COPY_2D *pf_copy_2d,
1657 FT_SCALING_FILTER_BY_2 *pf_scaling_filter_mxn)
1658 {
1659 #define N_TAPS 7
1660 #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1))
1661 UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ];
1662 UWORD32 cpy_strd = MAX_BLK_SZ;
1663 UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1);
1664
1665 UWORD8 *pu1_in, *pu1_out;
1666 WORD32 in_strd, wkg_mem_strd;
1667
1668 WORD32 row_start, row_end;
1669 WORD32 col_start, col_end;
1670 WORD32 i, fun_select;
1671 WORD32 ht_tmp, wd_tmp;
1672 FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2];
1673
1674 assert((wd & 1) == 0);
1675 assert((ht & 1) == 0);
1676 assert(block_wd <= MAX_CTB_SIZE);
1677 assert(block_ht <= MAX_CTB_SIZE);
1678
1679 /* function pointers for filtering different dimensions */
1680 ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn;
1681 ihevce_scaling_filters[1] = pf_scaling_filter_mxn;
1682
1683 /* handle boundary blks */
1684 col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0;
1685 row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0;
1686 col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0;
1687 row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0;
1688 if(col_end && (wd % block_wd != 0))
1689 {
1690 block_wd = (wd % block_wd);
1691 }
1692 if(row_end && (ht % block_ht != 0))
1693 {
1694 block_ht = (ht % block_ht);
1695 }
1696
1697 /* boundary blks needs to be padded, copy src to tmp buffer */
1698 if(col_start || col_end || row_end || row_start)
1699 {
1700 UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd;
1701
1702 pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start));
1703 pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start));
1704 ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end);
1705 wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end);
1706 pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp);
1707 pu1_in = au1_cpy + cpy_strd * 3 + 3;
1708 in_strd = cpy_strd;
1709 }
1710 else
1711 {
1712 pu1_in = pu1_src + wd_offset + ht_offset * src_strd;
1713 in_strd = src_strd;
1714 }
1715
1716 /*top padding*/
1717 if(row_start)
1718 {
1719 UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3;
1720
1721 pu1_cpy = au1_cpy + cpy_strd * (3 - 1);
1722 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1723 pu1_cpy -= cpy_strd;
1724 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1725 pu1_cpy -= cpy_strd;
1726 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1727 }
1728
1729 /*bottom padding*/
1730 if(row_end)
1731 {
1732 UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd;
1733
1734 pu1_cpy = pu1_cpy_tmp + cpy_strd;
1735 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1736 pu1_cpy += cpy_strd;
1737 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1738 pu1_cpy += cpy_strd;
1739 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1740 }
1741
1742 /*left padding*/
1743 if(col_start)
1744 {
1745 UWORD8 *pu1_cpy_tmp = au1_cpy + 3;
1746
1747 pu1_cpy = au1_cpy;
1748 for(i = 0; i < block_ht + 6; i++)
1749 {
1750 pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
1751 pu1_cpy += cpy_strd;
1752 pu1_cpy_tmp += cpy_strd;
1753 }
1754 }
1755
1756 /*right padding*/
1757 if(col_end)
1758 {
1759 UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1;
1760
1761 pu1_cpy = au1_cpy + 3 + block_wd;
1762 for(i = 0; i < block_ht + 6; i++)
1763 {
1764 pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
1765 pu1_cpy += cpy_strd;
1766 pu1_cpy_tmp += cpy_strd;
1767 }
1768 }
1769
1770 wkg_mem_strd = block_wd >> 1;
1771 pu1_out = pu1_dst + (wd_offset >> 1);
1772 fun_select = (block_wd % 16 == 0);
1773 ihevce_scaling_filters[fun_select](
1774 pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd);
1775
1776 /* Left padding of 16 for 1st block of every row */
1777 if(wd_offset == 0)
1778 {
1779 UWORD8 u1_val;
1780 WORD32 pad_wd = 16;
1781 WORD32 pad_ht = block_ht >> 1;
1782 UWORD8 *dst = pu1_dst;
1783
1784 for(i = 0; i < pad_ht; i++)
1785 {
1786 u1_val = dst[0];
1787 memset(&dst[-pad_wd], u1_val, pad_wd);
1788 dst += dst_strd;
1789 }
1790 }
1791
1792 if(wd == wd_offset + block_wd)
1793 {
1794 /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */
1795 /* Right padding is done only after processing of last block of that row is done*/
1796 UWORD8 u1_val;
1797 WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4;
1798 WORD32 pad_ht = block_ht >> 1;
1799 UWORD8 *dst = pu1_dst + (wd >> 1) - 1;
1800
1801 for(i = 0; i < pad_ht; i++)
1802 {
1803 u1_val = dst[0];
1804 memset(&dst[1], u1_val, pad_wd);
1805 dst += dst_strd;
1806 }
1807
1808 if(ht_offset == 0)
1809 {
1810 /* Top padding of 16 is done for 1st row only after we reach end of that row */
1811 WORD32 pad_wd = dst_strd;
1812 WORD32 pad_ht = 16;
1813 UWORD8 *dst = pu1_dst - 16;
1814
1815 for(i = 1; i <= pad_ht; i++)
1816 {
1817 memcpy(dst - (i * dst_strd), dst, pad_wd);
1818 }
1819 }
1820
1821 /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have
1822 reached end of frame */
1823 if(ht - ht_offset - block_ht == 0)
1824 {
1825 WORD32 pad_wd = dst_strd;
1826 WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4;
1827 UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16;
1828
1829 for(i = 1; i <= pad_ht; i++)
1830 memcpy(dst + (i * dst_strd), dst, pad_wd);
1831 }
1832 }
1833 }
1834
1835 /*!
1836 ******************************************************************************
1837 * \if Function name : ihevce_decomp_pre_intra_process_row \endif
1838 *
1839 * \brief
1840 * Row level function which down scales a given row by 2 in horz and
1841 * vertical direction creates output of size wd/2 * ht/2.
1842 *
1843 * @param[in] pu1_src : soource pointer
1844 * @param[in] src_stride : source stride
1845 * @param[out] pu1_dst : desitnation pointer
1846 * @param[in] dst_stride : destination stride
1847 * @param[in] layer_wd : layer width
1848 * @param[in] layer_ht : layer height
1849 * @param[in] ht_offset : height offset of the block to be scaled
1850 * @param[in] block_ht : height of the block to be scaled
1851 * @param[in] wd_offset : width offset of the block to be scaled
1852 * @param[in] block_wd : width of the block to be scaled
1853 * @param[in] num_col_blks : number of col blks in that row
1854 *
1855 * \return None
1856 *
1857 * @NOTE : When decompositionis done from L1 to L2 pre intra analysis is
1858 * done on L1
1859 *
1860 *****************************************************************************
1861 */
ihevce_decomp_pre_intra_process_row(UWORD8 * pu1_src,WORD32 src_stride,UWORD8 * pu1_dst_decomp,WORD32 dst_stride,WORD32 layer_wd,WORD32 layer_ht,UWORD8 * pu1_wkg_mem,WORD32 ht_offset,WORD32 block_ht,WORD32 block_wd,WORD32 i4_cu_aligned_pic_wd,WORD32 i4_cu_aligned_pic_ht,WORD32 num_col_blks,WORD32 layer_no,ihevce_ed_ctxt_t * ps_ed_ctxt,ihevce_ed_blk_t * ps_ed_row,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1_row,ihevce_8x8_L0_satd_t * ps_layer0_cur_satd,ihevce_8x8_L0_mean_t * ps_layer0_cur_mean,WORD32 num_4x4_blks_ctb_y,WORD32 num_4x4_blks_last_ctb_x,WORD32 skip_decomp,WORD32 skip_pre_intra,WORD32 row_block_no,WORD32 i4_enable_noise_detection,ctb_analyse_t * ps_ctb_analyse,ihevce_ipe_optimised_function_list_t * ps_ipe_optimised_function_list,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list)1862 void ihevce_decomp_pre_intra_process_row(
1863 UWORD8 *pu1_src,
1864 WORD32 src_stride,
1865 UWORD8 *pu1_dst_decomp,
1866 WORD32 dst_stride,
1867 WORD32 layer_wd,
1868 WORD32 layer_ht,
1869 UWORD8 *pu1_wkg_mem,
1870 WORD32 ht_offset,
1871 WORD32 block_ht,
1872 WORD32 block_wd,
1873 WORD32 i4_cu_aligned_pic_wd,
1874 WORD32 i4_cu_aligned_pic_ht,
1875 WORD32 num_col_blks,
1876 WORD32 layer_no,
1877 ihevce_ed_ctxt_t *ps_ed_ctxt,
1878 ihevce_ed_blk_t *ps_ed_row,
1879 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1_row,
1880 ihevce_8x8_L0_satd_t *ps_layer0_cur_satd,
1881 ihevce_8x8_L0_mean_t *ps_layer0_cur_mean,
1882 WORD32 num_4x4_blks_ctb_y,
1883 WORD32 num_4x4_blks_last_ctb_x,
1884 WORD32 skip_decomp,
1885 WORD32 skip_pre_intra,
1886 WORD32 row_block_no,
1887 WORD32 i4_enable_noise_detection,
1888 ctb_analyse_t *ps_ctb_analyse,
1889 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
1890 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
1891 {
1892 WORD32 col_block_no;
1893
1894 //ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
1895 UWORD8 *pu1_src_pre_intra = pu1_src + (ht_offset * src_stride);
1896 WORD32 num_4x4_blks_in_ctb = block_wd >> 2;
1897 //WORD32 nbr_flags[64];
1898 WORD32 *nbr_flags_ptr = &ps_ed_ctxt->ai4_nbr_flags[0];
1899 WORD32 src_inc_pre_intra = num_4x4_blks_in_ctb * 4;
1900 WORD32 inc_ctb = 0;
1901 ihevce_ed_blk_t *ps_ed_ctb = ps_ed_row;
1902 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1 = ps_ed_ctb_l1_row;
1903 WORD32 i, j;
1904 WORD32 do_pre_intra_analysis;
1905 pf_ed_calc_ctb ed_calc_ctb;
1906 ctb_analyse_t *ps_ctb_analyse_curr;
1907
1908 (void)i4_cu_aligned_pic_wd;
1909 (void)i4_cu_aligned_pic_ht;
1910 (void)ps_layer0_cur_satd;
1911 (void)ps_layer0_cur_mean;
1912 (void)i4_enable_noise_detection;
1913 /*increment the struct pointer to point to the first CTB of the current row. */
1914 ps_ctb_analyse_curr = ps_ctb_analyse + row_block_no * num_col_blks;
1915
1916 //if((num_4x4_blks_ctb_x == num_4x4_blks_ctb_y) && (num_4x4_blks_in_ctb == num_4x4_blks_ctb_x) )
1917 if(num_4x4_blks_in_ctb == num_4x4_blks_ctb_y)
1918 {
1919 ed_calc_ctb = ihevce_ed_calc_ctb;
1920 }
1921 else
1922 {
1923 ed_calc_ctb = ihevce_ed_calc_incomplete_ctb;
1924 }
1925
1926 inc_ctb = num_4x4_blks_in_ctb * num_4x4_blks_in_ctb;
1927
1928 do_pre_intra_analysis = ((layer_no == 1) || (layer_no == 2)) && (!skip_pre_intra);
1929
1930 /*
1931 * For optimal pre intra analysis first block is processed outside
1932 * the loop.
1933 */
1934 if(!skip_decomp)
1935 {
1936 ihevce_scale_by_2(
1937 pu1_src,
1938 src_stride,
1939 pu1_dst_decomp,
1940 dst_stride,
1941 layer_wd,
1942 layer_ht,
1943 pu1_wkg_mem,
1944 ht_offset,
1945 block_ht,
1946 block_wd * 0,
1947 block_wd,
1948 ps_cmn_utils_optimised_function_list->pf_copy_2d,
1949 ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
1950 /* Disable noise detection */
1951 ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
1952
1953 memset(
1954 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
1955 0,
1956 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
1957 }
1958
1959 /*
1960 * Pre intra analysis for the first ctb.
1961 * To analyse any given CTB we need to set the availability flags of the
1962 * following neighbouring CTB: BL,L,TL,T,TR.
1963 */
1964 if(do_pre_intra_analysis)
1965 {
1966 /*
1967 * At the beginning of ctb row set left intra modes to default value.
1968 */
1969 for(j = 0; j < num_4x4_blks_ctb_y; j++)
1970 {
1971 ps_ed_ctxt->left_ctb_intra_modes[j] = INTRA_DC;
1972 }
1973
1974 /*
1975 * Copy the neighbor flags for a general ctb (ctb inside the frame; not any corners).
1976 * The table gau4_nbr_flags_8x8_4x4blks generated for 16x16 4x4 blocks(ctb_size = 64).
1977 * But the same table holds good for other 4x4 blocks 2d arrays(eg 8x8 4x4 blks,4x4 4x4blks).
1978 * But the flags must be accessed with stride of 16 since the table has been generated for
1979 * ctb_size = 64. For odd 4x4 2d arrays(eg 3x3 4x4 blks) the flags needs modification.
1980 * The flags also need modification for corner ctbs.
1981 */
1982 memcpy(
1983 ps_ed_ctxt->ai4_nbr_flags,
1984 gau4_nbr_flags_8x8_4x4blks,
1985 sizeof(gau4_nbr_flags_8x8_4x4blks));
1986
1987 /*
1988 * Since this is the fist ctb in the ctb row, set left flags unavailable for 1st CTB col
1989 */
1990 for(j = 0; j < num_4x4_blks_ctb_y; j++)
1991 {
1992 SET_L_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1993 SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1994 SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1995 }
1996 /*
1997 * If this is the fist ctb row, set top flags unavailable.
1998 */
1999 if(ht_offset == 0)
2000 {
2001 for(j = 0; j < num_4x4_blks_in_ctb; j++)
2002 {
2003 SET_T_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
2004 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
2005 SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
2006 }
2007 }
2008
2009 /* If this is last ctb row,set BL as not available. */
2010 if(ht_offset + block_ht >= layer_ht)
2011 {
2012 for(j = 0; j < num_4x4_blks_in_ctb; j++)
2013 {
2014 SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(num_4x4_blks_ctb_y - 1) * 8 + j]);
2015 }
2016 }
2017 col_block_no = 0;
2018 /* Call intra analysis for the ctb */
2019 ed_calc_ctb(
2020 ps_ed_ctxt,
2021 ps_ed_ctb,
2022 ps_ed_ctb_l1,
2023 pu1_src_pre_intra,
2024 src_stride,
2025 num_4x4_blks_in_ctb,
2026 num_4x4_blks_ctb_y,
2027 nbr_flags_ptr,
2028 layer_no,
2029 row_block_no,
2030 col_block_no,
2031 ps_ipe_optimised_function_list,
2032 ps_cmn_utils_optimised_function_list
2033
2034 );
2035
2036 pu1_src_pre_intra += src_inc_pre_intra;
2037 ps_ed_ctb += inc_ctb;
2038 ps_ed_ctb_l1 += 1;
2039 /*
2040 * For the rest of the ctbs, set left flags available.
2041 */
2042 for(j = 0; j < num_4x4_blks_ctb_y; j++)
2043 {
2044 SET_L_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
2045 }
2046 for(j = 0; j < num_4x4_blks_ctb_y - 1; j++)
2047 {
2048 SET_BL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
2049 SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(j + 1) * 8]);
2050 }
2051 if(ht_offset != 0)
2052 {
2053 SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[0]);
2054 }
2055 }
2056
2057 /* The first ctb is processed before the loop.
2058 * The last one is processed after the loop.
2059 */
2060 for(col_block_no = 1; col_block_no < num_col_blks - 1; col_block_no++)
2061 {
2062 if(!skip_decomp)
2063 {
2064 ihevce_scale_by_2(
2065 pu1_src,
2066 src_stride,
2067 pu1_dst_decomp,
2068 dst_stride,
2069 layer_wd,
2070 layer_ht,
2071 pu1_wkg_mem,
2072 ht_offset,
2073 block_ht,
2074 block_wd * col_block_no,
2075 block_wd,
2076 ps_cmn_utils_optimised_function_list->pf_copy_2d,
2077 ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
2078 /* Disable noise detection */
2079 memset(
2080 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
2081 0,
2082 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
2083
2084 ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
2085 }
2086
2087 if(do_pre_intra_analysis)
2088 {
2089 ed_calc_ctb(
2090 ps_ed_ctxt,
2091 ps_ed_ctb,
2092 ps_ed_ctb_l1,
2093 pu1_src_pre_intra,
2094 src_stride,
2095 num_4x4_blks_in_ctb,
2096 num_4x4_blks_ctb_y,
2097 nbr_flags_ptr,
2098 layer_no,
2099 row_block_no,
2100 col_block_no,
2101 ps_ipe_optimised_function_list,
2102 ps_cmn_utils_optimised_function_list);
2103 pu1_src_pre_intra += src_inc_pre_intra;
2104 ps_ed_ctb += inc_ctb;
2105 ps_ed_ctb_l1 += 1;
2106 }
2107 }
2108
2109 /* Last ctb in row */
2110 if((!skip_decomp) && (col_block_no == (num_col_blks - 1)))
2111 {
2112 ihevce_scale_by_2(
2113 pu1_src,
2114 src_stride,
2115 pu1_dst_decomp,
2116 dst_stride,
2117 layer_wd,
2118 layer_ht,
2119 pu1_wkg_mem,
2120 ht_offset,
2121 block_ht,
2122 block_wd * col_block_no,
2123 block_wd,
2124 ps_cmn_utils_optimised_function_list->pf_copy_2d,
2125 ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
2126 {
2127 /* Disable noise detection */
2128 memset(
2129 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
2130 0,
2131 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
2132
2133 ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
2134 }
2135 }
2136
2137 if(do_pre_intra_analysis && (col_block_no == (num_col_blks - 1)))
2138 {
2139 /*
2140 * The last ctb can be complete or incomplete. The complete
2141 * ctb is handled in the if and incomplete is handled in the
2142 * else case
2143 */
2144 //if(num_4x4_blks_last_ctb == num_4x4_blks_in_ctb)
2145 if((num_4x4_blks_last_ctb_x == num_4x4_blks_ctb_y) &&
2146 (num_4x4_blks_in_ctb == num_4x4_blks_last_ctb_x))
2147 {
2148 /* Last ctb so set top right not available */
2149 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[num_4x4_blks_in_ctb - 1]);
2150
2151 ed_calc_ctb(
2152 ps_ed_ctxt,
2153 ps_ed_ctb,
2154 ps_ed_ctb_l1,
2155 pu1_src_pre_intra,
2156 src_stride,
2157 num_4x4_blks_in_ctb,
2158 num_4x4_blks_in_ctb,
2159 nbr_flags_ptr,
2160 layer_no,
2161 row_block_no,
2162 col_block_no,
2163 ps_ipe_optimised_function_list,
2164 ps_cmn_utils_optimised_function_list);
2165 pu1_src_pre_intra += src_inc_pre_intra;
2166 ps_ed_ctb += inc_ctb;
2167 ps_ed_ctb_l1 += 1;
2168 }
2169 else
2170 {
2171 /* Last ctb so set top right not available */
2172 for(i = 0; i < num_4x4_blks_ctb_y; i++)
2173 {
2174 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[i * 8 + num_4x4_blks_in_ctb - 1]);
2175 }
2176
2177 ihevce_ed_calc_incomplete_ctb(
2178 ps_ed_ctxt,
2179 ps_ed_ctb,
2180 ps_ed_ctb_l1,
2181 pu1_src_pre_intra,
2182 src_stride,
2183 num_4x4_blks_last_ctb_x,
2184 num_4x4_blks_ctb_y,
2185 nbr_flags_ptr,
2186 layer_no,
2187 row_block_no,
2188 col_block_no,
2189 ps_ipe_optimised_function_list,
2190 ps_cmn_utils_optimised_function_list);
2191 }
2192 }
2193 }
2194
2195 /*!
2196 ******************************************************************************
2197 * \if Function name : ihevce_decomp_pre_intra_process \endif
2198 *
2199 * \brief
2200 * Frame level function to decompose given layer L0 into coarser layers
2201 *
2202 * \param[in] pv_ctxt : pointer to master context of decomp_pre_intra module
2203 * \param[in] ps_inp : pointer to input yuv buffer (frame buffer)
2204 * \param[in] pv_multi_thrd_ctxt : pointer to multithread context
2205 * \param[out] thrd_id : thread id
2206 *
2207 * \return
2208 * None
2209 *
2210 * \author
2211 * Ittiam
2212 *
2213 *****************************************************************************
2214 */
ihevce_decomp_pre_intra_process(void * pv_ctxt,ihevce_lap_output_params_t * ps_lap_out_prms,frm_ctb_ctxt_t * ps_frm_ctb_prms,void * pv_multi_thrd_ctxt,WORD32 thrd_id,WORD32 i4_ping_pong,ihevce_8x8_L0_satd_t * ps_layer0_cur_satd,ihevce_8x8_L0_mean_t * ps_layer0_cur_mean)2215 void ihevce_decomp_pre_intra_process(
2216 void *pv_ctxt,
2217 ihevce_lap_output_params_t *ps_lap_out_prms,
2218 frm_ctb_ctxt_t *ps_frm_ctb_prms,
2219 void *pv_multi_thrd_ctxt,
2220 WORD32 thrd_id,
2221 WORD32 i4_ping_pong,
2222 ihevce_8x8_L0_satd_t *ps_layer0_cur_satd,
2223 ihevce_8x8_L0_mean_t *ps_layer0_cur_mean)
2224 {
2225 WORD32 i4_layer_no;
2226 WORD32 i4_num_layers;
2227 WORD32 end_of_layer;
2228 UWORD8 *pu1_src, *pu1_dst;
2229 WORD32 src_stride, dst_stride;
2230 WORD32 i4_layer_wd, i4_layer_ht;
2231 WORD32 ht_offset, block_ht;
2232 WORD32 row_block_no, num_row_blocks;
2233 UWORD8 *pu1_wkg_mem;
2234 WORD32 block_wd;
2235 WORD32 num_col_blks;
2236 WORD32 skip_decomp, skip_pre_intra;
2237 WORD32 i4_cu_aligned_pic_wd, i4_cu_aligned_pic_ht;
2238 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
2239 (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
2240
2241 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
2242 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thrd_id];
2243 multi_thrd_ctxt_t *ps_multi_thrd = (multi_thrd_ctxt_t *)pv_multi_thrd_ctxt;
2244
2245 ihevce_ed_ctxt_t *ps_ed_ctxt;
2246 ihevce_ed_blk_t *ps_ed;
2247 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1;
2248 WORD32 inc_ctb = 0;
2249 WORD32 num_4x4_blks_lyr;
2250
2251 i4_num_layers = ps_ctxt->i4_num_layers;
2252
2253 ASSERT(i4_num_layers >= 3);
2254
2255 /*
2256 * Always force minimum layers as 4 so that we would have both l1 and l2
2257 * pre intra analysis
2258 */
2259 if(i4_num_layers == 3)
2260 {
2261 i4_num_layers = 4;
2262 }
2263
2264 ps_ctxt->as_layers[0].pu1_inp = (UWORD8 *)ps_lap_out_prms->s_input_buf.pv_y_buf;
2265 ps_ctxt->as_layers[0].i4_inp_stride = ps_lap_out_prms->s_input_buf.i4_y_strd;
2266 ps_ctxt->as_layers[0].i4_actual_wd = ps_lap_out_prms->s_input_buf.i4_y_wd;
2267 ps_ctxt->as_layers[0].i4_actual_ht = ps_lap_out_prms->s_input_buf.i4_y_ht;
2268
2269 /* ------------ Loop over all the layers --------------- */
2270 /* This loop does only decomp for all layers by picking jobs from job queue */
2271 /* Decomp for all layers will completed with this for loop */
2272 for(i4_layer_no = 0; i4_layer_no < (i4_num_layers - 1); i4_layer_no++)
2273 {
2274 WORD32 idx = 0;
2275 src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride;
2276 pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp;
2277 i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd;
2278 i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht;
2279 pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp;
2280 dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride;
2281 block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd;
2282 block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht;
2283 num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks;
2284 num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks;
2285 i4_cu_aligned_pic_wd = ps_frm_ctb_prms->i4_cu_aligned_pic_wd;
2286 i4_cu_aligned_pic_ht = ps_frm_ctb_prms->i4_cu_aligned_pic_ht;
2287
2288 /* register ed_ctxt buffer pointer */
2289 //pv_ed_ctxt = &ps_ctxt->as_layers[i4_layer_no].s_early_decision;
2290 //ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
2291 //ps_ed = ps_ed_ctxt->ps_ed;
2292
2293 //pv_ed_ctxt = &ps_ctxt->ps_ed_ctxt;
2294 ps_ed_ctxt = ps_ctxt->ps_ed_ctxt;
2295
2296 /* initialize ed_ctxt here */
2297 /* init is moved here since now allocation is happening for only one instance
2298 is allocated. for each layer it is re-used */
2299 ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no];
2300 ps_ed_ctxt->i4_slice_type = ps_ctxt->i4_slice_type;
2301 ps_ed_ctxt->level = ps_ctxt->i4_codec_level;
2302 if(0 == i4_layer_no)
2303 {
2304 ps_ed_ctxt->ps_ed_pic = NULL;
2305 ps_ed_ctxt->ps_ed = NULL;
2306 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
2307 ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
2308 }
2309 else if(1 == i4_layer_no)
2310 {
2311 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf;
2312 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf;
2313 ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1;
2314 ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1;
2315 ps_ctxt->ps_layer0_cur_satd = NULL;
2316 ps_ctxt->ps_layer0_cur_mean = NULL;
2317 }
2318 else if(2 == i4_layer_no)
2319 {
2320 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf;
2321 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf;
2322 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
2323 ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
2324 ps_ctxt->ps_layer0_cur_satd = NULL;
2325 ps_ctxt->ps_layer0_cur_mean = NULL;
2326 }
2327
2328 /*Calculate the number of 4x4 blocks in a CTB in that layer*/
2329 /*Divide block_wd by 4. 4 to get no of 4x4 blks*/
2330 num_4x4_blks_lyr = block_wd >> 2;
2331 inc_ctb = num_4x4_blks_lyr * num_4x4_blks_lyr;
2332
2333 ps_ed = ps_ed_ctxt->ps_ed;
2334 ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1;
2335
2336 end_of_layer = 0;
2337 skip_decomp = 0;
2338 skip_pre_intra = 1;
2339 //if( i4_layer_no >= ps_ctxt->i4_num_layers)
2340 if(i4_layer_no >= (ps_ctxt->i4_num_layers - 1))
2341 {
2342 skip_decomp = 1;
2343 }
2344 /* ------------ Loop over all the CTB rows --------------- */
2345 while(0 == end_of_layer)
2346 {
2347 job_queue_t *ps_pre_enc_job;
2348 WORD32 num_4x4_blks_ctb_y = 0;
2349 WORD32 num_4x4_blks_last_ctb_x = 0;
2350
2351 /* Get the current row from the job queue */
2352 ps_pre_enc_job = (job_queue_t *)ihevce_pre_enc_grp_get_next_job(
2353 pv_multi_thrd_ctxt, (DECOMP_JOB_LYR0 + i4_layer_no), 1, i4_ping_pong);
2354
2355 pu1_wkg_mem = ps_ctxt->pu1_wkg_mem;
2356
2357 /* If all rows are done, set the end of layer flag to 1, */
2358 if(NULL == ps_pre_enc_job)
2359 {
2360 end_of_layer = 1;
2361 }
2362 else
2363 {
2364 /* Obtain the current row's details from the job */
2365 row_block_no = ps_pre_enc_job->s_job_info.s_decomp_job_info.i4_vert_unit_row_no;
2366 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = row_block_no;
2367 ht_offset = row_block_no * block_ht;
2368
2369 if(row_block_no < (num_row_blocks))
2370 {
2371 pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
2372 ((block_ht >> 1) * dst_stride * row_block_no);
2373
2374 /*L0 8x8 curr satd for qp mod*/
2375 if(i4_layer_no == 0)
2376 {
2377 ps_ctxt->ps_layer0_cur_satd =
2378 ps_layer0_cur_satd + (row_block_no * num_col_blks /*num ctbs*/ *
2379 (block_wd >> 3) * (block_ht >> 3));
2380 ps_ctxt->ps_layer0_cur_mean =
2381 ps_layer0_cur_mean + (row_block_no * num_col_blks /*num ctbs*/ *
2382 (block_wd >> 3) * (block_ht >> 3));
2383 }
2384
2385 /* call the row level processing function */
2386 ihevce_decomp_pre_intra_process_row(
2387 pu1_src,
2388 src_stride,
2389 pu1_dst,
2390 dst_stride,
2391 i4_layer_wd,
2392 i4_layer_ht,
2393 pu1_wkg_mem,
2394 ht_offset,
2395 block_ht,
2396 block_wd,
2397 i4_cu_aligned_pic_wd,
2398 i4_cu_aligned_pic_ht,
2399 num_col_blks,
2400 i4_layer_no,
2401 ps_ed_ctxt,
2402 ps_ed,
2403 ps_ed_ctb_l1,
2404 ps_ctxt->ps_layer0_cur_satd,
2405 ps_ctxt->ps_layer0_cur_mean,
2406 num_4x4_blks_ctb_y,
2407 num_4x4_blks_last_ctb_x,
2408 skip_decomp,
2409 skip_pre_intra,
2410 row_block_no,
2411 ps_ctxt->i4_enable_noise_detection,
2412 ps_ctxt->ps_ctb_analyse,
2413 &ps_ctxt->s_ipe_optimised_function_list,
2414 &ps_ctxt->s_cmn_opt_func);
2415
2416 /*When decompositionis done from L1 to L2
2417 pre intra analysis is done on L1*/
2418 if(i4_layer_no == 1 || i4_layer_no == 2)
2419 {
2420 // ps_ed = ps_ed_ctxt->ps_ed +
2421 // (row_block_no * inc_ctb * (num_col_blks));
2422 }
2423 }
2424 idx++;
2425 /* set the output dependency */
2426 ihevce_pre_enc_grp_job_set_out_dep(
2427 pv_multi_thrd_ctxt, ps_pre_enc_job, i4_ping_pong);
2428 }
2429 }
2430 ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = idx;
2431
2432 ihevce_ed_frame_init(ps_ed_ctxt, i4_layer_no);
2433
2434 if((1 == i4_layer_no) && (IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset))
2435 {
2436 WORD32 vert_ctr, ctb_ctr, i;
2437 WORD32 ctb_ctr_blks = ps_ctxt->as_layers[1].i4_num_col_blks;
2438 WORD32 vert_ctr_blks = ps_ctxt->as_layers[1].i4_num_row_blks;
2439
2440 if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
2441 (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))
2442 {
2443 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
2444 {
2445 ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
2446 ps_ctxt->ps_ed_ctb_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
2447
2448 for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
2449 {
2450 ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
2451 for(i = 0; i < 16; i++)
2452 {
2453 ps_ed_ctb_curr_l1->i4_best_sad_cost_8x8_l1_ipe[i] = 0x7fffffff;
2454 ps_ed_ctb_curr_l1->i4_best_sad_8x8_l1_ipe[i] = 0x7fffffff;
2455 }
2456 }
2457 }
2458 }
2459 }
2460
2461 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
2462 if(((2 == i4_layer_no) && (ps_lap_out_prms->i4_pic_type == IV_I_FRAME ||
2463 ps_lap_out_prms->i4_pic_type == IV_IDR_FRAME)) ||
2464 ((1 == i4_layer_no) &&
2465 (ps_lap_out_prms->i4_temporal_lyr_id <= TEMPORAL_LAYER_DISABLE)) ||
2466 ((IHEVCE_QUALITY_P6 != ps_ctxt->i4_quality_preset) && (0 != i4_layer_no)))
2467 #else
2468 if((0 != i4_layer_no) &&
2469 (1 != ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
2470 (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
2471 #endif
2472 {
2473 WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
2474
2475 src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride;
2476 pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp;
2477 i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd;
2478 i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht;
2479 pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp;
2480 dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride;
2481 block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd;
2482 block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht;
2483 num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks;
2484 num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks;
2485 i4_cu_aligned_pic_wd = ps_frm_ctb_prms->i4_cu_aligned_pic_wd;
2486 i4_cu_aligned_pic_ht = ps_frm_ctb_prms->i4_cu_aligned_pic_ht;
2487
2488 /* register ed_ctxt buffer pointer */
2489 ps_ed_ctxt = ps_ctxt->ps_ed_ctxt;
2490
2491 /* initialize ed_ctxt here */
2492 /* init is moved here since now allocation is happening for only one instance
2493 is allocated. for each layer it is re-used */
2494 ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no];
2495 ps_ed_ctxt->i4_slice_type = ps_ctxt->i4_slice_type;
2496 ps_ed_ctxt->level = ps_ctxt->i4_codec_level;
2497 if(1 == i4_layer_no)
2498 {
2499 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf;
2500 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf;
2501 ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1;
2502 ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1;
2503 ps_ctxt->ps_layer0_cur_satd = NULL;
2504 ps_ctxt->ps_layer0_cur_mean = NULL;
2505 }
2506 else if(2 == i4_layer_no)
2507 {
2508 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf;
2509 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf;
2510 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
2511 ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
2512 ps_ctxt->ps_layer0_cur_satd = NULL;
2513 ps_ctxt->ps_layer0_cur_mean = NULL;
2514 }
2515
2516 /*Calculate the number of 4x4 blocks in a CTB in that layer*/
2517 /*Divide block_wd by 4. 4 to get no of 4x4 blks*/
2518 num_4x4_blks_lyr = block_wd >> 2;
2519 inc_ctb = num_4x4_blks_lyr * num_4x4_blks_lyr;
2520
2521 ps_ed = ps_ed_ctxt->ps_ed;
2522 ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1;
2523 skip_decomp = 1;
2524 skip_pre_intra = 0;
2525 for(idx = 0; idx < i4_num_rows; idx++)
2526 {
2527 WORD32 num_4x4_blks_ctb_y = 0;
2528 WORD32 num_4x4_blks_last_ctb_x = 0;
2529
2530 pu1_wkg_mem = ps_ctxt->pu1_wkg_mem;
2531
2532 {
2533 /* Obtain the current row's details from the job */
2534 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
2535 ht_offset = row_block_no * block_ht;
2536
2537 if(row_block_no < (num_row_blocks))
2538 {
2539 pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
2540 ((block_ht >> 1) * dst_stride * row_block_no);
2541
2542 if(i4_layer_no == 1 || i4_layer_no == 2)
2543 {
2544 ps_ed = ps_ed_ctxt->ps_ed + (row_block_no * inc_ctb * (num_col_blks));
2545 ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1 + (row_block_no * num_col_blks);
2546
2547 ps_ed_ctxt->i4_quality_preset = ps_ctxt->i4_quality_preset;
2548 num_4x4_blks_ctb_y = block_ht >> 2;
2549 num_4x4_blks_last_ctb_x = block_wd >> 2;
2550
2551 if(row_block_no == num_row_blocks - 1)
2552 {
2553 if(i4_layer_ht % block_ht)
2554 {
2555 num_4x4_blks_ctb_y = ((i4_layer_ht % block_ht) + 3) >> 2;
2556 }
2557 }
2558
2559 if(i4_layer_wd % block_wd)
2560 {
2561 num_4x4_blks_last_ctb_x = ((i4_layer_wd % block_wd) + 3) >> 2;
2562 }
2563 }
2564
2565 /* call the row level processing function */
2566 ihevce_decomp_pre_intra_process_row(
2567 pu1_src,
2568 src_stride,
2569 pu1_dst,
2570 dst_stride,
2571 i4_layer_wd,
2572 i4_layer_ht,
2573 pu1_wkg_mem,
2574 ht_offset,
2575 block_ht,
2576 block_wd,
2577 i4_cu_aligned_pic_wd,
2578 i4_cu_aligned_pic_ht,
2579 num_col_blks,
2580 i4_layer_no,
2581 ps_ed_ctxt,
2582 ps_ed,
2583 ps_ed_ctb_l1,
2584 ps_ctxt->ps_layer0_cur_satd,
2585 ps_ctxt->ps_layer0_cur_mean,
2586 num_4x4_blks_ctb_y,
2587 num_4x4_blks_last_ctb_x,
2588 skip_decomp,
2589 skip_pre_intra,
2590 row_block_no,
2591 0,
2592 NULL,
2593 &ps_ctxt->s_ipe_optimised_function_list,
2594 &ps_ctxt->s_cmn_opt_func);
2595 }
2596 }
2597 if(1 == i4_layer_no)
2598 {
2599 ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
2600 }
2601 }
2602 for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
2603 {
2604 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
2605 }
2606 ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
2607 }
2608
2609 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
2610 if((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
2611 (((i4_layer_no == 2) && (ps_lap_out_prms->i4_pic_type == ISLICE)) ||
2612 ((i4_layer_no == 1) && (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
2613 {
2614 WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
2615 if(1 == i4_layer_no)
2616 {
2617 for(idx = 0; idx < i4_num_rows; idx++)
2618 {
2619 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
2620
2621 {
2622 ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
2623 }
2624 }
2625 }
2626 for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
2627 {
2628 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
2629 }
2630 ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
2631 }
2632 #else
2633 if((i4_layer_no != 0) && ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
2634 (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE)))
2635 {
2636 WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
2637 for(idx = 0; idx < i4_num_rows; idx++)
2638 {
2639 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
2640 if(1 == i4_layer_no)
2641 {
2642 ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
2643 }
2644 }
2645 for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
2646 {
2647 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
2648 }
2649 ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
2650 }
2651 #endif
2652 }
2653 }
2654
2655 /*!
2656 ************************************************************************
2657 * \brief
2658 * return number of records used by decomp pre intra
2659 *
2660 ************************************************************************
2661 */
ihevce_decomp_pre_intra_get_num_mem_recs(void)2662 WORD32 ihevce_decomp_pre_intra_get_num_mem_recs(void)
2663 {
2664 return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
2665 }
2666
2667 /*!
2668 ************************************************************************
2669 * @brief
2670 * return each record attributes of decomp pre intra
2671 ************************************************************************
2672 */
ihevce_decomp_pre_intra_get_mem_recs(iv_mem_rec_t * ps_mem_tab,WORD32 i4_num_proc_thrds,WORD32 i4_mem_space)2673 WORD32 ihevce_decomp_pre_intra_get_mem_recs(
2674 iv_mem_rec_t *ps_mem_tab, WORD32 i4_num_proc_thrds, WORD32 i4_mem_space)
2675 {
2676 /* memories should be requested assuming worst case requirememnts */
2677
2678 /* Module context structure */
2679 ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_size = sizeof(ihevce_decomp_pre_intra_master_ctxt_t);
2680 ps_mem_tab[DECOMP_PRE_INTRA_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
2681 ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_alignment = 8;
2682
2683 /* Thread context structure */
2684 ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_size =
2685 i4_num_proc_thrds * sizeof(ihevce_decomp_pre_intra_ctxt_t);
2686 ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
2687 ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_alignment = 8;
2688
2689 /* early decision context structure */
2690 ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_size = i4_num_proc_thrds * sizeof(ihevce_ed_ctxt_t);
2691 ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
2692 ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_alignment = 8;
2693
2694 return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
2695 }
2696
2697 /*!
2698 ************************************************************************
2699 * @brief
2700 * Init decomp pre intra context
2701 ************************************************************************
2702 */
ihevce_decomp_pre_intra_init(iv_mem_rec_t * ps_mem_tab,ihevce_static_cfg_params_t * ps_init_prms,WORD32 i4_num_proc_thrds,func_selector_t * ps_func_selector,WORD32 i4_resolution_id,UWORD8 u1_is_popcnt_available)2703 void *ihevce_decomp_pre_intra_init(
2704 iv_mem_rec_t *ps_mem_tab,
2705 ihevce_static_cfg_params_t *ps_init_prms,
2706 WORD32 i4_num_proc_thrds,
2707 func_selector_t *ps_func_selector,
2708 WORD32 i4_resolution_id,
2709 UWORD8 u1_is_popcnt_available)
2710 {
2711 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt;
2712 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt;
2713 WORD32 thread_no;
2714 WORD32 n_tot_layers;
2715 WORD32 count;
2716 WORD32 a_wd[MAX_NUM_HME_LAYERS], a_ht[MAX_NUM_HME_LAYERS], layer_no;
2717 WORD32 a_disp_wd[MAX_NUM_LAYERS], a_disp_ht[MAX_NUM_LAYERS];
2718 ihevce_ed_ctxt_t *ps_ed_ctxt;
2719 WORD32 min_cu_size;
2720
2721 /* get the min cu size from config params */
2722 min_cu_size = ps_init_prms->s_config_prms.i4_min_log2_cu_size;
2723
2724 min_cu_size = 1 << min_cu_size;
2725
2726 /* Get the height and width of each layer */
2727 *a_wd = ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_width +
2728 SET_CTB_ALIGN(
2729 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_width, min_cu_size);
2730 *a_ht =
2731 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_height +
2732 SET_CTB_ALIGN(
2733 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_height, min_cu_size);
2734
2735 n_tot_layers = hme_derive_num_layers(1, a_wd, a_ht, a_disp_wd, a_disp_ht);
2736
2737 /* Decomp state structure */
2738 ps_master_ctxt =
2739 (ihevce_decomp_pre_intra_master_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_CTXT].pv_base;
2740 ps_master_ctxt->i4_num_proc_thrds = i4_num_proc_thrds;
2741
2742 ps_ctxt = (ihevce_decomp_pre_intra_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].pv_base;
2743 ps_ed_ctxt = (ihevce_ed_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].pv_base;
2744
2745 for(thread_no = 0; thread_no < ps_master_ctxt->i4_num_proc_thrds; thread_no++)
2746 {
2747 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no] = ps_ctxt;
2748
2749 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->i4_num_layers = n_tot_layers;
2750
2751 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->pu1_wkg_mem =
2752 &ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->au1_wkg_mem[0];
2753
2754 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->ps_ed_ctxt = ps_ed_ctxt;
2755
2756 for(layer_no = 0; layer_no < n_tot_layers; layer_no++)
2757 {
2758 WORD32 max_ctb_size;
2759 WORD32 decomp_blk_ht, decomp_blk_wd;
2760
2761 ps_ctxt->as_layers[layer_no].i4_actual_wd = a_wd[layer_no];
2762 ps_ctxt->as_layers[layer_no].i4_actual_ht = a_ht[layer_no];
2763 ps_ctxt->as_layers[layer_no].i4_inp_stride = 0;
2764 ps_ctxt->as_layers[layer_no].pu1_inp = NULL;
2765 ps_ctxt->as_layers[layer_no].i4_num_rows_processed = 0;
2766
2767 for(count = 0; count < MAX_NUM_CTB_ROWS_FRM; count++)
2768 {
2769 ps_ctxt->as_layers[layer_no].ai4_curr_row_no[count] = -1;
2770 }
2771 if(0 == layer_no)
2772 {
2773 ps_ctxt->as_layers[layer_no].i4_padded_ht = a_ht[layer_no];
2774 ps_ctxt->as_layers[layer_no].i4_padded_wd = a_wd[layer_no];
2775 }
2776 else
2777 {
2778 ps_ctxt->as_layers[layer_no].i4_padded_ht = a_ht[layer_no] + 32 + 4;
2779 ps_ctxt->as_layers[layer_no].i4_padded_wd = a_wd[layer_no] + 32 + 4;
2780 }
2781
2782 /** If CTB size= 64.decomp_blk_wd = 64 for L0, 32 for L1 , 16 for L2, 8 for L3 */
2783 max_ctb_size = 1 << ps_init_prms->s_config_prms.i4_max_log2_cu_size;
2784
2785 ps_ctxt->as_layers[layer_no].i4_decomp_blk_ht = max_ctb_size >> layer_no;
2786 ps_ctxt->as_layers[layer_no].i4_decomp_blk_wd = max_ctb_size >> layer_no;
2787
2788 decomp_blk_ht = ps_ctxt->as_layers[layer_no].i4_decomp_blk_ht;
2789 decomp_blk_wd = ps_ctxt->as_layers[layer_no].i4_decomp_blk_wd;
2790
2791 ps_ctxt->as_layers[layer_no].i4_num_row_blks =
2792 ((a_ht[layer_no] + (decomp_blk_ht - 1)) / decomp_blk_ht);
2793
2794 ps_ctxt->as_layers[layer_no].i4_num_col_blks =
2795 ((a_wd[layer_no] + (decomp_blk_wd - 1)) / decomp_blk_wd);
2796 }
2797 ps_ed_ctxt->ps_func_selector = ps_func_selector;
2798
2799 ps_ctxt->i4_quality_preset =
2800 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_quality_preset;
2801
2802 if(ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P7)
2803 {
2804 ps_ctxt->i4_quality_preset = IHEVCE_QUALITY_P6;
2805 }
2806
2807 if(ps_init_prms->s_coding_tools_prms.i4_vqet &
2808 (1 << BITPOS_IN_VQ_TOGGLE_FOR_CONTROL_TOGGLER))
2809 {
2810 if(ps_init_prms->s_coding_tools_prms.i4_vqet &
2811 (1 << BITPOS_IN_VQ_TOGGLE_FOR_ENABLING_NOISE_PRESERVATION))
2812 {
2813 ps_ctxt->i4_enable_noise_detection = 1;
2814 }
2815 else
2816 {
2817 ps_ctxt->i4_enable_noise_detection = 0;
2818 }
2819 }
2820 else
2821 {
2822 ps_ctxt->i4_enable_noise_detection = 0;
2823 }
2824
2825 ihevce_cmn_utils_instr_set_router(
2826 &ps_ctxt->s_cmn_opt_func, u1_is_popcnt_available, ps_init_prms->e_arch_type);
2827
2828 ihevce_ipe_instr_set_router(
2829 &ps_ctxt->s_ipe_optimised_function_list, ps_init_prms->e_arch_type);
2830
2831 ps_ctxt++;
2832 ps_ed_ctxt++;
2833 }
2834 /* return the handle to caller */
2835 return ((void *)ps_master_ctxt);
2836 }
2837
2838 /*!
2839 ******************************************************************************
2840 * \if Function name : ihevce_decomp_pre_intra_frame_init \endif
2841 *
2842 * \brief
2843 * Frame Intialization for Decomp intra pre analysis.
2844 *
2845 * \param[in] pv_ctxt : pointer to module ctxt
2846 * \param[in] ppu1_decomp_lyr_bufs : pointer to array of layer buffer pointers
2847 * \param[in] pi4_lyr_buf_stride : pointer to array of layer buffer strides
2848 *
2849 * \return
2850 * None
2851 *
2852 * \author
2853 * Ittiam
2854 *
2855 *****************************************************************************
2856 */
ihevce_decomp_pre_intra_frame_init(void * pv_ctxt,UWORD8 ** ppu1_decomp_lyr_bufs,WORD32 * pi4_lyr_buf_stride,ihevce_ed_blk_t * ps_layer1_buf,ihevce_ed_blk_t * ps_layer2_buf,ihevce_ed_ctb_l1_t * ps_ed_ctb_l1,WORD32 i4_ol_sad_lambda_qf,WORD32 i4_slice_type,ctb_analyse_t * ps_ctb_analyse)2857 void ihevce_decomp_pre_intra_frame_init(
2858 void *pv_ctxt,
2859 UWORD8 **ppu1_decomp_lyr_bufs,
2860 WORD32 *pi4_lyr_buf_stride,
2861 ihevce_ed_blk_t *ps_layer1_buf,
2862 ihevce_ed_blk_t *ps_layer2_buf,
2863 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
2864 WORD32 i4_ol_sad_lambda_qf,
2865 WORD32 i4_slice_type,
2866 ctb_analyse_t *ps_ctb_analyse)
2867 {
2868 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt;
2869 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt;
2870 WORD32 thread_no;
2871
2872 /* Decomp state structure */
2873 ps_master_ctxt = (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
2874
2875 for(thread_no = 0; thread_no < ps_master_ctxt->i4_num_proc_thrds; thread_no++)
2876 {
2877 WORD32 layer_no;
2878
2879 ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no];
2880
2881 /* L0 layer (actual input) is registered in process call */
2882 for(layer_no = 1; layer_no < ps_ctxt->i4_num_layers; layer_no++)
2883 {
2884 ps_ctxt->as_layers[layer_no].i4_inp_stride = pi4_lyr_buf_stride[layer_no - 1];
2885 ps_ctxt->as_layers[layer_no].pu1_inp = ppu1_decomp_lyr_bufs[layer_no - 1];
2886
2887 /*Populating the buffer pointers for layer1 and layer2 buffers to store the
2888 structure for each 4x4 block after pre intra analysis on their respective laeyrs*/
2889
2890 if(layer_no == 1)
2891 {
2892 WORD32 sad_lambda_l1 = (3 * i4_ol_sad_lambda_qf >> 2);
2893 WORD32 temp = 1 << LAMBDA_Q_SHIFT;
2894 WORD32 lambda = ((temp) > sad_lambda_l1) ? temp : sad_lambda_l1;
2895 //ps_ctxt->as_layers[1].s_early_decision.ps_ed_pic = ps_layer1_buf;
2896 //ps_ctxt->as_layers[1].s_early_decision.ps_ed = ps_layer1_buf;
2897 ps_ctxt->ps_layer1_buf = ps_layer1_buf;
2898 ps_ctxt->ps_ed_ctb_l1 = ps_ed_ctb_l1;
2899 ps_ctxt->ai4_lambda[layer_no] = lambda;
2900 ps_ctxt->i4_codec_level = 0;
2901 ps_ctxt->i4_slice_type = i4_slice_type;
2902 }
2903 else if(layer_no == 2)
2904 {
2905 WORD32 sad_lambda_l2 = i4_ol_sad_lambda_qf >> 1;
2906 WORD32 temp = 1 << LAMBDA_Q_SHIFT;
2907 WORD32 lambda = ((temp) > sad_lambda_l2) ? temp : sad_lambda_l2;
2908
2909 //ps_ctxt->as_layers[2].s_early_decision.ps_ed_pic = ps_layer2_buf;
2910 //ps_ctxt->as_layers[2].s_early_decision.ps_ed = ps_layer2_buf;
2911 ps_ctxt->ps_layer2_buf = ps_layer2_buf;
2912 //ihevce_ed_frame_init(ps_ctxt->ps_ed_ctxt);
2913 ps_ctxt->ai4_lambda[layer_no] = lambda;
2914 ps_ctxt->i4_codec_level = 0;
2915 ps_ctxt->i4_slice_type = i4_slice_type;
2916 }
2917 else
2918 {
2919 //ps_ctxt->as_layers[0].s_early_decision.ps_ed_pic = NULL;
2920 //ps_ctxt->as_layers[0].s_early_decision.ps_ed = NULL;
2921 //ps_ctxt->ps_layer1_buf = NULL;
2922 ps_ctxt->ai4_lambda[layer_no] = -1;
2923 ps_ctxt->i4_codec_level = 0;
2924 ps_ctxt->i4_slice_type = i4_slice_type;
2925 }
2926 }
2927
2928 /* make the ps_ctb_analyse refernce as a part of the private context */
2929 ps_ctxt->ps_ctb_analyse = ps_ctb_analyse;
2930 }
2931 }
2932
2933 /**
2934 *******************************************************************************
2935 *
2936 * @brief
2937 * Merge Sort function.
2938 *
2939 * @par Description:
2940 * This function sorts the data in the input array in ascending
2941 * order using merge sort algorithm. Intermediate data obtained in
2942 * merge sort are stored in output 2-D array.
2943 *
2944 * @param[in]
2945 * pi4_input_val : Input 1-D array
2946 * aai4_output_val: Output 2-D array containing elements sorted in sets of
2947 * 4,16,64 etc.
2948 * i4_length : length of the array
2949 * i4_ip_sort_level: Input sort level. Specifies the level upto which array is sorted.
2950 * It should be 1 if the array is unsorted. Should be 4 if array is sorted
2951 * in sets of 4.
2952 * i4_op_sort_level: Output sort level. Specify the level upto which sorting is required.
2953 * If it is given as length of array it sorts for whole array.
2954 *
2955 * @returns
2956 *
2957 * @remarks
2958 * None
2959 *
2960 *******************************************************************************
2961 */
ihevce_merge_sort(WORD32 * pi4_input_val,WORD32 aai4_output_val[][64],WORD32 i4_length,WORD32 i4_ip_sort_level,WORD32 i4_op_sort_level)2962 void ihevce_merge_sort(
2963 WORD32 *pi4_input_val,
2964 WORD32 aai4_output_val[][64],
2965 WORD32 i4_length,
2966 WORD32 i4_ip_sort_level,
2967 WORD32 i4_op_sort_level)
2968 {
2969 WORD32 i, j, k;
2970 WORD32 count, level;
2971 WORD32 temp[64];
2972 WORD32 *pi4_temp_buf_cpy;
2973 WORD32 *pi4_temp = &temp[0];
2974 WORD32 calc_level;
2975
2976 pi4_temp_buf_cpy = pi4_temp;
2977
2978 GETRANGE(calc_level, i4_op_sort_level / i4_ip_sort_level);
2979
2980 calc_level = calc_level - 1;
2981
2982 /*** This function is written under the assumption that we need only intermediate values of
2983 sort in the range of 4,16,64 etc. ***/
2984 ASSERT((calc_level % 2) == 0);
2985
2986 /** One iteration of this for loop does 1 sets of sort and produces one intermediate value in 2 iterations **/
2987 for(level = 0; level < calc_level; level++)
2988 {
2989 /** Merges adjacent sets of elements based on current sort level **/
2990 for(count = 0; count < i4_length; (count = count + (i4_ip_sort_level * 2)))
2991 {
2992 i = 0;
2993 j = 0;
2994 if(pi4_input_val[i4_ip_sort_level - 1] < pi4_input_val[i4_ip_sort_level])
2995 {
2996 /*** Condition for early exit ***/
2997 memcpy(&pi4_temp[0], pi4_input_val, sizeof(WORD32) * i4_ip_sort_level * 2);
2998 }
2999 else
3000 {
3001 for(k = 0; k < (i4_ip_sort_level * 2); k++)
3002 {
3003 if((i < i4_ip_sort_level) && (j < i4_ip_sort_level))
3004 {
3005 if(pi4_input_val[i] > pi4_input_val[j + i4_ip_sort_level])
3006 {
3007 /** copy to output array **/
3008 pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
3009 j++;
3010 }
3011 else
3012 {
3013 /** copy to output array **/
3014 pi4_temp[k] = pi4_input_val[i];
3015 i++;
3016 }
3017 }
3018 else if(i == i4_ip_sort_level)
3019 {
3020 /** copy the remaining data to output array **/
3021 pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
3022 j++;
3023 }
3024 else
3025 {
3026 /** copy the remaining data to output array **/
3027 pi4_temp[k] = pi4_input_val[i];
3028 i++;
3029 }
3030 }
3031 }
3032 pi4_input_val += (i4_ip_sort_level * 2);
3033 pi4_temp += (i4_ip_sort_level * 2);
3034 }
3035 pi4_input_val = pi4_temp - i4_length;
3036
3037 if(level % 2)
3038 {
3039 /** Assign a temp address for storing next sort level output as we will not need this data as output **/
3040 pi4_temp = pi4_temp_buf_cpy;
3041 }
3042 else
3043 {
3044 /** Assign address for storing the intermediate data into output 2-D array **/
3045 pi4_temp = aai4_output_val[level / 2];
3046 }
3047 i4_ip_sort_level *= 2;
3048 }
3049 }
3050
ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit(void * pv_pre_intra_ctxt,pre_enc_me_ctxt_t * ps_curr_out,WORD32 i4_is_last_thread,frm_ctb_ctxt_t * ps_frm_ctb_prms,WORD32 i4_temporal_lyr_id,WORD32 i4_enable_noise_detection)3051 void ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit(
3052 void *pv_pre_intra_ctxt,
3053 pre_enc_me_ctxt_t *ps_curr_out,
3054 WORD32 i4_is_last_thread,
3055 frm_ctb_ctxt_t *ps_frm_ctb_prms,
3056 WORD32 i4_temporal_lyr_id,
3057 WORD32 i4_enable_noise_detection)
3058 {
3059 ihevce_decomp_pre_intra_master_ctxt_t *ps_pre_intra_master_ctxt =
3060 (ihevce_decomp_pre_intra_master_ctxt_t *)pv_pre_intra_ctxt;
3061 ihevce_decomp_pre_intra_ctxt_t *ps_pre_intra_ctxt =
3062 ps_pre_intra_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
3063
3064 WORD32 i4_k;
3065 WORD32 ctb_ctr, vert_ctr;
3066
3067 WORD32 ai4_curr_frame_8x8_sum_act[2] = { 0, 0 };
3068 LWORD64 ai8_curr_frame_8x8_sum_act_sqr[2] = { 0, 0 };
3069 WORD32 ai4_curr_frame_8x8_sum_blks[2] = { 0, 0 };
3070 ULWORD64 u8_curr_frame_8x8_sum_act_sqr = 0;
3071
3072 LWORD64 ai8_curr_frame_16x16_sum_act_sqr[3] = { 0, 0, 0 };
3073 WORD32 ai4_curr_frame_16x16_sum_act[3] = { 0, 0, 0 };
3074 WORD32 ai4_curr_frame_16x16_sum_blks[3] = { 0, 0, 0 };
3075
3076 LWORD64 ai8_curr_frame_32x32_sum_act_sqr[3] = { 0, 0, 0 };
3077 WORD32 ai4_curr_frame_32x32_sum_act[3] = { 0, 0, 0 };
3078 WORD32 ai4_curr_frame_32x32_sum_blks[3] = { 0, 0, 0 };
3079
3080 (void)i4_temporal_lyr_id;
3081 (void)i4_enable_noise_detection;
3082
3083 if(i4_is_last_thread == 1)
3084 {
3085 WORD32 i4_slice_type = ps_curr_out->s_slice_hdr.i1_slice_type;
3086 //ps_pre_intra_ctxt->i4_slice_type;
3087 WORD32 ctb_ctr_blks = ps_pre_intra_ctxt->as_layers[1].i4_num_col_blks;
3088 WORD32 vert_ctr_blks = ps_pre_intra_ctxt->as_layers[1].i4_num_row_blks;
3089 ihevce_ed_ctb_l1_t *ps_ed_ctb_pic_l1 = ps_curr_out->ps_ed_ctb_l1;
3090 WORD32 block_wd = ps_pre_intra_ctxt->as_layers[1].i4_decomp_blk_wd;
3091 WORD32 inc_ctb = ((block_wd >> 2) * (block_wd >> 2));
3092 ihevce_ed_blk_t *ps_ed_blk_l1 = ps_curr_out->ps_layer1_buf;
3093 ihevce_ed_blk_t *ps_ed;
3094 WORD32 i, j;
3095 WORD32 i4_avg_noise_satd;
3096 WORD32 k;
3097 WORD32 i4_layer_wd = ps_pre_intra_ctxt->as_layers[1].i4_actual_wd;
3098 WORD32 i4_layer_ht = ps_pre_intra_ctxt->as_layers[1].i4_actual_ht;
3099
3100 /*Calculate min noise threshold */
3101 /*Min noise threshold is calculted by taking average of lowest 1% satd val in the complete 4x4 frame satds*/
3102 //ihevce_ed_ctxt_t *ps_ed_ctxt = ps_pre_intra_ctxt->ps_ed_ctxt;
3103 WORD32 i4_min_blk = ((MIN_BLKS * (i4_layer_wd >> 1) * (i4_layer_ht >> 1)) / 100);
3104 WORD32 ai4_noise_thr_hstrgm[MAX_SATD_THRSHLD];
3105 memset(&ai4_noise_thr_hstrgm[0], 0, (sizeof(WORD32) * MAX_SATD_THRSHLD));
3106 ASSERT(!(USE_CUR_L0_SATD && USE_CUR_SATD));
3107 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
3108 {
3109 ps_ed = ps_ed_blk_l1 + (vert_ctr * inc_ctb * (ctb_ctr_blks));
3110 for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
3111 {
3112 /* Populate avg satd to calculate MI and activity factors */
3113 for(i = 0; i < 4; i++)
3114 {
3115 for(j = 0; j < 4; j++)
3116 {
3117 for(k = 0; k < 4; k++)
3118 {
3119 if(-1 != (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd)
3120 {
3121 WORD32 i4_satd_lim;
3122 i4_satd_lim = (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd;
3123 /* Histogram creation for Noise threshold */
3124 if(i4_satd_lim < MAX_SATD_THRSHLD)
3125 {
3126 ai4_noise_thr_hstrgm[i4_satd_lim]++;
3127 }
3128 }
3129 }
3130 }
3131 }
3132 ps_ed += inc_ctb;
3133 }
3134 }
3135 {
3136 WORD32 i4_total_blks = 0;
3137 LWORD64 i8_acc_satd = 0;
3138 for(i = MIN_SATD_THRSHLD; i < MAX_SATD_THRSHLD; i++)
3139 {
3140 i4_total_blks += ai4_noise_thr_hstrgm[i];
3141 i8_acc_satd += (i * ai4_noise_thr_hstrgm[i]);
3142
3143 if(i4_total_blks > i4_min_blk)
3144 break;
3145 }
3146 if(i4_total_blks < i4_min_blk)
3147 {
3148 i4_avg_noise_satd = SATD_NOISE_FLOOR_THRESHOLD;
3149 }
3150 else
3151 {
3152 i4_avg_noise_satd = (WORD32)(i8_acc_satd + (i4_total_blks >> 1)) / i4_total_blks;
3153 }
3154 }
3155
3156 ps_curr_out->i4_avg_noise_thrshld_4x4 = i4_avg_noise_satd;
3157
3158 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
3159 {
3160 ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
3161 ps_ed_ctb_pic_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
3162 ps_ed = ps_ed_blk_l1 + (vert_ctr * inc_ctb * (ctb_ctr_blks));
3163
3164 for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
3165 {
3166 /*sum of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 level */
3167 WORD32 ai4_sum_sum_4x4_satd_16x16[4] = { 0, 0, 0, 0 };
3168 /*min of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 level */
3169 WORD32 ai4_min_sum_4x4_satd_16x16[4] = {
3170 MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL
3171 };
3172 /*min of (min of L1_4x4 @ L1_8x8) @ L1_16x16 level */
3173 WORD32 ai4_min_min_4x4_satd_16x16[4] = {
3174 MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL
3175 };
3176 WORD32 i4_sum_4x4_satd, i4_min_4x4_satd;
3177 ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
3178
3179 WORD32 is_min_block_uncompensated_in_l32x32 = 0;
3180
3181 /*min of L1_4x4 @ L1_8x8*/
3182 WORD32 ai4_min_satd_ctb[MAX_CTB_SIZE];
3183 /*** This 2-D array will contain 4x4 satds sorted in ascending order in sets of 4,16,64 ***/
3184 /*** For example : '5 10 2 7 6 12 3 1' array input will return '2 5 7 10 1 3 6 12' if sorted in sets of 4 ***/
3185 WORD32 aai4_min_4_16_64_satd[3][MAX_CTB_SIZE];
3186
3187 /*sum of L1_4x4 @ L1_8x8*/
3188 WORD32 ai4_sum_satd_ctb[MAX_CTB_SIZE >> 2];
3189 /*** This 2-D array will contain 4x4 satds sorted in ascending order in sets of 4,16***/
3190 WORD32 aai4_sum_4_16_satd_ctb[2][MAX_CTB_SIZE];
3191
3192 /* sum of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 */
3193 WORD32 ai4_sum_sum_satd_ctb[(MAX_CTB_SIZE >> 2) >> 2];
3194 /*L1_32x32 = L0_64x64
3195 so in L1_32x32 there are 64 L1_4x4blocks*/
3196 for(i = 0; i < MAX_CTB_SIZE; i++)
3197 {
3198 ai4_min_satd_ctb[i] = -1;
3199 }
3200 for(j = 0; j < 3; j++)
3201 {
3202 for(i = 0; i < MAX_CTB_SIZE; i++)
3203 {
3204 aai4_min_4_16_64_satd[j][i] = -1;
3205 }
3206 }
3207 /*L1_32x32 = L0_64x64
3208 so in L1_32x32 there are 16 L1_8x8blocks*/
3209 for(i = 0; i < (MAX_CTB_SIZE >> 2); i++)
3210 {
3211 ai4_sum_satd_ctb[i] = -1;
3212 }
3213 for(j = 0; j < 2; j++)
3214 {
3215 for(i = 0; i < (MAX_CTB_SIZE >> 2); i++)
3216 {
3217 aai4_sum_4_16_satd_ctb[j][i] = -1;
3218 }
3219 }
3220 /*L1_32x32 = L0_64x64
3221 so in L1_32x32 there are 16 L1_16x16blocks*/
3222 for(i = 0; i < ((MAX_CTB_SIZE >> 2) >> 2); i++)
3223 {
3224 ai4_sum_sum_satd_ctb[i] = 0;
3225 }
3226 /*Populate sum min 4x4 activty */
3227 /*loop for L1_32x32 block*/
3228 for(i = 0; i < 4; i++)
3229 {
3230 /*loop for L1_16x16 block*/
3231 for(j = 0; j < 4; j++)
3232 {
3233 WORD32 i4_sum_satd_dumyy = 0;
3234 WORD32 i4_num_satd_blks = 0;
3235 /* loop for L1_8x8 block*/
3236 for(k = 0; k < 4; k++)
3237 {
3238 WORD32 i4_satd_lim;
3239 i4_satd_lim = (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd;
3240
3241 /*complete ctb will not have i4_4x4_satd = -1*/
3242 if(-1 != i4_satd_lim)
3243 {
3244 #if SUB_NOISE_THRSHLD
3245 i4_satd_lim = i4_satd_lim - i4_avg_noise_satd;
3246 if(i4_satd_lim < 0)
3247 {
3248 i4_satd_lim = 0;
3249 }
3250 #else
3251 if(i4_satd_lim < i4_avg_noise_satd)
3252 {
3253 i4_satd_lim = i4_avg_noise_satd;
3254 }
3255 #endif
3256 i4_num_satd_blks++;
3257 /*populate 4x4 data to calculate modulation index */
3258 (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd = i4_satd_lim;
3259
3260 i4_sum_satd_dumyy += i4_satd_lim;
3261 ai4_min_satd_ctb[j * 4 + i * 16 + k] = i4_satd_lim;
3262 }
3263 }
3264 if(i4_num_satd_blks != 0)
3265 {
3266 /*make the sum of satd always for 4 blocks even it is incomplete ctb */
3267 i4_sum_satd_dumyy = i4_sum_satd_dumyy * 4 / i4_num_satd_blks;
3268 }
3269 else
3270 {
3271 i4_sum_satd_dumyy = -1;
3272 }
3273 /*sum of L1_4x4 @ L1_8x8block level*/
3274 ai4_sum_satd_ctb[j + i * 4] = i4_sum_satd_dumyy;
3275 /*sum of L1_8x8 @ L1_16x16block level*/
3276 ai4_sum_sum_satd_ctb[i] += i4_sum_satd_dumyy;
3277 /*store sum of 4x4 @ L1_8x8block level*/
3278 ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j] = i4_sum_satd_dumyy;
3279 /*store min of 4x4 @ L1_8x8block level */
3280 //ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] = i4_min_satd_dumyy;
3281 }
3282 }
3283 {
3284 WORD32 i4_array_length = sizeof(ai4_min_satd_ctb) / sizeof(WORD32);
3285
3286 /*** This function will sort 64 elements in array ai4_min_satd_ctb in ascending order to ***/
3287 /*** 3 arrays in sets of 4,16,64 into the 2-D array aai4_min_4_16_64_satd ***/
3288 ihevce_merge_sort(
3289 &ai4_min_satd_ctb[0], aai4_min_4_16_64_satd, i4_array_length, 1, 64);
3290
3291 i4_array_length = sizeof(ai4_sum_satd_ctb) / sizeof(WORD32);
3292
3293 /*** This function will sort 16 elements in array ai4_sum_satd_ctb in ascending order to ***/
3294 /*** 2 arrays in sets of 4,16 into the 2-D array aai4_sum_4_16_satd_ctb ***/
3295 ihevce_merge_sort(
3296 &ai4_sum_satd_ctb[0], aai4_sum_4_16_satd_ctb, i4_array_length, 1, 16);
3297 }
3298
3299 /*Populate avg satd to calculate MI and activity factors*/
3300 for(i = 0; i < 4; i++)
3301 {
3302 WORD32 is_min_block_uncompensated_in_l116x16 = 0;
3303 ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = -1;
3304 ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = -1;
3305 ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = -1;
3306
3307 for(j = 0; j < 4; j++)
3308 {
3309 ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] =
3310 aai4_min_4_16_64_satd[0][i * 16 + j * 4 + MEDIAN_CU_TU];
3311 /*Accumulate the sum of 8*8 activities in the current layer (16*16 CU in L0)*/
3312 i4_sum_4x4_satd = ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j];
3313 i4_min_4x4_satd = ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j];
3314 ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = -1;
3315 ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = -1;
3316 ASSERT(-2 != i4_sum_4x4_satd);
3317
3318 if((-1 != i4_sum_4x4_satd))
3319 {
3320 WORD32 not_skipped = 1;
3321
3322 if((i4_slice_type == ISLICE) || (1 == not_skipped))
3323 {
3324 is_min_block_uncompensated_in_l116x16 = 1;
3325 is_min_block_uncompensated_in_l32x32 = 1;
3326
3327 u8_curr_frame_8x8_sum_act_sqr +=
3328 (i4_sum_4x4_satd * i4_sum_4x4_satd);
3329
3330 ai4_curr_frame_8x8_sum_act[0] += i4_sum_4x4_satd;
3331 ai8_curr_frame_8x8_sum_act_sqr[0] +=
3332 (i4_sum_4x4_satd * i4_sum_4x4_satd);
3333 ai4_curr_frame_8x8_sum_blks[0] += 1;
3334 ai4_curr_frame_8x8_sum_act[1] += i4_min_4x4_satd;
3335 ai8_curr_frame_8x8_sum_act_sqr[1] +=
3336 (i4_min_4x4_satd * i4_min_4x4_satd);
3337 ai4_curr_frame_8x8_sum_blks[1] += 1;
3338 }
3339
3340 ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = i4_sum_4x4_satd;
3341 ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = i4_min_4x4_satd;
3342 }
3343 else
3344 {
3345 ai4_sum_sum_4x4_satd_16x16[i] = MAX_32BIT_VAL;
3346 ai4_min_sum_4x4_satd_16x16[i] = MAX_32BIT_VAL;
3347 ai4_min_min_4x4_satd_16x16[i] = MAX_32BIT_VAL;
3348 }
3349 }
3350
3351 //if(1 == is_min_block_comensated_in_l116x16)
3352 {
3353 ai4_min_sum_4x4_satd_16x16[i] =
3354 aai4_sum_4_16_satd_ctb[0][i * 4 + MEDIAN_CU_TU];
3355 ai4_min_min_4x4_satd_16x16[i] =
3356 aai4_min_4_16_64_satd[1][i * 16 + MEDIAN_CU_TU_BY_2];
3357
3358 if(ai4_sum_sum_4x4_satd_16x16[i] != MAX_32BIT_VAL)
3359 {
3360 ai4_sum_sum_4x4_satd_16x16[i] = 0;
3361 for(j = 0; j < 4; j++)
3362 {
3363 ai4_sum_sum_4x4_satd_16x16[i] +=
3364 ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j];
3365 }
3366 ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = ai4_sum_sum_4x4_satd_16x16[i];
3367 ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = ai4_min_sum_4x4_satd_16x16[i];
3368 ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = ai4_min_min_4x4_satd_16x16[i];
3369 }
3370 }
3371 if(1 == is_min_block_uncompensated_in_l116x16)
3372 {
3373 if(MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[i])
3374 {
3375 ai4_curr_frame_16x16_sum_act[0] += ai4_sum_sum_4x4_satd_16x16[i];
3376 ai8_curr_frame_16x16_sum_act_sqr[0] +=
3377 (ai4_sum_sum_4x4_satd_16x16[i] * ai4_sum_sum_4x4_satd_16x16[i]);
3378 ai4_curr_frame_16x16_sum_blks[0] += 1;
3379 }
3380 if(MAX_32BIT_VAL != ai4_min_sum_4x4_satd_16x16[i])
3381 {
3382 ai4_curr_frame_16x16_sum_act[1] += ai4_min_sum_4x4_satd_16x16[i];
3383 ai8_curr_frame_16x16_sum_act_sqr[1] +=
3384 (ai4_min_sum_4x4_satd_16x16[i] * ai4_min_sum_4x4_satd_16x16[i]);
3385 ai4_curr_frame_16x16_sum_blks[1] += 1;
3386 ai4_curr_frame_16x16_sum_act[2] += ai4_min_min_4x4_satd_16x16[i];
3387 ai8_curr_frame_16x16_sum_act_sqr[2] +=
3388 (ai4_min_min_4x4_satd_16x16[i] * ai4_min_min_4x4_satd_16x16[i]);
3389 ai4_curr_frame_16x16_sum_blks[2] += 1;
3390 }
3391 }
3392 }
3393 /*32x32*/
3394 {
3395 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = -1;
3396 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = -1;
3397 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = -1;
3398 ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = -1;
3399
3400 if((MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[0]) ||
3401 (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[2]) ||
3402 (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[1]) ||
3403 (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[3]))
3404 {
3405 //if(1 == is_min_block_comensated_in_l32x32)
3406 {
3407 {
3408 WORD32 aai4_min_sum_sum_4x4_satd_16x16[1][64];
3409 WORD32 i4_array_length =
3410 sizeof(ai4_sum_sum_4x4_satd_16x16) / sizeof(WORD32);
3411 /*** Sort 4 elements in ascending order ***/
3412 ihevce_merge_sort(
3413 &ai4_sum_sum_4x4_satd_16x16[0],
3414 aai4_min_sum_sum_4x4_satd_16x16,
3415 i4_array_length,
3416 1,
3417 4);
3418
3419 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] =
3420 aai4_min_sum_sum_4x4_satd_16x16[0][MEDIAN_CU_TU];
3421 }
3422 {
3423 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] =
3424 aai4_sum_4_16_satd_ctb[1][MEDIAN_CU_TU_BY_2];
3425 }
3426 {
3427 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] =
3428 aai4_min_4_16_64_satd[2][MEDIAN_CU_TU_BY_4];
3429 }
3430
3431 /*Sum of all 32x32 activity */
3432 ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = 0;
3433 for(j = 0; j < 4; j++)
3434 {
3435 if(MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[j])
3436 ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] +=
3437 ai4_sum_sum_4x4_satd_16x16[j];
3438 }
3439
3440 if(1 == is_min_block_uncompensated_in_l32x32)
3441 {
3442 /*Accumulate the sum of 32*32 activities in the current layer (64*64 CU in L0)*/
3443 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][0])
3444 {
3445 ai4_curr_frame_32x32_sum_act[0] +=
3446 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0];
3447 ai8_curr_frame_32x32_sum_act_sqr[0] +=
3448 (ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] *
3449 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0]);
3450 ai4_curr_frame_32x32_sum_blks[0] += 1;
3451 }
3452
3453 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][1])
3454 {
3455 ai4_curr_frame_32x32_sum_act[1] +=
3456 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1];
3457 ai8_curr_frame_32x32_sum_act_sqr[1] +=
3458 (ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] *
3459 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1]);
3460 ai4_curr_frame_32x32_sum_blks[1] += 1;
3461 }
3462
3463 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][2])
3464 {
3465 ai4_curr_frame_32x32_sum_act[2] +=
3466 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2];
3467 ai8_curr_frame_32x32_sum_act_sqr[2] +=
3468 (ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] *
3469 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2]);
3470 ai4_curr_frame_32x32_sum_blks[2] += 1;
3471 }
3472 }
3473 }
3474 }
3475 }
3476 /*Increment ctb count*/
3477 ps_ed += inc_ctb;
3478 }
3479 }
3480
3481 /* Spatial Variation and modulation index calculated for the frame */
3482 {
3483 for(i4_k = 0; i4_k < 2; i4_k++)
3484 {
3485 /*8x8*/
3486 #if USE_SQRT_AVG_OF_SATD_SQR
3487 ps_curr_out->i8_curr_frame_8x8_sum_act[i4_k] = ai8_curr_frame_8x8_sum_act_sqr[i4_k];
3488 #else
3489 ps_curr_out->i8_curr_frame_8x8_sum_act[i4_k] = ai4_curr_frame_8x8_sum_act[i4_k];
3490 #endif
3491 ps_curr_out->i4_curr_frame_8x8_sum_act_for_strength[i4_k] =
3492 ai4_curr_frame_8x8_sum_act[i4_k];
3493 ps_curr_out->i4_curr_frame_8x8_num_blks[i4_k] = ai4_curr_frame_8x8_sum_blks[i4_k];
3494 ps_curr_out->u8_curr_frame_8x8_sum_act_sqr = u8_curr_frame_8x8_sum_act_sqr;
3495
3496 /*16x16*/
3497 #if USE_SQRT_AVG_OF_SATD_SQR
3498 ps_curr_out->i8_curr_frame_16x16_sum_act[i4_k] =
3499 ai8_curr_frame_16x16_sum_act_sqr[i4_k];
3500 #else
3501 ps_curr_out->i8_curr_frame_16x16_sum_act[i4_k] = ai4_curr_frame_16x16_sum_act[i4_k];
3502 #endif
3503 ps_curr_out->i4_curr_frame_16x16_num_blks[i4_k] =
3504 ai4_curr_frame_16x16_sum_blks[i4_k];
3505
3506 /*32x32*/
3507 #if USE_SQRT_AVG_OF_SATD_SQR
3508 ps_curr_out->i8_curr_frame_32x32_sum_act[i4_k] =
3509 ai8_curr_frame_32x32_sum_act_sqr[i4_k];
3510 #else
3511 ps_curr_out->i8_curr_frame_32x32_sum_act[i4_k] = ai4_curr_frame_32x32_sum_act[i4_k];
3512 #endif
3513 ps_curr_out->i4_curr_frame_32x32_num_blks[i4_k] =
3514 ai4_curr_frame_32x32_sum_blks[i4_k];
3515 }
3516
3517 /*16x16*/
3518 #if USE_SQRT_AVG_OF_SATD_SQR
3519 ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai8_curr_frame_16x16_sum_act_sqr[2];
3520 #else
3521 ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai4_curr_frame_16x16_sum_act[2];
3522 #endif
3523
3524 ps_curr_out->i4_curr_frame_16x16_num_blks[2] = ai4_curr_frame_16x16_sum_blks[2];
3525
3526 /*32x32*/
3527 #if USE_SQRT_AVG_OF_SATD_SQR
3528 ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai8_curr_frame_32x32_sum_act_sqr[2];
3529 #else
3530 ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai4_curr_frame_32x32_sum_act[2];
3531 #endif
3532 ps_curr_out->i4_curr_frame_32x32_num_blks[2] = ai4_curr_frame_32x32_sum_blks[2];
3533 }
3534 }
3535 }
3536
3537 /*!
3538 ******************************************************************************
3539 * \if Function name : ihevce_decomp_pre_intra_get_frame_satd \endif
3540 *
3541 * \brief
3542 * Number of memory records are returned for enc_loop module
3543 *
3544 *
3545 * \return
3546 * None
3547 *
3548 * \author
3549 * Ittiam
3550 *
3551 *****************************************************************************
3552 */
ihevce_decomp_pre_intra_get_frame_satd(void * pv_ctxt,WORD32 * i4_width,WORD32 * i4_hieght)3553 LWORD64 ihevce_decomp_pre_intra_get_frame_satd(void *pv_ctxt, WORD32 *i4_width, WORD32 *i4_hieght)
3554 {
3555 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
3556 (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
3557 WORD32 i4_i;
3558 LWORD64 i8_tot_satd = 0;
3559
3560 /*accumulate SATD acorss all thread. note that every thread will enter this function,
3561 hence it must be guranteed that all thread must have completed preintra pass by now*/
3562 for(i4_i = 0; i4_i < ps_master_ctxt->i4_num_proc_thrds; i4_i++)
3563 {
3564 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
3565 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i4_i];
3566
3567 //i8_tot_satd += ps_ctxt->as_layers[1].s_early_decision.i8_sum_best_satd;
3568 i8_tot_satd += ps_ctxt->ps_ed_ctxt->i8_sum_best_satd;
3569
3570 *i4_width = ps_ctxt->as_layers[1].i4_actual_wd;
3571 *i4_hieght = ps_ctxt->as_layers[1].i4_actual_ht;
3572 }
3573
3574 return i8_tot_satd;
3575 }
3576
ihevce_decomp_pre_intra_get_frame_satd_squared(void * pv_ctxt,WORD32 * i4_width,WORD32 * i4_hieght)3577 LWORD64 ihevce_decomp_pre_intra_get_frame_satd_squared(
3578 void *pv_ctxt, WORD32 *i4_width, WORD32 *i4_hieght)
3579 {
3580 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
3581 (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
3582 WORD32 i4_i;
3583 LWORD64 i8_tot_satd = 0;
3584
3585 /*accumulate SATD acorss all thread. note that every thread will enter this function,
3586 hence it must be guranteed that all thread must have completed preintra pass by now*/
3587 for(i4_i = 0; i4_i < ps_master_ctxt->i4_num_proc_thrds; i4_i++)
3588 {
3589 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
3590 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i4_i];
3591
3592 //i8_tot_satd += ps_ctxt->as_layers[1].s_early_decision.i8_sum_best_satd;
3593 i8_tot_satd += (ps_ctxt->ps_ed_ctxt->i8_sum_sq_best_satd);
3594
3595 *i4_width = ps_ctxt->as_layers[1].i4_actual_wd;
3596 *i4_hieght = ps_ctxt->as_layers[1].i4_actual_ht;
3597 }
3598
3599 return i8_tot_satd;
3600 }
3601