1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 ******************************************************************************
23 * @file ihevce_had_satd.c
24 *
25 * @brief
26 * This file contains functions of Hadamard SAD and SATD
27 *
28 * @author
29 * Ittiam
30 *
31 * List of Functions
32 * <TODO: TO BE ADDED>
33 *
34 ******************************************************************************
35 */
36
37 /*****************************************************************************/
38 /* File Includes */
39 /*****************************************************************************/
40 /* System include files */
41 #include <stdio.h>
42 #include <string.h>
43 #include <stdlib.h>
44 #include <assert.h>
45 #include <stdarg.h>
46 #include <math.h>
47
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevce_api.h"
52
53 #include "rc_cntrl_param.h"
54 #include "rc_frame_info_collector.h"
55 #include "rc_look_ahead_params.h"
56
57 #include "ihevc_defs.h"
58 #include "ihevc_structs.h"
59 #include "ihevc_platform_macros.h"
60 #include "ihevc_deblk.h"
61 #include "ihevc_itrans_recon.h"
62 #include "ihevc_chroma_itrans_recon.h"
63 #include "ihevc_chroma_intra_pred.h"
64 #include "ihevc_intra_pred.h"
65 #include "ihevc_inter_pred.h"
66 #include "ihevc_mem_fns.h"
67 #include "ihevc_padding.h"
68 #include "ihevc_weighted_pred.h"
69 #include "ihevc_sao.h"
70 #include "ihevc_resi_trans.h"
71 #include "ihevc_quant_iquant_ssd.h"
72 #include "ihevc_cabac_tables.h"
73
74 #include "ihevce_defs.h"
75 #include "ihevce_lap_enc_structs.h"
76 #include "ihevce_multi_thrd_structs.h"
77 #include "ihevce_multi_thrd_funcs.h"
78 #include "ihevce_me_common_defs.h"
79 #include "ihevce_had_satd.h"
80 #include "ihevce_error_codes.h"
81 #include "ihevce_bitstream.h"
82 #include "ihevce_cabac.h"
83 #include "ihevce_rdoq_macros.h"
84 #include "ihevce_function_selector.h"
85 #include "ihevce_enc_structs.h"
86 #include "ihevce_cmn_utils_instr_set_router.h"
87 #include "hme_datatype.h"
88 #include "hme_interface.h"
89 #include "hme_common_defs.h"
90 #include "hme_defs.h"
91
92 /*****************************************************************************/
93 /* Function Definitions */
94 /*****************************************************************************/
95
ihevce_hadamard_4x4_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)96 static void ihevce_hadamard_4x4_8bit(
97 UWORD8 *pu1_src,
98 WORD32 src_strd,
99 UWORD8 *pu1_pred,
100 WORD32 pred_strd,
101 WORD16 *pi2_dst,
102 WORD32 dst_strd)
103 {
104 WORD32 k;
105 WORD16 m[16];
106
107 /*===== hadamard horz transform =====*/
108 for(k = 0; k < 4; k++)
109 {
110 WORD32 r0, r1, r2, r3;
111 WORD32 h0, h1, h2, h3;
112
113 /* Compute the residue block */
114 r0 = pu1_src[0] - pu1_pred[0];
115 r1 = pu1_src[1] - pu1_pred[1];
116 r2 = pu1_src[2] - pu1_pred[2];
117 r3 = pu1_src[3] - pu1_pred[3];
118
119 h0 = r0 + r1;
120 h1 = r0 - r1;
121 h2 = r2 + r3;
122 h3 = r2 - r3;
123
124 m[k * 4 + 0] = h0 + h2;
125 m[k * 4 + 1] = h1 + h3;
126 m[k * 4 + 2] = h0 - h2;
127 m[k * 4 + 3] = h1 - h3;
128
129 pu1_pred += pred_strd;
130 pu1_src += src_strd;
131 }
132
133 /*===== hadamard vert transform =====*/
134 for(k = 0; k < 4; k++)
135 {
136 WORD32 v0, v1, v2, v3;
137
138 v0 = m[0 + k] + m[4 + k];
139 v1 = m[0 + k] - m[4 + k];
140 v2 = m[8 + k] + m[12 + k];
141 v3 = m[8 + k] - m[12 + k];
142
143 pi2_dst[0 * dst_strd + k] = v0 + v2;
144 pi2_dst[1 * dst_strd + k] = v1 + v3;
145 pi2_dst[2 * dst_strd + k] = v0 - v2;
146 pi2_dst[3 * dst_strd + k] = v1 - v3;
147 }
148 }
149
ihevce_hadamard_8x8_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)150 static void ihevce_hadamard_8x8_8bit(
151 UWORD8 *pu1_src,
152 WORD32 src_strd,
153 UWORD8 *pu1_pred,
154 WORD32 pred_strd,
155 WORD16 *pi2_dst,
156 WORD32 dst_strd)
157 {
158 WORD32 i;
159
160 // y0
161 ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
162 // y1
163 ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd);
164 // y2
165 ihevce_hadamard_4x4_8bit(
166 pu1_src + 4 * src_strd,
167 src_strd,
168 pu1_pred + 4 * pred_strd,
169 pred_strd,
170 pi2_dst + (4 * dst_strd),
171 dst_strd);
172 // y3
173 ihevce_hadamard_4x4_8bit(
174 pu1_src + 4 + 4 * src_strd,
175 src_strd,
176 pu1_pred + 4 + 4 * pred_strd,
177 pred_strd,
178 pi2_dst + (4 * dst_strd) + 4,
179 dst_strd);
180
181 /* Child HAD results combined as follows to get Parent result */
182 /* _ _ */
183 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
184 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
185 /* \- -/ */
186 for(i = 0; i < 16; i++)
187 {
188 WORD32 idx = (i >> 2) * dst_strd + (i % 4);
189 WORD16 a0 = pi2_dst[idx];
190 WORD16 a1 = pi2_dst[4 + idx];
191 WORD16 a2 = pi2_dst[(4 * dst_strd) + idx];
192 WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx];
193
194 WORD16 b0 = (a0 + a1);
195 WORD16 b1 = (a0 - a1);
196 WORD16 b2 = (a2 + a3);
197 WORD16 b3 = (a2 - a3);
198
199 pi2_dst[idx] = b0 + b2;
200 pi2_dst[4 + idx] = b1 + b3;
201 pi2_dst[(4 * dst_strd) + idx] = b0 - b2;
202 pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3;
203 }
204 }
205
ihevce_hadamard_16x16_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)206 static void ihevce_hadamard_16x16_8bit(
207 UWORD8 *pu1_src,
208 WORD32 src_strd,
209 UWORD8 *pu1_pred,
210 WORD32 pred_strd,
211 WORD16 *pi2_dst,
212 WORD32 dst_strd)
213 {
214 WORD32 i;
215
216 // y0
217 ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
218 // y1
219 ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd);
220 // y2
221 ihevce_hadamard_8x8_8bit(
222 pu1_src + 8 * src_strd,
223 src_strd,
224 pu1_pred + 8 * pred_strd,
225 pred_strd,
226 pi2_dst + (8 * dst_strd),
227 dst_strd);
228 // y3
229 ihevce_hadamard_8x8_8bit(
230 pu1_src + 8 + 8 * src_strd,
231 src_strd,
232 pu1_pred + 8 + 8 * pred_strd,
233 pred_strd,
234 pi2_dst + (8 * dst_strd) + 8,
235 dst_strd);
236
237 /* Child HAD results combined as follows to get Parent result */
238 /* _ _ */
239 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
240 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
241 /* \- -/ */
242 for(i = 0; i < 64; i++)
243 {
244 WORD32 idx = (i >> 3) * dst_strd + (i % 8);
245 WORD16 a0 = pi2_dst[idx];
246 WORD16 a1 = pi2_dst[8 + idx];
247 WORD16 a2 = pi2_dst[(8 * dst_strd) + idx];
248 WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx];
249
250 WORD16 b0 = (a0 + a1) >> 1;
251 WORD16 b1 = (a0 - a1) >> 1;
252 WORD16 b2 = (a2 + a3) >> 1;
253 WORD16 b3 = (a2 - a3) >> 1;
254
255 pi2_dst[idx] = b0 + b2;
256 pi2_dst[8 + idx] = b1 + b3;
257 pi2_dst[(8 * dst_strd) + idx] = b0 - b2;
258 pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3;
259 }
260 }
261
ihevce_hadamard_32x32_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)262 static void ihevce_hadamard_32x32_8bit(
263 UWORD8 *pu1_src,
264 WORD32 src_strd,
265 UWORD8 *pu1_pred,
266 WORD32 pred_strd,
267 WORD16 *pi2_dst,
268 WORD32 dst_strd)
269 {
270 WORD32 i;
271
272 // y0
273 ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
274 // y1
275 ihevce_hadamard_16x16_8bit(
276 pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd);
277 // y2
278 ihevce_hadamard_16x16_8bit(
279 pu1_src + 16 * src_strd,
280 src_strd,
281 pu1_pred + 16 * pred_strd,
282 pred_strd,
283 pi2_dst + (16 * dst_strd),
284 dst_strd);
285 // y3
286 ihevce_hadamard_16x16_8bit(
287 pu1_src + 16 + 16 * src_strd,
288 src_strd,
289 pu1_pred + 16 + 16 * pred_strd,
290 pred_strd,
291 pi2_dst + (16 * dst_strd) + 16,
292 dst_strd);
293
294 /* Child HAD results combined as follows to get Parent result */
295 /* _ _ */
296 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
297 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
298 /* \- -/ */
299 for(i = 0; i < 256; i++)
300 {
301 WORD32 idx = (i >> 4) * dst_strd + (i % 16);
302 WORD16 a0 = pi2_dst[idx] >> 2;
303 WORD16 a1 = pi2_dst[16 + idx] >> 2;
304 WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2;
305 WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2;
306
307 WORD16 b0 = (a0 + a1);
308 WORD16 b1 = (a0 - a1);
309 WORD16 b2 = (a2 + a3);
310 WORD16 b3 = (a2 - a3);
311
312 pi2_dst[idx] = b0 + b2;
313 pi2_dst[16 + idx] = b1 + b3;
314 pi2_dst[(16 * dst_strd) + idx] = b0 - b2;
315 pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3;
316 }
317 }
318
319 /**
320 *******************************************************************************
321 *
322 * @brief
323 * Compute Hadamard sad for 4x4 block with 8-bit input
324 *
325 * @par Description:
326 *
327 * @param[in] pu1_origin
328 * UWORD8 pointer to the current block
329 *
330 * @param[in] src_strd
331 * WORD32 Source stride
332 *
333 * @param[in] pu1_pred_buf
334 * UWORD8 pointer to the prediction block
335 *
336 * @param[in] pred_strd
337 * WORD32 Pred stride
338 *
339 * @param[in] pi2_dst
340 * WORD16 pointer to the transform block
341 *
342 * @param[in] dst_strd
343 * WORD32 Destination stride
344 *
345 * @param[in] size
346 * WORD32 transform Block size
347 *
348 * @returns hadamard SAD
349 *
350 * @remarks
351 * Not updating the transform destination now. Only returning the SATD
352 *
353 *******************************************************************************
354 */
ihevce_HAD_4x4_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)355 UWORD32 ihevce_HAD_4x4_8bit(
356 UWORD8 *pu1_origin,
357 WORD32 src_strd,
358 UWORD8 *pu1_pred_buf,
359 WORD32 pred_strd,
360 WORD16 *pi2_dst,
361 WORD32 dst_strd)
362 {
363 WORD32 k;
364 WORD16 v[16];
365 UWORD32 u4_sad = 0;
366
367 (void)pi2_dst;
368 (void)dst_strd;
369 ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4);
370
371 for(k = 0; k < 16; ++k)
372 u4_sad += abs(v[k]);
373 u4_sad = ((u4_sad + 2) >> 2);
374
375 return u4_sad;
376 }
377
378 /**
379 *******************************************************************************
380 *
381 * @brief
382 * Computes Hadamard Sad for 8x8 block with 8-bit input
383 *
384 * @par Description:
385 *
386 * @param[in] pu1_origin
387 * UWORD8 pointer to the current block
388 *
389 * @param[in] src_strd
390 * WORD32 Source stride
391 *
392 * @param[in] pu1_pred_buf
393 * UWORD8 pointer to the prediction block
394 *
395 * @param[in] pred_strd
396 * WORD32 Pred stride
397 *
398 * @param[in] pi2_dst
399 * WORD16 pointer to the transform block
400 *
401 * @param[in] dst_strd
402 * WORD32 Destination stride
403 *
404 * @param[in] size
405 * WORD32 transform Block size
406 *
407 * @returns Hadamard SAD
408 *
409 * @remarks
410 * Not updating the transform destination now. Only returning the SATD
411 *
412 *******************************************************************************
413 */
ihevce_HAD_8x8_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)414 UWORD32 ihevce_HAD_8x8_8bit(
415 UWORD8 *pu1_origin,
416 WORD32 src_strd,
417 UWORD8 *pu1_pred_buf,
418 WORD32 pred_strd,
419 WORD16 *pi2_dst,
420 WORD32 dst_strd)
421 {
422 WORD32 k;
423 UWORD32 u4_sad = 0;
424 WORD16 v[64];
425
426 (void)pi2_dst;
427 (void)dst_strd;
428 ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
429
430 for(k = 0; k < 64; ++k)
431 u4_sad += abs(v[k]);
432 u4_sad = ((u4_sad + 4) >> 3);
433
434 return u4_sad;
435 }
436
437 /**
438 *******************************************************************************
439 *
440 * @brief
441 * Compute dc suppressed hadamard sad for 8x8 block with 8-bit input
442 *
443 * @par Description:
444 *
445 * @param[in] pu1_origin
446 * UWORD8 pointer to the current block
447 *
448 * @param[in] src_strd
449 * WORD32 Source stride
450 *
451 * @param[in] pu1_pred_buf
452 * UWORD8 pointer to the prediction block
453 *
454 * @param[in] pred_strd
455 * WORD32 Pred stride
456 *
457 * @param[in] pi2_dst
458 * WORD16 pointer to the transform block
459 *
460 * @param[in] dst_strd
461 * WORD32 Destination stride
462 *
463 * @param[in] size
464 * WORD32 transform Block size
465 *
466 * @returns Hadamard SAD with DC Suppressed
467 *
468 * @remarks
469 * Not updating the transform destination now. Only returning the SATD
470 *
471 *******************************************************************************
472 */
ihevce_compute_ac_had_8x8_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)473 UWORD32 ihevce_compute_ac_had_8x8_8bit(
474 UWORD8 *pu1_origin,
475 WORD32 src_strd,
476 UWORD8 *pu1_pred_buf,
477 WORD32 pred_strd,
478 WORD16 *pi2_dst,
479 WORD32 dst_strd)
480 {
481 WORD32 k;
482 UWORD32 u4_sad = 0;
483 WORD16 v[64];
484
485 (void)pi2_dst;
486 (void)dst_strd;
487 ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
488
489 v[0] = 0;
490 for(k = 0; k < 64; ++k)
491 u4_sad += abs(v[k]);
492 u4_sad = ((u4_sad + 4) >> 3);
493
494 return u4_sad;
495 }
496
497 /**
498 *******************************************************************************
499 *
500 * @brief
501 * Computes Hadamard Sad for 16x16 block with 8-bit input
502 *
503 * @par Description:
504 *
505 * @param[in] pu1_origin
506 * UWORD8 pointer to the current block
507 *
508 * @param[in] src_strd
509 * WORD32 Source stride
510 *
511 * @param[in] pu1_pred_buf
512 * UWORD8 pointer to the prediction block
513 *
514 * @param[in] pred_strd
515 * WORD32 Pred stride
516 *
517 * @param[in] pi2_dst
518 * WORD16 pointer to the transform block
519 *
520 * @param[in] dst_strd
521 * WORD32 Destination stride
522 *
523 * @param[in] size
524 * WORD32 transform Block size
525 *
526 * @returns Hadamard SAD
527 *
528 * @remarks
529 * Not updating the transform destination now. Only returning the SATD
530 *
531 *******************************************************************************
532 */
ihevce_HAD_16x16_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)533 UWORD32 ihevce_HAD_16x16_8bit(
534 UWORD8 *pu1_origin,
535 WORD32 src_strd,
536 UWORD8 *pu1_pred_buf,
537 WORD32 pred_strd,
538 WORD16 *pi2_dst,
539 WORD32 dst_strd)
540 {
541 WORD32 k;
542 UWORD32 u4_sad = 0;
543 WORD16 v[256];
544
545 (void)pi2_dst;
546 (void)dst_strd;
547 ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16);
548
549 for(k = 0; k < 256; ++k)
550 u4_sad += abs(v[k]);
551 u4_sad = ((u4_sad + 4) >> 3);
552
553 return u4_sad;
554 }
555
556 /**
557 *******************************************************************************
558 *
559 * @brief
560 * Computes Hadamard Sad for 32x32 block with 8-bit input
561 *
562 * @par Description:
563 *
564 * @param[in] pu1_origin
565 * UWORD8 pointer to the current block
566 *
567 * @param[in] src_strd
568 * WORD32 Source stride
569 *
570 * @param[in] pu1_pred_buf
571 * UWORD8 pointer to the prediction block
572 *
573 * @param[in] pred_strd
574 * WORD32 Pred stride
575 *
576 * @param[in] pi2_dst
577 * WORD16 pointer to the transform block
578 *
579 * @param[in] dst_strd
580 * WORD32 Destination stride
581 *
582 * @param[in] size
583 * WORD32 transform Block size
584 *
585 * @returns Hadamard SAD
586 *
587 * @remarks
588 * Not updating the transform destination now. Only returning the SATD
589 *
590 *******************************************************************************
591 */
ihevce_HAD_32x32_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)592 UWORD32 ihevce_HAD_32x32_8bit(
593 UWORD8 *pu1_origin,
594 WORD32 src_strd,
595 UWORD8 *pu1_pred_buf,
596 WORD32 pred_strd,
597 WORD16 *pi2_dst,
598 WORD32 dst_strd)
599 {
600 WORD32 k;
601 UWORD32 u4_sad = 0;
602 WORD16 v[32 * 32];
603
604 (void)pi2_dst;
605 (void)dst_strd;
606 ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32);
607
608 for(k = 0; k < 32 * 32; ++k)
609 u4_sad += abs(v[k]);
610 u4_sad = ((u4_sad + 2) >> 2);
611
612 return u4_sad;
613 }
614
615 //#if COMPUTE_16x16_R == C
616 /**
617 *******************************************************************************
618 *
619 * @brief
620 * Computes 8x8 transform using children 4x4 hadamard results
621 *
622 * @par Description:
623 *
624 * @param[in] pi2_4x4_had
625 * WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
626 *
627 * @param[in] had4_strd
628 * stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
629 *
630 * @param[out] pi2_dst
631 * destination buffer where 8x8 hadamard result is stored
632 *
633 * @param[in] dst_stride
634 * stride of destination block
635 *
636 * @param[in] i4_frm_qstep
637 * frm_qstep value based on the which the threshold value is calculated
638 *
639 * @returns
640 * 8x8 Hadamard SATD
641 * @remarks
642 *
643 *******************************************************************************
644 */
ihevce_compute_8x8HAD_using_4x4(WORD16 * pi2_4x4_had,WORD32 had4_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)645 static UWORD32 ihevce_compute_8x8HAD_using_4x4(
646 WORD16 *pi2_4x4_had,
647 WORD32 had4_strd,
648 WORD16 *pi2_dst,
649 WORD32 dst_strd,
650 WORD32 i4_frm_qstep,
651 WORD32 *pi4_cbf)
652 {
653 /* Qstep value is right shifted by 8 */
654 WORD32 threshold = (i4_frm_qstep >> 8);
655
656 /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */
657 WORD16 *pi2_y0 = pi2_4x4_had;
658 WORD16 *pi2_y1 = pi2_4x4_had + 4;
659 WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4;
660 WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4;
661
662 /* Initialize pointers to store 8x8 HAD output */
663 WORD16 *pi2_dst0 = pi2_dst;
664 WORD16 *pi2_dst1 = pi2_dst + 4;
665 WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4;
666 WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4;
667
668 UWORD32 u4_satd = 0;
669 WORD32 i;
670
671 /* Child HAD results combined as follows to get Parent result */
672 /* _ _ */
673 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
674 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
675 /* \- -/ */
676 for(i = 0; i < 16; i++)
677 {
678 WORD32 src_idx = (i >> 2) * had4_strd + (i % 4);
679 WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4);
680
681 WORD16 a0 = pi2_y0[src_idx];
682 WORD16 a1 = pi2_y1[src_idx];
683 WORD16 a2 = pi2_y2[src_idx];
684 WORD16 a3 = pi2_y3[src_idx];
685
686 WORD16 b0 = (a0 + a1);
687 WORD16 b1 = (a0 - a1);
688 WORD16 b2 = (a2 + a3);
689 WORD16 b3 = (a2 - a3);
690
691 pi2_dst0[dst_idx] = b0 + b2;
692 pi2_dst1[dst_idx] = b1 + b3;
693 pi2_dst2[dst_idx] = b0 - b2;
694 pi2_dst3[dst_idx] = b1 - b3;
695
696 if(ABS(pi2_dst0[dst_idx]) > threshold)
697 *pi4_cbf = 1;
698 if(ABS(pi2_dst1[dst_idx]) > threshold)
699 *pi4_cbf = 1;
700 if(ABS(pi2_dst2[dst_idx]) > threshold)
701 *pi4_cbf = 1;
702 if(ABS(pi2_dst3[dst_idx]) > threshold)
703 *pi4_cbf = 1;
704
705 u4_satd += ABS(pi2_dst0[dst_idx]);
706 u4_satd += ABS(pi2_dst1[dst_idx]);
707 u4_satd += ABS(pi2_dst2[dst_idx]);
708 u4_satd += ABS(pi2_dst3[dst_idx]);
709 }
710
711 /* return the 8x8 satd */
712 return (u4_satd);
713 }
714
715 /**
716 *******************************************************************************
717 *
718 * @brief
719 * Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of
720 * a 8x8 block (Residue is computed for 8-bit src and prediction buffers)
721 * Modified to incorporate the dead-zone implementation - Lokesh
722 *
723 * @par Description:
724 *
725 * @param[in] pu1_origin
726 * UWORD8 pointer to the current block
727 *
728 * @param[in] src_strd
729 * WORD32 Source stride
730 *
731 * @param[in] pu1_pred
732 * UWORD8 pointer to the prediction block
733 *
734 * @param[in] pred_strd
735 * WORD32 Pred stride
736 *
737 * @param[out] pi2_dst
738 * WORD16 pointer to the transform block
739 *
740 * @param[in] dst_strd
741 * WORD32 Destination stride
742 *
743 * @param[out] pi4_hsad
744 * array for storing hadmard sad of each 4x4 block
745 *
746 * @param[in] hsad_stride
747 * stride of hadmard sad destination buffer (for Zscan order of storing sads)
748 *
749 * @param[in] i4_frm_qstep
750 * frm_qstep value based on the which the threshold value is calculated
751 *
752 * @returns
753 *
754 * @remarks
755 *
756 *******************************************************************************
757 */
ihevce_had4_4x4(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst4x4,WORD32 dst_strd,WORD32 * pi4_hsad,WORD32 hsad_stride,WORD32 i4_frm_qstep)758 static WORD32 ihevce_had4_4x4(
759 UWORD8 *pu1_src,
760 WORD32 src_strd,
761 UWORD8 *pu1_pred,
762 WORD32 pred_strd,
763 WORD16 *pi2_dst4x4,
764 WORD32 dst_strd,
765 WORD32 *pi4_hsad,
766 WORD32 hsad_stride,
767 WORD32 i4_frm_qstep)
768 {
769 WORD32 i, k;
770 WORD32 i4_child_total_sad = 0;
771
772 (void)i4_frm_qstep;
773 /* -------- Compute four 4x4 HAD Transforms ---------*/
774 for(i = 0; i < 4; i++)
775 {
776 UWORD8 *pu1_pi0, *pu1_pi1;
777 WORD16 *pi2_dst;
778 WORD32 blkx, blky;
779 UWORD32 u4_hsad = 0;
780 // TODO: choose deadzone as f(qstep)
781 WORD32 threshold = 0;
782
783 /*****************************************************/
784 /* Assuming the looping structure of the four */
785 /* blocks is in Z scan order of 4x4s in a 8x8 */
786 /* block instead of raster scan */
787 /*****************************************************/
788 blkx = (i & 0x1);
789 blky = (i >> 1);
790
791 pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd);
792 pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd);
793 pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd);
794
795 ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd);
796
797 for(k = 0; k < 4; k++)
798 {
799 if(ABS(pi2_dst[0 * dst_strd + k]) < threshold)
800 pi2_dst[0 * dst_strd + k] = 0;
801
802 if(ABS(pi2_dst[1 * dst_strd + k]) < threshold)
803 pi2_dst[1 * dst_strd + k] = 0;
804
805 if(ABS(pi2_dst[2 * dst_strd + k]) < threshold)
806 pi2_dst[2 * dst_strd + k] = 0;
807
808 if(ABS(pi2_dst[3 * dst_strd + k]) < threshold)
809 pi2_dst[3 * dst_strd + k] = 0;
810
811 /* Accumulate the SATD */
812 u4_hsad += ABS(pi2_dst[0 * dst_strd + k]);
813 u4_hsad += ABS(pi2_dst[1 * dst_strd + k]);
814 u4_hsad += ABS(pi2_dst[2 * dst_strd + k]);
815 u4_hsad += ABS(pi2_dst[3 * dst_strd + k]);
816 }
817
818 /*===== Normalize the HSAD =====*/
819 pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2);
820 i4_child_total_sad += ((u4_hsad + 2) >> 2);
821 }
822 return i4_child_total_sad;
823 }
824
825 /**
826 *******************************************************************************
827 *
828 * @brief
829 * HSAD is returned for the 4, 4x4 in 8x8
830 *
831 * @par Description:
832 *
833 * @param[in] pu1_origin
834 * UWORD8 pointer to the current block
835 *
836 * @param[in] src_strd
837 * WORD32 Source stride
838 *
839 * @param[in] pu1_pred
840 * UWORD8 pointer to the prediction block
841 *
842 * @param[in] pred_strd
843 * WORD32 Pred stride
844 *
845 * @param[out] pi2_dst
846 * WORD16 pointer to the transform output block
847 *
848 * @param[out] dst_strd
849 * WORD32 Destination stride
850 *
851 * @param[out] ppi4_hsad
852 * pointer to base pointers for storing hadmard sads of various
853 * block sizes (4x4 to 32x32)
854 *
855 * @param[in] pos_x_y_4x4
856 * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
857 * Lower 16bits denote xpos and upper 16ypos of the 4x4block
858 *
859 * @param[in] num_4x4_in_row
860 * Denotes the number of current 4x4 blocks in a ctb/CU/MB
861 *
862 * @returns
863 *
864 * @remarks
865 *
866 *******************************************************************************
867 */
ihevce_had_8x8_using_4_4x4(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row)868 void ihevce_had_8x8_using_4_4x4(
869 UWORD8 *pu1_src,
870 WORD32 src_strd,
871 UWORD8 *pu1_pred,
872 WORD32 pred_strd,
873 WORD16 *pi2_dst,
874 WORD32 dst_strd,
875 WORD32 **ppi4_hsad,
876 WORD32 pos_x_y_4x4,
877 WORD32 num_4x4_in_row)
878 {
879 WORD16 ai2_4x4_had[64];
880 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
881 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
882 WORD32 *pi4_4x4_hsad;
883 WORD32 *pi4_8x8_hsad;
884
885 (void)pi2_dst;
886 (void)dst_strd;
887 ASSERT(pos_x >= 0);
888 ASSERT(pos_y >= 0);
889
890 /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */
891 pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
892 pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
893
894 /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
895 pi4_8x8_hsad[0] = ihevce_had4_4x4(
896 pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
897 }
898
899 /**
900 *******************************************************************************
901 *
902 * @brief
903 * Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8
904 * block and its four subblocks(4x4).
905 *
906 * @par Description:
907 *
908 * @param[in] pu1_origin
909 * UWORD8 pointer to the current block
910 *
911 * @param[in] src_strd
912 * WORD32 Source stride
913 *
914 * @param[in] pu1_pred
915 * UWORD8 pointer to the prediction block
916 *
917 * @param[in] pred_strd
918 * WORD32 Pred stride
919 *
920 * @param[out] pi2_dst
921 * WORD16 pointer to the transform output block
922 *
923 * @param[out] dst_strd
924 * WORD32 Destination stride
925 *
926 * @param[out] ppi4_hsad
927 * pointer to base pointers for storing hadmard sads of various
928 * block sizes (4x4 to 32x32)
929 *
930 * @param[in] pos_x_y_4x4
931 * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
932 * Lower 16bits denote xpos and upper 16ypos of the 4x4block
933 *
934 * @param[in] num_4x4_in_row
935 * Denotes the number of current 4x4 blocks in a ctb/CU/MB
936 *
937 * @param[in] i4_frm_qstep
938 * frm_qstep value based on the which the threshold value is calculated
939 *
940 * @returns
941 *
942 * @remarks
943 *
944 *******************************************************************************
945 */
ihevce_had_8x8_using_4_4x4_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)946 WORD32 ihevce_had_8x8_using_4_4x4_r(
947 UWORD8 *pu1_src,
948 WORD32 src_strd,
949 UWORD8 *pu1_pred,
950 WORD32 pred_strd,
951 WORD16 *pi2_dst,
952 WORD32 dst_strd,
953 WORD32 **ppi4_hsad,
954 WORD32 **ppi4_tu_split,
955 WORD32 **ppi4_tu_early_cbf,
956 WORD32 pos_x_y_4x4,
957 WORD32 num_4x4_in_row,
958 WORD32 lambda,
959 WORD32 lambda_q_shift,
960 WORD32 i4_frm_qstep,
961 WORD32 i4_cur_depth,
962 WORD32 i4_max_depth,
963 WORD32 i4_max_tr_size,
964 WORD32 *pi4_tu_split_cost,
965 void *pv_func_sel)
966 {
967 WORD16 ai2_4x4_had[64];
968 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
969 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
970 WORD32 *pi4_4x4_hsad;
971 WORD32 *pi4_8x8_hsad;
972 WORD32 *pi4_8x8_tu_split;
973
974 WORD32 *pi4_8x8_tu_early_cbf;
975
976 UWORD32 u4_satd;
977 WORD32 cost_child = 0, cost_parent = 0;
978 WORD32 early_cbf = 0;
979
980 const UWORD8 u1_cur_tr_size = 8;
981 /* Stores the best cost for the Current 8x8: Lokesh */
982 WORD32 best_cost = 0;
983
984 (void)pv_func_sel;
985 ASSERT(pos_x >= 0);
986 ASSERT(pos_y >= 0);
987
988 /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */
989 pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
990 pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
991 pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
992 pi4_8x8_tu_early_cbf =
993 ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
994
995 /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
996 cost_child = ihevce_had4_4x4(
997 pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
998
999 /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */
1000 u4_satd = ihevce_compute_8x8HAD_using_4x4(
1001 ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1002
1003 /* store the normalized 8x8 satd */
1004 cost_parent = ((u4_satd + 4) >> 3);
1005
1006 /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1007 cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
1008
1009 if(i4_cur_depth < i4_max_depth)
1010 {
1011 if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1012 {
1013 //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1);
1014 *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
1015 best_cost = cost_child;
1016 best_cost <<= 1;
1017 best_cost++;
1018 pi4_8x8_tu_split[0] = 1;
1019 pi4_8x8_hsad[0] = cost_child;
1020 }
1021 else
1022 {
1023 //cost_parent -= ((1) * lambda) >> (lambda_q_shift + 1);
1024 best_cost = cost_parent;
1025 best_cost <<= 1;
1026 pi4_8x8_tu_split[0] = 0;
1027 pi4_8x8_hsad[0] = cost_parent;
1028 }
1029 }
1030 else
1031 {
1032 //cost_parent -= ((1) * lambda) >> (lambda_q_shift + 1);
1033 best_cost = cost_parent;
1034 best_cost <<= 1;
1035 pi4_8x8_tu_split[0] = 0;
1036 pi4_8x8_hsad[0] = cost_parent;
1037 }
1038
1039 pi4_8x8_tu_early_cbf[0] = early_cbf;
1040
1041 /* best cost has tu_split_flag at LSB(Least significant bit) */
1042 return ((best_cost << 1) + early_cbf);
1043 }
1044
1045 /**
1046 *******************************************************************************
1047 *
1048 * @brief
1049 * Computes 16x16 transform using children 8x8 hadamard results
1050 * Modified to incorporate the dead-zone implementation - Lokesh
1051 *
1052 * @par Description:
1053 *
1054 * @param[in] pi2_8x8_had
1055 * WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1056 *
1057 * @param[in] had8_strd
1058 * stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1059 *
1060 * @param[out] pi2_dst
1061 * destination buffer where 8x8 hadamard result is stored
1062 *
1063 * @param[in] dst_stride
1064 * stride of destination block
1065 *
1066 * @param[in] i4_frm_qstep
1067 * frm_qstep value based on the which the threshold value is calculated
1068 *
1069 * @returns
1070 * 16x16 Hadamard SATD
1071 * @remarks
1072 *
1073 *******************************************************************************
1074 */
ihevce_compute_16x16HAD_using_8x8(WORD16 * pi2_8x8_had,WORD32 had8_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)1075 static UWORD32 ihevce_compute_16x16HAD_using_8x8(
1076 WORD16 *pi2_8x8_had,
1077 WORD32 had8_strd,
1078 WORD16 *pi2_dst,
1079 WORD32 dst_strd,
1080 WORD32 i4_frm_qstep,
1081 WORD32 *pi4_cbf)
1082 {
1083 /* Qstep value is right shifted by 8 */
1084 WORD32 threshold = (i4_frm_qstep >> 8);
1085
1086 /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1087 WORD16 *pi2_y0 = pi2_8x8_had;
1088 WORD16 *pi2_y1 = pi2_8x8_had + 8;
1089 WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8;
1090 WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8;
1091
1092 /* Initialize pointers to store 8x8 HAD output */
1093 WORD16 *pi2_dst0 = pi2_dst;
1094 WORD16 *pi2_dst1 = pi2_dst + 8;
1095 WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8;
1096 WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8;
1097
1098 UWORD32 u4_satd = 0;
1099 WORD32 i;
1100
1101 /* Child HAD results combined as follows to get Parent result */
1102 /* _ _ */
1103 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
1104 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
1105 /* \- -/ */
1106 for(i = 0; i < 64; i++)
1107 {
1108 WORD32 src_idx = (i >> 3) * had8_strd + (i % 8);
1109 WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8);
1110
1111 WORD16 a0 = pi2_y0[src_idx];
1112 WORD16 a1 = pi2_y1[src_idx];
1113 WORD16 a2 = pi2_y2[src_idx];
1114 WORD16 a3 = pi2_y3[src_idx];
1115
1116 WORD16 b0 = (a0 + a1) >> 1;
1117 WORD16 b1 = (a0 - a1) >> 1;
1118 WORD16 b2 = (a2 + a3) >> 1;
1119 WORD16 b3 = (a2 - a3) >> 1;
1120
1121 pi2_dst0[dst_idx] = b0 + b2;
1122 pi2_dst1[dst_idx] = b1 + b3;
1123 pi2_dst2[dst_idx] = b0 - b2;
1124 pi2_dst3[dst_idx] = b1 - b3;
1125
1126 /* Make the value of dst to zerp, if it falls below the dead-zone */
1127 if(ABS(pi2_dst0[dst_idx]) > threshold)
1128 *pi4_cbf = 1;
1129 if(ABS(pi2_dst1[dst_idx]) > threshold)
1130 *pi4_cbf = 1;
1131 if(ABS(pi2_dst2[dst_idx]) > threshold)
1132 *pi4_cbf = 1;
1133 if(ABS(pi2_dst3[dst_idx]) > threshold)
1134 *pi4_cbf = 1;
1135
1136 u4_satd += ABS(pi2_dst0[dst_idx]);
1137 u4_satd += ABS(pi2_dst1[dst_idx]);
1138 u4_satd += ABS(pi2_dst2[dst_idx]);
1139 u4_satd += ABS(pi2_dst3[dst_idx]);
1140 }
1141
1142 /* return 16x16 satd */
1143 return (u4_satd);
1144 }
1145
1146 /**
1147 *******************************************************************************
1148 *
1149 * @brief
1150 * Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates.
1151 * Uses recursive 8x8 had output to compute satd for 16x16 and its children
1152 *
1153 * @par Description:
1154 *
1155 * @param[in] pu1_origin
1156 * UWORD8 pointer to the current block
1157 *
1158 * @param[in] src_strd
1159 * WORD32 Source stride
1160 *
1161 * @param[in] pu1_pred
1162 * UWORD8 pointer to the prediction block
1163 *
1164 * @param[in] pred_strd
1165 * WORD32 Pred stride
1166 *
1167 * @param[out] pi2_dst
1168 * WORD16 pointer to the transform output block
1169 *
1170 * @param[out] dst_strd
1171 * WORD32 Destination stride
1172 *
1173 * @param[out] ppi4_hsad
1174 * pointer to base pointers for storing hadmard sads of various
1175 * block sizes (4x4 to 32x32)
1176 *
1177 * @param[in] pos_x_y_4x4
1178 * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1179 * Lower 16bits denote xpos and upper 16ypos of the 4x4block
1180 *
1181 * @param[in] num_4x4_in_row
1182 * Denotes the number of current 4x4 blocks in a ctb/CU/MB
1183 *
1184 * @param[in] lambda
1185 * lambda values is the cost factor calculated based on QP
1186 *
1187 * @param[in] lambda_q_shift
1188 * lambda_q_shift used to reverse the lambda value back from q8 format
1189 *
1190 * @param[in] depth
1191 * depth gives the current TU depth with respect to the CU
1192 *
1193 * @param[in] i4_frm_qstep
1194 * frm_qstep value based on the which the threshold value is calculated
1195 *
1196 * @returns
1197 *
1198 * @remarks
1199 *
1200 *******************************************************************************
1201 */
1202
ihevce_had_16x16_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)1203 WORD32 ihevce_had_16x16_r(
1204 UWORD8 *pu1_src,
1205 WORD32 src_strd,
1206 UWORD8 *pu1_pred,
1207 WORD32 pred_strd,
1208 WORD16 *pi2_dst,
1209 WORD32 dst_strd,
1210 WORD32 **ppi4_hsad,
1211 WORD32 **ppi4_tu_split,
1212 WORD32 **ppi4_tu_early_cbf,
1213 WORD32 pos_x_y_4x4,
1214 WORD32 num_4x4_in_row,
1215 WORD32 lambda,
1216 WORD32 lambda_q_shift,
1217 WORD32 i4_frm_qstep,
1218 WORD32 i4_cur_depth,
1219 WORD32 i4_max_depth,
1220 WORD32 i4_max_tr_size,
1221 WORD32 *pi4_tu_split_cost,
1222 void *pv_func_sel)
1223 {
1224 WORD16 ai2_8x8_had[256];
1225 WORD32 *pi4_16x16_hsad;
1226 WORD32 *pi4_16x16_tu_split;
1227
1228 WORD32 *pi4_16x16_tu_early_cbf;
1229
1230 UWORD32 u4_satd = 0;
1231 WORD32 tu_split_flag = 0;
1232 WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1233 const UWORD8 u1_cur_tr_size = 16;
1234
1235 /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1236 /* cost_child : Stores the cost of the child HAD transform (16x16) */
1237 WORD32 cost_parent = 0, cost_child = 0;
1238
1239 /*best_cost returns the best cost at the end of the function */
1240 /*tu_split denoes whether the TU (16x16)is split or not */
1241 WORD32 best_cost = 0, best_cost_tu_split;
1242 WORD32 i;
1243
1244 WORD16 *pi2_y0;
1245 UWORD8 *pu1_src0;
1246 UWORD8 *pu1_pred0;
1247 WORD32 pos_x_y_4x4_0;
1248
1249 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1250 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1251
1252 ASSERT(pos_x >= 0);
1253 ASSERT(pos_y >= 0);
1254
1255 /* Initialize pointers to store 16x16 SATDs */
1256 pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1257
1258 pi4_16x16_tu_split =
1259 ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1260
1261 pi4_16x16_tu_early_cbf =
1262 ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1263
1264 /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1265 for(i = 0; i < 4; i++)
1266 {
1267 pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
1268 pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
1269 pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1270 pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1271
1272 best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r(
1273 pu1_src0,
1274 src_strd,
1275 pu1_pred0,
1276 pred_strd,
1277 pi2_y0,
1278 16,
1279 ppi4_hsad,
1280 ppi4_tu_split,
1281 ppi4_tu_early_cbf,
1282 pos_x_y_4x4_0,
1283 num_4x4_in_row,
1284 lambda,
1285 lambda_q_shift,
1286 i4_frm_qstep,
1287 i4_cur_depth + 1,
1288 i4_max_depth,
1289 i4_max_tr_size,
1290 pi4_tu_split_cost,
1291 pv_func_sel);
1292
1293 /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
1294 best_cost = (best_cost_tu_split >> 2);
1295
1296 /* Last but one bit stores the information regarding the TU_Split */
1297 tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
1298
1299 /* Last bit stores the information regarding the early_cbf */
1300 i4_early_cbf_flag += (best_cost_tu_split & 0x1);
1301
1302 cost_child += best_cost;
1303
1304 tu_split_flag <<= 1;
1305 i4_early_cbf_flag <<= 1;
1306 }
1307
1308 /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
1309 pi2_y0 = ai2_8x8_had;
1310
1311 /* Threshold currently passed as "0" */
1312 u4_satd =
1313 ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1314
1315 /* store the normalized satd */
1316 cost_parent = ((u4_satd + 4) >> 3);
1317
1318 /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1319 cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1320
1321 i4_early_cbf_flag += early_cbf;
1322
1323 /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
1324 which decides the extent to which TU_REC needs to be done */
1325 if(i4_cur_depth < i4_max_depth)
1326 {
1327 if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1328 {
1329 //cost_child -= ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1330 *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1331 tu_split_flag += 1;
1332 best_cost = cost_child;
1333 }
1334 else
1335 {
1336 //cost_parent -= ((1 + 1) * lambda) >> (lambda_q_shift + 1);
1337 tu_split_flag += 0;
1338 best_cost = cost_parent;
1339 }
1340 }
1341 else
1342 {
1343 //cost_parent -= ((1 + 1) * lambda) >> (lambda_q_shift + 1);
1344 tu_split_flag += 0;
1345 best_cost = cost_parent;
1346 }
1347
1348 pi4_16x16_hsad[0] = best_cost;
1349 pi4_16x16_tu_split[0] = tu_split_flag;
1350 pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
1351
1352 /*returning two values(best cost & tu_split_flag) as a single value*/
1353 return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
1354 }
1355
1356 //#endif
1357 /**
1358 *******************************************************************************
1359 *
1360 * @brief
1361 * Computes 32x32 transform using children 16x16 hadamard results
1362 *
1363 * @par Description:
1364 *
1365 * @param[in] pi2_16x16_had
1366 * WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1367 *
1368 * @param[in] had16_strd
1369 * stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1370 *
1371 * @param[out] pi2_dst
1372 * destination buffer where 16x16 hadamard result is stored
1373 *
1374 * @param[in] dst_stride
1375 * stride of destination block
1376 *
1377 * @param[in] i4_frm_qstep
1378 * frm_qstep value based on the which the threshold value is calculated
1379 *
1380 * @returns
1381 * 32x32 Hadamard SATD
1382 * @remarks
1383 *
1384 *******************************************************************************
1385 */
1386 //#if COMPUTE_32x32_USING_16X16 == C
ihevce_compute_32x32HAD_using_16x16(WORD16 * pi2_16x16_had,WORD32 had16_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)1387 UWORD32 ihevce_compute_32x32HAD_using_16x16(
1388 WORD16 *pi2_16x16_had,
1389 WORD32 had16_strd,
1390 WORD16 *pi2_dst,
1391 WORD32 dst_strd,
1392 WORD32 i4_frm_qstep,
1393 WORD32 *pi4_cbf)
1394 {
1395 /* Qstep value is right shifted by 8 */
1396 WORD32 threshold = (i4_frm_qstep >> 8);
1397
1398 /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1399 WORD16 *pi2_y0 = pi2_16x16_had;
1400 WORD16 *pi2_y1 = pi2_16x16_had + 16;
1401 WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16;
1402 WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16;
1403
1404 /* Initialize pointers to store 8x8 HAD output */
1405 WORD16 *pi2_dst0 = pi2_dst;
1406 WORD16 *pi2_dst1 = pi2_dst + 16;
1407 WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16;
1408 WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16;
1409
1410 UWORD32 u4_satd = 0;
1411 WORD32 i;
1412
1413 /* Child HAD results combined as follows to get Parent result */
1414 /* _ _ */
1415 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
1416 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
1417 /* \- -/ */
1418 for(i = 0; i < 256; i++)
1419 {
1420 WORD32 src_idx = (i >> 4) * had16_strd + (i % 16);
1421 WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16);
1422
1423 WORD16 a0 = pi2_y0[src_idx] >> 2;
1424 WORD16 a1 = pi2_y1[src_idx] >> 2;
1425 WORD16 a2 = pi2_y2[src_idx] >> 2;
1426 WORD16 a3 = pi2_y3[src_idx] >> 2;
1427
1428 WORD16 b0 = (a0 + a1);
1429 WORD16 b1 = (a0 - a1);
1430 WORD16 b2 = (a2 + a3);
1431 WORD16 b3 = (a2 - a3);
1432
1433 pi2_dst0[dst_idx] = b0 + b2;
1434 pi2_dst1[dst_idx] = b1 + b3;
1435 pi2_dst2[dst_idx] = b0 - b2;
1436 pi2_dst3[dst_idx] = b1 - b3;
1437
1438 /* Make the value of dst to zerp, if it falls below the dead-zone */
1439 if(ABS(pi2_dst0[dst_idx]) > threshold)
1440 *pi4_cbf = 1;
1441 if(ABS(pi2_dst1[dst_idx]) > threshold)
1442 *pi4_cbf = 1;
1443 if(ABS(pi2_dst2[dst_idx]) > threshold)
1444 *pi4_cbf = 1;
1445 if(ABS(pi2_dst3[dst_idx]) > threshold)
1446 *pi4_cbf = 1;
1447
1448 u4_satd += ABS(pi2_dst0[dst_idx]);
1449 u4_satd += ABS(pi2_dst1[dst_idx]);
1450 u4_satd += ABS(pi2_dst2[dst_idx]);
1451 u4_satd += ABS(pi2_dst3[dst_idx]);
1452 }
1453
1454 /* return 32x32 satd */
1455 return (u4_satd);
1456 }
1457 //#endif
1458
1459 /**
1460 *******************************************************************************
1461 *
1462 * @brief
1463 * Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates.
1464 * Uses recursive 16x16 had output to compute satd for 32x32 and its children
1465 *
1466 * @par Description:
1467 *
1468 * @param[in] pu1_origin
1469 * UWORD8 pointer to the current block
1470 *
1471 * @param[in] src_strd
1472 * WORD32 Source stride
1473 *
1474 * @param[in] pu1_pred
1475 * UWORD8 pointer to the prediction block
1476 *
1477 * @param[in] pred_strd
1478 * WORD32 Pred stride
1479 *
1480 * @param[out] pi2_dst
1481 * WORD16 pointer to the transform output block
1482 *
1483 * @param[out] dst_strd
1484 * WORD32 Destination stride
1485 *
1486 * @param[out] ppi4_hsad
1487 * pointer to base pointers for storing hadmard sads of various
1488 * block sizes (4x4 to 32x32)
1489 *
1490 * @param[in] pos_x_y_4x4
1491 * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1492 * Lower 16bits denote xpos and upper 16ypos of the 4x4block
1493 *
1494 * @param[in] num_4x4_in_row
1495 * Denotes the number of current 4x4 blocks in a ctb/CU/MB
1496 *
1497 * @param[in] lambda
1498 * lambda values is the cost factor calculated based on QP
1499 *
1500 * @param[in] lambda_q_shift
1501 * lambda_q_shift used to reverse the lambda value back from q8 format
1502 *
1503 * @param[in] depth
1504 * depth gives the current TU depth with respect to the CU
1505 *
1506 * @param[in] i4_frm_qstep
1507 * frm_qstep value based on the which the threshold value is calculated
1508 *
1509 *
1510 * @returns
1511 *
1512 * @remarks
1513 *
1514 *******************************************************************************
1515 */
ihevce_had_32x32_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,me_func_selector_t * ps_func_selector)1516 void ihevce_had_32x32_r(
1517 UWORD8 *pu1_src,
1518 WORD32 src_strd,
1519 UWORD8 *pu1_pred,
1520 WORD32 pred_strd,
1521 WORD16 *pi2_dst,
1522 WORD32 dst_strd,
1523 WORD32 **ppi4_hsad,
1524 WORD32 **ppi4_tu_split,
1525 WORD32 **ppi4_tu_early_cbf,
1526 WORD32 pos_x_y_4x4,
1527 WORD32 num_4x4_in_row,
1528 WORD32 lambda,
1529 WORD32 lambda_q_shift,
1530 WORD32 i4_frm_qstep,
1531 WORD32 i4_cur_depth,
1532 WORD32 i4_max_depth,
1533 WORD32 i4_max_tr_size,
1534 WORD32 *pi4_tu_split_cost,
1535 me_func_selector_t *ps_func_selector)
1536
1537 {
1538 WORD16 ai2_16x16_had[1024];
1539 WORD32 *pi4_32x32_hsad;
1540 WORD32 *pi4_32x32_tu_split;
1541 WORD32 *pi4_32x32_tu_early_cbf;
1542
1543 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1544 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1545 WORD32 tu_split_flag = 0;
1546 const UWORD8 u1_cur_tr_size = 32;
1547 WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1548
1549 /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1550 /* cost_child : Stores the cost of the child HAD transform (16x16) */
1551 WORD32 cost_child = 0, cost_parent = 0;
1552
1553 /*retuned as the best cost for the entire TU (32x32) */
1554 WORD32 best_cost = 0;
1555 /*captures the best cost and tu_split at child level */
1556 WORD32 best_cost_tu_split;
1557
1558 /* Initialize pointers to 4 8x8 blocks in 16x16 */
1559 WORD16 *pi2_y0 = ai2_16x16_had;
1560 WORD16 *pi2_y1 = ai2_16x16_had + 16;
1561 WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16;
1562 WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16;
1563
1564 UWORD8 *pu1_src0 = pu1_src;
1565 UWORD8 *pu1_src1 = pu1_src + 16;
1566 UWORD8 *pu1_src2 = pu1_src + src_strd * 16;
1567 UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16;
1568
1569 UWORD8 *pu1_pred0 = pu1_pred;
1570 UWORD8 *pu1_pred1 = pu1_pred + 16;
1571 UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16;
1572 UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16;
1573
1574 ASSERT(pos_x >= 0);
1575 ASSERT(pos_y >= 0);
1576
1577 /* Initialize pointers to store 32x32 SATDs */
1578 pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1579
1580 pi4_32x32_tu_split =
1581 ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1582
1583 pi4_32x32_tu_early_cbf =
1584 ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1585
1586 /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1587 best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1588 pu1_src0,
1589 src_strd,
1590 pu1_pred0,
1591 pred_strd,
1592 pi2_y0,
1593 32,
1594 ppi4_hsad,
1595 ppi4_tu_split,
1596 ppi4_tu_early_cbf,
1597 pos_x_y_4x4,
1598 num_4x4_in_row,
1599 lambda,
1600 lambda_q_shift,
1601 i4_frm_qstep,
1602 i4_cur_depth + 1,
1603 i4_max_depth,
1604 i4_max_tr_size,
1605 pi4_tu_split_cost,
1606 NULL);
1607
1608 /* cost is shifted by 10bits */
1609 best_cost = best_cost_tu_split >> 10;
1610
1611 /* Tu split is present in the 6-10 bits */
1612 tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1613
1614 /*Early CBF info is present in the last 5 bits */
1615 i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1616
1617 tu_split_flag <<= 5;
1618 i4_early_cbf_flag <<= 5;
1619
1620 cost_child += best_cost;
1621
1622 best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1623 pu1_src1,
1624 src_strd,
1625 pu1_pred1,
1626 pred_strd,
1627 pi2_y1,
1628 32,
1629 ppi4_hsad,
1630 ppi4_tu_split,
1631 ppi4_tu_early_cbf,
1632 pos_x_y_4x4 + 4,
1633 num_4x4_in_row,
1634 lambda,
1635 lambda_q_shift,
1636 i4_frm_qstep,
1637 i4_cur_depth + 1,
1638 i4_max_depth,
1639 i4_max_tr_size,
1640 pi4_tu_split_cost,
1641 NULL);
1642
1643 /* cost is shifted by 10bits */
1644 best_cost = best_cost_tu_split >> 10;
1645
1646 /* Tu split is present in the 6-10 bits */
1647 tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1648
1649 /*Early CBF info is present in the last 5 bits */
1650 i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1651
1652 tu_split_flag <<= 5;
1653 i4_early_cbf_flag <<= 5;
1654
1655 cost_child += best_cost;
1656
1657 best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1658 pu1_src2,
1659 src_strd,
1660 pu1_pred2,
1661 pred_strd,
1662 pi2_y2,
1663 32,
1664 ppi4_hsad,
1665 ppi4_tu_split,
1666 ppi4_tu_early_cbf,
1667 pos_x_y_4x4 + (4 << 16),
1668 num_4x4_in_row,
1669 lambda,
1670 lambda_q_shift,
1671 i4_frm_qstep,
1672 i4_cur_depth + 1,
1673 i4_max_depth,
1674 i4_max_tr_size,
1675 pi4_tu_split_cost,
1676 NULL);
1677
1678 /* cost is shifted by 10bits */
1679 best_cost = best_cost_tu_split >> 10;
1680
1681 /* Tu split is present in the 6-10 bits */
1682 tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1683
1684 /*Early CBF info is present in the last 5 bits */
1685 i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1686
1687 tu_split_flag <<= 5;
1688 i4_early_cbf_flag <<= 5;
1689
1690 cost_child += best_cost;
1691
1692 best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1693 pu1_src3,
1694 src_strd,
1695 pu1_pred3,
1696 pred_strd,
1697 pi2_y3,
1698 32,
1699 ppi4_hsad,
1700 ppi4_tu_split,
1701 ppi4_tu_early_cbf,
1702 pos_x_y_4x4 + (4 << 16) + 4,
1703 num_4x4_in_row,
1704 lambda,
1705 lambda_q_shift,
1706 i4_frm_qstep,
1707 i4_cur_depth + 1,
1708 i4_max_depth,
1709 i4_max_tr_size,
1710 pi4_tu_split_cost,
1711 NULL);
1712
1713 /* cost is shifted by 10bits */
1714 best_cost = best_cost_tu_split >> 10;
1715
1716 /* Tu split is present in the 6-10 bits */
1717 tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1718
1719 /*Early CBF info is present in the last 5 bits */
1720 i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1721
1722 tu_split_flag <<= 1;
1723 i4_early_cbf_flag <<= 1;
1724
1725 cost_child += best_cost;
1726
1727 {
1728 UWORD32 u4_satd = 0;
1729
1730 u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16(
1731 pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1732
1733 cost_parent = ((u4_satd + 2) >> 2);
1734 }
1735
1736 /* 4 TU_Split flags , 4 CBF Flags*/
1737 cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1738
1739 i4_early_cbf_flag += early_cbf;
1740
1741 /* 1 TU_SPlit flag, 1 CBF flag */
1742 //cost_parent += ((1 + 1)* lambda) >> (lambda_q_shift + 1);
1743
1744 if(i4_cur_depth < i4_max_depth)
1745 {
1746 if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size))
1747 {
1748 *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1749 best_cost = cost_child;
1750 tu_split_flag++;
1751 }
1752 else
1753 {
1754 tu_split_flag = 0;
1755 best_cost = cost_parent;
1756 }
1757 }
1758 else
1759 {
1760 tu_split_flag = 0;
1761 best_cost = cost_parent;
1762 }
1763
1764 pi4_32x32_tu_split[0] = tu_split_flag;
1765
1766 pi4_32x32_hsad[0] = best_cost;
1767
1768 pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag;
1769 }
1770