• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 ******************************************************************************
23 * @file ihevce_had_satd.c
24 *
25 * @brief
26 *    This file contains functions of Hadamard SAD and SATD
27 *
28 * @author
29 *    Ittiam
30 *
31 * List of Functions
32 *   <TODO: TO BE ADDED>
33 *
34 ******************************************************************************
35 */
36 
37 /*****************************************************************************/
38 /* File Includes                                                             */
39 /*****************************************************************************/
40 /* System include files */
41 #include <stdio.h>
42 #include <string.h>
43 #include <stdlib.h>
44 #include <assert.h>
45 #include <stdarg.h>
46 #include <math.h>
47 
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevce_api.h"
52 
53 #include "rc_cntrl_param.h"
54 #include "rc_frame_info_collector.h"
55 #include "rc_look_ahead_params.h"
56 
57 #include "ihevc_defs.h"
58 #include "ihevc_structs.h"
59 #include "ihevc_platform_macros.h"
60 #include "ihevc_deblk.h"
61 #include "ihevc_itrans_recon.h"
62 #include "ihevc_chroma_itrans_recon.h"
63 #include "ihevc_chroma_intra_pred.h"
64 #include "ihevc_intra_pred.h"
65 #include "ihevc_inter_pred.h"
66 #include "ihevc_mem_fns.h"
67 #include "ihevc_padding.h"
68 #include "ihevc_weighted_pred.h"
69 #include "ihevc_sao.h"
70 #include "ihevc_resi_trans.h"
71 #include "ihevc_quant_iquant_ssd.h"
72 #include "ihevc_cabac_tables.h"
73 
74 #include "ihevce_defs.h"
75 #include "ihevce_lap_enc_structs.h"
76 #include "ihevce_multi_thrd_structs.h"
77 #include "ihevce_multi_thrd_funcs.h"
78 #include "ihevce_me_common_defs.h"
79 #include "ihevce_had_satd.h"
80 #include "ihevce_error_codes.h"
81 #include "ihevce_bitstream.h"
82 #include "ihevce_cabac.h"
83 #include "ihevce_rdoq_macros.h"
84 #include "ihevce_function_selector.h"
85 #include "ihevce_enc_structs.h"
86 #include "ihevce_cmn_utils_instr_set_router.h"
87 #include "hme_datatype.h"
88 #include "hme_interface.h"
89 #include "hme_common_defs.h"
90 #include "hme_defs.h"
91 
92 /*****************************************************************************/
93 /* Function Definitions                                                      */
94 /*****************************************************************************/
95 
ihevce_hadamard_4x4_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)96 static void ihevce_hadamard_4x4_8bit(
97     UWORD8 *pu1_src,
98     WORD32 src_strd,
99     UWORD8 *pu1_pred,
100     WORD32 pred_strd,
101     WORD16 *pi2_dst,
102     WORD32 dst_strd)
103 {
104     WORD32 k;
105     WORD16 m[16];
106 
107     /*===== hadamard horz transform =====*/
108     for(k = 0; k < 4; k++)
109     {
110         WORD32 r0, r1, r2, r3;
111         WORD32 h0, h1, h2, h3;
112 
113         /* Compute the residue block */
114         r0 = pu1_src[0] - pu1_pred[0];
115         r1 = pu1_src[1] - pu1_pred[1];
116         r2 = pu1_src[2] - pu1_pred[2];
117         r3 = pu1_src[3] - pu1_pred[3];
118 
119         h0 = r0 + r1;
120         h1 = r0 - r1;
121         h2 = r2 + r3;
122         h3 = r2 - r3;
123 
124         m[k * 4 + 0] = h0 + h2;
125         m[k * 4 + 1] = h1 + h3;
126         m[k * 4 + 2] = h0 - h2;
127         m[k * 4 + 3] = h1 - h3;
128 
129         pu1_pred += pred_strd;
130         pu1_src += src_strd;
131     }
132 
133     /*===== hadamard vert transform =====*/
134     for(k = 0; k < 4; k++)
135     {
136         WORD32 v0, v1, v2, v3;
137 
138         v0 = m[0 + k] + m[4 + k];
139         v1 = m[0 + k] - m[4 + k];
140         v2 = m[8 + k] + m[12 + k];
141         v3 = m[8 + k] - m[12 + k];
142 
143         pi2_dst[0 * dst_strd + k] = v0 + v2;
144         pi2_dst[1 * dst_strd + k] = v1 + v3;
145         pi2_dst[2 * dst_strd + k] = v0 - v2;
146         pi2_dst[3 * dst_strd + k] = v1 - v3;
147     }
148 }
149 
ihevce_hadamard_8x8_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)150 static void ihevce_hadamard_8x8_8bit(
151     UWORD8 *pu1_src,
152     WORD32 src_strd,
153     UWORD8 *pu1_pred,
154     WORD32 pred_strd,
155     WORD16 *pi2_dst,
156     WORD32 dst_strd)
157 {
158     WORD32 i;
159 
160     // y0
161     ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
162     // y1
163     ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd);
164     // y2
165     ihevce_hadamard_4x4_8bit(
166         pu1_src + 4 * src_strd,
167         src_strd,
168         pu1_pred + 4 * pred_strd,
169         pred_strd,
170         pi2_dst + (4 * dst_strd),
171         dst_strd);
172     // y3
173     ihevce_hadamard_4x4_8bit(
174         pu1_src + 4 + 4 * src_strd,
175         src_strd,
176         pu1_pred + 4 + 4 * pred_strd,
177         pred_strd,
178         pi2_dst + (4 * dst_strd) + 4,
179         dst_strd);
180 
181     /*   Child HAD results combined as follows to get Parent result */
182     /*  _                                                 _         */
183     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
184     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
185     /* \-                                                 -/        */
186     for(i = 0; i < 16; i++)
187     {
188         WORD32 idx = (i >> 2) * dst_strd + (i % 4);
189         WORD16 a0 = pi2_dst[idx];
190         WORD16 a1 = pi2_dst[4 + idx];
191         WORD16 a2 = pi2_dst[(4 * dst_strd) + idx];
192         WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx];
193 
194         WORD16 b0 = (a0 + a1);
195         WORD16 b1 = (a0 - a1);
196         WORD16 b2 = (a2 + a3);
197         WORD16 b3 = (a2 - a3);
198 
199         pi2_dst[idx] = b0 + b2;
200         pi2_dst[4 + idx] = b1 + b3;
201         pi2_dst[(4 * dst_strd) + idx] = b0 - b2;
202         pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3;
203     }
204 }
205 
ihevce_hadamard_16x16_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)206 static void ihevce_hadamard_16x16_8bit(
207     UWORD8 *pu1_src,
208     WORD32 src_strd,
209     UWORD8 *pu1_pred,
210     WORD32 pred_strd,
211     WORD16 *pi2_dst,
212     WORD32 dst_strd)
213 {
214     WORD32 i;
215 
216     // y0
217     ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
218     // y1
219     ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd);
220     // y2
221     ihevce_hadamard_8x8_8bit(
222         pu1_src + 8 * src_strd,
223         src_strd,
224         pu1_pred + 8 * pred_strd,
225         pred_strd,
226         pi2_dst + (8 * dst_strd),
227         dst_strd);
228     // y3
229     ihevce_hadamard_8x8_8bit(
230         pu1_src + 8 + 8 * src_strd,
231         src_strd,
232         pu1_pred + 8 + 8 * pred_strd,
233         pred_strd,
234         pi2_dst + (8 * dst_strd) + 8,
235         dst_strd);
236 
237     /*   Child HAD results combined as follows to get Parent result */
238     /*  _                                                 _         */
239     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
240     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
241     /* \-                                                 -/        */
242     for(i = 0; i < 64; i++)
243     {
244         WORD32 idx = (i >> 3) * dst_strd + (i % 8);
245         WORD16 a0 = pi2_dst[idx];
246         WORD16 a1 = pi2_dst[8 + idx];
247         WORD16 a2 = pi2_dst[(8 * dst_strd) + idx];
248         WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx];
249 
250         WORD16 b0 = (a0 + a1) >> 1;
251         WORD16 b1 = (a0 - a1) >> 1;
252         WORD16 b2 = (a2 + a3) >> 1;
253         WORD16 b3 = (a2 - a3) >> 1;
254 
255         pi2_dst[idx] = b0 + b2;
256         pi2_dst[8 + idx] = b1 + b3;
257         pi2_dst[(8 * dst_strd) + idx] = b0 - b2;
258         pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3;
259     }
260 }
261 
ihevce_hadamard_32x32_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)262 static void ihevce_hadamard_32x32_8bit(
263     UWORD8 *pu1_src,
264     WORD32 src_strd,
265     UWORD8 *pu1_pred,
266     WORD32 pred_strd,
267     WORD16 *pi2_dst,
268     WORD32 dst_strd)
269 {
270     WORD32 i;
271 
272     // y0
273     ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
274     // y1
275     ihevce_hadamard_16x16_8bit(
276         pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd);
277     // y2
278     ihevce_hadamard_16x16_8bit(
279         pu1_src + 16 * src_strd,
280         src_strd,
281         pu1_pred + 16 * pred_strd,
282         pred_strd,
283         pi2_dst + (16 * dst_strd),
284         dst_strd);
285     // y3
286     ihevce_hadamard_16x16_8bit(
287         pu1_src + 16 + 16 * src_strd,
288         src_strd,
289         pu1_pred + 16 + 16 * pred_strd,
290         pred_strd,
291         pi2_dst + (16 * dst_strd) + 16,
292         dst_strd);
293 
294     /*   Child HAD results combined as follows to get Parent result */
295     /*  _                                                 _         */
296     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
297     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
298     /* \-                                                 -/        */
299     for(i = 0; i < 256; i++)
300     {
301         WORD32 idx = (i >> 4) * dst_strd + (i % 16);
302         WORD16 a0 = pi2_dst[idx] >> 2;
303         WORD16 a1 = pi2_dst[16 + idx] >> 2;
304         WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2;
305         WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2;
306 
307         WORD16 b0 = (a0 + a1);
308         WORD16 b1 = (a0 - a1);
309         WORD16 b2 = (a2 + a3);
310         WORD16 b3 = (a2 - a3);
311 
312         pi2_dst[idx] = b0 + b2;
313         pi2_dst[16 + idx] = b1 + b3;
314         pi2_dst[(16 * dst_strd) + idx] = b0 - b2;
315         pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3;
316     }
317 }
318 
319 /**
320 *******************************************************************************
321 *
322 * @brief
323 *  Compute Hadamard sad for 4x4 block with 8-bit input
324 *
325 * @par Description:
326 *
327 * @param[in] pu1_origin
328 *  UWORD8 pointer to the current block
329 *
330 * @param[in] src_strd
331 *  WORD32 Source stride
332 *
333 * @param[in] pu1_pred_buf
334 *  UWORD8 pointer to the prediction block
335 *
336 * @param[in] pred_strd
337 *  WORD32 Pred stride
338 *
339 * @param[in] pi2_dst
340 *  WORD16 pointer to the transform block
341 *
342 * @param[in] dst_strd
343 *  WORD32 Destination stride
344 *
345 * @param[in] size
346 *  WORD32 transform Block size
347 *
348 * @returns hadamard SAD
349 *
350 * @remarks
351 *  Not updating the transform destination now. Only returning the SATD
352 *
353 *******************************************************************************
354 */
ihevce_HAD_4x4_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)355 UWORD32 ihevce_HAD_4x4_8bit(
356     UWORD8 *pu1_origin,
357     WORD32 src_strd,
358     UWORD8 *pu1_pred_buf,
359     WORD32 pred_strd,
360     WORD16 *pi2_dst,
361     WORD32 dst_strd)
362 {
363     WORD32 k;
364     WORD16 v[16];
365     UWORD32 u4_sad = 0;
366 
367     (void)pi2_dst;
368     (void)dst_strd;
369     ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4);
370 
371     for(k = 0; k < 16; ++k)
372         u4_sad += abs(v[k]);
373     u4_sad = ((u4_sad + 2) >> 2);
374 
375     return u4_sad;
376 }
377 
378 /**
379 *******************************************************************************
380 *
381 * @brief
382 *  Computes Hadamard Sad for 8x8 block with 8-bit input
383 *
384 * @par Description:
385 *
386 * @param[in] pu1_origin
387 *  UWORD8 pointer to the current block
388 *
389 * @param[in] src_strd
390 *  WORD32 Source stride
391 *
392 * @param[in] pu1_pred_buf
393 *  UWORD8 pointer to the prediction block
394 *
395 * @param[in] pred_strd
396 *  WORD32 Pred stride
397 *
398 * @param[in] pi2_dst
399 *  WORD16 pointer to the transform block
400 *
401 * @param[in] dst_strd
402 *  WORD32 Destination stride
403 *
404 * @param[in] size
405 *  WORD32 transform Block size
406 *
407 * @returns Hadamard SAD
408 *
409 * @remarks
410 *  Not updating the transform destination now. Only returning the SATD
411 *
412 *******************************************************************************
413 */
ihevce_HAD_8x8_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)414 UWORD32 ihevce_HAD_8x8_8bit(
415     UWORD8 *pu1_origin,
416     WORD32 src_strd,
417     UWORD8 *pu1_pred_buf,
418     WORD32 pred_strd,
419     WORD16 *pi2_dst,
420     WORD32 dst_strd)
421 {
422     WORD32 k;
423     UWORD32 u4_sad = 0;
424     WORD16 v[64];
425 
426     (void)pi2_dst;
427     (void)dst_strd;
428     ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
429 
430     for(k = 0; k < 64; ++k)
431         u4_sad += abs(v[k]);
432     u4_sad = ((u4_sad + 4) >> 3);
433 
434     return u4_sad;
435 }
436 
437 /**
438 *******************************************************************************
439 *
440 * @brief
441 *  Compute dc suppressed hadamard sad for 8x8 block with 8-bit input
442 *
443 * @par Description:
444 *
445 * @param[in] pu1_origin
446 *  UWORD8 pointer to the current block
447 *
448 * @param[in] src_strd
449 *  WORD32 Source stride
450 *
451 * @param[in] pu1_pred_buf
452 *  UWORD8 pointer to the prediction block
453 *
454 * @param[in] pred_strd
455 *  WORD32 Pred stride
456 *
457 * @param[in] pi2_dst
458 *  WORD16 pointer to the transform block
459 *
460 * @param[in] dst_strd
461 *  WORD32 Destination stride
462 *
463 * @param[in] size
464 *  WORD32 transform Block size
465 *
466 * @returns Hadamard SAD with DC Suppressed
467 *
468 * @remarks
469 *  Not updating the transform destination now. Only returning the SATD
470 *
471 *******************************************************************************
472 */
ihevce_compute_ac_had_8x8_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)473 UWORD32 ihevce_compute_ac_had_8x8_8bit(
474     UWORD8 *pu1_origin,
475     WORD32 src_strd,
476     UWORD8 *pu1_pred_buf,
477     WORD32 pred_strd,
478     WORD16 *pi2_dst,
479     WORD32 dst_strd)
480 {
481     WORD32 k;
482     UWORD32 u4_sad = 0;
483     WORD16 v[64];
484 
485     (void)pi2_dst;
486     (void)dst_strd;
487     ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
488 
489     v[0] = 0;
490     for(k = 0; k < 64; ++k)
491         u4_sad += abs(v[k]);
492     u4_sad = ((u4_sad + 4) >> 3);
493 
494     return u4_sad;
495 }
496 
497 /**
498 *******************************************************************************
499 *
500 * @brief
501 *  Computes Hadamard Sad for 16x16 block with 8-bit input
502 *
503 * @par Description:
504 *
505 * @param[in] pu1_origin
506 *  UWORD8 pointer to the current block
507 *
508 * @param[in] src_strd
509 *  WORD32 Source stride
510 *
511 * @param[in] pu1_pred_buf
512 *  UWORD8 pointer to the prediction block
513 *
514 * @param[in] pred_strd
515 *  WORD32 Pred stride
516 *
517 * @param[in] pi2_dst
518 *  WORD16 pointer to the transform block
519 *
520 * @param[in] dst_strd
521 *  WORD32 Destination stride
522 *
523 * @param[in] size
524 *  WORD32 transform Block size
525 *
526 * @returns Hadamard SAD
527 *
528 * @remarks
529 *  Not updating the transform destination now. Only returning the SATD
530 *
531 *******************************************************************************
532 */
ihevce_HAD_16x16_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)533 UWORD32 ihevce_HAD_16x16_8bit(
534     UWORD8 *pu1_origin,
535     WORD32 src_strd,
536     UWORD8 *pu1_pred_buf,
537     WORD32 pred_strd,
538     WORD16 *pi2_dst,
539     WORD32 dst_strd)
540 {
541     WORD32 k;
542     UWORD32 u4_sad = 0;
543     WORD16 v[256];
544 
545     (void)pi2_dst;
546     (void)dst_strd;
547     ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16);
548 
549     for(k = 0; k < 256; ++k)
550         u4_sad += abs(v[k]);
551     u4_sad = ((u4_sad + 4) >> 3);
552 
553     return u4_sad;
554 }
555 
556 /**
557 *******************************************************************************
558 *
559 * @brief
560 *  Computes Hadamard Sad for 32x32 block with 8-bit input
561 *
562 * @par Description:
563 *
564 * @param[in] pu1_origin
565 *  UWORD8 pointer to the current block
566 *
567 * @param[in] src_strd
568 *  WORD32 Source stride
569 *
570 * @param[in] pu1_pred_buf
571 *  UWORD8 pointer to the prediction block
572 *
573 * @param[in] pred_strd
574 *  WORD32 Pred stride
575 *
576 * @param[in] pi2_dst
577 *  WORD16 pointer to the transform block
578 *
579 * @param[in] dst_strd
580 *  WORD32 Destination stride
581 *
582 * @param[in] size
583 *  WORD32 transform Block size
584 *
585 * @returns Hadamard SAD
586 *
587 * @remarks
588 *  Not updating the transform destination now. Only returning the SATD
589 *
590 *******************************************************************************
591 */
ihevce_HAD_32x32_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)592 UWORD32 ihevce_HAD_32x32_8bit(
593     UWORD8 *pu1_origin,
594     WORD32 src_strd,
595     UWORD8 *pu1_pred_buf,
596     WORD32 pred_strd,
597     WORD16 *pi2_dst,
598     WORD32 dst_strd)
599 {
600     WORD32 k;
601     UWORD32 u4_sad = 0;
602     WORD16 v[32 * 32];
603 
604     (void)pi2_dst;
605     (void)dst_strd;
606     ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32);
607 
608     for(k = 0; k < 32 * 32; ++k)
609         u4_sad += abs(v[k]);
610     u4_sad = ((u4_sad + 2) >> 2);
611 
612     return u4_sad;
613 }
614 
615 //#if COMPUTE_16x16_R == C
616 /**
617 *******************************************************************************
618 *
619 * @brief
620 *   Computes 8x8 transform using children 4x4 hadamard results
621 *
622 * @par Description:
623 *
624 * @param[in] pi2_4x4_had
625 *  WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
626 *
627 * @param[in] had4_strd
628 *  stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
629 *
630 * @param[out] pi2_dst
631 *  destination buffer where 8x8 hadamard result is stored
632 *
633 * @param[in] dst_stride
634 *  stride of destination block
635 *
636 * @param[in] i4_frm_qstep
637 *  frm_qstep value based on the which the threshold value is calculated
638 *
639 * @returns
640 *  8x8 Hadamard SATD
641 * @remarks
642 *
643 *******************************************************************************
644 */
ihevce_compute_8x8HAD_using_4x4(WORD16 * pi2_4x4_had,WORD32 had4_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)645 static UWORD32 ihevce_compute_8x8HAD_using_4x4(
646     WORD16 *pi2_4x4_had,
647     WORD32 had4_strd,
648     WORD16 *pi2_dst,
649     WORD32 dst_strd,
650     WORD32 i4_frm_qstep,
651     WORD32 *pi4_cbf)
652 {
653     /* Qstep value is right shifted by 8 */
654     WORD32 threshold = (i4_frm_qstep >> 8);
655 
656     /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */
657     WORD16 *pi2_y0 = pi2_4x4_had;
658     WORD16 *pi2_y1 = pi2_4x4_had + 4;
659     WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4;
660     WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4;
661 
662     /* Initialize pointers to store 8x8 HAD output */
663     WORD16 *pi2_dst0 = pi2_dst;
664     WORD16 *pi2_dst1 = pi2_dst + 4;
665     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4;
666     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4;
667 
668     UWORD32 u4_satd = 0;
669     WORD32 i;
670 
671     /*   Child HAD results combined as follows to get Parent result */
672     /*  _                                                 _         */
673     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
674     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
675     /* \-                                                 -/        */
676     for(i = 0; i < 16; i++)
677     {
678         WORD32 src_idx = (i >> 2) * had4_strd + (i % 4);
679         WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4);
680 
681         WORD16 a0 = pi2_y0[src_idx];
682         WORD16 a1 = pi2_y1[src_idx];
683         WORD16 a2 = pi2_y2[src_idx];
684         WORD16 a3 = pi2_y3[src_idx];
685 
686         WORD16 b0 = (a0 + a1);
687         WORD16 b1 = (a0 - a1);
688         WORD16 b2 = (a2 + a3);
689         WORD16 b3 = (a2 - a3);
690 
691         pi2_dst0[dst_idx] = b0 + b2;
692         pi2_dst1[dst_idx] = b1 + b3;
693         pi2_dst2[dst_idx] = b0 - b2;
694         pi2_dst3[dst_idx] = b1 - b3;
695 
696         if(ABS(pi2_dst0[dst_idx]) > threshold)
697             *pi4_cbf = 1;
698         if(ABS(pi2_dst1[dst_idx]) > threshold)
699             *pi4_cbf = 1;
700         if(ABS(pi2_dst2[dst_idx]) > threshold)
701             *pi4_cbf = 1;
702         if(ABS(pi2_dst3[dst_idx]) > threshold)
703             *pi4_cbf = 1;
704 
705         u4_satd += ABS(pi2_dst0[dst_idx]);
706         u4_satd += ABS(pi2_dst1[dst_idx]);
707         u4_satd += ABS(pi2_dst2[dst_idx]);
708         u4_satd += ABS(pi2_dst3[dst_idx]);
709     }
710 
711     /* return the 8x8 satd */
712     return (u4_satd);
713 }
714 
715 /**
716 *******************************************************************************
717 *
718 * @brief
719 *    Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of
720 *    a 8x8 block (Residue is computed for 8-bit src and prediction buffers)
721 *    Modified to incorporate the dead-zone implementation - Lokesh
722 *
723 * @par Description:
724 *
725 * @param[in] pu1_origin
726 *  UWORD8 pointer to the current block
727 *
728 * @param[in] src_strd
729 *  WORD32 Source stride
730 *
731 * @param[in] pu1_pred
732 *  UWORD8 pointer to the prediction block
733 *
734 * @param[in] pred_strd
735 *  WORD32 Pred stride
736 *
737 * @param[out] pi2_dst
738 *  WORD16 pointer to the transform block
739 *
740 * @param[in] dst_strd
741 *  WORD32 Destination stride
742 *
743 * @param[out] pi4_hsad
744 *  array for storing hadmard sad of each 4x4 block
745 *
746 * @param[in] hsad_stride
747 *  stride of hadmard sad destination buffer (for Zscan order of storing sads)
748 *
749 * @param[in] i4_frm_qstep
750 *  frm_qstep value based on the which the threshold value is calculated
751 *
752 * @returns
753 *
754 * @remarks
755 *
756 *******************************************************************************
757 */
ihevce_had4_4x4(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst4x4,WORD32 dst_strd,WORD32 * pi4_hsad,WORD32 hsad_stride,WORD32 i4_frm_qstep)758 static WORD32 ihevce_had4_4x4(
759     UWORD8 *pu1_src,
760     WORD32 src_strd,
761     UWORD8 *pu1_pred,
762     WORD32 pred_strd,
763     WORD16 *pi2_dst4x4,
764     WORD32 dst_strd,
765     WORD32 *pi4_hsad,
766     WORD32 hsad_stride,
767     WORD32 i4_frm_qstep)
768 {
769     WORD32 i, k;
770     WORD32 i4_child_total_sad = 0;
771 
772     (void)i4_frm_qstep;
773     /* -------- Compute four 4x4 HAD Transforms ---------*/
774     for(i = 0; i < 4; i++)
775     {
776         UWORD8 *pu1_pi0, *pu1_pi1;
777         WORD16 *pi2_dst;
778         WORD32 blkx, blky;
779         UWORD32 u4_hsad = 0;
780         // TODO: choose deadzone as f(qstep)
781         WORD32 threshold = 0;
782 
783         /*****************************************************/
784         /*    Assuming the looping structure of the four     */
785         /*    blocks is in Z scan order of 4x4s in a 8x8     */
786         /*    block instead of raster scan                   */
787         /*****************************************************/
788         blkx = (i & 0x1);
789         blky = (i >> 1);
790 
791         pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd);
792         pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd);
793         pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd);
794 
795         ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd);
796 
797         for(k = 0; k < 4; k++)
798         {
799             if(ABS(pi2_dst[0 * dst_strd + k]) < threshold)
800                 pi2_dst[0 * dst_strd + k] = 0;
801 
802             if(ABS(pi2_dst[1 * dst_strd + k]) < threshold)
803                 pi2_dst[1 * dst_strd + k] = 0;
804 
805             if(ABS(pi2_dst[2 * dst_strd + k]) < threshold)
806                 pi2_dst[2 * dst_strd + k] = 0;
807 
808             if(ABS(pi2_dst[3 * dst_strd + k]) < threshold)
809                 pi2_dst[3 * dst_strd + k] = 0;
810 
811             /* Accumulate the SATD */
812             u4_hsad += ABS(pi2_dst[0 * dst_strd + k]);
813             u4_hsad += ABS(pi2_dst[1 * dst_strd + k]);
814             u4_hsad += ABS(pi2_dst[2 * dst_strd + k]);
815             u4_hsad += ABS(pi2_dst[3 * dst_strd + k]);
816         }
817 
818         /*===== Normalize the HSAD =====*/
819         pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2);
820         i4_child_total_sad += ((u4_hsad + 2) >> 2);
821     }
822     return i4_child_total_sad;
823 }
824 
825 /**
826 *******************************************************************************
827 *
828 * @brief
829 *    HSAD is returned for the 4, 4x4 in 8x8
830 *
831 * @par Description:
832 *
833 * @param[in] pu1_origin
834 *  UWORD8 pointer to the current block
835 *
836 * @param[in] src_strd
837 *  WORD32 Source stride
838 *
839 * @param[in] pu1_pred
840 *  UWORD8 pointer to the prediction block
841 *
842 * @param[in] pred_strd
843 *  WORD32 Pred stride
844 *
845 * @param[out] pi2_dst
846 *  WORD16 pointer to the transform output block
847 *
848 * @param[out] dst_strd
849 *  WORD32 Destination stride
850 *
851 * @param[out] ppi4_hsad
852 *   pointer to base pointers for storing hadmard sads of various
853 *   block sizes (4x4 to 32x32)
854 *
855 * @param[in] pos_x_y_4x4
856 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
857 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
858 *
859 * @param[in] num_4x4_in_row
860 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
861 *
862 * @returns
863 *
864 * @remarks
865 *
866 *******************************************************************************
867 */
ihevce_had_8x8_using_4_4x4(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row)868 void ihevce_had_8x8_using_4_4x4(
869     UWORD8 *pu1_src,
870     WORD32 src_strd,
871     UWORD8 *pu1_pred,
872     WORD32 pred_strd,
873     WORD16 *pi2_dst,
874     WORD32 dst_strd,
875     WORD32 **ppi4_hsad,
876     WORD32 pos_x_y_4x4,
877     WORD32 num_4x4_in_row)
878 {
879     WORD16 ai2_4x4_had[64];
880     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
881     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
882     WORD32 *pi4_4x4_hsad;
883     WORD32 *pi4_8x8_hsad;
884 
885     (void)pi2_dst;
886     (void)dst_strd;
887     ASSERT(pos_x >= 0);
888     ASSERT(pos_y >= 0);
889 
890     /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
891     pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
892     pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
893 
894     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
895     pi4_8x8_hsad[0] = ihevce_had4_4x4(
896         pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
897 }
898 
899 /**
900 *******************************************************************************
901 *
902 * @brief
903 *    Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8
904 *    block and its four subblocks(4x4).
905 *
906 * @par Description:
907 *
908 * @param[in] pu1_origin
909 *  UWORD8 pointer to the current block
910 *
911 * @param[in] src_strd
912 *  WORD32 Source stride
913 *
914 * @param[in] pu1_pred
915 *  UWORD8 pointer to the prediction block
916 *
917 * @param[in] pred_strd
918 *  WORD32 Pred stride
919 *
920 * @param[out] pi2_dst
921 *  WORD16 pointer to the transform output block
922 *
923 * @param[out] dst_strd
924 *  WORD32 Destination stride
925 *
926 * @param[out] ppi4_hsad
927 *   pointer to base pointers for storing hadmard sads of various
928 *   block sizes (4x4 to 32x32)
929 *
930 * @param[in] pos_x_y_4x4
931 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
932 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
933 *
934 * @param[in] num_4x4_in_row
935 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
936 *
937 * @param[in] i4_frm_qstep
938 *  frm_qstep value based on the which the threshold value is calculated
939 *
940 * @returns
941 *
942 * @remarks
943 *
944 *******************************************************************************
945 */
ihevce_had_8x8_using_4_4x4_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)946 WORD32 ihevce_had_8x8_using_4_4x4_r(
947     UWORD8 *pu1_src,
948     WORD32 src_strd,
949     UWORD8 *pu1_pred,
950     WORD32 pred_strd,
951     WORD16 *pi2_dst,
952     WORD32 dst_strd,
953     WORD32 **ppi4_hsad,
954     WORD32 **ppi4_tu_split,
955     WORD32 **ppi4_tu_early_cbf,
956     WORD32 pos_x_y_4x4,
957     WORD32 num_4x4_in_row,
958     WORD32 lambda,
959     WORD32 lambda_q_shift,
960     WORD32 i4_frm_qstep,
961     WORD32 i4_cur_depth,
962     WORD32 i4_max_depth,
963     WORD32 i4_max_tr_size,
964     WORD32 *pi4_tu_split_cost,
965     void *pv_func_sel)
966 {
967     WORD16 ai2_4x4_had[64];
968     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
969     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
970     WORD32 *pi4_4x4_hsad;
971     WORD32 *pi4_8x8_hsad;
972     WORD32 *pi4_8x8_tu_split;
973 
974     WORD32 *pi4_8x8_tu_early_cbf;
975 
976     UWORD32 u4_satd;
977     WORD32 cost_child = 0, cost_parent = 0;
978     WORD32 early_cbf = 0;
979 
980     const UWORD8 u1_cur_tr_size = 8;
981     /* Stores the best cost for the Current 8x8: Lokesh */
982     WORD32 best_cost = 0;
983 
984     (void)pv_func_sel;
985     ASSERT(pos_x >= 0);
986     ASSERT(pos_y >= 0);
987 
988     /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
989     pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
990     pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
991     pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
992     pi4_8x8_tu_early_cbf =
993         ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
994 
995     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
996     cost_child = ihevce_had4_4x4(
997         pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
998 
999     /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */
1000     u4_satd = ihevce_compute_8x8HAD_using_4x4(
1001         ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1002 
1003     /* store the normalized 8x8 satd */
1004     cost_parent = ((u4_satd + 4) >> 3);
1005 
1006     /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1007     cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
1008 
1009     if(i4_cur_depth < i4_max_depth)
1010     {
1011         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1012         {
1013             //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1);
1014             *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
1015             best_cost = cost_child;
1016             best_cost <<= 1;
1017             best_cost++;
1018             pi4_8x8_tu_split[0] = 1;
1019             pi4_8x8_hsad[0] = cost_child;
1020         }
1021         else
1022         {
1023             //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
1024             best_cost = cost_parent;
1025             best_cost <<= 1;
1026             pi4_8x8_tu_split[0] = 0;
1027             pi4_8x8_hsad[0] = cost_parent;
1028         }
1029     }
1030     else
1031     {
1032         //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
1033         best_cost = cost_parent;
1034         best_cost <<= 1;
1035         pi4_8x8_tu_split[0] = 0;
1036         pi4_8x8_hsad[0] = cost_parent;
1037     }
1038 
1039     pi4_8x8_tu_early_cbf[0] = early_cbf;
1040 
1041     /* best cost has tu_split_flag at LSB(Least significant bit) */
1042     return ((best_cost << 1) + early_cbf);
1043 }
1044 
1045 /**
1046 *******************************************************************************
1047 *
1048 * @brief
1049 *   Computes 16x16 transform using children 8x8 hadamard results
1050 *    Modified to incorporate the dead-zone implementation - Lokesh
1051 *
1052 * @par Description:
1053 *
1054 * @param[in] pi2_8x8_had
1055 *  WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1056 *
1057 * @param[in] had8_strd
1058 *  stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1059 *
1060 * @param[out] pi2_dst
1061 *  destination buffer where 8x8 hadamard result is stored
1062 *
1063 * @param[in] dst_stride
1064 *  stride of destination block
1065 *
1066 * @param[in] i4_frm_qstep
1067 *  frm_qstep value based on the which the threshold value is calculated
1068 *
1069 * @returns
1070 *  16x16 Hadamard SATD
1071 * @remarks
1072 *
1073 *******************************************************************************
1074 */
ihevce_compute_16x16HAD_using_8x8(WORD16 * pi2_8x8_had,WORD32 had8_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)1075 static UWORD32 ihevce_compute_16x16HAD_using_8x8(
1076     WORD16 *pi2_8x8_had,
1077     WORD32 had8_strd,
1078     WORD16 *pi2_dst,
1079     WORD32 dst_strd,
1080     WORD32 i4_frm_qstep,
1081     WORD32 *pi4_cbf)
1082 {
1083     /* Qstep value is right shifted by 8 */
1084     WORD32 threshold = (i4_frm_qstep >> 8);
1085 
1086     /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1087     WORD16 *pi2_y0 = pi2_8x8_had;
1088     WORD16 *pi2_y1 = pi2_8x8_had + 8;
1089     WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8;
1090     WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8;
1091 
1092     /* Initialize pointers to store 8x8 HAD output */
1093     WORD16 *pi2_dst0 = pi2_dst;
1094     WORD16 *pi2_dst1 = pi2_dst + 8;
1095     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8;
1096     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8;
1097 
1098     UWORD32 u4_satd = 0;
1099     WORD32 i;
1100 
1101     /*   Child HAD results combined as follows to get Parent result */
1102     /*  _                                                 _         */
1103     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
1104     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
1105     /* \-                                                 -/        */
1106     for(i = 0; i < 64; i++)
1107     {
1108         WORD32 src_idx = (i >> 3) * had8_strd + (i % 8);
1109         WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8);
1110 
1111         WORD16 a0 = pi2_y0[src_idx];
1112         WORD16 a1 = pi2_y1[src_idx];
1113         WORD16 a2 = pi2_y2[src_idx];
1114         WORD16 a3 = pi2_y3[src_idx];
1115 
1116         WORD16 b0 = (a0 + a1) >> 1;
1117         WORD16 b1 = (a0 - a1) >> 1;
1118         WORD16 b2 = (a2 + a3) >> 1;
1119         WORD16 b3 = (a2 - a3) >> 1;
1120 
1121         pi2_dst0[dst_idx] = b0 + b2;
1122         pi2_dst1[dst_idx] = b1 + b3;
1123         pi2_dst2[dst_idx] = b0 - b2;
1124         pi2_dst3[dst_idx] = b1 - b3;
1125 
1126         /* Make the value of dst to zerp, if it falls below the dead-zone */
1127         if(ABS(pi2_dst0[dst_idx]) > threshold)
1128             *pi4_cbf = 1;
1129         if(ABS(pi2_dst1[dst_idx]) > threshold)
1130             *pi4_cbf = 1;
1131         if(ABS(pi2_dst2[dst_idx]) > threshold)
1132             *pi4_cbf = 1;
1133         if(ABS(pi2_dst3[dst_idx]) > threshold)
1134             *pi4_cbf = 1;
1135 
1136         u4_satd += ABS(pi2_dst0[dst_idx]);
1137         u4_satd += ABS(pi2_dst1[dst_idx]);
1138         u4_satd += ABS(pi2_dst2[dst_idx]);
1139         u4_satd += ABS(pi2_dst3[dst_idx]);
1140     }
1141 
1142     /* return 16x16 satd */
1143     return (u4_satd);
1144 }
1145 
1146 /**
1147 *******************************************************************************
1148 *
1149 * @brief
1150 *    Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates.
1151 *    Uses recursive 8x8 had output to compute satd for 16x16 and its children
1152 *
1153 * @par Description:
1154 *
1155 * @param[in] pu1_origin
1156 *  UWORD8 pointer to the current block
1157 *
1158 * @param[in] src_strd
1159 *  WORD32 Source stride
1160 *
1161 * @param[in] pu1_pred
1162 *  UWORD8 pointer to the prediction block
1163 *
1164 * @param[in] pred_strd
1165 *  WORD32 Pred stride
1166 *
1167 * @param[out] pi2_dst
1168 *  WORD16 pointer to the transform output block
1169 *
1170 * @param[out] dst_strd
1171 *  WORD32 Destination stride
1172 *
1173 * @param[out] ppi4_hsad
1174 *   pointer to base pointers for storing hadmard sads of various
1175 *   block sizes (4x4 to 32x32)
1176 *
1177 * @param[in] pos_x_y_4x4
1178 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1179 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
1180 *
1181 * @param[in] num_4x4_in_row
1182 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
1183 *
1184 * @param[in] lambda
1185 *  lambda values is the cost factor calculated based on QP
1186 *
1187 * @param[in] lambda_q_shift
1188 *  lambda_q_shift used to reverse the lambda value back from q8 format
1189 *
1190 * @param[in] depth
1191 *  depth gives the current TU depth with respect to the CU
1192 *
1193 * @param[in] i4_frm_qstep
1194 *  frm_qstep value based on the which the threshold value is calculated
1195 *
1196 * @returns
1197 *
1198 * @remarks
1199 *
1200 *******************************************************************************
1201 */
1202 
ihevce_had_16x16_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)1203 WORD32 ihevce_had_16x16_r(
1204     UWORD8 *pu1_src,
1205     WORD32 src_strd,
1206     UWORD8 *pu1_pred,
1207     WORD32 pred_strd,
1208     WORD16 *pi2_dst,
1209     WORD32 dst_strd,
1210     WORD32 **ppi4_hsad,
1211     WORD32 **ppi4_tu_split,
1212     WORD32 **ppi4_tu_early_cbf,
1213     WORD32 pos_x_y_4x4,
1214     WORD32 num_4x4_in_row,
1215     WORD32 lambda,
1216     WORD32 lambda_q_shift,
1217     WORD32 i4_frm_qstep,
1218     WORD32 i4_cur_depth,
1219     WORD32 i4_max_depth,
1220     WORD32 i4_max_tr_size,
1221     WORD32 *pi4_tu_split_cost,
1222     void *pv_func_sel)
1223 {
1224     WORD16 ai2_8x8_had[256];
1225     WORD32 *pi4_16x16_hsad;
1226     WORD32 *pi4_16x16_tu_split;
1227 
1228     WORD32 *pi4_16x16_tu_early_cbf;
1229 
1230     UWORD32 u4_satd = 0;
1231     WORD32 tu_split_flag = 0;
1232     WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1233     const UWORD8 u1_cur_tr_size = 16;
1234 
1235     /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1236     /* cost_child : Stores the cost of the child HAD transform (16x16) */
1237     WORD32 cost_parent = 0, cost_child = 0;
1238 
1239     /*best_cost returns the best cost at the end of the function */
1240     /*tu_split denoes whether the TU (16x16)is split or not */
1241     WORD32 best_cost = 0, best_cost_tu_split;
1242     WORD32 i;
1243 
1244     WORD16 *pi2_y0;
1245     UWORD8 *pu1_src0;
1246     UWORD8 *pu1_pred0;
1247     WORD32 pos_x_y_4x4_0;
1248 
1249     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1250     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1251 
1252     ASSERT(pos_x >= 0);
1253     ASSERT(pos_y >= 0);
1254 
1255     /* Initialize pointers to  store 16x16 SATDs */
1256     pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1257 
1258     pi4_16x16_tu_split =
1259         ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1260 
1261     pi4_16x16_tu_early_cbf =
1262         ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1263 
1264     /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1265     for(i = 0; i < 4; i++)
1266     {
1267         pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
1268         pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
1269         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1270         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1271 
1272         best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r(
1273             pu1_src0,
1274             src_strd,
1275             pu1_pred0,
1276             pred_strd,
1277             pi2_y0,
1278             16,
1279             ppi4_hsad,
1280             ppi4_tu_split,
1281             ppi4_tu_early_cbf,
1282             pos_x_y_4x4_0,
1283             num_4x4_in_row,
1284             lambda,
1285             lambda_q_shift,
1286             i4_frm_qstep,
1287             i4_cur_depth + 1,
1288             i4_max_depth,
1289             i4_max_tr_size,
1290             pi4_tu_split_cost,
1291             pv_func_sel);
1292 
1293         /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
1294         best_cost = (best_cost_tu_split >> 2);
1295 
1296         /* Last but one bit stores the information regarding the TU_Split */
1297         tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
1298 
1299         /* Last bit stores the information regarding the early_cbf */
1300         i4_early_cbf_flag += (best_cost_tu_split & 0x1);
1301 
1302         cost_child += best_cost;
1303 
1304         tu_split_flag <<= 1;
1305         i4_early_cbf_flag <<= 1;
1306     }
1307 
1308     /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
1309     pi2_y0 = ai2_8x8_had;
1310 
1311     /* Threshold currently passed as "0" */
1312     u4_satd =
1313         ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1314 
1315     /* store the normalized satd */
1316     cost_parent = ((u4_satd + 4) >> 3);
1317 
1318     /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1319     cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1320 
1321     i4_early_cbf_flag += early_cbf;
1322 
1323     /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
1324     which decides the extent to which TU_REC needs to be done */
1325     if(i4_cur_depth < i4_max_depth)
1326     {
1327         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1328         {
1329             //cost_child -= ((4 + 4)  * lambda) >> (lambda_q_shift + 1);
1330             *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1331             tu_split_flag += 1;
1332             best_cost = cost_child;
1333         }
1334         else
1335         {
1336             //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
1337             tu_split_flag += 0;
1338             best_cost = cost_parent;
1339         }
1340     }
1341     else
1342     {
1343         //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
1344         tu_split_flag += 0;
1345         best_cost = cost_parent;
1346     }
1347 
1348     pi4_16x16_hsad[0] = best_cost;
1349     pi4_16x16_tu_split[0] = tu_split_flag;
1350     pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
1351 
1352     /*returning two values(best cost & tu_split_flag) as a single value*/
1353     return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
1354 }
1355 
1356 //#endif
1357 /**
1358 *******************************************************************************
1359 *
1360 * @brief
1361 *   Computes 32x32 transform using children 16x16 hadamard results
1362 *
1363 * @par Description:
1364 *
1365 * @param[in] pi2_16x16_had
1366 *  WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1367 *
1368 * @param[in] had16_strd
1369 *  stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1370 *
1371 * @param[out] pi2_dst
1372 *  destination buffer where 16x16 hadamard result is stored
1373 *
1374 * @param[in] dst_stride
1375 *  stride of destination block
1376 *
1377 * @param[in] i4_frm_qstep
1378 *  frm_qstep value based on the which the threshold value is calculated
1379 *
1380 * @returns
1381 *  32x32 Hadamard SATD
1382 * @remarks
1383 *
1384 *******************************************************************************
1385 */
1386 //#if COMPUTE_32x32_USING_16X16 == C
ihevce_compute_32x32HAD_using_16x16(WORD16 * pi2_16x16_had,WORD32 had16_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)1387 UWORD32 ihevce_compute_32x32HAD_using_16x16(
1388     WORD16 *pi2_16x16_had,
1389     WORD32 had16_strd,
1390     WORD16 *pi2_dst,
1391     WORD32 dst_strd,
1392     WORD32 i4_frm_qstep,
1393     WORD32 *pi4_cbf)
1394 {
1395     /* Qstep value is right shifted by 8 */
1396     WORD32 threshold = (i4_frm_qstep >> 8);
1397 
1398     /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1399     WORD16 *pi2_y0 = pi2_16x16_had;
1400     WORD16 *pi2_y1 = pi2_16x16_had + 16;
1401     WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16;
1402     WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16;
1403 
1404     /* Initialize pointers to store 8x8 HAD output */
1405     WORD16 *pi2_dst0 = pi2_dst;
1406     WORD16 *pi2_dst1 = pi2_dst + 16;
1407     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16;
1408     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16;
1409 
1410     UWORD32 u4_satd = 0;
1411     WORD32 i;
1412 
1413     /*   Child HAD results combined as follows to get Parent result */
1414     /*  _                                                 _         */
1415     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
1416     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
1417     /* \-                                                 -/        */
1418     for(i = 0; i < 256; i++)
1419     {
1420         WORD32 src_idx = (i >> 4) * had16_strd + (i % 16);
1421         WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16);
1422 
1423         WORD16 a0 = pi2_y0[src_idx] >> 2;
1424         WORD16 a1 = pi2_y1[src_idx] >> 2;
1425         WORD16 a2 = pi2_y2[src_idx] >> 2;
1426         WORD16 a3 = pi2_y3[src_idx] >> 2;
1427 
1428         WORD16 b0 = (a0 + a1);
1429         WORD16 b1 = (a0 - a1);
1430         WORD16 b2 = (a2 + a3);
1431         WORD16 b3 = (a2 - a3);
1432 
1433         pi2_dst0[dst_idx] = b0 + b2;
1434         pi2_dst1[dst_idx] = b1 + b3;
1435         pi2_dst2[dst_idx] = b0 - b2;
1436         pi2_dst3[dst_idx] = b1 - b3;
1437 
1438         /* Make the value of dst to zerp, if it falls below the dead-zone */
1439         if(ABS(pi2_dst0[dst_idx]) > threshold)
1440             *pi4_cbf = 1;
1441         if(ABS(pi2_dst1[dst_idx]) > threshold)
1442             *pi4_cbf = 1;
1443         if(ABS(pi2_dst2[dst_idx]) > threshold)
1444             *pi4_cbf = 1;
1445         if(ABS(pi2_dst3[dst_idx]) > threshold)
1446             *pi4_cbf = 1;
1447 
1448         u4_satd += ABS(pi2_dst0[dst_idx]);
1449         u4_satd += ABS(pi2_dst1[dst_idx]);
1450         u4_satd += ABS(pi2_dst2[dst_idx]);
1451         u4_satd += ABS(pi2_dst3[dst_idx]);
1452     }
1453 
1454     /* return 32x32 satd */
1455     return (u4_satd);
1456 }
1457 //#endif
1458 
1459 /**
1460 *******************************************************************************
1461 *
1462 * @brief
1463 *    Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates.
1464 *    Uses recursive 16x16 had output to compute satd for 32x32 and its children
1465 *
1466 * @par Description:
1467 *
1468 * @param[in] pu1_origin
1469 *  UWORD8 pointer to the current block
1470 *
1471 * @param[in] src_strd
1472 *  WORD32 Source stride
1473 *
1474 * @param[in] pu1_pred
1475 *  UWORD8 pointer to the prediction block
1476 *
1477 * @param[in] pred_strd
1478 *  WORD32 Pred stride
1479 *
1480 * @param[out] pi2_dst
1481 *  WORD16 pointer to the transform output block
1482 *
1483 * @param[out] dst_strd
1484 *  WORD32 Destination stride
1485 *
1486 * @param[out] ppi4_hsad
1487 *   pointer to base pointers for storing hadmard sads of various
1488 *   block sizes (4x4 to 32x32)
1489 *
1490 * @param[in] pos_x_y_4x4
1491 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1492 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
1493 *
1494 * @param[in] num_4x4_in_row
1495 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
1496 *
1497 * @param[in] lambda
1498 *  lambda values is the cost factor calculated based on QP
1499 *
1500 * @param[in] lambda_q_shift
1501 *  lambda_q_shift used to reverse the lambda value back from q8 format
1502 *
1503 * @param[in] depth
1504 *  depth gives the current TU depth with respect to the CU
1505 *
1506 * @param[in] i4_frm_qstep
1507 *  frm_qstep value based on the which the threshold value is calculated
1508 *
1509 *
1510 * @returns
1511 *
1512 * @remarks
1513 *
1514 *******************************************************************************
1515 */
ihevce_had_32x32_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,me_func_selector_t * ps_func_selector)1516 void ihevce_had_32x32_r(
1517     UWORD8 *pu1_src,
1518     WORD32 src_strd,
1519     UWORD8 *pu1_pred,
1520     WORD32 pred_strd,
1521     WORD16 *pi2_dst,
1522     WORD32 dst_strd,
1523     WORD32 **ppi4_hsad,
1524     WORD32 **ppi4_tu_split,
1525     WORD32 **ppi4_tu_early_cbf,
1526     WORD32 pos_x_y_4x4,
1527     WORD32 num_4x4_in_row,
1528     WORD32 lambda,
1529     WORD32 lambda_q_shift,
1530     WORD32 i4_frm_qstep,
1531     WORD32 i4_cur_depth,
1532     WORD32 i4_max_depth,
1533     WORD32 i4_max_tr_size,
1534     WORD32 *pi4_tu_split_cost,
1535     me_func_selector_t *ps_func_selector)
1536 
1537 {
1538     WORD16 ai2_16x16_had[1024];
1539     WORD32 *pi4_32x32_hsad;
1540     WORD32 *pi4_32x32_tu_split;
1541     WORD32 *pi4_32x32_tu_early_cbf;
1542 
1543     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1544     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1545     WORD32 tu_split_flag = 0;
1546     const UWORD8 u1_cur_tr_size = 32;
1547     WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1548 
1549     /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1550     /* cost_child : Stores the cost of the child HAD transform (16x16) */
1551     WORD32 cost_child = 0, cost_parent = 0;
1552 
1553     /*retuned as the best cost for the entire TU (32x32) */
1554     WORD32 best_cost = 0;
1555     /*captures the best cost and tu_split at child level */
1556     WORD32 best_cost_tu_split;
1557 
1558     /* Initialize pointers to 4 8x8 blocks in 16x16 */
1559     WORD16 *pi2_y0 = ai2_16x16_had;
1560     WORD16 *pi2_y1 = ai2_16x16_had + 16;
1561     WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16;
1562     WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16;
1563 
1564     UWORD8 *pu1_src0 = pu1_src;
1565     UWORD8 *pu1_src1 = pu1_src + 16;
1566     UWORD8 *pu1_src2 = pu1_src + src_strd * 16;
1567     UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16;
1568 
1569     UWORD8 *pu1_pred0 = pu1_pred;
1570     UWORD8 *pu1_pred1 = pu1_pred + 16;
1571     UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16;
1572     UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16;
1573 
1574     ASSERT(pos_x >= 0);
1575     ASSERT(pos_y >= 0);
1576 
1577     /* Initialize pointers to store 32x32 SATDs */
1578     pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1579 
1580     pi4_32x32_tu_split =
1581         ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1582 
1583     pi4_32x32_tu_early_cbf =
1584         ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1585 
1586     /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1587     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1588         pu1_src0,
1589         src_strd,
1590         pu1_pred0,
1591         pred_strd,
1592         pi2_y0,
1593         32,
1594         ppi4_hsad,
1595         ppi4_tu_split,
1596         ppi4_tu_early_cbf,
1597         pos_x_y_4x4,
1598         num_4x4_in_row,
1599         lambda,
1600         lambda_q_shift,
1601         i4_frm_qstep,
1602         i4_cur_depth + 1,
1603         i4_max_depth,
1604         i4_max_tr_size,
1605         pi4_tu_split_cost,
1606         NULL);
1607 
1608     /* cost is shifted by 10bits */
1609     best_cost = best_cost_tu_split >> 10;
1610 
1611     /* Tu split is present in the 6-10 bits */
1612     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1613 
1614     /*Early CBF info is present in the last 5 bits */
1615     i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1616 
1617     tu_split_flag <<= 5;
1618     i4_early_cbf_flag <<= 5;
1619 
1620     cost_child += best_cost;
1621 
1622     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1623         pu1_src1,
1624         src_strd,
1625         pu1_pred1,
1626         pred_strd,
1627         pi2_y1,
1628         32,
1629         ppi4_hsad,
1630         ppi4_tu_split,
1631         ppi4_tu_early_cbf,
1632         pos_x_y_4x4 + 4,
1633         num_4x4_in_row,
1634         lambda,
1635         lambda_q_shift,
1636         i4_frm_qstep,
1637         i4_cur_depth + 1,
1638         i4_max_depth,
1639         i4_max_tr_size,
1640         pi4_tu_split_cost,
1641         NULL);
1642 
1643     /* cost is shifted by 10bits */
1644     best_cost = best_cost_tu_split >> 10;
1645 
1646     /* Tu split is present in the 6-10 bits */
1647     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1648 
1649     /*Early CBF info is present in the last 5 bits */
1650     i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1651 
1652     tu_split_flag <<= 5;
1653     i4_early_cbf_flag <<= 5;
1654 
1655     cost_child += best_cost;
1656 
1657     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1658         pu1_src2,
1659         src_strd,
1660         pu1_pred2,
1661         pred_strd,
1662         pi2_y2,
1663         32,
1664         ppi4_hsad,
1665         ppi4_tu_split,
1666         ppi4_tu_early_cbf,
1667         pos_x_y_4x4 + (4 << 16),
1668         num_4x4_in_row,
1669         lambda,
1670         lambda_q_shift,
1671         i4_frm_qstep,
1672         i4_cur_depth + 1,
1673         i4_max_depth,
1674         i4_max_tr_size,
1675         pi4_tu_split_cost,
1676         NULL);
1677 
1678     /* cost is shifted by 10bits */
1679     best_cost = best_cost_tu_split >> 10;
1680 
1681     /* Tu split is present in the 6-10 bits */
1682     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1683 
1684     /*Early CBF info is present in the last 5 bits */
1685     i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1686 
1687     tu_split_flag <<= 5;
1688     i4_early_cbf_flag <<= 5;
1689 
1690     cost_child += best_cost;
1691 
1692     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1693         pu1_src3,
1694         src_strd,
1695         pu1_pred3,
1696         pred_strd,
1697         pi2_y3,
1698         32,
1699         ppi4_hsad,
1700         ppi4_tu_split,
1701         ppi4_tu_early_cbf,
1702         pos_x_y_4x4 + (4 << 16) + 4,
1703         num_4x4_in_row,
1704         lambda,
1705         lambda_q_shift,
1706         i4_frm_qstep,
1707         i4_cur_depth + 1,
1708         i4_max_depth,
1709         i4_max_tr_size,
1710         pi4_tu_split_cost,
1711         NULL);
1712 
1713     /* cost is shifted by 10bits */
1714     best_cost = best_cost_tu_split >> 10;
1715 
1716     /* Tu split is present in the 6-10 bits */
1717     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1718 
1719     /*Early CBF info is present in the last 5 bits */
1720     i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1721 
1722     tu_split_flag <<= 1;
1723     i4_early_cbf_flag <<= 1;
1724 
1725     cost_child += best_cost;
1726 
1727     {
1728         UWORD32 u4_satd = 0;
1729 
1730         u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16(
1731             pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1732 
1733         cost_parent = ((u4_satd + 2) >> 2);
1734     }
1735 
1736     /* 4 TU_Split flags , 4 CBF Flags*/
1737     cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1738 
1739     i4_early_cbf_flag += early_cbf;
1740 
1741     /* 1 TU_SPlit flag, 1 CBF flag */
1742     //cost_parent += ((1 + 1)* lambda) >>  (lambda_q_shift + 1);
1743 
1744     if(i4_cur_depth < i4_max_depth)
1745     {
1746         if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size))
1747         {
1748             *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1749             best_cost = cost_child;
1750             tu_split_flag++;
1751         }
1752         else
1753         {
1754             tu_split_flag = 0;
1755             best_cost = cost_parent;
1756         }
1757     }
1758     else
1759     {
1760         tu_split_flag = 0;
1761         best_cost = cost_parent;
1762     }
1763 
1764     pi4_32x32_tu_split[0] = tu_split_flag;
1765 
1766     pi4_32x32_hsad[0] = best_cost;
1767 
1768     pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag;
1769 }
1770