• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  ih264_resi_trans_quant.c
24  *
25  * @brief
26  *  Contains function definitions single stage  forward transform for H.264
27  *  It will calculate the residue, do the cf and then do quantization
28  *
29  * @author
30  *  Ittiam
31  *
32  * @par List of Functions:
33  *  - ih264_resi_trans_quant_4x4()
34  *  - ih264_resi_trans_quant_chroma_4x4
35  *  - ih264_hadamard_quant_4x4
36  *  - ih264_hadamard_quant_2x2_uv
37  *  - ih264_resi_trans_quant_8x8
38  *
39  * @remarks
40  *******************************************************************************
41  */
42 /* System include files */
43 #include <stdbool.h>
44 #include <stddef.h>
45 
46 /* User include files */
47 #include "ih264_typedefs.h"
48 #include "ih264_defs.h"
49 #include "ih264_size_defs.h"
50 #include "ih264_macros.h"
51 #include "ih264_trans_macros.h"
52 #include "ih264_trans_data.h"
53 #include "ih264_structs.h"
54 #include "isvc_trans_quant_itrans_iquant.h"
55 
isvc_subtract_upsampled_res(WORD16 i2_residue,WORD16 i2_upsampled_res)56 static FORCEINLINE WORD16 isvc_subtract_upsampled_res(WORD16 i2_residue, WORD16 i2_upsampled_res)
57 {
58     return (CLIP3(-((WORD16) UINT8_MAX), ((WORD16) UINT8_MAX), i2_residue - i2_upsampled_res));
59 }
60 
61 /**
62  *******************************************************************************
63  *
64  * @brief
65  *   This function performs forward transform and quantization on a 4*4 block
66  *
67  * @par Description:
68  *   The function accepts source buffer and estimation buffer. From these, it
69  *   computes the residue. This is residue is then transformed and quantized.
70  *   The transform and quantization are in placed computed. They use the residue
71  *   buffer for this.
72  *
73  * @param[in] pu1_src
74  *   Pointer to source sub-block
75  *
76  * @param[in] pu1_pred
77  *   Pointer to prediction sub-block
78  *
79  * @param[in] pi2_out
80  *   Pointer to residual sub-block
81  *
82  * @param[in] i4_src_stride
83  *   Source stride
84  *
85  * @param[in] i4_pred_stride
86  *   Prediction stride
87  *
88  * @param[in] dst_strd
89  *   Destination stride
90  *
91  * @param[in] u4_qbits
92  *    QP_BITS_h264_4x4 + floor(QP/6)
93  *
94  * @param[in] pu2_threshold_matrix
95  *   Pointer to Forward Quant Threshold Matrix
96  *
97  * @param[in] pu2_scale_matrix
98  *   Pointer to Forward Quant Scale Matrix
99  *
100  * @param[in] u4_round_factor
101  *   Quantization Round factor
102  *
103  * @param[out] pu1_nnz
104  *   Total non-zero coefficients in the current sub-block
105  *
106  * @returns
107  *
108  * @remarks
109  *   None
110  *
111  *******************************************************************************
112  */
isvc_resi_trans_quant_4x4(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)113 void isvc_resi_trans_quant_4x4(buffer_container_t *ps_src, buffer_container_t *ps_pred,
114                                buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res,
115                                resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz,
116                                WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
117 {
118     UWORD32 i;
119     WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
120     WORD32 i4_value;
121 
122     UWORD8 *pu1_src = ps_src->pv_data;
123     UWORD8 *pu1_pred = ps_pred->pv_data;
124     WORD16 *pi2_out = ps_out->pv_data;
125     WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL;
126     WORD32 i4_src_stride = ps_src->i4_data_stride;
127     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
128     WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
129     WORD16 *pi2_out_tmp = pi2_out;
130     UWORD32 u4_nonzero_coeff = 0;
131     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
132     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
133     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
134     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
135 
136     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
137     {
138         /* computing prediction error (residue) */
139         x4 = pu1_src[0] - pu1_pred[0];
140         x5 = pu1_src[1] - pu1_pred[1];
141         x6 = pu1_src[2] - pu1_pred[2];
142         x7 = pu1_src[3] - pu1_pred[3];
143 
144         if(u1_use_upsampled_res)
145         {
146             x4 = isvc_subtract_upsampled_res(x4, pi2_upsampled_res[0]);
147             x5 = isvc_subtract_upsampled_res(x5, pi2_upsampled_res[1]);
148             x6 = isvc_subtract_upsampled_res(x6, pi2_upsampled_res[2]);
149             x7 = isvc_subtract_upsampled_res(x7, pi2_upsampled_res[3]);
150         }
151 
152         /* Horizontal transform */
153         x0 = x4 + x7;
154         x1 = x5 + x6;
155         x2 = x5 - x6;
156         x3 = x4 - x7;
157 
158         pi2_out_tmp[0] = x0 + x1;
159         pi2_out_tmp[1] = (x3 << 1) + x2;
160         pi2_out_tmp[2] = x0 - x1;
161         pi2_out_tmp[3] = x3 - (x2 << 1);
162 
163         /* pointing to next row; */
164         pu1_src += i4_src_stride;
165         pu1_pred += i4_pred_stride;
166         pi2_out_tmp += 4;
167         pi2_upsampled_res += i4_upsampled_res_stride;
168     }
169 
170     pi2_out_tmp = pi2_out;
171 
172     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
173     {
174         /* Vertical transform and quantization */
175         x4 = pi2_out_tmp[0];
176         x5 = pi2_out_tmp[4];
177         x6 = pi2_out_tmp[8];
178         x7 = pi2_out_tmp[12];
179 
180         x0 = x4 + x7;
181         x1 = x5 + x6;
182         x2 = x5 - x6;
183         x3 = x4 - x7;
184 
185         /* quantization is done in place */
186 
187         i4_value = x0 + x1;
188 
189         if(i == 0)
190         {
191             (*pi2_dc_out) = i4_value;
192         }
193 
194         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
195                   u4_nonzero_coeff);
196         pi2_out_tmp[0] = i4_value;
197 
198         i4_value = (x3 << 1) + x2;
199         FWD_QUANT(i4_value, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits,
200                   u4_nonzero_coeff);
201         pi2_out_tmp[4] = i4_value;
202 
203         i4_value = x0 - x1;
204         FWD_QUANT(i4_value, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits,
205                   u4_nonzero_coeff);
206         pi2_out_tmp[8] = i4_value;
207 
208         i4_value = x3 - (x2 << 1);
209         FWD_QUANT(i4_value, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor,
210                   u4_qbits, u4_nonzero_coeff);
211         pi2_out_tmp[12] = i4_value;
212 
213         pi2_out_tmp++;
214         pu2_scale_matrix++;
215         pu2_threshold_matrix++;
216     }
217 
218     /* Return total nonzero coefficients in the current sub block */
219     *pu1_nnz = u4_nonzero_coeff;
220 }
221 
222 /**
223  *******************************************************************************
224  *
225  * @brief
226  *   This function performs forward transform and quantization on a 4*4 chroma
227  *block with interleaved values
228  *
229  * @par Description:
230  *   The function accepts source buffer and estimation buffer. From these, it
231  *   computes the residue. This is residue is then transformed and quantized.
232  *   The transform and quantization are in placed computed. They use the residue
233  *   buffer for this.
234  *
235  * @param[in] pu1_src
236  *   Pointer to source sub-block
237  *
238  * @param[in] pu1_pred
239  *   Pointer to prediction sub-block
240  *
241  * @param[in] pi2_out
242  *   Pointer to residual sub-block
243  *
244  * @param[in] i4_src_stride
245  *   Source stride
246  *
247  * @param[in] i4_pred_stride
248  *   Prediction stride
249  *
250  * @param[in] dst_strd
251  *   Destination stride
252  *
253  * @param[in] u4_qbits
254  *    QP_BITS_h264_4x4 + floor(QP/6)
255  *
256  * @param[in] pu2_threshold_matrix
257  *   Pointer to Forward Quant Threshold Matrix
258  *
259  * @param[in] pu2_scale_matrix
260  *   Pointer to Forward Quant Scale Matrix
261  *
262  * @param[in] u4_round_factor
263  *   Quantization Round factor
264  *
265  * @param[out] pu1_nnz
266  *   Total non-zero coefficients in the current sub-block
267  *
268  * @returns
269  *
270  * @remarks
271  *   None
272  *
273  *******************************************************************************
274  */
isvc_resi_trans_quant_chroma_4x4(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)275 void isvc_resi_trans_quant_chroma_4x4(buffer_container_t *ps_src, buffer_container_t *ps_pred,
276                                       buffer_container_t *ps_out,
277                                       buffer_container_t *ps_upsampled_res,
278                                       resi_trans_quant_constants_t *ps_quant_constants,
279                                       UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
280                                       UWORD8 u1_use_upsampled_res)
281 {
282     UWORD32 i;
283     WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
284     WORD32 i4_value;
285 
286     UWORD8 *pu1_src = ps_src->pv_data;
287     UWORD8 *pu1_pred = ps_pred->pv_data;
288     WORD16 *pi2_out = ps_out->pv_data;
289     WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL;
290     WORD32 i4_src_stride = ps_src->i4_data_stride;
291     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
292     WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
293     WORD16 *pi2_out_tmp = pi2_out;
294     UWORD32 u4_nonzero_coeff = 0;
295     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
296     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
297     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
298     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
299 
300     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
301     {
302         /* computing prediction error (residue) */
303         x4 = pu1_src[0] - pu1_pred[0];
304         x5 = pu1_src[2] - pu1_pred[2];
305         x6 = pu1_src[4] - pu1_pred[4];
306         x7 = pu1_src[6] - pu1_pred[6];
307 
308         if(u1_use_upsampled_res)
309         {
310             x4 = isvc_subtract_upsampled_res(x4, pi2_upsampled_res[0]);
311             x5 = isvc_subtract_upsampled_res(x5, pi2_upsampled_res[1]);
312             x6 = isvc_subtract_upsampled_res(x6, pi2_upsampled_res[2]);
313             x7 = isvc_subtract_upsampled_res(x7, pi2_upsampled_res[3]);
314         }
315 
316         /* Horizontal transform */
317         x0 = x4 + x7;
318         x1 = x5 + x6;
319         x2 = x5 - x6;
320         x3 = x4 - x7;
321 
322         pi2_out_tmp[0] = x0 + x1;
323         pi2_out_tmp[1] = (x3 << 1) + x2;
324         pi2_out_tmp[2] = x0 - x1;
325         pi2_out_tmp[3] = x3 - (x2 << 1);
326 
327         /* pointing to next row; */
328         pu1_src += i4_src_stride;
329         pu1_pred += i4_pred_stride;
330         pi2_out_tmp += 4;
331         pi2_upsampled_res += i4_upsampled_res_stride;
332     }
333     pi2_out_tmp = pi2_out;
334     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
335     {
336         /* Vertical transform and quantization */
337         x4 = pi2_out_tmp[0];
338         x5 = pi2_out_tmp[4];
339         x6 = pi2_out_tmp[8];
340         x7 = pi2_out_tmp[12];
341 
342         x0 = x4 + x7;
343         x1 = x5 + x6;
344         x2 = x5 - x6;
345         x3 = x4 - x7;
346 
347         /* quantization is done in place */
348 
349         i4_value = x0 + x1;
350 
351         if(i == 0)
352         {
353             *pi2_dc_out = i4_value;
354         }
355 
356         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
357                   u4_nonzero_coeff);
358         pi2_out_tmp[0] = i4_value;
359 
360         i4_value = (x3 << 1) + x2;
361         FWD_QUANT(i4_value, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits,
362                   u4_nonzero_coeff);
363         pi2_out_tmp[4] = i4_value;
364 
365         i4_value = x0 - x1;
366         FWD_QUANT(i4_value, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits,
367                   u4_nonzero_coeff);
368         pi2_out_tmp[8] = i4_value;
369 
370         i4_value = x3 - (x2 << 1);
371         FWD_QUANT(i4_value, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor,
372                   u4_qbits, u4_nonzero_coeff);
373         pi2_out_tmp[12] = i4_value;
374 
375         pi2_out_tmp++;
376         pu2_scale_matrix++;
377         pu2_threshold_matrix++;
378     }
379 
380     /* Return total nonzero coefficients in the current sub block */
381     *pu1_nnz = u4_nonzero_coeff;
382 }
383 
384 /**
385  *******************************************************************************
386  *
387  * @brief
388  *   This function performs forward hadamard transform and quantization on a 4*4
389  *block
390  *
391  * @par Description:
392  *   The function accepts source buffer and estimation buffer. From these, it
393  *   computes the residue. This is residue is then transformed and quantized.
394  *   The transform and quantization are in placed computed. They use the residue
395  *   buffer for this.
396  *
397  * @param[in] pu1_src
398  *   Pointer to source sub-block
399  *
400  * @param[in] pu1_pred
401  *   Pointer to prediction sub-block
402  *
403  * @param[in] pi2_out
404  *   Pointer to residual sub-block
405  *
406  * @param[in] i4_src_stride
407  *   Source stride
408  *
409  * @param[in] i4_pred_stride
410  *   Prediction stride
411  *
412  * @param[in] dst_strd
413  *   Destination stride
414  *
415  * @param[in] u4_qbits
416  *    QP_BITS_h264_4x4 + floor(QP/6)
417  *
418  * @param[in] pu2_threshold_matrix
419  *   Pointer to Forward Quant Threshold Matrix
420  *
421  * @param[in] pu2_scale_matrix
422  *   Pointer to Forward Quant Scale Matrix
423  *
424  * @param[in] u4_round_factor
425  *   Quantization Round factor
426  *
427  * @param[out] pu1_nnz
428  *   Total non-zero coefficients in the current sub-block
429  *
430  * @returns
431  *
432  * @remarks
433  *   None
434  *
435  */
436 
isvc_hadamard_quant_4x4(WORD16 * pi2_src,WORD16 * pi2_dst,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz)437 void isvc_hadamard_quant_4x4(WORD16 *pi2_src, WORD16 *pi2_dst,
438                              resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz)
439 {
440     WORD32 i;
441     WORD32 x0, x1, x2, x3, x4, x5, x6, x7, i4_value;
442 
443     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
444     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
445     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
446     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
447 
448     *pu1_nnz = 0;
449 
450     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
451     {
452         x4 = pi2_src[0];
453         x5 = pi2_src[1];
454         x6 = pi2_src[2];
455         x7 = pi2_src[3];
456 
457         x0 = x4 + x7;
458         x1 = x5 + x6;
459         x2 = x5 - x6;
460         x3 = x4 - x7;
461 
462         pi2_dst[0] = x0 + x1;
463         pi2_dst[1] = x3 + x2;
464         pi2_dst[2] = x0 - x1;
465         pi2_dst[3] = x3 - x2;
466 
467         pi2_src += 4;
468         pi2_dst += 4;
469     }
470 
471     /* Vertical transform and quantization */
472     pi2_dst -= SUB_BLK_WIDTH_4x4 << 2;
473 
474     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
475     {
476         x4 = pi2_dst[0];
477         x5 = pi2_dst[4];
478         x6 = pi2_dst[8];
479         x7 = pi2_dst[12];
480 
481         x0 = x4 + x7;
482         x1 = x5 + x6;
483         x2 = x5 - x6;
484         x3 = x4 - x7;
485 
486         i4_value = (x0 + x1) >> 1;
487         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
488                   pu1_nnz[0]);
489         pi2_dst[0] = i4_value;
490 
491         i4_value = (x3 + x2) >> 1;
492         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
493                   pu1_nnz[0]);
494         pi2_dst[4] = i4_value;
495 
496         i4_value = (x0 - x1) >> 1;
497         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
498                   pu1_nnz[0]);
499         pi2_dst[8] = i4_value;
500 
501         i4_value = (x3 - x2) >> 1;
502         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
503                   pu1_nnz[0]);
504         pi2_dst[12] = i4_value;
505 
506         pi2_dst++;
507     }
508 }
509 
510 /**
511  *******************************************************************************
512  *
513  * @brief
514  *   This function performs forward hadamard transform and quantization on a 2*2
515  *block for both U and V planes
516  *
517  * @par Description:
518  *   The function accepts source buffer and estimation buffer. From these, it
519  *   computes the residue. This is residue is then transformed and quantized.
520  *   The transform and quantization are in placed computed. They use the residue
521  *   buffer for this.
522  *
523  * @param[in] pu1_src
524  *   Pointer to source sub-block
525  *
526  * @param[in] pu1_pred
527  *   Pointer to prediction sub-block
528  *
529  * @param[in] pi2_out
530  *   Pointer to residual sub-block
531  *
532  * @param[in] i4_src_stride
533  *   Source stride
534  *
535  * @param[in] i4_pred_stride
536  *   Prediction stride
537  *
538  * @param[in] dst_strd
539  *   Destination stride
540  *
541  * @param[in] u4_qbits
542  *    QP_BITS_h264_4x4 + floor(QP/6)
543  *
544  * @param[in] pu2_threshold_matrix
545  *   Pointer to Forward Quant Threshold Matrix
546  *
547  * @param[in] pu2_scale_matrix
548  *   Pointer to Forward Quant Scale Matrix
549  *
550  * @param[in] u4_round_factor
551  *   Quantization Round factor
552  *
553  * @param[out] pu1_nnz
554  *   Total non-zero coefficients in the current sub-block
555  *
556  * @returns
557  *
558  * @remarks
559  *   NNZ for dc is populated at 0 and 5th position of pu1_nnz
560  *
561  */
562 
isvc_hadamard_quant_2x2_uv(WORD16 * pi2_src,WORD16 * pi2_dst,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz)563 void isvc_hadamard_quant_2x2_uv(WORD16 *pi2_src, WORD16 *pi2_dst,
564                                 resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz)
565 {
566     WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
567     WORD32 i4_value, plane;
568 
569     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
570     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
571     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
572     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
573 
574     for(plane = 0; plane < 2; plane++)
575     {
576         pu1_nnz[plane] = 0;
577 
578         /* Horizontal transform */
579         x4 = pi2_src[0];
580         x5 = pi2_src[1];
581         x6 = pi2_src[2];
582         x7 = pi2_src[3];
583 
584         x0 = x4 + x5;
585         x1 = x4 - x5;
586         x2 = x6 + x7;
587         x3 = x6 - x7;
588 
589         /* Vertical transform and quantization */
590         i4_value = (x0 + x2);
591         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
592                   pu1_nnz[plane]);
593         pi2_dst[0] = i4_value;
594 
595         i4_value = (x0 - x2);
596         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
597                   pu1_nnz[plane]);
598         pi2_dst[2] = i4_value;
599 
600         i4_value = (x1 - x3);
601         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
602                   pu1_nnz[plane]);
603         pi2_dst[3] = i4_value;
604 
605         i4_value = (x1 + x3);
606         FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
607                   pu1_nnz[plane]);
608         pi2_dst[1] = i4_value;
609 
610         pi2_dst += 4;
611         pi2_src += 4;
612     }
613 }
614 
615 /*
616  *******************************************************************************
617  *
618  * @brief
619  *  This function performs Single stage forward transform CF8 and quantization
620  *on 8*8 blocks for h.264
621  *
622  * @par Description:
623  *  Performs single stage 8x8 forward transform CF8 after calculating the
624  *residue The result is then quantized
625  *
626  * @param[in] pu1_src
627  *  Input 8x8 pixels
628  *
629  * @param[in] pu1_pred
630  *  Input 8x8 pixels
631  *
632  * @param[in] pi1_out
633  * Output 8x8 pixels
634  *
635  * @param[in] u4_thresh
636  *  Threshold under which the coeffs are not quantized
637  *
638  *  @param[in] u4_qp_div
639  *  QP/6
640  *
641  *  @param[in] u4_qp_rem
642  *  QP%6
643  *
644  * @param[in] u2_src_stride
645  *  Source stride
646  *
647  * @param[in] i4_pred_stride
648  * stride for prediciton buffer
649  *
650  *  @param[in] dst_strd
651  *  stride for destination buffer
652  *
653  *  @param[in] pu4_quant_mat
654  *  Pointer to the 4x4 quantization matrix
655  *
656  * @returns  Void
657  *
658  *
659  *******************************************************************************
660  */
isvc_resi_trans_quant_8x8(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)661 void isvc_resi_trans_quant_8x8(buffer_container_t *ps_src, buffer_container_t *ps_pred,
662                                buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res,
663                                resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz,
664                                WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
665 {
666     UWORD32 i;
667     WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
668     WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
669 
670     UWORD8 *pu1_src = ps_src->pv_data;
671     UWORD8 *pu1_pred = ps_pred->pv_data;
672     WORD16 *pi2_out = ps_out->pv_data;
673     WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL;
674     WORD32 i4_src_stride = ps_src->i4_data_stride;
675     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
676     WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
677     WORD16 *pi2_out_tmp = pi2_out;
678     UWORD32 u4_nonzero_coeff = 0;
679     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
680     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
681     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
682     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
683 
684     UNUSED(pi2_dc_out);
685 
686     /*Horizontal transform */
687     /* we are going to use the a's and r's in a twisted way since */
688     /*i dont want to declare more variables */
689     for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
690     {
691         r0 = pu1_src[0];
692         r0 -= pu1_pred[0];
693         r1 = pu1_src[1];
694         r1 -= pu1_pred[1];
695         r2 = pu1_src[2];
696         r2 -= pu1_pred[2];
697         r3 = pu1_src[3];
698         r3 -= pu1_pred[3];
699         r4 = pu1_src[4];
700         r4 -= pu1_pred[4];
701         r5 = pu1_src[5];
702         r5 -= pu1_pred[5];
703         r6 = pu1_src[6];
704         r6 -= pu1_pred[6];
705         r7 = pu1_src[7];
706         r7 -= pu1_pred[7];
707 
708         if(u1_use_upsampled_res)
709         {
710             r0 = isvc_subtract_upsampled_res(r0, pi2_upsampled_res[0]);
711             r1 = isvc_subtract_upsampled_res(r1, pi2_upsampled_res[1]);
712             r2 = isvc_subtract_upsampled_res(r2, pi2_upsampled_res[2]);
713             r3 = isvc_subtract_upsampled_res(r3, pi2_upsampled_res[3]);
714             r4 = isvc_subtract_upsampled_res(r4, pi2_upsampled_res[4]);
715             r5 = isvc_subtract_upsampled_res(r5, pi2_upsampled_res[5]);
716             r6 = isvc_subtract_upsampled_res(r6, pi2_upsampled_res[6]);
717             r7 = isvc_subtract_upsampled_res(r7, pi2_upsampled_res[7]);
718         }
719 
720         a0 = r0 + r7;
721         a1 = r1 + r6;
722         a2 = r2 + r5;
723         a3 = r3 + r4;
724 
725         a4 = a0 + a3;
726         a5 = a1 + a2;
727         a6 = a0 - a3;
728         a7 = a1 - a2;
729 
730         pi2_out_tmp[0] = a4 + a5;
731 
732         pi2_out_tmp[2] = a6 + (a7 >> 1);
733         pi2_out_tmp[4] = a4 - a5;
734         pi2_out_tmp[6] = (a6 >> 1) - a7;
735 
736         a0 = r0 - r7;
737         a1 = r1 - r6;
738         a2 = r2 - r5;
739         a3 = r3 - r4;
740 
741         a4 = a1 + a2 + ((a0 >> 1) + a0);
742         a5 = a0 - a3 - ((a2 >> 1) + a2);
743         a6 = a0 + a3 - ((a1 >> 1) + a1);
744         a7 = a1 - a2 + ((a3 >> 1) + a3);
745 
746         pi2_out_tmp[1] = a4 + (a7 >> 2);
747         pi2_out_tmp[3] = a5 + (a6 >> 2);
748         pi2_out_tmp[5] = a6 - (a5 >> 2);
749         pi2_out_tmp[7] = (a4 >> 2) - a7;
750 
751         pu1_src += i4_src_stride;
752         pu1_pred += i4_pred_stride;
753         pi2_out_tmp += 8;
754         pi2_upsampled_res += i4_upsampled_res_stride;
755     }
756 
757     /*vertical transform and quant */
758 
759     pi2_out_tmp = pi2_out;
760 
761     for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
762     {
763         r0 = pi2_out_tmp[0];
764         r1 = pi2_out_tmp[8];
765         r2 = pi2_out_tmp[16];
766         r3 = pi2_out_tmp[24];
767         r4 = pi2_out_tmp[32];
768         r5 = pi2_out_tmp[40];
769         r6 = pi2_out_tmp[48];
770         r7 = pi2_out_tmp[56];
771 
772         a0 = r0 + r7;
773         a1 = r1 + r6;
774         a2 = r2 + r5;
775         a3 = r3 + r4;
776 
777         a4 = a0 + a3;
778         a5 = a1 + a2;
779         a6 = a0 - a3;
780         a7 = a1 - a2;
781 
782         a0 = r0 - r7;
783         a1 = r1 - r6;
784         a2 = r2 - r5;
785         a3 = r3 - r4;
786 
787         r0 = a4 + a5;
788         r2 = a6 + (a7 >> 1);
789         r4 = a4 - a5;
790         r6 = (a6 >> 1) - a7;
791 
792         a4 = a1 + a2 + ((a0 >> 1) + a0);
793         a5 = a0 - a3 - ((a2 >> 1) + a2);
794         a6 = a0 + a3 - ((a1 >> 1) + a1);
795         a7 = a1 - a2 + ((a3 >> 1) + a3);
796 
797         r1 = a4 + (a7 >> 2);
798         r3 = a5 + (a6 >> 2);
799         r5 = a6 - (a5 >> 2);
800         r7 = (a4 >> 2) - a7;
801 
802         FWD_QUANT(r0, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits,
803                   u4_nonzero_coeff);
804         pi2_out_tmp[0] = r0;
805 
806         FWD_QUANT(r1, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits,
807                   u4_nonzero_coeff);
808         pi2_out_tmp[8] = r1;
809 
810         FWD_QUANT(r2, pu2_threshold_matrix[16], pu2_scale_matrix[16], u4_round_factor, u4_qbits,
811                   u4_nonzero_coeff);
812         pi2_out_tmp[16] = r2;
813 
814         FWD_QUANT(r3, pu2_threshold_matrix[24], pu2_scale_matrix[24], u4_round_factor, u4_qbits,
815                   u4_nonzero_coeff);
816         pi2_out_tmp[24] = r3;
817 
818         FWD_QUANT(r4, pu2_threshold_matrix[32], pu2_scale_matrix[32], u4_round_factor, u4_qbits,
819                   u4_nonzero_coeff);
820         pi2_out_tmp[32] = r4;
821 
822         FWD_QUANT(r5, pu2_threshold_matrix[40], pu2_scale_matrix[40], u4_round_factor, u4_qbits,
823                   u4_nonzero_coeff);
824         pi2_out_tmp[40] = r5;
825 
826         FWD_QUANT(r6, pu2_threshold_matrix[48], pu2_scale_matrix[48], u4_round_factor, u4_qbits,
827                   u4_nonzero_coeff);
828         pi2_out_tmp[48] = r6;
829 
830         FWD_QUANT(r7, pu2_threshold_matrix[56], pu2_scale_matrix[56], u4_round_factor, u4_qbits,
831                   u4_nonzero_coeff);
832         pi2_out_tmp[56] = r7;
833 
834         pi2_out_tmp++;
835         pu2_scale_matrix++;
836         pu2_threshold_matrix++;
837     }
838     /* Return total nonzero coefficients in the current sub block */
839     *pu1_nnz = u4_nonzero_coeff;
840 }
841