/****************************************************************************** * * Copyright (C) 2022 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * ih264_resi_trans_quant.c * * @brief * Contains function definitions single stage forward transform for H.264 * It will calculate the residue, do the cf and then do quantization * * @author * Ittiam * * @par List of Functions: * - ih264_resi_trans_quant_4x4() * - ih264_resi_trans_quant_chroma_4x4 * - ih264_hadamard_quant_4x4 * - ih264_hadamard_quant_2x2_uv * - ih264_resi_trans_quant_8x8 * * @remarks ******************************************************************************* */ /* System include files */ #include #include /* User include files */ #include "ih264_typedefs.h" #include "ih264_defs.h" #include "ih264_size_defs.h" #include "ih264_macros.h" #include "ih264_trans_macros.h" #include "ih264_trans_data.h" #include "ih264_structs.h" #include "isvc_trans_quant_itrans_iquant.h" static FORCEINLINE WORD16 isvc_subtract_upsampled_res(WORD16 i2_residue, WORD16 i2_upsampled_res) { return (CLIP3(-((WORD16) UINT8_MAX), ((WORD16) UINT8_MAX), i2_residue - i2_upsampled_res)); } /** ******************************************************************************* * * @brief * This function performs forward transform and quantization on a 4*4 block * * @par Description: * The function accepts source buffer and estimation buffer. From these, it * computes the residue. This is residue is then transformed and quantized. * The transform and quantization are in placed computed. They use the residue * buffer for this. * * @param[in] pu1_src * Pointer to source sub-block * * @param[in] pu1_pred * Pointer to prediction sub-block * * @param[in] pi2_out * Pointer to residual sub-block * * @param[in] i4_src_stride * Source stride * * @param[in] i4_pred_stride * Prediction stride * * @param[in] dst_strd * Destination stride * * @param[in] u4_qbits * QP_BITS_h264_4x4 + floor(QP/6) * * @param[in] pu2_threshold_matrix * Pointer to Forward Quant Threshold Matrix * * @param[in] pu2_scale_matrix * Pointer to Forward Quant Scale Matrix * * @param[in] u4_round_factor * Quantization Round factor * * @param[out] pu1_nnz * Total non-zero coefficients in the current sub-block * * @returns * * @remarks * None * ******************************************************************************* */ void isvc_resi_trans_quant_4x4(buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res) { UWORD32 i; WORD32 x0, x1, x2, x3, x4, x5, x6, x7; WORD32 i4_value; UWORD8 *pu1_src = ps_src->pv_data; UWORD8 *pu1_pred = ps_pred->pv_data; WORD16 *pi2_out = ps_out->pv_data; WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL; WORD32 i4_src_stride = ps_src->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0; WORD16 *pi2_out_tmp = pi2_out; UWORD32 u4_nonzero_coeff = 0; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) { /* computing prediction error (residue) */ x4 = pu1_src[0] - pu1_pred[0]; x5 = pu1_src[1] - pu1_pred[1]; x6 = pu1_src[2] - pu1_pred[2]; x7 = pu1_src[3] - pu1_pred[3]; if(u1_use_upsampled_res) { x4 = isvc_subtract_upsampled_res(x4, pi2_upsampled_res[0]); x5 = isvc_subtract_upsampled_res(x5, pi2_upsampled_res[1]); x6 = isvc_subtract_upsampled_res(x6, pi2_upsampled_res[2]); x7 = isvc_subtract_upsampled_res(x7, pi2_upsampled_res[3]); } /* Horizontal transform */ x0 = x4 + x7; x1 = x5 + x6; x2 = x5 - x6; x3 = x4 - x7; pi2_out_tmp[0] = x0 + x1; pi2_out_tmp[1] = (x3 << 1) + x2; pi2_out_tmp[2] = x0 - x1; pi2_out_tmp[3] = x3 - (x2 << 1); /* pointing to next row; */ pu1_src += i4_src_stride; pu1_pred += i4_pred_stride; pi2_out_tmp += 4; pi2_upsampled_res += i4_upsampled_res_stride; } pi2_out_tmp = pi2_out; for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) { /* Vertical transform and quantization */ x4 = pi2_out_tmp[0]; x5 = pi2_out_tmp[4]; x6 = pi2_out_tmp[8]; x7 = pi2_out_tmp[12]; x0 = x4 + x7; x1 = x5 + x6; x2 = x5 - x6; x3 = x4 - x7; /* quantization is done in place */ i4_value = x0 + x1; if(i == 0) { (*pi2_dc_out) = i4_value; } FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[0] = i4_value; i4_value = (x3 << 1) + x2; FWD_QUANT(i4_value, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[4] = i4_value; i4_value = x0 - x1; FWD_QUANT(i4_value, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[8] = i4_value; i4_value = x3 - (x2 << 1); FWD_QUANT(i4_value, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[12] = i4_value; pi2_out_tmp++; pu2_scale_matrix++; pu2_threshold_matrix++; } /* Return total nonzero coefficients in the current sub block */ *pu1_nnz = u4_nonzero_coeff; } /** ******************************************************************************* * * @brief * This function performs forward transform and quantization on a 4*4 chroma *block with interleaved values * * @par Description: * The function accepts source buffer and estimation buffer. From these, it * computes the residue. This is residue is then transformed and quantized. * The transform and quantization are in placed computed. They use the residue * buffer for this. * * @param[in] pu1_src * Pointer to source sub-block * * @param[in] pu1_pred * Pointer to prediction sub-block * * @param[in] pi2_out * Pointer to residual sub-block * * @param[in] i4_src_stride * Source stride * * @param[in] i4_pred_stride * Prediction stride * * @param[in] dst_strd * Destination stride * * @param[in] u4_qbits * QP_BITS_h264_4x4 + floor(QP/6) * * @param[in] pu2_threshold_matrix * Pointer to Forward Quant Threshold Matrix * * @param[in] pu2_scale_matrix * Pointer to Forward Quant Scale Matrix * * @param[in] u4_round_factor * Quantization Round factor * * @param[out] pu1_nnz * Total non-zero coefficients in the current sub-block * * @returns * * @remarks * None * ******************************************************************************* */ void isvc_resi_trans_quant_chroma_4x4(buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res) { UWORD32 i; WORD32 x0, x1, x2, x3, x4, x5, x6, x7; WORD32 i4_value; UWORD8 *pu1_src = ps_src->pv_data; UWORD8 *pu1_pred = ps_pred->pv_data; WORD16 *pi2_out = ps_out->pv_data; WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL; WORD32 i4_src_stride = ps_src->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0; WORD16 *pi2_out_tmp = pi2_out; UWORD32 u4_nonzero_coeff = 0; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) { /* computing prediction error (residue) */ x4 = pu1_src[0] - pu1_pred[0]; x5 = pu1_src[2] - pu1_pred[2]; x6 = pu1_src[4] - pu1_pred[4]; x7 = pu1_src[6] - pu1_pred[6]; if(u1_use_upsampled_res) { x4 = isvc_subtract_upsampled_res(x4, pi2_upsampled_res[0]); x5 = isvc_subtract_upsampled_res(x5, pi2_upsampled_res[1]); x6 = isvc_subtract_upsampled_res(x6, pi2_upsampled_res[2]); x7 = isvc_subtract_upsampled_res(x7, pi2_upsampled_res[3]); } /* Horizontal transform */ x0 = x4 + x7; x1 = x5 + x6; x2 = x5 - x6; x3 = x4 - x7; pi2_out_tmp[0] = x0 + x1; pi2_out_tmp[1] = (x3 << 1) + x2; pi2_out_tmp[2] = x0 - x1; pi2_out_tmp[3] = x3 - (x2 << 1); /* pointing to next row; */ pu1_src += i4_src_stride; pu1_pred += i4_pred_stride; pi2_out_tmp += 4; pi2_upsampled_res += i4_upsampled_res_stride; } pi2_out_tmp = pi2_out; for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) { /* Vertical transform and quantization */ x4 = pi2_out_tmp[0]; x5 = pi2_out_tmp[4]; x6 = pi2_out_tmp[8]; x7 = pi2_out_tmp[12]; x0 = x4 + x7; x1 = x5 + x6; x2 = x5 - x6; x3 = x4 - x7; /* quantization is done in place */ i4_value = x0 + x1; if(i == 0) { *pi2_dc_out = i4_value; } FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[0] = i4_value; i4_value = (x3 << 1) + x2; FWD_QUANT(i4_value, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[4] = i4_value; i4_value = x0 - x1; FWD_QUANT(i4_value, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[8] = i4_value; i4_value = x3 - (x2 << 1); FWD_QUANT(i4_value, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[12] = i4_value; pi2_out_tmp++; pu2_scale_matrix++; pu2_threshold_matrix++; } /* Return total nonzero coefficients in the current sub block */ *pu1_nnz = u4_nonzero_coeff; } /** ******************************************************************************* * * @brief * This function performs forward hadamard transform and quantization on a 4*4 *block * * @par Description: * The function accepts source buffer and estimation buffer. From these, it * computes the residue. This is residue is then transformed and quantized. * The transform and quantization are in placed computed. They use the residue * buffer for this. * * @param[in] pu1_src * Pointer to source sub-block * * @param[in] pu1_pred * Pointer to prediction sub-block * * @param[in] pi2_out * Pointer to residual sub-block * * @param[in] i4_src_stride * Source stride * * @param[in] i4_pred_stride * Prediction stride * * @param[in] dst_strd * Destination stride * * @param[in] u4_qbits * QP_BITS_h264_4x4 + floor(QP/6) * * @param[in] pu2_threshold_matrix * Pointer to Forward Quant Threshold Matrix * * @param[in] pu2_scale_matrix * Pointer to Forward Quant Scale Matrix * * @param[in] u4_round_factor * Quantization Round factor * * @param[out] pu1_nnz * Total non-zero coefficients in the current sub-block * * @returns * * @remarks * None * */ void isvc_hadamard_quant_4x4(WORD16 *pi2_src, WORD16 *pi2_dst, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz) { WORD32 i; WORD32 x0, x1, x2, x3, x4, x5, x6, x7, i4_value; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; *pu1_nnz = 0; for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) { x4 = pi2_src[0]; x5 = pi2_src[1]; x6 = pi2_src[2]; x7 = pi2_src[3]; x0 = x4 + x7; x1 = x5 + x6; x2 = x5 - x6; x3 = x4 - x7; pi2_dst[0] = x0 + x1; pi2_dst[1] = x3 + x2; pi2_dst[2] = x0 - x1; pi2_dst[3] = x3 - x2; pi2_src += 4; pi2_dst += 4; } /* Vertical transform and quantization */ pi2_dst -= SUB_BLK_WIDTH_4x4 << 2; for(i = 0; i < SUB_BLK_WIDTH_4x4; i++) { x4 = pi2_dst[0]; x5 = pi2_dst[4]; x6 = pi2_dst[8]; x7 = pi2_dst[12]; x0 = x4 + x7; x1 = x5 + x6; x2 = x5 - x6; x3 = x4 - x7; i4_value = (x0 + x1) >> 1; FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]); pi2_dst[0] = i4_value; i4_value = (x3 + x2) >> 1; FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]); pi2_dst[4] = i4_value; i4_value = (x0 - x1) >> 1; FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]); pi2_dst[8] = i4_value; i4_value = (x3 - x2) >> 1; FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]); pi2_dst[12] = i4_value; pi2_dst++; } } /** ******************************************************************************* * * @brief * This function performs forward hadamard transform and quantization on a 2*2 *block for both U and V planes * * @par Description: * The function accepts source buffer and estimation buffer. From these, it * computes the residue. This is residue is then transformed and quantized. * The transform and quantization are in placed computed. They use the residue * buffer for this. * * @param[in] pu1_src * Pointer to source sub-block * * @param[in] pu1_pred * Pointer to prediction sub-block * * @param[in] pi2_out * Pointer to residual sub-block * * @param[in] i4_src_stride * Source stride * * @param[in] i4_pred_stride * Prediction stride * * @param[in] dst_strd * Destination stride * * @param[in] u4_qbits * QP_BITS_h264_4x4 + floor(QP/6) * * @param[in] pu2_threshold_matrix * Pointer to Forward Quant Threshold Matrix * * @param[in] pu2_scale_matrix * Pointer to Forward Quant Scale Matrix * * @param[in] u4_round_factor * Quantization Round factor * * @param[out] pu1_nnz * Total non-zero coefficients in the current sub-block * * @returns * * @remarks * NNZ for dc is populated at 0 and 5th position of pu1_nnz * */ void isvc_hadamard_quant_2x2_uv(WORD16 *pi2_src, WORD16 *pi2_dst, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz) { WORD32 x0, x1, x2, x3, x4, x5, x6, x7; WORD32 i4_value, plane; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; for(plane = 0; plane < 2; plane++) { pu1_nnz[plane] = 0; /* Horizontal transform */ x4 = pi2_src[0]; x5 = pi2_src[1]; x6 = pi2_src[2]; x7 = pi2_src[3]; x0 = x4 + x5; x1 = x4 - x5; x2 = x6 + x7; x3 = x6 - x7; /* Vertical transform and quantization */ i4_value = (x0 + x2); FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[plane]); pi2_dst[0] = i4_value; i4_value = (x0 - x2); FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[plane]); pi2_dst[2] = i4_value; i4_value = (x1 - x3); FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[plane]); pi2_dst[3] = i4_value; i4_value = (x1 + x3); FWD_QUANT(i4_value, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[plane]); pi2_dst[1] = i4_value; pi2_dst += 4; pi2_src += 4; } } /* ******************************************************************************* * * @brief * This function performs Single stage forward transform CF8 and quantization *on 8*8 blocks for h.264 * * @par Description: * Performs single stage 8x8 forward transform CF8 after calculating the *residue The result is then quantized * * @param[in] pu1_src * Input 8x8 pixels * * @param[in] pu1_pred * Input 8x8 pixels * * @param[in] pi1_out * Output 8x8 pixels * * @param[in] u4_thresh * Threshold under which the coeffs are not quantized * * @param[in] u4_qp_div * QP/6 * * @param[in] u4_qp_rem * QP%6 * * @param[in] u2_src_stride * Source stride * * @param[in] i4_pred_stride * stride for prediciton buffer * * @param[in] dst_strd * stride for destination buffer * * @param[in] pu4_quant_mat * Pointer to the 4x4 quantization matrix * * @returns Void * * ******************************************************************************* */ void isvc_resi_trans_quant_8x8(buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res) { UWORD32 i; WORD32 a0, a1, a2, a3, a4, a5, a6, a7; WORD32 r0, r1, r2, r3, r4, r5, r6, r7; UWORD8 *pu1_src = ps_src->pv_data; UWORD8 *pu1_pred = ps_pred->pv_data; WORD16 *pi2_out = ps_out->pv_data; WORD16 *pi2_upsampled_res = ps_upsampled_res ? ps_upsampled_res->pv_data : NULL; WORD32 i4_src_stride = ps_src->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0; WORD16 *pi2_out_tmp = pi2_out; UWORD32 u4_nonzero_coeff = 0; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; UNUSED(pi2_dc_out); /*Horizontal transform */ /* we are going to use the a's and r's in a twisted way since */ /*i dont want to declare more variables */ for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i) { r0 = pu1_src[0]; r0 -= pu1_pred[0]; r1 = pu1_src[1]; r1 -= pu1_pred[1]; r2 = pu1_src[2]; r2 -= pu1_pred[2]; r3 = pu1_src[3]; r3 -= pu1_pred[3]; r4 = pu1_src[4]; r4 -= pu1_pred[4]; r5 = pu1_src[5]; r5 -= pu1_pred[5]; r6 = pu1_src[6]; r6 -= pu1_pred[6]; r7 = pu1_src[7]; r7 -= pu1_pred[7]; if(u1_use_upsampled_res) { r0 = isvc_subtract_upsampled_res(r0, pi2_upsampled_res[0]); r1 = isvc_subtract_upsampled_res(r1, pi2_upsampled_res[1]); r2 = isvc_subtract_upsampled_res(r2, pi2_upsampled_res[2]); r3 = isvc_subtract_upsampled_res(r3, pi2_upsampled_res[3]); r4 = isvc_subtract_upsampled_res(r4, pi2_upsampled_res[4]); r5 = isvc_subtract_upsampled_res(r5, pi2_upsampled_res[5]); r6 = isvc_subtract_upsampled_res(r6, pi2_upsampled_res[6]); r7 = isvc_subtract_upsampled_res(r7, pi2_upsampled_res[7]); } a0 = r0 + r7; a1 = r1 + r6; a2 = r2 + r5; a3 = r3 + r4; a4 = a0 + a3; a5 = a1 + a2; a6 = a0 - a3; a7 = a1 - a2; pi2_out_tmp[0] = a4 + a5; pi2_out_tmp[2] = a6 + (a7 >> 1); pi2_out_tmp[4] = a4 - a5; pi2_out_tmp[6] = (a6 >> 1) - a7; a0 = r0 - r7; a1 = r1 - r6; a2 = r2 - r5; a3 = r3 - r4; a4 = a1 + a2 + ((a0 >> 1) + a0); a5 = a0 - a3 - ((a2 >> 1) + a2); a6 = a0 + a3 - ((a1 >> 1) + a1); a7 = a1 - a2 + ((a3 >> 1) + a3); pi2_out_tmp[1] = a4 + (a7 >> 2); pi2_out_tmp[3] = a5 + (a6 >> 2); pi2_out_tmp[5] = a6 - (a5 >> 2); pi2_out_tmp[7] = (a4 >> 2) - a7; pu1_src += i4_src_stride; pu1_pred += i4_pred_stride; pi2_out_tmp += 8; pi2_upsampled_res += i4_upsampled_res_stride; } /*vertical transform and quant */ pi2_out_tmp = pi2_out; for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i) { r0 = pi2_out_tmp[0]; r1 = pi2_out_tmp[8]; r2 = pi2_out_tmp[16]; r3 = pi2_out_tmp[24]; r4 = pi2_out_tmp[32]; r5 = pi2_out_tmp[40]; r6 = pi2_out_tmp[48]; r7 = pi2_out_tmp[56]; a0 = r0 + r7; a1 = r1 + r6; a2 = r2 + r5; a3 = r3 + r4; a4 = a0 + a3; a5 = a1 + a2; a6 = a0 - a3; a7 = a1 - a2; a0 = r0 - r7; a1 = r1 - r6; a2 = r2 - r5; a3 = r3 - r4; r0 = a4 + a5; r2 = a6 + (a7 >> 1); r4 = a4 - a5; r6 = (a6 >> 1) - a7; a4 = a1 + a2 + ((a0 >> 1) + a0); a5 = a0 - a3 - ((a2 >> 1) + a2); a6 = a0 + a3 - ((a1 >> 1) + a1); a7 = a1 - a2 + ((a3 >> 1) + a3); r1 = a4 + (a7 >> 2); r3 = a5 + (a6 >> 2); r5 = a6 - (a5 >> 2); r7 = (a4 >> 2) - a7; FWD_QUANT(r0, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[0] = r0; FWD_QUANT(r1, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[8] = r1; FWD_QUANT(r2, pu2_threshold_matrix[16], pu2_scale_matrix[16], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[16] = r2; FWD_QUANT(r3, pu2_threshold_matrix[24], pu2_scale_matrix[24], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[24] = r3; FWD_QUANT(r4, pu2_threshold_matrix[32], pu2_scale_matrix[32], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[32] = r4; FWD_QUANT(r5, pu2_threshold_matrix[40], pu2_scale_matrix[40], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[40] = r5; FWD_QUANT(r6, pu2_threshold_matrix[48], pu2_scale_matrix[48], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[48] = r6; FWD_QUANT(r7, pu2_threshold_matrix[56], pu2_scale_matrix[56], u4_round_factor, u4_qbits, u4_nonzero_coeff); pi2_out_tmp[56] = r7; pi2_out_tmp++; pu2_scale_matrix++; pu2_threshold_matrix++; } /* Return total nonzero coefficients in the current sub block */ *pu1_nnz = u4_nonzero_coeff; }