/****************************************************************************** * * Copyright (C) 2022 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** * ******************************************************************************* * * @file * isvc_resi_trans_quant_neon.c * * @brief * neon variants of forward transform and quantization functions * * ******************************************************************************* */ #include #include #include "ih264_typedefs.h" #include "ih264_debug.h" #include "ih264_defs.h" #include "ih264_trans_macros.h" #include "ih264_macros.h" #include "ih264_platform_macros.h" #include "ih264_trans_data.h" #include "ih264_size_defs.h" #include "isvc_structs.h" #include "isvc_trans_quant_itrans_iquant.h" void isvc_resi_trans_quant_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res) { UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data; UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; WORD16 *pi2_out = (WORD16 *) ps_out->pv_data; WORD32 i4_src_stride = ps_src->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_out_stride = ps_out->i4_data_stride; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; uint8x8_t src0, src1, src2, src3; uint8x8_t pred0, pred1, pred2, pred3; uint8x8_t temp0_u8x8, temp1_u8x8; uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4; uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4; uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4; uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4; int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4; int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4; int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4; int16x4x2_t xx0_16x4x2, xx1_16x4x2; int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4; uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8; uint16x8_t temp0_u16x8, temp1_u16x8; int32x2x2_t x0_32x2x2, x1_32x2x2; int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4; int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor); int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits); int16x4_t zeros_16x4 = vdup_n_s16(0); UNUSED(ps_upsampled_res); UNUSED(u1_use_upsampled_res); threshold0_16x4 = vld1_u16(pu2_threshold_matrix); threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4); threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8); threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12); scale_mat0_16x4 = vld1_u16(pu2_scale_matrix); scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4); scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8); scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12); src0 = vld1_u8(&pu1_src[0 * i4_src_stride]); src1 = vld1_u8(&pu1_src[1 * i4_src_stride]); src2 = vld1_u8(&pu1_src[2 * i4_src_stride]); src3 = vld1_u8(&pu1_src[3 * i4_src_stride]); pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]); pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]); pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]); pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]); /* calculate res = src - pred */ res0_16x8 = vsubl_u8(src0, pred0); res1_16x8 = vsubl_u8(src1, pred1); res2_16x8 = vsubl_u8(src2, pred2); res3_16x8 = vsubl_u8(src3, pred3); res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8)); res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8)); res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8)); res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8)); /* Perform Forward transform */ /*-------------------------------------------------------------*/ /* DCT [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ /* Matrix transpose */ /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4); xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4); x0_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); x1_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx3_16x4, 1); x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4); x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx2_16x4, 1); x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4); /* Matrix transpose */ /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4); xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4); x0_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); x1_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); /* Vertical Transformation */ xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx3_16x4, 1); x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4); x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx2_16x4, 1); x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4); /* get the first 16 bits from the register */ *pi2_dc_out = vget_lane_s16(x0_16x4, 0); xx0_16x4 = vabs_s16(x0_16x4); xx1_16x4 = vabs_s16(x1_16x4); xx2_16x4 = vabs_s16(x2_16x4); xx3_16x4 = vabs_s16(x3_16x4); /* compare with zero for getting sign */ temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4); temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4); temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4); temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4); /* compare with zero for thresholding */ thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4); thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4); thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4); thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4); /* Multiply abs values obtained with scaling matrix */ tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4)); tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4)); tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4)); tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4)); tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4); tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4); tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4); tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4); qbits_32x4 = vnegq_s32(qbits_32x4); tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4); tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4); tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4); tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4); /* Convertion to 16 bits signed */ temp0_16x4 = vmovn_s32(tx0_32x4); temp1_16x4 = vmovn_s32(tx1_32x4); temp2_16x4 = vmovn_s32(tx2_32x4); temp3_16x4 = vmovn_s32(tx3_32x4); x0_16x4 = vneg_s16(temp0_16x4); x1_16x4 = vneg_s16(temp1_16x4); x2_16x4 = vneg_s16(temp2_16x4); x3_16x4 = vneg_s16(temp3_16x4); /* Restore sign */ x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4); x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4); x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4); x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4); xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4); xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4); xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4); xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4); /* Store Quantized outputs */ vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4); vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4); vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4); vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4); /* NNZ calculation */ temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4); temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4); temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4); temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4); temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4); temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4); /* Convertion to 8 bit unsigned */ temp0_u8x8 = vmovn_u16(temp0_u16x8); temp1_u8x8 = vmovn_u16(temp1_u16x8); temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7); temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7); temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0); } void isvc_resi_trans_quant_4x4_with_residual_sub_neon( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res) { UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data; UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; WORD16 *pi2_out = (WORD16 *) ps_out->pv_data; WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL; WORD32 i4_src_stride = ps_src->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_out_stride = ps_out->i4_data_stride; WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; uint8x8_t src0, src1, src2, src3; uint8x8_t pred0, pred1, pred2, pred3; uint8x8_t temp0_u8x8, temp1_u8x8; uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4; uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4; uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4; uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4; int16x4_t upres0_16x4, upres1_16x4, upres2_16x4, upres3_16x4; int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4; int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4; int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4; int16x4x2_t xx0_16x4x2, xx1_16x4x2; int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4; uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8; uint16x8_t temp0_u16x8, temp1_u16x8; int32x2x2_t x0_32x2x2, x1_32x2x2; int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4; int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor); int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits); int16x4_t zeros_16x4 = vdup_n_s16(0); int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX)); int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX)); UNUSED(u1_use_upsampled_res); threshold0_16x4 = vld1_u16(pu2_threshold_matrix); threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4); threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8); threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12); scale_mat0_16x4 = vld1_u16(pu2_scale_matrix); scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4); scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8); scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12); src0 = vld1_u8(&pu1_src[0 * i4_src_stride]); src1 = vld1_u8(&pu1_src[1 * i4_src_stride]); src2 = vld1_u8(&pu1_src[2 * i4_src_stride]); src3 = vld1_u8(&pu1_src[3 * i4_src_stride]); pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]); pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]); pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]); pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]); /* calculate res = src - pred */ res0_16x8 = vsubl_u8(src0, pred0); res1_16x8 = vsubl_u8(src1, pred1); res2_16x8 = vsubl_u8(src2, pred2); res3_16x8 = vsubl_u8(src3, pred3); res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8)); res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8)); res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8)); res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8)); /* Load upsampled res */ upres0_16x4 = vld1_s16(&pi2_upsampled_res[0 * i4_upsampled_res_stride]); upres1_16x4 = vld1_s16(&pi2_upsampled_res[1 * i4_upsampled_res_stride]); upres2_16x4 = vld1_s16(&pi2_upsampled_res[2 * i4_upsampled_res_stride]); upres3_16x4 = vld1_s16(&pi2_upsampled_res[3 * i4_upsampled_res_stride]); /* subtract upsampled res from (src - pred) to obtain final res */ res0_16x4 = vsub_s16(res0_16x4, upres0_16x4); res1_16x4 = vsub_s16(res1_16x4, upres1_16x4); res2_16x4 = vsub_s16(res2_16x4, upres2_16x4); res3_16x4 = vsub_s16(res3_16x4, upres3_16x4); /* Saturate all values < -255 to -255 and retain the rest as it is */ res0_16x4 = vmax_s16(res0_16x4, neg_255_16x4); res1_16x4 = vmax_s16(res1_16x4, neg_255_16x4); res2_16x4 = vmax_s16(res2_16x4, neg_255_16x4); res3_16x4 = vmax_s16(res3_16x4, neg_255_16x4); /* Saturate all values > 255 to 255 and retain the rest as it is */ res0_16x4 = vmin_s16(res0_16x4, pos_255_16x4); res1_16x4 = vmin_s16(res1_16x4, pos_255_16x4); res2_16x4 = vmin_s16(res2_16x4, pos_255_16x4); res3_16x4 = vmin_s16(res3_16x4, pos_255_16x4); /* Perform Forward transform */ /*-------------------------------------------------------------*/ /* DCT [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ /* Matrix transpose */ /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4); xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4); x0_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); x1_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx3_16x4, 1); x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4); x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx2_16x4, 1); x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4); /* Matrix transpose */ /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4); xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4); x0_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); x1_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); /* Vertical Transformation */ xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx3_16x4, 1); x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4); x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx2_16x4, 1); x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4); /* get the first 16 bits from the register */ *pi2_dc_out = vget_lane_s16(x0_16x4, 0); xx0_16x4 = vabs_s16(x0_16x4); xx1_16x4 = vabs_s16(x1_16x4); xx2_16x4 = vabs_s16(x2_16x4); xx3_16x4 = vabs_s16(x3_16x4); /* compare with zero for getting sign */ temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4); temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4); temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4); temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4); /* compare with zero for thresholding */ thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4); thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4); thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4); thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4); /* Multiply abs values obtained with scaling matrix */ tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4)); tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4)); tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4)); tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4)); tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4); tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4); tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4); tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4); qbits_32x4 = vnegq_s32(qbits_32x4); tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4); tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4); tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4); tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4); /* Convertion to 16 bits signed */ temp0_16x4 = vmovn_s32(tx0_32x4); temp1_16x4 = vmovn_s32(tx1_32x4); temp2_16x4 = vmovn_s32(tx2_32x4); temp3_16x4 = vmovn_s32(tx3_32x4); x0_16x4 = vneg_s16(temp0_16x4); x1_16x4 = vneg_s16(temp1_16x4); x2_16x4 = vneg_s16(temp2_16x4); x3_16x4 = vneg_s16(temp3_16x4); /* Restore sign */ x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4); x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4); x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4); x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4); xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4); xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4); xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4); xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4); /* Store Quantized outputs */ vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4); vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4); vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4); vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4); /* NNZ calculation */ temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4); temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4); temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4); temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4); temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4); temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4); /* Convertion to 8 bit unsigned */ temp0_u8x8 = vmovn_u16(temp0_u16x8); temp1_u8x8 = vmovn_u16(temp1_u16x8); temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7); temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7); temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0); } void isvc_resi_trans_quant_chroma_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res) { UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data; UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; WORD16 *pi2_out = (WORD16 *) ps_out->pv_data; WORD32 i4_src_stride = ps_src->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_out_stride = ps_out->i4_data_stride; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; uint8x8_t src0, src1, src2, src3; uint8x8_t pred0, pred1, pred2, pred3; uint8x8x2_t tmp0, tmp1, tmp2, tmp3; uint8x8_t temp0_u8x8, temp1_u8x8; uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4; uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4; uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4; uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4; int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4; int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4; int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4; int16x4x2_t xx0_16x4x2, xx1_16x4x2; int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4; uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8; uint16x8_t temp0_u16x8, temp1_u16x8; int32x2x2_t x0_32x2x2, x1_32x2x2; int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4; int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor); int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits); int16x4_t zeros_16x4 = vdup_n_s16(0); UNUSED(ps_upsampled_res); UNUSED(u1_use_upsampled_res); threshold0_16x4 = vld1_u16(pu2_threshold_matrix); threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4); threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8); threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12); scale_mat0_16x4 = vld1_u16(pu2_scale_matrix); scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4); scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8); scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12); src0 = vld1_u8(&pu1_src[0 * i4_src_stride]); src1 = vld1_u8(&pu1_src[1 * i4_src_stride]); src2 = vld1_u8(&pu1_src[2 * i4_src_stride]); src3 = vld1_u8(&pu1_src[3 * i4_src_stride]); /* deinterleaving source buffer */ tmp0 = vuzp_u8(src0, src0); tmp1 = vuzp_u8(src1, src1); tmp2 = vuzp_u8(src2, src2); tmp3 = vuzp_u8(src3, src3); src0 = tmp0.val[0]; src1 = tmp1.val[0]; src2 = tmp2.val[0]; src3 = tmp3.val[0]; pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]); pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]); pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]); pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]); /* deinterleaving pred buffer */ tmp0 = vuzp_u8(pred0, pred0); tmp1 = vuzp_u8(pred1, pred1); tmp2 = vuzp_u8(pred2, pred2); tmp3 = vuzp_u8(pred3, pred3); pred0 = tmp0.val[0]; pred1 = tmp1.val[0]; pred2 = tmp2.val[0]; pred3 = tmp3.val[0]; /* calculate res = src - pred */ res0_16x8 = vsubl_u8(src0, pred0); res1_16x8 = vsubl_u8(src1, pred1); res2_16x8 = vsubl_u8(src2, pred2); res3_16x8 = vsubl_u8(src3, pred3); res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8)); res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8)); res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8)); res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8)); /* Perform Forward transform */ /*-------------------------------------------------------------*/ /* DCT [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ /* Matrix transpose */ /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4); xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4); x0_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); x1_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx3_16x4, 1); x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4); x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx2_16x4, 1); x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4); /* Matrix transpose */ /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4); xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4); x0_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); x1_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); /* Vertical Transformation */ xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx3_16x4, 1); x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4); x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx2_16x4, 1); x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4); /* get the first 16 bits from the register */ *pi2_dc_out = vget_lane_s16(x0_16x4, 0); xx0_16x4 = vabs_s16(x0_16x4); xx1_16x4 = vabs_s16(x1_16x4); xx2_16x4 = vabs_s16(x2_16x4); xx3_16x4 = vabs_s16(x3_16x4); /* compare with zero for getting sign */ temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4); temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4); temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4); temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4); /* compare with zero for thresholding */ thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4); thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4); thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4); thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4); /* Multiply abs values obtained with scaling matrix */ tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4)); tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4)); tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4)); tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4)); tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4); tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4); tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4); tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4); qbits_32x4 = vnegq_s32(qbits_32x4); tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4); tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4); tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4); tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4); /* Convertion to 16 bits signed */ temp0_16x4 = vmovn_s32(tx0_32x4); temp1_16x4 = vmovn_s32(tx1_32x4); temp2_16x4 = vmovn_s32(tx2_32x4); temp3_16x4 = vmovn_s32(tx3_32x4); x0_16x4 = vneg_s16(temp0_16x4); x1_16x4 = vneg_s16(temp1_16x4); x2_16x4 = vneg_s16(temp2_16x4); x3_16x4 = vneg_s16(temp3_16x4); /* Restore sign */ x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4); x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4); x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4); x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4); /* Thresholding */ xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4); xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4); xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4); xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4); /* Store Quantized outputs */ vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4); vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4); vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4); vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4); /* NNZ calculation */ temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4); temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4); temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4); temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4); temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4); temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4); /* Convertion to 8 bit unsigned */ temp0_u8x8 = vmovn_u16(temp0_u16x8); temp1_u8x8 = vmovn_u16(temp1_u16x8); temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7); temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7); temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0); } void isvc_resi_trans_quant_chroma_4x4_with_residual_sub_neon( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out, buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants, UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res) { UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data; UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; WORD16 *pi2_out = (WORD16 *) ps_out->pv_data; WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL; WORD32 i4_src_stride = ps_src->i4_data_stride; WORD32 i4_pred_stride = ps_pred->i4_data_stride; WORD32 i4_out_stride = ps_out->i4_data_stride; WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0; const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix; const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix; UWORD32 u4_qbits = ps_quant_constants->u4_qbits; UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor; uint8x8_t src0, src1, src2, src3; uint8x8_t pred0, pred1, pred2, pred3; uint8x8x2_t tmp0, tmp1, tmp2, tmp3; uint8x8_t temp0_u8x8, temp1_u8x8; uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4; uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4; uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4; uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4; int16x4_t upres0_16x4, upres1_16x4, upres2_16x4, upres3_16x4; int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4; int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4; int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4; int16x4x2_t xx0_16x4x2, xx1_16x4x2; int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4; uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8; uint16x8_t temp0_u16x8, temp1_u16x8; int32x2x2_t x0_32x2x2, x1_32x2x2; int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4; int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor); int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits); int16x4_t zeros_16x4 = vdup_n_s16(0); int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX)); int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX)); UNUSED(u1_use_upsampled_res); threshold0_16x4 = vld1_u16(pu2_threshold_matrix); threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4); threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8); threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12); scale_mat0_16x4 = vld1_u16(pu2_scale_matrix); scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4); scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8); scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12); src0 = vld1_u8(&pu1_src[0 * i4_src_stride]); src1 = vld1_u8(&pu1_src[1 * i4_src_stride]); src2 = vld1_u8(&pu1_src[2 * i4_src_stride]); src3 = vld1_u8(&pu1_src[3 * i4_src_stride]); /* deinterleaving source buffer */ tmp0 = vuzp_u8(src0, src0); tmp1 = vuzp_u8(src1, src1); tmp2 = vuzp_u8(src2, src2); tmp3 = vuzp_u8(src3, src3); src0 = tmp0.val[0]; src1 = tmp1.val[0]; src2 = tmp2.val[0]; src3 = tmp3.val[0]; pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]); pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]); pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]); pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]); /* deinterleaving pred buffer */ tmp0 = vuzp_u8(pred0, pred0); tmp1 = vuzp_u8(pred1, pred1); tmp2 = vuzp_u8(pred2, pred2); tmp3 = vuzp_u8(pred3, pred3); pred0 = tmp0.val[0]; pred1 = tmp1.val[0]; pred2 = tmp2.val[0]; pred3 = tmp3.val[0]; /* calculate res = src - pred */ res0_16x8 = vsubl_u8(src0, pred0); res1_16x8 = vsubl_u8(src1, pred1); res2_16x8 = vsubl_u8(src2, pred2); res3_16x8 = vsubl_u8(src3, pred3); res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8)); res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8)); res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8)); res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8)); /* Load upsampled res */ upres0_16x4 = vld1_s16(&pi2_upsampled_res[0 * i4_upsampled_res_stride]); upres1_16x4 = vld1_s16(&pi2_upsampled_res[1 * i4_upsampled_res_stride]); upres2_16x4 = vld1_s16(&pi2_upsampled_res[2 * i4_upsampled_res_stride]); upres3_16x4 = vld1_s16(&pi2_upsampled_res[3 * i4_upsampled_res_stride]); /* subtract upsampled res from (src - pred) to obtain final res */ res0_16x4 = vsub_s16(res0_16x4, upres0_16x4); res1_16x4 = vsub_s16(res1_16x4, upres1_16x4); res2_16x4 = vsub_s16(res2_16x4, upres2_16x4); res3_16x4 = vsub_s16(res3_16x4, upres3_16x4); /* Saturate all values < -255 to -255 and retain the rest as it is */ res0_16x4 = vmax_s16(res0_16x4, neg_255_16x4); res1_16x4 = vmax_s16(res1_16x4, neg_255_16x4); res2_16x4 = vmax_s16(res2_16x4, neg_255_16x4); res3_16x4 = vmax_s16(res3_16x4, neg_255_16x4); /* Saturate all values > 255 to 255 and retain the rest as it is */ res0_16x4 = vmin_s16(res0_16x4, pos_255_16x4); res1_16x4 = vmin_s16(res1_16x4, pos_255_16x4); res2_16x4 = vmin_s16(res2_16x4, pos_255_16x4); res3_16x4 = vmin_s16(res3_16x4, pos_255_16x4); /* Perform Forward transform */ /*-------------------------------------------------------------*/ /* DCT [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ /* Matrix transpose */ /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4); xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4); x0_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); x1_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx3_16x4, 1); x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4); x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx2_16x4, 1); x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4); /* Matrix transpose */ /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4); xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4); x0_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); x1_32x2x2 = vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); /* Vertical Transformation */ xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx3_16x4, 1); x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4); x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4); temp0_16x4 = vshl_n_s16(xx2_16x4, 1); x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4); /* get the first 16 bits from the register */ *pi2_dc_out = vget_lane_s16(x0_16x4, 0); xx0_16x4 = vabs_s16(x0_16x4); xx1_16x4 = vabs_s16(x1_16x4); xx2_16x4 = vabs_s16(x2_16x4); xx3_16x4 = vabs_s16(x3_16x4); /* compare with zero for getting sign */ temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4); temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4); temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4); temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4); thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4); thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4); thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4); thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4); /* Multiply abs values obtained with scaling matrix */ tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4)); tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4)); tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4)); tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4)); tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4); tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4); tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4); tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4); qbits_32x4 = vnegq_s32(qbits_32x4); tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4); tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4); tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4); tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4); /* Convertion to 16 bits signed */ temp0_16x4 = vmovn_s32(tx0_32x4); temp1_16x4 = vmovn_s32(tx1_32x4); temp2_16x4 = vmovn_s32(tx2_32x4); temp3_16x4 = vmovn_s32(tx3_32x4); x0_16x4 = vneg_s16(temp0_16x4); x1_16x4 = vneg_s16(temp1_16x4); x2_16x4 = vneg_s16(temp2_16x4); x3_16x4 = vneg_s16(temp3_16x4); /* Restore sign */ x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4); x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4); x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4); x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4); xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4); xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4); xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4); xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4); /* Store Quantized outputs */ vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4); vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4); vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4); vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4); /* NNZ calculation */ temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4); temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4); temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4); temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4); temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4); temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4); /* Convertion to 8 bit unsigned */ temp0_u8x8 = vmovn_u16(temp0_u16x8); temp1_u8x8 = vmovn_u16(temp1_u16x8); temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7); temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7); temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8); *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0); }