• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  * *******************************************************************************
22  * * @file
23  *  isvc_resi_trans_quant_neon.c
24  *
25  * @brief
26  *  neon variants of forward transform and quantization functions
27  *
28  * *******************************************************************************
29  */
30 
31 #include <arm_neon.h>
32 #include <string.h>
33 
34 #include "ih264_typedefs.h"
35 #include "ih264_debug.h"
36 #include "ih264_defs.h"
37 #include "ih264_trans_macros.h"
38 #include "ih264_macros.h"
39 #include "ih264_platform_macros.h"
40 #include "ih264_trans_data.h"
41 #include "ih264_size_defs.h"
42 #include "isvc_structs.h"
43 #include "isvc_trans_quant_itrans_iquant.h"
44 
isvc_resi_trans_quant_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)45 void isvc_resi_trans_quant_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
46                                     buffer_container_t *ps_out,
47                                     buffer_container_t *ps_upsampled_res,
48                                     resi_trans_quant_constants_t *ps_quant_constants,
49                                     UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
50                                     UWORD8 u1_use_upsampled_res)
51 {
52     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
53     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
54     WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
55     WORD32 i4_src_stride = ps_src->i4_data_stride;
56     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
57     WORD32 i4_out_stride = ps_out->i4_data_stride;
58     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
59     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
60     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
61     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
62 
63     uint8x8_t src0, src1, src2, src3;
64     uint8x8_t pred0, pred1, pred2, pred3;
65     uint8x8_t temp0_u8x8, temp1_u8x8;
66     uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4;
67     uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4;
68     uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4;
69     uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4;
70     int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4;
71     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
72     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
73     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
74     int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4;
75     uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8;
76     uint16x8_t temp0_u16x8, temp1_u16x8;
77     int32x2x2_t x0_32x2x2, x1_32x2x2;
78     int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4;
79 
80     int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor);
81     int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits);
82     int16x4_t zeros_16x4 = vdup_n_s16(0);
83 
84     UNUSED(ps_upsampled_res);
85     UNUSED(u1_use_upsampled_res);
86 
87     threshold0_16x4 = vld1_u16(pu2_threshold_matrix);
88     threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4);
89     threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8);
90     threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12);
91 
92     scale_mat0_16x4 = vld1_u16(pu2_scale_matrix);
93     scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4);
94     scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8);
95     scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12);
96 
97     src0 = vld1_u8(&pu1_src[0 * i4_src_stride]);
98     src1 = vld1_u8(&pu1_src[1 * i4_src_stride]);
99     src2 = vld1_u8(&pu1_src[2 * i4_src_stride]);
100     src3 = vld1_u8(&pu1_src[3 * i4_src_stride]);
101 
102     pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]);
103     pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]);
104     pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]);
105     pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]);
106 
107     /* calculate res = src - pred */
108     res0_16x8 = vsubl_u8(src0, pred0);
109     res1_16x8 = vsubl_u8(src1, pred1);
110     res2_16x8 = vsubl_u8(src2, pred2);
111     res3_16x8 = vsubl_u8(src3, pred3);
112 
113     res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8));
114     res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8));
115     res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8));
116     res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8));
117 
118     /* Perform Forward transform */
119     /*-------------------------------------------------------------*/
120     /* DCT [ Horizontal transformation ]                          */
121     /*-------------------------------------------------------------*/
122     /* Matrix transpose */
123     /*
124      *  a0 a1 a2 a3
125      *  b0 b1 b2 b3
126      *  c0 c1 c2 c3
127      *  d0 d1 d2 d3
128      */
129 
130     xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4);
131     xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4);
132     x0_32x2x2 =
133         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
134     x1_32x2x2 =
135         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
136 
137     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
138     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
139     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
140     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
141 
142     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
143     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
144     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
145     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
146 
147     x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
148     temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
149     x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4);
150 
151     x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
152     temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
153     x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
154 
155     /* Matrix transpose */
156     /*
157      *  a0 b0 c0 d0
158      *  a1 b1 c1 d1
159      *  a2 b2 c2 d2
160      *  a3 b3 c3 d3
161      */
162 
163     xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4);
164     xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4);
165     x0_32x2x2 =
166         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
167     x1_32x2x2 =
168         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
169 
170     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
171     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
172     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
173     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
174 
175     /* Vertical Transformation */
176 
177     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
178     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
179     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
180     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
181 
182     x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
183     temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
184     x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4);
185 
186     x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
187     temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
188     x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
189 
190     /* get the first 16 bits from the register */
191     *pi2_dc_out = vget_lane_s16(x0_16x4, 0);
192 
193     xx0_16x4 = vabs_s16(x0_16x4);
194     xx1_16x4 = vabs_s16(x1_16x4);
195     xx2_16x4 = vabs_s16(x2_16x4);
196     xx3_16x4 = vabs_s16(x3_16x4);
197 
198     /* compare with zero for getting sign */
199     temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4);
200     temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4);
201     temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4);
202     temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4);
203 
204     /* compare with zero for thresholding */
205     thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4);
206     thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4);
207     thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4);
208     thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4);
209 
210     /* Multiply abs values obtained with scaling matrix */
211     tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4));
212     tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4));
213     tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4));
214     tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4));
215 
216     tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4);
217     tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4);
218     tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4);
219     tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4);
220 
221     qbits_32x4 = vnegq_s32(qbits_32x4);
222 
223     tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4);
224     tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4);
225     tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4);
226     tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4);
227 
228     /* Convertion to 16 bits signed */
229     temp0_16x4 = vmovn_s32(tx0_32x4);
230     temp1_16x4 = vmovn_s32(tx1_32x4);
231     temp2_16x4 = vmovn_s32(tx2_32x4);
232     temp3_16x4 = vmovn_s32(tx3_32x4);
233 
234     x0_16x4 = vneg_s16(temp0_16x4);
235     x1_16x4 = vneg_s16(temp1_16x4);
236     x2_16x4 = vneg_s16(temp2_16x4);
237     x3_16x4 = vneg_s16(temp3_16x4);
238 
239     /* Restore sign */
240     x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4);
241     x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4);
242     x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4);
243     x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4);
244 
245     xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4);
246     xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4);
247     xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4);
248     xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4);
249 
250     /* Store Quantized outputs */
251     vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4);
252     vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4);
253     vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4);
254     vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4);
255 
256     /* NNZ calculation */
257 
258     temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4);
259     temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4);
260     temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4);
261     temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4);
262 
263     temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4);
264     temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4);
265 
266     /* Convertion to 8 bit unsigned */
267     temp0_u8x8 = vmovn_u16(temp0_u16x8);
268     temp1_u8x8 = vmovn_u16(temp1_u16x8);
269 
270     temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7);
271     temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7);
272 
273     temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8);
274     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
275     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
276     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
277 
278     *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0);
279 }
280 
isvc_resi_trans_quant_4x4_with_residual_sub_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)281 void isvc_resi_trans_quant_4x4_with_residual_sub_neon(
282     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out,
283     buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants,
284     UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
285 {
286     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
287     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
288     WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
289     WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL;
290     WORD32 i4_src_stride = ps_src->i4_data_stride;
291     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
292     WORD32 i4_out_stride = ps_out->i4_data_stride;
293     WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
294     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
295     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
296     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
297     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
298 
299     uint8x8_t src0, src1, src2, src3;
300     uint8x8_t pred0, pred1, pred2, pred3;
301     uint8x8_t temp0_u8x8, temp1_u8x8;
302     uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4;
303     uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4;
304     uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4;
305     uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4;
306     int16x4_t upres0_16x4, upres1_16x4, upres2_16x4, upres3_16x4;
307     int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4;
308     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
309     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
310     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
311     int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4;
312     uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8;
313     uint16x8_t temp0_u16x8, temp1_u16x8;
314     int32x2x2_t x0_32x2x2, x1_32x2x2;
315     int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4;
316 
317     int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor);
318     int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits);
319     int16x4_t zeros_16x4 = vdup_n_s16(0);
320     int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
321     int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
322 
323     UNUSED(u1_use_upsampled_res);
324 
325     threshold0_16x4 = vld1_u16(pu2_threshold_matrix);
326     threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4);
327     threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8);
328     threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12);
329 
330     scale_mat0_16x4 = vld1_u16(pu2_scale_matrix);
331     scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4);
332     scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8);
333     scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12);
334 
335     src0 = vld1_u8(&pu1_src[0 * i4_src_stride]);
336     src1 = vld1_u8(&pu1_src[1 * i4_src_stride]);
337     src2 = vld1_u8(&pu1_src[2 * i4_src_stride]);
338     src3 = vld1_u8(&pu1_src[3 * i4_src_stride]);
339 
340     pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]);
341     pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]);
342     pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]);
343     pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]);
344 
345     /* calculate res = src - pred */
346     res0_16x8 = vsubl_u8(src0, pred0);
347     res1_16x8 = vsubl_u8(src1, pred1);
348     res2_16x8 = vsubl_u8(src2, pred2);
349     res3_16x8 = vsubl_u8(src3, pred3);
350 
351     res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8));
352     res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8));
353     res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8));
354     res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8));
355 
356     /* Load upsampled res */
357     upres0_16x4 = vld1_s16(&pi2_upsampled_res[0 * i4_upsampled_res_stride]);
358     upres1_16x4 = vld1_s16(&pi2_upsampled_res[1 * i4_upsampled_res_stride]);
359     upres2_16x4 = vld1_s16(&pi2_upsampled_res[2 * i4_upsampled_res_stride]);
360     upres3_16x4 = vld1_s16(&pi2_upsampled_res[3 * i4_upsampled_res_stride]);
361 
362     /* subtract upsampled res from (src - pred) to obtain final res */
363     res0_16x4 = vsub_s16(res0_16x4, upres0_16x4);
364     res1_16x4 = vsub_s16(res1_16x4, upres1_16x4);
365     res2_16x4 = vsub_s16(res2_16x4, upres2_16x4);
366     res3_16x4 = vsub_s16(res3_16x4, upres3_16x4);
367 
368     /* Saturate all values < -255 to -255 and retain the rest as it is */
369     res0_16x4 = vmax_s16(res0_16x4, neg_255_16x4);
370     res1_16x4 = vmax_s16(res1_16x4, neg_255_16x4);
371     res2_16x4 = vmax_s16(res2_16x4, neg_255_16x4);
372     res3_16x4 = vmax_s16(res3_16x4, neg_255_16x4);
373 
374     /* Saturate all values > 255 to 255 and retain the rest as it is */
375     res0_16x4 = vmin_s16(res0_16x4, pos_255_16x4);
376     res1_16x4 = vmin_s16(res1_16x4, pos_255_16x4);
377     res2_16x4 = vmin_s16(res2_16x4, pos_255_16x4);
378     res3_16x4 = vmin_s16(res3_16x4, pos_255_16x4);
379 
380     /* Perform Forward transform */
381     /*-------------------------------------------------------------*/
382     /* DCT [ Horizontal transformation ]                          */
383     /*-------------------------------------------------------------*/
384     /* Matrix transpose */
385     /*
386      *  a0 a1 a2 a3
387      *  b0 b1 b2 b3
388      *  c0 c1 c2 c3
389      *  d0 d1 d2 d3
390      */
391 
392     xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4);
393     xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4);
394     x0_32x2x2 =
395         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
396     x1_32x2x2 =
397         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
398 
399     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
400     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
401     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
402     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
403 
404     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
405     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
406     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
407     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
408 
409     x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
410     temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
411     x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4);
412 
413     x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
414     temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
415     x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
416 
417     /* Matrix transpose */
418     /*
419      *  a0 b0 c0 d0
420      *  a1 b1 c1 d1
421      *  a2 b2 c2 d2
422      *  a3 b3 c3 d3
423      */
424 
425     xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4);
426     xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4);
427     x0_32x2x2 =
428         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
429     x1_32x2x2 =
430         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
431 
432     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
433     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
434     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
435     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
436 
437     /* Vertical Transformation */
438 
439     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
440     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
441     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
442     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
443 
444     x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
445     temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
446     x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4);
447 
448     x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
449     temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
450     x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
451 
452     /* get the first 16 bits from the register */
453     *pi2_dc_out = vget_lane_s16(x0_16x4, 0);
454 
455     xx0_16x4 = vabs_s16(x0_16x4);
456     xx1_16x4 = vabs_s16(x1_16x4);
457     xx2_16x4 = vabs_s16(x2_16x4);
458     xx3_16x4 = vabs_s16(x3_16x4);
459 
460     /* compare with zero for getting sign */
461     temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4);
462     temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4);
463     temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4);
464     temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4);
465 
466     /* compare with zero for thresholding */
467     thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4);
468     thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4);
469     thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4);
470     thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4);
471 
472     /* Multiply abs values obtained with scaling matrix */
473     tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4));
474     tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4));
475     tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4));
476     tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4));
477 
478     tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4);
479     tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4);
480     tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4);
481     tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4);
482 
483     qbits_32x4 = vnegq_s32(qbits_32x4);
484 
485     tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4);
486     tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4);
487     tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4);
488     tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4);
489 
490     /* Convertion to 16 bits signed */
491     temp0_16x4 = vmovn_s32(tx0_32x4);
492     temp1_16x4 = vmovn_s32(tx1_32x4);
493     temp2_16x4 = vmovn_s32(tx2_32x4);
494     temp3_16x4 = vmovn_s32(tx3_32x4);
495 
496     x0_16x4 = vneg_s16(temp0_16x4);
497     x1_16x4 = vneg_s16(temp1_16x4);
498     x2_16x4 = vneg_s16(temp2_16x4);
499     x3_16x4 = vneg_s16(temp3_16x4);
500 
501     /* Restore sign */
502     x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4);
503     x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4);
504     x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4);
505     x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4);
506 
507     xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4);
508     xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4);
509     xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4);
510     xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4);
511 
512     /* Store Quantized outputs */
513     vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4);
514     vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4);
515     vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4);
516     vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4);
517 
518     /* NNZ calculation */
519 
520     temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4);
521     temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4);
522     temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4);
523     temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4);
524 
525     temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4);
526     temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4);
527 
528     /* Convertion to 8 bit unsigned */
529     temp0_u8x8 = vmovn_u16(temp0_u16x8);
530     temp1_u8x8 = vmovn_u16(temp1_u16x8);
531 
532     temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7);
533     temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7);
534 
535     temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8);
536     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
537     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
538     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
539 
540     *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0);
541 }
542 
isvc_resi_trans_quant_chroma_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)543 void isvc_resi_trans_quant_chroma_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
544                                            buffer_container_t *ps_out,
545                                            buffer_container_t *ps_upsampled_res,
546                                            resi_trans_quant_constants_t *ps_quant_constants,
547                                            UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
548                                            UWORD8 u1_use_upsampled_res)
549 {
550     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
551     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
552     WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
553     WORD32 i4_src_stride = ps_src->i4_data_stride;
554     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
555     WORD32 i4_out_stride = ps_out->i4_data_stride;
556     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
557     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
558     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
559     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
560 
561     uint8x8_t src0, src1, src2, src3;
562     uint8x8_t pred0, pred1, pred2, pred3;
563     uint8x8x2_t tmp0, tmp1, tmp2, tmp3;
564     uint8x8_t temp0_u8x8, temp1_u8x8;
565     uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4;
566     uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4;
567     uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4;
568     uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4;
569     int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4;
570     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
571     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
572     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
573     int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4;
574     uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8;
575     uint16x8_t temp0_u16x8, temp1_u16x8;
576     int32x2x2_t x0_32x2x2, x1_32x2x2;
577     int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4;
578 
579     int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor);
580     int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits);
581     int16x4_t zeros_16x4 = vdup_n_s16(0);
582 
583     UNUSED(ps_upsampled_res);
584     UNUSED(u1_use_upsampled_res);
585 
586     threshold0_16x4 = vld1_u16(pu2_threshold_matrix);
587     threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4);
588     threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8);
589     threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12);
590 
591     scale_mat0_16x4 = vld1_u16(pu2_scale_matrix);
592     scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4);
593     scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8);
594     scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12);
595 
596     src0 = vld1_u8(&pu1_src[0 * i4_src_stride]);
597     src1 = vld1_u8(&pu1_src[1 * i4_src_stride]);
598     src2 = vld1_u8(&pu1_src[2 * i4_src_stride]);
599     src3 = vld1_u8(&pu1_src[3 * i4_src_stride]);
600 
601     /* deinterleaving source buffer */
602     tmp0 = vuzp_u8(src0, src0);
603     tmp1 = vuzp_u8(src1, src1);
604     tmp2 = vuzp_u8(src2, src2);
605     tmp3 = vuzp_u8(src3, src3);
606 
607     src0 = tmp0.val[0];
608     src1 = tmp1.val[0];
609     src2 = tmp2.val[0];
610     src3 = tmp3.val[0];
611 
612     pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]);
613     pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]);
614     pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]);
615     pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]);
616 
617     /* deinterleaving pred buffer */
618     tmp0 = vuzp_u8(pred0, pred0);
619     tmp1 = vuzp_u8(pred1, pred1);
620     tmp2 = vuzp_u8(pred2, pred2);
621     tmp3 = vuzp_u8(pred3, pred3);
622 
623     pred0 = tmp0.val[0];
624     pred1 = tmp1.val[0];
625     pred2 = tmp2.val[0];
626     pred3 = tmp3.val[0];
627 
628     /* calculate res = src - pred */
629     res0_16x8 = vsubl_u8(src0, pred0);
630     res1_16x8 = vsubl_u8(src1, pred1);
631     res2_16x8 = vsubl_u8(src2, pred2);
632     res3_16x8 = vsubl_u8(src3, pred3);
633 
634     res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8));
635     res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8));
636     res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8));
637     res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8));
638 
639     /* Perform Forward transform */
640     /*-------------------------------------------------------------*/
641     /* DCT [ Horizontal transformation ]                          */
642     /*-------------------------------------------------------------*/
643     /* Matrix transpose */
644     /*
645      *  a0 a1 a2 a3
646      *  b0 b1 b2 b3
647      *  c0 c1 c2 c3
648      *  d0 d1 d2 d3
649      */
650 
651     xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4);
652     xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4);
653     x0_32x2x2 =
654         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
655     x1_32x2x2 =
656         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
657 
658     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
659     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
660     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
661     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
662 
663     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
664     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
665     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
666     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
667 
668     x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
669     temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
670     x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4);
671 
672     x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
673     temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
674     x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
675 
676     /* Matrix transpose */
677     /*
678      *  a0 b0 c0 d0
679      *  a1 b1 c1 d1
680      *  a2 b2 c2 d2
681      *  a3 b3 c3 d3
682      */
683 
684     xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4);
685     xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4);
686     x0_32x2x2 =
687         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
688     x1_32x2x2 =
689         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
690 
691     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
692     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
693     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
694     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
695 
696     /* Vertical Transformation */
697 
698     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
699     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
700     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
701     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
702 
703     x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
704     temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
705     x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4);
706 
707     x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
708     temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
709     x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
710 
711     /* get the first 16 bits from the register */
712     *pi2_dc_out = vget_lane_s16(x0_16x4, 0);
713 
714     xx0_16x4 = vabs_s16(x0_16x4);
715     xx1_16x4 = vabs_s16(x1_16x4);
716     xx2_16x4 = vabs_s16(x2_16x4);
717     xx3_16x4 = vabs_s16(x3_16x4);
718 
719     /* compare with zero for getting sign */
720     temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4);
721     temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4);
722     temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4);
723     temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4);
724 
725     /* compare with zero for thresholding */
726     thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4);
727     thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4);
728     thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4);
729     thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4);
730 
731     /* Multiply abs values obtained with scaling matrix */
732     tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4));
733     tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4));
734     tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4));
735     tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4));
736 
737     tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4);
738     tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4);
739     tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4);
740     tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4);
741 
742     qbits_32x4 = vnegq_s32(qbits_32x4);
743 
744     tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4);
745     tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4);
746     tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4);
747     tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4);
748 
749     /* Convertion to 16 bits signed */
750     temp0_16x4 = vmovn_s32(tx0_32x4);
751     temp1_16x4 = vmovn_s32(tx1_32x4);
752     temp2_16x4 = vmovn_s32(tx2_32x4);
753     temp3_16x4 = vmovn_s32(tx3_32x4);
754 
755     x0_16x4 = vneg_s16(temp0_16x4);
756     x1_16x4 = vneg_s16(temp1_16x4);
757     x2_16x4 = vneg_s16(temp2_16x4);
758     x3_16x4 = vneg_s16(temp3_16x4);
759 
760     /* Restore sign */
761     x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4);
762     x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4);
763     x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4);
764     x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4);
765 
766     /* Thresholding */
767     xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4);
768     xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4);
769     xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4);
770     xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4);
771 
772     /* Store Quantized outputs */
773     vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4);
774     vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4);
775     vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4);
776     vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4);
777 
778     /* NNZ calculation */
779 
780     temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4);
781     temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4);
782     temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4);
783     temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4);
784 
785     temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4);
786     temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4);
787 
788     /* Convertion to 8 bit unsigned */
789     temp0_u8x8 = vmovn_u16(temp0_u16x8);
790     temp1_u8x8 = vmovn_u16(temp1_u16x8);
791 
792     temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7);
793     temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7);
794 
795     temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8);
796     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
797     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
798     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
799 
800     *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0);
801 }
802 
isvc_resi_trans_quant_chroma_4x4_with_residual_sub_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)803 void isvc_resi_trans_quant_chroma_4x4_with_residual_sub_neon(
804     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out,
805     buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants,
806     UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
807 {
808     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
809     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
810     WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
811     WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL;
812     WORD32 i4_src_stride = ps_src->i4_data_stride;
813     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
814     WORD32 i4_out_stride = ps_out->i4_data_stride;
815     WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
816     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
817     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
818     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
819     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
820 
821     uint8x8_t src0, src1, src2, src3;
822     uint8x8_t pred0, pred1, pred2, pred3;
823     uint8x8x2_t tmp0, tmp1, tmp2, tmp3;
824     uint8x8_t temp0_u8x8, temp1_u8x8;
825     uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4;
826     uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4;
827     uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4;
828     uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4;
829     int16x4_t upres0_16x4, upres1_16x4, upres2_16x4, upres3_16x4;
830     int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4;
831     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
832     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
833     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
834     int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4;
835     uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8;
836     uint16x8_t temp0_u16x8, temp1_u16x8;
837     int32x2x2_t x0_32x2x2, x1_32x2x2;
838     int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4;
839 
840     int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor);
841     int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits);
842     int16x4_t zeros_16x4 = vdup_n_s16(0);
843     int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
844     int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
845 
846     UNUSED(u1_use_upsampled_res);
847 
848     threshold0_16x4 = vld1_u16(pu2_threshold_matrix);
849     threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4);
850     threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8);
851     threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12);
852 
853     scale_mat0_16x4 = vld1_u16(pu2_scale_matrix);
854     scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4);
855     scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8);
856     scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12);
857 
858     src0 = vld1_u8(&pu1_src[0 * i4_src_stride]);
859     src1 = vld1_u8(&pu1_src[1 * i4_src_stride]);
860     src2 = vld1_u8(&pu1_src[2 * i4_src_stride]);
861     src3 = vld1_u8(&pu1_src[3 * i4_src_stride]);
862 
863     /* deinterleaving source buffer */
864     tmp0 = vuzp_u8(src0, src0);
865     tmp1 = vuzp_u8(src1, src1);
866     tmp2 = vuzp_u8(src2, src2);
867     tmp3 = vuzp_u8(src3, src3);
868 
869     src0 = tmp0.val[0];
870     src1 = tmp1.val[0];
871     src2 = tmp2.val[0];
872     src3 = tmp3.val[0];
873 
874     pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]);
875     pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]);
876     pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]);
877     pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]);
878 
879     /* deinterleaving pred buffer */
880     tmp0 = vuzp_u8(pred0, pred0);
881     tmp1 = vuzp_u8(pred1, pred1);
882     tmp2 = vuzp_u8(pred2, pred2);
883     tmp3 = vuzp_u8(pred3, pred3);
884 
885     pred0 = tmp0.val[0];
886     pred1 = tmp1.val[0];
887     pred2 = tmp2.val[0];
888     pred3 = tmp3.val[0];
889 
890     /* calculate res = src - pred */
891     res0_16x8 = vsubl_u8(src0, pred0);
892     res1_16x8 = vsubl_u8(src1, pred1);
893     res2_16x8 = vsubl_u8(src2, pred2);
894     res3_16x8 = vsubl_u8(src3, pred3);
895 
896     res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8));
897     res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8));
898     res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8));
899     res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8));
900 
901     /* Load upsampled res */
902     upres0_16x4 = vld1_s16(&pi2_upsampled_res[0 * i4_upsampled_res_stride]);
903     upres1_16x4 = vld1_s16(&pi2_upsampled_res[1 * i4_upsampled_res_stride]);
904     upres2_16x4 = vld1_s16(&pi2_upsampled_res[2 * i4_upsampled_res_stride]);
905     upres3_16x4 = vld1_s16(&pi2_upsampled_res[3 * i4_upsampled_res_stride]);
906 
907     /* subtract upsampled res from (src - pred) to obtain final res */
908     res0_16x4 = vsub_s16(res0_16x4, upres0_16x4);
909     res1_16x4 = vsub_s16(res1_16x4, upres1_16x4);
910     res2_16x4 = vsub_s16(res2_16x4, upres2_16x4);
911     res3_16x4 = vsub_s16(res3_16x4, upres3_16x4);
912 
913     /* Saturate all values < -255 to -255 and retain the rest as it is */
914     res0_16x4 = vmax_s16(res0_16x4, neg_255_16x4);
915     res1_16x4 = vmax_s16(res1_16x4, neg_255_16x4);
916     res2_16x4 = vmax_s16(res2_16x4, neg_255_16x4);
917     res3_16x4 = vmax_s16(res3_16x4, neg_255_16x4);
918 
919     /* Saturate all values > 255 to 255 and retain the rest as it is */
920     res0_16x4 = vmin_s16(res0_16x4, pos_255_16x4);
921     res1_16x4 = vmin_s16(res1_16x4, pos_255_16x4);
922     res2_16x4 = vmin_s16(res2_16x4, pos_255_16x4);
923     res3_16x4 = vmin_s16(res3_16x4, pos_255_16x4);
924 
925     /* Perform Forward transform */
926     /*-------------------------------------------------------------*/
927     /* DCT [ Horizontal transformation ]                          */
928     /*-------------------------------------------------------------*/
929     /* Matrix transpose */
930     /*
931      *  a0 a1 a2 a3
932      *  b0 b1 b2 b3
933      *  c0 c1 c2 c3
934      *  d0 d1 d2 d3
935      */
936 
937     xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4);
938     xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4);
939     x0_32x2x2 =
940         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
941     x1_32x2x2 =
942         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
943 
944     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
945     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
946     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
947     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
948 
949     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
950     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
951     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
952     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
953 
954     x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
955     temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
956     x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4);
957 
958     x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
959     temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
960     x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
961 
962     /* Matrix transpose */
963     /*
964      *  a0 b0 c0 d0
965      *  a1 b1 c1 d1
966      *  a2 b2 c2 d2
967      *  a3 b3 c3 d3
968      */
969 
970     xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4);
971     xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4);
972     x0_32x2x2 =
973         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
974     x1_32x2x2 =
975         vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
976 
977     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
978     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
979     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
980     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
981 
982     /* Vertical Transformation */
983 
984     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
985     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
986     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
987     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
988 
989     x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
990     temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
991     x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4);
992 
993     x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
994     temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
995     x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
996 
997     /* get the first 16 bits from the register */
998     *pi2_dc_out = vget_lane_s16(x0_16x4, 0);
999 
1000     xx0_16x4 = vabs_s16(x0_16x4);
1001     xx1_16x4 = vabs_s16(x1_16x4);
1002     xx2_16x4 = vabs_s16(x2_16x4);
1003     xx3_16x4 = vabs_s16(x3_16x4);
1004 
1005     /* compare with zero for getting sign */
1006     temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4);
1007     temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4);
1008     temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4);
1009     temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4);
1010 
1011     thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4);
1012     thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4);
1013     thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4);
1014     thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4);
1015 
1016     /* Multiply abs values obtained with scaling matrix */
1017     tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4));
1018     tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4));
1019     tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4));
1020     tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4));
1021 
1022     tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4);
1023     tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4);
1024     tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4);
1025     tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4);
1026 
1027     qbits_32x4 = vnegq_s32(qbits_32x4);
1028 
1029     tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4);
1030     tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4);
1031     tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4);
1032     tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4);
1033 
1034     /* Convertion to 16 bits signed */
1035     temp0_16x4 = vmovn_s32(tx0_32x4);
1036     temp1_16x4 = vmovn_s32(tx1_32x4);
1037     temp2_16x4 = vmovn_s32(tx2_32x4);
1038     temp3_16x4 = vmovn_s32(tx3_32x4);
1039 
1040     x0_16x4 = vneg_s16(temp0_16x4);
1041     x1_16x4 = vneg_s16(temp1_16x4);
1042     x2_16x4 = vneg_s16(temp2_16x4);
1043     x3_16x4 = vneg_s16(temp3_16x4);
1044 
1045     /* Restore sign */
1046     x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4);
1047     x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4);
1048     x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4);
1049     x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4);
1050 
1051     xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4);
1052     xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4);
1053     xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4);
1054     xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4);
1055 
1056     /* Store Quantized outputs */
1057     vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4);
1058     vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4);
1059     vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4);
1060     vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4);
1061 
1062     /* NNZ calculation */
1063 
1064     temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4);
1065     temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4);
1066     temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4);
1067     temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4);
1068 
1069     temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4);
1070     temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4);
1071 
1072     /* Convertion to 8 bit unsigned */
1073     temp0_u8x8 = vmovn_u16(temp0_u16x8);
1074     temp1_u8x8 = vmovn_u16(temp1_u16x8);
1075 
1076     temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7);
1077     temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7);
1078 
1079     temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8);
1080     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
1081     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
1082     temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
1083 
1084     *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0);
1085 }
1086