• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  * *******************************************************************************
22  * * @file
23  *  isvc_iquant_itrans_recon_neon.c
24  *
25  * @brief
26  *  neon variants of inverse transform and quantization functions
27  *
28  * *******************************************************************************
29  */
30 #include <arm_neon.h>
31 
32 #include "ih264_typedefs.h"
33 #include "ih264_debug.h"
34 #include "ih264_defs.h"
35 #include "ih264_trans_macros.h"
36 #include "ih264_macros.h"
37 #include "ih264_platform_macros.h"
38 #include "ih264_trans_data.h"
39 #include "ih264_size_defs.h"
40 #include "isvc_structs.h"
41 #include "isvc_trans_quant_itrans_iquant.h"
42 
isvc_iquant_itrans_recon_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)43 void isvc_iquant_itrans_recon_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
44                                        buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
45                                        buffer_container_t *ps_rec,
46                                        iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
47                                        WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
48                                        UWORD8 u1_res_accumulate)
49 {
50     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
51     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
52     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
53     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
54     WORD32 i4_out_stride = ps_rec->i4_data_stride;
55     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
56     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
57     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
58 
59     int16x4x4_t src_16x4x2;
60     int16x4x4_t iscal_16x4x2;
61     int16x4x4_t weigh_16x4x2;
62 
63     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
64     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
65     int16x4_t rq1_16x4, rq3_16x4;
66     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
67     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
68     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
69     int32x2x2_t x0_32x2x2, x1_32x2x2;
70     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
71 
72     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
73     int16x8_t pred0, pred1, pred2, pred3;
74     int16x8_t resd01_in, resd23_in;
75     int16x8_t pred01_in, pred23_in;
76     uint8x8_t pred01_un, pred23_un;
77 
78     int16x8_t pos_255_16x8 = vdupq_n_s16(((WORD16) UINT8_MAX));
79     int16x8_t neg_255_16x8 = vdupq_n_s16(-((WORD16) UINT8_MAX));
80     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
81 
82     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
83     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
84 
85     UNUSED(ps_res);
86     UNUSED(ps_res_pred);
87     UNUSED(u1_res_accumulate);
88 
89     src_16x4x2 = vld4_s16(pi2_src);
90     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
91     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
92 
93     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
94     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
95     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
96     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
97 
98     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
99     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
100     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
101     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
102 
103     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
104     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
105     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
106     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
107 
108     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
109     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
110     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
111     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
112 
113     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
114     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
115     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
116     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
117 
118     if(i4_iq_start_idx == 1)
119     {
120         q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
121     }
122 
123     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
124     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
125 
126     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
127     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
128     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
129     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
130 
131     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
132     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
133     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
134     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
135 
136     /* row 0 to row 3 */
137     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
138     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
139     x0_32x2x2 =
140         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
141     x1_32x2x2 =
142         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
143 
144     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
145     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
146     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
147     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
148 
149     /* Store Horz transform output into temp */
150     vst1_s16(pi2_tmp, x0_16x4);
151     vst1_s16(pi2_tmp + 4, x1_16x4);
152     vst1_s16(pi2_tmp + 8, x2_16x4);
153     vst1_s16(pi2_tmp + 12, x3_16x4);
154 
155     /* vertical inverse transform */
156     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
157     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
158 
159     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
160     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
161     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
162     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
163 
164     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
165     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
166     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
167     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
168 
169     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
170     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
171     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
172     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
173 
174     resd01_in = vcombine_s16(x0_16x4, x1_16x4);
175     resd23_in = vcombine_s16(x2_16x4, x3_16x4);
176 
177     /* Saturate all values < -255 to -255 and retain the rest as it is */
178     resd01_in = vmaxq_s16(resd01_in, neg_255_16x8);
179     resd23_in = vmaxq_s16(resd23_in, neg_255_16x8);
180 
181     /* Saturate all values > 255 to 255 and retain the rest as it is */
182     resd01_in = vminq_s16(resd01_in, pos_255_16x8);
183     resd23_in = vminq_s16(resd23_in, pos_255_16x8);
184 
185     /* Load pred */
186     pred0_in = vld1_u8((uint8_t *) pu1_pred);
187     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
188     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
189     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
190 
191     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
192     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
193     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
194     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
195 
196     pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
197     pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
198 
199     /* Out pixel = pred + res */
200     pred01_in = vaddq_s16(pred01_in, resd01_in);
201     pred23_in = vaddq_s16(pred23_in, resd23_in);
202 
203     /* Convert to 8 bit unsigned with saturation */
204     pred01_un = vqmovun_s16(pred01_in);
205     pred23_un = vqmovun_s16(pred23_in);
206 
207     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
208     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
209     vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
210     vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
211                   vreinterpret_u32_u8(pred23_un), 1);
212 }
213 
isvc_iquant_itrans_recon_4x4_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)214 void isvc_iquant_itrans_recon_4x4_with_res_output_neon(
215     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
216     buffer_container_t *ps_res, buffer_container_t *ps_rec,
217     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
218     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
219 {
220     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
221     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
222     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
223     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
224     WORD32 i4_res_stride = ps_res->i4_data_stride;
225     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
226     WORD32 i4_out_stride = ps_rec->i4_data_stride;
227     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
228     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
229     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
230 
231     int16x4x4_t src_16x4x2;
232     int16x4x4_t iscal_16x4x2;
233     int16x4x4_t weigh_16x4x2;
234 
235     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
236     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
237     int16x4_t rq1_16x4, rq3_16x4;
238     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
239     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
240     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
241     int32x2x2_t x0_32x2x2, x1_32x2x2;
242     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
243 
244     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
245     int16x8_t pred0, pred1, pred2, pred3;
246     int16x8_t resd01_in, resd23_in;
247     int16x8_t pred01_in, pred23_in;
248     uint8x8_t pred01_un, pred23_un;
249 
250     int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
251     int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
252     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
253 
254     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
255     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
256 
257     UNUSED(ps_res_pred);
258     UNUSED(u1_res_accumulate);
259 
260     src_16x4x2 = vld4_s16(pi2_src);
261     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
262     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
263 
264     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
265     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
266     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
267     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
268 
269     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
270     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
271     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
272     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
273 
274     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
275     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
276     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
277     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
278 
279     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
280     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
281     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
282     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
283 
284     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
285     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
286     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
287     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
288 
289     if(i4_iq_start_idx == 1)
290     {
291         q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
292     }
293 
294     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
295     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
296 
297     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
298     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
299     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
300     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
301 
302     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
303     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
304     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
305     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
306 
307     /* row 0 to row 3 */
308     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
309     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
310     x0_32x2x2 =
311         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
312     x1_32x2x2 =
313         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
314 
315     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
316     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
317     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
318     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
319 
320     /* Store Horz transform output into temp */
321     vst1_s16(pi2_tmp, x0_16x4);
322     vst1_s16(pi2_tmp + 4, x1_16x4);
323     vst1_s16(pi2_tmp + 8, x2_16x4);
324     vst1_s16(pi2_tmp + 12, x3_16x4);
325 
326     /* vertical inverse transform */
327     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
328     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
329 
330     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
331     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
332     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
333     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
334 
335     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
336     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
337     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
338     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
339 
340     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
341     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
342     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
343     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
344 
345     /* Saturate all values < -255 to -255 and retain the rest as it is */
346     x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
347     x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
348     x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
349     x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
350 
351     /* Saturate all values > 255 to 255 and retain the rest as it is */
352     x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
353     x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
354     x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
355     x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
356 
357     vst1_s16(pi2_res, x0_16x4);
358     vst1_s16(pi2_res + i4_res_stride, x1_16x4);
359     vst1_s16(pi2_res + (i4_res_stride << 1), x2_16x4);
360     vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, x3_16x4);
361 
362     resd01_in = vcombine_s16(x0_16x4, x1_16x4);
363     resd23_in = vcombine_s16(x2_16x4, x3_16x4);
364 
365     /* Load pred */
366     pred0_in = vld1_u8((uint8_t *) pu1_pred);
367     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
368     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
369     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
370 
371     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
372     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
373     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
374     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
375 
376     pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
377     pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
378 
379     /* Out pixel = pred + res */
380     pred01_in = vaddq_s16(pred01_in, resd01_in);
381     pred23_in = vaddq_s16(pred23_in, resd23_in);
382 
383     /* Convert to 8 bit unsigned with saturation */
384     pred01_un = vqmovun_s16(pred01_in);
385     pred23_un = vqmovun_s16(pred23_in);
386 
387     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
388     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
389     vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
390     vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
391                   vreinterpret_u32_u8(pred23_un), 1);
392 }
393 
isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)394 void isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon(
395     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
396     buffer_container_t *ps_res, buffer_container_t *ps_rec,
397     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
398     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
399 {
400     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
401     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
402     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
403     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
404     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
405     WORD32 i4_res_stride = ps_res->i4_data_stride;
406     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
407     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
408     WORD32 i4_out_stride = ps_rec->i4_data_stride;
409     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
410     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
411     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
412 
413     int16x4x4_t src_16x4x2;
414     int16x4x4_t iscal_16x4x2;
415     int16x4x4_t weigh_16x4x2;
416 
417     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
418     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
419     int16x4_t rq1_16x4, rq3_16x4;
420     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
421     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
422     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
423     int32x2x2_t x0_32x2x2, x1_32x2x2;
424     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
425 
426     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
427     int16x8_t pred0, pred1, pred2, pred3;
428     int16x4_t resd0_in, resd1_in, resd2_in, resd3_in;
429     int16x8_t resd01_in, resd23_in;
430     int16x8_t pred01_in, pred23_in;
431     uint8x8_t pred01_un, pred23_un;
432 
433     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
434 
435     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
436     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
437     int16x4_t pos_255 = vdup_n_s16(((WORD16) UINT8_MAX));
438     int16x4_t neg_255 = vdup_n_s16(-((WORD16) UINT8_MAX));
439 
440     UNUSED(u1_res_accumulate);
441 
442     src_16x4x2 = vld4_s16(pi2_src);
443     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
444     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
445 
446     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
447     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
448     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
449     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
450 
451     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
452     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
453     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
454     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
455 
456     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
457     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
458     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
459     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
460 
461     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
462     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
463     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
464     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
465 
466     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
467     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
468     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
469     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
470 
471     if(i4_iq_start_idx == 1)
472     {
473         q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
474     }
475 
476     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
477     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
478 
479     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
480     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
481     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
482     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
483 
484     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
485     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
486     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
487     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
488 
489     /* row 0 to row 3 */
490     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
491     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
492     x0_32x2x2 =
493         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
494     x1_32x2x2 =
495         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
496 
497     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
498     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
499     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
500     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
501 
502     /* Store Horz transform output into temp */
503     vst1_s16(pi2_tmp, x0_16x4);
504     vst1_s16(pi2_tmp + 4, x1_16x4);
505     vst1_s16(pi2_tmp + 8, x2_16x4);
506     vst1_s16(pi2_tmp + 12, x3_16x4);
507 
508     /* vertical inverse transform */
509     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
510     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
511 
512     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
513     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
514     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
515     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
516 
517     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
518     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
519     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
520     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
521 
522     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
523     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
524     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
525     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
526 
527     /* Accumulating Res */
528 
529     /* Load Res pred */
530     resd0_in = vld1_s16((int16_t *) pi2_res_pred);
531     resd1_in = vld1_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
532     resd2_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
533     resd3_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
534 
535     /* Add res pred with res obtained */
536     resd0_in = vadd_s16(resd0_in, x0_16x4);
537     resd1_in = vadd_s16(resd1_in, x1_16x4);
538     resd2_in = vadd_s16(resd2_in, x2_16x4);
539     resd3_in = vadd_s16(resd3_in, x3_16x4);
540 
541     /* Saturate all values < -255 to -255 and retain the rest as it is */
542     resd0_in = vmax_s16(resd0_in, neg_255);
543     resd1_in = vmax_s16(resd1_in, neg_255);
544     resd2_in = vmax_s16(resd2_in, neg_255);
545     resd3_in = vmax_s16(resd3_in, neg_255);
546 
547     /* Saturate all values > 255 to 255 and retain the rest as it is */
548     resd0_in = vmin_s16(resd0_in, pos_255);
549     resd1_in = vmin_s16(resd1_in, pos_255);
550     resd2_in = vmin_s16(resd2_in, pos_255);
551     resd3_in = vmin_s16(resd3_in, pos_255);
552 
553     vst1_s16(pi2_res, resd0_in);
554     vst1_s16(pi2_res + i4_res_stride, resd1_in);
555     vst1_s16(pi2_res + (i4_res_stride << 1), resd2_in);
556     vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resd3_in);
557 
558     resd01_in = vcombine_s16(resd0_in, resd1_in);
559     resd23_in = vcombine_s16(resd2_in, resd3_in);
560 
561     /* Load pred */
562     pred0_in = vld1_u8((uint8_t *) pu1_pred);
563     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
564     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
565     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
566 
567     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
568     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
569     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
570     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
571 
572     pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
573     pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
574 
575     /* Out pixel = pred + res */
576     pred01_in = vaddq_s16(pred01_in, resd01_in);
577     pred23_in = vaddq_s16(pred23_in, resd23_in);
578 
579     /* Convert to 8 bit unsigned with saturation */
580     pred01_un = vqmovun_s16(pred01_in);
581     pred23_un = vqmovun_s16(pred23_in);
582 
583     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
584     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
585     vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
586     vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
587                   vreinterpret_u32_u8(pred23_un), 1);
588 }
589 
isvc_iquant_itrans_recon_chroma_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)590 void isvc_iquant_itrans_recon_chroma_4x4_neon(
591     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
592     buffer_container_t *ps_res, buffer_container_t *ps_rec,
593     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
594     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
595 {
596     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
597     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
598     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
599     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
600     WORD32 i4_out_stride = ps_rec->i4_data_stride;
601     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
602     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
603     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
604 
605     WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
606 
607     int16x4x4_t src_16x4x2;
608     int16x4x4_t iscal_16x4x2;
609     int16x4x4_t weigh_16x4x2;
610 
611     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
612     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
613     int16x4_t rq1_16x4, rq3_16x4;
614     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
615     int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
616     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
617     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
618     int32x2x2_t x0_32x2x2, x1_32x2x2;
619     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
620 
621     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
622     int16x8_t pred0, pred1, pred2, pred3;
623     int16x8_t rec0, rec1, rec2, rec3;
624     uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
625     uint8x8_t out0, out1, out2, out3;
626 
627     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
628 
629     int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
630     int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
631     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
632     int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
633 
634     UNUSED(i4_iq_start_idx);
635     UNUSED(ps_res);
636     UNUSED(ps_res_pred);
637     UNUSED(u1_res_accumulate);
638 
639     src_16x4x2 = vld4_s16(pi2_src);
640     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
641     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
642 
643     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
644     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
645     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
646     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
647 
648     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
649     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
650     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
651     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
652 
653     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
654     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
655     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
656     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
657 
658     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
659     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
660     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
661     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
662 
663     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
664     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
665     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
666     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
667 
668     q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
669 
670     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
671     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
672 
673     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
674     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
675     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
676     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
677 
678     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
679     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
680     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
681     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
682 
683     /* row 0 to row 3 */
684     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
685     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
686     x0_32x2x2 =
687         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
688     x1_32x2x2 =
689         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
690 
691     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
692     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
693     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
694     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
695 
696     /* Store Horz transform output into temp */
697     vst1_s16(pi2_tmp, x0_16x4);
698     vst1_s16(pi2_tmp + 4, x1_16x4);
699     vst1_s16(pi2_tmp + 8, x2_16x4);
700     vst1_s16(pi2_tmp + 12, x3_16x4);
701 
702     /* vertical inverse transform */
703     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
704     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
705 
706     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
707     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
708     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
709     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
710 
711     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
712     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
713     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
714     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
715 
716     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
717     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
718     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
719     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
720 
721     /* Saturate all values < -255 to -255 and retain the rest as it is */
722     x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
723     x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
724     x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
725     x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
726 
727     /* Saturate all values > 255 to 255 and retain the rest as it is */
728     x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
729     x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
730     x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
731     x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
732 
733     x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
734     x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
735     x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
736     x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
737 
738     pred0_in = vld1_u8((uint8_t *) pu1_pred);
739     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
740     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
741     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
742 
743     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
744     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
745     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
746     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
747 
748     /* Out pixel = pred + res */
749     rec0 = vaddq_s16(pred0, x0_16x8);
750     rec1 = vaddq_s16(pred1, x1_16x8);
751     rec2 = vaddq_s16(pred2, x2_16x8);
752     rec3 = vaddq_s16(pred3, x3_16x8);
753 
754     out0 = vld1_u8(pu1_out);
755     out1 = vld1_u8(pu1_out + i4_out_stride);
756     out2 = vld1_u8(pu1_out + i4_out_stride * 2);
757     out3 = vld1_u8(pu1_out + i4_out_stride * 3);
758 
759     /* Convert to 8 bit unsigned with saturation */
760     rec0_un = vqmovun_s16(rec0);
761     rec1_un = vqmovun_s16(rec1);
762     rec2_un = vqmovun_s16(rec2);
763     rec3_un = vqmovun_s16(rec3);
764 
765     /* Store in alternate postions */
766     out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
767     out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
768     out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
769     out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
770 
771     vst1_u8((pu1_out), out0);
772     vst1_u8((pu1_out + i4_out_stride), out1);
773     vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
774     vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
775 }
776 
isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)777 void isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon(
778     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
779     buffer_container_t *ps_res, buffer_container_t *ps_rec,
780     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
781     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
782 {
783     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
784     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
785     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
786     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
787     WORD32 i4_res_stride = ps_res->i4_data_stride;
788     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
789     WORD32 i4_out_stride = ps_rec->i4_data_stride;
790     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
791     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
792     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
793 
794     WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
795 
796     int16x4x4_t src_16x4x2;
797     int16x4x4_t iscal_16x4x2;
798     int16x4x4_t weigh_16x4x2;
799 
800     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
801     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
802     int16x4_t rq1_16x4, rq3_16x4;
803     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
804     int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
805     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
806     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
807     int32x2x2_t x0_32x2x2, x1_32x2x2;
808     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
809 
810     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
811     int16x8_t pred0, pred1, pred2, pred3;
812     int16x8_t rec0, rec1, rec2, rec3;
813     uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
814     uint8x8_t out0, out1, out2, out3;
815     int16x8_t resout0, resout1, resout2, resout3;
816 
817     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
818     uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
819     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
820     int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
821     int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
822     int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
823 
824     UNUSED(i4_iq_start_idx);
825     UNUSED(ps_res_pred);
826     UNUSED(u1_res_accumulate);
827 
828     src_16x4x2 = vld4_s16(pi2_src);
829     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
830     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
831 
832     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
833     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
834     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
835     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
836 
837     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
838     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
839     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
840     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
841 
842     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
843     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
844     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
845     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
846 
847     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
848     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
849     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
850     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
851 
852     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
853     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
854     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
855     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
856 
857     q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
858 
859     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
860     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
861 
862     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
863     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
864     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
865     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
866 
867     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
868     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
869     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
870     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
871 
872     /* row 0 to row 3 */
873     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
874     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
875     x0_32x2x2 =
876         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
877     x1_32x2x2 =
878         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
879 
880     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
881     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
882     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
883     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
884 
885     /* Store Horz transform output into temp */
886     vst1_s16(pi2_tmp, x0_16x4);
887     vst1_s16(pi2_tmp + 4, x1_16x4);
888     vst1_s16(pi2_tmp + 8, x2_16x4);
889     vst1_s16(pi2_tmp + 12, x3_16x4);
890 
891     /* vertical inverse transform */
892     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
893     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
894 
895     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
896     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
897     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
898     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
899 
900     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
901     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
902     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
903     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
904 
905     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
906     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
907     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
908     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
909 
910     /* Saturate all values < -255 to -255 and retain the rest as it is */
911     x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
912     x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
913     x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
914     x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
915 
916     /* Saturate all values > 255 to 255 and retain the rest as it is */
917     x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
918     x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
919     x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
920     x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
921 
922     resout0 = vld1q_s16(pi2_res);
923     resout1 = vld1q_s16(pi2_res + i4_res_stride);
924     resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
925     resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
926 
927     x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
928     x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
929     x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
930     x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
931 
932     /* Storing res in alternate positions */
933     resout0 = vbslq_s16(chroma_mask_16x8, x0_16x8, resout0);
934     resout1 = vbslq_s16(chroma_mask_16x8, x1_16x8, resout1);
935     resout2 = vbslq_s16(chroma_mask_16x8, x2_16x8, resout2);
936     resout3 = vbslq_s16(chroma_mask_16x8, x3_16x8, resout3);
937 
938     vst1q_s16(pi2_res, resout0);
939     vst1q_s16(pi2_res + i4_res_stride, resout1);
940     vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
941     vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
942 
943     pred0_in = vld1_u8((uint8_t *) pu1_pred);
944     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
945     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
946     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
947 
948     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
949     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
950     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
951     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
952 
953     /* Out pixel = pred + res */
954     rec0 = vaddq_s16(pred0, x0_16x8);
955     rec1 = vaddq_s16(pred1, x1_16x8);
956     rec2 = vaddq_s16(pred2, x2_16x8);
957     rec3 = vaddq_s16(pred3, x3_16x8);
958 
959     out0 = vld1_u8(pu1_out);
960     out1 = vld1_u8(pu1_out + i4_out_stride);
961     out2 = vld1_u8(pu1_out + i4_out_stride * 2);
962     out3 = vld1_u8(pu1_out + i4_out_stride * 3);
963 
964     /* Convert to 8 bit unsigned with saturation */
965     rec0_un = vqmovun_s16(rec0);
966     rec1_un = vqmovun_s16(rec1);
967     rec2_un = vqmovun_s16(rec2);
968     rec3_un = vqmovun_s16(rec3);
969 
970     /* Store output pixels in alternate positions */
971     out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
972     out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
973     out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
974     out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
975 
976     vst1_u8((pu1_out), out0);
977     vst1_u8((pu1_out + i4_out_stride), out1);
978     vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
979     vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
980 }
981 
isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)982 void isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon(
983     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
984     buffer_container_t *ps_res, buffer_container_t *ps_rec,
985     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
986     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
987 {
988     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
989     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
990     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
991     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
992     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
993     WORD32 i4_res_stride = ps_res->i4_data_stride;
994     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
995     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
996     WORD32 i4_out_stride = ps_rec->i4_data_stride;
997     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
998     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
999     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1000 
1001     WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1002 
1003     int16x4x4_t src_16x4x2;
1004     int16x4x4_t iscal_16x4x2;
1005     int16x4x4_t weigh_16x4x2;
1006 
1007     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
1008     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
1009     int16x4_t rq1_16x4, rq3_16x4;
1010     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
1011     int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
1012     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
1013     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
1014     int32x2x2_t x0_32x2x2, x1_32x2x2;
1015     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
1016 
1017     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1018     int16x8_t pred0, pred1, pred2, pred3;
1019     int16x8_t rec0, rec1, rec2, rec3;
1020     uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
1021     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1022     int16x8_t resd1_in_mask, resd2_in_mask, resd3_in_mask;
1023     uint8x8_t out0, out1, out2, out3;
1024     int16x8_t resout0, resout1, resout2, resout3;
1025     int16x8_t pos_255 = vdupq_n_s16(((WORD16) UINT8_MAX));
1026     int16x8_t neg_255 = vdupq_n_s16(-((WORD16) UINT8_MAX));
1027 
1028     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1029     uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1030 
1031     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
1032     int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
1033 
1034     int16x8_t resd0_in_mask = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000};
1035 
1036     UNUSED(i4_iq_start_idx);
1037     UNUSED(u1_res_accumulate);
1038 
1039     resd1_in_mask = resd0_in_mask;
1040     resd2_in_mask = resd0_in_mask;
1041     resd3_in_mask = resd0_in_mask;
1042 
1043     src_16x4x2 = vld4_s16(pi2_src);
1044     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
1045     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
1046 
1047     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
1048     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
1049     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
1050     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
1051 
1052     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
1053     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
1054     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
1055     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
1056 
1057     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
1058     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
1059     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
1060     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
1061 
1062     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
1063     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
1064     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
1065     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
1066 
1067     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
1068     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
1069     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
1070     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
1071 
1072     q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
1073 
1074     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
1075     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
1076 
1077     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
1078     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
1079     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
1080     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
1081 
1082     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
1083     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
1084     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
1085     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
1086 
1087     /* row 0 to row 3 */
1088     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
1089     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
1090     x0_32x2x2 =
1091         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
1092     x1_32x2x2 =
1093         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
1094 
1095     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
1096     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
1097     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
1098     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
1099 
1100     /* Store Horz transform output into temp */
1101     vst1_s16(pi2_tmp, x0_16x4);
1102     vst1_s16(pi2_tmp + 4, x1_16x4);
1103     vst1_s16(pi2_tmp + 8, x2_16x4);
1104     vst1_s16(pi2_tmp + 12, x3_16x4);
1105 
1106     /* vertical inverse transform */
1107     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
1108     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
1109 
1110     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
1111     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
1112     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
1113     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
1114 
1115     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
1116     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
1117     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
1118     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
1119 
1120     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
1121     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
1122     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
1123     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
1124 
1125     resd0_in = vld1q_s16((int16_t *) pi2_res_pred);
1126     resd1_in = vld1q_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
1127     resd2_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
1128     resd3_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
1129 
1130     /* Mask alternate values */
1131     resd0_in_mask = vbslq_s16(chroma_mask_16x8, resd0_in, resd0_in_mask);
1132     resd1_in_mask = vbslq_s16(chroma_mask_16x8, resd1_in, resd1_in_mask);
1133     resd2_in_mask = vbslq_s16(chroma_mask_16x8, resd2_in, resd2_in_mask);
1134     resd3_in_mask = vbslq_s16(chroma_mask_16x8, resd3_in, resd3_in_mask);
1135 
1136     x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
1137     x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
1138     x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
1139     x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
1140 
1141     resd0_in = vaddq_s16(resd0_in_mask, x0_16x8);
1142     resd1_in = vaddq_s16(resd1_in_mask, x1_16x8);
1143     resd2_in = vaddq_s16(resd2_in_mask, x2_16x8);
1144     resd3_in = vaddq_s16(resd3_in_mask, x3_16x8);
1145 
1146     /* Saturate all values < -255 to -255 and retain the rest as it is */
1147     resd0_in = vmaxq_s16(resd0_in, neg_255);
1148     resd1_in = vmaxq_s16(resd1_in, neg_255);
1149     resd2_in = vmaxq_s16(resd2_in, neg_255);
1150     resd3_in = vmaxq_s16(resd3_in, neg_255);
1151 
1152     /* Saturate all values > 255 to 255 and retain the rest as it is */
1153     resd0_in = vminq_s16(resd0_in, pos_255);
1154     resd1_in = vminq_s16(resd1_in, pos_255);
1155     resd2_in = vminq_s16(resd2_in, pos_255);
1156     resd3_in = vminq_s16(resd3_in, pos_255);
1157 
1158     resout0 = vld1q_s16(pi2_res);
1159     resout1 = vld1q_s16(pi2_res + i4_res_stride);
1160     resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1161     resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1162 
1163     /* Store res in aternate positions */
1164     resout0 = vbslq_s16(chroma_mask_16x8, resd0_in, resout0);
1165     resout1 = vbslq_s16(chroma_mask_16x8, resd1_in, resout1);
1166     resout2 = vbslq_s16(chroma_mask_16x8, resd2_in, resout2);
1167     resout3 = vbslq_s16(chroma_mask_16x8, resd3_in, resout3);
1168 
1169     vst1q_s16(pi2_res, resout0);
1170     vst1q_s16(pi2_res + i4_res_stride, resout1);
1171     vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1172     vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1173 
1174     pred0_in = vld1_u8((uint8_t *) pu1_pred);
1175     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
1176     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
1177     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
1178 
1179     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1180     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1181     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1182     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1183 
1184     /* Out pixel = pred + res */
1185     rec0 = vaddq_s16(pred0, resout0);
1186     rec1 = vaddq_s16(pred1, resout1);
1187     rec2 = vaddq_s16(pred2, resout2);
1188     rec3 = vaddq_s16(pred3, resout3);
1189 
1190     out0 = vld1_u8(pu1_out);
1191     out1 = vld1_u8(pu1_out + i4_out_stride);
1192     out2 = vld1_u8(pu1_out + i4_out_stride * 2);
1193     out3 = vld1_u8(pu1_out + i4_out_stride * 3);
1194 
1195     /* Convert to 8 bit unsigned with saturation */
1196     rec0_un = vqmovun_s16(rec0);
1197     rec1_un = vqmovun_s16(rec1);
1198     rec2_un = vqmovun_s16(rec2);
1199     rec3_un = vqmovun_s16(rec3);
1200 
1201     /* Store output pixels in alternate positions */
1202     out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
1203     out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
1204     out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
1205     out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
1206 
1207     vst1_u8((pu1_out), out0);
1208     vst1_u8((pu1_out + i4_out_stride), out1);
1209     vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
1210     vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
1211 }
1212 
isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1213 void isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
1214                                           buffer_container_t *ps_res_pred,
1215                                           buffer_container_t *ps_res, buffer_container_t *ps_rec,
1216                                           iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
1217                                           WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1218                                           WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1219 {
1220     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1221     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1222     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1223     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1224     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1225     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1226     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1227     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1228     WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1229 
1230     WORD32 i4_iq_out_temp;
1231     int16x8_t temp_0;
1232     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1233     int16x8_t pred0, pred1, pred2, pred3;
1234 
1235     UNUSED(pi2_tmp);
1236     UNUSED(ps_res);
1237     UNUSED(ps_res_pred);
1238     UNUSED(u1_res_accumulate);
1239 
1240     if(i4_iq_start_idx == 0)
1241     {
1242         i4_iq_out_temp = pi2_src[0];
1243         INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1244     }
1245     else
1246     {
1247         i4_iq_out_temp = pi2_dc_src[0];
1248     }
1249 
1250     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1251 
1252     pred0_in = vld1_u8(pu1_pred);
1253     pu1_pred = pu1_pred + i4_pred_stride;
1254     pred1_in = vld1_u8(pu1_pred);
1255     pu1_pred = pu1_pred + i4_pred_stride;
1256     pred2_in = vld1_u8(pu1_pred);
1257     pu1_pred = pu1_pred + i4_pred_stride;
1258     pred3_in = vld1_u8(pu1_pred);
1259 
1260     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1261     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1262     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1263     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1264 
1265     /* Out pixel = Res + pred */
1266     pred0 = vaddq_s16(pred0, temp_0);
1267     pred1 = vaddq_s16(pred1, temp_0);
1268     pred2 = vaddq_s16(pred2, temp_0);
1269     pred3 = vaddq_s16(pred3, temp_0);
1270 
1271     /* Convert to unsigned 8 bit with saturation */
1272     pred0_in = vqmovun_s16(pred0);
1273     pred1_in = vqmovun_s16(pred1);
1274     pred2_in = vqmovun_s16(pred2);
1275     pred3_in = vqmovun_s16(pred3);
1276 
1277     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
1278     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred1_in), 0);
1279     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 2), vreinterpret_u32_u8(pred2_in), 0);
1280     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0);
1281 }
1282 
isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1283 void isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon(
1284     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1285     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1286     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1287     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1288 {
1289     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1290     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1291     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1292     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1293     WORD32 i4_res_stride = ps_res->i4_data_stride;
1294     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1295     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1296     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1297     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1298     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1299     WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1300 
1301     WORD16 i2_it_out;
1302     WORD32 i4_iq_out_temp;
1303     int16x8_t temp_0;
1304     int16x4_t residue_res;
1305     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1306     int16x8_t pred0, pred1, pred2, pred3;
1307 
1308     UNUSED(pi2_tmp);
1309     UNUSED(ps_res_pred);
1310     UNUSED(u1_res_accumulate);
1311 
1312     if(i4_iq_start_idx == 0)
1313     {
1314         i4_iq_out_temp = pi2_src[0];
1315         INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1316     }
1317     else
1318     {
1319         i4_iq_out_temp = pi2_dc_src[0];
1320     }
1321 
1322     i2_it_out = ((i4_iq_out_temp + 32) >> 6);
1323     temp_0 = vdupq_n_s16(i2_it_out);
1324     residue_res = vdup_n_s16(isvc_get_residue(i2_it_out, 0, 0));
1325 
1326     vst1_s16(pi2_res, residue_res);
1327     vst1_s16(pi2_res + i4_res_stride, residue_res);
1328     vst1_s16(pi2_res + (i4_res_stride << 1), residue_res);
1329     vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, residue_res);
1330 
1331     pred0_in = vld1_u8(pu1_pred);
1332     pu1_pred = pu1_pred + i4_pred_stride;
1333     pred1_in = vld1_u8(pu1_pred);
1334     pu1_pred = pu1_pred + i4_pred_stride;
1335     pred2_in = vld1_u8(pu1_pred);
1336     pu1_pred = pu1_pred + i4_pred_stride;
1337     pred3_in = vld1_u8(pu1_pred);
1338 
1339     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1340     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1341     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1342     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1343 
1344     /* Out pixel = Res + pred */
1345     pred0 = vaddq_s16(pred0, temp_0);
1346     pred1 = vaddq_s16(pred1, temp_0);
1347     pred2 = vaddq_s16(pred2, temp_0);
1348     pred3 = vaddq_s16(pred3, temp_0);
1349 
1350     /* Convert to unsigned 8 bit with saturation */
1351     pred0_in = vqmovun_s16(pred0);
1352     pred1_in = vqmovun_s16(pred1);
1353     pred2_in = vqmovun_s16(pred2);
1354     pred3_in = vqmovun_s16(pred3);
1355 
1356     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
1357     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred1_in), 0);
1358     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 2), vreinterpret_u32_u8(pred2_in), 0);
1359     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0);
1360 }
1361 
isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1362 void isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon(
1363     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1364     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1365     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1366     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1367 {
1368     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1369     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1370     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1371     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1372     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1373     WORD32 i4_res_stride = ps_res->i4_data_stride;
1374     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1375     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1376     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1377     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1378     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1379     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1380     WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1381 
1382     WORD32 i4_iq_out_temp;
1383     int16x4_t temp_0;
1384     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1385     int16x8_t pred0, pred1, pred2, pred3;
1386     int16x8_t pred01_in, pred23_in;
1387     uint8x8_t pred01_un, pred23_un;
1388 
1389     int16x4_t resd0_in, resd1_in, resd2_in, resd3_in;
1390     int16x8_t resd01_in, resd23_in;
1391     int16x4_t pos_255 = vdup_n_s16(((WORD16) UINT8_MAX));
1392     int16x4_t neg_255 = vdup_n_s16(-((WORD16) UINT8_MAX));
1393 
1394     UNUSED(pi2_tmp);
1395     UNUSED(u1_res_accumulate);
1396 
1397     if(i4_iq_start_idx == 0)
1398     {
1399         i4_iq_out_temp = pi2_src[0];
1400         INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1401     }
1402     else
1403     {
1404         i4_iq_out_temp = pi2_dc_src[0];
1405     }
1406 
1407     temp_0 = vdup_n_s16((i4_iq_out_temp + 32) >> 6);
1408 
1409     resd0_in = vld1_s16((int16_t *) pi2_res_pred);
1410     resd1_in = vld1_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
1411     resd2_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
1412     resd3_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
1413 
1414     /* Add res pred to the res obtained */
1415     resd0_in = vadd_s16(resd0_in, temp_0);
1416     resd1_in = vadd_s16(resd1_in, temp_0);
1417     resd2_in = vadd_s16(resd2_in, temp_0);
1418     resd3_in = vadd_s16(resd3_in, temp_0);
1419 
1420     /* Saturate all values < -255 to -255 and retain the rest as it is */
1421     resd0_in = vmax_s16(resd0_in, neg_255);
1422     resd1_in = vmax_s16(resd1_in, neg_255);
1423     resd2_in = vmax_s16(resd2_in, neg_255);
1424     resd3_in = vmax_s16(resd3_in, neg_255);
1425 
1426     /* Saturate all values > 255 to 255 and retain the rest as it is */
1427     resd0_in = vmin_s16(resd0_in, pos_255);
1428     resd1_in = vmin_s16(resd1_in, pos_255);
1429     resd2_in = vmin_s16(resd2_in, pos_255);
1430     resd3_in = vmin_s16(resd3_in, pos_255);
1431 
1432     vst1_s16(pi2_res, resd0_in);
1433     vst1_s16(pi2_res + i4_res_stride, resd1_in);
1434     vst1_s16(pi2_res + (i4_res_stride << 1), resd2_in);
1435     vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resd3_in);
1436 
1437     resd01_in = vcombine_s16(resd0_in, resd1_in);
1438     resd23_in = vcombine_s16(resd2_in, resd3_in);
1439 
1440     pred0_in = vld1_u8(pu1_pred);
1441     pu1_pred = pu1_pred + i4_pred_stride;
1442     pred1_in = vld1_u8(pu1_pred);
1443     pu1_pred = pu1_pred + i4_pred_stride;
1444     pred2_in = vld1_u8(pu1_pred);
1445     pu1_pred = pu1_pred + i4_pred_stride;
1446     pred3_in = vld1_u8(pu1_pred);
1447 
1448     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1449     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1450     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1451     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1452 
1453     pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
1454     pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
1455 
1456     /* Out pixel = Res + pred */
1457     pred01_in = vaddq_s16(pred01_in, resd01_in);
1458     pred23_in = vaddq_s16(pred23_in, resd23_in);
1459 
1460     /* Convert to unsigned 8 bit with saturation */
1461     pred01_un = vqmovun_s16(pred01_in);
1462     pred23_un = vqmovun_s16(pred23_in);
1463 
1464     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
1465     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
1466     vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
1467     vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
1468                   vreinterpret_u32_u8(pred23_un), 1);
1469 }
1470 
isvc_iquant_itrans_recon_chroma_4x4_dc_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1471 void isvc_iquant_itrans_recon_chroma_4x4_dc_neon(
1472     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1473     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1474     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1475     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1476 {
1477     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1478     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1479     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1480     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1481     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1482     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1483     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1484     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1485 
1486     WORD32 i4_iq_out_temp;
1487     int16x8_t temp_0;
1488     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1489     int16x8_t pred0, pred1, pred2, pred3;
1490     uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
1491     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1492 
1493     UNUSED(pi2_src);
1494     UNUSED(pu2_iscal_mat);
1495     UNUSED(pu2_weigh_mat);
1496     UNUSED(u4_qp_div_6);
1497     UNUSED(pi2_tmp);
1498     UNUSED(i4_iq_start_idx);
1499     UNUSED(ps_res);
1500     UNUSED(ps_res_pred);
1501     UNUSED(u1_res_accumulate);
1502 
1503     i4_iq_out_temp = pi2_dc_src[0];
1504     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1505 
1506     pred0_in = vld1_u8(pu1_pred);
1507     pu1_pred = pu1_pred + i4_pred_stride;
1508     pred1_in = vld1_u8(pu1_pred);
1509     pu1_pred = pu1_pred + i4_pred_stride;
1510     pred2_in = vld1_u8(pu1_pred);
1511     pu1_pred = pu1_pred + i4_pred_stride;
1512     pred3_in = vld1_u8(pu1_pred);
1513 
1514     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1515     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1516     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1517     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1518 
1519     /* Out pixel = Res + pred */
1520     pred0 = vaddq_s16(pred0, temp_0);
1521     pred1 = vaddq_s16(pred1, temp_0);
1522     pred2 = vaddq_s16(pred2, temp_0);
1523     pred3 = vaddq_s16(pred3, temp_0);
1524 
1525     /* Convert to unsigned 8 bit with saturation */
1526     pred0_in = vqmovun_s16(pred0);
1527     pred1_in = vqmovun_s16(pred1);
1528     pred2_in = vqmovun_s16(pred2);
1529     pred3_in = vqmovun_s16(pred3);
1530 
1531     i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
1532     i4_out_horz_8x8_r1 = vld1_u8(pu1_out + i4_out_stride);
1533     i4_out_horz_8x8_r2 = vld1_u8(pu1_out + i4_out_stride * 2);
1534     i4_out_horz_8x8_r3 = vld1_u8(pu1_out + i4_out_stride * 3);
1535 
1536     /* Store out pixels in alternate positions */
1537     i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
1538     i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
1539     i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
1540     i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
1541 
1542     vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
1543     vst1_u8((uint8_t *) (pu1_out + i4_out_stride), i4_out_horz_8x8_r1);
1544     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), i4_out_horz_8x8_r2);
1545     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), i4_out_horz_8x8_r3);
1546 }
1547 
isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1548 void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon(
1549     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1550     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1551     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1552     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1553 {
1554     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1555     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1556     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1557     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1558     WORD32 i4_res_stride = ps_res->i4_data_stride;
1559     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1560     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1561     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1562     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1563     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1564 
1565     WORD16 i2_it_out;
1566     WORD32 i4_iq_out_temp;
1567     int16x8_t temp_0, residue_res;
1568     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1569     int16x8_t pred0, pred1, pred2, pred3;
1570     int16x8_t resout0, resout1, resout2, resout3;
1571 
1572     uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
1573     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1574     uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1575 
1576     UNUSED(pi2_src);
1577     UNUSED(pu2_iscal_mat);
1578     UNUSED(pu2_weigh_mat);
1579     UNUSED(u4_qp_div_6);
1580     UNUSED(pi2_tmp);
1581     UNUSED(i4_iq_start_idx);
1582     UNUSED(ps_res_pred);
1583     UNUSED(u1_res_accumulate);
1584 
1585     i4_iq_out_temp = pi2_dc_src[0];
1586 
1587     i2_it_out = ((i4_iq_out_temp + 32) >> 6);
1588     temp_0 = vdupq_n_s16(i2_it_out);
1589     residue_res = vdupq_n_s16(isvc_get_residue(i2_it_out, 0, 0));
1590 
1591     resout0 = vld1q_s16(pi2_res);
1592     resout1 = vld1q_s16(pi2_res + i4_res_stride);
1593     resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1594     resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1595 
1596     /* Store res in alternate positions */
1597     resout0 = vbslq_s16(chroma_mask_16x8, residue_res, resout0);
1598     resout1 = vbslq_s16(chroma_mask_16x8, residue_res, resout1);
1599     resout2 = vbslq_s16(chroma_mask_16x8, residue_res, resout2);
1600     resout3 = vbslq_s16(chroma_mask_16x8, residue_res, resout3);
1601 
1602     vst1q_s16(pi2_res, resout0);
1603     vst1q_s16(pi2_res + i4_res_stride, resout1);
1604     vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1605     vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1606 
1607     pred0_in = vld1_u8(pu1_pred);
1608     pu1_pred = pu1_pred + i4_pred_stride;
1609     pred1_in = vld1_u8(pu1_pred);
1610     pu1_pred = pu1_pred + i4_pred_stride;
1611     pred2_in = vld1_u8(pu1_pred);
1612     pu1_pred = pu1_pred + i4_pred_stride;
1613     pred3_in = vld1_u8(pu1_pred);
1614 
1615     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1616     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1617     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1618     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1619 
1620     /* Out pixel = Res + pred */
1621     pred0 = vaddq_s16(pred0, temp_0);
1622     pred1 = vaddq_s16(pred1, temp_0);
1623     pred2 = vaddq_s16(pred2, temp_0);
1624     pred3 = vaddq_s16(pred3, temp_0);
1625 
1626     /* Convert to unsigned 8 bit with saturation */
1627     pred0_in = vqmovun_s16(pred0);
1628     pred1_in = vqmovun_s16(pred1);
1629     pred2_in = vqmovun_s16(pred2);
1630     pred3_in = vqmovun_s16(pred3);
1631 
1632     /* Store out pixels in alternate positions */
1633     i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
1634     i4_out_horz_8x8_r1 = vld1_u8(pu1_out + i4_out_stride);
1635     i4_out_horz_8x8_r2 = vld1_u8(pu1_out + i4_out_stride * 2);
1636     i4_out_horz_8x8_r3 = vld1_u8(pu1_out + i4_out_stride * 3);
1637 
1638     i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
1639     i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
1640     i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
1641     i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
1642 
1643     vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
1644     vst1_u8((uint8_t *) (pu1_out + i4_out_stride), i4_out_horz_8x8_r1);
1645     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), i4_out_horz_8x8_r2);
1646     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), i4_out_horz_8x8_r3);
1647 }
1648 
isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1649 void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon(
1650     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1651     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1652     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1653     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1654 {
1655     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1656     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1657     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1658     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1659     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1660     WORD32 i4_res_stride = ps_res->i4_data_stride;
1661     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1662     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1663     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1664     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1665     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1666     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1667 
1668     WORD32 i4_iq_out_temp;
1669     int16x8_t temp_0;
1670     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1671     int16x8_t pred0, pred1, pred2, pred3;
1672     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1673     int16x8_t resout0, resout1, resout2, resout3;
1674     int16x8_t resd1_in_mask, resd2_in_mask, resd3_in_mask;
1675     uint8x8_t out0, out1, out2, out3;
1676     int16x8_t pos_255 = vdupq_n_s16(((WORD16) UINT8_MAX));
1677     int16x8_t neg_255 = vdupq_n_s16(-((WORD16) UINT8_MAX));
1678     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1679     uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1680 
1681     int16x8_t resd0_in_mask = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000};
1682 
1683     UNUSED(pi2_src);
1684     UNUSED(pu2_iscal_mat);
1685     UNUSED(pu2_weigh_mat);
1686     UNUSED(u4_qp_div_6);
1687     UNUSED(pi2_tmp);
1688     UNUSED(i4_iq_start_idx);
1689     UNUSED(u1_res_accumulate);
1690 
1691     resd1_in_mask = resd0_in_mask;
1692     resd2_in_mask = resd0_in_mask;
1693     resd3_in_mask = resd0_in_mask;
1694 
1695     i4_iq_out_temp = pi2_dc_src[0];
1696     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1697 
1698     resd0_in = vld1q_s16((int16_t *) pi2_res_pred);
1699     resd1_in = vld1q_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
1700     resd2_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
1701     resd3_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
1702 
1703     /* Mask alternate values of res pred */
1704     resd0_in_mask = vbslq_s16(chroma_mask_16x8, resd0_in, resd0_in_mask);
1705     resd1_in_mask = vbslq_s16(chroma_mask_16x8, resd1_in, resd1_in_mask);
1706     resd2_in_mask = vbslq_s16(chroma_mask_16x8, resd2_in, resd2_in_mask);
1707     resd3_in_mask = vbslq_s16(chroma_mask_16x8, resd3_in, resd3_in_mask);
1708 
1709     /* Add res pred to res obtained */
1710     resd0_in = vaddq_s16(resd0_in_mask, temp_0);
1711     resd1_in = vaddq_s16(resd1_in_mask, temp_0);
1712     resd2_in = vaddq_s16(resd2_in_mask, temp_0);
1713     resd3_in = vaddq_s16(resd3_in_mask, temp_0);
1714 
1715     /* Saturate all values < -255 to -255 and retain the rest as it is */
1716     resd0_in = vmaxq_s16(resd0_in, neg_255);
1717     resd1_in = vmaxq_s16(resd1_in, neg_255);
1718     resd2_in = vmaxq_s16(resd2_in, neg_255);
1719     resd3_in = vmaxq_s16(resd3_in, neg_255);
1720 
1721     /* Saturate all values > 255 to 255 and retain the rest as it is */
1722     resd0_in = vminq_s16(resd0_in, pos_255);
1723     resd1_in = vminq_s16(resd1_in, pos_255);
1724     resd2_in = vminq_s16(resd2_in, pos_255);
1725     resd3_in = vminq_s16(resd3_in, pos_255);
1726 
1727     resout0 = vld1q_s16(pi2_res);
1728     resout1 = vld1q_s16(pi2_res + i4_res_stride);
1729     resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1730     resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1731 
1732     /* Store res in alternate positions */
1733     resout0 = vbslq_s16(chroma_mask_16x8, resd0_in, resout0);
1734     resout1 = vbslq_s16(chroma_mask_16x8, resd1_in, resout1);
1735     resout2 = vbslq_s16(chroma_mask_16x8, resd2_in, resout2);
1736     resout3 = vbslq_s16(chroma_mask_16x8, resd3_in, resout3);
1737 
1738     vst1q_s16(pi2_res, resout0);
1739     vst1q_s16(pi2_res + i4_res_stride, resout1);
1740     vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1741     vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1742 
1743     pred0_in = vld1_u8(pu1_pred);
1744     pu1_pred = pu1_pred + i4_pred_stride;
1745     pred1_in = vld1_u8(pu1_pred);
1746     pu1_pred = pu1_pred + i4_pred_stride;
1747     pred2_in = vld1_u8(pu1_pred);
1748     pu1_pred = pu1_pred + i4_pred_stride;
1749     pred3_in = vld1_u8(pu1_pred);
1750 
1751     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1752     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1753     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1754     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1755 
1756     /* Out pixel = Res + pred */
1757     pred0 = vaddq_s16(pred0, resout0);
1758     pred1 = vaddq_s16(pred1, resout1);
1759     pred2 = vaddq_s16(pred2, resout2);
1760     pred3 = vaddq_s16(pred3, resout3);
1761 
1762     /* Convert to unsigned 8 bit with saturation */
1763     pred0_in = vqmovun_s16(pred0);
1764     pred1_in = vqmovun_s16(pred1);
1765     pred2_in = vqmovun_s16(pred2);
1766     pred3_in = vqmovun_s16(pred3);
1767 
1768     out0 = vld1_u8(pu1_out);
1769     out1 = vld1_u8(pu1_out + i4_out_stride);
1770     out2 = vld1_u8(pu1_out + i4_out_stride * 2);
1771     out3 = vld1_u8(pu1_out + i4_out_stride * 3);
1772 
1773     /* Store out pixels in alternate positions */
1774     out0 = vbsl_u8(chroma_mask_8x8, pred0_in, out0);
1775     out1 = vbsl_u8(chroma_mask_8x8, pred1_in, out1);
1776     out2 = vbsl_u8(chroma_mask_8x8, pred2_in, out2);
1777     out3 = vbsl_u8(chroma_mask_8x8, pred3_in, out3);
1778 
1779     vst1_u8((uint8_t *) (pu1_out), out0);
1780     vst1_u8((uint8_t *) (pu1_out + i4_out_stride), out1);
1781     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), out2);
1782     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), out3);
1783 }
1784