• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  * *******************************************************************************
22  * * @file
23  *  isvc_iquant_itrans_recon_neon.c
24  *
25  * @brief
26  *  neon variants of inverse transform and quantization functions
27  *
28  * *******************************************************************************
29  */
30 #include <arm_neon.h>
31 
32 #include "ih264_typedefs.h"
33 #include "ih264_debug.h"
34 #include "ih264_defs.h"
35 #include "ih264_trans_macros.h"
36 #include "ih264_macros.h"
37 #include "ih264_platform_macros.h"
38 #include "ih264_trans_data.h"
39 #include "ih264_size_defs.h"
40 #include "isvc_structs.h"
41 #include "isvc_trans_quant_itrans_iquant.h"
42 
isvc_iquant_itrans_recon_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)43 void isvc_iquant_itrans_recon_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
44                                        buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
45                                        buffer_container_t *ps_rec,
46                                        iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
47                                        WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
48                                        UWORD8 u1_res_accumulate)
49 {
50     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
51     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
52     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
53     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
54     WORD32 i4_out_stride = ps_rec->i4_data_stride;
55     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
56     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
57     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
58 
59     int16x4x4_t src_16x4x2;
60     int16x4x4_t iscal_16x4x2;
61     int16x4x4_t weigh_16x4x2;
62 
63     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
64     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
65     int16x4_t rq1_16x4, rq3_16x4;
66     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
67     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
68     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
69     int32x2x2_t x0_32x2x2, x1_32x2x2;
70     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
71 
72     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
73     int16x8_t pred0, pred1, pred2, pred3;
74     int16x8_t resd01_in, resd23_in;
75     int16x8_t pred01_in, pred23_in;
76     uint8x8_t pred01_un, pred23_un;
77 
78     int16x8_t pos_255_16x8 = vdupq_n_s16(((WORD16) UINT8_MAX));
79     int16x8_t neg_255_16x8 = vdupq_n_s16(-((WORD16) UINT8_MAX));
80     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
81 
82     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
83     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
84 
85     UNUSED(ps_res);
86     UNUSED(ps_res_pred);
87     UNUSED(u1_res_accumulate);
88 
89     src_16x4x2 = vld4_s16(pi2_src);
90     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
91     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
92 
93     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
94     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
95     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
96     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
97 
98     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
99     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
100     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
101     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
102 
103     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
104     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
105     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
106     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
107 
108     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
109     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
110     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
111     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
112 
113     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
114     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
115     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
116     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
117 
118     if(i4_iq_start_idx == 1)
119     {
120         q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
121     }
122 
123     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
124     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
125 
126     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
127     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
128     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
129     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
130 
131     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
132     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
133     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
134     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
135 
136     /* row 0 to row 3 */
137     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
138     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
139     x0_32x2x2 =
140         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
141     x1_32x2x2 =
142         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
143 
144     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
145     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
146     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
147     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
148 
149     /* Store Horz transform output into temp */
150     vst1_s16(pi2_tmp, x0_16x4);
151     vst1_s16(pi2_tmp + 4, x1_16x4);
152     vst1_s16(pi2_tmp + 8, x2_16x4);
153     vst1_s16(pi2_tmp + 12, x3_16x4);
154 
155     /* vertical inverse transform */
156     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
157     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
158 
159     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
160     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
161     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
162     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
163 
164     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
165     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
166     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
167     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
168 
169     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
170     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
171     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
172     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
173 
174     resd01_in = vcombine_s16(x0_16x4, x1_16x4);
175     resd23_in = vcombine_s16(x2_16x4, x3_16x4);
176 
177     /* Saturate all values < -255 to -255 and retain the rest as it is */
178     resd01_in = vmaxq_s16(resd01_in, neg_255_16x8);
179     resd23_in = vmaxq_s16(resd23_in, neg_255_16x8);
180 
181     /* Saturate all values > 255 to 255 and retain the rest as it is */
182     resd01_in = vminq_s16(resd01_in, pos_255_16x8);
183     resd23_in = vminq_s16(resd23_in, pos_255_16x8);
184 
185     /* Load pred */
186     pred0_in = vld1_u8((uint8_t *) pu1_pred);
187     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
188     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
189     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
190 
191     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
192     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
193     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
194     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
195 
196     pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
197     pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
198 
199     /* Out pixel = pred + res */
200     pred01_in = vaddq_s16(pred01_in, resd01_in);
201     pred23_in = vaddq_s16(pred23_in, resd23_in);
202 
203     /* Convert to 8 bit unsigned with saturation */
204     pred01_un = vqmovun_s16(pred01_in);
205     pred23_un = vqmovun_s16(pred23_in);
206 
207     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
208     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
209     vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
210     vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
211                   vreinterpret_u32_u8(pred23_un), 1);
212 }
213 
isvc_iquant_itrans_recon_4x4_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)214 void isvc_iquant_itrans_recon_4x4_with_res_output_neon(
215     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
216     buffer_container_t *ps_res, buffer_container_t *ps_rec,
217     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
218     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
219 {
220     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
221     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
222     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
223     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
224     WORD32 i4_res_stride = ps_res->i4_data_stride;
225     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
226     WORD32 i4_out_stride = ps_rec->i4_data_stride;
227     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
228     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
229     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
230 
231     int16x4x4_t src_16x4x2;
232     int16x4x4_t iscal_16x4x2;
233     int16x4x4_t weigh_16x4x2;
234 
235     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
236     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
237     int16x4_t rq1_16x4, rq3_16x4;
238     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
239     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
240     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
241     int32x2x2_t x0_32x2x2, x1_32x2x2;
242     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
243 
244     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
245     int16x8_t pred0, pred1, pred2, pred3;
246     int16x8_t resd01_in, resd23_in;
247     int16x8_t pred01_in, pred23_in;
248     uint8x8_t pred01_un, pred23_un;
249 
250     int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
251     int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
252     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
253 
254     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
255     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
256 
257     UNUSED(ps_res_pred);
258     UNUSED(u1_res_accumulate);
259 
260     src_16x4x2 = vld4_s16(pi2_src);
261     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
262     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
263 
264     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
265     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
266     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
267     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
268 
269     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
270     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
271     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
272     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
273 
274     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
275     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
276     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
277     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
278 
279     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
280     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
281     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
282     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
283 
284     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
285     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
286     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
287     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
288 
289     if(i4_iq_start_idx == 1)
290     {
291         q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
292     }
293 
294     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
295     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
296 
297     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
298     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
299     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
300     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
301 
302     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
303     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
304     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
305     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
306 
307     /* row 0 to row 3 */
308     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
309     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
310     x0_32x2x2 =
311         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
312     x1_32x2x2 =
313         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
314 
315     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
316     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
317     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
318     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
319 
320     /* Store Horz transform output into temp */
321     vst1_s16(pi2_tmp, x0_16x4);
322     vst1_s16(pi2_tmp + 4, x1_16x4);
323     vst1_s16(pi2_tmp + 8, x2_16x4);
324     vst1_s16(pi2_tmp + 12, x3_16x4);
325 
326     /* vertical inverse transform */
327     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
328     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
329 
330     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
331     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
332     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
333     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
334 
335     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
336     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
337     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
338     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
339 
340     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
341     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
342     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
343     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
344 
345     /* Saturate all values < -255 to -255 and retain the rest as it is */
346     x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
347     x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
348     x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
349     x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
350 
351     /* Saturate all values > 255 to 255 and retain the rest as it is */
352     x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
353     x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
354     x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
355     x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
356 
357     vst1_s16(pi2_res, x0_16x4);
358     vst1_s16(pi2_res + i4_res_stride, x1_16x4);
359     vst1_s16(pi2_res + (i4_res_stride << 1), x2_16x4);
360     vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, x3_16x4);
361 
362     resd01_in = vcombine_s16(x0_16x4, x1_16x4);
363     resd23_in = vcombine_s16(x2_16x4, x3_16x4);
364 
365     /* Load pred */
366     pred0_in = vld1_u8((uint8_t *) pu1_pred);
367     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
368     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
369     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
370 
371     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
372     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
373     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
374     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
375 
376     pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
377     pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
378 
379     /* Out pixel = pred + res */
380     pred01_in = vaddq_s16(pred01_in, resd01_in);
381     pred23_in = vaddq_s16(pred23_in, resd23_in);
382 
383     /* Convert to 8 bit unsigned with saturation */
384     pred01_un = vqmovun_s16(pred01_in);
385     pred23_un = vqmovun_s16(pred23_in);
386 
387     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
388     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
389     vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
390     vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
391                   vreinterpret_u32_u8(pred23_un), 1);
392 }
393 
isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)394 void isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon(
395     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
396     buffer_container_t *ps_res, buffer_container_t *ps_rec,
397     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
398     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
399 {
400     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
401     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
402     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
403     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
404     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
405     WORD32 i4_res_stride = ps_res->i4_data_stride;
406     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
407     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
408     WORD32 i4_out_stride = ps_rec->i4_data_stride;
409     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
410     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
411     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
412 
413     int16x4x4_t src_16x4x2;
414     int16x4x4_t iscal_16x4x2;
415     int16x4x4_t weigh_16x4x2;
416 
417     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
418     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
419     int16x4_t rq1_16x4, rq3_16x4;
420     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
421     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
422     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
423     int32x2x2_t x0_32x2x2, x1_32x2x2;
424     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
425 
426     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
427     int16x8_t pred0, pred1, pred2, pred3;
428     int16x4_t resd0_in, resd1_in, resd2_in, resd3_in;
429     int16x8_t resd01_in, resd23_in;
430     int16x8_t pred01_in, pred23_in;
431     uint8x8_t pred01_un, pred23_un;
432 
433     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
434 
435     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
436     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
437     int16x4_t pos_255 = vdup_n_s16(((WORD16) UINT8_MAX));
438     int16x4_t neg_255 = vdup_n_s16(-((WORD16) UINT8_MAX));
439 
440     UNUSED(u1_res_accumulate);
441 
442     src_16x4x2 = vld4_s16(pi2_src);
443     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
444     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
445 
446     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
447     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
448     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
449     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
450 
451     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
452     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
453     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
454     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
455 
456     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
457     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
458     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
459     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
460 
461     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
462     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
463     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
464     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
465 
466     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
467     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
468     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
469     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
470 
471     if(i4_iq_start_idx == 1)
472     {
473         q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
474     }
475 
476     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
477     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
478 
479     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
480     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
481     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
482     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
483 
484     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
485     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
486     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
487     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
488 
489     /* row 0 to row 3 */
490     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
491     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
492     x0_32x2x2 =
493         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
494     x1_32x2x2 =
495         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
496 
497     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
498     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
499     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
500     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
501 
502     /* Store Horz transform output into temp */
503     vst1_s16(pi2_tmp, x0_16x4);
504     vst1_s16(pi2_tmp + 4, x1_16x4);
505     vst1_s16(pi2_tmp + 8, x2_16x4);
506     vst1_s16(pi2_tmp + 12, x3_16x4);
507 
508     /* vertical inverse transform */
509     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
510     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
511 
512     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
513     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
514     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
515     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
516 
517     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
518     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
519     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
520     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
521 
522     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
523     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
524     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
525     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
526 
527     /* Accumulating Res */
528 
529     /* Load Res pred */
530     resd0_in = vld1_s16((int16_t *) pi2_res_pred);
531     resd1_in = vld1_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
532     resd2_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
533     resd3_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
534 
535     /* Add res pred with res obtained */
536     resd0_in = vadd_s16(resd0_in, x0_16x4);
537     resd1_in = vadd_s16(resd1_in, x1_16x4);
538     resd2_in = vadd_s16(resd2_in, x2_16x4);
539     resd3_in = vadd_s16(resd3_in, x3_16x4);
540 
541     /* Saturate all values < -255 to -255 and retain the rest as it is */
542     resd0_in = vmax_s16(resd0_in, neg_255);
543     resd1_in = vmax_s16(resd1_in, neg_255);
544     resd2_in = vmax_s16(resd2_in, neg_255);
545     resd3_in = vmax_s16(resd3_in, neg_255);
546 
547     /* Saturate all values > 255 to 255 and retain the rest as it is */
548     resd0_in = vmin_s16(resd0_in, pos_255);
549     resd1_in = vmin_s16(resd1_in, pos_255);
550     resd2_in = vmin_s16(resd2_in, pos_255);
551     resd3_in = vmin_s16(resd3_in, pos_255);
552 
553     vst1_s16(pi2_res, resd0_in);
554     vst1_s16(pi2_res + i4_res_stride, resd1_in);
555     vst1_s16(pi2_res + (i4_res_stride << 1), resd2_in);
556     vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resd3_in);
557 
558     resd01_in = vcombine_s16(resd0_in, resd1_in);
559     resd23_in = vcombine_s16(resd2_in, resd3_in);
560 
561     /* Load pred */
562     pred0_in = vld1_u8((uint8_t *) pu1_pred);
563     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
564     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
565     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
566 
567     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
568     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
569     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
570     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
571 
572     pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
573     pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
574 
575     /* Out pixel = pred + res */
576     pred01_in = vaddq_s16(pred01_in, resd01_in);
577     pred23_in = vaddq_s16(pred23_in, resd23_in);
578 
579     /* Convert to 8 bit unsigned with saturation */
580     pred01_un = vqmovun_s16(pred01_in);
581     pred23_un = vqmovun_s16(pred23_in);
582 
583     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
584     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
585     vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
586     vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
587                   vreinterpret_u32_u8(pred23_un), 1);
588 }
589 
isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)590 void isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon(
591     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
592     buffer_container_t *ps_res, buffer_container_t *ps_rec,
593     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
594     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
595 {
596     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
597     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
598     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
599     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
600     WORD32 i4_res_stride = ps_res->i4_data_stride;
601     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
602     WORD32 i4_out_stride = ps_rec->i4_data_stride;
603     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
604     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
605     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
606 
607     WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
608 
609     int16x4x4_t src_16x4x2;
610     int16x4x4_t iscal_16x4x2;
611     int16x4x4_t weigh_16x4x2;
612 
613     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
614     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
615     int16x4_t rq1_16x4, rq3_16x4;
616     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
617     int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
618     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
619     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
620     int32x2x2_t x0_32x2x2, x1_32x2x2;
621     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
622 
623     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
624     int16x8_t pred0, pred1, pred2, pred3;
625     int16x8_t rec0, rec1, rec2, rec3;
626     uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
627     uint8x8_t out0, out1, out2, out3;
628     int16x8_t resout0, resout1, resout2, resout3;
629 
630     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
631     uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
632     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
633     int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
634     int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
635     int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
636 
637     UNUSED(i4_iq_start_idx);
638     UNUSED(ps_res_pred);
639     UNUSED(u1_res_accumulate);
640 
641     src_16x4x2 = vld4_s16(pi2_src);
642     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
643     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
644 
645     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
646     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
647     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
648     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
649 
650     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
651     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
652     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
653     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
654 
655     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
656     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
657     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
658     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
659 
660     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
661     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
662     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
663     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
664 
665     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
666     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
667     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
668     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
669 
670     q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
671 
672     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
673     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
674 
675     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
676     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
677     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
678     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
679 
680     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
681     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
682     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
683     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
684 
685     /* row 0 to row 3 */
686     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
687     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
688     x0_32x2x2 =
689         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
690     x1_32x2x2 =
691         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
692 
693     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
694     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
695     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
696     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
697 
698     /* Store Horz transform output into temp */
699     vst1_s16(pi2_tmp, x0_16x4);
700     vst1_s16(pi2_tmp + 4, x1_16x4);
701     vst1_s16(pi2_tmp + 8, x2_16x4);
702     vst1_s16(pi2_tmp + 12, x3_16x4);
703 
704     /* vertical inverse transform */
705     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
706     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
707 
708     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
709     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
710     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
711     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
712 
713     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
714     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
715     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
716     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
717 
718     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
719     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
720     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
721     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
722 
723     /* Saturate all values < -255 to -255 and retain the rest as it is */
724     x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
725     x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
726     x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
727     x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
728 
729     /* Saturate all values > 255 to 255 and retain the rest as it is */
730     x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
731     x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
732     x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
733     x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
734 
735     resout0 = vld1q_s16(pi2_res);
736     resout1 = vld1q_s16(pi2_res + i4_res_stride);
737     resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
738     resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
739 
740     x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
741     x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
742     x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
743     x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
744 
745     /* Storing res in alternate positions */
746     resout0 = vbslq_s16(chroma_mask_16x8, x0_16x8, resout0);
747     resout1 = vbslq_s16(chroma_mask_16x8, x1_16x8, resout1);
748     resout2 = vbslq_s16(chroma_mask_16x8, x2_16x8, resout2);
749     resout3 = vbslq_s16(chroma_mask_16x8, x3_16x8, resout3);
750 
751     vst1q_s16(pi2_res, resout0);
752     vst1q_s16(pi2_res + i4_res_stride, resout1);
753     vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
754     vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
755 
756     pred0_in = vld1_u8((uint8_t *) pu1_pred);
757     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
758     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
759     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
760 
761     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
762     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
763     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
764     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
765 
766     /* Out pixel = pred + res */
767     rec0 = vaddq_s16(pred0, x0_16x8);
768     rec1 = vaddq_s16(pred1, x1_16x8);
769     rec2 = vaddq_s16(pred2, x2_16x8);
770     rec3 = vaddq_s16(pred3, x3_16x8);
771 
772     out0 = vld1_u8(pu1_out);
773     out1 = vld1_u8(pu1_out + i4_out_stride);
774     out2 = vld1_u8(pu1_out + i4_out_stride * 2);
775     out3 = vld1_u8(pu1_out + i4_out_stride * 3);
776 
777     /* Convert to 8 bit unsigned with saturation */
778     rec0_un = vqmovun_s16(rec0);
779     rec1_un = vqmovun_s16(rec1);
780     rec2_un = vqmovun_s16(rec2);
781     rec3_un = vqmovun_s16(rec3);
782 
783     /* Store output pixels in alternate positions */
784     out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
785     out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
786     out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
787     out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
788 
789     vst1_u8((pu1_out), out0);
790     vst1_u8((pu1_out + i4_out_stride), out1);
791     vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
792     vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
793 }
794 
isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)795 void isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon(
796     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
797     buffer_container_t *ps_res, buffer_container_t *ps_rec,
798     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
799     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
800 {
801     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
802     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
803     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
804     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
805     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
806     WORD32 i4_res_stride = ps_res->i4_data_stride;
807     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
808     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
809     WORD32 i4_out_stride = ps_rec->i4_data_stride;
810     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
811     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
812     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
813 
814     WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
815 
816     int16x4x4_t src_16x4x2;
817     int16x4x4_t iscal_16x4x2;
818     int16x4x4_t weigh_16x4x2;
819 
820     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
821     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
822     int16x4_t rq1_16x4, rq3_16x4;
823     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
824     int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
825     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
826     int16x4x2_t xx0_16x4x2, xx1_16x4x2;
827     int32x2x2_t x0_32x2x2, x1_32x2x2;
828     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
829 
830     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
831     int16x8_t pred0, pred1, pred2, pred3;
832     int16x8_t rec0, rec1, rec2, rec3;
833     uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
834     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
835     int16x8_t resd1_in_mask, resd2_in_mask, resd3_in_mask;
836     uint8x8_t out0, out1, out2, out3;
837     int16x8_t resout0, resout1, resout2, resout3;
838     int16x8_t pos_255 = vdupq_n_s16(((WORD16) UINT8_MAX));
839     int16x8_t neg_255 = vdupq_n_s16(-((WORD16) UINT8_MAX));
840 
841     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
842     uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
843 
844     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
845     int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
846 
847     int16x8_t resd0_in_mask = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000};
848 
849     UNUSED(i4_iq_start_idx);
850     UNUSED(u1_res_accumulate);
851 
852     resd1_in_mask = resd0_in_mask;
853     resd2_in_mask = resd0_in_mask;
854     resd3_in_mask = resd0_in_mask;
855 
856     src_16x4x2 = vld4_s16(pi2_src);
857     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
858     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
859 
860     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
861     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
862     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
863     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
864 
865     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
866     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
867     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
868     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
869 
870     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
871     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
872     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
873     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
874 
875     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
876     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
877     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
878     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
879 
880     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
881     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
882     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
883     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
884 
885     q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
886 
887     rq1_16x4 = vshr_n_s16(q1_16x4, 1);
888     rq3_16x4 = vshr_n_s16(q3_16x4, 1);
889 
890     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
891     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
892     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
893     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
894 
895     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
896     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
897     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
898     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
899 
900     /* row 0 to row 3 */
901     xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
902     xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
903     x0_32x2x2 =
904         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
905     x1_32x2x2 =
906         vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
907 
908     x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
909     x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
910     x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
911     x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
912 
913     /* Store Horz transform output into temp */
914     vst1_s16(pi2_tmp, x0_16x4);
915     vst1_s16(pi2_tmp + 4, x1_16x4);
916     vst1_s16(pi2_tmp + 8, x2_16x4);
917     vst1_s16(pi2_tmp + 12, x3_16x4);
918 
919     /* vertical inverse transform */
920     rq1_16x4 = vshr_n_s16(x1_16x4, 1);
921     rq3_16x4 = vshr_n_s16(x3_16x4, 1);
922 
923     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
924     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
925     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
926     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
927 
928     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
929     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
930     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
931     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
932 
933     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
934     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
935     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
936     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
937 
938     resd0_in = vld1q_s16((int16_t *) pi2_res_pred);
939     resd1_in = vld1q_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
940     resd2_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
941     resd3_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
942 
943     /* Mask alternate values */
944     resd0_in_mask = vbslq_s16(chroma_mask_16x8, resd0_in, resd0_in_mask);
945     resd1_in_mask = vbslq_s16(chroma_mask_16x8, resd1_in, resd1_in_mask);
946     resd2_in_mask = vbslq_s16(chroma_mask_16x8, resd2_in, resd2_in_mask);
947     resd3_in_mask = vbslq_s16(chroma_mask_16x8, resd3_in, resd3_in_mask);
948 
949     x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
950     x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
951     x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
952     x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
953 
954     resd0_in = vaddq_s16(resd0_in_mask, x0_16x8);
955     resd1_in = vaddq_s16(resd1_in_mask, x1_16x8);
956     resd2_in = vaddq_s16(resd2_in_mask, x2_16x8);
957     resd3_in = vaddq_s16(resd3_in_mask, x3_16x8);
958 
959     /* Saturate all values < -255 to -255 and retain the rest as it is */
960     resd0_in = vmaxq_s16(resd0_in, neg_255);
961     resd1_in = vmaxq_s16(resd1_in, neg_255);
962     resd2_in = vmaxq_s16(resd2_in, neg_255);
963     resd3_in = vmaxq_s16(resd3_in, neg_255);
964 
965     /* Saturate all values > 255 to 255 and retain the rest as it is */
966     resd0_in = vminq_s16(resd0_in, pos_255);
967     resd1_in = vminq_s16(resd1_in, pos_255);
968     resd2_in = vminq_s16(resd2_in, pos_255);
969     resd3_in = vminq_s16(resd3_in, pos_255);
970 
971     resout0 = vld1q_s16(pi2_res);
972     resout1 = vld1q_s16(pi2_res + i4_res_stride);
973     resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
974     resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
975 
976     /* Store res in aternate positions */
977     resout0 = vbslq_s16(chroma_mask_16x8, resd0_in, resout0);
978     resout1 = vbslq_s16(chroma_mask_16x8, resd1_in, resout1);
979     resout2 = vbslq_s16(chroma_mask_16x8, resd2_in, resout2);
980     resout3 = vbslq_s16(chroma_mask_16x8, resd3_in, resout3);
981 
982     vst1q_s16(pi2_res, resout0);
983     vst1q_s16(pi2_res + i4_res_stride, resout1);
984     vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
985     vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
986 
987     pred0_in = vld1_u8((uint8_t *) pu1_pred);
988     pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
989     pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
990     pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
991 
992     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
993     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
994     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
995     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
996 
997     /* Out pixel = pred + res */
998     rec0 = vaddq_s16(pred0, resout0);
999     rec1 = vaddq_s16(pred1, resout1);
1000     rec2 = vaddq_s16(pred2, resout2);
1001     rec3 = vaddq_s16(pred3, resout3);
1002 
1003     out0 = vld1_u8(pu1_out);
1004     out1 = vld1_u8(pu1_out + i4_out_stride);
1005     out2 = vld1_u8(pu1_out + i4_out_stride * 2);
1006     out3 = vld1_u8(pu1_out + i4_out_stride * 3);
1007 
1008     /* Convert to 8 bit unsigned with saturation */
1009     rec0_un = vqmovun_s16(rec0);
1010     rec1_un = vqmovun_s16(rec1);
1011     rec2_un = vqmovun_s16(rec2);
1012     rec3_un = vqmovun_s16(rec3);
1013 
1014     /* Store output pixels in alternate positions */
1015     out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
1016     out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
1017     out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
1018     out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
1019 
1020     vst1_u8((pu1_out), out0);
1021     vst1_u8((pu1_out + i4_out_stride), out1);
1022     vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
1023     vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
1024 }
1025 
isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1026 void isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
1027                                           buffer_container_t *ps_res_pred,
1028                                           buffer_container_t *ps_res, buffer_container_t *ps_rec,
1029                                           iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
1030                                           WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1031                                           WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1032 {
1033     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1034     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1035     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1036     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1037     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1038     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1039     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1040     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1041     WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1042 
1043     WORD32 i4_iq_out_temp;
1044     int16x8_t temp_0;
1045     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1046     int16x8_t pred0, pred1, pred2, pred3;
1047 
1048     UNUSED(pi2_tmp);
1049     UNUSED(ps_res);
1050     UNUSED(ps_res_pred);
1051     UNUSED(u1_res_accumulate);
1052 
1053     if(i4_iq_start_idx == 0)
1054     {
1055         i4_iq_out_temp = pi2_src[0];
1056         INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1057     }
1058     else
1059     {
1060         i4_iq_out_temp = pi2_dc_src[0];
1061     }
1062 
1063     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1064 
1065     pred0_in = vld1_u8(pu1_pred);
1066     pu1_pred = pu1_pred + i4_pred_stride;
1067     pred1_in = vld1_u8(pu1_pred);
1068     pu1_pred = pu1_pred + i4_pred_stride;
1069     pred2_in = vld1_u8(pu1_pred);
1070     pu1_pred = pu1_pred + i4_pred_stride;
1071     pred3_in = vld1_u8(pu1_pred);
1072 
1073     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1074     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1075     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1076     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1077 
1078     /* Out pixel = Res + pred */
1079     pred0 = vaddq_s16(pred0, temp_0);
1080     pred1 = vaddq_s16(pred1, temp_0);
1081     pred2 = vaddq_s16(pred2, temp_0);
1082     pred3 = vaddq_s16(pred3, temp_0);
1083 
1084     /* Convert to unsigned 8 bit with saturation */
1085     pred0_in = vqmovun_s16(pred0);
1086     pred1_in = vqmovun_s16(pred1);
1087     pred2_in = vqmovun_s16(pred2);
1088     pred3_in = vqmovun_s16(pred3);
1089 
1090     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
1091     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred1_in), 0);
1092     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 2), vreinterpret_u32_u8(pred2_in), 0);
1093     vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0);
1094 }
1095 
isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1096 void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon(
1097     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1098     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1099     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1100     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1101 {
1102     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1103     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1104     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1105     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1106     WORD32 i4_res_stride = ps_res->i4_data_stride;
1107     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1108     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1109     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1110     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1111     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1112 
1113     WORD16 i2_it_out;
1114     WORD32 i4_iq_out_temp;
1115     int16x8_t temp_0, residue_res;
1116     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1117     int16x8_t pred0, pred1, pred2, pred3;
1118     int16x8_t resout0, resout1, resout2, resout3;
1119 
1120     uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
1121     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1122     uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1123 
1124     UNUSED(pi2_src);
1125     UNUSED(pu2_iscal_mat);
1126     UNUSED(pu2_weigh_mat);
1127     UNUSED(u4_qp_div_6);
1128     UNUSED(pi2_tmp);
1129     UNUSED(i4_iq_start_idx);
1130     UNUSED(ps_res_pred);
1131     UNUSED(u1_res_accumulate);
1132 
1133     i4_iq_out_temp = pi2_dc_src[0];
1134 
1135     i2_it_out = ((i4_iq_out_temp + 32) >> 6);
1136     temp_0 = vdupq_n_s16(i2_it_out);
1137     residue_res = vdupq_n_s16(isvc_get_residue(i2_it_out, 0, 0));
1138 
1139     resout0 = vld1q_s16(pi2_res);
1140     resout1 = vld1q_s16(pi2_res + i4_res_stride);
1141     resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1142     resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1143 
1144     /* Store res in alternate positions */
1145     resout0 = vbslq_s16(chroma_mask_16x8, residue_res, resout0);
1146     resout1 = vbslq_s16(chroma_mask_16x8, residue_res, resout1);
1147     resout2 = vbslq_s16(chroma_mask_16x8, residue_res, resout2);
1148     resout3 = vbslq_s16(chroma_mask_16x8, residue_res, resout3);
1149 
1150     vst1q_s16(pi2_res, resout0);
1151     vst1q_s16(pi2_res + i4_res_stride, resout1);
1152     vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1153     vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1154 
1155     pred0_in = vld1_u8(pu1_pred);
1156     pu1_pred = pu1_pred + i4_pred_stride;
1157     pred1_in = vld1_u8(pu1_pred);
1158     pu1_pred = pu1_pred + i4_pred_stride;
1159     pred2_in = vld1_u8(pu1_pred);
1160     pu1_pred = pu1_pred + i4_pred_stride;
1161     pred3_in = vld1_u8(pu1_pred);
1162 
1163     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1164     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1165     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1166     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1167 
1168     /* Out pixel = Res + pred */
1169     pred0 = vaddq_s16(pred0, temp_0);
1170     pred1 = vaddq_s16(pred1, temp_0);
1171     pred2 = vaddq_s16(pred2, temp_0);
1172     pred3 = vaddq_s16(pred3, temp_0);
1173 
1174     /* Convert to unsigned 8 bit with saturation */
1175     pred0_in = vqmovun_s16(pred0);
1176     pred1_in = vqmovun_s16(pred1);
1177     pred2_in = vqmovun_s16(pred2);
1178     pred3_in = vqmovun_s16(pred3);
1179 
1180     /* Store out pixels in alternate positions */
1181     i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
1182     i4_out_horz_8x8_r1 = vld1_u8(pu1_out + i4_out_stride);
1183     i4_out_horz_8x8_r2 = vld1_u8(pu1_out + i4_out_stride * 2);
1184     i4_out_horz_8x8_r3 = vld1_u8(pu1_out + i4_out_stride * 3);
1185 
1186     i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
1187     i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
1188     i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
1189     i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
1190 
1191     vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
1192     vst1_u8((uint8_t *) (pu1_out + i4_out_stride), i4_out_horz_8x8_r1);
1193     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), i4_out_horz_8x8_r2);
1194     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), i4_out_horz_8x8_r3);
1195 }
1196 
isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1197 void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon(
1198     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1199     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1200     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1201     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1202 {
1203     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1204     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1205     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1206     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1207     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1208     WORD32 i4_res_stride = ps_res->i4_data_stride;
1209     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1210     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1211     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1212     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1213     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1214     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1215 
1216     WORD32 i4_iq_out_temp;
1217     int16x8_t temp_0;
1218     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1219     int16x8_t pred0, pred1, pred2, pred3;
1220     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1221     int16x8_t resout0, resout1, resout2, resout3;
1222     int16x8_t resd1_in_mask, resd2_in_mask, resd3_in_mask;
1223     uint8x8_t out0, out1, out2, out3;
1224     int16x8_t pos_255 = vdupq_n_s16(((WORD16) UINT8_MAX));
1225     int16x8_t neg_255 = vdupq_n_s16(-((WORD16) UINT8_MAX));
1226     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1227     uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1228 
1229     int16x8_t resd0_in_mask = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000};
1230 
1231     UNUSED(pi2_src);
1232     UNUSED(pu2_iscal_mat);
1233     UNUSED(pu2_weigh_mat);
1234     UNUSED(u4_qp_div_6);
1235     UNUSED(pi2_tmp);
1236     UNUSED(i4_iq_start_idx);
1237     UNUSED(u1_res_accumulate);
1238 
1239     resd1_in_mask = resd0_in_mask;
1240     resd2_in_mask = resd0_in_mask;
1241     resd3_in_mask = resd0_in_mask;
1242 
1243     i4_iq_out_temp = pi2_dc_src[0];
1244     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1245 
1246     resd0_in = vld1q_s16((int16_t *) pi2_res_pred);
1247     resd1_in = vld1q_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
1248     resd2_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
1249     resd3_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
1250 
1251     /* Mask alternate values of res pred */
1252     resd0_in_mask = vbslq_s16(chroma_mask_16x8, resd0_in, resd0_in_mask);
1253     resd1_in_mask = vbslq_s16(chroma_mask_16x8, resd1_in, resd1_in_mask);
1254     resd2_in_mask = vbslq_s16(chroma_mask_16x8, resd2_in, resd2_in_mask);
1255     resd3_in_mask = vbslq_s16(chroma_mask_16x8, resd3_in, resd3_in_mask);
1256 
1257     /* Add res pred to res obtained */
1258     resd0_in = vaddq_s16(resd0_in_mask, temp_0);
1259     resd1_in = vaddq_s16(resd1_in_mask, temp_0);
1260     resd2_in = vaddq_s16(resd2_in_mask, temp_0);
1261     resd3_in = vaddq_s16(resd3_in_mask, temp_0);
1262 
1263     /* Saturate all values < -255 to -255 and retain the rest as it is */
1264     resd0_in = vmaxq_s16(resd0_in, neg_255);
1265     resd1_in = vmaxq_s16(resd1_in, neg_255);
1266     resd2_in = vmaxq_s16(resd2_in, neg_255);
1267     resd3_in = vmaxq_s16(resd3_in, neg_255);
1268 
1269     /* Saturate all values > 255 to 255 and retain the rest as it is */
1270     resd0_in = vminq_s16(resd0_in, pos_255);
1271     resd1_in = vminq_s16(resd1_in, pos_255);
1272     resd2_in = vminq_s16(resd2_in, pos_255);
1273     resd3_in = vminq_s16(resd3_in, pos_255);
1274 
1275     resout0 = vld1q_s16(pi2_res);
1276     resout1 = vld1q_s16(pi2_res + i4_res_stride);
1277     resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1278     resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1279 
1280     /* Store res in alternate positions */
1281     resout0 = vbslq_s16(chroma_mask_16x8, resd0_in, resout0);
1282     resout1 = vbslq_s16(chroma_mask_16x8, resd1_in, resout1);
1283     resout2 = vbslq_s16(chroma_mask_16x8, resd2_in, resout2);
1284     resout3 = vbslq_s16(chroma_mask_16x8, resd3_in, resout3);
1285 
1286     vst1q_s16(pi2_res, resout0);
1287     vst1q_s16(pi2_res + i4_res_stride, resout1);
1288     vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1289     vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1290 
1291     pred0_in = vld1_u8(pu1_pred);
1292     pu1_pred = pu1_pred + i4_pred_stride;
1293     pred1_in = vld1_u8(pu1_pred);
1294     pu1_pred = pu1_pred + i4_pred_stride;
1295     pred2_in = vld1_u8(pu1_pred);
1296     pu1_pred = pu1_pred + i4_pred_stride;
1297     pred3_in = vld1_u8(pu1_pred);
1298 
1299     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1300     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1301     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1302     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1303 
1304     /* Out pixel = Res + pred */
1305     pred0 = vaddq_s16(pred0, resout0);
1306     pred1 = vaddq_s16(pred1, resout1);
1307     pred2 = vaddq_s16(pred2, resout2);
1308     pred3 = vaddq_s16(pred3, resout3);
1309 
1310     /* Convert to unsigned 8 bit with saturation */
1311     pred0_in = vqmovun_s16(pred0);
1312     pred1_in = vqmovun_s16(pred1);
1313     pred2_in = vqmovun_s16(pred2);
1314     pred3_in = vqmovun_s16(pred3);
1315 
1316     out0 = vld1_u8(pu1_out);
1317     out1 = vld1_u8(pu1_out + i4_out_stride);
1318     out2 = vld1_u8(pu1_out + i4_out_stride * 2);
1319     out3 = vld1_u8(pu1_out + i4_out_stride * 3);
1320 
1321     /* Store out pixels in alternate positions */
1322     out0 = vbsl_u8(chroma_mask_8x8, pred0_in, out0);
1323     out1 = vbsl_u8(chroma_mask_8x8, pred1_in, out1);
1324     out2 = vbsl_u8(chroma_mask_8x8, pred2_in, out2);
1325     out3 = vbsl_u8(chroma_mask_8x8, pred3_in, out3);
1326 
1327     vst1_u8((uint8_t *) (pu1_out), out0);
1328     vst1_u8((uint8_t *) (pu1_out + i4_out_stride), out1);
1329     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), out2);
1330     vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), out3);
1331 }
1332