1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 * *******************************************************************************
22 * * @file
23 * isvc_iquant_itrans_recon_neon.c
24 *
25 * @brief
26 * neon variants of inverse transform and quantization functions
27 *
28 * *******************************************************************************
29 */
30 #include <arm_neon.h>
31
32 #include "ih264_typedefs.h"
33 #include "ih264_debug.h"
34 #include "ih264_defs.h"
35 #include "ih264_trans_macros.h"
36 #include "ih264_macros.h"
37 #include "ih264_platform_macros.h"
38 #include "ih264_trans_data.h"
39 #include "ih264_size_defs.h"
40 #include "isvc_structs.h"
41 #include "isvc_trans_quant_itrans_iquant.h"
42
isvc_iquant_itrans_recon_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)43 void isvc_iquant_itrans_recon_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
44 buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
45 buffer_container_t *ps_rec,
46 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
47 WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
48 UWORD8 u1_res_accumulate)
49 {
50 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
51 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
52 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
53 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
54 WORD32 i4_out_stride = ps_rec->i4_data_stride;
55 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
56 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
57 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
58
59 int16x4x4_t src_16x4x2;
60 int16x4x4_t iscal_16x4x2;
61 int16x4x4_t weigh_16x4x2;
62
63 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
64 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
65 int16x4_t rq1_16x4, rq3_16x4;
66 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
67 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
68 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
69 int32x2x2_t x0_32x2x2, x1_32x2x2;
70 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
71
72 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
73 int16x8_t pred0, pred1, pred2, pred3;
74 int16x8_t resd01_in, resd23_in;
75 int16x8_t pred01_in, pred23_in;
76 uint8x8_t pred01_un, pred23_un;
77
78 int16x8_t pos_255_16x8 = vdupq_n_s16(((WORD16) UINT8_MAX));
79 int16x8_t neg_255_16x8 = vdupq_n_s16(-((WORD16) UINT8_MAX));
80 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
81
82 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
83 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
84
85 UNUSED(ps_res);
86 UNUSED(ps_res_pred);
87 UNUSED(u1_res_accumulate);
88
89 src_16x4x2 = vld4_s16(pi2_src);
90 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
91 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
92
93 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
94 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
95 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
96 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
97
98 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
99 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
100 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
101 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
102
103 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
104 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
105 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
106 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
107
108 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
109 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
110 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
111 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
112
113 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
114 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
115 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
116 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
117
118 if(i4_iq_start_idx == 1)
119 {
120 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
121 }
122
123 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
124 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
125
126 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
127 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
128 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
129 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
130
131 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
132 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
133 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
134 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
135
136 /* row 0 to row 3 */
137 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
138 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
139 x0_32x2x2 =
140 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
141 x1_32x2x2 =
142 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
143
144 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
145 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
146 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
147 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
148
149 /* Store Horz transform output into temp */
150 vst1_s16(pi2_tmp, x0_16x4);
151 vst1_s16(pi2_tmp + 4, x1_16x4);
152 vst1_s16(pi2_tmp + 8, x2_16x4);
153 vst1_s16(pi2_tmp + 12, x3_16x4);
154
155 /* vertical inverse transform */
156 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
157 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
158
159 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
160 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
161 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
162 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
163
164 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
165 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
166 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
167 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
168
169 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
170 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
171 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
172 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
173
174 resd01_in = vcombine_s16(x0_16x4, x1_16x4);
175 resd23_in = vcombine_s16(x2_16x4, x3_16x4);
176
177 /* Saturate all values < -255 to -255 and retain the rest as it is */
178 resd01_in = vmaxq_s16(resd01_in, neg_255_16x8);
179 resd23_in = vmaxq_s16(resd23_in, neg_255_16x8);
180
181 /* Saturate all values > 255 to 255 and retain the rest as it is */
182 resd01_in = vminq_s16(resd01_in, pos_255_16x8);
183 resd23_in = vminq_s16(resd23_in, pos_255_16x8);
184
185 /* Load pred */
186 pred0_in = vld1_u8((uint8_t *) pu1_pred);
187 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
188 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
189 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
190
191 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
192 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
193 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
194 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
195
196 pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
197 pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
198
199 /* Out pixel = pred + res */
200 pred01_in = vaddq_s16(pred01_in, resd01_in);
201 pred23_in = vaddq_s16(pred23_in, resd23_in);
202
203 /* Convert to 8 bit unsigned with saturation */
204 pred01_un = vqmovun_s16(pred01_in);
205 pred23_un = vqmovun_s16(pred23_in);
206
207 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
208 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
209 vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
210 vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
211 vreinterpret_u32_u8(pred23_un), 1);
212 }
213
isvc_iquant_itrans_recon_4x4_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)214 void isvc_iquant_itrans_recon_4x4_with_res_output_neon(
215 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
216 buffer_container_t *ps_res, buffer_container_t *ps_rec,
217 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
218 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
219 {
220 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
221 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
222 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
223 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
224 WORD32 i4_res_stride = ps_res->i4_data_stride;
225 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
226 WORD32 i4_out_stride = ps_rec->i4_data_stride;
227 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
228 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
229 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
230
231 int16x4x4_t src_16x4x2;
232 int16x4x4_t iscal_16x4x2;
233 int16x4x4_t weigh_16x4x2;
234
235 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
236 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
237 int16x4_t rq1_16x4, rq3_16x4;
238 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
239 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
240 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
241 int32x2x2_t x0_32x2x2, x1_32x2x2;
242 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
243
244 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
245 int16x8_t pred0, pred1, pred2, pred3;
246 int16x8_t resd01_in, resd23_in;
247 int16x8_t pred01_in, pred23_in;
248 uint8x8_t pred01_un, pred23_un;
249
250 int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
251 int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
252 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
253
254 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
255 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
256
257 UNUSED(ps_res_pred);
258 UNUSED(u1_res_accumulate);
259
260 src_16x4x2 = vld4_s16(pi2_src);
261 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
262 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
263
264 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
265 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
266 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
267 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
268
269 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
270 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
271 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
272 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
273
274 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
275 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
276 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
277 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
278
279 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
280 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
281 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
282 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
283
284 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
285 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
286 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
287 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
288
289 if(i4_iq_start_idx == 1)
290 {
291 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
292 }
293
294 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
295 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
296
297 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
298 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
299 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
300 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
301
302 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
303 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
304 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
305 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
306
307 /* row 0 to row 3 */
308 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
309 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
310 x0_32x2x2 =
311 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
312 x1_32x2x2 =
313 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
314
315 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
316 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
317 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
318 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
319
320 /* Store Horz transform output into temp */
321 vst1_s16(pi2_tmp, x0_16x4);
322 vst1_s16(pi2_tmp + 4, x1_16x4);
323 vst1_s16(pi2_tmp + 8, x2_16x4);
324 vst1_s16(pi2_tmp + 12, x3_16x4);
325
326 /* vertical inverse transform */
327 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
328 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
329
330 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
331 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
332 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
333 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
334
335 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
336 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
337 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
338 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
339
340 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
341 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
342 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
343 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
344
345 /* Saturate all values < -255 to -255 and retain the rest as it is */
346 x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
347 x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
348 x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
349 x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
350
351 /* Saturate all values > 255 to 255 and retain the rest as it is */
352 x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
353 x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
354 x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
355 x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
356
357 vst1_s16(pi2_res, x0_16x4);
358 vst1_s16(pi2_res + i4_res_stride, x1_16x4);
359 vst1_s16(pi2_res + (i4_res_stride << 1), x2_16x4);
360 vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, x3_16x4);
361
362 resd01_in = vcombine_s16(x0_16x4, x1_16x4);
363 resd23_in = vcombine_s16(x2_16x4, x3_16x4);
364
365 /* Load pred */
366 pred0_in = vld1_u8((uint8_t *) pu1_pred);
367 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
368 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
369 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
370
371 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
372 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
373 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
374 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
375
376 pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
377 pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
378
379 /* Out pixel = pred + res */
380 pred01_in = vaddq_s16(pred01_in, resd01_in);
381 pred23_in = vaddq_s16(pred23_in, resd23_in);
382
383 /* Convert to 8 bit unsigned with saturation */
384 pred01_un = vqmovun_s16(pred01_in);
385 pred23_un = vqmovun_s16(pred23_in);
386
387 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
388 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
389 vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
390 vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
391 vreinterpret_u32_u8(pred23_un), 1);
392 }
393
isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)394 void isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon(
395 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
396 buffer_container_t *ps_res, buffer_container_t *ps_rec,
397 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
398 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
399 {
400 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
401 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
402 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
403 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
404 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
405 WORD32 i4_res_stride = ps_res->i4_data_stride;
406 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
407 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
408 WORD32 i4_out_stride = ps_rec->i4_data_stride;
409 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
410 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
411 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
412
413 int16x4x4_t src_16x4x2;
414 int16x4x4_t iscal_16x4x2;
415 int16x4x4_t weigh_16x4x2;
416
417 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
418 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
419 int16x4_t rq1_16x4, rq3_16x4;
420 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
421 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
422 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
423 int32x2x2_t x0_32x2x2, x1_32x2x2;
424 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
425
426 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
427 int16x8_t pred0, pred1, pred2, pred3;
428 int16x4_t resd0_in, resd1_in, resd2_in, resd3_in;
429 int16x8_t resd01_in, resd23_in;
430 int16x8_t pred01_in, pred23_in;
431 uint8x8_t pred01_un, pred23_un;
432
433 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
434
435 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
436 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
437 int16x4_t pos_255 = vdup_n_s16(((WORD16) UINT8_MAX));
438 int16x4_t neg_255 = vdup_n_s16(-((WORD16) UINT8_MAX));
439
440 UNUSED(u1_res_accumulate);
441
442 src_16x4x2 = vld4_s16(pi2_src);
443 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
444 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
445
446 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
447 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
448 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
449 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
450
451 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
452 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
453 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
454 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
455
456 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
457 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
458 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
459 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
460
461 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
462 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
463 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
464 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
465
466 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
467 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
468 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
469 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
470
471 if(i4_iq_start_idx == 1)
472 {
473 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
474 }
475
476 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
477 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
478
479 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
480 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
481 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
482 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
483
484 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
485 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
486 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
487 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
488
489 /* row 0 to row 3 */
490 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
491 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
492 x0_32x2x2 =
493 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
494 x1_32x2x2 =
495 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
496
497 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
498 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
499 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
500 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
501
502 /* Store Horz transform output into temp */
503 vst1_s16(pi2_tmp, x0_16x4);
504 vst1_s16(pi2_tmp + 4, x1_16x4);
505 vst1_s16(pi2_tmp + 8, x2_16x4);
506 vst1_s16(pi2_tmp + 12, x3_16x4);
507
508 /* vertical inverse transform */
509 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
510 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
511
512 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
513 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
514 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
515 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
516
517 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
518 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
519 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
520 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
521
522 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
523 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
524 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
525 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
526
527 /* Accumulating Res */
528
529 /* Load Res pred */
530 resd0_in = vld1_s16((int16_t *) pi2_res_pred);
531 resd1_in = vld1_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
532 resd2_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
533 resd3_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
534
535 /* Add res pred with res obtained */
536 resd0_in = vadd_s16(resd0_in, x0_16x4);
537 resd1_in = vadd_s16(resd1_in, x1_16x4);
538 resd2_in = vadd_s16(resd2_in, x2_16x4);
539 resd3_in = vadd_s16(resd3_in, x3_16x4);
540
541 /* Saturate all values < -255 to -255 and retain the rest as it is */
542 resd0_in = vmax_s16(resd0_in, neg_255);
543 resd1_in = vmax_s16(resd1_in, neg_255);
544 resd2_in = vmax_s16(resd2_in, neg_255);
545 resd3_in = vmax_s16(resd3_in, neg_255);
546
547 /* Saturate all values > 255 to 255 and retain the rest as it is */
548 resd0_in = vmin_s16(resd0_in, pos_255);
549 resd1_in = vmin_s16(resd1_in, pos_255);
550 resd2_in = vmin_s16(resd2_in, pos_255);
551 resd3_in = vmin_s16(resd3_in, pos_255);
552
553 vst1_s16(pi2_res, resd0_in);
554 vst1_s16(pi2_res + i4_res_stride, resd1_in);
555 vst1_s16(pi2_res + (i4_res_stride << 1), resd2_in);
556 vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resd3_in);
557
558 resd01_in = vcombine_s16(resd0_in, resd1_in);
559 resd23_in = vcombine_s16(resd2_in, resd3_in);
560
561 /* Load pred */
562 pred0_in = vld1_u8((uint8_t *) pu1_pred);
563 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
564 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
565 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
566
567 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
568 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
569 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
570 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
571
572 pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
573 pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
574
575 /* Out pixel = pred + res */
576 pred01_in = vaddq_s16(pred01_in, resd01_in);
577 pred23_in = vaddq_s16(pred23_in, resd23_in);
578
579 /* Convert to 8 bit unsigned with saturation */
580 pred01_un = vqmovun_s16(pred01_in);
581 pred23_un = vqmovun_s16(pred23_in);
582
583 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
584 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
585 vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
586 vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
587 vreinterpret_u32_u8(pred23_un), 1);
588 }
589
isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)590 void isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon(
591 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
592 buffer_container_t *ps_res, buffer_container_t *ps_rec,
593 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
594 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
595 {
596 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
597 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
598 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
599 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
600 WORD32 i4_res_stride = ps_res->i4_data_stride;
601 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
602 WORD32 i4_out_stride = ps_rec->i4_data_stride;
603 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
604 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
605 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
606
607 WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
608
609 int16x4x4_t src_16x4x2;
610 int16x4x4_t iscal_16x4x2;
611 int16x4x4_t weigh_16x4x2;
612
613 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
614 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
615 int16x4_t rq1_16x4, rq3_16x4;
616 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
617 int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
618 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
619 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
620 int32x2x2_t x0_32x2x2, x1_32x2x2;
621 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
622
623 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
624 int16x8_t pred0, pred1, pred2, pred3;
625 int16x8_t rec0, rec1, rec2, rec3;
626 uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
627 uint8x8_t out0, out1, out2, out3;
628 int16x8_t resout0, resout1, resout2, resout3;
629
630 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
631 uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
632 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
633 int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
634 int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
635 int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
636
637 UNUSED(i4_iq_start_idx);
638 UNUSED(ps_res_pred);
639 UNUSED(u1_res_accumulate);
640
641 src_16x4x2 = vld4_s16(pi2_src);
642 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
643 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
644
645 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
646 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
647 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
648 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
649
650 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
651 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
652 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
653 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
654
655 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
656 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
657 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
658 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
659
660 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
661 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
662 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
663 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
664
665 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
666 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
667 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
668 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
669
670 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
671
672 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
673 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
674
675 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
676 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
677 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
678 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
679
680 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
681 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
682 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
683 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
684
685 /* row 0 to row 3 */
686 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
687 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
688 x0_32x2x2 =
689 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
690 x1_32x2x2 =
691 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
692
693 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
694 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
695 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
696 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
697
698 /* Store Horz transform output into temp */
699 vst1_s16(pi2_tmp, x0_16x4);
700 vst1_s16(pi2_tmp + 4, x1_16x4);
701 vst1_s16(pi2_tmp + 8, x2_16x4);
702 vst1_s16(pi2_tmp + 12, x3_16x4);
703
704 /* vertical inverse transform */
705 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
706 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
707
708 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
709 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
710 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
711 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
712
713 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
714 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
715 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
716 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
717
718 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
719 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
720 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
721 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
722
723 /* Saturate all values < -255 to -255 and retain the rest as it is */
724 x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
725 x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
726 x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
727 x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
728
729 /* Saturate all values > 255 to 255 and retain the rest as it is */
730 x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
731 x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
732 x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
733 x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
734
735 resout0 = vld1q_s16(pi2_res);
736 resout1 = vld1q_s16(pi2_res + i4_res_stride);
737 resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
738 resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
739
740 x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
741 x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
742 x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
743 x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
744
745 /* Storing res in alternate positions */
746 resout0 = vbslq_s16(chroma_mask_16x8, x0_16x8, resout0);
747 resout1 = vbslq_s16(chroma_mask_16x8, x1_16x8, resout1);
748 resout2 = vbslq_s16(chroma_mask_16x8, x2_16x8, resout2);
749 resout3 = vbslq_s16(chroma_mask_16x8, x3_16x8, resout3);
750
751 vst1q_s16(pi2_res, resout0);
752 vst1q_s16(pi2_res + i4_res_stride, resout1);
753 vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
754 vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
755
756 pred0_in = vld1_u8((uint8_t *) pu1_pred);
757 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
758 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
759 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
760
761 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
762 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
763 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
764 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
765
766 /* Out pixel = pred + res */
767 rec0 = vaddq_s16(pred0, x0_16x8);
768 rec1 = vaddq_s16(pred1, x1_16x8);
769 rec2 = vaddq_s16(pred2, x2_16x8);
770 rec3 = vaddq_s16(pred3, x3_16x8);
771
772 out0 = vld1_u8(pu1_out);
773 out1 = vld1_u8(pu1_out + i4_out_stride);
774 out2 = vld1_u8(pu1_out + i4_out_stride * 2);
775 out3 = vld1_u8(pu1_out + i4_out_stride * 3);
776
777 /* Convert to 8 bit unsigned with saturation */
778 rec0_un = vqmovun_s16(rec0);
779 rec1_un = vqmovun_s16(rec1);
780 rec2_un = vqmovun_s16(rec2);
781 rec3_un = vqmovun_s16(rec3);
782
783 /* Store output pixels in alternate positions */
784 out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
785 out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
786 out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
787 out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
788
789 vst1_u8((pu1_out), out0);
790 vst1_u8((pu1_out + i4_out_stride), out1);
791 vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
792 vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
793 }
794
isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)795 void isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon(
796 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
797 buffer_container_t *ps_res, buffer_container_t *ps_rec,
798 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
799 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
800 {
801 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
802 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
803 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
804 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
805 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
806 WORD32 i4_res_stride = ps_res->i4_data_stride;
807 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
808 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
809 WORD32 i4_out_stride = ps_rec->i4_data_stride;
810 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
811 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
812 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
813
814 WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
815
816 int16x4x4_t src_16x4x2;
817 int16x4x4_t iscal_16x4x2;
818 int16x4x4_t weigh_16x4x2;
819
820 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
821 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
822 int16x4_t rq1_16x4, rq3_16x4;
823 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
824 int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
825 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
826 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
827 int32x2x2_t x0_32x2x2, x1_32x2x2;
828 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
829
830 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
831 int16x8_t pred0, pred1, pred2, pred3;
832 int16x8_t rec0, rec1, rec2, rec3;
833 uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
834 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
835 int16x8_t resd1_in_mask, resd2_in_mask, resd3_in_mask;
836 uint8x8_t out0, out1, out2, out3;
837 int16x8_t resout0, resout1, resout2, resout3;
838 int16x8_t pos_255 = vdupq_n_s16(((WORD16) UINT8_MAX));
839 int16x8_t neg_255 = vdupq_n_s16(-((WORD16) UINT8_MAX));
840
841 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
842 uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
843
844 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
845 int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
846
847 int16x8_t resd0_in_mask = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000};
848
849 UNUSED(i4_iq_start_idx);
850 UNUSED(u1_res_accumulate);
851
852 resd1_in_mask = resd0_in_mask;
853 resd2_in_mask = resd0_in_mask;
854 resd3_in_mask = resd0_in_mask;
855
856 src_16x4x2 = vld4_s16(pi2_src);
857 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
858 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
859
860 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
861 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
862 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
863 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
864
865 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
866 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
867 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
868 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
869
870 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
871 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
872 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
873 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
874
875 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
876 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
877 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
878 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
879
880 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
881 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
882 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
883 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
884
885 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
886
887 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
888 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
889
890 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
891 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
892 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
893 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
894
895 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
896 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
897 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
898 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
899
900 /* row 0 to row 3 */
901 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
902 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
903 x0_32x2x2 =
904 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
905 x1_32x2x2 =
906 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
907
908 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
909 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
910 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
911 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
912
913 /* Store Horz transform output into temp */
914 vst1_s16(pi2_tmp, x0_16x4);
915 vst1_s16(pi2_tmp + 4, x1_16x4);
916 vst1_s16(pi2_tmp + 8, x2_16x4);
917 vst1_s16(pi2_tmp + 12, x3_16x4);
918
919 /* vertical inverse transform */
920 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
921 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
922
923 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
924 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
925 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
926 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
927
928 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
929 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
930 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
931 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
932
933 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
934 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
935 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
936 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
937
938 resd0_in = vld1q_s16((int16_t *) pi2_res_pred);
939 resd1_in = vld1q_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
940 resd2_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
941 resd3_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
942
943 /* Mask alternate values */
944 resd0_in_mask = vbslq_s16(chroma_mask_16x8, resd0_in, resd0_in_mask);
945 resd1_in_mask = vbslq_s16(chroma_mask_16x8, resd1_in, resd1_in_mask);
946 resd2_in_mask = vbslq_s16(chroma_mask_16x8, resd2_in, resd2_in_mask);
947 resd3_in_mask = vbslq_s16(chroma_mask_16x8, resd3_in, resd3_in_mask);
948
949 x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
950 x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
951 x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
952 x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
953
954 resd0_in = vaddq_s16(resd0_in_mask, x0_16x8);
955 resd1_in = vaddq_s16(resd1_in_mask, x1_16x8);
956 resd2_in = vaddq_s16(resd2_in_mask, x2_16x8);
957 resd3_in = vaddq_s16(resd3_in_mask, x3_16x8);
958
959 /* Saturate all values < -255 to -255 and retain the rest as it is */
960 resd0_in = vmaxq_s16(resd0_in, neg_255);
961 resd1_in = vmaxq_s16(resd1_in, neg_255);
962 resd2_in = vmaxq_s16(resd2_in, neg_255);
963 resd3_in = vmaxq_s16(resd3_in, neg_255);
964
965 /* Saturate all values > 255 to 255 and retain the rest as it is */
966 resd0_in = vminq_s16(resd0_in, pos_255);
967 resd1_in = vminq_s16(resd1_in, pos_255);
968 resd2_in = vminq_s16(resd2_in, pos_255);
969 resd3_in = vminq_s16(resd3_in, pos_255);
970
971 resout0 = vld1q_s16(pi2_res);
972 resout1 = vld1q_s16(pi2_res + i4_res_stride);
973 resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
974 resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
975
976 /* Store res in aternate positions */
977 resout0 = vbslq_s16(chroma_mask_16x8, resd0_in, resout0);
978 resout1 = vbslq_s16(chroma_mask_16x8, resd1_in, resout1);
979 resout2 = vbslq_s16(chroma_mask_16x8, resd2_in, resout2);
980 resout3 = vbslq_s16(chroma_mask_16x8, resd3_in, resout3);
981
982 vst1q_s16(pi2_res, resout0);
983 vst1q_s16(pi2_res + i4_res_stride, resout1);
984 vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
985 vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
986
987 pred0_in = vld1_u8((uint8_t *) pu1_pred);
988 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
989 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
990 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
991
992 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
993 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
994 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
995 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
996
997 /* Out pixel = pred + res */
998 rec0 = vaddq_s16(pred0, resout0);
999 rec1 = vaddq_s16(pred1, resout1);
1000 rec2 = vaddq_s16(pred2, resout2);
1001 rec3 = vaddq_s16(pred3, resout3);
1002
1003 out0 = vld1_u8(pu1_out);
1004 out1 = vld1_u8(pu1_out + i4_out_stride);
1005 out2 = vld1_u8(pu1_out + i4_out_stride * 2);
1006 out3 = vld1_u8(pu1_out + i4_out_stride * 3);
1007
1008 /* Convert to 8 bit unsigned with saturation */
1009 rec0_un = vqmovun_s16(rec0);
1010 rec1_un = vqmovun_s16(rec1);
1011 rec2_un = vqmovun_s16(rec2);
1012 rec3_un = vqmovun_s16(rec3);
1013
1014 /* Store output pixels in alternate positions */
1015 out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
1016 out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
1017 out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
1018 out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
1019
1020 vst1_u8((pu1_out), out0);
1021 vst1_u8((pu1_out + i4_out_stride), out1);
1022 vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
1023 vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
1024 }
1025
isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1026 void isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
1027 buffer_container_t *ps_res_pred,
1028 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1029 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
1030 WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1031 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1032 {
1033 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1034 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1035 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1036 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1037 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1038 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1039 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1040 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1041 WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1042
1043 WORD32 i4_iq_out_temp;
1044 int16x8_t temp_0;
1045 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1046 int16x8_t pred0, pred1, pred2, pred3;
1047
1048 UNUSED(pi2_tmp);
1049 UNUSED(ps_res);
1050 UNUSED(ps_res_pred);
1051 UNUSED(u1_res_accumulate);
1052
1053 if(i4_iq_start_idx == 0)
1054 {
1055 i4_iq_out_temp = pi2_src[0];
1056 INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1057 }
1058 else
1059 {
1060 i4_iq_out_temp = pi2_dc_src[0];
1061 }
1062
1063 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1064
1065 pred0_in = vld1_u8(pu1_pred);
1066 pu1_pred = pu1_pred + i4_pred_stride;
1067 pred1_in = vld1_u8(pu1_pred);
1068 pu1_pred = pu1_pred + i4_pred_stride;
1069 pred2_in = vld1_u8(pu1_pred);
1070 pu1_pred = pu1_pred + i4_pred_stride;
1071 pred3_in = vld1_u8(pu1_pred);
1072
1073 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1074 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1075 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1076 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1077
1078 /* Out pixel = Res + pred */
1079 pred0 = vaddq_s16(pred0, temp_0);
1080 pred1 = vaddq_s16(pred1, temp_0);
1081 pred2 = vaddq_s16(pred2, temp_0);
1082 pred3 = vaddq_s16(pred3, temp_0);
1083
1084 /* Convert to unsigned 8 bit with saturation */
1085 pred0_in = vqmovun_s16(pred0);
1086 pred1_in = vqmovun_s16(pred1);
1087 pred2_in = vqmovun_s16(pred2);
1088 pred3_in = vqmovun_s16(pred3);
1089
1090 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
1091 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred1_in), 0);
1092 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 2), vreinterpret_u32_u8(pred2_in), 0);
1093 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0);
1094 }
1095
isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1096 void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon(
1097 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1098 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1099 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1100 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1101 {
1102 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1103 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1104 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1105 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1106 WORD32 i4_res_stride = ps_res->i4_data_stride;
1107 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1108 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1109 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1110 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1111 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1112
1113 WORD16 i2_it_out;
1114 WORD32 i4_iq_out_temp;
1115 int16x8_t temp_0, residue_res;
1116 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1117 int16x8_t pred0, pred1, pred2, pred3;
1118 int16x8_t resout0, resout1, resout2, resout3;
1119
1120 uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
1121 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1122 uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1123
1124 UNUSED(pi2_src);
1125 UNUSED(pu2_iscal_mat);
1126 UNUSED(pu2_weigh_mat);
1127 UNUSED(u4_qp_div_6);
1128 UNUSED(pi2_tmp);
1129 UNUSED(i4_iq_start_idx);
1130 UNUSED(ps_res_pred);
1131 UNUSED(u1_res_accumulate);
1132
1133 i4_iq_out_temp = pi2_dc_src[0];
1134
1135 i2_it_out = ((i4_iq_out_temp + 32) >> 6);
1136 temp_0 = vdupq_n_s16(i2_it_out);
1137 residue_res = vdupq_n_s16(isvc_get_residue(i2_it_out, 0, 0));
1138
1139 resout0 = vld1q_s16(pi2_res);
1140 resout1 = vld1q_s16(pi2_res + i4_res_stride);
1141 resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1142 resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1143
1144 /* Store res in alternate positions */
1145 resout0 = vbslq_s16(chroma_mask_16x8, residue_res, resout0);
1146 resout1 = vbslq_s16(chroma_mask_16x8, residue_res, resout1);
1147 resout2 = vbslq_s16(chroma_mask_16x8, residue_res, resout2);
1148 resout3 = vbslq_s16(chroma_mask_16x8, residue_res, resout3);
1149
1150 vst1q_s16(pi2_res, resout0);
1151 vst1q_s16(pi2_res + i4_res_stride, resout1);
1152 vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1153 vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1154
1155 pred0_in = vld1_u8(pu1_pred);
1156 pu1_pred = pu1_pred + i4_pred_stride;
1157 pred1_in = vld1_u8(pu1_pred);
1158 pu1_pred = pu1_pred + i4_pred_stride;
1159 pred2_in = vld1_u8(pu1_pred);
1160 pu1_pred = pu1_pred + i4_pred_stride;
1161 pred3_in = vld1_u8(pu1_pred);
1162
1163 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1164 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1165 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1166 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1167
1168 /* Out pixel = Res + pred */
1169 pred0 = vaddq_s16(pred0, temp_0);
1170 pred1 = vaddq_s16(pred1, temp_0);
1171 pred2 = vaddq_s16(pred2, temp_0);
1172 pred3 = vaddq_s16(pred3, temp_0);
1173
1174 /* Convert to unsigned 8 bit with saturation */
1175 pred0_in = vqmovun_s16(pred0);
1176 pred1_in = vqmovun_s16(pred1);
1177 pred2_in = vqmovun_s16(pred2);
1178 pred3_in = vqmovun_s16(pred3);
1179
1180 /* Store out pixels in alternate positions */
1181 i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
1182 i4_out_horz_8x8_r1 = vld1_u8(pu1_out + i4_out_stride);
1183 i4_out_horz_8x8_r2 = vld1_u8(pu1_out + i4_out_stride * 2);
1184 i4_out_horz_8x8_r3 = vld1_u8(pu1_out + i4_out_stride * 3);
1185
1186 i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
1187 i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
1188 i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
1189 i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
1190
1191 vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
1192 vst1_u8((uint8_t *) (pu1_out + i4_out_stride), i4_out_horz_8x8_r1);
1193 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), i4_out_horz_8x8_r2);
1194 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), i4_out_horz_8x8_r3);
1195 }
1196
isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1197 void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon(
1198 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1199 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1200 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1201 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1202 {
1203 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1204 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1205 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1206 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1207 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1208 WORD32 i4_res_stride = ps_res->i4_data_stride;
1209 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1210 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1211 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1212 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1213 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1214 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1215
1216 WORD32 i4_iq_out_temp;
1217 int16x8_t temp_0;
1218 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1219 int16x8_t pred0, pred1, pred2, pred3;
1220 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1221 int16x8_t resout0, resout1, resout2, resout3;
1222 int16x8_t resd1_in_mask, resd2_in_mask, resd3_in_mask;
1223 uint8x8_t out0, out1, out2, out3;
1224 int16x8_t pos_255 = vdupq_n_s16(((WORD16) UINT8_MAX));
1225 int16x8_t neg_255 = vdupq_n_s16(-((WORD16) UINT8_MAX));
1226 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1227 uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1228
1229 int16x8_t resd0_in_mask = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000};
1230
1231 UNUSED(pi2_src);
1232 UNUSED(pu2_iscal_mat);
1233 UNUSED(pu2_weigh_mat);
1234 UNUSED(u4_qp_div_6);
1235 UNUSED(pi2_tmp);
1236 UNUSED(i4_iq_start_idx);
1237 UNUSED(u1_res_accumulate);
1238
1239 resd1_in_mask = resd0_in_mask;
1240 resd2_in_mask = resd0_in_mask;
1241 resd3_in_mask = resd0_in_mask;
1242
1243 i4_iq_out_temp = pi2_dc_src[0];
1244 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1245
1246 resd0_in = vld1q_s16((int16_t *) pi2_res_pred);
1247 resd1_in = vld1q_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
1248 resd2_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
1249 resd3_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
1250
1251 /* Mask alternate values of res pred */
1252 resd0_in_mask = vbslq_s16(chroma_mask_16x8, resd0_in, resd0_in_mask);
1253 resd1_in_mask = vbslq_s16(chroma_mask_16x8, resd1_in, resd1_in_mask);
1254 resd2_in_mask = vbslq_s16(chroma_mask_16x8, resd2_in, resd2_in_mask);
1255 resd3_in_mask = vbslq_s16(chroma_mask_16x8, resd3_in, resd3_in_mask);
1256
1257 /* Add res pred to res obtained */
1258 resd0_in = vaddq_s16(resd0_in_mask, temp_0);
1259 resd1_in = vaddq_s16(resd1_in_mask, temp_0);
1260 resd2_in = vaddq_s16(resd2_in_mask, temp_0);
1261 resd3_in = vaddq_s16(resd3_in_mask, temp_0);
1262
1263 /* Saturate all values < -255 to -255 and retain the rest as it is */
1264 resd0_in = vmaxq_s16(resd0_in, neg_255);
1265 resd1_in = vmaxq_s16(resd1_in, neg_255);
1266 resd2_in = vmaxq_s16(resd2_in, neg_255);
1267 resd3_in = vmaxq_s16(resd3_in, neg_255);
1268
1269 /* Saturate all values > 255 to 255 and retain the rest as it is */
1270 resd0_in = vminq_s16(resd0_in, pos_255);
1271 resd1_in = vminq_s16(resd1_in, pos_255);
1272 resd2_in = vminq_s16(resd2_in, pos_255);
1273 resd3_in = vminq_s16(resd3_in, pos_255);
1274
1275 resout0 = vld1q_s16(pi2_res);
1276 resout1 = vld1q_s16(pi2_res + i4_res_stride);
1277 resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1278 resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1279
1280 /* Store res in alternate positions */
1281 resout0 = vbslq_s16(chroma_mask_16x8, resd0_in, resout0);
1282 resout1 = vbslq_s16(chroma_mask_16x8, resd1_in, resout1);
1283 resout2 = vbslq_s16(chroma_mask_16x8, resd2_in, resout2);
1284 resout3 = vbslq_s16(chroma_mask_16x8, resd3_in, resout3);
1285
1286 vst1q_s16(pi2_res, resout0);
1287 vst1q_s16(pi2_res + i4_res_stride, resout1);
1288 vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1289 vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1290
1291 pred0_in = vld1_u8(pu1_pred);
1292 pu1_pred = pu1_pred + i4_pred_stride;
1293 pred1_in = vld1_u8(pu1_pred);
1294 pu1_pred = pu1_pred + i4_pred_stride;
1295 pred2_in = vld1_u8(pu1_pred);
1296 pu1_pred = pu1_pred + i4_pred_stride;
1297 pred3_in = vld1_u8(pu1_pred);
1298
1299 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1300 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1301 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1302 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1303
1304 /* Out pixel = Res + pred */
1305 pred0 = vaddq_s16(pred0, resout0);
1306 pred1 = vaddq_s16(pred1, resout1);
1307 pred2 = vaddq_s16(pred2, resout2);
1308 pred3 = vaddq_s16(pred3, resout3);
1309
1310 /* Convert to unsigned 8 bit with saturation */
1311 pred0_in = vqmovun_s16(pred0);
1312 pred1_in = vqmovun_s16(pred1);
1313 pred2_in = vqmovun_s16(pred2);
1314 pred3_in = vqmovun_s16(pred3);
1315
1316 out0 = vld1_u8(pu1_out);
1317 out1 = vld1_u8(pu1_out + i4_out_stride);
1318 out2 = vld1_u8(pu1_out + i4_out_stride * 2);
1319 out3 = vld1_u8(pu1_out + i4_out_stride * 3);
1320
1321 /* Store out pixels in alternate positions */
1322 out0 = vbsl_u8(chroma_mask_8x8, pred0_in, out0);
1323 out1 = vbsl_u8(chroma_mask_8x8, pred1_in, out1);
1324 out2 = vbsl_u8(chroma_mask_8x8, pred2_in, out2);
1325 out3 = vbsl_u8(chroma_mask_8x8, pred3_in, out3);
1326
1327 vst1_u8((uint8_t *) (pu1_out), out0);
1328 vst1_u8((uint8_t *) (pu1_out + i4_out_stride), out1);
1329 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), out2);
1330 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), out3);
1331 }
1332