1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 * *******************************************************************************
22 * * @file
23 * isvc_iquant_itrans_recon_neon.c
24 *
25 * @brief
26 * neon variants of inverse transform and quantization functions
27 *
28 * *******************************************************************************
29 */
30 #include <arm_neon.h>
31
32 #include "ih264_typedefs.h"
33 #include "ih264_debug.h"
34 #include "ih264_defs.h"
35 #include "ih264_trans_macros.h"
36 #include "ih264_macros.h"
37 #include "ih264_platform_macros.h"
38 #include "ih264_trans_data.h"
39 #include "ih264_size_defs.h"
40 #include "isvc_structs.h"
41 #include "isvc_trans_quant_itrans_iquant.h"
42
isvc_iquant_itrans_recon_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)43 void isvc_iquant_itrans_recon_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
44 buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
45 buffer_container_t *ps_rec,
46 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
47 WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
48 UWORD8 u1_res_accumulate)
49 {
50 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
51 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
52 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
53 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
54 WORD32 i4_out_stride = ps_rec->i4_data_stride;
55 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
56 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
57 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
58
59 int16x4x4_t src_16x4x2;
60 int16x4x4_t iscal_16x4x2;
61 int16x4x4_t weigh_16x4x2;
62
63 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
64 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
65 int16x4_t rq1_16x4, rq3_16x4;
66 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
67 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
68 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
69 int32x2x2_t x0_32x2x2, x1_32x2x2;
70 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
71
72 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
73 int16x8_t pred0, pred1, pred2, pred3;
74 int16x8_t resd01_in, resd23_in;
75 int16x8_t pred01_in, pred23_in;
76 uint8x8_t pred01_un, pred23_un;
77
78 int16x8_t pos_255_16x8 = vdupq_n_s16(((WORD16) UINT8_MAX));
79 int16x8_t neg_255_16x8 = vdupq_n_s16(-((WORD16) UINT8_MAX));
80 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
81
82 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
83 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
84
85 UNUSED(ps_res);
86 UNUSED(ps_res_pred);
87 UNUSED(u1_res_accumulate);
88
89 src_16x4x2 = vld4_s16(pi2_src);
90 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
91 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
92
93 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
94 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
95 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
96 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
97
98 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
99 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
100 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
101 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
102
103 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
104 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
105 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
106 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
107
108 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
109 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
110 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
111 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
112
113 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
114 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
115 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
116 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
117
118 if(i4_iq_start_idx == 1)
119 {
120 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
121 }
122
123 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
124 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
125
126 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
127 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
128 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
129 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
130
131 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
132 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
133 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
134 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
135
136 /* row 0 to row 3 */
137 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
138 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
139 x0_32x2x2 =
140 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
141 x1_32x2x2 =
142 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
143
144 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
145 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
146 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
147 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
148
149 /* Store Horz transform output into temp */
150 vst1_s16(pi2_tmp, x0_16x4);
151 vst1_s16(pi2_tmp + 4, x1_16x4);
152 vst1_s16(pi2_tmp + 8, x2_16x4);
153 vst1_s16(pi2_tmp + 12, x3_16x4);
154
155 /* vertical inverse transform */
156 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
157 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
158
159 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
160 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
161 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
162 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
163
164 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
165 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
166 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
167 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
168
169 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
170 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
171 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
172 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
173
174 resd01_in = vcombine_s16(x0_16x4, x1_16x4);
175 resd23_in = vcombine_s16(x2_16x4, x3_16x4);
176
177 /* Saturate all values < -255 to -255 and retain the rest as it is */
178 resd01_in = vmaxq_s16(resd01_in, neg_255_16x8);
179 resd23_in = vmaxq_s16(resd23_in, neg_255_16x8);
180
181 /* Saturate all values > 255 to 255 and retain the rest as it is */
182 resd01_in = vminq_s16(resd01_in, pos_255_16x8);
183 resd23_in = vminq_s16(resd23_in, pos_255_16x8);
184
185 /* Load pred */
186 pred0_in = vld1_u8((uint8_t *) pu1_pred);
187 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
188 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
189 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
190
191 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
192 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
193 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
194 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
195
196 pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
197 pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
198
199 /* Out pixel = pred + res */
200 pred01_in = vaddq_s16(pred01_in, resd01_in);
201 pred23_in = vaddq_s16(pred23_in, resd23_in);
202
203 /* Convert to 8 bit unsigned with saturation */
204 pred01_un = vqmovun_s16(pred01_in);
205 pred23_un = vqmovun_s16(pred23_in);
206
207 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
208 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
209 vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
210 vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
211 vreinterpret_u32_u8(pred23_un), 1);
212 }
213
isvc_iquant_itrans_recon_4x4_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)214 void isvc_iquant_itrans_recon_4x4_with_res_output_neon(
215 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
216 buffer_container_t *ps_res, buffer_container_t *ps_rec,
217 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
218 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
219 {
220 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
221 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
222 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
223 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
224 WORD32 i4_res_stride = ps_res->i4_data_stride;
225 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
226 WORD32 i4_out_stride = ps_rec->i4_data_stride;
227 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
228 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
229 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
230
231 int16x4x4_t src_16x4x2;
232 int16x4x4_t iscal_16x4x2;
233 int16x4x4_t weigh_16x4x2;
234
235 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
236 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
237 int16x4_t rq1_16x4, rq3_16x4;
238 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
239 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
240 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
241 int32x2x2_t x0_32x2x2, x1_32x2x2;
242 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
243
244 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
245 int16x8_t pred0, pred1, pred2, pred3;
246 int16x8_t resd01_in, resd23_in;
247 int16x8_t pred01_in, pred23_in;
248 uint8x8_t pred01_un, pred23_un;
249
250 int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
251 int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
252 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
253
254 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
255 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
256
257 UNUSED(ps_res_pred);
258 UNUSED(u1_res_accumulate);
259
260 src_16x4x2 = vld4_s16(pi2_src);
261 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
262 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
263
264 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
265 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
266 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
267 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
268
269 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
270 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
271 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
272 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
273
274 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
275 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
276 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
277 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
278
279 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
280 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
281 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
282 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
283
284 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
285 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
286 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
287 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
288
289 if(i4_iq_start_idx == 1)
290 {
291 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
292 }
293
294 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
295 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
296
297 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
298 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
299 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
300 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
301
302 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
303 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
304 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
305 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
306
307 /* row 0 to row 3 */
308 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
309 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
310 x0_32x2x2 =
311 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
312 x1_32x2x2 =
313 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
314
315 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
316 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
317 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
318 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
319
320 /* Store Horz transform output into temp */
321 vst1_s16(pi2_tmp, x0_16x4);
322 vst1_s16(pi2_tmp + 4, x1_16x4);
323 vst1_s16(pi2_tmp + 8, x2_16x4);
324 vst1_s16(pi2_tmp + 12, x3_16x4);
325
326 /* vertical inverse transform */
327 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
328 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
329
330 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
331 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
332 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
333 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
334
335 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
336 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
337 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
338 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
339
340 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
341 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
342 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
343 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
344
345 /* Saturate all values < -255 to -255 and retain the rest as it is */
346 x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
347 x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
348 x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
349 x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
350
351 /* Saturate all values > 255 to 255 and retain the rest as it is */
352 x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
353 x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
354 x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
355 x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
356
357 vst1_s16(pi2_res, x0_16x4);
358 vst1_s16(pi2_res + i4_res_stride, x1_16x4);
359 vst1_s16(pi2_res + (i4_res_stride << 1), x2_16x4);
360 vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, x3_16x4);
361
362 resd01_in = vcombine_s16(x0_16x4, x1_16x4);
363 resd23_in = vcombine_s16(x2_16x4, x3_16x4);
364
365 /* Load pred */
366 pred0_in = vld1_u8((uint8_t *) pu1_pred);
367 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
368 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
369 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
370
371 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
372 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
373 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
374 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
375
376 pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
377 pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
378
379 /* Out pixel = pred + res */
380 pred01_in = vaddq_s16(pred01_in, resd01_in);
381 pred23_in = vaddq_s16(pred23_in, resd23_in);
382
383 /* Convert to 8 bit unsigned with saturation */
384 pred01_un = vqmovun_s16(pred01_in);
385 pred23_un = vqmovun_s16(pred23_in);
386
387 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
388 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
389 vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
390 vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
391 vreinterpret_u32_u8(pred23_un), 1);
392 }
393
isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)394 void isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon(
395 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
396 buffer_container_t *ps_res, buffer_container_t *ps_rec,
397 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
398 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
399 {
400 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
401 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
402 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
403 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
404 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
405 WORD32 i4_res_stride = ps_res->i4_data_stride;
406 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
407 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
408 WORD32 i4_out_stride = ps_rec->i4_data_stride;
409 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
410 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
411 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
412
413 int16x4x4_t src_16x4x2;
414 int16x4x4_t iscal_16x4x2;
415 int16x4x4_t weigh_16x4x2;
416
417 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
418 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
419 int16x4_t rq1_16x4, rq3_16x4;
420 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
421 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
422 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
423 int32x2x2_t x0_32x2x2, x1_32x2x2;
424 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
425
426 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
427 int16x8_t pred0, pred1, pred2, pred3;
428 int16x4_t resd0_in, resd1_in, resd2_in, resd3_in;
429 int16x8_t resd01_in, resd23_in;
430 int16x8_t pred01_in, pred23_in;
431 uint8x8_t pred01_un, pred23_un;
432
433 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
434
435 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
436 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
437 int16x4_t pos_255 = vdup_n_s16(((WORD16) UINT8_MAX));
438 int16x4_t neg_255 = vdup_n_s16(-((WORD16) UINT8_MAX));
439
440 UNUSED(u1_res_accumulate);
441
442 src_16x4x2 = vld4_s16(pi2_src);
443 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
444 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
445
446 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
447 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
448 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
449 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
450
451 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
452 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
453 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
454 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
455
456 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
457 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
458 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
459 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
460
461 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
462 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
463 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
464 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
465
466 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
467 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
468 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
469 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
470
471 if(i4_iq_start_idx == 1)
472 {
473 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
474 }
475
476 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
477 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
478
479 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
480 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
481 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
482 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
483
484 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
485 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
486 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
487 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
488
489 /* row 0 to row 3 */
490 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
491 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
492 x0_32x2x2 =
493 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
494 x1_32x2x2 =
495 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
496
497 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
498 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
499 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
500 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
501
502 /* Store Horz transform output into temp */
503 vst1_s16(pi2_tmp, x0_16x4);
504 vst1_s16(pi2_tmp + 4, x1_16x4);
505 vst1_s16(pi2_tmp + 8, x2_16x4);
506 vst1_s16(pi2_tmp + 12, x3_16x4);
507
508 /* vertical inverse transform */
509 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
510 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
511
512 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
513 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
514 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
515 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
516
517 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
518 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
519 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
520 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
521
522 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
523 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
524 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
525 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
526
527 /* Accumulating Res */
528
529 /* Load Res pred */
530 resd0_in = vld1_s16((int16_t *) pi2_res_pred);
531 resd1_in = vld1_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
532 resd2_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
533 resd3_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
534
535 /* Add res pred with res obtained */
536 resd0_in = vadd_s16(resd0_in, x0_16x4);
537 resd1_in = vadd_s16(resd1_in, x1_16x4);
538 resd2_in = vadd_s16(resd2_in, x2_16x4);
539 resd3_in = vadd_s16(resd3_in, x3_16x4);
540
541 /* Saturate all values < -255 to -255 and retain the rest as it is */
542 resd0_in = vmax_s16(resd0_in, neg_255);
543 resd1_in = vmax_s16(resd1_in, neg_255);
544 resd2_in = vmax_s16(resd2_in, neg_255);
545 resd3_in = vmax_s16(resd3_in, neg_255);
546
547 /* Saturate all values > 255 to 255 and retain the rest as it is */
548 resd0_in = vmin_s16(resd0_in, pos_255);
549 resd1_in = vmin_s16(resd1_in, pos_255);
550 resd2_in = vmin_s16(resd2_in, pos_255);
551 resd3_in = vmin_s16(resd3_in, pos_255);
552
553 vst1_s16(pi2_res, resd0_in);
554 vst1_s16(pi2_res + i4_res_stride, resd1_in);
555 vst1_s16(pi2_res + (i4_res_stride << 1), resd2_in);
556 vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resd3_in);
557
558 resd01_in = vcombine_s16(resd0_in, resd1_in);
559 resd23_in = vcombine_s16(resd2_in, resd3_in);
560
561 /* Load pred */
562 pred0_in = vld1_u8((uint8_t *) pu1_pred);
563 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
564 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
565 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
566
567 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
568 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
569 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
570 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
571
572 pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
573 pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
574
575 /* Out pixel = pred + res */
576 pred01_in = vaddq_s16(pred01_in, resd01_in);
577 pred23_in = vaddq_s16(pred23_in, resd23_in);
578
579 /* Convert to 8 bit unsigned with saturation */
580 pred01_un = vqmovun_s16(pred01_in);
581 pred23_un = vqmovun_s16(pred23_in);
582
583 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
584 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
585 vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
586 vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
587 vreinterpret_u32_u8(pred23_un), 1);
588 }
589
isvc_iquant_itrans_recon_chroma_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)590 void isvc_iquant_itrans_recon_chroma_4x4_neon(
591 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
592 buffer_container_t *ps_res, buffer_container_t *ps_rec,
593 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
594 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
595 {
596 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
597 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
598 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
599 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
600 WORD32 i4_out_stride = ps_rec->i4_data_stride;
601 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
602 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
603 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
604
605 WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
606
607 int16x4x4_t src_16x4x2;
608 int16x4x4_t iscal_16x4x2;
609 int16x4x4_t weigh_16x4x2;
610
611 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
612 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
613 int16x4_t rq1_16x4, rq3_16x4;
614 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
615 int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
616 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
617 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
618 int32x2x2_t x0_32x2x2, x1_32x2x2;
619 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
620
621 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
622 int16x8_t pred0, pred1, pred2, pred3;
623 int16x8_t rec0, rec1, rec2, rec3;
624 uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
625 uint8x8_t out0, out1, out2, out3;
626
627 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
628
629 int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
630 int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
631 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
632 int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
633
634 UNUSED(i4_iq_start_idx);
635 UNUSED(ps_res);
636 UNUSED(ps_res_pred);
637 UNUSED(u1_res_accumulate);
638
639 src_16x4x2 = vld4_s16(pi2_src);
640 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
641 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
642
643 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
644 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
645 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
646 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
647
648 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
649 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
650 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
651 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
652
653 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
654 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
655 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
656 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
657
658 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
659 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
660 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
661 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
662
663 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
664 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
665 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
666 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
667
668 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
669
670 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
671 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
672
673 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
674 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
675 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
676 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
677
678 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
679 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
680 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
681 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
682
683 /* row 0 to row 3 */
684 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
685 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
686 x0_32x2x2 =
687 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
688 x1_32x2x2 =
689 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
690
691 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
692 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
693 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
694 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
695
696 /* Store Horz transform output into temp */
697 vst1_s16(pi2_tmp, x0_16x4);
698 vst1_s16(pi2_tmp + 4, x1_16x4);
699 vst1_s16(pi2_tmp + 8, x2_16x4);
700 vst1_s16(pi2_tmp + 12, x3_16x4);
701
702 /* vertical inverse transform */
703 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
704 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
705
706 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
707 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
708 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
709 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
710
711 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
712 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
713 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
714 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
715
716 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
717 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
718 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
719 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
720
721 /* Saturate all values < -255 to -255 and retain the rest as it is */
722 x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
723 x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
724 x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
725 x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
726
727 /* Saturate all values > 255 to 255 and retain the rest as it is */
728 x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
729 x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
730 x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
731 x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
732
733 x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
734 x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
735 x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
736 x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
737
738 pred0_in = vld1_u8((uint8_t *) pu1_pred);
739 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
740 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
741 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
742
743 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
744 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
745 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
746 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
747
748 /* Out pixel = pred + res */
749 rec0 = vaddq_s16(pred0, x0_16x8);
750 rec1 = vaddq_s16(pred1, x1_16x8);
751 rec2 = vaddq_s16(pred2, x2_16x8);
752 rec3 = vaddq_s16(pred3, x3_16x8);
753
754 out0 = vld1_u8(pu1_out);
755 out1 = vld1_u8(pu1_out + i4_out_stride);
756 out2 = vld1_u8(pu1_out + i4_out_stride * 2);
757 out3 = vld1_u8(pu1_out + i4_out_stride * 3);
758
759 /* Convert to 8 bit unsigned with saturation */
760 rec0_un = vqmovun_s16(rec0);
761 rec1_un = vqmovun_s16(rec1);
762 rec2_un = vqmovun_s16(rec2);
763 rec3_un = vqmovun_s16(rec3);
764
765 /* Store in alternate postions */
766 out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
767 out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
768 out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
769 out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
770
771 vst1_u8((pu1_out), out0);
772 vst1_u8((pu1_out + i4_out_stride), out1);
773 vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
774 vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
775 }
776
isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)777 void isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon(
778 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
779 buffer_container_t *ps_res, buffer_container_t *ps_rec,
780 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
781 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
782 {
783 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
784 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
785 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
786 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
787 WORD32 i4_res_stride = ps_res->i4_data_stride;
788 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
789 WORD32 i4_out_stride = ps_rec->i4_data_stride;
790 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
791 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
792 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
793
794 WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
795
796 int16x4x4_t src_16x4x2;
797 int16x4x4_t iscal_16x4x2;
798 int16x4x4_t weigh_16x4x2;
799
800 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
801 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
802 int16x4_t rq1_16x4, rq3_16x4;
803 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
804 int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
805 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
806 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
807 int32x2x2_t x0_32x2x2, x1_32x2x2;
808 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
809
810 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
811 int16x8_t pred0, pred1, pred2, pred3;
812 int16x8_t rec0, rec1, rec2, rec3;
813 uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
814 uint8x8_t out0, out1, out2, out3;
815 int16x8_t resout0, resout1, resout2, resout3;
816
817 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
818 uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
819 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
820 int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
821 int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
822 int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
823
824 UNUSED(i4_iq_start_idx);
825 UNUSED(ps_res_pred);
826 UNUSED(u1_res_accumulate);
827
828 src_16x4x2 = vld4_s16(pi2_src);
829 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
830 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
831
832 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
833 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
834 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
835 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
836
837 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
838 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
839 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
840 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
841
842 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
843 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
844 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
845 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
846
847 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
848 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
849 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
850 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
851
852 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
853 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
854 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
855 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
856
857 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
858
859 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
860 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
861
862 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
863 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
864 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
865 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
866
867 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
868 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
869 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
870 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
871
872 /* row 0 to row 3 */
873 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
874 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
875 x0_32x2x2 =
876 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
877 x1_32x2x2 =
878 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
879
880 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
881 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
882 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
883 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
884
885 /* Store Horz transform output into temp */
886 vst1_s16(pi2_tmp, x0_16x4);
887 vst1_s16(pi2_tmp + 4, x1_16x4);
888 vst1_s16(pi2_tmp + 8, x2_16x4);
889 vst1_s16(pi2_tmp + 12, x3_16x4);
890
891 /* vertical inverse transform */
892 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
893 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
894
895 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
896 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
897 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
898 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
899
900 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
901 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
902 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
903 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
904
905 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
906 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
907 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
908 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
909
910 /* Saturate all values < -255 to -255 and retain the rest as it is */
911 x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4);
912 x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4);
913 x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4);
914 x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4);
915
916 /* Saturate all values > 255 to 255 and retain the rest as it is */
917 x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4);
918 x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4);
919 x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4);
920 x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4);
921
922 resout0 = vld1q_s16(pi2_res);
923 resout1 = vld1q_s16(pi2_res + i4_res_stride);
924 resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
925 resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
926
927 x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
928 x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
929 x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
930 x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
931
932 /* Storing res in alternate positions */
933 resout0 = vbslq_s16(chroma_mask_16x8, x0_16x8, resout0);
934 resout1 = vbslq_s16(chroma_mask_16x8, x1_16x8, resout1);
935 resout2 = vbslq_s16(chroma_mask_16x8, x2_16x8, resout2);
936 resout3 = vbslq_s16(chroma_mask_16x8, x3_16x8, resout3);
937
938 vst1q_s16(pi2_res, resout0);
939 vst1q_s16(pi2_res + i4_res_stride, resout1);
940 vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
941 vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
942
943 pred0_in = vld1_u8((uint8_t *) pu1_pred);
944 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
945 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
946 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
947
948 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
949 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
950 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
951 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
952
953 /* Out pixel = pred + res */
954 rec0 = vaddq_s16(pred0, x0_16x8);
955 rec1 = vaddq_s16(pred1, x1_16x8);
956 rec2 = vaddq_s16(pred2, x2_16x8);
957 rec3 = vaddq_s16(pred3, x3_16x8);
958
959 out0 = vld1_u8(pu1_out);
960 out1 = vld1_u8(pu1_out + i4_out_stride);
961 out2 = vld1_u8(pu1_out + i4_out_stride * 2);
962 out3 = vld1_u8(pu1_out + i4_out_stride * 3);
963
964 /* Convert to 8 bit unsigned with saturation */
965 rec0_un = vqmovun_s16(rec0);
966 rec1_un = vqmovun_s16(rec1);
967 rec2_un = vqmovun_s16(rec2);
968 rec3_un = vqmovun_s16(rec3);
969
970 /* Store output pixels in alternate positions */
971 out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
972 out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
973 out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
974 out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
975
976 vst1_u8((pu1_out), out0);
977 vst1_u8((pu1_out + i4_out_stride), out1);
978 vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
979 vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
980 }
981
isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)982 void isvc_iquant_itrans_recon_chroma_4x4_with_res_accumulate_neon(
983 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
984 buffer_container_t *ps_res, buffer_container_t *ps_rec,
985 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
986 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
987 {
988 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
989 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
990 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
991 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
992 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
993 WORD32 i4_res_stride = ps_res->i4_data_stride;
994 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
995 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
996 WORD32 i4_out_stride = ps_rec->i4_data_stride;
997 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
998 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
999 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1000
1001 WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1002
1003 int16x4x4_t src_16x4x2;
1004 int16x4x4_t iscal_16x4x2;
1005 int16x4x4_t weigh_16x4x2;
1006
1007 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
1008 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
1009 int16x4_t rq1_16x4, rq3_16x4;
1010 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
1011 int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
1012 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
1013 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
1014 int32x2x2_t x0_32x2x2, x1_32x2x2;
1015 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
1016
1017 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1018 int16x8_t pred0, pred1, pred2, pred3;
1019 int16x8_t rec0, rec1, rec2, rec3;
1020 uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
1021 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1022 int16x8_t resd1_in_mask, resd2_in_mask, resd3_in_mask;
1023 uint8x8_t out0, out1, out2, out3;
1024 int16x8_t resout0, resout1, resout2, resout3;
1025 int16x8_t pos_255 = vdupq_n_s16(((WORD16) UINT8_MAX));
1026 int16x8_t neg_255 = vdupq_n_s16(-((WORD16) UINT8_MAX));
1027
1028 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1029 uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1030
1031 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
1032 int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor);
1033
1034 int16x8_t resd0_in_mask = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000};
1035
1036 UNUSED(i4_iq_start_idx);
1037 UNUSED(u1_res_accumulate);
1038
1039 resd1_in_mask = resd0_in_mask;
1040 resd2_in_mask = resd0_in_mask;
1041 resd3_in_mask = resd0_in_mask;
1042
1043 src_16x4x2 = vld4_s16(pi2_src);
1044 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
1045 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
1046
1047 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
1048 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
1049 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
1050 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
1051
1052 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
1053 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
1054 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
1055 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
1056
1057 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
1058 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
1059 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
1060 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
1061
1062 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
1063 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
1064 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
1065 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
1066
1067 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
1068 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
1069 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
1070 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
1071
1072 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
1073
1074 rq1_16x4 = vshr_n_s16(q1_16x4, 1);
1075 rq3_16x4 = vshr_n_s16(q3_16x4, 1);
1076
1077 x0_16x4 = vadd_s16(q0_16x4, q2_16x4);
1078 x1_16x4 = vsub_s16(q0_16x4, q2_16x4);
1079 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);
1080 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);
1081
1082 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
1083 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
1084 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
1085 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
1086
1087 /* row 0 to row 3 */
1088 xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4);
1089 xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4);
1090 x0_32x2x2 =
1091 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
1092 x1_32x2x2 =
1093 vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
1094
1095 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
1096 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
1097 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
1098 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
1099
1100 /* Store Horz transform output into temp */
1101 vst1_s16(pi2_tmp, x0_16x4);
1102 vst1_s16(pi2_tmp + 4, x1_16x4);
1103 vst1_s16(pi2_tmp + 8, x2_16x4);
1104 vst1_s16(pi2_tmp + 12, x3_16x4);
1105
1106 /* vertical inverse transform */
1107 rq1_16x4 = vshr_n_s16(x1_16x4, 1);
1108 rq3_16x4 = vshr_n_s16(x3_16x4, 1);
1109
1110 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);
1111 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);
1112 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);
1113 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);
1114
1115 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);
1116 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);
1117 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);
1118 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);
1119
1120 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
1121 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
1122 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
1123 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
1124
1125 resd0_in = vld1q_s16((int16_t *) pi2_res_pred);
1126 resd1_in = vld1q_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
1127 resd2_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
1128 resd3_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
1129
1130 /* Mask alternate values */
1131 resd0_in_mask = vbslq_s16(chroma_mask_16x8, resd0_in, resd0_in_mask);
1132 resd1_in_mask = vbslq_s16(chroma_mask_16x8, resd1_in, resd1_in_mask);
1133 resd2_in_mask = vbslq_s16(chroma_mask_16x8, resd2_in, resd2_in_mask);
1134 resd3_in_mask = vbslq_s16(chroma_mask_16x8, resd3_in, resd3_in_mask);
1135
1136 x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
1137 x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
1138 x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
1139 x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
1140
1141 resd0_in = vaddq_s16(resd0_in_mask, x0_16x8);
1142 resd1_in = vaddq_s16(resd1_in_mask, x1_16x8);
1143 resd2_in = vaddq_s16(resd2_in_mask, x2_16x8);
1144 resd3_in = vaddq_s16(resd3_in_mask, x3_16x8);
1145
1146 /* Saturate all values < -255 to -255 and retain the rest as it is */
1147 resd0_in = vmaxq_s16(resd0_in, neg_255);
1148 resd1_in = vmaxq_s16(resd1_in, neg_255);
1149 resd2_in = vmaxq_s16(resd2_in, neg_255);
1150 resd3_in = vmaxq_s16(resd3_in, neg_255);
1151
1152 /* Saturate all values > 255 to 255 and retain the rest as it is */
1153 resd0_in = vminq_s16(resd0_in, pos_255);
1154 resd1_in = vminq_s16(resd1_in, pos_255);
1155 resd2_in = vminq_s16(resd2_in, pos_255);
1156 resd3_in = vminq_s16(resd3_in, pos_255);
1157
1158 resout0 = vld1q_s16(pi2_res);
1159 resout1 = vld1q_s16(pi2_res + i4_res_stride);
1160 resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1161 resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1162
1163 /* Store res in aternate positions */
1164 resout0 = vbslq_s16(chroma_mask_16x8, resd0_in, resout0);
1165 resout1 = vbslq_s16(chroma_mask_16x8, resd1_in, resout1);
1166 resout2 = vbslq_s16(chroma_mask_16x8, resd2_in, resout2);
1167 resout3 = vbslq_s16(chroma_mask_16x8, resd3_in, resout3);
1168
1169 vst1q_s16(pi2_res, resout0);
1170 vst1q_s16(pi2_res + i4_res_stride, resout1);
1171 vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1172 vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1173
1174 pred0_in = vld1_u8((uint8_t *) pu1_pred);
1175 pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride));
1176 pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1));
1177 pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3));
1178
1179 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1180 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1181 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1182 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1183
1184 /* Out pixel = pred + res */
1185 rec0 = vaddq_s16(pred0, resout0);
1186 rec1 = vaddq_s16(pred1, resout1);
1187 rec2 = vaddq_s16(pred2, resout2);
1188 rec3 = vaddq_s16(pred3, resout3);
1189
1190 out0 = vld1_u8(pu1_out);
1191 out1 = vld1_u8(pu1_out + i4_out_stride);
1192 out2 = vld1_u8(pu1_out + i4_out_stride * 2);
1193 out3 = vld1_u8(pu1_out + i4_out_stride * 3);
1194
1195 /* Convert to 8 bit unsigned with saturation */
1196 rec0_un = vqmovun_s16(rec0);
1197 rec1_un = vqmovun_s16(rec1);
1198 rec2_un = vqmovun_s16(rec2);
1199 rec3_un = vqmovun_s16(rec3);
1200
1201 /* Store output pixels in alternate positions */
1202 out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
1203 out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
1204 out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
1205 out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
1206
1207 vst1_u8((pu1_out), out0);
1208 vst1_u8((pu1_out + i4_out_stride), out1);
1209 vst1_u8((pu1_out + (i4_out_stride << 1)), out2);
1210 vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3);
1211 }
1212
isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1213 void isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
1214 buffer_container_t *ps_res_pred,
1215 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1216 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
1217 WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1218 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1219 {
1220 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1221 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1222 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1223 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1224 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1225 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1226 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1227 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1228 WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1229
1230 WORD32 i4_iq_out_temp;
1231 int16x8_t temp_0;
1232 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1233 int16x8_t pred0, pred1, pred2, pred3;
1234
1235 UNUSED(pi2_tmp);
1236 UNUSED(ps_res);
1237 UNUSED(ps_res_pred);
1238 UNUSED(u1_res_accumulate);
1239
1240 if(i4_iq_start_idx == 0)
1241 {
1242 i4_iq_out_temp = pi2_src[0];
1243 INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1244 }
1245 else
1246 {
1247 i4_iq_out_temp = pi2_dc_src[0];
1248 }
1249
1250 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1251
1252 pred0_in = vld1_u8(pu1_pred);
1253 pu1_pred = pu1_pred + i4_pred_stride;
1254 pred1_in = vld1_u8(pu1_pred);
1255 pu1_pred = pu1_pred + i4_pred_stride;
1256 pred2_in = vld1_u8(pu1_pred);
1257 pu1_pred = pu1_pred + i4_pred_stride;
1258 pred3_in = vld1_u8(pu1_pred);
1259
1260 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1261 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1262 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1263 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1264
1265 /* Out pixel = Res + pred */
1266 pred0 = vaddq_s16(pred0, temp_0);
1267 pred1 = vaddq_s16(pred1, temp_0);
1268 pred2 = vaddq_s16(pred2, temp_0);
1269 pred3 = vaddq_s16(pred3, temp_0);
1270
1271 /* Convert to unsigned 8 bit with saturation */
1272 pred0_in = vqmovun_s16(pred0);
1273 pred1_in = vqmovun_s16(pred1);
1274 pred2_in = vqmovun_s16(pred2);
1275 pred3_in = vqmovun_s16(pred3);
1276
1277 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
1278 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred1_in), 0);
1279 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 2), vreinterpret_u32_u8(pred2_in), 0);
1280 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0);
1281 }
1282
isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1283 void isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon(
1284 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1285 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1286 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1287 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1288 {
1289 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1290 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1291 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1292 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1293 WORD32 i4_res_stride = ps_res->i4_data_stride;
1294 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1295 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1296 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1297 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1298 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1299 WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1300
1301 WORD16 i2_it_out;
1302 WORD32 i4_iq_out_temp;
1303 int16x8_t temp_0;
1304 int16x4_t residue_res;
1305 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1306 int16x8_t pred0, pred1, pred2, pred3;
1307
1308 UNUSED(pi2_tmp);
1309 UNUSED(ps_res_pred);
1310 UNUSED(u1_res_accumulate);
1311
1312 if(i4_iq_start_idx == 0)
1313 {
1314 i4_iq_out_temp = pi2_src[0];
1315 INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1316 }
1317 else
1318 {
1319 i4_iq_out_temp = pi2_dc_src[0];
1320 }
1321
1322 i2_it_out = ((i4_iq_out_temp + 32) >> 6);
1323 temp_0 = vdupq_n_s16(i2_it_out);
1324 residue_res = vdup_n_s16(isvc_get_residue(i2_it_out, 0, 0));
1325
1326 vst1_s16(pi2_res, residue_res);
1327 vst1_s16(pi2_res + i4_res_stride, residue_res);
1328 vst1_s16(pi2_res + (i4_res_stride << 1), residue_res);
1329 vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, residue_res);
1330
1331 pred0_in = vld1_u8(pu1_pred);
1332 pu1_pred = pu1_pred + i4_pred_stride;
1333 pred1_in = vld1_u8(pu1_pred);
1334 pu1_pred = pu1_pred + i4_pred_stride;
1335 pred2_in = vld1_u8(pu1_pred);
1336 pu1_pred = pu1_pred + i4_pred_stride;
1337 pred3_in = vld1_u8(pu1_pred);
1338
1339 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1340 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1341 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1342 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1343
1344 /* Out pixel = Res + pred */
1345 pred0 = vaddq_s16(pred0, temp_0);
1346 pred1 = vaddq_s16(pred1, temp_0);
1347 pred2 = vaddq_s16(pred2, temp_0);
1348 pred3 = vaddq_s16(pred3, temp_0);
1349
1350 /* Convert to unsigned 8 bit with saturation */
1351 pred0_in = vqmovun_s16(pred0);
1352 pred1_in = vqmovun_s16(pred1);
1353 pred2_in = vqmovun_s16(pred2);
1354 pred3_in = vqmovun_s16(pred3);
1355
1356 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
1357 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred1_in), 0);
1358 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 2), vreinterpret_u32_u8(pred2_in), 0);
1359 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0);
1360 }
1361
isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1362 void isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon(
1363 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1364 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1365 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1366 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1367 {
1368 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1369 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1370 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1371 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1372 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1373 WORD32 i4_res_stride = ps_res->i4_data_stride;
1374 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1375 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1376 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1377 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1378 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1379 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1380 WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1381
1382 WORD32 i4_iq_out_temp;
1383 int16x4_t temp_0;
1384 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1385 int16x8_t pred0, pred1, pred2, pred3;
1386 int16x8_t pred01_in, pred23_in;
1387 uint8x8_t pred01_un, pred23_un;
1388
1389 int16x4_t resd0_in, resd1_in, resd2_in, resd3_in;
1390 int16x8_t resd01_in, resd23_in;
1391 int16x4_t pos_255 = vdup_n_s16(((WORD16) UINT8_MAX));
1392 int16x4_t neg_255 = vdup_n_s16(-((WORD16) UINT8_MAX));
1393
1394 UNUSED(pi2_tmp);
1395 UNUSED(u1_res_accumulate);
1396
1397 if(i4_iq_start_idx == 0)
1398 {
1399 i4_iq_out_temp = pi2_src[0];
1400 INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1401 }
1402 else
1403 {
1404 i4_iq_out_temp = pi2_dc_src[0];
1405 }
1406
1407 temp_0 = vdup_n_s16((i4_iq_out_temp + 32) >> 6);
1408
1409 resd0_in = vld1_s16((int16_t *) pi2_res_pred);
1410 resd1_in = vld1_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
1411 resd2_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
1412 resd3_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
1413
1414 /* Add res pred to the res obtained */
1415 resd0_in = vadd_s16(resd0_in, temp_0);
1416 resd1_in = vadd_s16(resd1_in, temp_0);
1417 resd2_in = vadd_s16(resd2_in, temp_0);
1418 resd3_in = vadd_s16(resd3_in, temp_0);
1419
1420 /* Saturate all values < -255 to -255 and retain the rest as it is */
1421 resd0_in = vmax_s16(resd0_in, neg_255);
1422 resd1_in = vmax_s16(resd1_in, neg_255);
1423 resd2_in = vmax_s16(resd2_in, neg_255);
1424 resd3_in = vmax_s16(resd3_in, neg_255);
1425
1426 /* Saturate all values > 255 to 255 and retain the rest as it is */
1427 resd0_in = vmin_s16(resd0_in, pos_255);
1428 resd1_in = vmin_s16(resd1_in, pos_255);
1429 resd2_in = vmin_s16(resd2_in, pos_255);
1430 resd3_in = vmin_s16(resd3_in, pos_255);
1431
1432 vst1_s16(pi2_res, resd0_in);
1433 vst1_s16(pi2_res + i4_res_stride, resd1_in);
1434 vst1_s16(pi2_res + (i4_res_stride << 1), resd2_in);
1435 vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resd3_in);
1436
1437 resd01_in = vcombine_s16(resd0_in, resd1_in);
1438 resd23_in = vcombine_s16(resd2_in, resd3_in);
1439
1440 pred0_in = vld1_u8(pu1_pred);
1441 pu1_pred = pu1_pred + i4_pred_stride;
1442 pred1_in = vld1_u8(pu1_pred);
1443 pu1_pred = pu1_pred + i4_pred_stride;
1444 pred2_in = vld1_u8(pu1_pred);
1445 pu1_pred = pu1_pred + i4_pred_stride;
1446 pred3_in = vld1_u8(pu1_pred);
1447
1448 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1449 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1450 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1451 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1452
1453 pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1));
1454 pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3));
1455
1456 /* Out pixel = Res + pred */
1457 pred01_in = vaddq_s16(pred01_in, resd01_in);
1458 pred23_in = vaddq_s16(pred23_in, resd23_in);
1459
1460 /* Convert to unsigned 8 bit with saturation */
1461 pred01_un = vqmovun_s16(pred01_in);
1462 pred23_un = vqmovun_s16(pred23_in);
1463
1464 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
1465 vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1);
1466 vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0);
1467 vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)),
1468 vreinterpret_u32_u8(pred23_un), 1);
1469 }
1470
isvc_iquant_itrans_recon_chroma_4x4_dc_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1471 void isvc_iquant_itrans_recon_chroma_4x4_dc_neon(
1472 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1473 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1474 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1475 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1476 {
1477 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1478 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1479 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1480 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1481 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1482 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1483 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1484 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1485
1486 WORD32 i4_iq_out_temp;
1487 int16x8_t temp_0;
1488 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1489 int16x8_t pred0, pred1, pred2, pred3;
1490 uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
1491 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1492
1493 UNUSED(pi2_src);
1494 UNUSED(pu2_iscal_mat);
1495 UNUSED(pu2_weigh_mat);
1496 UNUSED(u4_qp_div_6);
1497 UNUSED(pi2_tmp);
1498 UNUSED(i4_iq_start_idx);
1499 UNUSED(ps_res);
1500 UNUSED(ps_res_pred);
1501 UNUSED(u1_res_accumulate);
1502
1503 i4_iq_out_temp = pi2_dc_src[0];
1504 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1505
1506 pred0_in = vld1_u8(pu1_pred);
1507 pu1_pred = pu1_pred + i4_pred_stride;
1508 pred1_in = vld1_u8(pu1_pred);
1509 pu1_pred = pu1_pred + i4_pred_stride;
1510 pred2_in = vld1_u8(pu1_pred);
1511 pu1_pred = pu1_pred + i4_pred_stride;
1512 pred3_in = vld1_u8(pu1_pred);
1513
1514 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1515 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1516 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1517 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1518
1519 /* Out pixel = Res + pred */
1520 pred0 = vaddq_s16(pred0, temp_0);
1521 pred1 = vaddq_s16(pred1, temp_0);
1522 pred2 = vaddq_s16(pred2, temp_0);
1523 pred3 = vaddq_s16(pred3, temp_0);
1524
1525 /* Convert to unsigned 8 bit with saturation */
1526 pred0_in = vqmovun_s16(pred0);
1527 pred1_in = vqmovun_s16(pred1);
1528 pred2_in = vqmovun_s16(pred2);
1529 pred3_in = vqmovun_s16(pred3);
1530
1531 i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
1532 i4_out_horz_8x8_r1 = vld1_u8(pu1_out + i4_out_stride);
1533 i4_out_horz_8x8_r2 = vld1_u8(pu1_out + i4_out_stride * 2);
1534 i4_out_horz_8x8_r3 = vld1_u8(pu1_out + i4_out_stride * 3);
1535
1536 /* Store out pixels in alternate positions */
1537 i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
1538 i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
1539 i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
1540 i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
1541
1542 vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
1543 vst1_u8((uint8_t *) (pu1_out + i4_out_stride), i4_out_horz_8x8_r1);
1544 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), i4_out_horz_8x8_r2);
1545 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), i4_out_horz_8x8_r3);
1546 }
1547
isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1548 void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon(
1549 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1550 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1551 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1552 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1553 {
1554 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1555 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1556 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1557 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1558 WORD32 i4_res_stride = ps_res->i4_data_stride;
1559 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1560 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1561 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1562 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1563 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1564
1565 WORD16 i2_it_out;
1566 WORD32 i4_iq_out_temp;
1567 int16x8_t temp_0, residue_res;
1568 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1569 int16x8_t pred0, pred1, pred2, pred3;
1570 int16x8_t resout0, resout1, resout2, resout3;
1571
1572 uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
1573 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1574 uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1575
1576 UNUSED(pi2_src);
1577 UNUSED(pu2_iscal_mat);
1578 UNUSED(pu2_weigh_mat);
1579 UNUSED(u4_qp_div_6);
1580 UNUSED(pi2_tmp);
1581 UNUSED(i4_iq_start_idx);
1582 UNUSED(ps_res_pred);
1583 UNUSED(u1_res_accumulate);
1584
1585 i4_iq_out_temp = pi2_dc_src[0];
1586
1587 i2_it_out = ((i4_iq_out_temp + 32) >> 6);
1588 temp_0 = vdupq_n_s16(i2_it_out);
1589 residue_res = vdupq_n_s16(isvc_get_residue(i2_it_out, 0, 0));
1590
1591 resout0 = vld1q_s16(pi2_res);
1592 resout1 = vld1q_s16(pi2_res + i4_res_stride);
1593 resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1594 resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1595
1596 /* Store res in alternate positions */
1597 resout0 = vbslq_s16(chroma_mask_16x8, residue_res, resout0);
1598 resout1 = vbslq_s16(chroma_mask_16x8, residue_res, resout1);
1599 resout2 = vbslq_s16(chroma_mask_16x8, residue_res, resout2);
1600 resout3 = vbslq_s16(chroma_mask_16x8, residue_res, resout3);
1601
1602 vst1q_s16(pi2_res, resout0);
1603 vst1q_s16(pi2_res + i4_res_stride, resout1);
1604 vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1605 vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1606
1607 pred0_in = vld1_u8(pu1_pred);
1608 pu1_pred = pu1_pred + i4_pred_stride;
1609 pred1_in = vld1_u8(pu1_pred);
1610 pu1_pred = pu1_pred + i4_pred_stride;
1611 pred2_in = vld1_u8(pu1_pred);
1612 pu1_pred = pu1_pred + i4_pred_stride;
1613 pred3_in = vld1_u8(pu1_pred);
1614
1615 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1616 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1617 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1618 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1619
1620 /* Out pixel = Res + pred */
1621 pred0 = vaddq_s16(pred0, temp_0);
1622 pred1 = vaddq_s16(pred1, temp_0);
1623 pred2 = vaddq_s16(pred2, temp_0);
1624 pred3 = vaddq_s16(pred3, temp_0);
1625
1626 /* Convert to unsigned 8 bit with saturation */
1627 pred0_in = vqmovun_s16(pred0);
1628 pred1_in = vqmovun_s16(pred1);
1629 pred2_in = vqmovun_s16(pred2);
1630 pred3_in = vqmovun_s16(pred3);
1631
1632 /* Store out pixels in alternate positions */
1633 i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
1634 i4_out_horz_8x8_r1 = vld1_u8(pu1_out + i4_out_stride);
1635 i4_out_horz_8x8_r2 = vld1_u8(pu1_out + i4_out_stride * 2);
1636 i4_out_horz_8x8_r3 = vld1_u8(pu1_out + i4_out_stride * 3);
1637
1638 i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
1639 i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
1640 i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
1641 i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
1642
1643 vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
1644 vst1_u8((uint8_t *) (pu1_out + i4_out_stride), i4_out_horz_8x8_r1);
1645 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), i4_out_horz_8x8_r2);
1646 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), i4_out_horz_8x8_r3);
1647 }
1648
isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1649 void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_accumulate_neon(
1650 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1651 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1652 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1653 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1654 {
1655 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1656 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1657 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1658 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1659 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1660 WORD32 i4_res_stride = ps_res->i4_data_stride;
1661 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1662 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1663 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1664 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1665 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1666 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1667
1668 WORD32 i4_iq_out_temp;
1669 int16x8_t temp_0;
1670 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1671 int16x8_t pred0, pred1, pred2, pred3;
1672 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1673 int16x8_t resout0, resout1, resout2, resout3;
1674 int16x8_t resd1_in_mask, resd2_in_mask, resd3_in_mask;
1675 uint8x8_t out0, out1, out2, out3;
1676 int16x8_t pos_255 = vdupq_n_s16(((WORD16) UINT8_MAX));
1677 int16x8_t neg_255 = vdupq_n_s16(-((WORD16) UINT8_MAX));
1678 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1679 uint16x8_t chroma_mask_16x8 = {0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000};
1680
1681 int16x8_t resd0_in_mask = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000};
1682
1683 UNUSED(pi2_src);
1684 UNUSED(pu2_iscal_mat);
1685 UNUSED(pu2_weigh_mat);
1686 UNUSED(u4_qp_div_6);
1687 UNUSED(pi2_tmp);
1688 UNUSED(i4_iq_start_idx);
1689 UNUSED(u1_res_accumulate);
1690
1691 resd1_in_mask = resd0_in_mask;
1692 resd2_in_mask = resd0_in_mask;
1693 resd3_in_mask = resd0_in_mask;
1694
1695 i4_iq_out_temp = pi2_dc_src[0];
1696 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1697
1698 resd0_in = vld1q_s16((int16_t *) pi2_res_pred);
1699 resd1_in = vld1q_s16((int16_t *) pi2_res_pred + i4_res_pred_stride);
1700 resd2_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2));
1701 resd3_in = vld1q_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3));
1702
1703 /* Mask alternate values of res pred */
1704 resd0_in_mask = vbslq_s16(chroma_mask_16x8, resd0_in, resd0_in_mask);
1705 resd1_in_mask = vbslq_s16(chroma_mask_16x8, resd1_in, resd1_in_mask);
1706 resd2_in_mask = vbslq_s16(chroma_mask_16x8, resd2_in, resd2_in_mask);
1707 resd3_in_mask = vbslq_s16(chroma_mask_16x8, resd3_in, resd3_in_mask);
1708
1709 /* Add res pred to res obtained */
1710 resd0_in = vaddq_s16(resd0_in_mask, temp_0);
1711 resd1_in = vaddq_s16(resd1_in_mask, temp_0);
1712 resd2_in = vaddq_s16(resd2_in_mask, temp_0);
1713 resd3_in = vaddq_s16(resd3_in_mask, temp_0);
1714
1715 /* Saturate all values < -255 to -255 and retain the rest as it is */
1716 resd0_in = vmaxq_s16(resd0_in, neg_255);
1717 resd1_in = vmaxq_s16(resd1_in, neg_255);
1718 resd2_in = vmaxq_s16(resd2_in, neg_255);
1719 resd3_in = vmaxq_s16(resd3_in, neg_255);
1720
1721 /* Saturate all values > 255 to 255 and retain the rest as it is */
1722 resd0_in = vminq_s16(resd0_in, pos_255);
1723 resd1_in = vminq_s16(resd1_in, pos_255);
1724 resd2_in = vminq_s16(resd2_in, pos_255);
1725 resd3_in = vminq_s16(resd3_in, pos_255);
1726
1727 resout0 = vld1q_s16(pi2_res);
1728 resout1 = vld1q_s16(pi2_res + i4_res_stride);
1729 resout2 = vld1q_s16(pi2_res + i4_res_stride * 2);
1730 resout3 = vld1q_s16(pi2_res + i4_res_stride * 3);
1731
1732 /* Store res in alternate positions */
1733 resout0 = vbslq_s16(chroma_mask_16x8, resd0_in, resout0);
1734 resout1 = vbslq_s16(chroma_mask_16x8, resd1_in, resout1);
1735 resout2 = vbslq_s16(chroma_mask_16x8, resd2_in, resout2);
1736 resout3 = vbslq_s16(chroma_mask_16x8, resd3_in, resout3);
1737
1738 vst1q_s16(pi2_res, resout0);
1739 vst1q_s16(pi2_res + i4_res_stride, resout1);
1740 vst1q_s16(pi2_res + (i4_res_stride << 1), resout2);
1741 vst1q_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resout3);
1742
1743 pred0_in = vld1_u8(pu1_pred);
1744 pu1_pred = pu1_pred + i4_pred_stride;
1745 pred1_in = vld1_u8(pu1_pred);
1746 pu1_pred = pu1_pred + i4_pred_stride;
1747 pred2_in = vld1_u8(pu1_pred);
1748 pu1_pred = pu1_pred + i4_pred_stride;
1749 pred3_in = vld1_u8(pu1_pred);
1750
1751 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1752 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1753 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1754 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1755
1756 /* Out pixel = Res + pred */
1757 pred0 = vaddq_s16(pred0, resout0);
1758 pred1 = vaddq_s16(pred1, resout1);
1759 pred2 = vaddq_s16(pred2, resout2);
1760 pred3 = vaddq_s16(pred3, resout3);
1761
1762 /* Convert to unsigned 8 bit with saturation */
1763 pred0_in = vqmovun_s16(pred0);
1764 pred1_in = vqmovun_s16(pred1);
1765 pred2_in = vqmovun_s16(pred2);
1766 pred3_in = vqmovun_s16(pred3);
1767
1768 out0 = vld1_u8(pu1_out);
1769 out1 = vld1_u8(pu1_out + i4_out_stride);
1770 out2 = vld1_u8(pu1_out + i4_out_stride * 2);
1771 out3 = vld1_u8(pu1_out + i4_out_stride * 3);
1772
1773 /* Store out pixels in alternate positions */
1774 out0 = vbsl_u8(chroma_mask_8x8, pred0_in, out0);
1775 out1 = vbsl_u8(chroma_mask_8x8, pred1_in, out1);
1776 out2 = vbsl_u8(chroma_mask_8x8, pred2_in, out2);
1777 out3 = vbsl_u8(chroma_mask_8x8, pred3_in, out3);
1778
1779 vst1_u8((uint8_t *) (pu1_out), out0);
1780 vst1_u8((uint8_t *) (pu1_out + i4_out_stride), out1);
1781 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), out2);
1782 vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), out3);
1783 }
1784