1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 * *******************************************************************************
22 * * @file
23 * isvc_resi_trans_quant_neon.c
24 *
25 * @brief
26 * neon variants of forward transform and quantization functions
27 *
28 * *******************************************************************************
29 */
30
31 #include <arm_neon.h>
32 #include <string.h>
33
34 #include "ih264_typedefs.h"
35 #include "ih264_debug.h"
36 #include "ih264_defs.h"
37 #include "ih264_trans_macros.h"
38 #include "ih264_macros.h"
39 #include "ih264_platform_macros.h"
40 #include "ih264_trans_data.h"
41 #include "ih264_size_defs.h"
42 #include "isvc_structs.h"
43 #include "isvc_trans_quant_itrans_iquant.h"
44
isvc_resi_trans_quant_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)45 void isvc_resi_trans_quant_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
46 buffer_container_t *ps_out,
47 buffer_container_t *ps_upsampled_res,
48 resi_trans_quant_constants_t *ps_quant_constants,
49 UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
50 UWORD8 u1_use_upsampled_res)
51 {
52 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
53 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
54 WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
55 WORD32 i4_src_stride = ps_src->i4_data_stride;
56 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
57 WORD32 i4_out_stride = ps_out->i4_data_stride;
58 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
59 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
60 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
61 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
62
63 uint8x8_t src0, src1, src2, src3;
64 uint8x8_t pred0, pred1, pred2, pred3;
65 uint8x8_t temp0_u8x8, temp1_u8x8;
66 uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4;
67 uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4;
68 uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4;
69 uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4;
70 int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4;
71 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
72 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
73 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
74 int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4;
75 uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8;
76 uint16x8_t temp0_u16x8, temp1_u16x8;
77 int32x2x2_t x0_32x2x2, x1_32x2x2;
78 int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4;
79
80 int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor);
81 int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits);
82 int16x4_t zeros_16x4 = vdup_n_s16(0);
83
84 UNUSED(ps_upsampled_res);
85 UNUSED(u1_use_upsampled_res);
86
87 threshold0_16x4 = vld1_u16(pu2_threshold_matrix);
88 threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4);
89 threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8);
90 threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12);
91
92 scale_mat0_16x4 = vld1_u16(pu2_scale_matrix);
93 scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4);
94 scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8);
95 scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12);
96
97 src0 = vld1_u8(&pu1_src[0 * i4_src_stride]);
98 src1 = vld1_u8(&pu1_src[1 * i4_src_stride]);
99 src2 = vld1_u8(&pu1_src[2 * i4_src_stride]);
100 src3 = vld1_u8(&pu1_src[3 * i4_src_stride]);
101
102 pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]);
103 pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]);
104 pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]);
105 pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]);
106
107 /* calculate res = src - pred */
108 res0_16x8 = vsubl_u8(src0, pred0);
109 res1_16x8 = vsubl_u8(src1, pred1);
110 res2_16x8 = vsubl_u8(src2, pred2);
111 res3_16x8 = vsubl_u8(src3, pred3);
112
113 res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8));
114 res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8));
115 res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8));
116 res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8));
117
118 /* Perform Forward transform */
119 /*-------------------------------------------------------------*/
120 /* DCT [ Horizontal transformation ] */
121 /*-------------------------------------------------------------*/
122 /* Matrix transpose */
123 /*
124 * a0 a1 a2 a3
125 * b0 b1 b2 b3
126 * c0 c1 c2 c3
127 * d0 d1 d2 d3
128 */
129
130 xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4);
131 xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4);
132 x0_32x2x2 =
133 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
134 x1_32x2x2 =
135 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
136
137 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
138 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
139 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
140 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
141
142 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
143 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
144 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
145 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
146
147 x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
148 temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
149 x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4);
150
151 x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
152 temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
153 x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
154
155 /* Matrix transpose */
156 /*
157 * a0 b0 c0 d0
158 * a1 b1 c1 d1
159 * a2 b2 c2 d2
160 * a3 b3 c3 d3
161 */
162
163 xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4);
164 xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4);
165 x0_32x2x2 =
166 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
167 x1_32x2x2 =
168 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
169
170 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
171 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
172 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
173 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
174
175 /* Vertical Transformation */
176
177 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
178 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
179 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
180 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
181
182 x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
183 temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
184 x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4);
185
186 x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
187 temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
188 x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
189
190 /* get the first 16 bits from the register */
191 *pi2_dc_out = vget_lane_s16(x0_16x4, 0);
192
193 xx0_16x4 = vabs_s16(x0_16x4);
194 xx1_16x4 = vabs_s16(x1_16x4);
195 xx2_16x4 = vabs_s16(x2_16x4);
196 xx3_16x4 = vabs_s16(x3_16x4);
197
198 /* compare with zero for getting sign */
199 temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4);
200 temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4);
201 temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4);
202 temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4);
203
204 /* compare with zero for thresholding */
205 thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4);
206 thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4);
207 thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4);
208 thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4);
209
210 /* Multiply abs values obtained with scaling matrix */
211 tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4));
212 tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4));
213 tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4));
214 tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4));
215
216 tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4);
217 tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4);
218 tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4);
219 tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4);
220
221 qbits_32x4 = vnegq_s32(qbits_32x4);
222
223 tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4);
224 tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4);
225 tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4);
226 tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4);
227
228 /* Convertion to 16 bits signed */
229 temp0_16x4 = vmovn_s32(tx0_32x4);
230 temp1_16x4 = vmovn_s32(tx1_32x4);
231 temp2_16x4 = vmovn_s32(tx2_32x4);
232 temp3_16x4 = vmovn_s32(tx3_32x4);
233
234 x0_16x4 = vneg_s16(temp0_16x4);
235 x1_16x4 = vneg_s16(temp1_16x4);
236 x2_16x4 = vneg_s16(temp2_16x4);
237 x3_16x4 = vneg_s16(temp3_16x4);
238
239 /* Restore sign */
240 x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4);
241 x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4);
242 x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4);
243 x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4);
244
245 xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4);
246 xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4);
247 xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4);
248 xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4);
249
250 /* Store Quantized outputs */
251 vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4);
252 vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4);
253 vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4);
254 vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4);
255
256 /* NNZ calculation */
257
258 temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4);
259 temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4);
260 temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4);
261 temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4);
262
263 temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4);
264 temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4);
265
266 /* Convertion to 8 bit unsigned */
267 temp0_u8x8 = vmovn_u16(temp0_u16x8);
268 temp1_u8x8 = vmovn_u16(temp1_u16x8);
269
270 temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7);
271 temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7);
272
273 temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8);
274 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
275 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
276 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
277
278 *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0);
279 }
280
isvc_resi_trans_quant_4x4_with_residual_sub_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)281 void isvc_resi_trans_quant_4x4_with_residual_sub_neon(
282 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out,
283 buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants,
284 UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
285 {
286 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
287 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
288 WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
289 WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL;
290 WORD32 i4_src_stride = ps_src->i4_data_stride;
291 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
292 WORD32 i4_out_stride = ps_out->i4_data_stride;
293 WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
294 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
295 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
296 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
297 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
298
299 uint8x8_t src0, src1, src2, src3;
300 uint8x8_t pred0, pred1, pred2, pred3;
301 uint8x8_t temp0_u8x8, temp1_u8x8;
302 uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4;
303 uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4;
304 uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4;
305 uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4;
306 int16x4_t upres0_16x4, upres1_16x4, upres2_16x4, upres3_16x4;
307 int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4;
308 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
309 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
310 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
311 int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4;
312 uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8;
313 uint16x8_t temp0_u16x8, temp1_u16x8;
314 int32x2x2_t x0_32x2x2, x1_32x2x2;
315 int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4;
316
317 int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor);
318 int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits);
319 int16x4_t zeros_16x4 = vdup_n_s16(0);
320 int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
321 int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
322
323 UNUSED(u1_use_upsampled_res);
324
325 threshold0_16x4 = vld1_u16(pu2_threshold_matrix);
326 threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4);
327 threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8);
328 threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12);
329
330 scale_mat0_16x4 = vld1_u16(pu2_scale_matrix);
331 scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4);
332 scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8);
333 scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12);
334
335 src0 = vld1_u8(&pu1_src[0 * i4_src_stride]);
336 src1 = vld1_u8(&pu1_src[1 * i4_src_stride]);
337 src2 = vld1_u8(&pu1_src[2 * i4_src_stride]);
338 src3 = vld1_u8(&pu1_src[3 * i4_src_stride]);
339
340 pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]);
341 pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]);
342 pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]);
343 pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]);
344
345 /* calculate res = src - pred */
346 res0_16x8 = vsubl_u8(src0, pred0);
347 res1_16x8 = vsubl_u8(src1, pred1);
348 res2_16x8 = vsubl_u8(src2, pred2);
349 res3_16x8 = vsubl_u8(src3, pred3);
350
351 res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8));
352 res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8));
353 res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8));
354 res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8));
355
356 /* Load upsampled res */
357 upres0_16x4 = vld1_s16(&pi2_upsampled_res[0 * i4_upsampled_res_stride]);
358 upres1_16x4 = vld1_s16(&pi2_upsampled_res[1 * i4_upsampled_res_stride]);
359 upres2_16x4 = vld1_s16(&pi2_upsampled_res[2 * i4_upsampled_res_stride]);
360 upres3_16x4 = vld1_s16(&pi2_upsampled_res[3 * i4_upsampled_res_stride]);
361
362 /* subtract upsampled res from (src - pred) to obtain final res */
363 res0_16x4 = vsub_s16(res0_16x4, upres0_16x4);
364 res1_16x4 = vsub_s16(res1_16x4, upres1_16x4);
365 res2_16x4 = vsub_s16(res2_16x4, upres2_16x4);
366 res3_16x4 = vsub_s16(res3_16x4, upres3_16x4);
367
368 /* Saturate all values < -255 to -255 and retain the rest as it is */
369 res0_16x4 = vmax_s16(res0_16x4, neg_255_16x4);
370 res1_16x4 = vmax_s16(res1_16x4, neg_255_16x4);
371 res2_16x4 = vmax_s16(res2_16x4, neg_255_16x4);
372 res3_16x4 = vmax_s16(res3_16x4, neg_255_16x4);
373
374 /* Saturate all values > 255 to 255 and retain the rest as it is */
375 res0_16x4 = vmin_s16(res0_16x4, pos_255_16x4);
376 res1_16x4 = vmin_s16(res1_16x4, pos_255_16x4);
377 res2_16x4 = vmin_s16(res2_16x4, pos_255_16x4);
378 res3_16x4 = vmin_s16(res3_16x4, pos_255_16x4);
379
380 /* Perform Forward transform */
381 /*-------------------------------------------------------------*/
382 /* DCT [ Horizontal transformation ] */
383 /*-------------------------------------------------------------*/
384 /* Matrix transpose */
385 /*
386 * a0 a1 a2 a3
387 * b0 b1 b2 b3
388 * c0 c1 c2 c3
389 * d0 d1 d2 d3
390 */
391
392 xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4);
393 xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4);
394 x0_32x2x2 =
395 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
396 x1_32x2x2 =
397 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
398
399 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
400 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
401 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
402 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
403
404 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
405 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
406 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
407 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
408
409 x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
410 temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
411 x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4);
412
413 x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
414 temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
415 x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
416
417 /* Matrix transpose */
418 /*
419 * a0 b0 c0 d0
420 * a1 b1 c1 d1
421 * a2 b2 c2 d2
422 * a3 b3 c3 d3
423 */
424
425 xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4);
426 xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4);
427 x0_32x2x2 =
428 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
429 x1_32x2x2 =
430 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
431
432 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
433 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
434 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
435 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
436
437 /* Vertical Transformation */
438
439 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
440 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
441 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
442 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
443
444 x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
445 temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
446 x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4);
447
448 x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
449 temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
450 x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
451
452 /* get the first 16 bits from the register */
453 *pi2_dc_out = vget_lane_s16(x0_16x4, 0);
454
455 xx0_16x4 = vabs_s16(x0_16x4);
456 xx1_16x4 = vabs_s16(x1_16x4);
457 xx2_16x4 = vabs_s16(x2_16x4);
458 xx3_16x4 = vabs_s16(x3_16x4);
459
460 /* compare with zero for getting sign */
461 temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4);
462 temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4);
463 temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4);
464 temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4);
465
466 /* compare with zero for thresholding */
467 thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4);
468 thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4);
469 thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4);
470 thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4);
471
472 /* Multiply abs values obtained with scaling matrix */
473 tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4));
474 tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4));
475 tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4));
476 tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4));
477
478 tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4);
479 tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4);
480 tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4);
481 tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4);
482
483 qbits_32x4 = vnegq_s32(qbits_32x4);
484
485 tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4);
486 tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4);
487 tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4);
488 tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4);
489
490 /* Convertion to 16 bits signed */
491 temp0_16x4 = vmovn_s32(tx0_32x4);
492 temp1_16x4 = vmovn_s32(tx1_32x4);
493 temp2_16x4 = vmovn_s32(tx2_32x4);
494 temp3_16x4 = vmovn_s32(tx3_32x4);
495
496 x0_16x4 = vneg_s16(temp0_16x4);
497 x1_16x4 = vneg_s16(temp1_16x4);
498 x2_16x4 = vneg_s16(temp2_16x4);
499 x3_16x4 = vneg_s16(temp3_16x4);
500
501 /* Restore sign */
502 x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4);
503 x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4);
504 x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4);
505 x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4);
506
507 xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4);
508 xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4);
509 xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4);
510 xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4);
511
512 /* Store Quantized outputs */
513 vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4);
514 vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4);
515 vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4);
516 vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4);
517
518 /* NNZ calculation */
519
520 temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4);
521 temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4);
522 temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4);
523 temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4);
524
525 temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4);
526 temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4);
527
528 /* Convertion to 8 bit unsigned */
529 temp0_u8x8 = vmovn_u16(temp0_u16x8);
530 temp1_u8x8 = vmovn_u16(temp1_u16x8);
531
532 temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7);
533 temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7);
534
535 temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8);
536 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
537 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
538 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
539
540 *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0);
541 }
542
isvc_resi_trans_quant_chroma_4x4_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)543 void isvc_resi_trans_quant_chroma_4x4_neon(buffer_container_t *ps_src, buffer_container_t *ps_pred,
544 buffer_container_t *ps_out,
545 buffer_container_t *ps_upsampled_res,
546 resi_trans_quant_constants_t *ps_quant_constants,
547 UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
548 UWORD8 u1_use_upsampled_res)
549 {
550 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
551 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
552 WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
553 WORD32 i4_src_stride = ps_src->i4_data_stride;
554 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
555 WORD32 i4_out_stride = ps_out->i4_data_stride;
556 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
557 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
558 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
559 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
560
561 uint8x8_t src0, src1, src2, src3;
562 uint8x8_t pred0, pred1, pred2, pred3;
563 uint8x8x2_t tmp0, tmp1, tmp2, tmp3;
564 uint8x8_t temp0_u8x8, temp1_u8x8;
565 uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4;
566 uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4;
567 uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4;
568 uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4;
569 int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4;
570 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
571 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
572 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
573 int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4;
574 uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8;
575 uint16x8_t temp0_u16x8, temp1_u16x8;
576 int32x2x2_t x0_32x2x2, x1_32x2x2;
577 int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4;
578
579 int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor);
580 int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits);
581 int16x4_t zeros_16x4 = vdup_n_s16(0);
582
583 UNUSED(ps_upsampled_res);
584 UNUSED(u1_use_upsampled_res);
585
586 threshold0_16x4 = vld1_u16(pu2_threshold_matrix);
587 threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4);
588 threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8);
589 threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12);
590
591 scale_mat0_16x4 = vld1_u16(pu2_scale_matrix);
592 scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4);
593 scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8);
594 scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12);
595
596 src0 = vld1_u8(&pu1_src[0 * i4_src_stride]);
597 src1 = vld1_u8(&pu1_src[1 * i4_src_stride]);
598 src2 = vld1_u8(&pu1_src[2 * i4_src_stride]);
599 src3 = vld1_u8(&pu1_src[3 * i4_src_stride]);
600
601 /* deinterleaving source buffer */
602 tmp0 = vuzp_u8(src0, src0);
603 tmp1 = vuzp_u8(src1, src1);
604 tmp2 = vuzp_u8(src2, src2);
605 tmp3 = vuzp_u8(src3, src3);
606
607 src0 = tmp0.val[0];
608 src1 = tmp1.val[0];
609 src2 = tmp2.val[0];
610 src3 = tmp3.val[0];
611
612 pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]);
613 pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]);
614 pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]);
615 pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]);
616
617 /* deinterleaving pred buffer */
618 tmp0 = vuzp_u8(pred0, pred0);
619 tmp1 = vuzp_u8(pred1, pred1);
620 tmp2 = vuzp_u8(pred2, pred2);
621 tmp3 = vuzp_u8(pred3, pred3);
622
623 pred0 = tmp0.val[0];
624 pred1 = tmp1.val[0];
625 pred2 = tmp2.val[0];
626 pred3 = tmp3.val[0];
627
628 /* calculate res = src - pred */
629 res0_16x8 = vsubl_u8(src0, pred0);
630 res1_16x8 = vsubl_u8(src1, pred1);
631 res2_16x8 = vsubl_u8(src2, pred2);
632 res3_16x8 = vsubl_u8(src3, pred3);
633
634 res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8));
635 res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8));
636 res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8));
637 res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8));
638
639 /* Perform Forward transform */
640 /*-------------------------------------------------------------*/
641 /* DCT [ Horizontal transformation ] */
642 /*-------------------------------------------------------------*/
643 /* Matrix transpose */
644 /*
645 * a0 a1 a2 a3
646 * b0 b1 b2 b3
647 * c0 c1 c2 c3
648 * d0 d1 d2 d3
649 */
650
651 xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4);
652 xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4);
653 x0_32x2x2 =
654 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
655 x1_32x2x2 =
656 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
657
658 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
659 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
660 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
661 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
662
663 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
664 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
665 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
666 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
667
668 x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
669 temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
670 x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4);
671
672 x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
673 temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
674 x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
675
676 /* Matrix transpose */
677 /*
678 * a0 b0 c0 d0
679 * a1 b1 c1 d1
680 * a2 b2 c2 d2
681 * a3 b3 c3 d3
682 */
683
684 xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4);
685 xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4);
686 x0_32x2x2 =
687 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
688 x1_32x2x2 =
689 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
690
691 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
692 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
693 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
694 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
695
696 /* Vertical Transformation */
697
698 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
699 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
700 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
701 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
702
703 x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
704 temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
705 x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4);
706
707 x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
708 temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
709 x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
710
711 /* get the first 16 bits from the register */
712 *pi2_dc_out = vget_lane_s16(x0_16x4, 0);
713
714 xx0_16x4 = vabs_s16(x0_16x4);
715 xx1_16x4 = vabs_s16(x1_16x4);
716 xx2_16x4 = vabs_s16(x2_16x4);
717 xx3_16x4 = vabs_s16(x3_16x4);
718
719 /* compare with zero for getting sign */
720 temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4);
721 temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4);
722 temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4);
723 temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4);
724
725 /* compare with zero for thresholding */
726 thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4);
727 thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4);
728 thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4);
729 thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4);
730
731 /* Multiply abs values obtained with scaling matrix */
732 tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4));
733 tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4));
734 tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4));
735 tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4));
736
737 tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4);
738 tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4);
739 tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4);
740 tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4);
741
742 qbits_32x4 = vnegq_s32(qbits_32x4);
743
744 tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4);
745 tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4);
746 tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4);
747 tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4);
748
749 /* Convertion to 16 bits signed */
750 temp0_16x4 = vmovn_s32(tx0_32x4);
751 temp1_16x4 = vmovn_s32(tx1_32x4);
752 temp2_16x4 = vmovn_s32(tx2_32x4);
753 temp3_16x4 = vmovn_s32(tx3_32x4);
754
755 x0_16x4 = vneg_s16(temp0_16x4);
756 x1_16x4 = vneg_s16(temp1_16x4);
757 x2_16x4 = vneg_s16(temp2_16x4);
758 x3_16x4 = vneg_s16(temp3_16x4);
759
760 /* Restore sign */
761 x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4);
762 x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4);
763 x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4);
764 x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4);
765
766 /* Thresholding */
767 xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4);
768 xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4);
769 xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4);
770 xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4);
771
772 /* Store Quantized outputs */
773 vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4);
774 vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4);
775 vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4);
776 vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4);
777
778 /* NNZ calculation */
779
780 temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4);
781 temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4);
782 temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4);
783 temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4);
784
785 temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4);
786 temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4);
787
788 /* Convertion to 8 bit unsigned */
789 temp0_u8x8 = vmovn_u16(temp0_u16x8);
790 temp1_u8x8 = vmovn_u16(temp1_u16x8);
791
792 temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7);
793 temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7);
794
795 temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8);
796 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
797 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
798 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
799
800 *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0);
801 }
802
isvc_resi_trans_quant_chroma_4x4_with_residual_sub_neon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)803 void isvc_resi_trans_quant_chroma_4x4_with_residual_sub_neon(
804 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out,
805 buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants,
806 UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
807 {
808 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
809 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
810 WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
811 WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL;
812 WORD32 i4_src_stride = ps_src->i4_data_stride;
813 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
814 WORD32 i4_out_stride = ps_out->i4_data_stride;
815 WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
816 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
817 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
818 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
819 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
820
821 uint8x8_t src0, src1, src2, src3;
822 uint8x8_t pred0, pred1, pred2, pred3;
823 uint8x8x2_t tmp0, tmp1, tmp2, tmp3;
824 uint8x8_t temp0_u8x8, temp1_u8x8;
825 uint16x4_t temp0_u16x4, temp1_u16x4, temp2_u16x4, temp3_u16x4;
826 uint16x4_t scale_mat0_16x4, scale_mat1_16x4, scale_mat2_16x4, scale_mat3_16x4;
827 uint16x4_t threshold0_16x4, threshold1_16x4, threshold2_16x4, threshold3_16x4;
828 uint16x4_t thresholdmask0_16x4, thresholdmask1_16x4, thresholdmask2_16x4, thresholdmask3_16x4;
829 int16x4_t upres0_16x4, upres1_16x4, upres2_16x4, upres3_16x4;
830 int16x4_t res0_16x4, res1_16x4, res2_16x4, res3_16x4;
831 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
832 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
833 int16x4x2_t xx0_16x4x2, xx1_16x4x2;
834 int16x4_t temp0_16x4, temp1_16x4, temp2_16x4, temp3_16x4;
835 uint16x8_t res0_16x8, res1_16x8, res2_16x8, res3_16x8;
836 uint16x8_t temp0_u16x8, temp1_u16x8;
837 int32x2x2_t x0_32x2x2, x1_32x2x2;
838 int32x4_t tx0_32x4, tx1_32x4, tx2_32x4, tx3_32x4;
839
840 int32x4_t rnd_factor_32x4 = vdupq_n_s32(u4_round_factor);
841 int32x4_t qbits_32x4 = vdupq_n_s32(u4_qbits);
842 int16x4_t zeros_16x4 = vdup_n_s16(0);
843 int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX));
844 int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX));
845
846 UNUSED(u1_use_upsampled_res);
847
848 threshold0_16x4 = vld1_u16(pu2_threshold_matrix);
849 threshold1_16x4 = vld1_u16(pu2_threshold_matrix + 4);
850 threshold2_16x4 = vld1_u16(pu2_threshold_matrix + 8);
851 threshold3_16x4 = vld1_u16(pu2_threshold_matrix + 12);
852
853 scale_mat0_16x4 = vld1_u16(pu2_scale_matrix);
854 scale_mat1_16x4 = vld1_u16(pu2_scale_matrix + 4);
855 scale_mat2_16x4 = vld1_u16(pu2_scale_matrix + 8);
856 scale_mat3_16x4 = vld1_u16(pu2_scale_matrix + 12);
857
858 src0 = vld1_u8(&pu1_src[0 * i4_src_stride]);
859 src1 = vld1_u8(&pu1_src[1 * i4_src_stride]);
860 src2 = vld1_u8(&pu1_src[2 * i4_src_stride]);
861 src3 = vld1_u8(&pu1_src[3 * i4_src_stride]);
862
863 /* deinterleaving source buffer */
864 tmp0 = vuzp_u8(src0, src0);
865 tmp1 = vuzp_u8(src1, src1);
866 tmp2 = vuzp_u8(src2, src2);
867 tmp3 = vuzp_u8(src3, src3);
868
869 src0 = tmp0.val[0];
870 src1 = tmp1.val[0];
871 src2 = tmp2.val[0];
872 src3 = tmp3.val[0];
873
874 pred0 = vld1_u8(&pu1_pred[0 * i4_pred_stride]);
875 pred1 = vld1_u8(&pu1_pred[1 * i4_pred_stride]);
876 pred2 = vld1_u8(&pu1_pred[2 * i4_pred_stride]);
877 pred3 = vld1_u8(&pu1_pred[3 * i4_pred_stride]);
878
879 /* deinterleaving pred buffer */
880 tmp0 = vuzp_u8(pred0, pred0);
881 tmp1 = vuzp_u8(pred1, pred1);
882 tmp2 = vuzp_u8(pred2, pred2);
883 tmp3 = vuzp_u8(pred3, pred3);
884
885 pred0 = tmp0.val[0];
886 pred1 = tmp1.val[0];
887 pred2 = tmp2.val[0];
888 pred3 = tmp3.val[0];
889
890 /* calculate res = src - pred */
891 res0_16x8 = vsubl_u8(src0, pred0);
892 res1_16x8 = vsubl_u8(src1, pred1);
893 res2_16x8 = vsubl_u8(src2, pred2);
894 res3_16x8 = vsubl_u8(src3, pred3);
895
896 res0_16x4 = vreinterpret_s16_u16(vget_low_u16(res0_16x8));
897 res1_16x4 = vreinterpret_s16_u16(vget_low_u16(res1_16x8));
898 res2_16x4 = vreinterpret_s16_u16(vget_low_u16(res2_16x8));
899 res3_16x4 = vreinterpret_s16_u16(vget_low_u16(res3_16x8));
900
901 /* Load upsampled res */
902 upres0_16x4 = vld1_s16(&pi2_upsampled_res[0 * i4_upsampled_res_stride]);
903 upres1_16x4 = vld1_s16(&pi2_upsampled_res[1 * i4_upsampled_res_stride]);
904 upres2_16x4 = vld1_s16(&pi2_upsampled_res[2 * i4_upsampled_res_stride]);
905 upres3_16x4 = vld1_s16(&pi2_upsampled_res[3 * i4_upsampled_res_stride]);
906
907 /* subtract upsampled res from (src - pred) to obtain final res */
908 res0_16x4 = vsub_s16(res0_16x4, upres0_16x4);
909 res1_16x4 = vsub_s16(res1_16x4, upres1_16x4);
910 res2_16x4 = vsub_s16(res2_16x4, upres2_16x4);
911 res3_16x4 = vsub_s16(res3_16x4, upres3_16x4);
912
913 /* Saturate all values < -255 to -255 and retain the rest as it is */
914 res0_16x4 = vmax_s16(res0_16x4, neg_255_16x4);
915 res1_16x4 = vmax_s16(res1_16x4, neg_255_16x4);
916 res2_16x4 = vmax_s16(res2_16x4, neg_255_16x4);
917 res3_16x4 = vmax_s16(res3_16x4, neg_255_16x4);
918
919 /* Saturate all values > 255 to 255 and retain the rest as it is */
920 res0_16x4 = vmin_s16(res0_16x4, pos_255_16x4);
921 res1_16x4 = vmin_s16(res1_16x4, pos_255_16x4);
922 res2_16x4 = vmin_s16(res2_16x4, pos_255_16x4);
923 res3_16x4 = vmin_s16(res3_16x4, pos_255_16x4);
924
925 /* Perform Forward transform */
926 /*-------------------------------------------------------------*/
927 /* DCT [ Horizontal transformation ] */
928 /*-------------------------------------------------------------*/
929 /* Matrix transpose */
930 /*
931 * a0 a1 a2 a3
932 * b0 b1 b2 b3
933 * c0 c1 c2 c3
934 * d0 d1 d2 d3
935 */
936
937 xx0_16x4x2 = vtrn_s16(res0_16x4, res1_16x4);
938 xx1_16x4x2 = vtrn_s16(res2_16x4, res3_16x4);
939 x0_32x2x2 =
940 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
941 x1_32x2x2 =
942 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
943
944 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
945 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
946 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
947 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
948
949 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
950 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
951 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
952 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
953
954 x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
955 temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
956 x1_16x4 = vadd_s16(xx2_16x4, temp0_16x4);
957
958 x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
959 temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
960 x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
961
962 /* Matrix transpose */
963 /*
964 * a0 b0 c0 d0
965 * a1 b1 c1 d1
966 * a2 b2 c2 d2
967 * a3 b3 c3 d3
968 */
969
970 xx0_16x4x2 = vtrn_s16(x0_16x4, x1_16x4);
971 xx1_16x4x2 = vtrn_s16(x2_16x4, x3_16x4);
972 x0_32x2x2 =
973 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0]));
974 x1_32x2x2 =
975 vtrn_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1]));
976
977 x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]);
978 x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]);
979 x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]);
980 x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]);
981
982 /* Vertical Transformation */
983
984 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);
985 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);
986 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);
987 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);
988
989 x0_16x4 = vadd_s16(xx0_16x4, xx1_16x4);
990 temp0_16x4 = vshl_n_s16(xx3_16x4, 1);
991 x1_16x4 = vadd_s16(temp0_16x4, xx2_16x4);
992
993 x2_16x4 = vsub_s16(xx0_16x4, xx1_16x4);
994 temp0_16x4 = vshl_n_s16(xx2_16x4, 1);
995 x3_16x4 = vsub_s16(xx3_16x4, temp0_16x4);
996
997 /* get the first 16 bits from the register */
998 *pi2_dc_out = vget_lane_s16(x0_16x4, 0);
999
1000 xx0_16x4 = vabs_s16(x0_16x4);
1001 xx1_16x4 = vabs_s16(x1_16x4);
1002 xx2_16x4 = vabs_s16(x2_16x4);
1003 xx3_16x4 = vabs_s16(x3_16x4);
1004
1005 /* compare with zero for getting sign */
1006 temp0_u16x4 = vcgt_s16(x0_16x4, zeros_16x4);
1007 temp1_u16x4 = vcgt_s16(x1_16x4, zeros_16x4);
1008 temp2_u16x4 = vcgt_s16(x2_16x4, zeros_16x4);
1009 temp3_u16x4 = vcgt_s16(x3_16x4, zeros_16x4);
1010
1011 thresholdmask0_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold0_16x4), xx0_16x4);
1012 thresholdmask1_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold1_16x4), xx1_16x4);
1013 thresholdmask2_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold2_16x4), xx2_16x4);
1014 thresholdmask3_16x4 = vcgt_s16(vreinterpret_s16_u16(threshold3_16x4), xx3_16x4);
1015
1016 /* Multiply abs values obtained with scaling matrix */
1017 tx0_32x4 = vmull_s16(xx0_16x4, vreinterpret_s16_u16(scale_mat0_16x4));
1018 tx1_32x4 = vmull_s16(xx1_16x4, vreinterpret_s16_u16(scale_mat1_16x4));
1019 tx2_32x4 = vmull_s16(xx2_16x4, vreinterpret_s16_u16(scale_mat2_16x4));
1020 tx3_32x4 = vmull_s16(xx3_16x4, vreinterpret_s16_u16(scale_mat3_16x4));
1021
1022 tx0_32x4 = vaddq_s32(tx0_32x4, rnd_factor_32x4);
1023 tx1_32x4 = vaddq_s32(tx1_32x4, rnd_factor_32x4);
1024 tx2_32x4 = vaddq_s32(tx2_32x4, rnd_factor_32x4);
1025 tx3_32x4 = vaddq_s32(tx3_32x4, rnd_factor_32x4);
1026
1027 qbits_32x4 = vnegq_s32(qbits_32x4);
1028
1029 tx0_32x4 = vshlq_s32(tx0_32x4, qbits_32x4);
1030 tx1_32x4 = vshlq_s32(tx1_32x4, qbits_32x4);
1031 tx2_32x4 = vshlq_s32(tx2_32x4, qbits_32x4);
1032 tx3_32x4 = vshlq_s32(tx3_32x4, qbits_32x4);
1033
1034 /* Convertion to 16 bits signed */
1035 temp0_16x4 = vmovn_s32(tx0_32x4);
1036 temp1_16x4 = vmovn_s32(tx1_32x4);
1037 temp2_16x4 = vmovn_s32(tx2_32x4);
1038 temp3_16x4 = vmovn_s32(tx3_32x4);
1039
1040 x0_16x4 = vneg_s16(temp0_16x4);
1041 x1_16x4 = vneg_s16(temp1_16x4);
1042 x2_16x4 = vneg_s16(temp2_16x4);
1043 x3_16x4 = vneg_s16(temp3_16x4);
1044
1045 /* Restore sign */
1046 x0_16x4 = vbsl_s16(temp0_u16x4, temp0_16x4, x0_16x4);
1047 x1_16x4 = vbsl_s16(temp1_u16x4, temp1_16x4, x1_16x4);
1048 x2_16x4 = vbsl_s16(temp2_u16x4, temp2_16x4, x2_16x4);
1049 x3_16x4 = vbsl_s16(temp3_u16x4, temp3_16x4, x3_16x4);
1050
1051 xx0_16x4 = vbsl_s16(thresholdmask0_16x4, zeros_16x4, x0_16x4);
1052 xx1_16x4 = vbsl_s16(thresholdmask1_16x4, zeros_16x4, x1_16x4);
1053 xx2_16x4 = vbsl_s16(thresholdmask2_16x4, zeros_16x4, x2_16x4);
1054 xx3_16x4 = vbsl_s16(thresholdmask3_16x4, zeros_16x4, x3_16x4);
1055
1056 /* Store Quantized outputs */
1057 vst1_s16(&pi2_out[0 * i4_out_stride], xx0_16x4);
1058 vst1_s16(&pi2_out[1 * i4_out_stride], xx1_16x4);
1059 vst1_s16(&pi2_out[2 * i4_out_stride], xx2_16x4);
1060 vst1_s16(&pi2_out[3 * i4_out_stride], xx3_16x4);
1061
1062 /* NNZ calculation */
1063
1064 temp0_u16x4 = vceq_s16(xx0_16x4, zeros_16x4);
1065 temp1_u16x4 = vceq_s16(xx1_16x4, zeros_16x4);
1066 temp2_u16x4 = vceq_s16(xx2_16x4, zeros_16x4);
1067 temp3_u16x4 = vceq_s16(xx3_16x4, zeros_16x4);
1068
1069 temp0_u16x8 = vcombine_u16(temp0_u16x4, temp2_u16x4);
1070 temp1_u16x8 = vcombine_u16(temp1_u16x4, temp3_u16x4);
1071
1072 /* Convertion to 8 bit unsigned */
1073 temp0_u8x8 = vmovn_u16(temp0_u16x8);
1074 temp1_u8x8 = vmovn_u16(temp1_u16x8);
1075
1076 temp0_u8x8 = vshr_n_u8(temp0_u8x8, 7);
1077 temp1_u8x8 = vshr_n_u8(temp1_u8x8, 7);
1078
1079 temp0_u8x8 = vadd_u8(temp0_u8x8, temp1_u8x8);
1080 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
1081 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
1082 temp0_u8x8 = vpadd_u8(temp0_u8x8, temp1_u8x8);
1083
1084 *pu1_nnz = 16 - vget_lane_u8(temp0_u8x8, 0);
1085 }
1086