1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvc_iquant_itrans_recon_sse42.c
24 *
25 * @brief
26 * Contains function definitions for inverse quantization, inverse
27 * transform and reconstruction
28 *
29 * @author
30 * Mohit [100664]
31 *
32 * @par List of Functions:
33 * - isvc_iquant_itrans_recon_4x4_sse42()
34 * - isvc_iquant_itrans_recon_chroma_4x4_sse42()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 #include <immintrin.h>
42
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "ih264_defs.h"
46 #include "ih264_trans_macros.h"
47 #include "ih264_macros.h"
48 #include "ih264_platform_macros.h"
49 #include "ih264_trans_data.h"
50 #include "ih264_size_defs.h"
51 #include "isvc_structs.h"
52 #include "isvc_trans_quant_itrans_iquant.h"
53
54 /*
55 ********************************************************************************
56 *
57 * @brief This function reconstructs a 4x4 sub block from quantized resiude and
58 * prediction buffer
59 *
60 * @par Description:
61 * The quantized residue is first inverse quantized, then inverse transformed.
62 * This inverse transformed content is added to the prediction buffer to recon-
63 * struct the end output
64 *
65 * @param[in] pi2_src
66 * quantized 4x4 block
67 *
68 * @param[in] pu1_pred
69 * prediction 4x4 block
70 *
71 * @param[out] pu1_out
72 * reconstructed 4x4 block
73 *
74 * @param[in] src_strd
75 * quantization buffer stride
76 *
77 * @param[in] i4_pred_stride,
78 * Prediction buffer stride
79 *
80 * @param[in] i4_out_stride
81 * recon buffer Stride
82 *
83 * @param[in] pu2_scaling_list
84 * pointer to scaling list
85 *
86 * @param[in] pu2_norm_adjust
87 * pointer to inverse scale matrix
88 *
89 * @param[in] u4_qp_div_6
90 * Floor (qp/6)
91 *
92 * @param[in] pi4_tmp
93 * temporary buffer of size 1*16
94 *
95 * @returns none
96 *
97 * @remarks none
98 *
99 *******************************************************************************
100 */
101
isvc_iquant_itrans_recon_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)102 void isvc_iquant_itrans_recon_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
103 buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
104 buffer_container_t *ps_rec,
105 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
106 WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
107 UWORD8 u1_res_accumulate)
108 {
109 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
110 WORD16 *pi2_tmp_ptr = pi2_tmp;
111 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
112 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
113 WORD32 i4_src_stride = ps_src->i4_data_stride;
114 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
115 WORD32 i4_out_stride = ps_rec->i4_data_stride;
116 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
117 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
118 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
119 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
120 __m128i src_r0_r1, src_r2_r3;
121 __m128i src_r0, src_r1, src_r2, src_r3;
122 __m128i scalemat_r0_r1, scalemat_r2_r3;
123 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
124 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
125 /* all bits reset to zero */
126 __m128i zero_8x16b = _mm_setzero_si128();
127 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
128 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
129 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
130 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
131 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
132 __m128i value_32 = _mm_set1_epi32(32);
133
134 ASSERT(4 == i4_src_stride);
135 ASSERT(0 == u1_res_accumulate);
136
137 UNUSED(i4_src_stride);
138 UNUSED(ps_res);
139 UNUSED(ps_res_pred);
140 UNUSED(u1_res_accumulate);
141
142 /*************************************************************/
143 /* Dequantization of coefficients. Will be replaced by SIMD */
144 /* operations on platform */
145 /*************************************************************/
146
147 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
148 matrix 0th,1st row */
149 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
150
151 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
152 source matrix 2nd,3rd row */
153 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
154
155 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
156 scaling matrix 0th,1st row */
157 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
158
159 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
160 the scaling matrix 2nd,3rd row */
161 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
162
163 /* q00 q01 q02 q03 q10 q11
164 q12 q13 -- all 16 bits */
165 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
166
167 /* q20 q21 q22 q23 q30 q31
168 q32 q33 -- all 16 bits */
169 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
170
171 /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
172 b12*q12 b13*q13 -- 16 bit result */
173 temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
174
175 /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
176 b32*q32 b33*q33 -- 16 bit result */
177 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
178
179 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
180 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
181
182 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
183 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
184
185 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
186 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
187
188 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
189 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
190
191 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
192 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
193 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
194 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
195 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
196 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
197 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
198 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
199
200 temp4 = _mm_madd_epi16(src_r0, temp4);
201 temp5 = _mm_madd_epi16(src_r1, temp5);
202 temp6 = _mm_madd_epi16(src_r2, temp6);
203 temp7 = _mm_madd_epi16(src_r3, temp7);
204
205 if(u4_qp_div_6 >= 4)
206 {
207 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
208 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
209 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
210 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
211 }
212 else
213 {
214 temp4 = _mm_add_epi32(temp4, add_rshift);
215 temp5 = _mm_add_epi32(temp5, add_rshift);
216 temp6 = _mm_add_epi32(temp6, add_rshift);
217 temp7 = _mm_add_epi32(temp7, add_rshift);
218 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
219 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
220 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
221 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
222 }
223
224 if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
225 /* Perform Inverse transform */
226 /*-------------------------------------------------------------*/
227 /* IDCT [ Horizontal transformation ] */
228 /*-------------------------------------------------------------*/
229 // Matrix transpose
230 /*
231 * a0 a1 a2 a3
232 * b0 b1 b2 b3
233 * c0 c1 c2 c3
234 * d0 d1 d2 d3
235 */
236
237 /* a0 b0 a1 b1 */
238 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
239 /* c0 d0 c1 d1 */
240 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
241 /* a2 b2 a3 b3 */
242 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
243 /* c2 d2 c3 d3 */
244 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
245 /* a0 b0 c0 d0 */
246 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
247 /* a1 b1 c1 d1 */
248 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
249 /* a2 b2 c2 d2 */
250 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
251 /* a3 b3 c3 d3 */
252 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
253 /* Transform starts -- horizontal transform */
254 /*------------------------------------------------------------------*/
255 /* z0 = w0 + w2 */
256 temp0 = _mm_add_epi32(resq_r0, resq_r2);
257 /* z1 = w0 - w2 */
258 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
259 /* z2 = (w1 >> 1) - w3 */
260 temp2 = _mm_srai_epi32(resq_r1, 1);
261 temp2 = _mm_sub_epi32(temp2, resq_r3);
262 /* z3 = w1 + (w3 >> 1) */
263 temp3 = _mm_srai_epi32(resq_r3, 1);
264 temp3 = _mm_add_epi32(temp3, resq_r1);
265 /*----------------------------------------------------------*/
266 /* x0 = z0 + z3 */
267 resq_r0 = _mm_add_epi32(temp0, temp3);
268 /* x1 = z1 + z2 */
269 resq_r1 = _mm_add_epi32(temp1, temp2);
270 /* x2 = z1 - z2 */
271 resq_r2 = _mm_sub_epi32(temp1, temp2);
272 /* x3 = z0 - z3 */
273 resq_r3 = _mm_sub_epi32(temp0, temp3);
274
275 // Matrix transpose
276 /*
277 * a0 b0 c0 d0
278 * a1 b1 c1 d1
279 * a2 b2 c2 d2
280 * a3 b3 c3 d3
281 */
282
283 /* a0 a1 b0 b1 */
284 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
285 /* a2 a3 b2 b3 */
286 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
287 /* c0 c1 d0 d1 */
288 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
289 /* c2 c3 d2 d3 */
290 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
291 /* a0 a1 a2 a3 */
292 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
293 /* b0 b1 b2 b3 */
294 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
295 /* c0 c1 c2 c3 */
296 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
297 /* d0 d1 d2 d3 */
298 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
299 /* Transform ends -- horizontal transform */
300
301 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
302 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
303
304 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
305 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
306
307 /* Load pred buffer */
308 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
309 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
310 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
311 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
312
313 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
314 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
315 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
316 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
317
318 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
319 pred_r1 = _mm_unpacklo_epi64(pred_r2, pred_r3);
320
321 /*--------------------------------------------------------------*/
322 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
323 /* */
324 /* Add the prediction and store it back to same buffer */
325 /*--------------------------------------------------------------*/
326 /* z0j = y0j + y2j */
327 temp0 = _mm_add_epi32(resq_r0, resq_r2);
328 /* z1j = y0j - y2j */
329 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
330 /* z2j = (y1j>>1) - y3j */
331 temp2 = _mm_srai_epi32(resq_r1, 1);
332 temp2 = _mm_sub_epi32(temp2, resq_r3);
333 /* z3j = y1j + (y3j>>1) */
334 temp3 = _mm_srai_epi32(resq_r3, 1);
335 temp3 = _mm_add_epi32(temp3, resq_r1);
336
337 /* x0j = z0j + z3j */
338 temp4 = _mm_add_epi32(temp0, temp3);
339 temp4 = _mm_add_epi32(temp4, value_32);
340 temp4 = _mm_srai_epi32(temp4, 6);
341 /* x1j = z1j + z2j */
342 temp5 = _mm_add_epi32(temp1, temp2);
343 temp5 = _mm_add_epi32(temp5, value_32);
344 temp5 = _mm_srai_epi32(temp5, 6);
345 /* x2j = z1j - z2j */
346 temp6 = _mm_sub_epi32(temp1, temp2);
347 temp6 = _mm_add_epi32(temp6, value_32);
348 temp6 = _mm_srai_epi32(temp6, 6);
349 /* x3j = z0j - z3j */
350 temp7 = _mm_sub_epi32(temp0, temp3);
351 temp7 = _mm_add_epi32(temp7, value_32);
352 temp7 = _mm_srai_epi32(temp7, 6);
353
354 /* 32-bit to 16-bit conversion */
355 temp0 = _mm_packs_epi32(temp4, temp5);
356 temp1 = _mm_packs_epi32(temp6, temp7);
357
358 /* Saturate all values < -255 to -255 and retain the rest as it is */
359 temp4 = _mm_max_epi16(temp0, neg_255_8x16b);
360 /* Saturate all values > 255 to 255 and retain the rest as it is */
361 temp4 = _mm_min_epi16(temp4, pos_255_8x16b);
362
363 /* Saturate all values < -255 to -255 and retain the rest as it is */
364 temp5 = _mm_max_epi16(temp1, neg_255_8x16b);
365 /* Saturate all values > 255 to 255 and retain the rest as it is */
366 temp5 = _mm_min_epi16(temp5, pos_255_8x16b);
367
368 temp0 = _mm_add_epi16(temp4, pred_r0);
369 temp1 = _mm_add_epi16(temp5, pred_r1);
370
371 /*------------------------------------------------------------------*/
372 /* Clipping the results to 8 bits */
373 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
374 temp0 = _mm_and_si128(temp0, sign_reg);
375 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
376 temp1 = _mm_and_si128(temp1, sign_reg);
377
378 resq_r0 = _mm_packus_epi16(temp0, temp1);
379 resq_r1 = _mm_srli_si128(resq_r0, 4);
380 resq_r2 = _mm_srli_si128(resq_r1, 4);
381 resq_r3 = _mm_srli_si128(resq_r2, 4);
382
383 *pu4_out = _mm_cvtsi128_si32(resq_r0);
384 pu1_out += i4_out_stride;
385 pu4_out = (UWORD32 *) (pu1_out);
386 *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
387 pu1_out += i4_out_stride;
388 pu4_out = (UWORD32 *) (pu1_out);
389 *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
390 pu1_out += i4_out_stride;
391 pu4_out = (UWORD32 *) (pu1_out);
392 *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
393 }
394
isvc_iquant_itrans_recon_res_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)395 void isvc_iquant_itrans_recon_res_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
396 buffer_container_t *ps_res_pred,
397 buffer_container_t *ps_res, buffer_container_t *ps_rec,
398 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
399 WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
400 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
401 {
402 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
403 WORD16 *pi2_tmp_ptr = pi2_tmp;
404 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
405 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
406 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
407 WORD32 i4_src_stride = ps_src->i4_data_stride;
408 WORD32 i4_res_stride = ps_res->i4_data_stride;
409 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
410 WORD32 i4_out_stride = ps_rec->i4_data_stride;
411 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
412 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
413 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
414 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
415 __m128i src_r0_r1, src_r2_r3;
416 __m128i src_r0, src_r1, src_r2, src_r3;
417 __m128i scalemat_r0_r1, scalemat_r2_r3;
418 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
419 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
420 /* all bits reset to zero */
421 __m128i zero_8x16b = _mm_setzero_si128();
422 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
423 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
424 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
425 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
426 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
427 __m128i value_32 = _mm_set1_epi32(32);
428
429 ASSERT(4 == i4_src_stride);
430 ASSERT(0 == u1_res_accumulate);
431
432 UNUSED(i4_src_stride);
433 UNUSED(ps_res_pred);
434 UNUSED(u1_res_accumulate);
435
436 /*************************************************************/
437 /* Dequantization of coefficients. Will be replaced by SIMD */
438 /* operations on platform */
439 /*************************************************************/
440
441 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
442 matrix 0th,1st row */
443 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
444
445 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
446 source matrix 2nd,3rd row */
447 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
448
449 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
450 scaling matrix 0th,1st row */
451 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
452
453 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
454 the scaling matrix 2nd,3rd row */
455 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
456
457 /* q00 q01 q02 q03 q10 q11
458 q12 q13 -- all 16 bits */
459 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
460
461 /* q20 q21 q22 q23 q30 q31
462 q32 q33 -- all 16 bits */
463 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
464
465 /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
466 b12*q12 b13*q13 -- 16 bit result */
467 temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
468
469 /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
470 b32*q32 b33*q33 -- 16 bit result */
471 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
472
473 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
474 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
475
476 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
477 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
478
479 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
480 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
481
482 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
483 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
484
485 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
486 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
487 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
488 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
489 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
490 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
491 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
492 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
493
494 temp4 = _mm_madd_epi16(src_r0, temp4);
495 temp5 = _mm_madd_epi16(src_r1, temp5);
496 temp6 = _mm_madd_epi16(src_r2, temp6);
497 temp7 = _mm_madd_epi16(src_r3, temp7);
498
499 if(u4_qp_div_6 >= 4)
500 {
501 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
502 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
503 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
504 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
505 }
506 else
507 {
508 temp4 = _mm_add_epi32(temp4, add_rshift);
509 temp5 = _mm_add_epi32(temp5, add_rshift);
510 temp6 = _mm_add_epi32(temp6, add_rshift);
511 temp7 = _mm_add_epi32(temp7, add_rshift);
512 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
513 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
514 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
515 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
516 }
517
518 if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
519 /* Perform Inverse transform */
520 /*-------------------------------------------------------------*/
521 /* IDCT [ Horizontal transformation ] */
522 /*-------------------------------------------------------------*/
523 // Matrix transpose
524 /*
525 * a0 a1 a2 a3
526 * b0 b1 b2 b3
527 * c0 c1 c2 c3
528 * d0 d1 d2 d3
529 */
530
531 /* a0 b0 a1 b1 */
532 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
533 /* c0 d0 c1 d1 */
534 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
535 /* a2 b2 a3 b3 */
536 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
537 /* c2 d2 c3 d3 */
538 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
539 /* a0 b0 c0 d0 */
540 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
541 /* a1 b1 c1 d1 */
542 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
543 /* a2 b2 c2 d2 */
544 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
545 /* a3 b3 c3 d3 */
546 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
547 /* Transform starts -- horizontal transform */
548 /*------------------------------------------------------------------*/
549 /* z0 = w0 + w2 */
550 temp0 = _mm_add_epi32(resq_r0, resq_r2);
551 /* z1 = w0 - w2 */
552 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
553 /* z2 = (w1 >> 1) - w3 */
554 temp2 = _mm_srai_epi32(resq_r1, 1);
555 temp2 = _mm_sub_epi32(temp2, resq_r3);
556 /* z3 = w1 + (w3 >> 1) */
557 temp3 = _mm_srai_epi32(resq_r3, 1);
558 temp3 = _mm_add_epi32(temp3, resq_r1);
559 /*----------------------------------------------------------*/
560 /* x0 = z0 + z3 */
561 resq_r0 = _mm_add_epi32(temp0, temp3);
562 /* x1 = z1 + z2 */
563 resq_r1 = _mm_add_epi32(temp1, temp2);
564 /* x2 = z1 - z2 */
565 resq_r2 = _mm_sub_epi32(temp1, temp2);
566 /* x3 = z0 - z3 */
567 resq_r3 = _mm_sub_epi32(temp0, temp3);
568
569 // Matrix transpose
570 /*
571 * a0 b0 c0 d0
572 * a1 b1 c1 d1
573 * a2 b2 c2 d2
574 * a3 b3 c3 d3
575 */
576
577 /* a0 a1 b0 b1 */
578 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
579 /* a2 a3 b2 b3 */
580 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
581 /* c0 c1 d0 d1 */
582 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
583 /* c2 c3 d2 d3 */
584 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
585 /* a0 a1 a2 a3 */
586 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
587 /* b0 b1 b2 b3 */
588 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
589 /* c0 c1 c2 c3 */
590 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
591 /* d0 d1 d2 d3 */
592 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
593 /* Transform ends -- horizontal transform */
594
595 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
596 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
597
598 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
599 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
600
601 /* Load pred buffer */
602 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
603 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
604 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
605 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
606
607 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
608 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
609 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
610 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
611
612 /*--------------------------------------------------------------*/
613 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
614 /* */
615 /* Add the prediction and store it back to same buffer */
616 /*--------------------------------------------------------------*/
617 /* z0j = y0j + y2j */
618 temp0 = _mm_add_epi32(resq_r0, resq_r2);
619 /* z1j = y0j - y2j */
620 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
621 /* z2j = (y1j>>1) - y3j */
622 temp2 = _mm_srai_epi32(resq_r1, 1);
623 temp2 = _mm_sub_epi32(temp2, resq_r3);
624 /* z3j = y1j + (y3j>>1) */
625 temp3 = _mm_srai_epi32(resq_r3, 1);
626 temp3 = _mm_add_epi32(temp3, resq_r1);
627
628 /* x0j = z0j + z3j */
629 temp4 = _mm_add_epi32(temp0, temp3);
630 temp4 = _mm_add_epi32(temp4, value_32);
631 temp4 = _mm_srai_epi32(temp4, 6);
632 /* x1j = z1j + z2j */
633 temp5 = _mm_add_epi32(temp1, temp2);
634 temp5 = _mm_add_epi32(temp5, value_32);
635 temp5 = _mm_srai_epi32(temp5, 6);
636 /* x2j = z1j - z2j */
637 temp6 = _mm_sub_epi32(temp1, temp2);
638 temp6 = _mm_add_epi32(temp6, value_32);
639 temp6 = _mm_srai_epi32(temp6, 6);
640 /* x3j = z0j - z3j */
641 temp7 = _mm_sub_epi32(temp0, temp3);
642 temp7 = _mm_add_epi32(temp7, value_32);
643 temp7 = _mm_srai_epi32(temp7, 6);
644
645 /* 32-bit to 16-bit conversion */
646 temp0 = _mm_packs_epi32(temp4, temp5);
647 temp1 = _mm_packs_epi32(temp6, temp7);
648
649 /* Saturate all values < -255 to -255 and retain the rest as it is */
650 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
651 /* Saturate all values > 255 to 255 and retain the rest as it is */
652 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
653
654 /* Saturate all values < -255 to -255 and retain the rest as it is */
655 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
656 /* Saturate all values > 255 to 255 and retain the rest as it is */
657 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
658
659 _mm_storel_epi64((__m128i *) (&pi2_res[0]), temp0);
660 _mm_storel_epi64((__m128i *) (&pi2_res[2 * i4_res_stride]), temp1);
661
662 temp4 = _mm_add_epi16(temp0, pred_r0);
663 temp0 = _mm_srli_si128(temp0, 8);
664 _mm_storel_epi64((__m128i *) (&pi2_res[i4_res_stride]), temp0);
665
666 temp6 = _mm_add_epi16(temp1, pred_r2);
667 temp1 = _mm_srli_si128(temp1, 8);
668 _mm_storel_epi64((__m128i *) (&pi2_res[3 * i4_res_stride]), temp1);
669
670 temp5 = _mm_add_epi16(temp0, pred_r1);
671 temp7 = _mm_add_epi16(temp1, pred_r3);
672
673 temp4 = _mm_cvtepi16_epi32(temp4);
674 temp5 = _mm_cvtepi16_epi32(temp5);
675 temp6 = _mm_cvtepi16_epi32(temp6);
676 temp7 = _mm_cvtepi16_epi32(temp7);
677
678 /* 32-bit to 16-bit conversion */
679 temp0 = _mm_packs_epi32(temp4, temp5);
680 temp1 = _mm_packs_epi32(temp6, temp7);
681 /*------------------------------------------------------------------*/
682 /* Clipping the results to 8 bits */
683 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
684 temp0 = _mm_and_si128(temp0, sign_reg);
685 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
686 temp1 = _mm_and_si128(temp1, sign_reg);
687
688 resq_r0 = _mm_packus_epi16(temp0, temp1);
689 resq_r1 = _mm_srli_si128(resq_r0, 4);
690 resq_r2 = _mm_srli_si128(resq_r1, 4);
691 resq_r3 = _mm_srli_si128(resq_r2, 4);
692
693 *pu4_out = _mm_cvtsi128_si32(resq_r0);
694 pu1_out += i4_out_stride;
695 pu4_out = (UWORD32 *) (pu1_out);
696 *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
697 pu1_out += i4_out_stride;
698 pu4_out = (UWORD32 *) (pu1_out);
699 *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
700 pu1_out += i4_out_stride;
701 pu4_out = (UWORD32 *) (pu1_out);
702 *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
703 }
704
isvc_iquant_itrans_recon_res_4x4_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)705 void isvc_iquant_itrans_recon_res_4x4_with_res_acc_sse42(
706 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
707 buffer_container_t *ps_res, buffer_container_t *ps_rec,
708 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
709 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
710 {
711 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
712 WORD16 *pi2_tmp_ptr = pi2_tmp;
713 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
714 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
715 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
716 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
717 WORD32 i4_src_stride = ps_src->i4_data_stride;
718 WORD32 i4_res_stride = ps_res->i4_data_stride;
719 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
720 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
721 WORD32 i4_out_stride = ps_rec->i4_data_stride;
722 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
723 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
724 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
725 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
726 __m128i src_r0_r1, src_r2_r3;
727 __m128i src_r0, src_r1, src_r2, src_r3;
728 __m128i scalemat_r0_r1, scalemat_r2_r3;
729 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
730 __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
731 __m128i res_r0, res_r1, res_r2, res_r3;
732 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
733 /* all bits reset to zero */
734 __m128i zero_8x16b = _mm_setzero_si128();
735 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
736 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
737 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
738 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
739 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
740 __m128i value_32 = _mm_set1_epi32(32);
741
742 ASSERT(4 == i4_src_stride);
743 ASSERT(1 == u1_res_accumulate);
744
745 UNUSED(i4_src_stride);
746 UNUSED(ps_res_pred);
747 UNUSED(u1_res_accumulate);
748
749 /*************************************************************/
750 /* Dequantization of coefficients. Will be replaced by SIMD */
751 /* operations on platform */
752 /*************************************************************/
753
754 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
755 matrix 0th,1st row */
756 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
757
758 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
759 source matrix 2nd,3rd row */
760 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
761
762 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
763 scaling matrix 0th,1st row */
764 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
765
766 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
767 the scaling matrix 2nd,3rd row */
768 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
769
770 /* q00 q01 q02 q03 q10 q11
771 q12 q13 -- all 16 bits */
772 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
773
774 /* q20 q21 q22 q23 q30 q31
775 q32 q33 -- all 16 bits */
776 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
777
778 /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
779 b12*q12 b13*q13 -- 16 bit result */
780 temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
781
782 /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
783 b32*q32 b33*q33 -- 16 bit result */
784 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
785
786 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
787 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
788
789 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
790 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
791
792 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
793 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
794
795 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
796 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
797
798 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
799 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
800 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
801 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
802 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
803 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
804 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
805 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
806
807 temp4 = _mm_madd_epi16(src_r0, temp4);
808 temp5 = _mm_madd_epi16(src_r1, temp5);
809 temp6 = _mm_madd_epi16(src_r2, temp6);
810 temp7 = _mm_madd_epi16(src_r3, temp7);
811
812 if(u4_qp_div_6 >= 4)
813 {
814 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
815 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
816 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
817 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
818 }
819 else
820 {
821 temp4 = _mm_add_epi32(temp4, add_rshift);
822 temp5 = _mm_add_epi32(temp5, add_rshift);
823 temp6 = _mm_add_epi32(temp6, add_rshift);
824 temp7 = _mm_add_epi32(temp7, add_rshift);
825 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
826 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
827 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
828 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
829 }
830
831 if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
832 /* Perform Inverse transform */
833 /*-------------------------------------------------------------*/
834 /* IDCT [ Horizontal transformation ] */
835 /*-------------------------------------------------------------*/
836 // Matrix transpose
837 /*
838 * a0 a1 a2 a3
839 * b0 b1 b2 b3
840 * c0 c1 c2 c3
841 * d0 d1 d2 d3
842 */
843
844 /* a0 b0 a1 b1 */
845 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
846 /* c0 d0 c1 d1 */
847 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
848 /* a2 b2 a3 b3 */
849 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
850 /* c2 d2 c3 d3 */
851 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
852 /* a0 b0 c0 d0 */
853 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
854 /* a1 b1 c1 d1 */
855 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
856 /* a2 b2 c2 d2 */
857 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
858 /* a3 b3 c3 d3 */
859 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
860 /* Transform starts -- horizontal transform */
861 /*------------------------------------------------------------------*/
862 /* z0 = w0 + w2 */
863 temp0 = _mm_add_epi32(resq_r0, resq_r2);
864 /* z1 = w0 - w2 */
865 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
866 /* z2 = (w1 >> 1) - w3 */
867 temp2 = _mm_srai_epi32(resq_r1, 1);
868 temp2 = _mm_sub_epi32(temp2, resq_r3);
869 /* z3 = w1 + (w3 >> 1) */
870 temp3 = _mm_srai_epi32(resq_r3, 1);
871 temp3 = _mm_add_epi32(temp3, resq_r1);
872 /*----------------------------------------------------------*/
873 /* x0 = z0 + z3 */
874 resq_r0 = _mm_add_epi32(temp0, temp3);
875 /* x1 = z1 + z2 */
876 resq_r1 = _mm_add_epi32(temp1, temp2);
877 /* x2 = z1 - z2 */
878 resq_r2 = _mm_sub_epi32(temp1, temp2);
879 /* x3 = z0 - z3 */
880 resq_r3 = _mm_sub_epi32(temp0, temp3);
881
882 // Matrix transpose
883 /*
884 * a0 b0 c0 d0
885 * a1 b1 c1 d1
886 * a2 b2 c2 d2
887 * a3 b3 c3 d3
888 */
889
890 /* a0 a1 b0 b1 */
891 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
892 /* a2 a3 b2 b3 */
893 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
894 /* c0 c1 d0 d1 */
895 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
896 /* c2 c3 d2 d3 */
897 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
898 /* a0 a1 a2 a3 */
899 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
900 /* b0 b1 b2 b3 */
901 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
902 /* c0 c1 c2 c3 */
903 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
904 /* d0 d1 d2 d3 */
905 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
906 /* Transform ends -- horizontal transform */
907
908 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
909 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
910
911 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
912 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
913
914 /* Load pred buffer */
915 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
916 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
917 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
918 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
919
920 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
921 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
922 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
923 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
924
925 /*--------------------------------------------------------------*/
926 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
927 /* */
928 /* Add the prediction and store it back to same buffer */
929 /*--------------------------------------------------------------*/
930 /* z0j = y0j + y2j */
931 temp0 = _mm_add_epi32(resq_r0, resq_r2);
932 /* z1j = y0j - y2j */
933 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
934 /* z2j = (y1j>>1) - y3j */
935 temp2 = _mm_srai_epi32(resq_r1, 1);
936 temp2 = _mm_sub_epi32(temp2, resq_r3);
937 /* z3j = y1j + (y3j>>1) */
938 temp3 = _mm_srai_epi32(resq_r3, 1);
939 temp3 = _mm_add_epi32(temp3, resq_r1);
940
941 /* x0j = z0j + z3j */
942 temp4 = _mm_add_epi32(temp0, temp3);
943 temp4 = _mm_add_epi32(temp4, value_32);
944 temp4 = _mm_srai_epi32(temp4, 6);
945 res_r0 = temp4;
946 /* x1j = z1j + z2j */
947 temp5 = _mm_add_epi32(temp1, temp2);
948 temp5 = _mm_add_epi32(temp5, value_32);
949 temp5 = _mm_srai_epi32(temp5, 6);
950 res_r1 = temp5;
951 /* x2j = z1j - z2j */
952 temp6 = _mm_sub_epi32(temp1, temp2);
953 temp6 = _mm_add_epi32(temp6, value_32);
954 temp6 = _mm_srai_epi32(temp6, 6);
955 res_r2 = temp6;
956 /* x3j = z0j - z3j */
957 temp7 = _mm_sub_epi32(temp0, temp3);
958 temp7 = _mm_add_epi32(temp7, value_32);
959 temp7 = _mm_srai_epi32(temp7, 6);
960 res_r3 = temp7;
961
962 /* Accumulating res */
963 res_pred_r0 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[0]);
964 res_pred_r1 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[i4_res_pred_stride]);
965 res_pred_r2 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
966 res_pred_r3 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
967
968 res_pred_r0 = _mm_cvtepi16_epi32(res_pred_r0);
969 res_pred_r1 = _mm_cvtepi16_epi32(res_pred_r1);
970 res_pred_r2 = _mm_cvtepi16_epi32(res_pred_r2);
971 res_pred_r3 = _mm_cvtepi16_epi32(res_pred_r3);
972
973 temp0 = _mm_add_epi32(res_r0, res_pred_r0);
974 temp1 = _mm_add_epi32(res_r1, res_pred_r1);
975 temp2 = _mm_add_epi32(res_r2, res_pred_r2);
976 temp3 = _mm_add_epi32(res_r3, res_pred_r3);
977
978 temp0 = _mm_packs_epi32(temp0, temp1);
979 temp1 = _mm_packs_epi32(temp2, temp3);
980
981 /* Saturate all values < -255 to -255 and retain the rest as it is */
982 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
983 /* Saturate all values > 255 to 255 and retain the rest as it is */
984 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
985
986 /* Saturate all values < -255 to -255 and retain the rest as it is */
987 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
988 /* Saturate all values > 255 to 255 and retain the rest as it is */
989 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
990
991 _mm_storel_epi64((__m128i *) (&pi2_res[0]), temp0);
992 _mm_storel_epi64((__m128i *) (&pi2_res[2 * i4_res_stride]), temp1);
993
994 temp4 = _mm_add_epi16(temp0, pred_r0);
995 temp0 = _mm_srli_si128(temp0, 8);
996 _mm_storel_epi64((__m128i *) (&pi2_res[i4_res_stride]), temp0);
997
998 temp6 = _mm_add_epi16(temp1, pred_r2);
999 temp1 = _mm_srli_si128(temp1, 8);
1000 _mm_storel_epi64((__m128i *) (&pi2_res[3 * i4_res_stride]), temp1);
1001
1002 temp5 = _mm_add_epi16(temp0, pred_r1);
1003 temp7 = _mm_add_epi16(temp1, pred_r3);
1004
1005 temp4 = _mm_cvtepi16_epi32(temp4);
1006 temp5 = _mm_cvtepi16_epi32(temp5);
1007 temp6 = _mm_cvtepi16_epi32(temp6);
1008 temp7 = _mm_cvtepi16_epi32(temp7);
1009
1010 /* 32-bit to 16-bit conversion */
1011 temp0 = _mm_packs_epi32(temp4, temp5);
1012 temp1 = _mm_packs_epi32(temp6, temp7);
1013 /*------------------------------------------------------------------*/
1014 /* Clipping the results to 8 bits */
1015 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
1016 temp0 = _mm_and_si128(temp0, sign_reg);
1017 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
1018 temp1 = _mm_and_si128(temp1, sign_reg);
1019
1020 resq_r0 = _mm_packus_epi16(temp0, temp1);
1021 resq_r1 = _mm_srli_si128(resq_r0, 4);
1022 resq_r2 = _mm_srli_si128(resq_r1, 4);
1023 resq_r3 = _mm_srli_si128(resq_r2, 4);
1024
1025 *pu4_out = _mm_cvtsi128_si32(resq_r0);
1026 pu1_out += i4_out_stride;
1027 pu4_out = (UWORD32 *) (pu1_out);
1028 *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
1029 pu1_out += i4_out_stride;
1030 pu4_out = (UWORD32 *) (pu1_out);
1031 *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
1032 pu1_out += i4_out_stride;
1033 pu4_out = (UWORD32 *) (pu1_out);
1034 *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
1035 }
1036
isvc_iquant_itrans_recon_res_chroma_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1037 void isvc_iquant_itrans_recon_res_chroma_4x4_sse42(
1038 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1039 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1040 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1041 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1042 {
1043 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1044 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1045 WORD16 *pi2_res_ptr = pi2_res;
1046 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1047 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1048 WORD32 i4_src_stride = ps_src->i4_data_stride;
1049 WORD32 i4_res_stride = ps_res->i4_data_stride;
1050 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1051 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1052 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1053 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1054 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1055 __m128i src_r0_r1, src_r2_r3;
1056 __m128i src_r0, src_r1, src_r2, src_r3;
1057 __m128i scalemat_r0_r1, scalemat_r2_r3;
1058 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1059 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
1060 /* all bits reset to zero */
1061 __m128i zero_8x16b = _mm_setzero_si128();
1062 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1063 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1064 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1065 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1066 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1067 __m128i value_32 = _mm_set1_epi32(32);
1068 __m128i chroma_mask = _mm_set1_epi16(0xFF);
1069 __m128i out_r0, out_r1, out_r2, out_r3;
1070 __m128i res_r0, res_r1, res_r2, res_r3;
1071
1072 ASSERT(4 == i4_src_stride);
1073 ASSERT(0 == u1_res_accumulate);
1074
1075 UNUSED(i4_src_stride);
1076 UNUSED(u1_res_accumulate);
1077 UNUSED(ps_res_pred);
1078 UNUSED(i4_iq_start_idx);
1079
1080 /*************************************************************/
1081 /* Dequantization of coefficients. Will be replaced by SIMD */
1082 /* operations on platform */
1083 /*************************************************************/
1084 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
1085 matrix 0th,1st row */
1086 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1087
1088 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
1089 source matrix 2nd,3rd row */
1090 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1091
1092 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
1093 scaling matrix 0th,1st row */
1094 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1095
1096 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
1097 the scaling matrix 2nd,3rd row */
1098 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1099
1100 /* q00 q01 q02 q03 q10 q11
1101 q12 q13 -- all 16 bits */
1102 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1103
1104 /* q20 q21 q22 q23 q30 q31
1105 q32 q33 -- all 16 bits */
1106 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1107
1108 temp0 = _mm_mullo_epi16(scalemat_r0_r1,
1109 dequant_r0_r1); // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
1110 // b12*q12 b13*q13 -- 16 bit result
1111
1112 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1113
1114 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1115 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1116
1117 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1118 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1119
1120 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1121 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1122
1123 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1124 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1125
1126 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
1127 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
1128 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
1129 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
1130 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
1131 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
1132 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
1133 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
1134
1135 temp4 = _mm_madd_epi16(src_r0, temp4);
1136 temp5 = _mm_madd_epi16(src_r1, temp5);
1137 temp6 = _mm_madd_epi16(src_r2, temp6);
1138 temp7 = _mm_madd_epi16(src_r3, temp7);
1139
1140 if(u4_qp_div_6 >= 4)
1141 {
1142 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1143 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1144 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1145 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1146 }
1147 else
1148 {
1149 temp4 = _mm_add_epi32(temp4, add_rshift);
1150 temp5 = _mm_add_epi32(temp5, add_rshift);
1151 temp6 = _mm_add_epi32(temp6, add_rshift);
1152 temp7 = _mm_add_epi32(temp7, add_rshift);
1153 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1154 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1155 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1156 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1157 }
1158
1159 resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1160 /* Perform Inverse transform */
1161 /*-------------------------------------------------------------*/
1162 /* IDCT [ Horizontal transformation ] */
1163 /*-------------------------------------------------------------*/
1164 // Matrix transpose
1165 /*
1166 * a0 a1 a2 a3
1167 * b0 b1 b2 b3
1168 * c0 c1 c2 c3
1169 * d0 d1 d2 d3
1170 */
1171 /* a0 b0 a1 b1 */
1172 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1173 /* c0 d0 c1 d1 */
1174 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1175 /* a2 b2 a3 b3 */
1176 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1177 /* c2 d2 c3 d3 */
1178 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1179 /* a0 b0 c0 d0 */
1180 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1181 /* a1 b1 c1 d1 */
1182 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1183 /* a2 b2 c2 d2 */
1184 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1185 /* a3 b3 c3 d3 */
1186 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1187 /* Transform starts -- horizontal transform */
1188
1189 /*------------------------------------------------------------------*/
1190 /* z0 = w0 + w2 */
1191 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1192 /* z1 = w0 - w2 */
1193 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1194 /* z2 = (w1 >> 1) - w3 */
1195 temp2 = _mm_srai_epi32(resq_r1, 1);
1196 temp2 = _mm_sub_epi32(temp2, resq_r3);
1197 /* z3 = w1 + (w3 >> 1) */
1198 temp3 = _mm_srai_epi32(resq_r3, 1);
1199 temp3 = _mm_add_epi32(temp3, resq_r1);
1200 /*----------------------------------------------------------*/
1201 /* x0 = z0 + z3 */
1202 resq_r0 = _mm_add_epi32(temp0, temp3);
1203 /* x1 = z1 + z2 */
1204 resq_r1 = _mm_add_epi32(temp1, temp2);
1205 /* x2 = z1 - z2 */
1206 resq_r2 = _mm_sub_epi32(temp1, temp2);
1207 /* x3 = z0 - z3 */
1208 resq_r3 = _mm_sub_epi32(temp0, temp3);
1209 // Matrix transpose
1210 /*
1211 * a0 b0 c0 d0
1212 * a1 b1 c1 d1
1213 * a2 b2 c2 d2
1214 * a3 b3 c3 d3
1215 */
1216 /* a0 a1 b0 b1 */
1217 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1218 /* a2 a3 b2 b3 */
1219 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1220 /* c0 c1 d0 d1 */
1221 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1222 /* c2 c3 d2 d3 */
1223 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1224 /* a0 a1 a2 a3 */
1225 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1226 /* b0 b1 b2 b3 */
1227 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1228 /* c0 c1 c2 c3 */
1229 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1230 /* d0 d1 d2 d3 */
1231 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1232 /* Transform ends -- horizontal transform */
1233
1234 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
1235 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
1236
1237 _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0);
1238 _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1);
1239
1240 /* Load pred buffer */
1241 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1242 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1243 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1244 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1245
1246 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1247 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1248 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1249 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1250
1251 pred_r0 = _mm_cvtepu16_epi32(pred_r0);
1252 pred_r1 = _mm_cvtepu16_epi32(pred_r1);
1253 pred_r2 = _mm_cvtepu16_epi32(pred_r2);
1254 pred_r3 = _mm_cvtepu16_epi32(pred_r3);
1255
1256 /*--------------------------------------------------------------*/
1257 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
1258 /* */
1259 /* Add the prediction and store it back to same buffer */
1260 /*--------------------------------------------------------------*/
1261 /* z0j = y0j + y2j */
1262 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1263 /* z1j = y0j - y2j */
1264 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1265 /* z2j = (y1j>>1) - y3j */
1266 temp2 = _mm_srai_epi32(resq_r1, 1);
1267 temp2 = _mm_sub_epi32(temp2, resq_r3);
1268 /* z3j = y1j + (y3j>>1) */
1269 temp3 = _mm_srai_epi32(resq_r3, 1);
1270 temp3 = _mm_add_epi32(temp3, resq_r1);
1271
1272 /* x0j = z0j + z3j */
1273 temp4 = _mm_add_epi32(temp0, temp3);
1274 temp4 = _mm_add_epi32(temp4, value_32);
1275 temp4 = _mm_srai_epi32(temp4, 6);
1276 /* x1j = z1j + z2j */
1277 temp5 = _mm_add_epi32(temp1, temp2);
1278 temp5 = _mm_add_epi32(temp5, value_32);
1279 temp5 = _mm_srai_epi32(temp5, 6);
1280 /* x2j = z1j - z2j */
1281 temp6 = _mm_sub_epi32(temp1, temp2);
1282 temp6 = _mm_add_epi32(temp6, value_32);
1283 temp6 = _mm_srai_epi32(temp6, 6);
1284 /* x3j = z0j - z3j */
1285 temp7 = _mm_sub_epi32(temp0, temp3);
1286 temp7 = _mm_add_epi32(temp7, value_32);
1287 temp7 = _mm_srai_epi32(temp7, 6);
1288
1289 /* 32-bit to 16-bit conversion */
1290 temp0 = _mm_packs_epi32(temp4, temp5);
1291 temp1 = _mm_packs_epi32(temp6, temp7);
1292
1293 /* Saturate all values < -255 to -255 and retain the rest as it is */
1294 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
1295 /* Saturate all values > 255 to 255 and retain the rest as it is */
1296 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
1297
1298 /* Saturate all values < -255 to -255 and retain the rest as it is */
1299 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
1300 /* Saturate all values > 255 to 255 and retain the rest as it is */
1301 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
1302
1303 chroma_mask = _mm_set1_epi32(0xffff0000);
1304 out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]));
1305 out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]));
1306 out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]));
1307 out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]));
1308
1309 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1310 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1311 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1312 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1313
1314 res_r0 = _mm_cvtepu16_epi32(temp0);
1315 res_r2 = _mm_cvtepu16_epi32(temp1);
1316 res_r1 = _mm_srli_si128(temp0, 8);
1317 res_r3 = _mm_srli_si128(temp1, 8);
1318 res_r1 = _mm_cvtepu16_epi32(res_r1);
1319 res_r3 = _mm_cvtepu16_epi32(res_r3);
1320
1321 out_r0 = _mm_add_epi16(out_r0, res_r0);
1322 out_r1 = _mm_add_epi16(out_r1, res_r1);
1323 out_r2 = _mm_add_epi16(out_r2, res_r2);
1324 out_r3 = _mm_add_epi16(out_r3, res_r3);
1325
1326 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]), out_r0);
1327 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]), out_r1);
1328 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), out_r2);
1329 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), out_r3);
1330
1331 resq_r0 = _mm_add_epi16(pred_r0, res_r0);
1332 resq_r1 = _mm_add_epi16(pred_r1, res_r1);
1333 resq_r2 = _mm_add_epi16(pred_r2, res_r2);
1334 resq_r3 = _mm_add_epi16(pred_r3, res_r3);
1335
1336 temp0 = _mm_packus_epi32(resq_r0, resq_r1);
1337 temp1 = _mm_packus_epi32(resq_r2, resq_r3);
1338
1339 /*------------------------------------------------------------------*/
1340 /* Clipping the results to 8 bits */
1341 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
1342 temp0 = _mm_and_si128(temp0, sign_reg);
1343 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
1344 temp1 = _mm_and_si128(temp1, sign_reg);
1345
1346 resq_r0 = _mm_packus_epi16(temp0, temp1);
1347 resq_r1 = _mm_srli_si128(resq_r0, 4);
1348 resq_r2 = _mm_srli_si128(resq_r1, 4);
1349 resq_r3 = _mm_srli_si128(resq_r2, 4);
1350
1351 resq_r0 = _mm_cvtepu8_epi16(resq_r0);
1352 resq_r1 = _mm_cvtepu8_epi16(resq_r1);
1353 resq_r2 = _mm_cvtepu8_epi16(resq_r2);
1354 resq_r3 = _mm_cvtepu8_epi16(resq_r3);
1355
1356 chroma_mask = _mm_set1_epi16(0xff00);
1357 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
1358 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
1359 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
1360 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
1361
1362 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1363 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1364 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1365 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1366
1367 out_r0 = _mm_add_epi8(out_r0, resq_r0);
1368 out_r1 = _mm_add_epi8(out_r1, resq_r1);
1369 out_r2 = _mm_add_epi8(out_r2, resq_r2);
1370 out_r3 = _mm_add_epi8(out_r3, resq_r3);
1371
1372 _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
1373 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
1374 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
1375 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
1376 }
1377
isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1378 void isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42(
1379 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1380 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1381 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1382 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1383 {
1384 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1385 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1386 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1387 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1388 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1389 WORD32 i4_src_stride = ps_src->i4_data_stride;
1390 WORD32 i4_res_stride = ps_res->i4_data_stride;
1391 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1392 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1393 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1394 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1395 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1396 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1397 __m128i src_r0_r1, src_r2_r3;
1398 __m128i src_r0, src_r1, src_r2, src_r3;
1399 __m128i scalemat_r0_r1, scalemat_r2_r3;
1400 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1401 __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
1402 __m128i res_r0, res_r1, res_r2, res_r3;
1403 __m128i dequant_r0_r1, dequant_r2_r3;
1404 /* all bits reset to zero */
1405 __m128i zero_8x16b = _mm_setzero_si128();
1406 __m128i reg_chroma = _mm_set1_epi32(0xFFFF);
1407 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1408 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1409 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1410 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1411 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1412 __m128i value_32 = _mm_set1_epi32(32);
1413 __m128i chroma_mask = _mm_set1_epi16(0xFF);
1414 __m128i out_r0, out_r1, out_r2, out_r3;
1415 __m128i mask_r0;
1416
1417 ASSERT(4 == i4_src_stride);
1418 ASSERT(1 == u1_res_accumulate);
1419
1420 UNUSED(i4_src_stride);
1421 UNUSED(u1_res_accumulate);
1422 UNUSED(i4_iq_start_idx);
1423
1424 /*************************************************************/
1425 /* Dequantization of coefficients. Will be replaced by SIMD */
1426 /* operations on platform */
1427 /*************************************************************/
1428 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
1429 matrix 0th,1st row */
1430 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1431
1432 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
1433 source matrix 2nd,3rd row */
1434 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1435
1436 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
1437 scaling matrix 0th,1st row */
1438 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1439
1440 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
1441 the scaling matrix 2nd,3rd row */
1442 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1443
1444 /* q00 q01 q02 q03 q10 q11
1445 q12 q13 -- all 16 bits */
1446 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1447
1448 /* q20 q21 q22 q23 q30 q31
1449 q32 q33 -- all 16 bits */
1450 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1451
1452 temp0 = _mm_mullo_epi16(scalemat_r0_r1,
1453 dequant_r0_r1); // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
1454 // b12*q12 b13*q13 -- 16 bit result
1455
1456 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1457
1458 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1459 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1460
1461 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1462 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1463
1464 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1465 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1466
1467 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1468 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1469
1470 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
1471 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
1472 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
1473 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
1474 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
1475 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
1476 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
1477 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
1478
1479 temp4 = _mm_madd_epi16(src_r0, temp4);
1480 temp5 = _mm_madd_epi16(src_r1, temp5);
1481 temp6 = _mm_madd_epi16(src_r2, temp6);
1482 temp7 = _mm_madd_epi16(src_r3, temp7);
1483
1484 if(u4_qp_div_6 >= 4)
1485 {
1486 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1487 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1488 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1489 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1490 }
1491 else
1492 {
1493 temp4 = _mm_add_epi32(temp4, add_rshift);
1494 temp5 = _mm_add_epi32(temp5, add_rshift);
1495 temp6 = _mm_add_epi32(temp6, add_rshift);
1496 temp7 = _mm_add_epi32(temp7, add_rshift);
1497 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1498 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1499 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1500 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1501 }
1502
1503 resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1504 /* Perform Inverse transform */
1505 /*-------------------------------------------------------------*/
1506 /* IDCT [ Horizontal transformation ] */
1507 /*-------------------------------------------------------------*/
1508 // Matrix transpose
1509 /*
1510 * a0 a1 a2 a3
1511 * b0 b1 b2 b3
1512 * c0 c1 c2 c3
1513 * d0 d1 d2 d3
1514 */
1515 /* a0 b0 a1 b1 */
1516 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1517 /* c0 d0 c1 d1 */
1518 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1519 /* a2 b2 a3 b3 */
1520 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1521 /* c2 d2 c3 d3 */
1522 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1523 /* a0 b0 c0 d0 */
1524 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1525 /* a1 b1 c1 d1 */
1526 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1527 /* a2 b2 c2 d2 */
1528 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1529 /* a3 b3 c3 d3 */
1530 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1531 /* Transform starts -- horizontal transform */
1532
1533 /*------------------------------------------------------------------*/
1534 /* z0 = w0 + w2 */
1535 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1536 /* z1 = w0 - w2 */
1537 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1538 /* z2 = (w1 >> 1) - w3 */
1539 temp2 = _mm_srai_epi32(resq_r1, 1);
1540 temp2 = _mm_sub_epi32(temp2, resq_r3);
1541 /* z3 = w1 + (w3 >> 1) */
1542 temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
1543 temp3 = _mm_add_epi32(temp3, resq_r1);
1544 /*----------------------------------------------------------*/
1545 /* x0 = z0 + z3 */
1546 resq_r0 = _mm_add_epi32(temp0, temp3);
1547 /* x1 = z1 + z2 */
1548 resq_r1 = _mm_add_epi32(temp1, temp2);
1549 /* x2 = z1 - z2 */
1550 resq_r2 = _mm_sub_epi32(temp1, temp2);
1551 /* x3 = z0 - z3 */
1552 resq_r3 = _mm_sub_epi32(temp0, temp3);
1553 // Matrix transpose
1554 /*
1555 * a0 b0 c0 d0
1556 * a1 b1 c1 d1
1557 * a2 b2 c2 d2
1558 * a3 b3 c3 d3
1559 */
1560 /* a0 a1 b0 b1 */
1561 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1562 /* a2 a3 b2 b3 */
1563 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1564 /* c0 c1 d0 d1 */
1565 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1566 /* c2 c3 d2 d3 */
1567 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1568 /* a0 a1 a2 a3 */
1569 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1570 /* b0 b1 b2 b3 */
1571 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1572 /* c0 c1 c2 c3 */
1573 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1574 /* d0 d1 d2 d3 */
1575 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1576 /* Transform ends -- horizontal transform */
1577
1578 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
1579 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
1580
1581 _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0);
1582 _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1);
1583
1584 /* Load pred buffer */
1585 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1586 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1587 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1588 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1589
1590 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1591 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1592 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1593 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1594
1595 /*--------------------------------------------------------------*/
1596 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
1597 /* */
1598 /* Add the prediction and store it back to same buffer */
1599 /*--------------------------------------------------------------*/
1600 /* z0j = y0j + y2j */
1601 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1602 /* z1j = y0j - y2j */
1603 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1604 /* z2j = (y1j>>1) - y3j */
1605 temp2 = _mm_srai_epi32(resq_r1, 1);
1606 temp2 = _mm_sub_epi32(temp2, resq_r3);
1607 /* z3j = y1j + (y3j>>1) */
1608 temp3 = _mm_srai_epi32(resq_r3, 1);
1609 temp3 = _mm_add_epi32(temp3, resq_r1);
1610
1611 /* x0j = z0j + z3j */
1612 temp4 = _mm_add_epi32(temp0, temp3);
1613 temp4 = _mm_add_epi32(temp4, value_32);
1614 temp4 = _mm_srai_epi32(temp4, 6);
1615 res_r0 = temp4;
1616 /* x1j = z1j + z2j */
1617 temp5 = _mm_add_epi32(temp1, temp2);
1618 temp5 = _mm_add_epi32(temp5, value_32);
1619 temp5 = _mm_srai_epi32(temp5, 6);
1620 res_r1 = temp5;
1621 /* x2j = z1j - z2j */
1622 temp6 = _mm_sub_epi32(temp1, temp2);
1623 temp6 = _mm_add_epi32(temp6, value_32);
1624 temp6 = _mm_srai_epi32(temp6, 6);
1625 res_r2 = temp6;
1626 /* x3j = z0j - z3j */
1627 temp7 = _mm_sub_epi32(temp0, temp3);
1628 temp7 = _mm_add_epi32(temp7, value_32);
1629 temp7 = _mm_srai_epi32(temp7, 6);
1630 res_r3 = temp7;
1631
1632 res_pred_r0 = _mm_loadu_si128((__m128i *) &pi2_res_pred[0 * i4_res_pred_stride]);
1633 res_pred_r1 = _mm_loadu_si128((__m128i *) &pi2_res_pred[1 * i4_res_pred_stride]);
1634 res_pred_r2 = _mm_loadu_si128((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
1635 res_pred_r3 = _mm_loadu_si128((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
1636
1637 res_pred_r0 = _mm_and_si128(res_pred_r0, reg_chroma);
1638 res_pred_r1 = _mm_and_si128(res_pred_r1, reg_chroma);
1639 res_pred_r2 = _mm_and_si128(res_pred_r2, reg_chroma);
1640 res_pred_r3 = _mm_and_si128(res_pred_r3, reg_chroma);
1641
1642 temp0 = _mm_packs_epi32(res_r0, res_r1);
1643 temp1 = _mm_packs_epi32(res_r2, res_r3);
1644
1645 res_r0 = _mm_cvtepu16_epi32(temp0);
1646 res_r2 = _mm_cvtepu16_epi32(temp1);
1647 res_r1 = _mm_srli_si128(temp0, 8);
1648 res_r3 = _mm_srli_si128(temp1, 8);
1649 res_r1 = _mm_cvtepu16_epi32(res_r1);
1650 res_r3 = _mm_cvtepu16_epi32(res_r3);
1651
1652 res_r0 = _mm_add_epi16(res_pred_r0, res_r0);
1653 res_r1 = _mm_add_epi16(res_pred_r1, res_r1);
1654 res_r2 = _mm_add_epi16(res_pred_r2, res_r2);
1655 res_r3 = _mm_add_epi16(res_pred_r3, res_r3);
1656
1657 temp0 = _mm_packus_epi32(res_r0, res_r1);
1658 temp1 = _mm_packus_epi32(res_r2, res_r3);
1659
1660 /* Saturate all values < -255 to -255 and retain the rest as it is */
1661 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
1662 /* Saturate all values > 255 to 255 and retain the rest as it is */
1663 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
1664
1665 /* Saturate all values < -255 to -255 and retain the rest as it is */
1666 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
1667 /* Saturate all values > 255 to 255 and retain the rest as it is */
1668 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
1669
1670 res_r0 = _mm_cvtepu16_epi32(temp0);
1671 res_r1 = _mm_srli_si128(temp0, 8);
1672 res_r1 = _mm_cvtepu16_epi32(res_r1);
1673
1674 res_r2 = _mm_cvtepu16_epi32(temp1);
1675 res_r3 = _mm_srli_si128(temp1, 8);
1676 res_r3 = _mm_cvtepu16_epi32(res_r3);
1677
1678 chroma_mask = _mm_set1_epi32(0xffff0000);
1679 out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]));
1680 out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]));
1681 out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]));
1682 out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]));
1683
1684 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1685 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1686 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1687 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1688
1689 out_r0 = _mm_add_epi16(out_r0, res_r0);
1690 out_r1 = _mm_add_epi16(out_r1, res_r1);
1691 out_r2 = _mm_add_epi16(out_r2, res_r2);
1692 out_r3 = _mm_add_epi16(out_r3, res_r3);
1693
1694 _mm_storeu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]), out_r0);
1695 _mm_storeu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]), out_r1);
1696 _mm_storeu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]), out_r2);
1697 _mm_storeu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]), out_r3);
1698
1699 pred_r0 = _mm_cvtepu16_epi32(pred_r0);
1700 pred_r1 = _mm_cvtepu16_epi32(pred_r1);
1701 pred_r2 = _mm_cvtepu16_epi32(pred_r2);
1702 pred_r3 = _mm_cvtepu16_epi32(pred_r3);
1703
1704 resq_r0 = _mm_add_epi16(pred_r0, res_r0);
1705 resq_r1 = _mm_add_epi16(pred_r1, res_r1);
1706 resq_r2 = _mm_add_epi16(pred_r2, res_r2);
1707 resq_r3 = _mm_add_epi16(pred_r3, res_r3);
1708
1709 temp0 = _mm_packus_epi32(resq_r0, resq_r1);
1710 temp1 = _mm_packus_epi32(resq_r2, resq_r3);
1711
1712 /* Clipping the results to 8 bits */
1713 mask_r0 = _mm_cmpgt_epi16(temp0, zero_8x16b);
1714 temp0 = _mm_and_si128(temp0, mask_r0);
1715 mask_r0 = _mm_cmpgt_epi16(temp1, zero_8x16b);
1716 temp1 = _mm_and_si128(temp1, mask_r0);
1717
1718 resq_r0 = _mm_packus_epi16(temp0, temp1);
1719 resq_r1 = _mm_srli_si128(resq_r0, 4);
1720 resq_r2 = _mm_srli_si128(resq_r1, 4);
1721 resq_r3 = _mm_srli_si128(resq_r2, 4);
1722
1723 resq_r0 = _mm_cvtepu8_epi16(resq_r0);
1724 resq_r1 = _mm_cvtepu8_epi16(resq_r1);
1725 resq_r2 = _mm_cvtepu8_epi16(resq_r2);
1726 resq_r3 = _mm_cvtepu8_epi16(resq_r3);
1727
1728 chroma_mask = _mm_set1_epi16(0xFF00);
1729 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0 * i4_out_stride]));
1730 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[1 * i4_out_stride]));
1731 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
1732 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
1733
1734 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1735 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1736 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1737 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1738
1739 out_r0 = _mm_add_epi8(out_r0, resq_r0);
1740 out_r1 = _mm_add_epi8(out_r1, resq_r1);
1741 out_r2 = _mm_add_epi8(out_r2, resq_r2);
1742 out_r3 = _mm_add_epi8(out_r3, resq_r3);
1743
1744 _mm_storel_epi64((__m128i *) (&pu1_out[0 * i4_out_stride]), out_r0);
1745 _mm_storel_epi64((__m128i *) (&pu1_out[1 * i4_out_stride]), out_r1);
1746 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
1747 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
1748 }
1749
isvc_iquant_itrans_recon_dc_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1750 void isvc_iquant_itrans_recon_dc_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
1751 buffer_container_t *ps_res_pred,
1752 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1753 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
1754 WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1755 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1756 {
1757 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1758 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1759 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1760 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1761 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1762 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1763 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1764 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
1765 WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0];
1766 WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
1767
1768 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1769 __m128i sign_reg;
1770 /* all bits reset to zero */
1771 __m128i zero_8x16b = _mm_setzero_si128();
1772 __m128i temp4, temp5, temp6, temp7;
1773 __m128i value_add;
1774
1775 ASSERT(0 == u1_res_accumulate);
1776
1777 UNUSED(pi2_tmp);
1778 UNUSED(ps_res);
1779 UNUSED(ps_res_pred);
1780 UNUSED(u1_res_accumulate);
1781
1782 INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
1783
1784 /* Restoring dc value for intra case */
1785 if(i4_iq_start_idx != 0)
1786 {
1787 q0 = pi2_dc_src[0];
1788 }
1789
1790 i_macro = ((q0 + 32) >> 6);
1791
1792 value_add = _mm_set1_epi16(i_macro);
1793
1794 zero_8x16b = _mm_setzero_si128();
1795
1796 /* Load pred buffer */
1797
1798 /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */
1799 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1800
1801 /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */
1802 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1803
1804 /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */
1805 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1806
1807 /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */
1808 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1809
1810 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
1811 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
1812 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
1813 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
1814
1815 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
1816 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
1817
1818 temp4 = _mm_add_epi16(value_add, pred_r0);
1819 temp5 = _mm_add_epi16(value_add, pred_r2);
1820 /*------------------------------------------------------------------*/
1821 /* Clipping the results to 8 bits */
1822 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);
1823 temp4 = _mm_and_si128(temp4, sign_reg);
1824 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);
1825 temp5 = _mm_and_si128(temp5, sign_reg);
1826
1827 temp4 = _mm_packus_epi16(temp4, temp5);
1828 temp5 = _mm_srli_si128(temp4, 4);
1829 temp6 = _mm_srli_si128(temp5, 4);
1830 temp7 = _mm_srli_si128(temp6, 4);
1831
1832 *pu4_out = _mm_cvtsi128_si32(temp4);
1833 pu1_out += i4_out_stride;
1834 pu4_out = (UWORD32 *) (pu1_out);
1835 *(pu4_out) = _mm_cvtsi128_si32(temp5);
1836 pu1_out += i4_out_stride;
1837 pu4_out = (UWORD32 *) (pu1_out);
1838 *(pu4_out) = _mm_cvtsi128_si32(temp6);
1839 pu1_out += i4_out_stride;
1840 pu4_out = (UWORD32 *) (pu1_out);
1841 *(pu4_out) = _mm_cvtsi128_si32(temp7);
1842 }
1843
isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1844 void isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42(
1845 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1846 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1847 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1848 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1849 {
1850 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1851 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1852 WORD16 *pi2_res_ptr = pi2_res;
1853 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1854 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1855 WORD32 i4_res_stride = ps_res->i4_data_stride;
1856 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1857 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1858 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1859 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1860 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1861 /* DC value won't be dequantized for chroma
1862 inverse transform */
1863 WORD16 q0 = pi2_dc_src[0];
1864 WORD16 i_macro = ((q0 + 32) >> 6);
1865
1866 __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg;
1867 /* all bits reset to zero */
1868 __m128i zero_8x16b = _mm_setzero_si128();
1869 __m128i chroma_mask = _mm_set1_epi16(0xFF);
1870 __m128i value_add = _mm_set1_epi16(isvc_get_residue(i_macro, 0, 0));
1871 __m128i out_r0, out_r1, out_r2, out_r3;
1872
1873 ASSERT(0 == u1_res_accumulate);
1874
1875 UNUSED(pi2_src);
1876 UNUSED(pu2_iscal_mat);
1877 UNUSED(pu2_weigh_mat);
1878 UNUSED(u4_qp_div_6);
1879 UNUSED(pi2_tmp);
1880 UNUSED(ps_res_pred);
1881 UNUSED(i4_iq_start_idx);
1882 UNUSED(u1_res_accumulate);
1883
1884 /* Load pred buffer */
1885 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1886
1887 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1888
1889 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1890
1891 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1892
1893 /* Mask alternate pred values from the interleaved pred buf */
1894 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1895 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1896 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1897 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1898
1899 /* Pack the first four 16 bit values of 2 regs into a single reg*/
1900 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
1901 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
1902
1903 /* Compute out pixel by adding res to pred */
1904 pred_r0 = _mm_add_epi16(value_add, pred_r0);
1905 pred_r2 = _mm_add_epi16(value_add, pred_r2);
1906
1907 /* Convert res from 16 bits to 32 bits */
1908 value_add = _mm_cvtepu16_epi32(value_add);
1909
1910 out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]));
1911 out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]));
1912 out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]));
1913 out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]));
1914
1915 /* Mask the loaded res in order to save the U/V res data computed in
1916 this function call without thrashing the U/V res data that was saved
1917 during an earlier function call */
1918 chroma_mask = _mm_set1_epi32(0xffff0000);
1919 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1920 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1921 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1922 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1923
1924 /* Save the res in alternate locations */
1925 out_r0 = _mm_add_epi16(out_r0, value_add);
1926 out_r1 = _mm_add_epi16(out_r1, value_add);
1927 out_r2 = _mm_add_epi16(out_r2, value_add);
1928 out_r3 = _mm_add_epi16(out_r3, value_add);
1929
1930 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]), out_r0);
1931 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]), out_r1);
1932 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), out_r2);
1933 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), out_r3);
1934 /*------------------------------------------------------------------*/
1935 /* Clipping the results to 8 bits */
1936 sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b);
1937 pred_r0 = _mm_and_si128(pred_r0, sign_reg);
1938 sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b);
1939 pred_r2 = _mm_and_si128(pred_r2, sign_reg);
1940
1941 pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
1942 pred_r1 = _mm_srli_si128(pred_r0, 4);
1943 pred_r2 = _mm_srli_si128(pred_r1, 4);
1944 pred_r3 = _mm_srli_si128(pred_r2, 4);
1945
1946 /* p00 p01 p02 p03 -- all 16 bits */
1947 pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);
1948 /* p10 p11 p12 p13 -- all 16 bits */
1949 pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);
1950 /* p20 p21 p22 p23 -- all 16 bits */
1951 pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);
1952 /* p30 p31 p32 p33 -- all 16 bits */
1953 pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);
1954
1955 /* Load interleaved out buffer */
1956 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
1957 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
1958 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
1959 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
1960
1961 /* Mask the interleaved out buf in order to save the U/V out pixel computed in
1962 this function call without thrashing the U/V out pixel that was saved
1963 during an earlier function call */
1964 chroma_mask = _mm_set1_epi16(0xFF00);
1965
1966 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1967 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1968 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1969 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1970
1971 /* Save the out pixels in alternate locations */
1972 out_r0 = _mm_add_epi8(out_r0, pred_r0);
1973 out_r1 = _mm_add_epi8(out_r1, pred_r1);
1974 out_r2 = _mm_add_epi8(out_r2, pred_r2);
1975 out_r3 = _mm_add_epi8(out_r3, pred_r3);
1976
1977 _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
1978 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
1979 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
1980 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
1981 }
1982
isvc_iquant_itrans_recon_res_chroma_4x4_dc_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1983 void isvc_iquant_itrans_recon_res_chroma_4x4_dc_with_res_acc_sse42(
1984 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1985 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1986 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1987 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1988 {
1989 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1990 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1991 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1992 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1993 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1994 WORD32 i4_res_stride = ps_res->i4_data_stride;
1995 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1996 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1997 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1998 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1999 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2000 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2001 /* DC value won't be dequantized for chroma
2002 inverse transform */
2003 WORD16 q0 = pi2_dc_src[0];
2004 WORD16 i_macro = ((q0 + 32) >> 6);
2005
2006 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2007 /* all bits reset to zero */
2008 __m128i zero_8x16b = _mm_setzero_si128();
2009 __m128i chroma_mask = _mm_set1_epi16(0xFF);
2010 __m128i reg_chroma = _mm_set_epi16(0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF);
2011 __m128i value_add = _mm_set1_epi16(i_macro);
2012 __m128i out_r0, out_r1, out_r2, out_r3;
2013 __m128i res_r0, res_r1, res_r2, res_r3;
2014 __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
2015 __m128i temp0, temp1;
2016 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
2017 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
2018
2019 ASSERT(1 == u1_res_accumulate);
2020
2021 UNUSED(pi2_src);
2022 UNUSED(pu2_iscal_mat);
2023 UNUSED(pu2_weigh_mat);
2024 UNUSED(u4_qp_div_6);
2025 UNUSED(pi2_tmp);
2026 UNUSED(i4_iq_start_idx);
2027 UNUSED(u1_res_accumulate);
2028
2029 /* Load pred buffer */
2030 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2031
2032 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2033
2034 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2035
2036 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2037 /* Mask alternate pred values from the interleaved pred buf */
2038 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
2039 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
2040 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
2041 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
2042
2043 /* Pack the first four 16 bit values of 2 regs into a single reg*/
2044 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2045 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2046
2047 /* Accumulating res */
2048
2049 /* load res pred buffer */
2050 res_pred_r0 = _mm_loadu_si128((__m128i *) &pi2_res_pred[0 * i4_res_pred_stride]);
2051 res_pred_r1 = _mm_loadu_si128((__m128i *) &pi2_res_pred[1 * i4_res_pred_stride]);
2052 res_pred_r2 = _mm_loadu_si128((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
2053 res_pred_r3 = _mm_loadu_si128((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
2054
2055 /* Mask res pred and retain alternate values */
2056 res_pred_r0 = _mm_and_si128(res_pred_r0, reg_chroma);
2057 res_pred_r1 = _mm_and_si128(res_pred_r1, reg_chroma);
2058 res_pred_r2 = _mm_and_si128(res_pred_r2, reg_chroma);
2059 res_pred_r3 = _mm_and_si128(res_pred_r3, reg_chroma);
2060
2061 /* Convert to 32 bits */
2062 res_r0 = _mm_cvtepu16_epi32(value_add);
2063 res_r2 = _mm_cvtepu16_epi32(value_add);
2064 res_r1 = _mm_cvtepu16_epi32(value_add);
2065 res_r3 = _mm_cvtepu16_epi32(value_add);
2066
2067 /* Add res pred to the res obtained from inv transform */
2068 res_r0 = _mm_add_epi16(res_pred_r0, res_r0);
2069 res_r1 = _mm_add_epi16(res_pred_r1, res_r1);
2070 res_r2 = _mm_add_epi16(res_pred_r2, res_r2);
2071 res_r3 = _mm_add_epi16(res_pred_r3, res_r3);
2072
2073 /* Convert 32 bit res of the format [a0 0 a1 0 a2 0 a3 0] to
2074 16 bits of the format [a0 a1 a2 a3] using hadd [ao + 0,
2075 a1 + 0, a2 + 0, a3 + 0] To be optimized */
2076 temp0 = _mm_hadd_epi16(res_r0, res_r1);
2077 temp1 = _mm_hadd_epi16(res_r2, res_r3);
2078
2079 /* Saturate all values < -255 to -255 and retain the rest as it is */
2080 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
2081 /* Saturate all values > 255 to 255 and retain the rest as it is */
2082 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
2083
2084 /* Saturate all values < -255 to -255 and retain the rest as it is */
2085 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
2086 /* Saturate all values > 255 to 255 and retain the rest as it is */
2087 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
2088
2089 /* Compute out pixel by adding res to pred */
2090 pred_r0 = _mm_add_epi16(temp0, pred_r0);
2091 pred_r2 = _mm_add_epi16(temp1, pred_r2);
2092
2093 res_r0 = _mm_cvtepu16_epi32(temp0);
2094 res_r2 = _mm_cvtepu16_epi32(temp1);
2095 res_r1 = _mm_srli_si128(temp0, 8);
2096 res_r3 = _mm_srli_si128(temp1, 8);
2097 res_r1 = _mm_cvtepu16_epi32(res_r1);
2098 res_r3 = _mm_cvtepu16_epi32(res_r3);
2099
2100 /* Load res buffer */
2101 out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]));
2102 out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]));
2103 out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]));
2104 out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]));
2105
2106 /* Mask the loaded res in order to save the U/V res data computed in
2107 this function call without thrashing the U/V res data that was saved
2108 during an earlier function call */
2109 chroma_mask = _mm_set1_epi32(0xffff0000);
2110
2111 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2112 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2113 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2114 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2115
2116 /* Save the res in alternate locations */
2117 out_r0 = _mm_add_epi16(out_r0, res_r0);
2118 out_r1 = _mm_add_epi16(out_r1, res_r1);
2119 out_r2 = _mm_add_epi16(out_r2, res_r2);
2120 out_r3 = _mm_add_epi16(out_r3, res_r3);
2121
2122 _mm_storeu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]), out_r0);
2123 _mm_storeu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]), out_r1);
2124 _mm_storeu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]), out_r2);
2125 _mm_storeu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]), out_r3);
2126 /*------------------------------------------------------------------*/
2127 /* Clipping the results to 8 bits */
2128 pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
2129 pred_r1 = _mm_srli_si128(pred_r0, 4);
2130 pred_r2 = _mm_srli_si128(pred_r1, 4);
2131 pred_r3 = _mm_srli_si128(pred_r2, 4);
2132
2133 /* p00 p01 p02 p03 -- all 16 bits */
2134 pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);
2135 /* p10 p11 p12 p13 -- all 16 bits */
2136 pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);
2137 /* p20 p21 p22 p23 -- all 16 bits */
2138 pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);
2139 /* p30 p31 p32 p33 -- all 16 bits */
2140 pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);
2141
2142 /* Load interleaved out buffer */
2143 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
2144 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
2145 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2146 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2147
2148 /* Mask the interleaved out buf in order to save the U/V out pixel computed in
2149 this function call without thrashing the U/V out pixel that was saved
2150 during an earlier function call */
2151 chroma_mask = _mm_set1_epi16(0xFF00);
2152
2153 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2154 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2155 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2156 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2157
2158 /* Save the out pixels in alternate locations */
2159 out_r0 = _mm_add_epi8(out_r0, pred_r0);
2160 out_r1 = _mm_add_epi8(out_r1, pred_r1);
2161 out_r2 = _mm_add_epi8(out_r2, pred_r2);
2162 out_r3 = _mm_add_epi8(out_r3, pred_r3);
2163
2164 _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
2165 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
2166 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2167 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2168 }
2169