1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvc_iquant_itrans_recon_sse42.c
24 *
25 * @brief
26 * Contains function definitions for inverse quantization, inverse
27 * transform and reconstruction
28 *
29 * @author
30 * Mohit [100664]
31 *
32 * @par List of Functions:
33 * - isvc_iquant_itrans_recon_4x4_sse42()
34 * - isvc_iquant_itrans_recon_chroma_4x4_sse42()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 #include <immintrin.h>
42
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "ih264_defs.h"
46 #include "ih264_trans_macros.h"
47 #include "ih264_macros.h"
48 #include "ih264_platform_macros.h"
49 #include "ih264_trans_data.h"
50 #include "ih264_size_defs.h"
51 #include "isvc_structs.h"
52 #include "isvc_trans_quant_itrans_iquant.h"
53
54 /*
55 ********************************************************************************
56 *
57 * @brief This function reconstructs a 4x4 sub block from quantized resiude and
58 * prediction buffer
59 *
60 * @par Description:
61 * The quantized residue is first inverse quantized, then inverse transformed.
62 * This inverse transformed content is added to the prediction buffer to recon-
63 * struct the end output
64 *
65 * @param[in] pi2_src
66 * quantized 4x4 block
67 *
68 * @param[in] pu1_pred
69 * prediction 4x4 block
70 *
71 * @param[out] pu1_out
72 * reconstructed 4x4 block
73 *
74 * @param[in] src_strd
75 * quantization buffer stride
76 *
77 * @param[in] i4_pred_stride,
78 * Prediction buffer stride
79 *
80 * @param[in] i4_out_stride
81 * recon buffer Stride
82 *
83 * @param[in] pu2_scaling_list
84 * pointer to scaling list
85 *
86 * @param[in] pu2_norm_adjust
87 * pointer to inverse scale matrix
88 *
89 * @param[in] u4_qp_div_6
90 * Floor (qp/6)
91 *
92 * @param[in] pi4_tmp
93 * temporary buffer of size 1*16
94 *
95 * @returns none
96 *
97 * @remarks none
98 *
99 *******************************************************************************
100 */
101
isvc_iquant_itrans_recon_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)102 void isvc_iquant_itrans_recon_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
103 buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
104 buffer_container_t *ps_rec,
105 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
106 WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
107 UWORD8 u1_res_accumulate)
108 {
109 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
110 WORD16 *pi2_tmp_ptr = pi2_tmp;
111 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
112 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
113 WORD32 i4_src_stride = ps_src->i4_data_stride;
114 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
115 WORD32 i4_out_stride = ps_rec->i4_data_stride;
116 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
117 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
118 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
119 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
120 __m128i src_r0_r1, src_r2_r3;
121 __m128i src_r0, src_r1, src_r2, src_r3;
122 __m128i scalemat_r0_r1, scalemat_r2_r3;
123 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
124 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
125 /* all bits reset to zero */
126 __m128i zero_8x16b = _mm_setzero_si128();
127 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
128 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
129 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
130 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
131 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
132 __m128i value_32 = _mm_set1_epi32(32);
133
134 ASSERT(4 == i4_src_stride);
135 ASSERT(0 == u1_res_accumulate);
136
137 UNUSED(i4_src_stride);
138 UNUSED(ps_res);
139 UNUSED(ps_res_pred);
140 UNUSED(u1_res_accumulate);
141
142 /*************************************************************/
143 /* Dequantization of coefficients. Will be replaced by SIMD */
144 /* operations on platform */
145 /*************************************************************/
146
147 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
148 matrix 0th,1st row */
149 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
150
151 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
152 source matrix 2nd,3rd row */
153 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
154
155 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
156 scaling matrix 0th,1st row */
157 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
158
159 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
160 the scaling matrix 2nd,3rd row */
161 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
162
163 /* q00 q01 q02 q03 q10 q11
164 q12 q13 -- all 16 bits */
165 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
166
167 /* q20 q21 q22 q23 q30 q31
168 q32 q33 -- all 16 bits */
169 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
170
171 /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
172 b12*q12 b13*q13 -- 16 bit result */
173 temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
174
175 /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
176 b32*q32 b33*q33 -- 16 bit result */
177 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
178
179 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
180 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
181
182 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
183 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
184
185 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
186 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
187
188 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
189 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
190
191 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
192 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
193 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
194 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
195 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
196 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
197 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
198 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
199
200 temp4 = _mm_madd_epi16(src_r0, temp4);
201 temp5 = _mm_madd_epi16(src_r1, temp5);
202 temp6 = _mm_madd_epi16(src_r2, temp6);
203 temp7 = _mm_madd_epi16(src_r3, temp7);
204
205 if(u4_qp_div_6 >= 4)
206 {
207 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
208 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
209 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
210 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
211 }
212 else
213 {
214 temp4 = _mm_add_epi32(temp4, add_rshift);
215 temp5 = _mm_add_epi32(temp5, add_rshift);
216 temp6 = _mm_add_epi32(temp6, add_rshift);
217 temp7 = _mm_add_epi32(temp7, add_rshift);
218 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
219 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
220 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
221 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
222 }
223
224 if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
225 /* Perform Inverse transform */
226 /*-------------------------------------------------------------*/
227 /* IDCT [ Horizontal transformation ] */
228 /*-------------------------------------------------------------*/
229 // Matrix transpose
230 /*
231 * a0 a1 a2 a3
232 * b0 b1 b2 b3
233 * c0 c1 c2 c3
234 * d0 d1 d2 d3
235 */
236
237 /* a0 b0 a1 b1 */
238 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
239 /* c0 d0 c1 d1 */
240 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
241 /* a2 b2 a3 b3 */
242 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
243 /* c2 d2 c3 d3 */
244 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
245 /* a0 b0 c0 d0 */
246 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
247 /* a1 b1 c1 d1 */
248 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
249 /* a2 b2 c2 d2 */
250 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
251 /* a3 b3 c3 d3 */
252 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
253 /* Transform starts -- horizontal transform */
254 /*------------------------------------------------------------------*/
255 /* z0 = w0 + w2 */
256 temp0 = _mm_add_epi32(resq_r0, resq_r2);
257 /* z1 = w0 - w2 */
258 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
259 /* z2 = (w1 >> 1) - w3 */
260 temp2 = _mm_srai_epi32(resq_r1, 1);
261 temp2 = _mm_sub_epi32(temp2, resq_r3);
262 /* z3 = w1 + (w3 >> 1) */
263 temp3 = _mm_srai_epi32(resq_r3, 1);
264 temp3 = _mm_add_epi32(temp3, resq_r1);
265 /*----------------------------------------------------------*/
266 /* x0 = z0 + z3 */
267 resq_r0 = _mm_add_epi32(temp0, temp3);
268 /* x1 = z1 + z2 */
269 resq_r1 = _mm_add_epi32(temp1, temp2);
270 /* x2 = z1 - z2 */
271 resq_r2 = _mm_sub_epi32(temp1, temp2);
272 /* x3 = z0 - z3 */
273 resq_r3 = _mm_sub_epi32(temp0, temp3);
274
275 // Matrix transpose
276 /*
277 * a0 b0 c0 d0
278 * a1 b1 c1 d1
279 * a2 b2 c2 d2
280 * a3 b3 c3 d3
281 */
282
283 /* a0 a1 b0 b1 */
284 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
285 /* a2 a3 b2 b3 */
286 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
287 /* c0 c1 d0 d1 */
288 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
289 /* c2 c3 d2 d3 */
290 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
291 /* a0 a1 a2 a3 */
292 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
293 /* b0 b1 b2 b3 */
294 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
295 /* c0 c1 c2 c3 */
296 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
297 /* d0 d1 d2 d3 */
298 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
299 /* Transform ends -- horizontal transform */
300
301 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
302 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
303
304 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
305 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
306
307 /* Load pred buffer */
308 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
309 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
310 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
311 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
312
313 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
314 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
315 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
316 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
317
318 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
319 pred_r1 = _mm_unpacklo_epi64(pred_r2, pred_r3);
320
321 /*--------------------------------------------------------------*/
322 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
323 /* */
324 /* Add the prediction and store it back to same buffer */
325 /*--------------------------------------------------------------*/
326 /* z0j = y0j + y2j */
327 temp0 = _mm_add_epi32(resq_r0, resq_r2);
328 /* z1j = y0j - y2j */
329 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
330 /* z2j = (y1j>>1) - y3j */
331 temp2 = _mm_srai_epi32(resq_r1, 1);
332 temp2 = _mm_sub_epi32(temp2, resq_r3);
333 /* z3j = y1j + (y3j>>1) */
334 temp3 = _mm_srai_epi32(resq_r3, 1);
335 temp3 = _mm_add_epi32(temp3, resq_r1);
336
337 /* x0j = z0j + z3j */
338 temp4 = _mm_add_epi32(temp0, temp3);
339 temp4 = _mm_add_epi32(temp4, value_32);
340 temp4 = _mm_srai_epi32(temp4, 6);
341 /* x1j = z1j + z2j */
342 temp5 = _mm_add_epi32(temp1, temp2);
343 temp5 = _mm_add_epi32(temp5, value_32);
344 temp5 = _mm_srai_epi32(temp5, 6);
345 /* x2j = z1j - z2j */
346 temp6 = _mm_sub_epi32(temp1, temp2);
347 temp6 = _mm_add_epi32(temp6, value_32);
348 temp6 = _mm_srai_epi32(temp6, 6);
349 /* x3j = z0j - z3j */
350 temp7 = _mm_sub_epi32(temp0, temp3);
351 temp7 = _mm_add_epi32(temp7, value_32);
352 temp7 = _mm_srai_epi32(temp7, 6);
353
354 /* 32-bit to 16-bit conversion */
355 temp0 = _mm_packs_epi32(temp4, temp5);
356 temp1 = _mm_packs_epi32(temp6, temp7);
357
358 /* Saturate all values < -255 to -255 and retain the rest as it is */
359 temp4 = _mm_max_epi16(temp0, neg_255_8x16b);
360 /* Saturate all values > 255 to 255 and retain the rest as it is */
361 temp4 = _mm_min_epi16(temp4, pos_255_8x16b);
362
363 /* Saturate all values < -255 to -255 and retain the rest as it is */
364 temp5 = _mm_max_epi16(temp1, neg_255_8x16b);
365 /* Saturate all values > 255 to 255 and retain the rest as it is */
366 temp5 = _mm_min_epi16(temp5, pos_255_8x16b);
367
368 temp0 = _mm_add_epi16(temp4, pred_r0);
369 temp1 = _mm_add_epi16(temp5, pred_r1);
370
371 /*------------------------------------------------------------------*/
372 /* Clipping the results to 8 bits */
373 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
374 temp0 = _mm_and_si128(temp0, sign_reg);
375 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
376 temp1 = _mm_and_si128(temp1, sign_reg);
377
378 resq_r0 = _mm_packus_epi16(temp0, temp1);
379 resq_r1 = _mm_srli_si128(resq_r0, 4);
380 resq_r2 = _mm_srli_si128(resq_r1, 4);
381 resq_r3 = _mm_srli_si128(resq_r2, 4);
382
383 *pu4_out = _mm_cvtsi128_si32(resq_r0);
384 pu1_out += i4_out_stride;
385 pu4_out = (UWORD32 *) (pu1_out);
386 *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
387 pu1_out += i4_out_stride;
388 pu4_out = (UWORD32 *) (pu1_out);
389 *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
390 pu1_out += i4_out_stride;
391 pu4_out = (UWORD32 *) (pu1_out);
392 *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
393 }
394
isvc_iquant_itrans_recon_res_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)395 void isvc_iquant_itrans_recon_res_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
396 buffer_container_t *ps_res_pred,
397 buffer_container_t *ps_res, buffer_container_t *ps_rec,
398 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
399 WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
400 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
401 {
402 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
403 WORD16 *pi2_tmp_ptr = pi2_tmp;
404 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
405 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
406 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
407 WORD32 i4_src_stride = ps_src->i4_data_stride;
408 WORD32 i4_res_stride = ps_res->i4_data_stride;
409 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
410 WORD32 i4_out_stride = ps_rec->i4_data_stride;
411 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
412 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
413 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
414 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
415 __m128i src_r0_r1, src_r2_r3;
416 __m128i src_r0, src_r1, src_r2, src_r3;
417 __m128i scalemat_r0_r1, scalemat_r2_r3;
418 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
419 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
420 /* all bits reset to zero */
421 __m128i zero_8x16b = _mm_setzero_si128();
422 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
423 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
424 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
425 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
426 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
427 __m128i value_32 = _mm_set1_epi32(32);
428
429 ASSERT(4 == i4_src_stride);
430 ASSERT(0 == u1_res_accumulate);
431
432 UNUSED(i4_src_stride);
433 UNUSED(ps_res_pred);
434 UNUSED(u1_res_accumulate);
435
436 /*************************************************************/
437 /* Dequantization of coefficients. Will be replaced by SIMD */
438 /* operations on platform */
439 /*************************************************************/
440
441 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
442 matrix 0th,1st row */
443 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
444
445 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
446 source matrix 2nd,3rd row */
447 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
448
449 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
450 scaling matrix 0th,1st row */
451 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
452
453 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
454 the scaling matrix 2nd,3rd row */
455 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
456
457 /* q00 q01 q02 q03 q10 q11
458 q12 q13 -- all 16 bits */
459 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
460
461 /* q20 q21 q22 q23 q30 q31
462 q32 q33 -- all 16 bits */
463 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
464
465 /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
466 b12*q12 b13*q13 -- 16 bit result */
467 temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
468
469 /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
470 b32*q32 b33*q33 -- 16 bit result */
471 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
472
473 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
474 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
475
476 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
477 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
478
479 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
480 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
481
482 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
483 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
484
485 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
486 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
487 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
488 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
489 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
490 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
491 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
492 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
493
494 temp4 = _mm_madd_epi16(src_r0, temp4);
495 temp5 = _mm_madd_epi16(src_r1, temp5);
496 temp6 = _mm_madd_epi16(src_r2, temp6);
497 temp7 = _mm_madd_epi16(src_r3, temp7);
498
499 if(u4_qp_div_6 >= 4)
500 {
501 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
502 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
503 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
504 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
505 }
506 else
507 {
508 temp4 = _mm_add_epi32(temp4, add_rshift);
509 temp5 = _mm_add_epi32(temp5, add_rshift);
510 temp6 = _mm_add_epi32(temp6, add_rshift);
511 temp7 = _mm_add_epi32(temp7, add_rshift);
512 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
513 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
514 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
515 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
516 }
517
518 if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
519 /* Perform Inverse transform */
520 /*-------------------------------------------------------------*/
521 /* IDCT [ Horizontal transformation ] */
522 /*-------------------------------------------------------------*/
523 // Matrix transpose
524 /*
525 * a0 a1 a2 a3
526 * b0 b1 b2 b3
527 * c0 c1 c2 c3
528 * d0 d1 d2 d3
529 */
530
531 /* a0 b0 a1 b1 */
532 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
533 /* c0 d0 c1 d1 */
534 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
535 /* a2 b2 a3 b3 */
536 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
537 /* c2 d2 c3 d3 */
538 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
539 /* a0 b0 c0 d0 */
540 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
541 /* a1 b1 c1 d1 */
542 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
543 /* a2 b2 c2 d2 */
544 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
545 /* a3 b3 c3 d3 */
546 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
547 /* Transform starts -- horizontal transform */
548 /*------------------------------------------------------------------*/
549 /* z0 = w0 + w2 */
550 temp0 = _mm_add_epi32(resq_r0, resq_r2);
551 /* z1 = w0 - w2 */
552 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
553 /* z2 = (w1 >> 1) - w3 */
554 temp2 = _mm_srai_epi32(resq_r1, 1);
555 temp2 = _mm_sub_epi32(temp2, resq_r3);
556 /* z3 = w1 + (w3 >> 1) */
557 temp3 = _mm_srai_epi32(resq_r3, 1);
558 temp3 = _mm_add_epi32(temp3, resq_r1);
559 /*----------------------------------------------------------*/
560 /* x0 = z0 + z3 */
561 resq_r0 = _mm_add_epi32(temp0, temp3);
562 /* x1 = z1 + z2 */
563 resq_r1 = _mm_add_epi32(temp1, temp2);
564 /* x2 = z1 - z2 */
565 resq_r2 = _mm_sub_epi32(temp1, temp2);
566 /* x3 = z0 - z3 */
567 resq_r3 = _mm_sub_epi32(temp0, temp3);
568
569 // Matrix transpose
570 /*
571 * a0 b0 c0 d0
572 * a1 b1 c1 d1
573 * a2 b2 c2 d2
574 * a3 b3 c3 d3
575 */
576
577 /* a0 a1 b0 b1 */
578 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
579 /* a2 a3 b2 b3 */
580 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
581 /* c0 c1 d0 d1 */
582 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
583 /* c2 c3 d2 d3 */
584 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
585 /* a0 a1 a2 a3 */
586 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
587 /* b0 b1 b2 b3 */
588 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
589 /* c0 c1 c2 c3 */
590 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
591 /* d0 d1 d2 d3 */
592 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
593 /* Transform ends -- horizontal transform */
594
595 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
596 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
597
598 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
599 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
600
601 /* Load pred buffer */
602 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
603 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
604 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
605 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
606
607 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
608 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
609 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
610 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
611
612 /*--------------------------------------------------------------*/
613 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
614 /* */
615 /* Add the prediction and store it back to same buffer */
616 /*--------------------------------------------------------------*/
617 /* z0j = y0j + y2j */
618 temp0 = _mm_add_epi32(resq_r0, resq_r2);
619 /* z1j = y0j - y2j */
620 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
621 /* z2j = (y1j>>1) - y3j */
622 temp2 = _mm_srai_epi32(resq_r1, 1);
623 temp2 = _mm_sub_epi32(temp2, resq_r3);
624 /* z3j = y1j + (y3j>>1) */
625 temp3 = _mm_srai_epi32(resq_r3, 1);
626 temp3 = _mm_add_epi32(temp3, resq_r1);
627
628 /* x0j = z0j + z3j */
629 temp4 = _mm_add_epi32(temp0, temp3);
630 temp4 = _mm_add_epi32(temp4, value_32);
631 temp4 = _mm_srai_epi32(temp4, 6);
632 /* x1j = z1j + z2j */
633 temp5 = _mm_add_epi32(temp1, temp2);
634 temp5 = _mm_add_epi32(temp5, value_32);
635 temp5 = _mm_srai_epi32(temp5, 6);
636 /* x2j = z1j - z2j */
637 temp6 = _mm_sub_epi32(temp1, temp2);
638 temp6 = _mm_add_epi32(temp6, value_32);
639 temp6 = _mm_srai_epi32(temp6, 6);
640 /* x3j = z0j - z3j */
641 temp7 = _mm_sub_epi32(temp0, temp3);
642 temp7 = _mm_add_epi32(temp7, value_32);
643 temp7 = _mm_srai_epi32(temp7, 6);
644
645 /* 32-bit to 16-bit conversion */
646 temp0 = _mm_packs_epi32(temp4, temp5);
647 temp1 = _mm_packs_epi32(temp6, temp7);
648
649 /* Saturate all values < -255 to -255 and retain the rest as it is */
650 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
651 /* Saturate all values > 255 to 255 and retain the rest as it is */
652 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
653
654 /* Saturate all values < -255 to -255 and retain the rest as it is */
655 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
656 /* Saturate all values > 255 to 255 and retain the rest as it is */
657 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
658
659 _mm_storel_epi64((__m128i *) (&pi2_res[0]), temp0);
660 _mm_storel_epi64((__m128i *) (&pi2_res[2 * i4_res_stride]), temp1);
661
662 temp4 = _mm_add_epi16(temp0, pred_r0);
663 temp0 = _mm_srli_si128(temp0, 8);
664 _mm_storel_epi64((__m128i *) (&pi2_res[i4_res_stride]), temp0);
665
666 temp6 = _mm_add_epi16(temp1, pred_r2);
667 temp1 = _mm_srli_si128(temp1, 8);
668 _mm_storel_epi64((__m128i *) (&pi2_res[3 * i4_res_stride]), temp1);
669
670 temp5 = _mm_add_epi16(temp0, pred_r1);
671 temp7 = _mm_add_epi16(temp1, pred_r3);
672
673 temp4 = _mm_cvtepi16_epi32(temp4);
674 temp5 = _mm_cvtepi16_epi32(temp5);
675 temp6 = _mm_cvtepi16_epi32(temp6);
676 temp7 = _mm_cvtepi16_epi32(temp7);
677
678 /* 32-bit to 16-bit conversion */
679 temp0 = _mm_packs_epi32(temp4, temp5);
680 temp1 = _mm_packs_epi32(temp6, temp7);
681 /*------------------------------------------------------------------*/
682 /* Clipping the results to 8 bits */
683 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
684 temp0 = _mm_and_si128(temp0, sign_reg);
685 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
686 temp1 = _mm_and_si128(temp1, sign_reg);
687
688 resq_r0 = _mm_packus_epi16(temp0, temp1);
689 resq_r1 = _mm_srli_si128(resq_r0, 4);
690 resq_r2 = _mm_srli_si128(resq_r1, 4);
691 resq_r3 = _mm_srli_si128(resq_r2, 4);
692
693 *pu4_out = _mm_cvtsi128_si32(resq_r0);
694 pu1_out += i4_out_stride;
695 pu4_out = (UWORD32 *) (pu1_out);
696 *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
697 pu1_out += i4_out_stride;
698 pu4_out = (UWORD32 *) (pu1_out);
699 *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
700 pu1_out += i4_out_stride;
701 pu4_out = (UWORD32 *) (pu1_out);
702 *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
703 }
704
isvc_iquant_itrans_recon_res_4x4_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)705 void isvc_iquant_itrans_recon_res_4x4_with_res_acc_sse42(
706 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
707 buffer_container_t *ps_res, buffer_container_t *ps_rec,
708 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
709 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
710 {
711 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
712 WORD16 *pi2_tmp_ptr = pi2_tmp;
713 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
714 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
715 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
716 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
717 WORD32 i4_src_stride = ps_src->i4_data_stride;
718 WORD32 i4_res_stride = ps_res->i4_data_stride;
719 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
720 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
721 WORD32 i4_out_stride = ps_rec->i4_data_stride;
722 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
723 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
724 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
725 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
726 __m128i src_r0_r1, src_r2_r3;
727 __m128i src_r0, src_r1, src_r2, src_r3;
728 __m128i scalemat_r0_r1, scalemat_r2_r3;
729 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
730 __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
731 __m128i res_r0, res_r1, res_r2, res_r3;
732 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
733 /* all bits reset to zero */
734 __m128i zero_8x16b = _mm_setzero_si128();
735 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
736 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
737 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
738 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
739 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
740 __m128i value_32 = _mm_set1_epi32(32);
741
742 ASSERT(4 == i4_src_stride);
743 ASSERT(1 == u1_res_accumulate);
744
745 UNUSED(i4_src_stride);
746 UNUSED(ps_res_pred);
747 UNUSED(u1_res_accumulate);
748
749 /*************************************************************/
750 /* Dequantization of coefficients. Will be replaced by SIMD */
751 /* operations on platform */
752 /*************************************************************/
753
754 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
755 matrix 0th,1st row */
756 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
757
758 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
759 source matrix 2nd,3rd row */
760 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
761
762 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
763 scaling matrix 0th,1st row */
764 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
765
766 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
767 the scaling matrix 2nd,3rd row */
768 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
769
770 /* q00 q01 q02 q03 q10 q11
771 q12 q13 -- all 16 bits */
772 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
773
774 /* q20 q21 q22 q23 q30 q31
775 q32 q33 -- all 16 bits */
776 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
777
778 /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
779 b12*q12 b13*q13 -- 16 bit result */
780 temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
781
782 /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
783 b32*q32 b33*q33 -- 16 bit result */
784 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
785
786 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
787 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
788
789 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
790 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
791
792 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
793 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
794
795 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
796 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
797
798 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
799 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
800 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
801 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
802 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
803 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
804 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
805 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
806
807 temp4 = _mm_madd_epi16(src_r0, temp4);
808 temp5 = _mm_madd_epi16(src_r1, temp5);
809 temp6 = _mm_madd_epi16(src_r2, temp6);
810 temp7 = _mm_madd_epi16(src_r3, temp7);
811
812 if(u4_qp_div_6 >= 4)
813 {
814 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
815 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
816 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
817 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
818 }
819 else
820 {
821 temp4 = _mm_add_epi32(temp4, add_rshift);
822 temp5 = _mm_add_epi32(temp5, add_rshift);
823 temp6 = _mm_add_epi32(temp6, add_rshift);
824 temp7 = _mm_add_epi32(temp7, add_rshift);
825 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
826 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
827 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
828 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
829 }
830
831 if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
832 /* Perform Inverse transform */
833 /*-------------------------------------------------------------*/
834 /* IDCT [ Horizontal transformation ] */
835 /*-------------------------------------------------------------*/
836 // Matrix transpose
837 /*
838 * a0 a1 a2 a3
839 * b0 b1 b2 b3
840 * c0 c1 c2 c3
841 * d0 d1 d2 d3
842 */
843
844 /* a0 b0 a1 b1 */
845 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
846 /* c0 d0 c1 d1 */
847 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
848 /* a2 b2 a3 b3 */
849 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
850 /* c2 d2 c3 d3 */
851 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
852 /* a0 b0 c0 d0 */
853 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
854 /* a1 b1 c1 d1 */
855 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
856 /* a2 b2 c2 d2 */
857 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
858 /* a3 b3 c3 d3 */
859 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
860 /* Transform starts -- horizontal transform */
861 /*------------------------------------------------------------------*/
862 /* z0 = w0 + w2 */
863 temp0 = _mm_add_epi32(resq_r0, resq_r2);
864 /* z1 = w0 - w2 */
865 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
866 /* z2 = (w1 >> 1) - w3 */
867 temp2 = _mm_srai_epi32(resq_r1, 1);
868 temp2 = _mm_sub_epi32(temp2, resq_r3);
869 /* z3 = w1 + (w3 >> 1) */
870 temp3 = _mm_srai_epi32(resq_r3, 1);
871 temp3 = _mm_add_epi32(temp3, resq_r1);
872 /*----------------------------------------------------------*/
873 /* x0 = z0 + z3 */
874 resq_r0 = _mm_add_epi32(temp0, temp3);
875 /* x1 = z1 + z2 */
876 resq_r1 = _mm_add_epi32(temp1, temp2);
877 /* x2 = z1 - z2 */
878 resq_r2 = _mm_sub_epi32(temp1, temp2);
879 /* x3 = z0 - z3 */
880 resq_r3 = _mm_sub_epi32(temp0, temp3);
881
882 // Matrix transpose
883 /*
884 * a0 b0 c0 d0
885 * a1 b1 c1 d1
886 * a2 b2 c2 d2
887 * a3 b3 c3 d3
888 */
889
890 /* a0 a1 b0 b1 */
891 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
892 /* a2 a3 b2 b3 */
893 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
894 /* c0 c1 d0 d1 */
895 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
896 /* c2 c3 d2 d3 */
897 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
898 /* a0 a1 a2 a3 */
899 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
900 /* b0 b1 b2 b3 */
901 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
902 /* c0 c1 c2 c3 */
903 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
904 /* d0 d1 d2 d3 */
905 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
906 /* Transform ends -- horizontal transform */
907
908 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
909 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
910
911 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
912 _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
913
914 /* Load pred buffer */
915 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
916 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
917 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
918 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
919
920 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
921 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
922 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
923 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
924
925 /*--------------------------------------------------------------*/
926 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
927 /* */
928 /* Add the prediction and store it back to same buffer */
929 /*--------------------------------------------------------------*/
930 /* z0j = y0j + y2j */
931 temp0 = _mm_add_epi32(resq_r0, resq_r2);
932 /* z1j = y0j - y2j */
933 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
934 /* z2j = (y1j>>1) - y3j */
935 temp2 = _mm_srai_epi32(resq_r1, 1);
936 temp2 = _mm_sub_epi32(temp2, resq_r3);
937 /* z3j = y1j + (y3j>>1) */
938 temp3 = _mm_srai_epi32(resq_r3, 1);
939 temp3 = _mm_add_epi32(temp3, resq_r1);
940
941 /* x0j = z0j + z3j */
942 temp4 = _mm_add_epi32(temp0, temp3);
943 temp4 = _mm_add_epi32(temp4, value_32);
944 temp4 = _mm_srai_epi32(temp4, 6);
945 res_r0 = temp4;
946 /* x1j = z1j + z2j */
947 temp5 = _mm_add_epi32(temp1, temp2);
948 temp5 = _mm_add_epi32(temp5, value_32);
949 temp5 = _mm_srai_epi32(temp5, 6);
950 res_r1 = temp5;
951 /* x2j = z1j - z2j */
952 temp6 = _mm_sub_epi32(temp1, temp2);
953 temp6 = _mm_add_epi32(temp6, value_32);
954 temp6 = _mm_srai_epi32(temp6, 6);
955 res_r2 = temp6;
956 /* x3j = z0j - z3j */
957 temp7 = _mm_sub_epi32(temp0, temp3);
958 temp7 = _mm_add_epi32(temp7, value_32);
959 temp7 = _mm_srai_epi32(temp7, 6);
960 res_r3 = temp7;
961
962 /* Accumulating res */
963 res_pred_r0 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[0]);
964 res_pred_r1 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[i4_res_pred_stride]);
965 res_pred_r2 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
966 res_pred_r3 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
967
968 res_pred_r0 = _mm_cvtepi16_epi32(res_pred_r0);
969 res_pred_r1 = _mm_cvtepi16_epi32(res_pred_r1);
970 res_pred_r2 = _mm_cvtepi16_epi32(res_pred_r2);
971 res_pred_r3 = _mm_cvtepi16_epi32(res_pred_r3);
972
973 temp0 = _mm_add_epi32(res_r0, res_pred_r0);
974 temp1 = _mm_add_epi32(res_r1, res_pred_r1);
975 temp2 = _mm_add_epi32(res_r2, res_pred_r2);
976 temp3 = _mm_add_epi32(res_r3, res_pred_r3);
977
978 temp0 = _mm_packs_epi32(temp0, temp1);
979 temp1 = _mm_packs_epi32(temp2, temp3);
980
981 /* Saturate all values < -255 to -255 and retain the rest as it is */
982 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
983 /* Saturate all values > 255 to 255 and retain the rest as it is */
984 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
985
986 /* Saturate all values < -255 to -255 and retain the rest as it is */
987 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
988 /* Saturate all values > 255 to 255 and retain the rest as it is */
989 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
990
991 _mm_storel_epi64((__m128i *) (&pi2_res[0]), temp0);
992 _mm_storel_epi64((__m128i *) (&pi2_res[2 * i4_res_stride]), temp1);
993
994 temp4 = _mm_add_epi16(temp0, pred_r0);
995 temp0 = _mm_srli_si128(temp0, 8);
996 _mm_storel_epi64((__m128i *) (&pi2_res[i4_res_stride]), temp0);
997
998 temp6 = _mm_add_epi16(temp1, pred_r2);
999 temp1 = _mm_srli_si128(temp1, 8);
1000 _mm_storel_epi64((__m128i *) (&pi2_res[3 * i4_res_stride]), temp1);
1001
1002 temp5 = _mm_add_epi16(temp0, pred_r1);
1003 temp7 = _mm_add_epi16(temp1, pred_r3);
1004
1005 temp4 = _mm_cvtepi16_epi32(temp4);
1006 temp5 = _mm_cvtepi16_epi32(temp5);
1007 temp6 = _mm_cvtepi16_epi32(temp6);
1008 temp7 = _mm_cvtepi16_epi32(temp7);
1009
1010 /* 32-bit to 16-bit conversion */
1011 temp0 = _mm_packs_epi32(temp4, temp5);
1012 temp1 = _mm_packs_epi32(temp6, temp7);
1013 /*------------------------------------------------------------------*/
1014 /* Clipping the results to 8 bits */
1015 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
1016 temp0 = _mm_and_si128(temp0, sign_reg);
1017 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
1018 temp1 = _mm_and_si128(temp1, sign_reg);
1019
1020 resq_r0 = _mm_packus_epi16(temp0, temp1);
1021 resq_r1 = _mm_srli_si128(resq_r0, 4);
1022 resq_r2 = _mm_srli_si128(resq_r1, 4);
1023 resq_r3 = _mm_srli_si128(resq_r2, 4);
1024
1025 *pu4_out = _mm_cvtsi128_si32(resq_r0);
1026 pu1_out += i4_out_stride;
1027 pu4_out = (UWORD32 *) (pu1_out);
1028 *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
1029 pu1_out += i4_out_stride;
1030 pu4_out = (UWORD32 *) (pu1_out);
1031 *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
1032 pu1_out += i4_out_stride;
1033 pu4_out = (UWORD32 *) (pu1_out);
1034 *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
1035 }
1036
1037 /*
1038 ********************************************************************************
1039 *
1040 * @brief This function reconstructs a 4x4 sub block from quantized chroma
1041 *resiude and prediction buffer
1042 *
1043 * @par Description:
1044 * The quantized residue is first inverse quantized, then inverse transformed.
1045 * This inverse transformed content is added to the prediction buffer to recon-
1046 * struct the end output
1047 *
1048 * @param[in] pi2_src
1049 * quantized 4x4 block
1050 *
1051 * @param[in] pu1_pred
1052 * prediction 4x4 block
1053 *
1054 * @param[out] pu1_out
1055 * reconstructed 4x4 block
1056 *
1057 * @param[in] src_strd
1058 * quantization buffer stride
1059 *
1060 * @param[in] i4_pred_stride,
1061 * Prediction buffer stride
1062 *
1063 * @param[in] i4_out_stride
1064 * recon buffer Stride
1065 *
1066 * @param[in] pu2_scaling_list
1067 * pointer to scaling list
1068 *
1069 * @param[in] pu2_norm_adjust
1070 * pointer to inverse scale matrix
1071 *
1072 * @param[in] u4_qp_div_6
1073 * Floor (qp/6)
1074 *
1075 * @param[in] pi4_tmp
1076 * temporary buffer of size 1*16
1077 *
1078 * @returns none
1079 *
1080 * @remarks none
1081 *
1082 *******************************************************************************
1083 */
isvc_iquant_itrans_recon_chroma_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1084 void isvc_iquant_itrans_recon_chroma_4x4_sse42(
1085 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1086 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1087 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1088 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1089 {
1090 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1091 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1092 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1093 WORD32 i4_src_stride = ps_src->i4_data_stride;
1094 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1095 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1096 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1097 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1098 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1099 __m128i src_r0_r1, src_r2_r3;
1100 __m128i src_r0, src_r1, src_r2, src_r3;
1101 __m128i scalemat_r0_r1, scalemat_r2_r3;
1102 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1103 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
1104 /* all bits reset to zero */
1105 __m128i zero_8x16b = _mm_setzero_si128();
1106 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1107 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1108 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1109 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1110 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1111 __m128i value_32 = _mm_set1_epi32(32);
1112 __m128i chroma_mask = _mm_set1_epi16(0xFF);
1113 __m128i out_r0, out_r1, out_r2, out_r3;
1114
1115 ASSERT(4 == i4_src_stride);
1116 ASSERT(0 == u1_res_accumulate);
1117
1118 UNUSED(i4_src_stride);
1119 UNUSED(u1_res_accumulate);
1120 UNUSED(ps_res);
1121 UNUSED(ps_res_pred);
1122 UNUSED(i4_iq_start_idx);
1123
1124 /*************************************************************/
1125 /* Dequantization of coefficients. Will be replaced by SIMD */
1126 /* operations on platform */
1127 /*************************************************************/
1128 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
1129 matrix 0th,1st row */
1130 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1131
1132 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
1133 source matrix 2nd,3rd row */
1134 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1135
1136 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
1137 scaling matrix 0th,1st row */
1138 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1139
1140 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
1141 the scaling matrix 2nd,3rd row */
1142 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1143
1144 /* q00 q01 q02 q03 q10 q11
1145 q12 q13 -- all 16 bits */
1146 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1147
1148 /* q20 q21 q22 q23 q30 q31
1149 q32 q33 -- all 16 bits */
1150 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1151
1152 temp0 = _mm_mullo_epi16(scalemat_r0_r1,
1153 dequant_r0_r1); // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
1154 // b12*q12 b13*q13 -- 16 bit result
1155
1156 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1157
1158 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1159 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1160
1161 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1162 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1163
1164 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1165 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1166
1167 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1168 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1169
1170 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
1171 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
1172 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
1173 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
1174 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
1175 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
1176 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
1177 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
1178
1179 temp4 = _mm_madd_epi16(src_r0, temp4);
1180 temp5 = _mm_madd_epi16(src_r1, temp5);
1181 temp6 = _mm_madd_epi16(src_r2, temp6);
1182 temp7 = _mm_madd_epi16(src_r3, temp7);
1183
1184 if(u4_qp_div_6 >= 4)
1185 {
1186 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1187 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1188 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1189 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1190 }
1191 else
1192 {
1193 temp4 = _mm_add_epi32(temp4, add_rshift);
1194 temp5 = _mm_add_epi32(temp5, add_rshift);
1195 temp6 = _mm_add_epi32(temp6, add_rshift);
1196 temp7 = _mm_add_epi32(temp7, add_rshift);
1197 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1198 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1199 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1200 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1201 }
1202
1203 resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1204 /* Perform Inverse transform */
1205 /*-------------------------------------------------------------*/
1206 /* IDCT [ Horizontal transformation ] */
1207 /*-------------------------------------------------------------*/
1208 // Matrix transpose
1209 /*
1210 * a0 a1 a2 a3
1211 * b0 b1 b2 b3
1212 * c0 c1 c2 c3
1213 * d0 d1 d2 d3
1214 */
1215 /* a0 b0 a1 b1 */
1216 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1217 /* c0 d0 c1 d1 */
1218 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1219 /* a2 b2 a3 b3 */
1220 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1221 /* c2 d2 c3 d3 */
1222 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1223 /* a0 b0 c0 d0 */
1224 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1225 /* a1 b1 c1 d1 */
1226 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1227 /* a2 b2 c2 d2 */
1228 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1229 /* a3 b3 c3 d3 */
1230 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1231 /* Transform starts -- horizontal transform */
1232
1233 /*------------------------------------------------------------------*/
1234 /* z0 = w0 + w2 */
1235 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1236 /* z1 = w0 - w2 */
1237 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1238 /* z2 = (w1 >> 1) - w3 */
1239 temp2 = _mm_srai_epi32(resq_r1, 1);
1240 temp2 = _mm_sub_epi32(temp2, resq_r3);
1241 /* z3 = w1 + (w3 >> 1) */
1242 temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
1243 temp3 = _mm_add_epi32(temp3, resq_r1);
1244 /*----------------------------------------------------------*/
1245 /* x0 = z0 + z3 */
1246 resq_r0 = _mm_add_epi32(temp0, temp3);
1247 /* x1 = z1 + z2 */
1248 resq_r1 = _mm_add_epi32(temp1, temp2);
1249 /* x2 = z1 - z2 */
1250 resq_r2 = _mm_sub_epi32(temp1, temp2);
1251 /* x3 = z0 - z3 */
1252 resq_r3 = _mm_sub_epi32(temp0, temp3);
1253 // Matrix transpose
1254 /*
1255 * a0 b0 c0 d0
1256 * a1 b1 c1 d1
1257 * a2 b2 c2 d2
1258 * a3 b3 c3 d3
1259 */
1260 /* a0 a1 b0 b1 */
1261 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1262 /* a2 a3 b2 b3 */
1263 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1264 /* c0 c1 d0 d1 */
1265 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1266 /* c2 c3 d2 d3 */
1267 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1268 /* a0 a1 a2 a3 */
1269 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1270 /* b0 b1 b2 b3 */
1271 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1272 /* c0 c1 c2 c3 */
1273 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1274 /* d0 d1 d2 d3 */
1275 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1276 /* Transform ends -- horizontal transform */
1277
1278 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
1279 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
1280
1281 _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0);
1282 _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1);
1283
1284 /* Load pred buffer */
1285 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1286 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1287 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1288 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1289
1290 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1291 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1292 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1293 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1294
1295 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
1296 pred_r1 = _mm_unpacklo_epi64(pred_r2, pred_r3);
1297
1298 /*--------------------------------------------------------------*/
1299 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
1300 /* */
1301 /* Add the prediction and store it back to same buffer */
1302 /*--------------------------------------------------------------*/
1303 /* z0j = y0j + y2j */
1304 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1305 /* z1j = y0j - y2j */
1306 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1307 /* z2j = (y1j>>1) - y3j */
1308 temp2 = _mm_srai_epi32(resq_r1, 1);
1309 temp2 = _mm_sub_epi32(temp2, resq_r3);
1310 /* z3j = y1j + (y3j>>1) */
1311 temp3 = _mm_srai_epi32(resq_r3, 1);
1312 temp3 = _mm_add_epi32(temp3, resq_r1);
1313
1314 /* x0j = z0j + z3j */
1315 temp4 = _mm_add_epi32(temp0, temp3);
1316 temp4 = _mm_add_epi32(temp4, value_32);
1317 temp4 = _mm_srai_epi32(temp4, 6);
1318 /* x1j = z1j + z2j */
1319 temp5 = _mm_add_epi32(temp1, temp2);
1320 temp5 = _mm_add_epi32(temp5, value_32);
1321 temp5 = _mm_srai_epi32(temp5, 6);
1322 /* x2j = z1j - z2j */
1323 temp6 = _mm_sub_epi32(temp1, temp2);
1324 temp6 = _mm_add_epi32(temp6, value_32);
1325 temp6 = _mm_srai_epi32(temp6, 6);
1326 /* x3j = z0j - z3j */
1327 temp7 = _mm_sub_epi32(temp0, temp3);
1328 temp7 = _mm_add_epi32(temp7, value_32);
1329 temp7 = _mm_srai_epi32(temp7, 6);
1330
1331 /* 32-bit to 16-bit conversion */
1332 temp0 = _mm_packs_epi32(temp4, temp5);
1333 temp1 = _mm_packs_epi32(temp6, temp7);
1334
1335 /* Saturate all values < -255 to -255 and retain the rest as it is */
1336 temp4 = _mm_max_epi16(temp0, neg_255_8x16b);
1337 /* Saturate all values > 255 to 255 and retain the rest as it is */
1338 temp4 = _mm_min_epi16(temp4, pos_255_8x16b);
1339
1340 /* Saturate all values < -255 to -255 and retain the rest as it is */
1341 temp5 = _mm_max_epi16(temp1, neg_255_8x16b);
1342 /* Saturate all values > 255 to 255 and retain the rest as it is */
1343 temp5 = _mm_min_epi16(temp5, pos_255_8x16b);
1344
1345 temp0 = _mm_add_epi16(temp4, pred_r0);
1346 temp1 = _mm_add_epi16(temp5, pred_r1);
1347
1348 /*------------------------------------------------------------------*/
1349 /* Clipping the results to 8 bits */
1350 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
1351 temp0 = _mm_and_si128(temp0, sign_reg);
1352 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
1353 temp1 = _mm_and_si128(temp1, sign_reg);
1354
1355 resq_r0 = _mm_packus_epi16(temp0, temp1);
1356 resq_r1 = _mm_srli_si128(resq_r0, 4);
1357 resq_r2 = _mm_srli_si128(resq_r1, 4);
1358 resq_r3 = _mm_srli_si128(resq_r2, 4);
1359
1360 resq_r0 = _mm_cvtepu8_epi16(resq_r0);
1361 resq_r1 = _mm_cvtepu8_epi16(resq_r1);
1362 resq_r2 = _mm_cvtepu8_epi16(resq_r2);
1363 resq_r3 = _mm_cvtepu8_epi16(resq_r3);
1364
1365 chroma_mask = _mm_set1_epi16(0xFF00);
1366 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
1367 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
1368 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
1369 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
1370
1371 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1372 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1373 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1374 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1375
1376 out_r0 = _mm_add_epi8(out_r0, resq_r0);
1377 out_r1 = _mm_add_epi8(out_r1, resq_r1);
1378 out_r2 = _mm_add_epi8(out_r2, resq_r2);
1379 out_r3 = _mm_add_epi8(out_r3, resq_r3);
1380
1381 _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
1382 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
1383 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
1384 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
1385 }
1386
isvc_iquant_itrans_recon_res_chroma_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1387 void isvc_iquant_itrans_recon_res_chroma_4x4_sse42(
1388 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1389 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1390 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1391 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1392 {
1393 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1394 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1395 WORD16 *pi2_res_ptr = pi2_res;
1396 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1397 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1398 WORD32 i4_src_stride = ps_src->i4_data_stride;
1399 WORD32 i4_res_stride = ps_res->i4_data_stride;
1400 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1401 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1402 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1403 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1404 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1405 __m128i src_r0_r1, src_r2_r3;
1406 __m128i src_r0, src_r1, src_r2, src_r3;
1407 __m128i scalemat_r0_r1, scalemat_r2_r3;
1408 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1409 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
1410 /* all bits reset to zero */
1411 __m128i zero_8x16b = _mm_setzero_si128();
1412 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1413 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1414 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1415 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1416 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1417 __m128i value_32 = _mm_set1_epi32(32);
1418 __m128i chroma_mask = _mm_set1_epi16(0xFF);
1419 __m128i out_r0, out_r1, out_r2, out_r3;
1420 __m128i res_r0, res_r1, res_r2, res_r3;
1421
1422 ASSERT(4 == i4_src_stride);
1423 ASSERT(0 == u1_res_accumulate);
1424
1425 UNUSED(i4_src_stride);
1426 UNUSED(u1_res_accumulate);
1427 UNUSED(ps_res_pred);
1428 UNUSED(i4_iq_start_idx);
1429
1430 /*************************************************************/
1431 /* Dequantization of coefficients. Will be replaced by SIMD */
1432 /* operations on platform */
1433 /*************************************************************/
1434 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
1435 matrix 0th,1st row */
1436 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1437
1438 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
1439 source matrix 2nd,3rd row */
1440 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1441
1442 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
1443 scaling matrix 0th,1st row */
1444 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1445
1446 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
1447 the scaling matrix 2nd,3rd row */
1448 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1449
1450 /* q00 q01 q02 q03 q10 q11
1451 q12 q13 -- all 16 bits */
1452 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1453
1454 /* q20 q21 q22 q23 q30 q31
1455 q32 q33 -- all 16 bits */
1456 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1457
1458 temp0 = _mm_mullo_epi16(scalemat_r0_r1,
1459 dequant_r0_r1); // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
1460 // b12*q12 b13*q13 -- 16 bit result
1461
1462 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1463
1464 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1465 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1466
1467 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1468 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1469
1470 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1471 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1472
1473 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1474 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1475
1476 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
1477 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
1478 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
1479 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
1480 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
1481 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
1482 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
1483 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
1484
1485 temp4 = _mm_madd_epi16(src_r0, temp4);
1486 temp5 = _mm_madd_epi16(src_r1, temp5);
1487 temp6 = _mm_madd_epi16(src_r2, temp6);
1488 temp7 = _mm_madd_epi16(src_r3, temp7);
1489
1490 if(u4_qp_div_6 >= 4)
1491 {
1492 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1493 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1494 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1495 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1496 }
1497 else
1498 {
1499 temp4 = _mm_add_epi32(temp4, add_rshift);
1500 temp5 = _mm_add_epi32(temp5, add_rshift);
1501 temp6 = _mm_add_epi32(temp6, add_rshift);
1502 temp7 = _mm_add_epi32(temp7, add_rshift);
1503 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1504 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1505 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1506 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1507 }
1508
1509 resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1510 /* Perform Inverse transform */
1511 /*-------------------------------------------------------------*/
1512 /* IDCT [ Horizontal transformation ] */
1513 /*-------------------------------------------------------------*/
1514 // Matrix transpose
1515 /*
1516 * a0 a1 a2 a3
1517 * b0 b1 b2 b3
1518 * c0 c1 c2 c3
1519 * d0 d1 d2 d3
1520 */
1521 /* a0 b0 a1 b1 */
1522 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1523 /* c0 d0 c1 d1 */
1524 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1525 /* a2 b2 a3 b3 */
1526 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1527 /* c2 d2 c3 d3 */
1528 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1529 /* a0 b0 c0 d0 */
1530 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1531 /* a1 b1 c1 d1 */
1532 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1533 /* a2 b2 c2 d2 */
1534 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1535 /* a3 b3 c3 d3 */
1536 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1537 /* Transform starts -- horizontal transform */
1538
1539 /*------------------------------------------------------------------*/
1540 /* z0 = w0 + w2 */
1541 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1542 /* z1 = w0 - w2 */
1543 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1544 /* z2 = (w1 >> 1) - w3 */
1545 temp2 = _mm_srai_epi32(resq_r1, 1);
1546 temp2 = _mm_sub_epi32(temp2, resq_r3);
1547 /* z3 = w1 + (w3 >> 1) */
1548 temp3 = _mm_srai_epi32(resq_r3, 1);
1549 temp3 = _mm_add_epi32(temp3, resq_r1);
1550 /*----------------------------------------------------------*/
1551 /* x0 = z0 + z3 */
1552 resq_r0 = _mm_add_epi32(temp0, temp3);
1553 /* x1 = z1 + z2 */
1554 resq_r1 = _mm_add_epi32(temp1, temp2);
1555 /* x2 = z1 - z2 */
1556 resq_r2 = _mm_sub_epi32(temp1, temp2);
1557 /* x3 = z0 - z3 */
1558 resq_r3 = _mm_sub_epi32(temp0, temp3);
1559 // Matrix transpose
1560 /*
1561 * a0 b0 c0 d0
1562 * a1 b1 c1 d1
1563 * a2 b2 c2 d2
1564 * a3 b3 c3 d3
1565 */
1566 /* a0 a1 b0 b1 */
1567 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1568 /* a2 a3 b2 b3 */
1569 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1570 /* c0 c1 d0 d1 */
1571 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1572 /* c2 c3 d2 d3 */
1573 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1574 /* a0 a1 a2 a3 */
1575 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1576 /* b0 b1 b2 b3 */
1577 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1578 /* c0 c1 c2 c3 */
1579 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1580 /* d0 d1 d2 d3 */
1581 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1582 /* Transform ends -- horizontal transform */
1583
1584 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
1585 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
1586
1587 _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0);
1588 _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1);
1589
1590 /* Load pred buffer */
1591 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1592 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1593 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1594 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1595
1596 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1597 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1598 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1599 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1600
1601 pred_r0 = _mm_cvtepu16_epi32(pred_r0);
1602 pred_r1 = _mm_cvtepu16_epi32(pred_r1);
1603 pred_r2 = _mm_cvtepu16_epi32(pred_r2);
1604 pred_r3 = _mm_cvtepu16_epi32(pred_r3);
1605
1606 /*--------------------------------------------------------------*/
1607 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
1608 /* */
1609 /* Add the prediction and store it back to same buffer */
1610 /*--------------------------------------------------------------*/
1611 /* z0j = y0j + y2j */
1612 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1613 /* z1j = y0j - y2j */
1614 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1615 /* z2j = (y1j>>1) - y3j */
1616 temp2 = _mm_srai_epi32(resq_r1, 1);
1617 temp2 = _mm_sub_epi32(temp2, resq_r3);
1618 /* z3j = y1j + (y3j>>1) */
1619 temp3 = _mm_srai_epi32(resq_r3, 1);
1620 temp3 = _mm_add_epi32(temp3, resq_r1);
1621
1622 /* x0j = z0j + z3j */
1623 temp4 = _mm_add_epi32(temp0, temp3);
1624 temp4 = _mm_add_epi32(temp4, value_32);
1625 temp4 = _mm_srai_epi32(temp4, 6);
1626 /* x1j = z1j + z2j */
1627 temp5 = _mm_add_epi32(temp1, temp2);
1628 temp5 = _mm_add_epi32(temp5, value_32);
1629 temp5 = _mm_srai_epi32(temp5, 6);
1630 /* x2j = z1j - z2j */
1631 temp6 = _mm_sub_epi32(temp1, temp2);
1632 temp6 = _mm_add_epi32(temp6, value_32);
1633 temp6 = _mm_srai_epi32(temp6, 6);
1634 /* x3j = z0j - z3j */
1635 temp7 = _mm_sub_epi32(temp0, temp3);
1636 temp7 = _mm_add_epi32(temp7, value_32);
1637 temp7 = _mm_srai_epi32(temp7, 6);
1638
1639 /* 32-bit to 16-bit conversion */
1640 temp0 = _mm_packs_epi32(temp4, temp5);
1641 temp1 = _mm_packs_epi32(temp6, temp7);
1642
1643 /* Saturate all values < -255 to -255 and retain the rest as it is */
1644 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
1645 /* Saturate all values > 255 to 255 and retain the rest as it is */
1646 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
1647
1648 /* Saturate all values < -255 to -255 and retain the rest as it is */
1649 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
1650 /* Saturate all values > 255 to 255 and retain the rest as it is */
1651 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
1652
1653 chroma_mask = _mm_set1_epi32(0xffff0000);
1654 out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]));
1655 out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]));
1656 out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]));
1657 out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]));
1658
1659 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1660 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1661 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1662 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1663
1664 res_r0 = _mm_cvtepu16_epi32(temp0);
1665 res_r2 = _mm_cvtepu16_epi32(temp1);
1666 res_r1 = _mm_srli_si128(temp0, 8);
1667 res_r3 = _mm_srli_si128(temp1, 8);
1668 res_r1 = _mm_cvtepu16_epi32(res_r1);
1669 res_r3 = _mm_cvtepu16_epi32(res_r3);
1670
1671 out_r0 = _mm_add_epi16(out_r0, res_r0);
1672 out_r1 = _mm_add_epi16(out_r1, res_r1);
1673 out_r2 = _mm_add_epi16(out_r2, res_r2);
1674 out_r3 = _mm_add_epi16(out_r3, res_r3);
1675
1676 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]), out_r0);
1677 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]), out_r1);
1678 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), out_r2);
1679 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), out_r3);
1680
1681 resq_r0 = _mm_add_epi16(pred_r0, res_r0);
1682 resq_r1 = _mm_add_epi16(pred_r1, res_r1);
1683 resq_r2 = _mm_add_epi16(pred_r2, res_r2);
1684 resq_r3 = _mm_add_epi16(pred_r3, res_r3);
1685
1686 temp0 = _mm_packus_epi32(resq_r0, resq_r1);
1687 temp1 = _mm_packus_epi32(resq_r2, resq_r3);
1688
1689 /*------------------------------------------------------------------*/
1690 /* Clipping the results to 8 bits */
1691 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
1692 temp0 = _mm_and_si128(temp0, sign_reg);
1693 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
1694 temp1 = _mm_and_si128(temp1, sign_reg);
1695
1696 resq_r0 = _mm_packus_epi16(temp0, temp1);
1697 resq_r1 = _mm_srli_si128(resq_r0, 4);
1698 resq_r2 = _mm_srli_si128(resq_r1, 4);
1699 resq_r3 = _mm_srli_si128(resq_r2, 4);
1700
1701 resq_r0 = _mm_cvtepu8_epi16(resq_r0);
1702 resq_r1 = _mm_cvtepu8_epi16(resq_r1);
1703 resq_r2 = _mm_cvtepu8_epi16(resq_r2);
1704 resq_r3 = _mm_cvtepu8_epi16(resq_r3);
1705
1706 chroma_mask = _mm_set1_epi16(0xff00);
1707 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
1708 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
1709 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
1710 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
1711
1712 out_r0 = _mm_and_si128(out_r0, chroma_mask);
1713 out_r1 = _mm_and_si128(out_r1, chroma_mask);
1714 out_r2 = _mm_and_si128(out_r2, chroma_mask);
1715 out_r3 = _mm_and_si128(out_r3, chroma_mask);
1716
1717 out_r0 = _mm_add_epi8(out_r0, resq_r0);
1718 out_r1 = _mm_add_epi8(out_r1, resq_r1);
1719 out_r2 = _mm_add_epi8(out_r2, resq_r2);
1720 out_r3 = _mm_add_epi8(out_r3, resq_r3);
1721
1722 _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
1723 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
1724 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
1725 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
1726 }
1727
isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1728 void isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42(
1729 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1730 buffer_container_t *ps_res, buffer_container_t *ps_rec,
1731 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1732 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1733 {
1734 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1735 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1736 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1737 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1738 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1739 WORD32 i4_src_stride = ps_src->i4_data_stride;
1740 WORD32 i4_res_stride = ps_res->i4_data_stride;
1741 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1742 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1743 WORD32 i4_out_stride = ps_rec->i4_data_stride;
1744 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1745 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1746 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1747 __m128i src_r0_r1, src_r2_r3;
1748 __m128i src_r0, src_r1, src_r2, src_r3;
1749 __m128i scalemat_r0_r1, scalemat_r2_r3;
1750 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1751 __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
1752 __m128i res_r0, res_r1, res_r2, res_r3;
1753 __m128i dequant_r0_r1, dequant_r2_r3;
1754 /* all bits reset to zero */
1755 __m128i zero_8x16b = _mm_setzero_si128();
1756 __m128i reg_chroma = _mm_set1_epi32(0xFFFF);
1757 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1758 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1759 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1760 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1761 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1762 __m128i value_32 = _mm_set1_epi32(32);
1763 __m128i chroma_mask = _mm_set1_epi16(0xFF);
1764 __m128i out_r0, out_r1, out_r2, out_r3;
1765 __m128i mask_r0;
1766
1767 ASSERT(4 == i4_src_stride);
1768 ASSERT(1 == u1_res_accumulate);
1769
1770 UNUSED(i4_src_stride);
1771 UNUSED(u1_res_accumulate);
1772 UNUSED(i4_iq_start_idx);
1773
1774 /*************************************************************/
1775 /* Dequantization of coefficients. Will be replaced by SIMD */
1776 /* operations on platform */
1777 /*************************************************************/
1778 /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
1779 matrix 0th,1st row */
1780 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1781
1782 /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
1783 source matrix 2nd,3rd row */
1784 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1785
1786 /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
1787 scaling matrix 0th,1st row */
1788 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1789
1790 /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
1791 the scaling matrix 2nd,3rd row */
1792 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1793
1794 /* q00 q01 q02 q03 q10 q11
1795 q12 q13 -- all 16 bits */
1796 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1797
1798 /* q20 q21 q22 q23 q30 q31
1799 q32 q33 -- all 16 bits */
1800 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1801
1802 temp0 = _mm_mullo_epi16(scalemat_r0_r1,
1803 dequant_r0_r1); // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
1804 // b12*q12 b13*q13 -- 16 bit result
1805
1806 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1807
1808 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1809 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1810
1811 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1812 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1813
1814 /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1815 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1816
1817 /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1818 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1819
1820 /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
1821 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
1822 /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
1823 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
1824 /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
1825 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
1826 /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
1827 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
1828
1829 temp4 = _mm_madd_epi16(src_r0, temp4);
1830 temp5 = _mm_madd_epi16(src_r1, temp5);
1831 temp6 = _mm_madd_epi16(src_r2, temp6);
1832 temp7 = _mm_madd_epi16(src_r3, temp7);
1833
1834 if(u4_qp_div_6 >= 4)
1835 {
1836 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1837 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1838 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1839 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1840 }
1841 else
1842 {
1843 temp4 = _mm_add_epi32(temp4, add_rshift);
1844 temp5 = _mm_add_epi32(temp5, add_rshift);
1845 temp6 = _mm_add_epi32(temp6, add_rshift);
1846 temp7 = _mm_add_epi32(temp7, add_rshift);
1847 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1848 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1849 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1850 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1851 }
1852
1853 resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1854 /* Perform Inverse transform */
1855 /*-------------------------------------------------------------*/
1856 /* IDCT [ Horizontal transformation ] */
1857 /*-------------------------------------------------------------*/
1858 // Matrix transpose
1859 /*
1860 * a0 a1 a2 a3
1861 * b0 b1 b2 b3
1862 * c0 c1 c2 c3
1863 * d0 d1 d2 d3
1864 */
1865 /* a0 b0 a1 b1 */
1866 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1867 /* c0 d0 c1 d1 */
1868 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1869 /* a2 b2 a3 b3 */
1870 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1871 /* c2 d2 c3 d3 */
1872 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1873 /* a0 b0 c0 d0 */
1874 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1875 /* a1 b1 c1 d1 */
1876 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1877 /* a2 b2 c2 d2 */
1878 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1879 /* a3 b3 c3 d3 */
1880 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1881 /* Transform starts -- horizontal transform */
1882
1883 /*------------------------------------------------------------------*/
1884 /* z0 = w0 + w2 */
1885 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1886 /* z1 = w0 - w2 */
1887 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1888 /* z2 = (w1 >> 1) - w3 */
1889 temp2 = _mm_srai_epi32(resq_r1, 1);
1890 temp2 = _mm_sub_epi32(temp2, resq_r3);
1891 /* z3 = w1 + (w3 >> 1) */
1892 temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
1893 temp3 = _mm_add_epi32(temp3, resq_r1);
1894 /*----------------------------------------------------------*/
1895 /* x0 = z0 + z3 */
1896 resq_r0 = _mm_add_epi32(temp0, temp3);
1897 /* x1 = z1 + z2 */
1898 resq_r1 = _mm_add_epi32(temp1, temp2);
1899 /* x2 = z1 - z2 */
1900 resq_r2 = _mm_sub_epi32(temp1, temp2);
1901 /* x3 = z0 - z3 */
1902 resq_r3 = _mm_sub_epi32(temp0, temp3);
1903 // Matrix transpose
1904 /*
1905 * a0 b0 c0 d0
1906 * a1 b1 c1 d1
1907 * a2 b2 c2 d2
1908 * a3 b3 c3 d3
1909 */
1910 /* a0 a1 b0 b1 */
1911 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1912 /* a2 a3 b2 b3 */
1913 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1914 /* c0 c1 d0 d1 */
1915 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1916 /* c2 c3 d2 d3 */
1917 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1918 /* a0 a1 a2 a3 */
1919 resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1920 /* b0 b1 b2 b3 */
1921 resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1922 /* c0 c1 c2 c3 */
1923 resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1924 /* d0 d1 d2 d3 */
1925 resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1926 /* Transform ends -- horizontal transform */
1927
1928 temp0 = _mm_packs_epi32(resq_r0, resq_r1);
1929 temp1 = _mm_packs_epi32(resq_r2, resq_r3);
1930
1931 _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0);
1932 _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1);
1933
1934 /* Load pred buffer */
1935 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1936 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1937 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1938 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1939
1940 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1941 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1942 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1943 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1944
1945 /*--------------------------------------------------------------*/
1946 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
1947 /* */
1948 /* Add the prediction and store it back to same buffer */
1949 /*--------------------------------------------------------------*/
1950 /* z0j = y0j + y2j */
1951 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1952 /* z1j = y0j - y2j */
1953 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1954 /* z2j = (y1j>>1) - y3j */
1955 temp2 = _mm_srai_epi32(resq_r1, 1);
1956 temp2 = _mm_sub_epi32(temp2, resq_r3);
1957 /* z3j = y1j + (y3j>>1) */
1958 temp3 = _mm_srai_epi32(resq_r3, 1);
1959 temp3 = _mm_add_epi32(temp3, resq_r1);
1960
1961 /* x0j = z0j + z3j */
1962 temp4 = _mm_add_epi32(temp0, temp3);
1963 temp4 = _mm_add_epi32(temp4, value_32);
1964 temp4 = _mm_srai_epi32(temp4, 6);
1965 res_r0 = temp4;
1966 /* x1j = z1j + z2j */
1967 temp5 = _mm_add_epi32(temp1, temp2);
1968 temp5 = _mm_add_epi32(temp5, value_32);
1969 temp5 = _mm_srai_epi32(temp5, 6);
1970 res_r1 = temp5;
1971 /* x2j = z1j - z2j */
1972 temp6 = _mm_sub_epi32(temp1, temp2);
1973 temp6 = _mm_add_epi32(temp6, value_32);
1974 temp6 = _mm_srai_epi32(temp6, 6);
1975 res_r2 = temp6;
1976 /* x3j = z0j - z3j */
1977 temp7 = _mm_sub_epi32(temp0, temp3);
1978 temp7 = _mm_add_epi32(temp7, value_32);
1979 temp7 = _mm_srai_epi32(temp7, 6);
1980 res_r3 = temp7;
1981
1982 res_pred_r0 = _mm_loadu_si128((__m128i *) &pi2_res_pred[0 * i4_res_pred_stride]);
1983 res_pred_r1 = _mm_loadu_si128((__m128i *) &pi2_res_pred[1 * i4_res_pred_stride]);
1984 res_pred_r2 = _mm_loadu_si128((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
1985 res_pred_r3 = _mm_loadu_si128((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
1986
1987 res_pred_r0 = _mm_and_si128(res_pred_r0, reg_chroma);
1988 res_pred_r1 = _mm_and_si128(res_pred_r1, reg_chroma);
1989 res_pred_r2 = _mm_and_si128(res_pred_r2, reg_chroma);
1990 res_pred_r3 = _mm_and_si128(res_pred_r3, reg_chroma);
1991
1992 temp0 = _mm_packs_epi32(res_r0, res_r1);
1993 temp1 = _mm_packs_epi32(res_r2, res_r3);
1994
1995 res_r0 = _mm_cvtepu16_epi32(temp0);
1996 res_r2 = _mm_cvtepu16_epi32(temp1);
1997 res_r1 = _mm_srli_si128(temp0, 8);
1998 res_r3 = _mm_srli_si128(temp1, 8);
1999 res_r1 = _mm_cvtepu16_epi32(res_r1);
2000 res_r3 = _mm_cvtepu16_epi32(res_r3);
2001
2002 res_r0 = _mm_add_epi16(res_pred_r0, res_r0);
2003 res_r1 = _mm_add_epi16(res_pred_r1, res_r1);
2004 res_r2 = _mm_add_epi16(res_pred_r2, res_r2);
2005 res_r3 = _mm_add_epi16(res_pred_r3, res_r3);
2006
2007 temp0 = _mm_packus_epi32(res_r0, res_r1);
2008 temp1 = _mm_packus_epi32(res_r2, res_r3);
2009
2010 /* Saturate all values < -255 to -255 and retain the rest as it is */
2011 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
2012 /* Saturate all values > 255 to 255 and retain the rest as it is */
2013 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
2014
2015 /* Saturate all values < -255 to -255 and retain the rest as it is */
2016 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
2017 /* Saturate all values > 255 to 255 and retain the rest as it is */
2018 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
2019
2020 res_r0 = _mm_cvtepu16_epi32(temp0);
2021 res_r1 = _mm_srli_si128(temp0, 8);
2022 res_r1 = _mm_cvtepu16_epi32(res_r1);
2023
2024 res_r2 = _mm_cvtepu16_epi32(temp1);
2025 res_r3 = _mm_srli_si128(temp1, 8);
2026 res_r3 = _mm_cvtepu16_epi32(res_r3);
2027
2028 chroma_mask = _mm_set1_epi32(0xffff0000);
2029 out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]));
2030 out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]));
2031 out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]));
2032 out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]));
2033
2034 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2035 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2036 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2037 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2038
2039 out_r0 = _mm_add_epi16(out_r0, res_r0);
2040 out_r1 = _mm_add_epi16(out_r1, res_r1);
2041 out_r2 = _mm_add_epi16(out_r2, res_r2);
2042 out_r3 = _mm_add_epi16(out_r3, res_r3);
2043
2044 _mm_storeu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]), out_r0);
2045 _mm_storeu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]), out_r1);
2046 _mm_storeu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]), out_r2);
2047 _mm_storeu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]), out_r3);
2048
2049 pred_r0 = _mm_cvtepu16_epi32(pred_r0);
2050 pred_r1 = _mm_cvtepu16_epi32(pred_r1);
2051 pred_r2 = _mm_cvtepu16_epi32(pred_r2);
2052 pred_r3 = _mm_cvtepu16_epi32(pred_r3);
2053
2054 resq_r0 = _mm_add_epi16(pred_r0, res_r0);
2055 resq_r1 = _mm_add_epi16(pred_r1, res_r1);
2056 resq_r2 = _mm_add_epi16(pred_r2, res_r2);
2057 resq_r3 = _mm_add_epi16(pred_r3, res_r3);
2058
2059 temp0 = _mm_packus_epi32(resq_r0, resq_r1);
2060 temp1 = _mm_packus_epi32(resq_r2, resq_r3);
2061
2062 /* Clipping the results to 8 bits */
2063 mask_r0 = _mm_cmpgt_epi16(temp0, zero_8x16b);
2064 temp0 = _mm_and_si128(temp0, mask_r0);
2065 mask_r0 = _mm_cmpgt_epi16(temp1, zero_8x16b);
2066 temp1 = _mm_and_si128(temp1, mask_r0);
2067
2068 resq_r0 = _mm_packus_epi16(temp0, temp1);
2069 resq_r1 = _mm_srli_si128(resq_r0, 4);
2070 resq_r2 = _mm_srli_si128(resq_r1, 4);
2071 resq_r3 = _mm_srli_si128(resq_r2, 4);
2072
2073 resq_r0 = _mm_cvtepu8_epi16(resq_r0);
2074 resq_r1 = _mm_cvtepu8_epi16(resq_r1);
2075 resq_r2 = _mm_cvtepu8_epi16(resq_r2);
2076 resq_r3 = _mm_cvtepu8_epi16(resq_r3);
2077
2078 chroma_mask = _mm_set1_epi16(0xFF00);
2079 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0 * i4_out_stride]));
2080 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[1 * i4_out_stride]));
2081 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2082 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2083
2084 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2085 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2086 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2087 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2088
2089 out_r0 = _mm_add_epi8(out_r0, resq_r0);
2090 out_r1 = _mm_add_epi8(out_r1, resq_r1);
2091 out_r2 = _mm_add_epi8(out_r2, resq_r2);
2092 out_r3 = _mm_add_epi8(out_r3, resq_r3);
2093
2094 _mm_storel_epi64((__m128i *) (&pu1_out[0 * i4_out_stride]), out_r0);
2095 _mm_storel_epi64((__m128i *) (&pu1_out[1 * i4_out_stride]), out_r1);
2096 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2097 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2098 }
2099
isvc_iquant_itrans_recon_dc_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2100 void isvc_iquant_itrans_recon_dc_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
2101 buffer_container_t *ps_res_pred,
2102 buffer_container_t *ps_res, buffer_container_t *ps_rec,
2103 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
2104 WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2105 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2106 {
2107 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2108 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2109 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2110 WORD32 i4_out_stride = ps_rec->i4_data_stride;
2111 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2112 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2113 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2114 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
2115 WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0];
2116 WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
2117
2118 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2119 __m128i sign_reg;
2120 /* all bits reset to zero */
2121 __m128i zero_8x16b = _mm_setzero_si128();
2122 __m128i temp4, temp5, temp6, temp7;
2123 __m128i value_add;
2124
2125 ASSERT(0 == u1_res_accumulate);
2126
2127 UNUSED(pi2_tmp);
2128 UNUSED(ps_res);
2129 UNUSED(ps_res_pred);
2130 UNUSED(u1_res_accumulate);
2131
2132 INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
2133
2134 /* Restoring dc value for intra case */
2135 if(i4_iq_start_idx != 0)
2136 {
2137 q0 = pi2_dc_src[0];
2138 }
2139
2140 i_macro = ((q0 + 32) >> 6);
2141
2142 value_add = _mm_set1_epi16(i_macro);
2143
2144 zero_8x16b = _mm_setzero_si128();
2145
2146 /* Load pred buffer */
2147
2148 /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */
2149 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2150
2151 /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */
2152 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2153
2154 /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */
2155 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2156
2157 /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */
2158 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2159
2160 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2161 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2162 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
2163 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
2164
2165 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2166 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2167
2168 temp4 = _mm_add_epi16(value_add, pred_r0);
2169 temp5 = _mm_add_epi16(value_add, pred_r2);
2170 /*------------------------------------------------------------------*/
2171 /* Clipping the results to 8 bits */
2172 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);
2173 temp4 = _mm_and_si128(temp4, sign_reg);
2174 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);
2175 temp5 = _mm_and_si128(temp5, sign_reg);
2176
2177 temp4 = _mm_packus_epi16(temp4, temp5);
2178 temp5 = _mm_srli_si128(temp4, 4);
2179 temp6 = _mm_srli_si128(temp5, 4);
2180 temp7 = _mm_srli_si128(temp6, 4);
2181
2182 *pu4_out = _mm_cvtsi128_si32(temp4);
2183 pu1_out += i4_out_stride;
2184 pu4_out = (UWORD32 *) (pu1_out);
2185 *(pu4_out) = _mm_cvtsi128_si32(temp5);
2186 pu1_out += i4_out_stride;
2187 pu4_out = (UWORD32 *) (pu1_out);
2188 *(pu4_out) = _mm_cvtsi128_si32(temp6);
2189 pu1_out += i4_out_stride;
2190 pu4_out = (UWORD32 *) (pu1_out);
2191 *(pu4_out) = _mm_cvtsi128_si32(temp7);
2192 }
2193
isvc_iquant_itrans_recon_res_dc_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2194 void isvc_iquant_itrans_recon_res_dc_4x4_sse42(
2195 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2196 buffer_container_t *ps_res, buffer_container_t *ps_rec,
2197 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2198 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2199 {
2200 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
2201 WORD16 *pi2_res_ptr = pi2_res;
2202 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2203 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2204 WORD32 i4_res_stride = ps_res->i4_data_stride;
2205 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2206 WORD32 i4_out_stride = ps_rec->i4_data_stride;
2207 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2208 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2209 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2210 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
2211 WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0];
2212 WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
2213
2214 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2215 __m128i sign_reg;
2216 /* all bits reset to zero */
2217 __m128i zero_8x16b = _mm_setzero_si128();
2218 __m128i temp4, temp5, temp6, temp7;
2219 __m128i value_add;
2220
2221 ASSERT(0 == u1_res_accumulate);
2222
2223 UNUSED(pi2_tmp);
2224 UNUSED(ps_res_pred);
2225 UNUSED(u1_res_accumulate);
2226
2227 INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
2228
2229 /* Restoring dc value for intra case */
2230 if(i4_iq_start_idx != 0) q0 = pi2_dc_src[0];
2231
2232 i_macro = ((q0 + 32) >> 6);
2233
2234 value_add = _mm_set1_epi16(isvc_get_residue(i_macro, 0, 0));
2235
2236 zero_8x16b = _mm_setzero_si128();
2237
2238 /* Load pred buffer */
2239
2240 /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */
2241 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2242
2243 /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */
2244 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2245
2246 /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */
2247 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2248
2249 /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */
2250 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2251
2252 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2253 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2254 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
2255 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
2256
2257 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2258 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2259
2260 temp4 = _mm_add_epi16(value_add, pred_r0);
2261 temp5 = _mm_add_epi16(value_add, pred_r2);
2262
2263 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0]), value_add);
2264 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[i4_res_stride]), value_add);
2265 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), value_add);
2266 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), value_add);
2267 /*------------------------------------------------------------------*/
2268 /* Clipping the results to 8 bits */
2269 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);
2270 temp4 = _mm_and_si128(temp4, sign_reg);
2271 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);
2272 temp5 = _mm_and_si128(temp5, sign_reg);
2273
2274 temp4 = _mm_packus_epi16(temp4, temp5);
2275 temp5 = _mm_srli_si128(temp4, 4);
2276 temp6 = _mm_srli_si128(temp5, 4);
2277 temp7 = _mm_srli_si128(temp6, 4);
2278
2279 *pu4_out = _mm_cvtsi128_si32(temp4);
2280 pu1_out += i4_out_stride;
2281 pu4_out = (UWORD32 *) (pu1_out);
2282 *(pu4_out) = _mm_cvtsi128_si32(temp5);
2283 pu1_out += i4_out_stride;
2284 pu4_out = (UWORD32 *) (pu1_out);
2285 *(pu4_out) = _mm_cvtsi128_si32(temp6);
2286 pu1_out += i4_out_stride;
2287 pu4_out = (UWORD32 *) (pu1_out);
2288 *(pu4_out) = _mm_cvtsi128_si32(temp7);
2289 }
2290
isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2291 void isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42(
2292 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2293 buffer_container_t *ps_res, buffer_container_t *ps_rec,
2294 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2295 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2296 {
2297 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
2298 WORD16 *pi2_res_ptr = pi2_res;
2299 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
2300 WORD16 *pi2_res_pred_ptr = pi2_res_pred;
2301 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2302 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2303 WORD32 i4_res_stride = ps_res->i4_data_stride;
2304 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
2305 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2306 WORD32 i4_out_stride = ps_rec->i4_data_stride;
2307 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2308 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2309 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2310 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
2311 WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0];
2312 WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
2313
2314 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2315 __m128i sign_reg;
2316 /* all bits reset to zero */
2317 __m128i zero_8x16b = _mm_setzero_si128();
2318 __m128i temp4, temp5, temp6, temp7;
2319 __m128i value_add;
2320 __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
2321 __m128i temp0, temp1;
2322 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
2323 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
2324
2325 ASSERT(1 == u1_res_accumulate);
2326
2327 UNUSED(pi2_tmp);
2328 UNUSED(u1_res_accumulate);
2329
2330 INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
2331
2332 /* Restoring dc value for intra case */
2333 if(i4_iq_start_idx != 0) q0 = pi2_dc_src[0];
2334
2335 i_macro = ((q0 + 32) >> 6);
2336
2337 value_add = _mm_set1_epi16(i_macro);
2338
2339 zero_8x16b = _mm_setzero_si128();
2340
2341 /* Load pred buffer */
2342
2343 /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */
2344 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2345
2346 /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */
2347 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2348
2349 /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */
2350 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2351
2352 /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */
2353 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2354
2355 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2356 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2357 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
2358 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
2359
2360 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2361 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2362
2363 /* Accumulating res */
2364 res_pred_r0 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[0]);
2365 res_pred_r1 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[i4_res_pred_stride]);
2366 res_pred_r2 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[2 * i4_res_pred_stride]);
2367 res_pred_r3 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[3 * i4_res_pred_stride]);
2368
2369 res_pred_r0 = _mm_unpacklo_epi64(res_pred_r0, res_pred_r1);
2370 res_pred_r1 = _mm_unpacklo_epi64(res_pred_r2, res_pred_r3);
2371
2372 temp0 = _mm_add_epi16(value_add, res_pred_r0);
2373 temp1 = _mm_add_epi16(value_add, res_pred_r1);
2374
2375 /* Saturate all values < -255 to -255 and retain the rest as it is */
2376 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
2377 /* Saturate all values > 255 to 255 and retain the rest as it is */
2378 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
2379
2380 /* Saturate all values < -255 to -255 and retain the rest as it is */
2381 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
2382 /* Saturate all values > 255 to 255 and retain the rest as it is */
2383 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
2384
2385 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0]), temp0);
2386 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), temp1);
2387
2388 temp4 = _mm_add_epi16(temp0, pred_r0);
2389 temp5 = _mm_add_epi16(temp1, pred_r2);
2390
2391 temp0 = _mm_srli_si128(temp0, 8);
2392 temp1 = _mm_srli_si128(temp1, 8);
2393
2394 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[i4_res_stride]), temp0);
2395 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), temp1);
2396
2397 /*------------------------------------------------------------------*/
2398 /* Clipping the results to 8 bits */
2399 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);
2400 temp4 = _mm_and_si128(temp4, sign_reg);
2401 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);
2402 temp5 = _mm_and_si128(temp5, sign_reg);
2403
2404 temp4 = _mm_packus_epi16(temp4, temp5);
2405 temp5 = _mm_srli_si128(temp4, 4);
2406 temp6 = _mm_srli_si128(temp5, 4);
2407 temp7 = _mm_srli_si128(temp6, 4);
2408
2409 *pu4_out = _mm_cvtsi128_si32(temp4);
2410 pu1_out += i4_out_stride;
2411 pu4_out = (UWORD32 *) (pu1_out);
2412 *(pu4_out) = _mm_cvtsi128_si32(temp5);
2413 pu1_out += i4_out_stride;
2414 pu4_out = (UWORD32 *) (pu1_out);
2415 *(pu4_out) = _mm_cvtsi128_si32(temp6);
2416 pu1_out += i4_out_stride;
2417 pu4_out = (UWORD32 *) (pu1_out);
2418 *(pu4_out) = _mm_cvtsi128_si32(temp7);
2419 }
2420
isvc_iquant_itrans_recon_chroma_4x4_dc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2421 void isvc_iquant_itrans_recon_chroma_4x4_dc_sse42(
2422 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2423 buffer_container_t *ps_res, buffer_container_t *ps_rec,
2424 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2425 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2426 {
2427 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
2428 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2429 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2430 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2431 WORD32 i4_out_stride = ps_rec->i4_data_stride;
2432 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2433 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2434 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2435 /* DC value won't be dequantized for chroma
2436 inverse transform */
2437 WORD16 q0 = pi2_dc_src[0];
2438 WORD16 i_macro = ((q0 + 32) >> 6);
2439
2440 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2441 /* all bits reset to zero */
2442 __m128i zero_8x16b = _mm_setzero_si128();
2443 __m128i chroma_mask = _mm_set1_epi16(0xFF);
2444 __m128i value_add = _mm_set1_epi16(i_macro);
2445 __m128i out_r0, out_r1, out_r2, out_r3;
2446
2447 ASSERT(0 == u1_res_accumulate);
2448
2449 UNUSED(pi2_src);
2450 UNUSED(pu2_iscal_mat);
2451 UNUSED(pu2_weigh_mat);
2452 UNUSED(u4_qp_div_6);
2453 UNUSED(pi2_tmp);
2454 UNUSED(ps_res_pred);
2455 UNUSED(ps_res);
2456 UNUSED(i4_iq_start_idx);
2457 UNUSED(u1_res_accumulate);
2458
2459 /* Load pred buffer */
2460 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2461
2462 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2463
2464 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2465
2466 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2467
2468 /* Mask alternate pred values from the interleaved pred buf */
2469 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
2470 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
2471 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
2472 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
2473
2474 /* Pack the first four 16 bit values of 2 regs into a single reg*/
2475 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2476 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2477
2478 /* Compute out pixel by adding res to pred */
2479 pred_r0 = _mm_add_epi16(value_add, pred_r0);
2480 pred_r2 = _mm_add_epi16(value_add, pred_r2);
2481 /*------------------------------------------------------------------*/
2482 /* Clipping the results to 8 bits */
2483 pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
2484 pred_r1 = _mm_srli_si128(pred_r0, 4);
2485 pred_r2 = _mm_srli_si128(pred_r1, 4);
2486 pred_r3 = _mm_srli_si128(pred_r2, 4);
2487
2488 /* p00 p01 p02 p03 -- all 16 bits */
2489 pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);
2490 /* p10 p11 p12 p13 -- all 16 bits */
2491 pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);
2492 /* p20 p21 p22 p23 -- all 16 bits */
2493 pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);
2494 /* p30 p31 p32 p33 -- all 16 bits */
2495 pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);
2496
2497 /* Load interleaved out buffer */
2498 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
2499 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
2500 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2501 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2502
2503 /* Mask the interleaved out buf in order to save the U/V out pixel computed in
2504 this function call without thrashing the U/V out pixel that was saved
2505 during an earlier function call */
2506 chroma_mask = _mm_set1_epi16(0xFF00);
2507
2508 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2509 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2510 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2511 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2512
2513 /* Save the out pixels in alternate locations */
2514 out_r0 = _mm_add_epi8(out_r0, pred_r0);
2515 out_r1 = _mm_add_epi8(out_r1, pred_r1);
2516 out_r2 = _mm_add_epi8(out_r2, pred_r2);
2517 out_r3 = _mm_add_epi8(out_r3, pred_r3);
2518
2519 _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
2520 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
2521 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2522 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2523 }
2524
isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2525 void isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42(
2526 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2527 buffer_container_t *ps_res, buffer_container_t *ps_rec,
2528 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2529 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2530 {
2531 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
2532 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
2533 WORD16 *pi2_res_ptr = pi2_res;
2534 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2535 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2536 WORD32 i4_res_stride = ps_res->i4_data_stride;
2537 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2538 WORD32 i4_out_stride = ps_rec->i4_data_stride;
2539 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2540 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2541 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2542 /* DC value won't be dequantized for chroma
2543 inverse transform */
2544 WORD16 q0 = pi2_dc_src[0];
2545 WORD16 i_macro = ((q0 + 32) >> 6);
2546
2547 __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg;
2548 /* all bits reset to zero */
2549 __m128i zero_8x16b = _mm_setzero_si128();
2550 __m128i chroma_mask = _mm_set1_epi16(0xFF);
2551 __m128i value_add = _mm_set1_epi16(isvc_get_residue(i_macro, 0, 0));
2552 __m128i out_r0, out_r1, out_r2, out_r3;
2553
2554 ASSERT(0 == u1_res_accumulate);
2555
2556 UNUSED(pi2_src);
2557 UNUSED(pu2_iscal_mat);
2558 UNUSED(pu2_weigh_mat);
2559 UNUSED(u4_qp_div_6);
2560 UNUSED(pi2_tmp);
2561 UNUSED(ps_res_pred);
2562 UNUSED(i4_iq_start_idx);
2563 UNUSED(u1_res_accumulate);
2564
2565 /* Load pred buffer */
2566 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2567
2568 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2569
2570 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2571
2572 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2573
2574 /* Mask alternate pred values from the interleaved pred buf */
2575 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
2576 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
2577 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
2578 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
2579
2580 /* Pack the first four 16 bit values of 2 regs into a single reg*/
2581 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2582 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2583
2584 /* Compute out pixel by adding res to pred */
2585 pred_r0 = _mm_add_epi16(value_add, pred_r0);
2586 pred_r2 = _mm_add_epi16(value_add, pred_r2);
2587
2588 /* Convert res from 16 bits to 32 bits */
2589 value_add = _mm_cvtepu16_epi32(value_add);
2590
2591 out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]));
2592 out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]));
2593 out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]));
2594 out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]));
2595
2596 /* Mask the loaded res in order to save the U/V res data computed in
2597 this function call without thrashing the U/V res data that was saved
2598 during an earlier function call */
2599 chroma_mask = _mm_set1_epi32(0xffff0000);
2600 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2601 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2602 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2603 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2604
2605 /* Save the res in alternate locations */
2606 out_r0 = _mm_add_epi16(out_r0, value_add);
2607 out_r1 = _mm_add_epi16(out_r1, value_add);
2608 out_r2 = _mm_add_epi16(out_r2, value_add);
2609 out_r3 = _mm_add_epi16(out_r3, value_add);
2610
2611 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]), out_r0);
2612 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]), out_r1);
2613 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), out_r2);
2614 _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), out_r3);
2615 /*------------------------------------------------------------------*/
2616 /* Clipping the results to 8 bits */
2617 sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b);
2618 pred_r0 = _mm_and_si128(pred_r0, sign_reg);
2619 sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b);
2620 pred_r2 = _mm_and_si128(pred_r2, sign_reg);
2621
2622 pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
2623 pred_r1 = _mm_srli_si128(pred_r0, 4);
2624 pred_r2 = _mm_srli_si128(pred_r1, 4);
2625 pred_r3 = _mm_srli_si128(pred_r2, 4);
2626
2627 /* p00 p01 p02 p03 -- all 16 bits */
2628 pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);
2629 /* p10 p11 p12 p13 -- all 16 bits */
2630 pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);
2631 /* p20 p21 p22 p23 -- all 16 bits */
2632 pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);
2633 /* p30 p31 p32 p33 -- all 16 bits */
2634 pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);
2635
2636 /* Load interleaved out buffer */
2637 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
2638 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
2639 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2640 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2641
2642 /* Mask the interleaved out buf in order to save the U/V out pixel computed in
2643 this function call without thrashing the U/V out pixel that was saved
2644 during an earlier function call */
2645 chroma_mask = _mm_set1_epi16(0xFF00);
2646
2647 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2648 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2649 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2650 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2651
2652 /* Save the out pixels in alternate locations */
2653 out_r0 = _mm_add_epi8(out_r0, pred_r0);
2654 out_r1 = _mm_add_epi8(out_r1, pred_r1);
2655 out_r2 = _mm_add_epi8(out_r2, pred_r2);
2656 out_r3 = _mm_add_epi8(out_r3, pred_r3);
2657
2658 _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
2659 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
2660 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2661 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2662 }
2663
isvc_iquant_itrans_recon_res_chroma_4x4_dc_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2664 void isvc_iquant_itrans_recon_res_chroma_4x4_dc_with_res_acc_sse42(
2665 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2666 buffer_container_t *ps_res, buffer_container_t *ps_rec,
2667 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2668 WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2669 {
2670 WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
2671 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
2672 WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
2673 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2674 UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2675 WORD32 i4_res_stride = ps_res->i4_data_stride;
2676 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
2677 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2678 WORD32 i4_out_stride = ps_rec->i4_data_stride;
2679 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2680 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2681 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2682 /* DC value won't be dequantized for chroma
2683 inverse transform */
2684 WORD16 q0 = pi2_dc_src[0];
2685 WORD16 i_macro = ((q0 + 32) >> 6);
2686
2687 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2688 /* all bits reset to zero */
2689 __m128i zero_8x16b = _mm_setzero_si128();
2690 __m128i chroma_mask = _mm_set1_epi16(0xFF);
2691 __m128i reg_chroma = _mm_set_epi16(0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF);
2692 __m128i value_add = _mm_set1_epi16(i_macro);
2693 __m128i out_r0, out_r1, out_r2, out_r3;
2694 __m128i res_r0, res_r1, res_r2, res_r3;
2695 __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
2696 __m128i temp0, temp1;
2697 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
2698 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
2699
2700 ASSERT(1 == u1_res_accumulate);
2701
2702 UNUSED(pi2_src);
2703 UNUSED(pu2_iscal_mat);
2704 UNUSED(pu2_weigh_mat);
2705 UNUSED(u4_qp_div_6);
2706 UNUSED(pi2_tmp);
2707 UNUSED(i4_iq_start_idx);
2708 UNUSED(u1_res_accumulate);
2709
2710 /* Load pred buffer */
2711 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2712
2713 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2714
2715 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2716
2717 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2718 /* Mask alternate pred values from the interleaved pred buf */
2719 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
2720 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
2721 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
2722 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
2723
2724 /* Pack the first four 16 bit values of 2 regs into a single reg*/
2725 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2726 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2727
2728 /* Accumulating res */
2729
2730 /* load res pred buffer */
2731 res_pred_r0 = _mm_loadu_si128((__m128i *) &pi2_res_pred[0 * i4_res_pred_stride]);
2732 res_pred_r1 = _mm_loadu_si128((__m128i *) &pi2_res_pred[1 * i4_res_pred_stride]);
2733 res_pred_r2 = _mm_loadu_si128((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
2734 res_pred_r3 = _mm_loadu_si128((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
2735
2736 /* Mask res pred and retain alternate values */
2737 res_pred_r0 = _mm_and_si128(res_pred_r0, reg_chroma);
2738 res_pred_r1 = _mm_and_si128(res_pred_r1, reg_chroma);
2739 res_pred_r2 = _mm_and_si128(res_pred_r2, reg_chroma);
2740 res_pred_r3 = _mm_and_si128(res_pred_r3, reg_chroma);
2741
2742 /* Convert to 32 bits */
2743 res_r0 = _mm_cvtepu16_epi32(value_add);
2744 res_r2 = _mm_cvtepu16_epi32(value_add);
2745 res_r1 = _mm_cvtepu16_epi32(value_add);
2746 res_r3 = _mm_cvtepu16_epi32(value_add);
2747
2748 /* Add res pred to the res obtained from inv transform */
2749 res_r0 = _mm_add_epi16(res_pred_r0, res_r0);
2750 res_r1 = _mm_add_epi16(res_pred_r1, res_r1);
2751 res_r2 = _mm_add_epi16(res_pred_r2, res_r2);
2752 res_r3 = _mm_add_epi16(res_pred_r3, res_r3);
2753
2754 /* Convert 32 bit res of the format [a0 0 a1 0 a2 0 a3 0] to
2755 16 bits of the format [a0 a1 a2 a3] using hadd [ao + 0,
2756 a1 + 0, a2 + 0, a3 + 0] To be optimized */
2757 temp0 = _mm_hadd_epi16(res_r0, res_r1);
2758 temp1 = _mm_hadd_epi16(res_r2, res_r3);
2759
2760 /* Saturate all values < -255 to -255 and retain the rest as it is */
2761 temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
2762 /* Saturate all values > 255 to 255 and retain the rest as it is */
2763 temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
2764
2765 /* Saturate all values < -255 to -255 and retain the rest as it is */
2766 temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
2767 /* Saturate all values > 255 to 255 and retain the rest as it is */
2768 temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
2769
2770 /* Compute out pixel by adding res to pred */
2771 pred_r0 = _mm_add_epi16(temp0, pred_r0);
2772 pred_r2 = _mm_add_epi16(temp1, pred_r2);
2773
2774 res_r0 = _mm_cvtepu16_epi32(temp0);
2775 res_r2 = _mm_cvtepu16_epi32(temp1);
2776 res_r1 = _mm_srli_si128(temp0, 8);
2777 res_r3 = _mm_srli_si128(temp1, 8);
2778 res_r1 = _mm_cvtepu16_epi32(res_r1);
2779 res_r3 = _mm_cvtepu16_epi32(res_r3);
2780
2781 /* Load res buffer */
2782 out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]));
2783 out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]));
2784 out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]));
2785 out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]));
2786
2787 /* Mask the loaded res in order to save the U/V res data computed in
2788 this function call without thrashing the U/V res data that was saved
2789 during an earlier function call */
2790 chroma_mask = _mm_set1_epi32(0xffff0000);
2791
2792 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2793 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2794 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2795 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2796
2797 /* Save the res in alternate locations */
2798 out_r0 = _mm_add_epi16(out_r0, res_r0);
2799 out_r1 = _mm_add_epi16(out_r1, res_r1);
2800 out_r2 = _mm_add_epi16(out_r2, res_r2);
2801 out_r3 = _mm_add_epi16(out_r3, res_r3);
2802
2803 _mm_storeu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]), out_r0);
2804 _mm_storeu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]), out_r1);
2805 _mm_storeu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]), out_r2);
2806 _mm_storeu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]), out_r3);
2807 /*------------------------------------------------------------------*/
2808 /* Clipping the results to 8 bits */
2809 pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
2810 pred_r1 = _mm_srli_si128(pred_r0, 4);
2811 pred_r2 = _mm_srli_si128(pred_r1, 4);
2812 pred_r3 = _mm_srli_si128(pred_r2, 4);
2813
2814 /* p00 p01 p02 p03 -- all 16 bits */
2815 pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);
2816 /* p10 p11 p12 p13 -- all 16 bits */
2817 pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);
2818 /* p20 p21 p22 p23 -- all 16 bits */
2819 pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);
2820 /* p30 p31 p32 p33 -- all 16 bits */
2821 pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);
2822
2823 /* Load interleaved out buffer */
2824 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
2825 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
2826 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2827 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2828
2829 /* Mask the interleaved out buf in order to save the U/V out pixel computed in
2830 this function call without thrashing the U/V out pixel that was saved
2831 during an earlier function call */
2832 chroma_mask = _mm_set1_epi16(0xFF00);
2833
2834 out_r0 = _mm_and_si128(out_r0, chroma_mask);
2835 out_r1 = _mm_and_si128(out_r1, chroma_mask);
2836 out_r2 = _mm_and_si128(out_r2, chroma_mask);
2837 out_r3 = _mm_and_si128(out_r3, chroma_mask);
2838
2839 /* Save the out pixels in alternate locations */
2840 out_r0 = _mm_add_epi8(out_r0, pred_r0);
2841 out_r1 = _mm_add_epi8(out_r1, pred_r1);
2842 out_r2 = _mm_add_epi8(out_r2, pred_r2);
2843 out_r3 = _mm_add_epi8(out_r3, pred_r3);
2844
2845 _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
2846 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
2847 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2848 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2849 }
2850