• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvc_iquant_itrans_recon_sse42.c
24  *
25  * @brief
26  *  Contains function definitions for inverse  quantization, inverse
27  * transform and reconstruction
28  *
29  * @author
30  *  Mohit [100664]
31  *
32  * @par List of Functions:
33  *  - isvc_iquant_itrans_recon_4x4_sse42()
34  *  - isvc_iquant_itrans_recon_chroma_4x4_sse42()
35  *
36  * @remarks
37  *  None
38  *
39  *******************************************************************************
40  */
41 #include <immintrin.h>
42 
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "ih264_defs.h"
46 #include "ih264_trans_macros.h"
47 #include "ih264_macros.h"
48 #include "ih264_platform_macros.h"
49 #include "ih264_trans_data.h"
50 #include "ih264_size_defs.h"
51 #include "isvc_structs.h"
52 #include "isvc_trans_quant_itrans_iquant.h"
53 
54 /*
55  ********************************************************************************
56  *
57  * @brief This function reconstructs a 4x4 sub block from quantized resiude and
58  * prediction buffer
59  *
60  * @par Description:
61  *  The quantized residue is first inverse quantized, then inverse transformed.
62  *  This inverse transformed content is added to the prediction buffer to recon-
63  *  struct the end output
64  *
65  * @param[in] pi2_src
66  *  quantized 4x4 block
67  *
68  * @param[in] pu1_pred
69  *  prediction 4x4 block
70  *
71  * @param[out] pu1_out
72  *  reconstructed 4x4 block
73  *
74  * @param[in] src_strd
75  *  quantization buffer stride
76  *
77  * @param[in] i4_pred_stride,
78  *  Prediction buffer stride
79  *
80  * @param[in] i4_out_stride
81  *  recon buffer Stride
82  *
83  * @param[in] pu2_scaling_list
84  *  pointer to scaling list
85  *
86  * @param[in] pu2_norm_adjust
87  *  pointer to inverse scale matrix
88  *
89  * @param[in] u4_qp_div_6
90  *  Floor (qp/6)
91  *
92  * @param[in] pi4_tmp
93  * temporary buffer of size 1*16
94  *
95  * @returns none
96  *
97  * @remarks none
98  *
99  *******************************************************************************
100  */
101 
isvc_iquant_itrans_recon_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)102 void isvc_iquant_itrans_recon_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
103                                         buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
104                                         buffer_container_t *ps_rec,
105                                         iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
106                                         WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
107                                         UWORD8 u1_res_accumulate)
108 {
109     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
110     WORD16 *pi2_tmp_ptr = pi2_tmp;
111     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
112     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
113     WORD32 i4_src_stride = ps_src->i4_data_stride;
114     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
115     WORD32 i4_out_stride = ps_rec->i4_data_stride;
116     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
117     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
118     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
119     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
120     __m128i src_r0_r1, src_r2_r3;
121     __m128i src_r0, src_r1, src_r2, src_r3;
122     __m128i scalemat_r0_r1, scalemat_r2_r3;
123     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
124     __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
125     /* all bits reset to zero */
126     __m128i zero_8x16b = _mm_setzero_si128();
127     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
128     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
129     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
130     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
131     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
132     __m128i value_32 = _mm_set1_epi32(32);
133 
134     ASSERT(4 == i4_src_stride);
135     ASSERT(0 == u1_res_accumulate);
136 
137     UNUSED(i4_src_stride);
138     UNUSED(ps_res);
139     UNUSED(ps_res_pred);
140     UNUSED(u1_res_accumulate);
141 
142     /*************************************************************/
143     /* Dequantization of coefficients. Will be replaced by SIMD  */
144     /* operations on platform                                    */
145     /*************************************************************/
146 
147     /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
148      matrix 0th,1st row */
149     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
150 
151     /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
152       source matrix 2nd,3rd row */
153     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
154 
155     /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
156      scaling matrix 0th,1st row */
157     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
158 
159     /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
160      the scaling matrix 2nd,3rd row */
161     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
162 
163     /* q00 q01 q02 q03 q10 q11
164      q12 q13 -- all 16 bits */
165     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
166 
167     /* q20 q21 q22 q23 q30 q31
168      q32 q33 -- all 16 bits */
169     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
170 
171     /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
172      b12*q12 b13*q13 -- 16 bit result */
173     temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
174 
175     /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
176      b32*q32 b33*q33 -- 16 bit result */
177     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
178 
179     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
180     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
181 
182     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
183     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
184 
185     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
186     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
187 
188     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
189     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
190 
191     /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
192     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
193     /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
194     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
195     /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
196     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
197     /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
198     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
199 
200     temp4 = _mm_madd_epi16(src_r0, temp4);
201     temp5 = _mm_madd_epi16(src_r1, temp5);
202     temp6 = _mm_madd_epi16(src_r2, temp6);
203     temp7 = _mm_madd_epi16(src_r3, temp7);
204 
205     if(u4_qp_div_6 >= 4)
206     {
207         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
208         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
209         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
210         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
211     }
212     else
213     {
214         temp4 = _mm_add_epi32(temp4, add_rshift);
215         temp5 = _mm_add_epi32(temp5, add_rshift);
216         temp6 = _mm_add_epi32(temp6, add_rshift);
217         temp7 = _mm_add_epi32(temp7, add_rshift);
218         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
219         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
220         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
221         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
222     }
223 
224     if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
225     /* Perform Inverse transform */
226     /*-------------------------------------------------------------*/
227     /* IDCT [ Horizontal transformation ]                          */
228     /*-------------------------------------------------------------*/
229     // Matrix transpose
230     /*
231      *  a0 a1 a2 a3
232      *  b0 b1 b2 b3
233      *  c0 c1 c2 c3
234      *  d0 d1 d2 d3
235      */
236 
237     /* a0 b0 a1 b1 */
238     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
239     /* c0 d0 c1 d1 */
240     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
241     /* a2 b2 a3 b3 */
242     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
243     /* c2 d2 c3 d3 */
244     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
245     /* a0 b0 c0 d0 */
246     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
247     /* a1 b1 c1 d1 */
248     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
249     /* a2 b2 c2 d2 */
250     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
251     /* a3 b3 c3 d3 */
252     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
253     /* Transform starts -- horizontal transform */
254     /*------------------------------------------------------------------*/
255     /* z0 = w0 + w2                                             */
256     temp0 = _mm_add_epi32(resq_r0, resq_r2);
257     /* z1 = w0 - w2                                             */
258     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
259     /* z2 = (w1 >> 1) - w3                                      */
260     temp2 = _mm_srai_epi32(resq_r1, 1);
261     temp2 = _mm_sub_epi32(temp2, resq_r3);
262     /* z3 = w1 + (w3 >> 1)                                      */
263     temp3 = _mm_srai_epi32(resq_r3, 1);
264     temp3 = _mm_add_epi32(temp3, resq_r1);
265     /*----------------------------------------------------------*/
266     /* x0 = z0 + z3                                             */
267     resq_r0 = _mm_add_epi32(temp0, temp3);
268     /* x1 = z1 + z2                                             */
269     resq_r1 = _mm_add_epi32(temp1, temp2);
270     /* x2 = z1 - z2                                             */
271     resq_r2 = _mm_sub_epi32(temp1, temp2);
272     /* x3 = z0 - z3                                             */
273     resq_r3 = _mm_sub_epi32(temp0, temp3);
274 
275     // Matrix transpose
276     /*
277      *  a0 b0 c0 d0
278      *  a1 b1 c1 d1
279      *  a2 b2 c2 d2
280      *  a3 b3 c3 d3
281      */
282 
283     /* a0 a1 b0 b1 */
284     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
285     /* a2 a3 b2 b3 */
286     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
287     /* c0 c1 d0 d1 */
288     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
289     /* c2 c3 d2 d3 */
290     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
291     /* a0 a1 a2 a3 */
292     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
293     /* b0 b1 b2 b3 */
294     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
295     /* c0 c1 c2 c3 */
296     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
297     /* d0 d1 d2 d3 */
298     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
299     /* Transform ends -- horizontal transform */
300 
301     temp0 = _mm_packs_epi32(resq_r0, resq_r1);
302     temp1 = _mm_packs_epi32(resq_r2, resq_r3);
303 
304     _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
305     _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
306 
307     /* Load pred buffer */
308     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
309     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
310     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
311     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
312 
313     pred_r0 = _mm_cvtepu8_epi16(pred_r0);
314     pred_r1 = _mm_cvtepu8_epi16(pred_r1);
315     pred_r2 = _mm_cvtepu8_epi16(pred_r2);
316     pred_r3 = _mm_cvtepu8_epi16(pred_r3);
317 
318     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
319     pred_r1 = _mm_unpacklo_epi64(pred_r2, pred_r3);
320 
321     /*--------------------------------------------------------------*/
322     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
323     /*                                                              */
324     /* Add the prediction and store it back to same buffer          */
325     /*--------------------------------------------------------------*/
326     /* z0j = y0j + y2j                                                        */
327     temp0 = _mm_add_epi32(resq_r0, resq_r2);
328     /* z1j = y0j - y2j                                                        */
329     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
330     /* z2j = (y1j>>1) - y3j */
331     temp2 = _mm_srai_epi32(resq_r1, 1);
332     temp2 = _mm_sub_epi32(temp2, resq_r3);
333     /* z3j = y1j + (y3j>>1) */
334     temp3 = _mm_srai_epi32(resq_r3, 1);
335     temp3 = _mm_add_epi32(temp3, resq_r1);
336 
337     /* x0j = z0j + z3j                                                        */
338     temp4 = _mm_add_epi32(temp0, temp3);
339     temp4 = _mm_add_epi32(temp4, value_32);
340     temp4 = _mm_srai_epi32(temp4, 6);
341     /* x1j = z1j + z2j                                                        */
342     temp5 = _mm_add_epi32(temp1, temp2);
343     temp5 = _mm_add_epi32(temp5, value_32);
344     temp5 = _mm_srai_epi32(temp5, 6);
345     /* x2j = z1j - z2j                                                        */
346     temp6 = _mm_sub_epi32(temp1, temp2);
347     temp6 = _mm_add_epi32(temp6, value_32);
348     temp6 = _mm_srai_epi32(temp6, 6);
349     /* x3j = z0j - z3j                                                        */
350     temp7 = _mm_sub_epi32(temp0, temp3);
351     temp7 = _mm_add_epi32(temp7, value_32);
352     temp7 = _mm_srai_epi32(temp7, 6);
353 
354     /* 32-bit to 16-bit conversion */
355     temp0 = _mm_packs_epi32(temp4, temp5);
356     temp1 = _mm_packs_epi32(temp6, temp7);
357 
358     /* Saturate all values < -255 to -255 and retain the rest as it is */
359     temp4 = _mm_max_epi16(temp0, neg_255_8x16b);
360     /* Saturate all values > 255 to 255 and retain the rest as it is */
361     temp4 = _mm_min_epi16(temp4, pos_255_8x16b);
362 
363     /* Saturate all values < -255 to -255 and retain the rest as it is */
364     temp5 = _mm_max_epi16(temp1, neg_255_8x16b);
365     /* Saturate all values > 255 to 255 and retain the rest as it is */
366     temp5 = _mm_min_epi16(temp5, pos_255_8x16b);
367 
368     temp0 = _mm_add_epi16(temp4, pred_r0);
369     temp1 = _mm_add_epi16(temp5, pred_r1);
370 
371     /*------------------------------------------------------------------*/
372     /* Clipping the results to 8 bits */
373     sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
374     temp0 = _mm_and_si128(temp0, sign_reg);
375     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
376     temp1 = _mm_and_si128(temp1, sign_reg);
377 
378     resq_r0 = _mm_packus_epi16(temp0, temp1);
379     resq_r1 = _mm_srli_si128(resq_r0, 4);
380     resq_r2 = _mm_srli_si128(resq_r1, 4);
381     resq_r3 = _mm_srli_si128(resq_r2, 4);
382 
383     *pu4_out = _mm_cvtsi128_si32(resq_r0);
384     pu1_out += i4_out_stride;
385     pu4_out = (UWORD32 *) (pu1_out);
386     *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
387     pu1_out += i4_out_stride;
388     pu4_out = (UWORD32 *) (pu1_out);
389     *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
390     pu1_out += i4_out_stride;
391     pu4_out = (UWORD32 *) (pu1_out);
392     *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
393 }
394 
isvc_iquant_itrans_recon_res_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)395 void isvc_iquant_itrans_recon_res_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
396                                             buffer_container_t *ps_res_pred,
397                                             buffer_container_t *ps_res, buffer_container_t *ps_rec,
398                                             iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
399                                             WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
400                                             WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
401 {
402     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
403     WORD16 *pi2_tmp_ptr = pi2_tmp;
404     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
405     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
406     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
407     WORD32 i4_src_stride = ps_src->i4_data_stride;
408     WORD32 i4_res_stride = ps_res->i4_data_stride;
409     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
410     WORD32 i4_out_stride = ps_rec->i4_data_stride;
411     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
412     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
413     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
414     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
415     __m128i src_r0_r1, src_r2_r3;
416     __m128i src_r0, src_r1, src_r2, src_r3;
417     __m128i scalemat_r0_r1, scalemat_r2_r3;
418     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
419     __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
420     /* all bits reset to zero */
421     __m128i zero_8x16b = _mm_setzero_si128();
422     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
423     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
424     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
425     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
426     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
427     __m128i value_32 = _mm_set1_epi32(32);
428 
429     ASSERT(4 == i4_src_stride);
430     ASSERT(0 == u1_res_accumulate);
431 
432     UNUSED(i4_src_stride);
433     UNUSED(ps_res_pred);
434     UNUSED(u1_res_accumulate);
435 
436     /*************************************************************/
437     /* Dequantization of coefficients. Will be replaced by SIMD  */
438     /* operations on platform                                    */
439     /*************************************************************/
440 
441     /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
442     matrix 0th,1st row */
443     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
444 
445     /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
446     source matrix 2nd,3rd row */
447     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
448 
449     /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
450     scaling matrix 0th,1st row */
451     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
452 
453     /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
454     the scaling matrix 2nd,3rd row */
455     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
456 
457     /* q00 q01 q02 q03 q10 q11
458     q12 q13 -- all 16 bits */
459     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
460 
461     /* q20 q21 q22 q23 q30 q31
462     q32 q33 -- all 16 bits */
463     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
464 
465     /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
466     b12*q12 b13*q13 -- 16 bit result */
467     temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
468 
469     /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
470     b32*q32 b33*q33 -- 16 bit result */
471     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
472 
473     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
474     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
475 
476     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
477     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
478 
479     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
480     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
481 
482     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
483     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
484 
485     /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
486     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
487     /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
488     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
489     /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
490     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
491     /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
492     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
493 
494     temp4 = _mm_madd_epi16(src_r0, temp4);
495     temp5 = _mm_madd_epi16(src_r1, temp5);
496     temp6 = _mm_madd_epi16(src_r2, temp6);
497     temp7 = _mm_madd_epi16(src_r3, temp7);
498 
499     if(u4_qp_div_6 >= 4)
500     {
501         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
502         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
503         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
504         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
505     }
506     else
507     {
508         temp4 = _mm_add_epi32(temp4, add_rshift);
509         temp5 = _mm_add_epi32(temp5, add_rshift);
510         temp6 = _mm_add_epi32(temp6, add_rshift);
511         temp7 = _mm_add_epi32(temp7, add_rshift);
512         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
513         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
514         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
515         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
516     }
517 
518     if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
519     /* Perform Inverse transform */
520     /*-------------------------------------------------------------*/
521     /* IDCT [ Horizontal transformation ]                          */
522     /*-------------------------------------------------------------*/
523     // Matrix transpose
524     /*
525      *  a0 a1 a2 a3
526      *  b0 b1 b2 b3
527      *  c0 c1 c2 c3
528      *  d0 d1 d2 d3
529      */
530 
531     /* a0 b0 a1 b1 */
532     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
533     /* c0 d0 c1 d1 */
534     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
535     /* a2 b2 a3 b3 */
536     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
537     /* c2 d2 c3 d3 */
538     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
539     /* a0 b0 c0 d0 */
540     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
541     /* a1 b1 c1 d1 */
542     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
543     /* a2 b2 c2 d2 */
544     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
545     /* a3 b3 c3 d3 */
546     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
547     /* Transform starts -- horizontal transform */
548     /*------------------------------------------------------------------*/
549     /* z0 = w0 + w2                                             */
550     temp0 = _mm_add_epi32(resq_r0, resq_r2);
551     /* z1 = w0 - w2                                             */
552     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
553     /* z2 = (w1 >> 1) - w3                                      */
554     temp2 = _mm_srai_epi32(resq_r1, 1);
555     temp2 = _mm_sub_epi32(temp2, resq_r3);
556     /* z3 = w1 + (w3 >> 1)                                      */
557     temp3 = _mm_srai_epi32(resq_r3, 1);
558     temp3 = _mm_add_epi32(temp3, resq_r1);
559     /*----------------------------------------------------------*/
560     /* x0 = z0 + z3                                             */
561     resq_r0 = _mm_add_epi32(temp0, temp3);
562     /* x1 = z1 + z2                                             */
563     resq_r1 = _mm_add_epi32(temp1, temp2);
564     /* x2 = z1 - z2                                             */
565     resq_r2 = _mm_sub_epi32(temp1, temp2);
566     /* x3 = z0 - z3                                             */
567     resq_r3 = _mm_sub_epi32(temp0, temp3);
568 
569     // Matrix transpose
570     /*
571      *  a0 b0 c0 d0
572      *  a1 b1 c1 d1
573      *  a2 b2 c2 d2
574      *  a3 b3 c3 d3
575      */
576 
577     /* a0 a1 b0 b1 */
578     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
579     /* a2 a3 b2 b3 */
580     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
581     /* c0 c1 d0 d1 */
582     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
583     /* c2 c3 d2 d3 */
584     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
585     /* a0 a1 a2 a3 */
586     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
587     /* b0 b1 b2 b3 */
588     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
589     /* c0 c1 c2 c3 */
590     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
591     /* d0 d1 d2 d3 */
592     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
593     /* Transform ends -- horizontal transform */
594 
595     temp0 = _mm_packs_epi32(resq_r0, resq_r1);
596     temp1 = _mm_packs_epi32(resq_r2, resq_r3);
597 
598     _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
599     _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
600 
601     /* Load pred buffer */
602     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
603     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
604     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
605     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
606 
607     pred_r0 = _mm_cvtepu8_epi16(pred_r0);
608     pred_r1 = _mm_cvtepu8_epi16(pred_r1);
609     pred_r2 = _mm_cvtepu8_epi16(pred_r2);
610     pred_r3 = _mm_cvtepu8_epi16(pred_r3);
611 
612     /*--------------------------------------------------------------*/
613     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
614     /*                                                              */
615     /* Add the prediction and store it back to same buffer          */
616     /*--------------------------------------------------------------*/
617     /* z0j = y0j + y2j                                                        */
618     temp0 = _mm_add_epi32(resq_r0, resq_r2);
619     /* z1j = y0j - y2j                                                        */
620     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
621     /* z2j = (y1j>>1) - y3j */
622     temp2 = _mm_srai_epi32(resq_r1, 1);
623     temp2 = _mm_sub_epi32(temp2, resq_r3);
624     /* z3j = y1j + (y3j>>1) */
625     temp3 = _mm_srai_epi32(resq_r3, 1);
626     temp3 = _mm_add_epi32(temp3, resq_r1);
627 
628     /* x0j = z0j + z3j                                                        */
629     temp4 = _mm_add_epi32(temp0, temp3);
630     temp4 = _mm_add_epi32(temp4, value_32);
631     temp4 = _mm_srai_epi32(temp4, 6);
632     /* x1j = z1j + z2j                                                        */
633     temp5 = _mm_add_epi32(temp1, temp2);
634     temp5 = _mm_add_epi32(temp5, value_32);
635     temp5 = _mm_srai_epi32(temp5, 6);
636     /* x2j = z1j - z2j                                                        */
637     temp6 = _mm_sub_epi32(temp1, temp2);
638     temp6 = _mm_add_epi32(temp6, value_32);
639     temp6 = _mm_srai_epi32(temp6, 6);
640     /* x3j = z0j - z3j                                                        */
641     temp7 = _mm_sub_epi32(temp0, temp3);
642     temp7 = _mm_add_epi32(temp7, value_32);
643     temp7 = _mm_srai_epi32(temp7, 6);
644 
645     /* 32-bit to 16-bit conversion */
646     temp0 = _mm_packs_epi32(temp4, temp5);
647     temp1 = _mm_packs_epi32(temp6, temp7);
648 
649     /* Saturate all values < -255 to -255 and retain the rest as it is */
650     temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
651     /* Saturate all values > 255 to 255 and retain the rest as it is */
652     temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
653 
654     /* Saturate all values < -255 to -255 and retain the rest as it is */
655     temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
656     /* Saturate all values > 255 to 255 and retain the rest as it is */
657     temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
658 
659     _mm_storel_epi64((__m128i *) (&pi2_res[0]), temp0);
660     _mm_storel_epi64((__m128i *) (&pi2_res[2 * i4_res_stride]), temp1);
661 
662     temp4 = _mm_add_epi16(temp0, pred_r0);
663     temp0 = _mm_srli_si128(temp0, 8);
664     _mm_storel_epi64((__m128i *) (&pi2_res[i4_res_stride]), temp0);
665 
666     temp6 = _mm_add_epi16(temp1, pred_r2);
667     temp1 = _mm_srli_si128(temp1, 8);
668     _mm_storel_epi64((__m128i *) (&pi2_res[3 * i4_res_stride]), temp1);
669 
670     temp5 = _mm_add_epi16(temp0, pred_r1);
671     temp7 = _mm_add_epi16(temp1, pred_r3);
672 
673     temp4 = _mm_cvtepi16_epi32(temp4);
674     temp5 = _mm_cvtepi16_epi32(temp5);
675     temp6 = _mm_cvtepi16_epi32(temp6);
676     temp7 = _mm_cvtepi16_epi32(temp7);
677 
678     /* 32-bit to 16-bit conversion */
679     temp0 = _mm_packs_epi32(temp4, temp5);
680     temp1 = _mm_packs_epi32(temp6, temp7);
681     /*------------------------------------------------------------------*/
682     /* Clipping the results to 8 bits */
683     sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
684     temp0 = _mm_and_si128(temp0, sign_reg);
685     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
686     temp1 = _mm_and_si128(temp1, sign_reg);
687 
688     resq_r0 = _mm_packus_epi16(temp0, temp1);
689     resq_r1 = _mm_srli_si128(resq_r0, 4);
690     resq_r2 = _mm_srli_si128(resq_r1, 4);
691     resq_r3 = _mm_srli_si128(resq_r2, 4);
692 
693     *pu4_out = _mm_cvtsi128_si32(resq_r0);
694     pu1_out += i4_out_stride;
695     pu4_out = (UWORD32 *) (pu1_out);
696     *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
697     pu1_out += i4_out_stride;
698     pu4_out = (UWORD32 *) (pu1_out);
699     *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
700     pu1_out += i4_out_stride;
701     pu4_out = (UWORD32 *) (pu1_out);
702     *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
703 }
704 
isvc_iquant_itrans_recon_res_4x4_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)705 void isvc_iquant_itrans_recon_res_4x4_with_res_acc_sse42(
706     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
707     buffer_container_t *ps_res, buffer_container_t *ps_rec,
708     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
709     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
710 {
711     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
712     WORD16 *pi2_tmp_ptr = pi2_tmp;
713     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
714     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
715     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
716     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
717     WORD32 i4_src_stride = ps_src->i4_data_stride;
718     WORD32 i4_res_stride = ps_res->i4_data_stride;
719     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
720     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
721     WORD32 i4_out_stride = ps_rec->i4_data_stride;
722     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
723     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
724     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
725     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
726     __m128i src_r0_r1, src_r2_r3;
727     __m128i src_r0, src_r1, src_r2, src_r3;
728     __m128i scalemat_r0_r1, scalemat_r2_r3;
729     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
730     __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
731     __m128i res_r0, res_r1, res_r2, res_r3;
732     __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
733     /* all bits reset to zero */
734     __m128i zero_8x16b = _mm_setzero_si128();
735     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
736     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
737     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
738     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
739     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
740     __m128i value_32 = _mm_set1_epi32(32);
741 
742     ASSERT(4 == i4_src_stride);
743     ASSERT(1 == u1_res_accumulate);
744 
745     UNUSED(i4_src_stride);
746     UNUSED(ps_res_pred);
747     UNUSED(u1_res_accumulate);
748 
749     /*************************************************************/
750     /* Dequantization of coefficients. Will be replaced by SIMD  */
751     /* operations on platform                                    */
752     /*************************************************************/
753 
754     /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
755      matrix 0th,1st row */
756     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
757 
758     /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
759       source matrix 2nd,3rd row */
760     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
761 
762     /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
763      scaling matrix 0th,1st row */
764     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
765 
766     /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
767      the scaling matrix 2nd,3rd row */
768     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
769 
770     /* q00 q01 q02 q03 q10 q11
771      q12 q13 -- all 16 bits */
772     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
773 
774     /* q20 q21 q22 q23 q30 q31
775      q32 q33 -- all 16 bits */
776     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
777 
778     /* b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
779      b12*q12 b13*q13 -- 16 bit result */
780     temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
781 
782     /* b20*q20 b21*q21 b22*q22 b23*q23 b30*q30 b31*q31
783      b32*q32 b33*q33 -- 16 bit result */
784     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
785 
786     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
787     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
788 
789     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
790     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
791 
792     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
793     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
794 
795     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
796     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
797 
798     /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
799     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
800     /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
801     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
802     /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
803     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
804     /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
805     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
806 
807     temp4 = _mm_madd_epi16(src_r0, temp4);
808     temp5 = _mm_madd_epi16(src_r1, temp5);
809     temp6 = _mm_madd_epi16(src_r2, temp6);
810     temp7 = _mm_madd_epi16(src_r3, temp7);
811 
812     if(u4_qp_div_6 >= 4)
813     {
814         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
815         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
816         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
817         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
818     }
819     else
820     {
821         temp4 = _mm_add_epi32(temp4, add_rshift);
822         temp5 = _mm_add_epi32(temp5, add_rshift);
823         temp6 = _mm_add_epi32(temp6, add_rshift);
824         temp7 = _mm_add_epi32(temp7, add_rshift);
825         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
826         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
827         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
828         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
829     }
830 
831     if(i4_iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
832     /* Perform Inverse transform */
833     /*-------------------------------------------------------------*/
834     /* IDCT [ Horizontal transformation ]                          */
835     /*-------------------------------------------------------------*/
836     // Matrix transpose
837     /*
838      *  a0 a1 a2 a3
839      *  b0 b1 b2 b3
840      *  c0 c1 c2 c3
841      *  d0 d1 d2 d3
842      */
843 
844     /* a0 b0 a1 b1 */
845     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
846     /* c0 d0 c1 d1 */
847     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
848     /* a2 b2 a3 b3 */
849     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
850     /* c2 d2 c3 d3 */
851     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
852     /* a0 b0 c0 d0 */
853     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
854     /* a1 b1 c1 d1 */
855     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
856     /* a2 b2 c2 d2 */
857     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
858     /* a3 b3 c3 d3 */
859     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
860     /* Transform starts -- horizontal transform */
861     /*------------------------------------------------------------------*/
862     /* z0 = w0 + w2                                             */
863     temp0 = _mm_add_epi32(resq_r0, resq_r2);
864     /* z1 = w0 - w2                                             */
865     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
866     /* z2 = (w1 >> 1) - w3                                      */
867     temp2 = _mm_srai_epi32(resq_r1, 1);
868     temp2 = _mm_sub_epi32(temp2, resq_r3);
869     /* z3 = w1 + (w3 >> 1)                                      */
870     temp3 = _mm_srai_epi32(resq_r3, 1);
871     temp3 = _mm_add_epi32(temp3, resq_r1);
872     /*----------------------------------------------------------*/
873     /* x0 = z0 + z3                                             */
874     resq_r0 = _mm_add_epi32(temp0, temp3);
875     /* x1 = z1 + z2                                             */
876     resq_r1 = _mm_add_epi32(temp1, temp2);
877     /* x2 = z1 - z2                                             */
878     resq_r2 = _mm_sub_epi32(temp1, temp2);
879     /* x3 = z0 - z3                                             */
880     resq_r3 = _mm_sub_epi32(temp0, temp3);
881 
882     // Matrix transpose
883     /*
884      *  a0 b0 c0 d0
885      *  a1 b1 c1 d1
886      *  a2 b2 c2 d2
887      *  a3 b3 c3 d3
888      */
889 
890     /* a0 a1 b0 b1 */
891     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
892     /* a2 a3 b2 b3 */
893     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
894     /* c0 c1 d0 d1 */
895     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
896     /* c2 c3 d2 d3 */
897     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
898     /* a0 a1 a2 a3 */
899     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
900     /* b0 b1 b2 b3 */
901     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
902     /* c0 c1 c2 c3 */
903     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
904     /* d0 d1 d2 d3 */
905     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
906     /* Transform ends -- horizontal transform */
907 
908     temp0 = _mm_packs_epi32(resq_r0, resq_r1);
909     temp1 = _mm_packs_epi32(resq_r2, resq_r3);
910 
911     _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[0]), temp0);
912     _mm_storeu_si128((__m128i *) (&pi2_tmp_ptr[2 * 4]), temp1);
913 
914     /* Load pred buffer */
915     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
916     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
917     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
918     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
919 
920     pred_r0 = _mm_cvtepu8_epi16(pred_r0);
921     pred_r1 = _mm_cvtepu8_epi16(pred_r1);
922     pred_r2 = _mm_cvtepu8_epi16(pred_r2);
923     pred_r3 = _mm_cvtepu8_epi16(pred_r3);
924 
925     /*--------------------------------------------------------------*/
926     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
927     /*                                                              */
928     /* Add the prediction and store it back to same buffer          */
929     /*--------------------------------------------------------------*/
930     /* z0j = y0j + y2j                                                        */
931     temp0 = _mm_add_epi32(resq_r0, resq_r2);
932     /* z1j = y0j - y2j                                                        */
933     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
934     /* z2j = (y1j>>1) - y3j */
935     temp2 = _mm_srai_epi32(resq_r1, 1);
936     temp2 = _mm_sub_epi32(temp2, resq_r3);
937     /* z3j = y1j + (y3j>>1) */
938     temp3 = _mm_srai_epi32(resq_r3, 1);
939     temp3 = _mm_add_epi32(temp3, resq_r1);
940 
941     /* x0j = z0j + z3j                                                        */
942     temp4 = _mm_add_epi32(temp0, temp3);
943     temp4 = _mm_add_epi32(temp4, value_32);
944     temp4 = _mm_srai_epi32(temp4, 6);
945     res_r0 = temp4;
946     /* x1j = z1j + z2j                                                        */
947     temp5 = _mm_add_epi32(temp1, temp2);
948     temp5 = _mm_add_epi32(temp5, value_32);
949     temp5 = _mm_srai_epi32(temp5, 6);
950     res_r1 = temp5;
951     /* x2j = z1j - z2j                                                        */
952     temp6 = _mm_sub_epi32(temp1, temp2);
953     temp6 = _mm_add_epi32(temp6, value_32);
954     temp6 = _mm_srai_epi32(temp6, 6);
955     res_r2 = temp6;
956     /* x3j = z0j - z3j                                                        */
957     temp7 = _mm_sub_epi32(temp0, temp3);
958     temp7 = _mm_add_epi32(temp7, value_32);
959     temp7 = _mm_srai_epi32(temp7, 6);
960     res_r3 = temp7;
961 
962     /* Accumulating res */
963     res_pred_r0 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[0]);
964     res_pred_r1 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[i4_res_pred_stride]);
965     res_pred_r2 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
966     res_pred_r3 = _mm_loadl_epi64((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
967 
968     res_pred_r0 = _mm_cvtepi16_epi32(res_pred_r0);
969     res_pred_r1 = _mm_cvtepi16_epi32(res_pred_r1);
970     res_pred_r2 = _mm_cvtepi16_epi32(res_pred_r2);
971     res_pred_r3 = _mm_cvtepi16_epi32(res_pred_r3);
972 
973     temp0 = _mm_add_epi32(res_r0, res_pred_r0);
974     temp1 = _mm_add_epi32(res_r1, res_pred_r1);
975     temp2 = _mm_add_epi32(res_r2, res_pred_r2);
976     temp3 = _mm_add_epi32(res_r3, res_pred_r3);
977 
978     temp0 = _mm_packs_epi32(temp0, temp1);
979     temp1 = _mm_packs_epi32(temp2, temp3);
980 
981     /* Saturate all values < -255 to -255 and retain the rest as it is */
982     temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
983     /* Saturate all values > 255 to 255 and retain the rest as it is */
984     temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
985 
986     /* Saturate all values < -255 to -255 and retain the rest as it is */
987     temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
988     /* Saturate all values > 255 to 255 and retain the rest as it is */
989     temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
990 
991     _mm_storel_epi64((__m128i *) (&pi2_res[0]), temp0);
992     _mm_storel_epi64((__m128i *) (&pi2_res[2 * i4_res_stride]), temp1);
993 
994     temp4 = _mm_add_epi16(temp0, pred_r0);
995     temp0 = _mm_srli_si128(temp0, 8);
996     _mm_storel_epi64((__m128i *) (&pi2_res[i4_res_stride]), temp0);
997 
998     temp6 = _mm_add_epi16(temp1, pred_r2);
999     temp1 = _mm_srli_si128(temp1, 8);
1000     _mm_storel_epi64((__m128i *) (&pi2_res[3 * i4_res_stride]), temp1);
1001 
1002     temp5 = _mm_add_epi16(temp0, pred_r1);
1003     temp7 = _mm_add_epi16(temp1, pred_r3);
1004 
1005     temp4 = _mm_cvtepi16_epi32(temp4);
1006     temp5 = _mm_cvtepi16_epi32(temp5);
1007     temp6 = _mm_cvtepi16_epi32(temp6);
1008     temp7 = _mm_cvtepi16_epi32(temp7);
1009 
1010     /* 32-bit to 16-bit conversion */
1011     temp0 = _mm_packs_epi32(temp4, temp5);
1012     temp1 = _mm_packs_epi32(temp6, temp7);
1013     /*------------------------------------------------------------------*/
1014     /* Clipping the results to 8 bits */
1015     sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
1016     temp0 = _mm_and_si128(temp0, sign_reg);
1017     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
1018     temp1 = _mm_and_si128(temp1, sign_reg);
1019 
1020     resq_r0 = _mm_packus_epi16(temp0, temp1);
1021     resq_r1 = _mm_srli_si128(resq_r0, 4);
1022     resq_r2 = _mm_srli_si128(resq_r1, 4);
1023     resq_r3 = _mm_srli_si128(resq_r2, 4);
1024 
1025     *pu4_out = _mm_cvtsi128_si32(resq_r0);
1026     pu1_out += i4_out_stride;
1027     pu4_out = (UWORD32 *) (pu1_out);
1028     *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
1029     pu1_out += i4_out_stride;
1030     pu4_out = (UWORD32 *) (pu1_out);
1031     *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
1032     pu1_out += i4_out_stride;
1033     pu4_out = (UWORD32 *) (pu1_out);
1034     *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
1035 }
1036 
1037 /*
1038  ********************************************************************************
1039  *
1040  * @brief This function reconstructs a 4x4 sub block from quantized chroma
1041  *resiude and prediction buffer
1042  *
1043  * @par Description:
1044  *  The quantized residue is first inverse quantized, then inverse transformed.
1045  *  This inverse transformed content is added to the prediction buffer to recon-
1046  *  struct the end output
1047  *
1048  * @param[in] pi2_src
1049  *  quantized 4x4 block
1050  *
1051  * @param[in] pu1_pred
1052  *  prediction 4x4 block
1053  *
1054  * @param[out] pu1_out
1055  *  reconstructed 4x4 block
1056  *
1057  * @param[in] src_strd
1058  *  quantization buffer stride
1059  *
1060  * @param[in] i4_pred_stride,
1061  *  Prediction buffer stride
1062  *
1063  * @param[in] i4_out_stride
1064  *  recon buffer Stride
1065  *
1066  * @param[in] pu2_scaling_list
1067  *  pointer to scaling list
1068  *
1069  * @param[in] pu2_norm_adjust
1070  *  pointer to inverse scale matrix
1071  *
1072  * @param[in] u4_qp_div_6
1073  *  Floor (qp/6)
1074  *
1075  * @param[in] pi4_tmp
1076  * temporary buffer of size 1*16
1077  *
1078  * @returns none
1079  *
1080  * @remarks none
1081  *
1082  *******************************************************************************
1083  */
isvc_iquant_itrans_recon_chroma_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1084 void isvc_iquant_itrans_recon_chroma_4x4_sse42(
1085     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1086     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1087     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1088     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1089 {
1090     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1091     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1092     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1093     WORD32 i4_src_stride = ps_src->i4_data_stride;
1094     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1095     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1096     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1097     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1098     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1099     __m128i src_r0_r1, src_r2_r3;
1100     __m128i src_r0, src_r1, src_r2, src_r3;
1101     __m128i scalemat_r0_r1, scalemat_r2_r3;
1102     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1103     __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
1104     /* all bits reset to zero */
1105     __m128i zero_8x16b = _mm_setzero_si128();
1106     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1107     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1108     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1109     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1110     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1111     __m128i value_32 = _mm_set1_epi32(32);
1112     __m128i chroma_mask = _mm_set1_epi16(0xFF);
1113     __m128i out_r0, out_r1, out_r2, out_r3;
1114 
1115     ASSERT(4 == i4_src_stride);
1116     ASSERT(0 == u1_res_accumulate);
1117 
1118     UNUSED(i4_src_stride);
1119     UNUSED(u1_res_accumulate);
1120     UNUSED(ps_res);
1121     UNUSED(ps_res_pred);
1122     UNUSED(i4_iq_start_idx);
1123 
1124     /*************************************************************/
1125     /* Dequantization of coefficients. Will be replaced by SIMD  */
1126     /* operations on platform                                    */
1127     /*************************************************************/
1128     /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
1129      matrix 0th,1st row */
1130     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1131 
1132     /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
1133       source matrix 2nd,3rd row */
1134     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1135 
1136     /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
1137      scaling matrix 0th,1st row */
1138     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1139 
1140     /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
1141      the scaling matrix 2nd,3rd row */
1142     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1143 
1144     /* q00 q01 q02 q03 q10 q11
1145      q12 q13 -- all 16 bits */
1146     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1147 
1148     /* q20 q21 q22 q23 q30 q31
1149      q32 q33 -- all 16 bits */
1150     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1151 
1152     temp0 = _mm_mullo_epi16(scalemat_r0_r1,
1153                             dequant_r0_r1);  // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
1154                                              // b12*q12 b13*q13 -- 16 bit result
1155 
1156     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1157 
1158     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1159     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1160 
1161     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1162     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1163 
1164     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1165     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1166 
1167     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1168     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1169 
1170     /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
1171     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
1172     /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
1173     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
1174     /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
1175     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
1176     /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
1177     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
1178 
1179     temp4 = _mm_madd_epi16(src_r0, temp4);
1180     temp5 = _mm_madd_epi16(src_r1, temp5);
1181     temp6 = _mm_madd_epi16(src_r2, temp6);
1182     temp7 = _mm_madd_epi16(src_r3, temp7);
1183 
1184     if(u4_qp_div_6 >= 4)
1185     {
1186         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1187         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1188         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1189         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1190     }
1191     else
1192     {
1193         temp4 = _mm_add_epi32(temp4, add_rshift);
1194         temp5 = _mm_add_epi32(temp5, add_rshift);
1195         temp6 = _mm_add_epi32(temp6, add_rshift);
1196         temp7 = _mm_add_epi32(temp7, add_rshift);
1197         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1198         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1199         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1200         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1201     }
1202 
1203     resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1204     /* Perform Inverse transform */
1205     /*-------------------------------------------------------------*/
1206     /* IDCT [ Horizontal transformation ]                          */
1207     /*-------------------------------------------------------------*/
1208     // Matrix transpose
1209     /*
1210      *  a0 a1 a2 a3
1211      *  b0 b1 b2 b3
1212      *  c0 c1 c2 c3
1213      *  d0 d1 d2 d3
1214      */
1215     /* a0 b0 a1 b1 */
1216     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1217     /* c0 d0 c1 d1 */
1218     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1219     /* a2 b2 a3 b3 */
1220     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1221     /* c2 d2 c3 d3 */
1222     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1223     /* a0 b0 c0 d0 */
1224     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1225     /* a1 b1 c1 d1 */
1226     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1227     /* a2 b2 c2 d2 */
1228     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1229     /* a3 b3 c3 d3 */
1230     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1231     /* Transform starts -- horizontal transform */
1232 
1233     /*------------------------------------------------------------------*/
1234     /* z0 = w0 + w2                                             */
1235     temp0 = _mm_add_epi32(resq_r0, resq_r2);
1236     /* z1 = w0 - w2                                             */
1237     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1238     /* z2 = (w1 >> 1) - w3                                      */
1239     temp2 = _mm_srai_epi32(resq_r1, 1);
1240     temp2 = _mm_sub_epi32(temp2, resq_r3);
1241     /* z3 = w1 + (w3 >> 1)                                      */
1242     temp3 = _mm_srai_epi32(resq_r3, 1);  //(w3>>1) + w1
1243     temp3 = _mm_add_epi32(temp3, resq_r1);
1244     /*----------------------------------------------------------*/
1245     /* x0 = z0 + z3                                             */
1246     resq_r0 = _mm_add_epi32(temp0, temp3);
1247     /* x1 = z1 + z2                                             */
1248     resq_r1 = _mm_add_epi32(temp1, temp2);
1249     /* x2 = z1 - z2                                             */
1250     resq_r2 = _mm_sub_epi32(temp1, temp2);
1251     /* x3 = z0 - z3                                             */
1252     resq_r3 = _mm_sub_epi32(temp0, temp3);
1253     // Matrix transpose
1254     /*
1255      *  a0 b0 c0 d0
1256      *  a1 b1 c1 d1
1257      *  a2 b2 c2 d2
1258      *  a3 b3 c3 d3
1259      */
1260     /* a0 a1 b0 b1 */
1261     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1262     /* a2 a3 b2 b3 */
1263     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1264     /* c0 c1 d0 d1 */
1265     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1266     /* c2 c3 d2 d3 */
1267     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1268     /* a0 a1 a2 a3 */
1269     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1270     /* b0 b1 b2 b3 */
1271     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1272     /* c0 c1 c2 c3 */
1273     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1274     /* d0 d1 d2 d3 */
1275     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1276     /* Transform ends -- horizontal transform */
1277 
1278     temp0 = _mm_packs_epi32(resq_r0, resq_r1);
1279     temp1 = _mm_packs_epi32(resq_r2, resq_r3);
1280 
1281     _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0);
1282     _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1);
1283 
1284     /* Load pred buffer */
1285     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1286     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1287     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1288     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1289 
1290     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1291     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1292     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1293     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1294 
1295     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
1296     pred_r1 = _mm_unpacklo_epi64(pred_r2, pred_r3);
1297 
1298     /*--------------------------------------------------------------*/
1299     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
1300     /*                                                              */
1301     /* Add the prediction and store it back to same buffer          */
1302     /*--------------------------------------------------------------*/
1303     /* z0j = y0j + y2j                                         */
1304     temp0 = _mm_add_epi32(resq_r0, resq_r2);
1305     /* z1j = y0j - y2j                                                        */
1306     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1307     /* z2j = (y1j>>1) - y3j */
1308     temp2 = _mm_srai_epi32(resq_r1, 1);
1309     temp2 = _mm_sub_epi32(temp2, resq_r3);
1310     /* z3j = y1j + (y3j>>1) */
1311     temp3 = _mm_srai_epi32(resq_r3, 1);
1312     temp3 = _mm_add_epi32(temp3, resq_r1);
1313 
1314     /* x0j = z0j + z3j                                                        */
1315     temp4 = _mm_add_epi32(temp0, temp3);
1316     temp4 = _mm_add_epi32(temp4, value_32);
1317     temp4 = _mm_srai_epi32(temp4, 6);
1318     /* x1j = z1j + z2j                                                        */
1319     temp5 = _mm_add_epi32(temp1, temp2);
1320     temp5 = _mm_add_epi32(temp5, value_32);
1321     temp5 = _mm_srai_epi32(temp5, 6);
1322     /* x2j = z1j - z2j                                                        */
1323     temp6 = _mm_sub_epi32(temp1, temp2);
1324     temp6 = _mm_add_epi32(temp6, value_32);
1325     temp6 = _mm_srai_epi32(temp6, 6);
1326     /* x3j = z0j - z3j                                                        */
1327     temp7 = _mm_sub_epi32(temp0, temp3);
1328     temp7 = _mm_add_epi32(temp7, value_32);
1329     temp7 = _mm_srai_epi32(temp7, 6);
1330 
1331     /* 32-bit to 16-bit conversion */
1332     temp0 = _mm_packs_epi32(temp4, temp5);
1333     temp1 = _mm_packs_epi32(temp6, temp7);
1334 
1335     /* Saturate all values < -255 to -255 and retain the rest as it is */
1336     temp4 = _mm_max_epi16(temp0, neg_255_8x16b);
1337     /* Saturate all values > 255 to 255 and retain the rest as it is */
1338     temp4 = _mm_min_epi16(temp4, pos_255_8x16b);
1339 
1340     /* Saturate all values < -255 to -255 and retain the rest as it is */
1341     temp5 = _mm_max_epi16(temp1, neg_255_8x16b);
1342     /* Saturate all values > 255 to 255 and retain the rest as it is */
1343     temp5 = _mm_min_epi16(temp5, pos_255_8x16b);
1344 
1345     temp0 = _mm_add_epi16(temp4, pred_r0);
1346     temp1 = _mm_add_epi16(temp5, pred_r1);
1347 
1348     /*------------------------------------------------------------------*/
1349     /* Clipping the results to 8 bits */
1350     sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
1351     temp0 = _mm_and_si128(temp0, sign_reg);
1352     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
1353     temp1 = _mm_and_si128(temp1, sign_reg);
1354 
1355     resq_r0 = _mm_packus_epi16(temp0, temp1);
1356     resq_r1 = _mm_srli_si128(resq_r0, 4);
1357     resq_r2 = _mm_srli_si128(resq_r1, 4);
1358     resq_r3 = _mm_srli_si128(resq_r2, 4);
1359 
1360     resq_r0 = _mm_cvtepu8_epi16(resq_r0);
1361     resq_r1 = _mm_cvtepu8_epi16(resq_r1);
1362     resq_r2 = _mm_cvtepu8_epi16(resq_r2);
1363     resq_r3 = _mm_cvtepu8_epi16(resq_r3);
1364 
1365     chroma_mask = _mm_set1_epi16(0xFF00);
1366     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
1367     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
1368     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
1369     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
1370 
1371     out_r0 = _mm_and_si128(out_r0, chroma_mask);
1372     out_r1 = _mm_and_si128(out_r1, chroma_mask);
1373     out_r2 = _mm_and_si128(out_r2, chroma_mask);
1374     out_r3 = _mm_and_si128(out_r3, chroma_mask);
1375 
1376     out_r0 = _mm_add_epi8(out_r0, resq_r0);
1377     out_r1 = _mm_add_epi8(out_r1, resq_r1);
1378     out_r2 = _mm_add_epi8(out_r2, resq_r2);
1379     out_r3 = _mm_add_epi8(out_r3, resq_r3);
1380 
1381     _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
1382     _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
1383     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
1384     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
1385 }
1386 
isvc_iquant_itrans_recon_res_chroma_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1387 void isvc_iquant_itrans_recon_res_chroma_4x4_sse42(
1388     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1389     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1390     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1391     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1392 {
1393     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1394     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1395     WORD16 *pi2_res_ptr = pi2_res;
1396     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1397     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1398     WORD32 i4_src_stride = ps_src->i4_data_stride;
1399     WORD32 i4_res_stride = ps_res->i4_data_stride;
1400     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1401     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1402     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1403     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1404     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1405     __m128i src_r0_r1, src_r2_r3;
1406     __m128i src_r0, src_r1, src_r2, src_r3;
1407     __m128i scalemat_r0_r1, scalemat_r2_r3;
1408     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1409     __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
1410     /* all bits reset to zero */
1411     __m128i zero_8x16b = _mm_setzero_si128();
1412     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1413     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1414     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1415     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1416     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1417     __m128i value_32 = _mm_set1_epi32(32);
1418     __m128i chroma_mask = _mm_set1_epi16(0xFF);
1419     __m128i out_r0, out_r1, out_r2, out_r3;
1420     __m128i res_r0, res_r1, res_r2, res_r3;
1421 
1422     ASSERT(4 == i4_src_stride);
1423     ASSERT(0 == u1_res_accumulate);
1424 
1425     UNUSED(i4_src_stride);
1426     UNUSED(u1_res_accumulate);
1427     UNUSED(ps_res_pred);
1428     UNUSED(i4_iq_start_idx);
1429 
1430     /*************************************************************/
1431     /* Dequantization of coefficients. Will be replaced by SIMD  */
1432     /* operations on platform                                    */
1433     /*************************************************************/
1434     /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
1435     matrix 0th,1st row */
1436     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1437 
1438     /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
1439     source matrix 2nd,3rd row */
1440     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1441 
1442     /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
1443     scaling matrix 0th,1st row */
1444     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1445 
1446     /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
1447     the scaling matrix 2nd,3rd row */
1448     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1449 
1450     /* q00 q01 q02 q03 q10 q11
1451     q12 q13 -- all 16 bits */
1452     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1453 
1454     /* q20 q21 q22 q23 q30 q31
1455     q32 q33 -- all 16 bits */
1456     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1457 
1458     temp0 = _mm_mullo_epi16(scalemat_r0_r1,
1459                             dequant_r0_r1);  // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
1460                                              // b12*q12 b13*q13 -- 16 bit result
1461 
1462     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1463 
1464     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1465     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1466 
1467     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1468     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1469 
1470     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1471     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1472 
1473     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1474     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1475 
1476     /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
1477     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
1478     /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
1479     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
1480     /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
1481     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
1482     /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
1483     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
1484 
1485     temp4 = _mm_madd_epi16(src_r0, temp4);
1486     temp5 = _mm_madd_epi16(src_r1, temp5);
1487     temp6 = _mm_madd_epi16(src_r2, temp6);
1488     temp7 = _mm_madd_epi16(src_r3, temp7);
1489 
1490     if(u4_qp_div_6 >= 4)
1491     {
1492         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1493         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1494         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1495         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1496     }
1497     else
1498     {
1499         temp4 = _mm_add_epi32(temp4, add_rshift);
1500         temp5 = _mm_add_epi32(temp5, add_rshift);
1501         temp6 = _mm_add_epi32(temp6, add_rshift);
1502         temp7 = _mm_add_epi32(temp7, add_rshift);
1503         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1504         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1505         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1506         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1507     }
1508 
1509     resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1510     /* Perform Inverse transform */
1511     /*-------------------------------------------------------------*/
1512     /* IDCT [ Horizontal transformation ]                          */
1513     /*-------------------------------------------------------------*/
1514     // Matrix transpose
1515     /*
1516      *  a0 a1 a2 a3
1517      *  b0 b1 b2 b3
1518      *  c0 c1 c2 c3
1519      *  d0 d1 d2 d3
1520      */
1521     /* a0 b0 a1 b1 */
1522     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1523     /* c0 d0 c1 d1 */
1524     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1525     /* a2 b2 a3 b3 */
1526     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1527     /* c2 d2 c3 d3 */
1528     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1529     /* a0 b0 c0 d0 */
1530     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1531     /* a1 b1 c1 d1 */
1532     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1533     /* a2 b2 c2 d2 */
1534     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1535     /* a3 b3 c3 d3 */
1536     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1537     /* Transform starts -- horizontal transform */
1538 
1539     /*------------------------------------------------------------------*/
1540     /* z0 = w0 + w2                                             */
1541     temp0 = _mm_add_epi32(resq_r0, resq_r2);
1542     /* z1 = w0 - w2                                             */
1543     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1544     /* z2 = (w1 >> 1) - w3                                      */
1545     temp2 = _mm_srai_epi32(resq_r1, 1);
1546     temp2 = _mm_sub_epi32(temp2, resq_r3);
1547     /* z3 = w1 + (w3 >> 1)                                      */
1548     temp3 = _mm_srai_epi32(resq_r3, 1);
1549     temp3 = _mm_add_epi32(temp3, resq_r1);
1550     /*----------------------------------------------------------*/
1551     /* x0 = z0 + z3                                             */
1552     resq_r0 = _mm_add_epi32(temp0, temp3);
1553     /* x1 = z1 + z2                                             */
1554     resq_r1 = _mm_add_epi32(temp1, temp2);
1555     /* x2 = z1 - z2                                             */
1556     resq_r2 = _mm_sub_epi32(temp1, temp2);
1557     /* x3 = z0 - z3                                             */
1558     resq_r3 = _mm_sub_epi32(temp0, temp3);
1559     // Matrix transpose
1560     /*
1561      *  a0 b0 c0 d0
1562      *  a1 b1 c1 d1
1563      *  a2 b2 c2 d2
1564      *  a3 b3 c3 d3
1565      */
1566     /* a0 a1 b0 b1 */
1567     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1568     /* a2 a3 b2 b3 */
1569     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1570     /* c0 c1 d0 d1 */
1571     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1572     /* c2 c3 d2 d3 */
1573     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1574     /* a0 a1 a2 a3 */
1575     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1576     /* b0 b1 b2 b3 */
1577     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1578     /* c0 c1 c2 c3 */
1579     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1580     /* d0 d1 d2 d3 */
1581     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1582     /* Transform ends -- horizontal transform */
1583 
1584     temp0 = _mm_packs_epi32(resq_r0, resq_r1);
1585     temp1 = _mm_packs_epi32(resq_r2, resq_r3);
1586 
1587     _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0);
1588     _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1);
1589 
1590     /* Load pred buffer */
1591     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1592     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1593     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1594     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1595 
1596     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1597     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1598     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1599     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1600 
1601     pred_r0 = _mm_cvtepu16_epi32(pred_r0);
1602     pred_r1 = _mm_cvtepu16_epi32(pred_r1);
1603     pred_r2 = _mm_cvtepu16_epi32(pred_r2);
1604     pred_r3 = _mm_cvtepu16_epi32(pred_r3);
1605 
1606     /*--------------------------------------------------------------*/
1607     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
1608     /*                                                              */
1609     /* Add the prediction and store it back to same buffer          */
1610     /*--------------------------------------------------------------*/
1611     /* z0j = y0j + y2j                                         */
1612     temp0 = _mm_add_epi32(resq_r0, resq_r2);
1613     /* z1j = y0j - y2j                                                        */
1614     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1615     /* z2j = (y1j>>1) - y3j */
1616     temp2 = _mm_srai_epi32(resq_r1, 1);
1617     temp2 = _mm_sub_epi32(temp2, resq_r3);
1618     /* z3j = y1j + (y3j>>1) */
1619     temp3 = _mm_srai_epi32(resq_r3, 1);
1620     temp3 = _mm_add_epi32(temp3, resq_r1);
1621 
1622     /* x0j = z0j + z3j                                                        */
1623     temp4 = _mm_add_epi32(temp0, temp3);
1624     temp4 = _mm_add_epi32(temp4, value_32);
1625     temp4 = _mm_srai_epi32(temp4, 6);
1626     /* x1j = z1j + z2j                                                        */
1627     temp5 = _mm_add_epi32(temp1, temp2);
1628     temp5 = _mm_add_epi32(temp5, value_32);
1629     temp5 = _mm_srai_epi32(temp5, 6);
1630     /* x2j = z1j - z2j                                                        */
1631     temp6 = _mm_sub_epi32(temp1, temp2);
1632     temp6 = _mm_add_epi32(temp6, value_32);
1633     temp6 = _mm_srai_epi32(temp6, 6);
1634     /* x3j = z0j - z3j                                                        */
1635     temp7 = _mm_sub_epi32(temp0, temp3);
1636     temp7 = _mm_add_epi32(temp7, value_32);
1637     temp7 = _mm_srai_epi32(temp7, 6);
1638 
1639     /* 32-bit to 16-bit conversion */
1640     temp0 = _mm_packs_epi32(temp4, temp5);
1641     temp1 = _mm_packs_epi32(temp6, temp7);
1642 
1643     /* Saturate all values < -255 to -255 and retain the rest as it is */
1644     temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
1645     /* Saturate all values > 255 to 255 and retain the rest as it is */
1646     temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
1647 
1648     /* Saturate all values < -255 to -255 and retain the rest as it is */
1649     temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
1650     /* Saturate all values > 255 to 255 and retain the rest as it is */
1651     temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
1652 
1653     chroma_mask = _mm_set1_epi32(0xffff0000);
1654     out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]));
1655     out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]));
1656     out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]));
1657     out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]));
1658 
1659     out_r0 = _mm_and_si128(out_r0, chroma_mask);
1660     out_r1 = _mm_and_si128(out_r1, chroma_mask);
1661     out_r2 = _mm_and_si128(out_r2, chroma_mask);
1662     out_r3 = _mm_and_si128(out_r3, chroma_mask);
1663 
1664     res_r0 = _mm_cvtepu16_epi32(temp0);
1665     res_r2 = _mm_cvtepu16_epi32(temp1);
1666     res_r1 = _mm_srli_si128(temp0, 8);
1667     res_r3 = _mm_srli_si128(temp1, 8);
1668     res_r1 = _mm_cvtepu16_epi32(res_r1);
1669     res_r3 = _mm_cvtepu16_epi32(res_r3);
1670 
1671     out_r0 = _mm_add_epi16(out_r0, res_r0);
1672     out_r1 = _mm_add_epi16(out_r1, res_r1);
1673     out_r2 = _mm_add_epi16(out_r2, res_r2);
1674     out_r3 = _mm_add_epi16(out_r3, res_r3);
1675 
1676     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]), out_r0);
1677     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]), out_r1);
1678     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), out_r2);
1679     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), out_r3);
1680 
1681     resq_r0 = _mm_add_epi16(pred_r0, res_r0);
1682     resq_r1 = _mm_add_epi16(pred_r1, res_r1);
1683     resq_r2 = _mm_add_epi16(pred_r2, res_r2);
1684     resq_r3 = _mm_add_epi16(pred_r3, res_r3);
1685 
1686     temp0 = _mm_packus_epi32(resq_r0, resq_r1);
1687     temp1 = _mm_packus_epi32(resq_r2, resq_r3);
1688 
1689     /*------------------------------------------------------------------*/
1690     /* Clipping the results to 8 bits */
1691     sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);
1692     temp0 = _mm_and_si128(temp0, sign_reg);
1693     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
1694     temp1 = _mm_and_si128(temp1, sign_reg);
1695 
1696     resq_r0 = _mm_packus_epi16(temp0, temp1);
1697     resq_r1 = _mm_srli_si128(resq_r0, 4);
1698     resq_r2 = _mm_srli_si128(resq_r1, 4);
1699     resq_r3 = _mm_srli_si128(resq_r2, 4);
1700 
1701     resq_r0 = _mm_cvtepu8_epi16(resq_r0);
1702     resq_r1 = _mm_cvtepu8_epi16(resq_r1);
1703     resq_r2 = _mm_cvtepu8_epi16(resq_r2);
1704     resq_r3 = _mm_cvtepu8_epi16(resq_r3);
1705 
1706     chroma_mask = _mm_set1_epi16(0xff00);
1707     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
1708     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
1709     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
1710     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
1711 
1712     out_r0 = _mm_and_si128(out_r0, chroma_mask);
1713     out_r1 = _mm_and_si128(out_r1, chroma_mask);
1714     out_r2 = _mm_and_si128(out_r2, chroma_mask);
1715     out_r3 = _mm_and_si128(out_r3, chroma_mask);
1716 
1717     out_r0 = _mm_add_epi8(out_r0, resq_r0);
1718     out_r1 = _mm_add_epi8(out_r1, resq_r1);
1719     out_r2 = _mm_add_epi8(out_r2, resq_r2);
1720     out_r3 = _mm_add_epi8(out_r3, resq_r3);
1721 
1722     _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
1723     _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
1724     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
1725     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
1726 }
1727 
isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)1728 void isvc_iquant_itrans_recon_res_chroma_4x4_with_res_acc_sse42(
1729     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
1730     buffer_container_t *ps_res, buffer_container_t *ps_rec,
1731     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
1732     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
1733 {
1734     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
1735     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
1736     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
1737     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1738     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
1739     WORD32 i4_src_stride = ps_src->i4_data_stride;
1740     WORD32 i4_res_stride = ps_res->i4_data_stride;
1741     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
1742     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1743     WORD32 i4_out_stride = ps_rec->i4_data_stride;
1744     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
1745     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
1746     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
1747     __m128i src_r0_r1, src_r2_r3;
1748     __m128i src_r0, src_r1, src_r2, src_r3;
1749     __m128i scalemat_r0_r1, scalemat_r2_r3;
1750     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1751     __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
1752     __m128i res_r0, res_r1, res_r2, res_r3;
1753     __m128i dequant_r0_r1, dequant_r2_r3;
1754     /* all bits reset to zero */
1755     __m128i zero_8x16b = _mm_setzero_si128();
1756     __m128i reg_chroma = _mm_set1_epi32(0xFFFF);
1757     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1758     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1759     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1760     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1761     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1762     __m128i value_32 = _mm_set1_epi32(32);
1763     __m128i chroma_mask = _mm_set1_epi16(0xFF);
1764     __m128i out_r0, out_r1, out_r2, out_r3;
1765     __m128i mask_r0;
1766 
1767     ASSERT(4 == i4_src_stride);
1768     ASSERT(1 == u1_res_accumulate);
1769 
1770     UNUSED(i4_src_stride);
1771     UNUSED(u1_res_accumulate);
1772     UNUSED(i4_iq_start_idx);
1773 
1774     /*************************************************************/
1775     /* Dequantization of coefficients. Will be replaced by SIMD  */
1776     /* operations on platform                                    */
1777     /*************************************************************/
1778     /* a00 a01 a02 a03 a10 a11 a12 a13 -- the source
1779     matrix 0th,1st row */
1780     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1781 
1782     /* a20 a21 a22 a23 a30 a31 a32 a33 -- the
1783     source matrix 2nd,3rd row */
1784     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1785 
1786     /* b00 b01 b02 b03 b10 b11 b12 b13 -- the
1787     scaling matrix 0th,1st row */
1788     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1789 
1790     /* b20 b21 b22 b23 b30 b31 b32 b33 --b12 b13 -- the
1791     the scaling matrix 2nd,3rd row */
1792     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1793 
1794     /* q00 q01 q02 q03 q10 q11
1795     q12 q13 -- all 16 bits */
1796     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1797 
1798     /* q20 q21 q22 q23 q30 q31
1799     q32 q33 -- all 16 bits */
1800     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1801 
1802     temp0 = _mm_mullo_epi16(scalemat_r0_r1,
1803                             dequant_r0_r1);  // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
1804                                              // b12*q12 b13*q13 -- 16 bit result
1805 
1806     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1807 
1808     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1809     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1810 
1811     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1812     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1813 
1814     /* b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long */
1815     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1816 
1817     /* b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long */
1818     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1819 
1820     /* a00 0 a01 0 a02 0 a03 0 -- 16 bit long */
1821     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);
1822     /* a10 0 a11 0 a12 0 a13 0 -- 16 bit long */
1823     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);
1824     /* a20 0 a21 0 a22 0 a23 0 -- 16 bit long */
1825     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);
1826     /* a30 0 a31 0 a32 0 a33 0 -- 16 bit long */
1827     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);
1828 
1829     temp4 = _mm_madd_epi16(src_r0, temp4);
1830     temp5 = _mm_madd_epi16(src_r1, temp5);
1831     temp6 = _mm_madd_epi16(src_r2, temp6);
1832     temp7 = _mm_madd_epi16(src_r3, temp7);
1833 
1834     if(u4_qp_div_6 >= 4)
1835     {
1836         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1837         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1838         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1839         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1840     }
1841     else
1842     {
1843         temp4 = _mm_add_epi32(temp4, add_rshift);
1844         temp5 = _mm_add_epi32(temp5, add_rshift);
1845         temp6 = _mm_add_epi32(temp6, add_rshift);
1846         temp7 = _mm_add_epi32(temp7, add_rshift);
1847         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1848         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1849         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1850         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1851     }
1852 
1853     resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1854     /* Perform Inverse transform */
1855     /*-------------------------------------------------------------*/
1856     /* IDCT [ Horizontal transformation ]                          */
1857     /*-------------------------------------------------------------*/
1858     // Matrix transpose
1859     /*
1860      *  a0 a1 a2 a3
1861      *  b0 b1 b2 b3
1862      *  c0 c1 c2 c3
1863      *  d0 d1 d2 d3
1864      */
1865     /* a0 b0 a1 b1 */
1866     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1867     /* c0 d0 c1 d1 */
1868     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1869     /* a2 b2 a3 b3 */
1870     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1871     /* c2 d2 c3 d3 */
1872     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1873     /* a0 b0 c0 d0 */
1874     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1875     /* a1 b1 c1 d1 */
1876     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1877     /* a2 b2 c2 d2 */
1878     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1879     /* a3 b3 c3 d3 */
1880     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1881     /* Transform starts -- horizontal transform */
1882 
1883     /*------------------------------------------------------------------*/
1884     /* z0 = w0 + w2                                             */
1885     temp0 = _mm_add_epi32(resq_r0, resq_r2);
1886     /* z1 = w0 - w2                                             */
1887     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1888     /* z2 = (w1 >> 1) - w3                                      */
1889     temp2 = _mm_srai_epi32(resq_r1, 1);
1890     temp2 = _mm_sub_epi32(temp2, resq_r3);
1891     /* z3 = w1 + (w3 >> 1)                                      */
1892     temp3 = _mm_srai_epi32(resq_r3, 1);  //(w3>>1) + w1
1893     temp3 = _mm_add_epi32(temp3, resq_r1);
1894     /*----------------------------------------------------------*/
1895     /* x0 = z0 + z3                                             */
1896     resq_r0 = _mm_add_epi32(temp0, temp3);
1897     /* x1 = z1 + z2                                             */
1898     resq_r1 = _mm_add_epi32(temp1, temp2);
1899     /* x2 = z1 - z2                                             */
1900     resq_r2 = _mm_sub_epi32(temp1, temp2);
1901     /* x3 = z0 - z3                                             */
1902     resq_r3 = _mm_sub_epi32(temp0, temp3);
1903     // Matrix transpose
1904     /*
1905      *  a0 b0 c0 d0
1906      *  a1 b1 c1 d1
1907      *  a2 b2 c2 d2
1908      *  a3 b3 c3 d3
1909      */
1910     /* a0 a1 b0 b1 */
1911     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);
1912     /* a2 a3 b2 b3 */
1913     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);
1914     /* c0 c1 d0 d1 */
1915     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);
1916     /* c2 c3 d2 d3 */
1917     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);
1918     /* a0 a1 a2 a3 */
1919     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);
1920     /* b0 b1 b2 b3 */
1921     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);
1922     /* c0 c1 c2 c3 */
1923     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);
1924     /* d0 d1 d2 d3 */
1925     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);
1926     /* Transform ends -- horizontal transform */
1927 
1928     temp0 = _mm_packs_epi32(resq_r0, resq_r1);
1929     temp1 = _mm_packs_epi32(resq_r2, resq_r3);
1930 
1931     _mm_storeu_si128((__m128i *) (&pi2_tmp[0]), temp0);
1932     _mm_storeu_si128((__m128i *) (&pi2_tmp[2 * 4]), temp1);
1933 
1934     /* Load pred buffer */
1935     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1936     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1937     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1938     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1939 
1940     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1941     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1942     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1943     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1944 
1945     /*--------------------------------------------------------------*/
1946     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
1947     /*                                                              */
1948     /* Add the prediction and store it back to same buffer          */
1949     /*--------------------------------------------------------------*/
1950     /* z0j = y0j + y2j                                         */
1951     temp0 = _mm_add_epi32(resq_r0, resq_r2);
1952     /* z1j = y0j - y2j                                                        */
1953     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1954     /* z2j = (y1j>>1) - y3j */
1955     temp2 = _mm_srai_epi32(resq_r1, 1);
1956     temp2 = _mm_sub_epi32(temp2, resq_r3);
1957     /* z3j = y1j + (y3j>>1) */
1958     temp3 = _mm_srai_epi32(resq_r3, 1);
1959     temp3 = _mm_add_epi32(temp3, resq_r1);
1960 
1961     /* x0j = z0j + z3j                                                        */
1962     temp4 = _mm_add_epi32(temp0, temp3);
1963     temp4 = _mm_add_epi32(temp4, value_32);
1964     temp4 = _mm_srai_epi32(temp4, 6);
1965     res_r0 = temp4;
1966     /* x1j = z1j + z2j                                                        */
1967     temp5 = _mm_add_epi32(temp1, temp2);
1968     temp5 = _mm_add_epi32(temp5, value_32);
1969     temp5 = _mm_srai_epi32(temp5, 6);
1970     res_r1 = temp5;
1971     /* x2j = z1j - z2j                                                        */
1972     temp6 = _mm_sub_epi32(temp1, temp2);
1973     temp6 = _mm_add_epi32(temp6, value_32);
1974     temp6 = _mm_srai_epi32(temp6, 6);
1975     res_r2 = temp6;
1976     /* x3j = z0j - z3j                                                        */
1977     temp7 = _mm_sub_epi32(temp0, temp3);
1978     temp7 = _mm_add_epi32(temp7, value_32);
1979     temp7 = _mm_srai_epi32(temp7, 6);
1980     res_r3 = temp7;
1981 
1982     res_pred_r0 = _mm_loadu_si128((__m128i *) &pi2_res_pred[0 * i4_res_pred_stride]);
1983     res_pred_r1 = _mm_loadu_si128((__m128i *) &pi2_res_pred[1 * i4_res_pred_stride]);
1984     res_pred_r2 = _mm_loadu_si128((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
1985     res_pred_r3 = _mm_loadu_si128((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
1986 
1987     res_pred_r0 = _mm_and_si128(res_pred_r0, reg_chroma);
1988     res_pred_r1 = _mm_and_si128(res_pred_r1, reg_chroma);
1989     res_pred_r2 = _mm_and_si128(res_pred_r2, reg_chroma);
1990     res_pred_r3 = _mm_and_si128(res_pred_r3, reg_chroma);
1991 
1992     temp0 = _mm_packs_epi32(res_r0, res_r1);
1993     temp1 = _mm_packs_epi32(res_r2, res_r3);
1994 
1995     res_r0 = _mm_cvtepu16_epi32(temp0);
1996     res_r2 = _mm_cvtepu16_epi32(temp1);
1997     res_r1 = _mm_srli_si128(temp0, 8);
1998     res_r3 = _mm_srli_si128(temp1, 8);
1999     res_r1 = _mm_cvtepu16_epi32(res_r1);
2000     res_r3 = _mm_cvtepu16_epi32(res_r3);
2001 
2002     res_r0 = _mm_add_epi16(res_pred_r0, res_r0);
2003     res_r1 = _mm_add_epi16(res_pred_r1, res_r1);
2004     res_r2 = _mm_add_epi16(res_pred_r2, res_r2);
2005     res_r3 = _mm_add_epi16(res_pred_r3, res_r3);
2006 
2007     temp0 = _mm_packus_epi32(res_r0, res_r1);
2008     temp1 = _mm_packus_epi32(res_r2, res_r3);
2009 
2010     /* Saturate all values < -255 to -255 and retain the rest as it is */
2011     temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
2012     /* Saturate all values > 255 to 255 and retain the rest as it is */
2013     temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
2014 
2015     /* Saturate all values < -255 to -255 and retain the rest as it is */
2016     temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
2017     /* Saturate all values > 255 to 255 and retain the rest as it is */
2018     temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
2019 
2020     res_r0 = _mm_cvtepu16_epi32(temp0);
2021     res_r1 = _mm_srli_si128(temp0, 8);
2022     res_r1 = _mm_cvtepu16_epi32(res_r1);
2023 
2024     res_r2 = _mm_cvtepu16_epi32(temp1);
2025     res_r3 = _mm_srli_si128(temp1, 8);
2026     res_r3 = _mm_cvtepu16_epi32(res_r3);
2027 
2028     chroma_mask = _mm_set1_epi32(0xffff0000);
2029     out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]));
2030     out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]));
2031     out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]));
2032     out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]));
2033 
2034     out_r0 = _mm_and_si128(out_r0, chroma_mask);
2035     out_r1 = _mm_and_si128(out_r1, chroma_mask);
2036     out_r2 = _mm_and_si128(out_r2, chroma_mask);
2037     out_r3 = _mm_and_si128(out_r3, chroma_mask);
2038 
2039     out_r0 = _mm_add_epi16(out_r0, res_r0);
2040     out_r1 = _mm_add_epi16(out_r1, res_r1);
2041     out_r2 = _mm_add_epi16(out_r2, res_r2);
2042     out_r3 = _mm_add_epi16(out_r3, res_r3);
2043 
2044     _mm_storeu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]), out_r0);
2045     _mm_storeu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]), out_r1);
2046     _mm_storeu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]), out_r2);
2047     _mm_storeu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]), out_r3);
2048 
2049     pred_r0 = _mm_cvtepu16_epi32(pred_r0);
2050     pred_r1 = _mm_cvtepu16_epi32(pred_r1);
2051     pred_r2 = _mm_cvtepu16_epi32(pred_r2);
2052     pred_r3 = _mm_cvtepu16_epi32(pred_r3);
2053 
2054     resq_r0 = _mm_add_epi16(pred_r0, res_r0);
2055     resq_r1 = _mm_add_epi16(pred_r1, res_r1);
2056     resq_r2 = _mm_add_epi16(pred_r2, res_r2);
2057     resq_r3 = _mm_add_epi16(pred_r3, res_r3);
2058 
2059     temp0 = _mm_packus_epi32(resq_r0, resq_r1);
2060     temp1 = _mm_packus_epi32(resq_r2, resq_r3);
2061 
2062     /* Clipping the results to 8 bits */
2063     mask_r0 = _mm_cmpgt_epi16(temp0, zero_8x16b);
2064     temp0 = _mm_and_si128(temp0, mask_r0);
2065     mask_r0 = _mm_cmpgt_epi16(temp1, zero_8x16b);
2066     temp1 = _mm_and_si128(temp1, mask_r0);
2067 
2068     resq_r0 = _mm_packus_epi16(temp0, temp1);
2069     resq_r1 = _mm_srli_si128(resq_r0, 4);
2070     resq_r2 = _mm_srli_si128(resq_r1, 4);
2071     resq_r3 = _mm_srli_si128(resq_r2, 4);
2072 
2073     resq_r0 = _mm_cvtepu8_epi16(resq_r0);
2074     resq_r1 = _mm_cvtepu8_epi16(resq_r1);
2075     resq_r2 = _mm_cvtepu8_epi16(resq_r2);
2076     resq_r3 = _mm_cvtepu8_epi16(resq_r3);
2077 
2078     chroma_mask = _mm_set1_epi16(0xFF00);
2079     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0 * i4_out_stride]));
2080     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[1 * i4_out_stride]));
2081     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2082     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2083 
2084     out_r0 = _mm_and_si128(out_r0, chroma_mask);
2085     out_r1 = _mm_and_si128(out_r1, chroma_mask);
2086     out_r2 = _mm_and_si128(out_r2, chroma_mask);
2087     out_r3 = _mm_and_si128(out_r3, chroma_mask);
2088 
2089     out_r0 = _mm_add_epi8(out_r0, resq_r0);
2090     out_r1 = _mm_add_epi8(out_r1, resq_r1);
2091     out_r2 = _mm_add_epi8(out_r2, resq_r2);
2092     out_r3 = _mm_add_epi8(out_r3, resq_r3);
2093 
2094     _mm_storel_epi64((__m128i *) (&pu1_out[0 * i4_out_stride]), out_r0);
2095     _mm_storel_epi64((__m128i *) (&pu1_out[1 * i4_out_stride]), out_r1);
2096     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2097     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2098 }
2099 
isvc_iquant_itrans_recon_dc_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2100 void isvc_iquant_itrans_recon_dc_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
2101                                            buffer_container_t *ps_res_pred,
2102                                            buffer_container_t *ps_res, buffer_container_t *ps_rec,
2103                                            iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
2104                                            WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2105                                            WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2106 {
2107     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2108     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2109     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2110     WORD32 i4_out_stride = ps_rec->i4_data_stride;
2111     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2112     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2113     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2114     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
2115     WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0];
2116     WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
2117 
2118     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2119     __m128i sign_reg;
2120     /* all bits reset to zero */
2121     __m128i zero_8x16b = _mm_setzero_si128();
2122     __m128i temp4, temp5, temp6, temp7;
2123     __m128i value_add;
2124 
2125     ASSERT(0 == u1_res_accumulate);
2126 
2127     UNUSED(pi2_tmp);
2128     UNUSED(ps_res);
2129     UNUSED(ps_res_pred);
2130     UNUSED(u1_res_accumulate);
2131 
2132     INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
2133 
2134     /* Restoring dc value for intra case */
2135     if(i4_iq_start_idx != 0)
2136     {
2137         q0 = pi2_dc_src[0];
2138     }
2139 
2140     i_macro = ((q0 + 32) >> 6);
2141 
2142     value_add = _mm_set1_epi16(i_macro);
2143 
2144     zero_8x16b = _mm_setzero_si128();
2145 
2146     /* Load pred buffer */
2147 
2148     /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */
2149     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2150 
2151     /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */
2152     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2153 
2154     /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */
2155     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2156 
2157     /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */
2158     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2159 
2160     pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2161     pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2162     pred_r2 = _mm_cvtepu8_epi16(pred_r2);
2163     pred_r3 = _mm_cvtepu8_epi16(pred_r3);
2164 
2165     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2166     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2167 
2168     temp4 = _mm_add_epi16(value_add, pred_r0);
2169     temp5 = _mm_add_epi16(value_add, pred_r2);
2170     /*------------------------------------------------------------------*/
2171     /* Clipping the results to 8 bits */
2172     sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);
2173     temp4 = _mm_and_si128(temp4, sign_reg);
2174     sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);
2175     temp5 = _mm_and_si128(temp5, sign_reg);
2176 
2177     temp4 = _mm_packus_epi16(temp4, temp5);
2178     temp5 = _mm_srli_si128(temp4, 4);
2179     temp6 = _mm_srli_si128(temp5, 4);
2180     temp7 = _mm_srli_si128(temp6, 4);
2181 
2182     *pu4_out = _mm_cvtsi128_si32(temp4);
2183     pu1_out += i4_out_stride;
2184     pu4_out = (UWORD32 *) (pu1_out);
2185     *(pu4_out) = _mm_cvtsi128_si32(temp5);
2186     pu1_out += i4_out_stride;
2187     pu4_out = (UWORD32 *) (pu1_out);
2188     *(pu4_out) = _mm_cvtsi128_si32(temp6);
2189     pu1_out += i4_out_stride;
2190     pu4_out = (UWORD32 *) (pu1_out);
2191     *(pu4_out) = _mm_cvtsi128_si32(temp7);
2192 }
2193 
isvc_iquant_itrans_recon_res_dc_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2194 void isvc_iquant_itrans_recon_res_dc_4x4_sse42(
2195     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2196     buffer_container_t *ps_res, buffer_container_t *ps_rec,
2197     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2198     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2199 {
2200     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
2201     WORD16 *pi2_res_ptr = pi2_res;
2202     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2203     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2204     WORD32 i4_res_stride = ps_res->i4_data_stride;
2205     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2206     WORD32 i4_out_stride = ps_rec->i4_data_stride;
2207     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2208     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2209     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2210     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
2211     WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0];
2212     WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
2213 
2214     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2215     __m128i sign_reg;
2216     /* all bits reset to zero */
2217     __m128i zero_8x16b = _mm_setzero_si128();
2218     __m128i temp4, temp5, temp6, temp7;
2219     __m128i value_add;
2220 
2221     ASSERT(0 == u1_res_accumulate);
2222 
2223     UNUSED(pi2_tmp);
2224     UNUSED(ps_res_pred);
2225     UNUSED(u1_res_accumulate);
2226 
2227     INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
2228 
2229     /* Restoring dc value for intra case */
2230     if(i4_iq_start_idx != 0) q0 = pi2_dc_src[0];
2231 
2232     i_macro = ((q0 + 32) >> 6);
2233 
2234     value_add = _mm_set1_epi16(isvc_get_residue(i_macro, 0, 0));
2235 
2236     zero_8x16b = _mm_setzero_si128();
2237 
2238     /* Load pred buffer */
2239 
2240     /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */
2241     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2242 
2243     /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */
2244     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2245 
2246     /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */
2247     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2248 
2249     /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */
2250     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2251 
2252     pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2253     pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2254     pred_r2 = _mm_cvtepu8_epi16(pred_r2);
2255     pred_r3 = _mm_cvtepu8_epi16(pred_r3);
2256 
2257     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2258     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2259 
2260     temp4 = _mm_add_epi16(value_add, pred_r0);
2261     temp5 = _mm_add_epi16(value_add, pred_r2);
2262 
2263     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0]), value_add);
2264     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[i4_res_stride]), value_add);
2265     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), value_add);
2266     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), value_add);
2267     /*------------------------------------------------------------------*/
2268     /* Clipping the results to 8 bits */
2269     sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);
2270     temp4 = _mm_and_si128(temp4, sign_reg);
2271     sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);
2272     temp5 = _mm_and_si128(temp5, sign_reg);
2273 
2274     temp4 = _mm_packus_epi16(temp4, temp5);
2275     temp5 = _mm_srli_si128(temp4, 4);
2276     temp6 = _mm_srli_si128(temp5, 4);
2277     temp7 = _mm_srli_si128(temp6, 4);
2278 
2279     *pu4_out = _mm_cvtsi128_si32(temp4);
2280     pu1_out += i4_out_stride;
2281     pu4_out = (UWORD32 *) (pu1_out);
2282     *(pu4_out) = _mm_cvtsi128_si32(temp5);
2283     pu1_out += i4_out_stride;
2284     pu4_out = (UWORD32 *) (pu1_out);
2285     *(pu4_out) = _mm_cvtsi128_si32(temp6);
2286     pu1_out += i4_out_stride;
2287     pu4_out = (UWORD32 *) (pu1_out);
2288     *(pu4_out) = _mm_cvtsi128_si32(temp7);
2289 }
2290 
isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2291 void isvc_iquant_itrans_recon_res_dc_with_res_acc_4x4_sse42(
2292     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2293     buffer_container_t *ps_res, buffer_container_t *ps_rec,
2294     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2295     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2296 {
2297     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
2298     WORD16 *pi2_res_ptr = pi2_res;
2299     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
2300     WORD16 *pi2_res_pred_ptr = pi2_res_pred;
2301     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2302     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2303     WORD32 i4_res_stride = ps_res->i4_data_stride;
2304     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
2305     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2306     WORD32 i4_out_stride = ps_rec->i4_data_stride;
2307     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2308     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2309     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2310     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
2311     WORD32 q0 = ((WORD16 *) (ps_src->pv_data))[0];
2312     WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
2313 
2314     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2315     __m128i sign_reg;
2316     /* all bits reset to zero */
2317     __m128i zero_8x16b = _mm_setzero_si128();
2318     __m128i temp4, temp5, temp6, temp7;
2319     __m128i value_add;
2320     __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
2321     __m128i temp0, temp1;
2322     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
2323     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
2324 
2325     ASSERT(1 == u1_res_accumulate);
2326 
2327     UNUSED(pi2_tmp);
2328     UNUSED(u1_res_accumulate);
2329 
2330     INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
2331 
2332     /* Restoring dc value for intra case */
2333     if(i4_iq_start_idx != 0) q0 = pi2_dc_src[0];
2334 
2335     i_macro = ((q0 + 32) >> 6);
2336 
2337     value_add = _mm_set1_epi16(i_macro);
2338 
2339     zero_8x16b = _mm_setzero_si128();
2340 
2341     /* Load pred buffer */
2342 
2343     /* p00 p01 p02 p03 0 0 0 0 -- all 8 bits */
2344     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2345 
2346     /* p10 p11 p12 p13 0 0 0 0 -- all 8 bits */
2347     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2348 
2349     /* p20 p21 p22 p23 0 0 0 0 -- all 8 bits */
2350     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2351 
2352     /* p30 p31 p32 p33 0 0 0 0 -- all 8 bits */
2353     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2354 
2355     pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2356     pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2357     pred_r2 = _mm_cvtepu8_epi16(pred_r2);
2358     pred_r3 = _mm_cvtepu8_epi16(pred_r3);
2359 
2360     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2361     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2362 
2363     /* Accumulating res */
2364     res_pred_r0 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[0]);
2365     res_pred_r1 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[i4_res_pred_stride]);
2366     res_pred_r2 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[2 * i4_res_pred_stride]);
2367     res_pred_r3 = _mm_loadl_epi64((__m128i *) &pi2_res_pred_ptr[3 * i4_res_pred_stride]);
2368 
2369     res_pred_r0 = _mm_unpacklo_epi64(res_pred_r0, res_pred_r1);
2370     res_pred_r1 = _mm_unpacklo_epi64(res_pred_r2, res_pred_r3);
2371 
2372     temp0 = _mm_add_epi16(value_add, res_pred_r0);
2373     temp1 = _mm_add_epi16(value_add, res_pred_r1);
2374 
2375     /* Saturate all values < -255 to -255 and retain the rest as it is */
2376     temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
2377     /* Saturate all values > 255 to 255 and retain the rest as it is */
2378     temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
2379 
2380     /* Saturate all values < -255 to -255 and retain the rest as it is */
2381     temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
2382     /* Saturate all values > 255 to 255 and retain the rest as it is */
2383     temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
2384 
2385     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0]), temp0);
2386     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), temp1);
2387 
2388     temp4 = _mm_add_epi16(temp0, pred_r0);
2389     temp5 = _mm_add_epi16(temp1, pred_r2);
2390 
2391     temp0 = _mm_srli_si128(temp0, 8);
2392     temp1 = _mm_srli_si128(temp1, 8);
2393 
2394     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[i4_res_stride]), temp0);
2395     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), temp1);
2396 
2397     /*------------------------------------------------------------------*/
2398     /* Clipping the results to 8 bits */
2399     sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);
2400     temp4 = _mm_and_si128(temp4, sign_reg);
2401     sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);
2402     temp5 = _mm_and_si128(temp5, sign_reg);
2403 
2404     temp4 = _mm_packus_epi16(temp4, temp5);
2405     temp5 = _mm_srli_si128(temp4, 4);
2406     temp6 = _mm_srli_si128(temp5, 4);
2407     temp7 = _mm_srli_si128(temp6, 4);
2408 
2409     *pu4_out = _mm_cvtsi128_si32(temp4);
2410     pu1_out += i4_out_stride;
2411     pu4_out = (UWORD32 *) (pu1_out);
2412     *(pu4_out) = _mm_cvtsi128_si32(temp5);
2413     pu1_out += i4_out_stride;
2414     pu4_out = (UWORD32 *) (pu1_out);
2415     *(pu4_out) = _mm_cvtsi128_si32(temp6);
2416     pu1_out += i4_out_stride;
2417     pu4_out = (UWORD32 *) (pu1_out);
2418     *(pu4_out) = _mm_cvtsi128_si32(temp7);
2419 }
2420 
isvc_iquant_itrans_recon_chroma_4x4_dc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2421 void isvc_iquant_itrans_recon_chroma_4x4_dc_sse42(
2422     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2423     buffer_container_t *ps_res, buffer_container_t *ps_rec,
2424     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2425     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2426 {
2427     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
2428     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2429     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2430     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2431     WORD32 i4_out_stride = ps_rec->i4_data_stride;
2432     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2433     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2434     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2435     /* DC value won't be dequantized for chroma
2436     inverse transform */
2437     WORD16 q0 = pi2_dc_src[0];
2438     WORD16 i_macro = ((q0 + 32) >> 6);
2439 
2440     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2441     /* all bits reset to zero */
2442     __m128i zero_8x16b = _mm_setzero_si128();
2443     __m128i chroma_mask = _mm_set1_epi16(0xFF);
2444     __m128i value_add = _mm_set1_epi16(i_macro);
2445     __m128i out_r0, out_r1, out_r2, out_r3;
2446 
2447     ASSERT(0 == u1_res_accumulate);
2448 
2449     UNUSED(pi2_src);
2450     UNUSED(pu2_iscal_mat);
2451     UNUSED(pu2_weigh_mat);
2452     UNUSED(u4_qp_div_6);
2453     UNUSED(pi2_tmp);
2454     UNUSED(ps_res_pred);
2455     UNUSED(ps_res);
2456     UNUSED(i4_iq_start_idx);
2457     UNUSED(u1_res_accumulate);
2458 
2459     /* Load pred buffer */
2460     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2461 
2462     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2463 
2464     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2465 
2466     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2467 
2468     /* Mask alternate pred values from the interleaved pred buf */
2469     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
2470     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
2471     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
2472     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
2473 
2474     /* Pack the first four 16 bit values of 2 regs into a single reg*/
2475     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2476     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2477 
2478     /* Compute out pixel by adding res to pred */
2479     pred_r0 = _mm_add_epi16(value_add, pred_r0);
2480     pred_r2 = _mm_add_epi16(value_add, pred_r2);
2481     /*------------------------------------------------------------------*/
2482     /* Clipping the results to 8 bits */
2483     pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
2484     pred_r1 = _mm_srli_si128(pred_r0, 4);
2485     pred_r2 = _mm_srli_si128(pred_r1, 4);
2486     pred_r3 = _mm_srli_si128(pred_r2, 4);
2487 
2488     /* p00 p01 p02 p03 -- all 16 bits */
2489     pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);
2490     /* p10 p11 p12 p13 -- all 16 bits */
2491     pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);
2492     /* p20 p21 p22 p23 -- all 16 bits */
2493     pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);
2494     /* p30 p31 p32 p33 -- all 16 bits */
2495     pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);
2496 
2497     /* Load interleaved out buffer */
2498     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
2499     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
2500     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2501     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2502 
2503     /* Mask the interleaved out buf in order to save the U/V out pixel computed in
2504     this function call without thrashing the U/V out pixel that was saved
2505     during an earlier function call */
2506     chroma_mask = _mm_set1_epi16(0xFF00);
2507 
2508     out_r0 = _mm_and_si128(out_r0, chroma_mask);
2509     out_r1 = _mm_and_si128(out_r1, chroma_mask);
2510     out_r2 = _mm_and_si128(out_r2, chroma_mask);
2511     out_r3 = _mm_and_si128(out_r3, chroma_mask);
2512 
2513     /* Save the out pixels in alternate locations */
2514     out_r0 = _mm_add_epi8(out_r0, pred_r0);
2515     out_r1 = _mm_add_epi8(out_r1, pred_r1);
2516     out_r2 = _mm_add_epi8(out_r2, pred_r2);
2517     out_r3 = _mm_add_epi8(out_r3, pred_r3);
2518 
2519     _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
2520     _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
2521     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2522     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2523 }
2524 
isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2525 void isvc_iquant_itrans_recon_res_chroma_4x4_dc_sse42(
2526     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2527     buffer_container_t *ps_res, buffer_container_t *ps_rec,
2528     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2529     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2530 {
2531     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
2532     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
2533     WORD16 *pi2_res_ptr = pi2_res;
2534     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2535     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2536     WORD32 i4_res_stride = ps_res->i4_data_stride;
2537     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2538     WORD32 i4_out_stride = ps_rec->i4_data_stride;
2539     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2540     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2541     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2542     /* DC value won't be dequantized for chroma
2543     inverse transform */
2544     WORD16 q0 = pi2_dc_src[0];
2545     WORD16 i_macro = ((q0 + 32) >> 6);
2546 
2547     __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg;
2548     /* all bits reset to zero */
2549     __m128i zero_8x16b = _mm_setzero_si128();
2550     __m128i chroma_mask = _mm_set1_epi16(0xFF);
2551     __m128i value_add = _mm_set1_epi16(isvc_get_residue(i_macro, 0, 0));
2552     __m128i out_r0, out_r1, out_r2, out_r3;
2553 
2554     ASSERT(0 == u1_res_accumulate);
2555 
2556     UNUSED(pi2_src);
2557     UNUSED(pu2_iscal_mat);
2558     UNUSED(pu2_weigh_mat);
2559     UNUSED(u4_qp_div_6);
2560     UNUSED(pi2_tmp);
2561     UNUSED(ps_res_pred);
2562     UNUSED(i4_iq_start_idx);
2563     UNUSED(u1_res_accumulate);
2564 
2565     /* Load pred buffer */
2566     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2567 
2568     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2569 
2570     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2571 
2572     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2573 
2574     /* Mask alternate pred values from the interleaved pred buf */
2575     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
2576     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
2577     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
2578     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
2579 
2580     /* Pack the first four 16 bit values of 2 regs into a single reg*/
2581     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2582     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2583 
2584     /* Compute out pixel by adding res to pred */
2585     pred_r0 = _mm_add_epi16(value_add, pred_r0);
2586     pred_r2 = _mm_add_epi16(value_add, pred_r2);
2587 
2588     /* Convert res from 16 bits to 32 bits  */
2589     value_add = _mm_cvtepu16_epi32(value_add);
2590 
2591     out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]));
2592     out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]));
2593     out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]));
2594     out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]));
2595 
2596     /* Mask the loaded res in order to save the U/V res data computed in
2597     this function call without thrashing the U/V res data that was saved
2598     during an earlier function call */
2599     chroma_mask = _mm_set1_epi32(0xffff0000);
2600     out_r0 = _mm_and_si128(out_r0, chroma_mask);
2601     out_r1 = _mm_and_si128(out_r1, chroma_mask);
2602     out_r2 = _mm_and_si128(out_r2, chroma_mask);
2603     out_r3 = _mm_and_si128(out_r3, chroma_mask);
2604 
2605     /* Save the res in alternate locations */
2606     out_r0 = _mm_add_epi16(out_r0, value_add);
2607     out_r1 = _mm_add_epi16(out_r1, value_add);
2608     out_r2 = _mm_add_epi16(out_r2, value_add);
2609     out_r3 = _mm_add_epi16(out_r3, value_add);
2610 
2611     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[0 * i4_res_stride]), out_r0);
2612     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[1 * i4_res_stride]), out_r1);
2613     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[2 * i4_res_stride]), out_r2);
2614     _mm_storeu_si128((__m128i *) (&pi2_res_ptr[3 * i4_res_stride]), out_r3);
2615     /*------------------------------------------------------------------*/
2616     /* Clipping the results to 8 bits */
2617     sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b);
2618     pred_r0 = _mm_and_si128(pred_r0, sign_reg);
2619     sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b);
2620     pred_r2 = _mm_and_si128(pred_r2, sign_reg);
2621 
2622     pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
2623     pred_r1 = _mm_srli_si128(pred_r0, 4);
2624     pred_r2 = _mm_srli_si128(pred_r1, 4);
2625     pred_r3 = _mm_srli_si128(pred_r2, 4);
2626 
2627     /* p00 p01 p02 p03 -- all 16 bits */
2628     pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);
2629     /* p10 p11 p12 p13 -- all 16 bits */
2630     pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);
2631     /* p20 p21 p22 p23 -- all 16 bits */
2632     pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);
2633     /* p30 p31 p32 p33 -- all 16 bits */
2634     pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);
2635 
2636     /* Load interleaved out buffer */
2637     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
2638     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
2639     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2640     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2641 
2642     /* Mask the interleaved out buf in order to save the U/V out pixel computed in
2643     this function call without thrashing the U/V out pixel that was saved
2644     during an earlier function call */
2645     chroma_mask = _mm_set1_epi16(0xFF00);
2646 
2647     out_r0 = _mm_and_si128(out_r0, chroma_mask);
2648     out_r1 = _mm_and_si128(out_r1, chroma_mask);
2649     out_r2 = _mm_and_si128(out_r2, chroma_mask);
2650     out_r3 = _mm_and_si128(out_r3, chroma_mask);
2651 
2652     /* Save the out pixels in alternate locations */
2653     out_r0 = _mm_add_epi8(out_r0, pred_r0);
2654     out_r1 = _mm_add_epi8(out_r1, pred_r1);
2655     out_r2 = _mm_add_epi8(out_r2, pred_r2);
2656     out_r3 = _mm_add_epi8(out_r3, pred_r3);
2657 
2658     _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
2659     _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
2660     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2661     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2662 }
2663 
isvc_iquant_itrans_recon_res_chroma_4x4_dc_with_res_acc_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)2664 void isvc_iquant_itrans_recon_res_chroma_4x4_dc_with_res_acc_sse42(
2665     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
2666     buffer_container_t *ps_res, buffer_container_t *ps_rec,
2667     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
2668     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
2669 {
2670     WORD16 *pi2_src = (WORD16 *) ps_src->pv_data;
2671     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
2672     WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data;
2673     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
2674     UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data;
2675     WORD32 i4_res_stride = ps_res->i4_data_stride;
2676     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
2677     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
2678     WORD32 i4_out_stride = ps_rec->i4_data_stride;
2679     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
2680     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
2681     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
2682     /* DC value won't be dequantized for chroma
2683     inverse transform */
2684     WORD16 q0 = pi2_dc_src[0];
2685     WORD16 i_macro = ((q0 + 32) >> 6);
2686 
2687     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
2688     /* all bits reset to zero */
2689     __m128i zero_8x16b = _mm_setzero_si128();
2690     __m128i chroma_mask = _mm_set1_epi16(0xFF);
2691     __m128i reg_chroma = _mm_set_epi16(0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF);
2692     __m128i value_add = _mm_set1_epi16(i_macro);
2693     __m128i out_r0, out_r1, out_r2, out_r3;
2694     __m128i res_r0, res_r1, res_r2, res_r3;
2695     __m128i res_pred_r0, res_pred_r1, res_pred_r2, res_pred_r3;
2696     __m128i temp0, temp1;
2697     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
2698     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
2699 
2700     ASSERT(1 == u1_res_accumulate);
2701 
2702     UNUSED(pi2_src);
2703     UNUSED(pu2_iscal_mat);
2704     UNUSED(pu2_weigh_mat);
2705     UNUSED(u4_qp_div_6);
2706     UNUSED(pi2_tmp);
2707     UNUSED(i4_iq_start_idx);
2708     UNUSED(u1_res_accumulate);
2709 
2710     /* Load pred buffer */
2711     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
2712 
2713     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
2714 
2715     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
2716 
2717     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
2718     /* Mask alternate pred values from the interleaved pred buf */
2719     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
2720     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
2721     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
2722     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
2723 
2724     /* Pack the first four 16 bit values of 2 regs into a single reg*/
2725     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);
2726     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);
2727 
2728     /* Accumulating res */
2729 
2730     /* load res pred buffer */
2731     res_pred_r0 = _mm_loadu_si128((__m128i *) &pi2_res_pred[0 * i4_res_pred_stride]);
2732     res_pred_r1 = _mm_loadu_si128((__m128i *) &pi2_res_pred[1 * i4_res_pred_stride]);
2733     res_pred_r2 = _mm_loadu_si128((__m128i *) &pi2_res_pred[2 * i4_res_pred_stride]);
2734     res_pred_r3 = _mm_loadu_si128((__m128i *) &pi2_res_pred[3 * i4_res_pred_stride]);
2735 
2736     /* Mask res pred and retain alternate values */
2737     res_pred_r0 = _mm_and_si128(res_pred_r0, reg_chroma);
2738     res_pred_r1 = _mm_and_si128(res_pred_r1, reg_chroma);
2739     res_pred_r2 = _mm_and_si128(res_pred_r2, reg_chroma);
2740     res_pred_r3 = _mm_and_si128(res_pred_r3, reg_chroma);
2741 
2742     /* Convert to 32 bits */
2743     res_r0 = _mm_cvtepu16_epi32(value_add);
2744     res_r2 = _mm_cvtepu16_epi32(value_add);
2745     res_r1 = _mm_cvtepu16_epi32(value_add);
2746     res_r3 = _mm_cvtepu16_epi32(value_add);
2747 
2748     /* Add res pred to the res obtained from inv transform */
2749     res_r0 = _mm_add_epi16(res_pred_r0, res_r0);
2750     res_r1 = _mm_add_epi16(res_pred_r1, res_r1);
2751     res_r2 = _mm_add_epi16(res_pred_r2, res_r2);
2752     res_r3 = _mm_add_epi16(res_pred_r3, res_r3);
2753 
2754     /* Convert 32 bit res of the format [a0 0 a1 0 a2 0 a3 0] to
2755     16 bits of the format [a0 a1 a2 a3] using hadd [ao + 0,
2756     a1 + 0, a2 + 0, a3 + 0] To be optimized */
2757     temp0 = _mm_hadd_epi16(res_r0, res_r1);
2758     temp1 = _mm_hadd_epi16(res_r2, res_r3);
2759 
2760     /* Saturate all values < -255 to -255 and retain the rest as it is */
2761     temp0 = _mm_max_epi16(temp0, neg_255_8x16b);
2762     /* Saturate all values > 255 to 255 and retain the rest as it is */
2763     temp0 = _mm_min_epi16(temp0, pos_255_8x16b);
2764 
2765     /* Saturate all values < -255 to -255 and retain the rest as it is */
2766     temp1 = _mm_max_epi16(temp1, neg_255_8x16b);
2767     /* Saturate all values > 255 to 255 and retain the rest as it is */
2768     temp1 = _mm_min_epi16(temp1, pos_255_8x16b);
2769 
2770     /* Compute out pixel by adding res to pred */
2771     pred_r0 = _mm_add_epi16(temp0, pred_r0);
2772     pred_r2 = _mm_add_epi16(temp1, pred_r2);
2773 
2774     res_r0 = _mm_cvtepu16_epi32(temp0);
2775     res_r2 = _mm_cvtepu16_epi32(temp1);
2776     res_r1 = _mm_srli_si128(temp0, 8);
2777     res_r3 = _mm_srli_si128(temp1, 8);
2778     res_r1 = _mm_cvtepu16_epi32(res_r1);
2779     res_r3 = _mm_cvtepu16_epi32(res_r3);
2780 
2781     /* Load res buffer */
2782     out_r0 = _mm_loadu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]));
2783     out_r1 = _mm_loadu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]));
2784     out_r2 = _mm_loadu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]));
2785     out_r3 = _mm_loadu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]));
2786 
2787     /* Mask the loaded res in order to save the U/V res data computed in
2788     this function call without thrashing the U/V res data that was saved
2789     during an earlier function call */
2790     chroma_mask = _mm_set1_epi32(0xffff0000);
2791 
2792     out_r0 = _mm_and_si128(out_r0, chroma_mask);
2793     out_r1 = _mm_and_si128(out_r1, chroma_mask);
2794     out_r2 = _mm_and_si128(out_r2, chroma_mask);
2795     out_r3 = _mm_and_si128(out_r3, chroma_mask);
2796 
2797     /* Save the res in alternate locations */
2798     out_r0 = _mm_add_epi16(out_r0, res_r0);
2799     out_r1 = _mm_add_epi16(out_r1, res_r1);
2800     out_r2 = _mm_add_epi16(out_r2, res_r2);
2801     out_r3 = _mm_add_epi16(out_r3, res_r3);
2802 
2803     _mm_storeu_si128((__m128i *) (&pi2_res[0 * i4_res_stride]), out_r0);
2804     _mm_storeu_si128((__m128i *) (&pi2_res[1 * i4_res_stride]), out_r1);
2805     _mm_storeu_si128((__m128i *) (&pi2_res[2 * i4_res_stride]), out_r2);
2806     _mm_storeu_si128((__m128i *) (&pi2_res[3 * i4_res_stride]), out_r3);
2807     /*------------------------------------------------------------------*/
2808     /* Clipping the results to 8 bits */
2809     pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
2810     pred_r1 = _mm_srli_si128(pred_r0, 4);
2811     pred_r2 = _mm_srli_si128(pred_r1, 4);
2812     pred_r3 = _mm_srli_si128(pred_r2, 4);
2813 
2814     /* p00 p01 p02 p03 -- all 16 bits */
2815     pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);
2816     /* p10 p11 p12 p13 -- all 16 bits */
2817     pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);
2818     /* p20 p21 p22 p23 -- all 16 bits */
2819     pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);
2820     /* p30 p31 p32 p33 -- all 16 bits */
2821     pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);
2822 
2823     /* Load interleaved out buffer */
2824     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
2825     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
2826     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
2827     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
2828 
2829     /* Mask the interleaved out buf in order to save the U/V out pixel computed in
2830     this function call without thrashing the U/V out pixel that was saved
2831     during an earlier function call */
2832     chroma_mask = _mm_set1_epi16(0xFF00);
2833 
2834     out_r0 = _mm_and_si128(out_r0, chroma_mask);
2835     out_r1 = _mm_and_si128(out_r1, chroma_mask);
2836     out_r2 = _mm_and_si128(out_r2, chroma_mask);
2837     out_r3 = _mm_and_si128(out_r3, chroma_mask);
2838 
2839     /* Save the out pixels in alternate locations */
2840     out_r0 = _mm_add_epi8(out_r0, pred_r0);
2841     out_r1 = _mm_add_epi8(out_r1, pred_r1);
2842     out_r2 = _mm_add_epi8(out_r2, pred_r2);
2843     out_r3 = _mm_add_epi8(out_r3, pred_r3);
2844 
2845     _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
2846     _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
2847     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
2848     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
2849 }
2850