• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvc_iquant_itrans_recon_dc_ssse3.c
24  *
25  * @brief
26  *  Contains function definitions for inverse  quantization, inverse
27  * transform and reconstruction
28  *
29  * @author
30  *  Mohit [100664]
31  *
32  * @par List of Functions:
33  *  - isvc_iquant_itrans_recon_4x4_dc_ssse3()
34  *  - isvc_iquant_itrans_recon_8x8_dc_ssse3()
35  *
36  * @remarks
37  *  None
38  *
39  *******************************************************************************
40  */
41 #include <immintrin.h>
42 
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "ih264_defs.h"
46 #include "ih264_trans_macros.h"
47 #include "ih264_macros.h"
48 #include "ih264_platform_macros.h"
49 #include "ih264_trans_data.h"
50 #include "ih264_size_defs.h"
51 #include "isvc_structs.h"
52 #include "isvc_trans_quant_itrans_iquant.h"
53 
54 /*
55  ********************************************************************************
56  *
57  * @brief This function reconstructs a 4x4 sub block from quantized resiude and
58  * prediction buffer for dc input pattern only, i.e. only the (0,0) element of
59  *the input 4x4 block is non-zero. For complete function, refer
60  *isvc_iquant_itrans_recon_ssse3.c
61  *
62  * @par Description:
63  *  The quantized residue is first inverse quantized, then inverse transformed.
64  *  This inverse transformed content is added to the prediction buffer to recon-
65  *  struct the end output
66  *
67  * @param[in] pi2_src
68  *  quantized 4x4 block
69  *
70  * @param[in] pu1_pred
71  *  prediction 4x4 block
72  *
73  * @param[out] pu1_out
74  *  reconstructed 4x4 block
75  *
76  * @param[in] src_strd
77  *  quantization buffer stride
78  *
79  * @param[in] i4_pred_stride,
80  *  Prediction buffer stride
81  *
82  * @param[in] i4_out_stride
83  *  recon buffer Stride
84  *
85  * @param[in] pu2_scaling_list
86  *  pointer to scaling list
87  *
88  * @param[in] pu2_norm_adjust
89  *  pointer to inverse scale matrix
90  *
91  * @param[in] u4_qp_div_6
92  *  Floor (qp/6)
93  *
94  * @param[in] pi4_tmp
95  * temporary buffer of size 1*16
96  *
97  * @returns none
98  *
99  * @remarks none
100  *
101  *******************************************************************************
102  */
isvc_iquant_itrans_recon_4x4_dc_ssse3(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)103 void isvc_iquant_itrans_recon_4x4_dc_ssse3(buffer_container_t *ps_src, buffer_container_t *ps_pred,
104                                            buffer_container_t *ps_res_pred,
105                                            buffer_container_t *ps_res, buffer_container_t *ps_rec,
106                                            iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
107                                            WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
108                                            WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
109 {
110     WORD16 *pi2_src = ps_src->pv_data;
111     WORD16 *pi2_res = ps_res->pv_data;
112     WORD16 *pi2_res_pred = ps_res_pred->pv_data;
113     UWORD8 *pu1_pred = ps_pred->pv_data;
114     UWORD8 *pu1_out = ps_rec->pv_data;
115     WORD32 i4_src_stride = ps_src->i4_data_stride;
116     WORD32 i4_res_stride = ps_res->i4_data_stride;
117     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
118     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
119     WORD32 i4_out_stride = ps_rec->i4_data_stride;
120     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
121     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
122     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
123     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
124     WORD32 q0 = pi2_src[0];
125     WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
126 
127     __m128i predload_r, pred_r0, pred_r1, pred_r2, pred_r3;
128     __m128i sign_reg;
129     __m128i zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
130     __m128i temp4, temp5, temp6, temp7;
131     __m128i value_add;
132 
133     UNUSED(pi2_tmp);
134     UNUSED(u1_res_accumulate);
135     UNUSED(i4_src_stride);
136     UNUSED(i4_res_stride);
137     UNUSED(i4_res_pred_stride);
138     UNUSED(pi2_res);
139     UNUSED(pi2_res_pred);
140     UNUSED(i4_iq_start_idx);
141 
142     /* Implement residue accumulation */
143     ASSERT(0);
144 
145     INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
146 
147     if(i4_iq_start_idx != 0) q0 = pi2_dc_src[0];  // Restoring dc value for intra case
148 
149     i_macro = ((q0 + 32) >> 6);
150 
151     value_add = _mm_set1_epi16(i_macro);
152 
153     zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
154     // Load pred buffer
155     predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));  // p00 p01 p02 p03 0 0 0 0 0
156                                                                // 0 0 0 -- all 8 bits
157     pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p00 p01 p02 p03 0 0 0 0 -- all 16 bits
158     predload_r =
159         _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));  // p10 p11 p12 p13 0 0 0 0 0 0
160                                                                    // 0 0 -- all 8 bits
161     pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p10 p11 p12 p13 0 0 0 0 -- all 16 bits
162     predload_r =
163         _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));  // p20 p21 p22 p23 0 0 0 0
164                                                                        // 0 0 0 0 -- all 8 bits
165     pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p20 p21 p22 p23 0 0 0 0 -- all 16 bits
166     predload_r =
167         _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));  // p30 p31 p32 p33 0 0 0 0
168                                                                        // 0 0 0 0 -- all 8 bits
169     pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p30 p31 p32 p33 0 0 0 0 -- all 16 bits
170 
171     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);  // p00 p01 p02 p03 p10 p11 p12 p13
172     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);  // p20 p21 p22p p23 p30 p31 p32 p33
173 
174     temp4 = _mm_add_epi16(value_add, pred_r0);
175     temp5 = _mm_add_epi16(value_add, pred_r2);
176     /*------------------------------------------------------------------*/
177     // Clipping the results to 8 bits
178     sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);  // sign check
179     temp4 = _mm_and_si128(temp4, sign_reg);
180     sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);  // sign check
181     temp5 = _mm_and_si128(temp5, sign_reg);
182 
183     temp4 = _mm_packus_epi16(temp4, temp5);
184     temp5 = _mm_srli_si128(temp4, 4);
185     temp6 = _mm_srli_si128(temp5, 4);
186     temp7 = _mm_srli_si128(temp6, 4);
187 
188     *pu4_out = _mm_cvtsi128_si32(temp4);
189     pu1_out += i4_out_stride;
190     pu4_out = (UWORD32 *) (pu1_out);
191     *(pu4_out) = _mm_cvtsi128_si32(temp5);
192     pu1_out += i4_out_stride;
193     pu4_out = (UWORD32 *) (pu1_out);
194     *(pu4_out) = _mm_cvtsi128_si32(temp6);
195     pu1_out += i4_out_stride;
196     pu4_out = (UWORD32 *) (pu1_out);
197     *(pu4_out) = _mm_cvtsi128_si32(temp7);
198 }
199 
200 /**
201  *******************************************************************************
202  *
203  * @brief
204  *  This function performs inverse quant and Inverse transform type Ci4 for 8x8
205  *block for dc input pattern only, i.e. only the (0,0) element of the input 8x8
206  *block is non-zero. For complete function, refer
207  *isvc_iquant_itrans_recon_ssse3.c
208  *
209  * @par Description:
210  *  Performs inverse transform Ci8 and adds the residue to get the
211  *  reconstructed block
212  *
213  * @param[in] pi2_src
214  *  Input 8x8coefficients
215  *
216  * @param[in] pu1_pred
217  *  Prediction 8x8 block
218  *
219  * @param[out] pu1_recon
220  *  Output 8x8 block
221  *
222  * @param[in] q_div
223  *  QP/6
224  *
225  * @param[in] q_rem
226  *  QP%6
227  *
228  * @param[in] q_lev
229  *  Quantizer level
230  *
231  * @param[in] u4_src_stride
232  *  Input stride
233  *
234  * @param[in] u4_pred_stride,
235  *  Prediction stride
236  *
237  * @param[in] u4_out_stride
238  *  Output Stride
239  *
240  * @param[in] pi4_tmp
241  *  temporary buffer of size 1*64
242  *  the tmp for each block
243  *
244  * @param[in] pu4_iquant_mat
245  *  Pointer to the inverse quantization matrix
246  *
247  * @returns  Void
248  *
249  * @remarks
250  *  None
251  *
252  *******************************************************************************
253  */
254 
isvc_iquant_itrans_recon_8x8_dc_ssse3(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)255 void isvc_iquant_itrans_recon_8x8_dc_ssse3(buffer_container_t *ps_src, buffer_container_t *ps_pred,
256                                            buffer_container_t *ps_res_pred,
257                                            buffer_container_t *ps_res, buffer_container_t *ps_rec,
258                                            iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
259                                            WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
260                                            WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
261 {
262     WORD16 *pi2_src = ps_src->pv_data;
263     WORD16 *pi2_res = ps_res->pv_data;
264     WORD16 *pi2_res_pred = ps_res_pred->pv_data;
265     UWORD8 *pu1_pred = ps_pred->pv_data;
266     UWORD8 *pu1_out = ps_rec->pv_data;
267     WORD32 i4_src_stride = ps_src->i4_data_stride;
268     WORD32 i4_res_stride = ps_res->i4_data_stride;
269     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
270     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
271     WORD32 i4_out_stride = ps_rec->i4_data_stride;
272     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
273     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
274     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
275     WORD32 q0 = pi2_src[0];
276     WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 6) ? 1 << (5 - u4_qp_div_6) : 0;
277 
278     __m128i predload_r, pred_r0, pred_r1, pred_r2, pred_r3, pred_r4, pred_r5, pred_r6, pred_r7;
279     __m128i sign_reg;
280     __m128i zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
281     __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
282     __m128i value_add;
283 
284     UNUSED(pi2_tmp);
285     UNUSED(pi2_dc_src);
286     UNUSED(u1_res_accumulate);
287     UNUSED(i4_src_stride);
288     UNUSED(i4_res_stride);
289     UNUSED(i4_res_pred_stride);
290     UNUSED(pi2_res);
291     UNUSED(pi2_res_pred);
292     UNUSED(i4_iq_start_idx);
293 
294     /* Implement residue accumulation */
295     ASSERT(0);
296 
297     INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 6);
298     i_macro = ((q0 + 32) >> 6);
299 
300     value_add = _mm_set1_epi16(i_macro);
301 
302     // Load pred buffer row 0
303     predload_r =
304         _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));      // p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0
305                                                           // -- all 8 bits
306     pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
307     // Load pred buffer row 1
308     predload_r =
309         _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0 0
310                                                                    // 0 0 0 0 0 0 -- all 8 bits
311     pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
312     // Load pred buffer row 2
313     predload_r = _mm_loadl_epi64(
314         (__m128i *) (&pu1_pred[2 * i4_pred_stride]));     // p0 p1 p2 p3 p4 p5 p6 p7 0 0
315                                                           // 0 0 0 0 0 0 -- all 8 bits
316     pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
317     // Load pred buffer row 3
318     predload_r = _mm_loadl_epi64(
319         (__m128i *) (&pu1_pred[3 * i4_pred_stride]));     // p0 p1 p2 p3 p4 p5 p6 p7 0 0
320                                                           // 0 0 0 0 0 0 -- all 8 bits
321     pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
322     // Load pred buffer row 4
323     predload_r = _mm_loadl_epi64(
324         (__m128i *) (&pu1_pred[4 * i4_pred_stride]));     // p0 p1 p2 p3 p4 p5 p6 p7 0 0
325                                                           // 0 0 0 0 0 0 -- all 8 bits
326     pred_r4 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
327     // Load pred buffer row 5
328     predload_r =
329         _mm_loadl_epi64((__m128i *) (&pu1_pred[5 * i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0
330                                                                        // 0 0 0 0 0 0 0 -- all 8 bit
331     pred_r5 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
332     // Load pred buffer row 6
333     predload_r = _mm_loadl_epi64(
334         (__m128i *) (&pu1_pred[6 * i4_pred_stride]));     // p0 p1 p2 p3 p4 p5 p6 p7 0 0
335                                                           // 0 0 0 0 0 0 -- all 8 bits
336     pred_r6 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
337     // Load pred buffer row 7
338     predload_r = _mm_loadl_epi64(
339         (__m128i *) (&pu1_pred[7 * i4_pred_stride]));     // p0 p1 p2 p3 p4 p5 p6 p7 0 0
340                                                           // 0 0 0 0 0 0 -- all 8 bits
341     pred_r7 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
342 
343     temp1 = _mm_add_epi16(value_add, pred_r0);
344 
345     temp2 = _mm_add_epi16(value_add, pred_r1);
346 
347     temp3 = _mm_add_epi16(value_add, pred_r2);
348 
349     temp4 = _mm_add_epi16(value_add, pred_r3);
350 
351     temp5 = _mm_add_epi16(value_add, pred_r4);
352 
353     temp6 = _mm_add_epi16(value_add, pred_r5);
354 
355     temp7 = _mm_add_epi16(value_add, pred_r6);
356 
357     temp8 = _mm_add_epi16(value_add, pred_r7);
358     /*------------------------------------------------------------------*/
359     // Clipping the results to 8 bits
360     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);  // sign check
361     temp1 = _mm_and_si128(temp1, sign_reg);
362     sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b);  // sign check
363     temp2 = _mm_and_si128(temp2, sign_reg);
364     sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b);  // sign check
365     temp3 = _mm_and_si128(temp3, sign_reg);
366     sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);  // sign check
367     temp4 = _mm_and_si128(temp4, sign_reg);
368     sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);  // sign check
369     temp5 = _mm_and_si128(temp5, sign_reg);
370     sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b);  // sign check
371     temp6 = _mm_and_si128(temp6, sign_reg);
372     sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b);  // sign check
373     temp7 = _mm_and_si128(temp7, sign_reg);
374     sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b);  // sign check
375     temp8 = _mm_and_si128(temp8, sign_reg);
376 
377     temp1 = _mm_packus_epi16(temp1, zero_8x16b);
378     temp2 = _mm_packus_epi16(temp2, zero_8x16b);
379     temp3 = _mm_packus_epi16(temp3, zero_8x16b);
380     temp4 = _mm_packus_epi16(temp4, zero_8x16b);
381     temp5 = _mm_packus_epi16(temp5, zero_8x16b);
382     temp6 = _mm_packus_epi16(temp6, zero_8x16b);
383     temp7 = _mm_packus_epi16(temp7, zero_8x16b);
384     temp8 = _mm_packus_epi16(temp8, zero_8x16b);
385 
386     _mm_storel_epi64((__m128i *) (&pu1_out[0]), temp1);
387     _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), temp2);
388     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), temp3);
389     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), temp4);
390     _mm_storel_epi64((__m128i *) (&pu1_out[4 * i4_out_stride]), temp5);
391     _mm_storel_epi64((__m128i *) (&pu1_out[5 * i4_out_stride]), temp6);
392     _mm_storel_epi64((__m128i *) (&pu1_out[6 * i4_out_stride]), temp7);
393     _mm_storel_epi64((__m128i *) (&pu1_out[7 * i4_out_stride]), temp8);
394 }
395 
396 /*
397  ********************************************************************************
398  *
399  * @brief This function reconstructs a 4x4 sub block from quantized chroma
400  *resiude and prediction buffer
401  *
402  * @par Description:
403  *  The quantized residue is first inverse quantized, then inverse transformed.
404  *  This inverse transformed content is added to the prediction buffer to recon-
405  *  struct the end output
406  *
407  * @param[in] pi2_src
408  *  quantized 4x4 block
409  *
410  * @param[in] pu1_pred
411  *  prediction 4x4 block
412  *
413  * @param[out] pu1_out
414  *  reconstructed 4x4 block
415  *
416  * @param[in] src_strd
417  *  quantization buffer stride
418  *
419  * @param[in] i4_pred_stride,
420  *  Prediction buffer stride
421  *
422  * @param[in] i4_out_stride
423  *  recon buffer Stride
424  *
425  * @param[in] pu2_scaling_list
426  *  pointer to scaling list
427  *
428  * @param[in] pu2_norm_adjust
429  *  pointer to inverse scale matrix
430  *
431  * @param[in] u4_qp_div_6
432  *  Floor (qp/6)
433  *
434  * @param[in] pi4_tmp
435  * temporary buffer of size 1*16
436  *
437  * @returns none
438  *
439  * @remarks none
440  *
441  *******************************************************************************
442  */
isvc_iquant_itrans_recon_chroma_4x4_dc_ssse3(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)443 void isvc_iquant_itrans_recon_chroma_4x4_dc_ssse3(
444     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred,
445     buffer_container_t *ps_res, buffer_container_t *ps_rec,
446     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src,
447     WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate)
448 {
449     WORD16 *pi2_src = ps_src->pv_data;
450     WORD16 *pi2_res = ps_res->pv_data;
451     WORD16 *pi2_res_pred = ps_res_pred->pv_data;
452     UWORD8 *pu1_pred = ps_pred->pv_data;
453     UWORD8 *pu1_out = ps_rec->pv_data;
454     WORD32 i4_src_stride = ps_src->i4_data_stride;
455     WORD32 i4_res_stride = ps_res->i4_data_stride;
456     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
457     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
458     WORD32 i4_out_stride = ps_rec->i4_data_stride;
459     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
460     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
461     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
462     WORD16 q0 = pi2_dc_src[0];  // DC value won't be dequantized for chroma
463                                 // inverse transform
464     WORD16 i_macro = ((q0 + 32) >> 6);
465 
466     __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg;
467     __m128i zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
468     __m128i chroma_mask = _mm_set1_epi16(0xFF);
469     __m128i value_add = _mm_set1_epi16(i_macro);
470     __m128i out_r0, out_r1, out_r2, out_r3;
471 
472     UNUSED(pi2_src);
473     UNUSED(pu2_iscal_mat);
474     UNUSED(pu2_weigh_mat);
475     UNUSED(u4_qp_div_6);
476     UNUSED(pi2_tmp);
477     UNUSED(u1_res_accumulate);
478     UNUSED(i4_src_stride);
479     UNUSED(i4_res_stride);
480     UNUSED(i4_res_pred_stride);
481     UNUSED(pi2_res);
482     UNUSED(pi2_res_pred);
483     UNUSED(i4_iq_start_idx);
484 
485     /* Implement residue accumulation */
486     ASSERT(0);
487 
488     // Load pred buffer
489     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));  // p00 p01 p02 p03 0 0 0 0 0
490                                                             // 0 0 0 -- all 8 bits
491     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));  // p10 p11 p12 p13 0 0 0 0
492                                                                          // 0 0 0 0 -- all 8 bits
493     pred_r2 =
494         _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));  // p20 p21 p22 p23 0 0 0 0
495                                                                        // 0 0 0 0 -- all 8 bits
496     pred_r3 =
497         _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));  // p30 p31 p32 p33 0 0 0 0
498                                                                        // 0 0 0 0 -- all 8 bits
499 
500     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
501     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
502     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
503     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
504 
505     pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1);  // p00 p01 p02 p03 p10 p11 p12 p13
506     pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3);  // p20 p21 p22p p23 p30 p31 p32 p33
507 
508     pred_r0 = _mm_add_epi16(value_add, pred_r0);
509     pred_r2 = _mm_add_epi16(value_add, pred_r2);
510 
511     /*------------------------------------------------------------------*/
512     // Clipping the results to 8 bits
513     sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b);  // sign check
514     pred_r0 = _mm_and_si128(pred_r0, sign_reg);
515     sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b);
516     pred_r2 = _mm_and_si128(pred_r2, sign_reg);
517 
518     pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
519     pred_r1 = _mm_srli_si128(pred_r0, 4);
520     pred_r2 = _mm_srli_si128(pred_r1, 4);
521     pred_r3 = _mm_srli_si128(pred_r2, 4);
522 
523     pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b);  // p00 p01 p02 p03 -- all 16 bits
524     pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b);  // p10 p11 p12 p13 -- all 16 bits
525     pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b);  // p20 p21 p22 p23 -- all 16 bits
526     pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b);  // p30 p31 p32 p33 -- all 16 bits
527 
528     chroma_mask = _mm_set1_epi16(0xFF00);
529     out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
530     out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[i4_out_stride]));
531     out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]));
532     out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]));
533 
534     out_r0 = _mm_and_si128(out_r0, chroma_mask);
535     out_r1 = _mm_and_si128(out_r1, chroma_mask);
536     out_r2 = _mm_and_si128(out_r2, chroma_mask);
537     out_r3 = _mm_and_si128(out_r3, chroma_mask);
538 
539     out_r0 = _mm_add_epi8(out_r0, pred_r0);
540     out_r1 = _mm_add_epi8(out_r1, pred_r1);
541     out_r2 = _mm_add_epi8(out_r2, pred_r2);
542     out_r3 = _mm_add_epi8(out_r3, pred_r3);
543 
544     _mm_storel_epi64((__m128i *) (&pu1_out[0]), out_r0);
545     _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), out_r1);
546     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), out_r2);
547     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), out_r3);
548 }
549