• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvc_iquant_itrans_recon_ssse3.c
24  *
25  * @brief
26  *  Contains function definitions for inverse  quantization, inverse
27  * transform and reconstruction
28  *
29  * @author
30  *  Mohit [100664]
31  *
32  * @par List of Functions:
33  *  - isvc_iquant_itrans_recon_4x4_ssse3()
34  *  - isvc_iquant_itrans_recon_8x8_ssse3()
35  *
36  * @remarks
37  *  None
38  *
39  *******************************************************************************
40  */
41 #include <immintrin.h>
42 
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "ih264_defs.h"
46 #include "ih264_trans_macros.h"
47 #include "ih264_macros.h"
48 #include "ih264_platform_macros.h"
49 #include "ih264_trans_data.h"
50 #include "ih264_size_defs.h"
51 #include "isvc_structs.h"
52 #include "isvc_trans_quant_itrans_iquant.h"
53 
54 /*
55  ********************************************************************************
56  *
57  * @brief This function reconstructs a 4x4 sub block from quantized resiude and
58  * prediction buffer
59  *
60  * @par Description:
61  *  The quantized residue is first inverse quantized, then inverse transformed.
62  *  This inverse transformed content is added to the prediction buffer to recon-
63  *  struct the end output
64  *
65  * @param[in] pi2_src
66  *  quantized 4x4 block
67  *
68  * @param[in] pu1_pred
69  *  prediction 4x4 block
70  *
71  * @param[out] pu1_out
72  *  reconstructed 4x4 block
73  *
74  * @param[in] src_strd
75  *  quantization buffer stride
76  *
77  * @param[in] i4_pred_stride,
78  *  Prediction buffer stride
79  *
80  * @param[in] i4_out_stride
81  *  recon buffer Stride
82  *
83  * @param[in] pu2_scaling_list
84  *  pointer to scaling list
85  *
86  * @param[in] pu2_norm_adjust
87  *  pointer to inverse scale matrix
88  *
89  * @param[in] u4_qp_div_6
90  *  Floor (qp/6)
91  *
92  * @param[in] pi4_tmp
93  * temporary buffer of size 1*16
94  *
95  * @returns none
96  *
97  * @remarks none
98  *
99  *******************************************************************************
100  */
isvc_iquant_itrans_recon_4x4_ssse3(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)101 void isvc_iquant_itrans_recon_4x4_ssse3(buffer_container_t *ps_src, buffer_container_t *ps_pred,
102                                         buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
103                                         buffer_container_t *ps_rec,
104                                         iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
105                                         WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
106                                         UWORD8 u1_res_accumulate)
107 {
108     WORD16 *pi2_src = ps_src->pv_data;
109     WORD16 *pi2_res = ps_res->pv_data;
110     WORD16 *pi2_res_pred = ps_res_pred->pv_data;
111     UWORD8 *pu1_pred = ps_pred->pv_data;
112     UWORD8 *pu1_out = ps_rec->pv_data;
113     WORD32 i4_src_stride = ps_src->i4_data_stride;
114     WORD32 i4_res_stride = ps_res->i4_data_stride;
115     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
116     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
117     WORD32 i4_out_stride = ps_rec->i4_data_stride;
118     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
119     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
120     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
121     UWORD32 *pu4_out = (UWORD32 *) pu1_out;
122     __m128i src_r0_r1, src_r2_r3;
123     __m128i src_r0, src_r1, src_r2, src_r3;
124     __m128i scalemat_r0_r1, scalemat_r2_r3, predload_r;
125     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
126     __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
127     __m128i zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
128     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
129     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
130     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
131     __m128i value_32 = _mm_set1_epi32(32);
132 
133     UNUSED(pi2_tmp);
134     UNUSED(pi2_dc_src);
135     UNUSED(u1_res_accumulate);
136     UNUSED(i4_src_stride);
137     UNUSED(i4_res_stride);
138     UNUSED(i4_res_pred_stride);
139     UNUSED(pi2_res);
140     UNUSED(pi2_res_pred);
141     UNUSED(i4_iq_start_idx);
142 
143     /* Implement residue accumulation */
144     ASSERT(0);
145 
146     /*************************************************************/
147     /* Dequantization of coefficients. Will be replaced by SIMD  */
148     /* operations on platform                                    */
149     /*************************************************************/
150     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));  // a00 a01 a02 a03 a10 a11 a12 a13 -- the
151                                                          // source matrix 0th,1st row
152     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));  // a20 a21 a22 a23 a30 a31 a32 a33 --
153                                                              // the source matrix 2nd,3rd row
154     scalemat_r0_r1 =
155         _mm_loadu_si128((__m128i *) (pu2_iscal_mat));  // b00 b01 b02 b03 b10 b11 b12 b13 -- the
156                                                        // scaling matrix 0th,1st row
157     scalemat_r2_r3 =
158         _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));  // b20 b21 b22 b23 b30 b31 b32 b33 --
159                                                            // the scaling matrix 2nd,3rd row
160     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));  // q00 q01 q02 q03 q10 q11
161                                                                    // q12 q13 -- all 16 bits
162     dequant_r2_r3 = _mm_loadu_si128(
163         (__m128i *) (pu2_weigh_mat + 8));  // q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
164 
165     temp0 = _mm_mullo_epi16(scalemat_r0_r1,
166                             dequant_r0_r1);  // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
167                                              // b12*q12 b13*q13 -- 16 bit result
168     temp1 = _mm_mullo_epi16(scalemat_r2_r3,
169                             dequant_r2_r3);  // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
170                                              // b12*q12 b13*q13 -- 16 bit result
171 
172     temp4 =
173         _mm_unpacklo_epi16(temp0,
174                            zero_8x16b);  // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
175     temp5 =
176         _mm_unpackhi_epi16(temp0,
177                            zero_8x16b);  // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
178     temp6 =
179         _mm_unpacklo_epi16(temp1,
180                            zero_8x16b);  // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
181     temp7 =
182         _mm_unpackhi_epi16(temp1,
183                            zero_8x16b);  // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
184 
185     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
186     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);  // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
187     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);  // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
188     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);  // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
189 
190     temp4 = _mm_madd_epi16(src_r0, temp4);  // a00*b00*q00 a10*b10*q10 a20*b20*q20
191                                             // a30*b30 q30 -- 32 bits long
192     temp5 = _mm_madd_epi16(src_r1, temp5);
193     temp6 = _mm_madd_epi16(src_r2, temp6);
194     temp7 = _mm_madd_epi16(src_r3, temp7);
195 
196     if(u4_qp_div_6 >= 4)
197     {
198         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
199         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
200         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
201         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
202     }
203     else
204     {
205         temp4 = _mm_add_epi32(temp4, add_rshift);
206         temp5 = _mm_add_epi32(temp5, add_rshift);
207         temp6 = _mm_add_epi32(temp6, add_rshift);
208         temp7 = _mm_add_epi32(temp7, add_rshift);
209         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
210         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
211         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
212         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
213     }
214 
215     if(i4_iq_start_idx == 1)
216     {
217         resq_r0 = _mm_insert_epi16(resq_r0, (WORD32) pi2_src[0], 0);
218         if(pi2_src[0] >= 0)
219             resq_r0 = _mm_insert_epi16(resq_r0, 0, 1);
220         else
221             resq_r0 = _mm_insert_epi16(resq_r0, -1, 1);
222     }
223     /* Perform Inverse transform */
224     /*-------------------------------------------------------------*/
225     /* IDCT [ Horizontal transformation ]                          */
226     /*-------------------------------------------------------------*/
227     // Matrix transpose
228     /*
229      *  a0 a1 a2 a3
230      *  b0 b1 b2 b3
231      *  c0 c1 c2 c3
232      *  d0 d1 d2 d3
233      */
234     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);  // a0 b0 a1 b1
235     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);  // c0 d0 c1 d1
236     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);  // a2 b2 a3 b3
237     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);  // c2 d2 c3 d3
238     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);    // a0 b0 c0 d0
239     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);    // a1 b1 c1 d1
240     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);    // a2 b2 c2 d2
241     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);    // a3 b3 c3 d3
242     // Transform starts -- horizontal transform
243     /*------------------------------------------------------------------*/
244     /* z0 = w0 + w2                                             */
245     temp0 = _mm_add_epi32(resq_r0, resq_r2);
246     /* z1 = w0 - w2                                             */
247     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
248     /* z2 = (w1 >> 1) - w3                                      */
249     temp2 = _mm_srai_epi32(resq_r1, 1);     //(w1>>1)
250     temp2 = _mm_sub_epi32(temp2, resq_r3);  //(w1>>1) - w3
251     /* z3 = w1 + (w3 >> 1)                                      */
252     temp3 = _mm_srai_epi32(resq_r3, 1);  //(w3>>1) + w1
253     temp3 = _mm_add_epi32(temp3, resq_r1);
254     /*----------------------------------------------------------*/
255     /* x0 = z0 + z3                                             */
256     resq_r0 = _mm_add_epi32(temp0, temp3);
257     /* x1 = z1 + z2                                             */
258     resq_r1 = _mm_add_epi32(temp1, temp2);
259     /* x2 = z1 - z2                                             */
260     resq_r2 = _mm_sub_epi32(temp1, temp2);
261     /* x3 = z0 - z3                                             */
262     resq_r3 = _mm_sub_epi32(temp0, temp3);
263     // Matrix transpose
264     /*
265      *  a0 b0 c0 d0
266      *  a1 b1 c1 d1
267      *  a2 b2 c2 d2
268      *  a3 b3 c3 d3
269      */
270     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);  // a0 a1 b0 b1
271     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);  // a2 a3 b2 b3
272     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);  // c0 c1 d0 d1
273     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);  // c2 c3 d2 d3
274     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);    // a0 a1 a2 a3
275     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);    // b0 b1 b2 b3
276     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);    // c0 c1 c2 c3
277     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);    // d0 d1 d2 d3
278     // Transform ends -- horizontal transform
279 
280     zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
281     // Load pred buffer
282     predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));  // p00 p01 p02 p03 0 0 0 0 0
283                                                                // 0 0 0 -- all 8 bits
284     pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p00 p01 p02 p03 0 0 0 0 -- all 16 bits
285 
286     predload_r =
287         _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));  // p10 p11 p12 p13 0 0 0 0 0 0
288                                                                    // 0 0 -- all 8 bits
289     pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p10 p11 p12 p13 0 0 0 0 -- all 16 bits
290 
291     predload_r =
292         _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));  // p20 p21 p22 p23 0 0 0 0
293                                                                        // 0 0 0 0 -- all 8 bits
294     pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p20 p21 p22 p23 0 0 0 0 -- all 16 bits
295 
296     predload_r =
297         _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));  // p30 p31 p32 p33 0 0 0 0
298                                                                        // 0 0 0 0 -- all 8 bits
299     pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p30 p31 p32 p33 0 0 0 0 -- all 16 bits
300     pred_r0 = _mm_unpacklo_epi16(pred_r0, zero_8x16b);  // p00 p01 p02 p03 -- 32 bits sign extended
301     pred_r1 = _mm_unpacklo_epi16(pred_r1, zero_8x16b);  // p10 p11 p12 p13 -- 32 bits sign extended
302     pred_r2 = _mm_unpacklo_epi16(pred_r2, zero_8x16b);  // p20 p21 p22 p23 -- 32 bits sign extended
303     pred_r3 = _mm_unpacklo_epi16(pred_r3, zero_8x16b);  // p30 p31 p32 p33 -- 32 bits sign extended
304 
305     /*--------------------------------------------------------------*/
306     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
307     /*                                                              */
308     /* Add the prediction and store it back to same buffer          */
309     /*--------------------------------------------------------------*/
310     /* z0j = y0j + y2j                                                        */
311     temp0 = _mm_add_epi32(resq_r0, resq_r2);
312     /* z1j = y0j - y2j                                                        */
313     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
314     /* z2j = (y1j>>1) - y3j */
315     temp2 = _mm_srai_epi32(resq_r1, 1);  //(y1j>>1)
316     temp2 = _mm_sub_epi32(temp2, resq_r3);
317     /* z3j = y1j + (y3j>>1) */
318     temp3 = _mm_srai_epi32(resq_r3, 1);  //(y3j>>1)
319     temp3 = _mm_add_epi32(temp3, resq_r1);
320 
321     /* x0j = z0j + z3j                                                        */
322     temp4 = _mm_add_epi32(temp0, temp3);
323     temp4 = _mm_add_epi32(temp4, value_32);
324     temp4 = _mm_srai_epi32(temp4, 6);
325     temp4 = _mm_add_epi32(temp4, pred_r0);
326     /* x1j = z1j + z2j                                                        */
327     temp5 = _mm_add_epi32(temp1, temp2);
328     temp5 = _mm_add_epi32(temp5, value_32);
329     temp5 = _mm_srai_epi32(temp5, 6);
330     temp5 = _mm_add_epi32(temp5, pred_r1);
331     /* x2j = z1j - z2j                                                        */
332     temp6 = _mm_sub_epi32(temp1, temp2);
333     temp6 = _mm_add_epi32(temp6, value_32);
334     temp6 = _mm_srai_epi32(temp6, 6);
335     temp6 = _mm_add_epi32(temp6, pred_r2);
336     /* x3j = z0j - z3j                                                        */
337     temp7 = _mm_sub_epi32(temp0, temp3);
338     temp7 = _mm_add_epi32(temp7, value_32);
339     temp7 = _mm_srai_epi32(temp7, 6);
340     temp7 = _mm_add_epi32(temp7, pred_r3);
341 
342     // 32-bit to 16-bit conversion
343     temp0 = _mm_packs_epi32(temp4, temp5);
344     temp1 = _mm_packs_epi32(temp6, temp7);
345     /*------------------------------------------------------------------*/
346     // Clipping the results to 8 bits
347     sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b);  // sign check
348     temp0 = _mm_and_si128(temp0, sign_reg);
349     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
350     temp1 = _mm_and_si128(temp1, sign_reg);
351 
352     resq_r0 = _mm_packus_epi16(temp0, temp1);
353     resq_r1 = _mm_srli_si128(resq_r0, 4);
354     resq_r2 = _mm_srli_si128(resq_r1, 4);
355     resq_r3 = _mm_srli_si128(resq_r2, 4);
356 
357     *pu4_out = _mm_cvtsi128_si32(resq_r0);
358     pu1_out += i4_out_stride;
359     pu4_out = (UWORD32 *) (pu1_out);
360     *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
361     pu1_out += i4_out_stride;
362     pu4_out = (UWORD32 *) (pu1_out);
363     *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
364     pu1_out += i4_out_stride;
365     pu4_out = (UWORD32 *) (pu1_out);
366     *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
367 }
368 
369 /**
370  *******************************************************************************
371  *
372  * @brief
373  *  This function performs inverse quant and Inverse transform type Ci4 for 8x8
374  *block
375  *
376  * @par Description:
377  *  Performs inverse transform Ci8 and adds the residue to get the
378  *  reconstructed block
379  *
380  * @param[in] pi2_src
381  *  Input 8x8coefficients
382  *
383  * @param[in] pu1_pred
384  *  Prediction 8x8 block
385  *
386  * @param[out] pu1_recon
387  *  Output 8x8 block
388  *
389  * @param[in] q_div
390  *  QP/6
391  *
392  * @param[in] q_rem
393  *  QP%6
394  *
395  * @param[in] q_lev
396  *  Quantizer level
397  *
398  * @param[in] u4_src_stride
399  *  Input stride
400  *
401  * @param[in] u4_pred_stride,
402  *  Prediction stride
403  *
404  * @param[in] u4_out_stride
405  *  Output Stride
406  *
407  * @param[in] pi4_tmp
408  *  temporary buffer of size 1*64
409  *  the tmp for each block
410  *
411  * @param[in] pu4_iquant_mat
412  *  Pointer to the inverse quantization matrix
413  *
414  * @returns  Void
415  *
416  * @remarks
417  *  None
418  *
419  *******************************************************************************
420  */
421 
isvc_iquant_itrans_recon_8x8_ssse3(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)422 void isvc_iquant_itrans_recon_8x8_ssse3(buffer_container_t *ps_src, buffer_container_t *ps_pred,
423                                         buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
424                                         buffer_container_t *ps_rec,
425                                         iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
426                                         WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
427                                         UWORD8 u1_res_accumulate)
428 {
429     WORD16 *pi2_src = ps_src->pv_data;
430     WORD16 *pi2_res = ps_res->pv_data;
431     WORD16 *pi2_res_pred = ps_res_pred->pv_data;
432     UWORD8 *pu1_pred = ps_pred->pv_data;
433     UWORD8 *pu1_out = ps_rec->pv_data;
434     WORD32 i4_src_stride = ps_src->i4_data_stride;
435     WORD32 i4_res_stride = ps_res->i4_data_stride;
436     WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
437     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
438     WORD32 i4_out_stride = ps_rec->i4_data_stride;
439     const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
440     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
441     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
442     __m128i src_r0;
443     __m128i scalemat_r0;
444     __m128i zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
445     // __m128i one_8x16b = _mm_set1_epi8(255); // all bits set to 1
446     // __m128i one_zero_mask = _mm_unpacklo_epi16(one_8x16b, zero_8x16b); // 1 0 1
447     // 0 1 0 1 0 --- 16 bits size
448     __m128i value_32 = _mm_set1_epi32(32);
449     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0);
450     __m128i dequant_r0;
451     __m128i predload_r;
452     __m128i pred_r0_1, pred_r1_1, pred_r2_1, pred_r3_1, pred_r4_1, pred_r5_1, pred_r6_1, pred_r7_1;
453     __m128i sign_reg;
454     __m128i src_r0_1, src_r0_2;
455     __m128i scalemat_r0_1, scalemat_r0_2;
456     __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
457     __m128i temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18, temp19, temp20;
458     // To store dequantization results
459     __m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2, resq_r3_1, resq_r3_2,
460         resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2, resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2;
461 
462     UNUSED(pi2_tmp);
463     UNUSED(i4_iq_start_idx);
464     UNUSED(pi2_dc_src);
465     UNUSED(u1_res_accumulate);
466     UNUSED(i4_src_stride);
467     UNUSED(i4_res_stride);
468     UNUSED(i4_res_pred_stride);
469     UNUSED(pi2_res);
470     UNUSED(pi2_res_pred);
471     UNUSED(i4_iq_start_idx);
472 
473     /* Implement residue accumulation */
474     ASSERT(0);
475 
476     /*************************************************************/
477     /* Dequantization of coefficients. Will be replaced by SIMD  */
478     /* operations on platform. Note : DC coeff is not scaled     */
479     /*************************************************************/
480 
481     // Row 0 processing
482     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src));  // a00 a01 a02 a03 a04 a05 a06 a07 -- the
483                                                       // source matrix 0th row
484     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));  // b00 b01 b02 b03 b04 b05 b06 b07
485                                                                  // -- the scaling matrix 0th row
486     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[0]));  // q0 q1 q2 q3 q4 q5 q6
487                                                                     // q7 -- all 16 bits
488     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
489     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
490     temp10 = _mm_mullo_epi16(scalemat_r0,
491                              dequant_r0);  // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
492                                            // b05*q5 b06*q6 b07*q7 -- 16 bit result
493     scalemat_r0_1 =
494         _mm_unpacklo_epi16(temp10,
495                            zero_8x16b);  // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
496     scalemat_r0_2 =
497         _mm_unpackhi_epi16(temp10,
498                            zero_8x16b);  // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
499 
500     temp5 = _mm_madd_epi16(src_r0_1,
501                            scalemat_r0_1);  // a00*b00*q0 a01*b01*q1 a02*b02*q2
502                                             // a03*b03*q3 -- 32 bits long
503     temp7 = _mm_madd_epi16(src_r0_2,
504                            scalemat_r0_2);  // a04*b04*q4 a05*b05*q5 a06*b06*q6
505                                             // a07*b07*q7 -- 32 bits long
506 
507     if(u4_qp_div_6 >= 6)
508     {
509         resq_r0_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
510         resq_r0_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
511     }
512     else
513     {
514         temp5 = _mm_add_epi32(temp5, add_rshift);
515         temp7 = _mm_add_epi32(temp7, add_rshift);
516         resq_r0_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
517         resq_r0_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
518     }
519     resq_r0_1 =
520         _mm_packs_epi32(resq_r0_1,
521                         resq_r0_2);  // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
522                                      // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
523     // Row 1 processing
524     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 8));  // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
525                                                           // the source matrix 1st row
526     scalemat_r0 =
527         _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));  // b00 b01 b02 b03 b04 b05 b06 b07 b08
528                                                            // -- the scaling matrix 1st row
529     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[8]));  // q0 q1 q2 q3 q4 q5 q6
530                                                                     // q7 -- all 16 bits
531     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
532     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
533     temp10 = _mm_mullo_epi16(scalemat_r0,
534                              dequant_r0);  // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
535                                            // b05*q5 b06*q6 b07*q7 -- 16 bit result
536     scalemat_r0_1 =
537         _mm_unpacklo_epi16(temp10,
538                            zero_8x16b);  // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
539     scalemat_r0_2 =
540         _mm_unpackhi_epi16(temp10,
541                            zero_8x16b);  // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
542     temp5 = _mm_madd_epi16(src_r0_1,
543                            scalemat_r0_1);  // a00*b00*q0 a01*b01*q1 a02*b02*q2
544                                             // a03*b03*q3 -- 32 bits long
545     temp7 = _mm_madd_epi16(src_r0_2,
546                            scalemat_r0_2);  // a04*b04*q4 a05*b05*q5 a06*b06*q6
547                                             // a07*b07*q7 -- 32 bits long
548     if(u4_qp_div_6 >= 6)
549     {
550         resq_r1_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
551         resq_r1_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
552     }
553     else
554     {
555         temp5 = _mm_add_epi32(temp5, add_rshift);
556         temp7 = _mm_add_epi32(temp7, add_rshift);
557         resq_r1_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
558         resq_r1_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
559     }
560     resq_r1_1 =
561         _mm_packs_epi32(resq_r1_1,
562                         resq_r1_2);  // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
563                                      // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
564     // Row 2 processing
565     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 16));  // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
566                                                            // the source matrix 2nd row
567     scalemat_r0 =
568         _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 16));  // b00 b01 b02 b03 b04 b05 b06 b07 b08
569                                                             // -- the scaling matrix 2nd row
570     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[16]));  // q0 q1 q2 q3 q4 q5
571                                                                      // q6 q7 -- all 16 bits
572     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
573     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
574     temp10 = _mm_mullo_epi16(scalemat_r0,
575                              dequant_r0);  // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
576                                            // b05*q5 b06*q6 b07*q7 -- 16 bit result
577     scalemat_r0_1 =
578         _mm_unpacklo_epi16(temp10,
579                            zero_8x16b);  // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
580     scalemat_r0_2 =
581         _mm_unpackhi_epi16(temp10,
582                            zero_8x16b);  // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
583     temp5 = _mm_madd_epi16(src_r0_1,
584                            scalemat_r0_1);  // a00*b00*q0 a01*b01*q1 a02*b02*q2
585                                             // a03*b03*q3 -- 32 bits long
586     temp7 = _mm_madd_epi16(src_r0_2,
587                            scalemat_r0_2);  // a04*b04*q4 a05*b05*q5 a06*b06*q6
588                                             // a07*b07*q7 -- 32 bits long
589     if(u4_qp_div_6 >= 6)
590     {
591         resq_r2_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
592         resq_r2_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
593     }
594     else
595     {
596         temp5 = _mm_add_epi32(temp5, add_rshift);
597         temp7 = _mm_add_epi32(temp7, add_rshift);
598         resq_r2_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
599         resq_r2_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
600     }
601     resq_r2_1 =
602         _mm_packs_epi32(resq_r2_1,
603                         resq_r2_2);  // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
604                                      // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
605     // Row 3 processing
606     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 24));  // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
607                                                            // the source matrix 3rd row
608     scalemat_r0 =
609         _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 24));  // b00 b01 b02 b03 b04 b05 b06 b07 b08
610                                                             // -- the scaling matrix 3rd row
611     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[24]));  // q0 q1 q2 q3 q4 q5
612                                                                      // q6 q7 -- all 16 bits
613     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
614     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
615     temp10 = _mm_mullo_epi16(scalemat_r0,
616                              dequant_r0);  // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
617                                            // b05*q5 b06*q6 b07*q7 -- 16 bit result
618     scalemat_r0_1 =
619         _mm_unpacklo_epi16(temp10,
620                            zero_8x16b);  // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
621     scalemat_r0_2 =
622         _mm_unpackhi_epi16(temp10,
623                            zero_8x16b);  // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
624     temp5 = _mm_madd_epi16(src_r0_1,
625                            scalemat_r0_1);  // a00*b00*q0 a01*b01*q1 a02*b02*q2
626                                             // a03*b03*q3 - 32 bits long
627     temp7 = _mm_madd_epi16(src_r0_2,
628                            scalemat_r0_2);  // a04*b04*q4 a05*b05*q5 a06*b06*q6
629                                             // a07*b07*q7 -- 32 bits long
630     if(u4_qp_div_6 >= 6)
631     {
632         resq_r3_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
633         resq_r3_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
634     }
635     else
636     {
637         temp5 = _mm_add_epi32(temp5, add_rshift);
638         temp7 = _mm_add_epi32(temp7, add_rshift);
639         resq_r3_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
640         resq_r3_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
641     }
642     resq_r3_1 =
643         _mm_packs_epi32(resq_r3_1,
644                         resq_r3_2);  // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
645                                      // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
646     // Row 4 processing
647     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 32));  // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
648                                                            // the source matrix 4th row
649     scalemat_r0 =
650         _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 32));  // b00 b01 b02 b03 b04 b05 b06 b07 b08
651                                                             // -- the scaling matrix 4th row
652     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[32]));  // q0 q1 q2 q3 q4 q5
653                                                                      // q6 q7 -- all 16 bits
654     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
655     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
656     temp10 = _mm_mullo_epi16(scalemat_r0,
657                              dequant_r0);  // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
658                                            // b05*q5 b06*q6 b07*q7 -- 16 bit result
659     scalemat_r0_1 =
660         _mm_unpacklo_epi16(temp10,
661                            zero_8x16b);  // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
662     scalemat_r0_2 =
663         _mm_unpackhi_epi16(temp10,
664                            zero_8x16b);  // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
665     temp5 = _mm_madd_epi16(src_r0_1,
666                            scalemat_r0_1);  // a00*b00*q0 a01*b01*q1 a02*b02*q2
667                                             // a03*b03*q3 -- 32 bits long
668     temp7 = _mm_madd_epi16(src_r0_2,
669                            scalemat_r0_2);  // a04*b04*q4 a05*b05*q5 a06*b06*q6
670                                             // a07*b07*q7 -- 32 bits long
671     if(u4_qp_div_6 >= 6)
672     {
673         resq_r4_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
674         resq_r4_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
675     }
676     else
677     {
678         temp5 = _mm_add_epi32(temp5, add_rshift);
679         temp7 = _mm_add_epi32(temp7, add_rshift);
680         resq_r4_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
681         resq_r4_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
682     }
683     resq_r4_1 =
684         _mm_packs_epi32(resq_r4_1,
685                         resq_r4_2);  // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
686                                      // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
687     // Row 5 processing
688     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 40));  // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
689                                                            // the source matrix 5th row
690     scalemat_r0 =
691         _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 40));  // b00 b01 b02 b03 b04 b05 b06 b07 b08
692                                                             // -- the scaling matrix 5th row
693     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[40]));  // q0 q1 q2 q3 q4 q5
694                                                                      // q6 q7 -- all 16 bits
695     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
696     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
697     temp10 = _mm_mullo_epi16(scalemat_r0,
698                              dequant_r0);  // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
699                                            // b05*q5 b06*q6 b07*q7 -- 16 bit result
700     scalemat_r0_1 =
701         _mm_unpacklo_epi16(temp10,
702                            zero_8x16b);  // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
703     scalemat_r0_2 =
704         _mm_unpackhi_epi16(temp10,
705                            zero_8x16b);  // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
706     temp5 = _mm_madd_epi16(src_r0_1,
707                            scalemat_r0_1);  // a00*b00*q0 a01*b01*q1 a02*b02*q2
708                                             // a03*b03*q3 -- 32 bits long
709     temp7 = _mm_madd_epi16(src_r0_2,
710                            scalemat_r0_2);  // a04*b04*q4 a05*b05*q5 a06*b06*q6
711                                             // a07*b07*q7 -- 32 bits long
712     if(u4_qp_div_6 >= 6)
713     {
714         resq_r5_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
715         resq_r5_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
716         // resq_r5_1 = _mm_and_si128(resq_r5_1,one_zero_mask);
717         // resq_r5_2 = _mm_and_si128(resq_r5_2,one_zero_mask);
718     }
719     else
720     {
721         temp5 = _mm_add_epi32(temp5, add_rshift);
722         temp7 = _mm_add_epi32(temp7, add_rshift);
723         resq_r5_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
724         resq_r5_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
725     }
726     resq_r5_1 =
727         _mm_packs_epi32(resq_r5_1,
728                         resq_r5_2);  // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
729                                      // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
730     // Row 6 processing
731     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 48));  // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
732                                                            // the source matrix 6th row
733     scalemat_r0 =
734         _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 48));  // b00 b01 b02 b03 b04 b05 b06 b07 b08
735                                                             // -- the scaling matrix 6th row
736     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[48]));  // q0 q1 q2 q3 q4 q5
737                                                                      // q6 q7 -- all 16 bits
738     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
739     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
740     temp10 = _mm_mullo_epi16(scalemat_r0,
741                              dequant_r0);  // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
742                                            // b05*q5 b06*q6 b07*q7 -- 16 bit result
743     scalemat_r0_1 =
744         _mm_unpacklo_epi16(temp10,
745                            zero_8x16b);  // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
746     scalemat_r0_2 =
747         _mm_unpackhi_epi16(temp10,
748                            zero_8x16b);  // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
749     temp5 = _mm_madd_epi16(src_r0_1,
750                            scalemat_r0_1);  // a00*b00*q0 a01*b01*q1 a02*b02*q2
751                                             // a03*b03*q3 -- 32 bits long
752     temp7 = _mm_madd_epi16(src_r0_2,
753                            scalemat_r0_2);  // a04*b04*q4 a05*b05*q5 a06*b06*q6
754                                             // a07*b07*q7 -- 32 bits long
755     if(u4_qp_div_6 >= 6)
756     {
757         resq_r6_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
758         resq_r6_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
759         // resq_r6_1 = _mm_and_si128(resq_r6_1,one_zero_mask);
760         // resq_r6_2 = _mm_and_si128(resq_r6_2,one_zero_mask);
761     }
762     else
763     {
764         temp5 = _mm_add_epi32(temp5, add_rshift);
765         temp7 = _mm_add_epi32(temp7, add_rshift);
766         resq_r6_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
767         resq_r6_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
768         // resq_r6_1 = _mm_and_si128(resq_r6_1,one_zero_mask);
769         // resq_r6_2 = _mm_and_si128(resq_r6_2,one_zero_mask);
770     }
771     resq_r6_1 =
772         _mm_packs_epi32(resq_r6_1,
773                         resq_r6_2);  // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
774                                      // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
775     // Row 7 processing
776     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 56));  // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
777                                                            // the source matrix 7th row
778     scalemat_r0 =
779         _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 56));  // b00 b01 b02 b03 b04 b05 b06 b07 b08
780                                                             // -- the scaling matrix 7th row
781     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[56]));  // q0 q1 q2 q3 q4 q5
782                                                                      // q6 q7 -- all 16 bits
783     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
784     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
785     temp10 = _mm_mullo_epi16(scalemat_r0,
786                              dequant_r0);  // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
787                                            // b05*q5 b06*q6 b07*q7 -- 16 bit result
788     scalemat_r0_1 =
789         _mm_unpacklo_epi16(temp10,
790                            zero_8x16b);  // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
791     scalemat_r0_2 =
792         _mm_unpackhi_epi16(temp10,
793                            zero_8x16b);  // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
794     temp5 = _mm_madd_epi16(src_r0_1,
795                            scalemat_r0_1);  // a00*b00*q0 a01*b01*q1 a02*b02*q2
796                                             // a03*b03*q3 -- 32 bits long
797     temp7 = _mm_madd_epi16(src_r0_2,
798                            scalemat_r0_2);  // a04*b04*q4 a05*b05*q5 a06*b06*q6
799                                             // a07*b07*q7 -- 32 bits long
800     if(u4_qp_div_6 >= 6)
801     {
802         resq_r7_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
803         resq_r7_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
804     }
805     else
806     {
807         temp5 = _mm_add_epi32(temp5, add_rshift);
808         temp7 = _mm_add_epi32(temp7, add_rshift);
809         resq_r7_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
810         resq_r7_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
811     }
812     resq_r7_1 =
813         _mm_packs_epi32(resq_r7_1,
814                         resq_r7_2);  // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
815                                      // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
816     /* Perform Inverse transform */
817     /*--------------------------------------------------------------------*/
818     /* IDCT [ Horizontal transformation ]                                 */
819     /*--------------------------------------------------------------------*/
820     // Matrix transpose
821     /*
822      *  a0 a1 a2 a3 a4 a5 a6 a7
823      *  b0 b1 b2 b3 b4 b5 b6 b7
824      *  c0 c1 c2 c3 c4 c5 c6 c7
825      *  d0 d1 d2 d3 d4 d5 d6 d7
826      */
827     temp1 = _mm_unpacklo_epi16(resq_r0_1, resq_r1_1);  // a0 b0 a1 b1 a2 b2 a3 b3
828     temp3 = _mm_unpacklo_epi16(resq_r2_1, resq_r3_1);  // c0 d0 c1 d1 c2 d2 c3 d3
829     temp2 = _mm_unpackhi_epi16(resq_r0_1, resq_r1_1);  // a4 b4 a5 b5 a6 b6 a7 b7
830     temp4 = _mm_unpackhi_epi16(resq_r2_1, resq_r3_1);  // c4 d4 c5 d5 c6 d6 c7 d7
831     resq_r0_1 = _mm_unpacklo_epi32(temp1, temp3);      // a0 b0 c0 d0 a1 b1 c1 d1
832     resq_r1_1 = _mm_unpackhi_epi32(temp1, temp3);      // a2 b2 c2 d2 a3 b3 c3 d3
833     resq_r2_1 = _mm_unpacklo_epi32(temp2, temp4);      // a4 b4 c4 d4 a5 b5 c5 d5
834     resq_r3_1 = _mm_unpackhi_epi32(temp2, temp4);      // a6 b6 c6 d6 a7 b7 c7 d7
835     /*
836      * e0 e1 e2 e3 e4 e5 e6 e7
837      * f0 f1 f2 f3 f4 f5 f6 f7
838      * g0 g1 g2 g3 g4 g5 g6 g7
839      * h0 h1 h2 h3 h4 h5 h6 h7
840      */
841     temp1 = _mm_unpacklo_epi16(resq_r4_1, resq_r5_1);  // e0 f0 e1 f1 e2 f2 e2 f3
842     temp3 = _mm_unpacklo_epi16(resq_r6_1, resq_r7_1);  // g0 h0 g1 h1 g2 h2 g3 h3
843     temp2 = _mm_unpackhi_epi16(resq_r4_1, resq_r5_1);  // e4 f4 e5 f5 e6 f6 e7 f7
844     temp4 = _mm_unpackhi_epi16(resq_r6_1, resq_r7_1);  // g4 h4 g5 h5 g6 h6 g7 h7
845     resq_r4_1 = _mm_unpacklo_epi32(temp1, temp3);      // e0 f0 g0 h0 e1 f1 g1 h1
846     resq_r5_1 = _mm_unpackhi_epi32(temp1, temp3);      // e2 f2 g2 h2 e3 f3 g3 h3
847     resq_r6_1 = _mm_unpacklo_epi32(temp2, temp4);      // e4 f4 g4 h4 e5 f5 g5 h5
848     resq_r7_1 = _mm_unpackhi_epi32(temp2, temp4);      // e6 f6 g6 h6 e7 f7 g7 h7
849     /*
850      * a0 b0 c0 d0 a1 b1 c1 d1
851      * a2 b2 c2 d2 a3 b3 c3 d3
852      * a4 b4 c4 d4 a5 b5 c5 d5
853      * a6 b6 c6 d6 a7 b7 c7 d7
854      * e0 f0 g0 h0 e1 f1 g1 h1
855      * e2 f2 g2 h2 e3 f3 g3 h3
856      * e4 f4 g4 h4 e5 f5 g5 h5
857      * e6 f6 g6 h6 e7 f7 g7 h7
858      */
859     resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1);  // a0 b0 c0 d0 e0 f0 g0 h0
860     resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1);  // a1 b1 c1 d1 e1 f1 g1 h1
861     resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1);  // a2 b2 c2 d2 e2 f2 g2 h2
862     resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1);  // a3 b3 c3 d3 e3 f3 g3 h3
863     resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1);  // a4 b4 c4 d4 e4 f4 g4 h4
864     resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1);  // a5 b5 c5 d5 e5 f5 g5 h5
865     resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1);  // a6 b6 c6 d6 e6 f6 g6 h6
866     resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1);  // a7 b7 c7 d7 e7 f7 g7 h7
867 
868     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
869     resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg);  // a1 b1 c1 d1 -- 32 bit
870     resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg);  // e1 f1 g1 h1 -- 32 bit
871     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
872     resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg);  // a3 b3 c3 d3 -- 32 bit
873     resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg);  // e3 f3 g3 h3 -- 32 bit
874     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
875     resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg);  // a5 b5 c5 d5 -- 32 bit
876     resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg);  // e5 f5 g5 h5 -- 32 bit
877     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
878     resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg);  // a7 b7 c7 d7 -- 32 bit
879     resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg);  // e7 f7 g7 h7 -- 32 bit
880     // Transform starts -- horizontal transform
881     /*------------------------------------------------------------------*/
882     /* y0 = w0 + w4                                                     */
883     temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
884     /* y2 = w0 - w4                                                      */
885     temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
886     /* y1 = -w3 + w5 - w7 - (w7 >> 1)                                   */
887     temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1);  //-w3+w5
888     temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
889     temp4 = _mm_sub_epi32(temp2, resq_r7_1);  //-w3+w5-w7
890     temp12 = _mm_sub_epi32(temp10, resq_r7_2);
891     temp5 = _mm_srai_epi32(resq_r7_1, 1);  // w7>>1
892     temp13 = _mm_srai_epi32(resq_r7_2, 1);
893     temp2 = _mm_sub_epi32(temp4, temp5);  //-w3+w5-w7 -(w7>>1)
894     temp10 = _mm_sub_epi32(temp12, temp13);
895     temp2 = _mm_packs_epi32(temp2, temp10);
896     /* y3 = w1 + w7 - w3 - (w3 >> 1)                                    */
897     temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1);  // w1+w7
898     temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
899     temp4 = _mm_sub_epi32(temp4, resq_r3_1);  // w1+w7-w3
900     temp12 = _mm_sub_epi32(temp12, resq_r3_2);
901     temp5 = _mm_srai_epi32(resq_r3_1, 1);  // w3>>1
902     temp13 = _mm_srai_epi32(resq_r3_2, 1);
903     temp4 = _mm_sub_epi32(temp4, temp5);  // w1+w7-w3-(w3>>1)
904     temp12 = _mm_sub_epi32(temp12, temp13);
905     temp4 = _mm_packs_epi32(temp4, temp12);
906     /* y4 = (w2 >> 1) - w6                                              */
907     temp5 = _mm_srai_epi16(resq_r2_2, 1);     // w2>>1
908     temp5 = _mm_sub_epi16(temp5, resq_r6_2);  //(w2>>1)-w6
909     /* y5 = -w1 + w7 + w5 + (w5 >> 1)                                   */
910     temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1);  // w7-w1
911     temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
912     temp6 = _mm_add_epi32(temp6, resq_r5_1);  // w7-w1+w5
913     temp14 = _mm_add_epi32(temp14, resq_r5_2);
914     temp7 = _mm_srai_epi32(resq_r5_1, 1);  // w5>>1
915     temp15 = _mm_srai_epi32(resq_r5_2, 1);
916     temp6 = _mm_add_epi32(temp6, temp7);  // w7-w1_w5+(w5>>1)
917     temp14 = _mm_add_epi32(temp14, temp15);
918     temp6 = _mm_packs_epi32(temp6, temp14);
919     /* y6 = w2 + (w6 >> 1)                                              */
920     temp7 = _mm_srai_epi16(resq_r6_2, 1);     // w6>>1
921     temp7 = _mm_add_epi16(temp7, resq_r2_2);  //(w6>>1)+w2
922     /* y7 = w3 + w5 + w1 + (w1 >> 1)                                    */
923     temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1);  // w3+w5
924     temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
925     temp8 = _mm_add_epi32(temp8, resq_r1_1);  // w3+w5+w1
926     temp16 = _mm_add_epi32(temp16, resq_r1_2);
927     temp17 = _mm_srai_epi32(resq_r1_1, 1);  // w1>>1
928     temp18 = _mm_srai_epi32(resq_r1_2, 1);
929     temp8 = _mm_add_epi32(temp8, temp17);  // w3+w5+w1+(w1>>1)
930     temp16 = _mm_add_epi32(temp16, temp18);
931     temp8 = _mm_packs_epi32(temp8, temp16);
932     /*------------------------------------------------------------------*/
933     /*------------------------------------------------------------------*/
934     /* z0 = y0 + y6                                                        */
935     resq_r0_1 = _mm_add_epi16(temp1, temp7);
936     /* z1 = y1 + (y7 >> 2)                                                */
937     resq_r1_1 = _mm_srai_epi16(temp8, 2);
938     resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
939     /* z2 = y2 + y4                                                        */
940     resq_r2_1 = _mm_add_epi16(temp3, temp5);
941     /* z3 = y3 + (y5 >> 2)                                                */
942     resq_r3_1 = _mm_srai_epi16(temp6, 2);
943     resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
944     /* z4 = y2 - y4                                                        */
945     resq_r4_1 = _mm_sub_epi16(temp3, temp5);
946     /* z5 = (y3 >> 2) - y5                                                 */
947     resq_r5_1 = _mm_srai_epi16(temp4, 2);
948     resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
949     /* z6 = y0 - y6                                                     */
950     resq_r6_1 = _mm_sub_epi16(temp1, temp7);
951     /* z7 = y7 - (y1 >> 2)                                                 */
952     resq_r7_1 = _mm_srai_epi16(temp2, 2);
953     resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
954     /*------------------------------------------------------------------*/
955     /*------------------------------------------------------------------*/
956     /* x0 = z0 + z7                                                        */
957     temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
958     /* x1 = z2 + z5                                                        */
959     temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
960     /* x2 = z4 + z3                                                        */
961     temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
962     /* x3 = z6 + z1                                                        */
963     temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
964     /* x4 = z6 - z1                                                        */
965     temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
966     /* x5 = z4 - z3                                                        */
967     temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
968     /* x6 = z2 - z5                                                        */
969     temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
970     /* x7 = z0 - z7                                                        */
971     temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
972     /*------------------------------------------------------------------*/
973     // Matrix transpose
974     /*
975      *  a0 b0 c0 d0 e0 f0 g0 h0
976      *  a1 b1 c1 d1 e1 f1 g1 h1
977      *  a2 b2 c2 d2 e2 f2 g2 h2
978      *  a3 b3 c3 d3 e3 f3 g3 h3
979      */
980     temp17 = _mm_unpacklo_epi16(temp1, temp2);  // a0 a1 b0 b1 c0 c1 d0 d1
981     temp19 = _mm_unpacklo_epi16(temp3, temp4);  // a2 a3 b2 b3 c2 c3 d2 d3
982     temp18 = _mm_unpackhi_epi16(temp1, temp2);  // e0 e1 f0 f1 g0 g1 h0 h1
983     temp20 = _mm_unpackhi_epi16(temp3, temp4);  // e2 e3 f2 f3 g2 g3 h2 h3
984 
985     resq_r0_1 = _mm_unpacklo_epi32(temp17, temp19);  // a0 a1 a2 a3 b0 b1 b2 b3
986     resq_r1_1 = _mm_unpackhi_epi32(temp17, temp19);  // c0 c1 c2 c3 d0 d1 d2 d3
987     resq_r2_1 = _mm_unpacklo_epi32(temp18, temp20);  // e0 e1 e2 e3 f0 f1 f2 f3
988     resq_r3_1 = _mm_unpackhi_epi32(temp18, temp20);  // g0 g2 g2 g3 h0 h1 h2 h3
989     /*
990      *  a4 b4 c4 d4 e4 f4 g4 h4
991      *  a5 b5 c5 d5 e5 f5 g5 h5
992      *  a6 b6 c6 d6 e6 f6 g6 h6
993      *  a7 b7 c7 d7 e7 f7 g7 h7
994      */
995     temp17 = _mm_unpacklo_epi16(temp5, temp6);  // a4 a5 b4 b5 c4 c5 d4 d5
996     temp19 = _mm_unpacklo_epi16(temp7, temp8);  // a6 a7 b6 b7 c6 c7 d6 d7
997     temp18 = _mm_unpackhi_epi16(temp5, temp6);  // e4 e5 f4 f5 g4 g5 h4 h5
998     temp20 = _mm_unpackhi_epi16(temp7, temp8);  // e6 e7 f6 f7 g6 g7 h6 h7
999 
1000     resq_r4_1 = _mm_unpacklo_epi32(temp17, temp19);  // a4 a5 a6 a7 b4 b5 b6 b7
1001     resq_r5_1 = _mm_unpackhi_epi32(temp17, temp19);  // c4 c5 c6 c7 d4 d5 d6 d7
1002     resq_r6_1 = _mm_unpacklo_epi32(temp18, temp20);  // e4 e5 e6 e7 f4 f5 f6 f7
1003     resq_r7_1 = _mm_unpackhi_epi32(temp18, temp20);  // g4 g5 g6 g7 h4 h5 h6 h7
1004     /*  a0 a1 a2 a3 b0 b1 b2 b3
1005      *  c0 c1 c2 c3 d0 d1 d2 d3
1006      *  e0 e1 e2 e3 f0 f1 f2 f3
1007      *  g0 g2 g2 g3 h0 h1 h2 h3
1008      *  a4 a5 a6 a7 b4 b5 b6 b7
1009      *  c4 c5 c6 c7 d4 d5 d6 d7
1010      *  e4 e5 e6 e7 f4 f5 f6 f7
1011      *  g4 g5 g6 g7 h4 h5 h6 h7
1012      */
1013     resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1);  // a0 a1 a2 a3 a4 a5 a6 a7
1014     resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1);  // b0 b1 b2 b3 b4 b5 b6 b7
1015     resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1);  // c0 c1 c2 c3 c4 c5 c6 c7
1016     resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1);  // d0 d1 d2 d3 d4 d5 d6 d7
1017     resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1);  // e0 e1 e2 e3 e4 e5 e6 e7
1018     resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1);  // f0 f1 f2 f3 f4 f5 f6 f7
1019     resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1);  // g0 g1 g2 g3 g4 g5 g6 g7
1020     resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1);  // h0 h1 h2 h3 h4 h5 h6 h7
1021 
1022     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
1023     resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg);  // a1 b1 c1 d1 -- 32 bit
1024     resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg);  // e1 f1 g1 h1 -- 32 bit
1025     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
1026     resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg);  // a3 b3 c3 d3 -- 32 bit
1027     resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg);  // e3 f3 g3 h3 -- 32 bit
1028     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
1029     resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg);  // a5 b5 c5 d5 -- 32 bit
1030     resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg);  // e5 f5 g5 h5 -- 32 bit
1031     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
1032     resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg);  // a7 b7 c7 d7 -- 32 bit
1033     resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg);  // e7 f7 g7 h7 -- 32 bit
1034 
1035     zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
1036     // Load pred buffer row 0
1037     predload_r =
1038         _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));  // p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0
1039                                                       // -- all 8 bits
1040     pred_r0_1 =
1041         _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1042     // Load pred buffer row 1
1043     predload_r =
1044         _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1045                                                                    // 0 0 0 0 0 0 -- all 8 bits
1046     pred_r1_1 =
1047         _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1048     // Load pred buffer row 2
1049     predload_r = _mm_loadl_epi64(
1050         (__m128i *) (&pu1_pred[2 * i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1051                                                        // 0 0 0 0 0 0 -- all 8 bits
1052     pred_r2_1 =
1053         _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1054     // Load pred buffer row 3
1055     predload_r = _mm_loadl_epi64(
1056         (__m128i *) (&pu1_pred[3 * i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1057                                                        // 0 0 0 0 0 0 -- all 8 bits
1058     pred_r3_1 =
1059         _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1060     // Load pred buffer row 4
1061     predload_r = _mm_loadl_epi64(
1062         (__m128i *) (&pu1_pred[4 * i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1063                                                        // 0 0 0 0 0 0 -- all 8 bits
1064     pred_r4_1 =
1065         _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1066     // Load pred buffer row 5
1067     predload_r =
1068         _mm_loadl_epi64((__m128i *) (&pu1_pred[5 * i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0
1069                                                                        // 0 0 0 0 0 0 0 -- all 8 bit
1070     pred_r5_1 =
1071         _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1072     // Load pred buffer row 6
1073     predload_r = _mm_loadl_epi64(
1074         (__m128i *) (&pu1_pred[6 * i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1075                                                        // 0 0 0 0 0 0 -- all 8 bits
1076     pred_r6_1 =
1077         _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1078     // Load pred buffer row 7
1079     predload_r = _mm_loadl_epi64(
1080         (__m128i *) (&pu1_pred[7 * i4_pred_stride]));  // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1081                                                        // 0 0 0 0 0 0 -- all 8 bits
1082     pred_r7_1 =
1083         _mm_unpacklo_epi8(predload_r, zero_8x16b);  // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1084 
1085     /*--------------------------------------------------------------------*/
1086     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6            */
1087     /*                                                                    */
1088     /* Add the prediction and store it back to reconstructed frame buffer */
1089     /* [Prediction buffer itself in this case]                            */
1090     /*--------------------------------------------------------------------*/
1091 
1092     /* y0j = w0j + w4j                                                     */
1093     temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
1094     /* y2j = w0j - w4j                                                      */
1095     temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
1096     /* y1j = -w3j + w5j - w7j - (w7j >> 1)                                   */
1097     temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1);  //-w3+w5
1098     temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
1099     temp4 = _mm_sub_epi32(temp2, resq_r7_1);  //-w3+w5-w7
1100     temp12 = _mm_sub_epi32(temp10, resq_r7_2);
1101     temp5 = _mm_srai_epi32(resq_r7_1, 1);  // w7>>1
1102     temp13 = _mm_srai_epi32(resq_r7_2, 1);
1103     temp2 = _mm_sub_epi32(temp4, temp5);  //-w3+w5-w7 -(w7>>1)
1104     temp10 = _mm_sub_epi32(temp12, temp13);
1105     temp2 = _mm_packs_epi32(temp2, temp10);
1106     /* y3j = w1j + w7j - w3j - (w3j >> 1)                                    */
1107     temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1);  // w1+w7
1108     temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
1109     temp4 = _mm_sub_epi32(temp4, resq_r3_1);  // w1+w7-w3
1110     temp12 = _mm_sub_epi32(temp12, resq_r3_2);
1111     temp5 = _mm_srai_epi32(resq_r3_1, 1);  // w3>>1
1112     temp13 = _mm_srai_epi32(resq_r3_2, 1);
1113     temp4 = _mm_sub_epi32(temp4, temp5);  // w1+w7-w3-(w3>>1)
1114     temp12 = _mm_sub_epi32(temp12, temp13);
1115     temp4 = _mm_packs_epi32(temp4, temp12);
1116     /* y4j = (w2j >> 1) - w6j                                              */
1117     temp5 = _mm_srai_epi16(resq_r2_2, 1);     // w2>>1
1118     temp5 = _mm_sub_epi16(temp5, resq_r6_2);  //(w2>>1)-w6
1119     /* y5j = -w1j + w7j + w5j + (w5j >> 1)                                   */
1120     temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1);  // w7-w1
1121     temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
1122     temp6 = _mm_add_epi32(temp6, resq_r5_1);  // w7-w1+w5
1123     temp14 = _mm_add_epi32(temp14, resq_r5_2);
1124     temp7 = _mm_srai_epi32(resq_r5_1, 1);  // w5>>1
1125     temp15 = _mm_srai_epi32(resq_r5_2, 1);
1126     temp6 = _mm_add_epi32(temp6, temp7);  // w7-w1_w5+(w5>>1)
1127     temp14 = _mm_add_epi32(temp14, temp15);
1128     temp6 = _mm_packs_epi32(temp6, temp14);
1129     /* y6j = w2j + (w6j >> 1)                                              */
1130     temp7 = _mm_srai_epi16(resq_r6_2, 1);     // w6>>1
1131     temp7 = _mm_add_epi16(temp7, resq_r2_2);  //(w6>>1)+w2
1132     /* y7j = w3j + w5j + w1j + (w1j >> 1)                                    */
1133     temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1);  // w3+w5
1134     temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
1135     temp8 = _mm_add_epi32(temp8, resq_r1_1);  // w3+w5+w1
1136     temp16 = _mm_add_epi32(temp16, resq_r1_2);
1137     temp17 = _mm_srai_epi32(resq_r1_1, 1);  // w1>>1
1138     temp18 = _mm_srai_epi32(resq_r1_2, 1);
1139     temp8 = _mm_add_epi32(temp8, temp17);  // w3+w5+w1+(w1>>1)
1140     temp16 = _mm_add_epi32(temp16, temp18);
1141     temp8 = _mm_packs_epi32(temp8, temp16);
1142     /*------------------------------------------------------------------*/
1143     /*------------------------------------------------------------------*/
1144     /* z0j = y0j + y6j                                                        */
1145     resq_r0_1 = _mm_add_epi16(temp1, temp7);
1146     /* z1j = y1j + (y7j >> 2)                                                */
1147     resq_r1_1 = _mm_srai_epi16(temp8, 2);
1148     resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
1149     /* z2j = y2j + y4j                                                        */
1150     resq_r2_1 = _mm_add_epi16(temp3, temp5);
1151     /* z3j = y3j + (y5j >> 2)                                                */
1152     resq_r3_1 = _mm_srai_epi16(temp6, 2);
1153     resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
1154     /* z4j = y2j - y4j                                                        */
1155     resq_r4_1 = _mm_sub_epi16(temp3, temp5);
1156     /* z5j = (y3j >> 2) - y5j                                                 */
1157     resq_r5_1 = _mm_srai_epi16(temp4, 2);
1158     resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
1159     /* z6j = y0j - y6j                                                     */
1160     resq_r6_1 = _mm_sub_epi16(temp1, temp7);
1161     /* z7j = y7j - (y1j >> 2)                                                 */
1162     resq_r7_1 = _mm_srai_epi16(temp2, 2);
1163     resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
1164     /*------------------------------------------------------------------*/
1165 
1166     /*------------------------------------------------------------------*/
1167     /* x0j = z0j + z7j                                                        */
1168     temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
1169     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp1);
1170     temp10 = _mm_unpacklo_epi16(temp1, sign_reg);
1171     temp11 = _mm_unpackhi_epi16(temp1, sign_reg);
1172     temp10 = _mm_add_epi32(temp10, value_32);
1173     temp11 = _mm_add_epi32(temp11, value_32);
1174     temp10 = _mm_srai_epi32(temp10, 6);
1175     temp11 = _mm_srai_epi32(temp11, 6);
1176     temp10 = _mm_packs_epi32(temp10, temp11);
1177     temp1 = _mm_add_epi16(temp10, pred_r0_1);
1178     /* x1j = z2j + z5j                                                        */
1179     temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
1180     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp2);
1181     temp10 = _mm_unpacklo_epi16(temp2, sign_reg);
1182     temp11 = _mm_unpackhi_epi16(temp2, sign_reg);
1183     temp10 = _mm_add_epi32(temp10, value_32);
1184     temp11 = _mm_add_epi32(temp11, value_32);
1185     temp10 = _mm_srai_epi32(temp10, 6);
1186     temp11 = _mm_srai_epi32(temp11, 6);
1187     temp10 = _mm_packs_epi32(temp10, temp11);
1188     temp2 = _mm_add_epi16(temp10, pred_r1_1);
1189     /* x2j = z4j + z3j                                                        */
1190     temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
1191     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp3);
1192     temp10 = _mm_unpacklo_epi16(temp3, sign_reg);
1193     temp11 = _mm_unpackhi_epi16(temp3, sign_reg);
1194     temp10 = _mm_add_epi32(temp10, value_32);
1195     temp11 = _mm_add_epi32(temp11, value_32);
1196     temp10 = _mm_srai_epi32(temp10, 6);
1197     temp11 = _mm_srai_epi32(temp11, 6);
1198     temp10 = _mm_packs_epi32(temp10, temp11);
1199     temp3 = _mm_add_epi16(temp10, pred_r2_1);
1200     /* x3j = z6j + z1j                                                        */
1201     temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
1202     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp4);
1203     temp10 = _mm_unpacklo_epi16(temp4, sign_reg);
1204     temp11 = _mm_unpackhi_epi16(temp4, sign_reg);
1205     temp10 = _mm_add_epi32(temp10, value_32);
1206     temp11 = _mm_add_epi32(temp11, value_32);
1207     temp10 = _mm_srai_epi32(temp10, 6);
1208     temp11 = _mm_srai_epi32(temp11, 6);
1209     temp10 = _mm_packs_epi32(temp10, temp11);
1210     temp4 = _mm_add_epi16(temp10, pred_r3_1);
1211     /* x4j = z6j - z1j                                                        */
1212     temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
1213     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp5);
1214     temp10 = _mm_unpacklo_epi16(temp5, sign_reg);
1215     temp11 = _mm_unpackhi_epi16(temp5, sign_reg);
1216     temp10 = _mm_add_epi32(temp10, value_32);
1217     temp11 = _mm_add_epi32(temp11, value_32);
1218     temp10 = _mm_srai_epi32(temp10, 6);
1219     temp11 = _mm_srai_epi32(temp11, 6);
1220     temp10 = _mm_packs_epi32(temp10, temp11);
1221     temp5 = _mm_add_epi16(temp10, pred_r4_1);
1222     /* x5j = z4j - z3j                                                        */
1223     temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
1224     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp6);
1225     temp10 = _mm_unpacklo_epi16(temp6, sign_reg);
1226     temp11 = _mm_unpackhi_epi16(temp6, sign_reg);
1227     temp10 = _mm_add_epi32(temp10, value_32);
1228     temp11 = _mm_add_epi32(temp11, value_32);
1229     temp10 = _mm_srai_epi32(temp10, 6);
1230     temp11 = _mm_srai_epi32(temp11, 6);
1231     temp10 = _mm_packs_epi32(temp10, temp11);
1232     temp6 = _mm_add_epi16(temp10, pred_r5_1);
1233     /* x6j = z2j - z5j                                                        */
1234     temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
1235     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp7);
1236     temp10 = _mm_unpacklo_epi16(temp7, sign_reg);
1237     temp11 = _mm_unpackhi_epi16(temp7, sign_reg);
1238     temp10 = _mm_add_epi32(temp10, value_32);
1239     temp11 = _mm_add_epi32(temp11, value_32);
1240     temp10 = _mm_srai_epi32(temp10, 6);
1241     temp11 = _mm_srai_epi32(temp11, 6);
1242     temp10 = _mm_packs_epi32(temp10, temp11);
1243     temp7 = _mm_add_epi16(temp10, pred_r6_1);
1244     /* x7j = z0j - z7j                                                        */
1245     temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
1246     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp8);
1247     temp10 = _mm_unpacklo_epi16(temp8, sign_reg);
1248     temp11 = _mm_unpackhi_epi16(temp8, sign_reg);
1249     temp10 = _mm_add_epi32(temp10, value_32);
1250     temp11 = _mm_add_epi32(temp11, value_32);
1251     temp10 = _mm_srai_epi32(temp10, 6);
1252     temp11 = _mm_srai_epi32(temp11, 6);
1253     temp10 = _mm_packs_epi32(temp10, temp11);
1254     temp8 = _mm_add_epi16(temp10, pred_r7_1);
1255     /*------------------------------------------------------------------*/
1256     // Clipping the results to 8 bits
1257     sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);  // sign check
1258     temp1 = _mm_and_si128(temp1, sign_reg);
1259     sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b);  // sign check
1260     temp2 = _mm_and_si128(temp2, sign_reg);
1261     sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b);  // sign check
1262     temp3 = _mm_and_si128(temp3, sign_reg);
1263     sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b);  // sign check
1264     temp4 = _mm_and_si128(temp4, sign_reg);
1265     sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b);  // sign check
1266     temp5 = _mm_and_si128(temp5, sign_reg);
1267     sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b);  // sign check
1268     temp6 = _mm_and_si128(temp6, sign_reg);
1269     sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b);  // sign check
1270     temp7 = _mm_and_si128(temp7, sign_reg);
1271     sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b);  // sign check
1272     temp8 = _mm_and_si128(temp8, sign_reg);
1273 
1274     resq_r0_2 = _mm_packus_epi16(temp1, zero_8x16b);
1275     resq_r1_2 = _mm_packus_epi16(temp2, zero_8x16b);
1276     resq_r2_2 = _mm_packus_epi16(temp3, zero_8x16b);
1277     resq_r3_2 = _mm_packus_epi16(temp4, zero_8x16b);
1278     resq_r4_2 = _mm_packus_epi16(temp5, zero_8x16b);
1279     resq_r5_2 = _mm_packus_epi16(temp6, zero_8x16b);
1280     resq_r6_2 = _mm_packus_epi16(temp7, zero_8x16b);
1281     resq_r7_2 = _mm_packus_epi16(temp8, zero_8x16b);
1282 
1283     _mm_storel_epi64((__m128i *) (&pu1_out[0]), resq_r0_2);
1284     _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), resq_r1_2);
1285     _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), resq_r2_2);
1286     _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), resq_r3_2);
1287     _mm_storel_epi64((__m128i *) (&pu1_out[4 * i4_out_stride]), resq_r4_2);
1288     _mm_storel_epi64((__m128i *) (&pu1_out[5 * i4_out_stride]), resq_r5_2);
1289     _mm_storel_epi64((__m128i *) (&pu1_out[6 * i4_out_stride]), resq_r6_2);
1290     _mm_storel_epi64((__m128i *) (&pu1_out[7 * i4_out_stride]), resq_r7_2);
1291 }
1292