1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvc_iquant_itrans_recon_ssse3.c
24 *
25 * @brief
26 * Contains function definitions for inverse quantization, inverse
27 * transform and reconstruction
28 *
29 * @author
30 * Mohit [100664]
31 *
32 * @par List of Functions:
33 * - isvc_iquant_itrans_recon_4x4_ssse3()
34 * - isvc_iquant_itrans_recon_8x8_ssse3()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 #include <immintrin.h>
42
43 #include "ih264_typedefs.h"
44 #include "ih264_debug.h"
45 #include "ih264_defs.h"
46 #include "ih264_trans_macros.h"
47 #include "ih264_macros.h"
48 #include "ih264_platform_macros.h"
49 #include "ih264_trans_data.h"
50 #include "ih264_size_defs.h"
51 #include "isvc_structs.h"
52 #include "isvc_trans_quant_itrans_iquant.h"
53
54 /*
55 ********************************************************************************
56 *
57 * @brief This function reconstructs a 4x4 sub block from quantized resiude and
58 * prediction buffer
59 *
60 * @par Description:
61 * The quantized residue is first inverse quantized, then inverse transformed.
62 * This inverse transformed content is added to the prediction buffer to recon-
63 * struct the end output
64 *
65 * @param[in] pi2_src
66 * quantized 4x4 block
67 *
68 * @param[in] pu1_pred
69 * prediction 4x4 block
70 *
71 * @param[out] pu1_out
72 * reconstructed 4x4 block
73 *
74 * @param[in] src_strd
75 * quantization buffer stride
76 *
77 * @param[in] i4_pred_stride,
78 * Prediction buffer stride
79 *
80 * @param[in] i4_out_stride
81 * recon buffer Stride
82 *
83 * @param[in] pu2_scaling_list
84 * pointer to scaling list
85 *
86 * @param[in] pu2_norm_adjust
87 * pointer to inverse scale matrix
88 *
89 * @param[in] u4_qp_div_6
90 * Floor (qp/6)
91 *
92 * @param[in] pi4_tmp
93 * temporary buffer of size 1*16
94 *
95 * @returns none
96 *
97 * @remarks none
98 *
99 *******************************************************************************
100 */
isvc_iquant_itrans_recon_4x4_ssse3(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)101 void isvc_iquant_itrans_recon_4x4_ssse3(buffer_container_t *ps_src, buffer_container_t *ps_pred,
102 buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
103 buffer_container_t *ps_rec,
104 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
105 WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
106 UWORD8 u1_res_accumulate)
107 {
108 WORD16 *pi2_src = ps_src->pv_data;
109 WORD16 *pi2_res = ps_res->pv_data;
110 WORD16 *pi2_res_pred = ps_res_pred->pv_data;
111 UWORD8 *pu1_pred = ps_pred->pv_data;
112 UWORD8 *pu1_out = ps_rec->pv_data;
113 WORD32 i4_src_stride = ps_src->i4_data_stride;
114 WORD32 i4_res_stride = ps_res->i4_data_stride;
115 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
116 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
117 WORD32 i4_out_stride = ps_rec->i4_data_stride;
118 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
119 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
120 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
121 UWORD32 *pu4_out = (UWORD32 *) pu1_out;
122 __m128i src_r0_r1, src_r2_r3;
123 __m128i src_r0, src_r1, src_r2, src_r3;
124 __m128i scalemat_r0_r1, scalemat_r2_r3, predload_r;
125 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
126 __m128i sign_reg, dequant_r0_r1, dequant_r2_r3;
127 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
128 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
129 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
130 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
131 __m128i value_32 = _mm_set1_epi32(32);
132
133 UNUSED(pi2_tmp);
134 UNUSED(pi2_dc_src);
135 UNUSED(u1_res_accumulate);
136 UNUSED(i4_src_stride);
137 UNUSED(i4_res_stride);
138 UNUSED(i4_res_pred_stride);
139 UNUSED(pi2_res);
140 UNUSED(pi2_res_pred);
141 UNUSED(i4_iq_start_idx);
142
143 /* Implement residue accumulation */
144 ASSERT(0);
145
146 /*************************************************************/
147 /* Dequantization of coefficients. Will be replaced by SIMD */
148 /* operations on platform */
149 /*************************************************************/
150 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); // a00 a01 a02 a03 a10 a11 a12 a13 -- the
151 // source matrix 0th,1st row
152 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); // a20 a21 a22 a23 a30 a31 a32 a33 --
153 // the source matrix 2nd,3rd row
154 scalemat_r0_r1 =
155 _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); // b00 b01 b02 b03 b10 b11 b12 b13 -- the
156 // scaling matrix 0th,1st row
157 scalemat_r2_r3 =
158 _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); // b20 b21 b22 b23 b30 b31 b32 b33 --
159 // the scaling matrix 2nd,3rd row
160 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat)); // q00 q01 q02 q03 q10 q11
161 // q12 q13 -- all 16 bits
162 dequant_r2_r3 = _mm_loadu_si128(
163 (__m128i *) (pu2_weigh_mat + 8)); // q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
164
165 temp0 = _mm_mullo_epi16(scalemat_r0_r1,
166 dequant_r0_r1); // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
167 // b12*q12 b13*q13 -- 16 bit result
168 temp1 = _mm_mullo_epi16(scalemat_r2_r3,
169 dequant_r2_r3); // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11
170 // b12*q12 b13*q13 -- 16 bit result
171
172 temp4 =
173 _mm_unpacklo_epi16(temp0,
174 zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
175 temp5 =
176 _mm_unpackhi_epi16(temp0,
177 zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
178 temp6 =
179 _mm_unpacklo_epi16(temp1,
180 zero_8x16b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
181 temp7 =
182 _mm_unpackhi_epi16(temp1,
183 zero_8x16b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
184
185 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
186 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
187 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
188 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
189
190 temp4 = _mm_madd_epi16(src_r0, temp4); // a00*b00*q00 a10*b10*q10 a20*b20*q20
191 // a30*b30 q30 -- 32 bits long
192 temp5 = _mm_madd_epi16(src_r1, temp5);
193 temp6 = _mm_madd_epi16(src_r2, temp6);
194 temp7 = _mm_madd_epi16(src_r3, temp7);
195
196 if(u4_qp_div_6 >= 4)
197 {
198 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
199 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
200 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
201 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
202 }
203 else
204 {
205 temp4 = _mm_add_epi32(temp4, add_rshift);
206 temp5 = _mm_add_epi32(temp5, add_rshift);
207 temp6 = _mm_add_epi32(temp6, add_rshift);
208 temp7 = _mm_add_epi32(temp7, add_rshift);
209 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
210 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
211 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
212 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
213 }
214
215 if(i4_iq_start_idx == 1)
216 {
217 resq_r0 = _mm_insert_epi16(resq_r0, (WORD32) pi2_src[0], 0);
218 if(pi2_src[0] >= 0)
219 resq_r0 = _mm_insert_epi16(resq_r0, 0, 1);
220 else
221 resq_r0 = _mm_insert_epi16(resq_r0, -1, 1);
222 }
223 /* Perform Inverse transform */
224 /*-------------------------------------------------------------*/
225 /* IDCT [ Horizontal transformation ] */
226 /*-------------------------------------------------------------*/
227 // Matrix transpose
228 /*
229 * a0 a1 a2 a3
230 * b0 b1 b2 b3
231 * c0 c1 c2 c3
232 * d0 d1 d2 d3
233 */
234 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); // a0 b0 a1 b1
235 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); // c0 d0 c1 d1
236 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); // a2 b2 a3 b3
237 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); // c2 d2 c3 d3
238 resq_r0 = _mm_unpacklo_epi64(temp1, temp3); // a0 b0 c0 d0
239 resq_r1 = _mm_unpackhi_epi64(temp1, temp3); // a1 b1 c1 d1
240 resq_r2 = _mm_unpacklo_epi64(temp2, temp4); // a2 b2 c2 d2
241 resq_r3 = _mm_unpackhi_epi64(temp2, temp4); // a3 b3 c3 d3
242 // Transform starts -- horizontal transform
243 /*------------------------------------------------------------------*/
244 /* z0 = w0 + w2 */
245 temp0 = _mm_add_epi32(resq_r0, resq_r2);
246 /* z1 = w0 - w2 */
247 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
248 /* z2 = (w1 >> 1) - w3 */
249 temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1)
250 temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3
251 /* z3 = w1 + (w3 >> 1) */
252 temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
253 temp3 = _mm_add_epi32(temp3, resq_r1);
254 /*----------------------------------------------------------*/
255 /* x0 = z0 + z3 */
256 resq_r0 = _mm_add_epi32(temp0, temp3);
257 /* x1 = z1 + z2 */
258 resq_r1 = _mm_add_epi32(temp1, temp2);
259 /* x2 = z1 - z2 */
260 resq_r2 = _mm_sub_epi32(temp1, temp2);
261 /* x3 = z0 - z3 */
262 resq_r3 = _mm_sub_epi32(temp0, temp3);
263 // Matrix transpose
264 /*
265 * a0 b0 c0 d0
266 * a1 b1 c1 d1
267 * a2 b2 c2 d2
268 * a3 b3 c3 d3
269 */
270 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); // a0 a1 b0 b1
271 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); // a2 a3 b2 b3
272 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); // c0 c1 d0 d1
273 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); // c2 c3 d2 d3
274 resq_r0 = _mm_unpacklo_epi64(temp1, temp3); // a0 a1 a2 a3
275 resq_r1 = _mm_unpackhi_epi64(temp1, temp3); // b0 b1 b2 b3
276 resq_r2 = _mm_unpacklo_epi64(temp2, temp4); // c0 c1 c2 c3
277 resq_r3 = _mm_unpackhi_epi64(temp2, temp4); // d0 d1 d2 d3
278 // Transform ends -- horizontal transform
279
280 zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
281 // Load pred buffer
282 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); // p00 p01 p02 p03 0 0 0 0 0
283 // 0 0 0 -- all 8 bits
284 pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); // p00 p01 p02 p03 0 0 0 0 -- all 16 bits
285
286 predload_r =
287 _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride])); // p10 p11 p12 p13 0 0 0 0 0 0
288 // 0 0 -- all 8 bits
289 pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); // p10 p11 p12 p13 0 0 0 0 -- all 16 bits
290
291 predload_r =
292 _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride])); // p20 p21 p22 p23 0 0 0 0
293 // 0 0 0 0 -- all 8 bits
294 pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); // p20 p21 p22 p23 0 0 0 0 -- all 16 bits
295
296 predload_r =
297 _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride])); // p30 p31 p32 p33 0 0 0 0
298 // 0 0 0 0 -- all 8 bits
299 pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); // p30 p31 p32 p33 0 0 0 0 -- all 16 bits
300 pred_r0 = _mm_unpacklo_epi16(pred_r0, zero_8x16b); // p00 p01 p02 p03 -- 32 bits sign extended
301 pred_r1 = _mm_unpacklo_epi16(pred_r1, zero_8x16b); // p10 p11 p12 p13 -- 32 bits sign extended
302 pred_r2 = _mm_unpacklo_epi16(pred_r2, zero_8x16b); // p20 p21 p22 p23 -- 32 bits sign extended
303 pred_r3 = _mm_unpacklo_epi16(pred_r3, zero_8x16b); // p30 p31 p32 p33 -- 32 bits sign extended
304
305 /*--------------------------------------------------------------*/
306 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
307 /* */
308 /* Add the prediction and store it back to same buffer */
309 /*--------------------------------------------------------------*/
310 /* z0j = y0j + y2j */
311 temp0 = _mm_add_epi32(resq_r0, resq_r2);
312 /* z1j = y0j - y2j */
313 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
314 /* z2j = (y1j>>1) - y3j */
315 temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1)
316 temp2 = _mm_sub_epi32(temp2, resq_r3);
317 /* z3j = y1j + (y3j>>1) */
318 temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1)
319 temp3 = _mm_add_epi32(temp3, resq_r1);
320
321 /* x0j = z0j + z3j */
322 temp4 = _mm_add_epi32(temp0, temp3);
323 temp4 = _mm_add_epi32(temp4, value_32);
324 temp4 = _mm_srai_epi32(temp4, 6);
325 temp4 = _mm_add_epi32(temp4, pred_r0);
326 /* x1j = z1j + z2j */
327 temp5 = _mm_add_epi32(temp1, temp2);
328 temp5 = _mm_add_epi32(temp5, value_32);
329 temp5 = _mm_srai_epi32(temp5, 6);
330 temp5 = _mm_add_epi32(temp5, pred_r1);
331 /* x2j = z1j - z2j */
332 temp6 = _mm_sub_epi32(temp1, temp2);
333 temp6 = _mm_add_epi32(temp6, value_32);
334 temp6 = _mm_srai_epi32(temp6, 6);
335 temp6 = _mm_add_epi32(temp6, pred_r2);
336 /* x3j = z0j - z3j */
337 temp7 = _mm_sub_epi32(temp0, temp3);
338 temp7 = _mm_add_epi32(temp7, value_32);
339 temp7 = _mm_srai_epi32(temp7, 6);
340 temp7 = _mm_add_epi32(temp7, pred_r3);
341
342 // 32-bit to 16-bit conversion
343 temp0 = _mm_packs_epi32(temp4, temp5);
344 temp1 = _mm_packs_epi32(temp6, temp7);
345 /*------------------------------------------------------------------*/
346 // Clipping the results to 8 bits
347 sign_reg = _mm_cmpgt_epi16(temp0, zero_8x16b); // sign check
348 temp0 = _mm_and_si128(temp0, sign_reg);
349 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b);
350 temp1 = _mm_and_si128(temp1, sign_reg);
351
352 resq_r0 = _mm_packus_epi16(temp0, temp1);
353 resq_r1 = _mm_srli_si128(resq_r0, 4);
354 resq_r2 = _mm_srli_si128(resq_r1, 4);
355 resq_r3 = _mm_srli_si128(resq_r2, 4);
356
357 *pu4_out = _mm_cvtsi128_si32(resq_r0);
358 pu1_out += i4_out_stride;
359 pu4_out = (UWORD32 *) (pu1_out);
360 *(pu4_out) = _mm_cvtsi128_si32(resq_r1);
361 pu1_out += i4_out_stride;
362 pu4_out = (UWORD32 *) (pu1_out);
363 *(pu4_out) = _mm_cvtsi128_si32(resq_r2);
364 pu1_out += i4_out_stride;
365 pu4_out = (UWORD32 *) (pu1_out);
366 *(pu4_out) = _mm_cvtsi128_si32(resq_r3);
367 }
368
369 /**
370 *******************************************************************************
371 *
372 * @brief
373 * This function performs inverse quant and Inverse transform type Ci4 for 8x8
374 *block
375 *
376 * @par Description:
377 * Performs inverse transform Ci8 and adds the residue to get the
378 * reconstructed block
379 *
380 * @param[in] pi2_src
381 * Input 8x8coefficients
382 *
383 * @param[in] pu1_pred
384 * Prediction 8x8 block
385 *
386 * @param[out] pu1_recon
387 * Output 8x8 block
388 *
389 * @param[in] q_div
390 * QP/6
391 *
392 * @param[in] q_rem
393 * QP%6
394 *
395 * @param[in] q_lev
396 * Quantizer level
397 *
398 * @param[in] u4_src_stride
399 * Input stride
400 *
401 * @param[in] u4_pred_stride,
402 * Prediction stride
403 *
404 * @param[in] u4_out_stride
405 * Output Stride
406 *
407 * @param[in] pi4_tmp
408 * temporary buffer of size 1*64
409 * the tmp for each block
410 *
411 * @param[in] pu4_iquant_mat
412 * Pointer to the inverse quantization matrix
413 *
414 * @returns Void
415 *
416 * @remarks
417 * None
418 *
419 *******************************************************************************
420 */
421
isvc_iquant_itrans_recon_8x8_ssse3(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res_pred,buffer_container_t * ps_res,buffer_container_t * ps_rec,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,WORD16 * pi2_tmp,WORD16 * pi2_dc_src,WORD32 i4_iq_start_idx,UWORD8 u1_res_accumulate)422 void isvc_iquant_itrans_recon_8x8_ssse3(buffer_container_t *ps_src, buffer_container_t *ps_pred,
423 buffer_container_t *ps_res_pred, buffer_container_t *ps_res,
424 buffer_container_t *ps_rec,
425 iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
426 WORD16 *pi2_tmp, WORD16 *pi2_dc_src, WORD32 i4_iq_start_idx,
427 UWORD8 u1_res_accumulate)
428 {
429 WORD16 *pi2_src = ps_src->pv_data;
430 WORD16 *pi2_res = ps_res->pv_data;
431 WORD16 *pi2_res_pred = ps_res_pred->pv_data;
432 UWORD8 *pu1_pred = ps_pred->pv_data;
433 UWORD8 *pu1_out = ps_rec->pv_data;
434 WORD32 i4_src_stride = ps_src->i4_data_stride;
435 WORD32 i4_res_stride = ps_res->i4_data_stride;
436 WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride;
437 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
438 WORD32 i4_out_stride = ps_rec->i4_data_stride;
439 const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
440 const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
441 UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
442 __m128i src_r0;
443 __m128i scalemat_r0;
444 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
445 // __m128i one_8x16b = _mm_set1_epi8(255); // all bits set to 1
446 // __m128i one_zero_mask = _mm_unpacklo_epi16(one_8x16b, zero_8x16b); // 1 0 1
447 // 0 1 0 1 0 --- 16 bits size
448 __m128i value_32 = _mm_set1_epi32(32);
449 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0);
450 __m128i dequant_r0;
451 __m128i predload_r;
452 __m128i pred_r0_1, pred_r1_1, pred_r2_1, pred_r3_1, pred_r4_1, pred_r5_1, pred_r6_1, pred_r7_1;
453 __m128i sign_reg;
454 __m128i src_r0_1, src_r0_2;
455 __m128i scalemat_r0_1, scalemat_r0_2;
456 __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
457 __m128i temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18, temp19, temp20;
458 // To store dequantization results
459 __m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2, resq_r3_1, resq_r3_2,
460 resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2, resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2;
461
462 UNUSED(pi2_tmp);
463 UNUSED(i4_iq_start_idx);
464 UNUSED(pi2_dc_src);
465 UNUSED(u1_res_accumulate);
466 UNUSED(i4_src_stride);
467 UNUSED(i4_res_stride);
468 UNUSED(i4_res_pred_stride);
469 UNUSED(pi2_res);
470 UNUSED(pi2_res_pred);
471 UNUSED(i4_iq_start_idx);
472
473 /* Implement residue accumulation */
474 ASSERT(0);
475
476 /*************************************************************/
477 /* Dequantization of coefficients. Will be replaced by SIMD */
478 /* operations on platform. Note : DC coeff is not scaled */
479 /*************************************************************/
480
481 // Row 0 processing
482 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src)); // a00 a01 a02 a03 a04 a05 a06 a07 -- the
483 // source matrix 0th row
484 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat)); // b00 b01 b02 b03 b04 b05 b06 b07
485 // -- the scaling matrix 0th row
486 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[0])); // q0 q1 q2 q3 q4 q5 q6
487 // q7 -- all 16 bits
488 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
489 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
490 temp10 = _mm_mullo_epi16(scalemat_r0,
491 dequant_r0); // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
492 // b05*q5 b06*q6 b07*q7 -- 16 bit result
493 scalemat_r0_1 =
494 _mm_unpacklo_epi16(temp10,
495 zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
496 scalemat_r0_2 =
497 _mm_unpackhi_epi16(temp10,
498 zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
499
500 temp5 = _mm_madd_epi16(src_r0_1,
501 scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2
502 // a03*b03*q3 -- 32 bits long
503 temp7 = _mm_madd_epi16(src_r0_2,
504 scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6
505 // a07*b07*q7 -- 32 bits long
506
507 if(u4_qp_div_6 >= 6)
508 {
509 resq_r0_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
510 resq_r0_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
511 }
512 else
513 {
514 temp5 = _mm_add_epi32(temp5, add_rshift);
515 temp7 = _mm_add_epi32(temp7, add_rshift);
516 resq_r0_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
517 resq_r0_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
518 }
519 resq_r0_1 =
520 _mm_packs_epi32(resq_r0_1,
521 resq_r0_2); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
522 // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
523 // Row 1 processing
524 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
525 // the source matrix 1st row
526 scalemat_r0 =
527 _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8)); // b00 b01 b02 b03 b04 b05 b06 b07 b08
528 // -- the scaling matrix 1st row
529 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[8])); // q0 q1 q2 q3 q4 q5 q6
530 // q7 -- all 16 bits
531 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
532 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
533 temp10 = _mm_mullo_epi16(scalemat_r0,
534 dequant_r0); // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
535 // b05*q5 b06*q6 b07*q7 -- 16 bit result
536 scalemat_r0_1 =
537 _mm_unpacklo_epi16(temp10,
538 zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
539 scalemat_r0_2 =
540 _mm_unpackhi_epi16(temp10,
541 zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
542 temp5 = _mm_madd_epi16(src_r0_1,
543 scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2
544 // a03*b03*q3 -- 32 bits long
545 temp7 = _mm_madd_epi16(src_r0_2,
546 scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6
547 // a07*b07*q7 -- 32 bits long
548 if(u4_qp_div_6 >= 6)
549 {
550 resq_r1_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
551 resq_r1_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
552 }
553 else
554 {
555 temp5 = _mm_add_epi32(temp5, add_rshift);
556 temp7 = _mm_add_epi32(temp7, add_rshift);
557 resq_r1_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
558 resq_r1_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
559 }
560 resq_r1_1 =
561 _mm_packs_epi32(resq_r1_1,
562 resq_r1_2); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
563 // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
564 // Row 2 processing
565 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 16)); // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
566 // the source matrix 2nd row
567 scalemat_r0 =
568 _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 16)); // b00 b01 b02 b03 b04 b05 b06 b07 b08
569 // -- the scaling matrix 2nd row
570 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[16])); // q0 q1 q2 q3 q4 q5
571 // q6 q7 -- all 16 bits
572 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
573 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
574 temp10 = _mm_mullo_epi16(scalemat_r0,
575 dequant_r0); // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
576 // b05*q5 b06*q6 b07*q7 -- 16 bit result
577 scalemat_r0_1 =
578 _mm_unpacklo_epi16(temp10,
579 zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
580 scalemat_r0_2 =
581 _mm_unpackhi_epi16(temp10,
582 zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
583 temp5 = _mm_madd_epi16(src_r0_1,
584 scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2
585 // a03*b03*q3 -- 32 bits long
586 temp7 = _mm_madd_epi16(src_r0_2,
587 scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6
588 // a07*b07*q7 -- 32 bits long
589 if(u4_qp_div_6 >= 6)
590 {
591 resq_r2_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
592 resq_r2_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
593 }
594 else
595 {
596 temp5 = _mm_add_epi32(temp5, add_rshift);
597 temp7 = _mm_add_epi32(temp7, add_rshift);
598 resq_r2_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
599 resq_r2_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
600 }
601 resq_r2_1 =
602 _mm_packs_epi32(resq_r2_1,
603 resq_r2_2); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
604 // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
605 // Row 3 processing
606 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 24)); // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
607 // the source matrix 3rd row
608 scalemat_r0 =
609 _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 24)); // b00 b01 b02 b03 b04 b05 b06 b07 b08
610 // -- the scaling matrix 3rd row
611 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[24])); // q0 q1 q2 q3 q4 q5
612 // q6 q7 -- all 16 bits
613 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
614 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
615 temp10 = _mm_mullo_epi16(scalemat_r0,
616 dequant_r0); // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
617 // b05*q5 b06*q6 b07*q7 -- 16 bit result
618 scalemat_r0_1 =
619 _mm_unpacklo_epi16(temp10,
620 zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
621 scalemat_r0_2 =
622 _mm_unpackhi_epi16(temp10,
623 zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
624 temp5 = _mm_madd_epi16(src_r0_1,
625 scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2
626 // a03*b03*q3 - 32 bits long
627 temp7 = _mm_madd_epi16(src_r0_2,
628 scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6
629 // a07*b07*q7 -- 32 bits long
630 if(u4_qp_div_6 >= 6)
631 {
632 resq_r3_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
633 resq_r3_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
634 }
635 else
636 {
637 temp5 = _mm_add_epi32(temp5, add_rshift);
638 temp7 = _mm_add_epi32(temp7, add_rshift);
639 resq_r3_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
640 resq_r3_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
641 }
642 resq_r3_1 =
643 _mm_packs_epi32(resq_r3_1,
644 resq_r3_2); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
645 // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
646 // Row 4 processing
647 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 32)); // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
648 // the source matrix 4th row
649 scalemat_r0 =
650 _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 32)); // b00 b01 b02 b03 b04 b05 b06 b07 b08
651 // -- the scaling matrix 4th row
652 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[32])); // q0 q1 q2 q3 q4 q5
653 // q6 q7 -- all 16 bits
654 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
655 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
656 temp10 = _mm_mullo_epi16(scalemat_r0,
657 dequant_r0); // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
658 // b05*q5 b06*q6 b07*q7 -- 16 bit result
659 scalemat_r0_1 =
660 _mm_unpacklo_epi16(temp10,
661 zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
662 scalemat_r0_2 =
663 _mm_unpackhi_epi16(temp10,
664 zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
665 temp5 = _mm_madd_epi16(src_r0_1,
666 scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2
667 // a03*b03*q3 -- 32 bits long
668 temp7 = _mm_madd_epi16(src_r0_2,
669 scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6
670 // a07*b07*q7 -- 32 bits long
671 if(u4_qp_div_6 >= 6)
672 {
673 resq_r4_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
674 resq_r4_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
675 }
676 else
677 {
678 temp5 = _mm_add_epi32(temp5, add_rshift);
679 temp7 = _mm_add_epi32(temp7, add_rshift);
680 resq_r4_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
681 resq_r4_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
682 }
683 resq_r4_1 =
684 _mm_packs_epi32(resq_r4_1,
685 resq_r4_2); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
686 // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
687 // Row 5 processing
688 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 40)); // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
689 // the source matrix 5th row
690 scalemat_r0 =
691 _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 40)); // b00 b01 b02 b03 b04 b05 b06 b07 b08
692 // -- the scaling matrix 5th row
693 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[40])); // q0 q1 q2 q3 q4 q5
694 // q6 q7 -- all 16 bits
695 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
696 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
697 temp10 = _mm_mullo_epi16(scalemat_r0,
698 dequant_r0); // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
699 // b05*q5 b06*q6 b07*q7 -- 16 bit result
700 scalemat_r0_1 =
701 _mm_unpacklo_epi16(temp10,
702 zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
703 scalemat_r0_2 =
704 _mm_unpackhi_epi16(temp10,
705 zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
706 temp5 = _mm_madd_epi16(src_r0_1,
707 scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2
708 // a03*b03*q3 -- 32 bits long
709 temp7 = _mm_madd_epi16(src_r0_2,
710 scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6
711 // a07*b07*q7 -- 32 bits long
712 if(u4_qp_div_6 >= 6)
713 {
714 resq_r5_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
715 resq_r5_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
716 // resq_r5_1 = _mm_and_si128(resq_r5_1,one_zero_mask);
717 // resq_r5_2 = _mm_and_si128(resq_r5_2,one_zero_mask);
718 }
719 else
720 {
721 temp5 = _mm_add_epi32(temp5, add_rshift);
722 temp7 = _mm_add_epi32(temp7, add_rshift);
723 resq_r5_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
724 resq_r5_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
725 }
726 resq_r5_1 =
727 _mm_packs_epi32(resq_r5_1,
728 resq_r5_2); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
729 // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
730 // Row 6 processing
731 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 48)); // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
732 // the source matrix 6th row
733 scalemat_r0 =
734 _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 48)); // b00 b01 b02 b03 b04 b05 b06 b07 b08
735 // -- the scaling matrix 6th row
736 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[48])); // q0 q1 q2 q3 q4 q5
737 // q6 q7 -- all 16 bits
738 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
739 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
740 temp10 = _mm_mullo_epi16(scalemat_r0,
741 dequant_r0); // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
742 // b05*q5 b06*q6 b07*q7 -- 16 bit result
743 scalemat_r0_1 =
744 _mm_unpacklo_epi16(temp10,
745 zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
746 scalemat_r0_2 =
747 _mm_unpackhi_epi16(temp10,
748 zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
749 temp5 = _mm_madd_epi16(src_r0_1,
750 scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2
751 // a03*b03*q3 -- 32 bits long
752 temp7 = _mm_madd_epi16(src_r0_2,
753 scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6
754 // a07*b07*q7 -- 32 bits long
755 if(u4_qp_div_6 >= 6)
756 {
757 resq_r6_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
758 resq_r6_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
759 // resq_r6_1 = _mm_and_si128(resq_r6_1,one_zero_mask);
760 // resq_r6_2 = _mm_and_si128(resq_r6_2,one_zero_mask);
761 }
762 else
763 {
764 temp5 = _mm_add_epi32(temp5, add_rshift);
765 temp7 = _mm_add_epi32(temp7, add_rshift);
766 resq_r6_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
767 resq_r6_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
768 // resq_r6_1 = _mm_and_si128(resq_r6_1,one_zero_mask);
769 // resq_r6_2 = _mm_and_si128(resq_r6_2,one_zero_mask);
770 }
771 resq_r6_1 =
772 _mm_packs_epi32(resq_r6_1,
773 resq_r6_2); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
774 // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
775 // Row 7 processing
776 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 56)); // a00 a01 a02 a03 a04 a05 a06 a07 a08 --
777 // the source matrix 7th row
778 scalemat_r0 =
779 _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 56)); // b00 b01 b02 b03 b04 b05 b06 b07 b08
780 // -- the scaling matrix 7th row
781 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[56])); // q0 q1 q2 q3 q4 q5
782 // q6 q7 -- all 16 bits
783 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
784 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
785 temp10 = _mm_mullo_epi16(scalemat_r0,
786 dequant_r0); // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4
787 // b05*q5 b06*q6 b07*q7 -- 16 bit result
788 scalemat_r0_1 =
789 _mm_unpacklo_epi16(temp10,
790 zero_8x16b); // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
791 scalemat_r0_2 =
792 _mm_unpackhi_epi16(temp10,
793 zero_8x16b); // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
794 temp5 = _mm_madd_epi16(src_r0_1,
795 scalemat_r0_1); // a00*b00*q0 a01*b01*q1 a02*b02*q2
796 // a03*b03*q3 -- 32 bits long
797 temp7 = _mm_madd_epi16(src_r0_2,
798 scalemat_r0_2); // a04*b04*q4 a05*b05*q5 a06*b06*q6
799 // a07*b07*q7 -- 32 bits long
800 if(u4_qp_div_6 >= 6)
801 {
802 resq_r7_1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 6);
803 resq_r7_2 = _mm_slli_epi32(temp7, u4_qp_div_6 - 6);
804 }
805 else
806 {
807 temp5 = _mm_add_epi32(temp5, add_rshift);
808 temp7 = _mm_add_epi32(temp7, add_rshift);
809 resq_r7_1 = _mm_srai_epi32(temp5, 6 - u4_qp_div_6);
810 resq_r7_2 = _mm_srai_epi32(temp7, 6 - u4_qp_div_6);
811 }
812 resq_r7_1 =
813 _mm_packs_epi32(resq_r7_1,
814 resq_r7_2); // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4
815 // a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16 bit long
816 /* Perform Inverse transform */
817 /*--------------------------------------------------------------------*/
818 /* IDCT [ Horizontal transformation ] */
819 /*--------------------------------------------------------------------*/
820 // Matrix transpose
821 /*
822 * a0 a1 a2 a3 a4 a5 a6 a7
823 * b0 b1 b2 b3 b4 b5 b6 b7
824 * c0 c1 c2 c3 c4 c5 c6 c7
825 * d0 d1 d2 d3 d4 d5 d6 d7
826 */
827 temp1 = _mm_unpacklo_epi16(resq_r0_1, resq_r1_1); // a0 b0 a1 b1 a2 b2 a3 b3
828 temp3 = _mm_unpacklo_epi16(resq_r2_1, resq_r3_1); // c0 d0 c1 d1 c2 d2 c3 d3
829 temp2 = _mm_unpackhi_epi16(resq_r0_1, resq_r1_1); // a4 b4 a5 b5 a6 b6 a7 b7
830 temp4 = _mm_unpackhi_epi16(resq_r2_1, resq_r3_1); // c4 d4 c5 d5 c6 d6 c7 d7
831 resq_r0_1 = _mm_unpacklo_epi32(temp1, temp3); // a0 b0 c0 d0 a1 b1 c1 d1
832 resq_r1_1 = _mm_unpackhi_epi32(temp1, temp3); // a2 b2 c2 d2 a3 b3 c3 d3
833 resq_r2_1 = _mm_unpacklo_epi32(temp2, temp4); // a4 b4 c4 d4 a5 b5 c5 d5
834 resq_r3_1 = _mm_unpackhi_epi32(temp2, temp4); // a6 b6 c6 d6 a7 b7 c7 d7
835 /*
836 * e0 e1 e2 e3 e4 e5 e6 e7
837 * f0 f1 f2 f3 f4 f5 f6 f7
838 * g0 g1 g2 g3 g4 g5 g6 g7
839 * h0 h1 h2 h3 h4 h5 h6 h7
840 */
841 temp1 = _mm_unpacklo_epi16(resq_r4_1, resq_r5_1); // e0 f0 e1 f1 e2 f2 e2 f3
842 temp3 = _mm_unpacklo_epi16(resq_r6_1, resq_r7_1); // g0 h0 g1 h1 g2 h2 g3 h3
843 temp2 = _mm_unpackhi_epi16(resq_r4_1, resq_r5_1); // e4 f4 e5 f5 e6 f6 e7 f7
844 temp4 = _mm_unpackhi_epi16(resq_r6_1, resq_r7_1); // g4 h4 g5 h5 g6 h6 g7 h7
845 resq_r4_1 = _mm_unpacklo_epi32(temp1, temp3); // e0 f0 g0 h0 e1 f1 g1 h1
846 resq_r5_1 = _mm_unpackhi_epi32(temp1, temp3); // e2 f2 g2 h2 e3 f3 g3 h3
847 resq_r6_1 = _mm_unpacklo_epi32(temp2, temp4); // e4 f4 g4 h4 e5 f5 g5 h5
848 resq_r7_1 = _mm_unpackhi_epi32(temp2, temp4); // e6 f6 g6 h6 e7 f7 g7 h7
849 /*
850 * a0 b0 c0 d0 a1 b1 c1 d1
851 * a2 b2 c2 d2 a3 b3 c3 d3
852 * a4 b4 c4 d4 a5 b5 c5 d5
853 * a6 b6 c6 d6 a7 b7 c7 d7
854 * e0 f0 g0 h0 e1 f1 g1 h1
855 * e2 f2 g2 h2 e3 f3 g3 h3
856 * e4 f4 g4 h4 e5 f5 g5 h5
857 * e6 f6 g6 h6 e7 f7 g7 h7
858 */
859 resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1); // a0 b0 c0 d0 e0 f0 g0 h0
860 resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1); // a1 b1 c1 d1 e1 f1 g1 h1
861 resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1); // a2 b2 c2 d2 e2 f2 g2 h2
862 resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1); // a3 b3 c3 d3 e3 f3 g3 h3
863 resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1); // a4 b4 c4 d4 e4 f4 g4 h4
864 resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1); // a5 b5 c5 d5 e5 f5 g5 h5
865 resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1); // a6 b6 c6 d6 e6 f6 g6 h6
866 resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1); // a7 b7 c7 d7 e7 f7 g7 h7
867
868 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
869 resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg); // a1 b1 c1 d1 -- 32 bit
870 resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg); // e1 f1 g1 h1 -- 32 bit
871 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
872 resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg); // a3 b3 c3 d3 -- 32 bit
873 resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg); // e3 f3 g3 h3 -- 32 bit
874 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
875 resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg); // a5 b5 c5 d5 -- 32 bit
876 resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg); // e5 f5 g5 h5 -- 32 bit
877 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
878 resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg); // a7 b7 c7 d7 -- 32 bit
879 resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg); // e7 f7 g7 h7 -- 32 bit
880 // Transform starts -- horizontal transform
881 /*------------------------------------------------------------------*/
882 /* y0 = w0 + w4 */
883 temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
884 /* y2 = w0 - w4 */
885 temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
886 /* y1 = -w3 + w5 - w7 - (w7 >> 1) */
887 temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1); //-w3+w5
888 temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
889 temp4 = _mm_sub_epi32(temp2, resq_r7_1); //-w3+w5-w7
890 temp12 = _mm_sub_epi32(temp10, resq_r7_2);
891 temp5 = _mm_srai_epi32(resq_r7_1, 1); // w7>>1
892 temp13 = _mm_srai_epi32(resq_r7_2, 1);
893 temp2 = _mm_sub_epi32(temp4, temp5); //-w3+w5-w7 -(w7>>1)
894 temp10 = _mm_sub_epi32(temp12, temp13);
895 temp2 = _mm_packs_epi32(temp2, temp10);
896 /* y3 = w1 + w7 - w3 - (w3 >> 1) */
897 temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1); // w1+w7
898 temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
899 temp4 = _mm_sub_epi32(temp4, resq_r3_1); // w1+w7-w3
900 temp12 = _mm_sub_epi32(temp12, resq_r3_2);
901 temp5 = _mm_srai_epi32(resq_r3_1, 1); // w3>>1
902 temp13 = _mm_srai_epi32(resq_r3_2, 1);
903 temp4 = _mm_sub_epi32(temp4, temp5); // w1+w7-w3-(w3>>1)
904 temp12 = _mm_sub_epi32(temp12, temp13);
905 temp4 = _mm_packs_epi32(temp4, temp12);
906 /* y4 = (w2 >> 1) - w6 */
907 temp5 = _mm_srai_epi16(resq_r2_2, 1); // w2>>1
908 temp5 = _mm_sub_epi16(temp5, resq_r6_2); //(w2>>1)-w6
909 /* y5 = -w1 + w7 + w5 + (w5 >> 1) */
910 temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1); // w7-w1
911 temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
912 temp6 = _mm_add_epi32(temp6, resq_r5_1); // w7-w1+w5
913 temp14 = _mm_add_epi32(temp14, resq_r5_2);
914 temp7 = _mm_srai_epi32(resq_r5_1, 1); // w5>>1
915 temp15 = _mm_srai_epi32(resq_r5_2, 1);
916 temp6 = _mm_add_epi32(temp6, temp7); // w7-w1_w5+(w5>>1)
917 temp14 = _mm_add_epi32(temp14, temp15);
918 temp6 = _mm_packs_epi32(temp6, temp14);
919 /* y6 = w2 + (w6 >> 1) */
920 temp7 = _mm_srai_epi16(resq_r6_2, 1); // w6>>1
921 temp7 = _mm_add_epi16(temp7, resq_r2_2); //(w6>>1)+w2
922 /* y7 = w3 + w5 + w1 + (w1 >> 1) */
923 temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1); // w3+w5
924 temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
925 temp8 = _mm_add_epi32(temp8, resq_r1_1); // w3+w5+w1
926 temp16 = _mm_add_epi32(temp16, resq_r1_2);
927 temp17 = _mm_srai_epi32(resq_r1_1, 1); // w1>>1
928 temp18 = _mm_srai_epi32(resq_r1_2, 1);
929 temp8 = _mm_add_epi32(temp8, temp17); // w3+w5+w1+(w1>>1)
930 temp16 = _mm_add_epi32(temp16, temp18);
931 temp8 = _mm_packs_epi32(temp8, temp16);
932 /*------------------------------------------------------------------*/
933 /*------------------------------------------------------------------*/
934 /* z0 = y0 + y6 */
935 resq_r0_1 = _mm_add_epi16(temp1, temp7);
936 /* z1 = y1 + (y7 >> 2) */
937 resq_r1_1 = _mm_srai_epi16(temp8, 2);
938 resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
939 /* z2 = y2 + y4 */
940 resq_r2_1 = _mm_add_epi16(temp3, temp5);
941 /* z3 = y3 + (y5 >> 2) */
942 resq_r3_1 = _mm_srai_epi16(temp6, 2);
943 resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
944 /* z4 = y2 - y4 */
945 resq_r4_1 = _mm_sub_epi16(temp3, temp5);
946 /* z5 = (y3 >> 2) - y5 */
947 resq_r5_1 = _mm_srai_epi16(temp4, 2);
948 resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
949 /* z6 = y0 - y6 */
950 resq_r6_1 = _mm_sub_epi16(temp1, temp7);
951 /* z7 = y7 - (y1 >> 2) */
952 resq_r7_1 = _mm_srai_epi16(temp2, 2);
953 resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
954 /*------------------------------------------------------------------*/
955 /*------------------------------------------------------------------*/
956 /* x0 = z0 + z7 */
957 temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
958 /* x1 = z2 + z5 */
959 temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
960 /* x2 = z4 + z3 */
961 temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
962 /* x3 = z6 + z1 */
963 temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
964 /* x4 = z6 - z1 */
965 temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
966 /* x5 = z4 - z3 */
967 temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
968 /* x6 = z2 - z5 */
969 temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
970 /* x7 = z0 - z7 */
971 temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
972 /*------------------------------------------------------------------*/
973 // Matrix transpose
974 /*
975 * a0 b0 c0 d0 e0 f0 g0 h0
976 * a1 b1 c1 d1 e1 f1 g1 h1
977 * a2 b2 c2 d2 e2 f2 g2 h2
978 * a3 b3 c3 d3 e3 f3 g3 h3
979 */
980 temp17 = _mm_unpacklo_epi16(temp1, temp2); // a0 a1 b0 b1 c0 c1 d0 d1
981 temp19 = _mm_unpacklo_epi16(temp3, temp4); // a2 a3 b2 b3 c2 c3 d2 d3
982 temp18 = _mm_unpackhi_epi16(temp1, temp2); // e0 e1 f0 f1 g0 g1 h0 h1
983 temp20 = _mm_unpackhi_epi16(temp3, temp4); // e2 e3 f2 f3 g2 g3 h2 h3
984
985 resq_r0_1 = _mm_unpacklo_epi32(temp17, temp19); // a0 a1 a2 a3 b0 b1 b2 b3
986 resq_r1_1 = _mm_unpackhi_epi32(temp17, temp19); // c0 c1 c2 c3 d0 d1 d2 d3
987 resq_r2_1 = _mm_unpacklo_epi32(temp18, temp20); // e0 e1 e2 e3 f0 f1 f2 f3
988 resq_r3_1 = _mm_unpackhi_epi32(temp18, temp20); // g0 g2 g2 g3 h0 h1 h2 h3
989 /*
990 * a4 b4 c4 d4 e4 f4 g4 h4
991 * a5 b5 c5 d5 e5 f5 g5 h5
992 * a6 b6 c6 d6 e6 f6 g6 h6
993 * a7 b7 c7 d7 e7 f7 g7 h7
994 */
995 temp17 = _mm_unpacklo_epi16(temp5, temp6); // a4 a5 b4 b5 c4 c5 d4 d5
996 temp19 = _mm_unpacklo_epi16(temp7, temp8); // a6 a7 b6 b7 c6 c7 d6 d7
997 temp18 = _mm_unpackhi_epi16(temp5, temp6); // e4 e5 f4 f5 g4 g5 h4 h5
998 temp20 = _mm_unpackhi_epi16(temp7, temp8); // e6 e7 f6 f7 g6 g7 h6 h7
999
1000 resq_r4_1 = _mm_unpacklo_epi32(temp17, temp19); // a4 a5 a6 a7 b4 b5 b6 b7
1001 resq_r5_1 = _mm_unpackhi_epi32(temp17, temp19); // c4 c5 c6 c7 d4 d5 d6 d7
1002 resq_r6_1 = _mm_unpacklo_epi32(temp18, temp20); // e4 e5 e6 e7 f4 f5 f6 f7
1003 resq_r7_1 = _mm_unpackhi_epi32(temp18, temp20); // g4 g5 g6 g7 h4 h5 h6 h7
1004 /* a0 a1 a2 a3 b0 b1 b2 b3
1005 * c0 c1 c2 c3 d0 d1 d2 d3
1006 * e0 e1 e2 e3 f0 f1 f2 f3
1007 * g0 g2 g2 g3 h0 h1 h2 h3
1008 * a4 a5 a6 a7 b4 b5 b6 b7
1009 * c4 c5 c6 c7 d4 d5 d6 d7
1010 * e4 e5 e6 e7 f4 f5 f6 f7
1011 * g4 g5 g6 g7 h4 h5 h6 h7
1012 */
1013 resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1); // a0 a1 a2 a3 a4 a5 a6 a7
1014 resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1); // b0 b1 b2 b3 b4 b5 b6 b7
1015 resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1); // c0 c1 c2 c3 c4 c5 c6 c7
1016 resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1); // d0 d1 d2 d3 d4 d5 d6 d7
1017 resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1); // e0 e1 e2 e3 e4 e5 e6 e7
1018 resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1); // f0 f1 f2 f3 f4 f5 f6 f7
1019 resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1); // g0 g1 g2 g3 g4 g5 g6 g7
1020 resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1); // h0 h1 h2 h3 h4 h5 h6 h7
1021
1022 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
1023 resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg); // a1 b1 c1 d1 -- 32 bit
1024 resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg); // e1 f1 g1 h1 -- 32 bit
1025 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
1026 resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg); // a3 b3 c3 d3 -- 32 bit
1027 resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg); // e3 f3 g3 h3 -- 32 bit
1028 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
1029 resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg); // a5 b5 c5 d5 -- 32 bit
1030 resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg); // e5 f5 g5 h5 -- 32 bit
1031 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
1032 resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg); // a7 b7 c7 d7 -- 32 bit
1033 resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg); // e7 f7 g7 h7 -- 32 bit
1034
1035 zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
1036 // Load pred buffer row 0
1037 predload_r =
1038 _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); // p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0
1039 // -- all 8 bits
1040 pred_r0_1 =
1041 _mm_unpacklo_epi8(predload_r, zero_8x16b); // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1042 // Load pred buffer row 1
1043 predload_r =
1044 _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride])); // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1045 // 0 0 0 0 0 0 -- all 8 bits
1046 pred_r1_1 =
1047 _mm_unpacklo_epi8(predload_r, zero_8x16b); // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1048 // Load pred buffer row 2
1049 predload_r = _mm_loadl_epi64(
1050 (__m128i *) (&pu1_pred[2 * i4_pred_stride])); // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1051 // 0 0 0 0 0 0 -- all 8 bits
1052 pred_r2_1 =
1053 _mm_unpacklo_epi8(predload_r, zero_8x16b); // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1054 // Load pred buffer row 3
1055 predload_r = _mm_loadl_epi64(
1056 (__m128i *) (&pu1_pred[3 * i4_pred_stride])); // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1057 // 0 0 0 0 0 0 -- all 8 bits
1058 pred_r3_1 =
1059 _mm_unpacklo_epi8(predload_r, zero_8x16b); // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1060 // Load pred buffer row 4
1061 predload_r = _mm_loadl_epi64(
1062 (__m128i *) (&pu1_pred[4 * i4_pred_stride])); // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1063 // 0 0 0 0 0 0 -- all 8 bits
1064 pred_r4_1 =
1065 _mm_unpacklo_epi8(predload_r, zero_8x16b); // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1066 // Load pred buffer row 5
1067 predload_r =
1068 _mm_loadl_epi64((__m128i *) (&pu1_pred[5 * i4_pred_stride])); // p0 p1 p2 p3 p4 p5 p6 p7 0
1069 // 0 0 0 0 0 0 0 -- all 8 bit
1070 pred_r5_1 =
1071 _mm_unpacklo_epi8(predload_r, zero_8x16b); // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1072 // Load pred buffer row 6
1073 predload_r = _mm_loadl_epi64(
1074 (__m128i *) (&pu1_pred[6 * i4_pred_stride])); // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1075 // 0 0 0 0 0 0 -- all 8 bits
1076 pred_r6_1 =
1077 _mm_unpacklo_epi8(predload_r, zero_8x16b); // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1078 // Load pred buffer row 7
1079 predload_r = _mm_loadl_epi64(
1080 (__m128i *) (&pu1_pred[7 * i4_pred_stride])); // p0 p1 p2 p3 p4 p5 p6 p7 0 0
1081 // 0 0 0 0 0 0 -- all 8 bits
1082 pred_r7_1 =
1083 _mm_unpacklo_epi8(predload_r, zero_8x16b); // p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
1084
1085 /*--------------------------------------------------------------------*/
1086 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
1087 /* */
1088 /* Add the prediction and store it back to reconstructed frame buffer */
1089 /* [Prediction buffer itself in this case] */
1090 /*--------------------------------------------------------------------*/
1091
1092 /* y0j = w0j + w4j */
1093 temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
1094 /* y2j = w0j - w4j */
1095 temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
1096 /* y1j = -w3j + w5j - w7j - (w7j >> 1) */
1097 temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1); //-w3+w5
1098 temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
1099 temp4 = _mm_sub_epi32(temp2, resq_r7_1); //-w3+w5-w7
1100 temp12 = _mm_sub_epi32(temp10, resq_r7_2);
1101 temp5 = _mm_srai_epi32(resq_r7_1, 1); // w7>>1
1102 temp13 = _mm_srai_epi32(resq_r7_2, 1);
1103 temp2 = _mm_sub_epi32(temp4, temp5); //-w3+w5-w7 -(w7>>1)
1104 temp10 = _mm_sub_epi32(temp12, temp13);
1105 temp2 = _mm_packs_epi32(temp2, temp10);
1106 /* y3j = w1j + w7j - w3j - (w3j >> 1) */
1107 temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1); // w1+w7
1108 temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
1109 temp4 = _mm_sub_epi32(temp4, resq_r3_1); // w1+w7-w3
1110 temp12 = _mm_sub_epi32(temp12, resq_r3_2);
1111 temp5 = _mm_srai_epi32(resq_r3_1, 1); // w3>>1
1112 temp13 = _mm_srai_epi32(resq_r3_2, 1);
1113 temp4 = _mm_sub_epi32(temp4, temp5); // w1+w7-w3-(w3>>1)
1114 temp12 = _mm_sub_epi32(temp12, temp13);
1115 temp4 = _mm_packs_epi32(temp4, temp12);
1116 /* y4j = (w2j >> 1) - w6j */
1117 temp5 = _mm_srai_epi16(resq_r2_2, 1); // w2>>1
1118 temp5 = _mm_sub_epi16(temp5, resq_r6_2); //(w2>>1)-w6
1119 /* y5j = -w1j + w7j + w5j + (w5j >> 1) */
1120 temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1); // w7-w1
1121 temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
1122 temp6 = _mm_add_epi32(temp6, resq_r5_1); // w7-w1+w5
1123 temp14 = _mm_add_epi32(temp14, resq_r5_2);
1124 temp7 = _mm_srai_epi32(resq_r5_1, 1); // w5>>1
1125 temp15 = _mm_srai_epi32(resq_r5_2, 1);
1126 temp6 = _mm_add_epi32(temp6, temp7); // w7-w1_w5+(w5>>1)
1127 temp14 = _mm_add_epi32(temp14, temp15);
1128 temp6 = _mm_packs_epi32(temp6, temp14);
1129 /* y6j = w2j + (w6j >> 1) */
1130 temp7 = _mm_srai_epi16(resq_r6_2, 1); // w6>>1
1131 temp7 = _mm_add_epi16(temp7, resq_r2_2); //(w6>>1)+w2
1132 /* y7j = w3j + w5j + w1j + (w1j >> 1) */
1133 temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1); // w3+w5
1134 temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
1135 temp8 = _mm_add_epi32(temp8, resq_r1_1); // w3+w5+w1
1136 temp16 = _mm_add_epi32(temp16, resq_r1_2);
1137 temp17 = _mm_srai_epi32(resq_r1_1, 1); // w1>>1
1138 temp18 = _mm_srai_epi32(resq_r1_2, 1);
1139 temp8 = _mm_add_epi32(temp8, temp17); // w3+w5+w1+(w1>>1)
1140 temp16 = _mm_add_epi32(temp16, temp18);
1141 temp8 = _mm_packs_epi32(temp8, temp16);
1142 /*------------------------------------------------------------------*/
1143 /*------------------------------------------------------------------*/
1144 /* z0j = y0j + y6j */
1145 resq_r0_1 = _mm_add_epi16(temp1, temp7);
1146 /* z1j = y1j + (y7j >> 2) */
1147 resq_r1_1 = _mm_srai_epi16(temp8, 2);
1148 resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
1149 /* z2j = y2j + y4j */
1150 resq_r2_1 = _mm_add_epi16(temp3, temp5);
1151 /* z3j = y3j + (y5j >> 2) */
1152 resq_r3_1 = _mm_srai_epi16(temp6, 2);
1153 resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
1154 /* z4j = y2j - y4j */
1155 resq_r4_1 = _mm_sub_epi16(temp3, temp5);
1156 /* z5j = (y3j >> 2) - y5j */
1157 resq_r5_1 = _mm_srai_epi16(temp4, 2);
1158 resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
1159 /* z6j = y0j - y6j */
1160 resq_r6_1 = _mm_sub_epi16(temp1, temp7);
1161 /* z7j = y7j - (y1j >> 2) */
1162 resq_r7_1 = _mm_srai_epi16(temp2, 2);
1163 resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
1164 /*------------------------------------------------------------------*/
1165
1166 /*------------------------------------------------------------------*/
1167 /* x0j = z0j + z7j */
1168 temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
1169 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp1);
1170 temp10 = _mm_unpacklo_epi16(temp1, sign_reg);
1171 temp11 = _mm_unpackhi_epi16(temp1, sign_reg);
1172 temp10 = _mm_add_epi32(temp10, value_32);
1173 temp11 = _mm_add_epi32(temp11, value_32);
1174 temp10 = _mm_srai_epi32(temp10, 6);
1175 temp11 = _mm_srai_epi32(temp11, 6);
1176 temp10 = _mm_packs_epi32(temp10, temp11);
1177 temp1 = _mm_add_epi16(temp10, pred_r0_1);
1178 /* x1j = z2j + z5j */
1179 temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
1180 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp2);
1181 temp10 = _mm_unpacklo_epi16(temp2, sign_reg);
1182 temp11 = _mm_unpackhi_epi16(temp2, sign_reg);
1183 temp10 = _mm_add_epi32(temp10, value_32);
1184 temp11 = _mm_add_epi32(temp11, value_32);
1185 temp10 = _mm_srai_epi32(temp10, 6);
1186 temp11 = _mm_srai_epi32(temp11, 6);
1187 temp10 = _mm_packs_epi32(temp10, temp11);
1188 temp2 = _mm_add_epi16(temp10, pred_r1_1);
1189 /* x2j = z4j + z3j */
1190 temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
1191 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp3);
1192 temp10 = _mm_unpacklo_epi16(temp3, sign_reg);
1193 temp11 = _mm_unpackhi_epi16(temp3, sign_reg);
1194 temp10 = _mm_add_epi32(temp10, value_32);
1195 temp11 = _mm_add_epi32(temp11, value_32);
1196 temp10 = _mm_srai_epi32(temp10, 6);
1197 temp11 = _mm_srai_epi32(temp11, 6);
1198 temp10 = _mm_packs_epi32(temp10, temp11);
1199 temp3 = _mm_add_epi16(temp10, pred_r2_1);
1200 /* x3j = z6j + z1j */
1201 temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
1202 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp4);
1203 temp10 = _mm_unpacklo_epi16(temp4, sign_reg);
1204 temp11 = _mm_unpackhi_epi16(temp4, sign_reg);
1205 temp10 = _mm_add_epi32(temp10, value_32);
1206 temp11 = _mm_add_epi32(temp11, value_32);
1207 temp10 = _mm_srai_epi32(temp10, 6);
1208 temp11 = _mm_srai_epi32(temp11, 6);
1209 temp10 = _mm_packs_epi32(temp10, temp11);
1210 temp4 = _mm_add_epi16(temp10, pred_r3_1);
1211 /* x4j = z6j - z1j */
1212 temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
1213 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp5);
1214 temp10 = _mm_unpacklo_epi16(temp5, sign_reg);
1215 temp11 = _mm_unpackhi_epi16(temp5, sign_reg);
1216 temp10 = _mm_add_epi32(temp10, value_32);
1217 temp11 = _mm_add_epi32(temp11, value_32);
1218 temp10 = _mm_srai_epi32(temp10, 6);
1219 temp11 = _mm_srai_epi32(temp11, 6);
1220 temp10 = _mm_packs_epi32(temp10, temp11);
1221 temp5 = _mm_add_epi16(temp10, pred_r4_1);
1222 /* x5j = z4j - z3j */
1223 temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
1224 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp6);
1225 temp10 = _mm_unpacklo_epi16(temp6, sign_reg);
1226 temp11 = _mm_unpackhi_epi16(temp6, sign_reg);
1227 temp10 = _mm_add_epi32(temp10, value_32);
1228 temp11 = _mm_add_epi32(temp11, value_32);
1229 temp10 = _mm_srai_epi32(temp10, 6);
1230 temp11 = _mm_srai_epi32(temp11, 6);
1231 temp10 = _mm_packs_epi32(temp10, temp11);
1232 temp6 = _mm_add_epi16(temp10, pred_r5_1);
1233 /* x6j = z2j - z5j */
1234 temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
1235 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp7);
1236 temp10 = _mm_unpacklo_epi16(temp7, sign_reg);
1237 temp11 = _mm_unpackhi_epi16(temp7, sign_reg);
1238 temp10 = _mm_add_epi32(temp10, value_32);
1239 temp11 = _mm_add_epi32(temp11, value_32);
1240 temp10 = _mm_srai_epi32(temp10, 6);
1241 temp11 = _mm_srai_epi32(temp11, 6);
1242 temp10 = _mm_packs_epi32(temp10, temp11);
1243 temp7 = _mm_add_epi16(temp10, pred_r6_1);
1244 /* x7j = z0j - z7j */
1245 temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
1246 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp8);
1247 temp10 = _mm_unpacklo_epi16(temp8, sign_reg);
1248 temp11 = _mm_unpackhi_epi16(temp8, sign_reg);
1249 temp10 = _mm_add_epi32(temp10, value_32);
1250 temp11 = _mm_add_epi32(temp11, value_32);
1251 temp10 = _mm_srai_epi32(temp10, 6);
1252 temp11 = _mm_srai_epi32(temp11, 6);
1253 temp10 = _mm_packs_epi32(temp10, temp11);
1254 temp8 = _mm_add_epi16(temp10, pred_r7_1);
1255 /*------------------------------------------------------------------*/
1256 // Clipping the results to 8 bits
1257 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); // sign check
1258 temp1 = _mm_and_si128(temp1, sign_reg);
1259 sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b); // sign check
1260 temp2 = _mm_and_si128(temp2, sign_reg);
1261 sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b); // sign check
1262 temp3 = _mm_and_si128(temp3, sign_reg);
1263 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check
1264 temp4 = _mm_and_si128(temp4, sign_reg);
1265 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check
1266 temp5 = _mm_and_si128(temp5, sign_reg);
1267 sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b); // sign check
1268 temp6 = _mm_and_si128(temp6, sign_reg);
1269 sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b); // sign check
1270 temp7 = _mm_and_si128(temp7, sign_reg);
1271 sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b); // sign check
1272 temp8 = _mm_and_si128(temp8, sign_reg);
1273
1274 resq_r0_2 = _mm_packus_epi16(temp1, zero_8x16b);
1275 resq_r1_2 = _mm_packus_epi16(temp2, zero_8x16b);
1276 resq_r2_2 = _mm_packus_epi16(temp3, zero_8x16b);
1277 resq_r3_2 = _mm_packus_epi16(temp4, zero_8x16b);
1278 resq_r4_2 = _mm_packus_epi16(temp5, zero_8x16b);
1279 resq_r5_2 = _mm_packus_epi16(temp6, zero_8x16b);
1280 resq_r6_2 = _mm_packus_epi16(temp7, zero_8x16b);
1281 resq_r7_2 = _mm_packus_epi16(temp8, zero_8x16b);
1282
1283 _mm_storel_epi64((__m128i *) (&pu1_out[0]), resq_r0_2);
1284 _mm_storel_epi64((__m128i *) (&pu1_out[i4_out_stride]), resq_r1_2);
1285 _mm_storel_epi64((__m128i *) (&pu1_out[2 * i4_out_stride]), resq_r2_2);
1286 _mm_storel_epi64((__m128i *) (&pu1_out[3 * i4_out_stride]), resq_r3_2);
1287 _mm_storel_epi64((__m128i *) (&pu1_out[4 * i4_out_stride]), resq_r4_2);
1288 _mm_storel_epi64((__m128i *) (&pu1_out[5 * i4_out_stride]), resq_r5_2);
1289 _mm_storel_epi64((__m128i *) (&pu1_out[6 * i4_out_stride]), resq_r6_2);
1290 _mm_storel_epi64((__m128i *) (&pu1_out[7 * i4_out_stride]), resq_r7_2);
1291 }
1292