1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 * @file
24 * impeg2_itrans_recon_x86_intr.c
25 *
26 * @brief
27 * Contains function definitions for inverse quantization, inverse
28 * transform and reconstruction
29 *
30 * @author
31 * 100470
32 * 100592 (edited by)
33 *
34 * @par List of Functions:
35 * - impeg2_itrans_recon_8x8_sse42()
36 *
37 * @remarks
38 * None
39 *
40 *******************************************************************************
41 */
42 #include <stdio.h>
43 #include <string.h>
44 #include "iv_datatypedef.h"
45 #include "impeg2_macros.h"
46 #include "impeg2_defs.h"
47 #include "impeg2_globals.h"
48
49 #include <immintrin.h>
50 #include <emmintrin.h>
51 #include <smmintrin.h>
52 #include <tmmintrin.h>
53
54
55 /**
56 *******************************************************************************
57 *
58 * @brief
59 * This function performs inverse quantization, inverse transform and
60 * reconstruction for 8c8 input block
61 *
62 * @par Description:
63 * Performs inverse quantization , inverse transform and adds the
64 * prediction data and clips output to 8 bit
65 *
66 * @param[in] pi2_src
67 * Input 8x8 coefficients
68 *
69 * @param[in] pi2_tmp
70 * Temporary 8x8 buffer for storing inverse
71 * transform 1st stage output
72 *
73 * @param[in] pu1_pred
74 * Prediction 8x8 block
75 *
76 * @param[in] pi2_dequant_coeff
77 * Dequant Coeffs
78 *
79 * @param[out] pu1_dst
80 * Output 8x8 block
81 *
82 * @param[in] src_strd
83 * Input stride
84 *
85 * @param[in] qp_div
86 * Quantization parameter / 6
87 *
88 * @param[in] qp_rem
89 * Quantization parameter % 6
90 *
91 * @param[in] pred_strd
92 * Prediction stride
93 *
94 * @param[in] dst_strd
95 * Output Stride
96 *
97 * @param[in] zero_cols
98 * Zero columns in pi2_src
99 *
100 * @returns Void
101 *
102 * @remarks
103 * None
104 *
105 *******************************************************************************
106 */
107
108
impeg2_idct_recon_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)109 void impeg2_idct_recon_sse42(WORD16 *pi2_src,
110 WORD16 *pi2_tmp,
111 UWORD8 *pu1_pred,
112 UWORD8 *pu1_dst,
113 WORD32 src_strd,
114 WORD32 pred_strd,
115 WORD32 dst_strd,
116 WORD32 zero_cols,
117 WORD32 zero_rows)
118 {
119 __m128i m_temp_reg_0;
120 __m128i m_temp_reg_1;
121 __m128i m_temp_reg_2;
122 __m128i m_temp_reg_3;
123 __m128i m_temp_reg_5;
124 __m128i m_temp_reg_6;
125 __m128i m_temp_reg_7;
126 __m128i m_temp_reg_4;
127 __m128i m_temp_reg_10;
128 __m128i m_temp_reg_11;
129 __m128i m_temp_reg_12;
130 __m128i m_temp_reg_13;
131 __m128i m_temp_reg_14;
132 __m128i m_temp_reg_15;
133 __m128i m_temp_reg_16;
134 __m128i m_temp_reg_17;
135 __m128i m_temp_reg_20;
136 __m128i m_temp_reg_21;
137 __m128i m_temp_reg_22;
138 __m128i m_temp_reg_23;
139 __m128i m_temp_reg_24;
140 __m128i m_temp_reg_25;
141 __m128i m_temp_reg_26;
142 __m128i m_temp_reg_27;
143 __m128i m_temp_reg_30;
144 __m128i m_temp_reg_31;
145 __m128i m_temp_reg_32;
146 __m128i m_temp_reg_33;
147 __m128i m_temp_reg_34;
148 __m128i m_temp_reg_35;
149 __m128i m_temp_reg_36;
150 __m128i m_temp_reg_37;
151 __m128i m_temp_reg_40;
152 __m128i m_temp_reg_41;
153 __m128i m_temp_reg_42;
154 __m128i m_temp_reg_43;
155 __m128i m_temp_reg_44;
156 __m128i m_temp_reg_45;
157 __m128i m_temp_reg_46;
158 __m128i m_temp_reg_47;
159 __m128i m_temp_reg_50;
160 __m128i m_temp_reg_51;
161 __m128i m_temp_reg_52;
162 __m128i m_temp_reg_53;
163 __m128i m_temp_reg_54;
164 __m128i m_temp_reg_55;
165 __m128i m_temp_reg_56;
166 __m128i m_temp_reg_57;
167 __m128i m_temp_reg_60;
168 __m128i m_temp_reg_61;
169 __m128i m_temp_reg_62;
170 __m128i m_temp_reg_63;
171 __m128i m_temp_reg_64;
172 __m128i m_temp_reg_65;
173 __m128i m_temp_reg_66;
174 __m128i m_temp_reg_67;
175 __m128i m_temp_reg_70;
176 __m128i m_temp_reg_71;
177 __m128i m_temp_reg_72;
178 __m128i m_temp_reg_73;
179 __m128i m_temp_reg_74;
180 __m128i m_temp_reg_75;
181 __m128i m_temp_reg_76;
182 __m128i m_temp_reg_77;
183 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
184
185 WORD32 check_row_stage_1; /* Lokesh */
186 WORD32 check_row_stage_2; /* Lokesh */
187
188 __m128i m_rdng_factor;
189 WORD32 i4_shift = IDCT_STG1_SHIFT;
190 UNUSED(pi2_tmp);
191 check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
192 check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
193
194 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
195 pi2_src += src_strd;
196 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
197 pi2_src += src_strd;
198 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
199 pi2_src += src_strd;
200 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
201 pi2_src += src_strd;
202
203 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
204 pi2_src += src_strd;
205 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
206 pi2_src += src_strd;
207 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
208 pi2_src += src_strd;
209 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
210
211 if(!check_row_stage_2)
212 {
213 if(!check_row_stage_1)
214 {
215 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
216 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
217 {
218 //Interleaving 0,4 row in 0 , 1 Rishab
219 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
220 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
221 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
222
223 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
224
225 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
226 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
227
228 }
229
230
231 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
232 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
233 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
234 {
235
236 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
237 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
238
239 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
240 //Interleaving 2,6 row in 4, 5 Rishab
241 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
242
243 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
244 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
245
246
247 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
248
249 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
250 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
251
252 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
253 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
254
255
256
257 /* e */
258
259 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
260 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
261 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
262 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
263 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
264 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
265
266 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
267 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
268
269 }
270
271 /* o */
272 {
273
274 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
275 {
276
277 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
278 //o0:1B*89+3B*75,5B*50+7B*18
279 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
280
281 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
282 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
283
284
285
286 /* Column 0 of destination computed here */
287 /* It is stored in m_temp_reg_50 */
288 /* Column 7 of destination computed here */
289 /* It is stored in m_temp_reg_57 */
290 /* Upper 8 bytes of both registers are zero due to zero_cols*/
291
292
293
294 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
295 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
296
297 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
298 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
299
300 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
301 m_temp_reg_63 = _mm_setzero_si128();
302 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
303
304 //o1:1B*75-3B*18,5B*89+7B*50
305 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
306
307 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
308 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
309
310 /* Loading coeff for computing o2 in the next block */
311
312 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
313 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
314
315 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
316
317
318
319 /* Column 1 of destination computed here */
320 /* It is stored in m_temp_reg_51 */
321 /* Column 6 of destination computed here */
322 /* It is stored in m_temp_reg_56 */
323
324 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
325 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
326
327 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
328 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
329
330 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
331 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
332
333 //o2:1B*50-3B*89,5B*18+7B*75
334 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
335
336 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
337 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
338
339
340 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
341
342 /* Loading coeff for computing o3 in the next block */
343
344 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
345 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
346
347
348
349 /* Column 2 of destination computed here */
350 /* It is stored in m_temp_reg_52 */
351 /* Column 5 of destination computed here */
352 /* It is stored in m_temp_reg_55 */
353
354 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
355 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
356
357 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
358 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
359
360 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
361 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
362
363 //o3:1B*18-3B*50,5B*75-7B*89
364 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
365
366 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
367 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
368
369
370
371 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
372
373
374
375 /* Column 3 of destination computed here */
376 /* It is stored in m_temp_reg_53 */
377 /* Column 4 of destination computed here */
378 /* It is stored in m_temp_reg_54 */
379
380 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
381 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
382
383 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
384 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
385
386 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
387 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
388
389
390 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
391 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
392 }
393 }
394
395 /* Transpose of the destination 8x8 matrix done here */
396 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
397 /* respectively */
398 {
399 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
400 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
401 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
402 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
403
404 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
405 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
406
407 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
408 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
409
410 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
411 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
412 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
413 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
414
415 m_temp_reg_54 = _mm_setzero_si128();
416 m_temp_reg_55 = _mm_setzero_si128();
417 m_temp_reg_56 = _mm_setzero_si128();
418 m_temp_reg_57 = _mm_setzero_si128();
419 }
420 }
421 else
422 {
423 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
424 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
425 {
426 //Interleaving 0,4 row in 0 , 1 Rishab
427 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
428 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
429 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
430
431 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
432
433 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
434 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
435
436 }
437
438
439 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
440 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
441 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
442 {
443
444 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
445 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
446
447 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
448 //Interleaving 2,6 row in 4, 5 Rishab
449 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
450
451 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
452 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
453
454
455 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
456
457 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
458 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
459
460 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
461 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
462
463
464
465 /* e */
466
467 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
468 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
469 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
470 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
471 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
472 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
473
474 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
475 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
476
477 }
478
479 /* o */
480 {
481
482 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
483 {
484
485 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
486 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
487 //o0:1B*89+3B*75,5B*50+7B*18
488 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
489 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
490
491 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
492 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
493
494 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
495
496
497
498 /* Column 0 of destination computed here */
499 /* It is stored in m_temp_reg_50 */
500 /* Column 7 of destination computed here */
501 /* It is stored in m_temp_reg_57 */
502 /* Upper 8 bytes of both registers are zero due to zero_cols*/
503
504
505
506 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
507 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
508
509 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
510 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
511
512 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
513 m_temp_reg_63 = _mm_setzero_si128();
514 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
515
516 //o1:1B*75-3B*18,5B*89+7B*50
517 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
518 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
519
520 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
521 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
522
523 /* Loading coeff for computing o2 in the next block */
524
525 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
526 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
527
528 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
529 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
530
531
532
533 /* Column 1 of destination computed here */
534 /* It is stored in m_temp_reg_51 */
535 /* Column 6 of destination computed here */
536 /* It is stored in m_temp_reg_56 */
537
538 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
539 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
540
541 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
542 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
543
544 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
545 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
546
547 //o2:1B*50-3B*89,5B*18+7B*75
548 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
549 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
550
551 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
552 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
553
554
555 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
556
557 /* Loading coeff for computing o3 in the next block */
558
559 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
560 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
561
562 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
563
564
565 /* Column 2 of destination computed here */
566 /* It is stored in m_temp_reg_52 */
567 /* Column 5 of destination computed here */
568 /* It is stored in m_temp_reg_55 */
569
570 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
571 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
572
573 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
574 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
575
576 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
577 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
578
579 //o3:1B*18-3B*50,5B*75-7B*89
580 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
581 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
582
583 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
584 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
585
586
587
588 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
589
590 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
591
592
593 /* Column 3 of destination computed here */
594 /* It is stored in m_temp_reg_53 */
595 /* Column 4 of destination computed here */
596 /* It is stored in m_temp_reg_54 */
597
598 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
599 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
600
601 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
602 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
603
604 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
605 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
606
607
608 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
609 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
610 }
611 }
612
613 /* Transpose of the destination 8x8 matrix done here */
614 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
615 /* respectively */
616 {
617 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
618 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
619 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
620 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
621
622 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
623 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
624 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
625 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
626
627 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
628 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
629 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
630 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
631
632 m_temp_reg_54 = _mm_setzero_si128();
633 m_temp_reg_55 = _mm_setzero_si128();
634 m_temp_reg_56 = _mm_setzero_si128();
635 m_temp_reg_57 = _mm_setzero_si128();
636 }
637 }
638
639 /* Stage 2 */
640 i4_shift = IDCT_STG2_SHIFT;
641 {
642 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
643 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
644 {
645 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
646 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
647
648 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
649 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
650
651 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
652 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
653 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
654 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
655
656
657 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
658 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
659 }
660
661
662 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
663 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
664 {
665
666 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
667 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
668
669
670 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
671 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
672 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
673 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
674
675 /* Loading coeff for computing o0 in the next block */
676 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
677
678
679 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
680 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
681
682
683
684 /* e */
685
686 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
687 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
688 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
689 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
690 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
691 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
692
693 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
694 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
695
696 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
697 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
698
699 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
700 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
701
702 }
703
704 /* o */
705 {
706
707 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
708 {
709 //o0:1B*89+3B*75,1T*89+3T*75
710 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
711 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
712
713 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
714 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
715 /* Loading coeff for computing o1 in the next block */
716 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
717
718
719
720 /* Column 0 of destination computed here */
721 /* It is stored in m_temp_reg_50 */
722 /* Column 7 of destination computed here */
723 /* It is stored in m_temp_reg_57 */
724
725 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
726 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
727
728 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
729 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
730
731 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
732 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
733 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
734 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
735
736 //o1:1B*75-3B*18,1T*75-3T*18
737 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
738 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
739
740 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
741 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
742 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
743 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
744
745 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
746 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
747
748
749 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
750
751
752 /* Loading coeff for computing o2 in the next block */
753 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
754
755
756
757 /* Column 1 of destination computed here */
758 /* It is stored in m_temp_reg_51 */
759 /* Column 6 of destination computed here */
760 /* It is stored in m_temp_reg_56 */
761
762 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
763 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
764
765 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
766 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
767
768 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
769 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
770 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
771 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
772
773 //o2:1B*50-3B*89,5T*18+7T*75.
774 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
775 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
776
777 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
778 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
779 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
780 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
781
782 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
783 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
784
785
786 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
787
788 /* Loading coeff for computing o3 in the next block */
789
790 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
791
792
793 /* Column 2 of destination computed here */
794 /* It is stored in m_temp_reg_52 */
795 /* Column 5 of destination computed here */
796 /* It is stored in m_temp_reg_55 */
797
798 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
799 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
800
801 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
802 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
803
804 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
805 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
806 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
807 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
808
809 //o3:1B*18-3B*50,1T*18-3T*50
810 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
811 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
812
813 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
814 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
815 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
816 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
817
818
819 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
820 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
821
822
823
824 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
825
826
827 /* Column 3 of destination computed here */
828 /* It is stored in m_temp_reg_53 */
829 /* Column 4 of destination computed here */
830 /* It is stored in m_temp_reg_54 */
831
832 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
833 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
834
835 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
836 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
837
838 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
839 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
840 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
841 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
842
843 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
844 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
845 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
846 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
847
848 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
849 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
850 }
851 }
852
853 /* Transpose of the destination 8x8 matrix done here */
854 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
855 /* respectively */
856 {
857 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
858 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
859 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
860 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
861 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
862 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
863 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
864 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
865
866 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
867 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
868 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
869 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
870 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
871 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
872 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
873 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
874 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
875 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
876 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
877 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
878
879 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
880 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
881 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
882 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
883 }
884
885 /* Recon and store */
886 {
887 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
888 pu1_pred += pred_strd;
889 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
890 pu1_pred += pred_strd;
891 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
892 pu1_pred += pred_strd;
893 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
894 pu1_pred += pred_strd;
895 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
896 pu1_pred += pred_strd;
897 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
898 pu1_pred += pred_strd;
899 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
900 pu1_pred += pred_strd;
901 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
902
903 m_temp_reg_50 = _mm_setzero_si128();
904 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
905 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
906 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
907 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
908 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
909 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
910 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
911 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
912
913 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
914 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
915 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
916 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
917 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
918 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
919 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
920 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
921
922 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
923 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
924 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
925 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
926 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
927 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
928 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
929 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
930
931 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
932 pu1_dst += dst_strd;
933 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
934 pu1_dst += dst_strd;
935 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
936 pu1_dst += dst_strd;
937 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
938 pu1_dst += dst_strd;
939 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
940 pu1_dst += dst_strd;
941 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
942 pu1_dst += dst_strd;
943 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
944 pu1_dst += dst_strd;
945 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
946 pu1_dst += dst_strd;
947 }
948 }
949 }
950 else
951
952 {
953
954 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
955 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
956 if(!check_row_stage_1)
957 {
958 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
959 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
960 {
961 //Interleaving 0,4 row in 0 , 1 Rishab
962 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
963 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
964 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
965
966 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
967 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
968
969 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
970 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
971
972
973 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
974 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
975 }
976
977
978 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
979 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
980 {
981
982 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
983 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
984
985 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
986 //Interleaving 2,6 row in 4, 5 Rishab
987 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
988 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
989
990 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
991 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
992
993 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
994 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
995
996
997
998 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
999
1000 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
1001 //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]);
1002
1003 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
1004 //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]);
1005
1006 }
1007
1008 /* e */
1009 {
1010 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1011 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1012 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1013 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1014 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1015 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1016
1017 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1018 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1019
1020 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1021 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1022
1023 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1024 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1025
1026 }
1027
1028 /* o */
1029 {
1030
1031 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1032 {
1033
1034 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1035 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1036 //o0:1B*89+3B*75,1T*89+3T*75
1037 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1038 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1039
1040 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1041 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1042
1043 }
1044
1045 /* Column 0 of destination computed here */
1046 /* It is stored in m_temp_reg_50 */
1047 /* Column 7 of destination computed here */
1048 /* It is stored in m_temp_reg_57 */
1049 {
1050
1051
1052 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1053 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1054
1055 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1056 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1057
1058 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1059 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1060 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1061 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1062
1063 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1064 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1065 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1066 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1067
1068 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1069 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1070 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1071
1072 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1073 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1074
1075 /* Loading coeff for computing o2 in the next block */
1076
1077 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
1078
1079 }
1080
1081 /* Column 1 of destination computed here */
1082 /* It is stored in m_temp_reg_51 */
1083 /* Column 6 of destination computed here */
1084 /* It is stored in m_temp_reg_56 */
1085 {
1086 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1087 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1088
1089 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1090 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1091
1092 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1093 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1094 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1095 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1096
1097 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1098 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1099 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1100 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1101
1102 //o2:1B*50-3B*89,1T*50-3T*89
1103 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1104 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1105
1106 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1107 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1108
1109
1110 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1111
1112
1113 /* Loading coeff for computing o3 in the next block */
1114
1115 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
1116
1117 }
1118
1119 /* Column 2 of destination computed here */
1120 /* It is stored in m_temp_reg_52 */
1121 /* Column 5 of destination computed here */
1122 /* It is stored in m_temp_reg_55 */
1123 {
1124 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1125 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1126
1127 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1128 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1129
1130 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1131 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1132 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1133 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1134
1135 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1136 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1137 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1138 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1139
1140 //o3:1B*18-3B*50,1T*18-3T*50
1141 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1142 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1143
1144 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1145 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1146
1147
1148
1149 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1150
1151
1152 }
1153
1154 /* Column 3 of destination computed here */
1155 /* It is stored in m_temp_reg_53 */
1156 /* Column 4 of destination computed here */
1157 /* It is stored in m_temp_reg_54 */
1158 {
1159 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1160 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1161
1162 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1163 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1164
1165 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1166 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1167 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1168 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1169
1170 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1171 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1172 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1173 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1174
1175 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1176 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1177 }
1178 }
1179
1180 /* Transpose of the destination 8x8 matrix done here */
1181 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1182 /* respectively */
1183 {
1184
1185
1186 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1187 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1188 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1189 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1190 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1191 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1192 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1193 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1194
1195 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1196 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1197 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1198 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1199 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1200 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1201 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1202 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1203
1204 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1205 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1206 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1207 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1208
1209 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1210 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1211 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1212 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1213 }
1214 }
1215 else
1216 {
1217
1218 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1219 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1220 {
1221 //Interleaving 0,4 row in 0 , 1 Rishab
1222 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1223 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
1224 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
1225
1226 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1227 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
1228
1229 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1230 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1231
1232
1233 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1234 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1235 }
1236
1237
1238 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1239 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1240 {
1241
1242 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1243 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1244
1245 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1246 //Interleaving 2,6 row in 4, 5 Rishab
1247 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1248 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
1249
1250 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1251 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1252
1253 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
1254 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1255
1256
1257
1258 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1259
1260 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
1261 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
1262
1263 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
1264 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
1265
1266 }
1267
1268 /* e */
1269 {
1270 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1271 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1272 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1273 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1274 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1275 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1276
1277 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1278 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1279
1280 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1281 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1282
1283 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1284 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1285
1286 }
1287
1288 /* o */
1289 {
1290
1291 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1292 {
1293
1294 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1295 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1296 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1297 m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
1298 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
1299 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1300 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1301 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1302 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1303
1304
1305 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1306 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1307
1308 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1309 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1310 }
1311
1312 /* Column 0 of destination computed here */
1313 /* It is stored in m_temp_reg_50 */
1314 /* Column 7 of destination computed here */
1315 /* It is stored in m_temp_reg_57 */
1316 {
1317
1318
1319 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1320 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1321
1322 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1323 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1324
1325 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1326 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1327 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1328 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1329
1330 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1331 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1332 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1333 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1334
1335 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1336 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1337 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1338 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1339 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
1340
1341 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1342 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1343
1344 /* Loading coeff for computing o2 in the next block */
1345
1346 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
1347 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
1348
1349 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1350 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1351 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
1352 }
1353
1354 /* Column 1 of destination computed here */
1355 /* It is stored in m_temp_reg_51 */
1356 /* Column 6 of destination computed here */
1357 /* It is stored in m_temp_reg_56 */
1358 {
1359 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1360 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1361
1362 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1363 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1364
1365 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1366 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1367 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1368 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1369
1370 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1371 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1372 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1373 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1374
1375 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
1376 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1377 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1378 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1379 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1380
1381 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1382 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1383
1384
1385 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1386
1387
1388 /* Loading coeff for computing o3 in the next block */
1389
1390 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
1391 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
1392
1393 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1394 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1395 }
1396
1397 /* Column 2 of destination computed here */
1398 /* It is stored in m_temp_reg_52 */
1399 /* Column 5 of destination computed here */
1400 /* It is stored in m_temp_reg_55 */
1401 {
1402 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1403 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1404
1405 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1406 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1407
1408 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1409 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1410 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1411 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1412
1413 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1414 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1415 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1416 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1417
1418 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
1419 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1420 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1421 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1422 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
1423
1424 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1425 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1426
1427
1428
1429 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1430
1431
1432 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1433 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
1434 }
1435
1436 /* Column 3 of destination computed here */
1437 /* It is stored in m_temp_reg_53 */
1438 /* Column 4 of destination computed here */
1439 /* It is stored in m_temp_reg_54 */
1440 {
1441 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1442 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1443
1444 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1445 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1446
1447 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1448 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1449 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1450 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1451
1452 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1453 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1454 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1455 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1456
1457 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1458 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1459 }
1460 }
1461
1462 /* Transpose of the destination 8x8 matrix done here */
1463 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1464 /* respectively */
1465 {
1466
1467
1468 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1469 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1470 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1471 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1472 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1473 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1474 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1475 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1476
1477 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1478 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1479 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1480 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1481 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1482 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1483 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1484 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1485
1486 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1487 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1488 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1489 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1490
1491 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1492 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1493 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1494 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1495 }
1496 }
1497 /* Stage 2 */
1498
1499 i4_shift = IDCT_STG2_SHIFT;
1500
1501 {
1502
1503 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1504 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1505 {
1506 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
1507 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
1508
1509 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
1510 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
1511
1512 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1513 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1514 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1515 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1516
1517
1518 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
1519 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
1520 }
1521
1522
1523 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1524 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1525 {
1526 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
1527 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
1528
1529
1530 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1531 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1532 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1533 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1534
1535 /* Loading coeff for computing o0 in the next block */
1536 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
1537 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]);
1538
1539
1540 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
1541 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
1542 }
1543
1544 /* e */
1545 {
1546 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1547 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1548 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1549 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1550 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1551 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1552
1553 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1554 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1555
1556 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1557 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1558
1559 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1560 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1561
1562 }
1563
1564 /* o */
1565 {
1566 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
1567 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
1568
1569 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1570 {
1571 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
1572 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1573 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1574 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1575 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1576
1577 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1578 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1579 /* Loading coeff for computing o1 in the next block */
1580 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
1581 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]);
1582
1583 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1584 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1585 }
1586
1587 /* Column 0 of destination computed here */
1588 /* It is stored in m_temp_reg_50 */
1589 /* Column 7 of destination computed here */
1590 /* It is stored in m_temp_reg_57 */
1591 {
1592 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1593 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1594
1595 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1596 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1597
1598 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1599 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1600 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1601 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1602
1603 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1604 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1605 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1606 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1607
1608 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1609 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1610 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
1611 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1612 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
1613
1614 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1615 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1616
1617
1618 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1619
1620
1621 /* Loading coeff for computing o2 in the next block */
1622 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
1623 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]);
1624
1625 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1626 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
1627 }
1628
1629 /* Column 1 of destination computed here */
1630 /* It is stored in m_temp_reg_51 */
1631 /* Column 6 of destination computed here */
1632 /* It is stored in m_temp_reg_56 */
1633 {
1634 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1635 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1636
1637 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1638 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1639
1640 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1641 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1642 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1643 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1644
1645 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1646 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1647 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1648 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1649
1650 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
1651 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1652 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1653 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1654 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1655
1656 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1657 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1658
1659
1660 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1661
1662 /* Loading coeff for computing o3 in the next block */
1663
1664 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
1665 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]);
1666
1667 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1668 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1669 }
1670
1671 /* Column 2 of destination computed here */
1672 /* It is stored in m_temp_reg_52 */
1673 /* Column 5 of destination computed here */
1674 /* It is stored in m_temp_reg_55 */
1675 {
1676 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1677 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1678
1679 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1680 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1681
1682 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1683 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1684 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1685 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1686
1687 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1688 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1689 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1690 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1691
1692 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
1693 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1694 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
1695 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1696 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
1697
1698 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1699 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1700
1701
1702
1703 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1704
1705
1706 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1707 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
1708 }
1709
1710 /* Column 3 of destination computed here */
1711 /* It is stored in m_temp_reg_53 */
1712 /* Column 4 of destination computed here */
1713 /* It is stored in m_temp_reg_54 */
1714 {
1715 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1716 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1717
1718 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1719 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1720
1721 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
1722 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
1723 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
1724 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
1725
1726 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
1727 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
1728 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
1729 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
1730
1731 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
1732 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
1733 }
1734 }
1735
1736 /* Transpose of the destination 8x8 matrix done here */
1737 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1738 /* respectively */
1739 {
1740 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1741 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1742 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1743 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1744 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1745 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1746 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1747 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1748
1749 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1750 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1751 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1752 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1753 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1754 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1755 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1756 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1757 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1758 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1759 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1760 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1761
1762 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1763 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1764 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1765 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1766 }
1767
1768 /* Recon and store */
1769 {
1770 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
1771 pu1_pred += pred_strd;
1772 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
1773 pu1_pred += pred_strd;
1774 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
1775 pu1_pred += pred_strd;
1776 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
1777 pu1_pred += pred_strd;
1778 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
1779 pu1_pred += pred_strd;
1780 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
1781 pu1_pred += pred_strd;
1782 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
1783 pu1_pred += pred_strd;
1784 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
1785
1786
1787 m_temp_reg_50 = _mm_setzero_si128();
1788 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
1789 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
1790 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
1791 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
1792 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
1793 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
1794 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
1795 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
1796
1797 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
1798 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
1799 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
1800 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
1801 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
1802 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
1803 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
1804 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
1805
1806 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
1807 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
1808 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
1809 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
1810 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
1811 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
1812 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
1813 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
1814
1815 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
1816 pu1_dst += dst_strd;
1817 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
1818 pu1_dst += dst_strd;
1819 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
1820 pu1_dst += dst_strd;
1821 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
1822 pu1_dst += dst_strd;
1823 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
1824 pu1_dst += dst_strd;
1825 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
1826 pu1_dst += dst_strd;
1827 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
1828 pu1_dst += dst_strd;
1829 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
1830 pu1_dst += dst_strd;
1831
1832 }
1833
1834
1835 }
1836
1837
1838 }
1839 }
1840
impeg2_idct_recon_dc_mismatch_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)1841 void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src,
1842 WORD16 *pi2_tmp,
1843 UWORD8 *pu1_pred,
1844 UWORD8 *pu1_dst,
1845 WORD32 src_strd,
1846 WORD32 pred_strd,
1847 WORD32 dst_strd,
1848 WORD32 zero_cols,
1849 WORD32 zero_rows)
1850 {
1851 WORD32 val;
1852 __m128i value_4x32b, mismatch_stg2_additive;
1853 __m128i pred_r, pred_half0, pred_half1;
1854 __m128i temp0, temp1;
1855 __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND);
1856
1857 UNUSED(pi2_tmp);
1858 UNUSED(src_strd);
1859 UNUSED(zero_cols);
1860 UNUSED(zero_rows);
1861
1862 val = pi2_src[0] * gai2_impeg2_idct_q15[0];
1863 val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
1864 val *= gai2_impeg2_idct_q11[0];
1865 value_4x32b = _mm_set1_epi32(val);
1866
1867 // Row 0 processing
1868 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive);
1869 pred_r = _mm_loadl_epi64((__m128i *) pu1_pred);
1870 pred_r = _mm_cvtepu8_epi16(pred_r);
1871 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1872 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1873 pred_half0 = _mm_cvtepu16_epi32(pred_r);
1874 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1875
1876 pred_r = _mm_srli_si128(pred_r, 8);
1877
1878 temp0 = _mm_add_epi32(temp0, value_4x32b);
1879 temp1 = _mm_add_epi32(temp1, value_4x32b);
1880 temp0 = _mm_add_epi32(temp0, round_stg2);
1881 temp1 = _mm_add_epi32(temp1, round_stg2);
1882 pred_half1 = _mm_cvtepu16_epi32(pred_r);
1883 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1884 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1885 temp0 = _mm_add_epi32(temp0, pred_half0);
1886 temp1 = _mm_add_epi32(temp1, pred_half1);
1887
1888 temp0 = _mm_packus_epi32(temp0, temp1);
1889 temp0 = _mm_packus_epi16(temp0, temp1);
1890
1891 _mm_storel_epi64((__m128i *)pu1_dst, temp0);
1892
1893 // Row 1 processing
1894 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8));
1895 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
1896 pred_r = _mm_cvtepu8_epi16(pred_r);
1897 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1898 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1899 pred_half0 = _mm_cvtepu16_epi32(pred_r);
1900 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1901
1902 pred_r = _mm_srli_si128(pred_r, 8);
1903
1904 temp0 = _mm_add_epi32(temp0, value_4x32b);
1905 temp1 = _mm_add_epi32(temp1, value_4x32b);
1906 temp0 = _mm_add_epi32(temp0, round_stg2);
1907 temp1 = _mm_add_epi32(temp1, round_stg2);
1908 pred_half1 = _mm_cvtepu16_epi32(pred_r);
1909 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1910 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1911 temp0 = _mm_add_epi32(temp0, pred_half0);
1912 temp1 = _mm_add_epi32(temp1, pred_half1);
1913
1914 temp0 = _mm_packus_epi32(temp0, temp1);
1915 temp0 = _mm_packus_epi16(temp0, temp1);
1916
1917 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0);
1918
1919 // Row 2 processing
1920 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16));
1921 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd));
1922 pred_r = _mm_cvtepu8_epi16(pred_r);
1923 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1924 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1925 pred_half0 = _mm_cvtepu16_epi32(pred_r);
1926 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1927
1928 pred_r = _mm_srli_si128(pred_r, 8);
1929
1930 temp0 = _mm_add_epi32(temp0, value_4x32b);
1931 temp1 = _mm_add_epi32(temp1, value_4x32b);
1932 temp0 = _mm_add_epi32(temp0, round_stg2);
1933 temp1 = _mm_add_epi32(temp1, round_stg2);
1934 pred_half1 = _mm_cvtepu16_epi32(pred_r);
1935 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1936 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1937 temp0 = _mm_add_epi32(temp0, pred_half0);
1938 temp1 = _mm_add_epi32(temp1, pred_half1);
1939
1940 temp0 = _mm_packus_epi32(temp0, temp1);
1941 temp0 = _mm_packus_epi16(temp0, temp1);
1942
1943 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0);
1944
1945 // Row 3 processing
1946 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24));
1947 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd));
1948 pred_r = _mm_cvtepu8_epi16(pred_r);
1949 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1950 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1951 pred_half0 = _mm_cvtepu16_epi32(pred_r);
1952 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1953
1954 pred_r = _mm_srli_si128(pred_r, 8);
1955
1956 temp0 = _mm_add_epi32(temp0, value_4x32b);
1957 temp1 = _mm_add_epi32(temp1, value_4x32b);
1958 temp0 = _mm_add_epi32(temp0, round_stg2);
1959 temp1 = _mm_add_epi32(temp1, round_stg2);
1960 pred_half1 = _mm_cvtepu16_epi32(pred_r);
1961 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1962 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1963 temp0 = _mm_add_epi32(temp0, pred_half0);
1964 temp1 = _mm_add_epi32(temp1, pred_half1);
1965
1966 temp0 = _mm_packus_epi32(temp0, temp1);
1967 temp0 = _mm_packus_epi16(temp0, temp1);
1968
1969 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0);
1970
1971 // Row 4 processing
1972 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32));
1973 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd));
1974 pred_r = _mm_cvtepu8_epi16(pred_r);
1975 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1976 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1977 pred_half0 = _mm_cvtepu16_epi32(pred_r);
1978 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1979
1980 pred_r = _mm_srli_si128(pred_r, 8);
1981
1982 temp0 = _mm_add_epi32(temp0, value_4x32b);
1983 temp1 = _mm_add_epi32(temp1, value_4x32b);
1984 temp0 = _mm_add_epi32(temp0, round_stg2);
1985 temp1 = _mm_add_epi32(temp1, round_stg2);
1986 pred_half1 = _mm_cvtepu16_epi32(pred_r);
1987 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1988 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1989 temp0 = _mm_add_epi32(temp0, pred_half0);
1990 temp1 = _mm_add_epi32(temp1, pred_half1);
1991
1992 temp0 = _mm_packus_epi32(temp0, temp1);
1993 temp0 = _mm_packus_epi16(temp0, temp1);
1994
1995 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0);
1996
1997 // Row 5 processing
1998 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40));
1999 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd));
2000 pred_r = _mm_cvtepu8_epi16(pred_r);
2001 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2002 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2003 pred_half0 = _mm_cvtepu16_epi32(pred_r);
2004 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2005
2006 pred_r = _mm_srli_si128(pred_r, 8);
2007
2008 temp0 = _mm_add_epi32(temp0, value_4x32b);
2009 temp1 = _mm_add_epi32(temp1, value_4x32b);
2010 temp0 = _mm_add_epi32(temp0, round_stg2);
2011 temp1 = _mm_add_epi32(temp1, round_stg2);
2012 pred_half1 = _mm_cvtepu16_epi32(pred_r);
2013 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2014 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2015 temp0 = _mm_add_epi32(temp0, pred_half0);
2016 temp1 = _mm_add_epi32(temp1, pred_half1);
2017
2018 temp0 = _mm_packus_epi32(temp0, temp1);
2019 temp0 = _mm_packus_epi16(temp0, temp1);
2020
2021 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0);
2022
2023 // Row 6 processing
2024 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48));
2025 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd));
2026 pred_r = _mm_cvtepu8_epi16(pred_r);
2027 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2028 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2029 pred_half0 = _mm_cvtepu16_epi32(pred_r);
2030 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2031
2032 pred_r = _mm_srli_si128(pred_r, 8);
2033
2034 temp0 = _mm_add_epi32(temp0, value_4x32b);
2035 temp1 = _mm_add_epi32(temp1, value_4x32b);
2036 temp0 = _mm_add_epi32(temp0, round_stg2);
2037 temp1 = _mm_add_epi32(temp1, round_stg2);
2038 pred_half1 = _mm_cvtepu16_epi32(pred_r);
2039 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2040 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2041 temp0 = _mm_add_epi32(temp0, pred_half0);
2042 temp1 = _mm_add_epi32(temp1, pred_half1);
2043
2044 temp0 = _mm_packus_epi32(temp0, temp1);
2045 temp0 = _mm_packus_epi16(temp0, temp1);
2046
2047 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0);
2048
2049 // Row 7 processing
2050 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56));
2051 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd));
2052 pred_r = _mm_cvtepu8_epi16(pred_r);
2053 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2054 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2055 pred_half0 = _mm_cvtepu16_epi32(pred_r);
2056 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2057
2058 pred_r = _mm_srli_si128(pred_r, 8);
2059
2060 temp0 = _mm_add_epi32(temp0, value_4x32b);
2061 temp1 = _mm_add_epi32(temp1, value_4x32b);
2062 temp0 = _mm_add_epi32(temp0, round_stg2);
2063 temp1 = _mm_add_epi32(temp1, round_stg2);
2064 pred_half1 = _mm_cvtepu16_epi32(pred_r);
2065 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2066 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2067 temp0 = _mm_add_epi32(temp0, pred_half0);
2068 temp1 = _mm_add_epi32(temp1, pred_half1);
2069
2070 temp0 = _mm_packus_epi32(temp0, temp1);
2071 temp0 = _mm_packus_epi16(temp0, temp1);
2072
2073 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0);
2074 }
2075
impeg2_idct_recon_dc_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)2076 void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src,
2077 WORD16 *pi2_tmp,
2078 UWORD8 *pu1_pred,
2079 UWORD8 *pu1_dst,
2080 WORD32 src_strd,
2081 WORD32 pred_strd,
2082 WORD32 dst_strd,
2083 WORD32 zero_cols,
2084 WORD32 zero_rows)
2085 {
2086 WORD32 val;
2087 __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3;
2088
2089 UNUSED(pi2_tmp);
2090 UNUSED(src_strd);
2091 UNUSED(zero_cols);
2092 UNUSED(zero_rows);
2093
2094 val = pi2_src[0] * gai2_impeg2_idct_q15[0];
2095 val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
2096 val = val * gai2_impeg2_idct_q11[0];
2097 val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
2098
2099 value_4x32b = _mm_set1_epi32(val);
2100
2101 //Row 0-1 processing
2102 pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2103 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2104 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2105 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2106
2107 temp0 = _mm_cvtepu16_epi32(pred_r0);
2108 pred_r0 = _mm_srli_si128(pred_r0, 8);
2109 temp2 = _mm_cvtepu16_epi32(pred_r1);
2110 pred_r1 = _mm_srli_si128(pred_r1, 8);
2111 temp1 = _mm_cvtepu16_epi32(pred_r0);
2112 temp3 = _mm_cvtepu16_epi32(pred_r1);
2113
2114 temp0 = _mm_add_epi32(temp0, value_4x32b);
2115 temp2 = _mm_add_epi32(temp2, value_4x32b);
2116 temp1 = _mm_add_epi32(temp1, value_4x32b);
2117 temp3 = _mm_add_epi32(temp3, value_4x32b);
2118 temp0 = _mm_packus_epi32(temp0, temp1);
2119 temp2 = _mm_packus_epi32(temp2, temp3);
2120 temp0 = _mm_packus_epi16(temp0, temp1);
2121 temp2 = _mm_packus_epi16(temp2, temp3);
2122 _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2123 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2124
2125 //Row 2-3 processing
2126 pu1_pred += 2 * pred_strd;
2127 pu1_dst += 2 * dst_strd;
2128
2129 pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2130 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2131 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2132 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2133
2134 temp0 = _mm_cvtepu16_epi32(pred_r0);
2135 pred_r0 = _mm_srli_si128(pred_r0, 8);
2136 temp2 = _mm_cvtepu16_epi32(pred_r1);
2137 pred_r1 = _mm_srli_si128(pred_r1, 8);
2138 temp1 = _mm_cvtepu16_epi32(pred_r0);
2139 temp3 = _mm_cvtepu16_epi32(pred_r1);
2140
2141 temp0 = _mm_add_epi32(temp0, value_4x32b);
2142 temp2 = _mm_add_epi32(temp2, value_4x32b);
2143 temp1 = _mm_add_epi32(temp1, value_4x32b);
2144 temp3 = _mm_add_epi32(temp3, value_4x32b);
2145 temp0 = _mm_packus_epi32(temp0, temp1);
2146 temp2 = _mm_packus_epi32(temp2, temp3);
2147 temp0 = _mm_packus_epi16(temp0, temp1);
2148 temp2 = _mm_packus_epi16(temp2, temp3);
2149 _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2150 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2151
2152 //Row 4-5 processing
2153 pu1_pred += 2 * pred_strd;
2154 pu1_dst += 2 * dst_strd;
2155
2156 pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2157 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2158 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2159 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2160
2161 temp0 = _mm_cvtepu16_epi32(pred_r0);
2162 pred_r0 = _mm_srli_si128(pred_r0, 8);
2163 temp2 = _mm_cvtepu16_epi32(pred_r1);
2164 pred_r1 = _mm_srli_si128(pred_r1, 8);
2165 temp1 = _mm_cvtepu16_epi32(pred_r0);
2166 temp3 = _mm_cvtepu16_epi32(pred_r1);
2167
2168 temp0 = _mm_add_epi32(temp0, value_4x32b);
2169 temp2 = _mm_add_epi32(temp2, value_4x32b);
2170 temp1 = _mm_add_epi32(temp1, value_4x32b);
2171 temp3 = _mm_add_epi32(temp3, value_4x32b);
2172 temp0 = _mm_packus_epi32(temp0, temp1);
2173 temp2 = _mm_packus_epi32(temp2, temp3);
2174 temp0 = _mm_packus_epi16(temp0, temp1);
2175 temp2 = _mm_packus_epi16(temp2, temp3);
2176 _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2177 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2178
2179 //Row 6-7 processing
2180 pu1_pred += 2 * pred_strd;
2181 pu1_dst += 2 * dst_strd;
2182
2183 pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2184 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2185 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
2186 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
2187
2188 temp0 = _mm_cvtepu16_epi32(pred_r0);
2189 pred_r0 = _mm_srli_si128(pred_r0, 8);
2190 temp2 = _mm_cvtepu16_epi32(pred_r1);
2191 pred_r1 = _mm_srli_si128(pred_r1, 8);
2192 temp1 = _mm_cvtepu16_epi32(pred_r0);
2193 temp3 = _mm_cvtepu16_epi32(pred_r1);
2194
2195 temp0 = _mm_add_epi32(temp0, value_4x32b);
2196 temp2 = _mm_add_epi32(temp2, value_4x32b);
2197 temp1 = _mm_add_epi32(temp1, value_4x32b);
2198 temp3 = _mm_add_epi32(temp3, value_4x32b);
2199 temp0 = _mm_packus_epi32(temp0, temp1);
2200 temp2 = _mm_packus_epi32(temp2, temp3);
2201 temp0 = _mm_packus_epi16(temp0, temp1);
2202 temp2 = _mm_packus_epi16(temp2, temp3);
2203 _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2204 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2205 }
2206