1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_32x32_itrans_recon_x86_intr.c
22 *
23 * @brief
24 * Contains function definitions for inverse quantization, inverse
25 * transform and reconstruction
26 *
27 * @author
28 * 100470
29 *
30 * @par List of Functions:
31 * - ihevc_itrans_recon_32x32_sse42()
32 *
33 * @remarks
34 * None
35 *
36 *******************************************************************************
37 */
38 #include <stdio.h>
39 #include <string.h>
40 #include "ihevc_typedefs.h"
41 #include "ihevc_platform_macros.h"
42 #include "ihevc_macros.h"
43 #include "ihevc_defs.h"
44 #include "ihevc_trans_tables.h"
45 #include "ihevc_iquant_itrans_recon.h"
46 #include "ihevc_func_selector.h"
47 #include "ihevc_trans_macros.h"
48
49 #include <emmintrin.h>
50 #include <smmintrin.h>
51 #include <tmmintrin.h>
52
53 /**
54 *******************************************************************************
55 *
56 * @brief
57 * This function performs inverse quantization, inverse transform and
58 * reconstruction for 16x16 input block
59 *
60 * @par Description:
61 * Performs inverse quantization , inverse transform and adds the
62 * prediction data and clips output to 8 bit
63 *
64 * @param[in] pi2_src
65 * Input 16x16 coefficients
66 *
67 * @param[in] pi2_tmp
68 * Temporary 16x16 buffer for storing inverse
69 * transform 1st stage output
70 *
71 * @param[in] pu1_pred
72 * Prediction 16x16 block
73 *
74 * @param[in] pi2_dequant_coeff
75 * Dequant Coeffs
76 *
77 * @param[out] pu1_dst
78 * Output 16x16 block
79 *
80 * @param[in] qp_div
81 * Quantization parameter / 6
82 *
83 * @param[in] qp_rem
84 * Quantization parameter % 6
85 *
86 * @param[in] src_strd
87 * Input stride
88 *
89 * @param[in] pred_strd
90 * Prediction stride
91 *
92 * @param[in] dst_strd
93 * Output Stride
94 *
95 * @param[in] zero_cols
96 * Zero columns in pi2_src
97 *
98 * @returns Void
99 *
100 * @remarks
101 * None
102 *
103 *******************************************************************************
104 */
105 /**/
106
ihevc_itrans_recon_32x32_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)107 void ihevc_itrans_recon_32x32_sse42(WORD16 *pi2_src,
108 WORD16 *pi2_tmp,
109 UWORD8 *pu1_pred,
110 UWORD8 *pu1_dst,
111 WORD32 src_strd,
112 WORD32 pred_strd,
113 WORD32 dst_strd,
114 WORD32 zero_cols,
115 WORD32 zero_rows)
116 {
117 /* Inverse Transform */
118
119 WORD32 j;
120
121
122 WORD16 *pi2_tmp_orig;
123
124
125 WORD16 *o_temp_ptr;
126 WORD16 *temp_ptr;
127
128 __m128i m_temp_reg_0;
129 __m128i m_temp_reg_1;
130 __m128i m_temp_reg_2;
131 __m128i m_temp_reg_3;
132 __m128i m_temp_reg_4;
133 __m128i m_temp_reg_5;
134 __m128i m_temp_reg_6;
135 __m128i m_temp_reg_7;
136 __m128i m_temp_reg_10;
137 __m128i m_temp_reg_11;
138 __m128i m_temp_reg_12;
139 __m128i m_temp_reg_13;
140 __m128i m_temp_reg_14;
141 __m128i m_temp_reg_15;
142 __m128i m_temp_reg_16;
143 __m128i m_temp_reg_17;
144 __m128i m_temp_reg_18;
145 __m128i m_temp_reg_19;
146 __m128i m_temp_reg_20;
147 __m128i m_temp_reg_21;
148 __m128i m_temp_reg_22;
149 __m128i m_temp_reg_23;
150 __m128i m_temp_reg_30;
151 __m128i m_temp_reg_31;
152 __m128i m_temp_reg_32;
153 __m128i m_temp_reg_33;
154 __m128i m_temp_reg_34;
155 __m128i m_temp_reg_35;
156 __m128i m_temp_reg_36;
157 __m128i m_temp_reg_37;
158 __m128i m_temp_reg_40;
159 __m128i m_temp_reg_41;
160 __m128i m_temp_reg_42;
161 __m128i m_temp_reg_43;
162 __m128i m_temp_reg_44;
163 __m128i m_temp_reg_45;
164 __m128i m_temp_reg_46;
165 __m128i m_temp_reg_47;
166
167 __m128i m_temp_reg_70;
168 __m128i m_temp_reg_71;
169 __m128i m_temp_reg_72;
170 __m128i m_temp_reg_73;
171 __m128i m_temp_reg_74;
172 __m128i m_temp_reg_75;
173 __m128i m_temp_reg_76;
174 __m128i m_temp_reg_77;
175
176 __m128i m_temp_reg_80;
177 __m128i m_temp_reg_81;
178 __m128i m_temp_reg_82;
179 __m128i m_temp_reg_83;
180 __m128i m_temp_reg_84;
181 __m128i m_temp_reg_85;
182 __m128i m_temp_reg_86;
183 __m128i m_temp_reg_87;
184
185 __m128i m_temp_reg_90;
186 __m128i m_temp_reg_91;
187 __m128i m_temp_reg_92;
188 __m128i m_temp_reg_93;
189 __m128i m_temp_reg_94;
190 __m128i m_temp_reg_95;
191 __m128i m_temp_reg_96;
192 __m128i m_temp_reg_97;
193
194 __m128i m_rdng_factor;
195 __m128i m_count;
196 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
197 __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
198
199 __m128i temp1, temp2, temp3, temp4;
200 __m128i temp5, temp6, temp7, temp8;
201
202 __m128i all_zero_reg;
203 WORD32 i;
204
205 /*Lokesh*/
206 WORD32 zero_last24_cols_stg1;
207 WORD32 zero_last24_rows_stg1;
208 WORD32 zero_last28_rows_stg1;
209
210 WORD32 zero_last28_rows_stg2;
211 WORD32 zero_last24_rows_stg2;
212
213 WORD32 trans_size_stg1;
214
215 WORD32 i4_shift = IT_SHIFT_STAGE_1;
216 WORD32 trans_size = TRANS_SIZE_32;
217
218
219 /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
220 zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
221 zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
222 zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
223
224 zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
225 zero_last24_rows_stg2 = zero_last24_cols_stg1;
226
227 if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
228 {
229 trans_size_stg1 = 8;
230
231 }
232 else
233 {
234 trans_size_stg1 = 32;
235 }
236
237 all_zero_reg = _mm_setzero_si128();
238
239 o_temp_ptr = pi2_tmp;
240 temp_ptr = (pi2_tmp + 1024);
241
242 pi2_tmp += 2048;
243 pi2_tmp_orig = pi2_tmp;
244
245 for(i = 0; i < trans_size_stg1; i += 8)
246 {
247
248 {
249 WORD16 *pi2_tmp_src = pi2_src;
250
251 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
252 pi2_tmp_src += (src_strd << 1);
253 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
254 pi2_tmp_src += (src_strd << 1);
255 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
256 pi2_tmp_src += (src_strd << 1);
257 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
258 pi2_tmp_src += (src_strd << 1);
259 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
260 pi2_tmp_src += (src_strd << 1);
261 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
262 pi2_tmp_src += (src_strd << 1);
263 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
264 pi2_tmp_src += (src_strd << 1);
265 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
266 pi2_tmp_src += (src_strd << 1);
267
268 m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
269 pi2_tmp_src += (src_strd << 1);
270 m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
271 pi2_tmp_src += (src_strd << 1);
272 m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
273 pi2_tmp_src += (src_strd << 1);
274 m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
275 pi2_tmp_src += (src_strd << 1);
276 m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
277 pi2_tmp_src += (src_strd << 1);
278 m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
279 pi2_tmp_src += (src_strd << 1);
280 m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
281 pi2_tmp_src += (src_strd << 1);
282 m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
283 }
284
285 if(zero_last28_rows_stg1)
286 {
287 /* eeo */
288 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
289 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
290 {
291 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
292
293 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
294
295 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
296
297 /* eeeo[0]= m_temp_reg_20 */
298 /* eeeo[1]= m_temp_reg_21 */
299 /* eeee[0]= m_temp_reg_22 */
300 /* eeee[1]= m_temp_reg_23 */
301
302 /* eee[0] = eeee[0] + eeeo[0]; */
303 m_temp_reg_40 = m_temp_reg_14;
304
305 /* eee[3] = eeee[0] - eeeo[0]; */
306 m_temp_reg_43 = m_temp_reg_14;
307
308 /* eee[2] = eeee[1] - eeeo[1]; */
309 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
310
311 /* eee[1] = eeee[1] + eeeo[1];*/
312 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
313
314 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
315
316 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
317
318 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
319
320 /* eeeo[0]= m_temp_reg_20 */
321 /* eeeo[1]= m_temp_reg_21 */
322 /* eeee[0]= m_temp_reg_22 */
323 /* eeee[1]= m_temp_reg_23 */
324
325 /* eee[0] = eeee[0] + eeeo[0]; */
326 m_temp_reg_44 = m_temp_reg_14;
327
328 /* eee[3] = eeee[0] - eeeo[0]; */
329 m_temp_reg_47 = m_temp_reg_14;
330
331 /* eee[2] = eeee[1] - eeeo[1]; */
332 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
333
334 /* eee[1] = eeee[1] + eeeo[1];*/
335 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
336
337
338 }
339 /* eo */
340 {
341 WORD16 *pi2_scratch = o_temp_ptr;
342
343 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
344 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
345 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
346 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
347 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
348 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
349 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
350 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
351
352 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
353
354 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
355
356 /* eo0[0-3] */
357 {
358 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
359
360 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
361
362 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
363 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
364
365 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
366 pi2_scratch += 8;
367 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
368 pi2_scratch += 8;
369
370 }
371
372 /* eo0[4-7] */
373 {
374 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
375
376 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
377 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
378
379 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
380 pi2_scratch += 8;
381 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
382 pi2_scratch += 8;
383
384 }
385 /* eo1[0-3] */
386 {
387 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
388
389 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
390 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
391
392 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
393 pi2_scratch += 8;
394 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
395 pi2_scratch += 8;
396
397 }
398
399 /* eo1[4-7] */
400 {
401 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
402
403 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
404 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
405
406 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
407 pi2_scratch += 8;
408 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
409 pi2_scratch += 8;
410
411 }
412
413 /* eo2[0-3] */
414 {
415 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
416
417 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
418 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
419
420 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
421 pi2_scratch += 8;
422 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
423 pi2_scratch += 8;
424
425 }
426
427 /* eo2[4-7] */
428 {
429 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
430
431 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
432 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
433
434 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
435 pi2_scratch += 8;
436 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
437 pi2_scratch += 8;
438
439 }
440
441 /**************************************************************************/
442
443
444 /* eo3[0-3] */
445 {
446 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
447
448 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
449 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
450
451 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
452 pi2_scratch += 8;
453 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
454 pi2_scratch += 8;
455
456 }
457
458 /* eo3[4-7] */
459 {
460 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
461
462 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
463 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
464
465 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
466 pi2_scratch += 8;
467 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
468 pi2_scratch += 8;
469
470 }
471
472
473 /* eo4[0-3] */
474 {
475 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
476
477 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
478 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
479
480 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
481 pi2_scratch += 8;
482 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
483 pi2_scratch += 8;
484
485 }
486 /* eo4[4-7] */
487 {
488 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
489
490 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
491 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
492
493 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
494 pi2_scratch += 8;
495 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
496 pi2_scratch += 8;
497
498 }
499
500 /***********************************************************************/
501
502 /* eo5[0-3] */
503 {
504 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
505
506 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
507 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
508
509 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
510 pi2_scratch += 8;
511 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
512 pi2_scratch += 8;
513
514 }
515
516
517 /* eo5[4-7] */
518 {
519 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
520
521 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
522 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
523
524 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
525 pi2_scratch += 8;
526 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
527 pi2_scratch += 8;
528
529 }
530
531 /* eo6[0-3] */
532 {
533 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
534
535 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
536 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
537
538 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
539 pi2_scratch += 8;
540 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
541 pi2_scratch += 8;
542
543 }
544
545
546 /* eo6[4-7] */
547 {
548 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
549
550 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
551 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
552
553 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
554 pi2_scratch += 8;
555 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
556 pi2_scratch += 8;
557
558 }
559
560
561 /* eo7[0-3] */
562 {
563 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
564
565 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
566 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
567
568 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
569 pi2_scratch += 8;
570 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
571 pi2_scratch += 8;
572
573 }
574
575
576 /* eo7[4-7] */
577 {
578 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
579
580 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
581 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
582
583 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
584 pi2_scratch += 8;
585 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
586 pi2_scratch += 8;
587
588 }
589
590 }
591 }
592 else if(zero_last24_rows_stg1)
593 {
594 {
595 /* eeo */
596 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
597 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
598
599 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
600 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
601
602 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
603
604 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
605
606 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
607
608 /* eeeo[0]= m_temp_reg_20 */
609 /* eeeo[1]= m_temp_reg_21 */
610 /* eeee[0]= m_temp_reg_22 */
611 /* eeee[1]= m_temp_reg_23 */
612
613 /* eee[0] = eeee[0] + eeeo[0]; */
614 m_temp_reg_40 = m_temp_reg_14;
615
616 /* eee[3] = eeee[0] - eeeo[0]; */
617 m_temp_reg_43 = m_temp_reg_14;
618
619 /* eee[2] = eeee[1] - eeeo[1]; */
620 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
621
622 /* eee[1] = eeee[1] + eeeo[1];*/
623 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
624
625 /* for row 4 to 7 */
626
627 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
628
629 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
630
631 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
632
633 /* eeeo[0]= m_temp_reg_20 */
634 /* eeeo[1]= m_temp_reg_21 */
635 /* eeee[0]= m_temp_reg_22 */
636 /* eeee[1]= m_temp_reg_23 */
637
638 /* eee[0] = eeee[0] + eeeo[0]; */
639 m_temp_reg_44 = m_temp_reg_14;
640
641 /* eee[3] = eeee[0] - eeeo[0]; */
642 m_temp_reg_47 = m_temp_reg_14;
643
644 /* eee[2] = eeee[1] - eeeo[1]; */
645 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
646
647 /* eee[1] = eeee[1] + eeeo[1];*/
648 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
649
650
651 // eeo[]
652 /* for(k = 0; k < 4; k++) */
653
654 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
655 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
656 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
657 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
658
659 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
660
661 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
662
663 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
664
665 m_temp_reg_33 = _mm_setzero_si128();
666
667 /* eeo */
668 {
669 /* eeo0[0-3] */
670 {
671 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
672
673 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
674 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
675
676 m_temp_reg_90 = m_temp_reg_34;
677 m_temp_reg_97 = m_temp_reg_35;
678 }
679 /* eeo0[4-7] */
680 {
681 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
682
683 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
684 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
685
686 m_temp_reg_91 = m_temp_reg_34;
687 m_temp_reg_96 = m_temp_reg_35;
688
689 }
690
691 /* eeo1[0-3] */
692 {
693 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
694
695 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
696 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
697 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
698 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
699
700 m_temp_reg_92 = m_temp_reg_34;
701 m_temp_reg_95 = m_temp_reg_35;
702
703 }
704
705 /* eo1[4-7] */
706 {
707 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
708
709 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
710 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
711 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
712 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
713
714 m_temp_reg_93 = m_temp_reg_34;
715 m_temp_reg_94 = m_temp_reg_35;
716
717
718 }
719
720 /* eo2[0-3] */
721 {
722 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
723
724 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
725 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
726 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
727 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
728
729 temp1 = m_temp_reg_34;
730 temp7 = m_temp_reg_35;
731
732 }
733
734 /* eo2[4-7] */
735 {
736 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
737
738 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
739 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
740 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
741 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
742
743 temp2 = m_temp_reg_34;
744 temp6 = m_temp_reg_35;
745
746 }
747
748 /* eo3[0-3] */
749 {
750 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
751
752 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
753 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
754 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
755 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
756
757 temp3 = m_temp_reg_34;
758 temp5 = m_temp_reg_35;
759
760 }
761
762
763 /* eo3[4-7] */
764 {
765 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
766
767 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
768 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
769 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
770 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
771
772 temp4 = m_temp_reg_34;
773 temp8 = m_temp_reg_35;
774
775
776 }
777 /* All values of ee[] array in pi2_temp */
778
779 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
780 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
781 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
782 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
783
784 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
785
786 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
787 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
788
789 }
790 }
791 /* eo */
792 {
793
794 WORD16 *pi2_scratch = o_temp_ptr;
795
796 /* eo0[0-3] */
797 {
798 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
799
800 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
801 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
802
803 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
804 pi2_scratch += 8;
805 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
806 pi2_scratch += 8;
807
808 }
809
810
811 /* eo0[4-7] */
812 {
813 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
814
815 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
816
817 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
818 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
819
820 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
821 pi2_scratch += 8;
822 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
823 pi2_scratch += 8;
824
825 }
826
827 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
828
829 /* eo1[0-3] */
830 {
831 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
832
833 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
834 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
835
836 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
837 pi2_scratch += 8;
838 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
839 pi2_scratch += 8;
840
841 }
842
843
844 /* eo1[4-7] */
845 {
846 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
847
848 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
849 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
850
851 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
852 pi2_scratch += 8;
853 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
854 pi2_scratch += 8;
855
856 }
857
858 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
859
860 /* eo2[0-3] */
861 {
862
863 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
864
865 m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
866 m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
867
868 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
869 pi2_scratch += 8;
870 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
871 pi2_scratch += 8;
872
873 }
874
875 /* eo2[4-7] */
876 {
877
878 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
879
880 m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
881 m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
882
883 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
884 pi2_scratch += 8;
885 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
886 pi2_scratch += 8;
887
888 }
889
890 /**************************************************************************/
891
892
893
894 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
895
896 /* eo3[0-3] */
897 {
898
899 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
900
901 m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
902 m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
903
904 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
905 pi2_scratch += 8;
906 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
907 pi2_scratch += 8;
908
909 }
910
911
912 /* eo3[4-7] */
913 {
914
915 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
916
917 m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
918 m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
919
920 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
921 pi2_scratch += 8;
922 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
923 pi2_scratch += 8;
924
925 }
926
927 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
928
929 /* eo4[0-3] */
930 {
931 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
932
933 m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
934 m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
935
936 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
937 pi2_scratch += 8;
938 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
939 pi2_scratch += 8;
940
941 }
942 /* eo4[4-7] */
943 {
944 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
945
946 m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
947 m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
948
949 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
950 pi2_scratch += 8;
951 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
952 pi2_scratch += 8;
953
954 }
955
956 /***********************************************************************/
957
958 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
959
960 /* eo5[0-3] */
961 {
962
963 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
964
965 m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
966 m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
967
968 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
969 pi2_scratch += 8;
970 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
971 pi2_scratch += 8;
972
973 }
974
975
976 /* eo5[4-7] */
977 {
978 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
979
980 m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
981 m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
982
983 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
984 pi2_scratch += 8;
985 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
986 pi2_scratch += 8;
987
988 }
989
990 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
991
992 /* eo6[0-3] */
993 {
994 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
995
996 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
997 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
998
999 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1000 pi2_scratch += 8;
1001 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1002 pi2_scratch += 8;
1003
1004 }
1005
1006
1007 /* eo6[4-7] */
1008 {
1009
1010 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1011
1012 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1013 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1014
1015 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1016 pi2_scratch += 8;
1017 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1018 pi2_scratch += 8;
1019
1020 }
1021
1022 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
1023
1024 /* eo7[0-3] */
1025 {
1026
1027 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1028
1029 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1030 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1031
1032 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1033 pi2_scratch += 8;
1034 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1035 pi2_scratch += 8;
1036
1037 }
1038
1039
1040 /* eo7[4-7] */
1041 {
1042 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1043
1044 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1045 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1046
1047 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1048 pi2_scratch += 8;
1049 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1050 pi2_scratch += 8;
1051
1052 }
1053
1054 }
1055
1056 }
1057 else
1058 {
1059
1060 {
1061 /* eeo */
1062 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1063 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1064
1065 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
1066 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
1067
1068 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
1069 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
1070
1071 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1072
1073 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1074
1075 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
1076 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
1077
1078 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
1079 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
1080
1081
1082 /* eeeo[0]= m_temp_reg_20 */
1083 /* eeeo[1]= m_temp_reg_21 */
1084 /* eeee[0]= m_temp_reg_22 */
1085 /* eeee[1]= m_temp_reg_23 */
1086
1087 /* eee[0] = eeee[0] + eeeo[0]; */
1088 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
1089
1090 /* eee[3] = eeee[0] - eeeo[0]; */
1091 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
1092
1093 /* eee[2] = eeee[1] - eeeo[1]; */
1094 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
1095
1096 /* eee[1] = eeee[1] + eeeo[1];*/
1097 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
1098
1099 /* for row 4 to 7 */
1100
1101 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
1102 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
1103
1104 /* Interleaving row 8 and row 24*/
1105 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1106
1107 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1108 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
1109
1110 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1111
1112 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
1113 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
1114
1115 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
1116 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
1117
1118
1119 /* eeeo[0]= m_temp_reg_20 */
1120 /* eeeo[1]= m_temp_reg_21 */
1121 /* eeee[0]= m_temp_reg_22 */
1122 /* eeee[1]= m_temp_reg_23 */
1123
1124 /* eee[0] = eeee[0] + eeeo[0]; */
1125 m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
1126
1127 /* eee[3] = eeee[0] - eeeo[0]; */
1128 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
1129
1130 /* eee[2] = eeee[1] - eeeo[1]; */
1131 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
1132
1133 /* eee[1] = eeee[1] + eeeo[1];*/
1134 m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
1135
1136
1137 // eeo[]
1138 /* for(k = 0; k < 4; k++) */
1139
1140 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
1141 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
1142
1143 /* eeo */
1144 {
1145 /* eeo0[0-3] */
1146 {
1147 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1148 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1149
1150 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1151 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1152
1153 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1154
1155 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1156 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1157
1158 }
1159
1160 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
1161 m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
1162 m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
1163 m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
1164
1165 /* eeo0[4-7] */
1166 {
1167 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1168 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1169
1170 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1171 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1172
1173 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1174
1175 m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
1176 m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
1177
1178 }
1179
1180
1181 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
1182 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50
1183
1184 /* eeo1[0-3] */
1185 {
1186 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1187 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1188
1189 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
1190 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
1191
1192 m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1193 m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1194
1195 }
1196
1197 /* eeo1[4-7] */
1198 {
1199
1200 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1201 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1202
1203 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
1204 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
1205
1206 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1207 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1208
1209
1210 }
1211
1212 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
1213 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75
1214
1215 /* eeo2[0-3] */
1216 {
1217
1218 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1219 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1220
1221 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
1222 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
1223
1224 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
1225 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
1226
1227 temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1228 temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1229
1230 }
1231
1232 /* eeo2[4-7] */
1233 {
1234
1235 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1236 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1237
1238 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
1239 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
1240
1241 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
1242 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
1243
1244 temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1245 temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1246
1247 }
1248
1249 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
1250 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89
1251
1252 /* eeo3[0-3] */
1253 {
1254
1255 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1256 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1257
1258 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
1259 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
1260
1261 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
1262 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
1263
1264 temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1265 temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1266
1267
1268 }
1269
1270 /* eeo3[4-7] */
1271 {
1272
1273 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1274 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1275
1276 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
1277 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
1278
1279 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
1280 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
1281 temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1282 temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1283
1284 }
1285
1286
1287 /* All values of ee[] array in pi2_temp */
1288
1289 /* for(k = 0; k < 8; k++) */
1290 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
1291 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
1292 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
1293 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
1294 }
1295 }
1296 /* eo */
1297 {
1298
1299 WORD16 *pi2_scratch = o_temp_ptr;
1300
1301 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1302 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1303 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1304 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1305
1306 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1307 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
1308 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
1309 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
1310
1311 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
1312 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
1313 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
1314 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
1315
1316 /* eo0[0-3] */
1317 {
1318 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1319 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1320
1321 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1322
1323 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1324 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1325
1326 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1327
1328 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1329
1330 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
1331 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
1332
1333 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1334 pi2_scratch += 8;
1335 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1336 pi2_scratch += 8;
1337
1338 }
1339 /* eo0[4-7] */
1340 {
1341 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1342 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1343 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1344 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1345
1346 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1347 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1348
1349 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1350
1351 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1352 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1353
1354 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1355
1356 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1357
1358 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
1359 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
1360
1361 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1362 pi2_scratch += 8;
1363 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1364 pi2_scratch += 8;
1365
1366 }
1367
1368 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
1369 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43
1370 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90
1371 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25
1372
1373 /* eo1[0-3] */
1374 {
1375
1376 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1377 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1378
1379 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1380
1381 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1382 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1383
1384 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1385
1386 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1387
1388 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
1389 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
1390
1391 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1392 pi2_scratch += 8;
1393 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1394 pi2_scratch += 8;
1395
1396 }
1397
1398 /* eo1[4-7] */
1399 {
1400 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1401 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1402
1403 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1404
1405 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1406 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1407
1408 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1409
1410 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1411
1412 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
1413 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
1414
1415 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1416 pi2_scratch += 8;
1417 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1418 pi2_scratch += 8;
1419
1420 }
1421
1422 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
1423 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87
1424 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57
1425 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43
1426
1427 /* eo2[0-3] */
1428 {
1429 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1430 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1431
1432 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1433
1434 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1435 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1436
1437 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1438
1439 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1440
1441 m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
1442 m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
1443
1444 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1445 pi2_scratch += 8;
1446 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1447 pi2_scratch += 8;
1448
1449 }
1450
1451
1452 /* eo2[4-7] */
1453 {
1454
1455 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1456 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1457
1458 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1459
1460 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1461 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1462
1463 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1464
1465 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1466
1467 m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
1468 m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
1469
1470 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1471 pi2_scratch += 8;
1472 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1473 pi2_scratch += 8;
1474
1475 }
1476 /**************************************************************************/
1477
1478 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
1479 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9
1480 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25
1481 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57
1482
1483 /* eo3[0-3] */
1484 {
1485 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1486 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1487
1488 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1489
1490 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1491 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1492
1493 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1494
1495 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1496
1497 m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
1498 m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
1499
1500 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1501 pi2_scratch += 8;
1502 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1503 pi2_scratch += 8;
1504
1505 }
1506
1507
1508 /* eo3[4-7] */
1509 {
1510 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1511 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1512
1513 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1514
1515 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1516 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1517
1518 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1519
1520 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1521
1522 m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
1523 m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
1524
1525 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1526 pi2_scratch += 8;
1527 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1528 pi2_scratch += 8;
1529
1530 }
1531
1532 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
1533 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90
1534 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87
1535 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70
1536
1537 /* eo4[0-3] */
1538 {
1539
1540 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1541 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1542
1543 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1544
1545 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1546 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1547
1548 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1549
1550 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1551
1552 m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
1553 m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
1554
1555 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1556 pi2_scratch += 8;
1557 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1558 pi2_scratch += 8;
1559
1560 }
1561
1562
1563 /* eo4[4-7] */
1564 {
1565 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1566 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1567
1568 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1569
1570 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1571 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1572
1573 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1574
1575 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1576
1577 m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
1578 m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
1579
1580 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1581 pi2_scratch += 8;
1582 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1583 pi2_scratch += 8;
1584
1585 }
1586
1587 /***********************************************************************/
1588
1589 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
1590 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25
1591 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70
1592 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80
1593
1594 /* eo5[0-3] */
1595 {
1596 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1597 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1598
1599 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1600
1601 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1602 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1603
1604 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1605
1606 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1607
1608 m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
1609 m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
1610
1611 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1612 pi2_scratch += 8;
1613 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1614 pi2_scratch += 8;
1615
1616 }
1617
1618
1619 /* eo5[4-7] */
1620 {
1621 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1622 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1623
1624 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1625
1626 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1627 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1628
1629 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1630
1631 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1632
1633 m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
1634 m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
1635
1636 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1637 pi2_scratch += 8;
1638 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1639 pi2_scratch += 8;
1640
1641 }
1642
1643 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
1644 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80
1645 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9
1646 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87
1647
1648 /* eo6[0-3] */
1649 {
1650
1651 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1652 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1653
1654 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1655
1656 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1657 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1658
1659 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1660
1661 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1662
1663 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
1664 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
1665
1666 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1667 pi2_scratch += 8;
1668 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1669 pi2_scratch += 8;
1670
1671 }
1672
1673
1674 /* eo6[4-7] */
1675 {
1676 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1677 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1678
1679 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1680
1681 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1682 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1683
1684 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1685
1686 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1687
1688 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1689 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1690
1691 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1692 pi2_scratch += 8;
1693 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1694 pi2_scratch += 8;
1695
1696 }
1697
1698 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
1699 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57
1700 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80
1701 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90
1702
1703 /* eo7[0-3] */
1704 {
1705
1706 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1707 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1708
1709 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1710
1711 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1712 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1713
1714 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1715
1716 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1717
1718 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1719 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1720
1721 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1722 pi2_scratch += 8;
1723 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1724 pi2_scratch += 8;
1725
1726 }
1727
1728
1729 /* eo7[4-7] */
1730 {
1731
1732 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1733 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1734
1735 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1736
1737 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1738 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1739
1740 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1741
1742 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1743
1744 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1745 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1746
1747 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1748 pi2_scratch += 8;
1749 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1750 pi2_scratch += 8;
1751
1752 }
1753
1754 }
1755
1756 }
1757 /* All e[] are done */
1758 /****************************/
1759
1760 {
1761
1762 WORD16 *pi2_tmp_src = pi2_src + src_strd;
1763
1764 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1765 pi2_tmp_src += (src_strd << 1);
1766 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1767 pi2_tmp_src += (src_strd << 1);
1768 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1769 pi2_tmp_src += (src_strd << 1);
1770 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1771 pi2_tmp_src += (src_strd << 1);
1772 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1773 pi2_tmp_src += (src_strd << 1);
1774 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1775 pi2_tmp_src += (src_strd << 1);
1776 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1777 pi2_tmp_src += (src_strd << 1);
1778 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1779 pi2_tmp_src += (src_strd << 1);
1780
1781 m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1782 pi2_tmp_src += (src_strd << 1);
1783 m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1784 pi2_tmp_src += (src_strd << 1);
1785 m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1786 pi2_tmp_src += (src_strd << 1);
1787 m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1788 pi2_tmp_src += (src_strd << 1);
1789 m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1790 pi2_tmp_src += (src_strd << 1);
1791 m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1792 pi2_tmp_src += (src_strd << 1);
1793 m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1794 pi2_tmp_src += (src_strd << 1);
1795 m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1796 }
1797
1798 if(zero_last28_rows_stg1)
1799 {
1800 /* o & stage 1 out */
1801 {
1802 WORD32 j;
1803 WORD16 *pi2_src_scratch = o_temp_ptr;
1804 WORD16 *pi2_dst_scratch = temp_ptr;
1805 WORD32 out_stride = (trans_size << 1);
1806 WORD32 in_stride = trans_size;
1807
1808 for(j = 0; j < 2; j++)
1809 {
1810 if(j)
1811 {
1812 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1813 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1814 }
1815
1816 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
1817
1818 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
1819
1820 /* o0[0-3] */
1821 {
1822 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1823
1824 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1825 pi2_src_scratch += in_stride;
1826
1827 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1828 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1829
1830 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1831 m_count = _mm_cvtsi32_si128(i4_shift);
1832 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1833 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1834
1835 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1836 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1837 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1838 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1839
1840 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1841
1842 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1843 pi2_dst_scratch += out_stride;
1844
1845 }
1846
1847 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
1848
1849 /* o1[0-3] */
1850 {
1851
1852 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1853
1854 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1855 pi2_src_scratch += in_stride;
1856
1857 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1858 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1859
1860 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1861 m_count = _mm_cvtsi32_si128(i4_shift);
1862 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1863 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1864
1865 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1866 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1867 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1868 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1869
1870 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1871
1872 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1873 pi2_dst_scratch += out_stride;
1874
1875 }
1876
1877 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
1878
1879 /* o2[0-3] */
1880 {
1881
1882 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1883
1884 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1885 pi2_src_scratch += in_stride;
1886
1887 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1888 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1889
1890 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1891 m_count = _mm_cvtsi32_si128(i4_shift);
1892 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1893 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1894
1895 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1896 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1897 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1898 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1899
1900 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1901
1902 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1903 pi2_dst_scratch += out_stride;
1904
1905 }
1906
1907 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
1908
1909 /* o3[0-3] */
1910 {
1911 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1912
1913 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1914 pi2_src_scratch += in_stride;
1915
1916 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1917 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1918
1919 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1920 m_count = _mm_cvtsi32_si128(i4_shift);
1921 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1922 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1923
1924 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1925 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1926 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1927 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1928
1929 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1930
1931 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1932 pi2_dst_scratch += out_stride;
1933
1934 }
1935
1936 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
1937
1938 /* o4[0-3] */
1939 {
1940 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1941
1942 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1943 pi2_src_scratch += in_stride;
1944
1945 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1946 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1947
1948 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1949 m_count = _mm_cvtsi32_si128(i4_shift);
1950 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1951 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1952
1953 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1954 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1955 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1956 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1957
1958 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1959
1960 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1961 pi2_dst_scratch += out_stride;
1962
1963 }
1964
1965 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
1966
1967 /* o5[0-3] */
1968 {
1969
1970 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1971
1972 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1973 pi2_src_scratch += in_stride;
1974
1975 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1976 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1977
1978 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1979 m_count = _mm_cvtsi32_si128(i4_shift);
1980 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1981 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1982
1983 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1984 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1985 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1986 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1987
1988 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1989
1990 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1991 pi2_dst_scratch += out_stride;
1992
1993 }
1994
1995 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
1996
1997 /* o6[0-3] */
1998 {
1999 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2000
2001 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2002 pi2_src_scratch += in_stride;
2003
2004 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2005 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2006
2007 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2008 m_count = _mm_cvtsi32_si128(i4_shift);
2009 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2010 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2011
2012 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2013 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2014 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2015 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2016
2017 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2018
2019 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2020 pi2_dst_scratch += out_stride;
2021
2022 }
2023
2024 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2025
2026 /* o7[0-3] */
2027 {
2028
2029 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2030
2031 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2032 pi2_src_scratch += 8;
2033
2034 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2035 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2036
2037 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2038 m_count = _mm_cvtsi32_si128(i4_shift);
2039 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2040 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2041
2042 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2043 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2044 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2045 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2046
2047 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2048
2049 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2050 pi2_dst_scratch += 8;
2051
2052 }
2053
2054 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2055
2056 /* o8[0-3] */
2057 {
2058 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2059
2060 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2061 pi2_src_scratch -= in_stride;
2062
2063 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2064 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2065
2066 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2067 m_count = _mm_cvtsi32_si128(i4_shift);
2068 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2069 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2070
2071 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2072 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2073 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2074 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2075
2076 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2077
2078 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2079 pi2_dst_scratch -= out_stride;
2080 }
2081
2082 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2083
2084 /* o9[0-3] */
2085 {
2086 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2087
2088 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2089 pi2_src_scratch -= in_stride;
2090
2091 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2092 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2093
2094 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2095 m_count = _mm_cvtsi32_si128(i4_shift);
2096 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2097 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2098
2099 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2100 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2101 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2102 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2103
2104 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2105
2106 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2107 pi2_dst_scratch -= out_stride;
2108 }
2109
2110 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2111
2112 /* o10[0-3] */
2113 {
2114 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2115
2116 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2117 pi2_src_scratch -= in_stride;
2118
2119 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2120 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2121
2122 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2123 m_count = _mm_cvtsi32_si128(i4_shift);
2124 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2125 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2126
2127 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2128 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2129 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2130 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2131
2132 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2133
2134 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2135 pi2_dst_scratch -= out_stride;
2136 }
2137
2138 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2139
2140 /* o11[0-3] */
2141 {
2142 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2143
2144 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2145 pi2_src_scratch -= in_stride;
2146
2147 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2148 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2149
2150 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2151 m_count = _mm_cvtsi32_si128(i4_shift);
2152 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2153 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2154
2155 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2156 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2157 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2158 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2159
2160 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2161
2162 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2163 pi2_dst_scratch -= out_stride;
2164
2165 }
2166
2167 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2168
2169 /* o12[0-3] */
2170 {
2171 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2172
2173 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2174 pi2_src_scratch -= in_stride;
2175
2176 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2177 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2178
2179 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2180 m_count = _mm_cvtsi32_si128(i4_shift);
2181 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2182 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2183
2184 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2185 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2186 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2187 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2188
2189 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2190
2191 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2192 pi2_dst_scratch -= out_stride;
2193
2194 }
2195
2196 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2197
2198 /* o13[0-3] */
2199 {
2200 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2201
2202 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2203 pi2_src_scratch -= in_stride;
2204
2205 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2206 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2207
2208 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2209 m_count = _mm_cvtsi32_si128(i4_shift);
2210 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2211 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2212
2213 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2214 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2215 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2216 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2217
2218 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2219
2220 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2221 pi2_dst_scratch -= out_stride;
2222 }
2223
2224 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2225
2226 /* o14[0-3] */
2227 {
2228 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2229
2230 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2231 pi2_src_scratch -= in_stride;
2232
2233 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2234 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2235
2236 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2237 m_count = _mm_cvtsi32_si128(i4_shift);
2238 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2239 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2240
2241 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2242 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2243 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2244 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2245
2246 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2247
2248 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2249 pi2_dst_scratch -= out_stride;
2250
2251 }
2252
2253 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2254
2255 /* o15[0-3] */
2256 {
2257 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2258
2259 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2260 pi2_src_scratch += 8;
2261
2262 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2263 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2264
2265 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2266 m_count = _mm_cvtsi32_si128(i4_shift);
2267 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2268 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2269
2270 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2271 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2272 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2273 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2274
2275 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2276
2277 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2278 pi2_dst_scratch += 8;
2279 }
2280
2281 }
2282 }
2283 }
2284 else if(zero_last24_rows_stg1)
2285 {
2286 /* o & stage 1 out */
2287 {
2288 WORD32 j;
2289
2290 WORD16 *pi2_src_scratch = o_temp_ptr;
2291 WORD16 *pi2_dst_scratch = temp_ptr;
2292 WORD32 out_stride = (trans_size << 1);
2293
2294 WORD32 in_stride = trans_size;
2295
2296 for(j = 0; j < 2; j++)
2297 {
2298 if(j)
2299 {
2300 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2301 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2302 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2303 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2304 }
2305
2306 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2307 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2308
2309 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2310 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2311
2312 /* o0[0-3] */
2313 {
2314
2315 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2316 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2317
2318 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2319
2320 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2321 pi2_src_scratch += in_stride;
2322
2323 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2324 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2325
2326 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2327 m_count = _mm_cvtsi32_si128(i4_shift);
2328 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2329 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2330
2331 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2332 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2333 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2334 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2335
2336 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2337
2338 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2339 pi2_dst_scratch += out_stride;
2340
2341 }
2342
2343 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2344 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2345
2346 /* o1[0-3] */
2347 {
2348 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2349 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2350
2351 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2352
2353 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2354 pi2_src_scratch += in_stride;
2355
2356 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2357 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2358
2359 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2360 m_count = _mm_cvtsi32_si128(i4_shift);
2361 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2362 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2363
2364 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2365 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2366 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2367 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2368
2369 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2370
2371 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2372 pi2_dst_scratch += out_stride;
2373
2374 }
2375
2376 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
2377 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
2378
2379 /* o2[0-3] */
2380 {
2381 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2382 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2383
2384 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2385
2386 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2387 pi2_src_scratch += in_stride;
2388
2389 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2390 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2391
2392 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2393 m_count = _mm_cvtsi32_si128(i4_shift);
2394 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2395 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2396
2397 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2398 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2399 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2400 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2401
2402 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2403
2404 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2405 pi2_dst_scratch += out_stride;
2406
2407 }
2408
2409 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
2410 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
2411
2412 /* o3[0-3] */
2413 {
2414 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2415 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2416
2417 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2418
2419 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2420 pi2_src_scratch += in_stride;
2421
2422 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2423 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2424
2425 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2426 m_count = _mm_cvtsi32_si128(i4_shift);
2427 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2428 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2429
2430 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2431 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2432 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2433 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2434
2435 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2436
2437 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2438 pi2_dst_scratch += out_stride;
2439
2440 }
2441
2442 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
2443 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
2444
2445 /* o4[0-3] */
2446 {
2447 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2448 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2449
2450 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2451
2452 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2453 pi2_src_scratch += in_stride;
2454
2455 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2456 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2457
2458 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2459 m_count = _mm_cvtsi32_si128(i4_shift);
2460 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2461 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2462
2463 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2464 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2465 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2466 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2467
2468 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2469
2470 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2471 pi2_dst_scratch += out_stride;
2472
2473 }
2474
2475 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
2476 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
2477
2478 /* o5[0-3] */
2479 {
2480 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2481 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2482
2483 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2484
2485 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2486 pi2_src_scratch += in_stride;
2487
2488 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2489 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2490
2491 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2492 m_count = _mm_cvtsi32_si128(i4_shift);
2493 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2494 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2495
2496 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2497 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2498 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2499 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2500
2501 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2502
2503 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2504 pi2_dst_scratch += out_stride;
2505
2506 }
2507
2508 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
2509 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
2510
2511 /* o6[0-3] */
2512 {
2513 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2514 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2515
2516 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2517
2518 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2519 pi2_src_scratch += in_stride;
2520
2521 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2522 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2523
2524 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2525 m_count = _mm_cvtsi32_si128(i4_shift);
2526 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2527 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2528
2529 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2530 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2531 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2532 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2533
2534 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2535
2536 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2537 pi2_dst_scratch += out_stride;
2538
2539 }
2540
2541 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2542 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
2543
2544 /* o7[0-3] */
2545 {
2546 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2547 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2548
2549 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2550
2551 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2552 pi2_src_scratch += 8;
2553
2554 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2555 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2556
2557 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2558 m_count = _mm_cvtsi32_si128(i4_shift);
2559 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2560 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2561
2562 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2563 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2564 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2565 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2566
2567 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2568
2569 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2570 pi2_dst_scratch += 8;
2571
2572 }
2573
2574 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2575 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
2576
2577 /* o8[0-3] */
2578 {
2579 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2580 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2581
2582 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2583
2584 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2585 pi2_src_scratch -= in_stride;
2586
2587 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2588 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2589
2590 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2591 m_count = _mm_cvtsi32_si128(i4_shift);
2592 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2593 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2594
2595 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2596 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2597 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2598 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2599
2600 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2601
2602 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2603 pi2_dst_scratch -= out_stride;
2604 }
2605
2606 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2607 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
2608
2609 /* o9[0-3] */
2610 {
2611 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2612 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2613
2614 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2615
2616 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2617 pi2_src_scratch -= in_stride;
2618
2619 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2620 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2621
2622 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2623 m_count = _mm_cvtsi32_si128(i4_shift);
2624 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2625 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2626
2627 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2628 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2629 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2630 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2631
2632 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2633
2634 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2635 pi2_dst_scratch -= out_stride;
2636 }
2637
2638 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2639 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
2640
2641 /* o10[0-3] */
2642 {
2643 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2644 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2645
2646 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2647
2648 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2649 pi2_src_scratch -= in_stride;
2650
2651 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2652 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2653
2654 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2655 m_count = _mm_cvtsi32_si128(i4_shift);
2656 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2657 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2658
2659 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2660 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2661 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2662 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2663
2664 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2665
2666 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2667 pi2_dst_scratch -= out_stride;
2668 }
2669
2670 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2671 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
2672
2673 /* o11[0-3] */
2674 {
2675
2676 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2677 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2678
2679 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2680
2681 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2682 pi2_src_scratch -= in_stride;
2683
2684 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2685 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2686
2687 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2688 m_count = _mm_cvtsi32_si128(i4_shift);
2689 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2690 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2691
2692 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2693 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2694 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2695 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2696
2697 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2698
2699 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2700 pi2_dst_scratch -= out_stride;
2701
2702 }
2703
2704 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2705 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
2706
2707 /* o12[0-3] */
2708 {
2709 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2710 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2711
2712 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2713
2714 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2715 pi2_src_scratch -= in_stride;
2716
2717 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2718 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2719
2720 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2721 m_count = _mm_cvtsi32_si128(i4_shift);
2722 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2723 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2724
2725 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2726 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2727 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2728 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2729
2730 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2731
2732 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2733 pi2_dst_scratch -= out_stride;
2734
2735 }
2736
2737 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2738 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
2739
2740 /* o13[0-3] */
2741 {
2742 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2743 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2744
2745 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2746
2747 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2748 pi2_src_scratch -= in_stride;
2749
2750 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2751 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2752
2753 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2754 m_count = _mm_cvtsi32_si128(i4_shift);
2755 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2756 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2757
2758 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2759 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2760 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2761 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2762
2763 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2764
2765 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2766 pi2_dst_scratch -= out_stride;
2767 }
2768
2769 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2770 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
2771
2772 /* o14[0-3] */
2773 {
2774 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2775 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2776
2777 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2778
2779 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2780 pi2_src_scratch -= in_stride;
2781
2782 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2783 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2784
2785 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2786 m_count = _mm_cvtsi32_si128(i4_shift);
2787 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2788 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2789
2790 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2791 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2792 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2793 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2794
2795 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2796
2797 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2798 pi2_dst_scratch -= out_stride;
2799
2800 }
2801
2802 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2803 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
2804
2805 /* o15[0-3] */
2806 {
2807 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2808 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2809
2810 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2811
2812 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2813 pi2_src_scratch += 8;
2814
2815 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2816 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2817
2818 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2819 m_count = _mm_cvtsi32_si128(i4_shift);
2820 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2821 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2822
2823 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2824 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2825 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2826 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2827
2828 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2829
2830 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2831 pi2_dst_scratch += 8;
2832 }
2833
2834 }
2835 }
2836 }
2837 else
2838 {
2839 /* o & stage 1 out */
2840 {
2841 WORD32 j;
2842
2843 WORD16 *pi2_src_scratch = o_temp_ptr;
2844 WORD16 *pi2_dst_scratch = temp_ptr;
2845 WORD32 out_stride = (trans_size << 1);
2846
2847 WORD32 in_stride = trans_size;
2848
2849
2850 for(j = 0; j < 2; j++)
2851 {
2852 if(j)
2853 {
2854 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2855 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2856 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2857 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2858 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
2859 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
2860 m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
2861 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
2862
2863 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
2864 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
2865 m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
2866 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
2867 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
2868 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
2869 m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
2870 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
2871 }
2872
2873 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2874 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2875 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
2876 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
2877 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
2878 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
2879 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
2880 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
2881
2882 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2883 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2884 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
2885 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
2886 temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
2887 temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
2888 temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
2889 temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
2890
2891
2892 /* o0[0-3] */
2893 {
2894 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2895 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2896 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2897 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2898
2899 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2900 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2901
2902 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2903
2904 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2905 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2906 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2907 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2908
2909 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2910 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2911
2912 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2913
2914 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2915
2916 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2917 pi2_src_scratch += in_stride;
2918
2919 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2920 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2921
2922 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2923 m_count = _mm_cvtsi32_si128(i4_shift);
2924 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2925 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2926
2927 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2928 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2929 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2930 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2931
2932 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2933
2934 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2935 pi2_dst_scratch += out_stride;
2936
2937 }
2938
2939 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2940 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2941 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
2942 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
2943 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
2944 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
2945 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
2946 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
2947
2948
2949 /* o1[0-3] */
2950 {
2951 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2952 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2953 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2954 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2955
2956 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2957 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2958
2959 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
2960
2961 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2962 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2963 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2964 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2965
2966 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2967 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2968
2969 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2970
2971 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2972
2973 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2974 pi2_src_scratch += in_stride;
2975
2976 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2977 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2978
2979 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2980 m_count = _mm_cvtsi32_si128(i4_shift);
2981 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2982 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2983
2984 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2985 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2986 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2987 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2988
2989 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2990
2991 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2992 pi2_dst_scratch += out_stride;
2993
2994 }
2995
2996 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
2997 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
2998 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
2999 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
3000 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
3001 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
3002 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
3003 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
3004
3005 /* o2[0-3] */
3006 {
3007 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3008 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3009 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3010 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3011
3012 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3013 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3014
3015 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3016
3017 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3018 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3019 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3020 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3021
3022 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
3023 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3024
3025 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
3026
3027 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3028
3029 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3030 pi2_src_scratch += in_stride;
3031
3032 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3033 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3034
3035 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3036 m_count = _mm_cvtsi32_si128(i4_shift);
3037 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3038 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3039
3040 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3041 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3042 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3043 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3044
3045 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3046
3047 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3048 pi2_dst_scratch += out_stride;
3049
3050 }
3051
3052
3053 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
3054 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
3055 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
3056 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
3057 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
3058 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
3059 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
3060 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
3061
3062 /* o3[0-3] */
3063 {
3064 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3065 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3066 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3067 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3068
3069 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3070 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3071
3072 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3073
3074 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3075 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3076 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3077 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3078
3079 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
3080 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3081
3082 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3083
3084 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3085
3086 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3087 pi2_src_scratch += in_stride;
3088
3089 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3090 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3091
3092 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3093 m_count = _mm_cvtsi32_si128(i4_shift);
3094 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3095 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3096
3097 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3098 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3099 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3100 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3101
3102 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3103
3104 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3105 pi2_dst_scratch += out_stride;
3106
3107 }
3108
3109 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
3110 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
3111 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
3112 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
3113 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
3114 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
3115 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
3116 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
3117
3118 /* o4[0-3] */
3119 {
3120 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3121 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3122 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3123 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3124
3125 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3126 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3127
3128 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3129
3130 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3131 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3132 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3133 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3134
3135 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3136 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3137
3138 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3139
3140 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3141
3142 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3143 pi2_src_scratch += in_stride;
3144
3145 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3146 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3147
3148 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3149 m_count = _mm_cvtsi32_si128(i4_shift);
3150 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3151 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3152
3153 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3154 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3155 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3156 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3157
3158 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3159
3160 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3161 pi2_dst_scratch += out_stride;
3162
3163 }
3164
3165
3166 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
3167 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
3168 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
3169 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
3170 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
3171 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
3172 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
3173 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
3174
3175 /* o5[0-3] */
3176 {
3177 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3178 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3179 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3180 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3181
3182 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3183 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3184
3185 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3186
3187 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3188 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3189 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3190 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3191
3192 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3193 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3194
3195 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3196
3197 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3198
3199 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3200 pi2_src_scratch += in_stride;
3201
3202 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3203 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3204
3205 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3206 m_count = _mm_cvtsi32_si128(i4_shift);
3207 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3208 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3209
3210 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3211 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3212 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3213 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3214
3215 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3216
3217 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3218 pi2_dst_scratch += out_stride;
3219
3220 }
3221
3222 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
3223 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
3224 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
3225 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
3226 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
3227 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
3228 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
3229 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
3230
3231
3232 /* o6[0-3] */
3233 {
3234 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3235 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3236 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3237 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3238
3239 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3240 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3241
3242 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3243
3244 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3245 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3246 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3247 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3248
3249 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3250 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3251
3252 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3253
3254 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3255
3256 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3257 pi2_src_scratch += in_stride;
3258
3259 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3260 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3261
3262 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3263 m_count = _mm_cvtsi32_si128(i4_shift);
3264 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3265 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3266
3267 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3268 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3269 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3270 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3271
3272 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3273
3274 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3275 pi2_dst_scratch += out_stride;
3276
3277 }
3278
3279 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
3280 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
3281 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
3282 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
3283 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
3284 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
3285 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
3286 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
3287
3288 /* o7[0-3] */
3289 {
3290 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3291 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3292 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3293 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3294
3295 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3296 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3297
3298 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3299
3300 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3301 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3302 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3303 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3304
3305 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3306 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3307
3308 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3309
3310 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3311
3312 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3313 pi2_src_scratch += 8;
3314
3315 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3316 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3317
3318 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3319 m_count = _mm_cvtsi32_si128(i4_shift);
3320 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3321 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3322
3323 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3324 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3325 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3326 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3327
3328 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3329
3330 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3331 pi2_dst_scratch += 8;
3332
3333 }
3334
3335 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
3336 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
3337 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
3338 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
3339 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
3340 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
3341 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
3342 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
3343
3344
3345 /* o8[0-3] */
3346 {
3347
3348 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3349 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3350 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3351 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3352
3353 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3354 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3355
3356 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3357
3358 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3359 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3360 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3361 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3362
3363 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3364 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3365
3366 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3367
3368 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3369
3370 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3371 pi2_src_scratch -= in_stride;
3372
3373 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3374 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3375
3376 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3377 m_count = _mm_cvtsi32_si128(i4_shift);
3378 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3379 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3380
3381 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3382 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3383 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3384 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3385
3386 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3387
3388 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3389 pi2_dst_scratch -= out_stride;
3390 }
3391
3392 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
3393 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
3394 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
3395 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
3396 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
3397 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
3398 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
3399 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
3400
3401
3402 /* o9[0-3] */
3403 {
3404 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3405 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3406 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3407 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3408
3409 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3410 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3411
3412 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3413
3414 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3415 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3416 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3417 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3418
3419 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3420 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3421
3422 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3423
3424 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3425
3426 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3427 pi2_src_scratch -= in_stride;
3428
3429 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3430 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3431
3432 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3433 m_count = _mm_cvtsi32_si128(i4_shift);
3434 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3435 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3436
3437 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3438 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3439 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3440 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3441
3442 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3443
3444 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3445 pi2_dst_scratch -= out_stride;
3446 }
3447
3448 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
3449 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
3450 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
3451 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
3452 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
3453 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
3454 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
3455 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
3456
3457 /* o10[0-3] */
3458 {
3459 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3460 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3461 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3462 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3463
3464 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3465 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3466
3467 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3468
3469 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3470 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3471 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3472 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3473
3474 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3475 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3476
3477 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3478
3479 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3480
3481 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3482 pi2_src_scratch -= in_stride;
3483
3484 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3485 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3486
3487 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3488 m_count = _mm_cvtsi32_si128(i4_shift);
3489 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3490 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3491
3492 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3493 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3494 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3495 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3496
3497 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3498
3499 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3500 pi2_dst_scratch -= out_stride;
3501 }
3502
3503 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
3504 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
3505 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
3506 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
3507 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
3508 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
3509 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
3510 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
3511
3512 /* o11[0-3] */
3513 {
3514 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3515 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3516 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3517 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3518
3519 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3520 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3521
3522 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3523
3524 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3525 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3526 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3527 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3528
3529 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3530 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3531
3532 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3533
3534 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3535
3536 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3537 pi2_src_scratch -= in_stride;
3538
3539 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3540 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3541
3542 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3543 m_count = _mm_cvtsi32_si128(i4_shift);
3544 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3545 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3546
3547 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3548 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3549 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3550 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3551
3552 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3553
3554 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3555 pi2_dst_scratch -= out_stride;
3556
3557 }
3558
3559 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
3560 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
3561 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
3562 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
3563 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
3564 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
3565 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
3566 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
3567
3568
3569 /* o12[0-3] */
3570 {
3571 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3572 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3573 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3574 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3575
3576 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3577 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3578
3579 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3580
3581 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3582 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3583 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3584 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3585
3586 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3587 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3588
3589 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3590
3591 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3592
3593 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3594 pi2_src_scratch -= in_stride;
3595
3596 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3597 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3598
3599 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3600 m_count = _mm_cvtsi32_si128(i4_shift);
3601 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3602 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3603
3604 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3605 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3606 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3607 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3608
3609 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3610
3611 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3612 pi2_dst_scratch -= out_stride;
3613
3614 }
3615
3616 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
3617 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
3618 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
3619 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
3620 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
3621 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
3622 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
3623 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
3624
3625
3626 /* o13[0-3] */
3627 {
3628 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3629 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3630 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3631 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3632
3633 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3634 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3635
3636 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3637
3638 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3639 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3640 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3641 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3642
3643 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3644 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3645
3646 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3647
3648 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3649
3650 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3651 pi2_src_scratch -= in_stride;
3652
3653 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3654 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3655
3656 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3657 m_count = _mm_cvtsi32_si128(i4_shift);
3658 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3659 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3660
3661 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3662 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3663 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3664 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3665
3666 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3667
3668 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3669 pi2_dst_scratch -= out_stride;
3670 }
3671
3672 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
3673 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
3674 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
3675 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
3676 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
3677 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
3678 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
3679 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
3680
3681
3682 /* o14[0-3] */
3683 {
3684 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3685 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3686 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3687 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3688
3689 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3690 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3691
3692 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3693
3694 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3695 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3696 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3697 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3698
3699 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3700 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3701
3702 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3703
3704 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3705
3706 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3707 pi2_src_scratch -= in_stride;
3708
3709 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3710 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3711
3712 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3713 m_count = _mm_cvtsi32_si128(i4_shift);
3714 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3715 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3716
3717 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3718 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3719 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3720 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3721
3722 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3723
3724 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3725 pi2_dst_scratch -= out_stride;
3726
3727 }
3728
3729 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
3730 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
3731 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
3732 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
3733 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
3734 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
3735 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
3736 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
3737
3738 /* o15[0-3] */
3739 {
3740 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3741 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3742 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3743 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3744
3745 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3746 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3747
3748 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3749
3750 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3751 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3752 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3753 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3754
3755 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3756 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3757
3758 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3759
3760 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3761
3762 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3763 pi2_src_scratch += 8;
3764
3765 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3766 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3767
3768 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3769 m_count = _mm_cvtsi32_si128(i4_shift);
3770 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3771 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3772
3773 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3774 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3775 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3776 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3777
3778 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3779
3780 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3781 pi2_dst_scratch += 8;
3782 }
3783
3784 }
3785 }
3786 }
3787 /* Transpose */
3788 {
3789 WORD16 *pi2_src_scratch = temp_ptr;
3790 WORD16 *pi2_dst_scratch = pi2_tmp;
3791 WORD32 in_stride = (trans_size << 1);
3792
3793 for(j = 0; j < 2; j++)
3794 {
3795 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3796 pi2_src_scratch += in_stride;
3797 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
3798 pi2_src_scratch += in_stride;
3799 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
3800 pi2_src_scratch += in_stride;
3801 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
3802 pi2_src_scratch += in_stride;
3803 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
3804 pi2_src_scratch += in_stride;
3805 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
3806 pi2_src_scratch += in_stride;
3807 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
3808 pi2_src_scratch += in_stride;
3809 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
3810 pi2_src_scratch += 8;
3811
3812 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
3813 pi2_src_scratch -= in_stride;
3814 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
3815 pi2_src_scratch -= in_stride;
3816 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
3817 pi2_src_scratch -= in_stride;
3818 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
3819 pi2_src_scratch -= in_stride;
3820 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
3821 pi2_src_scratch -= in_stride;
3822 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
3823 pi2_src_scratch -= in_stride;
3824 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
3825 pi2_src_scratch -= in_stride;
3826 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
3827 pi2_src_scratch += 8;
3828
3829
3830 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
3831 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
3832
3833 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
3834 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
3835
3836 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
3837 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
3838
3839 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
3840 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
3841
3842 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
3843 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
3844
3845 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
3846 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
3847
3848 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
3849 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
3850
3851 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
3852 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
3853
3854 /****************/
3855
3856 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
3857 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
3858
3859 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
3860 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
3861
3862 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
3863 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
3864
3865 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
3866 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
3867
3868 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
3869 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
3870
3871 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
3872 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
3873
3874 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
3875 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
3876
3877 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
3878 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
3879
3880 /******************/
3881
3882 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
3883 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
3884
3885 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
3886 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
3887
3888 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
3889 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
3890
3891 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
3892 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
3893
3894 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
3895 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
3896
3897 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
3898 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
3899
3900 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
3901 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
3902
3903 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
3904 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
3905
3906 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
3907 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
3908 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
3909 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
3910
3911 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
3912 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
3913 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
3914 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
3915
3916 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
3917 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
3918 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
3919 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
3920
3921 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
3922 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
3923 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
3924 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
3925
3926 pi2_dst_scratch += 4 * trans_size;
3927 }
3928 }
3929 pi2_src += 8;
3930 // pi2_dequant_coeff +=8;
3931 pi2_tmp += 8 * trans_size;
3932 zero_cols = zero_cols >> 1;
3933 }
3934
3935 if(trans_size_stg1 != TRANS_SIZE_32)
3936 {
3937 m_temp_reg_10 = _mm_setzero_si128();
3938
3939 for(i = trans_size_stg1; i < 32; i += 8)
3940 {
3941 WORD16 *pi2_dst_scratch = pi2_tmp;
3942
3943 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
3944 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
3945 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
3946 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
3947
3948 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
3949 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
3950 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
3951 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
3952
3953 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
3954 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
3955 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
3956 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
3957
3958 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
3959 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
3960 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
3961 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
3962
3963 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
3964 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
3965 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
3966 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
3967
3968 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
3969 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
3970 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
3971 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
3972
3973 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
3974 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
3975 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
3976 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
3977
3978 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
3979 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
3980 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
3981 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
3982
3983 pi2_tmp += 8 * trans_size;
3984 }
3985 }
3986
3987 pi2_tmp = pi2_tmp_orig;
3988
3989 /* Inverse Transform 2nd stage */
3990
3991
3992 for(j = 0; j < trans_size; j += 4)
3993 {
3994 i4_shift = IT_SHIFT_STAGE_2;
3995
3996 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
3997 if(zero_last28_rows_stg2)
3998 {
3999 {
4000
4001 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4002 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
4003 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
4004 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
4005 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
4006 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
4007 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
4008 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
4009
4010 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4011
4012 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
4013
4014 /* eo0[0-3] */
4015 {
4016 m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4017
4018 }
4019 /* eo1[0-3] */
4020 {
4021 m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4022
4023 }
4024 /* eo2[0-3] */
4025 {
4026 m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4027 }
4028
4029 /* eo3[0-3] */
4030 {
4031 m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4032 }
4033 /* eo4[0-3] */
4034 {
4035 m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
4036 }
4037
4038 /* eo5[0-3] */
4039 {
4040 m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
4041 }
4042
4043 /* eo6[0-3] */
4044 {
4045 m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
4046 }
4047 /* eo7[0-3] */
4048 {
4049 m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
4050 }
4051 }
4052
4053 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4054
4055 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4056
4057 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4058
4059 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4060
4061 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4062
4063 /* e[]*/
4064
4065 temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[0] */
4066 temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[15] */
4067
4068 temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[1] */
4069 temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[14] */
4070
4071 temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[2] */
4072 temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[13] */
4073
4074 temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[3] */
4075 temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[12] */
4076
4077 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[4] */
4078 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[11] */
4079
4080 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[5] */
4081 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[10] */
4082
4083 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[6] */
4084 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[9] */
4085
4086 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[7] */
4087 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[8] */
4088
4089 /*o[k]*/
4090 {
4091
4092 WORD16 *pi2_dst_scratch = temp_ptr;
4093 WORD32 out_stride = 8;
4094
4095 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4096
4097 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4098 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4099
4100 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
4101
4102
4103 /* o0[0-3] */
4104 {
4105 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4106
4107 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4108 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4109
4110 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4111 m_count = _mm_cvtsi32_si128(i4_shift);
4112 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4113 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4114
4115 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4116 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4117 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4118 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4119
4120 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4121
4122 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4123 pi2_dst_scratch += out_stride;
4124
4125 }
4126
4127 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4128
4129 /* o1[0-3] */
4130 {
4131 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4132
4133 m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4134 m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4135
4136 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4137 m_count = _mm_cvtsi32_si128(i4_shift);
4138 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4139 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4140
4141 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4142 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4143 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4144 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4145
4146 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4147
4148 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4149 pi2_dst_scratch += out_stride;
4150
4151 }
4152
4153 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4154
4155 /* o2[0-3] */
4156 {
4157 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4158
4159 m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
4160 m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
4161
4162 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4163 m_count = _mm_cvtsi32_si128(i4_shift);
4164 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4165 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4166
4167 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4168 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4169 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4170 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4171
4172 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4173
4174 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4175 pi2_dst_scratch += out_stride;
4176
4177 }
4178
4179 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4180
4181 /* o3[0-3] */
4182 {
4183 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4184
4185 m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
4186 m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
4187
4188 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4189 m_count = _mm_cvtsi32_si128(i4_shift);
4190 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4191 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4192
4193 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4194 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4195 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4196 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4197
4198 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4199
4200 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4201 pi2_dst_scratch += out_stride;
4202
4203 }
4204
4205 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4206
4207 /* o4[0-3] */
4208 {
4209 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4210
4211 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4212 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4213
4214 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4215 m_count = _mm_cvtsi32_si128(i4_shift);
4216 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4217 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4218
4219 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4220 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4221 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4222 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4223
4224 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4225
4226 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4227 pi2_dst_scratch += out_stride;
4228
4229 }
4230
4231 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4232
4233 /* o5[0-3] */
4234 {
4235 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4236
4237 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4238 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4239
4240 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4241 m_count = _mm_cvtsi32_si128(i4_shift);
4242 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4243 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4244
4245 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4246 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4247 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4248 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4249
4250 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4251
4252 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4253 pi2_dst_scratch += out_stride;
4254
4255 }
4256
4257 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4258
4259 /* o6[0-3] */
4260 {
4261 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4262
4263 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4264 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4265
4266 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4267 m_count = _mm_cvtsi32_si128(i4_shift);
4268 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4269 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4270
4271 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4272 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4273 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4274 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4275
4276 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4277
4278 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4279 pi2_dst_scratch += out_stride;
4280
4281 }
4282
4283 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4284
4285 /* o7[0-3] */
4286 {
4287 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4288
4289 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4290 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4291
4292 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4293 m_count = _mm_cvtsi32_si128(i4_shift);
4294 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4295 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4296
4297 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4298 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4299 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4300 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4301
4302 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4303
4304 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4305 pi2_dst_scratch += 8;
4306
4307 }
4308
4309 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4310
4311 /* o8[0-3] */
4312 {
4313 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4314
4315 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4316 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4317
4318 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4319 m_count = _mm_cvtsi32_si128(i4_shift);
4320 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4321 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4322
4323 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4324 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4325 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4326 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4327
4328 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4329
4330 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4331 pi2_dst_scratch += out_stride;
4332 }
4333
4334 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4335
4336 /* o9[0-3] */
4337 {
4338 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4339
4340 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4341 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4342
4343 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4344 m_count = _mm_cvtsi32_si128(i4_shift);
4345 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4346 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4347
4348 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4349 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4350 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4351 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4352
4353 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4354
4355 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4356 pi2_dst_scratch += out_stride;
4357
4358 }
4359
4360 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4361
4362 /* o10[0-3] */
4363 {
4364 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4365
4366 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
4367 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
4368
4369 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4370 m_count = _mm_cvtsi32_si128(i4_shift);
4371 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4372 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4373
4374 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4375 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4376 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4377 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4378
4379 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4380
4381 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4382 pi2_dst_scratch += out_stride;
4383 }
4384
4385 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
4386
4387 /* o11[0-3] */
4388 {
4389 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4390
4391 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
4392 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
4393
4394 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4395 m_count = _mm_cvtsi32_si128(i4_shift);
4396 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4397 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4398
4399 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4400 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4401 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4402 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4403
4404 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4405
4406 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4407 pi2_dst_scratch += out_stride;
4408
4409 }
4410
4411 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
4412
4413 /* o12[0-3] */
4414 {
4415 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4416
4417 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
4418 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
4419
4420 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4421 m_count = _mm_cvtsi32_si128(i4_shift);
4422 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4423 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4424
4425 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4426 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4427 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4428 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4429
4430 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4431
4432 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4433 pi2_dst_scratch += out_stride;
4434
4435 }
4436
4437 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
4438
4439 /* o13[0-3] */
4440 {
4441 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4442
4443 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
4444 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
4445
4446 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4447 m_count = _mm_cvtsi32_si128(i4_shift);
4448 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4449 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4450
4451 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4452 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4453 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4454 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4455
4456 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4457
4458 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4459 pi2_dst_scratch += out_stride;
4460 }
4461
4462 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
4463
4464 /* o14[0-3] */
4465 {
4466 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4467
4468 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
4469 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
4470
4471 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4472 m_count = _mm_cvtsi32_si128(i4_shift);
4473 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4474 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4475
4476 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4477 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4478 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4479 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4480
4481 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4482
4483 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4484 pi2_dst_scratch += out_stride;
4485
4486 }
4487
4488 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
4489
4490 /* o15[0-3] */
4491 {
4492 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4493
4494 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
4495 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
4496
4497 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4498 m_count = _mm_cvtsi32_si128(i4_shift);
4499 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4500 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4501
4502 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4503 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4504 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4505 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4506
4507 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4508
4509 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4510 pi2_dst_scratch += 8;
4511 }
4512
4513 }
4514
4515 }
4516 else if(zero_last24_rows_stg2)
4517 {
4518 /* eo */
4519 {
4520 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4521
4522 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4523 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
4524
4525 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
4526
4527
4528 /* eo0[0-3] */
4529 {
4530 m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4531
4532 }
4533
4534 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
4535
4536 /* eo1[0-3] */
4537 {
4538 m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4539
4540 }
4541 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
4542
4543 /* eo2[0-3] */
4544 {
4545 m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4546
4547 }
4548
4549 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
4550
4551 /* eo3[0-3] */
4552 {
4553
4554 m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4555
4556 }
4557
4558 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
4559
4560 /* eo4[0-3] */
4561 {
4562 m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4563
4564 }
4565
4566 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
4567
4568 /* eo5[0-3] */
4569 {
4570 m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4571 }
4572
4573 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
4574 /* eo6[0-3] */
4575 {
4576 m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4577 }
4578
4579 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
4580 /* eo7[0-3] */
4581 {
4582 m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4583
4584 }
4585
4586 }
4587
4588 /* eeo */
4589 {
4590
4591 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
4592 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
4593 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
4594 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
4595
4596 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
4597
4598 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
4599
4600 /* eeo0[0-3] */
4601 {
4602 temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4603
4604 }
4605
4606 /* eeo1[0-3] */
4607 {
4608 temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4609
4610 }
4611
4612 /* eo2[0-3] */
4613 {
4614 temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4615
4616 }
4617
4618
4619 /* eo3[0-3] */
4620 {
4621 temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4622
4623 }
4624
4625 }
4626
4627 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
4628 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
4629 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4630
4631 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4632
4633 //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
4634 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4635
4636 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4637 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4638
4639 m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1); /* ee[0] */
4640 m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1); /* ee[7] */
4641
4642 m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2); /* ee[1] */
4643 m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2); /* ee[6] */
4644
4645 m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3); /* ee[2] */
4646 m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3); /* ee[5] */
4647
4648 m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4); /* ee[3] */
4649 m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4); /* ee[4] */
4650
4651 /* e[]*/
4652
4653 temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */
4654 temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */
4655
4656 temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */
4657 temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */
4658
4659 temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */
4660 temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */
4661
4662 temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */
4663 temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */
4664
4665 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */
4666 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */
4667
4668 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */
4669 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */
4670
4671 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */
4672 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */
4673
4674 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */
4675 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */
4676
4677 /*o[k] */
4678 {
4679
4680 WORD16 *pi2_dst_scratch = temp_ptr;
4681 WORD32 out_stride = 8;
4682
4683 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4684 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
4685
4686 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4687 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4688 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
4689 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
4690
4691 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
4692 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
4693
4694 /* o0[0-3] */
4695 {
4696 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4697 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4698
4699 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4700
4701 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4702 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4703
4704 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4705 m_count = _mm_cvtsi32_si128(i4_shift);
4706 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4707 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4708
4709 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4710 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4711 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4712 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4713
4714 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4715
4716 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4717 pi2_dst_scratch += out_stride;
4718
4719 }
4720
4721
4722 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4723 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
4724
4725 /* o1[0-3] */
4726 {
4727 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4728 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4729
4730 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4731
4732 m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4733 m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4734
4735 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4736 m_count = _mm_cvtsi32_si128(i4_shift);
4737 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4738 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4739
4740 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4741 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4742 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4743 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4744
4745 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4746
4747 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4748 pi2_dst_scratch += out_stride;
4749
4750 }
4751
4752 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4753 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
4754
4755 /* o2[0-3] */
4756 {
4757 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4758 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4759
4760 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4761
4762 m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
4763 m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
4764
4765 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4766 m_count = _mm_cvtsi32_si128(i4_shift);
4767 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4768 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4769
4770 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4771 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4772 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4773 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4774
4775 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4776
4777 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4778 pi2_dst_scratch += out_stride;
4779
4780 }
4781
4782 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4783 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
4784
4785 /* o3[0-3] */
4786 {
4787 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4788 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4789
4790 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4791
4792 m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
4793 m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
4794
4795 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4796 m_count = _mm_cvtsi32_si128(i4_shift);
4797 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4798 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4799
4800 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4801 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4802 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4803 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4804
4805 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4806
4807 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4808 pi2_dst_scratch += out_stride;
4809
4810 }
4811
4812 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4813 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
4814
4815 /* o4[0-3] */
4816 {
4817 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4818 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4819
4820 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4821
4822 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4823 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4824
4825 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4826 m_count = _mm_cvtsi32_si128(i4_shift);
4827 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4828 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4829
4830 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4831 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4832 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4833 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4834
4835 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4836
4837 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4838 pi2_dst_scratch += out_stride;
4839
4840 }
4841
4842 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4843 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
4844
4845 /* o5[0-3] */
4846 {
4847 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4848 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4849
4850 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4851
4852 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4853 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4854
4855 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4856 m_count = _mm_cvtsi32_si128(i4_shift);
4857 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4858 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4859
4860 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4861 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4862 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4863 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4864
4865 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4866
4867 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4868 pi2_dst_scratch += out_stride;
4869
4870 }
4871
4872 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4873 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
4874
4875 /* o6[0-3] */
4876 {
4877 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4878 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4879
4880 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4881
4882 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4883 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4884
4885 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4886 m_count = _mm_cvtsi32_si128(i4_shift);
4887 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4888 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4889
4890 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4891 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4892 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4893 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4894
4895 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4896
4897 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4898 pi2_dst_scratch += out_stride;
4899
4900 }
4901
4902 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4903 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
4904
4905 /* o7[0-3] */
4906 {
4907 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4908 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4909
4910 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4911
4912 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4913 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4914
4915 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4916 m_count = _mm_cvtsi32_si128(i4_shift);
4917 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4918 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4919
4920 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4921 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4922 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4923 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4924
4925 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4926
4927 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4928 pi2_dst_scratch += 8;
4929
4930 }
4931
4932 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4933 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
4934
4935 /* o8[0-3] */
4936 {
4937 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4938 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4939
4940 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4941
4942 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4943 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4944
4945 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4946 m_count = _mm_cvtsi32_si128(i4_shift);
4947 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4948 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4949
4950 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4951 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4952 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4953 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4954
4955 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4956
4957 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4958 pi2_dst_scratch += out_stride;
4959 }
4960
4961 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4962 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
4963
4964 /* o9[0-3] */
4965 {
4966 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4967 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4968
4969 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4970
4971 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4972 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4973
4974 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4975 m_count = _mm_cvtsi32_si128(i4_shift);
4976 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4977 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4978
4979 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4980 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4981 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4982 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4983
4984 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4985
4986 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4987 pi2_dst_scratch += out_stride;
4988 }
4989
4990 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4991 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
4992
4993 /* o10[0-3] */
4994 {
4995 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4996 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4997
4998 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4999
5000 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
5001 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
5002
5003 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5004 m_count = _mm_cvtsi32_si128(i4_shift);
5005 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5006 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5007
5008 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5009 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5010 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5011 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5012
5013 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5014
5015 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5016 pi2_dst_scratch += out_stride;
5017 }
5018
5019 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
5020 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
5021
5022 /* o11[0-3] */
5023 {
5024 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5025 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5026
5027 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5028
5029 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
5030 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
5031
5032 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5033 m_count = _mm_cvtsi32_si128(i4_shift);
5034 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5035 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5036
5037 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5038 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5039 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5040 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5041
5042 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5043
5044 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5045 pi2_dst_scratch += out_stride;
5046
5047 }
5048
5049 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
5050 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
5051
5052 /* o12[0-3] */
5053 {
5054 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5055 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5056
5057 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5058
5059 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
5060 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
5061
5062 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5063 m_count = _mm_cvtsi32_si128(i4_shift);
5064 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5065 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5066
5067 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5068 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5069 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5070 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5071
5072 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5073
5074 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5075 pi2_dst_scratch += out_stride;
5076
5077 }
5078
5079 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
5080 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
5081
5082 /* o13[0-3] */
5083 {
5084 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5085 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5086
5087 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5088
5089 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
5090 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
5091
5092 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5093 m_count = _mm_cvtsi32_si128(i4_shift);
5094 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5095 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5096
5097 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5098 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5099 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5100 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5101
5102 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5103
5104 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5105 pi2_dst_scratch += out_stride;
5106 }
5107
5108 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
5109 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
5110
5111 /* o14[0-3] */
5112 {
5113 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5114 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5115
5116 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5117
5118 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
5119 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
5120
5121 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5122 m_count = _mm_cvtsi32_si128(i4_shift);
5123 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5124 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5125
5126 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5127 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5128 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5129 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5130
5131 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5132
5133 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5134 pi2_dst_scratch += out_stride;
5135 }
5136
5137 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
5138 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
5139
5140 /* o15[0-3] */
5141 {
5142 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5143 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5144
5145 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5146
5147 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
5148 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
5149
5150 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5151 m_count = _mm_cvtsi32_si128(i4_shift);
5152 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5153 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5154
5155 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5156 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5157 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5158 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5159
5160 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5161
5162 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5163 pi2_dst_scratch += 8;
5164 }
5165
5166 }
5167 }
5168 else
5169 {
5170 /* eo */
5171 {
5172
5173 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
5174 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
5175 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
5176 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
5177
5178
5179 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
5180 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
5181 m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
5182 m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
5183 m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
5184 m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
5185 m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
5186 m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
5187
5188 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
5189 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
5190 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
5191 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
5192
5193 /* eo0[0-3] */
5194 {
5195 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5196 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5197
5198 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5199
5200 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5201 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5202
5203 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5204
5205 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5206
5207 }
5208
5209 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
5210 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43
5211 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90
5212 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25
5213
5214 /* eo1[0-3] */
5215 {
5216 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5217 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5218
5219 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5220
5221 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5222 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5223
5224 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5225
5226 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
5227
5228 }
5229
5230 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
5231 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87
5232 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57
5233 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43
5234
5235 /* eo2[0-3] */
5236 {
5237 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5238 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5239
5240 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5241
5242 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5243 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5244
5245 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5246
5247 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5248
5249 }
5250
5251 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
5252 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9
5253 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25
5254 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57
5255
5256 /* eo3[0-3] */
5257 {
5258 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5259 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5260
5261 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5262
5263 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5264 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5265
5266 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
5267
5268 m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5269
5270 }
5271
5272 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
5273 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90
5274 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87
5275 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70
5276
5277
5278 /* eo4[0-3] */
5279 {
5280 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5281 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5282
5283 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5284
5285 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5286 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5287
5288 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
5289
5290 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5291
5292 }
5293
5294 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
5295 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25
5296 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70
5297 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80
5298
5299 /* eo5[0-3] */
5300 {
5301 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5302 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5303
5304 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5305
5306 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5307 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5308
5309 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5310
5311 m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5312 }
5313
5314 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
5315 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80
5316 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9
5317 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87
5318
5319 /* eo6[0-3] */
5320 {
5321 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5322 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5323
5324 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5325
5326 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5327 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5328
5329 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5330
5331 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5332
5333 }
5334
5335 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
5336 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57
5337 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80
5338 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90
5339
5340 /* eo7[0-3] */
5341 {
5342 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5343 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5344
5345 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5346
5347 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5348 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5349
5350 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5351
5352 m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5353
5354
5355 }
5356
5357 }
5358
5359 /* eeo */
5360 {
5361 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
5362 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
5363
5364 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
5365 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
5366 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
5367 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
5368
5369 /* eeo0[0-3] */
5370 {
5371
5372 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
5373 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
5374
5375 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5376 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5377
5378 temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5379
5380 }
5381
5382 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
5383 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50
5384
5385 /* eeo1[0-3] */
5386 {
5387 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5388 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5389
5390 temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5391
5392 }
5393
5394 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
5395 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75
5396
5397 /* eo2[0-3] */
5398 {
5399 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5400 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5401
5402 temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5403
5404 }
5405
5406 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
5407 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89
5408
5409 /* eo3[0-3] */
5410 {
5411 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5412 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5413
5414 temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5415
5416 }
5417
5418
5419 }
5420
5421 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
5422 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
5423
5424 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
5425 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
5426
5427 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
5428 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
5429
5430 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
5431
5432 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
5433 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
5434
5435 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
5436
5437 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
5438 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
5439
5440 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
5441 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
5442
5443 /* eeeo[0]= m_temp_reg_20 */
5444 /* eeeo[1]= m_temp_reg_21 */
5445 /* eeee[0]= m_temp_reg_22 */
5446 /* eeee[1]= m_temp_reg_23 */
5447
5448 /* eee[0] = eeee[0] + eeeo[0]; */
5449 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
5450
5451 /* eee[3] = eeee[0] - eeeo[0]; */
5452 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
5453
5454 /* eee[2] = eeee[1] - eeeo[1]; */
5455 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
5456
5457 /* eee[1] = eeee[1] + eeeo[1];*/
5458 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
5459
5460 m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1); /* ee[0] */
5461 m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1); /* ee[7] */
5462
5463 m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2); /* ee[1] */
5464 m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2); /* ee[6] */
5465
5466 m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3); /* ee[2] */
5467 m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3); /* ee[5] */
5468
5469 m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4); /* ee[3] */
5470 m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4); /* ee[4] */
5471
5472 /* e[]*/
5473
5474 temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */
5475 temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */
5476
5477 temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */
5478 temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */
5479
5480 temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */
5481 temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */
5482
5483 temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */
5484 temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */
5485
5486 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */
5487 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */
5488
5489 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */
5490 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */
5491
5492 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */
5493 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */
5494
5495 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */
5496 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */
5497
5498 /*o[k] */
5499 {
5500
5501 WORD16 *pi2_dst_scratch = temp_ptr;
5502 WORD32 out_stride = 8;
5503
5504 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
5505 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
5506 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
5507 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
5508 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
5509 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
5510 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
5511 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
5512
5513
5514 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
5515 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
5516 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
5517 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
5518 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
5519 m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
5520 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
5521 m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
5522
5523 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
5524 m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
5525 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
5526 m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
5527 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
5528 m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
5529 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
5530 m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
5531
5532 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
5533 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
5534 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
5535 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
5536 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
5537 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
5538 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
5539 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
5540
5541 /* o0[0-3] */
5542 {
5543 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5544 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5545 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5546 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5547
5548 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5549 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5550
5551 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5552
5553 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5554 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5555 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5556 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5557
5558 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5559 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5560
5561 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5562
5563 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5564
5565 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
5566 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
5567
5568 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5569 m_count = _mm_cvtsi32_si128(i4_shift);
5570 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5571 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5572
5573 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5574 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5575 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5576 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5577
5578 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5579
5580 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5581 pi2_dst_scratch += out_stride;
5582
5583 }
5584
5585 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
5586 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
5587 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
5588 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
5589 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
5590 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
5591 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
5592 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
5593
5594 /* o1[0-3] */
5595 {
5596 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5597 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5598 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5599 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5600
5601 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5602 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5603
5604 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
5605
5606 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5607 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5608 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5609 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5610
5611 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5612 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5613
5614 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5615
5616 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5617
5618 m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
5619 m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
5620
5621 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5622 m_count = _mm_cvtsi32_si128(i4_shift);
5623 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5624 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5625
5626 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5627 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5628 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5629 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5630
5631 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5632
5633 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5634 pi2_dst_scratch += out_stride;
5635
5636 }
5637
5638 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
5639 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
5640 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
5641 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
5642 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
5643 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
5644 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
5645 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
5646
5647 /* o2[0-3] */
5648 {
5649 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5650 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5651 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5652 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5653
5654 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5655 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5656
5657 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5658
5659 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5660 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5661 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5662 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5663
5664 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
5665 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5666
5667 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
5668
5669 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5670
5671 m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
5672 m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
5673
5674 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5675 m_count = _mm_cvtsi32_si128(i4_shift);
5676 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5677 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5678
5679 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5680 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5681 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5682 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5683
5684 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5685
5686 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5687 pi2_dst_scratch += out_stride;
5688
5689 }
5690
5691 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
5692 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
5693 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
5694 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
5695 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
5696 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
5697 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
5698 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
5699
5700 /* o3[0-3] */
5701 {
5702 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5703 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5704 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5705 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5706
5707 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5708 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5709
5710 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5711
5712 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5713 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5714 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5715 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5716
5717 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
5718 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5719
5720 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5721
5722 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5723
5724 m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
5725 m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
5726
5727 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5728 m_count = _mm_cvtsi32_si128(i4_shift);
5729 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5730 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5731
5732 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5733 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5734 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5735 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5736
5737 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5738
5739 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5740 pi2_dst_scratch += out_stride;
5741
5742 }
5743
5744 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
5745 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
5746 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
5747 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
5748 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
5749 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
5750 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
5751 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
5752
5753 /* o4[0-3] */
5754 {
5755 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5756 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5757 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5758 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5759
5760 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5761 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5762
5763 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5764
5765 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5766 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5767 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5768 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5769
5770 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5771 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5772
5773 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5774
5775 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5776
5777 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
5778 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
5779 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5780 m_count = _mm_cvtsi32_si128(i4_shift);
5781 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5782 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5783
5784 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5785 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5786 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5787 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5788
5789 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5790
5791 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5792 pi2_dst_scratch += out_stride;
5793
5794 }
5795
5796 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
5797 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
5798 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
5799 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
5800 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
5801 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
5802 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
5803 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
5804
5805 /* o5[0-3] */
5806 {
5807 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5808 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5809 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5810 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5811
5812 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5813 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5814
5815 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5816
5817 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5818 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5819 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5820 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5821
5822 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5823 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5824
5825 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5826
5827 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5828
5829 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
5830 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
5831
5832 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5833 m_count = _mm_cvtsi32_si128(i4_shift);
5834 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5835 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5836
5837 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5838 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5839 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5840 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5841
5842 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5843
5844 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5845 pi2_dst_scratch += out_stride;
5846
5847 }
5848
5849 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
5850 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
5851 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
5852 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
5853 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
5854 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
5855 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
5856 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
5857
5858 /* o6[0-3] */
5859 {
5860 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5861 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5862 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5863 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5864
5865 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5866 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5867
5868 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5869
5870 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5871 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5872 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5873 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5874
5875 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5876 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5877
5878 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5879
5880 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5881
5882 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
5883 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
5884
5885 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5886 m_count = _mm_cvtsi32_si128(i4_shift);
5887 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5888 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5889
5890 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5891 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5892 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5893 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5894
5895 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5896
5897 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5898 pi2_dst_scratch += out_stride;
5899
5900 }
5901
5902 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
5903 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
5904 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
5905 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
5906 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
5907 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
5908 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
5909 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
5910
5911 /* o7[0-3] */
5912 {
5913 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5914 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5915 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5916 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5917
5918 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5919 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5920
5921 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5922
5923 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5924 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5925 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5926 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5927
5928 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5929 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5930
5931 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5932
5933 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5934
5935 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
5936 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
5937
5938 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5939 m_count = _mm_cvtsi32_si128(i4_shift);
5940 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5941 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5942
5943 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5944 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5945 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5946 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5947
5948 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5949
5950 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5951 pi2_dst_scratch += 8;
5952
5953 }
5954
5955 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
5956 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
5957 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
5958 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
5959 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
5960 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
5961 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
5962 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
5963
5964 /* o8[0-3] */
5965 {
5966 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5967 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5968 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5969 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5970
5971 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5972 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5973
5974 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5975
5976 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5977 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5978 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5979 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5980
5981 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5982 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5983
5984 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5985
5986 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5987
5988 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
5989 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
5990
5991 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5992 m_count = _mm_cvtsi32_si128(i4_shift);
5993 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5994 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5995
5996 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5997 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5998 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5999 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6000
6001 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6002
6003 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6004 pi2_dst_scratch += out_stride;
6005 }
6006
6007 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
6008 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
6009 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
6010 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
6011 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
6012 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
6013 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
6014 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
6015
6016 /* o9[0-3] */
6017 {
6018 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6019 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6020 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6021 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6022
6023 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6024 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6025
6026 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6027
6028 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6029 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6030 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6031 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6032
6033 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6034 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6035
6036 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6037
6038 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6039
6040 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
6041 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
6042
6043 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6044 m_count = _mm_cvtsi32_si128(i4_shift);
6045 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6046 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6047
6048 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6049 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6050 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6051 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6052
6053 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6054
6055 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6056 pi2_dst_scratch += out_stride;
6057 }
6058
6059 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
6060 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
6061 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
6062 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
6063 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
6064 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
6065 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
6066 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
6067
6068 /* o10[0-3] */
6069 {
6070 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6071 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6072 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6073 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6074
6075 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6076 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6077
6078 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6079
6080 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6081 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6082 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6083 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6084
6085 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6086 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6087
6088 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6089
6090 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6091
6092 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
6093 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
6094
6095 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6096 m_count = _mm_cvtsi32_si128(i4_shift);
6097 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6098 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6099
6100 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6101 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6102 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6103 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6104
6105 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6106
6107 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6108 pi2_dst_scratch += out_stride;
6109 }
6110
6111
6112 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
6113 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
6114 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
6115 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
6116 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
6117 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
6118 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
6119 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
6120
6121 /* o11[0-3] */
6122 {
6123 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6124 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6125 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6126 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6127
6128 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6129 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6130
6131 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6132
6133 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6134 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6135 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6136 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6137
6138 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6139 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6140
6141 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6142
6143 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6144
6145 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
6146 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
6147
6148 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6149 m_count = _mm_cvtsi32_si128(i4_shift);
6150 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6151 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6152
6153 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6154 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6155 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6156 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6157
6158 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6159
6160 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6161 pi2_dst_scratch += out_stride;
6162
6163 }
6164
6165 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
6166 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
6167 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
6168 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
6169 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
6170 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
6171 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
6172 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
6173
6174 /* o12[0-3] */
6175 {
6176 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6177 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6178 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6179 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6180
6181 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6182 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6183
6184 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6185
6186 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6187 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6188 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6189 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6190
6191 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6192 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6193
6194 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6195
6196 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6197
6198 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
6199 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
6200
6201 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6202 m_count = _mm_cvtsi32_si128(i4_shift);
6203 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6204 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6205
6206 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6207 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6208 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6209 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6210
6211 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6212
6213 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6214 pi2_dst_scratch += out_stride;
6215
6216 }
6217
6218 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
6219 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
6220 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
6221 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
6222 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
6223 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
6224 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
6225 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
6226
6227 /* o13[0-3] */
6228 {
6229 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6230 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6231 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6232 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6233
6234 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6235 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6236
6237 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6238
6239 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6240 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6241 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6242 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6243
6244 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6245 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6246
6247 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6248
6249 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6250
6251 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
6252 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
6253
6254 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6255 m_count = _mm_cvtsi32_si128(i4_shift);
6256 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6257 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6258
6259 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6260 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6261 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6262 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6263
6264 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6265
6266 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6267 pi2_dst_scratch += out_stride;
6268 }
6269
6270 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
6271 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
6272 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
6273 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
6274 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
6275 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
6276 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
6277 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
6278
6279 /* o14[0-3] */
6280 {
6281 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6282 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6283 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6284 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6285
6286 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6287 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6288
6289 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6290
6291 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6292 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6293 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6294 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6295
6296 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6297 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6298
6299 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6300
6301 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6302
6303 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
6304 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
6305
6306 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6307 m_count = _mm_cvtsi32_si128(i4_shift);
6308 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6309 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6310
6311 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6312 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6313 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6314 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6315
6316 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6317
6318 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6319 pi2_dst_scratch += out_stride;
6320
6321 }
6322
6323 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
6324 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
6325 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
6326 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
6327 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
6328 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
6329 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
6330 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
6331
6332 /* o15[0-3] */
6333 {
6334 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6335 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6336 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6337 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6338
6339 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6340 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6341
6342 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6343
6344 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6345 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6346 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6347 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6348
6349 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6350 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6351
6352 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6353
6354 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6355
6356 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
6357 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
6358
6359 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6360 m_count = _mm_cvtsi32_si128(i4_shift);
6361 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6362 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6363
6364 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6365 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6366 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6367 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6368
6369 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6370
6371 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6372 pi2_dst_scratch += 8;
6373 }
6374
6375 }
6376 }
6377
6378 /* Transpose */
6379 {
6380
6381 WORD16 *pi2_src_scratch = temp_ptr;
6382 WORD32 out_stride = dst_strd;
6383 WORD32 in_stride = 8;
6384
6385 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
6386 pi2_src_scratch += in_stride;
6387 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
6388 pi2_src_scratch += in_stride;
6389 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
6390 pi2_src_scratch += in_stride;
6391 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
6392 pi2_src_scratch += in_stride;
6393 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
6394 pi2_src_scratch += in_stride;
6395 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
6396 pi2_src_scratch += in_stride;
6397 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
6398 pi2_src_scratch += in_stride;
6399 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
6400 pi2_src_scratch += 8;
6401
6402 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
6403 pi2_src_scratch += in_stride;
6404 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
6405 pi2_src_scratch += in_stride;
6406 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
6407 pi2_src_scratch += in_stride;
6408 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
6409 pi2_src_scratch += in_stride;
6410 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
6411 pi2_src_scratch += in_stride;
6412 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
6413 pi2_src_scratch += in_stride;
6414 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
6415 pi2_src_scratch += in_stride;
6416 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
6417 pi2_src_scratch += 8;
6418
6419
6420 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
6421 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
6422
6423 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
6424 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
6425
6426 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
6427 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
6428
6429 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
6430 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
6431
6432 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
6433 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
6434
6435 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
6436 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
6437
6438 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
6439 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
6440
6441 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
6442 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
6443
6444
6445 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
6446 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
6447
6448 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
6449 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
6450
6451 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
6452 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
6453
6454 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
6455 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
6456
6457 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
6458 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
6459
6460 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
6461 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
6462
6463 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
6464 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
6465
6466 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
6467 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
6468
6469
6470 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); // row0 = 0-7
6471 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); // row1 = 0-7
6472
6473 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); // row0=24-31
6474 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); // row1=24-31
6475
6476 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); // row0=8-15
6477 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); // row1=8-15
6478
6479 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); // row0=16-23
6480 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); // row1=16-23
6481
6482 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); // row2 =0-7
6483 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); // row3 =0-7
6484
6485 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); // row2=24-31
6486 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); // row3=24-31
6487
6488 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); // row2=8-15
6489 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); // row3=8-15
6490
6491 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); // row2=16-23
6492 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); // row3=16-23
6493
6494 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6495
6496 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6497 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6498
6499 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
6500 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6501
6502 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6503 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6504
6505 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
6506 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6507
6508 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6509
6510 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6511
6512 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6513 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6514
6515 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
6516 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6517
6518 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6519 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6520
6521 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
6522 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6523
6524 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6525 pu1_dst += out_stride;
6526 pu1_pred += pred_strd;
6527
6528
6529 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6530
6531 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6532 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6533
6534 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
6535 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6536
6537 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6538 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6539
6540 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
6541 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6542
6543 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6544
6545 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6546
6547 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6548 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6549
6550 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
6551 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6552
6553 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6554 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6555
6556 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
6557 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6558
6559 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6560 pu1_dst += out_stride;
6561 pu1_pred += pred_strd;
6562
6563 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6564
6565 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6566 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6567
6568 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
6569 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6570
6571 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6572 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6573
6574 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
6575 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6576
6577 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6578
6579 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6580
6581 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6582 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6583
6584 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
6585 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6586
6587 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6588 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6589
6590 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
6591 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6592
6593 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6594 pu1_dst += out_stride;
6595 pu1_pred += pred_strd;
6596
6597
6598 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6599
6600 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6601 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6602
6603 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
6604 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6605
6606 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6607 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6608
6609 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
6610 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6611
6612 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6613
6614 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6615
6616 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6617 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6618
6619 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
6620 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6621
6622 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6623 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6624
6625 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
6626 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6627
6628 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6629 pu1_dst += out_stride;
6630 pu1_pred += pred_strd;
6631
6632 }
6633 pi2_tmp += 4;
6634 }
6635 }
6636
6637