1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_itrans_recon_x86_intr.c
22 *
23 * @brief
24 * Contains function definitions for inverse quantization, inverse
25 * transform and reconstruction
26 *
27 * @author
28 * 100470
29 * 100592 (edited by)
30 *
31 * @par List of Functions:
32 * - ihevc_itrans_recon_4x4_ttype1_sse42()
33 * - ihevc_itrans_recon_4x4_sse42()
34 * - ihevc_itrans_recon_8x8_sse42()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 #include <stdio.h>
42 #include <string.h>
43 #include "ihevc_typedefs.h"
44 #include "ihevc_macros.h"
45 #include "ihevc_platform_macros.h"
46 #include "ihevc_defs.h"
47 #include "ihevc_trans_tables.h"
48 #include "ihevc_iquant_itrans_recon.h"
49 #include "ihevc_func_selector.h"
50 #include "ihevc_trans_macros.h"
51
52 #include <immintrin.h>
53 #include <emmintrin.h>
54 #include <smmintrin.h>
55 #include <tmmintrin.h>
56
57 /**
58 *******************************************************************************
59 *
60 * @brief
61 * This function performs inverse quantization, inverse transform
62 * type1(DST) and reconstruction for 4x4 input block
63 *
64 * @par Description:
65 * Performs inverse quantization , inverse transform type 1 and adds
66 * prediction data and clips output to 8 bit
67 *
68 * @param[in] pi2_src
69 * Input 4x4 coefficients
70 *
71 * @param[in] pi2_tmp
72 * Temporary 4x4 buffer for storing inverse
73 * transform 1st stage output
74 *
75 * @param[in] pu1_pred
76 * Prediction 4x4 block
77 *
78 * @param[in] pi2_dequant_coeff
79 * Dequant Coeffs
80 *
81 * @param[out] pu1_dst
82 * Output 4x4 block
83 *
84 * @param[in] qp_div
85 * Quantization parameter / 6
86 *
87 * @param[in] qp_rem
88 * Quantization parameter % 6
89 *
90 * @param[in] src_strd
91 * Input stride
92 *
93 * @param[in] pred_strd
94 * Prediction stride
95 *
96 * @param[in] dst_strd
97 * Output Stride
98 *
99 * @param[in] zero_cols
100 * Zero columns in pi2_src
101 *
102 * @returns Void
103 *
104 * @remarks
105 * None
106 *
107 *******************************************************************************
108 */
109
110
ihevc_itrans_recon_4x4_ttype1_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)111 void ihevc_itrans_recon_4x4_ttype1_sse42(WORD16 *pi2_src,
112 WORD16 *pi2_tmp,
113 UWORD8 *pu1_pred,
114 UWORD8 *pu1_dst,
115 WORD32 src_strd,
116 WORD32 pred_strd,
117 WORD32 dst_strd,
118 WORD32 zero_cols,
119 WORD32 zero_rows)
120 {
121 __m128i m_temp_reg_0;
122 __m128i m_temp_reg_1;
123 __m128i m_temp_reg_2;
124 __m128i m_temp_reg_3;
125 __m128i m_temp_reg_4;
126 __m128i m_temp_reg_10;
127 __m128i m_temp_reg_11;
128 __m128i m_temp_reg_12;
129 __m128i m_temp_reg_13;
130 __m128i m_temp_reg_14;
131 __m128i m_temp_reg_20;
132 __m128i m_temp_reg_21;
133 __m128i m_temp_reg_22;
134 __m128i m_temp_reg_23;
135 __m128i m_temp_reg_24;
136 __m128i m_temp_reg_25;
137 __m128i m_temp_reg_30;
138 __m128i m_temp_reg_31;
139 __m128i m_temp_reg_32;
140 __m128i m_temp_reg_33;
141 __m128i m_temp_reg_34;
142 __m128i m_temp_reg_35;
143 __m128i m_temp_reg_36;
144 __m128i m_coeff1, m_coeff2, m_coeff3;
145 __m128i m_rdng_factor;
146 __m128i m_count;
147
148 WORD32 i4_shift = IT_SHIFT_STAGE_1;
149 UNUSED(zero_rows);
150 UNUSED(zero_cols);
151 UNUSED(pi2_tmp);
152
153 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[2][0]); //74
154
155 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
156 pi2_src += src_strd;
157 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
158 pi2_src += src_strd;
159 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
160 pi2_src += src_strd;
161 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
162
163 m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
164 m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
165
166 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
167 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
168
169 /* c[4] in m_temp_reg_14 */
170 /* c[4] = src[0] - src[2] + src[3] */
171 {
172 m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
173 }
174
175 /* c[3] in m_temp_reg_13 */
176 {
177 m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
178 }
179
180 /* c[0] in m_temp_reg_10 */
181 {
182 m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
183 }
184
185 /* c[1] in m_temp_reg_11 */
186 {
187 m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
188 }
189
190 /* c[2] in m_temp_reg_12 */
191 {
192 m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
193 }
194
195 /* c[4] in m_temp_reg_14 */
196 /* c[4] = src[0] - src[2] + src[3] */
197 {
198 m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
199 }
200
201 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[1][0]); //29
202 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[0][0]); //55
203
204 /* Stage 1 outputs stored in m_temp_reg_20-23 */
205 {
206 m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1); //29*c0
207 m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2); //55*c1
208
209 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
210
211 m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1); //29*c1
212 m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2); //55*c2
213
214 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
215
216 m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2); //55*c0
217 m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1); //29*c2
218 m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3); //74*c4
219
220 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
221 m_count = _mm_cvtsi32_si128(i4_shift);
222
223 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
224 m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
225 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
226
227 m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
228 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
229
230 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
231 m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
232 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
233
234 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
235
236 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
237 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
238 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
239 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
240
241 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
242 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
243 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
244 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
245
246 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
247 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
248
249 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
250 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
251
252 }
253
254 /* Stage 2 */
255 {
256 i4_shift = IT_SHIFT_STAGE_2;
257
258 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
259 m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
260 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
261 m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
262 m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
263 m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
264
265 /* c[4] stored in m_temp_reg_4 */
266 {
267 m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
268 }
269
270 /* c[3] stored in m_temp_reg_3 */
271 {
272 m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
273 }
274
275 /* c[0] stored in m_temp_reg_0 */
276 {
277 m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
278 }
279
280 /* c[1] stored in m_temp_reg_1 */
281 {
282 m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
283 }
284
285 /* c[2] stored in m_temp_reg_2 */
286 {
287 m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
288 }
289
290 /* c[4] stored in m_temp_reg_4 */
291 {
292 m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
293 }
294
295 /* Stage 2 output generation */
296 {
297 m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1); //29*c0
298 m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2); //55*c1
299
300 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
301
302 m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //29*c1
303 m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2); //55*c2
304
305 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
306
307 m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2); //55*c0
308 m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1); //29*c2
309 m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3); //74*c4
310
311 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
312 m_count = _mm_cvtsi32_si128(i4_shift);
313
314 m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
315 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
316 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
317
318 m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
319 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
320
321 m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
322 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
323 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
324
325 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
326
327 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
328 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
329 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
330 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
331
332 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
333 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
334 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
335 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
336
337 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
338 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
339
340 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
341 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
342 }
343
344 /* Recon and store */
345 {
346 WORD32 *pi4_dst = (WORD32 *)pu1_dst;
347
348 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
349 pu1_pred += pred_strd;
350 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
351 pu1_pred += pred_strd;
352 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
353 pu1_pred += pred_strd;
354 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
355
356 m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
357 m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
358 m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
359 m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
360 m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
361 m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
362
363 m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
364 m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
365
366 m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
367
368 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
369 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
370 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
371 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
372 pu1_dst += dst_strd;
373 pi4_dst = (WORD32 *)(pu1_dst);
374
375 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
376 pu1_dst += dst_strd;
377 pi4_dst = (WORD32 *)(pu1_dst);
378
379 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
380 pu1_dst += dst_strd;
381 pi4_dst = (WORD32 *)(pu1_dst);
382
383 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
384 }
385 }
386 }
387
388 /**
389 *******************************************************************************
390 *
391 * @brief
392 * This function performs inverse quantization, inverse transform
393 * (DCT) and reconstruction for 4x4 input block
394 *
395 * @par Description:
396 * Performs inverse quantization , inverse transform and adds
397 * prediction data and clips output to 8 bit
398 *
399 * @param[in] pi2_src
400 * Input 4x4 coefficients
401 *
402 * @param[in] pi2_tmp
403 * Temporary 4x4 buffer for storing inverse
404 * transform 1st stage output
405 *
406 * @param[in] pu1_pred
407 * Prediction 4x4 block
408 *
409 * @param[in] pi2_dequant_coeff
410 * Dequant Coeffs
411 *
412 * @param[out] pu1_dst
413 * Output 4x4 block
414 *
415 * @param[in] qp_div
416 * Quantization parameter / 6
417 *
418 * @param[in] qp_rem
419 * Quantization parameter % 6
420 *
421 * @param[in] src_strd
422 * Input stride
423 *
424 * @param[in] pred_strd
425 * Prediction stride
426 *
427 * @param[in] dst_strd
428 * Output Stride
429 *
430 * @param[in] zero_cols
431 * Zero columns in pi2_src
432 *
433 * @returns Void
434 *
435 * @remarks
436 * None
437 *
438 *******************************************************************************
439 */
440
ihevc_itrans_recon_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)441 void ihevc_itrans_recon_4x4_sse42(WORD16 *pi2_src,
442 WORD16 *pi2_tmp,
443 UWORD8 *pu1_pred,
444 UWORD8 *pu1_dst,
445 WORD32 src_strd,
446 WORD32 pred_strd,
447 WORD32 dst_strd,
448 WORD32 zero_cols,
449 WORD32 zero_rows)
450 {
451
452
453 __m128i m_temp_reg_0;
454 __m128i m_temp_reg_1;
455 __m128i m_temp_reg_2;
456 __m128i m_temp_reg_3;
457 __m128i m_temp_reg_10;
458 __m128i m_temp_reg_11;
459 __m128i m_temp_reg_12;
460 __m128i m_temp_reg_13;
461 __m128i m_temp_reg_14;
462 __m128i m_temp_reg_15;
463 __m128i m_temp_reg_20;
464 __m128i m_temp_reg_21;
465 __m128i m_temp_reg_22;
466 __m128i m_temp_reg_23;
467 __m128i m_temp_reg_24;
468 __m128i m_temp_reg_25;
469 __m128i m_temp_reg_30;
470 __m128i m_temp_reg_31;
471 __m128i m_temp_reg_33;
472 __m128i m_temp_reg_34;
473 __m128i m_coeff1, m_coeff3;
474 __m128i m_rdng_factor;
475 __m128i m_count;
476
477
478 WORD32 i4_shift = IT_SHIFT_STAGE_1;
479 UNUSED(zero_rows);
480 UNUSED(zero_cols);
481 UNUSED(pi2_tmp);
482
483
484 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
485 pi2_src += src_strd;
486 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
487 pi2_src += src_strd;
488 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
489 pi2_src += src_strd;
490 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
491
492 m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
493 m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
494
495 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
496 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
497
498
499 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[0][0]); //36
500 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[2][0]); //83
501
502 /* e */
503 {
504 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
505 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
506 }
507
508 /* o */
509 {
510 m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //src[1]*36
511 m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3); //src[3]*83
512 m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); //src[1]*83
513 m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1); //src[3]*36
514 }
515
516 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
517
518 /* e1 stored in m_temp_reg_31 */
519 {
520 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
521 }
522
523 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
524
525 /* e0 stored in m_temp_reg_30 */
526 {
527 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
528 }
529
530 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
531 m_count = _mm_cvtsi32_si128(i4_shift);
532
533 /* o1 stored in m_temp_reg_33 */
534 {
535 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
536 }
537
538 /* e1 + add */
539 {
540 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
541 }
542
543 /* e0 + add */
544 {
545 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
546 }
547
548 /* o0 stored in m_temp_reg_34 */
549 {
550 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
551 }
552
553 /* Stage 1 outputs */
554 {
555 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
556 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
557
558 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
559 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
560
561
562 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
563 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
564 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
565 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
566
567 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
568 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
569 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
570 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
571
572 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
573 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
574
575 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
576 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
577 }
578
579 /* Stage 2 */
580 {
581 i4_shift = IT_SHIFT_STAGE_2;
582
583 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
584 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
585
586 m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
587 m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
588
589 m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
590 m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
591
592 /* e */
593 {
594 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
595 }
596
597 /* o */
598 {
599 m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1); //src[1]*36
600 m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); //src[1]*83
601 m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3); //src[3]*83
602 m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1); //src[3]*36
603 }
604
605 /* e */
606 {
607 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
608 }
609
610 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
611
612 /* e1 stored in m_temp_reg_31 */
613 {
614 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
615 }
616
617 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
618
619 /* e0 stored in m_temp_reg_30 */
620 {
621 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
622 }
623
624 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
625 m_count = _mm_cvtsi32_si128(i4_shift);
626
627 /* o1 stored in m_temp_reg_33 */
628 {
629 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
630 }
631
632 /* e1 + add */
633 {
634 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
635 }
636
637 /* e0 + add */
638 {
639 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
640 }
641
642 /* o0 stored in m_temp_reg_34 */
643 {
644 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
645 }
646
647 /* Stage 2 outputs */
648 {
649 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
650 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
651 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
652 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
653
654 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
655 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
656 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
657 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
658
659 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
660 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
661 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
662 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
663
664 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
665 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
666
667 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
668 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
669 }
670
671 /* Recon and store */
672 {
673 UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
674
675 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
676 pu1_pred += pred_strd;
677 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
678 pu1_pred += pred_strd;
679 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
680 pu1_pred += pred_strd;
681 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
682
683 m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
684 m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
685 m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
686 m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
687 m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
688 m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
689
690 m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
691 m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
692
693 m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
694
695 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
696 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
697 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
698 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
699 pu1_dst += dst_strd;
700 pu4_dst = (UWORD32 *)(pu1_dst);
701
702 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
703 pu1_dst += dst_strd;
704 pu4_dst = (UWORD32 *)(pu1_dst);
705
706 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
707 pu1_dst += dst_strd;
708 pu4_dst = (UWORD32 *)(pu1_dst);
709
710 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
711 }
712 }
713 }
714
715
716
717 /**
718 *******************************************************************************
719 *
720 * @brief
721 * This function performs inverse quantization, inverse transform and
722 * reconstruction for 8c8 input block
723 *
724 * @par Description:
725 * Performs inverse quantization , inverse transform and adds the
726 * prediction data and clips output to 8 bit
727 *
728 * @param[in] pi2_src
729 * Input 8x8 coefficients
730 *
731 * @param[in] pi2_tmp
732 * Temporary 8x8 buffer for storing inverse
733 * transform 1st stage output
734 *
735 * @param[in] pu1_pred
736 * Prediction 8x8 block
737 *
738 * @param[in] pi2_dequant_coeff
739 * Dequant Coeffs
740 *
741 * @param[out] pu1_dst
742 * Output 8x8 block
743 *
744 * @param[in] src_strd
745 * Input stride
746 *
747 * @param[in] qp_div
748 * Quantization parameter / 6
749 *
750 * @param[in] qp_rem
751 * Quantization parameter % 6
752 *
753 * @param[in] pred_strd
754 * Prediction stride
755 *
756 * @param[in] dst_strd
757 * Output Stride
758 *
759 * @param[in] zero_cols
760 * Zero columns in pi2_src
761 *
762 * @returns Void
763 *
764 * @remarks
765 * None
766 *
767 *******************************************************************************
768 */
769
770
ihevc_itrans_recon_8x8_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)771 void ihevc_itrans_recon_8x8_sse42(WORD16 *pi2_src,
772 WORD16 *pi2_tmp,
773 UWORD8 *pu1_pred,
774 UWORD8 *pu1_dst,
775 WORD32 src_strd,
776 WORD32 pred_strd,
777 WORD32 dst_strd,
778 WORD32 zero_cols,
779 WORD32 zero_rows)
780 {
781 __m128i m_temp_reg_0;
782 __m128i m_temp_reg_1;
783 __m128i m_temp_reg_2;
784 __m128i m_temp_reg_3;
785 __m128i m_temp_reg_5;
786 __m128i m_temp_reg_6;
787 __m128i m_temp_reg_7;
788 __m128i m_temp_reg_4;
789 __m128i m_temp_reg_10;
790 __m128i m_temp_reg_11;
791 __m128i m_temp_reg_12;
792 __m128i m_temp_reg_13;
793 __m128i m_temp_reg_14;
794 __m128i m_temp_reg_15;
795 __m128i m_temp_reg_16;
796 __m128i m_temp_reg_17;
797 __m128i m_temp_reg_20;
798 __m128i m_temp_reg_21;
799 __m128i m_temp_reg_22;
800 __m128i m_temp_reg_23;
801 __m128i m_temp_reg_24;
802 __m128i m_temp_reg_25;
803 __m128i m_temp_reg_26;
804 __m128i m_temp_reg_27;
805 __m128i m_temp_reg_30;
806 __m128i m_temp_reg_31;
807 __m128i m_temp_reg_32;
808 __m128i m_temp_reg_33;
809 __m128i m_temp_reg_34;
810 __m128i m_temp_reg_35;
811 __m128i m_temp_reg_36;
812 __m128i m_temp_reg_37;
813 __m128i m_temp_reg_40;
814 __m128i m_temp_reg_41;
815 __m128i m_temp_reg_42;
816 __m128i m_temp_reg_43;
817 __m128i m_temp_reg_44;
818 __m128i m_temp_reg_45;
819 __m128i m_temp_reg_46;
820 __m128i m_temp_reg_47;
821 __m128i m_temp_reg_50;
822 __m128i m_temp_reg_51;
823 __m128i m_temp_reg_52;
824 __m128i m_temp_reg_53;
825 __m128i m_temp_reg_54;
826 __m128i m_temp_reg_55;
827 __m128i m_temp_reg_56;
828 __m128i m_temp_reg_57;
829 __m128i m_temp_reg_60;
830 __m128i m_temp_reg_61;
831 __m128i m_temp_reg_62;
832 __m128i m_temp_reg_63;
833 __m128i m_temp_reg_64;
834 __m128i m_temp_reg_65;
835 __m128i m_temp_reg_66;
836 __m128i m_temp_reg_67;
837 __m128i m_temp_reg_70;
838 __m128i m_temp_reg_71;
839 __m128i m_temp_reg_72;
840 __m128i m_temp_reg_73;
841 __m128i m_temp_reg_74;
842 __m128i m_temp_reg_75;
843 __m128i m_temp_reg_76;
844 __m128i m_temp_reg_77;
845 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
846
847 WORD32 check_row_stage_1; /* Lokesh */
848 WORD32 check_row_stage_2; /* Lokesh */
849
850 __m128i m_rdng_factor;
851 WORD32 i4_shift = IT_SHIFT_STAGE_1;
852 UNUSED(pi2_tmp);
853 check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
854 check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
855
856 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
857 pi2_src += src_strd;
858 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
859 pi2_src += src_strd;
860 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
861 pi2_src += src_strd;
862 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
863 pi2_src += src_strd;
864
865 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
866 pi2_src += src_strd;
867 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
868 pi2_src += src_strd;
869 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
870 pi2_src += src_strd;
871 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
872
873 if(!check_row_stage_2)
874 {
875 if(!check_row_stage_1)
876 {
877 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
878 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
879 {
880 //Interleaving 0,4 row in 0 , 1 Rishab
881 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
882 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
883 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
884
885 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
886
887 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
888 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
889
890 }
891
892
893 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
894 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
895 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
896 {
897
898 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
899 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
900
901 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
902 //Interleaving 2,6 row in 4, 5 Rishab
903 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
904
905 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
906 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
907
908
909 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
910
911 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
912 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
913
914 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
915 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
916
917
918
919 /* e */
920
921 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
922 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
923 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
924 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
925 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
926 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
927
928 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
929 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
930
931 }
932
933 /* o */
934 {
935
936 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
937 {
938
939 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
940 //o0:1B*89+3B*75,5B*50+7B*18
941 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
942
943 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
944 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
945
946
947
948 /* Column 0 of destination computed here */
949 /* It is stored in m_temp_reg_50 */
950 /* Column 7 of destination computed here */
951 /* It is stored in m_temp_reg_57 */
952 /* Upper 8 bytes of both registers are zero due to zero_cols*/
953
954
955
956 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
957 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
958
959 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
960 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
961
962 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
963 m_temp_reg_63 = _mm_setzero_si128();
964 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
965
966 //o1:1B*75-3B*18,5B*89+7B*50
967 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
968
969 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
970 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
971
972 /* Loading coeff for computing o2 in the next block */
973
974 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
975 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
976
977 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
978
979
980
981 /* Column 1 of destination computed here */
982 /* It is stored in m_temp_reg_51 */
983 /* Column 6 of destination computed here */
984 /* It is stored in m_temp_reg_56 */
985
986 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
987 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
988
989 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
990 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
991
992 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
993 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
994
995 //o2:1B*50-3B*89,5B*18+7B*75
996 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
997
998 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
999 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1000
1001
1002 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1003
1004 /* Loading coeff for computing o3 in the next block */
1005
1006 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1007 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
1008
1009
1010
1011 /* Column 2 of destination computed here */
1012 /* It is stored in m_temp_reg_52 */
1013 /* Column 5 of destination computed here */
1014 /* It is stored in m_temp_reg_55 */
1015
1016 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1017 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1018
1019 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1020 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1021
1022 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1023 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1024
1025 //o3:1B*18-3B*50,5B*75-7B*89
1026 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1027
1028 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1029 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1030
1031
1032
1033 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1034
1035
1036
1037 /* Column 3 of destination computed here */
1038 /* It is stored in m_temp_reg_53 */
1039 /* Column 4 of destination computed here */
1040 /* It is stored in m_temp_reg_54 */
1041
1042 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1043 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1044
1045 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1046 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1047
1048 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1049 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1050
1051
1052 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1053 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1054 }
1055 }
1056
1057 /* Transpose of the destination 8x8 matrix done here */
1058 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1059 /* respectively */
1060 {
1061 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1062 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1063 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1064 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1065
1066 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1067 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1068
1069 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1070 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1071
1072 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1073 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1074 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1075 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1076
1077 m_temp_reg_54 = _mm_setzero_si128();
1078 m_temp_reg_55 = _mm_setzero_si128();
1079 m_temp_reg_56 = _mm_setzero_si128();
1080 m_temp_reg_57 = _mm_setzero_si128();
1081 }
1082 }
1083 else
1084 {
1085 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1086 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1087 {
1088 //Interleaving 0,4 row in 0 , 1 Rishab
1089 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1090 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1091 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1092
1093 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1094
1095 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1096 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1097
1098 }
1099
1100
1101 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1102 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1103 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
1104 {
1105
1106 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1107 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1108
1109 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1110 //Interleaving 2,6 row in 4, 5 Rishab
1111 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1112
1113 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1114 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1115
1116
1117 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1118
1119 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1120 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
1121
1122 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1123 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
1124
1125
1126
1127 /* e */
1128
1129 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1130 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1131 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1132 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1133 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1134 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1135
1136 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1137 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1138
1139 }
1140
1141 /* o */
1142 {
1143
1144 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1145 {
1146
1147 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1148 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1149 //o0:1B*89+3B*75,5B*50+7B*18
1150 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1151 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1152
1153 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1154 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1155
1156 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1157
1158
1159
1160 /* Column 0 of destination computed here */
1161 /* It is stored in m_temp_reg_50 */
1162 /* Column 7 of destination computed here */
1163 /* It is stored in m_temp_reg_57 */
1164 /* Upper 8 bytes of both registers are zero due to zero_cols*/
1165
1166
1167
1168 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1169 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1170
1171 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1172 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1173
1174 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1175 m_temp_reg_63 = _mm_setzero_si128();
1176 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1177
1178 //o1:1B*75-3B*18,5B*89+7B*50
1179 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1180 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1181
1182 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1183 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1184
1185 /* Loading coeff for computing o2 in the next block */
1186
1187 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1188 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
1189
1190 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1191 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1192
1193
1194
1195 /* Column 1 of destination computed here */
1196 /* It is stored in m_temp_reg_51 */
1197 /* Column 6 of destination computed here */
1198 /* It is stored in m_temp_reg_56 */
1199
1200 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1201 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1202
1203 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1204 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1205
1206 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1207 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1208
1209 //o2:1B*50-3B*89,5B*18+7B*75
1210 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1211 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1212
1213 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1214 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1215
1216
1217 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1218
1219 /* Loading coeff for computing o3 in the next block */
1220
1221 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1222 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
1223
1224 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1225
1226
1227 /* Column 2 of destination computed here */
1228 /* It is stored in m_temp_reg_52 */
1229 /* Column 5 of destination computed here */
1230 /* It is stored in m_temp_reg_55 */
1231
1232 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1233 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1234
1235 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1236 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1237
1238 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1239 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1240
1241 //o3:1B*18-3B*50,5B*75-7B*89
1242 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1243 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1244
1245 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1246 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1247
1248
1249
1250 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1251
1252 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1253
1254
1255 /* Column 3 of destination computed here */
1256 /* It is stored in m_temp_reg_53 */
1257 /* Column 4 of destination computed here */
1258 /* It is stored in m_temp_reg_54 */
1259
1260 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1261 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1262
1263 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1264 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1265
1266 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1267 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1268
1269
1270 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1271 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1272 }
1273 }
1274
1275 /* Transpose of the destination 8x8 matrix done here */
1276 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1277 /* respectively */
1278 {
1279 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1280 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1281 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1282 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1283
1284 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1285 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1286 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1287 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1288
1289 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1290 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1291 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1292 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1293
1294 m_temp_reg_54 = _mm_setzero_si128();
1295 m_temp_reg_55 = _mm_setzero_si128();
1296 m_temp_reg_56 = _mm_setzero_si128();
1297 m_temp_reg_57 = _mm_setzero_si128();
1298 }
1299 }
1300
1301 /* Stage 2 */
1302 i4_shift = IT_SHIFT_STAGE_2;
1303 {
1304 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1305 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1306 {
1307 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
1308 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
1309
1310 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
1311 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
1312
1313 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1314 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1315 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1316 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1317
1318
1319 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
1320 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
1321 }
1322
1323
1324 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1325 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1326 {
1327
1328 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
1329 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
1330
1331
1332 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1333 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1334 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1335 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1336
1337 /* Loading coeff for computing o0 in the next block */
1338 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1339
1340
1341 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
1342 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
1343
1344
1345
1346 /* e */
1347
1348 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1349 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1350 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1351 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1352 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1353 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1354
1355 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1356 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1357
1358 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1359 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1360
1361 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1362 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1363
1364 }
1365
1366 /* o */
1367 {
1368
1369 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1370 {
1371 //o0:1B*89+3B*75,1T*89+3T*75
1372 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1373 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1374
1375 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1376 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1377 /* Loading coeff for computing o1 in the next block */
1378 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1379
1380
1381
1382 /* Column 0 of destination computed here */
1383 /* It is stored in m_temp_reg_50 */
1384 /* Column 7 of destination computed here */
1385 /* It is stored in m_temp_reg_57 */
1386
1387 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1388 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1389
1390 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1391 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1392
1393 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1394 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1395 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1396 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1397
1398 //o1:1B*75-3B*18,1T*75-3T*18
1399 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1400 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1401
1402 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1403 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1404 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1405 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1406
1407 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1408 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1409
1410
1411 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1412
1413
1414 /* Loading coeff for computing o2 in the next block */
1415 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1416
1417
1418
1419 /* Column 1 of destination computed here */
1420 /* It is stored in m_temp_reg_51 */
1421 /* Column 6 of destination computed here */
1422 /* It is stored in m_temp_reg_56 */
1423
1424 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1425 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1426
1427 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1428 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1429
1430 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1431 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1432 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1433 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1434
1435 //o2:1B*50-3B*89,5T*18+7T*75.
1436 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1437 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1438
1439 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1440 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1441 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1442 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1443
1444 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1445 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1446
1447
1448 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1449
1450 /* Loading coeff for computing o3 in the next block */
1451
1452 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1453
1454
1455 /* Column 2 of destination computed here */
1456 /* It is stored in m_temp_reg_52 */
1457 /* Column 5 of destination computed here */
1458 /* It is stored in m_temp_reg_55 */
1459
1460 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1461 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1462
1463 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1464 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1465
1466 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1467 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1468 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1469 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1470
1471 //o3:1B*18-3B*50,1T*18-3T*50
1472 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1473 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1474
1475 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1476 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1477 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1478 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1479
1480
1481 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1482 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1483
1484
1485
1486 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1487
1488
1489 /* Column 3 of destination computed here */
1490 /* It is stored in m_temp_reg_53 */
1491 /* Column 4 of destination computed here */
1492 /* It is stored in m_temp_reg_54 */
1493
1494 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1495 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1496
1497 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1498 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1499
1500 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
1501 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
1502 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
1503 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
1504
1505 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
1506 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
1507 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
1508 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
1509
1510 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
1511 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
1512 }
1513 }
1514
1515 /* Transpose of the destination 8x8 matrix done here */
1516 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1517 /* respectively */
1518 {
1519 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1520 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1521 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1522 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1523 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1524 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1525 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1526 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1527
1528 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1529 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1530 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1531 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1532 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1533 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1534 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1535 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1536 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1537 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1538 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1539 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1540
1541 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1542 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1543 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1544 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1545 }
1546
1547 /* Recon and store */
1548 {
1549 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
1550 pu1_pred += pred_strd;
1551 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
1552 pu1_pred += pred_strd;
1553 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
1554 pu1_pred += pred_strd;
1555 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
1556 pu1_pred += pred_strd;
1557 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
1558 pu1_pred += pred_strd;
1559 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
1560 pu1_pred += pred_strd;
1561 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
1562 pu1_pred += pred_strd;
1563 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
1564
1565 m_temp_reg_50 = _mm_setzero_si128();
1566 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
1567 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
1568 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
1569 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
1570 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
1571 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
1572 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
1573 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
1574
1575 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
1576 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
1577 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
1578 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
1579 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
1580 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
1581 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
1582 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
1583
1584 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
1585 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
1586 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
1587 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
1588 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
1589 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
1590 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
1591 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
1592
1593 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
1594 pu1_dst += dst_strd;
1595 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
1596 pu1_dst += dst_strd;
1597 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
1598 pu1_dst += dst_strd;
1599 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
1600 pu1_dst += dst_strd;
1601 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
1602 pu1_dst += dst_strd;
1603 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
1604 pu1_dst += dst_strd;
1605 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
1606 pu1_dst += dst_strd;
1607 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
1608 pu1_dst += dst_strd;
1609 }
1610 }
1611 }
1612 else
1613
1614 {
1615
1616 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1617 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1618 if(!check_row_stage_1)
1619 {
1620 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1621 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1622 {
1623 //Interleaving 0,4 row in 0 , 1 Rishab
1624 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1625 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1626 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1627
1628 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1629 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
1630
1631 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1632 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1633
1634
1635 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1636 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1637 }
1638
1639
1640 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1641 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1642 {
1643
1644 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1645 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1646
1647 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1648 //Interleaving 2,6 row in 4, 5 Rishab
1649 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1650 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
1651
1652 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1653 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1654
1655 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
1656 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1657
1658
1659
1660 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1661
1662 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1663 //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
1664
1665 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1666 //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
1667
1668 }
1669
1670 /* e */
1671 {
1672 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1673 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1674 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1675 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1676 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1677 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1678
1679 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1680 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1681
1682 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1683 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1684
1685 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1686 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1687
1688 }
1689
1690 /* o */
1691 {
1692
1693 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1694 {
1695
1696 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1697 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1698 //o0:1B*89+3B*75,1T*89+3T*75
1699 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1700 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1701
1702 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1703 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1704
1705 }
1706
1707 /* Column 0 of destination computed here */
1708 /* It is stored in m_temp_reg_50 */
1709 /* Column 7 of destination computed here */
1710 /* It is stored in m_temp_reg_57 */
1711 {
1712
1713
1714 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1715 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1716
1717 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1718 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1719
1720 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1721 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1722 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1723 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1724
1725 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1726 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1727 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1728 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1729
1730 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1731 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1732 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1733
1734 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1735 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1736
1737 /* Loading coeff for computing o2 in the next block */
1738
1739 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1740
1741 }
1742
1743 /* Column 1 of destination computed here */
1744 /* It is stored in m_temp_reg_51 */
1745 /* Column 6 of destination computed here */
1746 /* It is stored in m_temp_reg_56 */
1747 {
1748 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1749 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1750
1751 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1752 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1753
1754 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1755 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1756 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1757 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1758
1759 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1760 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1761 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1762 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1763
1764 //o2:1B*50-3B*89,1T*50-3T*89
1765 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1766 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1767
1768 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1769 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1770
1771
1772 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1773
1774
1775 /* Loading coeff for computing o3 in the next block */
1776
1777 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1778
1779 }
1780
1781 /* Column 2 of destination computed here */
1782 /* It is stored in m_temp_reg_52 */
1783 /* Column 5 of destination computed here */
1784 /* It is stored in m_temp_reg_55 */
1785 {
1786 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1787 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1788
1789 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1790 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1791
1792 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1793 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1794 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1795 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1796
1797 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1798 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1799 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1800 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1801
1802 //o3:1B*18-3B*50,1T*18-3T*50
1803 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1804 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1805
1806 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1807 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1808
1809
1810
1811 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1812
1813
1814 }
1815
1816 /* Column 3 of destination computed here */
1817 /* It is stored in m_temp_reg_53 */
1818 /* Column 4 of destination computed here */
1819 /* It is stored in m_temp_reg_54 */
1820 {
1821 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1822 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1823
1824 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1825 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1826
1827 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1828 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1829 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1830 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1831
1832 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1833 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1834 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1835 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1836
1837 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1838 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1839 }
1840 }
1841
1842 /* Transpose of the destination 8x8 matrix done here */
1843 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1844 /* respectively */
1845 {
1846
1847
1848 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1849 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1850 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1851 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1852 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1853 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1854 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1855 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1856
1857 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1858 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1859 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1860 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1861 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1862 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1863 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1864 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1865
1866 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1867 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1868 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1869 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1870
1871 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1872 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1873 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1874 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1875 }
1876 }
1877 else
1878 {
1879
1880 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1881 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1882 {
1883 //Interleaving 0,4 row in 0 , 1 Rishab
1884 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1885 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1886 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1887
1888 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1889 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
1890
1891 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1892 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1893
1894
1895 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1896 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1897 }
1898
1899
1900 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1901 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1902 {
1903
1904 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1905 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1906
1907 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1908 //Interleaving 2,6 row in 4, 5 Rishab
1909 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1910 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
1911
1912 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1913 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1914
1915 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
1916 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1917
1918
1919
1920 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1921
1922 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1923 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
1924
1925 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1926 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
1927
1928 }
1929
1930 /* e */
1931 {
1932 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1933 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1934 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1935 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1936 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1937 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1938
1939 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1940 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1941
1942 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1943 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1944
1945 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1946 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1947
1948 }
1949
1950 /* o */
1951 {
1952
1953 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1954 {
1955
1956 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1957 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1958 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1959 m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
1960 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
1961 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1962 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1963 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1964 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1965
1966
1967 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1968 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1969
1970 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1971 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1972 }
1973
1974 /* Column 0 of destination computed here */
1975 /* It is stored in m_temp_reg_50 */
1976 /* Column 7 of destination computed here */
1977 /* It is stored in m_temp_reg_57 */
1978 {
1979
1980
1981 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1982 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1983
1984 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1985 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1986
1987 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1988 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1989 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1990 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1991
1992 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1993 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1994 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1995 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1996
1997 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1998 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1999 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
2000 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
2001 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
2002
2003 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2004 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2005
2006 /* Loading coeff for computing o2 in the next block */
2007
2008 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
2009 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
2010
2011 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
2012 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
2013 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
2014 }
2015
2016 /* Column 1 of destination computed here */
2017 /* It is stored in m_temp_reg_51 */
2018 /* Column 6 of destination computed here */
2019 /* It is stored in m_temp_reg_56 */
2020 {
2021 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
2022 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
2023
2024 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
2025 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
2026
2027 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2028 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2029 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2030 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2031
2032 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2033 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2034 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2035 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2036
2037 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
2038 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
2039 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
2040 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
2041 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
2042
2043 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2044 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2045
2046
2047 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
2048
2049
2050 /* Loading coeff for computing o3 in the next block */
2051
2052 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
2053 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
2054
2055 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2056 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2057 }
2058
2059 /* Column 2 of destination computed here */
2060 /* It is stored in m_temp_reg_52 */
2061 /* Column 5 of destination computed here */
2062 /* It is stored in m_temp_reg_55 */
2063 {
2064 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
2065 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
2066
2067 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
2068 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
2069
2070 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2071 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2072 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2073 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2074
2075 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2076 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2077 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2078 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2079
2080 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
2081 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
2082 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
2083 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
2084 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
2085
2086 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2087 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2088
2089
2090
2091 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
2092
2093
2094 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
2095 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
2096 }
2097
2098 /* Column 3 of destination computed here */
2099 /* It is stored in m_temp_reg_53 */
2100 /* Column 4 of destination computed here */
2101 /* It is stored in m_temp_reg_54 */
2102 {
2103 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
2104 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
2105
2106 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
2107 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
2108
2109 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2110 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2111 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2112 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2113
2114 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2115 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2116 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2117 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2118
2119 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2120 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2121 }
2122 }
2123
2124 /* Transpose of the destination 8x8 matrix done here */
2125 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
2126 /* respectively */
2127 {
2128
2129
2130 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
2131 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
2132 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
2133 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
2134 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
2135 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
2136 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
2137 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
2138
2139 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
2140 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
2141 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
2142 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
2143 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
2144 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
2145 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
2146 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
2147
2148 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
2149 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
2150 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
2151 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
2152
2153 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
2154 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
2155 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
2156 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
2157 }
2158 }
2159 /* Stage 2 */
2160
2161 i4_shift = IT_SHIFT_STAGE_2;
2162
2163 {
2164
2165 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
2166 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
2167 {
2168 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
2169 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
2170
2171 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
2172 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
2173
2174 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2175 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2176 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2177 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2178
2179
2180 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
2181 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
2182 }
2183
2184
2185 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
2186 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
2187 {
2188 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
2189 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
2190
2191
2192 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2193 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2194 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2195 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2196
2197 /* Loading coeff for computing o0 in the next block */
2198 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
2199 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
2200
2201
2202 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
2203 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
2204 }
2205
2206 /* e */
2207 {
2208 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
2209 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
2210 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
2211 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
2212 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
2213 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
2214
2215 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
2216 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
2217
2218 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
2219 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
2220
2221 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
2222 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
2223
2224 }
2225
2226 /* o */
2227 {
2228 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
2229 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
2230
2231 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
2232 {
2233 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
2234 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2235 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2236 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
2237 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
2238
2239 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2240 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
2241 /* Loading coeff for computing o1 in the next block */
2242 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
2243 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
2244
2245 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2246 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2247 }
2248
2249 /* Column 0 of destination computed here */
2250 /* It is stored in m_temp_reg_50 */
2251 /* Column 7 of destination computed here */
2252 /* It is stored in m_temp_reg_57 */
2253 {
2254 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
2255 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
2256
2257 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2258 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2259
2260 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2261 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2262 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2263 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2264
2265 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2266 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2267 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2268 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2269
2270 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
2271 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2272 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
2273 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2274 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
2275
2276 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2277 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2278
2279
2280 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
2281
2282
2283 /* Loading coeff for computing o2 in the next block */
2284 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
2285 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
2286
2287 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
2288 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
2289 }
2290
2291 /* Column 1 of destination computed here */
2292 /* It is stored in m_temp_reg_51 */
2293 /* Column 6 of destination computed here */
2294 /* It is stored in m_temp_reg_56 */
2295 {
2296 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
2297 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
2298
2299 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
2300 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
2301
2302 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2303 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2304 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2305 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2306
2307 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2308 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2309 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2310 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2311
2312 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
2313 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2314 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
2315 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2316 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
2317
2318 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2319 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2320
2321
2322 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
2323
2324 /* Loading coeff for computing o3 in the next block */
2325
2326 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
2327 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
2328
2329 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2330 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2331 }
2332
2333 /* Column 2 of destination computed here */
2334 /* It is stored in m_temp_reg_52 */
2335 /* Column 5 of destination computed here */
2336 /* It is stored in m_temp_reg_55 */
2337 {
2338 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
2339 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
2340
2341 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
2342 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
2343
2344 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2345 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2346 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2347 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2348
2349 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2350 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2351 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2352 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2353
2354 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
2355 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2356 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
2357 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2358 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
2359
2360 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2361 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2362
2363
2364
2365 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
2366
2367
2368 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
2369 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
2370 }
2371
2372 /* Column 3 of destination computed here */
2373 /* It is stored in m_temp_reg_53 */
2374 /* Column 4 of destination computed here */
2375 /* It is stored in m_temp_reg_54 */
2376 {
2377 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
2378 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
2379
2380 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
2381 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
2382
2383 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
2384 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
2385 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
2386 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
2387
2388 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
2389 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
2390 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
2391 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
2392
2393 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
2394 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
2395 }
2396 }
2397
2398 /* Transpose of the destination 8x8 matrix done here */
2399 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
2400 /* respectively */
2401 {
2402 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
2403 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
2404 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
2405 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
2406 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
2407 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
2408 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
2409 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
2410
2411 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
2412 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
2413 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
2414 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
2415 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
2416 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
2417 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
2418 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
2419 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
2420 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
2421 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
2422 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
2423
2424 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
2425 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
2426 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
2427 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
2428 }
2429
2430 /* Recon and store */
2431 {
2432 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
2433 pu1_pred += pred_strd;
2434 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
2435 pu1_pred += pred_strd;
2436 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
2437 pu1_pred += pred_strd;
2438 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
2439 pu1_pred += pred_strd;
2440 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
2441 pu1_pred += pred_strd;
2442 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
2443 pu1_pred += pred_strd;
2444 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
2445 pu1_pred += pred_strd;
2446 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
2447
2448
2449 m_temp_reg_50 = _mm_setzero_si128();
2450 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
2451 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
2452 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
2453 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
2454 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
2455 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
2456 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
2457 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
2458
2459 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
2460 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
2461 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
2462 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
2463 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
2464 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
2465 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
2466 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
2467
2468 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
2469 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
2470 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
2471 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
2472 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
2473 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
2474 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
2475 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
2476
2477 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
2478 pu1_dst += dst_strd;
2479 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
2480 pu1_dst += dst_strd;
2481 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
2482 pu1_dst += dst_strd;
2483 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
2484 pu1_dst += dst_strd;
2485 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
2486 pu1_dst += dst_strd;
2487 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
2488 pu1_dst += dst_strd;
2489 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
2490 pu1_dst += dst_strd;
2491 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
2492 pu1_dst += dst_strd;
2493
2494 }
2495
2496
2497 }
2498
2499
2500 }
2501 }
2502