1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_itrans_recon_atom_intr.c
22 *
23 * @brief
24 * Contains function definitions for inverse quantization, inverse
25 * transform and reconstruction
26 *
27 * @author
28 * 100470
29 * 100592 (edited by)
30 *
31 * @par List of Functions:
32 * - ihevc_itrans_recon_4x4_ttype1_ssse3()
33 * - ihevc_itrans_recon_4x4_ssse3()
34 * - ihevc_itrans_recon_8x8_ssse3()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 #include <stdio.h>
42 #include <string.h>
43 #include "ihevc_typedefs.h"
44 #include "ihevc_platform_macros.h"
45 #include "ihevc_macros.h"
46 #include "ihevc_defs.h"
47 #include "ihevc_func_selector.h"
48 #include "ihevc_trans_tables.h"
49 #include "ihevc_iquant_itrans_recon.h"
50 #include "ihevc_trans_macros.h"
51
52
53 #include <immintrin.h>
54 #include <emmintrin.h>
55
56 #include <tmmintrin.h>
57
58
59 /**
60 *******************************************************************************
61 *
62 * @brief
63 * This function performs inverse quantization, inverse transform
64 * type1(DST) and reconstruction for 4x4 input block
65 *
66 * @par Description:
67 * Performs inverse quantization , inverse transform type 1 and adds
68 * prediction data and clips output to 8 bit
69 *
70 * @param[in] pi2_src
71 * Input 4x4 coefficients
72 *
73 * @param[in] pi2_tmp
74 * Temporary 4x4 buffer for storing inverse
75 * transform 1st stage output
76 *
77 * @param[in] pu1_pred
78 * Prediction 4x4 block
79 *
80 * @param[in] pi2_dequant_coeff
81 * Dequant Coeffs
82 *
83 * @param[out] pu1_dst
84 * Output 4x4 block
85 *
86 * @param[in] qp_div
87 * Quantization parameter / 6
88 *
89 * @param[in] qp_rem
90 * Quantization parameter % 6
91 *
92 * @param[in] src_strd
93 * Input stride
94 *
95 * @param[in] pred_strd
96 * Prediction stride
97 *
98 * @param[in] dst_strd
99 * Output Stride
100 *
101 * @param[in] zero_cols
102 * Zero columns in pi2_src
103 *
104 * @returns Void
105 *
106 * @remarks
107 * None
108 *
109 *******************************************************************************
110 */
111
ihevc_itrans_recon_4x4_ttype1_ssse3(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)112 void ihevc_itrans_recon_4x4_ttype1_ssse3(WORD16 *pi2_src,
113 WORD16 *pi2_tmp,
114 UWORD8 *pu1_pred,
115 UWORD8 *pu1_dst,
116 WORD32 src_strd,
117 WORD32 pred_strd,
118 WORD32 dst_strd,
119 WORD32 zero_cols,
120 WORD32 zero_rows)
121 {
122 __m128i m_temp_reg_0;
123 __m128i m_temp_reg_1;
124 __m128i m_temp_reg_2;
125 __m128i m_temp_reg_3;
126 __m128i m_temp_reg_4;
127 __m128i m_temp_reg_10;
128 __m128i m_temp_reg_11;
129 __m128i m_temp_reg_12;
130 __m128i m_temp_reg_13;
131 __m128i m_temp_reg_14;
132 __m128i m_temp_reg_20;
133 __m128i m_temp_reg_21;
134 __m128i m_temp_reg_22;
135 __m128i m_temp_reg_23;
136 __m128i m_temp_reg_24;
137 __m128i m_temp_reg_25;
138 __m128i m_temp_reg_30;
139 __m128i m_temp_reg_31;
140 __m128i m_temp_reg_32;
141 __m128i m_temp_reg_33;
142 __m128i m_temp_reg_34;
143 __m128i m_temp_reg_35;
144 __m128i m_temp_reg_36;
145 __m128i m_rdng_factor;
146 __m128i m_count;
147
148 __m128i m_ge_zero16b_flag_row0;
149 __m128i m_ge_zero16b_flag_row1;
150 __m128i m_ge_zero16b_flag_row2;
151 __m128i m_ge_zero16b_flag_row3;
152
153 __m128i m_zero = _mm_setzero_si128();
154
155 WORD32 i4_shift = IT_SHIFT_STAGE_1;
156 UNUSED(zero_cols);
157 UNUSED(zero_rows);
158 UNUSED(pi2_tmp);
159 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
160 pi2_src += src_strd;
161 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
162 pi2_src += src_strd;
163 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
164 pi2_src += src_strd;
165 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
166
167 m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
168 m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
169 m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
170 m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
171
172 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
173 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
174 m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
175 m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
176
177 /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
178 m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
179
180 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
181 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
182
183 /* c[4] in m_temp_reg_14 */
184 /* c[4] = src[0] - src[2] + src[3] */
185 {
186 m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
187 }
188
189 /* c[3] in m_temp_reg_13 */
190 {
191 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
192 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 3);
193 m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
194 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
195 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
196 //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
197 }
198
199 /* c[0] in m_temp_reg_10 */
200 {
201 m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
202 }
203
204 /* c[1] in m_temp_reg_11 */
205 {
206 m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
207 }
208
209 /* c[2] in m_temp_reg_12 */
210 {
211 m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
212 }
213
214 /* c[4] in m_temp_reg_14 */
215 /* c[4] = src[0] - src[2] + src[3] */
216 {
217 m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
218 }
219
220 /* Stage 1 outputs stored in m_temp_reg_20-23 */
221 {
222 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 5);
223 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 1);
224 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
225 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
226 //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1);//29*c0
227
228 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 6);
229 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 3);
230 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
231 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
232 //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2);//55*c1
233
234 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
235
236 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 5);
237 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 1);
238 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
239 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
240 //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1);//29*c1
241
242 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 6);
243 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 3);
244 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
245 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
246 //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2);//55*c2
247
248 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
249
250 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 6);
251 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 3);
252 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
253 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
254 //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2);//55*c0
255
256 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 5);
257 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 1);
258 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
259 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
260 //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1);//29*c2
261
262 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_14, 6);
263 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_14, 3);
264 m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_14, 1);
265 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
266 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
267 //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3);//74*c4
268
269 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
270 m_count = _mm_cvtsi32_si128(i4_shift);
271
272 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
273 m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
274 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
275
276 m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
277 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
278
279 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
280 m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
281 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
282
283 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
284
285 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
286 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
287 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
288 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
289
290 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
291 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
292 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
293 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
294
295 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
296 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
297
298 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
299 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
300
301 }
302
303 /* Stage 2 */
304 {
305 i4_shift = IT_SHIFT_STAGE_2;
306
307 /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
308 m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
309 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
310 m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
311 m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
312 m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
313
314 m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
315 m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
316
317 m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
318 m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
319 m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
320 m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
321
322
323 /* c[4] stored in m_temp_reg_4 */
324 {
325 m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
326 }
327
328 /* c[3] stored in m_temp_reg_3 */
329 {
330 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_22, 6);
331 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_22, 3);
332 m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_22, 1);
333 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
334 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
335 //m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
336 }
337
338 /* c[0] stored in m_temp_reg_0 */
339 {
340 m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
341 }
342
343 /* c[1] stored in m_temp_reg_1 */
344 {
345 m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
346 }
347
348 /* c[2] stored in m_temp_reg_2 */
349 {
350 m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
351 }
352
353 /* c[4] stored in m_temp_reg_4 */
354 {
355 m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
356 }
357
358 /* Stage 2 output generation */
359 {
360 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 5);
361 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 1);
362 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
363 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
364 //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);//29*c0
365
366 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 6);
367 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 3);
368 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
369 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
370 //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2);//55*c1
371
372 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
373
374 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 5);
375 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 1);
376 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
377 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
378 //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//29*c1
379
380 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 6);
381 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 3);
382 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
383 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
384 //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);//55*c2
385
386 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
387
388 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
389 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 3);
390 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
391 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
392 //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);//55*c0
393
394 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 5);
395 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 1);
396 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
397 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
398 //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);//29*c2
399
400 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_4, 6);
401 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_4, 3);
402 m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_4, 1);
403 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
404 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
405 //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3);//74*c4
406
407 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
408 m_count = _mm_cvtsi32_si128(i4_shift);
409
410 m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
411 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
412 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
413
414 m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
415 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
416
417 m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
418 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
419 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
420
421 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
422
423 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
424 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
425 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
426 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
427
428 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
429 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
430 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
431 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
432
433 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
434 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
435
436 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
437 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
438 }
439
440 /* Recon and store */
441 {
442 WORD32 *pi4_dst = (WORD32 *)pu1_dst;
443
444 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
445 pu1_pred += pred_strd;
446 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
447 pu1_pred += pred_strd;
448 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
449 pu1_pred += pred_strd;
450 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
451
452 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
453 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
454 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
455 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
456
457 /*m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
458 m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
459 m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
460 m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);*/
461
462 m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
463 m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
464
465 m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
466 m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
467
468 m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
469
470 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
471 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
472 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
473 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
474 pu1_dst += dst_strd;
475 pi4_dst = (WORD32 *)(pu1_dst);
476
477 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
478 pu1_dst += dst_strd;
479 pi4_dst = (WORD32 *)(pu1_dst);
480
481 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
482 pu1_dst += dst_strd;
483 pi4_dst = (WORD32 *)(pu1_dst);
484
485 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
486 }
487 }
488 }
489
490
491 /**
492 *******************************************************************************
493 *
494 * @brief
495 * This function performs inverse quantization, inverse transform
496 * (DCT) and reconstruction for 4x4 input block
497 *
498 * @par Description:
499 * Performs inverse quantization , inverse transform and adds
500 * prediction data and clips output to 8 bit
501 *
502 * @param[in] pi2_src
503 * Input 4x4 coefficients
504 *
505 * @param[in] pi2_tmp
506 * Temporary 4x4 buffer for storing inverse
507 * transform 1st stage output
508 *
509 * @param[in] pu1_pred
510 * Prediction 4x4 block
511 *
512 * @param[in] pi2_dequant_coeff
513 * Dequant Coeffs
514 *
515 * @param[out] pu1_dst
516 * Output 4x4 block
517 *
518 * @param[in] qp_div
519 * Quantization parameter / 6
520 *
521 * @param[in] qp_rem
522 * Quantization parameter % 6
523 *
524 * @param[in] src_strd
525 * Input stride
526 *
527 * @param[in] pred_strd
528 * Prediction stride
529 *
530 * @param[in] dst_strd
531 * Output Stride
532 *
533 * @param[in] zero_cols
534 * Zero columns in pi2_src
535 *
536 * @returns Void
537 *
538 * @remarks
539 * None
540 *
541 *******************************************************************************
542 */
543
ihevc_itrans_recon_4x4_ssse3(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)544 void ihevc_itrans_recon_4x4_ssse3(WORD16 *pi2_src,
545 WORD16 *pi2_tmp,
546 UWORD8 *pu1_pred,
547 UWORD8 *pu1_dst,
548 WORD32 src_strd,
549 WORD32 pred_strd,
550 WORD32 dst_strd,
551 WORD32 zero_cols,
552 WORD32 zero_rows)
553 {
554 __m128i m_temp_reg_0;
555 __m128i m_temp_reg_1;
556 __m128i m_temp_reg_2;
557 __m128i m_temp_reg_3;
558 __m128i m_temp_reg_4;
559 __m128i m_temp_reg_10;
560 __m128i m_temp_reg_11;
561 __m128i m_temp_reg_12;
562 __m128i m_temp_reg_13;
563 __m128i m_temp_reg_14;
564 __m128i m_temp_reg_15;
565 __m128i m_temp_reg_20;
566 __m128i m_temp_reg_21;
567 __m128i m_temp_reg_22;
568 __m128i m_temp_reg_23;
569 __m128i m_temp_reg_24;
570 __m128i m_temp_reg_25;
571 __m128i m_temp_reg_30;
572 __m128i m_temp_reg_31;
573 __m128i m_temp_reg_33;
574 __m128i m_temp_reg_34;
575 __m128i m_rdng_factor;
576 __m128i m_count;
577
578 __m128i m_ge_zero16b_flag_row0;
579 __m128i m_ge_zero16b_flag_row1;
580 __m128i m_ge_zero16b_flag_row2;
581 __m128i m_ge_zero16b_flag_row3;
582
583 __m128i m_zero = _mm_setzero_si128();
584
585 WORD32 i4_shift = IT_SHIFT_STAGE_1;
586 UNUSED(zero_rows);
587 UNUSED(zero_cols);
588 UNUSED(pi2_tmp);
589
590 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
591 pi2_src += src_strd;
592 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
593 pi2_src += src_strd;
594 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
595 pi2_src += src_strd;
596 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
597
598 m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
599 m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
600 m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
601 m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
602
603 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
604 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
605 m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
606 m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
607
608 /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
609 m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
610
611 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
612 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
613
614 /* e */
615 {
616 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
617 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
618 }
619
620 /* o */
621 {
622 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 5);
623 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 2);
624 m_temp_reg_12 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
625 //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
626
627 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 6);
628 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 4);
629 m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_3, 1);
630 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_3);
631 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
632 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
633 //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
634
635 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
636 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 4);
637 m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
638 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_1);
639 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
640 m_temp_reg_14 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
641 //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
642
643 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 5);
644 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 2);
645 m_temp_reg_15 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
646 //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
647 }
648
649 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
650
651 /* e1 stored in m_temp_reg_31 */
652 {
653 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
654 }
655
656 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
657
658 /* e0 stored in m_temp_reg_30 */
659 {
660 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
661 }
662
663 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
664 m_count = _mm_cvtsi32_si128(i4_shift);
665
666 /* o1 stored in m_temp_reg_33 */
667 {
668 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
669 }
670
671 /* e1 + add */
672 {
673 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
674 }
675
676 /* e0 + add */
677 {
678 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
679 }
680
681 /* o0 stored in m_temp_reg_34 */
682 {
683 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
684 }
685
686 /* Stage 1 outputs */
687 {
688 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
689 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
690
691 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
692 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
693
694
695 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
696 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
697 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
698 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
699
700 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
701 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
702 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
703 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
704
705 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
706 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
707
708 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
709 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
710 }
711
712 /* Stage 2 */
713 {
714 i4_shift = IT_SHIFT_STAGE_2;
715
716 /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
717 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);*/
718
719 m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
720 m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
721
722 m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
723 m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
724 m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
725 m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
726
727 /*m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
728 m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
729
730 m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
731 m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
732
733 /* e */
734 {
735 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
736 }
737
738 /* o */
739 /*{
740 m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1);//src[1]*36
741 m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);//src[1]*83
742 m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3);//src[3]*83
743 m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1);//src[3]*36
744 }*/
745 {
746 m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 5);
747 m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 2);
748 m_temp_reg_12 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
749 //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
750
751 m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 6);
752 m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 4);
753 m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_23, 1);
754 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_23);
755 m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
756 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
757 //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
758
759 m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 6);
760 m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 4);
761 m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_22, 1);
762 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_22);
763 m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
764 m_temp_reg_14 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
765 //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
766
767 m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 5);
768 m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 2);
769 m_temp_reg_15 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
770 //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
771 }
772
773 /* e */
774 {
775 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
776 }
777
778 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
779
780 /* e1 stored in m_temp_reg_31 */
781 {
782 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
783 }
784
785 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
786
787 /* e0 stored in m_temp_reg_30 */
788 {
789 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
790 }
791
792 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
793 m_count = _mm_cvtsi32_si128(i4_shift);
794
795 /* o1 stored in m_temp_reg_33 */
796 {
797 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
798 }
799
800 /* e1 + add */
801 {
802 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
803 }
804
805 /* e0 + add */
806 {
807 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
808 }
809
810 /* o0 stored in m_temp_reg_34 */
811 {
812 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
813 }
814
815 /* Stage 2 outputs */
816 {
817 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
818 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
819 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
820 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
821
822 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
823 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
824 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
825 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
826
827 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
828 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
829 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
830 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
831
832 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
833 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
834
835 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
836 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
837 }
838
839 /* Recon and store */
840 {
841 UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
842
843 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
844 pu1_pred += pred_strd;
845 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
846 pu1_pred += pred_strd;
847 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
848 pu1_pred += pred_strd;
849 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
850
851 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
852 //m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
853
854 //m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
855 //m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
856
857 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
858 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
859 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
860 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
861
862 m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
863 m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
864
865 m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
866 m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
867
868 m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
869
870 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
871 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
872 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
873 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
874 pu1_dst += dst_strd;
875 pu4_dst = (UWORD32 *)(pu1_dst);
876
877 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
878 pu1_dst += dst_strd;
879 pu4_dst = (UWORD32 *)(pu1_dst);
880
881 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
882 pu1_dst += dst_strd;
883 pu4_dst = (UWORD32 *)(pu1_dst);
884
885 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
886 }
887 }
888 }
889
890
891
892 /**
893 *******************************************************************************
894 *
895 * @brief
896 * This function performs inverse quantization, inverse transform and
897 * reconstruction for 8c8 input block
898 *
899 * @par Description:
900 * Performs inverse quantization , inverse transform and adds the
901 * prediction data and clips output to 8 bit
902 *
903 * @param[in] pi2_src
904 * Input 8x8 coefficients
905 *
906 * @param[in] pi2_tmp
907 * Temporary 8x8 buffer for storing inverse
908 * transform 1st stage output
909 *
910 * @param[in] pu1_pred
911 * Prediction 8x8 block
912 *
913 * @param[in] pi2_dequant_coeff
914 * Dequant Coeffs
915 *
916 * @param[out] pu1_dst
917 * Output 8x8 block
918 *
919 * @param[in] src_strd
920 * Input stride
921 *
922 * @param[in] qp_div
923 * Quantization parameter / 6
924 *
925 * @param[in] qp_rem
926 * Quantization parameter % 6
927 *
928 * @param[in] pred_strd
929 * Prediction stride
930 *
931 * @param[in] dst_strd
932 * Output Stride
933 *
934 * @param[in] zero_cols
935 * Zero columns in pi2_src
936 *
937 * @returns Void
938 *
939 * @remarks
940 * None
941 *
942 *******************************************************************************
943 */
944
945
ihevc_itrans_recon_8x8_ssse3(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)946 void ihevc_itrans_recon_8x8_ssse3(WORD16 *pi2_src,
947 WORD16 *pi2_tmp,
948 UWORD8 *pu1_pred,
949 UWORD8 *pu1_dst,
950 WORD32 src_strd,
951 WORD32 pred_strd,
952 WORD32 dst_strd,
953 WORD32 zero_cols,
954 WORD32 zero_rows)
955 {
956 __m128i m_temp_reg_0;
957 __m128i m_temp_reg_1;
958 __m128i m_temp_reg_2;
959 __m128i m_temp_reg_3;
960 __m128i m_temp_reg_5;
961 __m128i m_temp_reg_6;
962 __m128i m_temp_reg_7;
963 __m128i m_temp_reg_4;
964 __m128i m_temp_reg_10;
965 __m128i m_temp_reg_11;
966 __m128i m_temp_reg_12;
967 __m128i m_temp_reg_13;
968 __m128i m_temp_reg_14;
969 __m128i m_temp_reg_15;
970 __m128i m_temp_reg_16;
971 __m128i m_temp_reg_17;
972 __m128i m_temp_reg_20;
973 __m128i m_temp_reg_21;
974 __m128i m_temp_reg_22;
975 __m128i m_temp_reg_23;
976 __m128i m_temp_reg_24;
977 __m128i m_temp_reg_25;
978 __m128i m_temp_reg_26;
979 __m128i m_temp_reg_27;
980 __m128i m_temp_reg_30;
981 __m128i m_temp_reg_31;
982 __m128i m_temp_reg_32;
983 __m128i m_temp_reg_33;
984 __m128i m_temp_reg_34;
985 __m128i m_temp_reg_35;
986 __m128i m_temp_reg_36;
987 __m128i m_temp_reg_37;
988 __m128i m_temp_reg_40;
989 __m128i m_temp_reg_41;
990 __m128i m_temp_reg_42;
991 __m128i m_temp_reg_43;
992 __m128i m_temp_reg_44;
993 __m128i m_temp_reg_45;
994 __m128i m_temp_reg_46;
995 __m128i m_temp_reg_47;
996 __m128i m_temp_reg_50;
997 __m128i m_temp_reg_51;
998 __m128i m_temp_reg_52;
999 __m128i m_temp_reg_53;
1000 __m128i m_temp_reg_54;
1001 __m128i m_temp_reg_55;
1002 __m128i m_temp_reg_56;
1003 __m128i m_temp_reg_57;
1004 __m128i m_temp_reg_60;
1005 __m128i m_temp_reg_61;
1006 __m128i m_temp_reg_62;
1007 __m128i m_temp_reg_63;
1008 __m128i m_temp_reg_64;
1009 __m128i m_temp_reg_65;
1010 __m128i m_temp_reg_66;
1011 __m128i m_temp_reg_67;
1012 __m128i m_temp_reg_70;
1013 __m128i m_temp_reg_71;
1014 __m128i m_temp_reg_72;
1015 __m128i m_temp_reg_73;
1016 __m128i m_temp_reg_74;
1017 __m128i m_temp_reg_75;
1018 __m128i m_temp_reg_76;
1019 __m128i m_temp_reg_77;
1020 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
1021
1022 WORD32 check_row_stage_1; /* Lokesh */
1023 WORD32 check_row_stage_2; /* Lokesh */
1024
1025 __m128i m_rdng_factor;
1026 //__m128i m_count;
1027 WORD32 i4_shift = IT_SHIFT_STAGE_1;
1028 UNUSED(zero_rows);
1029 UNUSED(zero_cols);
1030 UNUSED(pi2_tmp);
1031
1032 check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
1033 check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
1034
1035 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src);
1036 pi2_src += src_strd;
1037 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src);
1038 pi2_src += src_strd;
1039 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src);
1040 pi2_src += src_strd;
1041 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src);
1042 pi2_src += src_strd;
1043
1044 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src);
1045 pi2_src += src_strd;
1046 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src);
1047 pi2_src += src_strd;
1048 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src);
1049 pi2_src += src_strd;
1050 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src);
1051
1052 if(!check_row_stage_2)
1053 {
1054 if(!check_row_stage_1)
1055 {
1056 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1057 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1058 {
1059 //Interleaving 0,4 row in 0 , 1 Rishab
1060 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1061 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1062 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1063
1064 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1065
1066 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1067 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1068
1069 }
1070
1071
1072 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1073 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1074 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
1075 {
1076
1077 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1078 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1079
1080 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1081 //Interleaving 2,6 row in 4, 5 Rishab
1082 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1083
1084 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1085 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1086
1087
1088 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1089
1090 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1091 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
1092
1093 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1094 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
1095
1096
1097
1098 /* e */
1099
1100 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1101 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1102 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1103 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1104 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1105 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1106
1107 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1108 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1109
1110 }
1111
1112 /* o */
1113 {
1114
1115 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1116 {
1117
1118 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1119 //o0:1B*89+3B*75,5B*50+7B*18
1120 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1121
1122 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1123 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1124
1125
1126
1127 /* Column 0 of destination computed here */
1128 /* It is stored in m_temp_reg_50 */
1129 /* Column 7 of destination computed here */
1130 /* It is stored in m_temp_reg_57 */
1131 /* Upper 8 bytes of both registers are zero due to zero_cols*/
1132
1133
1134
1135 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1136 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1137
1138 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1139 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1140
1141 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1142 m_temp_reg_63 = _mm_setzero_si128();
1143 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1144
1145 //o1:1B*75-3B*18,5B*89+7B*50
1146 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1147
1148 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1149 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1150
1151 /* Loading coeff for computing o2 in the next block */
1152
1153 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1154 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
1155
1156 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1157
1158
1159
1160 /* Column 1 of destination computed here */
1161 /* It is stored in m_temp_reg_51 */
1162 /* Column 6 of destination computed here */
1163 /* It is stored in m_temp_reg_56 */
1164
1165 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1166 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1167
1168 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1169 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1170
1171 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1172 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1173
1174 //o2:1B*50-3B*89,5B*18+7B*75
1175 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1176
1177 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1178 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1179
1180
1181 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1182
1183 /* Loading coeff for computing o3 in the next block */
1184
1185 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1186 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
1187
1188
1189
1190 /* Column 2 of destination computed here */
1191 /* It is stored in m_temp_reg_52 */
1192 /* Column 5 of destination computed here */
1193 /* It is stored in m_temp_reg_55 */
1194
1195 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1196 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1197
1198 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1199 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1200
1201 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1202 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1203
1204 //o3:1B*18-3B*50,5B*75-7B*89
1205 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1206
1207 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1208 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1209
1210
1211
1212 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1213
1214
1215
1216 /* Column 3 of destination computed here */
1217 /* It is stored in m_temp_reg_53 */
1218 /* Column 4 of destination computed here */
1219 /* It is stored in m_temp_reg_54 */
1220
1221 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1222 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1223
1224 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1225 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1226
1227 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1228 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1229
1230
1231 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1232 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1233 }
1234 }
1235
1236 /* Transpose of the destination 8x8 matrix done here */
1237 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1238 /* respectively */
1239 {
1240 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1241 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1242 //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1243 //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1244 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1245 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1246 //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1247 //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1248
1249 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1250 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1251 //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1252 //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1253 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1254 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1255 //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1256 //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1257
1258 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1259 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1260 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1261 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1262
1263 /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1264 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1265 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1266 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1267 */
1268 m_temp_reg_54 = _mm_setzero_si128();
1269 m_temp_reg_55 = _mm_setzero_si128();
1270 m_temp_reg_56 = _mm_setzero_si128();
1271 m_temp_reg_57 = _mm_setzero_si128();
1272 }
1273 }
1274 else
1275 {
1276 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1277 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1278 {
1279 //Interleaving 0,4 row in 0 , 1 Rishab
1280 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1281 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1282 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1283
1284 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1285
1286 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1287 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1288
1289 }
1290
1291
1292 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1293 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1294 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
1295 {
1296
1297 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1298 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1299
1300 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1301 //Interleaving 2,6 row in 4, 5 Rishab
1302 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1303
1304 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1305 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1306
1307
1308 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1309
1310 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1311 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
1312
1313 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1314 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
1315
1316
1317
1318 /* e */
1319
1320 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1321 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1322 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1323 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1324 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1325 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1326
1327 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1328 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1329
1330 }
1331
1332 /* o */
1333 {
1334
1335 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1336 {
1337
1338 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1339 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1340 //o0:1B*89+3B*75,5B*50+7B*18
1341 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1342 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1343
1344 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1345 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1346
1347 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1348
1349
1350
1351 /* Column 0 of destination computed here */
1352 /* It is stored in m_temp_reg_50 */
1353 /* Column 7 of destination computed here */
1354 /* It is stored in m_temp_reg_57 */
1355 /* Upper 8 bytes of both registers are zero due to zero_cols*/
1356
1357
1358
1359 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1360 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1361
1362 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1363 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1364
1365 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1366 m_temp_reg_63 = _mm_setzero_si128();
1367 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1368
1369 //o1:1B*75-3B*18,5B*89+7B*50
1370 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1371 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1372
1373 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1374 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1375
1376 /* Loading coeff for computing o2 in the next block */
1377
1378 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1379 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
1380
1381 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1382 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1383
1384
1385
1386 /* Column 1 of destination computed here */
1387 /* It is stored in m_temp_reg_51 */
1388 /* Column 6 of destination computed here */
1389 /* It is stored in m_temp_reg_56 */
1390
1391 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1392 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1393
1394 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1395 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1396
1397 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1398 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1399
1400 //o2:1B*50-3B*89,5B*18+7B*75
1401 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1402 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1403
1404 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1405 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1406
1407
1408 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1409
1410 /* Loading coeff for computing o3 in the next block */
1411
1412 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1413 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
1414
1415 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1416
1417
1418 /* Column 2 of destination computed here */
1419 /* It is stored in m_temp_reg_52 */
1420 /* Column 5 of destination computed here */
1421 /* It is stored in m_temp_reg_55 */
1422
1423 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1424 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1425
1426 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1427 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1428
1429 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1430 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1431
1432 //o3:1B*18-3B*50,5B*75-7B*89
1433 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1434 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1435
1436 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1437 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1438
1439
1440
1441 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1442
1443 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1444
1445
1446 /* Column 3 of destination computed here */
1447 /* It is stored in m_temp_reg_53 */
1448 /* Column 4 of destination computed here */
1449 /* It is stored in m_temp_reg_54 */
1450
1451 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1452 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1453
1454 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1455 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1456
1457 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1458 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1459
1460
1461 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1462 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1463 }
1464 }
1465
1466 /* Transpose of the destination 8x8 matrix done here */
1467 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1468 /* respectively */
1469 {
1470 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1471 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1472 //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1473 //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1474 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1475 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1476 //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1477 //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1478
1479 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1480 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1481 //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1482 //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1483 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1484 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1485 //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1486 //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1487
1488 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1489 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1490 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1491 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1492
1493 /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1494 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1495 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1496 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1497 */
1498 m_temp_reg_54 = _mm_setzero_si128();
1499 m_temp_reg_55 = _mm_setzero_si128();
1500 m_temp_reg_56 = _mm_setzero_si128();
1501 m_temp_reg_57 = _mm_setzero_si128();
1502 }
1503 }
1504
1505 /* Stage 2 */
1506 i4_shift = IT_SHIFT_STAGE_2;
1507 {
1508 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1509 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1510 {
1511 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
1512 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
1513
1514 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
1515 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
1516
1517 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1518 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1519 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1520 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1521
1522
1523 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
1524 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
1525 }
1526
1527
1528 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1529 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1530 {
1531
1532 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
1533 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
1534
1535
1536 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1537 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1538 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1539 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1540
1541 /* Loading coeff for computing o0 in the next block */
1542 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1543
1544
1545 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
1546 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
1547
1548
1549
1550 /* e */
1551
1552 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1553 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1554 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1555 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1556 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1557 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1558
1559 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1560 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1561
1562 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1563 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1564
1565 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1566 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1567
1568 }
1569
1570 /* o */
1571 {
1572
1573 //m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55,m_temp_reg_57);
1574 //m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55,m_temp_reg_57);
1575
1576 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1577 {
1578 //o0:1B*89+3B*75,1T*89+3T*75
1579 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1580 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1581
1582 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1583 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1584 /* Loading coeff for computing o1 in the next block */
1585 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1586
1587
1588
1589 /* Column 0 of destination computed here */
1590 /* It is stored in m_temp_reg_50 */
1591 /* Column 7 of destination computed here */
1592 /* It is stored in m_temp_reg_57 */
1593
1594 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1595 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1596
1597 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1598 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1599
1600 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1601 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1602 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1603 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1604
1605 //o1:1B*75-3B*18,1T*75-3T*18
1606 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1607 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1608
1609 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1610 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1611 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1612 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1613
1614 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1615 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1616
1617
1618 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1619
1620
1621 /* Loading coeff for computing o2 in the next block */
1622 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1623
1624
1625
1626 /* Column 1 of destination computed here */
1627 /* It is stored in m_temp_reg_51 */
1628 /* Column 6 of destination computed here */
1629 /* It is stored in m_temp_reg_56 */
1630
1631 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1632 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1633
1634 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1635 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1636
1637 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1638 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1639 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1640 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1641
1642 //o2:1B*50-3B*89,5T*18+7T*75.
1643 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1644 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1645
1646 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1647 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1648 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1649 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1650
1651 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1652 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1653
1654
1655 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1656
1657 /* Loading coeff for computing o3 in the next block */
1658
1659 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1660
1661
1662 /* Column 2 of destination computed here */
1663 /* It is stored in m_temp_reg_52 */
1664 /* Column 5 of destination computed here */
1665 /* It is stored in m_temp_reg_55 */
1666
1667 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1668 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1669
1670 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1671 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1672
1673 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1674 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1675 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1676 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1677
1678 //o3:1B*18-3B*50,1T*18-3T*50
1679 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1680 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1681
1682 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1683 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1684 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1685 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1686
1687
1688 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1689 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1690
1691
1692
1693 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1694
1695
1696 /* Column 3 of destination computed here */
1697 /* It is stored in m_temp_reg_53 */
1698 /* Column 4 of destination computed here */
1699 /* It is stored in m_temp_reg_54 */
1700
1701 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1702 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1703
1704 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1705 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1706
1707 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
1708 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
1709 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
1710 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
1711
1712 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
1713 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
1714 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
1715 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
1716
1717 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
1718 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
1719 }
1720 }
1721
1722 /* Transpose of the destination 8x8 matrix done here */
1723 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1724 /* respectively */
1725 {
1726 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1727 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1728 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1729 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1730 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1731 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1732 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1733 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1734
1735 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1736 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1737 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1738 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1739 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1740 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1741 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1742 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1743 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1744 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1745 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1746 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1747
1748 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1749 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1750 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1751 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1752 }
1753
1754 /* Recon and store */
1755 {
1756 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
1757 pu1_pred += pred_strd;
1758 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
1759 pu1_pred += pred_strd;
1760 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
1761 pu1_pred += pred_strd;
1762 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
1763 pu1_pred += pred_strd;
1764 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
1765 pu1_pred += pred_strd;
1766 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
1767 pu1_pred += pred_strd;
1768 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
1769 pu1_pred += pred_strd;
1770 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
1771
1772 m_temp_reg_50 = _mm_setzero_si128();
1773 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
1774 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
1775 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
1776 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
1777 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
1778 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
1779 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
1780 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
1781
1782 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
1783 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
1784 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
1785 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
1786 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
1787 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
1788 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
1789 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
1790
1791 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
1792 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
1793 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
1794 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
1795 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
1796 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
1797 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
1798 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
1799
1800 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
1801 pu1_dst += dst_strd;
1802 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
1803 pu1_dst += dst_strd;
1804 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
1805 pu1_dst += dst_strd;
1806 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
1807 pu1_dst += dst_strd;
1808 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
1809 pu1_dst += dst_strd;
1810 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
1811 pu1_dst += dst_strd;
1812 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
1813 pu1_dst += dst_strd;
1814 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
1815 pu1_dst += dst_strd;
1816 }
1817 }
1818 }
1819 else
1820
1821 {
1822
1823 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1824 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1825 if(!check_row_stage_1)
1826 {
1827 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1828 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1829 {
1830 //Interleaving 0,4 row in 0 , 1 Rishab
1831 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1832 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1833 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1834
1835 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1836 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
1837
1838 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1839 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1840
1841
1842 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1843 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1844 }
1845
1846
1847 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1848 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1849 {
1850
1851 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1852 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1853
1854 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1855 //Interleaving 2,6 row in 4, 5 Rishab
1856 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1857 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
1858
1859 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1860 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1861
1862 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
1863 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1864
1865
1866
1867 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1868
1869 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1870 //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
1871
1872 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1873 //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
1874
1875 }
1876
1877 /* e */
1878 {
1879 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1880 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1881 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1882 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1883 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1884 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1885
1886 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1887 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1888
1889 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1890 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1891
1892 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1893 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1894
1895 }
1896
1897 /* o */
1898 {
1899
1900 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1901 {
1902
1903 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1904 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1905 // m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75,m_temp_reg_77);
1906 // m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75,m_temp_reg_77);
1907 //o0:1B*89+3B*75,1T*89+3T*75
1908 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1909 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1910 //m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1911 //m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1912
1913
1914 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1915 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1916
1917 //m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1918 //m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1919 }
1920
1921 /* Column 0 of destination computed here */
1922 /* It is stored in m_temp_reg_50 */
1923 /* Column 7 of destination computed here */
1924 /* It is stored in m_temp_reg_57 */
1925 {
1926
1927
1928 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1929 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1930
1931 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1932 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1933
1934 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1935 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1936 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1937 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1938
1939 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1940 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1941 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1942 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1943
1944 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1945 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1946 //m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1947 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1948 //m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
1949
1950 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1951 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1952
1953 /* Loading coeff for computing o2 in the next block */
1954
1955 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1956
1957 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1958 //m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1959 //m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
1960 }
1961
1962 /* Column 1 of destination computed here */
1963 /* It is stored in m_temp_reg_51 */
1964 /* Column 6 of destination computed here */
1965 /* It is stored in m_temp_reg_56 */
1966 {
1967 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1968 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1969
1970 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1971 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1972
1973 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1974 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1975 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1976 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1977
1978 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1979 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1980 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1981 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1982
1983 //o2:1B*50-3B*89,1T*50-3T*89
1984 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1985 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1986
1987 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1988 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1989
1990
1991 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1992
1993
1994 /* Loading coeff for computing o3 in the next block */
1995
1996 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1997
1998 //m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1999 //m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2000 }
2001
2002 /* Column 2 of destination computed here */
2003 /* It is stored in m_temp_reg_52 */
2004 /* Column 5 of destination computed here */
2005 /* It is stored in m_temp_reg_55 */
2006 {
2007 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
2008 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
2009
2010 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
2011 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
2012
2013 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2014 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2015 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2016 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2017
2018 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2019 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2020 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2021 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2022
2023 //o3:1B*18-3B*50,1T*18-3T*50
2024 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
2025 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
2026
2027 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2028 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2029
2030
2031
2032 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
2033
2034
2035 }
2036
2037 /* Column 3 of destination computed here */
2038 /* It is stored in m_temp_reg_53 */
2039 /* Column 4 of destination computed here */
2040 /* It is stored in m_temp_reg_54 */
2041 {
2042 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
2043 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
2044
2045 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
2046 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
2047
2048 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2049 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2050 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2051 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2052
2053 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2054 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2055 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2056 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2057
2058 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2059 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2060 }
2061 }
2062
2063 /* Transpose of the destination 8x8 matrix done here */
2064 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
2065 /* respectively */
2066 {
2067
2068
2069 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
2070 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
2071 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
2072 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
2073 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
2074 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
2075 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
2076 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
2077
2078 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
2079 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
2080 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
2081 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
2082 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
2083 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
2084 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
2085 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
2086
2087 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
2088 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
2089 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
2090 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
2091
2092 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
2093 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
2094 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
2095 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
2096 }
2097 }
2098 else
2099 {
2100
2101 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
2102 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
2103 {
2104 //Interleaving 0,4 row in 0 , 1 Rishab
2105 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
2106 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
2107 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
2108
2109 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
2110 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
2111
2112 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2113 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2114
2115
2116 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2117 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2118 }
2119
2120
2121 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
2122 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
2123 {
2124
2125 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
2126 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
2127
2128 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
2129 //Interleaving 2,6 row in 4, 5 Rishab
2130 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
2131 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
2132
2133 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
2134 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
2135
2136 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
2137 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
2138
2139
2140
2141 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
2142
2143 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
2144 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
2145
2146 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
2147 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
2148
2149 }
2150
2151 /* e */
2152 {
2153 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
2154 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
2155 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
2156 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
2157 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
2158 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
2159
2160 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
2161 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
2162
2163 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
2164 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
2165
2166 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
2167 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
2168
2169 }
2170
2171 /* o */
2172 {
2173
2174 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
2175 {
2176
2177 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
2178 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
2179 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
2180 m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
2181 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
2182 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
2183 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
2184 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
2185 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
2186
2187
2188 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2189 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
2190
2191 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2192 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2193 }
2194
2195 /* Column 0 of destination computed here */
2196 /* It is stored in m_temp_reg_50 */
2197 /* Column 7 of destination computed here */
2198 /* It is stored in m_temp_reg_57 */
2199 {
2200
2201
2202 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
2203 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
2204
2205 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2206 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2207
2208 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2209 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2210 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2211 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2212
2213 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2214 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2215 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2216 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2217
2218 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
2219 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
2220 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
2221 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
2222 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
2223
2224 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2225 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2226
2227 /* Loading coeff for computing o2 in the next block */
2228
2229 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
2230 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
2231
2232 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
2233 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
2234 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
2235 }
2236
2237 /* Column 1 of destination computed here */
2238 /* It is stored in m_temp_reg_51 */
2239 /* Column 6 of destination computed here */
2240 /* It is stored in m_temp_reg_56 */
2241 {
2242 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
2243 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
2244
2245 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
2246 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
2247
2248 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2249 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2250 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2251 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2252
2253 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2254 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2255 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2256 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2257
2258 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
2259 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
2260 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
2261 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
2262 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
2263
2264 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2265 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2266
2267
2268 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
2269
2270
2271 /* Loading coeff for computing o3 in the next block */
2272
2273 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
2274 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
2275
2276 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2277 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2278 }
2279
2280 /* Column 2 of destination computed here */
2281 /* It is stored in m_temp_reg_52 */
2282 /* Column 5 of destination computed here */
2283 /* It is stored in m_temp_reg_55 */
2284 {
2285 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
2286 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
2287
2288 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
2289 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
2290
2291 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2292 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2293 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2294 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2295
2296 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2297 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2298 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2299 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2300
2301 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
2302 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
2303 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
2304 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
2305 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
2306
2307 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2308 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2309
2310
2311
2312 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
2313
2314
2315 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
2316 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
2317 }
2318
2319 /* Column 3 of destination computed here */
2320 /* It is stored in m_temp_reg_53 */
2321 /* Column 4 of destination computed here */
2322 /* It is stored in m_temp_reg_54 */
2323 {
2324 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
2325 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
2326
2327 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
2328 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
2329
2330 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2331 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2332 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2333 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2334
2335 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2336 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2337 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2338 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2339
2340 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2341 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2342 }
2343 }
2344
2345 /* Transpose of the destination 8x8 matrix done here */
2346 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
2347 /* respectively */
2348 {
2349
2350
2351 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
2352 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
2353 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
2354 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
2355 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
2356 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
2357 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
2358 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
2359
2360 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
2361 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
2362 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
2363 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
2364 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
2365 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
2366 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
2367 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
2368
2369 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
2370 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
2371 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
2372 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
2373
2374 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
2375 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
2376 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
2377 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
2378 }
2379 }
2380 /* Stage 2 */
2381
2382 i4_shift = IT_SHIFT_STAGE_2;
2383
2384 {
2385
2386 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
2387 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
2388 {
2389 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
2390 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
2391
2392 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
2393 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
2394
2395 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2396 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2397 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2398 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2399
2400
2401 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
2402 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
2403 }
2404
2405
2406 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
2407 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
2408 {
2409 //m_temp_reg_66 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);
2410 //m_temp_reg_64 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);
2411 //m_temp_reg_62 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);
2412 //m_temp_reg_60 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);
2413 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
2414 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
2415
2416
2417 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2418 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2419 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2420 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2421
2422 //m_temp_reg_16 = _mm_sub_epi32(m_temp_reg_64, m_temp_reg_66);
2423 //m_temp_reg_14 = _mm_add_epi32(m_temp_reg_60, m_temp_reg_62);
2424
2425 /* Loading coeff for computing o0 in the next block */
2426 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
2427 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
2428
2429
2430 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
2431 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
2432 /*m_temp_reg_3 = _mm_srli_si128(m_temp_reg_53, 8);
2433 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
2434 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
2435 */
2436
2437 }
2438
2439 /* e */
2440 {
2441 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
2442 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
2443 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
2444 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
2445 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
2446 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
2447
2448 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
2449 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
2450
2451 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
2452 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
2453
2454 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
2455 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
2456
2457 }
2458
2459 /* o */
2460 {
2461 /* m_temp_reg_4 = _mm_cvtepi16_epi32(m_temp_reg_55);
2462 m_temp_reg_5 = _mm_srli_si128(m_temp_reg_55, 8);
2463 m_temp_reg_6 = _mm_cvtepi16_epi32(m_temp_reg_57);
2464 m_temp_reg_7 = _mm_srli_si128(m_temp_reg_57, 8);
2465 m_temp_reg_5 = _mm_cvtepi16_epi32(m_temp_reg_5);
2466 m_temp_reg_7 = _mm_cvtepi16_epi32(m_temp_reg_7);
2467 */
2468 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
2469 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
2470
2471 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
2472 {
2473 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
2474 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2475 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2476 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
2477 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
2478
2479 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2480 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
2481 /* Loading coeff for computing o1 in the next block */
2482 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
2483 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
2484
2485 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2486 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2487 }
2488
2489 /* Column 0 of destination computed here */
2490 /* It is stored in m_temp_reg_50 */
2491 /* Column 7 of destination computed here */
2492 /* It is stored in m_temp_reg_57 */
2493 {
2494 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
2495 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
2496
2497 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2498 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2499
2500 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2501 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2502 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2503 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2504
2505 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2506 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2507 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2508 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2509
2510 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
2511 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2512 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
2513 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2514 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
2515
2516 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2517 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2518
2519
2520 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
2521
2522
2523 /* Loading coeff for computing o2 in the next block */
2524 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
2525 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
2526
2527 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
2528 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
2529 }
2530
2531 /* Column 1 of destination computed here */
2532 /* It is stored in m_temp_reg_51 */
2533 /* Column 6 of destination computed here */
2534 /* It is stored in m_temp_reg_56 */
2535 {
2536 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
2537 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
2538
2539 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
2540 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
2541
2542 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2543 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2544 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2545 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2546
2547 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2548 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2549 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2550 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2551
2552 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
2553 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2554 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
2555 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2556 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
2557
2558 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2559 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2560
2561
2562 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
2563
2564 /* Loading coeff for computing o3 in the next block */
2565
2566 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
2567 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
2568
2569 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2570 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2571 }
2572
2573 /* Column 2 of destination computed here */
2574 /* It is stored in m_temp_reg_52 */
2575 /* Column 5 of destination computed here */
2576 /* It is stored in m_temp_reg_55 */
2577 {
2578 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
2579 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
2580
2581 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
2582 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
2583
2584 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2585 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2586 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2587 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2588
2589 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2590 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2591 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2592 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2593
2594 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
2595 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2596 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
2597 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2598 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
2599
2600 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2601 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2602
2603
2604
2605 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
2606
2607
2608 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
2609 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
2610 }
2611
2612 /* Column 3 of destination computed here */
2613 /* It is stored in m_temp_reg_53 */
2614 /* Column 4 of destination computed here */
2615 /* It is stored in m_temp_reg_54 */
2616 {
2617 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
2618 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
2619
2620 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
2621 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
2622
2623 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
2624 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
2625 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
2626 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
2627
2628 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
2629 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
2630 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
2631 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
2632
2633 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
2634 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
2635 }
2636 }
2637
2638 /* Transpose of the destination 8x8 matrix done here */
2639 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
2640 /* respectively */
2641 {
2642 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
2643 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
2644 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
2645 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
2646 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
2647 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
2648 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
2649 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
2650
2651 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
2652 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
2653 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
2654 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
2655 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
2656 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
2657 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
2658 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
2659 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
2660 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
2661 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
2662 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
2663
2664 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
2665 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
2666 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
2667 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
2668 }
2669
2670 /* Recon and store */
2671 {
2672 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
2673 pu1_pred += pred_strd;
2674 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
2675 pu1_pred += pred_strd;
2676 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
2677 pu1_pred += pred_strd;
2678 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
2679 pu1_pred += pred_strd;
2680 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
2681 pu1_pred += pred_strd;
2682 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
2683 pu1_pred += pred_strd;
2684 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
2685 pu1_pred += pred_strd;
2686 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
2687
2688
2689 m_temp_reg_50 = _mm_setzero_si128();
2690 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
2691 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
2692 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
2693 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
2694 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
2695 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
2696 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
2697 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
2698
2699 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
2700 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
2701 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
2702 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
2703 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
2704 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
2705 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
2706 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
2707
2708 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
2709 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
2710 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
2711 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
2712 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
2713 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
2714 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
2715 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
2716
2717 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
2718 pu1_dst += dst_strd;
2719 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
2720 pu1_dst += dst_strd;
2721 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
2722 pu1_dst += dst_strd;
2723 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
2724 pu1_dst += dst_strd;
2725 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
2726 pu1_dst += dst_strd;
2727 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
2728 pu1_dst += dst_strd;
2729 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
2730 pu1_dst += dst_strd;
2731 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
2732 pu1_dst += dst_strd;
2733
2734 }
2735
2736
2737 }
2738
2739
2740 }
2741 }
2742
2743