1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_itrans_recon_32x32.c
22 *
23 * @brief
24 * Contains function definitions for inverse transform and reconstruction 32x32
25 *
26 *
27 * @author
28 * 100470
29 *
30 * @par List of Functions:
31 * - ihevc_itrans_recon_32x32()
32 *
33 * @remarks
34 * None
35 *
36 *******************************************************************************
37 */
38 #include <stdio.h>
39 #include <string.h>
40 #include "ihevc_typedefs.h"
41 #include "ihevc_macros.h"
42 #include "ihevc_platform_macros.h"
43 #include "ihevc_defs.h"
44 #include "ihevc_trans_tables.h"
45 #include "ihevc_itrans_recon.h"
46 #include "ihevc_func_selector.h"
47 #include "ihevc_trans_macros.h"
48
49
50 /**
51 *******************************************************************************
52 *
53 * @brief
54 * This function performs Inverse transform and reconstruction for 32x32
55 * input block
56 *
57 * @par Description:
58 * Performs inverse transform and adds the prediction data and clips output
59 * to 8 bit
60 *
61 * @param[in] pi2_src
62 * Input 32x32 coefficients
63 *
64 * @param[in] pi2_tmp
65 * Temporary 32x32 buffer for storing inverse
66 *
67 * transform
68 * 1st stage output
69 *
70 * @param[in] pu1_pred
71 * Prediction 32x32 block
72 *
73 * @param[out] pu1_dst
74 * Output 32x32 block
75 *
76 * @param[in] src_strd
77 * Input stride
78 *
79 * @param[in] pred_strd
80 * Prediction stride
81 *
82 * @param[in] dst_strd
83 * Output Stride
84 *
85 * @param[in] shift
86 * Output shift
87 *
88 * @param[in] zero_cols
89 * Zero columns in pi2_src
90 *
91 * @returns Void
92 *
93 * @remarks
94 * None
95 *
96 *******************************************************************************
97 */
98
ihevc_itrans_recon_32x32(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)99 void ihevc_itrans_recon_32x32(WORD16 *pi2_src,
100 WORD16 *pi2_tmp,
101 UWORD8 *pu1_pred,
102 UWORD8 *pu1_dst,
103 WORD32 src_strd,
104 WORD32 pred_strd,
105 WORD32 dst_strd,
106 WORD32 zero_cols,
107 WORD32 zero_rows)
108 {
109 WORD32 j, k;
110 WORD32 e[16], o[16];
111 WORD32 ee[8], eo[8];
112 WORD32 eee[4], eeo[4];
113 WORD32 eeee[2], eeeo[2];
114 WORD32 add;
115 WORD32 shift;
116 WORD16 *pi2_tmp_orig;
117 WORD32 trans_size;
118 WORD32 zero_rows_2nd_stage = zero_cols;
119 WORD32 row_limit_2nd_stage;
120
121 trans_size = TRANS_SIZE_32;
122 pi2_tmp_orig = pi2_tmp;
123
124 if((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0)
125 row_limit_2nd_stage = 4;
126 else if((zero_cols & 0xFFFFFF00) == 0xFFFFFF00)
127 row_limit_2nd_stage = 8;
128 else
129 row_limit_2nd_stage = TRANS_SIZE_32;
130
131 if((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of input are non-zero */
132 {
133 /************************************************************************************************/
134 /**********************************START - IT_RECON_32x32****************************************/
135 /************************************************************************************************/
136 /* Inverse Transform 1st stage */
137 shift = IT_SHIFT_STAGE_1;
138 add = 1 << (shift - 1);
139
140 for(j = 0; j < row_limit_2nd_stage; j++)
141 {
142 /* Checking for Zero Cols */
143 if((zero_cols & 1) == 1)
144 {
145 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
146 }
147 else
148 {
149 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
150 for(k = 0; k < 16; k++)
151 {
152 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
153 + g_ai2_ihevc_trans_32[3][k]
154 * pi2_src[3 * src_strd];
155 }
156 for(k = 0; k < 8; k++)
157 {
158 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd];
159 }
160 // for(k = 0; k < 4; k++)
161 {
162 eeo[0] = 0;
163 eeo[1] = 0;
164 eeo[2] = 0;
165 eeo[3] = 0;
166 }
167 eeeo[0] = 0;
168 eeeo[1] = 0;
169 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
170 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
171
172 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
173 eee[0] = eeee[0] + eeeo[0];
174 eee[3] = eeee[0] - eeeo[0];
175 eee[1] = eeee[1] + eeeo[1];
176 eee[2] = eeee[1] - eeeo[1];
177 for(k = 0; k < 4; k++)
178 {
179 ee[k] = eee[k] + eeo[k];
180 ee[k + 4] = eee[3 - k] - eeo[3 - k];
181 }
182 for(k = 0; k < 8; k++)
183 {
184 e[k] = ee[k] + eo[k];
185 e[k + 8] = ee[7 - k] - eo[7 - k];
186 }
187 for(k = 0; k < 16; k++)
188 {
189 pi2_tmp[k] =
190 CLIP_S16(((e[k] + o[k] + add) >> shift));
191 pi2_tmp[k + 16] =
192 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
193 }
194 }
195 pi2_src++;
196 pi2_tmp += trans_size;
197 zero_cols = zero_cols >> 1;
198 }
199
200 pi2_tmp = pi2_tmp_orig;
201
202 /* Inverse Transform 2nd stage */
203 shift = IT_SHIFT_STAGE_2;
204 add = 1 << (shift - 1);
205 if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
206 {
207 for(j = 0; j < trans_size; j++)
208 {
209 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
210 for(k = 0; k < 16; k++)
211 {
212 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
213 + g_ai2_ihevc_trans_32[3][k]
214 * pi2_tmp[3 * trans_size];
215 }
216 for(k = 0; k < 8; k++)
217 {
218 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
219 }
220 // for(k = 0; k < 4; k++)
221 {
222 eeo[0] = 0;
223 eeo[1] = 0;
224 eeo[2] = 0;
225 eeo[3] = 0;
226 }
227 eeeo[0] = 0;
228 eeeo[1] = 0;
229 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
230 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
231
232 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
233 eee[0] = eeee[0] + eeeo[0];
234 eee[3] = eeee[0] - eeeo[0];
235 eee[1] = eeee[1] + eeeo[1];
236 eee[2] = eeee[1] - eeeo[1];
237 for(k = 0; k < 4; k++)
238 {
239 ee[k] = eee[k] + eeo[k];
240 ee[k + 4] = eee[3 - k] - eeo[3 - k];
241 }
242 for(k = 0; k < 8; k++)
243 {
244 e[k] = ee[k] + eo[k];
245 e[k + 8] = ee[7 - k] - eo[7 - k];
246 }
247 for(k = 0; k < 16; k++)
248 {
249 WORD32 itrans_out;
250 itrans_out =
251 CLIP_S16(((e[k] + o[k] + add) >> shift));
252 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
253 itrans_out =
254 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
255 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
256 }
257 pi2_tmp++;
258 pu1_pred += pred_strd;
259 pu1_dst += dst_strd;
260 }
261 }
262 else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
263 {
264 for(j = 0; j < trans_size; j++)
265 {
266 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
267 for(k = 0; k < 16; k++)
268 {
269 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
270 + g_ai2_ihevc_trans_32[3][k]
271 * pi2_tmp[3 * trans_size]
272 + g_ai2_ihevc_trans_32[5][k]
273 * pi2_tmp[5 * trans_size]
274 + g_ai2_ihevc_trans_32[7][k]
275 * pi2_tmp[7 * trans_size];
276 }
277 for(k = 0; k < 8; k++)
278 {
279 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
280 + g_ai2_ihevc_trans_32[6][k]
281 * pi2_tmp[6 * trans_size];
282 }
283 for(k = 0; k < 4; k++)
284 {
285 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
286 }
287 eeeo[0] = 0;
288 eeeo[1] = 0;
289 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
290 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
291
292 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
293 eee[0] = eeee[0] + eeeo[0];
294 eee[3] = eeee[0] - eeeo[0];
295 eee[1] = eeee[1] + eeeo[1];
296 eee[2] = eeee[1] - eeeo[1];
297 for(k = 0; k < 4; k++)
298 {
299 ee[k] = eee[k] + eeo[k];
300 ee[k + 4] = eee[3 - k] - eeo[3 - k];
301 }
302 for(k = 0; k < 8; k++)
303 {
304 e[k] = ee[k] + eo[k];
305 e[k + 8] = ee[7 - k] - eo[7 - k];
306 }
307 for(k = 0; k < 16; k++)
308 {
309 WORD32 itrans_out;
310 itrans_out =
311 CLIP_S16(((e[k] + o[k] + add) >> shift));
312 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
313 itrans_out =
314 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
315 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
316 }
317 pi2_tmp++;
318 pu1_pred += pred_strd;
319 pu1_dst += dst_strd;
320 }
321 }
322 else /* All rows of output of 1st stage are non-zero */
323 {
324 for(j = 0; j < trans_size; j++)
325 {
326 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
327 for(k = 0; k < 16; k++)
328 {
329 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
330 + g_ai2_ihevc_trans_32[3][k]
331 * pi2_tmp[3 * trans_size]
332 + g_ai2_ihevc_trans_32[5][k]
333 * pi2_tmp[5 * trans_size]
334 + g_ai2_ihevc_trans_32[7][k]
335 * pi2_tmp[7 * trans_size]
336 + g_ai2_ihevc_trans_32[9][k]
337 * pi2_tmp[9 * trans_size]
338 + g_ai2_ihevc_trans_32[11][k]
339 * pi2_tmp[11 * trans_size]
340 + g_ai2_ihevc_trans_32[13][k]
341 * pi2_tmp[13 * trans_size]
342 + g_ai2_ihevc_trans_32[15][k]
343 * pi2_tmp[15 * trans_size]
344 + g_ai2_ihevc_trans_32[17][k]
345 * pi2_tmp[17 * trans_size]
346 + g_ai2_ihevc_trans_32[19][k]
347 * pi2_tmp[19 * trans_size]
348 + g_ai2_ihevc_trans_32[21][k]
349 * pi2_tmp[21 * trans_size]
350 + g_ai2_ihevc_trans_32[23][k]
351 * pi2_tmp[23 * trans_size]
352 + g_ai2_ihevc_trans_32[25][k]
353 * pi2_tmp[25 * trans_size]
354 + g_ai2_ihevc_trans_32[27][k]
355 * pi2_tmp[27 * trans_size]
356 + g_ai2_ihevc_trans_32[29][k]
357 * pi2_tmp[29 * trans_size]
358 + g_ai2_ihevc_trans_32[31][k]
359 * pi2_tmp[31 * trans_size];
360 }
361 for(k = 0; k < 8; k++)
362 {
363 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
364 + g_ai2_ihevc_trans_32[6][k]
365 * pi2_tmp[6 * trans_size]
366 + g_ai2_ihevc_trans_32[10][k]
367 * pi2_tmp[10 * trans_size]
368 + g_ai2_ihevc_trans_32[14][k]
369 * pi2_tmp[14 * trans_size]
370 + g_ai2_ihevc_trans_32[18][k]
371 * pi2_tmp[18 * trans_size]
372 + g_ai2_ihevc_trans_32[22][k]
373 * pi2_tmp[22 * trans_size]
374 + g_ai2_ihevc_trans_32[26][k]
375 * pi2_tmp[26 * trans_size]
376 + g_ai2_ihevc_trans_32[30][k]
377 * pi2_tmp[30 * trans_size];
378 }
379 for(k = 0; k < 4; k++)
380 {
381 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
382 + g_ai2_ihevc_trans_32[12][k]
383 * pi2_tmp[12 * trans_size]
384 + g_ai2_ihevc_trans_32[20][k]
385 * pi2_tmp[20 * trans_size]
386 + g_ai2_ihevc_trans_32[28][k]
387 * pi2_tmp[28 * trans_size];
388 }
389 eeeo[0] =
390 g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
391 + g_ai2_ihevc_trans_32[24][0]
392 * pi2_tmp[24
393 * trans_size];
394 eeeo[1] =
395 g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
396 + g_ai2_ihevc_trans_32[24][1]
397 * pi2_tmp[24
398 * trans_size];
399 eeee[0] =
400 g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
401 + g_ai2_ihevc_trans_32[16][0]
402 * pi2_tmp[16
403 * trans_size];
404 eeee[1] =
405 g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
406 + g_ai2_ihevc_trans_32[16][1]
407 * pi2_tmp[16
408 * trans_size];
409
410 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
411 eee[0] = eeee[0] + eeeo[0];
412 eee[3] = eeee[0] - eeeo[0];
413 eee[1] = eeee[1] + eeeo[1];
414 eee[2] = eeee[1] - eeeo[1];
415 for(k = 0; k < 4; k++)
416 {
417 ee[k] = eee[k] + eeo[k];
418 ee[k + 4] = eee[3 - k] - eeo[3 - k];
419 }
420 for(k = 0; k < 8; k++)
421 {
422 e[k] = ee[k] + eo[k];
423 e[k + 8] = ee[7 - k] - eo[7 - k];
424 }
425 for(k = 0; k < 16; k++)
426 {
427 WORD32 itrans_out;
428 itrans_out =
429 CLIP_S16(((e[k] + o[k] + add) >> shift));
430 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
431 itrans_out =
432 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
433 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
434 }
435 pi2_tmp++;
436 pu1_pred += pred_strd;
437 pu1_dst += dst_strd;
438 }
439 }
440 /************************************************************************************************/
441 /************************************END - IT_RECON_32x32****************************************/
442 /************************************************************************************************/
443 }
444 else if((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of input are non-zero */
445 {
446 /************************************************************************************************/
447 /**********************************START - IT_RECON_32x32****************************************/
448 /************************************************************************************************/
449 /* Inverse Transform 1st stage */
450 shift = IT_SHIFT_STAGE_1;
451 add = 1 << (shift - 1);
452
453 for(j = 0; j < row_limit_2nd_stage; j++)
454 {
455 /* Checking for Zero Cols */
456 if((zero_cols & 1) == 1)
457 {
458 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
459 }
460 else
461 {
462 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
463 for(k = 0; k < 16; k++)
464 {
465 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
466 + g_ai2_ihevc_trans_32[3][k]
467 * pi2_src[3 * src_strd]
468 + g_ai2_ihevc_trans_32[5][k]
469 * pi2_src[5 * src_strd]
470 + g_ai2_ihevc_trans_32[7][k]
471 * pi2_src[7 * src_strd];
472 }
473 for(k = 0; k < 8; k++)
474 {
475 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
476 + g_ai2_ihevc_trans_32[6][k]
477 * pi2_src[6 * src_strd];
478 }
479 for(k = 0; k < 4; k++)
480 {
481 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd];
482 }
483 eeeo[0] = 0;
484 eeeo[1] = 0;
485 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0];
486 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0];
487
488 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
489 eee[0] = eeee[0] + eeeo[0];
490 eee[3] = eeee[0] - eeeo[0];
491 eee[1] = eeee[1] + eeeo[1];
492 eee[2] = eeee[1] - eeeo[1];
493 for(k = 0; k < 4; k++)
494 {
495 ee[k] = eee[k] + eeo[k];
496 ee[k + 4] = eee[3 - k] - eeo[3 - k];
497 }
498 for(k = 0; k < 8; k++)
499 {
500 e[k] = ee[k] + eo[k];
501 e[k + 8] = ee[7 - k] - eo[7 - k];
502 }
503 for(k = 0; k < 16; k++)
504 {
505 pi2_tmp[k] =
506 CLIP_S16(((e[k] + o[k] + add) >> shift));
507 pi2_tmp[k + 16] =
508 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
509 }
510 }
511 pi2_src++;
512 pi2_tmp += trans_size;
513 zero_cols = zero_cols >> 1;
514 }
515
516 pi2_tmp = pi2_tmp_orig;
517
518 /* Inverse Transform 2nd stage */
519 shift = IT_SHIFT_STAGE_2;
520 add = 1 << (shift - 1);
521 if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
522 {
523 for(j = 0; j < trans_size; j++)
524 {
525 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
526 for(k = 0; k < 16; k++)
527 {
528 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
529 + g_ai2_ihevc_trans_32[3][k]
530 * pi2_tmp[3 * trans_size];
531 }
532 for(k = 0; k < 8; k++)
533 {
534 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
535 }
536 // for(k = 0; k < 4; k++)
537 {
538 eeo[0] = 0;
539 eeo[1] = 0;
540 eeo[2] = 0;
541 eeo[3] = 0;
542 }
543 eeeo[0] = 0;
544 eeeo[1] = 0;
545 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
546 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
547
548 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
549 eee[0] = eeee[0] + eeeo[0];
550 eee[3] = eeee[0] - eeeo[0];
551 eee[1] = eeee[1] + eeeo[1];
552 eee[2] = eeee[1] - eeeo[1];
553 for(k = 0; k < 4; k++)
554 {
555 ee[k] = eee[k] + eeo[k];
556 ee[k + 4] = eee[3 - k] - eeo[3 - k];
557 }
558 for(k = 0; k < 8; k++)
559 {
560 e[k] = ee[k] + eo[k];
561 e[k + 8] = ee[7 - k] - eo[7 - k];
562 }
563 for(k = 0; k < 16; k++)
564 {
565 WORD32 itrans_out;
566 itrans_out =
567 CLIP_S16(((e[k] + o[k] + add) >> shift));
568 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
569 itrans_out =
570 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
571 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
572 }
573 pi2_tmp++;
574 pu1_pred += pred_strd;
575 pu1_dst += dst_strd;
576 }
577 }
578 else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
579 {
580 for(j = 0; j < trans_size; j++)
581 {
582 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
583 for(k = 0; k < 16; k++)
584 {
585 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
586 + g_ai2_ihevc_trans_32[3][k]
587 * pi2_tmp[3 * trans_size]
588 + g_ai2_ihevc_trans_32[5][k]
589 * pi2_tmp[5 * trans_size]
590 + g_ai2_ihevc_trans_32[7][k]
591 * pi2_tmp[7 * trans_size];
592 }
593 for(k = 0; k < 8; k++)
594 {
595 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
596 + g_ai2_ihevc_trans_32[6][k]
597 * pi2_tmp[6 * trans_size];
598 }
599 for(k = 0; k < 4; k++)
600 {
601 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
602 }
603 eeeo[0] = 0;
604 eeeo[1] = 0;
605 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
606 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
607
608 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
609 eee[0] = eeee[0] + eeeo[0];
610 eee[3] = eeee[0] - eeeo[0];
611 eee[1] = eeee[1] + eeeo[1];
612 eee[2] = eeee[1] - eeeo[1];
613 for(k = 0; k < 4; k++)
614 {
615 ee[k] = eee[k] + eeo[k];
616 ee[k + 4] = eee[3 - k] - eeo[3 - k];
617 }
618 for(k = 0; k < 8; k++)
619 {
620 e[k] = ee[k] + eo[k];
621 e[k + 8] = ee[7 - k] - eo[7 - k];
622 }
623 for(k = 0; k < 16; k++)
624 {
625 WORD32 itrans_out;
626 itrans_out =
627 CLIP_S16(((e[k] + o[k] + add) >> shift));
628 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
629 itrans_out =
630 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
631 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
632 }
633 pi2_tmp++;
634 pu1_pred += pred_strd;
635 pu1_dst += dst_strd;
636 }
637 }
638 else /* All rows of output of 1st stage are non-zero */
639 {
640 for(j = 0; j < trans_size; j++)
641 {
642 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
643 for(k = 0; k < 16; k++)
644 {
645 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
646 + g_ai2_ihevc_trans_32[3][k]
647 * pi2_tmp[3 * trans_size]
648 + g_ai2_ihevc_trans_32[5][k]
649 * pi2_tmp[5 * trans_size]
650 + g_ai2_ihevc_trans_32[7][k]
651 * pi2_tmp[7 * trans_size]
652 + g_ai2_ihevc_trans_32[9][k]
653 * pi2_tmp[9 * trans_size]
654 + g_ai2_ihevc_trans_32[11][k]
655 * pi2_tmp[11 * trans_size]
656 + g_ai2_ihevc_trans_32[13][k]
657 * pi2_tmp[13 * trans_size]
658 + g_ai2_ihevc_trans_32[15][k]
659 * pi2_tmp[15 * trans_size]
660 + g_ai2_ihevc_trans_32[17][k]
661 * pi2_tmp[17 * trans_size]
662 + g_ai2_ihevc_trans_32[19][k]
663 * pi2_tmp[19 * trans_size]
664 + g_ai2_ihevc_trans_32[21][k]
665 * pi2_tmp[21 * trans_size]
666 + g_ai2_ihevc_trans_32[23][k]
667 * pi2_tmp[23 * trans_size]
668 + g_ai2_ihevc_trans_32[25][k]
669 * pi2_tmp[25 * trans_size]
670 + g_ai2_ihevc_trans_32[27][k]
671 * pi2_tmp[27 * trans_size]
672 + g_ai2_ihevc_trans_32[29][k]
673 * pi2_tmp[29 * trans_size]
674 + g_ai2_ihevc_trans_32[31][k]
675 * pi2_tmp[31 * trans_size];
676 }
677 for(k = 0; k < 8; k++)
678 {
679 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
680 + g_ai2_ihevc_trans_32[6][k]
681 * pi2_tmp[6 * trans_size]
682 + g_ai2_ihevc_trans_32[10][k]
683 * pi2_tmp[10 * trans_size]
684 + g_ai2_ihevc_trans_32[14][k]
685 * pi2_tmp[14 * trans_size]
686 + g_ai2_ihevc_trans_32[18][k]
687 * pi2_tmp[18 * trans_size]
688 + g_ai2_ihevc_trans_32[22][k]
689 * pi2_tmp[22 * trans_size]
690 + g_ai2_ihevc_trans_32[26][k]
691 * pi2_tmp[26 * trans_size]
692 + g_ai2_ihevc_trans_32[30][k]
693 * pi2_tmp[30 * trans_size];
694 }
695 for(k = 0; k < 4; k++)
696 {
697 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
698 + g_ai2_ihevc_trans_32[12][k]
699 * pi2_tmp[12 * trans_size]
700 + g_ai2_ihevc_trans_32[20][k]
701 * pi2_tmp[20 * trans_size]
702 + g_ai2_ihevc_trans_32[28][k]
703 * pi2_tmp[28 * trans_size];
704 }
705 eeeo[0] =
706 g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
707 + g_ai2_ihevc_trans_32[24][0]
708 * pi2_tmp[24
709 * trans_size];
710 eeeo[1] =
711 g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
712 + g_ai2_ihevc_trans_32[24][1]
713 * pi2_tmp[24
714 * trans_size];
715 eeee[0] =
716 g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
717 + g_ai2_ihevc_trans_32[16][0]
718 * pi2_tmp[16
719 * trans_size];
720 eeee[1] =
721 g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
722 + g_ai2_ihevc_trans_32[16][1]
723 * pi2_tmp[16
724 * trans_size];
725
726 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
727 eee[0] = eeee[0] + eeeo[0];
728 eee[3] = eeee[0] - eeeo[0];
729 eee[1] = eeee[1] + eeeo[1];
730 eee[2] = eeee[1] - eeeo[1];
731 for(k = 0; k < 4; k++)
732 {
733 ee[k] = eee[k] + eeo[k];
734 ee[k + 4] = eee[3 - k] - eeo[3 - k];
735 }
736 for(k = 0; k < 8; k++)
737 {
738 e[k] = ee[k] + eo[k];
739 e[k + 8] = ee[7 - k] - eo[7 - k];
740 }
741 for(k = 0; k < 16; k++)
742 {
743 WORD32 itrans_out;
744 itrans_out =
745 CLIP_S16(((e[k] + o[k] + add) >> shift));
746 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
747 itrans_out =
748 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
749 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
750 }
751 pi2_tmp++;
752 pu1_pred += pred_strd;
753 pu1_dst += dst_strd;
754 }
755 }
756 /************************************************************************************************/
757 /************************************END - IT_RECON_32x32****************************************/
758 /************************************************************************************************/
759 }
760 else /* All rows of input are non-zero */
761 {
762 /************************************************************************************************/
763 /**********************************START - IT_RECON_32x32****************************************/
764 /************************************************************************************************/
765 /* Inverse Transform 1st stage */
766 shift = IT_SHIFT_STAGE_1;
767 add = 1 << (shift - 1);
768
769 for(j = 0; j < row_limit_2nd_stage; j++)
770 {
771 /* Checking for Zero Cols */
772 if((zero_cols & 1) == 1)
773 {
774 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
775 }
776 else
777 {
778 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
779 for(k = 0; k < 16; k++)
780 {
781 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
782 + g_ai2_ihevc_trans_32[3][k]
783 * pi2_src[3 * src_strd]
784 + g_ai2_ihevc_trans_32[5][k]
785 * pi2_src[5 * src_strd]
786 + g_ai2_ihevc_trans_32[7][k]
787 * pi2_src[7 * src_strd]
788 + g_ai2_ihevc_trans_32[9][k]
789 * pi2_src[9 * src_strd]
790 + g_ai2_ihevc_trans_32[11][k]
791 * pi2_src[11 * src_strd]
792 + g_ai2_ihevc_trans_32[13][k]
793 * pi2_src[13 * src_strd]
794 + g_ai2_ihevc_trans_32[15][k]
795 * pi2_src[15 * src_strd]
796 + g_ai2_ihevc_trans_32[17][k]
797 * pi2_src[17 * src_strd]
798 + g_ai2_ihevc_trans_32[19][k]
799 * pi2_src[19 * src_strd]
800 + g_ai2_ihevc_trans_32[21][k]
801 * pi2_src[21 * src_strd]
802 + g_ai2_ihevc_trans_32[23][k]
803 * pi2_src[23 * src_strd]
804 + g_ai2_ihevc_trans_32[25][k]
805 * pi2_src[25 * src_strd]
806 + g_ai2_ihevc_trans_32[27][k]
807 * pi2_src[27 * src_strd]
808 + g_ai2_ihevc_trans_32[29][k]
809 * pi2_src[29 * src_strd]
810 + g_ai2_ihevc_trans_32[31][k]
811 * pi2_src[31 * src_strd];
812 }
813 for(k = 0; k < 8; k++)
814 {
815 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
816 + g_ai2_ihevc_trans_32[6][k]
817 * pi2_src[6 * src_strd]
818 + g_ai2_ihevc_trans_32[10][k]
819 * pi2_src[10 * src_strd]
820 + g_ai2_ihevc_trans_32[14][k]
821 * pi2_src[14 * src_strd]
822 + g_ai2_ihevc_trans_32[18][k]
823 * pi2_src[18 * src_strd]
824 + g_ai2_ihevc_trans_32[22][k]
825 * pi2_src[22 * src_strd]
826 + g_ai2_ihevc_trans_32[26][k]
827 * pi2_src[26 * src_strd]
828 + g_ai2_ihevc_trans_32[30][k]
829 * pi2_src[30 * src_strd];
830 }
831 for(k = 0; k < 4; k++)
832 {
833 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
834 + g_ai2_ihevc_trans_32[12][k]
835 * pi2_src[12 * src_strd]
836 + g_ai2_ihevc_trans_32[20][k]
837 * pi2_src[20 * src_strd]
838 + g_ai2_ihevc_trans_32[28][k]
839 * pi2_src[28 * src_strd];
840 }
841 eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
842 + g_ai2_ihevc_trans_32[24][0]
843 * pi2_src[24 * src_strd];
844 eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
845 + g_ai2_ihevc_trans_32[24][1]
846 * pi2_src[24 * src_strd];
847 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
848 + g_ai2_ihevc_trans_32[16][0]
849 * pi2_src[16 * src_strd];
850 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
851 + g_ai2_ihevc_trans_32[16][1]
852 * pi2_src[16 * src_strd];
853
854 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
855 eee[0] = eeee[0] + eeeo[0];
856 eee[3] = eeee[0] - eeeo[0];
857 eee[1] = eeee[1] + eeeo[1];
858 eee[2] = eeee[1] - eeeo[1];
859 for(k = 0; k < 4; k++)
860 {
861 ee[k] = eee[k] + eeo[k];
862 ee[k + 4] = eee[3 - k] - eeo[3 - k];
863 }
864 for(k = 0; k < 8; k++)
865 {
866 e[k] = ee[k] + eo[k];
867 e[k + 8] = ee[7 - k] - eo[7 - k];
868 }
869 for(k = 0; k < 16; k++)
870 {
871 pi2_tmp[k] =
872 CLIP_S16(((e[k] + o[k] + add) >> shift));
873 pi2_tmp[k + 16] =
874 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
875 }
876 }
877 pi2_src++;
878 pi2_tmp += trans_size;
879 zero_cols = zero_cols >> 1;
880 }
881
882 pi2_tmp = pi2_tmp_orig;
883
884 /* Inverse Transform 2nd stage */
885 shift = IT_SHIFT_STAGE_2;
886 add = 1 << (shift - 1);
887 if((zero_rows_2nd_stage & 0xFFFFFFF0) == 0xFFFFFFF0) /* First 4 rows of output of 1st stage are non-zero */
888 {
889 for(j = 0; j < trans_size; j++)
890 {
891 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
892 for(k = 0; k < 16; k++)
893 {
894 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
895 + g_ai2_ihevc_trans_32[3][k]
896 * pi2_tmp[3 * trans_size];
897 }
898 for(k = 0; k < 8; k++)
899 {
900 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size];
901 }
902 // for(k = 0; k < 4; k++)
903 {
904 eeo[0] = 0;
905 eeo[1] = 0;
906 eeo[2] = 0;
907 eeo[3] = 0;
908 }
909 eeeo[0] = 0;
910 eeeo[1] = 0;
911 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
912 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
913
914 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
915 eee[0] = eeee[0] + eeeo[0];
916 eee[3] = eeee[0] - eeeo[0];
917 eee[1] = eeee[1] + eeeo[1];
918 eee[2] = eeee[1] - eeeo[1];
919 for(k = 0; k < 4; k++)
920 {
921 ee[k] = eee[k] + eeo[k];
922 ee[k + 4] = eee[3 - k] - eeo[3 - k];
923 }
924 for(k = 0; k < 8; k++)
925 {
926 e[k] = ee[k] + eo[k];
927 e[k + 8] = ee[7 - k] - eo[7 - k];
928 }
929 for(k = 0; k < 16; k++)
930 {
931 WORD32 itrans_out;
932 itrans_out =
933 CLIP_S16(((e[k] + o[k] + add) >> shift));
934 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
935 itrans_out =
936 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
937 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
938 }
939 pi2_tmp++;
940 pu1_pred += pred_strd;
941 pu1_dst += dst_strd;
942 }
943 }
944 else if((zero_rows_2nd_stage & 0xFFFFFF00) == 0xFFFFFF00) /* First 8 rows of output of 1st stage are non-zero */
945 {
946 for(j = 0; j < trans_size; j++)
947 {
948 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
949 for(k = 0; k < 16; k++)
950 {
951 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
952 + g_ai2_ihevc_trans_32[3][k]
953 * pi2_tmp[3 * trans_size]
954 + g_ai2_ihevc_trans_32[5][k]
955 * pi2_tmp[5 * trans_size]
956 + g_ai2_ihevc_trans_32[7][k]
957 * pi2_tmp[7 * trans_size];
958 }
959 for(k = 0; k < 8; k++)
960 {
961 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
962 + g_ai2_ihevc_trans_32[6][k]
963 * pi2_tmp[6 * trans_size];
964 }
965 for(k = 0; k < 4; k++)
966 {
967 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size];
968 }
969 eeeo[0] = 0;
970 eeeo[1] = 0;
971 eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0];
972 eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0];
973
974 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
975 eee[0] = eeee[0] + eeeo[0];
976 eee[3] = eeee[0] - eeeo[0];
977 eee[1] = eeee[1] + eeeo[1];
978 eee[2] = eeee[1] - eeeo[1];
979 for(k = 0; k < 4; k++)
980 {
981 ee[k] = eee[k] + eeo[k];
982 ee[k + 4] = eee[3 - k] - eeo[3 - k];
983 }
984 for(k = 0; k < 8; k++)
985 {
986 e[k] = ee[k] + eo[k];
987 e[k + 8] = ee[7 - k] - eo[7 - k];
988 }
989 for(k = 0; k < 16; k++)
990 {
991 WORD32 itrans_out;
992 itrans_out =
993 CLIP_S16(((e[k] + o[k] + add) >> shift));
994 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
995 itrans_out =
996 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
997 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
998 }
999 pi2_tmp++;
1000 pu1_pred += pred_strd;
1001 pu1_dst += dst_strd;
1002 }
1003 }
1004 else /* All rows of output of 1st stage are non-zero */
1005 {
1006 for(j = 0; j < trans_size; j++)
1007 {
1008 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
1009 for(k = 0; k < 16; k++)
1010 {
1011 o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_tmp[trans_size]
1012 + g_ai2_ihevc_trans_32[3][k]
1013 * pi2_tmp[3 * trans_size]
1014 + g_ai2_ihevc_trans_32[5][k]
1015 * pi2_tmp[5 * trans_size]
1016 + g_ai2_ihevc_trans_32[7][k]
1017 * pi2_tmp[7 * trans_size]
1018 + g_ai2_ihevc_trans_32[9][k]
1019 * pi2_tmp[9 * trans_size]
1020 + g_ai2_ihevc_trans_32[11][k]
1021 * pi2_tmp[11 * trans_size]
1022 + g_ai2_ihevc_trans_32[13][k]
1023 * pi2_tmp[13 * trans_size]
1024 + g_ai2_ihevc_trans_32[15][k]
1025 * pi2_tmp[15 * trans_size]
1026 + g_ai2_ihevc_trans_32[17][k]
1027 * pi2_tmp[17 * trans_size]
1028 + g_ai2_ihevc_trans_32[19][k]
1029 * pi2_tmp[19 * trans_size]
1030 + g_ai2_ihevc_trans_32[21][k]
1031 * pi2_tmp[21 * trans_size]
1032 + g_ai2_ihevc_trans_32[23][k]
1033 * pi2_tmp[23 * trans_size]
1034 + g_ai2_ihevc_trans_32[25][k]
1035 * pi2_tmp[25 * trans_size]
1036 + g_ai2_ihevc_trans_32[27][k]
1037 * pi2_tmp[27 * trans_size]
1038 + g_ai2_ihevc_trans_32[29][k]
1039 * pi2_tmp[29 * trans_size]
1040 + g_ai2_ihevc_trans_32[31][k]
1041 * pi2_tmp[31 * trans_size];
1042 }
1043 for(k = 0; k < 8; k++)
1044 {
1045 eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_tmp[2 * trans_size]
1046 + g_ai2_ihevc_trans_32[6][k]
1047 * pi2_tmp[6 * trans_size]
1048 + g_ai2_ihevc_trans_32[10][k]
1049 * pi2_tmp[10 * trans_size]
1050 + g_ai2_ihevc_trans_32[14][k]
1051 * pi2_tmp[14 * trans_size]
1052 + g_ai2_ihevc_trans_32[18][k]
1053 * pi2_tmp[18 * trans_size]
1054 + g_ai2_ihevc_trans_32[22][k]
1055 * pi2_tmp[22 * trans_size]
1056 + g_ai2_ihevc_trans_32[26][k]
1057 * pi2_tmp[26 * trans_size]
1058 + g_ai2_ihevc_trans_32[30][k]
1059 * pi2_tmp[30 * trans_size];
1060 }
1061 for(k = 0; k < 4; k++)
1062 {
1063 eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_tmp[4 * trans_size]
1064 + g_ai2_ihevc_trans_32[12][k]
1065 * pi2_tmp[12 * trans_size]
1066 + g_ai2_ihevc_trans_32[20][k]
1067 * pi2_tmp[20 * trans_size]
1068 + g_ai2_ihevc_trans_32[28][k]
1069 * pi2_tmp[28 * trans_size];
1070 }
1071 eeeo[0] =
1072 g_ai2_ihevc_trans_32[8][0] * pi2_tmp[8 * trans_size]
1073 + g_ai2_ihevc_trans_32[24][0]
1074 * pi2_tmp[24
1075 * trans_size];
1076 eeeo[1] =
1077 g_ai2_ihevc_trans_32[8][1] * pi2_tmp[8 * trans_size]
1078 + g_ai2_ihevc_trans_32[24][1]
1079 * pi2_tmp[24
1080 * trans_size];
1081 eeee[0] =
1082 g_ai2_ihevc_trans_32[0][0] * pi2_tmp[0]
1083 + g_ai2_ihevc_trans_32[16][0]
1084 * pi2_tmp[16
1085 * trans_size];
1086 eeee[1] =
1087 g_ai2_ihevc_trans_32[0][1] * pi2_tmp[0]
1088 + g_ai2_ihevc_trans_32[16][1]
1089 * pi2_tmp[16
1090 * trans_size];
1091
1092 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
1093 eee[0] = eeee[0] + eeeo[0];
1094 eee[3] = eeee[0] - eeeo[0];
1095 eee[1] = eeee[1] + eeeo[1];
1096 eee[2] = eeee[1] - eeeo[1];
1097 for(k = 0; k < 4; k++)
1098 {
1099 ee[k] = eee[k] + eeo[k];
1100 ee[k + 4] = eee[3 - k] - eeo[3 - k];
1101 }
1102 for(k = 0; k < 8; k++)
1103 {
1104 e[k] = ee[k] + eo[k];
1105 e[k + 8] = ee[7 - k] - eo[7 - k];
1106 }
1107 for(k = 0; k < 16; k++)
1108 {
1109 WORD32 itrans_out;
1110 itrans_out =
1111 CLIP_S16(((e[k] + o[k] + add) >> shift));
1112 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
1113 itrans_out =
1114 CLIP_S16(((e[15 - k] - o[15 - k] + add) >> shift));
1115 pu1_dst[k + 16] = CLIP_U8((itrans_out + pu1_pred[k + 16]));
1116 }
1117 pi2_tmp++;
1118 pu1_pred += pred_strd;
1119 pu1_dst += dst_strd;
1120 }
1121 }
1122 /************************************************************************************************/
1123 /************************************END - IT_RECON_32x32****************************************/
1124 /************************************************************************************************/
1125 }
1126 }
1127
1128