1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevc_resi_trans.c
24 *
25 * @brief
26 * Contains function definitions for residual and forward transform
27 *
28 * @author
29 * 100470
30 *
31 * @par List of Functions:
32 * - ihevc_resi_trans_4x4_ttype1()
33 * - ihevc_resi_trans_4x4()
34 * - ihevc_resi_trans_8x8()
35 * - ihevc_resi_trans_16x16()
36 * - ihevc_resi_trans_32x32()
37 *
38 * @remarks
39 * None
40 *
41 *******************************************************************************
42 */
43 #include <stdio.h>
44 #include <string.h>
45 #include <stdlib.h>
46 #include "ihevc_typedefs.h"
47 #include "ihevc_macros.h"
48 #include "ihevc_platform_macros.h"
49 #include "ihevc_defs.h"
50 #include "ihevc_trans_tables.h"
51 #include "ihevc_resi_trans.h"
52 #include "ihevc_func_selector.h"
53 #include "ihevc_trans_macros.h"
54
55 /**
56 *******************************************************************************
57 *
58 * @brief
59 * This function performs residue calculation and forward transform type 1
60 * on input pixels
61 *
62 * @par Description:
63 * Performs residue calculation by subtracting source and prediction and
64 * followed by forward transform
65 *
66 * @param[in] pu1_src
67 * Input 4x4 pixels
68 *
69 * @param[in] pu1_pred
70 * Prediction data
71 *
72 * @param[in] pi2_tmp
73 * Temporary buffer of size 4x4
74 *
75 * @param[out] pi2_dst
76 * Output 4x4 coefficients
77 *
78 * @param[in] src_strd
79 * Input stride
80 *
81 * @param[in] pred_strd
82 * Prediction Stride
83 *
84 * @param[in] dst_strd_chr_flag
85 * Output Stride and Chroma Flag packed in the MS and LS 16-bit
86 *
87 *
88 * @returns Void
89 *
90 * @remarks
91 * None
92 *
93 *******************************************************************************
94 */
95
ihevc_resi_trans_4x4_ttype1(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)96 UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src,
97 UWORD8 *pu1_pred,
98 WORD32 *pi4_temp,
99 WORD16 *pi2_dst,
100 WORD32 src_strd,
101 WORD32 pred_strd,
102 WORD32 dst_strd_chr_flag)
103 {
104 WORD32 i, c[4];
105 WORD32 add, shift;
106 WORD32 trans_size;
107 WORD32 *pi4_tmp_orig;
108 WORD16 *pi2_dst_orig;
109 UWORD32 u4_blk_sad = 0;
110 // WORD32 chroma_flag;
111 WORD32 dst_strd;
112
113 // chroma_flag = dst_strd_chr_flag & 1;
114 dst_strd = dst_strd_chr_flag >> 16;
115
116 pi2_dst_orig = pi2_dst;
117 pi4_tmp_orig = pi4_temp;
118 trans_size = TRANS_SIZE_4;
119
120 /* Residue + Forward Transform 1st stage */
121 shift = 1; // log2(iWidth) - 1 + g_uiBitIncrement
122 add = 1 << (shift - 1);
123
124 for(i = 0; i < trans_size; i++)
125 {
126 WORD32 resi_tmp_1, resi_tmp_2, resi_tmp_3;
127
128 // Intermediate Variables
129 resi_tmp_1 = pu1_src[0] - pu1_pred[0];
130 resi_tmp_2 = pu1_src[3] - pu1_pred[3];
131 c[0] = resi_tmp_1 + resi_tmp_2;
132 u4_blk_sad += abs(resi_tmp_1) + abs(resi_tmp_2);
133
134 resi_tmp_1 = pu1_src[1] - pu1_pred[1];
135 resi_tmp_2 = pu1_src[3] - pu1_pred[3];
136 c[1] = resi_tmp_1 + resi_tmp_2;
137 u4_blk_sad += abs(resi_tmp_1);
138
139 resi_tmp_1 = pu1_src[0] - pu1_pred[0];
140 resi_tmp_2 = pu1_src[1] - pu1_pred[1];
141 c[2] = resi_tmp_1 - resi_tmp_2;
142
143 resi_tmp_1 = pu1_src[2] - pu1_pred[2];
144 c[3] = 74 * resi_tmp_1;
145 u4_blk_sad += abs(resi_tmp_1);
146
147 pi4_temp[0] = (29 * c[0] + 55 * c[1] + c[3] + add) >> shift;
148
149 resi_tmp_1 = pu1_src[0] - pu1_pred[0];
150 resi_tmp_2 = pu1_src[1] - pu1_pred[1];
151 resi_tmp_3 = pu1_src[3] - pu1_pred[3];
152 pi4_temp[trans_size] =
153 (74 * (resi_tmp_1 + resi_tmp_2 - resi_tmp_3) + add)
154 >> shift;
155 pi4_temp[2 * trans_size] = (29 * c[2] + 55 * c[0] - c[3] + add) >> shift;
156 pi4_temp[3 * trans_size] = (55 * c[2] - 29 * c[1] + c[3] + add) >> shift;
157
158 pu1_src += src_strd;
159 pu1_pred += pred_strd;
160 pi4_temp++;
161 }
162
163 pi4_temp = pi4_tmp_orig;
164
165 /* Forward transform 2nd stage */
166 shift = 8; // log2(iHeight) + 6
167 add = 1 << (shift - 1);
168
169 for(i = 0; i < TRANS_SIZE_4; i++)
170 {
171 // Intermediate Variables
172 c[0] = pi4_temp[0] + pi4_temp[3];
173 c[1] = pi4_temp[1] + pi4_temp[3];
174 c[2] = pi4_temp[0] - pi4_temp[1];
175 c[3] = 74 * pi4_temp[2];
176
177 pi2_dst[0] = (29 * c[0] + 55 * c[1] + c[3] + add) >> shift;
178 pi2_dst[dst_strd] = (74 * (pi4_temp[0] + pi4_temp[1] - pi4_temp[3]) + add)
179 >> shift;
180 pi2_dst[2 * dst_strd] = (29 * c[2] + 55 * c[0] - c[3] + add) >> shift;
181 pi2_dst[3 * dst_strd] = (55 * c[2] - 29 * c[1] + c[3] + add) >> shift;
182
183 pi4_temp += trans_size;
184 pi2_dst++;
185 }
186
187 return u4_blk_sad;
188 }
189
190 /**
191 *******************************************************************************
192 *
193 * @brief
194 * This function performs residue calculation and forward transform on
195 * input pixels
196 *
197 * @par Description:
198 * Performs residue calculation by subtracting source and prediction and
199 * followed by forward transform
200 *
201 * @param[in] pu1_src
202 * Input 4x4 pixels
203 *
204 * @param[in] pu1_pred
205 * Prediction data
206 *
207 * @param[in] pi2_tmp
208 * Temporary buffer of size 4x4
209 *
210 * @param[out] pi2_dst
211 * Output 4x4 coefficients
212 *
213 * @param[in] src_strd
214 * Input stride
215 *
216 * @param[in] pred_strd
217 * Prediction Stride
218 *
219 * @param[in] dst_strd_chr_flag
220 * Output Stride and Chroma Flag packed in the MS and LS 16-bit
221 *
222 * @returns Void
223 *
224 * @remarks
225 * None
226 *
227 *******************************************************************************
228 */
229
ihevc_resi_trans_4x4(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)230 UWORD32 ihevc_resi_trans_4x4(UWORD8 *pu1_src,
231 UWORD8 *pu1_pred,
232 WORD32 *pi4_temp,
233 WORD16 *pi2_dst,
234 WORD32 src_strd,
235 WORD32 pred_strd,
236 WORD32 dst_strd_chr_flag)
237 {
238 WORD32 i;
239 WORD32 e[2], o[2];
240 WORD32 add, shift;
241 WORD32 trans_size;
242 WORD32 *pi4_tmp_orig;
243 WORD16 *pi2_dst_orig;
244 UWORD32 u4_blk_sad=0;
245 WORD32 chroma_flag;
246 WORD32 dst_strd;
247
248 chroma_flag = dst_strd_chr_flag & 1;
249 dst_strd = dst_strd_chr_flag >> 16;
250
251 pi2_dst_orig = pi2_dst;
252 pi4_tmp_orig = pi4_temp;
253 trans_size = TRANS_SIZE_4;
254
255 /* Residue + Forward Transform 1st stage */
256 shift = 1; // log2(iWidth) - 1 + g_uiBitIncrement
257 add = 1 << (shift - 1);
258
259 for(i = 0; i < trans_size; i++)
260 {
261 WORD32 resi_tmp_1, resi_tmp_2;
262
263 /* e and o */
264 resi_tmp_1 = pu1_src[0 + 0*chroma_flag] - pu1_pred[0 + 0*chroma_flag];
265 resi_tmp_2 = pu1_src[3 + 3*chroma_flag] - pu1_pred[3 + 3*chroma_flag];
266 e[0] = resi_tmp_1 + resi_tmp_2;
267 o[0] = resi_tmp_1 - resi_tmp_2;
268 u4_blk_sad += abs(resi_tmp_1);
269 u4_blk_sad += abs(resi_tmp_2);
270
271 resi_tmp_1 = pu1_src[1 + 1*chroma_flag] - pu1_pred[1 + 1*chroma_flag];
272 resi_tmp_2 = pu1_src[2 + 2*chroma_flag] - pu1_pred[2 + 2*chroma_flag];
273 e[1] = resi_tmp_1 + resi_tmp_2;
274 o[1] = resi_tmp_1 - resi_tmp_2;
275 u4_blk_sad += abs(resi_tmp_1);
276 u4_blk_sad += abs(resi_tmp_2);
277
278 pi4_temp[0] = (g_ai2_ihevc_trans_4[0][0] * e[0]
279 + g_ai2_ihevc_trans_4[0][1] * e[1]);// + add) >> shift;
280 pi4_temp[2 * trans_size] = (g_ai2_ihevc_trans_4[2][0] * e[0]
281 + g_ai2_ihevc_trans_4[2][1] * e[1]);// + add) >> shift;
282 pi4_temp[trans_size] = (g_ai2_ihevc_trans_4[1][0] * o[0]
283 + g_ai2_ihevc_trans_4[1][1] * o[1]);// + add) >> shift;
284 pi4_temp[3 * trans_size] = (g_ai2_ihevc_trans_4[3][0] * o[0]
285 + g_ai2_ihevc_trans_4[3][1] * o[1]);// + add) >> shift;
286
287 pu1_src += src_strd;
288 pu1_pred += pred_strd;
289 pi4_temp++;
290 }
291
292 pi4_temp = pi4_tmp_orig;
293 /* Forward Transform 2nd stage */
294 shift = 9; // log2(iHeight) + 6
295 add = 1 << (shift - 1);
296
297 for(i = 0; i < trans_size; i++)
298 {
299
300 /* e and o */
301 e[0] = pi4_temp[0] + pi4_temp[3];
302 o[0] = pi4_temp[0] - pi4_temp[3];
303 e[1] = pi4_temp[1] + pi4_temp[2];
304 o[1] = pi4_temp[1] - pi4_temp[2];
305
306 pi2_dst[0] = (g_ai2_ihevc_trans_4[0][0] * e[0]
307 + g_ai2_ihevc_trans_4[0][1] * e[1] + add) >> shift;
308 pi2_dst[2 * dst_strd] = (g_ai2_ihevc_trans_4[2][0] * e[0]
309 + g_ai2_ihevc_trans_4[2][1] * e[1] + add) >> shift;
310 pi2_dst[dst_strd] = (g_ai2_ihevc_trans_4[1][0] * o[0]
311 + g_ai2_ihevc_trans_4[1][1] * o[1] + add) >> shift;
312 pi2_dst[3 * dst_strd] = (g_ai2_ihevc_trans_4[3][0] * o[0]
313 + g_ai2_ihevc_trans_4[3][1] * o[1] + add) >> shift;
314
315 pi4_temp += trans_size;
316 pi2_dst++;
317 }
318
319 return u4_blk_sad;
320 }
321
ihevc_resi_trans_4x4_16bit(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_tmp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd)322 void ihevc_resi_trans_4x4_16bit(WORD16 *pi2_src,
323 UWORD8 *pu1_pred,
324 WORD16 *pi2_tmp,
325 WORD16 *pi2_dst,
326 WORD32 src_strd,
327 WORD32 pred_strd,
328 WORD32 dst_strd)
329 {
330 WORD32 i;
331 WORD32 e[2], o[2];
332 WORD32 add, shift;
333 WORD32 trans_size;
334 WORD16 *pi2_tmp_orig;
335 WORD16 *pi2_dst_orig;
336
337 pi2_dst_orig = pi2_dst;
338 pi2_tmp_orig = pi2_tmp;
339 trans_size = TRANS_SIZE_4;
340
341 /* Residue + Forward Transform 1st stage */
342 shift = 1; // log2(iWidth) - 1 + g_uiBitIncrement
343 add = 1 << (shift - 1);
344
345 for(i = 0; i < trans_size; i++)
346 {
347 WORD32 resi_tmp_1, resi_tmp_2;
348
349 /* e and o */
350 resi_tmp_1 = pi2_src[0] - pu1_pred[0];
351 resi_tmp_2 = pi2_src[3] - pu1_pred[3];
352 e[0] = resi_tmp_1 + resi_tmp_2;
353 o[0] = resi_tmp_1 - resi_tmp_2;
354
355 resi_tmp_1 = pi2_src[1] - pu1_pred[1];
356 resi_tmp_2 = pi2_src[2] - pu1_pred[2];
357 e[1] = resi_tmp_1 + resi_tmp_2;
358 o[1] = resi_tmp_1 - resi_tmp_2;
359
360 pi2_tmp[0] = (g_ai2_ihevc_trans_4[0][0] * e[0]
361 + g_ai2_ihevc_trans_4[0][1] * e[1] + add) >> shift;
362 pi2_tmp[2 * trans_size] = (g_ai2_ihevc_trans_4[2][0] * e[0]
363 + g_ai2_ihevc_trans_4[2][1] * e[1] + add) >> shift;
364 pi2_tmp[trans_size] = (g_ai2_ihevc_trans_4[1][0] * o[0]
365 + g_ai2_ihevc_trans_4[1][1] * o[1] + add) >> shift;
366 pi2_tmp[3 * trans_size] = (g_ai2_ihevc_trans_4[3][0] * o[0]
367 + g_ai2_ihevc_trans_4[3][1] * o[1] + add) >> shift;
368
369 pi2_src += src_strd;
370 pu1_pred += pred_strd;
371 pi2_tmp++;
372 }
373
374 pi2_tmp = pi2_tmp_orig;
375 /* Forward Transform 2nd stage */
376 shift = 8; // log2(iHeight) + 6
377 add = 1 << (shift - 1);
378
379 for(i = 0; i < trans_size; i++)
380 {
381
382 /* e and o */
383 e[0] = pi2_tmp[0] + pi2_tmp[3];
384 o[0] = pi2_tmp[0] - pi2_tmp[3];
385 e[1] = pi2_tmp[1] + pi2_tmp[2];
386 o[1] = pi2_tmp[1] - pi2_tmp[2];
387
388 pi2_dst[0] = (g_ai2_ihevc_trans_4[0][0] * e[0]
389 + g_ai2_ihevc_trans_4[0][1] * e[1] + add) >> shift;
390 pi2_dst[2 * dst_strd] = (g_ai2_ihevc_trans_4[2][0] * e[0]
391 + g_ai2_ihevc_trans_4[2][1] * e[1] + add) >> shift;
392 pi2_dst[dst_strd] = (g_ai2_ihevc_trans_4[1][0] * o[0]
393 + g_ai2_ihevc_trans_4[1][1] * o[1] + add) >> shift;
394 pi2_dst[3 * dst_strd] = (g_ai2_ihevc_trans_4[3][0] * o[0]
395 + g_ai2_ihevc_trans_4[3][1] * o[1] + add) >> shift;
396
397 pi2_tmp += trans_size;
398 pi2_dst++;
399 }
400 }
401 /**
402 *******************************************************************************
403 *
404 * @brief
405 * This function performs residue calculation and forward transform on
406 * input pixels
407 *
408 * @par Description:
409 * Performs residue calculation by subtracting source and prediction and
410 * followed by forward transform
411 *
412 * @param[in] pu1_src
413 * Input 8x8 pixels
414 *
415 * @param[in] pu1_pred
416 * Prediction data
417 *
418 * @param[in] pi2_tmp
419 * Temporary buffer of size 8x8
420 *
421 * @param[out] pi2_dst
422 * Output 8x8 coefficients
423 *
424 * @param[in] src_strd
425 * Input stride
426 *
427 * @param[in] pred_strd
428 * Prediction Stride
429 *
430 * @param[in] dst_strd_chr_flag
431 * Output Stride and Chroma Flag packed in the MS and LS 16-bit
432 *
433 * @returns Void
434 *
435 * @remarks
436 * None
437 *
438 *******************************************************************************
439 */
440
ihevc_resi_trans_8x8(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)441 UWORD32 ihevc_resi_trans_8x8(UWORD8 *pu1_src,
442 UWORD8 *pu1_pred,
443 WORD32 *pi4_temp,
444 WORD16 *pi2_dst,
445 WORD32 src_strd,
446 WORD32 pred_strd,
447 WORD32 dst_strd_chr_flag)
448 {
449 WORD32 i, k;
450 WORD32 e[4], o[4];
451 WORD32 ee[2], eo[2];
452 WORD32 add, shift;
453 WORD32 trans_size;
454 WORD32 *pi4_tmp_orig;
455 // WORD16 *pi2_tmp;
456 WORD16 *pi2_dst_orig;
457 UWORD32 u4_blk_sad=0;
458 WORD32 chroma_flag;
459 WORD32 dst_strd;
460
461 chroma_flag = dst_strd_chr_flag & 1;
462 dst_strd = dst_strd_chr_flag >> 16;
463
464 pi2_dst_orig = pi2_dst;
465 pi4_tmp_orig = pi4_temp;
466 trans_size = TRANS_SIZE_8;
467 /* Residue + Forward Transform 1st stage */
468 shift = 2; // log2(iWidth) - 1 + g_uiBitIncrement
469 add = 1 << (shift - 1);
470
471 for(i = 0; i < trans_size; i++)
472 {
473 WORD32 resi_tmp_1, resi_tmp_2;
474
475 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
476 /* e and o*/
477 for(k = 0; k < 4; k++)
478 {
479 resi_tmp_1 = pu1_src[k*(1+chroma_flag)] - pu1_pred[k*(1+chroma_flag)];
480 resi_tmp_2 = pu1_src[(7-k)*(1+chroma_flag)] - pu1_pred[(7-k)*(1+chroma_flag)];
481 e[k] = resi_tmp_1 + resi_tmp_2;
482 o[k] = resi_tmp_1 - resi_tmp_2;
483 u4_blk_sad += abs(resi_tmp_1) + abs(resi_tmp_2);
484 }
485 /* ee and eo */
486 ee[0] = e[0] + e[3];
487 eo[0] = e[0] - e[3];
488 ee[1] = e[1] + e[2];
489 eo[1] = e[1] - e[2];
490
491 pi4_temp[0] = (g_ai2_ihevc_trans_8[0][0] * ee[0]
492 + g_ai2_ihevc_trans_8[0][1] * ee[1]);// + add) >> shift;
493 pi4_temp[4 * trans_size] = (g_ai2_ihevc_trans_8[4][0] * ee[0]
494 + g_ai2_ihevc_trans_8[4][1] * ee[1]);// + add) >> shift;
495 pi4_temp[2 * trans_size] = (g_ai2_ihevc_trans_8[2][0] * eo[0]
496 + g_ai2_ihevc_trans_8[2][1] * eo[1]);// + add) >> shift;
497 pi4_temp[6 * trans_size] = (g_ai2_ihevc_trans_8[6][0] * eo[0]
498 + g_ai2_ihevc_trans_8[6][1] * eo[1]);// + add) >> shift;
499
500 pi4_temp[trans_size] = (g_ai2_ihevc_trans_8[1][0] * o[0]
501 + g_ai2_ihevc_trans_8[1][1] * o[1]
502 + g_ai2_ihevc_trans_8[1][2] * o[2]
503 + g_ai2_ihevc_trans_8[1][3] * o[3]);// + add) >> shift;
504 pi4_temp[3 * trans_size] = (g_ai2_ihevc_trans_8[3][0] * o[0]
505 + g_ai2_ihevc_trans_8[3][1] * o[1]
506 + g_ai2_ihevc_trans_8[3][2] * o[2]
507 + g_ai2_ihevc_trans_8[3][3] * o[3]);// + add) >> shift;
508 pi4_temp[5 * trans_size] = (g_ai2_ihevc_trans_8[5][0] * o[0]
509 + g_ai2_ihevc_trans_8[5][1] * o[1]
510 + g_ai2_ihevc_trans_8[5][2] * o[2]
511 + g_ai2_ihevc_trans_8[5][3] * o[3]);// + add) >> shift;
512 pi4_temp[7 * trans_size] = (g_ai2_ihevc_trans_8[7][0] * o[0]
513 + g_ai2_ihevc_trans_8[7][1] * o[1]
514 + g_ai2_ihevc_trans_8[7][2] * o[2]
515 + g_ai2_ihevc_trans_8[7][3] * o[3]);// + add) >> shift;
516
517 pu1_src += src_strd;
518 pu1_pred += pred_strd;
519 pi4_temp++;
520 }
521
522 pi4_temp = pi4_tmp_orig;
523 /* Forward Transform 2nd stage */
524 shift = 11; // log2(iHeight) + 6
525 add = 1 << (shift - 1);
526
527 for(i = 0; i < trans_size; i++)
528 {
529 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
530 /* e and o*/
531 for(k = 0; k < 4; k++)
532 {
533 e[k] = pi4_temp[k] + pi4_temp[7 - k];
534 o[k] = pi4_temp[k] - pi4_temp[7 - k];
535 }
536 /* ee and eo */
537 ee[0] = e[0] + e[3];
538 eo[0] = e[0] - e[3];
539 ee[1] = e[1] + e[2];
540 eo[1] = e[1] - e[2];
541
542 pi2_dst[0] = (g_ai2_ihevc_trans_8[0][0] * ee[0]
543 + g_ai2_ihevc_trans_8[0][1] * ee[1] + add) >> shift;
544 pi2_dst[4 * dst_strd] = (g_ai2_ihevc_trans_8[4][0] * ee[0]
545 + g_ai2_ihevc_trans_8[4][1] * ee[1] + add) >> shift;
546 pi2_dst[2 * dst_strd] = (g_ai2_ihevc_trans_8[2][0] * eo[0]
547 + g_ai2_ihevc_trans_8[2][1] * eo[1] + add) >> shift;
548 pi2_dst[6 * dst_strd] = (g_ai2_ihevc_trans_8[6][0] * eo[0]
549 + g_ai2_ihevc_trans_8[6][1] * eo[1] + add) >> shift;
550
551 pi2_dst[dst_strd] = (g_ai2_ihevc_trans_8[1][0] * o[0]
552 + g_ai2_ihevc_trans_8[1][1] * o[1]
553 + g_ai2_ihevc_trans_8[1][2] * o[2]
554 + g_ai2_ihevc_trans_8[1][3] * o[3] + add) >> shift;
555 pi2_dst[3 * dst_strd] = (g_ai2_ihevc_trans_8[3][0] * o[0]
556 + g_ai2_ihevc_trans_8[3][1] * o[1]
557 + g_ai2_ihevc_trans_8[3][2] * o[2]
558 + g_ai2_ihevc_trans_8[3][3] * o[3] + add) >> shift;
559 pi2_dst[5 * dst_strd] = (g_ai2_ihevc_trans_8[5][0] * o[0]
560 + g_ai2_ihevc_trans_8[5][1] * o[1]
561 + g_ai2_ihevc_trans_8[5][2] * o[2]
562 + g_ai2_ihevc_trans_8[5][3] * o[3] + add) >> shift;
563 pi2_dst[7 * dst_strd] = (g_ai2_ihevc_trans_8[7][0] * o[0]
564 + g_ai2_ihevc_trans_8[7][1] * o[1]
565 + g_ai2_ihevc_trans_8[7][2] * o[2]
566 + g_ai2_ihevc_trans_8[7][3] * o[3] + add) >> shift;
567
568 pi4_temp += trans_size;
569 pi2_dst++;
570 }
571
572 return u4_blk_sad;
573 }
574
ihevc_resi_trans_8x8_16bit(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_tmp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd)575 void ihevc_resi_trans_8x8_16bit(WORD16 *pi2_src,
576 UWORD8 *pu1_pred,
577 WORD16 *pi2_tmp,
578 WORD16 *pi2_dst,
579 WORD32 src_strd,
580 WORD32 pred_strd,
581 WORD32 dst_strd)
582 {
583 WORD32 i, k;
584 WORD32 e[4], o[4];
585 WORD32 ee[2], eo[2];
586 WORD32 add, shift;
587 WORD32 trans_size;
588 WORD16 *pi2_tmp_orig;
589 WORD16 *pi2_dst_orig;
590
591 pi2_dst_orig = pi2_dst;
592 pi2_tmp_orig = pi2_tmp;
593 trans_size = TRANS_SIZE_8;
594 /* Residue + Forward Transform 1st stage */
595 shift = 2; // log2(iWidth) - 1 + g_uiBitIncrement
596 add = 1 << (shift - 1);
597
598 for(i = 0; i < trans_size; i++)
599 {
600 WORD32 resi_tmp_1, resi_tmp_2;
601
602 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
603 /* e and o*/
604 for(k = 0; k < 4; k++)
605 {
606 resi_tmp_1 = pi2_src[k] - pu1_pred[k];
607 resi_tmp_2 = pi2_src[7 - k] - pu1_pred[7 - k];
608 e[k] = resi_tmp_1 + resi_tmp_2;
609 o[k] = resi_tmp_1 - resi_tmp_2;
610 }
611 /* ee and eo */
612 ee[0] = e[0] + e[3];
613 eo[0] = e[0] - e[3];
614 ee[1] = e[1] + e[2];
615 eo[1] = e[1] - e[2];
616
617 pi2_tmp[0] = (g_ai2_ihevc_trans_8[0][0] * ee[0]
618 + g_ai2_ihevc_trans_8[0][1] * ee[1] + add) >> shift;
619 pi2_tmp[4 * trans_size] = (g_ai2_ihevc_trans_8[4][0] * ee[0]
620 + g_ai2_ihevc_trans_8[4][1] * ee[1] + add) >> shift;
621 pi2_tmp[2 * trans_size] = (g_ai2_ihevc_trans_8[2][0] * eo[0]
622 + g_ai2_ihevc_trans_8[2][1] * eo[1] + add) >> shift;
623 pi2_tmp[6 * trans_size] = (g_ai2_ihevc_trans_8[6][0] * eo[0]
624 + g_ai2_ihevc_trans_8[6][1] * eo[1] + add) >> shift;
625
626 pi2_tmp[trans_size] = (g_ai2_ihevc_trans_8[1][0] * o[0]
627 + g_ai2_ihevc_trans_8[1][1] * o[1]
628 + g_ai2_ihevc_trans_8[1][2] * o[2]
629 + g_ai2_ihevc_trans_8[1][3] * o[3] + add) >> shift;
630 pi2_tmp[3 * trans_size] = (g_ai2_ihevc_trans_8[3][0] * o[0]
631 + g_ai2_ihevc_trans_8[3][1] * o[1]
632 + g_ai2_ihevc_trans_8[3][2] * o[2]
633 + g_ai2_ihevc_trans_8[3][3] * o[3] + add) >> shift;
634 pi2_tmp[5 * trans_size] = (g_ai2_ihevc_trans_8[5][0] * o[0]
635 + g_ai2_ihevc_trans_8[5][1] * o[1]
636 + g_ai2_ihevc_trans_8[5][2] * o[2]
637 + g_ai2_ihevc_trans_8[5][3] * o[3] + add) >> shift;
638 pi2_tmp[7 * trans_size] = (g_ai2_ihevc_trans_8[7][0] * o[0]
639 + g_ai2_ihevc_trans_8[7][1] * o[1]
640 + g_ai2_ihevc_trans_8[7][2] * o[2]
641 + g_ai2_ihevc_trans_8[7][3] * o[3] + add) >> shift;
642
643 pi2_src += src_strd;
644 pu1_pred += pred_strd;
645 pi2_tmp++;
646 }
647
648 pi2_tmp = pi2_tmp_orig;
649 /* Forward Transform 2nd stage */
650 shift = 9; // log2(iHeight) + 6
651 add = 1 << (shift - 1);
652
653 for(i = 0; i < trans_size; i++)
654 {
655 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
656 /* e and o*/
657 for(k = 0; k < 4; k++)
658 {
659 e[k] = pi2_tmp[k] + pi2_tmp[7 - k];
660 o[k] = pi2_tmp[k] - pi2_tmp[7 - k];
661 }
662 /* ee and eo */
663 ee[0] = e[0] + e[3];
664 eo[0] = e[0] - e[3];
665 ee[1] = e[1] + e[2];
666 eo[1] = e[1] - e[2];
667
668 pi2_dst[0] = (g_ai2_ihevc_trans_8[0][0] * ee[0]
669 + g_ai2_ihevc_trans_8[0][1] * ee[1] + add) >> shift;
670 pi2_dst[4 * dst_strd] = (g_ai2_ihevc_trans_8[4][0] * ee[0]
671 + g_ai2_ihevc_trans_8[4][1] * ee[1] + add) >> shift;
672 pi2_dst[2 * dst_strd] = (g_ai2_ihevc_trans_8[2][0] * eo[0]
673 + g_ai2_ihevc_trans_8[2][1] * eo[1] + add) >> shift;
674 pi2_dst[6 * dst_strd] = (g_ai2_ihevc_trans_8[6][0] * eo[0]
675 + g_ai2_ihevc_trans_8[6][1] * eo[1] + add) >> shift;
676
677 pi2_dst[dst_strd] = (g_ai2_ihevc_trans_8[1][0] * o[0]
678 + g_ai2_ihevc_trans_8[1][1] * o[1]
679 + g_ai2_ihevc_trans_8[1][2] * o[2]
680 + g_ai2_ihevc_trans_8[1][3] * o[3] + add) >> shift;
681 pi2_dst[3 * dst_strd] = (g_ai2_ihevc_trans_8[3][0] * o[0]
682 + g_ai2_ihevc_trans_8[3][1] * o[1]
683 + g_ai2_ihevc_trans_8[3][2] * o[2]
684 + g_ai2_ihevc_trans_8[3][3] * o[3] + add) >> shift;
685 pi2_dst[5 * dst_strd] = (g_ai2_ihevc_trans_8[5][0] * o[0]
686 + g_ai2_ihevc_trans_8[5][1] * o[1]
687 + g_ai2_ihevc_trans_8[5][2] * o[2]
688 + g_ai2_ihevc_trans_8[5][3] * o[3] + add) >> shift;
689 pi2_dst[7 * dst_strd] = (g_ai2_ihevc_trans_8[7][0] * o[0]
690 + g_ai2_ihevc_trans_8[7][1] * o[1]
691 + g_ai2_ihevc_trans_8[7][2] * o[2]
692 + g_ai2_ihevc_trans_8[7][3] * o[3] + add) >> shift;
693
694 pi2_tmp += trans_size;
695 pi2_dst++;
696 }
697 }
698 /**
699 *******************************************************************************
700 *
701 * @brief
702 * This function performs residue calculation and forward transform on
703 * input pixels
704 *
705 * @par Description:
706 * Performs residue calculation by subtracting source and prediction and
707 * followed by forward transform
708 *
709 * @param[in] pu1_src
710 * Input 16x16 pixels
711 *
712 * @param[in] pu1_pred
713 * Prediction data
714 *
715 * @param[in] pi2_tmp
716 * Temporary buffer of size 16x16
717 *
718 * @param[out] pi2_dst
719 * Output 16x16 coefficients
720 *
721 * @param[in] src_strd
722 * Input stride
723 *
724 * @param[in] pred_strd
725 * Prediction Stride
726 *
727 * @param[in] dst_strd_chr_flag
728 * Output Stride and Chroma Flag packed in the MS and LS 16-bit
729 *
730 * @returns Void
731 *
732 * @remarks
733 * None
734 *
735 *******************************************************************************
736 */
737
ihevc_resi_trans_16x16(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)738 UWORD32 ihevc_resi_trans_16x16(UWORD8 *pu1_src,
739 UWORD8 *pu1_pred,
740 WORD32 *pi4_temp,
741 WORD16 *pi2_dst,
742 WORD32 src_strd,
743 WORD32 pred_strd,
744 WORD32 dst_strd_chr_flag)
745 {
746 WORD32 i, k;
747 WORD32 e[8], o[8];
748 WORD32 ee[4], eo[4];
749 WORD32 eee[2], eeo[2];
750 WORD32 add, shift;
751 WORD32 trans_size;
752 WORD32 *pi4_tmp_orig;
753 WORD16 *pi2_dst_orig;
754 UWORD32 u4_blk_sad = 0;
755 WORD32 chroma_flag;
756 WORD32 dst_strd;
757
758 chroma_flag = dst_strd_chr_flag & 1;
759 dst_strd = dst_strd_chr_flag >> 16;
760
761 pi2_dst_orig = pi2_dst;
762 pi4_tmp_orig = pi4_temp;
763 trans_size = TRANS_SIZE_16;
764 /* Residue + Forward Transform 1st stage */
765 shift = 3; // log2(iWidth) - 1 + g_uiBitIncrement
766 add = 1 << (shift - 1);
767
768 for(i = 0; i < trans_size; i++)
769 {
770 WORD32 resi_tmp_1, resi_tmp_2;
771 /* e and o*/
772 for(k = 0; k < 8; k++)
773 {
774 resi_tmp_1 = pu1_src[k*(1+chroma_flag)] - pu1_pred[k*(1+chroma_flag)];
775 resi_tmp_2 = pu1_src[(15-k)*(1+chroma_flag)] - pu1_pred[(15-k)*(1+chroma_flag)];
776 e[k] = resi_tmp_1 + resi_tmp_2;
777 o[k] = resi_tmp_1 - resi_tmp_2;
778 u4_blk_sad += abs(resi_tmp_1) + abs(resi_tmp_2);
779 }
780 /* ee and eo */
781 for(k = 0; k < 4; k++)
782 {
783 ee[k] = e[k] + e[7 - k];
784 eo[k] = e[k] - e[7 - k];
785 }
786 /* eee and eeo */
787 eee[0] = ee[0] + ee[3];
788 eeo[0] = ee[0] - ee[3];
789 eee[1] = ee[1] + ee[2];
790 eeo[1] = ee[1] - ee[2];
791
792 pi4_temp[0] = (g_ai2_ihevc_trans_16[0][0] * eee[0]
793 + g_ai2_ihevc_trans_16[0][1] * eee[1]);// + add) >> shift;
794 pi4_temp[8 * trans_size] = (g_ai2_ihevc_trans_16[8][0] * eee[0]
795 + g_ai2_ihevc_trans_16[8][1] * eee[1]);// + add) >> shift;
796 pi4_temp[4 * trans_size] = (g_ai2_ihevc_trans_16[4][0] * eeo[0]
797 + g_ai2_ihevc_trans_16[4][1] * eeo[1]);// + add) >> shift;
798 pi4_temp[12 * trans_size] = (g_ai2_ihevc_trans_16[12][0] * eeo[0]
799 + g_ai2_ihevc_trans_16[12][1] * eeo[1]);// + add) >> shift;
800
801 for(k = 2; k < 16; k += 4)
802 {
803 pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_16[k][0] * eo[0]
804 + g_ai2_ihevc_trans_16[k][1] * eo[1]
805 + g_ai2_ihevc_trans_16[k][2] * eo[2]
806 + g_ai2_ihevc_trans_16[k][3] * eo[3]);// + add)>> shift;
807
808 }
809
810 for(k = 1; k < 16; k += 2)
811 {
812 pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_16[k][0] * o[0]
813 + g_ai2_ihevc_trans_16[k][1] * o[1]
814 + g_ai2_ihevc_trans_16[k][2] * o[2]
815 + g_ai2_ihevc_trans_16[k][3] * o[3]
816 + g_ai2_ihevc_trans_16[k][4] * o[4]
817 + g_ai2_ihevc_trans_16[k][5] * o[5]
818 + g_ai2_ihevc_trans_16[k][6] * o[6]
819 + g_ai2_ihevc_trans_16[k][7] * o[7]);// + add) >> shift;
820 }
821 pu1_src += src_strd;
822 pu1_pred += pred_strd;
823 pi4_temp++;
824 }
825
826 pi4_temp = pi4_tmp_orig;
827 /* Forward Transform 2nd stage */
828 shift = 13; // log2(iHeight) + 6
829 add = 1 << (shift - 1);
830
831 for(i = 0; i < TRANS_SIZE_16; i++)
832 {
833 /* e and o*/
834 for(k = 0; k < 8; k++)
835 {
836 e[k] = pi4_temp[k] + pi4_temp[15 - k];
837 o[k] = pi4_temp[k] - pi4_temp[15 - k];
838 }
839 /* ee and eo */
840 for(k = 0; k < 4; k++)
841 {
842 ee[k] = e[k] + e[7 - k];
843 eo[k] = e[k] - e[7 - k];
844 }
845 /* eee and eeo */
846 eee[0] = ee[0] + ee[3];
847 eeo[0] = ee[0] - ee[3];
848 eee[1] = ee[1] + ee[2];
849 eeo[1] = ee[1] - ee[2];
850
851 pi2_dst[0] = (g_ai2_ihevc_trans_16[0][0] * eee[0]
852 + g_ai2_ihevc_trans_16[0][1] * eee[1] + add) >> shift;
853 pi2_dst[8 * dst_strd] = (g_ai2_ihevc_trans_16[8][0] * eee[0]
854 + g_ai2_ihevc_trans_16[8][1] * eee[1] + add) >> shift;
855 pi2_dst[4 * dst_strd] = (g_ai2_ihevc_trans_16[4][0] * eeo[0]
856 + g_ai2_ihevc_trans_16[4][1] * eeo[1] + add) >> shift;
857 pi2_dst[12 * dst_strd] = (g_ai2_ihevc_trans_16[12][0] * eeo[0]
858 + g_ai2_ihevc_trans_16[12][1] * eeo[1] + add) >> shift;
859
860 for(k = 2; k < 16; k += 4)
861 {
862 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_16[k][0] * eo[0]
863 + g_ai2_ihevc_trans_16[k][1] * eo[1]
864 + g_ai2_ihevc_trans_16[k][2] * eo[2]
865 + g_ai2_ihevc_trans_16[k][3] * eo[3] + add)
866 >> shift;
867 }
868
869 for(k = 1; k < 16; k += 2)
870 {
871 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_16[k][0] * o[0]
872 + g_ai2_ihevc_trans_16[k][1] * o[1]
873 + g_ai2_ihevc_trans_16[k][2] * o[2]
874 + g_ai2_ihevc_trans_16[k][3] * o[3]
875 + g_ai2_ihevc_trans_16[k][4] * o[4]
876 + g_ai2_ihevc_trans_16[k][5] * o[5]
877 + g_ai2_ihevc_trans_16[k][6] * o[6]
878 + g_ai2_ihevc_trans_16[k][7] * o[7] + add) >> shift;
879 }
880
881 pi4_temp += trans_size;
882 pi2_dst++;
883 }
884
885 return u4_blk_sad;
886 }
887
888
ihevc_resi_trans_16x16_16bit(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_tmp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd)889 void ihevc_resi_trans_16x16_16bit(WORD16 *pi2_src,
890 UWORD8 *pu1_pred,
891 WORD16 *pi2_tmp,
892 WORD16 *pi2_dst,
893 WORD32 src_strd,
894 WORD32 pred_strd,
895 WORD32 dst_strd)
896 {
897 WORD32 i, k;
898 WORD32 e[8], o[8];
899 WORD32 ee[4], eo[4];
900 WORD32 eee[2], eeo[2];
901 WORD32 add, shift;
902 WORD32 trans_size;
903 WORD16 *pi2_tmp_orig;
904 WORD16 *pi2_dst_orig;
905
906 pi2_dst_orig = pi2_dst;
907 pi2_tmp_orig = pi2_tmp;
908 trans_size = TRANS_SIZE_16;
909 /* Residue + Forward Transform 1st stage */
910 shift = 3; // log2(iWidth) - 1 + g_uiBitIncrement
911 add = 1 << (shift - 1);
912
913 for(i = 0; i < trans_size; i++)
914 {
915 WORD32 resi_tmp_1, resi_tmp_2;
916 /* e and o*/
917 for(k = 0; k < 8; k++)
918 {
919 resi_tmp_1 = pi2_src[k] - pu1_pred[k];
920 resi_tmp_2 = pi2_src[15 - k] - pu1_pred[15 - k];
921 e[k] = resi_tmp_1 + resi_tmp_2;
922 o[k] = resi_tmp_1 - resi_tmp_2;
923 }
924 /* ee and eo */
925 for(k = 0; k < 4; k++)
926 {
927 ee[k] = e[k] + e[7 - k];
928 eo[k] = e[k] - e[7 - k];
929 }
930 /* eee and eeo */
931 eee[0] = ee[0] + ee[3];
932 eeo[0] = ee[0] - ee[3];
933 eee[1] = ee[1] + ee[2];
934 eeo[1] = ee[1] - ee[2];
935
936 pi2_tmp[0] = (g_ai2_ihevc_trans_16[0][0] * eee[0]
937 + g_ai2_ihevc_trans_16[0][1] * eee[1] + add) >> shift;
938 pi2_tmp[8 * trans_size] = (g_ai2_ihevc_trans_16[8][0] * eee[0]
939 + g_ai2_ihevc_trans_16[8][1] * eee[1] + add) >> shift;
940 pi2_tmp[4 * trans_size] = (g_ai2_ihevc_trans_16[4][0] * eeo[0]
941 + g_ai2_ihevc_trans_16[4][1] * eeo[1] + add) >> shift;
942 pi2_tmp[12 * trans_size] = (g_ai2_ihevc_trans_16[12][0] * eeo[0]
943 + g_ai2_ihevc_trans_16[12][1] * eeo[1] + add) >> shift;
944
945 for(k = 2; k < 16; k += 4)
946 {
947 pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_16[k][0] * eo[0]
948 + g_ai2_ihevc_trans_16[k][1] * eo[1]
949 + g_ai2_ihevc_trans_16[k][2] * eo[2]
950 + g_ai2_ihevc_trans_16[k][3] * eo[3] + add)
951 >> shift;
952 }
953
954 for(k = 1; k < 16; k += 2)
955 {
956 pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_16[k][0] * o[0]
957 + g_ai2_ihevc_trans_16[k][1] * o[1]
958 + g_ai2_ihevc_trans_16[k][2] * o[2]
959 + g_ai2_ihevc_trans_16[k][3] * o[3]
960 + g_ai2_ihevc_trans_16[k][4] * o[4]
961 + g_ai2_ihevc_trans_16[k][5] * o[5]
962 + g_ai2_ihevc_trans_16[k][6] * o[6]
963 + g_ai2_ihevc_trans_16[k][7] * o[7] + add) >> shift;
964 }
965 pi2_src += src_strd;
966 pu1_pred += pred_strd;
967 pi2_tmp++;
968 }
969
970 pi2_tmp = pi2_tmp_orig;
971 /* Forward Transform 2nd stage */
972 shift = 10; // log2(iHeight) + 6
973 add = 1 << (shift - 1);
974
975 for(i = 0; i < TRANS_SIZE_16; i++)
976 {
977 /* e and o*/
978 for(k = 0; k < 8; k++)
979 {
980 e[k] = pi2_tmp[k] + pi2_tmp[15 - k];
981 o[k] = pi2_tmp[k] - pi2_tmp[15 - k];
982 }
983 /* ee and eo */
984 for(k = 0; k < 4; k++)
985 {
986 ee[k] = e[k] + e[7 - k];
987 eo[k] = e[k] - e[7 - k];
988 }
989 /* eee and eeo */
990 eee[0] = ee[0] + ee[3];
991 eeo[0] = ee[0] - ee[3];
992 eee[1] = ee[1] + ee[2];
993 eeo[1] = ee[1] - ee[2];
994
995 pi2_dst[0] = (g_ai2_ihevc_trans_16[0][0] * eee[0]
996 + g_ai2_ihevc_trans_16[0][1] * eee[1] + add) >> shift;
997 pi2_dst[8 * dst_strd] = (g_ai2_ihevc_trans_16[8][0] * eee[0]
998 + g_ai2_ihevc_trans_16[8][1] * eee[1] + add) >> shift;
999 pi2_dst[4 * dst_strd] = (g_ai2_ihevc_trans_16[4][0] * eeo[0]
1000 + g_ai2_ihevc_trans_16[4][1] * eeo[1] + add) >> shift;
1001 pi2_dst[12 * dst_strd] = (g_ai2_ihevc_trans_16[12][0] * eeo[0]
1002 + g_ai2_ihevc_trans_16[12][1] * eeo[1] + add) >> shift;
1003
1004 for(k = 2; k < 16; k += 4)
1005 {
1006 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_16[k][0] * eo[0]
1007 + g_ai2_ihevc_trans_16[k][1] * eo[1]
1008 + g_ai2_ihevc_trans_16[k][2] * eo[2]
1009 + g_ai2_ihevc_trans_16[k][3] * eo[3] + add)
1010 >> shift;
1011 }
1012
1013 for(k = 1; k < 16; k += 2)
1014 {
1015 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_16[k][0] * o[0]
1016 + g_ai2_ihevc_trans_16[k][1] * o[1]
1017 + g_ai2_ihevc_trans_16[k][2] * o[2]
1018 + g_ai2_ihevc_trans_16[k][3] * o[3]
1019 + g_ai2_ihevc_trans_16[k][4] * o[4]
1020 + g_ai2_ihevc_trans_16[k][5] * o[5]
1021 + g_ai2_ihevc_trans_16[k][6] * o[6]
1022 + g_ai2_ihevc_trans_16[k][7] * o[7] + add) >> shift;
1023 }
1024
1025 pi2_tmp += trans_size;
1026 pi2_dst++;
1027 }
1028 }
1029
1030 /**
1031 *******************************************************************************
1032 *
1033 * @brief
1034 * This function performs residue calculation and forward transform on
1035 * input pixels
1036 *
1037 * @par Description:
1038 * Performs residue calculation by subtracting source and prediction and
1039 * followed by forward transform
1040 *
1041 * @param[in] pu1_src
1042 * Input 32x32 pixels
1043 *
1044 * @param[in] pu1_pred
1045 * Prediction data
1046 *
1047 * @param[in] pi2_tmp
1048 * Temporary buffer of size 32x32
1049 *
1050 * @param[out] pi2_dst
1051 * Output 32x32 coefficients
1052 *
1053 * @param[in] src_strd
1054 * Input stride
1055 *
1056 * @param[in] pred_strd
1057 * Prediction Stride
1058 *
1059 * @param[in] dst_strd_chr_flag
1060 * Output Stride and Chroma Flag packed in the MS and LS 16-bit
1061 *
1062 * @returns Void
1063 *
1064 * @remarks
1065 * None
1066 *
1067 *******************************************************************************
1068 */
1069
ihevc_resi_trans_32x32(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)1070 UWORD32 ihevc_resi_trans_32x32(UWORD8 *pu1_src,
1071 UWORD8 *pu1_pred,
1072 WORD32 *pi4_temp,
1073 WORD16 *pi2_dst,
1074 WORD32 src_strd,
1075 WORD32 pred_strd,
1076 WORD32 dst_strd_chr_flag)
1077 {
1078 WORD32 i, k;
1079 WORD32 e[16], o[16];
1080 WORD32 ee[8], eo[8];
1081 WORD32 eee[4], eeo[4];
1082 WORD32 eeee[2], eeeo[2];
1083 WORD32 add, shift;
1084 WORD32 trans_size;
1085 WORD32 *pi4_tmp_orig;
1086 WORD16 *pi2_dst_orig;
1087 UWORD32 u4_blk_sad = 0 ;
1088 WORD32 chroma_flag;
1089 WORD32 dst_strd;
1090
1091 chroma_flag = dst_strd_chr_flag & 1;
1092 dst_strd = dst_strd_chr_flag >> 16;
1093
1094 pi2_dst_orig = pi2_dst;
1095 pi4_tmp_orig = pi4_temp;
1096 trans_size = TRANS_SIZE_32;
1097 /* Residue + Forward Transform 1st stage */
1098 /* Made to zero to match with intrinsics */
1099 shift = 0; // 4 : log2(iWidth) - 1 + g_uiBitIncrement
1100 add = 0 ; //1 << (shift - 1);
1101
1102 for(i = 0; i < trans_size; i++)
1103 {
1104 WORD32 resi_tmp_1, resi_tmp_2;
1105 /* e and o*/
1106 for(k = 0; k < 16; k++)
1107 {
1108 resi_tmp_1 = pu1_src[k] - pu1_pred[k];
1109 resi_tmp_2 = pu1_src[31 - k] - pu1_pred[31 - k];
1110 e[k] = resi_tmp_1 + resi_tmp_2;
1111 o[k] = resi_tmp_1 - resi_tmp_2;
1112 u4_blk_sad += abs(resi_tmp_1) + abs(resi_tmp_2);
1113 }
1114 /* ee and eo */
1115 for(k = 0; k < 8; k++)
1116 {
1117 ee[k] = e[k] + e[15 - k];
1118 eo[k] = e[k] - e[15 - k];
1119 }
1120 /* eee and eeo */
1121 for(k = 0; k < 4; k++)
1122 {
1123 eee[k] = ee[k] + ee[7 - k];
1124 eeo[k] = ee[k] - ee[7 - k];
1125 }
1126 /* eeee and eeeo */
1127 eeee[0] = eee[0] + eee[3];
1128 eeeo[0] = eee[0] - eee[3];
1129 eeee[1] = eee[1] + eee[2];
1130 eeeo[1] = eee[1] - eee[2];
1131
1132 pi4_temp[0] = (g_ai2_ihevc_trans_32[0][0] * eeee[0]
1133 + g_ai2_ihevc_trans_32[0][1] * eeee[1]);// + add) >> shift;
1134 pi4_temp[16 * trans_size] = (g_ai2_ihevc_trans_32[16][0] * eeee[0]
1135 + g_ai2_ihevc_trans_32[16][1] * eeee[1]);// + add) >> shift;
1136 pi4_temp[8 * trans_size] = (g_ai2_ihevc_trans_32[8][0] * eeeo[0]
1137 + g_ai2_ihevc_trans_32[8][1] * eeeo[1]);// + add) >> shift;
1138 pi4_temp[24 * trans_size] = (g_ai2_ihevc_trans_32[24][0] * eeeo[0]
1139 + g_ai2_ihevc_trans_32[24][1] * eeeo[1]);// + add) >> shift;
1140 for(k = 4; k < 32; k += 8)
1141 {
1142 pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * eeo[0]
1143 + g_ai2_ihevc_trans_32[k][1] * eeo[1]
1144 + g_ai2_ihevc_trans_32[k][2] * eeo[2]
1145 + g_ai2_ihevc_trans_32[k][3] * eeo[3]);// + add)>> shift;
1146 }
1147 for(k = 2; k < 32; k += 4)
1148 {
1149 pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * eo[0]
1150 + g_ai2_ihevc_trans_32[k][1] * eo[1]
1151 + g_ai2_ihevc_trans_32[k][2] * eo[2]
1152 + g_ai2_ihevc_trans_32[k][3] * eo[3]
1153 + g_ai2_ihevc_trans_32[k][4] * eo[4]
1154 + g_ai2_ihevc_trans_32[k][5] * eo[5]
1155 + g_ai2_ihevc_trans_32[k][6] * eo[6]
1156 + g_ai2_ihevc_trans_32[k][7] * eo[7]);// + add)>> shift;
1157 }
1158 for(k = 1; k < 32; k += 2)
1159 {
1160 pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * o[0]
1161 + g_ai2_ihevc_trans_32[k][1] * o[1]
1162 + g_ai2_ihevc_trans_32[k][2] * o[2]
1163 + g_ai2_ihevc_trans_32[k][3] * o[3]
1164 + g_ai2_ihevc_trans_32[k][4] * o[4]
1165 + g_ai2_ihevc_trans_32[k][5] * o[5]
1166 + g_ai2_ihevc_trans_32[k][6] * o[6]
1167 + g_ai2_ihevc_trans_32[k][7] * o[7]
1168 + g_ai2_ihevc_trans_32[k][8] * o[8]
1169 + g_ai2_ihevc_trans_32[k][9] * o[9]
1170 + g_ai2_ihevc_trans_32[k][10] * o[10]
1171 + g_ai2_ihevc_trans_32[k][11] * o[11]
1172 + g_ai2_ihevc_trans_32[k][12] * o[12]
1173 + g_ai2_ihevc_trans_32[k][13] * o[13]
1174 + g_ai2_ihevc_trans_32[k][14] * o[14]
1175 + g_ai2_ihevc_trans_32[k][15] * o[15]);// + add) >> shift;
1176 }
1177 pu1_src += src_strd;
1178 pu1_pred += pred_strd;
1179 pi4_temp++;
1180 }
1181
1182 pi4_temp = pi4_tmp_orig;
1183 /* Forward Transform 2nd stage */
1184 shift = 15; // log2(iHeight) + 6
1185 add = 1 << (shift - 1);
1186
1187 for(i = 0; i < TRANS_SIZE_32; i++)
1188 {
1189 /* e and o*/
1190 for(k = 0; k < 16; k++)
1191 {
1192 e[k] = pi4_temp[k] + pi4_temp[31 - k];
1193 o[k] = pi4_temp[k] - pi4_temp[31 - k];
1194 }
1195 /* ee and eo */
1196 for(k = 0; k < 8; k++)
1197 {
1198 ee[k] = e[k] + e[15 - k];
1199 eo[k] = e[k] - e[15 - k];
1200 }
1201 /* eee and eeo */
1202 for(k = 0; k < 4; k++)
1203 {
1204 eee[k] = ee[k] + ee[7 - k];
1205 eeo[k] = ee[k] - ee[7 - k];
1206 }
1207 /* eeee and eeeo */
1208 eeee[0] = eee[0] + eee[3];
1209 eeeo[0] = eee[0] - eee[3];
1210 eeee[1] = eee[1] + eee[2];
1211 eeeo[1] = eee[1] - eee[2];
1212
1213 pi2_dst[0] = (g_ai2_ihevc_trans_32[0][0] * eeee[0]
1214 + g_ai2_ihevc_trans_32[0][1] * eeee[1] + add) >> shift;
1215 pi2_dst[16 * dst_strd] = (g_ai2_ihevc_trans_32[16][0] * eeee[0]
1216 + g_ai2_ihevc_trans_32[16][1] * eeee[1] + add) >> shift;
1217 pi2_dst[8 * dst_strd] = (g_ai2_ihevc_trans_32[8][0] * eeeo[0]
1218 + g_ai2_ihevc_trans_32[8][1] * eeeo[1] + add) >> shift;
1219 pi2_dst[24 * dst_strd] = (g_ai2_ihevc_trans_32[24][0] * eeeo[0]
1220 + g_ai2_ihevc_trans_32[24][1] * eeeo[1] + add) >> shift;
1221 for(k = 4; k < 32; k += 8)
1222 {
1223 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * eeo[0]
1224 + g_ai2_ihevc_trans_32[k][1] * eeo[1]
1225 + g_ai2_ihevc_trans_32[k][2] * eeo[2]
1226 + g_ai2_ihevc_trans_32[k][3] * eeo[3] + add)
1227 >> shift;
1228 }
1229 for(k = 2; k < 32; k += 4)
1230 {
1231 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * eo[0]
1232 + g_ai2_ihevc_trans_32[k][1] * eo[1]
1233 + g_ai2_ihevc_trans_32[k][2] * eo[2]
1234 + g_ai2_ihevc_trans_32[k][3] * eo[3]
1235 + g_ai2_ihevc_trans_32[k][4] * eo[4]
1236 + g_ai2_ihevc_trans_32[k][5] * eo[5]
1237 + g_ai2_ihevc_trans_32[k][6] * eo[6]
1238 + g_ai2_ihevc_trans_32[k][7] * eo[7] + add)
1239 >> shift;
1240 }
1241 for(k = 1; k < 32; k += 2)
1242 {
1243 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * o[0]
1244 + g_ai2_ihevc_trans_32[k][1] * o[1]
1245 + g_ai2_ihevc_trans_32[k][2] * o[2]
1246 + g_ai2_ihevc_trans_32[k][3] * o[3]
1247 + g_ai2_ihevc_trans_32[k][4] * o[4]
1248 + g_ai2_ihevc_trans_32[k][5] * o[5]
1249 + g_ai2_ihevc_trans_32[k][6] * o[6]
1250 + g_ai2_ihevc_trans_32[k][7] * o[7]
1251 + g_ai2_ihevc_trans_32[k][8] * o[8]
1252 + g_ai2_ihevc_trans_32[k][9] * o[9]
1253 + g_ai2_ihevc_trans_32[k][10] * o[10]
1254 + g_ai2_ihevc_trans_32[k][11] * o[11]
1255 + g_ai2_ihevc_trans_32[k][12] * o[12]
1256 + g_ai2_ihevc_trans_32[k][13] * o[13]
1257 + g_ai2_ihevc_trans_32[k][14] * o[14]
1258 + g_ai2_ihevc_trans_32[k][15] * o[15] + add)
1259 >> shift;
1260 }
1261
1262 pi4_temp += trans_size;
1263 pi2_dst++;
1264 }
1265
1266 return u4_blk_sad;
1267 }
1268
1269
1270
ihevc_resi_trans_32x32_16bit(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_tmp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd)1271 void ihevc_resi_trans_32x32_16bit(WORD16 *pi2_src,
1272 UWORD8 *pu1_pred,
1273 WORD16 *pi2_tmp,
1274 WORD16 *pi2_dst,
1275 WORD32 src_strd,
1276 WORD32 pred_strd,
1277 WORD32 dst_strd)
1278 {
1279 WORD32 i, k;
1280 WORD32 e[16], o[16];
1281 WORD32 ee[8], eo[8];
1282 WORD32 eee[4], eeo[4];
1283 WORD32 eeee[2], eeeo[2];
1284 WORD32 add, shift;
1285 WORD32 trans_size;
1286 WORD16 *pi2_tmp_orig;
1287 WORD16 *pi2_dst_orig;
1288
1289 pi2_dst_orig = pi2_dst;
1290 pi2_tmp_orig = pi2_tmp;
1291 trans_size = TRANS_SIZE_32;
1292 /* Residue + Forward Transform 1st stage */
1293 shift = 4; // log2(iWidth) - 1 + g_uiBitIncrement
1294 add = 1 << (shift - 1);
1295
1296 for(i = 0; i < trans_size; i++)
1297 {
1298 WORD32 resi_tmp_1, resi_tmp_2;
1299 /* e and o*/
1300 for(k = 0; k < 16; k++)
1301 {
1302 resi_tmp_1 = pi2_src[k] - pu1_pred[k];
1303 resi_tmp_2 = pi2_src[31 - k] - pu1_pred[31 - k];
1304 e[k] = resi_tmp_1 + resi_tmp_2;
1305 o[k] = resi_tmp_1 - resi_tmp_2;
1306 }
1307 /* ee and eo */
1308 for(k = 0; k < 8; k++)
1309 {
1310 ee[k] = e[k] + e[15 - k];
1311 eo[k] = e[k] - e[15 - k];
1312 }
1313 /* eee and eeo */
1314 for(k = 0; k < 4; k++)
1315 {
1316 eee[k] = ee[k] + ee[7 - k];
1317 eeo[k] = ee[k] - ee[7 - k];
1318 }
1319 /* eeee and eeeo */
1320 eeee[0] = eee[0] + eee[3];
1321 eeeo[0] = eee[0] - eee[3];
1322 eeee[1] = eee[1] + eee[2];
1323 eeeo[1] = eee[1] - eee[2];
1324
1325 pi2_tmp[0] = (g_ai2_ihevc_trans_32[0][0] * eeee[0]
1326 + g_ai2_ihevc_trans_32[0][1] * eeee[1] + add) >> shift;
1327 pi2_tmp[16 * trans_size] = (g_ai2_ihevc_trans_32[16][0] * eeee[0]
1328 + g_ai2_ihevc_trans_32[16][1] * eeee[1] + add) >> shift;
1329 pi2_tmp[8 * trans_size] = (g_ai2_ihevc_trans_32[8][0] * eeeo[0]
1330 + g_ai2_ihevc_trans_32[8][1] * eeeo[1] + add) >> shift;
1331 pi2_tmp[24 * trans_size] = (g_ai2_ihevc_trans_32[24][0] * eeeo[0]
1332 + g_ai2_ihevc_trans_32[24][1] * eeeo[1] + add) >> shift;
1333 for(k = 4; k < 32; k += 8)
1334 {
1335 pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * eeo[0]
1336 + g_ai2_ihevc_trans_32[k][1] * eeo[1]
1337 + g_ai2_ihevc_trans_32[k][2] * eeo[2]
1338 + g_ai2_ihevc_trans_32[k][3] * eeo[3] + add)
1339 >> shift;
1340 }
1341 for(k = 2; k < 32; k += 4)
1342 {
1343 pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * eo[0]
1344 + g_ai2_ihevc_trans_32[k][1] * eo[1]
1345 + g_ai2_ihevc_trans_32[k][2] * eo[2]
1346 + g_ai2_ihevc_trans_32[k][3] * eo[3]
1347 + g_ai2_ihevc_trans_32[k][4] * eo[4]
1348 + g_ai2_ihevc_trans_32[k][5] * eo[5]
1349 + g_ai2_ihevc_trans_32[k][6] * eo[6]
1350 + g_ai2_ihevc_trans_32[k][7] * eo[7] + add)
1351 >> shift;
1352 }
1353 for(k = 1; k < 32; k += 2)
1354 {
1355 pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * o[0]
1356 + g_ai2_ihevc_trans_32[k][1] * o[1]
1357 + g_ai2_ihevc_trans_32[k][2] * o[2]
1358 + g_ai2_ihevc_trans_32[k][3] * o[3]
1359 + g_ai2_ihevc_trans_32[k][4] * o[4]
1360 + g_ai2_ihevc_trans_32[k][5] * o[5]
1361 + g_ai2_ihevc_trans_32[k][6] * o[6]
1362 + g_ai2_ihevc_trans_32[k][7] * o[7]
1363 + g_ai2_ihevc_trans_32[k][8] * o[8]
1364 + g_ai2_ihevc_trans_32[k][9] * o[9]
1365 + g_ai2_ihevc_trans_32[k][10] * o[10]
1366 + g_ai2_ihevc_trans_32[k][11] * o[11]
1367 + g_ai2_ihevc_trans_32[k][12] * o[12]
1368 + g_ai2_ihevc_trans_32[k][13] * o[13]
1369 + g_ai2_ihevc_trans_32[k][14] * o[14]
1370 + g_ai2_ihevc_trans_32[k][15] * o[15] + add)
1371 >> shift;
1372 }
1373 pi2_src += src_strd;
1374 pu1_pred += pred_strd;
1375 pi2_tmp++;
1376 }
1377
1378 pi2_tmp = pi2_tmp_orig;
1379 /* Forward Transform 2nd stage */
1380 shift = 11; // log2(iHeight) + 6
1381 add = 1 << (shift - 1);
1382
1383 for(i = 0; i < TRANS_SIZE_32; i++)
1384 {
1385 /* e and o*/
1386 for(k = 0; k < 16; k++)
1387 {
1388 e[k] = pi2_tmp[k] + pi2_tmp[31 - k];
1389 o[k] = pi2_tmp[k] - pi2_tmp[31 - k];
1390 }
1391 /* ee and eo */
1392 for(k = 0; k < 8; k++)
1393 {
1394 ee[k] = e[k] + e[15 - k];
1395 eo[k] = e[k] - e[15 - k];
1396 }
1397 /* eee and eeo */
1398 for(k = 0; k < 4; k++)
1399 {
1400 eee[k] = ee[k] + ee[7 - k];
1401 eeo[k] = ee[k] - ee[7 - k];
1402 }
1403 /* eeee and eeeo */
1404 eeee[0] = eee[0] + eee[3];
1405 eeeo[0] = eee[0] - eee[3];
1406 eeee[1] = eee[1] + eee[2];
1407 eeeo[1] = eee[1] - eee[2];
1408
1409 pi2_dst[0] = (g_ai2_ihevc_trans_32[0][0] * eeee[0]
1410 + g_ai2_ihevc_trans_32[0][1] * eeee[1] + add) >> shift;
1411 pi2_dst[16 * dst_strd] = (g_ai2_ihevc_trans_32[16][0] * eeee[0]
1412 + g_ai2_ihevc_trans_32[16][1] * eeee[1] + add) >> shift;
1413 pi2_dst[8 * dst_strd] = (g_ai2_ihevc_trans_32[8][0] * eeeo[0]
1414 + g_ai2_ihevc_trans_32[8][1] * eeeo[1] + add) >> shift;
1415 pi2_dst[24 * dst_strd] = (g_ai2_ihevc_trans_32[24][0] * eeeo[0]
1416 + g_ai2_ihevc_trans_32[24][1] * eeeo[1] + add) >> shift;
1417 for(k = 4; k < 32; k += 8)
1418 {
1419 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * eeo[0]
1420 + g_ai2_ihevc_trans_32[k][1] * eeo[1]
1421 + g_ai2_ihevc_trans_32[k][2] * eeo[2]
1422 + g_ai2_ihevc_trans_32[k][3] * eeo[3] + add)
1423 >> shift;
1424 }
1425 for(k = 2; k < 32; k += 4)
1426 {
1427 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * eo[0]
1428 + g_ai2_ihevc_trans_32[k][1] * eo[1]
1429 + g_ai2_ihevc_trans_32[k][2] * eo[2]
1430 + g_ai2_ihevc_trans_32[k][3] * eo[3]
1431 + g_ai2_ihevc_trans_32[k][4] * eo[4]
1432 + g_ai2_ihevc_trans_32[k][5] * eo[5]
1433 + g_ai2_ihevc_trans_32[k][6] * eo[6]
1434 + g_ai2_ihevc_trans_32[k][7] * eo[7] + add)
1435 >> shift;
1436 }
1437 for(k = 1; k < 32; k += 2)
1438 {
1439 pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * o[0]
1440 + g_ai2_ihevc_trans_32[k][1] * o[1]
1441 + g_ai2_ihevc_trans_32[k][2] * o[2]
1442 + g_ai2_ihevc_trans_32[k][3] * o[3]
1443 + g_ai2_ihevc_trans_32[k][4] * o[4]
1444 + g_ai2_ihevc_trans_32[k][5] * o[5]
1445 + g_ai2_ihevc_trans_32[k][6] * o[6]
1446 + g_ai2_ihevc_trans_32[k][7] * o[7]
1447 + g_ai2_ihevc_trans_32[k][8] * o[8]
1448 + g_ai2_ihevc_trans_32[k][9] * o[9]
1449 + g_ai2_ihevc_trans_32[k][10] * o[10]
1450 + g_ai2_ihevc_trans_32[k][11] * o[11]
1451 + g_ai2_ihevc_trans_32[k][12] * o[12]
1452 + g_ai2_ihevc_trans_32[k][13] * o[13]
1453 + g_ai2_ihevc_trans_32[k][14] * o[14]
1454 + g_ai2_ihevc_trans_32[k][15] * o[15] + add)
1455 >> shift;
1456 }
1457
1458 pi2_tmp += trans_size;
1459 pi2_dst++;
1460 }
1461 }
1462
1463