• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21  *******************************************************************************
22  * @file
23  *  ihevc_resi_trans.c
24  *
25  * @brief
26  *  Contains function definitions for residual and  forward transform
27  *
28  * @author
29  *  100470
30  *
31  * @par List of Functions:
32  *  - ihevc_resi_trans_4x4_ttype1()
33  *  - ihevc_resi_trans_4x4()
34  *  - ihevc_resi_trans_8x8()
35  *  - ihevc_resi_trans_16x16()
36  *  - ihevc_resi_trans_32x32()
37  *
38  * @remarks
39  *  None
40  *
41  *******************************************************************************
42  */
43 #include <stdio.h>
44 #include <string.h>
45 #include <stdlib.h>
46 #include "ihevc_typedefs.h"
47 #include "ihevc_macros.h"
48 #include "ihevc_platform_macros.h"
49 #include "ihevc_defs.h"
50 #include "ihevc_trans_tables.h"
51 #include "ihevc_resi_trans.h"
52 #include "ihevc_func_selector.h"
53 #include "ihevc_trans_macros.h"
54 
55 /**
56  *******************************************************************************
57  *
58  * @brief
59  *  This function performs residue calculation and forward  transform type 1
60  * on input pixels
61  *
62  * @par Description:
63  *  Performs residue calculation by subtracting source and  prediction and
64  * followed by forward transform
65  *
66  * @param[in] pu1_src
67  *  Input 4x4 pixels
68  *
69  * @param[in] pu1_pred
70  *  Prediction data
71  *
72  * @param[in] pi2_tmp
73  *  Temporary buffer of size 4x4
74  *
75  * @param[out] pi2_dst
76  *  Output 4x4 coefficients
77  *
78  * @param[in] src_strd
79  *  Input stride
80  *
81  * @param[in] pred_strd
82  *  Prediction Stride
83  *
84  * @param[in] dst_strd_chr_flag
85  *  Output Stride and Chroma Flag packed in the MS and LS 16-bit
86  *
87  *
88  * @returns  Void
89  *
90  * @remarks
91  *  None
92  *
93  *******************************************************************************
94  */
95 
ihevc_resi_trans_4x4_ttype1(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)96 UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src,
97                                  UWORD8 *pu1_pred,
98                                     WORD32 *pi4_temp,
99                                  WORD16 *pi2_dst,
100                                  WORD32 src_strd,
101                                  WORD32 pred_strd,
102                                     WORD32 dst_strd_chr_flag)
103 {
104     WORD32 i, c[4];
105     WORD32 add, shift;
106     WORD32 trans_size;
107     WORD32 *pi4_tmp_orig;
108     WORD16 *pi2_dst_orig;
109     UWORD32  u4_blk_sad = 0;
110  //   WORD32 chroma_flag;
111     WORD32 dst_strd;
112 
113  //   chroma_flag = dst_strd_chr_flag & 1;
114     dst_strd = dst_strd_chr_flag >> 16;
115 
116     pi2_dst_orig = pi2_dst;
117     pi4_tmp_orig = pi4_temp;
118     trans_size = TRANS_SIZE_4;
119 
120     /* Residue + Forward Transform 1st stage */
121     shift = 1; // log2(iWidth) - 1 + g_uiBitIncrement
122     add = 1 << (shift - 1);
123 
124     for(i = 0; i < trans_size; i++)
125     {
126         WORD32 resi_tmp_1, resi_tmp_2, resi_tmp_3;
127 
128         // Intermediate Variables
129         resi_tmp_1 = pu1_src[0] - pu1_pred[0];
130         resi_tmp_2 = pu1_src[3] - pu1_pred[3];
131         c[0] = resi_tmp_1 + resi_tmp_2;
132         u4_blk_sad += abs(resi_tmp_1) + abs(resi_tmp_2);
133 
134         resi_tmp_1 = pu1_src[1] - pu1_pred[1];
135         resi_tmp_2 = pu1_src[3] - pu1_pred[3];
136         c[1] = resi_tmp_1 + resi_tmp_2;
137         u4_blk_sad += abs(resi_tmp_1);
138 
139         resi_tmp_1 = pu1_src[0] - pu1_pred[0];
140         resi_tmp_2 = pu1_src[1] - pu1_pred[1];
141         c[2] = resi_tmp_1 - resi_tmp_2;
142 
143         resi_tmp_1 = pu1_src[2] - pu1_pred[2];
144         c[3] = 74 * resi_tmp_1;
145         u4_blk_sad += abs(resi_tmp_1);
146 
147         pi4_temp[0] = (29 * c[0] + 55 * c[1] + c[3] + add) >> shift;
148 
149         resi_tmp_1 = pu1_src[0] - pu1_pred[0];
150         resi_tmp_2 = pu1_src[1] - pu1_pred[1];
151         resi_tmp_3 = pu1_src[3] - pu1_pred[3];
152         pi4_temp[trans_size] =
153                         (74 * (resi_tmp_1 + resi_tmp_2 - resi_tmp_3) + add)
154                                         >> shift;
155         pi4_temp[2 * trans_size] = (29 * c[2] + 55 * c[0] - c[3] + add) >> shift;
156         pi4_temp[3 * trans_size] = (55 * c[2] - 29 * c[1] + c[3] + add) >> shift;
157 
158         pu1_src += src_strd;
159         pu1_pred += pred_strd;
160         pi4_temp++;
161     }
162 
163     pi4_temp = pi4_tmp_orig;
164 
165     /* Forward transform 2nd stage */
166     shift = 8; // log2(iHeight) + 6
167     add = 1 << (shift - 1);
168 
169     for(i = 0; i < TRANS_SIZE_4; i++)
170     {
171         // Intermediate Variables
172         c[0] = pi4_temp[0] + pi4_temp[3];
173         c[1] = pi4_temp[1] + pi4_temp[3];
174         c[2] = pi4_temp[0] - pi4_temp[1];
175         c[3] = 74 * pi4_temp[2];
176 
177         pi2_dst[0] = (29 * c[0] + 55 * c[1] + c[3] + add) >> shift;
178         pi2_dst[dst_strd] = (74 * (pi4_temp[0] + pi4_temp[1] - pi4_temp[3]) + add)
179                         >> shift;
180         pi2_dst[2 * dst_strd] = (29 * c[2] + 55 * c[0] - c[3] + add) >> shift;
181         pi2_dst[3 * dst_strd] = (55 * c[2] - 29 * c[1] + c[3] + add) >> shift;
182 
183         pi4_temp += trans_size;
184         pi2_dst++;
185     }
186 
187     return u4_blk_sad;
188 }
189 
190 /**
191  *******************************************************************************
192  *
193  * @brief
194  *  This function performs residue calculation and forward  transform on
195  * input pixels
196  *
197  * @par Description:
198  *  Performs residue calculation by subtracting source and  prediction and
199  * followed by forward transform
200  *
201  * @param[in] pu1_src
202  *  Input 4x4 pixels
203  *
204  * @param[in] pu1_pred
205  *  Prediction data
206  *
207  * @param[in] pi2_tmp
208  *  Temporary buffer of size 4x4
209  *
210  * @param[out] pi2_dst
211  *  Output 4x4 coefficients
212  *
213  * @param[in] src_strd
214  *  Input stride
215  *
216  * @param[in] pred_strd
217  *  Prediction Stride
218  *
219  * @param[in] dst_strd_chr_flag
220  *  Output Stride and Chroma Flag packed in the MS and LS 16-bit
221  *
222  * @returns  Void
223  *
224  * @remarks
225  *  None
226  *
227  *******************************************************************************
228  */
229 
ihevc_resi_trans_4x4(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)230 UWORD32 ihevc_resi_trans_4x4(UWORD8 *pu1_src,
231                           UWORD8 *pu1_pred,
232                           WORD32 *pi4_temp,
233                           WORD16 *pi2_dst,
234                           WORD32 src_strd,
235                           WORD32 pred_strd,
236                           WORD32 dst_strd_chr_flag)
237 {
238     WORD32 i;
239     WORD32 e[2], o[2];
240     WORD32 add, shift;
241     WORD32 trans_size;
242     WORD32 *pi4_tmp_orig;
243     WORD16 *pi2_dst_orig;
244     UWORD32 u4_blk_sad=0;
245     WORD32 chroma_flag;
246     WORD32 dst_strd;
247 
248     chroma_flag = dst_strd_chr_flag & 1;
249     dst_strd = dst_strd_chr_flag >> 16;
250 
251     pi2_dst_orig = pi2_dst;
252     pi4_tmp_orig = pi4_temp;
253     trans_size = TRANS_SIZE_4;
254 
255     /* Residue + Forward Transform 1st stage */
256     shift = 1; // log2(iWidth) - 1 + g_uiBitIncrement
257     add = 1 << (shift - 1);
258 
259     for(i = 0; i < trans_size; i++)
260     {
261         WORD32 resi_tmp_1, resi_tmp_2;
262 
263         /* e and o */
264         resi_tmp_1 = pu1_src[0 + 0*chroma_flag] - pu1_pred[0 + 0*chroma_flag];
265         resi_tmp_2 = pu1_src[3 + 3*chroma_flag] - pu1_pred[3 + 3*chroma_flag];
266         e[0] = resi_tmp_1 + resi_tmp_2;
267         o[0] = resi_tmp_1 - resi_tmp_2;
268         u4_blk_sad += abs(resi_tmp_1);
269         u4_blk_sad += abs(resi_tmp_2);
270 
271         resi_tmp_1 = pu1_src[1 + 1*chroma_flag] - pu1_pred[1 + 1*chroma_flag];
272         resi_tmp_2 = pu1_src[2 + 2*chroma_flag] - pu1_pred[2 + 2*chroma_flag];
273         e[1] = resi_tmp_1 + resi_tmp_2;
274         o[1] = resi_tmp_1 - resi_tmp_2;
275         u4_blk_sad += abs(resi_tmp_1);
276         u4_blk_sad += abs(resi_tmp_2);
277 
278         pi4_temp[0] = (g_ai2_ihevc_trans_4[0][0] * e[0]
279                         + g_ai2_ihevc_trans_4[0][1] * e[1]);// + add) >> shift;
280         pi4_temp[2 * trans_size] = (g_ai2_ihevc_trans_4[2][0] * e[0]
281                         + g_ai2_ihevc_trans_4[2][1] * e[1]);// + add) >> shift;
282         pi4_temp[trans_size] = (g_ai2_ihevc_trans_4[1][0] * o[0]
283                         + g_ai2_ihevc_trans_4[1][1] * o[1]);// + add) >> shift;
284         pi4_temp[3 * trans_size] = (g_ai2_ihevc_trans_4[3][0] * o[0]
285                         + g_ai2_ihevc_trans_4[3][1] * o[1]);// + add) >> shift;
286 
287         pu1_src += src_strd;
288         pu1_pred += pred_strd;
289         pi4_temp++;
290     }
291 
292     pi4_temp = pi4_tmp_orig;
293     /* Forward Transform 2nd stage */
294     shift = 9; // log2(iHeight) + 6
295     add = 1 << (shift - 1);
296 
297     for(i = 0; i < trans_size; i++)
298     {
299 
300         /* e and o */
301         e[0] = pi4_temp[0] + pi4_temp[3];
302         o[0] = pi4_temp[0] - pi4_temp[3];
303         e[1] = pi4_temp[1] + pi4_temp[2];
304         o[1] = pi4_temp[1] - pi4_temp[2];
305 
306         pi2_dst[0] = (g_ai2_ihevc_trans_4[0][0] * e[0]
307                         + g_ai2_ihevc_trans_4[0][1] * e[1] + add) >> shift;
308         pi2_dst[2 * dst_strd] = (g_ai2_ihevc_trans_4[2][0] * e[0]
309                         + g_ai2_ihevc_trans_4[2][1] * e[1] + add) >> shift;
310         pi2_dst[dst_strd] = (g_ai2_ihevc_trans_4[1][0] * o[0]
311                         + g_ai2_ihevc_trans_4[1][1] * o[1] + add) >> shift;
312         pi2_dst[3 * dst_strd] = (g_ai2_ihevc_trans_4[3][0] * o[0]
313                         + g_ai2_ihevc_trans_4[3][1] * o[1] + add) >> shift;
314 
315         pi4_temp += trans_size;
316         pi2_dst++;
317     }
318 
319     return u4_blk_sad;
320 }
321 
ihevc_resi_trans_4x4_16bit(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_tmp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd)322 void ihevc_resi_trans_4x4_16bit(WORD16 *pi2_src,
323                           UWORD8 *pu1_pred,
324                           WORD16 *pi2_tmp,
325                           WORD16 *pi2_dst,
326                           WORD32 src_strd,
327                           WORD32 pred_strd,
328                           WORD32 dst_strd)
329 {
330     WORD32 i;
331     WORD32 e[2], o[2];
332     WORD32 add, shift;
333     WORD32 trans_size;
334     WORD16 *pi2_tmp_orig;
335     WORD16 *pi2_dst_orig;
336 
337     pi2_dst_orig = pi2_dst;
338     pi2_tmp_orig = pi2_tmp;
339     trans_size = TRANS_SIZE_4;
340 
341     /* Residue + Forward Transform 1st stage */
342     shift = 1; // log2(iWidth) - 1 + g_uiBitIncrement
343     add = 1 << (shift - 1);
344 
345     for(i = 0; i < trans_size; i++)
346     {
347         WORD32 resi_tmp_1, resi_tmp_2;
348 
349         /* e and o */
350         resi_tmp_1 = pi2_src[0] - pu1_pred[0];
351         resi_tmp_2 = pi2_src[3] - pu1_pred[3];
352         e[0] = resi_tmp_1 + resi_tmp_2;
353         o[0] = resi_tmp_1 - resi_tmp_2;
354 
355         resi_tmp_1 = pi2_src[1] - pu1_pred[1];
356         resi_tmp_2 = pi2_src[2] - pu1_pred[2];
357         e[1] = resi_tmp_1 + resi_tmp_2;
358         o[1] = resi_tmp_1 - resi_tmp_2;
359 
360         pi2_tmp[0] = (g_ai2_ihevc_trans_4[0][0] * e[0]
361                         + g_ai2_ihevc_trans_4[0][1] * e[1] + add) >> shift;
362         pi2_tmp[2 * trans_size] = (g_ai2_ihevc_trans_4[2][0] * e[0]
363                         + g_ai2_ihevc_trans_4[2][1] * e[1] + add) >> shift;
364         pi2_tmp[trans_size] = (g_ai2_ihevc_trans_4[1][0] * o[0]
365                         + g_ai2_ihevc_trans_4[1][1] * o[1] + add) >> shift;
366         pi2_tmp[3 * trans_size] = (g_ai2_ihevc_trans_4[3][0] * o[0]
367                         + g_ai2_ihevc_trans_4[3][1] * o[1] + add) >> shift;
368 
369         pi2_src += src_strd;
370         pu1_pred += pred_strd;
371         pi2_tmp++;
372     }
373 
374     pi2_tmp = pi2_tmp_orig;
375     /* Forward Transform 2nd stage */
376     shift = 8; // log2(iHeight) + 6
377     add = 1 << (shift - 1);
378 
379     for(i = 0; i < trans_size; i++)
380     {
381 
382         /* e and o */
383         e[0] = pi2_tmp[0] + pi2_tmp[3];
384         o[0] = pi2_tmp[0] - pi2_tmp[3];
385         e[1] = pi2_tmp[1] + pi2_tmp[2];
386         o[1] = pi2_tmp[1] - pi2_tmp[2];
387 
388         pi2_dst[0] = (g_ai2_ihevc_trans_4[0][0] * e[0]
389                         + g_ai2_ihevc_trans_4[0][1] * e[1] + add) >> shift;
390         pi2_dst[2 * dst_strd] = (g_ai2_ihevc_trans_4[2][0] * e[0]
391                         + g_ai2_ihevc_trans_4[2][1] * e[1] + add) >> shift;
392         pi2_dst[dst_strd] = (g_ai2_ihevc_trans_4[1][0] * o[0]
393                         + g_ai2_ihevc_trans_4[1][1] * o[1] + add) >> shift;
394         pi2_dst[3 * dst_strd] = (g_ai2_ihevc_trans_4[3][0] * o[0]
395                         + g_ai2_ihevc_trans_4[3][1] * o[1] + add) >> shift;
396 
397         pi2_tmp += trans_size;
398         pi2_dst++;
399     }
400 }
401 /**
402  *******************************************************************************
403  *
404  * @brief
405  *  This function performs residue calculation and forward  transform on
406  * input pixels
407  *
408  * @par Description:
409  *  Performs residue calculation by subtracting source and  prediction and
410  * followed by forward transform
411  *
412  * @param[in] pu1_src
413  *  Input 8x8 pixels
414  *
415  * @param[in] pu1_pred
416  *  Prediction data
417  *
418  * @param[in] pi2_tmp
419  *  Temporary buffer of size 8x8
420  *
421  * @param[out] pi2_dst
422  *  Output 8x8 coefficients
423  *
424  * @param[in] src_strd
425  *  Input stride
426  *
427  * @param[in] pred_strd
428  *  Prediction Stride
429  *
430  * @param[in] dst_strd_chr_flag
431  *  Output Stride and Chroma Flag packed in the MS and LS 16-bit
432  *
433  * @returns  Void
434  *
435  * @remarks
436  *  None
437  *
438  *******************************************************************************
439  */
440 
ihevc_resi_trans_8x8(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)441 UWORD32 ihevc_resi_trans_8x8(UWORD8 *pu1_src,
442                           UWORD8 *pu1_pred,
443                           WORD32 *pi4_temp,
444                           WORD16 *pi2_dst,
445                           WORD32 src_strd,
446                           WORD32 pred_strd,
447                           WORD32 dst_strd_chr_flag)
448 {
449     WORD32 i, k;
450     WORD32 e[4], o[4];
451     WORD32 ee[2], eo[2];
452     WORD32 add, shift;
453     WORD32 trans_size;
454     WORD32 *pi4_tmp_orig;
455 //    WORD16 *pi2_tmp;
456     WORD16 *pi2_dst_orig;
457     UWORD32 u4_blk_sad=0;
458     WORD32 chroma_flag;
459     WORD32 dst_strd;
460 
461     chroma_flag = dst_strd_chr_flag & 1;
462     dst_strd = dst_strd_chr_flag >> 16;
463 
464     pi2_dst_orig = pi2_dst;
465     pi4_tmp_orig = pi4_temp;
466     trans_size = TRANS_SIZE_8;
467     /* Residue + Forward Transform 1st stage */
468     shift = 2; // log2(iWidth) - 1 + g_uiBitIncrement
469     add = 1 << (shift - 1);
470 
471     for(i = 0; i < trans_size; i++)
472     {
473         WORD32 resi_tmp_1, resi_tmp_2;
474 
475         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
476         /* e and o*/
477         for(k = 0; k < 4; k++)
478         {
479             resi_tmp_1 = pu1_src[k*(1+chroma_flag)] - pu1_pred[k*(1+chroma_flag)];
480             resi_tmp_2 = pu1_src[(7-k)*(1+chroma_flag)] - pu1_pred[(7-k)*(1+chroma_flag)];
481             e[k] = resi_tmp_1 + resi_tmp_2;
482             o[k] = resi_tmp_1 - resi_tmp_2;
483             u4_blk_sad += abs(resi_tmp_1) + abs(resi_tmp_2);
484         }
485         /* ee and eo */
486         ee[0] = e[0] + e[3];
487         eo[0] = e[0] - e[3];
488         ee[1] = e[1] + e[2];
489         eo[1] = e[1] - e[2];
490 
491         pi4_temp[0] = (g_ai2_ihevc_trans_8[0][0] * ee[0]
492                         + g_ai2_ihevc_trans_8[0][1] * ee[1]);// + add) >> shift;
493         pi4_temp[4 * trans_size] = (g_ai2_ihevc_trans_8[4][0] * ee[0]
494                         + g_ai2_ihevc_trans_8[4][1] * ee[1]);// + add) >> shift;
495         pi4_temp[2 * trans_size] = (g_ai2_ihevc_trans_8[2][0] * eo[0]
496                         + g_ai2_ihevc_trans_8[2][1] * eo[1]);// + add) >> shift;
497         pi4_temp[6 * trans_size] = (g_ai2_ihevc_trans_8[6][0] * eo[0]
498                         + g_ai2_ihevc_trans_8[6][1] * eo[1]);// + add) >> shift;
499 
500         pi4_temp[trans_size] = (g_ai2_ihevc_trans_8[1][0] * o[0]
501                         + g_ai2_ihevc_trans_8[1][1] * o[1]
502                         + g_ai2_ihevc_trans_8[1][2] * o[2]
503                         + g_ai2_ihevc_trans_8[1][3] * o[3]);// + add) >> shift;
504         pi4_temp[3 * trans_size] = (g_ai2_ihevc_trans_8[3][0] * o[0]
505                         + g_ai2_ihevc_trans_8[3][1] * o[1]
506                         + g_ai2_ihevc_trans_8[3][2] * o[2]
507                         + g_ai2_ihevc_trans_8[3][3] * o[3]);// + add) >> shift;
508         pi4_temp[5 * trans_size] = (g_ai2_ihevc_trans_8[5][0] * o[0]
509                         + g_ai2_ihevc_trans_8[5][1] * o[1]
510                         + g_ai2_ihevc_trans_8[5][2] * o[2]
511                         + g_ai2_ihevc_trans_8[5][3] * o[3]);// + add) >> shift;
512         pi4_temp[7 * trans_size] = (g_ai2_ihevc_trans_8[7][0] * o[0]
513                         + g_ai2_ihevc_trans_8[7][1] * o[1]
514                         + g_ai2_ihevc_trans_8[7][2] * o[2]
515                         + g_ai2_ihevc_trans_8[7][3] * o[3]);// + add) >> shift;
516 
517         pu1_src += src_strd;
518         pu1_pred += pred_strd;
519         pi4_temp++;
520     }
521 
522     pi4_temp = pi4_tmp_orig;
523     /* Forward Transform 2nd stage */
524     shift = 11; // log2(iHeight) + 6
525     add = 1 << (shift - 1);
526 
527     for(i = 0; i < trans_size; i++)
528     {
529         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
530         /* e and o*/
531         for(k = 0; k < 4; k++)
532         {
533             e[k] = pi4_temp[k] + pi4_temp[7 - k];
534             o[k] = pi4_temp[k] - pi4_temp[7 - k];
535         }
536         /* ee and eo */
537         ee[0] = e[0] + e[3];
538         eo[0] = e[0] - e[3];
539         ee[1] = e[1] + e[2];
540         eo[1] = e[1] - e[2];
541 
542         pi2_dst[0] = (g_ai2_ihevc_trans_8[0][0] * ee[0]
543                         + g_ai2_ihevc_trans_8[0][1] * ee[1] + add) >> shift;
544         pi2_dst[4 * dst_strd] = (g_ai2_ihevc_trans_8[4][0] * ee[0]
545                         + g_ai2_ihevc_trans_8[4][1] * ee[1] + add) >> shift;
546         pi2_dst[2 * dst_strd] = (g_ai2_ihevc_trans_8[2][0] * eo[0]
547                         + g_ai2_ihevc_trans_8[2][1] * eo[1] + add) >> shift;
548         pi2_dst[6 * dst_strd] = (g_ai2_ihevc_trans_8[6][0] * eo[0]
549                         + g_ai2_ihevc_trans_8[6][1] * eo[1] + add) >> shift;
550 
551         pi2_dst[dst_strd] = (g_ai2_ihevc_trans_8[1][0] * o[0]
552                         + g_ai2_ihevc_trans_8[1][1] * o[1]
553                         + g_ai2_ihevc_trans_8[1][2] * o[2]
554                         + g_ai2_ihevc_trans_8[1][3] * o[3] + add) >> shift;
555         pi2_dst[3 * dst_strd] = (g_ai2_ihevc_trans_8[3][0] * o[0]
556                         + g_ai2_ihevc_trans_8[3][1] * o[1]
557                         + g_ai2_ihevc_trans_8[3][2] * o[2]
558                         + g_ai2_ihevc_trans_8[3][3] * o[3] + add) >> shift;
559         pi2_dst[5 * dst_strd] = (g_ai2_ihevc_trans_8[5][0] * o[0]
560                         + g_ai2_ihevc_trans_8[5][1] * o[1]
561                         + g_ai2_ihevc_trans_8[5][2] * o[2]
562                         + g_ai2_ihevc_trans_8[5][3] * o[3] + add) >> shift;
563         pi2_dst[7 * dst_strd] = (g_ai2_ihevc_trans_8[7][0] * o[0]
564                         + g_ai2_ihevc_trans_8[7][1] * o[1]
565                         + g_ai2_ihevc_trans_8[7][2] * o[2]
566                         + g_ai2_ihevc_trans_8[7][3] * o[3] + add) >> shift;
567 
568         pi4_temp += trans_size;
569         pi2_dst++;
570     }
571 
572     return u4_blk_sad;
573 }
574 
ihevc_resi_trans_8x8_16bit(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_tmp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd)575 void ihevc_resi_trans_8x8_16bit(WORD16 *pi2_src,
576                           UWORD8 *pu1_pred,
577                           WORD16 *pi2_tmp,
578                           WORD16 *pi2_dst,
579                           WORD32 src_strd,
580                           WORD32 pred_strd,
581                           WORD32 dst_strd)
582 {
583     WORD32 i, k;
584     WORD32 e[4], o[4];
585     WORD32 ee[2], eo[2];
586     WORD32 add, shift;
587     WORD32 trans_size;
588     WORD16 *pi2_tmp_orig;
589     WORD16 *pi2_dst_orig;
590 
591     pi2_dst_orig = pi2_dst;
592     pi2_tmp_orig = pi2_tmp;
593     trans_size = TRANS_SIZE_8;
594     /* Residue + Forward Transform 1st stage */
595     shift = 2; // log2(iWidth) - 1 + g_uiBitIncrement
596     add = 1 << (shift - 1);
597 
598     for(i = 0; i < trans_size; i++)
599     {
600         WORD32 resi_tmp_1, resi_tmp_2;
601 
602         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
603         /* e and o*/
604         for(k = 0; k < 4; k++)
605         {
606             resi_tmp_1 = pi2_src[k] - pu1_pred[k];
607             resi_tmp_2 = pi2_src[7 - k] - pu1_pred[7 - k];
608             e[k] = resi_tmp_1 + resi_tmp_2;
609             o[k] = resi_tmp_1 - resi_tmp_2;
610         }
611         /* ee and eo */
612         ee[0] = e[0] + e[3];
613         eo[0] = e[0] - e[3];
614         ee[1] = e[1] + e[2];
615         eo[1] = e[1] - e[2];
616 
617         pi2_tmp[0] = (g_ai2_ihevc_trans_8[0][0] * ee[0]
618                         + g_ai2_ihevc_trans_8[0][1] * ee[1] + add) >> shift;
619         pi2_tmp[4 * trans_size] = (g_ai2_ihevc_trans_8[4][0] * ee[0]
620                         + g_ai2_ihevc_trans_8[4][1] * ee[1] + add) >> shift;
621         pi2_tmp[2 * trans_size] = (g_ai2_ihevc_trans_8[2][0] * eo[0]
622                         + g_ai2_ihevc_trans_8[2][1] * eo[1] + add) >> shift;
623         pi2_tmp[6 * trans_size] = (g_ai2_ihevc_trans_8[6][0] * eo[0]
624                         + g_ai2_ihevc_trans_8[6][1] * eo[1] + add) >> shift;
625 
626         pi2_tmp[trans_size] = (g_ai2_ihevc_trans_8[1][0] * o[0]
627                         + g_ai2_ihevc_trans_8[1][1] * o[1]
628                         + g_ai2_ihevc_trans_8[1][2] * o[2]
629                         + g_ai2_ihevc_trans_8[1][3] * o[3] + add) >> shift;
630         pi2_tmp[3 * trans_size] = (g_ai2_ihevc_trans_8[3][0] * o[0]
631                         + g_ai2_ihevc_trans_8[3][1] * o[1]
632                         + g_ai2_ihevc_trans_8[3][2] * o[2]
633                         + g_ai2_ihevc_trans_8[3][3] * o[3] + add) >> shift;
634         pi2_tmp[5 * trans_size] = (g_ai2_ihevc_trans_8[5][0] * o[0]
635                         + g_ai2_ihevc_trans_8[5][1] * o[1]
636                         + g_ai2_ihevc_trans_8[5][2] * o[2]
637                         + g_ai2_ihevc_trans_8[5][3] * o[3] + add) >> shift;
638         pi2_tmp[7 * trans_size] = (g_ai2_ihevc_trans_8[7][0] * o[0]
639                         + g_ai2_ihevc_trans_8[7][1] * o[1]
640                         + g_ai2_ihevc_trans_8[7][2] * o[2]
641                         + g_ai2_ihevc_trans_8[7][3] * o[3] + add) >> shift;
642 
643         pi2_src += src_strd;
644         pu1_pred += pred_strd;
645         pi2_tmp++;
646     }
647 
648     pi2_tmp = pi2_tmp_orig;
649     /* Forward Transform 2nd stage */
650     shift = 9; // log2(iHeight) + 6
651     add = 1 << (shift - 1);
652 
653     for(i = 0; i < trans_size; i++)
654     {
655         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
656         /* e and o*/
657         for(k = 0; k < 4; k++)
658         {
659             e[k] = pi2_tmp[k] + pi2_tmp[7 - k];
660             o[k] = pi2_tmp[k] - pi2_tmp[7 - k];
661         }
662         /* ee and eo */
663         ee[0] = e[0] + e[3];
664         eo[0] = e[0] - e[3];
665         ee[1] = e[1] + e[2];
666         eo[1] = e[1] - e[2];
667 
668         pi2_dst[0] = (g_ai2_ihevc_trans_8[0][0] * ee[0]
669                         + g_ai2_ihevc_trans_8[0][1] * ee[1] + add) >> shift;
670         pi2_dst[4 * dst_strd] = (g_ai2_ihevc_trans_8[4][0] * ee[0]
671                         + g_ai2_ihevc_trans_8[4][1] * ee[1] + add) >> shift;
672         pi2_dst[2 * dst_strd] = (g_ai2_ihevc_trans_8[2][0] * eo[0]
673                         + g_ai2_ihevc_trans_8[2][1] * eo[1] + add) >> shift;
674         pi2_dst[6 * dst_strd] = (g_ai2_ihevc_trans_8[6][0] * eo[0]
675                         + g_ai2_ihevc_trans_8[6][1] * eo[1] + add) >> shift;
676 
677         pi2_dst[dst_strd] = (g_ai2_ihevc_trans_8[1][0] * o[0]
678                         + g_ai2_ihevc_trans_8[1][1] * o[1]
679                         + g_ai2_ihevc_trans_8[1][2] * o[2]
680                         + g_ai2_ihevc_trans_8[1][3] * o[3] + add) >> shift;
681         pi2_dst[3 * dst_strd] = (g_ai2_ihevc_trans_8[3][0] * o[0]
682                         + g_ai2_ihevc_trans_8[3][1] * o[1]
683                         + g_ai2_ihevc_trans_8[3][2] * o[2]
684                         + g_ai2_ihevc_trans_8[3][3] * o[3] + add) >> shift;
685         pi2_dst[5 * dst_strd] = (g_ai2_ihevc_trans_8[5][0] * o[0]
686                         + g_ai2_ihevc_trans_8[5][1] * o[1]
687                         + g_ai2_ihevc_trans_8[5][2] * o[2]
688                         + g_ai2_ihevc_trans_8[5][3] * o[3] + add) >> shift;
689         pi2_dst[7 * dst_strd] = (g_ai2_ihevc_trans_8[7][0] * o[0]
690                         + g_ai2_ihevc_trans_8[7][1] * o[1]
691                         + g_ai2_ihevc_trans_8[7][2] * o[2]
692                         + g_ai2_ihevc_trans_8[7][3] * o[3] + add) >> shift;
693 
694         pi2_tmp += trans_size;
695         pi2_dst++;
696     }
697 }
698 /**
699  *******************************************************************************
700  *
701  * @brief
702  *  This function performs residue calculation and forward  transform on
703  * input pixels
704  *
705  * @par Description:
706  *  Performs residue calculation by subtracting source and  prediction and
707  * followed by forward transform
708  *
709  * @param[in] pu1_src
710  *  Input 16x16 pixels
711  *
712  * @param[in] pu1_pred
713  *  Prediction data
714  *
715  * @param[in] pi2_tmp
716  *  Temporary buffer of size 16x16
717  *
718  * @param[out] pi2_dst
719  *  Output 16x16 coefficients
720  *
721  * @param[in] src_strd
722  *  Input stride
723  *
724  * @param[in] pred_strd
725  *  Prediction Stride
726  *
727  * @param[in] dst_strd_chr_flag
728  *  Output Stride and Chroma Flag packed in the MS and LS 16-bit
729  *
730  * @returns  Void
731  *
732  * @remarks
733  *  None
734  *
735  *******************************************************************************
736  */
737 
ihevc_resi_trans_16x16(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)738 UWORD32 ihevc_resi_trans_16x16(UWORD8 *pu1_src,
739                             UWORD8 *pu1_pred,
740                             WORD32 *pi4_temp,
741                             WORD16 *pi2_dst,
742                             WORD32 src_strd,
743                             WORD32 pred_strd,
744                             WORD32 dst_strd_chr_flag)
745 {
746     WORD32 i, k;
747     WORD32 e[8], o[8];
748     WORD32 ee[4], eo[4];
749     WORD32 eee[2], eeo[2];
750     WORD32 add, shift;
751     WORD32 trans_size;
752     WORD32 *pi4_tmp_orig;
753     WORD16 *pi2_dst_orig;
754     UWORD32 u4_blk_sad = 0;
755     WORD32 chroma_flag;
756     WORD32 dst_strd;
757 
758     chroma_flag = dst_strd_chr_flag & 1;
759     dst_strd = dst_strd_chr_flag >> 16;
760 
761     pi2_dst_orig = pi2_dst;
762     pi4_tmp_orig = pi4_temp;
763     trans_size = TRANS_SIZE_16;
764     /* Residue + Forward Transform 1st stage */
765     shift = 3; // log2(iWidth) - 1 + g_uiBitIncrement
766     add = 1 << (shift - 1);
767 
768     for(i = 0; i < trans_size; i++)
769     {
770         WORD32 resi_tmp_1, resi_tmp_2;
771         /* e and o*/
772         for(k = 0; k < 8; k++)
773         {
774             resi_tmp_1 = pu1_src[k*(1+chroma_flag)] - pu1_pred[k*(1+chroma_flag)];
775             resi_tmp_2 = pu1_src[(15-k)*(1+chroma_flag)] - pu1_pred[(15-k)*(1+chroma_flag)];
776             e[k] = resi_tmp_1 + resi_tmp_2;
777             o[k] = resi_tmp_1 - resi_tmp_2;
778             u4_blk_sad += abs(resi_tmp_1) + abs(resi_tmp_2);
779         }
780         /* ee and eo */
781         for(k = 0; k < 4; k++)
782         {
783             ee[k] = e[k] + e[7 - k];
784             eo[k] = e[k] - e[7 - k];
785         }
786         /* eee and eeo */
787         eee[0] = ee[0] + ee[3];
788         eeo[0] = ee[0] - ee[3];
789         eee[1] = ee[1] + ee[2];
790         eeo[1] = ee[1] - ee[2];
791 
792         pi4_temp[0] = (g_ai2_ihevc_trans_16[0][0] * eee[0]
793                         + g_ai2_ihevc_trans_16[0][1] * eee[1]);// + add) >> shift;
794         pi4_temp[8 * trans_size] = (g_ai2_ihevc_trans_16[8][0] * eee[0]
795                         + g_ai2_ihevc_trans_16[8][1] * eee[1]);// + add) >> shift;
796         pi4_temp[4 * trans_size] = (g_ai2_ihevc_trans_16[4][0] * eeo[0]
797                         + g_ai2_ihevc_trans_16[4][1] * eeo[1]);// + add) >> shift;
798         pi4_temp[12 * trans_size] = (g_ai2_ihevc_trans_16[12][0] * eeo[0]
799                         + g_ai2_ihevc_trans_16[12][1] * eeo[1]);// + add) >> shift;
800 
801         for(k = 2; k < 16; k += 4)
802         {
803             pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_16[k][0] * eo[0]
804                             + g_ai2_ihevc_trans_16[k][1] * eo[1]
805                             + g_ai2_ihevc_trans_16[k][2] * eo[2]
806                             + g_ai2_ihevc_trans_16[k][3] * eo[3]);// + add)>> shift;
807 
808         }
809 
810         for(k = 1; k < 16; k += 2)
811         {
812             pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_16[k][0] * o[0]
813                             + g_ai2_ihevc_trans_16[k][1] * o[1]
814                             + g_ai2_ihevc_trans_16[k][2] * o[2]
815                             + g_ai2_ihevc_trans_16[k][3] * o[3]
816                             + g_ai2_ihevc_trans_16[k][4] * o[4]
817                             + g_ai2_ihevc_trans_16[k][5] * o[5]
818                             + g_ai2_ihevc_trans_16[k][6] * o[6]
819                             + g_ai2_ihevc_trans_16[k][7] * o[7]);// + add) >> shift;
820         }
821         pu1_src += src_strd;
822         pu1_pred += pred_strd;
823         pi4_temp++;
824     }
825 
826     pi4_temp = pi4_tmp_orig;
827     /* Forward Transform 2nd stage */
828     shift = 13; // log2(iHeight) + 6
829     add = 1 << (shift - 1);
830 
831     for(i = 0; i < TRANS_SIZE_16; i++)
832     {
833         /* e and o*/
834         for(k = 0; k < 8; k++)
835         {
836             e[k] = pi4_temp[k] + pi4_temp[15 - k];
837             o[k] = pi4_temp[k] - pi4_temp[15 - k];
838         }
839         /* ee and eo */
840         for(k = 0; k < 4; k++)
841         {
842             ee[k] = e[k] + e[7 - k];
843             eo[k] = e[k] - e[7 - k];
844         }
845         /* eee and eeo */
846         eee[0] = ee[0] + ee[3];
847         eeo[0] = ee[0] - ee[3];
848         eee[1] = ee[1] + ee[2];
849         eeo[1] = ee[1] - ee[2];
850 
851         pi2_dst[0] = (g_ai2_ihevc_trans_16[0][0] * eee[0]
852                         + g_ai2_ihevc_trans_16[0][1] * eee[1] + add) >> shift;
853         pi2_dst[8 * dst_strd] = (g_ai2_ihevc_trans_16[8][0] * eee[0]
854                         + g_ai2_ihevc_trans_16[8][1] * eee[1] + add) >> shift;
855         pi2_dst[4 * dst_strd] = (g_ai2_ihevc_trans_16[4][0] * eeo[0]
856                         + g_ai2_ihevc_trans_16[4][1] * eeo[1] + add) >> shift;
857         pi2_dst[12 * dst_strd] = (g_ai2_ihevc_trans_16[12][0] * eeo[0]
858                         + g_ai2_ihevc_trans_16[12][1] * eeo[1] + add) >> shift;
859 
860         for(k = 2; k < 16; k += 4)
861         {
862             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_16[k][0] * eo[0]
863                             + g_ai2_ihevc_trans_16[k][1] * eo[1]
864                             + g_ai2_ihevc_trans_16[k][2] * eo[2]
865                             + g_ai2_ihevc_trans_16[k][3] * eo[3] + add)
866                             >> shift;
867         }
868 
869         for(k = 1; k < 16; k += 2)
870         {
871             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_16[k][0] * o[0]
872                             + g_ai2_ihevc_trans_16[k][1] * o[1]
873                             + g_ai2_ihevc_trans_16[k][2] * o[2]
874                             + g_ai2_ihevc_trans_16[k][3] * o[3]
875                             + g_ai2_ihevc_trans_16[k][4] * o[4]
876                             + g_ai2_ihevc_trans_16[k][5] * o[5]
877                             + g_ai2_ihevc_trans_16[k][6] * o[6]
878                             + g_ai2_ihevc_trans_16[k][7] * o[7] + add) >> shift;
879         }
880 
881         pi4_temp += trans_size;
882         pi2_dst++;
883     }
884 
885     return u4_blk_sad;
886 }
887 
888 
ihevc_resi_trans_16x16_16bit(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_tmp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd)889 void ihevc_resi_trans_16x16_16bit(WORD16 *pi2_src,
890                             UWORD8 *pu1_pred,
891                             WORD16 *pi2_tmp,
892                             WORD16 *pi2_dst,
893                             WORD32 src_strd,
894                             WORD32 pred_strd,
895                             WORD32 dst_strd)
896 {
897     WORD32 i, k;
898     WORD32 e[8], o[8];
899     WORD32 ee[4], eo[4];
900     WORD32 eee[2], eeo[2];
901     WORD32 add, shift;
902     WORD32 trans_size;
903     WORD16 *pi2_tmp_orig;
904     WORD16 *pi2_dst_orig;
905 
906     pi2_dst_orig = pi2_dst;
907     pi2_tmp_orig = pi2_tmp;
908     trans_size = TRANS_SIZE_16;
909     /* Residue + Forward Transform 1st stage */
910     shift = 3; // log2(iWidth) - 1 + g_uiBitIncrement
911     add = 1 << (shift - 1);
912 
913     for(i = 0; i < trans_size; i++)
914     {
915         WORD32 resi_tmp_1, resi_tmp_2;
916         /* e and o*/
917         for(k = 0; k < 8; k++)
918         {
919             resi_tmp_1 = pi2_src[k] - pu1_pred[k];
920             resi_tmp_2 = pi2_src[15 - k] - pu1_pred[15 - k];
921             e[k] = resi_tmp_1 + resi_tmp_2;
922             o[k] = resi_tmp_1 - resi_tmp_2;
923         }
924         /* ee and eo */
925         for(k = 0; k < 4; k++)
926         {
927             ee[k] = e[k] + e[7 - k];
928             eo[k] = e[k] - e[7 - k];
929         }
930         /* eee and eeo */
931         eee[0] = ee[0] + ee[3];
932         eeo[0] = ee[0] - ee[3];
933         eee[1] = ee[1] + ee[2];
934         eeo[1] = ee[1] - ee[2];
935 
936         pi2_tmp[0] = (g_ai2_ihevc_trans_16[0][0] * eee[0]
937                         + g_ai2_ihevc_trans_16[0][1] * eee[1] + add) >> shift;
938         pi2_tmp[8 * trans_size] = (g_ai2_ihevc_trans_16[8][0] * eee[0]
939                         + g_ai2_ihevc_trans_16[8][1] * eee[1] + add) >> shift;
940         pi2_tmp[4 * trans_size] = (g_ai2_ihevc_trans_16[4][0] * eeo[0]
941                         + g_ai2_ihevc_trans_16[4][1] * eeo[1] + add) >> shift;
942         pi2_tmp[12 * trans_size] = (g_ai2_ihevc_trans_16[12][0] * eeo[0]
943                         + g_ai2_ihevc_trans_16[12][1] * eeo[1] + add) >> shift;
944 
945         for(k = 2; k < 16; k += 4)
946         {
947             pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_16[k][0] * eo[0]
948                             + g_ai2_ihevc_trans_16[k][1] * eo[1]
949                             + g_ai2_ihevc_trans_16[k][2] * eo[2]
950                             + g_ai2_ihevc_trans_16[k][3] * eo[3] + add)
951                             >> shift;
952         }
953 
954         for(k = 1; k < 16; k += 2)
955         {
956             pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_16[k][0] * o[0]
957                             + g_ai2_ihevc_trans_16[k][1] * o[1]
958                             + g_ai2_ihevc_trans_16[k][2] * o[2]
959                             + g_ai2_ihevc_trans_16[k][3] * o[3]
960                             + g_ai2_ihevc_trans_16[k][4] * o[4]
961                             + g_ai2_ihevc_trans_16[k][5] * o[5]
962                             + g_ai2_ihevc_trans_16[k][6] * o[6]
963                             + g_ai2_ihevc_trans_16[k][7] * o[7] + add) >> shift;
964         }
965         pi2_src += src_strd;
966         pu1_pred += pred_strd;
967         pi2_tmp++;
968     }
969 
970     pi2_tmp = pi2_tmp_orig;
971     /* Forward Transform 2nd stage */
972     shift = 10; // log2(iHeight) + 6
973     add = 1 << (shift - 1);
974 
975     for(i = 0; i < TRANS_SIZE_16; i++)
976     {
977         /* e and o*/
978         for(k = 0; k < 8; k++)
979         {
980             e[k] = pi2_tmp[k] + pi2_tmp[15 - k];
981             o[k] = pi2_tmp[k] - pi2_tmp[15 - k];
982         }
983         /* ee and eo */
984         for(k = 0; k < 4; k++)
985         {
986             ee[k] = e[k] + e[7 - k];
987             eo[k] = e[k] - e[7 - k];
988         }
989         /* eee and eeo */
990         eee[0] = ee[0] + ee[3];
991         eeo[0] = ee[0] - ee[3];
992         eee[1] = ee[1] + ee[2];
993         eeo[1] = ee[1] - ee[2];
994 
995         pi2_dst[0] = (g_ai2_ihevc_trans_16[0][0] * eee[0]
996                         + g_ai2_ihevc_trans_16[0][1] * eee[1] + add) >> shift;
997         pi2_dst[8 * dst_strd] = (g_ai2_ihevc_trans_16[8][0] * eee[0]
998                         + g_ai2_ihevc_trans_16[8][1] * eee[1] + add) >> shift;
999         pi2_dst[4 * dst_strd] = (g_ai2_ihevc_trans_16[4][0] * eeo[0]
1000                         + g_ai2_ihevc_trans_16[4][1] * eeo[1] + add) >> shift;
1001         pi2_dst[12 * dst_strd] = (g_ai2_ihevc_trans_16[12][0] * eeo[0]
1002                         + g_ai2_ihevc_trans_16[12][1] * eeo[1] + add) >> shift;
1003 
1004         for(k = 2; k < 16; k += 4)
1005         {
1006             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_16[k][0] * eo[0]
1007                             + g_ai2_ihevc_trans_16[k][1] * eo[1]
1008                             + g_ai2_ihevc_trans_16[k][2] * eo[2]
1009                             + g_ai2_ihevc_trans_16[k][3] * eo[3] + add)
1010                             >> shift;
1011         }
1012 
1013         for(k = 1; k < 16; k += 2)
1014         {
1015             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_16[k][0] * o[0]
1016                             + g_ai2_ihevc_trans_16[k][1] * o[1]
1017                             + g_ai2_ihevc_trans_16[k][2] * o[2]
1018                             + g_ai2_ihevc_trans_16[k][3] * o[3]
1019                             + g_ai2_ihevc_trans_16[k][4] * o[4]
1020                             + g_ai2_ihevc_trans_16[k][5] * o[5]
1021                             + g_ai2_ihevc_trans_16[k][6] * o[6]
1022                             + g_ai2_ihevc_trans_16[k][7] * o[7] + add) >> shift;
1023         }
1024 
1025         pi2_tmp += trans_size;
1026         pi2_dst++;
1027     }
1028 }
1029 
1030 /**
1031  *******************************************************************************
1032  *
1033  * @brief
1034  *  This function performs residue calculation and forward  transform on
1035  * input pixels
1036  *
1037  * @par Description:
1038  *  Performs residue calculation by subtracting source and  prediction and
1039  * followed by forward transform
1040  *
1041  * @param[in] pu1_src
1042  *  Input 32x32 pixels
1043  *
1044  * @param[in] pu1_pred
1045  *  Prediction data
1046  *
1047  * @param[in] pi2_tmp
1048  *  Temporary buffer of size 32x32
1049  *
1050  * @param[out] pi2_dst
1051  *  Output 32x32 coefficients
1052  *
1053  * @param[in] src_strd
1054  *  Input stride
1055  *
1056  * @param[in] pred_strd
1057  *  Prediction Stride
1058  *
1059  * @param[in] dst_strd_chr_flag
1060  *  Output Stride and Chroma Flag packed in the MS and LS 16-bit
1061  *
1062  * @returns  Void
1063  *
1064  * @remarks
1065  *  None
1066  *
1067  *******************************************************************************
1068  */
1069 
ihevc_resi_trans_32x32(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD32 * pi4_temp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd_chr_flag)1070 UWORD32 ihevc_resi_trans_32x32(UWORD8 *pu1_src,
1071                             UWORD8 *pu1_pred,
1072                             WORD32 *pi4_temp,
1073                             WORD16 *pi2_dst,
1074                             WORD32 src_strd,
1075                             WORD32 pred_strd,
1076                             WORD32 dst_strd_chr_flag)
1077 {
1078     WORD32 i, k;
1079     WORD32 e[16], o[16];
1080     WORD32 ee[8], eo[8];
1081     WORD32 eee[4], eeo[4];
1082     WORD32 eeee[2], eeeo[2];
1083     WORD32 add, shift;
1084     WORD32 trans_size;
1085     WORD32 *pi4_tmp_orig;
1086     WORD16 *pi2_dst_orig;
1087     UWORD32 u4_blk_sad = 0 ;
1088     WORD32 chroma_flag;
1089     WORD32 dst_strd;
1090 
1091     chroma_flag = dst_strd_chr_flag & 1;
1092     dst_strd = dst_strd_chr_flag >> 16;
1093 
1094     pi2_dst_orig = pi2_dst;
1095     pi4_tmp_orig = pi4_temp;
1096     trans_size = TRANS_SIZE_32;
1097     /* Residue + Forward Transform 1st stage */
1098     /* Made to zero to match with intrinsics */
1099     shift = 0; // 4 : log2(iWidth) - 1 + g_uiBitIncrement
1100     add = 0 ; //1 << (shift - 1);
1101 
1102     for(i = 0; i < trans_size; i++)
1103     {
1104         WORD32 resi_tmp_1, resi_tmp_2;
1105         /* e and o*/
1106         for(k = 0; k < 16; k++)
1107         {
1108             resi_tmp_1 = pu1_src[k] - pu1_pred[k];
1109             resi_tmp_2 = pu1_src[31 - k] - pu1_pred[31 - k];
1110             e[k] = resi_tmp_1 + resi_tmp_2;
1111             o[k] = resi_tmp_1 - resi_tmp_2;
1112             u4_blk_sad += abs(resi_tmp_1) + abs(resi_tmp_2);
1113         }
1114         /* ee and eo */
1115         for(k = 0; k < 8; k++)
1116         {
1117             ee[k] = e[k] + e[15 - k];
1118             eo[k] = e[k] - e[15 - k];
1119         }
1120         /* eee and eeo */
1121         for(k = 0; k < 4; k++)
1122         {
1123             eee[k] = ee[k] + ee[7 - k];
1124             eeo[k] = ee[k] - ee[7 - k];
1125         }
1126         /* eeee and eeeo */
1127         eeee[0] = eee[0] + eee[3];
1128         eeeo[0] = eee[0] - eee[3];
1129         eeee[1] = eee[1] + eee[2];
1130         eeeo[1] = eee[1] - eee[2];
1131 
1132         pi4_temp[0] = (g_ai2_ihevc_trans_32[0][0] * eeee[0]
1133                         + g_ai2_ihevc_trans_32[0][1] * eeee[1]);// + add) >> shift;
1134         pi4_temp[16 * trans_size] = (g_ai2_ihevc_trans_32[16][0] * eeee[0]
1135                         + g_ai2_ihevc_trans_32[16][1] * eeee[1]);// + add) >> shift;
1136         pi4_temp[8 * trans_size] = (g_ai2_ihevc_trans_32[8][0] * eeeo[0]
1137                         + g_ai2_ihevc_trans_32[8][1] * eeeo[1]);// + add) >> shift;
1138         pi4_temp[24 * trans_size] = (g_ai2_ihevc_trans_32[24][0] * eeeo[0]
1139                         + g_ai2_ihevc_trans_32[24][1] * eeeo[1]);// + add) >> shift;
1140         for(k = 4; k < 32; k += 8)
1141         {
1142             pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * eeo[0]
1143                             + g_ai2_ihevc_trans_32[k][1] * eeo[1]
1144                             + g_ai2_ihevc_trans_32[k][2] * eeo[2]
1145                             + g_ai2_ihevc_trans_32[k][3] * eeo[3]);// + add)>> shift;
1146         }
1147         for(k = 2; k < 32; k += 4)
1148         {
1149             pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * eo[0]
1150                             + g_ai2_ihevc_trans_32[k][1] * eo[1]
1151                             + g_ai2_ihevc_trans_32[k][2] * eo[2]
1152                             + g_ai2_ihevc_trans_32[k][3] * eo[3]
1153                             + g_ai2_ihevc_trans_32[k][4] * eo[4]
1154                             + g_ai2_ihevc_trans_32[k][5] * eo[5]
1155                             + g_ai2_ihevc_trans_32[k][6] * eo[6]
1156                             + g_ai2_ihevc_trans_32[k][7] * eo[7]);// + add)>> shift;
1157         }
1158         for(k = 1; k < 32; k += 2)
1159         {
1160             pi4_temp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * o[0]
1161                             + g_ai2_ihevc_trans_32[k][1] * o[1]
1162                             + g_ai2_ihevc_trans_32[k][2] * o[2]
1163                             + g_ai2_ihevc_trans_32[k][3] * o[3]
1164                             + g_ai2_ihevc_trans_32[k][4] * o[4]
1165                             + g_ai2_ihevc_trans_32[k][5] * o[5]
1166                             + g_ai2_ihevc_trans_32[k][6] * o[6]
1167                             + g_ai2_ihevc_trans_32[k][7] * o[7]
1168                             + g_ai2_ihevc_trans_32[k][8] * o[8]
1169                             + g_ai2_ihevc_trans_32[k][9] * o[9]
1170                             + g_ai2_ihevc_trans_32[k][10] * o[10]
1171                             + g_ai2_ihevc_trans_32[k][11] * o[11]
1172                             + g_ai2_ihevc_trans_32[k][12] * o[12]
1173                             + g_ai2_ihevc_trans_32[k][13] * o[13]
1174                             + g_ai2_ihevc_trans_32[k][14] * o[14]
1175                             + g_ai2_ihevc_trans_32[k][15] * o[15]);// + add) >> shift;
1176         }
1177         pu1_src += src_strd;
1178         pu1_pred += pred_strd;
1179         pi4_temp++;
1180     }
1181 
1182     pi4_temp = pi4_tmp_orig;
1183     /* Forward Transform 2nd stage */
1184     shift = 15; // log2(iHeight) + 6
1185     add = 1 << (shift - 1);
1186 
1187     for(i = 0; i < TRANS_SIZE_32; i++)
1188     {
1189         /* e and o*/
1190         for(k = 0; k < 16; k++)
1191         {
1192             e[k] = pi4_temp[k] + pi4_temp[31 - k];
1193             o[k] = pi4_temp[k] - pi4_temp[31 - k];
1194         }
1195         /* ee and eo */
1196         for(k = 0; k < 8; k++)
1197         {
1198             ee[k] = e[k] + e[15 - k];
1199             eo[k] = e[k] - e[15 - k];
1200         }
1201         /* eee and eeo */
1202         for(k = 0; k < 4; k++)
1203         {
1204             eee[k] = ee[k] + ee[7 - k];
1205             eeo[k] = ee[k] - ee[7 - k];
1206         }
1207         /* eeee and eeeo */
1208         eeee[0] = eee[0] + eee[3];
1209         eeeo[0] = eee[0] - eee[3];
1210         eeee[1] = eee[1] + eee[2];
1211         eeeo[1] = eee[1] - eee[2];
1212 
1213         pi2_dst[0] = (g_ai2_ihevc_trans_32[0][0] * eeee[0]
1214                         + g_ai2_ihevc_trans_32[0][1] * eeee[1] + add) >> shift;
1215         pi2_dst[16 * dst_strd] = (g_ai2_ihevc_trans_32[16][0] * eeee[0]
1216                         + g_ai2_ihevc_trans_32[16][1] * eeee[1] + add) >> shift;
1217         pi2_dst[8 * dst_strd] = (g_ai2_ihevc_trans_32[8][0] * eeeo[0]
1218                         + g_ai2_ihevc_trans_32[8][1] * eeeo[1] + add) >> shift;
1219         pi2_dst[24 * dst_strd] = (g_ai2_ihevc_trans_32[24][0] * eeeo[0]
1220                         + g_ai2_ihevc_trans_32[24][1] * eeeo[1] + add) >> shift;
1221         for(k = 4; k < 32; k += 8)
1222         {
1223             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * eeo[0]
1224                             + g_ai2_ihevc_trans_32[k][1] * eeo[1]
1225                             + g_ai2_ihevc_trans_32[k][2] * eeo[2]
1226                             + g_ai2_ihevc_trans_32[k][3] * eeo[3] + add)
1227                             >> shift;
1228         }
1229         for(k = 2; k < 32; k += 4)
1230         {
1231             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * eo[0]
1232                             + g_ai2_ihevc_trans_32[k][1] * eo[1]
1233                             + g_ai2_ihevc_trans_32[k][2] * eo[2]
1234                             + g_ai2_ihevc_trans_32[k][3] * eo[3]
1235                             + g_ai2_ihevc_trans_32[k][4] * eo[4]
1236                             + g_ai2_ihevc_trans_32[k][5] * eo[5]
1237                             + g_ai2_ihevc_trans_32[k][6] * eo[6]
1238                             + g_ai2_ihevc_trans_32[k][7] * eo[7] + add)
1239                             >> shift;
1240         }
1241         for(k = 1; k < 32; k += 2)
1242         {
1243             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * o[0]
1244                             + g_ai2_ihevc_trans_32[k][1] * o[1]
1245                             + g_ai2_ihevc_trans_32[k][2] * o[2]
1246                             + g_ai2_ihevc_trans_32[k][3] * o[3]
1247                             + g_ai2_ihevc_trans_32[k][4] * o[4]
1248                             + g_ai2_ihevc_trans_32[k][5] * o[5]
1249                             + g_ai2_ihevc_trans_32[k][6] * o[6]
1250                             + g_ai2_ihevc_trans_32[k][7] * o[7]
1251                             + g_ai2_ihevc_trans_32[k][8] * o[8]
1252                             + g_ai2_ihevc_trans_32[k][9] * o[9]
1253                             + g_ai2_ihevc_trans_32[k][10] * o[10]
1254                             + g_ai2_ihevc_trans_32[k][11] * o[11]
1255                             + g_ai2_ihevc_trans_32[k][12] * o[12]
1256                             + g_ai2_ihevc_trans_32[k][13] * o[13]
1257                             + g_ai2_ihevc_trans_32[k][14] * o[14]
1258                             + g_ai2_ihevc_trans_32[k][15] * o[15] + add)
1259                             >> shift;
1260         }
1261 
1262         pi4_temp += trans_size;
1263         pi2_dst++;
1264     }
1265 
1266     return u4_blk_sad;
1267 }
1268 
1269 
1270 
ihevc_resi_trans_32x32_16bit(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_tmp,WORD16 * pi2_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd)1271 void ihevc_resi_trans_32x32_16bit(WORD16 *pi2_src,
1272                             UWORD8 *pu1_pred,
1273                             WORD16 *pi2_tmp,
1274                             WORD16 *pi2_dst,
1275                             WORD32 src_strd,
1276                             WORD32 pred_strd,
1277                             WORD32 dst_strd)
1278 {
1279     WORD32 i, k;
1280     WORD32 e[16], o[16];
1281     WORD32 ee[8], eo[8];
1282     WORD32 eee[4], eeo[4];
1283     WORD32 eeee[2], eeeo[2];
1284     WORD32 add, shift;
1285     WORD32 trans_size;
1286     WORD16 *pi2_tmp_orig;
1287     WORD16 *pi2_dst_orig;
1288 
1289     pi2_dst_orig = pi2_dst;
1290     pi2_tmp_orig = pi2_tmp;
1291     trans_size = TRANS_SIZE_32;
1292     /* Residue + Forward Transform 1st stage */
1293     shift = 4; // log2(iWidth) - 1 + g_uiBitIncrement
1294     add = 1 << (shift - 1);
1295 
1296     for(i = 0; i < trans_size; i++)
1297     {
1298         WORD32 resi_tmp_1, resi_tmp_2;
1299         /* e and o*/
1300         for(k = 0; k < 16; k++)
1301         {
1302             resi_tmp_1 = pi2_src[k] - pu1_pred[k];
1303             resi_tmp_2 = pi2_src[31 - k] - pu1_pred[31 - k];
1304             e[k] = resi_tmp_1 + resi_tmp_2;
1305             o[k] = resi_tmp_1 - resi_tmp_2;
1306         }
1307         /* ee and eo */
1308         for(k = 0; k < 8; k++)
1309         {
1310             ee[k] = e[k] + e[15 - k];
1311             eo[k] = e[k] - e[15 - k];
1312         }
1313         /* eee and eeo */
1314         for(k = 0; k < 4; k++)
1315         {
1316             eee[k] = ee[k] + ee[7 - k];
1317             eeo[k] = ee[k] - ee[7 - k];
1318         }
1319         /* eeee and eeeo */
1320         eeee[0] = eee[0] + eee[3];
1321         eeeo[0] = eee[0] - eee[3];
1322         eeee[1] = eee[1] + eee[2];
1323         eeeo[1] = eee[1] - eee[2];
1324 
1325         pi2_tmp[0] = (g_ai2_ihevc_trans_32[0][0] * eeee[0]
1326                         + g_ai2_ihevc_trans_32[0][1] * eeee[1] + add) >> shift;
1327         pi2_tmp[16 * trans_size] = (g_ai2_ihevc_trans_32[16][0] * eeee[0]
1328                         + g_ai2_ihevc_trans_32[16][1] * eeee[1] + add) >> shift;
1329         pi2_tmp[8 * trans_size] = (g_ai2_ihevc_trans_32[8][0] * eeeo[0]
1330                         + g_ai2_ihevc_trans_32[8][1] * eeeo[1] + add) >> shift;
1331         pi2_tmp[24 * trans_size] = (g_ai2_ihevc_trans_32[24][0] * eeeo[0]
1332                         + g_ai2_ihevc_trans_32[24][1] * eeeo[1] + add) >> shift;
1333         for(k = 4; k < 32; k += 8)
1334         {
1335             pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * eeo[0]
1336                             + g_ai2_ihevc_trans_32[k][1] * eeo[1]
1337                             + g_ai2_ihevc_trans_32[k][2] * eeo[2]
1338                             + g_ai2_ihevc_trans_32[k][3] * eeo[3] + add)
1339                             >> shift;
1340         }
1341         for(k = 2; k < 32; k += 4)
1342         {
1343             pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * eo[0]
1344                             + g_ai2_ihevc_trans_32[k][1] * eo[1]
1345                             + g_ai2_ihevc_trans_32[k][2] * eo[2]
1346                             + g_ai2_ihevc_trans_32[k][3] * eo[3]
1347                             + g_ai2_ihevc_trans_32[k][4] * eo[4]
1348                             + g_ai2_ihevc_trans_32[k][5] * eo[5]
1349                             + g_ai2_ihevc_trans_32[k][6] * eo[6]
1350                             + g_ai2_ihevc_trans_32[k][7] * eo[7] + add)
1351                             >> shift;
1352         }
1353         for(k = 1; k < 32; k += 2)
1354         {
1355             pi2_tmp[k * trans_size] = (g_ai2_ihevc_trans_32[k][0] * o[0]
1356                             + g_ai2_ihevc_trans_32[k][1] * o[1]
1357                             + g_ai2_ihevc_trans_32[k][2] * o[2]
1358                             + g_ai2_ihevc_trans_32[k][3] * o[3]
1359                             + g_ai2_ihevc_trans_32[k][4] * o[4]
1360                             + g_ai2_ihevc_trans_32[k][5] * o[5]
1361                             + g_ai2_ihevc_trans_32[k][6] * o[6]
1362                             + g_ai2_ihevc_trans_32[k][7] * o[7]
1363                             + g_ai2_ihevc_trans_32[k][8] * o[8]
1364                             + g_ai2_ihevc_trans_32[k][9] * o[9]
1365                             + g_ai2_ihevc_trans_32[k][10] * o[10]
1366                             + g_ai2_ihevc_trans_32[k][11] * o[11]
1367                             + g_ai2_ihevc_trans_32[k][12] * o[12]
1368                             + g_ai2_ihevc_trans_32[k][13] * o[13]
1369                             + g_ai2_ihevc_trans_32[k][14] * o[14]
1370                             + g_ai2_ihevc_trans_32[k][15] * o[15] + add)
1371                             >> shift;
1372         }
1373         pi2_src += src_strd;
1374         pu1_pred += pred_strd;
1375         pi2_tmp++;
1376     }
1377 
1378     pi2_tmp = pi2_tmp_orig;
1379     /* Forward Transform 2nd stage */
1380     shift = 11; // log2(iHeight) + 6
1381     add = 1 << (shift - 1);
1382 
1383     for(i = 0; i < TRANS_SIZE_32; i++)
1384     {
1385         /* e and o*/
1386         for(k = 0; k < 16; k++)
1387         {
1388             e[k] = pi2_tmp[k] + pi2_tmp[31 - k];
1389             o[k] = pi2_tmp[k] - pi2_tmp[31 - k];
1390         }
1391         /* ee and eo */
1392         for(k = 0; k < 8; k++)
1393         {
1394             ee[k] = e[k] + e[15 - k];
1395             eo[k] = e[k] - e[15 - k];
1396         }
1397         /* eee and eeo */
1398         for(k = 0; k < 4; k++)
1399         {
1400             eee[k] = ee[k] + ee[7 - k];
1401             eeo[k] = ee[k] - ee[7 - k];
1402         }
1403         /* eeee and eeeo */
1404         eeee[0] = eee[0] + eee[3];
1405         eeeo[0] = eee[0] - eee[3];
1406         eeee[1] = eee[1] + eee[2];
1407         eeeo[1] = eee[1] - eee[2];
1408 
1409         pi2_dst[0] = (g_ai2_ihevc_trans_32[0][0] * eeee[0]
1410                         + g_ai2_ihevc_trans_32[0][1] * eeee[1] + add) >> shift;
1411         pi2_dst[16 * dst_strd] = (g_ai2_ihevc_trans_32[16][0] * eeee[0]
1412                         + g_ai2_ihevc_trans_32[16][1] * eeee[1] + add) >> shift;
1413         pi2_dst[8 * dst_strd] = (g_ai2_ihevc_trans_32[8][0] * eeeo[0]
1414                         + g_ai2_ihevc_trans_32[8][1] * eeeo[1] + add) >> shift;
1415         pi2_dst[24 * dst_strd] = (g_ai2_ihevc_trans_32[24][0] * eeeo[0]
1416                         + g_ai2_ihevc_trans_32[24][1] * eeeo[1] + add) >> shift;
1417         for(k = 4; k < 32; k += 8)
1418         {
1419             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * eeo[0]
1420                             + g_ai2_ihevc_trans_32[k][1] * eeo[1]
1421                             + g_ai2_ihevc_trans_32[k][2] * eeo[2]
1422                             + g_ai2_ihevc_trans_32[k][3] * eeo[3] + add)
1423                             >> shift;
1424         }
1425         for(k = 2; k < 32; k += 4)
1426         {
1427             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * eo[0]
1428                             + g_ai2_ihevc_trans_32[k][1] * eo[1]
1429                             + g_ai2_ihevc_trans_32[k][2] * eo[2]
1430                             + g_ai2_ihevc_trans_32[k][3] * eo[3]
1431                             + g_ai2_ihevc_trans_32[k][4] * eo[4]
1432                             + g_ai2_ihevc_trans_32[k][5] * eo[5]
1433                             + g_ai2_ihevc_trans_32[k][6] * eo[6]
1434                             + g_ai2_ihevc_trans_32[k][7] * eo[7] + add)
1435                             >> shift;
1436         }
1437         for(k = 1; k < 32; k += 2)
1438         {
1439             pi2_dst[k * dst_strd] = (g_ai2_ihevc_trans_32[k][0] * o[0]
1440                             + g_ai2_ihevc_trans_32[k][1] * o[1]
1441                             + g_ai2_ihevc_trans_32[k][2] * o[2]
1442                             + g_ai2_ihevc_trans_32[k][3] * o[3]
1443                             + g_ai2_ihevc_trans_32[k][4] * o[4]
1444                             + g_ai2_ihevc_trans_32[k][5] * o[5]
1445                             + g_ai2_ihevc_trans_32[k][6] * o[6]
1446                             + g_ai2_ihevc_trans_32[k][7] * o[7]
1447                             + g_ai2_ihevc_trans_32[k][8] * o[8]
1448                             + g_ai2_ihevc_trans_32[k][9] * o[9]
1449                             + g_ai2_ihevc_trans_32[k][10] * o[10]
1450                             + g_ai2_ihevc_trans_32[k][11] * o[11]
1451                             + g_ai2_ihevc_trans_32[k][12] * o[12]
1452                             + g_ai2_ihevc_trans_32[k][13] * o[13]
1453                             + g_ai2_ihevc_trans_32[k][14] * o[14]
1454                             + g_ai2_ihevc_trans_32[k][15] * o[15] + add)
1455                             >> shift;
1456         }
1457 
1458         pi2_tmp += trans_size;
1459         pi2_dst++;
1460     }
1461 }
1462 
1463