1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 #include "mp4def.h"
19 #include "idct.h"
20 #include "motion_comp.h"
21
22 #ifdef FAST_IDCT
23
24 /****************************************************************
25 * vca_idct.c : created 6/1/99 for several options
26 * of hard-coded reduced idct function (using nz_coefs)
27 ******************************************************************/
28
29 /*****************************************************/
30 //pretested version
idctrow0(int16 * blk,uint8 * pred,uint8 * dst,int width)31 void idctrow0(int16 *blk, uint8 *pred, uint8 *dst, int width)
32 {
33 OSCL_UNUSED_ARG(blk);
34 OSCL_UNUSED_ARG(width);
35 OSCL_UNUSED_ARG(dst);
36 OSCL_UNUSED_ARG(pred);
37 return ;
38 }
idctcol0(int16 * blk)39 void idctcol0(int16 *blk)
40 {
41 OSCL_UNUSED_ARG(blk);
42 return ;
43 }
44
idctrow1(int16 * blk,uint8 * pred,uint8 * dst,int width)45 void idctrow1(int16 *blk, uint8 *pred, uint8 *dst, int width)
46 {
47 /* shortcut */
48 int tmp;
49 int i = 8;
50 uint32 pred_word, dst_word;
51 int res, res2;
52
53 /* preset the offset, such that we can take advantage pre-offset addressing mode */
54 width -= 4;
55 dst -= width;
56 pred -= 12;
57 blk -= 8;
58
59 while (i--)
60 {
61 tmp = (*(blk += 8) + 32) >> 6;
62 *blk = 0;
63
64 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
65 res = tmp + (pred_word & 0xFF);
66 CLIP_RESULT(res);
67 res2 = tmp + ((pred_word >> 8) & 0xFF);
68 CLIP_RESULT(res2);
69 dst_word = (res2 << 8) | res;
70 res = tmp + ((pred_word >> 16) & 0xFF);
71 CLIP_RESULT(res);
72 dst_word |= (res << 16);
73 res = tmp + ((pred_word >> 24) & 0xFF);
74 CLIP_RESULT(res);
75 dst_word |= (res << 24);
76 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
77
78 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
79 res = tmp + (pred_word & 0xFF);
80 CLIP_RESULT(res);
81 res2 = tmp + ((pred_word >> 8) & 0xFF);
82 CLIP_RESULT(res2);
83 dst_word = (res2 << 8) | res;
84 res = tmp + ((pred_word >> 16) & 0xFF);
85 CLIP_RESULT(res);
86 dst_word |= (res << 16);
87 res = tmp + ((pred_word >> 24) & 0xFF);
88 CLIP_RESULT(res);
89 dst_word |= (res << 24);
90 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
91 }
92 return;
93 }
94
idctcol1(int16 * blk)95 void idctcol1(int16 *blk)
96 { /* shortcut */
97 blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
98 blk[0] << 3;
99 return;
100 }
101
idctrow2(int16 * blk,uint8 * pred,uint8 * dst,int width)102 void idctrow2(int16 *blk, uint8 *pred, uint8 *dst, int width)
103 {
104 int32 x0, x1, x2, x4, x5;
105 int i = 8;
106 uint32 pred_word, dst_word;
107 int res, res2;
108
109 /* preset the offset, such that we can take advantage pre-offset addressing mode */
110 width -= 4;
111 dst -= width;
112 pred -= 12;
113 blk -= 8;
114
115 while (i--)
116 {
117 /* shortcut */
118 x4 = blk[9];
119 blk[9] = 0;
120 x0 = ((*(blk += 8)) << 8) + 8192;
121 *blk = 0; /* for proper rounding in the fourth stage */
122
123 /* first stage */
124 x5 = (W7 * x4 + 4) >> 3;
125 x4 = (W1 * x4 + 4) >> 3;
126
127 /* third stage */
128 x2 = (181 * (x4 + x5) + 128) >> 8;
129 x1 = (181 * (x4 - x5) + 128) >> 8;
130
131 /* fourth stage */
132 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
133 res = (x0 + x4) >> 14;
134 ADD_AND_CLIP1(res);
135 res2 = (x0 + x2) >> 14;
136 ADD_AND_CLIP2(res2);
137 dst_word = (res2 << 8) | res;
138 res = (x0 + x1) >> 14;
139 ADD_AND_CLIP3(res);
140 dst_word |= (res << 16);
141 res = (x0 + x5) >> 14;
142 ADD_AND_CLIP4(res);
143 dst_word |= (res << 24);
144 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
145
146 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
147 res = (x0 - x5) >> 14;
148 ADD_AND_CLIP1(res);
149 res2 = (x0 - x1) >> 14;
150 ADD_AND_CLIP2(res2);
151 dst_word = (res2 << 8) | res;
152 res = (x0 - x2) >> 14;
153 ADD_AND_CLIP3(res);
154 dst_word |= (res << 16);
155 res = (x0 - x4) >> 14;
156 ADD_AND_CLIP4(res);
157 dst_word |= (res << 24);
158 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
159 }
160 return ;
161 }
162
idctcol2(int16 * blk)163 void idctcol2(int16 *blk)
164 {
165 int32 x0, x1, x3, x5, x7;//, x8;
166
167 x1 = blk[8];
168 x0 = ((int32)blk[0] << 11) + 128;
169 /* both upper and lower*/
170
171 x7 = W7 * x1;
172 x1 = W1 * x1;
173
174 x3 = x7;
175 x5 = (181 * (x1 - x7) + 128) >> 8;
176 x7 = (181 * (x1 + x7) + 128) >> 8;
177
178 blk[0] = (x0 + x1) >> 8;
179 blk[8] = (x0 + x7) >> 8;
180 blk[16] = (x0 + x5) >> 8;
181 blk[24] = (x0 + x3) >> 8;
182 blk[56] = (x0 - x1) >> 8;
183 blk[48] = (x0 - x7) >> 8;
184 blk[40] = (x0 - x5) >> 8;
185 blk[32] = (x0 - x3) >> 8;
186
187 return ;
188 }
189
idctrow3(int16 * blk,uint8 * pred,uint8 * dst,int width)190 void idctrow3(int16 *blk, uint8 *pred, uint8 *dst, int width)
191 {
192 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
193 int i = 8;
194 uint32 pred_word, dst_word;
195 int res, res2;
196
197 /* preset the offset, such that we can take advantage pre-offset addressing mode */
198 width -= 4;
199 dst -= width;
200 pred -= 12;
201 blk -= 8;
202
203 while (i--)
204 {
205 x2 = blk[10];
206 blk[10] = 0;
207 x1 = blk[9];
208 blk[9] = 0;
209 x0 = ((*(blk += 8)) << 8) + 8192;
210 *blk = 0; /* for proper rounding in the fourth stage */
211 /* both upper and lower*/
212 /* both x2orx6 and x0orx4 */
213
214 x4 = x0;
215 x6 = (W6 * x2 + 4) >> 3;
216 x2 = (W2 * x2 + 4) >> 3;
217 x8 = x0 - x2;
218 x0 += x2;
219 x2 = x8;
220 x8 = x4 - x6;
221 x4 += x6;
222 x6 = x8;
223
224 x7 = (W7 * x1 + 4) >> 3;
225 x1 = (W1 * x1 + 4) >> 3;
226 x3 = x7;
227 x5 = (181 * (x1 - x7) + 128) >> 8;
228 x7 = (181 * (x1 + x7) + 128) >> 8;
229
230 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
231 res = (x0 + x1) >> 14;
232 ADD_AND_CLIP1(res);
233 res2 = (x4 + x7) >> 14;
234 ADD_AND_CLIP2(res2);
235 dst_word = (res2 << 8) | res;
236 res = (x6 + x5) >> 14;
237 ADD_AND_CLIP3(res);
238 dst_word |= (res << 16);
239 res = (x2 + x3) >> 14;
240 ADD_AND_CLIP4(res);
241 dst_word |= (res << 24);
242 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
243
244 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
245 res = (x2 - x3) >> 14;
246 ADD_AND_CLIP1(res);
247 res2 = (x6 - x5) >> 14;
248 ADD_AND_CLIP2(res2);
249 dst_word = (res2 << 8) | res;
250 res = (x4 - x7) >> 14;
251 ADD_AND_CLIP3(res);
252 dst_word |= (res << 16);
253 res = (x0 - x1) >> 14;
254 ADD_AND_CLIP4(res);
255 dst_word |= (res << 24);
256 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
257 }
258
259 return ;
260 }
261
idctcol3(int16 * blk)262 void idctcol3(int16 *blk)
263 {
264 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
265
266 x2 = blk[16];
267 x1 = blk[8];
268 x0 = ((int32)blk[0] << 11) + 128;
269
270 x4 = x0;
271 x6 = W6 * x2;
272 x2 = W2 * x2;
273 x8 = x0 - x2;
274 x0 += x2;
275 x2 = x8;
276 x8 = x4 - x6;
277 x4 += x6;
278 x6 = x8;
279
280 x7 = W7 * x1;
281 x1 = W1 * x1;
282 x3 = x7;
283 x5 = (181 * (x1 - x7) + 128) >> 8;
284 x7 = (181 * (x1 + x7) + 128) >> 8;
285
286 blk[0] = (x0 + x1) >> 8;
287 blk[8] = (x4 + x7) >> 8;
288 blk[16] = (x6 + x5) >> 8;
289 blk[24] = (x2 + x3) >> 8;
290 blk[56] = (x0 - x1) >> 8;
291 blk[48] = (x4 - x7) >> 8;
292 blk[40] = (x6 - x5) >> 8;
293 blk[32] = (x2 - x3) >> 8;
294
295 return;
296 }
297
298
idctrow4(int16 * blk,uint8 * pred,uint8 * dst,int width)299 void idctrow4(int16 *blk, uint8 *pred, uint8 *dst, int width)
300 {
301 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
302 int i = 8;
303 uint32 pred_word, dst_word;
304 int res, res2;
305
306 /* preset the offset, such that we can take advantage pre-offset addressing mode */
307 width -= 4;
308 dst -= width;
309 pred -= 12;
310 blk -= 8;
311
312 while (i--)
313 {
314 x2 = blk[10];
315 blk[10] = 0;
316 x1 = blk[9];
317 blk[9] = 0;
318 x3 = blk[11];
319 blk[11] = 0;
320 x0 = ((*(blk += 8)) << 8) + 8192;
321 *blk = 0; /* for proper rounding in the fourth stage */
322
323 x4 = x0;
324 x6 = (W6 * x2 + 4) >> 3;
325 x2 = (W2 * x2 + 4) >> 3;
326 x8 = x0 - x2;
327 x0 += x2;
328 x2 = x8;
329 x8 = x4 - x6;
330 x4 += x6;
331 x6 = x8;
332
333 x7 = (W7 * x1 + 4) >> 3;
334 x1 = (W1 * x1 + 4) >> 3;
335 x5 = (W3 * x3 + 4) >> 3;
336 x3 = (- W5 * x3 + 4) >> 3;
337 x8 = x1 - x5;
338 x1 += x5;
339 x5 = x8;
340 x8 = x7 - x3;
341 x3 += x7;
342 x7 = (181 * (x5 + x8) + 128) >> 8;
343 x5 = (181 * (x5 - x8) + 128) >> 8;
344
345 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
346 res = (x0 + x1) >> 14;
347 ADD_AND_CLIP1(res);
348 res2 = (x4 + x7) >> 14;
349 ADD_AND_CLIP2(res2);
350 dst_word = (res2 << 8) | res;
351 res = (x6 + x5) >> 14;
352 ADD_AND_CLIP3(res);
353 dst_word |= (res << 16);
354 res = (x2 + x3) >> 14;
355 ADD_AND_CLIP4(res);
356 dst_word |= (res << 24);
357 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
358
359 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
360 res = (x2 - x3) >> 14;
361 ADD_AND_CLIP1(res);
362 res2 = (x6 - x5) >> 14;
363 ADD_AND_CLIP2(res2);
364 dst_word = (res2 << 8) | res;
365 res = (x4 - x7) >> 14;
366 ADD_AND_CLIP3(res);
367 dst_word |= (res << 16);
368 res = (x0 - x1) >> 14;
369 ADD_AND_CLIP4(res);
370 dst_word |= (res << 24);
371 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
372 }
373 return ;
374 }
375
idctcol4(int16 * blk)376 void idctcol4(int16 *blk)
377 {
378 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
379 x2 = blk[16];
380 x1 = blk[8];
381 x3 = blk[24];
382 x0 = ((int32)blk[0] << 11) + 128;
383
384 x4 = x0;
385 x6 = W6 * x2;
386 x2 = W2 * x2;
387 x8 = x0 - x2;
388 x0 += x2;
389 x2 = x8;
390 x8 = x4 - x6;
391 x4 += x6;
392 x6 = x8;
393
394 x7 = W7 * x1;
395 x1 = W1 * x1;
396 x5 = W3 * x3;
397 x3 = -W5 * x3;
398 x8 = x1 - x5;
399 x1 += x5;
400 x5 = x8;
401 x8 = x7 - x3;
402 x3 += x7;
403 x7 = (181 * (x5 + x8) + 128) >> 8;
404 x5 = (181 * (x5 - x8) + 128) >> 8;
405
406
407 blk[0] = (x0 + x1) >> 8;
408 blk[8] = (x4 + x7) >> 8;
409 blk[16] = (x6 + x5) >> 8;
410 blk[24] = (x2 + x3) >> 8;
411 blk[56] = (x0 - x1) >> 8;
412 blk[48] = (x4 - x7) >> 8;
413 blk[40] = (x6 - x5) >> 8;
414 blk[32] = (x2 - x3) >> 8;
415
416 return ;
417 }
418
idctrow0_intra(int16 * blk,PIXEL * comp,int width)419 void idctrow0_intra(int16 *blk, PIXEL * comp, int width)
420 {
421 OSCL_UNUSED_ARG(blk);
422 OSCL_UNUSED_ARG(comp);
423 OSCL_UNUSED_ARG(width);
424 return ;
425 }
426
idctrow1_intra(int16 * blk,PIXEL * comp,int width)427 void idctrow1_intra(int16 *blk, PIXEL *comp, int width)
428 {
429 /* shortcut */
430 int32 tmp;
431 int i = 8;
432 int offset = width;
433 uint32 word;
434
435 comp -= offset;
436 while (i--)
437 {
438 tmp = ((blk[0] + 32) >> 6);
439 blk[0] = 0;
440 CLIP_RESULT(tmp)
441
442 word = (tmp << 8) | tmp;
443 word = (word << 16) | word;
444
445 *((uint32*)(comp += offset)) = word;
446 *((uint32*)(comp + 4)) = word;
447
448
449
450
451 blk += B_SIZE;
452 }
453 return;
454 }
455
idctrow2_intra(int16 * blk,PIXEL * comp,int width)456 void idctrow2_intra(int16 *blk, PIXEL *comp, int width)
457 {
458 int32 x0, x1, x2, x4, x5, temp;
459 int i = 8;
460 int offset = width;
461 int32 word;
462
463 comp -= offset;
464 while (i--)
465 {
466 /* shortcut */
467 x4 = blk[1];
468 blk[1] = 0;
469 x0 = ((int32)blk[0] << 8) + 8192;
470 blk[0] = 0; /* for proper rounding in the fourth stage */
471
472 /* first stage */
473 x5 = (W7 * x4 + 4) >> 3;
474 x4 = (W1 * x4 + 4) >> 3;
475
476 /* third stage */
477 x2 = (181 * (x4 + x5) + 128) >> 8;
478 x1 = (181 * (x4 - x5) + 128) >> 8;
479
480 /* fourth stage */
481 word = ((x0 + x4) >> 14);
482 CLIP_RESULT(word)
483
484 temp = ((x0 + x2) >> 14);
485 CLIP_RESULT(temp)
486 word = word | (temp << 8);
487 temp = ((x0 + x1) >> 14);
488 CLIP_RESULT(temp)
489 word = word | (temp << 16);
490 temp = ((x0 + x5) >> 14);
491 CLIP_RESULT(temp)
492 word = word | (temp << 24);
493 *((int32*)(comp += offset)) = word;
494
495 word = ((x0 - x5) >> 14);
496 CLIP_RESULT(word)
497 temp = ((x0 - x1) >> 14);
498 CLIP_RESULT(temp)
499 word = word | (temp << 8);
500 temp = ((x0 - x2) >> 14);
501 CLIP_RESULT(temp)
502 word = word | (temp << 16);
503 temp = ((x0 - x4) >> 14);
504 CLIP_RESULT(temp)
505 word = word | (temp << 24);
506 *((int32*)(comp + 4)) = word;
507
508 blk += B_SIZE;
509 }
510 return ;
511 }
512
idctrow3_intra(int16 * blk,PIXEL * comp,int width)513 void idctrow3_intra(int16 *blk, PIXEL *comp, int width)
514 {
515 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
516 int i = 8;
517 int offset = width;
518 int32 word;
519
520 comp -= offset;
521
522 while (i--)
523 {
524 x2 = blk[2];
525 blk[2] = 0;
526 x1 = blk[1];
527 blk[1] = 0;
528 x0 = ((int32)blk[0] << 8) + 8192;
529 blk[0] = 0;/* for proper rounding in the fourth stage */
530 /* both upper and lower*/
531 /* both x2orx6 and x0orx4 */
532
533 x4 = x0;
534 x6 = (W6 * x2 + 4) >> 3;
535 x2 = (W2 * x2 + 4) >> 3;
536 x8 = x0 - x2;
537 x0 += x2;
538 x2 = x8;
539 x8 = x4 - x6;
540 x4 += x6;
541 x6 = x8;
542
543 x7 = (W7 * x1 + 4) >> 3;
544 x1 = (W1 * x1 + 4) >> 3;
545 x3 = x7;
546 x5 = (181 * (x1 - x7) + 128) >> 8;
547 x7 = (181 * (x1 + x7) + 128) >> 8;
548
549 word = ((x0 + x1) >> 14);
550 CLIP_RESULT(word)
551 temp = ((x4 + x7) >> 14);
552 CLIP_RESULT(temp)
553 word = word | (temp << 8);
554
555
556 temp = ((x6 + x5) >> 14);
557 CLIP_RESULT(temp)
558 word = word | (temp << 16);
559
560 temp = ((x2 + x3) >> 14);
561 CLIP_RESULT(temp)
562 word = word | (temp << 24);
563 *((int32*)(comp += offset)) = word;
564
565 word = ((x2 - x3) >> 14);
566 CLIP_RESULT(word)
567
568 temp = ((x6 - x5) >> 14);
569 CLIP_RESULT(temp)
570 word = word | (temp << 8);
571
572 temp = ((x4 - x7) >> 14);
573 CLIP_RESULT(temp)
574 word = word | (temp << 16);
575
576 temp = ((x0 - x1) >> 14);
577 CLIP_RESULT(temp)
578 word = word | (temp << 24);
579 *((int32*)(comp + 4)) = word;
580
581 blk += B_SIZE;
582 }
583 return ;
584 }
585
idctrow4_intra(int16 * blk,PIXEL * comp,int width)586 void idctrow4_intra(int16 *blk, PIXEL *comp, int width)
587 {
588 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
589 int i = 8;
590 int offset = width;
591 int32 word;
592
593 comp -= offset;
594
595 while (i--)
596 {
597 x2 = blk[2];
598 blk[2] = 0;
599 x1 = blk[1];
600 blk[1] = 0;
601 x3 = blk[3];
602 blk[3] = 0;
603 x0 = ((int32)blk[0] << 8) + 8192;
604 blk[0] = 0;/* for proper rounding in the fourth stage */
605
606 x4 = x0;
607 x6 = (W6 * x2 + 4) >> 3;
608 x2 = (W2 * x2 + 4) >> 3;
609 x8 = x0 - x2;
610 x0 += x2;
611 x2 = x8;
612 x8 = x4 - x6;
613 x4 += x6;
614 x6 = x8;
615
616 x7 = (W7 * x1 + 4) >> 3;
617 x1 = (W1 * x1 + 4) >> 3;
618 x5 = (W3 * x3 + 4) >> 3;
619 x3 = (- W5 * x3 + 4) >> 3;
620 x8 = x1 - x5;
621 x1 += x5;
622 x5 = x8;
623 x8 = x7 - x3;
624 x3 += x7;
625 x7 = (181 * (x5 + x8) + 128) >> 8;
626 x5 = (181 * (x5 - x8) + 128) >> 8;
627
628 word = ((x0 + x1) >> 14);
629 CLIP_RESULT(word)
630
631 temp = ((x4 + x7) >> 14);
632 CLIP_RESULT(temp)
633 word = word | (temp << 8);
634
635
636 temp = ((x6 + x5) >> 14);
637 CLIP_RESULT(temp)
638 word = word | (temp << 16);
639
640 temp = ((x2 + x3) >> 14);
641 CLIP_RESULT(temp)
642 word = word | (temp << 24);
643 *((int32*)(comp += offset)) = word;
644
645 word = ((x2 - x3) >> 14);
646 CLIP_RESULT(word)
647
648 temp = ((x6 - x5) >> 14);
649 CLIP_RESULT(temp)
650 word = word | (temp << 8);
651
652 temp = ((x4 - x7) >> 14);
653 CLIP_RESULT(temp)
654 word = word | (temp << 16);
655
656 temp = ((x0 - x1) >> 14);
657 CLIP_RESULT(temp)
658 word = word | (temp << 24);
659 *((int32*)(comp + 4)) = word;
660
661 blk += B_SIZE;
662 }
663
664 return ;
665 }
666
667 #endif
668
669