1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 #include "mp4def.h"
19 #include "idct.h"
20 #include "motion_comp.h"
21
22 #ifdef FAST_IDCT
23
24 /****************************************************************
25 * vca_idct.c : created 6/1/99 for several options
26 * of hard-coded reduced idct function (using nz_coefs)
27 ******************************************************************/
28
29 /*****************************************************/
30 //pretested version
idctrow0(int16 *,uint8 *,uint8 *,int)31 void idctrow0(int16 *, uint8 *, uint8 *, int)
32 {
33 return ;
34 }
idctcol0(int16 *)35 void idctcol0(int16 *)
36 {
37 return ;
38 }
39
40 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow1(int16 * blk,uint8 * pred,uint8 * dst,int width)41 void idctrow1(int16 *blk, uint8 *pred, uint8 *dst, int width)
42 {
43 /* shortcut */
44 int tmp;
45 int i = 8;
46 uint32 pred_word, dst_word;
47 int res, res2;
48
49 /* preset the offset, such that we can take advantage pre-offset addressing mode */
50 width -= 4;
51 dst -= width;
52 pred -= 12;
53 blk -= 8;
54
55 while (i--)
56 {
57 tmp = (*(blk += 8) + 32) >> 6;
58 *blk = 0;
59
60 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
61 res = tmp + (pred_word & 0xFF);
62 CLIP_RESULT(res);
63 res2 = tmp + ((pred_word >> 8) & 0xFF);
64 CLIP_RESULT(res2);
65 dst_word = (res2 << 8) | res;
66 res = tmp + ((pred_word >> 16) & 0xFF);
67 CLIP_RESULT(res);
68 dst_word |= (res << 16);
69 res = tmp + ((pred_word >> 24) & 0xFF);
70 CLIP_RESULT(res);
71 dst_word |= (res << 24);
72 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
73
74 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
75 res = tmp + (pred_word & 0xFF);
76 CLIP_RESULT(res);
77 res2 = tmp + ((pred_word >> 8) & 0xFF);
78 CLIP_RESULT(res2);
79 dst_word = (res2 << 8) | res;
80 res = tmp + ((pred_word >> 16) & 0xFF);
81 CLIP_RESULT(res);
82 dst_word |= (res << 16);
83 res = tmp + ((pred_word >> 24) & 0xFF);
84 CLIP_RESULT(res);
85 dst_word |= (res << 24);
86 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
87 }
88 return;
89 }
90
idctcol1(int16 * blk)91 void idctcol1(int16 *blk)
92 { /* shortcut */
93 blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
94 blk[0] << 3;
95 return;
96 }
97
98 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow2(int16 * blk,uint8 * pred,uint8 * dst,int width)99 void idctrow2(int16 *blk, uint8 *pred, uint8 *dst, int width)
100 {
101 int32 x0, x1, x2, x4, x5;
102 int i = 8;
103 uint32 pred_word, dst_word;
104 int res, res2;
105
106 /* preset the offset, such that we can take advantage pre-offset addressing mode */
107 width -= 4;
108 dst -= width;
109 pred -= 12;
110 blk -= 8;
111
112 while (i--)
113 {
114 /* shortcut */
115 x4 = blk[9];
116 blk[9] = 0;
117 x0 = ((*(blk += 8)) << 8) + 8192;
118 *blk = 0; /* for proper rounding in the fourth stage */
119
120 /* first stage */
121 x5 = (W7 * x4 + 4) >> 3;
122 x4 = (W1 * x4 + 4) >> 3;
123
124 /* third stage */
125 x2 = (181 * (x4 + x5) + 128) >> 8;
126 x1 = (181 * (x4 - x5) + 128) >> 8;
127
128 /* fourth stage */
129 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
130 res = (x0 + x4) >> 14;
131 ADD_AND_CLIP1(res);
132 res2 = (x0 + x2) >> 14;
133 ADD_AND_CLIP2(res2);
134 dst_word = (res2 << 8) | res;
135 res = (x0 + x1) >> 14;
136 ADD_AND_CLIP3(res);
137 dst_word |= (res << 16);
138 res = (x0 + x5) >> 14;
139 ADD_AND_CLIP4(res);
140 dst_word |= (res << 24);
141 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
142
143 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
144 res = (x0 - x5) >> 14;
145 ADD_AND_CLIP1(res);
146 res2 = (x0 - x1) >> 14;
147 ADD_AND_CLIP2(res2);
148 dst_word = (res2 << 8) | res;
149 res = (x0 - x2) >> 14;
150 ADD_AND_CLIP3(res);
151 dst_word |= (res << 16);
152 res = (x0 - x4) >> 14;
153 ADD_AND_CLIP4(res);
154 dst_word |= (res << 24);
155 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
156 }
157 return ;
158 }
159
160 __attribute__((no_sanitize("signed-integer-overflow")))
idctcol2(int16 * blk)161 void idctcol2(int16 *blk)
162 {
163 int32 x0, x1, x3, x5, x7;//, x8;
164
165 x1 = blk[8];
166 x0 = ((int32)blk[0] << 11) + 128;
167 /* both upper and lower*/
168
169 x7 = W7 * x1;
170 x1 = W1 * x1;
171
172 x3 = x7;
173 x5 = (181 * (x1 - x7) + 128) >> 8;
174 x7 = (181 * (x1 + x7) + 128) >> 8;
175
176 blk[0] = (x0 + x1) >> 8;
177 blk[8] = (x0 + x7) >> 8;
178 blk[16] = (x0 + x5) >> 8;
179 blk[24] = (x0 + x3) >> 8;
180 blk[56] = (x0 - x1) >> 8;
181 blk[48] = (x0 - x7) >> 8;
182 blk[40] = (x0 - x5) >> 8;
183 blk[32] = (x0 - x3) >> 8;
184
185 return ;
186 }
187
188 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow3(int16 * blk,uint8 * pred,uint8 * dst,int width)189 void idctrow3(int16 *blk, uint8 *pred, uint8 *dst, int width)
190 {
191 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
192 int i = 8;
193 uint32 pred_word, dst_word;
194 int res, res2;
195
196 /* preset the offset, such that we can take advantage pre-offset addressing mode */
197 width -= 4;
198 dst -= width;
199 pred -= 12;
200 blk -= 8;
201
202 while (i--)
203 {
204 x2 = blk[10];
205 blk[10] = 0;
206 x1 = blk[9];
207 blk[9] = 0;
208 x0 = ((*(blk += 8)) << 8) + 8192;
209 *blk = 0; /* for proper rounding in the fourth stage */
210 /* both upper and lower*/
211 /* both x2orx6 and x0orx4 */
212
213 x4 = x0;
214 x6 = (W6 * x2 + 4) >> 3;
215 x2 = (W2 * x2 + 4) >> 3;
216 x8 = x0 - x2;
217 x0 += x2;
218 x2 = x8;
219 x8 = x4 - x6;
220 x4 += x6;
221 x6 = x8;
222
223 x7 = (W7 * x1 + 4) >> 3;
224 x1 = (W1 * x1 + 4) >> 3;
225 x3 = x7;
226 x5 = (181 * (x1 - x7) + 128) >> 8;
227 x7 = (181 * (x1 + x7) + 128) >> 8;
228
229 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
230 res = (x0 + x1) >> 14;
231 ADD_AND_CLIP1(res);
232 res2 = (x4 + x7) >> 14;
233 ADD_AND_CLIP2(res2);
234 dst_word = (res2 << 8) | res;
235 res = (x6 + x5) >> 14;
236 ADD_AND_CLIP3(res);
237 dst_word |= (res << 16);
238 res = (x2 + x3) >> 14;
239 ADD_AND_CLIP4(res);
240 dst_word |= (res << 24);
241 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
242
243 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
244 res = (x2 - x3) >> 14;
245 ADD_AND_CLIP1(res);
246 res2 = (x6 - x5) >> 14;
247 ADD_AND_CLIP2(res2);
248 dst_word = (res2 << 8) | res;
249 res = (x4 - x7) >> 14;
250 ADD_AND_CLIP3(res);
251 dst_word |= (res << 16);
252 res = (x0 - x1) >> 14;
253 ADD_AND_CLIP4(res);
254 dst_word |= (res << 24);
255 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
256 }
257
258 return ;
259 }
260
261 __attribute__((no_sanitize("signed-integer-overflow")))
idctcol3(int16 * blk)262 void idctcol3(int16 *blk)
263 {
264 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
265
266 x2 = blk[16];
267 x1 = blk[8];
268 x0 = ((int32)blk[0] << 11) + 128;
269
270 x4 = x0;
271 x6 = W6 * x2;
272 x2 = W2 * x2;
273 x8 = x0 - x2;
274 x0 += x2;
275 x2 = x8;
276 x8 = x4 - x6;
277 x4 += x6;
278 x6 = x8;
279
280 x7 = W7 * x1;
281 x1 = W1 * x1;
282 x3 = x7;
283 x5 = (181 * (x1 - x7) + 128) >> 8;
284 x7 = (181 * (x1 + x7) + 128) >> 8;
285
286 blk[0] = (x0 + x1) >> 8;
287 blk[8] = (x4 + x7) >> 8;
288 blk[16] = (x6 + x5) >> 8;
289 blk[24] = (x2 + x3) >> 8;
290 blk[56] = (x0 - x1) >> 8;
291 blk[48] = (x4 - x7) >> 8;
292 blk[40] = (x6 - x5) >> 8;
293 blk[32] = (x2 - x3) >> 8;
294
295 return;
296 }
297
298
299 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow4(int16 * blk,uint8 * pred,uint8 * dst,int width)300 void idctrow4(int16 *blk, uint8 *pred, uint8 *dst, int width)
301 {
302 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
303 int i = 8;
304 uint32 pred_word, dst_word;
305 int res, res2;
306
307 /* preset the offset, such that we can take advantage pre-offset addressing mode */
308 width -= 4;
309 dst -= width;
310 pred -= 12;
311 blk -= 8;
312
313 while (i--)
314 {
315 x2 = blk[10];
316 blk[10] = 0;
317 x1 = blk[9];
318 blk[9] = 0;
319 x3 = blk[11];
320 blk[11] = 0;
321 x0 = ((*(blk += 8)) << 8) + 8192;
322 *blk = 0; /* for proper rounding in the fourth stage */
323
324 x4 = x0;
325 x6 = (W6 * x2 + 4) >> 3;
326 x2 = (W2 * x2 + 4) >> 3;
327 x8 = x0 - x2;
328 x0 += x2;
329 x2 = x8;
330 x8 = x4 - x6;
331 x4 += x6;
332 x6 = x8;
333
334 x7 = (W7 * x1 + 4) >> 3;
335 x1 = (W1 * x1 + 4) >> 3;
336 x5 = (W3 * x3 + 4) >> 3;
337 x3 = (- W5 * x3 + 4) >> 3;
338 x8 = x1 - x5;
339 x1 += x5;
340 x5 = x8;
341 x8 = x7 - x3;
342 x3 += x7;
343 x7 = (181 * (x5 + x8) + 128) >> 8;
344 x5 = (181 * (x5 - x8) + 128) >> 8;
345
346 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
347 res = (x0 + x1) >> 14;
348 ADD_AND_CLIP1(res);
349 res2 = (x4 + x7) >> 14;
350 ADD_AND_CLIP2(res2);
351 dst_word = (res2 << 8) | res;
352 res = (x6 + x5) >> 14;
353 ADD_AND_CLIP3(res);
354 dst_word |= (res << 16);
355 res = (x2 + x3) >> 14;
356 ADD_AND_CLIP4(res);
357 dst_word |= (res << 24);
358 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
359
360 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
361 res = (x2 - x3) >> 14;
362 ADD_AND_CLIP1(res);
363 res2 = (x6 - x5) >> 14;
364 ADD_AND_CLIP2(res2);
365 dst_word = (res2 << 8) | res;
366 res = (x4 - x7) >> 14;
367 ADD_AND_CLIP3(res);
368 dst_word |= (res << 16);
369 res = (x0 - x1) >> 14;
370 ADD_AND_CLIP4(res);
371 dst_word |= (res << 24);
372 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
373 }
374 return ;
375 }
376
377 __attribute__((no_sanitize("signed-integer-overflow")))
idctcol4(int16 * blk)378 void idctcol4(int16 *blk)
379 {
380 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
381 x2 = blk[16];
382 x1 = blk[8];
383 x3 = blk[24];
384 x0 = ((int32)blk[0] << 11) + 128;
385
386 x4 = x0;
387 x6 = W6 * x2;
388 x2 = W2 * x2;
389 x8 = x0 - x2;
390 x0 += x2;
391 x2 = x8;
392 x8 = x4 - x6;
393 x4 += x6;
394 x6 = x8;
395
396 x7 = W7 * x1;
397 x1 = W1 * x1;
398 x5 = W3 * x3;
399 x3 = -W5 * x3;
400 x8 = x1 - x5;
401 x1 += x5;
402 x5 = x8;
403 x8 = x7 - x3;
404 x3 += x7;
405 x7 = (181 * (x5 + x8) + 128) >> 8;
406 x5 = (181 * (x5 - x8) + 128) >> 8;
407
408
409 blk[0] = (x0 + x1) >> 8;
410 blk[8] = (x4 + x7) >> 8;
411 blk[16] = (x6 + x5) >> 8;
412 blk[24] = (x2 + x3) >> 8;
413 blk[56] = (x0 - x1) >> 8;
414 blk[48] = (x4 - x7) >> 8;
415 blk[40] = (x6 - x5) >> 8;
416 blk[32] = (x2 - x3) >> 8;
417
418 return ;
419 }
420
idctrow0_intra(int16 *,PIXEL *,int)421 void idctrow0_intra(int16 *, PIXEL *, int)
422 {
423 return ;
424 }
425
idctrow1_intra(int16 * blk,PIXEL * comp,int width)426 void idctrow1_intra(int16 *blk, PIXEL *comp, int width)
427 {
428 /* shortcut */
429 int32 tmp;
430 int i = 8;
431 int offset = width;
432 uint32 word;
433
434 comp -= offset;
435 while (i--)
436 {
437 tmp = ((blk[0] + 32) >> 6);
438 blk[0] = 0;
439 CLIP_RESULT(tmp)
440
441 word = (tmp << 8) | tmp;
442 word = (word << 16) | word;
443
444 *((uint32*)(comp += offset)) = word;
445 *((uint32*)(comp + 4)) = word;
446
447
448
449
450 blk += B_SIZE;
451 }
452 return;
453 }
454
455 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow2_intra(int16 * blk,PIXEL * comp,int width)456 void idctrow2_intra(int16 *blk, PIXEL *comp, int width)
457 {
458 int32 x0, x1, x2, x4, x5, temp;
459 int i = 8;
460 int offset = width;
461 int32 word;
462
463 comp -= offset;
464 while (i--)
465 {
466 /* shortcut */
467 x4 = blk[1];
468 blk[1] = 0;
469 x0 = ((int32)blk[0] << 8) + 8192;
470 blk[0] = 0; /* for proper rounding in the fourth stage */
471
472 /* first stage */
473 x5 = (W7 * x4 + 4) >> 3;
474 x4 = (W1 * x4 + 4) >> 3;
475
476 /* third stage */
477 x2 = (181 * (x4 + x5) + 128) >> 8;
478 x1 = (181 * (x4 - x5) + 128) >> 8;
479
480 /* fourth stage */
481 word = ((x0 + x4) >> 14);
482 CLIP_RESULT(word)
483
484 temp = ((x0 + x2) >> 14);
485 CLIP_RESULT(temp)
486 word = word | (temp << 8);
487 temp = ((x0 + x1) >> 14);
488 CLIP_RESULT(temp)
489 word = word | (temp << 16);
490 temp = ((x0 + x5) >> 14);
491 CLIP_RESULT(temp)
492 word = word | (temp << 24);
493 *((int32*)(comp += offset)) = word;
494
495 word = ((x0 - x5) >> 14);
496 CLIP_RESULT(word)
497 temp = ((x0 - x1) >> 14);
498 CLIP_RESULT(temp)
499 word = word | (temp << 8);
500 temp = ((x0 - x2) >> 14);
501 CLIP_RESULT(temp)
502 word = word | (temp << 16);
503 temp = ((x0 - x4) >> 14);
504 CLIP_RESULT(temp)
505 word = word | (temp << 24);
506 *((int32*)(comp + 4)) = word;
507
508 blk += B_SIZE;
509 }
510 return ;
511 }
512
513 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow3_intra(int16 * blk,PIXEL * comp,int width)514 void idctrow3_intra(int16 *blk, PIXEL *comp, int width)
515 {
516 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
517 int i = 8;
518 int offset = width;
519 int32 word;
520
521 comp -= offset;
522
523 while (i--)
524 {
525 x2 = blk[2];
526 blk[2] = 0;
527 x1 = blk[1];
528 blk[1] = 0;
529 x0 = ((int32)blk[0] << 8) + 8192;
530 blk[0] = 0;/* for proper rounding in the fourth stage */
531 /* both upper and lower*/
532 /* both x2orx6 and x0orx4 */
533
534 x4 = x0;
535 x6 = (W6 * x2 + 4) >> 3;
536 x2 = (W2 * x2 + 4) >> 3;
537 x8 = x0 - x2;
538 x0 += x2;
539 x2 = x8;
540 x8 = x4 - x6;
541 x4 += x6;
542 x6 = x8;
543
544 x7 = (W7 * x1 + 4) >> 3;
545 x1 = (W1 * x1 + 4) >> 3;
546 x3 = x7;
547 x5 = (181 * (x1 - x7) + 128) >> 8;
548 x7 = (181 * (x1 + x7) + 128) >> 8;
549
550 word = ((x0 + x1) >> 14);
551 CLIP_RESULT(word)
552 temp = ((x4 + x7) >> 14);
553 CLIP_RESULT(temp)
554 word = word | (temp << 8);
555
556
557 temp = ((x6 + x5) >> 14);
558 CLIP_RESULT(temp)
559 word = word | (temp << 16);
560
561 temp = ((x2 + x3) >> 14);
562 CLIP_RESULT(temp)
563 word = word | (temp << 24);
564 *((int32*)(comp += offset)) = word;
565
566 word = ((x2 - x3) >> 14);
567 CLIP_RESULT(word)
568
569 temp = ((x6 - x5) >> 14);
570 CLIP_RESULT(temp)
571 word = word | (temp << 8);
572
573 temp = ((x4 - x7) >> 14);
574 CLIP_RESULT(temp)
575 word = word | (temp << 16);
576
577 temp = ((x0 - x1) >> 14);
578 CLIP_RESULT(temp)
579 word = word | (temp << 24);
580 *((int32*)(comp + 4)) = word;
581
582 blk += B_SIZE;
583 }
584 return ;
585 }
586
587 __attribute__((no_sanitize("signed-integer-overflow")))
idctrow4_intra(int16 * blk,PIXEL * comp,int width)588 void idctrow4_intra(int16 *blk, PIXEL *comp, int width)
589 {
590 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
591 int i = 8;
592 int offset = width;
593 int32 word;
594
595 comp -= offset;
596
597 while (i--)
598 {
599 x2 = blk[2];
600 blk[2] = 0;
601 x1 = blk[1];
602 blk[1] = 0;
603 x3 = blk[3];
604 blk[3] = 0;
605 x0 = ((int32)blk[0] << 8) + 8192;
606 blk[0] = 0;/* for proper rounding in the fourth stage */
607
608 x4 = x0;
609 x6 = (W6 * x2 + 4) >> 3;
610 x2 = (W2 * x2 + 4) >> 3;
611 x8 = x0 - x2;
612 x0 += x2;
613 x2 = x8;
614 x8 = x4 - x6;
615 x4 += x6;
616 x6 = x8;
617
618 x7 = (W7 * x1 + 4) >> 3;
619 x1 = (W1 * x1 + 4) >> 3;
620 x5 = (W3 * x3 + 4) >> 3;
621 x3 = (- W5 * x3 + 4) >> 3;
622 x8 = x1 - x5;
623 x1 += x5;
624 x5 = x8;
625 x8 = x7 - x3;
626 x3 += x7;
627 x7 = (181 * (x5 + x8) + 128) >> 8;
628 x5 = (181 * (x5 - x8) + 128) >> 8;
629
630 word = ((x0 + x1) >> 14);
631 CLIP_RESULT(word)
632
633 temp = ((x4 + x7) >> 14);
634 CLIP_RESULT(temp)
635 word = word | (temp << 8);
636
637
638 temp = ((x6 + x5) >> 14);
639 CLIP_RESULT(temp)
640 word = word | (temp << 16);
641
642 temp = ((x2 + x3) >> 14);
643 CLIP_RESULT(temp)
644 word = word | (temp << 24);
645 *((int32*)(comp += offset)) = word;
646
647 word = ((x2 - x3) >> 14);
648 CLIP_RESULT(word)
649
650 temp = ((x6 - x5) >> 14);
651 CLIP_RESULT(temp)
652 word = word | (temp << 8);
653
654 temp = ((x4 - x7) >> 14);
655 CLIP_RESULT(temp)
656 word = word | (temp << 16);
657
658 temp = ((x0 - x1) >> 14);
659 CLIP_RESULT(temp)
660 word = word | (temp << 24);
661 *((int32*)(comp + 4)) = word;
662
663 blk += B_SIZE;
664 }
665
666 return ;
667 }
668
669 #endif
670
671