1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 /*
19
20 ------------------------------------------------------------------------------
21 REVISION HISTORY
22 Who: Date: July/2001
23 Description: 1. Optimized BlockIDCT bitmap checking.
24 2. Rearranged functions.
25 3. Do column IDCT first, then row IDCT.
26 4. Combine motion comp and IDCT, require
27 two sets of row IDCTs one for INTRA
28 and one for INTER.
29 5. Add AAN IDCT
30
31 Who: Date: 8/16/01
32 1. Increase the input precision to 8 bits, i.e. change RDCTBITS
33 to 11, have to comment out all in-line assembly since 16 bit
34 multiplication doesn't work. Try to use diffent precision with
35 32 bit mult. but hasn't finished. Turns out that without in-line
36 assembly the performance doesn't change much (only 1%).
37 Who: Date: 9/04/05
38 1. Replace AAN IDCT with Chen's IDCT to accommodate 16 bit data type.
39
40 */
41 #include "mp4def.h"
42 #include "mp4enc_lib.h"
43 #include "mp4lib_int.h"
44 #include "dct.h"
45
46 #define ADD_CLIP { \
47 tmp = *rec + tmp; \
48 if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
49 *rec++ = tmp; \
50 }
51
52 #define INTRA_CLIP { \
53 if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
54 *rec++ = tmp; \
55 }
56
57
58 #define CLIP_RESULT(x) if((UInt)(x) > 0xFF){(x) = 0xFF & (~((x)>>31));}
59 #define ADD_AND_CLIP1(x) x += (pred_word&0xFF); CLIP_RESULT(x);
60 #define ADD_AND_CLIP2(x) x += ((pred_word>>8)&0xFF); CLIP_RESULT(x);
61 #define ADD_AND_CLIP3(x) x += ((pred_word>>16)&0xFF); CLIP_RESULT(x);
62 #define ADD_AND_CLIP4(x) x += ((pred_word>>24)&0xFF); CLIP_RESULT(x);
63
64
idct_col0(Short * blk)65 void idct_col0(Short *blk)
66 {
67 OSCL_UNUSED_ARG(blk);
68
69 return;
70 }
71
idct_col1(Short * blk)72 void idct_col1(Short *blk)
73 {
74 blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
75 blk[0] << 3;
76 return ;
77 }
78
79 /* Ignoring overflows as idct function expects and uses overflows */
80 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col2(Short * blk)81 void idct_col2(Short *blk)
82 {
83 int32 x0, x1, x3, x5, x7;//, x8;
84
85 x1 = blk[8];
86 x0 = ((int32)blk[0] << 11) + 128;
87 /* both upper and lower*/
88
89 x7 = W7 * x1;
90 x1 = W1 * x1;
91
92 x3 = x7;
93 x5 = (181 * (x1 - x7) + 128) >> 8;
94 x7 = (181 * (x1 + x7) + 128) >> 8;
95
96 blk[0] = (x0 + x1) >> 8;
97 blk[8] = (x0 + x7) >> 8;
98 blk[16] = (x0 + x5) >> 8;
99 blk[24] = (x0 + x3) >> 8;
100 blk[56] = (x0 - x1) >> 8;
101 blk[48] = (x0 - x7) >> 8;
102 blk[40] = (x0 - x5) >> 8;
103 blk[32] = (x0 - x3) >> 8;
104 return ;
105 }
106
107 /* Ignoring overflows as idct function expects and uses overflows */
108 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col3(Short * blk)109 void idct_col3(Short *blk)
110 {
111 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
112
113 x2 = blk[16];
114 x1 = blk[8];
115 x0 = ((int32)blk[0] << 11) + 128;
116
117 x4 = x0;
118 x6 = W6 * x2;
119 x2 = W2 * x2;
120 x8 = x0 - x2;
121 x0 += x2;
122 x2 = x8;
123 x8 = x4 - x6;
124 x4 += x6;
125 x6 = x8;
126
127 x7 = W7 * x1;
128 x1 = W1 * x1;
129 x3 = x7;
130 x5 = (181 * (x1 - x7) + 128) >> 8;
131 x7 = (181 * (x1 + x7) + 128) >> 8;
132
133 blk[0] = (x0 + x1) >> 8;
134 blk[8] = (x4 + x7) >> 8;
135 blk[16] = (x6 + x5) >> 8;
136 blk[24] = (x2 + x3) >> 8;
137 blk[56] = (x0 - x1) >> 8;
138 blk[48] = (x4 - x7) >> 8;
139 blk[40] = (x6 - x5) >> 8;
140 blk[32] = (x2 - x3) >> 8;
141 return ;
142 }
143
144 /* Ignoring overflows as idct function expects and uses overflows */
145 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col4(Short * blk)146 void idct_col4(Short *blk)
147 {
148 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
149 x2 = blk[16];
150 x1 = blk[8];
151 x3 = blk[24];
152 x0 = ((int32)blk[0] << 11) + 128;
153
154 x4 = x0;
155 x6 = W6 * x2;
156 x2 = W2 * x2;
157 x8 = x0 - x2;
158 x0 += x2;
159 x2 = x8;
160 x8 = x4 - x6;
161 x4 += x6;
162 x6 = x8;
163
164 x7 = W7 * x1;
165 x1 = W1 * x1;
166 x5 = W3 * x3;
167 x3 = -W5 * x3;
168 x8 = x1 - x5;
169 x1 += x5;
170 x5 = x8;
171 x8 = x7 - x3;
172 x3 += x7;
173 x7 = (181 * (x5 + x8) + 128) >> 8;
174 x5 = (181 * (x5 - x8) + 128) >> 8;
175
176
177 blk[0] = (x0 + x1) >> 8;
178 blk[8] = (x4 + x7) >> 8;
179 blk[16] = (x6 + x5) >> 8;
180 blk[24] = (x2 + x3) >> 8;
181 blk[56] = (x0 - x1) >> 8;
182 blk[48] = (x4 - x7) >> 8;
183 blk[40] = (x6 - x5) >> 8;
184 blk[32] = (x2 - x3) >> 8;
185 return ;
186 }
187
188 #ifndef SMALL_DCT
189 /* Ignoring overflows as idct function expects and uses overflows */
190 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col0x40(Short * blk)191 void idct_col0x40(Short *blk)
192 {
193 int32 x1, x3, x5, x7;//, x8;
194
195 x1 = blk[8];
196 /* both upper and lower*/
197
198 x7 = W7 * x1;
199 x1 = W1 * x1;
200
201 x3 = x7;
202 x5 = (181 * (x1 - x7) + 128) >> 8;
203 x7 = (181 * (x1 + x7) + 128) >> 8;
204
205 blk[0] = (128 + x1) >> 8;
206 blk[8] = (128 + x7) >> 8;
207 blk[16] = (128 + x5) >> 8;
208 blk[24] = (128 + x3) >> 8;
209 blk[56] = (128 - x1) >> 8;
210 blk[48] = (128 - x7) >> 8;
211 blk[40] = (128 - x5) >> 8;
212 blk[32] = (128 - x3) >> 8;
213
214 return ;
215 }
216
idct_col0x20(Short * blk)217 void idct_col0x20(Short *blk)
218 {
219 int32 x0, x2, x4, x6;
220
221 x2 = blk[16];
222 x6 = W6 * x2;
223 x2 = W2 * x2;
224 x0 = 128 + x2;
225 x2 = 128 - x2;
226 x4 = 128 + x6;
227 x6 = 128 - x6;
228
229 blk[0] = (x0) >> 8;
230 blk[56] = (x0) >> 8;
231 blk[8] = (x4) >> 8;
232 blk[48] = (x4) >> 8;
233 blk[16] = (x6) >> 8;
234 blk[40] = (x6) >> 8;
235 blk[24] = (x2) >> 8;
236 blk[32] = (x2) >> 8;
237
238 return ;
239 }
240
241 /* Ignoring overflows as idct function expects and uses overflows */
242 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col0x10(Short * blk)243 void idct_col0x10(Short *blk)
244 {
245 int32 x1, x3, x5, x7;
246
247 x3 = blk[24];
248 x1 = W3 * x3;
249 x3 = W5 * x3;
250
251 x7 = (181 * (x3 - x1) + 128) >> 8;
252 x5 = (-181 * (x1 + x3) + 128) >> 8;
253
254
255 blk[0] = (128 + x1) >> 8;
256 blk[8] = (128 + x7) >> 8;
257 blk[16] = (128 + x5) >> 8;
258 blk[24] = (128 - x3) >> 8;
259 blk[56] = (128 - x1) >> 8;
260 blk[48] = (128 - x7) >> 8;
261 blk[40] = (128 - x5) >> 8;
262 blk[32] = (128 + x3) >> 8;
263
264 return ;
265 }
266
267 #endif /* SMALL_DCT */
268
269 /* Ignoring overflows as idct function expects and uses overflows */
270 __attribute__((no_sanitize("signed-integer-overflow")))
idct_col(Short * blk)271 void idct_col(Short *blk)
272 {
273 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
274
275 x1 = (int32)blk[32] << 11;
276 x2 = blk[48];
277 x3 = blk[16];
278 x4 = blk[8];
279 x5 = blk[56];
280 x6 = blk[40];
281 x7 = blk[24];
282 x0 = ((int32)blk[0] << 11) + 128;
283
284 /* first stage */
285 x8 = W7 * (x4 + x5);
286 x4 = x8 + (W1 - W7) * x4;
287 x5 = x8 - (W1 + W7) * x5;
288 x8 = W3 * (x6 + x7);
289 x6 = x8 - (W3 - W5) * x6;
290 x7 = x8 - (W3 + W5) * x7;
291
292 /* second stage */
293 x8 = x0 + x1;
294 x0 -= x1;
295 x1 = W6 * (x3 + x2);
296 x2 = x1 - (W2 + W6) * x2;
297 x3 = x1 + (W2 - W6) * x3;
298 x1 = x4 + x6;
299 x4 -= x6;
300 x6 = x5 + x7;
301 x5 -= x7;
302
303 /* third stage */
304 x7 = x8 + x3;
305 x8 -= x3;
306 x3 = x0 + x2;
307 x0 -= x2;
308 x2 = (181 * (x4 + x5) + 128) >> 8;
309 x4 = (181 * (x4 - x5) + 128) >> 8;
310
311 /* fourth stage */
312 blk[0] = (x7 + x1) >> 8;
313 blk[8] = (x3 + x2) >> 8;
314 blk[16] = (x0 + x4) >> 8;
315 blk[24] = (x8 + x6) >> 8;
316 blk[32] = (x8 - x6) >> 8;
317 blk[40] = (x0 - x4) >> 8;
318 blk[48] = (x3 - x2) >> 8;
319 blk[56] = (x7 - x1) >> 8;
320
321 return ;
322 }
323
324 /* This function should not be called at all ****/
idct_row0Inter(Short * srce,UChar * rec,Int lx)325 void idct_row0Inter(Short *srce, UChar *rec, Int lx)
326 {
327 OSCL_UNUSED_ARG(srce);
328
329 OSCL_UNUSED_ARG(rec);
330
331 OSCL_UNUSED_ARG(lx);
332
333 return;
334 }
335
idct_row1Inter(Short * blk,UChar * rec,Int lx)336 void idct_row1Inter(Short *blk, UChar *rec, Int lx)
337 {
338 int tmp;
339 int i = 8;
340 uint32 pred_word, dst_word;
341 int res, res2;
342
343 /* preset the offset, such that we can take advantage pre-offset addressing mode */
344 rec -= lx;
345 blk -= 8;
346
347 while (i--)
348 {
349 tmp = (*(blk += 8) + 32) >> 6;
350 *blk = 0;
351
352 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
353 res = tmp + (pred_word & 0xFF);
354 CLIP_RESULT(res);
355 res2 = tmp + ((pred_word >> 8) & 0xFF);
356 CLIP_RESULT(res2);
357 dst_word = (res2 << 8) | res;
358 res = tmp + ((pred_word >> 16) & 0xFF);
359 CLIP_RESULT(res);
360 dst_word |= (res << 16);
361 res = tmp + ((pred_word >> 24) & 0xFF);
362 CLIP_RESULT(res);
363 dst_word |= (res << 24);
364 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
365
366 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
367 res = tmp + (pred_word & 0xFF);
368 CLIP_RESULT(res);
369 res2 = tmp + ((pred_word >> 8) & 0xFF);
370 CLIP_RESULT(res2);
371 dst_word = (res2 << 8) | res;
372 res = tmp + ((pred_word >> 16) & 0xFF);
373 CLIP_RESULT(res);
374 dst_word |= (res << 16);
375 res = tmp + ((pred_word >> 24) & 0xFF);
376 CLIP_RESULT(res);
377 dst_word |= (res << 24);
378 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
379 }
380 return;
381 }
382
383 /* Ignoring overflows as idct function expects and uses overflows */
384 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row2Inter(Short * blk,UChar * rec,Int lx)385 void idct_row2Inter(Short *blk, UChar *rec, Int lx)
386 {
387 int32 x0, x1, x2, x4, x5;
388 int i = 8;
389 uint32 pred_word, dst_word;
390 int res, res2;
391
392 /* preset the offset, such that we can take advantage pre-offset addressing mode */
393 rec -= lx;
394 blk -= 8;
395
396 while (i--)
397 {
398 /* shortcut */
399 x4 = blk[9];
400 blk[9] = 0;
401 x0 = ((*(blk += 8)) << 8) + 8192;
402 *blk = 0; /* for proper rounding in the fourth stage */
403
404 /* first stage */
405 x5 = (W7 * x4 + 4) >> 3;
406 x4 = (W1 * x4 + 4) >> 3;
407
408 /* third stage */
409 x2 = (181 * (x4 + x5) + 128) >> 8;
410 x1 = (181 * (x4 - x5) + 128) >> 8;
411
412 /* fourth stage */
413 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
414 res = (x0 + x4) >> 14;
415 ADD_AND_CLIP1(res);
416 res2 = (x0 + x2) >> 14;
417 ADD_AND_CLIP2(res2);
418 dst_word = (res2 << 8) | res;
419 res = (x0 + x1) >> 14;
420 ADD_AND_CLIP3(res);
421 dst_word |= (res << 16);
422 res = (x0 + x5) >> 14;
423 ADD_AND_CLIP4(res);
424 dst_word |= (res << 24);
425 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
426
427 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
428 res = (x0 - x5) >> 14;
429 ADD_AND_CLIP1(res);
430 res2 = (x0 - x1) >> 14;
431 ADD_AND_CLIP2(res2);
432 dst_word = (res2 << 8) | res;
433 res = (x0 - x2) >> 14;
434 ADD_AND_CLIP3(res);
435 dst_word |= (res << 16);
436 res = (x0 - x4) >> 14;
437 ADD_AND_CLIP4(res);
438 dst_word |= (res << 24);
439 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
440 }
441 return ;
442 }
443
444 /* Ignoring overflows as idct function expects and uses overflows */
445 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row3Inter(Short * blk,UChar * rec,Int lx)446 void idct_row3Inter(Short *blk, UChar *rec, Int lx)
447 {
448 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
449 int i = 8;
450 uint32 pred_word, dst_word;
451 int res, res2;
452
453 /* preset the offset, such that we can take advantage pre-offset addressing mode */
454 rec -= lx;
455 blk -= 8;
456
457 while (i--)
458 {
459 x2 = blk[10];
460 blk[10] = 0;
461 x1 = blk[9];
462 blk[9] = 0;
463 x0 = ((*(blk += 8)) << 8) + 8192;
464 *blk = 0; /* for proper rounding in the fourth stage */
465 /* both upper and lower*/
466 /* both x2orx6 and x0orx4 */
467
468 x4 = x0;
469 x6 = (W6 * x2 + 4) >> 3;
470 x2 = (W2 * x2 + 4) >> 3;
471 x8 = x0 - x2;
472 x0 += x2;
473 x2 = x8;
474 x8 = x4 - x6;
475 x4 += x6;
476 x6 = x8;
477
478 x7 = (W7 * x1 + 4) >> 3;
479 x1 = (W1 * x1 + 4) >> 3;
480 x3 = x7;
481 x5 = (181 * (x1 - x7) + 128) >> 8;
482 x7 = (181 * (x1 + x7) + 128) >> 8;
483
484 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
485 res = (x0 + x1) >> 14;
486 ADD_AND_CLIP1(res);
487 res2 = (x4 + x7) >> 14;
488 ADD_AND_CLIP2(res2);
489 dst_word = (res2 << 8) | res;
490 res = (x6 + x5) >> 14;
491 ADD_AND_CLIP3(res);
492 dst_word |= (res << 16);
493 res = (x2 + x3) >> 14;
494 ADD_AND_CLIP4(res);
495 dst_word |= (res << 24);
496 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
497
498 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
499 res = (x2 - x3) >> 14;
500 ADD_AND_CLIP1(res);
501 res2 = (x6 - x5) >> 14;
502 ADD_AND_CLIP2(res2);
503 dst_word = (res2 << 8) | res;
504 res = (x4 - x7) >> 14;
505 ADD_AND_CLIP3(res);
506 dst_word |= (res << 16);
507 res = (x0 - x1) >> 14;
508 ADD_AND_CLIP4(res);
509 dst_word |= (res << 24);
510 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
511 }
512
513 return ;
514 }
515
516 /* Ignoring overflows as idct function expects and uses overflows */
517 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row4Inter(Short * blk,UChar * rec,Int lx)518 void idct_row4Inter(Short *blk, UChar *rec, Int lx)
519 {
520 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
521 int i = 8;
522 uint32 pred_word, dst_word;
523 int res, res2;
524
525 /* preset the offset, such that we can take advantage pre-offset addressing mode */
526 rec -= lx;
527 blk -= 8;
528
529 while (i--)
530 {
531 x2 = blk[10];
532 blk[10] = 0;
533 x1 = blk[9];
534 blk[9] = 0;
535 x3 = blk[11];
536 blk[11] = 0;
537 x0 = ((*(blk += 8)) << 8) + 8192;
538 *blk = 0; /* for proper rounding in the fourth stage */
539
540 x4 = x0;
541 x6 = (W6 * x2 + 4) >> 3;
542 x2 = (W2 * x2 + 4) >> 3;
543 x8 = x0 - x2;
544 x0 += x2;
545 x2 = x8;
546 x8 = x4 - x6;
547 x4 += x6;
548 x6 = x8;
549
550 x7 = (W7 * x1 + 4) >> 3;
551 x1 = (W1 * x1 + 4) >> 3;
552 x5 = (W3 * x3 + 4) >> 3;
553 x3 = (- W5 * x3 + 4) >> 3;
554 x8 = x1 - x5;
555 x1 += x5;
556 x5 = x8;
557 x8 = x7 - x3;
558 x3 += x7;
559 x7 = (181 * (x5 + x8) + 128) >> 8;
560 x5 = (181 * (x5 - x8) + 128) >> 8;
561
562 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
563 res = (x0 + x1) >> 14;
564 ADD_AND_CLIP1(res);
565 res2 = (x4 + x7) >> 14;
566 ADD_AND_CLIP2(res2);
567 dst_word = (res2 << 8) | res;
568 res = (x6 + x5) >> 14;
569 ADD_AND_CLIP3(res);
570 dst_word |= (res << 16);
571 res = (x2 + x3) >> 14;
572 ADD_AND_CLIP4(res);
573 dst_word |= (res << 24);
574 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
575
576 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
577 res = (x2 - x3) >> 14;
578 ADD_AND_CLIP1(res);
579 res2 = (x6 - x5) >> 14;
580 ADD_AND_CLIP2(res2);
581 dst_word = (res2 << 8) | res;
582 res = (x4 - x7) >> 14;
583 ADD_AND_CLIP3(res);
584 dst_word |= (res << 16);
585 res = (x0 - x1) >> 14;
586 ADD_AND_CLIP4(res);
587 dst_word |= (res << 24);
588 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
589 }
590 return ;
591 }
592
593 #ifndef SMALL_DCT
594 /* Ignoring overflows as idct function expects and uses overflows */
595 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x40Inter(Short * blk,UChar * rec,Int lx)596 void idct_row0x40Inter(Short *blk, UChar *rec, Int lx)
597 {
598 int32 x1, x2, x4, x5;
599 int i = 8;
600 uint32 pred_word, dst_word;
601 int res, res2;
602
603 /* preset the offset, such that we can take advantage pre-offset addressing mode */
604 rec -= lx;
605
606 while (i--)
607 {
608 /* shortcut */
609 x4 = blk[1];
610 blk[1] = 0;
611 blk += 8; /* for proper rounding in the fourth stage */
612
613 /* first stage */
614 x5 = (W7 * x4 + 4) >> 3;
615 x4 = (W1 * x4 + 4) >> 3;
616
617 /* third stage */
618 x2 = (181 * (x4 + x5) + 128) >> 8;
619 x1 = (181 * (x4 - x5) + 128) >> 8;
620
621 /* fourth stage */
622 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
623 res = (8192 + x4) >> 14;
624 ADD_AND_CLIP1(res);
625 res2 = (8192 + x2) >> 14;
626 ADD_AND_CLIP2(res2);
627 dst_word = (res2 << 8) | res;
628 res = (8192 + x1) >> 14;
629 ADD_AND_CLIP3(res);
630 dst_word |= (res << 16);
631 res = (8192 + x5) >> 14;
632 ADD_AND_CLIP4(res);
633 dst_word |= (res << 24);
634 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
635
636 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
637 res = (8192 - x5) >> 14;
638 ADD_AND_CLIP1(res);
639 res2 = (8192 - x1) >> 14;
640 ADD_AND_CLIP2(res2);
641 dst_word = (res2 << 8) | res;
642 res = (8192 - x2) >> 14;
643 ADD_AND_CLIP3(res);
644 dst_word |= (res << 16);
645 res = (8192 - x4) >> 14;
646 ADD_AND_CLIP4(res);
647 dst_word |= (res << 24);
648 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
649 }
650 return ;
651 }
652
idct_row0x20Inter(Short * blk,UChar * rec,Int lx)653 void idct_row0x20Inter(Short *blk, UChar *rec, Int lx)
654 {
655 int32 x0, x2, x4, x6;
656 int i = 8;
657 uint32 pred_word, dst_word;
658 int res, res2;
659
660 /* preset the offset, such that we can take advantage pre-offset addressing mode */
661 rec -= lx;
662
663 while (i--)
664 {
665 x2 = blk[2];
666 blk[2] = 0;
667 blk += 8; /* for proper rounding in the fourth stage */
668 /* both upper and lower*/
669 /* both x2orx6 and x0orx4 */
670 x6 = (W6 * x2 + 4) >> 3;
671 x2 = (W2 * x2 + 4) >> 3;
672 x0 = 8192 + x2;
673 x2 = 8192 - x2;
674 x4 = 8192 + x6;
675 x6 = 8192 - x6;
676
677 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
678 res = (x0) >> 14;
679 ADD_AND_CLIP1(res);
680 res2 = (x4) >> 14;
681 ADD_AND_CLIP2(res2);
682 dst_word = (res2 << 8) | res;
683 res = (x6) >> 14;
684 ADD_AND_CLIP3(res);
685 dst_word |= (res << 16);
686 res = (x2) >> 14;
687 ADD_AND_CLIP4(res);
688 dst_word |= (res << 24);
689 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
690
691 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
692 res = (x2) >> 14;
693 ADD_AND_CLIP1(res);
694 res2 = (x6) >> 14;
695 ADD_AND_CLIP2(res2);
696 dst_word = (res2 << 8) | res;
697 res = (x4) >> 14;
698 ADD_AND_CLIP3(res);
699 dst_word |= (res << 16);
700 res = (x0) >> 14;
701 ADD_AND_CLIP4(res);
702 dst_word |= (res << 24);
703 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
704 }
705
706 return ;
707 }
708
709 /* Ignoring overflows as idct function expects and uses overflows */
710 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x10Inter(Short * blk,UChar * rec,Int lx)711 void idct_row0x10Inter(Short *blk, UChar *rec, Int lx)
712 {
713 int32 x1, x3, x5, x7;
714 int i = 8;
715 uint32 pred_word, dst_word;
716 int res, res2;
717
718 /* preset the offset, such that we can take advantage pre-offset addressing mode */
719 rec -= lx;
720
721 while (i--)
722 {
723 x3 = blk[3];
724 blk[3] = 0;
725 blk += 8;
726
727 x1 = (W3 * x3 + 4) >> 3;
728 x3 = (-W5 * x3 + 4) >> 3;
729
730 x7 = (-181 * (x3 + x1) + 128) >> 8;
731 x5 = (181 * (x3 - x1) + 128) >> 8;
732
733 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
734 res = (8192 + x1) >> 14;
735 ADD_AND_CLIP1(res);
736 res2 = (8192 + x7) >> 14;
737 ADD_AND_CLIP2(res2);
738 dst_word = (res2 << 8) | res;
739 res = (8192 + x5) >> 14;
740 ADD_AND_CLIP3(res);
741 dst_word |= (res << 16);
742 res = (8192 + x3) >> 14;
743 ADD_AND_CLIP4(res);
744 dst_word |= (res << 24);
745 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
746
747 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
748 res = (8192 - x3) >> 14;
749 ADD_AND_CLIP1(res);
750 res2 = (8192 - x5) >> 14;
751 ADD_AND_CLIP2(res2);
752 dst_word = (res2 << 8) | res;
753 res = (8192 - x7) >> 14;
754 ADD_AND_CLIP3(res);
755 dst_word |= (res << 16);
756 res = (8192 - x1) >> 14;
757 ADD_AND_CLIP4(res);
758 dst_word |= (res << 24);
759 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
760 }
761 return ;
762 }
763
764 #endif /* SMALL_DCT */
765
766 /* Ignoring overflows as idct function expects and uses overflows */
767 __attribute__((no_sanitize("signed-integer-overflow")))
idct_rowInter(Short * blk,UChar * rec,Int lx)768 void idct_rowInter(Short *blk, UChar *rec, Int lx)
769 {
770 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
771 int i = 8;
772 uint32 pred_word, dst_word;
773 int res, res2;
774
775 /* preset the offset, such that we can take advantage pre-offset addressing mode */
776 rec -= lx;
777 blk -= 8;
778
779 while (i--)
780 {
781 x1 = (int32)blk[12] << 8;
782 blk[12] = 0;
783 x2 = blk[14];
784 blk[14] = 0;
785 x3 = blk[10];
786 blk[10] = 0;
787 x4 = blk[9];
788 blk[9] = 0;
789 x5 = blk[15];
790 blk[15] = 0;
791 x6 = blk[13];
792 blk[13] = 0;
793 x7 = blk[11];
794 blk[11] = 0;
795 x0 = ((*(blk += 8)) << 8) + 8192;
796 *blk = 0; /* for proper rounding in the fourth stage */
797
798 /* first stage */
799 x8 = W7 * (x4 + x5) + 4;
800 x4 = (x8 + (W1 - W7) * x4) >> 3;
801 x5 = (x8 - (W1 + W7) * x5) >> 3;
802 x8 = W3 * (x6 + x7) + 4;
803 x6 = (x8 - (W3 - W5) * x6) >> 3;
804 x7 = (x8 - (W3 + W5) * x7) >> 3;
805
806 /* second stage */
807 x8 = x0 + x1;
808 x0 -= x1;
809 x1 = W6 * (x3 + x2) + 4;
810 x2 = (x1 - (W2 + W6) * x2) >> 3;
811 x3 = (x1 + (W2 - W6) * x3) >> 3;
812 x1 = x4 + x6;
813 x4 -= x6;
814 x6 = x5 + x7;
815 x5 -= x7;
816
817 /* third stage */
818 x7 = x8 + x3;
819 x8 -= x3;
820 x3 = x0 + x2;
821 x0 -= x2;
822 x2 = (181 * (x4 + x5) + 128) >> 8;
823 x4 = (181 * (x4 - x5) + 128) >> 8;
824
825 /* fourth stage */
826 pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
827
828 res = (x7 + x1) >> 14;
829 ADD_AND_CLIP1(res);
830 res2 = (x3 + x2) >> 14;
831 ADD_AND_CLIP2(res2);
832 dst_word = (res2 << 8) | res;
833 res = (x0 + x4) >> 14;
834 ADD_AND_CLIP3(res);
835 dst_word |= (res << 16);
836 res = (x8 + x6) >> 14;
837 ADD_AND_CLIP4(res);
838 dst_word |= (res << 24);
839 *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
840
841 pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
842
843 res = (x8 - x6) >> 14;
844 ADD_AND_CLIP1(res);
845 res2 = (x0 - x4) >> 14;
846 ADD_AND_CLIP2(res2);
847 dst_word = (res2 << 8) | res;
848 res = (x3 - x2) >> 14;
849 ADD_AND_CLIP3(res);
850 dst_word |= (res << 16);
851 res = (x7 - x1) >> 14;
852 ADD_AND_CLIP4(res);
853 dst_word |= (res << 24);
854 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
855 }
856 return;
857 }
858
idct_row0Intra(Short * srce,UChar * rec,Int lx)859 void idct_row0Intra(Short *srce, UChar *rec, Int lx)
860 {
861 OSCL_UNUSED_ARG(srce);
862
863 OSCL_UNUSED_ARG(rec);
864
865 OSCL_UNUSED_ARG(lx);
866
867 return;
868 }
869
idct_row1Intra(Short * blk,UChar * rec,Int lx)870 void idct_row1Intra(Short *blk, UChar *rec, Int lx)
871 {
872 int32 tmp;
873 int i = 8;
874
875 rec -= lx;
876 blk -= 8;
877 while (i--)
878 {
879 tmp = ((*(blk += 8) + 32) >> 6);
880 *blk = 0;
881 CLIP_RESULT(tmp)
882
883 tmp |= (tmp << 8);
884 tmp |= (tmp << 16);
885 *((uint32*)(rec += lx)) = tmp;
886 *((uint32*)(rec + 4)) = tmp;
887 }
888 return;
889 }
890
891 /* Ignoring overflows as idct function expects and uses overflows */
892 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row2Intra(Short * blk,UChar * rec,Int lx)893 void idct_row2Intra(Short *blk, UChar *rec, Int lx)
894 {
895 int32 x0, x1, x2, x4, x5;
896 int res, res2;
897 uint32 dst_word;
898 int i = 8;
899
900 rec -= lx;
901 blk -= 8;
902 while (i--)
903 {
904 /* shortcut */
905 x4 = blk[9];
906 blk[9] = 0;
907 x0 = ((*(blk += 8)) << 8) + 8192;
908 *blk = 0; /* for proper rounding in the fourth stage */
909
910 /* first stage */
911 x5 = (W7 * x4 + 4) >> 3;
912 x4 = (W1 * x4 + 4) >> 3;
913
914 /* third stage */
915 x2 = (181 * (x4 + x5) + 128) >> 8;
916 x1 = (181 * (x4 - x5) + 128) >> 8;
917
918 /* fourth stage */
919 res = ((x0 + x4) >> 14);
920 CLIP_RESULT(res)
921 res2 = ((x0 + x2) >> 14);
922 CLIP_RESULT(res2)
923 dst_word = (res2 << 8) | res;
924 res = ((x0 + x1) >> 14);
925 CLIP_RESULT(res)
926 dst_word |= (res << 16);
927 res = ((x0 + x5) >> 14);
928 CLIP_RESULT(res)
929 dst_word |= (res << 24);
930 *((uint32*)(rec += lx)) = dst_word;
931
932 res = ((x0 - x5) >> 14);
933 CLIP_RESULT(res)
934 res2 = ((x0 - x1) >> 14);
935 CLIP_RESULT(res2)
936 dst_word = (res2 << 8) | res;
937 res = ((x0 - x2) >> 14);
938 CLIP_RESULT(res)
939 dst_word |= (res << 16);
940 res = ((x0 - x4) >> 14);
941 CLIP_RESULT(res)
942 dst_word |= (res << 24);
943 *((uint32*)(rec + 4)) = dst_word;
944 }
945 return ;
946 }
947
948 /* Ignoring overflows as idct function expects and uses overflows */
949 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row3Intra(Short * blk,UChar * rec,Int lx)950 void idct_row3Intra(Short *blk, UChar *rec, Int lx)
951 {
952 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
953 int res, res2;
954 uint32 dst_word;
955 int i = 8;
956
957 rec -= lx;
958 blk -= 8;
959 while (i--)
960 {
961 x2 = blk[10];
962 blk[10] = 0;
963 x1 = blk[9];
964 blk[9] = 0;
965 x0 = ((*(blk += 8)) << 8) + 8192;
966 *blk = 0;/* for proper rounding in the fourth stage */
967 /* both upper and lower*/
968 /* both x2orx6 and x0orx4 */
969
970 x4 = x0;
971 x6 = (W6 * x2 + 4) >> 3;
972 x2 = (W2 * x2 + 4) >> 3;
973 x8 = x0 - x2;
974 x0 += x2;
975 x2 = x8;
976 x8 = x4 - x6;
977 x4 += x6;
978 x6 = x8;
979
980 x7 = (W7 * x1 + 4) >> 3;
981 x1 = (W1 * x1 + 4) >> 3;
982 x3 = x7;
983 x5 = (181 * (x1 - x7) + 128) >> 8;
984 x7 = (181 * (x1 + x7) + 128) >> 8;
985
986 res = ((x0 + x1) >> 14);
987 CLIP_RESULT(res)
988 res2 = ((x4 + x7) >> 14);
989 CLIP_RESULT(res2)
990 dst_word = (res2 << 8) | res;
991 res = ((x6 + x5) >> 14);
992 CLIP_RESULT(res)
993 dst_word |= (res << 16);
994 res = ((x2 + x3) >> 14);
995 CLIP_RESULT(res)
996 dst_word |= (res << 24);
997 *((uint32*)(rec += lx)) = dst_word;
998
999 res = ((x2 - x3) >> 14);
1000 CLIP_RESULT(res)
1001 res2 = ((x6 - x5) >> 14);
1002 CLIP_RESULT(res2)
1003 dst_word = (res2 << 8) | res;
1004 res = ((x4 - x7) >> 14);
1005 CLIP_RESULT(res)
1006 dst_word |= (res << 16);
1007 res = ((x0 - x1) >> 14);
1008 CLIP_RESULT(res)
1009 dst_word |= (res << 24);
1010 *((uint32*)(rec + 4)) = dst_word;
1011
1012 }
1013 return ;
1014 }
1015
1016 /* Ignoring overflows as idct function expects and uses overflows */
1017 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row4Intra(Short * blk,UChar * rec,Int lx)1018 void idct_row4Intra(Short *blk, UChar *rec, Int lx)
1019 {
1020 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1021 int res, res2;
1022 uint32 dst_word;
1023 int i = 8;
1024
1025 rec -= lx;
1026 blk -= 8;
1027 while (i--)
1028 {
1029 x2 = blk[10];
1030 blk[10] = 0;
1031 x1 = blk[9];
1032 blk[9] = 0;
1033 x3 = blk[11];
1034 blk[11] = 0;
1035 x0 = ((*(blk += 8)) << 8) + 8192;
1036 *blk = 0; /* for proper rounding in the fourth stage */
1037
1038 x4 = x0;
1039 x6 = (W6 * x2 + 4) >> 3;
1040 x2 = (W2 * x2 + 4) >> 3;
1041 x8 = x0 - x2;
1042 x0 += x2;
1043 x2 = x8;
1044 x8 = x4 - x6;
1045 x4 += x6;
1046 x6 = x8;
1047
1048 x7 = (W7 * x1 + 4) >> 3;
1049 x1 = (W1 * x1 + 4) >> 3;
1050 x5 = (W3 * x3 + 4) >> 3;
1051 x3 = (- W5 * x3 + 4) >> 3;
1052 x8 = x1 - x5;
1053 x1 += x5;
1054 x5 = x8;
1055 x8 = x7 - x3;
1056 x3 += x7;
1057 x7 = (181 * (x5 + x8) + 128) >> 8;
1058 x5 = (181 * (x5 - x8) + 128) >> 8;
1059
1060 res = ((x0 + x1) >> 14);
1061 CLIP_RESULT(res)
1062 res2 = ((x4 + x7) >> 14);
1063 CLIP_RESULT(res2)
1064 dst_word = (res2 << 8) | res;
1065 res = ((x6 + x5) >> 14);
1066 CLIP_RESULT(res)
1067 dst_word |= (res << 16);
1068 res = ((x2 + x3) >> 14);
1069 CLIP_RESULT(res)
1070 dst_word |= (res << 24);
1071 *((uint32*)(rec += lx)) = dst_word;
1072
1073 res = ((x2 - x3) >> 14);
1074 CLIP_RESULT(res)
1075 res2 = ((x6 - x5) >> 14);
1076 CLIP_RESULT(res2)
1077 dst_word = (res2 << 8) | res;
1078 res = ((x4 - x7) >> 14);
1079 CLIP_RESULT(res)
1080 dst_word |= (res << 16);
1081 res = ((x0 - x1) >> 14);
1082 CLIP_RESULT(res)
1083 dst_word |= (res << 24);
1084 *((uint32*)(rec + 4)) = dst_word;
1085 }
1086
1087 return ;
1088 }
1089
1090 #ifndef SMALL_DCT
1091 /* Ignoring overflows as idct function expects and uses overflows */
1092 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x40Intra(Short * blk,UChar * rec,Int lx)1093 void idct_row0x40Intra(Short *blk, UChar *rec, Int lx)
1094 {
1095 int32 x1, x2, x4, x5;
1096 int res, res2;
1097 uint32 dst_word;
1098 int i = 8;
1099
1100 rec -= lx;
1101
1102 while (i--)
1103 {
1104 /* shortcut */
1105 x4 = blk[1];
1106 blk[1] = 0;
1107 blk += 8;
1108
1109 /* first stage */
1110 x5 = (W7 * x4 + 4) >> 3;
1111 x4 = (W1 * x4 + 4) >> 3;
1112
1113 /* third stage */
1114 x2 = (181 * (x4 + x5) + 128) >> 8;
1115 x1 = (181 * (x4 - x5) + 128) >> 8;
1116
1117 /* fourth stage */
1118 res = ((8192 + x4) >> 14);
1119 CLIP_RESULT(res)
1120 res2 = ((8192 + x2) >> 14);
1121 CLIP_RESULT(res2)
1122 dst_word = (res2 << 8) | res;
1123 res = ((8192 + x1) >> 14);
1124 CLIP_RESULT(res)
1125 dst_word |= (res << 16);
1126 res = ((8192 + x5) >> 14);
1127 CLIP_RESULT(res)
1128 dst_word |= (res << 24);
1129 *((uint32*)(rec += lx)) = dst_word;
1130
1131 res = ((8192 - x5) >> 14);
1132 CLIP_RESULT(res)
1133 res2 = ((8192 - x1) >> 14);
1134 CLIP_RESULT(res2)
1135 dst_word = (res2 << 8) | res;
1136 res = ((8192 - x2) >> 14);
1137 CLIP_RESULT(res)
1138 dst_word |= (res << 16);
1139 res = ((8192 - x4) >> 14);
1140 CLIP_RESULT(res)
1141 dst_word |= (res << 24);
1142 *((uint32*)(rec + 4)) = dst_word;
1143
1144 }
1145 return ;
1146 }
1147
idct_row0x20Intra(Short * blk,UChar * rec,Int lx)1148 void idct_row0x20Intra(Short *blk, UChar *rec, Int lx)
1149 {
1150 int32 x0, x2, x4, x6;
1151 int res, res2;
1152 uint32 dst_word;
1153 int i = 8;
1154
1155 rec -= lx;
1156 while (i--)
1157 {
1158 x2 = blk[2];
1159 blk[2] = 0;
1160 blk += 8;
1161
1162 /* both upper and lower*/
1163 /* both x2orx6 and x0orx4 */
1164 x6 = (W6 * x2 + 4) >> 3;
1165 x2 = (W2 * x2 + 4) >> 3;
1166 x0 = 8192 + x2;
1167 x2 = 8192 - x2;
1168 x4 = 8192 + x6;
1169 x6 = 8192 - x6;
1170
1171 res = ((x0) >> 14);
1172 CLIP_RESULT(res)
1173 res2 = ((x4) >> 14);
1174 CLIP_RESULT(res2)
1175 dst_word = (res2 << 8) | res;
1176 res = ((x6) >> 14);
1177 CLIP_RESULT(res)
1178 dst_word |= (res << 16);
1179 res = ((x2) >> 14);
1180 CLIP_RESULT(res)
1181 dst_word |= (res << 24);
1182 *((uint32*)(rec += lx)) = dst_word;
1183
1184 res = ((x2) >> 14);
1185 CLIP_RESULT(res)
1186 res2 = ((x6) >> 14);
1187 CLIP_RESULT(res2)
1188 dst_word = (res2 << 8) | res;
1189 res = ((x4) >> 14);
1190 CLIP_RESULT(res)
1191 dst_word |= (res << 16);
1192 res = ((x0) >> 14);
1193 CLIP_RESULT(res)
1194 dst_word |= (res << 24);
1195 *((uint32*)(rec + 4)) = dst_word;
1196
1197 }
1198 return ;
1199 }
1200
1201 /* Ignoring overflows as idct function expects and uses overflows */
1202 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x10Intra(Short * blk,UChar * rec,Int lx)1203 void idct_row0x10Intra(Short *blk, UChar *rec, Int lx)
1204 {
1205 int32 x1, x3, x5, x7;
1206 int res, res2;
1207 uint32 dst_word;
1208 int i = 8;
1209
1210 rec -= lx;
1211 while (i--)
1212 {
1213 x3 = blk[3];
1214 blk[3] = 0 ;
1215 blk += 8;
1216
1217 x1 = (W3 * x3 + 4) >> 3;
1218 x3 = (W5 * x3 + 4) >> 3;
1219
1220 x7 = (181 * (x3 - x1) + 128) >> 8;
1221 x5 = (-181 * (x1 + x3) + 128) >> 8;
1222
1223 res = ((8192 + x1) >> 14);
1224 CLIP_RESULT(res)
1225 res2 = ((8192 + x7) >> 14);
1226 CLIP_RESULT(res2)
1227 dst_word = (res2 << 8) | res;
1228 res = ((8192 + x5) >> 14);
1229 CLIP_RESULT(res)
1230 dst_word |= (res << 16);
1231 res = ((8192 - x3) >> 14);
1232 CLIP_RESULT(res)
1233 dst_word |= (res << 24);
1234 *((uint32*)(rec += lx)) = dst_word;
1235
1236 res = ((8192 + x3) >> 14);
1237 CLIP_RESULT(res)
1238 res2 = ((8192 - x5) >> 14);
1239 CLIP_RESULT(res2)
1240 dst_word = (res2 << 8) | res;
1241 res = ((8192 - x7) >> 14);
1242 CLIP_RESULT(res)
1243 dst_word |= (res << 16);
1244 res = ((8192 - x1) >> 14);
1245 CLIP_RESULT(res)
1246 dst_word |= (res << 24);
1247 *((uint32*)(rec + 4)) = dst_word;
1248
1249 }
1250
1251 return ;
1252 }
1253
1254 #endif /* SMALL_DCT */
1255 /* Ignoring overflows as idct function expects and uses overflows */
1256 __attribute__((no_sanitize("signed-integer-overflow")))
idct_rowIntra(Short * blk,UChar * rec,Int lx)1257 void idct_rowIntra(Short *blk, UChar *rec, Int lx)
1258 {
1259 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1260 int i = 8;
1261 int res, res2;
1262 uint32 dst_word;
1263
1264 blk -= 8;
1265 rec -= lx;
1266
1267 while (i--)
1268 {
1269 x1 = (int32)blk[12] << 8;
1270 blk[12] = 0;
1271 x2 = blk[14];
1272 blk[14] = 0;
1273 x3 = blk[10];
1274 blk[10] = 0;
1275 x4 = blk[9];
1276 blk[9] = 0;
1277 x5 = blk[15];
1278 blk[15] = 0;
1279 x6 = blk[13];
1280 blk[13] = 0;
1281 x7 = blk[11];
1282 blk[11] = 0;
1283 x0 = ((*(blk += 8)) << 8) + 8192;
1284 *blk = 0; /* for proper rounding in the fourth stage */
1285
1286 /* first stage */
1287 x8 = W7 * (x4 + x5) + 4;
1288 x4 = (x8 + (W1 - W7) * x4) >> 3;
1289 x5 = (x8 - (W1 + W7) * x5) >> 3;
1290 x8 = W3 * (x6 + x7) + 4;
1291 x6 = (x8 - (W3 - W5) * x6) >> 3;
1292 x7 = (x8 - (W3 + W5) * x7) >> 3;
1293
1294 /* second stage */
1295 x8 = x0 + x1;
1296 x0 -= x1;
1297 x1 = W6 * (x3 + x2) + 4;
1298 x2 = (x1 - (W2 + W6) * x2) >> 3;
1299 x3 = (x1 + (W2 - W6) * x3) >> 3;
1300 x1 = x4 + x6;
1301 x4 -= x6;
1302 x6 = x5 + x7;
1303 x5 -= x7;
1304
1305 /* third stage */
1306 x7 = x8 + x3;
1307 x8 -= x3;
1308 x3 = x0 + x2;
1309 x0 -= x2;
1310 x2 = (181 * (x4 + x5) + 128) >> 8;
1311 x4 = (181 * (x4 - x5) + 128) >> 8;
1312
1313 /* fourth stage */
1314 res = ((x7 + x1) >> 14);
1315 CLIP_RESULT(res)
1316 res2 = ((x3 + x2) >> 14);
1317 CLIP_RESULT(res2)
1318 dst_word = res | (res2 << 8);
1319 res = ((x0 + x4) >> 14);
1320 CLIP_RESULT(res)
1321 dst_word |= (res << 16);
1322 res = ((x8 + x6) >> 14);
1323 CLIP_RESULT(res)
1324 dst_word |= (res << 24);
1325 *((uint32*)(rec += lx)) = dst_word;
1326
1327 res = ((x8 - x6) >> 14);
1328 CLIP_RESULT(res)
1329 res2 = ((x0 - x4) >> 14);
1330 CLIP_RESULT(res2)
1331 dst_word = res | (res2 << 8);
1332 res = ((x3 - x2) >> 14);
1333 CLIP_RESULT(res)
1334 dst_word |= (res << 16);
1335 res = ((x7 - x1) >> 14);
1336 CLIP_RESULT(res)
1337 dst_word |= (res << 24);
1338 *((uint32*)(rec + 4)) = dst_word;
1339 }
1340 return;
1341 }
1342
1343
1344 /* This function should not be called at all ****/
idct_row0zmv(Short * srce,UChar * rec,UChar * pred,Int lx)1345 void idct_row0zmv(Short *srce, UChar *rec, UChar *pred, Int lx)
1346 {
1347 OSCL_UNUSED_ARG(srce);
1348 OSCL_UNUSED_ARG(rec);
1349 OSCL_UNUSED_ARG(pred);
1350 OSCL_UNUSED_ARG(lx);
1351
1352 return;
1353 }
1354
idct_row1zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1355 void idct_row1zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1356 {
1357 int tmp;
1358 int i = 8;
1359 uint32 pred_word, dst_word;
1360 int res, res2;
1361
1362 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1363 pred -= 16;
1364 rec -= lx;
1365 blk -= 8;
1366
1367 while (i--)
1368 {
1369 tmp = (*(blk += 8) + 32) >> 6;
1370 *blk = 0;
1371
1372 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1373 res = tmp + (pred_word & 0xFF);
1374 CLIP_RESULT(res);
1375 res2 = tmp + ((pred_word >> 8) & 0xFF);
1376 CLIP_RESULT(res2);
1377 dst_word = (res2 << 8) | res;
1378 res = tmp + ((pred_word >> 16) & 0xFF);
1379 CLIP_RESULT(res);
1380 dst_word |= (res << 16);
1381 res = tmp + ((pred_word >> 24) & 0xFF);
1382 CLIP_RESULT(res);
1383 dst_word |= (res << 24);
1384 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1385
1386 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1387 res = tmp + (pred_word & 0xFF);
1388 CLIP_RESULT(res);
1389 res2 = tmp + ((pred_word >> 8) & 0xFF);
1390 CLIP_RESULT(res2);
1391 dst_word = (res2 << 8) | res;
1392 res = tmp + ((pred_word >> 16) & 0xFF);
1393 CLIP_RESULT(res);
1394 dst_word |= (res << 16);
1395 res = tmp + ((pred_word >> 24) & 0xFF);
1396 CLIP_RESULT(res);
1397 dst_word |= (res << 24);
1398 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1399 }
1400 return;
1401 }
1402
1403 /* Ignoring overflows as idct function expects and uses overflows */
1404 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row2zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1405 void idct_row2zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1406 {
1407 int32 x0, x1, x2, x4, x5;
1408 int i = 8;
1409 uint32 pred_word, dst_word;
1410 int res, res2;
1411
1412 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1413 rec -= lx;
1414 pred -= 16;
1415 blk -= 8;
1416
1417 while (i--)
1418 {
1419 /* shortcut */
1420 x4 = blk[9];
1421 blk[9] = 0;
1422 x0 = ((*(blk += 8)) << 8) + 8192;
1423 *blk = 0; /* for proper rounding in the fourth stage */
1424
1425 /* first stage */
1426 x5 = (W7 * x4 + 4) >> 3;
1427 x4 = (W1 * x4 + 4) >> 3;
1428
1429 /* third stage */
1430 x2 = (181 * (x4 + x5) + 128) >> 8;
1431 x1 = (181 * (x4 - x5) + 128) >> 8;
1432
1433 /* fourth stage */
1434 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1435 res = (x0 + x4) >> 14;
1436 ADD_AND_CLIP1(res);
1437 res2 = (x0 + x2) >> 14;
1438 ADD_AND_CLIP2(res2);
1439 dst_word = (res2 << 8) | res;
1440 res = (x0 + x1) >> 14;
1441 ADD_AND_CLIP3(res);
1442 dst_word |= (res << 16);
1443 res = (x0 + x5) >> 14;
1444 ADD_AND_CLIP4(res);
1445 dst_word |= (res << 24);
1446 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1447
1448 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1449 res = (x0 - x5) >> 14;
1450 ADD_AND_CLIP1(res);
1451 res2 = (x0 - x1) >> 14;
1452 ADD_AND_CLIP2(res2);
1453 dst_word = (res2 << 8) | res;
1454 res = (x0 - x2) >> 14;
1455 ADD_AND_CLIP3(res);
1456 dst_word |= (res << 16);
1457 res = (x0 - x4) >> 14;
1458 ADD_AND_CLIP4(res);
1459 dst_word |= (res << 24);
1460 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1461 }
1462 return ;
1463 }
1464
1465 /* Ignoring overflows as idct function expects and uses overflows */
1466 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row3zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1467 void idct_row3zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1468 {
1469 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1470 int i = 8;
1471 uint32 pred_word, dst_word;
1472 int res, res2;
1473
1474 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1475 rec -= lx;
1476 pred -= 16;
1477 blk -= 8;
1478
1479 while (i--)
1480 {
1481 x2 = blk[10];
1482 blk[10] = 0;
1483 x1 = blk[9];
1484 blk[9] = 0;
1485 x0 = ((*(blk += 8)) << 8) + 8192;
1486 *blk = 0; /* for proper rounding in the fourth stage */
1487 /* both upper and lower*/
1488 /* both x2orx6 and x0orx4 */
1489
1490 x4 = x0;
1491 x6 = (W6 * x2 + 4) >> 3;
1492 x2 = (W2 * x2 + 4) >> 3;
1493 x8 = x0 - x2;
1494 x0 += x2;
1495 x2 = x8;
1496 x8 = x4 - x6;
1497 x4 += x6;
1498 x6 = x8;
1499
1500 x7 = (W7 * x1 + 4) >> 3;
1501 x1 = (W1 * x1 + 4) >> 3;
1502 x3 = x7;
1503 x5 = (181 * (x1 - x7) + 128) >> 8;
1504 x7 = (181 * (x1 + x7) + 128) >> 8;
1505
1506 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1507 res = (x0 + x1) >> 14;
1508 ADD_AND_CLIP1(res);
1509 res2 = (x4 + x7) >> 14;
1510 ADD_AND_CLIP2(res2);
1511 dst_word = (res2 << 8) | res;
1512 res = (x6 + x5) >> 14;
1513 ADD_AND_CLIP3(res);
1514 dst_word |= (res << 16);
1515 res = (x2 + x3) >> 14;
1516 ADD_AND_CLIP4(res);
1517 dst_word |= (res << 24);
1518 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1519
1520 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1521 res = (x2 - x3) >> 14;
1522 ADD_AND_CLIP1(res);
1523 res2 = (x6 - x5) >> 14;
1524 ADD_AND_CLIP2(res2);
1525 dst_word = (res2 << 8) | res;
1526 res = (x4 - x7) >> 14;
1527 ADD_AND_CLIP3(res);
1528 dst_word |= (res << 16);
1529 res = (x0 - x1) >> 14;
1530 ADD_AND_CLIP4(res);
1531 dst_word |= (res << 24);
1532 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1533 }
1534
1535 return ;
1536 }
1537
1538 /* Ignoring overflows as idct function expects and uses overflows */
1539 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row4zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1540 void idct_row4zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1541 {
1542 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1543 int i = 8;
1544 uint32 pred_word, dst_word;
1545 int res, res2;
1546
1547 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1548 rec -= lx;
1549 pred -= 16;
1550 blk -= 8;
1551
1552 while (i--)
1553 {
1554 x2 = blk[10];
1555 blk[10] = 0;
1556 x1 = blk[9];
1557 blk[9] = 0;
1558 x3 = blk[11];
1559 blk[11] = 0;
1560 x0 = ((*(blk += 8)) << 8) + 8192;
1561 *blk = 0; /* for proper rounding in the fourth stage */
1562
1563 x4 = x0;
1564 x6 = (W6 * x2 + 4) >> 3;
1565 x2 = (W2 * x2 + 4) >> 3;
1566 x8 = x0 - x2;
1567 x0 += x2;
1568 x2 = x8;
1569 x8 = x4 - x6;
1570 x4 += x6;
1571 x6 = x8;
1572
1573 x7 = (W7 * x1 + 4) >> 3;
1574 x1 = (W1 * x1 + 4) >> 3;
1575 x5 = (W3 * x3 + 4) >> 3;
1576 x3 = (- W5 * x3 + 4) >> 3;
1577 x8 = x1 - x5;
1578 x1 += x5;
1579 x5 = x8;
1580 x8 = x7 - x3;
1581 x3 += x7;
1582 x7 = (181 * (x5 + x8) + 128) >> 8;
1583 x5 = (181 * (x5 - x8) + 128) >> 8;
1584
1585 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1586 res = (x0 + x1) >> 14;
1587 ADD_AND_CLIP1(res);
1588 res2 = (x4 + x7) >> 14;
1589 ADD_AND_CLIP2(res2);
1590 dst_word = (res2 << 8) | res;
1591 res = (x6 + x5) >> 14;
1592 ADD_AND_CLIP3(res);
1593 dst_word |= (res << 16);
1594 res = (x2 + x3) >> 14;
1595 ADD_AND_CLIP4(res);
1596 dst_word |= (res << 24);
1597 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1598
1599 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1600 res = (x2 - x3) >> 14;
1601 ADD_AND_CLIP1(res);
1602 res2 = (x6 - x5) >> 14;
1603 ADD_AND_CLIP2(res2);
1604 dst_word = (res2 << 8) | res;
1605 res = (x4 - x7) >> 14;
1606 ADD_AND_CLIP3(res);
1607 dst_word |= (res << 16);
1608 res = (x0 - x1) >> 14;
1609 ADD_AND_CLIP4(res);
1610 dst_word |= (res << 24);
1611 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1612 }
1613 return ;
1614 }
1615
1616 #ifndef SMALL_DCT
1617 /* Ignoring overflows as idct function expects and uses overflows */
1618 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x40zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1619 void idct_row0x40zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1620 {
1621 int32 x1, x2, x4, x5;
1622 int i = 8;
1623 uint32 pred_word, dst_word;
1624 int res, res2;
1625
1626 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1627 rec -= lx;
1628 pred -= 16;
1629
1630 while (i--)
1631 {
1632 /* shortcut */
1633 x4 = blk[1];
1634 blk[1] = 0;
1635 blk += 8; /* for proper rounding in the fourth stage */
1636
1637 /* first stage */
1638 x5 = (W7 * x4 + 4) >> 3;
1639 x4 = (W1 * x4 + 4) >> 3;
1640
1641 /* third stage */
1642 x2 = (181 * (x4 + x5) + 128) >> 8;
1643 x1 = (181 * (x4 - x5) + 128) >> 8;
1644
1645 /* fourth stage */
1646 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1647 res = (8192 + x4) >> 14;
1648 ADD_AND_CLIP1(res);
1649 res2 = (8192 + x2) >> 14;
1650 ADD_AND_CLIP2(res2);
1651 dst_word = (res2 << 8) | res;
1652 res = (8192 + x1) >> 14;
1653 ADD_AND_CLIP3(res);
1654 dst_word |= (res << 16);
1655 res = (8192 + x5) >> 14;
1656 ADD_AND_CLIP4(res);
1657 dst_word |= (res << 24);
1658 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1659
1660 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1661 res = (8192 - x5) >> 14;
1662 ADD_AND_CLIP1(res);
1663 res2 = (8192 - x1) >> 14;
1664 ADD_AND_CLIP2(res2);
1665 dst_word = (res2 << 8) | res;
1666 res = (8192 - x2) >> 14;
1667 ADD_AND_CLIP3(res);
1668 dst_word |= (res << 16);
1669 res = (8192 - x4) >> 14;
1670 ADD_AND_CLIP4(res);
1671 dst_word |= (res << 24);
1672 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1673 }
1674 return ;
1675 }
1676
idct_row0x20zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1677 void idct_row0x20zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1678 {
1679 int32 x0, x2, x4, x6;
1680 int i = 8;
1681 uint32 pred_word, dst_word;
1682 int res, res2;
1683
1684 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1685 rec -= lx;
1686 pred -= 16;
1687
1688 while (i--)
1689 {
1690 x2 = blk[2];
1691 blk[2] = 0;
1692 blk += 8; /* for proper rounding in the fourth stage */
1693 /* both upper and lower*/
1694 /* both x2orx6 and x0orx4 */
1695 x6 = (W6 * x2 + 4) >> 3;
1696 x2 = (W2 * x2 + 4) >> 3;
1697 x0 = 8192 + x2;
1698 x2 = 8192 - x2;
1699 x4 = 8192 + x6;
1700 x6 = 8192 - x6;
1701
1702 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1703 res = (x0) >> 14;
1704 ADD_AND_CLIP1(res);
1705 res2 = (x4) >> 14;
1706 ADD_AND_CLIP2(res2);
1707 dst_word = (res2 << 8) | res;
1708 res = (x6) >> 14;
1709 ADD_AND_CLIP3(res);
1710 dst_word |= (res << 16);
1711 res = (x2) >> 14;
1712 ADD_AND_CLIP4(res);
1713 dst_word |= (res << 24);
1714 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1715
1716 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1717 res = (x2) >> 14;
1718 ADD_AND_CLIP1(res);
1719 res2 = (x6) >> 14;
1720 ADD_AND_CLIP2(res2);
1721 dst_word = (res2 << 8) | res;
1722 res = (x4) >> 14;
1723 ADD_AND_CLIP3(res);
1724 dst_word |= (res << 16);
1725 res = (x0) >> 14;
1726 ADD_AND_CLIP4(res);
1727 dst_word |= (res << 24);
1728 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1729 }
1730
1731 return ;
1732 }
1733
1734 /* Ignoring overflows as idct function expects and uses overflows */
1735 __attribute__((no_sanitize("signed-integer-overflow")))
idct_row0x10zmv(Short * blk,UChar * rec,UChar * pred,Int lx)1736 void idct_row0x10zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1737 {
1738 int32 x1, x3, x5, x7;
1739 int i = 8;
1740 uint32 pred_word, dst_word;
1741 int res, res2;
1742
1743 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1744 rec -= lx;
1745 pred -= 16;
1746
1747 while (i--)
1748 {
1749 x3 = blk[3];
1750 blk[3] = 0;
1751 blk += 8;
1752
1753 x1 = (W3 * x3 + 4) >> 3;
1754 x3 = (-W5 * x3 + 4) >> 3;
1755
1756 x7 = (-181 * (x3 + x1) + 128) >> 8;
1757 x5 = (181 * (x3 - x1) + 128) >> 8;
1758
1759 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1760 res = (8192 + x1) >> 14;
1761 ADD_AND_CLIP1(res);
1762 res2 = (8192 + x7) >> 14;
1763 ADD_AND_CLIP2(res2);
1764 dst_word = (res2 << 8) | res;
1765 res = (8192 + x5) >> 14;
1766 ADD_AND_CLIP3(res);
1767 dst_word |= (res << 16);
1768 res = (8192 + x3) >> 14;
1769 ADD_AND_CLIP4(res);
1770 dst_word |= (res << 24);
1771 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1772
1773 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1774 res = (8192 - x3) >> 14;
1775 ADD_AND_CLIP1(res);
1776 res2 = (8192 - x5) >> 14;
1777 ADD_AND_CLIP2(res2);
1778 dst_word = (res2 << 8) | res;
1779 res = (8192 - x7) >> 14;
1780 ADD_AND_CLIP3(res);
1781 dst_word |= (res << 16);
1782 res = (8192 - x1) >> 14;
1783 ADD_AND_CLIP4(res);
1784 dst_word |= (res << 24);
1785 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1786 }
1787 return ;
1788 }
1789
1790 #endif /* SMALL_DCT */
1791
1792 /* Ignoring overflows as idct function expects and uses overflows */
1793 __attribute__((no_sanitize("signed-integer-overflow")))
idct_rowzmv(Short * blk,UChar * rec,UChar * pred,Int lx)1794 void idct_rowzmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1795 {
1796 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1797 int i = 8;
1798 uint32 pred_word, dst_word;
1799 int res, res2;
1800
1801 /* preset the offset, such that we can take advantage pre-offset addressing mode */
1802 rec -= lx;
1803 pred -= 16;
1804 blk -= 8;
1805
1806 while (i--)
1807 {
1808 x1 = (int32)blk[12] << 8;
1809 blk[12] = 0;
1810 x2 = blk[14];
1811 blk[14] = 0;
1812 x3 = blk[10];
1813 blk[10] = 0;
1814 x4 = blk[9];
1815 blk[9] = 0;
1816 x5 = blk[15];
1817 blk[15] = 0;
1818 x6 = blk[13];
1819 blk[13] = 0;
1820 x7 = blk[11];
1821 blk[11] = 0;
1822 x0 = ((*(blk += 8)) << 8) + 8192;
1823 *blk = 0; /* for proper rounding in the fourth stage */
1824
1825 /* first stage */
1826 x8 = W7 * (x4 + x5) + 4;
1827 x4 = (x8 + (W1 - W7) * x4) >> 3;
1828 x5 = (x8 - (W1 + W7) * x5) >> 3;
1829 x8 = W3 * (x6 + x7) + 4;
1830 x6 = (x8 - (W3 - W5) * x6) >> 3;
1831 x7 = (x8 - (W3 + W5) * x7) >> 3;
1832
1833 /* second stage */
1834 x8 = x0 + x1;
1835 x0 -= x1;
1836 x1 = W6 * (x3 + x2) + 4;
1837 x2 = (x1 - (W2 + W6) * x2) >> 3;
1838 x3 = (x1 + (W2 - W6) * x3) >> 3;
1839 x1 = x4 + x6;
1840 x4 -= x6;
1841 x6 = x5 + x7;
1842 x5 -= x7;
1843
1844 /* third stage */
1845 x7 = x8 + x3;
1846 x8 -= x3;
1847 x3 = x0 + x2;
1848 x0 -= x2;
1849 x2 = (181 * (x4 + x5) + 128) >> 8;
1850 x4 = (181 * (x4 - x5) + 128) >> 8;
1851
1852 /* fourth stage */
1853 pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1854
1855 res = (x7 + x1) >> 14;
1856 ADD_AND_CLIP1(res);
1857 res2 = (x3 + x2) >> 14;
1858 ADD_AND_CLIP2(res2);
1859 dst_word = (res2 << 8) | res;
1860 res = (x0 + x4) >> 14;
1861 ADD_AND_CLIP3(res);
1862 dst_word |= (res << 16);
1863 res = (x8 + x6) >> 14;
1864 ADD_AND_CLIP4(res);
1865 dst_word |= (res << 24);
1866 *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1867
1868 pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1869
1870 res = (x8 - x6) >> 14;
1871 ADD_AND_CLIP1(res);
1872 res2 = (x0 - x4) >> 14;
1873 ADD_AND_CLIP2(res2);
1874 dst_word = (res2 << 8) | res;
1875 res = (x3 - x2) >> 14;
1876 ADD_AND_CLIP3(res);
1877 dst_word |= (res << 16);
1878 res = (x7 - x1) >> 14;
1879 ADD_AND_CLIP4(res);
1880 dst_word |= (res << 24);
1881 *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1882 }
1883 return;
1884 }
1885
1886 /*----------------------------------------------------------------------------
1887 ; End Function: idctcol
1888 ----------------------------------------------------------------------------*/
1889 /* ======================================================================== */
1890 /* Function : BlockIDCTMotionComp */
1891 /* Date : 10/16/2000 */
1892 /* Purpose : fast IDCT routine */
1893 /* In/out : */
1894 /* Int* coeff_in Dequantized coefficient
1895 Int block_out output IDCT coefficient
1896 Int maxval clip value */
1897 /* Modified : 7/31/01, add checking for all-zero and DC-only block. */
1898 /* do 8 columns at a time */
1899 /* 8/2/01, do column first then row-IDCT. */
1900 /* 8/2/01, remove clipping (included in motion comp). */
1901 /* 8/7/01, combine with motion comp. */
1902 /* 8/8/01, use AAN IDCT */
1903 /* 9/4/05, use Chen's IDCT and 16 bit block */
1904 /* ======================================================================== */
BlockIDCTMotionComp(Short * block,UChar * bitmapcol,UChar bitmaprow,Int dctMode,UChar * rec,UChar * pred,Int lx_intra)1905 void BlockIDCTMotionComp(Short *block, UChar *bitmapcol, UChar bitmaprow,
1906 Int dctMode, UChar *rec, UChar *pred, Int lx_intra)
1907 {
1908 Int i;
1909 Int tmp, tmp2;
1910 ULong tmp4;
1911 Int bmap;
1912 Short *ptr = block;
1913 UChar *endcol;
1914 UInt mask = 0xFF;
1915 Int lx = lx_intra >> 1;
1916 Int intra = (lx_intra & 1);
1917
1918 /* all-zero block */
1919 if (dctMode == 0 || bitmaprow == 0)
1920 {
1921 if (intra)
1922 {
1923 *((ULong*)rec) = *((ULong*)(rec + 4)) = 0;
1924 *((ULong*)(rec += lx)) = 0;
1925 *((ULong*)(rec + 4)) = 0;
1926 *((ULong*)(rec += lx)) = 0;
1927 *((ULong*)(rec + 4)) = 0;
1928 *((ULong*)(rec += lx)) = 0;
1929 *((ULong*)(rec + 4)) = 0;
1930 *((ULong*)(rec += lx)) = 0;
1931 *((ULong*)(rec + 4)) = 0;
1932 *((ULong*)(rec += lx)) = 0;
1933 *((ULong*)(rec + 4)) = 0;
1934 *((ULong*)(rec += lx)) = 0;
1935 *((ULong*)(rec + 4)) = 0;
1936 *((ULong*)(rec += lx)) = 0;
1937 *((ULong*)(rec + 4)) = 0;
1938 return ;
1939 }
1940 else /* copy from previous frame */
1941 {
1942 *((ULong*)rec) = *((ULong*)pred);
1943 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1944 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1945 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1946 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1947 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1948 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1949 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1950 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1951 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1952 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1953 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1954 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1955 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1956 *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1957 *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1958 return ;
1959 }
1960 }
1961
1962 /* Test for DC only block */
1963 if (dctMode == 1 || (bitmaprow == 0x80 && bitmapcol[0] == 0x80))
1964 {
1965 i = ((block[0] << 3) + 32) >> 6;
1966 block[0] = 0;
1967 if (intra)
1968 {
1969 if ((UInt)i > mask) i = mask & (~(i >> 31));
1970
1971 tmp = i | (i << 8);
1972 tmp |= (tmp << 16);
1973
1974 *((ULong*)rec) = *((ULong*)(rec + 4)) = tmp;
1975 *((ULong*)(rec += lx)) = tmp;
1976 *((ULong*)(rec + 4)) = tmp;
1977 *((ULong*)(rec += lx)) = tmp;
1978 *((ULong*)(rec + 4)) = tmp;
1979 *((ULong*)(rec += lx)) = tmp;
1980 *((ULong*)(rec + 4)) = tmp;
1981 *((ULong*)(rec += lx)) = tmp;
1982 *((ULong*)(rec + 4)) = tmp;
1983 *((ULong*)(rec += lx)) = tmp;
1984 *((ULong*)(rec + 4)) = tmp;
1985 *((ULong*)(rec += lx)) = tmp;
1986 *((ULong*)(rec + 4)) = tmp;
1987 *((ULong*)(rec += lx)) = tmp;
1988 *((ULong*)(rec + 4)) = tmp;
1989
1990 return ;
1991 }
1992 else
1993 {
1994 endcol = rec + (lx << 3);
1995 do
1996 {
1997 tmp4 = *((ULong*)pred);
1998 tmp2 = tmp4 & 0xFF;
1999 tmp2 += i;
2000 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
2001 tmp = (tmp4 >> 8) & 0xFF;
2002 tmp += i;
2003 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2004 tmp2 |= (tmp << 8);
2005 tmp = (tmp4 >> 16) & 0xFF;
2006 tmp += i;
2007 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2008 tmp2 |= (tmp << 16);
2009 tmp = (tmp4 >> 24) & 0xFF;
2010 tmp += i;
2011 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2012 tmp2 |= (tmp << 24);
2013 *((ULong*)rec) = tmp2;
2014
2015 tmp4 = *((ULong*)(pred + 4));
2016 tmp2 = tmp4 & 0xFF;
2017 tmp2 += i;
2018 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
2019 tmp = (tmp4 >> 8) & 0xFF;
2020 tmp += i;
2021 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2022 tmp2 |= (tmp << 8);
2023 tmp = (tmp4 >> 16) & 0xFF;
2024 tmp += i;
2025 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2026 tmp2 |= (tmp << 16);
2027 tmp = (tmp4 >> 24) & 0xFF;
2028 tmp += i;
2029 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
2030 tmp2 |= (tmp << 24);
2031 *((ULong*)(rec + 4)) = tmp2;
2032
2033 rec += lx;
2034 pred += 16;
2035 }
2036 while (rec < endcol);
2037 return ;
2038 }
2039 }
2040
2041 for (i = 0; i < dctMode; i++)
2042 {
2043 bmap = (Int)bitmapcol[i];
2044 if (bmap)
2045 {
2046 if ((bmap&0xf) == 0)
2047 (*(idctcolVCA[bmap>>4]))(ptr);
2048 else
2049 idct_col(ptr);
2050 }
2051 ptr++;
2052 }
2053
2054 if ((bitmaprow&0xf) == 0)
2055 {
2056 if (intra)
2057 (*(idctrowVCAIntra[(Int)(bitmaprow>>4)]))(block, rec, lx);
2058 else
2059 (*(idctrowVCAzmv[(Int)(bitmaprow>>4)]))(block, rec, pred, lx);
2060 }
2061 else
2062 {
2063 if (intra)
2064 idct_rowIntra(block, rec, lx);
2065 else
2066 idct_rowzmv(block, rec, pred, lx);
2067 }
2068 }
2069