• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This file is part of the Independent JPEG Group's software.
3  *
4  * The authors make NO WARRANTY or representation, either express or implied,
5  * with respect to this software, its quality, accuracy, merchantability, or
6  * fitness for a particular purpose.  This software is provided "AS IS", and
7  * you, its user, assume the entire risk as to its quality and accuracy.
8  *
9  * This software is copyright (C) 1991, 1992, Thomas G. Lane.
10  * All Rights Reserved except as specified below.
11  *
12  * Permission is hereby granted to use, copy, modify, and distribute this
13  * software (or portions thereof) for any purpose, without fee, subject to
14  * these conditions:
15  * (1) If any part of the source code for this software is distributed, then
16  * this README file must be included, with this copyright and no-warranty
17  * notice unaltered; and any additions, deletions, or changes to the original
18  * files must be clearly indicated in accompanying documentation.
19  * (2) If only executable code is distributed, then the accompanying
20  * documentation must state that "this software is based in part on the work
21  * of the Independent JPEG Group".
22  * (3) Permission for use of this software is granted only if the user accepts
23  * full responsibility for any undesirable consequences; the authors accept
24  * NO LIABILITY for damages of any kind.
25  *
26  * These conditions apply to any software derived from or based on the IJG
27  * code, not just to the unmodified library.  If you use our work, you ought
28  * to acknowledge us.
29  *
30  * Permission is NOT granted for the use of any IJG author's name or company
31  * name in advertising or publicity relating to this software or products
32  * derived from it.  This software may be referred to only as "the Independent
33  * JPEG Group's software".
34  *
35  * We specifically permit and encourage the use of this software as the basis
36  * of commercial products, provided that all warranty or liability claims are
37  * assumed by the product vendor.
38  *
39  * This file contains the basic inverse-DCT transformation subroutine.
40  *
41  * This implementation is based on an algorithm described in
42  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
43  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
44  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
45  * The primary algorithm described there uses 11 multiplies and 29 adds.
46  * We use their alternate method with 12 multiplies and 32 adds.
47  * The advantage of this method is that no data path contains more than one
48  * multiplication; this allows a very simple and accurate implementation in
49  * scaled fixed-point arithmetic, with a minimal number of shifts.
50  *
51  * I've made lots of modifications to attempt to take advantage of the
52  * sparse nature of the DCT matrices we're getting.  Although the logic
53  * is cumbersome, it's straightforward and the resulting code is much
54  * faster.
55  *
56  * A better way to do this would be to pass in the DCT block as a sparse
57  * matrix, perhaps with the difference cases encoded.
58  */
59 
60 /**
61  * @file
62  * Independent JPEG Group's LLM idct.
63  */
64 
65 #include <stddef.h>
66 #include <stdint.h>
67 
68 #include "libavutil/intreadwrite.h"
69 
70 #include "dct.h"
71 #include "idctdsp.h"
72 
73 #define EIGHT_BIT_SAMPLES
74 
75 #define DCTSIZE 8
76 #define DCTSIZE2 64
77 
78 #define GLOBAL
79 
80 #define RIGHT_SHIFT(x, n) ((x) >> (n))
81 
82 typedef int16_t DCTBLOCK[DCTSIZE2];
83 
84 #define CONST_BITS 13
85 
86 /*
87  * This routine is specialized to the case DCTSIZE = 8.
88  */
89 
90 #if DCTSIZE != 8
91   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
92 #endif
93 
94 
95 /*
96  * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
97  * on each column.  Direct algorithms are also available, but they are
98  * much more complex and seem not to be any faster when reduced to code.
99  *
100  * The poop on this scaling stuff is as follows:
101  *
102  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
103  * larger than the true IDCT outputs.  The final outputs are therefore
104  * a factor of N larger than desired; since N=8 this can be cured by
105  * a simple right shift at the end of the algorithm.  The advantage of
106  * this arrangement is that we save two multiplications per 1-D IDCT,
107  * because the y0 and y4 inputs need not be divided by sqrt(N).
108  *
109  * We have to do addition and subtraction of the integer inputs, which
110  * is no problem, and multiplication by fractional constants, which is
111  * a problem to do in integer arithmetic.  We multiply all the constants
112  * by CONST_SCALE and convert them to integer constants (thus retaining
113  * CONST_BITS bits of precision in the constants).  After doing a
114  * multiplication we have to divide the product by CONST_SCALE, with proper
115  * rounding, to produce the correct output.  This division can be done
116  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
117  * as long as possible so that partial sums can be added together with
118  * full fractional precision.
119  *
120  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
121  * they are represented to better-than-integral precision.  These outputs
122  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
123  * with the recommended scaling.  (To scale up 12-bit sample data further, an
124  * intermediate int32 array would be needed.)
125  *
126  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
127  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
128  * shows that the values given below are the most effective.
129  */
130 
131 #ifdef EIGHT_BIT_SAMPLES
132 #define PASS1_BITS  2
133 #else
134 #define PASS1_BITS  1   /* lose a little precision to avoid overflow */
135 #endif
136 
137 #define ONE         ((int32_t) 1)
138 
139 #define CONST_SCALE (ONE << CONST_BITS)
140 
141 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
142  * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
143  * you will pay a significant penalty in run time.  In that case, figure
144  * the correct integer constant values and insert them by hand.
145  */
146 
147 /* Actually FIX is no longer used, we precomputed them all */
148 #define FIX(x)  ((int32_t) ((x) * CONST_SCALE + 0.5))
149 
150 /* Descale and correctly round an int32_t value that's scaled by N bits.
151  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
152  * the fudge factor is correct for either sign of X.
153  */
154 
155 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
156 
157 /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
158  * For 8-bit samples with the recommended scaling, all the variable
159  * and constant values involved are no more than 16 bits wide, so a
160  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
161  * this provides a useful speedup on many machines.
162  * There is no way to specify a 16x16->32 multiply in portable C, but
163  * some C compilers will do the right thing if you provide the correct
164  * combination of casts.
165  * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
166  */
167 
168 #ifdef EIGHT_BIT_SAMPLES
169 #ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
170 #define MULTIPLY(var,const)  (((int16_t) (var)) * ((int16_t) (const)))
171 #endif
172 #ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
173 #define MULTIPLY(var,const)  (((int16_t) (var)) * ((int32_t) (const)))
174 #endif
175 #endif
176 
177 #ifndef MULTIPLY                /* default definition */
178 #define MULTIPLY(var,const)  ((var) * (const))
179 #endif
180 
181 
182 /*
183   Unlike our decoder where we approximate the FIXes, we need to use exact
184 ones here or successive P-frames will drift too much with Reference frame coding
185 */
186 #define FIX_0_211164243 1730
187 #define FIX_0_275899380 2260
188 #define FIX_0_298631336 2446
189 #define FIX_0_390180644 3196
190 #define FIX_0_509795579 4176
191 #define FIX_0_541196100 4433
192 #define FIX_0_601344887 4926
193 #define FIX_0_765366865 6270
194 #define FIX_0_785694958 6436
195 #define FIX_0_899976223 7373
196 #define FIX_1_061594337 8697
197 #define FIX_1_111140466 9102
198 #define FIX_1_175875602 9633
199 #define FIX_1_306562965 10703
200 #define FIX_1_387039845 11363
201 #define FIX_1_451774981 11893
202 #define FIX_1_501321110 12299
203 #define FIX_1_662939225 13623
204 #define FIX_1_847759065 15137
205 #define FIX_1_961570560 16069
206 #define FIX_2_053119869 16819
207 #define FIX_2_172734803 17799
208 #define FIX_2_562915447 20995
209 #define FIX_3_072711026 25172
210 
211 /*
212  * Perform the inverse DCT on one block of coefficients.
213  */
214 
215 void ff_j_rev_dct(DCTBLOCK data)
216 {
217   int32_t tmp0, tmp1, tmp2, tmp3;
218   int32_t tmp10, tmp11, tmp12, tmp13;
219   int32_t z1, z2, z3, z4, z5;
220   int32_t d0, d1, d2, d3, d4, d5, d6, d7;
221   register int16_t *dataptr;
222   int rowctr;
223 
224   /* Pass 1: process rows. */
225   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
226   /* furthermore, we scale the results by 2**PASS1_BITS. */
227 
228   dataptr = data;
229 
230   for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
231     /* Due to quantization, we will usually find that many of the input
232      * coefficients are zero, especially the AC terms.  We can exploit this
233      * by short-circuiting the IDCT calculation for any row in which all
234      * the AC terms are zero.  In that case each output is equal to the
235      * DC coefficient (with scale factor as needed).
236      * With typical images and quantization tables, half or more of the
237      * row DCT calculations can be simplified this way.
238      */
239 
240     register uint8_t *idataptr = (uint8_t*)dataptr;
241 
242     /* WARNING: we do the same permutation as MMX idct to simplify the
243        video core */
244     d0 = dataptr[0];
245     d2 = dataptr[1];
246     d4 = dataptr[2];
247     d6 = dataptr[3];
248     d1 = dataptr[4];
249     d3 = dataptr[5];
250     d5 = dataptr[6];
251     d7 = dataptr[7];
252 
253     if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
254       /* AC terms all zero */
255       if (d0) {
256           /* Compute a 32 bit value to assign. */
257           int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
258           register int v = (dcval & 0xffff) | ((dcval * (1 << 16)) & 0xffff0000);
259 
260           AV_WN32A(&idataptr[ 0], v);
261           AV_WN32A(&idataptr[ 4], v);
262           AV_WN32A(&idataptr[ 8], v);
263           AV_WN32A(&idataptr[12], v);
264       }
265 
266       dataptr += DCTSIZE;       /* advance pointer to next row */
267       continue;
268     }
269 
270     /* Even part: reverse the even part of the forward DCT. */
271     /* The rotator is sqrt(2)*c(-6). */
272 {
273     if (d6) {
274             if (d2) {
275                     /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
276                     z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
277                     tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
278                     tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
279 
280                     tmp0 = (d0 + d4) * CONST_SCALE;
281                     tmp1 = (d0 - d4) * CONST_SCALE;
282 
283                     tmp10 = tmp0 + tmp3;
284                     tmp13 = tmp0 - tmp3;
285                     tmp11 = tmp1 + tmp2;
286                     tmp12 = tmp1 - tmp2;
287             } else {
288                     /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
289                     tmp2 = MULTIPLY(-d6, FIX_1_306562965);
290                     tmp3 = MULTIPLY(d6, FIX_0_541196100);
291 
292                     tmp0 = (d0 + d4) * CONST_SCALE;
293                     tmp1 = (d0 - d4) * CONST_SCALE;
294 
295                     tmp10 = tmp0 + tmp3;
296                     tmp13 = tmp0 - tmp3;
297                     tmp11 = tmp1 + tmp2;
298                     tmp12 = tmp1 - tmp2;
299             }
300     } else {
301             if (d2) {
302                     /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
303                     tmp2 = MULTIPLY(d2, FIX_0_541196100);
304                     tmp3 = MULTIPLY(d2, FIX_1_306562965);
305 
306                     tmp0 = (d0 + d4) * CONST_SCALE;
307                     tmp1 = (d0 - d4) * CONST_SCALE;
308 
309                     tmp10 = tmp0 + tmp3;
310                     tmp13 = tmp0 - tmp3;
311                     tmp11 = tmp1 + tmp2;
312                     tmp12 = tmp1 - tmp2;
313             } else {
314                     /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
315                     tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
316                     tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
317             }
318       }
319 
320     /* Odd part per figure 8; the matrix is unitary and hence its
321      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
322      */
323 
324     if (d7) {
325         if (d5) {
326             if (d3) {
327                 if (d1) {
328                     /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
329                     z1 = d7 + d1;
330                     z2 = d5 + d3;
331                     z3 = d7 + d3;
332                     z4 = d5 + d1;
333                     z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
334 
335                     tmp0 = MULTIPLY(d7, FIX_0_298631336);
336                     tmp1 = MULTIPLY(d5, FIX_2_053119869);
337                     tmp2 = MULTIPLY(d3, FIX_3_072711026);
338                     tmp3 = MULTIPLY(d1, FIX_1_501321110);
339                     z1 = MULTIPLY(-z1, FIX_0_899976223);
340                     z2 = MULTIPLY(-z2, FIX_2_562915447);
341                     z3 = MULTIPLY(-z3, FIX_1_961570560);
342                     z4 = MULTIPLY(-z4, FIX_0_390180644);
343 
344                     z3 += z5;
345                     z4 += z5;
346 
347                     tmp0 += z1 + z3;
348                     tmp1 += z2 + z4;
349                     tmp2 += z2 + z3;
350                     tmp3 += z1 + z4;
351                 } else {
352                     /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
353                     z2 = d5 + d3;
354                     z3 = d7 + d3;
355                     z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
356 
357                     tmp0 = MULTIPLY(d7, FIX_0_298631336);
358                     tmp1 = MULTIPLY(d5, FIX_2_053119869);
359                     tmp2 = MULTIPLY(d3, FIX_3_072711026);
360                     z1 = MULTIPLY(-d7, FIX_0_899976223);
361                     z2 = MULTIPLY(-z2, FIX_2_562915447);
362                     z3 = MULTIPLY(-z3, FIX_1_961570560);
363                     z4 = MULTIPLY(-d5, FIX_0_390180644);
364 
365                     z3 += z5;
366                     z4 += z5;
367 
368                     tmp0 += z1 + z3;
369                     tmp1 += z2 + z4;
370                     tmp2 += z2 + z3;
371                     tmp3 = z1 + z4;
372                 }
373             } else {
374                 if (d1) {
375                     /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
376                     z1 = d7 + d1;
377                     z4 = d5 + d1;
378                     z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
379 
380                     tmp0 = MULTIPLY(d7, FIX_0_298631336);
381                     tmp1 = MULTIPLY(d5, FIX_2_053119869);
382                     tmp3 = MULTIPLY(d1, FIX_1_501321110);
383                     z1 = MULTIPLY(-z1, FIX_0_899976223);
384                     z2 = MULTIPLY(-d5, FIX_2_562915447);
385                     z3 = MULTIPLY(-d7, FIX_1_961570560);
386                     z4 = MULTIPLY(-z4, FIX_0_390180644);
387 
388                     z3 += z5;
389                     z4 += z5;
390 
391                     tmp0 += z1 + z3;
392                     tmp1 += z2 + z4;
393                     tmp2 = z2 + z3;
394                     tmp3 += z1 + z4;
395                 } else {
396                     /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
397                     tmp0 = MULTIPLY(-d7, FIX_0_601344887);
398                     z1 = MULTIPLY(-d7, FIX_0_899976223);
399                     z3 = MULTIPLY(-d7, FIX_1_961570560);
400                     tmp1 = MULTIPLY(-d5, FIX_0_509795579);
401                     z2 = MULTIPLY(-d5, FIX_2_562915447);
402                     z4 = MULTIPLY(-d5, FIX_0_390180644);
403                     z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
404 
405                     z3 += z5;
406                     z4 += z5;
407 
408                     tmp0 += z3;
409                     tmp1 += z4;
410                     tmp2 = z2 + z3;
411                     tmp3 = z1 + z4;
412                 }
413             }
414         } else {
415             if (d3) {
416                 if (d1) {
417                     /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
418                     z1 = d7 + d1;
419                     z3 = d7 + d3;
420                     z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
421 
422                     tmp0 = MULTIPLY(d7, FIX_0_298631336);
423                     tmp2 = MULTIPLY(d3, FIX_3_072711026);
424                     tmp3 = MULTIPLY(d1, FIX_1_501321110);
425                     z1 = MULTIPLY(-z1, FIX_0_899976223);
426                     z2 = MULTIPLY(-d3, FIX_2_562915447);
427                     z3 = MULTIPLY(-z3, FIX_1_961570560);
428                     z4 = MULTIPLY(-d1, FIX_0_390180644);
429 
430                     z3 += z5;
431                     z4 += z5;
432 
433                     tmp0 += z1 + z3;
434                     tmp1 = z2 + z4;
435                     tmp2 += z2 + z3;
436                     tmp3 += z1 + z4;
437                 } else {
438                     /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
439                     z3 = d7 + d3;
440 
441                     tmp0 = MULTIPLY(-d7, FIX_0_601344887);
442                     z1 = MULTIPLY(-d7, FIX_0_899976223);
443                     tmp2 = MULTIPLY(d3, FIX_0_509795579);
444                     z2 = MULTIPLY(-d3, FIX_2_562915447);
445                     z5 = MULTIPLY(z3, FIX_1_175875602);
446                     z3 = MULTIPLY(-z3, FIX_0_785694958);
447 
448                     tmp0 += z3;
449                     tmp1 = z2 + z5;
450                     tmp2 += z3;
451                     tmp3 = z1 + z5;
452                 }
453             } else {
454                 if (d1) {
455                     /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
456                     z1 = d7 + d1;
457                     z5 = MULTIPLY(z1, FIX_1_175875602);
458 
459                     z1 = MULTIPLY(z1, FIX_0_275899380);
460                     z3 = MULTIPLY(-d7, FIX_1_961570560);
461                     tmp0 = MULTIPLY(-d7, FIX_1_662939225);
462                     z4 = MULTIPLY(-d1, FIX_0_390180644);
463                     tmp3 = MULTIPLY(d1, FIX_1_111140466);
464 
465                     tmp0 += z1;
466                     tmp1 = z4 + z5;
467                     tmp2 = z3 + z5;
468                     tmp3 += z1;
469                 } else {
470                     /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
471                     tmp0 = MULTIPLY(-d7, FIX_1_387039845);
472                     tmp1 = MULTIPLY(d7, FIX_1_175875602);
473                     tmp2 = MULTIPLY(-d7, FIX_0_785694958);
474                     tmp3 = MULTIPLY(d7, FIX_0_275899380);
475                 }
476             }
477         }
478     } else {
479         if (d5) {
480             if (d3) {
481                 if (d1) {
482                     /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
483                     z2 = d5 + d3;
484                     z4 = d5 + d1;
485                     z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
486 
487                     tmp1 = MULTIPLY(d5, FIX_2_053119869);
488                     tmp2 = MULTIPLY(d3, FIX_3_072711026);
489                     tmp3 = MULTIPLY(d1, FIX_1_501321110);
490                     z1 = MULTIPLY(-d1, FIX_0_899976223);
491                     z2 = MULTIPLY(-z2, FIX_2_562915447);
492                     z3 = MULTIPLY(-d3, FIX_1_961570560);
493                     z4 = MULTIPLY(-z4, FIX_0_390180644);
494 
495                     z3 += z5;
496                     z4 += z5;
497 
498                     tmp0 = z1 + z3;
499                     tmp1 += z2 + z4;
500                     tmp2 += z2 + z3;
501                     tmp3 += z1 + z4;
502                 } else {
503                     /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
504                     z2 = d5 + d3;
505 
506                     z5 = MULTIPLY(z2, FIX_1_175875602);
507                     tmp1 = MULTIPLY(d5, FIX_1_662939225);
508                     z4 = MULTIPLY(-d5, FIX_0_390180644);
509                     z2 = MULTIPLY(-z2, FIX_1_387039845);
510                     tmp2 = MULTIPLY(d3, FIX_1_111140466);
511                     z3 = MULTIPLY(-d3, FIX_1_961570560);
512 
513                     tmp0 = z3 + z5;
514                     tmp1 += z2;
515                     tmp2 += z2;
516                     tmp3 = z4 + z5;
517                 }
518             } else {
519                 if (d1) {
520                     /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
521                     z4 = d5 + d1;
522 
523                     z5 = MULTIPLY(z4, FIX_1_175875602);
524                     z1 = MULTIPLY(-d1, FIX_0_899976223);
525                     tmp3 = MULTIPLY(d1, FIX_0_601344887);
526                     tmp1 = MULTIPLY(-d5, FIX_0_509795579);
527                     z2 = MULTIPLY(-d5, FIX_2_562915447);
528                     z4 = MULTIPLY(z4, FIX_0_785694958);
529 
530                     tmp0 = z1 + z5;
531                     tmp1 += z4;
532                     tmp2 = z2 + z5;
533                     tmp3 += z4;
534                 } else {
535                     /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
536                     tmp0 = MULTIPLY(d5, FIX_1_175875602);
537                     tmp1 = MULTIPLY(d5, FIX_0_275899380);
538                     tmp2 = MULTIPLY(-d5, FIX_1_387039845);
539                     tmp3 = MULTIPLY(d5, FIX_0_785694958);
540                 }
541             }
542         } else {
543             if (d3) {
544                 if (d1) {
545                     /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
546                     z5 = d1 + d3;
547                     tmp3 = MULTIPLY(d1, FIX_0_211164243);
548                     tmp2 = MULTIPLY(-d3, FIX_1_451774981);
549                     z1 = MULTIPLY(d1, FIX_1_061594337);
550                     z2 = MULTIPLY(-d3, FIX_2_172734803);
551                     z4 = MULTIPLY(z5, FIX_0_785694958);
552                     z5 = MULTIPLY(z5, FIX_1_175875602);
553 
554                     tmp0 = z1 - z4;
555                     tmp1 = z2 + z4;
556                     tmp2 += z5;
557                     tmp3 += z5;
558                 } else {
559                     /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
560                     tmp0 = MULTIPLY(-d3, FIX_0_785694958);
561                     tmp1 = MULTIPLY(-d3, FIX_1_387039845);
562                     tmp2 = MULTIPLY(-d3, FIX_0_275899380);
563                     tmp3 = MULTIPLY(d3, FIX_1_175875602);
564                 }
565             } else {
566                 if (d1) {
567                     /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
568                     tmp0 = MULTIPLY(d1, FIX_0_275899380);
569                     tmp1 = MULTIPLY(d1, FIX_0_785694958);
570                     tmp2 = MULTIPLY(d1, FIX_1_175875602);
571                     tmp3 = MULTIPLY(d1, FIX_1_387039845);
572                 } else {
573                     /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
574                     tmp0 = tmp1 = tmp2 = tmp3 = 0;
575                 }
576             }
577         }
578     }
579 }
580     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
581 
582     dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
583     dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
584     dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
585     dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
586     dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
587     dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
588     dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
589     dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
590 
591     dataptr += DCTSIZE;         /* advance pointer to next row */
592   }
593 
594   /* Pass 2: process columns. */
595   /* Note that we must descale the results by a factor of 8 == 2**3, */
596   /* and also undo the PASS1_BITS scaling. */
597 
598   dataptr = data;
599   for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
600     /* Columns of zeroes can be exploited in the same way as we did with rows.
601      * However, the row calculation has created many nonzero AC terms, so the
602      * simplification applies less often (typically 5% to 10% of the time).
603      * On machines with very fast multiplication, it's possible that the
604      * test takes more time than it's worth.  In that case this section
605      * may be commented out.
606      */
607 
608     d0 = dataptr[DCTSIZE*0];
609     d1 = dataptr[DCTSIZE*1];
610     d2 = dataptr[DCTSIZE*2];
611     d3 = dataptr[DCTSIZE*3];
612     d4 = dataptr[DCTSIZE*4];
613     d5 = dataptr[DCTSIZE*5];
614     d6 = dataptr[DCTSIZE*6];
615     d7 = dataptr[DCTSIZE*7];
616 
617     /* Even part: reverse the even part of the forward DCT. */
618     /* The rotator is sqrt(2)*c(-6). */
619     if (d6) {
620             if (d2) {
621                     /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
622                     z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
623                     tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
624                     tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
625 
626                     tmp0 = (d0 + d4) * CONST_SCALE;
627                     tmp1 = (d0 - d4) * CONST_SCALE;
628 
629                     tmp10 = tmp0 + tmp3;
630                     tmp13 = tmp0 - tmp3;
631                     tmp11 = tmp1 + tmp2;
632                     tmp12 = tmp1 - tmp2;
633             } else {
634                     /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
635                     tmp2 = MULTIPLY(-d6, FIX_1_306562965);
636                     tmp3 = MULTIPLY(d6, FIX_0_541196100);
637 
638                     tmp0 = (d0 + d4) * CONST_SCALE;
639                     tmp1 = (d0 - d4) * CONST_SCALE;
640 
641                     tmp10 = tmp0 + tmp3;
642                     tmp13 = tmp0 - tmp3;
643                     tmp11 = tmp1 + tmp2;
644                     tmp12 = tmp1 - tmp2;
645             }
646     } else {
647             if (d2) {
648                     /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
649                     tmp2 = MULTIPLY(d2, FIX_0_541196100);
650                     tmp3 = MULTIPLY(d2, FIX_1_306562965);
651 
652                     tmp0 = (d0 + d4) * CONST_SCALE;
653                     tmp1 = (d0 - d4) * CONST_SCALE;
654 
655                     tmp10 = tmp0 + tmp3;
656                     tmp13 = tmp0 - tmp3;
657                     tmp11 = tmp1 + tmp2;
658                     tmp12 = tmp1 - tmp2;
659             } else {
660                     /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
661                     tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
662                     tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
663             }
664     }
665 
666     /* Odd part per figure 8; the matrix is unitary and hence its
667      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
668      */
669     if (d7) {
670         if (d5) {
671             if (d3) {
672                 if (d1) {
673                     /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
674                     z1 = d7 + d1;
675                     z2 = d5 + d3;
676                     z3 = d7 + d3;
677                     z4 = d5 + d1;
678                     z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
679 
680                     tmp0 = MULTIPLY(d7, FIX_0_298631336);
681                     tmp1 = MULTIPLY(d5, FIX_2_053119869);
682                     tmp2 = MULTIPLY(d3, FIX_3_072711026);
683                     tmp3 = MULTIPLY(d1, FIX_1_501321110);
684                     z1 = MULTIPLY(-z1, FIX_0_899976223);
685                     z2 = MULTIPLY(-z2, FIX_2_562915447);
686                     z3 = MULTIPLY(-z3, FIX_1_961570560);
687                     z4 = MULTIPLY(-z4, FIX_0_390180644);
688 
689                     z3 += z5;
690                     z4 += z5;
691 
692                     tmp0 += z1 + z3;
693                     tmp1 += z2 + z4;
694                     tmp2 += z2 + z3;
695                     tmp3 += z1 + z4;
696                 } else {
697                     /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
698                     z2 = d5 + d3;
699                     z3 = d7 + d3;
700                     z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
701 
702                     tmp0 = MULTIPLY(d7, FIX_0_298631336);
703                     tmp1 = MULTIPLY(d5, FIX_2_053119869);
704                     tmp2 = MULTIPLY(d3, FIX_3_072711026);
705                     z1 = MULTIPLY(-d7, FIX_0_899976223);
706                     z2 = MULTIPLY(-z2, FIX_2_562915447);
707                     z3 = MULTIPLY(-z3, FIX_1_961570560);
708                     z4 = MULTIPLY(-d5, FIX_0_390180644);
709 
710                     z3 += z5;
711                     z4 += z5;
712 
713                     tmp0 += z1 + z3;
714                     tmp1 += z2 + z4;
715                     tmp2 += z2 + z3;
716                     tmp3 = z1 + z4;
717                 }
718             } else {
719                 if (d1) {
720                     /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
721                     z1 = d7 + d1;
722                     z3 = d7;
723                     z4 = d5 + d1;
724                     z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
725 
726                     tmp0 = MULTIPLY(d7, FIX_0_298631336);
727                     tmp1 = MULTIPLY(d5, FIX_2_053119869);
728                     tmp3 = MULTIPLY(d1, FIX_1_501321110);
729                     z1 = MULTIPLY(-z1, FIX_0_899976223);
730                     z2 = MULTIPLY(-d5, FIX_2_562915447);
731                     z3 = MULTIPLY(-d7, FIX_1_961570560);
732                     z4 = MULTIPLY(-z4, FIX_0_390180644);
733 
734                     z3 += z5;
735                     z4 += z5;
736 
737                     tmp0 += z1 + z3;
738                     tmp1 += z2 + z4;
739                     tmp2 = z2 + z3;
740                     tmp3 += z1 + z4;
741                 } else {
742                     /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
743                     tmp0 = MULTIPLY(-d7, FIX_0_601344887);
744                     z1 = MULTIPLY(-d7, FIX_0_899976223);
745                     z3 = MULTIPLY(-d7, FIX_1_961570560);
746                     tmp1 = MULTIPLY(-d5, FIX_0_509795579);
747                     z2 = MULTIPLY(-d5, FIX_2_562915447);
748                     z4 = MULTIPLY(-d5, FIX_0_390180644);
749                     z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
750 
751                     z3 += z5;
752                     z4 += z5;
753 
754                     tmp0 += z3;
755                     tmp1 += z4;
756                     tmp2 = z2 + z3;
757                     tmp3 = z1 + z4;
758                 }
759             }
760         } else {
761             if (d3) {
762                 if (d1) {
763                     /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
764                     z1 = d7 + d1;
765                     z3 = d7 + d3;
766                     z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
767 
768                     tmp0 = MULTIPLY(d7, FIX_0_298631336);
769                     tmp2 = MULTIPLY(d3, FIX_3_072711026);
770                     tmp3 = MULTIPLY(d1, FIX_1_501321110);
771                     z1 = MULTIPLY(-z1, FIX_0_899976223);
772                     z2 = MULTIPLY(-d3, FIX_2_562915447);
773                     z3 = MULTIPLY(-z3, FIX_1_961570560);
774                     z4 = MULTIPLY(-d1, FIX_0_390180644);
775 
776                     z3 += z5;
777                     z4 += z5;
778 
779                     tmp0 += z1 + z3;
780                     tmp1 = z2 + z4;
781                     tmp2 += z2 + z3;
782                     tmp3 += z1 + z4;
783                 } else {
784                     /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
785                     z3 = d7 + d3;
786 
787                     tmp0 = MULTIPLY(-d7, FIX_0_601344887);
788                     z1 = MULTIPLY(-d7, FIX_0_899976223);
789                     tmp2 = MULTIPLY(d3, FIX_0_509795579);
790                     z2 = MULTIPLY(-d3, FIX_2_562915447);
791                     z5 = MULTIPLY(z3, FIX_1_175875602);
792                     z3 = MULTIPLY(-z3, FIX_0_785694958);
793 
794                     tmp0 += z3;
795                     tmp1 = z2 + z5;
796                     tmp2 += z3;
797                     tmp3 = z1 + z5;
798                 }
799             } else {
800                 if (d1) {
801                     /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
802                     z1 = d7 + d1;
803                     z5 = MULTIPLY(z1, FIX_1_175875602);
804 
805                     z1 = MULTIPLY(z1, FIX_0_275899380);
806                     z3 = MULTIPLY(-d7, FIX_1_961570560);
807                     tmp0 = MULTIPLY(-d7, FIX_1_662939225);
808                     z4 = MULTIPLY(-d1, FIX_0_390180644);
809                     tmp3 = MULTIPLY(d1, FIX_1_111140466);
810 
811                     tmp0 += z1;
812                     tmp1 = z4 + z5;
813                     tmp2 = z3 + z5;
814                     tmp3 += z1;
815                 } else {
816                     /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
817                     tmp0 = MULTIPLY(-d7, FIX_1_387039845);
818                     tmp1 = MULTIPLY(d7, FIX_1_175875602);
819                     tmp2 = MULTIPLY(-d7, FIX_0_785694958);
820                     tmp3 = MULTIPLY(d7, FIX_0_275899380);
821                 }
822             }
823         }
824     } else {
825         if (d5) {
826             if (d3) {
827                 if (d1) {
828                     /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
829                     z2 = d5 + d3;
830                     z4 = d5 + d1;
831                     z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
832 
833                     tmp1 = MULTIPLY(d5, FIX_2_053119869);
834                     tmp2 = MULTIPLY(d3, FIX_3_072711026);
835                     tmp3 = MULTIPLY(d1, FIX_1_501321110);
836                     z1 = MULTIPLY(-d1, FIX_0_899976223);
837                     z2 = MULTIPLY(-z2, FIX_2_562915447);
838                     z3 = MULTIPLY(-d3, FIX_1_961570560);
839                     z4 = MULTIPLY(-z4, FIX_0_390180644);
840 
841                     z3 += z5;
842                     z4 += z5;
843 
844                     tmp0 = z1 + z3;
845                     tmp1 += z2 + z4;
846                     tmp2 += z2 + z3;
847                     tmp3 += z1 + z4;
848                 } else {
849                     /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
850                     z2 = d5 + d3;
851 
852                     z5 = MULTIPLY(z2, FIX_1_175875602);
853                     tmp1 = MULTIPLY(d5, FIX_1_662939225);
854                     z4 = MULTIPLY(-d5, FIX_0_390180644);
855                     z2 = MULTIPLY(-z2, FIX_1_387039845);
856                     tmp2 = MULTIPLY(d3, FIX_1_111140466);
857                     z3 = MULTIPLY(-d3, FIX_1_961570560);
858 
859                     tmp0 = z3 + z5;
860                     tmp1 += z2;
861                     tmp2 += z2;
862                     tmp3 = z4 + z5;
863                 }
864             } else {
865                 if (d1) {
866                     /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
867                     z4 = d5 + d1;
868 
869                     z5 = MULTIPLY(z4, FIX_1_175875602);
870                     z1 = MULTIPLY(-d1, FIX_0_899976223);
871                     tmp3 = MULTIPLY(d1, FIX_0_601344887);
872                     tmp1 = MULTIPLY(-d5, FIX_0_509795579);
873                     z2 = MULTIPLY(-d5, FIX_2_562915447);
874                     z4 = MULTIPLY(z4, FIX_0_785694958);
875 
876                     tmp0 = z1 + z5;
877                     tmp1 += z4;
878                     tmp2 = z2 + z5;
879                     tmp3 += z4;
880                 } else {
881                     /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
882                     tmp0 = MULTIPLY(d5, FIX_1_175875602);
883                     tmp1 = MULTIPLY(d5, FIX_0_275899380);
884                     tmp2 = MULTIPLY(-d5, FIX_1_387039845);
885                     tmp3 = MULTIPLY(d5, FIX_0_785694958);
886                 }
887             }
888         } else {
889             if (d3) {
890                 if (d1) {
891                     /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
892                     z5 = d1 + d3;
893                     tmp3 = MULTIPLY(d1, FIX_0_211164243);
894                     tmp2 = MULTIPLY(-d3, FIX_1_451774981);
895                     z1 = MULTIPLY(d1, FIX_1_061594337);
896                     z2 = MULTIPLY(-d3, FIX_2_172734803);
897                     z4 = MULTIPLY(z5, FIX_0_785694958);
898                     z5 = MULTIPLY(z5, FIX_1_175875602);
899 
900                     tmp0 = z1 - z4;
901                     tmp1 = z2 + z4;
902                     tmp2 += z5;
903                     tmp3 += z5;
904                 } else {
905                     /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
906                     tmp0 = MULTIPLY(-d3, FIX_0_785694958);
907                     tmp1 = MULTIPLY(-d3, FIX_1_387039845);
908                     tmp2 = MULTIPLY(-d3, FIX_0_275899380);
909                     tmp3 = MULTIPLY(d3, FIX_1_175875602);
910                 }
911             } else {
912                 if (d1) {
913                     /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
914                     tmp0 = MULTIPLY(d1, FIX_0_275899380);
915                     tmp1 = MULTIPLY(d1, FIX_0_785694958);
916                     tmp2 = MULTIPLY(d1, FIX_1_175875602);
917                     tmp3 = MULTIPLY(d1, FIX_1_387039845);
918                 } else {
919                     /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
920                     tmp0 = tmp1 = tmp2 = tmp3 = 0;
921                 }
922             }
923         }
924     }
925 
926     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
927 
928     dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3,
929                                            CONST_BITS+PASS1_BITS+3);
930     dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3,
931                                            CONST_BITS+PASS1_BITS+3);
932     dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2,
933                                            CONST_BITS+PASS1_BITS+3);
934     dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2,
935                                            CONST_BITS+PASS1_BITS+3);
936     dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1,
937                                            CONST_BITS+PASS1_BITS+3);
938     dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1,
939                                            CONST_BITS+PASS1_BITS+3);
940     dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0,
941                                            CONST_BITS+PASS1_BITS+3);
942     dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0,
943                                            CONST_BITS+PASS1_BITS+3);
944 
945     dataptr++;                  /* advance pointer to next column */
946   }
947 }
948 
949 #undef DCTSIZE
950 #define DCTSIZE 4
951 #define DCTSTRIDE 8
952 
ff_j_rev_dct4(DCTBLOCK data)953 void ff_j_rev_dct4(DCTBLOCK data)
954 {
955   int32_t tmp0, tmp1, tmp2, tmp3;
956   int32_t tmp10, tmp11, tmp12, tmp13;
957   int32_t z1;
958   int32_t d0, d2, d4, d6;
959   register int16_t *dataptr;
960   int rowctr;
961 
962   /* Pass 1: process rows. */
963   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
964   /* furthermore, we scale the results by 2**PASS1_BITS. */
965 
966   data[0] += 4;
967 
968   dataptr = data;
969 
970   for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
971     /* Due to quantization, we will usually find that many of the input
972      * coefficients are zero, especially the AC terms.  We can exploit this
973      * by short-circuiting the IDCT calculation for any row in which all
974      * the AC terms are zero.  In that case each output is equal to the
975      * DC coefficient (with scale factor as needed).
976      * With typical images and quantization tables, half or more of the
977      * row DCT calculations can be simplified this way.
978      */
979 
980     register uint8_t *idataptr = (uint8_t*)dataptr;
981 
982     d0 = dataptr[0];
983     d2 = dataptr[1];
984     d4 = dataptr[2];
985     d6 = dataptr[3];
986 
987     if ((d2 | d4 | d6) == 0) {
988       /* AC terms all zero */
989       if (d0) {
990           /* Compute a 32 bit value to assign. */
991           int16_t dcval = (int16_t) (d0 << PASS1_BITS);
992           register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
993 
994           AV_WN32A(&idataptr[0], v);
995           AV_WN32A(&idataptr[4], v);
996       }
997 
998       dataptr += DCTSTRIDE;     /* advance pointer to next row */
999       continue;
1000     }
1001 
1002     /* Even part: reverse the even part of the forward DCT. */
1003     /* The rotator is sqrt(2)*c(-6). */
1004     if (d6) {
1005             if (d2) {
1006                     /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1007                     z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1008                     tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1009                     tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1010 
1011                     tmp0 = (d0 + d4) << CONST_BITS;
1012                     tmp1 = (d0 - d4) << CONST_BITS;
1013 
1014                     tmp10 = tmp0 + tmp3;
1015                     tmp13 = tmp0 - tmp3;
1016                     tmp11 = tmp1 + tmp2;
1017                     tmp12 = tmp1 - tmp2;
1018             } else {
1019                     /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1020                     tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1021                     tmp3 = MULTIPLY(d6, FIX_0_541196100);
1022 
1023                     tmp0 = (d0 + d4) << CONST_BITS;
1024                     tmp1 = (d0 - d4) << CONST_BITS;
1025 
1026                     tmp10 = tmp0 + tmp3;
1027                     tmp13 = tmp0 - tmp3;
1028                     tmp11 = tmp1 + tmp2;
1029                     tmp12 = tmp1 - tmp2;
1030             }
1031     } else {
1032             if (d2) {
1033                     /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1034                     tmp2 = MULTIPLY(d2, FIX_0_541196100);
1035                     tmp3 = MULTIPLY(d2, FIX_1_306562965);
1036 
1037                     tmp0 = (d0 + d4) << CONST_BITS;
1038                     tmp1 = (d0 - d4) << CONST_BITS;
1039 
1040                     tmp10 = tmp0 + tmp3;
1041                     tmp13 = tmp0 - tmp3;
1042                     tmp11 = tmp1 + tmp2;
1043                     tmp12 = tmp1 - tmp2;
1044             } else {
1045                     /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1046                     tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1047                     tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1048             }
1049       }
1050 
1051     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1052 
1053     dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
1054     dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
1055     dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
1056     dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
1057 
1058     dataptr += DCTSTRIDE;       /* advance pointer to next row */
1059   }
1060 
1061   /* Pass 2: process columns. */
1062   /* Note that we must descale the results by a factor of 8 == 2**3, */
1063   /* and also undo the PASS1_BITS scaling. */
1064 
1065   dataptr = data;
1066   for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
1067     /* Columns of zeroes can be exploited in the same way as we did with rows.
1068      * However, the row calculation has created many nonzero AC terms, so the
1069      * simplification applies less often (typically 5% to 10% of the time).
1070      * On machines with very fast multiplication, it's possible that the
1071      * test takes more time than it's worth.  In that case this section
1072      * may be commented out.
1073      */
1074 
1075     d0 = dataptr[DCTSTRIDE*0];
1076     d2 = dataptr[DCTSTRIDE*1];
1077     d4 = dataptr[DCTSTRIDE*2];
1078     d6 = dataptr[DCTSTRIDE*3];
1079 
1080     /* Even part: reverse the even part of the forward DCT. */
1081     /* The rotator is sqrt(2)*c(-6). */
1082     if (d6) {
1083             if (d2) {
1084                     /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1085                     z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1086                     tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1087                     tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1088 
1089                     tmp0 = (d0 + d4) << CONST_BITS;
1090                     tmp1 = (d0 - d4) << CONST_BITS;
1091 
1092                     tmp10 = tmp0 + tmp3;
1093                     tmp13 = tmp0 - tmp3;
1094                     tmp11 = tmp1 + tmp2;
1095                     tmp12 = tmp1 - tmp2;
1096             } else {
1097                     /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1098                     tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1099                     tmp3 = MULTIPLY(d6, FIX_0_541196100);
1100 
1101                     tmp0 = (d0 + d4) << CONST_BITS;
1102                     tmp1 = (d0 - d4) << CONST_BITS;
1103 
1104                     tmp10 = tmp0 + tmp3;
1105                     tmp13 = tmp0 - tmp3;
1106                     tmp11 = tmp1 + tmp2;
1107                     tmp12 = tmp1 - tmp2;
1108             }
1109     } else {
1110             if (d2) {
1111                     /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1112                     tmp2 = MULTIPLY(d2, FIX_0_541196100);
1113                     tmp3 = MULTIPLY(d2, FIX_1_306562965);
1114 
1115                     tmp0 = (d0 + d4) << CONST_BITS;
1116                     tmp1 = (d0 - d4) << CONST_BITS;
1117 
1118                     tmp10 = tmp0 + tmp3;
1119                     tmp13 = tmp0 - tmp3;
1120                     tmp11 = tmp1 + tmp2;
1121                     tmp12 = tmp1 - tmp2;
1122             } else {
1123                     /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1124                     tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1125                     tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1126             }
1127     }
1128 
1129     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1130 
1131     dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
1132     dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
1133     dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
1134     dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
1135 
1136     dataptr++;                  /* advance pointer to next column */
1137   }
1138 }
1139 
ff_j_rev_dct2(DCTBLOCK data)1140 void ff_j_rev_dct2(DCTBLOCK data){
1141   int d00, d01, d10, d11;
1142 
1143   data[0] += 4;
1144   d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
1145   d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
1146   d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
1147   d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
1148 
1149   data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
1150   data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
1151   data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
1152   data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
1153 }
1154 
ff_j_rev_dct1(DCTBLOCK data)1155 void ff_j_rev_dct1(DCTBLOCK data){
1156   data[0] = (data[0] + 4)>>3;
1157 }
1158 
1159 #undef FIX
1160 #undef CONST_BITS
1161 
ff_jref_idct_put(uint8_t * dest,ptrdiff_t line_size,int16_t * block)1162 void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
1163 {
1164     ff_j_rev_dct(block);
1165     ff_put_pixels_clamped_c(block, dest, line_size);
1166 }
1167 
ff_jref_idct_add(uint8_t * dest,ptrdiff_t line_size,int16_t * block)1168 void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
1169 {
1170     ff_j_rev_dct(block);
1171     ff_add_pixels_clamped_c(block, dest, line_size);
1172 }
1173