• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * jidctint.c
3  *
4  * This file was part of the Independent JPEG Group's software:
5  * Copyright (C) 1991-1998, Thomas G. Lane.
6  * Modification developed 2002-2018 by Guido Vollbeding.
7  * libjpeg-turbo Modifications:
8  * Copyright (C) 2015, 2020, D. R. Commander.
9  * For conditions of distribution and use, see the accompanying README.ijg
10  * file.
11  *
12  * This file contains a slower but more accurate integer implementation of the
13  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
14  * must also perform dequantization of the input coefficients.
15  *
16  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
17  * on each row (or vice versa, but it's more convenient to emit a row at
18  * a time).  Direct algorithms are also available, but they are much more
19  * complex and seem not to be any faster when reduced to code.
20  *
21  * This implementation is based on an algorithm described in
22  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
23  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
24  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
25  * The primary algorithm described there uses 11 multiplies and 29 adds.
26  * We use their alternate method with 12 multiplies and 32 adds.
27  * The advantage of this method is that no data path contains more than one
28  * multiplication; this allows a very simple and accurate implementation in
29  * scaled fixed-point arithmetic, with a minimal number of shifts.
30  *
31  * We also provide IDCT routines with various output sample block sizes for
32  * direct resolution reduction or enlargement without additional resampling:
33  * NxN (N=1...16) pixels for one 8x8 input DCT block.
34  *
35  * For N<8 we simply take the corresponding low-frequency coefficients of
36  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
37  * to yield the downscaled outputs.
38  * This can be seen as direct low-pass downsampling from the DCT domain
39  * point of view rather than the usual spatial domain point of view,
40  * yielding significant computational savings and results at least
41  * as good as common bilinear (averaging) spatial downsampling.
42  *
43  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
44  * lower frequencies and higher frequencies assumed to be zero.
45  * It turns out that the computational effort is similar to the 8x8 IDCT
46  * regarding the output size.
47  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
48  *
49  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
50  * since there would be too many additional constants to pre-calculate.
51  */
52 
53 #define JPEG_INTERNALS
54 #include "jinclude.h"
55 #include "jpeglib.h"
56 #include "jdct.h"               /* Private declarations for DCT subsystem */
57 
58 #ifdef DCT_ISLOW_SUPPORTED
59 
60 
61 /*
62  * This module is specialized to the case DCTSIZE = 8.
63  */
64 
65 #if DCTSIZE != 8
66   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
67 #endif
68 
69 
70 /*
71  * The poop on this scaling stuff is as follows:
72  *
73  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
74  * larger than the true IDCT outputs.  The final outputs are therefore
75  * a factor of N larger than desired; since N=8 this can be cured by
76  * a simple right shift at the end of the algorithm.  The advantage of
77  * this arrangement is that we save two multiplications per 1-D IDCT,
78  * because the y0 and y4 inputs need not be divided by sqrt(N).
79  *
80  * We have to do addition and subtraction of the integer inputs, which
81  * is no problem, and multiplication by fractional constants, which is
82  * a problem to do in integer arithmetic.  We multiply all the constants
83  * by CONST_SCALE and convert them to integer constants (thus retaining
84  * CONST_BITS bits of precision in the constants).  After doing a
85  * multiplication we have to divide the product by CONST_SCALE, with proper
86  * rounding, to produce the correct output.  This division can be done
87  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
88  * as long as possible so that partial sums can be added together with
89  * full fractional precision.
90  *
91  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
92  * they are represented to better-than-integral precision.  These outputs
93  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
94  * with the recommended scaling.  (To scale up 12-bit sample data further, an
95  * intermediate JLONG array would be needed.)
96  *
97  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
98  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
99  * shows that the values given below are the most effective.
100  */
101 
102 #if BITS_IN_JSAMPLE == 8
103 #define CONST_BITS  13
104 #define PASS1_BITS  2
105 #else
106 #define CONST_BITS  13
107 #define PASS1_BITS  1           /* lose a little precision to avoid overflow */
108 #endif
109 
110 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
111  * causing a lot of useless floating-point operations at run time.
112  * To get around this we use the following pre-calculated constants.
113  * If you change CONST_BITS you may want to add appropriate values.
114  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
115  */
116 
117 #if CONST_BITS == 13
118 #define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */
119 #define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */
120 #define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */
121 #define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
122 #define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
123 #define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */
124 #define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */
125 #define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
126 #define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */
127 #define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */
128 #define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
129 #define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */
130 #else
131 #define FIX_0_298631336  FIX(0.298631336)
132 #define FIX_0_390180644  FIX(0.390180644)
133 #define FIX_0_541196100  FIX(0.541196100)
134 #define FIX_0_765366865  FIX(0.765366865)
135 #define FIX_0_899976223  FIX(0.899976223)
136 #define FIX_1_175875602  FIX(1.175875602)
137 #define FIX_1_501321110  FIX(1.501321110)
138 #define FIX_1_847759065  FIX(1.847759065)
139 #define FIX_1_961570560  FIX(1.961570560)
140 #define FIX_2_053119869  FIX(2.053119869)
141 #define FIX_2_562915447  FIX(2.562915447)
142 #define FIX_3_072711026  FIX(3.072711026)
143 #endif
144 
145 
146 /* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
147  * For 8-bit samples with the recommended scaling, all the variable
148  * and constant values involved are no more than 16 bits wide, so a
149  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
150  * For 12-bit samples, a full 32-bit multiplication will be needed.
151  */
152 
153 #if BITS_IN_JSAMPLE == 8
154 #define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
155 #else
156 #define MULTIPLY(var, const)  ((var) * (const))
157 #endif
158 
159 
160 /* Dequantize a coefficient by multiplying it by the multiplier-table
161  * entry; produce an int result.  In this module, both inputs and result
162  * are 16 bits or less, so either int or short multiply will work.
163  */
164 
165 #define DEQUANTIZE(coef, quantval)  (((ISLOW_MULT_TYPE)(coef)) * (quantval))
166 
167 
168 /*
169  * Perform dequantization and inverse DCT on one block of coefficients.
170  */
171 
172 GLOBAL(void)
173 jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
174                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
175                 JDIMENSION output_col)
176 {
177   JLONG tmp0, tmp1, tmp2, tmp3;
178   JLONG tmp10, tmp11, tmp12, tmp13;
179   JLONG z1, z2, z3, z4, z5;
180   JCOEFPTR inptr;
181   ISLOW_MULT_TYPE *quantptr;
182   int *wsptr;
183   JSAMPROW outptr;
184   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
185   int ctr;
186   int workspace[DCTSIZE2];      /* buffers data between passes */
187   SHIFT_TEMPS
188 
189   /* Pass 1: process columns from input, store into work array. */
190   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
191   /* furthermore, we scale the results by 2**PASS1_BITS. */
192 
193   inptr = coef_block;
194   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
195   wsptr = workspace;
196   for (ctr = DCTSIZE; ctr > 0; ctr--) {
197     /* Due to quantization, we will usually find that many of the input
198      * coefficients are zero, especially the AC terms.  We can exploit this
199      * by short-circuiting the IDCT calculation for any column in which all
200      * the AC terms are zero.  In that case each output is equal to the
201      * DC coefficient (with scale factor as needed).
202      * With typical images and quantization tables, half or more of the
203      * column DCT calculations can be simplified this way.
204      */
205 
206     if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
207         inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
208         inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
209         inptr[DCTSIZE * 7] == 0) {
210       /* AC terms all zero */
211       int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
212                              quantptr[DCTSIZE * 0]), PASS1_BITS);
213 
214       wsptr[DCTSIZE * 0] = dcval;
215       wsptr[DCTSIZE * 1] = dcval;
216       wsptr[DCTSIZE * 2] = dcval;
217       wsptr[DCTSIZE * 3] = dcval;
218       wsptr[DCTSIZE * 4] = dcval;
219       wsptr[DCTSIZE * 5] = dcval;
220       wsptr[DCTSIZE * 6] = dcval;
221       wsptr[DCTSIZE * 7] = dcval;
222 
223       inptr++;                  /* advance pointers to next column */
224       quantptr++;
225       wsptr++;
226       continue;
227     }
228 
229     /* Even part: reverse the even part of the forward DCT. */
230     /* The rotator is sqrt(2)*c(-6). */
231 
232     z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
233     z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
234 
235     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
236     tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
237     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
238 
239     z2 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
240     z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
241 
242     tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
243     tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
244 
245     tmp10 = tmp0 + tmp3;
246     tmp13 = tmp0 - tmp3;
247     tmp11 = tmp1 + tmp2;
248     tmp12 = tmp1 - tmp2;
249 
250     /* Odd part per figure 8; the matrix is unitary and hence its
251      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
252      */
253 
254     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
255     tmp1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
256     tmp2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
257     tmp3 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
258 
259     z1 = tmp0 + tmp3;
260     z2 = tmp1 + tmp2;
261     z3 = tmp0 + tmp2;
262     z4 = tmp1 + tmp3;
263     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
264 
265     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
266     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
267     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
268     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
269     z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
270     z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
271     z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
272     z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
273 
274     z3 += z5;
275     z4 += z5;
276 
277     tmp0 += z1 + z3;
278     tmp1 += z2 + z4;
279     tmp2 += z2 + z3;
280     tmp3 += z1 + z4;
281 
282     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
283 
284     wsptr[DCTSIZE * 0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS - PASS1_BITS);
285     wsptr[DCTSIZE * 7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS - PASS1_BITS);
286     wsptr[DCTSIZE * 1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS - PASS1_BITS);
287     wsptr[DCTSIZE * 6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS - PASS1_BITS);
288     wsptr[DCTSIZE * 2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS - PASS1_BITS);
289     wsptr[DCTSIZE * 5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS - PASS1_BITS);
290     wsptr[DCTSIZE * 3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS - PASS1_BITS);
291     wsptr[DCTSIZE * 4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS - PASS1_BITS);
292 
293     inptr++;                    /* advance pointers to next column */
294     quantptr++;
295     wsptr++;
296   }
297 
298   /* Pass 2: process rows from work array, store into output array. */
299   /* Note that we must descale the results by a factor of 8 == 2**3, */
300   /* and also undo the PASS1_BITS scaling. */
301 
302   wsptr = workspace;
303   for (ctr = 0; ctr < DCTSIZE; ctr++) {
304     outptr = output_buf[ctr] + output_col;
305     /* Rows of zeroes can be exploited in the same way as we did with columns.
306      * However, the column calculation has created many nonzero AC terms, so
307      * the simplification applies less often (typically 5% to 10% of the time).
308      * On machines with very fast multiplication, it's possible that the
309      * test takes more time than it's worth.  In that case this section
310      * may be commented out.
311      */
312 
313 #ifndef NO_ZERO_ROW_TEST
314     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
315         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
316       /* AC terms all zero */
317       JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
318                                                PASS1_BITS + 3) & RANGE_MASK];
319 
320       outptr[0] = dcval;
321       outptr[1] = dcval;
322       outptr[2] = dcval;
323       outptr[3] = dcval;
324       outptr[4] = dcval;
325       outptr[5] = dcval;
326       outptr[6] = dcval;
327       outptr[7] = dcval;
328 
329       wsptr += DCTSIZE;         /* advance pointer to next row */
330       continue;
331     }
332 #endif
333 
334     /* Even part: reverse the even part of the forward DCT. */
335     /* The rotator is sqrt(2)*c(-6). */
336 
337     z2 = (JLONG)wsptr[2];
338     z3 = (JLONG)wsptr[6];
339 
340     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
341     tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
342     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
343 
344     tmp0 = LEFT_SHIFT((JLONG)wsptr[0] + (JLONG)wsptr[4], CONST_BITS);
345     tmp1 = LEFT_SHIFT((JLONG)wsptr[0] - (JLONG)wsptr[4], CONST_BITS);
346 
347     tmp10 = tmp0 + tmp3;
348     tmp13 = tmp0 - tmp3;
349     tmp11 = tmp1 + tmp2;
350     tmp12 = tmp1 - tmp2;
351 
352     /* Odd part per figure 8; the matrix is unitary and hence its
353      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
354      */
355 
356     tmp0 = (JLONG)wsptr[7];
357     tmp1 = (JLONG)wsptr[5];
358     tmp2 = (JLONG)wsptr[3];
359     tmp3 = (JLONG)wsptr[1];
360 
361     z1 = tmp0 + tmp3;
362     z2 = tmp1 + tmp2;
363     z3 = tmp0 + tmp2;
364     z4 = tmp1 + tmp3;
365     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
366 
367     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
368     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
369     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
370     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
371     z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
372     z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
373     z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
374     z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
375 
376     z3 += z5;
377     z4 += z5;
378 
379     tmp0 += z1 + z3;
380     tmp1 += z2 + z4;
381     tmp2 += z2 + z3;
382     tmp3 += z1 + z4;
383 
384     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
385 
386     outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp3,
387                                          CONST_BITS + PASS1_BITS + 3) &
388                             RANGE_MASK];
389     outptr[7] = range_limit[(int)DESCALE(tmp10 - tmp3,
390                                          CONST_BITS + PASS1_BITS + 3) &
391                             RANGE_MASK];
392     outptr[1] = range_limit[(int)DESCALE(tmp11 + tmp2,
393                                          CONST_BITS + PASS1_BITS + 3) &
394                             RANGE_MASK];
395     outptr[6] = range_limit[(int)DESCALE(tmp11 - tmp2,
396                                          CONST_BITS + PASS1_BITS + 3) &
397                             RANGE_MASK];
398     outptr[2] = range_limit[(int)DESCALE(tmp12 + tmp1,
399                                          CONST_BITS + PASS1_BITS + 3) &
400                             RANGE_MASK];
401     outptr[5] = range_limit[(int)DESCALE(tmp12 - tmp1,
402                                          CONST_BITS + PASS1_BITS + 3) &
403                             RANGE_MASK];
404     outptr[3] = range_limit[(int)DESCALE(tmp13 + tmp0,
405                                          CONST_BITS + PASS1_BITS + 3) &
406                             RANGE_MASK];
407     outptr[4] = range_limit[(int)DESCALE(tmp13 - tmp0,
408                                          CONST_BITS + PASS1_BITS + 3) &
409                             RANGE_MASK];
410 
411     wsptr += DCTSIZE;           /* advance pointer to next row */
412   }
413 }
414 
415 #ifdef IDCT_SCALING_SUPPORTED
416 
417 
418 /*
419  * Perform dequantization and inverse DCT on one block of coefficients,
420  * producing a reduced-size 7x7 output block.
421  *
422  * Optimized algorithm with 12 multiplications in the 1-D kernel.
423  * cK represents sqrt(2) * cos(K*pi/14).
424  */
425 
426 GLOBAL(void)
jpeg_idct_7x7(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)427 jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
428               JCOEFPTR coef_block, JSAMPARRAY output_buf,
429               JDIMENSION output_col)
430 {
431   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
432   JLONG z1, z2, z3;
433   JCOEFPTR inptr;
434   ISLOW_MULT_TYPE *quantptr;
435   int *wsptr;
436   JSAMPROW outptr;
437   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
438   int ctr;
439   int workspace[7 * 7];         /* buffers data between passes */
440   SHIFT_TEMPS
441 
442   /* Pass 1: process columns from input, store into work array. */
443 
444   inptr = coef_block;
445   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
446   wsptr = workspace;
447   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
448     /* Even part */
449 
450     tmp13 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
451     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
452     /* Add fudge factor here for final descale. */
453     tmp13 += ONE << (CONST_BITS - PASS1_BITS - 1);
454 
455     z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
456     z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
457     z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
458 
459     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
460     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
461     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
462     tmp0 = z1 + z3;
463     z2 -= tmp0;
464     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
465     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
466     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
467     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
468 
469     /* Odd part */
470 
471     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
472     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
473     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
474 
475     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
476     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
477     tmp0 = tmp1 - tmp2;
478     tmp1 += tmp2;
479     tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
480     tmp1 += tmp2;
481     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
482     tmp0 += z2;
483     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
484 
485     /* Final output stage */
486 
487     wsptr[7 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
488     wsptr[7 * 6] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
489     wsptr[7 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
490     wsptr[7 * 5] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
491     wsptr[7 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
492     wsptr[7 * 4] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
493     wsptr[7 * 3] = (int)RIGHT_SHIFT(tmp13, CONST_BITS - PASS1_BITS);
494   }
495 
496   /* Pass 2: process 7 rows from work array, store into output array. */
497 
498   wsptr = workspace;
499   for (ctr = 0; ctr < 7; ctr++) {
500     outptr = output_buf[ctr] + output_col;
501 
502     /* Even part */
503 
504     /* Add fudge factor here for final descale. */
505     tmp13 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
506     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
507 
508     z1 = (JLONG)wsptr[2];
509     z2 = (JLONG)wsptr[4];
510     z3 = (JLONG)wsptr[6];
511 
512     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
513     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
514     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
515     tmp0 = z1 + z3;
516     z2 -= tmp0;
517     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
518     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
519     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
520     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
521 
522     /* Odd part */
523 
524     z1 = (JLONG)wsptr[1];
525     z2 = (JLONG)wsptr[3];
526     z3 = (JLONG)wsptr[5];
527 
528     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
529     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
530     tmp0 = tmp1 - tmp2;
531     tmp1 += tmp2;
532     tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
533     tmp1 += tmp2;
534     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
535     tmp0 += z2;
536     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
537 
538     /* Final output stage */
539 
540     outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
541                                              CONST_BITS + PASS1_BITS + 3) &
542                             RANGE_MASK];
543     outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
544                                              CONST_BITS + PASS1_BITS + 3) &
545                             RANGE_MASK];
546     outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
547                                              CONST_BITS + PASS1_BITS + 3) &
548                             RANGE_MASK];
549     outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
550                                              CONST_BITS + PASS1_BITS + 3) &
551                             RANGE_MASK];
552     outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
553                                              CONST_BITS + PASS1_BITS + 3) &
554                             RANGE_MASK];
555     outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
556                                              CONST_BITS + PASS1_BITS + 3) &
557                             RANGE_MASK];
558     outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13,
559                                              CONST_BITS + PASS1_BITS + 3) &
560                             RANGE_MASK];
561 
562     wsptr += 7;         /* advance pointer to next row */
563   }
564 }
565 
566 
567 /*
568  * Perform dequantization and inverse DCT on one block of coefficients,
569  * producing a reduced-size 6x6 output block.
570  *
571  * Optimized algorithm with 3 multiplications in the 1-D kernel.
572  * cK represents sqrt(2) * cos(K*pi/12).
573  */
574 
575 GLOBAL(void)
jpeg_idct_6x6(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)576 jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
577               JCOEFPTR coef_block, JSAMPARRAY output_buf,
578               JDIMENSION output_col)
579 {
580   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
581   JLONG z1, z2, z3;
582   JCOEFPTR inptr;
583   ISLOW_MULT_TYPE *quantptr;
584   int *wsptr;
585   JSAMPROW outptr;
586   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
587   int ctr;
588   int workspace[6 * 6];         /* buffers data between passes */
589   SHIFT_TEMPS
590 
591   /* Pass 1: process columns from input, store into work array. */
592 
593   inptr = coef_block;
594   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
595   wsptr = workspace;
596   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
597     /* Even part */
598 
599     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
600     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
601     /* Add fudge factor here for final descale. */
602     tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
603     tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
604     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
605     tmp1 = tmp0 + tmp10;
606     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS - PASS1_BITS);
607     tmp10 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
608     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
609     tmp10 = tmp1 + tmp0;
610     tmp12 = tmp1 - tmp0;
611 
612     /* Odd part */
613 
614     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
615     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
616     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
617     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
618     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
619     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
620     tmp1 = LEFT_SHIFT(z1 - z2 - z3, PASS1_BITS);
621 
622     /* Final output stage */
623 
624     wsptr[6 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
625     wsptr[6 * 5] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
626     wsptr[6 * 1] = (int)(tmp11 + tmp1);
627     wsptr[6 * 4] = (int)(tmp11 - tmp1);
628     wsptr[6 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
629     wsptr[6 * 3] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
630   }
631 
632   /* Pass 2: process 6 rows from work array, store into output array. */
633 
634   wsptr = workspace;
635   for (ctr = 0; ctr < 6; ctr++) {
636     outptr = output_buf[ctr] + output_col;
637 
638     /* Even part */
639 
640     /* Add fudge factor here for final descale. */
641     tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
642     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
643     tmp2 = (JLONG)wsptr[4];
644     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
645     tmp1 = tmp0 + tmp10;
646     tmp11 = tmp0 - tmp10 - tmp10;
647     tmp10 = (JLONG)wsptr[2];
648     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
649     tmp10 = tmp1 + tmp0;
650     tmp12 = tmp1 - tmp0;
651 
652     /* Odd part */
653 
654     z1 = (JLONG)wsptr[1];
655     z2 = (JLONG)wsptr[3];
656     z3 = (JLONG)wsptr[5];
657     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
658     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
659     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
660     tmp1 = LEFT_SHIFT(z1 - z2 - z3, CONST_BITS);
661 
662     /* Final output stage */
663 
664     outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
665                                              CONST_BITS + PASS1_BITS + 3) &
666                             RANGE_MASK];
667     outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
668                                              CONST_BITS + PASS1_BITS + 3) &
669                             RANGE_MASK];
670     outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
671                                              CONST_BITS + PASS1_BITS + 3) &
672                             RANGE_MASK];
673     outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
674                                              CONST_BITS + PASS1_BITS + 3) &
675                             RANGE_MASK];
676     outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
677                                              CONST_BITS + PASS1_BITS + 3) &
678                             RANGE_MASK];
679     outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
680                                              CONST_BITS + PASS1_BITS + 3) &
681                             RANGE_MASK];
682 
683     wsptr += 6;         /* advance pointer to next row */
684   }
685 }
686 
687 
688 /*
689  * Perform dequantization and inverse DCT on one block of coefficients,
690  * producing a reduced-size 5x5 output block.
691  *
692  * Optimized algorithm with 5 multiplications in the 1-D kernel.
693  * cK represents sqrt(2) * cos(K*pi/10).
694  */
695 
696 GLOBAL(void)
jpeg_idct_5x5(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)697 jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
698               JCOEFPTR coef_block, JSAMPARRAY output_buf,
699               JDIMENSION output_col)
700 {
701   JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
702   JLONG z1, z2, z3;
703   JCOEFPTR inptr;
704   ISLOW_MULT_TYPE *quantptr;
705   int *wsptr;
706   JSAMPROW outptr;
707   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
708   int ctr;
709   int workspace[5 * 5];         /* buffers data between passes */
710   SHIFT_TEMPS
711 
712   /* Pass 1: process columns from input, store into work array. */
713 
714   inptr = coef_block;
715   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
716   wsptr = workspace;
717   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
718     /* Even part */
719 
720     tmp12 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
721     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
722     /* Add fudge factor here for final descale. */
723     tmp12 += ONE << (CONST_BITS - PASS1_BITS - 1);
724     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
725     tmp1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
726     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
727     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
728     z3 = tmp12 + z2;
729     tmp10 = z3 + z1;
730     tmp11 = z3 - z1;
731     tmp12 -= LEFT_SHIFT(z2, 2);
732 
733     /* Odd part */
734 
735     z2 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
736     z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
737 
738     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
739     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
740     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
741 
742     /* Final output stage */
743 
744     wsptr[5 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
745     wsptr[5 * 4] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
746     wsptr[5 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
747     wsptr[5 * 3] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
748     wsptr[5 * 2] = (int)RIGHT_SHIFT(tmp12, CONST_BITS - PASS1_BITS);
749   }
750 
751   /* Pass 2: process 5 rows from work array, store into output array. */
752 
753   wsptr = workspace;
754   for (ctr = 0; ctr < 5; ctr++) {
755     outptr = output_buf[ctr] + output_col;
756 
757     /* Even part */
758 
759     /* Add fudge factor here for final descale. */
760     tmp12 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
761     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
762     tmp0 = (JLONG)wsptr[2];
763     tmp1 = (JLONG)wsptr[4];
764     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
765     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
766     z3 = tmp12 + z2;
767     tmp10 = z3 + z1;
768     tmp11 = z3 - z1;
769     tmp12 -= LEFT_SHIFT(z2, 2);
770 
771     /* Odd part */
772 
773     z2 = (JLONG)wsptr[1];
774     z3 = (JLONG)wsptr[3];
775 
776     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
777     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
778     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
779 
780     /* Final output stage */
781 
782     outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
783                                              CONST_BITS + PASS1_BITS + 3) &
784                             RANGE_MASK];
785     outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
786                                              CONST_BITS + PASS1_BITS + 3) &
787                             RANGE_MASK];
788     outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
789                                              CONST_BITS + PASS1_BITS + 3) &
790                             RANGE_MASK];
791     outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
792                                              CONST_BITS + PASS1_BITS + 3) &
793                             RANGE_MASK];
794     outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12,
795                                              CONST_BITS + PASS1_BITS + 3) &
796                             RANGE_MASK];
797 
798     wsptr += 5;         /* advance pointer to next row */
799   }
800 }
801 
802 
803 /*
804  * Perform dequantization and inverse DCT on one block of coefficients,
805  * producing a reduced-size 3x3 output block.
806  *
807  * Optimized algorithm with 2 multiplications in the 1-D kernel.
808  * cK represents sqrt(2) * cos(K*pi/6).
809  */
810 
811 GLOBAL(void)
jpeg_idct_3x3(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)812 jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
813               JCOEFPTR coef_block, JSAMPARRAY output_buf,
814               JDIMENSION output_col)
815 {
816   JLONG tmp0, tmp2, tmp10, tmp12;
817   JCOEFPTR inptr;
818   ISLOW_MULT_TYPE *quantptr;
819   int *wsptr;
820   JSAMPROW outptr;
821   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
822   int ctr;
823   int workspace[3 * 3];         /* buffers data between passes */
824   SHIFT_TEMPS
825 
826   /* Pass 1: process columns from input, store into work array. */
827 
828   inptr = coef_block;
829   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
830   wsptr = workspace;
831   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
832     /* Even part */
833 
834     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
835     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
836     /* Add fudge factor here for final descale. */
837     tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
838     tmp2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
839     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
840     tmp10 = tmp0 + tmp12;
841     tmp2 = tmp0 - tmp12 - tmp12;
842 
843     /* Odd part */
844 
845     tmp12 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
846     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
847 
848     /* Final output stage */
849 
850     wsptr[3 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
851     wsptr[3 * 2] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
852     wsptr[3 * 1] = (int)RIGHT_SHIFT(tmp2, CONST_BITS - PASS1_BITS);
853   }
854 
855   /* Pass 2: process 3 rows from work array, store into output array. */
856 
857   wsptr = workspace;
858   for (ctr = 0; ctr < 3; ctr++) {
859     outptr = output_buf[ctr] + output_col;
860 
861     /* Even part */
862 
863     /* Add fudge factor here for final descale. */
864     tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
865     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
866     tmp2 = (JLONG)wsptr[2];
867     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
868     tmp10 = tmp0 + tmp12;
869     tmp2 = tmp0 - tmp12 - tmp12;
870 
871     /* Odd part */
872 
873     tmp12 = (JLONG)wsptr[1];
874     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
875 
876     /* Final output stage */
877 
878     outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
879                                              CONST_BITS + PASS1_BITS + 3) &
880                             RANGE_MASK];
881     outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
882                                              CONST_BITS + PASS1_BITS + 3) &
883                             RANGE_MASK];
884     outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp2,
885                                              CONST_BITS + PASS1_BITS + 3) &
886                             RANGE_MASK];
887 
888     wsptr += 3;         /* advance pointer to next row */
889   }
890 }
891 
892 
893 /*
894  * Perform dequantization and inverse DCT on one block of coefficients,
895  * producing a 9x9 output block.
896  *
897  * Optimized algorithm with 10 multiplications in the 1-D kernel.
898  * cK represents sqrt(2) * cos(K*pi/18).
899  */
900 
901 GLOBAL(void)
jpeg_idct_9x9(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)902 jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
903               JCOEFPTR coef_block, JSAMPARRAY output_buf,
904               JDIMENSION output_col)
905 {
906   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
907   JLONG z1, z2, z3, z4;
908   JCOEFPTR inptr;
909   ISLOW_MULT_TYPE *quantptr;
910   int *wsptr;
911   JSAMPROW outptr;
912   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
913   int ctr;
914   int workspace[8 * 9];         /* buffers data between passes */
915   SHIFT_TEMPS
916 
917   /* Pass 1: process columns from input, store into work array. */
918 
919   inptr = coef_block;
920   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
921   wsptr = workspace;
922   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
923     /* Even part */
924 
925     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
926     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
927     /* Add fudge factor here for final descale. */
928     tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
929 
930     z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
931     z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
932     z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
933 
934     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
935     tmp1 = tmp0 + tmp3;
936     tmp2 = tmp0 - tmp3 - tmp3;
937 
938     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
939     tmp11 = tmp2 + tmp0;
940     tmp14 = tmp2 - tmp0 - tmp0;
941 
942     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
943     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
944     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
945 
946     tmp10 = tmp1 + tmp0 - tmp3;
947     tmp12 = tmp1 - tmp0 + tmp2;
948     tmp13 = tmp1 - tmp2 + tmp3;
949 
950     /* Odd part */
951 
952     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
953     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
954     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
955     z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
956 
957     z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
958 
959     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
960     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
961     tmp0 = tmp2 + tmp3 - z2;
962     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
963     tmp2 += z2 - tmp1;
964     tmp3 += z2 + tmp1;
965     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
966 
967     /* Final output stage */
968 
969     wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
970     wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
971     wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
972     wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
973     wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
974     wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
975     wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS - PASS1_BITS);
976     wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS - PASS1_BITS);
977     wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp14, CONST_BITS - PASS1_BITS);
978   }
979 
980   /* Pass 2: process 9 rows from work array, store into output array. */
981 
982   wsptr = workspace;
983   for (ctr = 0; ctr < 9; ctr++) {
984     outptr = output_buf[ctr] + output_col;
985 
986     /* Even part */
987 
988     /* Add fudge factor here for final descale. */
989     tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
990     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
991 
992     z1 = (JLONG)wsptr[2];
993     z2 = (JLONG)wsptr[4];
994     z3 = (JLONG)wsptr[6];
995 
996     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
997     tmp1 = tmp0 + tmp3;
998     tmp2 = tmp0 - tmp3 - tmp3;
999 
1000     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1001     tmp11 = tmp2 + tmp0;
1002     tmp14 = tmp2 - tmp0 - tmp0;
1003 
1004     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1005     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1006     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1007 
1008     tmp10 = tmp1 + tmp0 - tmp3;
1009     tmp12 = tmp1 - tmp0 + tmp2;
1010     tmp13 = tmp1 - tmp2 + tmp3;
1011 
1012     /* Odd part */
1013 
1014     z1 = (JLONG)wsptr[1];
1015     z2 = (JLONG)wsptr[3];
1016     z3 = (JLONG)wsptr[5];
1017     z4 = (JLONG)wsptr[7];
1018 
1019     z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
1020 
1021     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1022     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1023     tmp0 = tmp2 + tmp3 - z2;
1024     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1025     tmp2 += z2 - tmp1;
1026     tmp3 += z2 + tmp1;
1027     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1028 
1029     /* Final output stage */
1030 
1031     outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
1032                                              CONST_BITS + PASS1_BITS + 3) &
1033                             RANGE_MASK];
1034     outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
1035                                              CONST_BITS + PASS1_BITS + 3) &
1036                             RANGE_MASK];
1037     outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
1038                                              CONST_BITS + PASS1_BITS + 3) &
1039                             RANGE_MASK];
1040     outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
1041                                              CONST_BITS + PASS1_BITS + 3) &
1042                             RANGE_MASK];
1043     outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
1044                                              CONST_BITS + PASS1_BITS + 3) &
1045                             RANGE_MASK];
1046     outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
1047                                              CONST_BITS + PASS1_BITS + 3) &
1048                             RANGE_MASK];
1049     outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13 + tmp3,
1050                                              CONST_BITS + PASS1_BITS + 3) &
1051                             RANGE_MASK];
1052     outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp13 - tmp3,
1053                                              CONST_BITS + PASS1_BITS + 3) &
1054                             RANGE_MASK];
1055     outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp14,
1056                                              CONST_BITS + PASS1_BITS + 3) &
1057                             RANGE_MASK];
1058 
1059     wsptr += 8;         /* advance pointer to next row */
1060   }
1061 }
1062 
1063 
1064 /*
1065  * Perform dequantization and inverse DCT on one block of coefficients,
1066  * producing a 10x10 output block.
1067  *
1068  * Optimized algorithm with 12 multiplications in the 1-D kernel.
1069  * cK represents sqrt(2) * cos(K*pi/20).
1070  */
1071 
1072 GLOBAL(void)
jpeg_idct_10x10(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1073 jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
1074                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
1075                 JDIMENSION output_col)
1076 {
1077   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
1078   JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
1079   JLONG z1, z2, z3, z4, z5;
1080   JCOEFPTR inptr;
1081   ISLOW_MULT_TYPE *quantptr;
1082   int *wsptr;
1083   JSAMPROW outptr;
1084   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1085   int ctr;
1086   int workspace[8 * 10];        /* buffers data between passes */
1087   SHIFT_TEMPS
1088 
1089   /* Pass 1: process columns from input, store into work array. */
1090 
1091   inptr = coef_block;
1092   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
1093   wsptr = workspace;
1094   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1095     /* Even part */
1096 
1097     z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
1098     z3 = LEFT_SHIFT(z3, CONST_BITS);
1099     /* Add fudge factor here for final descale. */
1100     z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
1101     z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
1102     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1103     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1104     tmp10 = z3 + z1;
1105     tmp11 = z3 - z2;
1106 
1107     tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
1108                         CONST_BITS - PASS1_BITS); /* c0 = (c4-c8)*2 */
1109 
1110     z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
1111     z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
1112 
1113     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1114     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1115     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1116 
1117     tmp20 = tmp10 + tmp12;
1118     tmp24 = tmp10 - tmp12;
1119     tmp21 = tmp11 + tmp13;
1120     tmp23 = tmp11 - tmp13;
1121 
1122     /* Odd part */
1123 
1124     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
1125     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
1126     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
1127     z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
1128 
1129     tmp11 = z2 + z4;
1130     tmp13 = z2 - z4;
1131 
1132     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1133     z5 = LEFT_SHIFT(z3, CONST_BITS);
1134 
1135     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1136     z4 = z5 + tmp12;
1137 
1138     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1139     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1140 
1141     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1142     z4 = z5 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
1143 
1144     tmp12 = LEFT_SHIFT(z1 - tmp13 - z3, PASS1_BITS);
1145 
1146     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1147     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1148 
1149     /* Final output stage */
1150 
1151     wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
1152     wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
1153     wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
1154     wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
1155     wsptr[8 * 2] = (int)(tmp22 + tmp12);
1156     wsptr[8 * 7] = (int)(tmp22 - tmp12);
1157     wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
1158     wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
1159     wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
1160     wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
1161   }
1162 
1163   /* Pass 2: process 10 rows from work array, store into output array. */
1164 
1165   wsptr = workspace;
1166   for (ctr = 0; ctr < 10; ctr++) {
1167     outptr = output_buf[ctr] + output_col;
1168 
1169     /* Even part */
1170 
1171     /* Add fudge factor here for final descale. */
1172     z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
1173     z3 = LEFT_SHIFT(z3, CONST_BITS);
1174     z4 = (JLONG)wsptr[4];
1175     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1176     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1177     tmp10 = z3 + z1;
1178     tmp11 = z3 - z2;
1179 
1180     tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1);         /* c0 = (c4-c8)*2 */
1181 
1182     z2 = (JLONG)wsptr[2];
1183     z3 = (JLONG)wsptr[6];
1184 
1185     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1186     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1187     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1188 
1189     tmp20 = tmp10 + tmp12;
1190     tmp24 = tmp10 - tmp12;
1191     tmp21 = tmp11 + tmp13;
1192     tmp23 = tmp11 - tmp13;
1193 
1194     /* Odd part */
1195 
1196     z1 = (JLONG)wsptr[1];
1197     z2 = (JLONG)wsptr[3];
1198     z3 = (JLONG)wsptr[5];
1199     z3 = LEFT_SHIFT(z3, CONST_BITS);
1200     z4 = (JLONG)wsptr[7];
1201 
1202     tmp11 = z2 + z4;
1203     tmp13 = z2 - z4;
1204 
1205     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1206 
1207     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1208     z4 = z3 + tmp12;
1209 
1210     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1211     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1212 
1213     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1214     z4 = z3 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
1215 
1216     tmp12 = LEFT_SHIFT(z1 - tmp13, CONST_BITS) - z3;
1217 
1218     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1219     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1220 
1221     /* Final output stage */
1222 
1223     outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
1224                                              CONST_BITS + PASS1_BITS + 3) &
1225                             RANGE_MASK];
1226     outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
1227                                              CONST_BITS + PASS1_BITS + 3) &
1228                             RANGE_MASK];
1229     outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
1230                                              CONST_BITS + PASS1_BITS + 3) &
1231                             RANGE_MASK];
1232     outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
1233                                              CONST_BITS + PASS1_BITS + 3) &
1234                             RANGE_MASK];
1235     outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
1236                                              CONST_BITS + PASS1_BITS + 3) &
1237                             RANGE_MASK];
1238     outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
1239                                              CONST_BITS + PASS1_BITS + 3) &
1240                             RANGE_MASK];
1241     outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
1242                                              CONST_BITS + PASS1_BITS + 3) &
1243                             RANGE_MASK];
1244     outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
1245                                              CONST_BITS + PASS1_BITS + 3) &
1246                             RANGE_MASK];
1247     outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
1248                                              CONST_BITS + PASS1_BITS + 3) &
1249                             RANGE_MASK];
1250     outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
1251                                              CONST_BITS + PASS1_BITS + 3) &
1252                             RANGE_MASK];
1253 
1254     wsptr += 8;         /* advance pointer to next row */
1255   }
1256 }
1257 
1258 
1259 /*
1260  * Perform dequantization and inverse DCT on one block of coefficients,
1261  * producing an 11x11 output block.
1262  *
1263  * Optimized algorithm with 24 multiplications in the 1-D kernel.
1264  * cK represents sqrt(2) * cos(K*pi/22).
1265  */
1266 
1267 GLOBAL(void)
jpeg_idct_11x11(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1268 jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
1269                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
1270                 JDIMENSION output_col)
1271 {
1272   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
1273   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1274   JLONG z1, z2, z3, z4;
1275   JCOEFPTR inptr;
1276   ISLOW_MULT_TYPE *quantptr;
1277   int *wsptr;
1278   JSAMPROW outptr;
1279   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1280   int ctr;
1281   int workspace[8 * 11];        /* buffers data between passes */
1282   SHIFT_TEMPS
1283 
1284   /* Pass 1: process columns from input, store into work array. */
1285 
1286   inptr = coef_block;
1287   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
1288   wsptr = workspace;
1289   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1290     /* Even part */
1291 
1292     tmp10 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
1293     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
1294     /* Add fudge factor here for final descale. */
1295     tmp10 += ONE << (CONST_BITS - PASS1_BITS - 1);
1296 
1297     z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
1298     z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
1299     z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
1300 
1301     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1302     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1303     z4 = z1 + z3;
1304     tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
1305     z4 -= z2;
1306     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1307     tmp21 = tmp20 + tmp23 + tmp25 -
1308             MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1309     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1310     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1311     tmp24 += tmp25;
1312     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1313     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1314              MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1315     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1316 
1317     /* Odd part */
1318 
1319     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
1320     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
1321     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
1322     z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
1323 
1324     tmp11 = z1 + z2;
1325     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1326     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1327     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1328     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1329     tmp10 = tmp11 + tmp12 + tmp13 -
1330             MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1331     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1332     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1333     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1334     z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
1335     tmp11 += z1;
1336     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1337     tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
1338              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1339              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1340 
1341     /* Final output stage */
1342 
1343     wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
1344     wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
1345     wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
1346     wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
1347     wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
1348     wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
1349     wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
1350     wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
1351     wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
1352     wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
1353     wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25, CONST_BITS - PASS1_BITS);
1354   }
1355 
1356   /* Pass 2: process 11 rows from work array, store into output array. */
1357 
1358   wsptr = workspace;
1359   for (ctr = 0; ctr < 11; ctr++) {
1360     outptr = output_buf[ctr] + output_col;
1361 
1362     /* Even part */
1363 
1364     /* Add fudge factor here for final descale. */
1365     tmp10 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
1366     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
1367 
1368     z1 = (JLONG)wsptr[2];
1369     z2 = (JLONG)wsptr[4];
1370     z3 = (JLONG)wsptr[6];
1371 
1372     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1373     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1374     z4 = z1 + z3;
1375     tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
1376     z4 -= z2;
1377     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1378     tmp21 = tmp20 + tmp23 + tmp25 -
1379             MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1380     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1381     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1382     tmp24 += tmp25;
1383     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1384     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1385              MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1386     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1387 
1388     /* Odd part */
1389 
1390     z1 = (JLONG)wsptr[1];
1391     z2 = (JLONG)wsptr[3];
1392     z3 = (JLONG)wsptr[5];
1393     z4 = (JLONG)wsptr[7];
1394 
1395     tmp11 = z1 + z2;
1396     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1397     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1398     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1399     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1400     tmp10 = tmp11 + tmp12 + tmp13 -
1401             MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1402     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1403     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1404     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1405     z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
1406     tmp11 += z1;
1407     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1408     tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
1409              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1410              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1411 
1412     /* Final output stage */
1413 
1414     outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
1415                                               CONST_BITS + PASS1_BITS + 3) &
1416                              RANGE_MASK];
1417     outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
1418                                               CONST_BITS + PASS1_BITS + 3) &
1419                              RANGE_MASK];
1420     outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
1421                                               CONST_BITS + PASS1_BITS + 3) &
1422                              RANGE_MASK];
1423     outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
1424                                               CONST_BITS + PASS1_BITS + 3) &
1425                              RANGE_MASK];
1426     outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
1427                                               CONST_BITS + PASS1_BITS + 3) &
1428                              RANGE_MASK];
1429     outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
1430                                               CONST_BITS + PASS1_BITS + 3) &
1431                              RANGE_MASK];
1432     outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
1433                                               CONST_BITS + PASS1_BITS + 3) &
1434                              RANGE_MASK];
1435     outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
1436                                               CONST_BITS + PASS1_BITS + 3) &
1437                              RANGE_MASK];
1438     outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
1439                                               CONST_BITS + PASS1_BITS + 3) &
1440                              RANGE_MASK];
1441     outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
1442                                               CONST_BITS + PASS1_BITS + 3) &
1443                              RANGE_MASK];
1444     outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25,
1445                                               CONST_BITS + PASS1_BITS + 3) &
1446                              RANGE_MASK];
1447 
1448     wsptr += 8;         /* advance pointer to next row */
1449   }
1450 }
1451 
1452 
1453 /*
1454  * Perform dequantization and inverse DCT on one block of coefficients,
1455  * producing a 12x12 output block.
1456  *
1457  * Optimized algorithm with 15 multiplications in the 1-D kernel.
1458  * cK represents sqrt(2) * cos(K*pi/24).
1459  */
1460 
1461 GLOBAL(void)
jpeg_idct_12x12(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1462 jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
1463                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
1464                 JDIMENSION output_col)
1465 {
1466   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1467   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1468   JLONG z1, z2, z3, z4;
1469   JCOEFPTR inptr;
1470   ISLOW_MULT_TYPE *quantptr;
1471   int *wsptr;
1472   JSAMPROW outptr;
1473   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1474   int ctr;
1475   int workspace[8 * 12];        /* buffers data between passes */
1476   SHIFT_TEMPS
1477 
1478   /* Pass 1: process columns from input, store into work array. */
1479 
1480   inptr = coef_block;
1481   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
1482   wsptr = workspace;
1483   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1484     /* Even part */
1485 
1486     z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
1487     z3 = LEFT_SHIFT(z3, CONST_BITS);
1488     /* Add fudge factor here for final descale. */
1489     z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
1490 
1491     z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
1492     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1493 
1494     tmp10 = z3 + z4;
1495     tmp11 = z3 - z4;
1496 
1497     z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
1498     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1499     z1 = LEFT_SHIFT(z1, CONST_BITS);
1500     z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
1501     z2 = LEFT_SHIFT(z2, CONST_BITS);
1502 
1503     tmp12 = z1 - z2;
1504 
1505     tmp21 = z3 + tmp12;
1506     tmp24 = z3 - tmp12;
1507 
1508     tmp12 = z4 + z2;
1509 
1510     tmp20 = tmp10 + tmp12;
1511     tmp25 = tmp10 - tmp12;
1512 
1513     tmp12 = z4 - z1 - z2;
1514 
1515     tmp22 = tmp11 + tmp12;
1516     tmp23 = tmp11 - tmp12;
1517 
1518     /* Odd part */
1519 
1520     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
1521     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
1522     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
1523     z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
1524 
1525     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1526     tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
1527 
1528     tmp10 = z1 + z3;
1529     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1530     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1531     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1532     tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
1533     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1534     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1535     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1536              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1537 
1538     z1 -= z4;
1539     z2 -= z3;
1540     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1541     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1542     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1543 
1544     /* Final output stage */
1545 
1546     wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
1547     wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
1548     wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
1549     wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
1550     wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
1551     wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
1552     wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
1553     wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
1554     wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
1555     wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
1556     wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
1557     wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
1558   }
1559 
1560   /* Pass 2: process 12 rows from work array, store into output array. */
1561 
1562   wsptr = workspace;
1563   for (ctr = 0; ctr < 12; ctr++) {
1564     outptr = output_buf[ctr] + output_col;
1565 
1566     /* Even part */
1567 
1568     /* Add fudge factor here for final descale. */
1569     z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
1570     z3 = LEFT_SHIFT(z3, CONST_BITS);
1571 
1572     z4 = (JLONG)wsptr[4];
1573     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1574 
1575     tmp10 = z3 + z4;
1576     tmp11 = z3 - z4;
1577 
1578     z1 = (JLONG)wsptr[2];
1579     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1580     z1 = LEFT_SHIFT(z1, CONST_BITS);
1581     z2 = (JLONG)wsptr[6];
1582     z2 = LEFT_SHIFT(z2, CONST_BITS);
1583 
1584     tmp12 = z1 - z2;
1585 
1586     tmp21 = z3 + tmp12;
1587     tmp24 = z3 - tmp12;
1588 
1589     tmp12 = z4 + z2;
1590 
1591     tmp20 = tmp10 + tmp12;
1592     tmp25 = tmp10 - tmp12;
1593 
1594     tmp12 = z4 - z1 - z2;
1595 
1596     tmp22 = tmp11 + tmp12;
1597     tmp23 = tmp11 - tmp12;
1598 
1599     /* Odd part */
1600 
1601     z1 = (JLONG)wsptr[1];
1602     z2 = (JLONG)wsptr[3];
1603     z3 = (JLONG)wsptr[5];
1604     z4 = (JLONG)wsptr[7];
1605 
1606     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1607     tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
1608 
1609     tmp10 = z1 + z3;
1610     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1611     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1612     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1613     tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
1614     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1615     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1616     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1617              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1618 
1619     z1 -= z4;
1620     z2 -= z3;
1621     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1622     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1623     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1624 
1625     /* Final output stage */
1626 
1627     outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
1628                                               CONST_BITS + PASS1_BITS + 3) &
1629                              RANGE_MASK];
1630     outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
1631                                               CONST_BITS + PASS1_BITS + 3) &
1632                              RANGE_MASK];
1633     outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
1634                                               CONST_BITS + PASS1_BITS + 3) &
1635                              RANGE_MASK];
1636     outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
1637                                               CONST_BITS + PASS1_BITS + 3) &
1638                              RANGE_MASK];
1639     outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
1640                                               CONST_BITS + PASS1_BITS + 3) &
1641                              RANGE_MASK];
1642     outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
1643                                               CONST_BITS + PASS1_BITS + 3) &
1644                              RANGE_MASK];
1645     outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
1646                                               CONST_BITS + PASS1_BITS + 3) &
1647                              RANGE_MASK];
1648     outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
1649                                               CONST_BITS + PASS1_BITS + 3) &
1650                              RANGE_MASK];
1651     outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
1652                                               CONST_BITS + PASS1_BITS + 3) &
1653                              RANGE_MASK];
1654     outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
1655                                               CONST_BITS + PASS1_BITS + 3) &
1656                              RANGE_MASK];
1657     outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
1658                                               CONST_BITS + PASS1_BITS + 3) &
1659                              RANGE_MASK];
1660     outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
1661                                               CONST_BITS + PASS1_BITS + 3) &
1662                              RANGE_MASK];
1663 
1664     wsptr += 8;         /* advance pointer to next row */
1665   }
1666 }
1667 
1668 
1669 /*
1670  * Perform dequantization and inverse DCT on one block of coefficients,
1671  * producing a 13x13 output block.
1672  *
1673  * Optimized algorithm with 29 multiplications in the 1-D kernel.
1674  * cK represents sqrt(2) * cos(K*pi/26).
1675  */
1676 
1677 GLOBAL(void)
jpeg_idct_13x13(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1678 jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
1679                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
1680                 JDIMENSION output_col)
1681 {
1682   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1683   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
1684   JLONG z1, z2, z3, z4;
1685   JCOEFPTR inptr;
1686   ISLOW_MULT_TYPE *quantptr;
1687   int *wsptr;
1688   JSAMPROW outptr;
1689   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1690   int ctr;
1691   int workspace[8 * 13];        /* buffers data between passes */
1692   SHIFT_TEMPS
1693 
1694   /* Pass 1: process columns from input, store into work array. */
1695 
1696   inptr = coef_block;
1697   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
1698   wsptr = workspace;
1699   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1700     /* Even part */
1701 
1702     z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
1703     z1 = LEFT_SHIFT(z1, CONST_BITS);
1704     /* Add fudge factor here for final descale. */
1705     z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
1706 
1707     z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
1708     z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
1709     z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
1710 
1711     tmp10 = z3 + z4;
1712     tmp11 = z3 - z4;
1713 
1714     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1715     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1716 
1717     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1718     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1719 
1720     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1721     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1722 
1723     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1724     tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
1725 
1726     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
1727     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
1728 
1729     tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
1730     tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
1731 
1732     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
1733 
1734     /* Odd part */
1735 
1736     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
1737     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
1738     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
1739     z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
1740 
1741     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
1742     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
1743     tmp15 = z1 + z4;
1744     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
1745     tmp10 = tmp11 + tmp12 + tmp13 -
1746             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
1747     tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
1748     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
1749     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
1750     tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
1751     tmp11 += tmp14;
1752     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
1753     tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
1754     tmp12 += tmp14;
1755     tmp13 += tmp14;
1756     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
1757     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
1758             MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
1759     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
1760     tmp14 += z1;
1761     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
1762              MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
1763 
1764     /* Final output stage */
1765 
1766     wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
1767     wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
1768     wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
1769     wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
1770     wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
1771     wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
1772     wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
1773     wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
1774     wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
1775     wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
1776     wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
1777     wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
1778     wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26, CONST_BITS - PASS1_BITS);
1779   }
1780 
1781   /* Pass 2: process 13 rows from work array, store into output array. */
1782 
1783   wsptr = workspace;
1784   for (ctr = 0; ctr < 13; ctr++) {
1785     outptr = output_buf[ctr] + output_col;
1786 
1787     /* Even part */
1788 
1789     /* Add fudge factor here for final descale. */
1790     z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
1791     z1 = LEFT_SHIFT(z1, CONST_BITS);
1792 
1793     z2 = (JLONG)wsptr[2];
1794     z3 = (JLONG)wsptr[4];
1795     z4 = (JLONG)wsptr[6];
1796 
1797     tmp10 = z3 + z4;
1798     tmp11 = z3 - z4;
1799 
1800     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1801     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1802 
1803     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1804     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1805 
1806     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1807     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1808 
1809     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1810     tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
1811 
1812     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
1813     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
1814 
1815     tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
1816     tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
1817 
1818     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
1819 
1820     /* Odd part */
1821 
1822     z1 = (JLONG)wsptr[1];
1823     z2 = (JLONG)wsptr[3];
1824     z3 = (JLONG)wsptr[5];
1825     z4 = (JLONG)wsptr[7];
1826 
1827     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
1828     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
1829     tmp15 = z1 + z4;
1830     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
1831     tmp10 = tmp11 + tmp12 + tmp13 -
1832             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
1833     tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
1834     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
1835     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
1836     tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
1837     tmp11 += tmp14;
1838     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
1839     tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
1840     tmp12 += tmp14;
1841     tmp13 += tmp14;
1842     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
1843     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
1844             MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
1845     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
1846     tmp14 += z1;
1847     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
1848              MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
1849 
1850     /* Final output stage */
1851 
1852     outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
1853                                               CONST_BITS + PASS1_BITS + 3) &
1854                              RANGE_MASK];
1855     outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
1856                                               CONST_BITS + PASS1_BITS + 3) &
1857                              RANGE_MASK];
1858     outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
1859                                               CONST_BITS + PASS1_BITS + 3) &
1860                              RANGE_MASK];
1861     outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
1862                                               CONST_BITS + PASS1_BITS + 3) &
1863                              RANGE_MASK];
1864     outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
1865                                               CONST_BITS + PASS1_BITS + 3) &
1866                              RANGE_MASK];
1867     outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
1868                                               CONST_BITS + PASS1_BITS + 3) &
1869                              RANGE_MASK];
1870     outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
1871                                               CONST_BITS + PASS1_BITS + 3) &
1872                              RANGE_MASK];
1873     outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
1874                                               CONST_BITS + PASS1_BITS + 3) &
1875                              RANGE_MASK];
1876     outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
1877                                               CONST_BITS + PASS1_BITS + 3) &
1878                              RANGE_MASK];
1879     outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
1880                                               CONST_BITS + PASS1_BITS + 3) &
1881                              RANGE_MASK];
1882     outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
1883                                               CONST_BITS + PASS1_BITS + 3) &
1884                              RANGE_MASK];
1885     outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
1886                                               CONST_BITS + PASS1_BITS + 3) &
1887                              RANGE_MASK];
1888     outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26,
1889                                               CONST_BITS + PASS1_BITS + 3) &
1890                              RANGE_MASK];
1891 
1892     wsptr += 8;         /* advance pointer to next row */
1893   }
1894 }
1895 
1896 
1897 /*
1898  * Perform dequantization and inverse DCT on one block of coefficients,
1899  * producing a 14x14 output block.
1900  *
1901  * Optimized algorithm with 20 multiplications in the 1-D kernel.
1902  * cK represents sqrt(2) * cos(K*pi/28).
1903  */
1904 
1905 GLOBAL(void)
jpeg_idct_14x14(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)1906 jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
1907                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
1908                 JDIMENSION output_col)
1909 {
1910   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1911   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
1912   JLONG z1, z2, z3, z4;
1913   JCOEFPTR inptr;
1914   ISLOW_MULT_TYPE *quantptr;
1915   int *wsptr;
1916   JSAMPROW outptr;
1917   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1918   int ctr;
1919   int workspace[8 * 14];        /* buffers data between passes */
1920   SHIFT_TEMPS
1921 
1922   /* Pass 1: process columns from input, store into work array. */
1923 
1924   inptr = coef_block;
1925   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
1926   wsptr = workspace;
1927   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1928     /* Even part */
1929 
1930     z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
1931     z1 = LEFT_SHIFT(z1, CONST_BITS);
1932     /* Add fudge factor here for final descale. */
1933     z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
1934     z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
1935     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
1936     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
1937     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
1938 
1939     tmp10 = z1 + z2;
1940     tmp11 = z1 + z3;
1941     tmp12 = z1 - z4;
1942 
1943     tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
1944                         CONST_BITS - PASS1_BITS); /* c0 = (c4+c12-c8)*2 */
1945 
1946     z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
1947     z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
1948 
1949     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
1950 
1951     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
1952     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
1953     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
1954             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
1955 
1956     tmp20 = tmp10 + tmp13;
1957     tmp26 = tmp10 - tmp13;
1958     tmp21 = tmp11 + tmp14;
1959     tmp25 = tmp11 - tmp14;
1960     tmp22 = tmp12 + tmp15;
1961     tmp24 = tmp12 - tmp15;
1962 
1963     /* Odd part */
1964 
1965     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
1966     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
1967     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
1968     z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
1969     tmp13 = LEFT_SHIFT(z4, CONST_BITS);
1970 
1971     tmp14 = z1 + z3;
1972     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
1973     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
1974     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
1975     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
1976     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
1977     z1    -= z2;
1978     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
1979     tmp16 += tmp15;
1980     z1    += z4;
1981     z4    = MULTIPLY(z2 + z3, -FIX(0.158341681)) - tmp13;  /* -c13 */
1982     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
1983     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
1984     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
1985     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
1986     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
1987 
1988     tmp13 = LEFT_SHIFT(z1 - z3, PASS1_BITS);
1989 
1990     /* Final output stage */
1991 
1992     wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
1993     wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
1994     wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
1995     wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
1996     wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
1997     wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
1998     wsptr[8 * 3]  = (int)(tmp23 + tmp13);
1999     wsptr[8 * 10] = (int)(tmp23 - tmp13);
2000     wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
2001     wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
2002     wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
2003     wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
2004     wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
2005     wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
2006   }
2007 
2008   /* Pass 2: process 14 rows from work array, store into output array. */
2009 
2010   wsptr = workspace;
2011   for (ctr = 0; ctr < 14; ctr++) {
2012     outptr = output_buf[ctr] + output_col;
2013 
2014     /* Even part */
2015 
2016     /* Add fudge factor here for final descale. */
2017     z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
2018     z1 = LEFT_SHIFT(z1, CONST_BITS);
2019     z4 = (JLONG)wsptr[4];
2020     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2021     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2022     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2023 
2024     tmp10 = z1 + z2;
2025     tmp11 = z1 + z3;
2026     tmp12 = z1 - z4;
2027 
2028     tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1);    /* c0 = (c4+c12-c8)*2 */
2029 
2030     z1 = (JLONG)wsptr[2];
2031     z2 = (JLONG)wsptr[6];
2032 
2033     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2034 
2035     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2036     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2037     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2038             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2039 
2040     tmp20 = tmp10 + tmp13;
2041     tmp26 = tmp10 - tmp13;
2042     tmp21 = tmp11 + tmp14;
2043     tmp25 = tmp11 - tmp14;
2044     tmp22 = tmp12 + tmp15;
2045     tmp24 = tmp12 - tmp15;
2046 
2047     /* Odd part */
2048 
2049     z1 = (JLONG)wsptr[1];
2050     z2 = (JLONG)wsptr[3];
2051     z3 = (JLONG)wsptr[5];
2052     z4 = (JLONG)wsptr[7];
2053     z4 = LEFT_SHIFT(z4, CONST_BITS);
2054 
2055     tmp14 = z1 + z3;
2056     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2057     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2058     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2059     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2060     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2061     z1    -= z2;
2062     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
2063     tmp16 += tmp15;
2064     tmp13 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - z4;     /* -c13 */
2065     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
2066     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
2067     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2068     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2069     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
2070 
2071     tmp13 = LEFT_SHIFT(z1 - z3, CONST_BITS) + z4;
2072 
2073     /* Final output stage */
2074 
2075     outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
2076                                               CONST_BITS + PASS1_BITS + 3) &
2077                              RANGE_MASK];
2078     outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
2079                                               CONST_BITS + PASS1_BITS + 3) &
2080                              RANGE_MASK];
2081     outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
2082                                               CONST_BITS + PASS1_BITS + 3) &
2083                              RANGE_MASK];
2084     outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
2085                                               CONST_BITS + PASS1_BITS + 3) &
2086                              RANGE_MASK];
2087     outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
2088                                               CONST_BITS + PASS1_BITS + 3) &
2089                              RANGE_MASK];
2090     outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
2091                                               CONST_BITS + PASS1_BITS + 3) &
2092                              RANGE_MASK];
2093     outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
2094                                               CONST_BITS + PASS1_BITS + 3) &
2095                              RANGE_MASK];
2096     outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
2097                                               CONST_BITS + PASS1_BITS + 3) &
2098                              RANGE_MASK];
2099     outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
2100                                               CONST_BITS + PASS1_BITS + 3) &
2101                              RANGE_MASK];
2102     outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
2103                                               CONST_BITS + PASS1_BITS + 3) &
2104                              RANGE_MASK];
2105     outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
2106                                               CONST_BITS + PASS1_BITS + 3) &
2107                              RANGE_MASK];
2108     outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
2109                                               CONST_BITS + PASS1_BITS + 3) &
2110                              RANGE_MASK];
2111     outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
2112                                               CONST_BITS + PASS1_BITS + 3) &
2113                              RANGE_MASK];
2114     outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
2115                                               CONST_BITS + PASS1_BITS + 3) &
2116                              RANGE_MASK];
2117 
2118     wsptr += 8;         /* advance pointer to next row */
2119   }
2120 }
2121 
2122 
2123 /*
2124  * Perform dequantization and inverse DCT on one block of coefficients,
2125  * producing a 15x15 output block.
2126  *
2127  * Optimized algorithm with 22 multiplications in the 1-D kernel.
2128  * cK represents sqrt(2) * cos(K*pi/30).
2129  */
2130 
2131 GLOBAL(void)
jpeg_idct_15x15(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)2132 jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
2133                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
2134                 JDIMENSION output_col)
2135 {
2136   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2137   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2138   JLONG z1, z2, z3, z4;
2139   JCOEFPTR inptr;
2140   ISLOW_MULT_TYPE *quantptr;
2141   int *wsptr;
2142   JSAMPROW outptr;
2143   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2144   int ctr;
2145   int workspace[8 * 15];        /* buffers data between passes */
2146   SHIFT_TEMPS
2147 
2148   /* Pass 1: process columns from input, store into work array. */
2149 
2150   inptr = coef_block;
2151   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
2152   wsptr = workspace;
2153   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2154     /* Even part */
2155 
2156     z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
2157     z1 = LEFT_SHIFT(z1, CONST_BITS);
2158     /* Add fudge factor here for final descale. */
2159     z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
2160 
2161     z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
2162     z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
2163     z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
2164 
2165     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2166     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2167 
2168     tmp12 = z1 - tmp10;
2169     tmp13 = z1 + tmp11;
2170     z1 -= LEFT_SHIFT(tmp11 - tmp10, 1);     /* c0 = (c6-c12)*2 */
2171 
2172     z4 = z2 - z3;
2173     z3 += z2;
2174     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2175     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2176     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2177 
2178     tmp20 = tmp13 + tmp10 + tmp11;
2179     tmp23 = tmp12 - tmp10 + tmp11 + z2;
2180 
2181     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2182     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2183 
2184     tmp25 = tmp13 - tmp10 - tmp11;
2185     tmp26 = tmp12 + tmp10 - tmp11 - z2;
2186 
2187     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2188     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2189 
2190     tmp21 = tmp12 + tmp10 + tmp11;
2191     tmp24 = tmp13 - tmp10 + tmp11;
2192     tmp11 += tmp11;
2193     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2194     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2195 
2196     /* Odd part */
2197 
2198     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
2199     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
2200     z4 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
2201     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2202     z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
2203 
2204     tmp13 = z2 - z4;
2205     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2206     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2207     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2208 
2209     tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
2210     tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
2211     z2 = z1 - z4;
2212     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2213 
2214     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2215     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2216     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2217     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2218     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2219     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2220 
2221     /* Final output stage */
2222 
2223     wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
2224     wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
2225     wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
2226     wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
2227     wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
2228     wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
2229     wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
2230     wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
2231     wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
2232     wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
2233     wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
2234     wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
2235     wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
2236     wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
2237     wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27, CONST_BITS - PASS1_BITS);
2238   }
2239 
2240   /* Pass 2: process 15 rows from work array, store into output array. */
2241 
2242   wsptr = workspace;
2243   for (ctr = 0; ctr < 15; ctr++) {
2244     outptr = output_buf[ctr] + output_col;
2245 
2246     /* Even part */
2247 
2248     /* Add fudge factor here for final descale. */
2249     z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
2250     z1 = LEFT_SHIFT(z1, CONST_BITS);
2251 
2252     z2 = (JLONG)wsptr[2];
2253     z3 = (JLONG)wsptr[4];
2254     z4 = (JLONG)wsptr[6];
2255 
2256     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2257     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2258 
2259     tmp12 = z1 - tmp10;
2260     tmp13 = z1 + tmp11;
2261     z1 -= LEFT_SHIFT(tmp11 - tmp10, 1);     /* c0 = (c6-c12)*2 */
2262 
2263     z4 = z2 - z3;
2264     z3 += z2;
2265     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2266     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2267     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2268 
2269     tmp20 = tmp13 + tmp10 + tmp11;
2270     tmp23 = tmp12 - tmp10 + tmp11 + z2;
2271 
2272     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2273     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2274 
2275     tmp25 = tmp13 - tmp10 - tmp11;
2276     tmp26 = tmp12 + tmp10 - tmp11 - z2;
2277 
2278     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2279     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2280 
2281     tmp21 = tmp12 + tmp10 + tmp11;
2282     tmp24 = tmp13 - tmp10 + tmp11;
2283     tmp11 += tmp11;
2284     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2285     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2286 
2287     /* Odd part */
2288 
2289     z1 = (JLONG)wsptr[1];
2290     z2 = (JLONG)wsptr[3];
2291     z4 = (JLONG)wsptr[5];
2292     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2293     z4 = (JLONG)wsptr[7];
2294 
2295     tmp13 = z2 - z4;
2296     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2297     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2298     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2299 
2300     tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
2301     tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
2302     z2 = z1 - z4;
2303     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2304 
2305     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2306     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2307     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2308     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2309     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2310     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2311 
2312     /* Final output stage */
2313 
2314     outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
2315                                               CONST_BITS + PASS1_BITS + 3) &
2316                              RANGE_MASK];
2317     outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
2318                                               CONST_BITS + PASS1_BITS + 3) &
2319                              RANGE_MASK];
2320     outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
2321                                               CONST_BITS + PASS1_BITS + 3) &
2322                              RANGE_MASK];
2323     outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
2324                                               CONST_BITS + PASS1_BITS + 3) &
2325                              RANGE_MASK];
2326     outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
2327                                               CONST_BITS + PASS1_BITS + 3) &
2328                              RANGE_MASK];
2329     outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
2330                                               CONST_BITS + PASS1_BITS + 3) &
2331                              RANGE_MASK];
2332     outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
2333                                               CONST_BITS + PASS1_BITS + 3) &
2334                              RANGE_MASK];
2335     outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
2336                                               CONST_BITS + PASS1_BITS + 3) &
2337                              RANGE_MASK];
2338     outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
2339                                               CONST_BITS + PASS1_BITS + 3) &
2340                              RANGE_MASK];
2341     outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
2342                                               CONST_BITS + PASS1_BITS + 3) &
2343                              RANGE_MASK];
2344     outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
2345                                               CONST_BITS + PASS1_BITS + 3) &
2346                              RANGE_MASK];
2347     outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
2348                                               CONST_BITS + PASS1_BITS + 3) &
2349                              RANGE_MASK];
2350     outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
2351                                               CONST_BITS + PASS1_BITS + 3) &
2352                              RANGE_MASK];
2353     outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
2354                                               CONST_BITS + PASS1_BITS + 3) &
2355                              RANGE_MASK];
2356     outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27,
2357                                               CONST_BITS + PASS1_BITS + 3) &
2358                              RANGE_MASK];
2359 
2360     wsptr += 8;         /* advance pointer to next row */
2361   }
2362 }
2363 
2364 
2365 /*
2366  * Perform dequantization and inverse DCT on one block of coefficients,
2367  * producing a 16x16 output block.
2368  *
2369  * Optimized algorithm with 28 multiplications in the 1-D kernel.
2370  * cK represents sqrt(2) * cos(K*pi/32).
2371  */
2372 
2373 GLOBAL(void)
jpeg_idct_16x16(j_decompress_ptr cinfo,jpeg_component_info * compptr,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)2374 jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
2375                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
2376                 JDIMENSION output_col)
2377 {
2378   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2379   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2380   JLONG z1, z2, z3, z4;
2381   JCOEFPTR inptr;
2382   ISLOW_MULT_TYPE *quantptr;
2383   int *wsptr;
2384   JSAMPROW outptr;
2385   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2386   int ctr;
2387   int workspace[8 * 16];        /* buffers data between passes */
2388   SHIFT_TEMPS
2389 
2390   /* Pass 1: process columns from input, store into work array. */
2391 
2392   inptr = coef_block;
2393   quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
2394   wsptr = workspace;
2395   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2396     /* Even part */
2397 
2398     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
2399     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
2400     /* Add fudge factor here for final descale. */
2401     tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
2402 
2403     z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
2404     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2405     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2406 
2407     tmp10 = tmp0 + tmp1;
2408     tmp11 = tmp0 - tmp1;
2409     tmp12 = tmp0 + tmp2;
2410     tmp13 = tmp0 - tmp2;
2411 
2412     z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
2413     z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
2414     z3 = z1 - z2;
2415     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2416     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2417 
2418     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2419     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2420     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2421     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2422 
2423     tmp20 = tmp10 + tmp0;
2424     tmp27 = tmp10 - tmp0;
2425     tmp21 = tmp12 + tmp1;
2426     tmp26 = tmp12 - tmp1;
2427     tmp22 = tmp13 + tmp2;
2428     tmp25 = tmp13 - tmp2;
2429     tmp23 = tmp11 + tmp3;
2430     tmp24 = tmp11 - tmp3;
2431 
2432     /* Odd part */
2433 
2434     z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
2435     z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
2436     z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
2437     z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
2438 
2439     tmp11 = z1 + z3;
2440 
2441     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2442     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2443     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2444     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2445     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2446     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2447     tmp0  = tmp1 + tmp2 + tmp3 -
2448             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2449     tmp13 = tmp10 + tmp11 + tmp12 -
2450             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2451     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2452     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2453     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2454     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2455     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2456     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2457     z2    += z4;
2458     z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
2459     tmp1  += z1;
2460     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2461     z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
2462     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2463     tmp12 += z2;
2464     z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
2465     tmp2  += z2;
2466     tmp3  += z2;
2467     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2468     tmp10 += z2;
2469     tmp11 += z2;
2470 
2471     /* Final output stage */
2472 
2473     wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS - PASS1_BITS);
2474     wsptr[8 * 15] = (int)RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS - PASS1_BITS);
2475     wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS - PASS1_BITS);
2476     wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS - PASS1_BITS);
2477     wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS - PASS1_BITS);
2478     wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS - PASS1_BITS);
2479     wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS - PASS1_BITS);
2480     wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS - PASS1_BITS);
2481     wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS - PASS1_BITS);
2482     wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS - PASS1_BITS);
2483     wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS - PASS1_BITS);
2484     wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS - PASS1_BITS);
2485     wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS - PASS1_BITS);
2486     wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS - PASS1_BITS);
2487     wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS - PASS1_BITS);
2488     wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS - PASS1_BITS);
2489   }
2490 
2491   /* Pass 2: process 16 rows from work array, store into output array. */
2492 
2493   wsptr = workspace;
2494   for (ctr = 0; ctr < 16; ctr++) {
2495     outptr = output_buf[ctr] + output_col;
2496 
2497     /* Even part */
2498 
2499     /* Add fudge factor here for final descale. */
2500     tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
2501     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
2502 
2503     z1 = (JLONG)wsptr[4];
2504     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2505     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2506 
2507     tmp10 = tmp0 + tmp1;
2508     tmp11 = tmp0 - tmp1;
2509     tmp12 = tmp0 + tmp2;
2510     tmp13 = tmp0 - tmp2;
2511 
2512     z1 = (JLONG)wsptr[2];
2513     z2 = (JLONG)wsptr[6];
2514     z3 = z1 - z2;
2515     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2516     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2517 
2518     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2519     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2520     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2521     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2522 
2523     tmp20 = tmp10 + tmp0;
2524     tmp27 = tmp10 - tmp0;
2525     tmp21 = tmp12 + tmp1;
2526     tmp26 = tmp12 - tmp1;
2527     tmp22 = tmp13 + tmp2;
2528     tmp25 = tmp13 - tmp2;
2529     tmp23 = tmp11 + tmp3;
2530     tmp24 = tmp11 - tmp3;
2531 
2532     /* Odd part */
2533 
2534     z1 = (JLONG)wsptr[1];
2535     z2 = (JLONG)wsptr[3];
2536     z3 = (JLONG)wsptr[5];
2537     z4 = (JLONG)wsptr[7];
2538 
2539     tmp11 = z1 + z3;
2540 
2541     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2542     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2543     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2544     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2545     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2546     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2547     tmp0  = tmp1 + tmp2 + tmp3 -
2548             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2549     tmp13 = tmp10 + tmp11 + tmp12 -
2550             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2551     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2552     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2553     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2554     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2555     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2556     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2557     z2    += z4;
2558     z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
2559     tmp1  += z1;
2560     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2561     z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
2562     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2563     tmp12 += z2;
2564     z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
2565     tmp2  += z2;
2566     tmp3  += z2;
2567     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2568     tmp10 += z2;
2569     tmp11 += z2;
2570 
2571     /* Final output stage */
2572 
2573     outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp0,
2574                                               CONST_BITS + PASS1_BITS + 3) &
2575                              RANGE_MASK];
2576     outptr[15] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp0,
2577                                               CONST_BITS + PASS1_BITS + 3) &
2578                              RANGE_MASK];
2579     outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp1,
2580                                               CONST_BITS + PASS1_BITS + 3) &
2581                              RANGE_MASK];
2582     outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp1,
2583                                               CONST_BITS + PASS1_BITS + 3) &
2584                              RANGE_MASK];
2585     outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp2,
2586                                               CONST_BITS + PASS1_BITS + 3) &
2587                              RANGE_MASK];
2588     outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp2,
2589                                               CONST_BITS + PASS1_BITS + 3) &
2590                              RANGE_MASK];
2591     outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp3,
2592                                               CONST_BITS + PASS1_BITS + 3) &
2593                              RANGE_MASK];
2594     outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp3,
2595                                               CONST_BITS + PASS1_BITS + 3) &
2596                              RANGE_MASK];
2597     outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp10,
2598                                               CONST_BITS + PASS1_BITS + 3) &
2599                              RANGE_MASK];
2600     outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp10,
2601                                               CONST_BITS + PASS1_BITS + 3) &
2602                              RANGE_MASK];
2603     outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp11,
2604                                               CONST_BITS + PASS1_BITS + 3) &
2605                              RANGE_MASK];
2606     outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp11,
2607                                               CONST_BITS + PASS1_BITS + 3) &
2608                              RANGE_MASK];
2609     outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp12,
2610                                               CONST_BITS + PASS1_BITS + 3) &
2611                              RANGE_MASK];
2612     outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp12,
2613                                               CONST_BITS + PASS1_BITS + 3) &
2614                              RANGE_MASK];
2615     outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27 + tmp13,
2616                                               CONST_BITS + PASS1_BITS + 3) &
2617                              RANGE_MASK];
2618     outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp27 - tmp13,
2619                                               CONST_BITS + PASS1_BITS + 3) &
2620                              RANGE_MASK];
2621 
2622     wsptr += 8;         /* advance pointer to next row */
2623   }
2624 }
2625 
2626 #endif /* IDCT_SCALING_SUPPORTED */
2627 #endif /* DCT_ISLOW_SUPPORTED */
2628