• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <math.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/vp9_systemdependent.h"
20 
fdct_round_shift(tran_high_t input)21 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
22   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
23   // TODO(debargha, peter.derivaz): Find new bounds for this assert
24   // and make the bounds consts.
25   // assert(INT16_MIN <= rv && rv <= INT16_MAX);
26   return rv;
27 }
28 
fdct4(const tran_low_t * input,tran_low_t * output)29 static void fdct4(const tran_low_t *input, tran_low_t *output) {
30   tran_high_t step[4];
31   tran_high_t temp1, temp2;
32 
33   step[0] = input[0] + input[3];
34   step[1] = input[1] + input[2];
35   step[2] = input[1] - input[2];
36   step[3] = input[0] - input[3];
37 
38   temp1 = (step[0] + step[1]) * cospi_16_64;
39   temp2 = (step[0] - step[1]) * cospi_16_64;
40   output[0] = fdct_round_shift(temp1);
41   output[2] = fdct_round_shift(temp2);
42   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
43   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
44   output[1] = fdct_round_shift(temp1);
45   output[3] = fdct_round_shift(temp2);
46 }
47 
vp9_fdct4x4_1_c(const int16_t * input,tran_low_t * output,int stride)48 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
49   int r, c;
50   tran_low_t sum = 0;
51   for (r = 0; r < 4; ++r)
52     for (c = 0; c < 4; ++c)
53       sum += input[r * stride + c];
54 
55   output[0] = sum << 1;
56   output[1] = 0;
57 }
58 
vp9_fdct4x4_c(const int16_t * input,tran_low_t * output,int stride)59 void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
60   // The 2D transform is done with two passes which are actually pretty
61   // similar. In the first one, we transform the columns and transpose
62   // the results. In the second one, we transform the rows. To achieve that,
63   // as the first pass results are transposed, we transpose the columns (that
64   // is the transposed rows) and transpose the results (so that it goes back
65   // in normal/row positions).
66   int pass;
67   // We need an intermediate buffer between passes.
68   tran_low_t intermediate[4 * 4];
69   const int16_t *in_pass0 = input;
70   const tran_low_t *in = NULL;
71   tran_low_t *out = intermediate;
72   // Do the two transform/transpose passes
73   for (pass = 0; pass < 2; ++pass) {
74     tran_high_t input[4];      // canbe16
75     tran_high_t step[4];       // canbe16
76     tran_high_t temp1, temp2;  // needs32
77     int i;
78     for (i = 0; i < 4; ++i) {
79       // Load inputs.
80       if (0 == pass) {
81         input[0] = in_pass0[0 * stride] * 16;
82         input[1] = in_pass0[1 * stride] * 16;
83         input[2] = in_pass0[2 * stride] * 16;
84         input[3] = in_pass0[3 * stride] * 16;
85         if (i == 0 && input[0]) {
86           input[0] += 1;
87         }
88       } else {
89         input[0] = in[0 * 4];
90         input[1] = in[1 * 4];
91         input[2] = in[2 * 4];
92         input[3] = in[3 * 4];
93       }
94       // Transform.
95       step[0] = input[0] + input[3];
96       step[1] = input[1] + input[2];
97       step[2] = input[1] - input[2];
98       step[3] = input[0] - input[3];
99       temp1 = (step[0] + step[1]) * cospi_16_64;
100       temp2 = (step[0] - step[1]) * cospi_16_64;
101       out[0] = fdct_round_shift(temp1);
102       out[2] = fdct_round_shift(temp2);
103       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
104       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
105       out[1] = fdct_round_shift(temp1);
106       out[3] = fdct_round_shift(temp2);
107       // Do next column (which is a transposed row in second/horizontal pass)
108       in_pass0++;
109       in++;
110       out += 4;
111     }
112     // Setup in/out for next pass.
113     in = intermediate;
114     out = output;
115   }
116 
117   {
118     int i, j;
119     for (i = 0; i < 4; ++i) {
120       for (j = 0; j < 4; ++j)
121         output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
122     }
123   }
124 }
125 
fadst4(const tran_low_t * input,tran_low_t * output)126 static void fadst4(const tran_low_t *input, tran_low_t *output) {
127   tran_high_t x0, x1, x2, x3;
128   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
129 
130   x0 = input[0];
131   x1 = input[1];
132   x2 = input[2];
133   x3 = input[3];
134 
135   if (!(x0 | x1 | x2 | x3)) {
136     output[0] = output[1] = output[2] = output[3] = 0;
137     return;
138   }
139 
140   s0 = sinpi_1_9 * x0;
141   s1 = sinpi_4_9 * x0;
142   s2 = sinpi_2_9 * x1;
143   s3 = sinpi_1_9 * x1;
144   s4 = sinpi_3_9 * x2;
145   s5 = sinpi_4_9 * x3;
146   s6 = sinpi_2_9 * x3;
147   s7 = x0 + x1 - x3;
148 
149   x0 = s0 + s2 + s5;
150   x1 = sinpi_3_9 * s7;
151   x2 = s1 - s3 + s6;
152   x3 = s4;
153 
154   s0 = x0 + x3;
155   s1 = x1;
156   s2 = x2 - x3;
157   s3 = x2 - x0 + x3;
158 
159   // 1-D transform scaling factor is sqrt(2).
160   output[0] = fdct_round_shift(s0);
161   output[1] = fdct_round_shift(s1);
162   output[2] = fdct_round_shift(s2);
163   output[3] = fdct_round_shift(s3);
164 }
165 
166 static const transform_2d FHT_4[] = {
167   { fdct4,  fdct4  },  // DCT_DCT  = 0
168   { fadst4, fdct4  },  // ADST_DCT = 1
169   { fdct4,  fadst4 },  // DCT_ADST = 2
170   { fadst4, fadst4 }   // ADST_ADST = 3
171 };
172 
vp9_fht4x4_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)173 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
174                   int stride, int tx_type) {
175   if (tx_type == DCT_DCT) {
176     vp9_fdct4x4_c(input, output, stride);
177   } else {
178     tran_low_t out[4 * 4];
179     tran_low_t *outptr = &out[0];
180     int i, j;
181     tran_low_t temp_in[4], temp_out[4];
182     const transform_2d ht = FHT_4[tx_type];
183 
184     // Columns
185     for (i = 0; i < 4; ++i) {
186       for (j = 0; j < 4; ++j)
187         temp_in[j] = input[j * stride + i] * 16;
188       if (i == 0 && temp_in[0])
189         temp_in[0] += 1;
190       ht.cols(temp_in, temp_out);
191       for (j = 0; j < 4; ++j)
192         outptr[j * 4 + i] = temp_out[j];
193     }
194 
195     // Rows
196     for (i = 0; i < 4; ++i) {
197       for (j = 0; j < 4; ++j)
198         temp_in[j] = out[j + i * 4];
199       ht.rows(temp_in, temp_out);
200       for (j = 0; j < 4; ++j)
201         output[j + i * 4] = (temp_out[j] + 1) >> 2;
202     }
203   }
204 }
205 
fdct8(const tran_low_t * input,tran_low_t * output)206 static void fdct8(const tran_low_t *input, tran_low_t *output) {
207   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
208   tran_high_t t0, t1, t2, t3;                  // needs32
209   tran_high_t x0, x1, x2, x3;                  // canbe16
210 
211   // stage 1
212   s0 = input[0] + input[7];
213   s1 = input[1] + input[6];
214   s2 = input[2] + input[5];
215   s3 = input[3] + input[4];
216   s4 = input[3] - input[4];
217   s5 = input[2] - input[5];
218   s6 = input[1] - input[6];
219   s7 = input[0] - input[7];
220 
221   // fdct4(step, step);
222   x0 = s0 + s3;
223   x1 = s1 + s2;
224   x2 = s1 - s2;
225   x3 = s0 - s3;
226   t0 = (x0 + x1) * cospi_16_64;
227   t1 = (x0 - x1) * cospi_16_64;
228   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
229   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
230   output[0] = fdct_round_shift(t0);
231   output[2] = fdct_round_shift(t2);
232   output[4] = fdct_round_shift(t1);
233   output[6] = fdct_round_shift(t3);
234 
235   // Stage 2
236   t0 = (s6 - s5) * cospi_16_64;
237   t1 = (s6 + s5) * cospi_16_64;
238   t2 = fdct_round_shift(t0);
239   t3 = fdct_round_shift(t1);
240 
241   // Stage 3
242   x0 = s4 + t2;
243   x1 = s4 - t2;
244   x2 = s7 - t3;
245   x3 = s7 + t3;
246 
247   // Stage 4
248   t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
249   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
250   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
251   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
252   output[1] = fdct_round_shift(t0);
253   output[3] = fdct_round_shift(t2);
254   output[5] = fdct_round_shift(t1);
255   output[7] = fdct_round_shift(t3);
256 }
257 
vp9_fdct8x8_1_c(const int16_t * input,tran_low_t * output,int stride)258 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
259   int r, c;
260   tran_low_t sum = 0;
261   for (r = 0; r < 8; ++r)
262     for (c = 0; c < 8; ++c)
263       sum += input[r * stride + c];
264 
265   output[0] = sum;
266   output[1] = 0;
267 }
268 
vp9_fdct8x8_c(const int16_t * input,tran_low_t * final_output,int stride)269 void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
270   int i, j;
271   tran_low_t intermediate[64];
272 
273   // Transform columns
274   {
275     tran_low_t *output = intermediate;
276     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
277     tran_high_t t0, t1, t2, t3;                  // needs32
278     tran_high_t x0, x1, x2, x3;                  // canbe16
279 
280     int i;
281     for (i = 0; i < 8; i++) {
282       // stage 1
283       s0 = (input[0 * stride] + input[7 * stride]) * 4;
284       s1 = (input[1 * stride] + input[6 * stride]) * 4;
285       s2 = (input[2 * stride] + input[5 * stride]) * 4;
286       s3 = (input[3 * stride] + input[4 * stride]) * 4;
287       s4 = (input[3 * stride] - input[4 * stride]) * 4;
288       s5 = (input[2 * stride] - input[5 * stride]) * 4;
289       s6 = (input[1 * stride] - input[6 * stride]) * 4;
290       s7 = (input[0 * stride] - input[7 * stride]) * 4;
291 
292       // fdct4(step, step);
293       x0 = s0 + s3;
294       x1 = s1 + s2;
295       x2 = s1 - s2;
296       x3 = s0 - s3;
297       t0 = (x0 + x1) * cospi_16_64;
298       t1 = (x0 - x1) * cospi_16_64;
299       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
300       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
301       output[0 * 8] = fdct_round_shift(t0);
302       output[2 * 8] = fdct_round_shift(t2);
303       output[4 * 8] = fdct_round_shift(t1);
304       output[6 * 8] = fdct_round_shift(t3);
305 
306       // Stage 2
307       t0 = (s6 - s5) * cospi_16_64;
308       t1 = (s6 + s5) * cospi_16_64;
309       t2 = fdct_round_shift(t0);
310       t3 = fdct_round_shift(t1);
311 
312       // Stage 3
313       x0 = s4 + t2;
314       x1 = s4 - t2;
315       x2 = s7 - t3;
316       x3 = s7 + t3;
317 
318       // Stage 4
319       t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
320       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
321       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
322       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
323       output[1 * 8] = fdct_round_shift(t0);
324       output[3 * 8] = fdct_round_shift(t2);
325       output[5 * 8] = fdct_round_shift(t1);
326       output[7 * 8] = fdct_round_shift(t3);
327       input++;
328       output++;
329     }
330   }
331 
332   // Rows
333   for (i = 0; i < 8; ++i) {
334     fdct8(&intermediate[i * 8], &final_output[i * 8]);
335     for (j = 0; j < 8; ++j)
336       final_output[j + i * 8] /= 2;
337   }
338 }
339 
vp9_fdct16x16_1_c(const int16_t * input,tran_low_t * output,int stride)340 void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
341   int r, c;
342   tran_low_t sum = 0;
343   for (r = 0; r < 16; ++r)
344     for (c = 0; c < 16; ++c)
345       sum += input[r * stride + c];
346 
347   output[0] = sum >> 1;
348   output[1] = 0;
349 }
350 
vp9_fdct16x16_c(const int16_t * input,tran_low_t * output,int stride)351 void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
352   // The 2D transform is done with two passes which are actually pretty
353   // similar. In the first one, we transform the columns and transpose
354   // the results. In the second one, we transform the rows. To achieve that,
355   // as the first pass results are transposed, we transpose the columns (that
356   // is the transposed rows) and transpose the results (so that it goes back
357   // in normal/row positions).
358   int pass;
359   // We need an intermediate buffer between passes.
360   tran_low_t intermediate[256];
361   const int16_t *in_pass0 = input;
362   const tran_low_t *in = NULL;
363   tran_low_t *out = intermediate;
364   // Do the two transform/transpose passes
365   for (pass = 0; pass < 2; ++pass) {
366     tran_high_t step1[8];      // canbe16
367     tran_high_t step2[8];      // canbe16
368     tran_high_t step3[8];      // canbe16
369     tran_high_t input[8];      // canbe16
370     tran_high_t temp1, temp2;  // needs32
371     int i;
372     for (i = 0; i < 16; i++) {
373       if (0 == pass) {
374         // Calculate input for the first 8 results.
375         input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
376         input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
377         input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
378         input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
379         input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
380         input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
381         input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
382         input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
383         // Calculate input for the next 8 results.
384         step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
385         step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
386         step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
387         step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
388         step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
389         step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
390         step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
391         step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
392       } else {
393         // Calculate input for the first 8 results.
394         input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
395         input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
396         input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
397         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
398         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
399         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
400         input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
401         input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
402         // Calculate input for the next 8 results.
403         step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
404         step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
405         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
406         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
407         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
408         step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
409         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
410         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
411       }
412       // Work on the first eight values; fdct8(input, even_results);
413       {
414         tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
415         tran_high_t t0, t1, t2, t3;                  // needs32
416         tran_high_t x0, x1, x2, x3;                  // canbe16
417 
418         // stage 1
419         s0 = input[0] + input[7];
420         s1 = input[1] + input[6];
421         s2 = input[2] + input[5];
422         s3 = input[3] + input[4];
423         s4 = input[3] - input[4];
424         s5 = input[2] - input[5];
425         s6 = input[1] - input[6];
426         s7 = input[0] - input[7];
427 
428         // fdct4(step, step);
429         x0 = s0 + s3;
430         x1 = s1 + s2;
431         x2 = s1 - s2;
432         x3 = s0 - s3;
433         t0 = (x0 + x1) * cospi_16_64;
434         t1 = (x0 - x1) * cospi_16_64;
435         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
436         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
437         out[0] = fdct_round_shift(t0);
438         out[4] = fdct_round_shift(t2);
439         out[8] = fdct_round_shift(t1);
440         out[12] = fdct_round_shift(t3);
441 
442         // Stage 2
443         t0 = (s6 - s5) * cospi_16_64;
444         t1 = (s6 + s5) * cospi_16_64;
445         t2 = fdct_round_shift(t0);
446         t3 = fdct_round_shift(t1);
447 
448         // Stage 3
449         x0 = s4 + t2;
450         x1 = s4 - t2;
451         x2 = s7 - t3;
452         x3 = s7 + t3;
453 
454         // Stage 4
455         t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
456         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
457         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
458         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
459         out[2] = fdct_round_shift(t0);
460         out[6] = fdct_round_shift(t2);
461         out[10] = fdct_round_shift(t1);
462         out[14] = fdct_round_shift(t3);
463       }
464       // Work on the next eight values; step1 -> odd_results
465       {
466         // step 2
467         temp1 = (step1[5] - step1[2]) * cospi_16_64;
468         temp2 = (step1[4] - step1[3]) * cospi_16_64;
469         step2[2] = fdct_round_shift(temp1);
470         step2[3] = fdct_round_shift(temp2);
471         temp1 = (step1[4] + step1[3]) * cospi_16_64;
472         temp2 = (step1[5] + step1[2]) * cospi_16_64;
473         step2[4] = fdct_round_shift(temp1);
474         step2[5] = fdct_round_shift(temp2);
475         // step 3
476         step3[0] = step1[0] + step2[3];
477         step3[1] = step1[1] + step2[2];
478         step3[2] = step1[1] - step2[2];
479         step3[3] = step1[0] - step2[3];
480         step3[4] = step1[7] - step2[4];
481         step3[5] = step1[6] - step2[5];
482         step3[6] = step1[6] + step2[5];
483         step3[7] = step1[7] + step2[4];
484         // step 4
485         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
486         temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
487         step2[1] = fdct_round_shift(temp1);
488         step2[2] = fdct_round_shift(temp2);
489         temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
490         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
491         step2[5] = fdct_round_shift(temp1);
492         step2[6] = fdct_round_shift(temp2);
493         // step 5
494         step1[0] = step3[0] + step2[1];
495         step1[1] = step3[0] - step2[1];
496         step1[2] = step3[3] + step2[2];
497         step1[3] = step3[3] - step2[2];
498         step1[4] = step3[4] - step2[5];
499         step1[5] = step3[4] + step2[5];
500         step1[6] = step3[7] - step2[6];
501         step1[7] = step3[7] + step2[6];
502         // step 6
503         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
504         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
505         out[1] = fdct_round_shift(temp1);
506         out[9] = fdct_round_shift(temp2);
507         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
508         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
509         out[5] = fdct_round_shift(temp1);
510         out[13] = fdct_round_shift(temp2);
511         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
512         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
513         out[3] = fdct_round_shift(temp1);
514         out[11] = fdct_round_shift(temp2);
515         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
516         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
517         out[7] = fdct_round_shift(temp1);
518         out[15] = fdct_round_shift(temp2);
519       }
520       // Do next column (which is a transposed row in second/horizontal pass)
521       in++;
522       in_pass0++;
523       out += 16;
524     }
525     // Setup in/out for next pass.
526     in = intermediate;
527     out = output;
528   }
529 }
530 
fadst8(const tran_low_t * input,tran_low_t * output)531 static void fadst8(const tran_low_t *input, tran_low_t *output) {
532   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
533 
534   tran_high_t x0 = input[7];
535   tran_high_t x1 = input[0];
536   tran_high_t x2 = input[5];
537   tran_high_t x3 = input[2];
538   tran_high_t x4 = input[3];
539   tran_high_t x5 = input[4];
540   tran_high_t x6 = input[1];
541   tran_high_t x7 = input[6];
542 
543   // stage 1
544   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
545   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
546   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
547   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
548   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
549   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
550   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
551   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
552 
553   x0 = fdct_round_shift(s0 + s4);
554   x1 = fdct_round_shift(s1 + s5);
555   x2 = fdct_round_shift(s2 + s6);
556   x3 = fdct_round_shift(s3 + s7);
557   x4 = fdct_round_shift(s0 - s4);
558   x5 = fdct_round_shift(s1 - s5);
559   x6 = fdct_round_shift(s2 - s6);
560   x7 = fdct_round_shift(s3 - s7);
561 
562   // stage 2
563   s0 = x0;
564   s1 = x1;
565   s2 = x2;
566   s3 = x3;
567   s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
568   s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
569   s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
570   s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
571 
572   x0 = s0 + s2;
573   x1 = s1 + s3;
574   x2 = s0 - s2;
575   x3 = s1 - s3;
576   x4 = fdct_round_shift(s4 + s6);
577   x5 = fdct_round_shift(s5 + s7);
578   x6 = fdct_round_shift(s4 - s6);
579   x7 = fdct_round_shift(s5 - s7);
580 
581   // stage 3
582   s2 = cospi_16_64 * (x2 + x3);
583   s3 = cospi_16_64 * (x2 - x3);
584   s6 = cospi_16_64 * (x6 + x7);
585   s7 = cospi_16_64 * (x6 - x7);
586 
587   x2 = fdct_round_shift(s2);
588   x3 = fdct_round_shift(s3);
589   x6 = fdct_round_shift(s6);
590   x7 = fdct_round_shift(s7);
591 
592   output[0] =   x0;
593   output[1] = - x4;
594   output[2] =   x6;
595   output[3] = - x2;
596   output[4] =   x3;
597   output[5] = - x7;
598   output[6] =   x5;
599   output[7] = - x1;
600 }
601 
602 static const transform_2d FHT_8[] = {
603   { fdct8,  fdct8  },  // DCT_DCT  = 0
604   { fadst8, fdct8  },  // ADST_DCT = 1
605   { fdct8,  fadst8 },  // DCT_ADST = 2
606   { fadst8, fadst8 }   // ADST_ADST = 3
607 };
608 
vp9_fht8x8_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)609 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
610                   int stride, int tx_type) {
611   if (tx_type == DCT_DCT) {
612     vp9_fdct8x8_c(input, output, stride);
613   } else {
614     tran_low_t out[64];
615     tran_low_t *outptr = &out[0];
616     int i, j;
617     tran_low_t temp_in[8], temp_out[8];
618     const transform_2d ht = FHT_8[tx_type];
619 
620     // Columns
621     for (i = 0; i < 8; ++i) {
622       for (j = 0; j < 8; ++j)
623         temp_in[j] = input[j * stride + i] * 4;
624       ht.cols(temp_in, temp_out);
625       for (j = 0; j < 8; ++j)
626         outptr[j * 8 + i] = temp_out[j];
627     }
628 
629     // Rows
630     for (i = 0; i < 8; ++i) {
631       for (j = 0; j < 8; ++j)
632         temp_in[j] = out[j + i * 8];
633       ht.rows(temp_in, temp_out);
634       for (j = 0; j < 8; ++j)
635         output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
636     }
637   }
638 }
639 
640 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
641    pixel. */
vp9_fwht4x4_c(const int16_t * input,tran_low_t * output,int stride)642 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
643   int i;
644   tran_high_t a1, b1, c1, d1, e1;
645   const int16_t *ip_pass0 = input;
646   const tran_low_t *ip = NULL;
647   tran_low_t *op = output;
648 
649   for (i = 0; i < 4; i++) {
650     a1 = ip_pass0[0 * stride];
651     b1 = ip_pass0[1 * stride];
652     c1 = ip_pass0[2 * stride];
653     d1 = ip_pass0[3 * stride];
654 
655     a1 += b1;
656     d1 = d1 - c1;
657     e1 = (a1 - d1) >> 1;
658     b1 = e1 - b1;
659     c1 = e1 - c1;
660     a1 -= c1;
661     d1 += b1;
662     op[0] = a1;
663     op[4] = c1;
664     op[8] = d1;
665     op[12] = b1;
666 
667     ip_pass0++;
668     op++;
669   }
670   ip = output;
671   op = output;
672 
673   for (i = 0; i < 4; i++) {
674     a1 = ip[0];
675     b1 = ip[1];
676     c1 = ip[2];
677     d1 = ip[3];
678 
679     a1 += b1;
680     d1 -= c1;
681     e1 = (a1 - d1) >> 1;
682     b1 = e1 - b1;
683     c1 = e1 - c1;
684     a1 -= c1;
685     d1 += b1;
686     op[0] = a1 * UNIT_QUANT_FACTOR;
687     op[1] = c1 * UNIT_QUANT_FACTOR;
688     op[2] = d1 * UNIT_QUANT_FACTOR;
689     op[3] = b1 * UNIT_QUANT_FACTOR;
690 
691     ip += 4;
692     op += 4;
693   }
694 }
695 
696 // Rewrote to use same algorithm as others.
fdct16(const tran_low_t in[16],tran_low_t out[16])697 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
698   tran_high_t step1[8];      // canbe16
699   tran_high_t step2[8];      // canbe16
700   tran_high_t step3[8];      // canbe16
701   tran_high_t input[8];      // canbe16
702   tran_high_t temp1, temp2;  // needs32
703 
704   // step 1
705   input[0] = in[0] + in[15];
706   input[1] = in[1] + in[14];
707   input[2] = in[2] + in[13];
708   input[3] = in[3] + in[12];
709   input[4] = in[4] + in[11];
710   input[5] = in[5] + in[10];
711   input[6] = in[6] + in[ 9];
712   input[7] = in[7] + in[ 8];
713 
714   step1[0] = in[7] - in[ 8];
715   step1[1] = in[6] - in[ 9];
716   step1[2] = in[5] - in[10];
717   step1[3] = in[4] - in[11];
718   step1[4] = in[3] - in[12];
719   step1[5] = in[2] - in[13];
720   step1[6] = in[1] - in[14];
721   step1[7] = in[0] - in[15];
722 
723   // fdct8(step, step);
724   {
725     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
726     tran_high_t t0, t1, t2, t3;                  // needs32
727     tran_high_t x0, x1, x2, x3;                  // canbe16
728 
729     // stage 1
730     s0 = input[0] + input[7];
731     s1 = input[1] + input[6];
732     s2 = input[2] + input[5];
733     s3 = input[3] + input[4];
734     s4 = input[3] - input[4];
735     s5 = input[2] - input[5];
736     s6 = input[1] - input[6];
737     s7 = input[0] - input[7];
738 
739     // fdct4(step, step);
740     x0 = s0 + s3;
741     x1 = s1 + s2;
742     x2 = s1 - s2;
743     x3 = s0 - s3;
744     t0 = (x0 + x1) * cospi_16_64;
745     t1 = (x0 - x1) * cospi_16_64;
746     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
747     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
748     out[0] = fdct_round_shift(t0);
749     out[4] = fdct_round_shift(t2);
750     out[8] = fdct_round_shift(t1);
751     out[12] = fdct_round_shift(t3);
752 
753     // Stage 2
754     t0 = (s6 - s5) * cospi_16_64;
755     t1 = (s6 + s5) * cospi_16_64;
756     t2 = fdct_round_shift(t0);
757     t3 = fdct_round_shift(t1);
758 
759     // Stage 3
760     x0 = s4 + t2;
761     x1 = s4 - t2;
762     x2 = s7 - t3;
763     x3 = s7 + t3;
764 
765     // Stage 4
766     t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
767     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
768     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
769     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
770     out[2] = fdct_round_shift(t0);
771     out[6] = fdct_round_shift(t2);
772     out[10] = fdct_round_shift(t1);
773     out[14] = fdct_round_shift(t3);
774   }
775 
776   // step 2
777   temp1 = (step1[5] - step1[2]) * cospi_16_64;
778   temp2 = (step1[4] - step1[3]) * cospi_16_64;
779   step2[2] = fdct_round_shift(temp1);
780   step2[3] = fdct_round_shift(temp2);
781   temp1 = (step1[4] + step1[3]) * cospi_16_64;
782   temp2 = (step1[5] + step1[2]) * cospi_16_64;
783   step2[4] = fdct_round_shift(temp1);
784   step2[5] = fdct_round_shift(temp2);
785 
786   // step 3
787   step3[0] = step1[0] + step2[3];
788   step3[1] = step1[1] + step2[2];
789   step3[2] = step1[1] - step2[2];
790   step3[3] = step1[0] - step2[3];
791   step3[4] = step1[7] - step2[4];
792   step3[5] = step1[6] - step2[5];
793   step3[6] = step1[6] + step2[5];
794   step3[7] = step1[7] + step2[4];
795 
796   // step 4
797   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
798   temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
799   step2[1] = fdct_round_shift(temp1);
800   step2[2] = fdct_round_shift(temp2);
801   temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
802   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
803   step2[5] = fdct_round_shift(temp1);
804   step2[6] = fdct_round_shift(temp2);
805 
806   // step 5
807   step1[0] = step3[0] + step2[1];
808   step1[1] = step3[0] - step2[1];
809   step1[2] = step3[3] + step2[2];
810   step1[3] = step3[3] - step2[2];
811   step1[4] = step3[4] - step2[5];
812   step1[5] = step3[4] + step2[5];
813   step1[6] = step3[7] - step2[6];
814   step1[7] = step3[7] + step2[6];
815 
816   // step 6
817   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
818   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
819   out[1] = fdct_round_shift(temp1);
820   out[9] = fdct_round_shift(temp2);
821 
822   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
823   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
824   out[5] = fdct_round_shift(temp1);
825   out[13] = fdct_round_shift(temp2);
826 
827   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
828   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
829   out[3] = fdct_round_shift(temp1);
830   out[11] = fdct_round_shift(temp2);
831 
832   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
833   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
834   out[7] = fdct_round_shift(temp1);
835   out[15] = fdct_round_shift(temp2);
836 }
837 
fadst16(const tran_low_t * input,tran_low_t * output)838 static void fadst16(const tran_low_t *input, tran_low_t *output) {
839   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
840   tran_high_t s9, s10, s11, s12, s13, s14, s15;
841 
842   tran_high_t x0 = input[15];
843   tran_high_t x1 = input[0];
844   tran_high_t x2 = input[13];
845   tran_high_t x3 = input[2];
846   tran_high_t x4 = input[11];
847   tran_high_t x5 = input[4];
848   tran_high_t x6 = input[9];
849   tran_high_t x7 = input[6];
850   tran_high_t x8 = input[7];
851   tran_high_t x9 = input[8];
852   tran_high_t x10 = input[5];
853   tran_high_t x11 = input[10];
854   tran_high_t x12 = input[3];
855   tran_high_t x13 = input[12];
856   tran_high_t x14 = input[1];
857   tran_high_t x15 = input[14];
858 
859   // stage 1
860   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
861   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
862   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
863   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
864   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
865   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
866   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
867   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
868   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
869   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
870   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
871   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
872   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
873   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
874   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
875   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
876 
877   x0 = fdct_round_shift(s0 + s8);
878   x1 = fdct_round_shift(s1 + s9);
879   x2 = fdct_round_shift(s2 + s10);
880   x3 = fdct_round_shift(s3 + s11);
881   x4 = fdct_round_shift(s4 + s12);
882   x5 = fdct_round_shift(s5 + s13);
883   x6 = fdct_round_shift(s6 + s14);
884   x7 = fdct_round_shift(s7 + s15);
885   x8  = fdct_round_shift(s0 - s8);
886   x9  = fdct_round_shift(s1 - s9);
887   x10 = fdct_round_shift(s2 - s10);
888   x11 = fdct_round_shift(s3 - s11);
889   x12 = fdct_round_shift(s4 - s12);
890   x13 = fdct_round_shift(s5 - s13);
891   x14 = fdct_round_shift(s6 - s14);
892   x15 = fdct_round_shift(s7 - s15);
893 
894   // stage 2
895   s0 = x0;
896   s1 = x1;
897   s2 = x2;
898   s3 = x3;
899   s4 = x4;
900   s5 = x5;
901   s6 = x6;
902   s7 = x7;
903   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
904   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
905   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
906   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
907   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
908   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
909   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
910   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
911 
912   x0 = s0 + s4;
913   x1 = s1 + s5;
914   x2 = s2 + s6;
915   x3 = s3 + s7;
916   x4 = s0 - s4;
917   x5 = s1 - s5;
918   x6 = s2 - s6;
919   x7 = s3 - s7;
920   x8 = fdct_round_shift(s8 + s12);
921   x9 = fdct_round_shift(s9 + s13);
922   x10 = fdct_round_shift(s10 + s14);
923   x11 = fdct_round_shift(s11 + s15);
924   x12 = fdct_round_shift(s8 - s12);
925   x13 = fdct_round_shift(s9 - s13);
926   x14 = fdct_round_shift(s10 - s14);
927   x15 = fdct_round_shift(s11 - s15);
928 
929   // stage 3
930   s0 = x0;
931   s1 = x1;
932   s2 = x2;
933   s3 = x3;
934   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
935   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
936   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
937   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
938   s8 = x8;
939   s9 = x9;
940   s10 = x10;
941   s11 = x11;
942   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
943   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
944   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
945   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
946 
947   x0 = s0 + s2;
948   x1 = s1 + s3;
949   x2 = s0 - s2;
950   x3 = s1 - s3;
951   x4 = fdct_round_shift(s4 + s6);
952   x5 = fdct_round_shift(s5 + s7);
953   x6 = fdct_round_shift(s4 - s6);
954   x7 = fdct_round_shift(s5 - s7);
955   x8 = s8 + s10;
956   x9 = s9 + s11;
957   x10 = s8 - s10;
958   x11 = s9 - s11;
959   x12 = fdct_round_shift(s12 + s14);
960   x13 = fdct_round_shift(s13 + s15);
961   x14 = fdct_round_shift(s12 - s14);
962   x15 = fdct_round_shift(s13 - s15);
963 
964   // stage 4
965   s2 = (- cospi_16_64) * (x2 + x3);
966   s3 = cospi_16_64 * (x2 - x3);
967   s6 = cospi_16_64 * (x6 + x7);
968   s7 = cospi_16_64 * (- x6 + x7);
969   s10 = cospi_16_64 * (x10 + x11);
970   s11 = cospi_16_64 * (- x10 + x11);
971   s14 = (- cospi_16_64) * (x14 + x15);
972   s15 = cospi_16_64 * (x14 - x15);
973 
974   x2 = fdct_round_shift(s2);
975   x3 = fdct_round_shift(s3);
976   x6 = fdct_round_shift(s6);
977   x7 = fdct_round_shift(s7);
978   x10 = fdct_round_shift(s10);
979   x11 = fdct_round_shift(s11);
980   x14 = fdct_round_shift(s14);
981   x15 = fdct_round_shift(s15);
982 
983   output[0] = x0;
984   output[1] = - x8;
985   output[2] = x12;
986   output[3] = - x4;
987   output[4] = x6;
988   output[5] = x14;
989   output[6] = x10;
990   output[7] = x2;
991   output[8] = x3;
992   output[9] =  x11;
993   output[10] = x15;
994   output[11] = x7;
995   output[12] = x5;
996   output[13] = - x13;
997   output[14] = x9;
998   output[15] = - x1;
999 }
1000 
1001 static const transform_2d FHT_16[] = {
1002   { fdct16,  fdct16  },  // DCT_DCT  = 0
1003   { fadst16, fdct16  },  // ADST_DCT = 1
1004   { fdct16,  fadst16 },  // DCT_ADST = 2
1005   { fadst16, fadst16 }   // ADST_ADST = 3
1006 };
1007 
vp9_fht16x16_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)1008 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
1009                     int stride, int tx_type) {
1010   if (tx_type == DCT_DCT) {
1011     vp9_fdct16x16_c(input, output, stride);
1012   } else {
1013     tran_low_t out[256];
1014     tran_low_t *outptr = &out[0];
1015     int i, j;
1016     tran_low_t temp_in[16], temp_out[16];
1017     const transform_2d ht = FHT_16[tx_type];
1018 
1019     // Columns
1020     for (i = 0; i < 16; ++i) {
1021       for (j = 0; j < 16; ++j)
1022         temp_in[j] = input[j * stride + i] * 4;
1023       ht.cols(temp_in, temp_out);
1024       for (j = 0; j < 16; ++j)
1025         outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1026     }
1027 
1028     // Rows
1029     for (i = 0; i < 16; ++i) {
1030       for (j = 0; j < 16; ++j)
1031         temp_in[j] = out[j + i * 16];
1032       ht.rows(temp_in, temp_out);
1033       for (j = 0; j < 16; ++j)
1034         output[j + i * 16] = temp_out[j];
1035     }
1036   }
1037 }
1038 
dct_32_round(tran_high_t input)1039 static INLINE tran_high_t dct_32_round(tran_high_t input) {
1040   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
1041   // TODO(debargha, peter.derivaz): Find new bounds for this assert,
1042   // and make the bounds consts.
1043   // assert(-131072 <= rv && rv <= 131071);
1044   return rv;
1045 }
1046 
half_round_shift(tran_high_t input)1047 static INLINE tran_high_t half_round_shift(tran_high_t input) {
1048   tran_high_t rv = (input + 1 + (input < 0)) >> 2;
1049   return rv;
1050 }
1051 
fdct32(const tran_high_t * input,tran_high_t * output,int round)1052 static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
1053   tran_high_t step[32];
1054   // Stage 1
1055   step[0] = input[0] + input[(32 - 1)];
1056   step[1] = input[1] + input[(32 - 2)];
1057   step[2] = input[2] + input[(32 - 3)];
1058   step[3] = input[3] + input[(32 - 4)];
1059   step[4] = input[4] + input[(32 - 5)];
1060   step[5] = input[5] + input[(32 - 6)];
1061   step[6] = input[6] + input[(32 - 7)];
1062   step[7] = input[7] + input[(32 - 8)];
1063   step[8] = input[8] + input[(32 - 9)];
1064   step[9] = input[9] + input[(32 - 10)];
1065   step[10] = input[10] + input[(32 - 11)];
1066   step[11] = input[11] + input[(32 - 12)];
1067   step[12] = input[12] + input[(32 - 13)];
1068   step[13] = input[13] + input[(32 - 14)];
1069   step[14] = input[14] + input[(32 - 15)];
1070   step[15] = input[15] + input[(32 - 16)];
1071   step[16] = -input[16] + input[(32 - 17)];
1072   step[17] = -input[17] + input[(32 - 18)];
1073   step[18] = -input[18] + input[(32 - 19)];
1074   step[19] = -input[19] + input[(32 - 20)];
1075   step[20] = -input[20] + input[(32 - 21)];
1076   step[21] = -input[21] + input[(32 - 22)];
1077   step[22] = -input[22] + input[(32 - 23)];
1078   step[23] = -input[23] + input[(32 - 24)];
1079   step[24] = -input[24] + input[(32 - 25)];
1080   step[25] = -input[25] + input[(32 - 26)];
1081   step[26] = -input[26] + input[(32 - 27)];
1082   step[27] = -input[27] + input[(32 - 28)];
1083   step[28] = -input[28] + input[(32 - 29)];
1084   step[29] = -input[29] + input[(32 - 30)];
1085   step[30] = -input[30] + input[(32 - 31)];
1086   step[31] = -input[31] + input[(32 - 32)];
1087 
1088   // Stage 2
1089   output[0] = step[0] + step[16 - 1];
1090   output[1] = step[1] + step[16 - 2];
1091   output[2] = step[2] + step[16 - 3];
1092   output[3] = step[3] + step[16 - 4];
1093   output[4] = step[4] + step[16 - 5];
1094   output[5] = step[5] + step[16 - 6];
1095   output[6] = step[6] + step[16 - 7];
1096   output[7] = step[7] + step[16 - 8];
1097   output[8] = -step[8] + step[16 - 9];
1098   output[9] = -step[9] + step[16 - 10];
1099   output[10] = -step[10] + step[16 - 11];
1100   output[11] = -step[11] + step[16 - 12];
1101   output[12] = -step[12] + step[16 - 13];
1102   output[13] = -step[13] + step[16 - 14];
1103   output[14] = -step[14] + step[16 - 15];
1104   output[15] = -step[15] + step[16 - 16];
1105 
1106   output[16] = step[16];
1107   output[17] = step[17];
1108   output[18] = step[18];
1109   output[19] = step[19];
1110 
1111   output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
1112   output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
1113   output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
1114   output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
1115 
1116   output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
1117   output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
1118   output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
1119   output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
1120 
1121   output[28] = step[28];
1122   output[29] = step[29];
1123   output[30] = step[30];
1124   output[31] = step[31];
1125 
1126   // dump the magnitude by 4, hence the intermediate values are within
1127   // the range of 16 bits.
1128   if (round) {
1129     output[0] = half_round_shift(output[0]);
1130     output[1] = half_round_shift(output[1]);
1131     output[2] = half_round_shift(output[2]);
1132     output[3] = half_round_shift(output[3]);
1133     output[4] = half_round_shift(output[4]);
1134     output[5] = half_round_shift(output[5]);
1135     output[6] = half_round_shift(output[6]);
1136     output[7] = half_round_shift(output[7]);
1137     output[8] = half_round_shift(output[8]);
1138     output[9] = half_round_shift(output[9]);
1139     output[10] = half_round_shift(output[10]);
1140     output[11] = half_round_shift(output[11]);
1141     output[12] = half_round_shift(output[12]);
1142     output[13] = half_round_shift(output[13]);
1143     output[14] = half_round_shift(output[14]);
1144     output[15] = half_round_shift(output[15]);
1145 
1146     output[16] = half_round_shift(output[16]);
1147     output[17] = half_round_shift(output[17]);
1148     output[18] = half_round_shift(output[18]);
1149     output[19] = half_round_shift(output[19]);
1150     output[20] = half_round_shift(output[20]);
1151     output[21] = half_round_shift(output[21]);
1152     output[22] = half_round_shift(output[22]);
1153     output[23] = half_round_shift(output[23]);
1154     output[24] = half_round_shift(output[24]);
1155     output[25] = half_round_shift(output[25]);
1156     output[26] = half_round_shift(output[26]);
1157     output[27] = half_round_shift(output[27]);
1158     output[28] = half_round_shift(output[28]);
1159     output[29] = half_round_shift(output[29]);
1160     output[30] = half_round_shift(output[30]);
1161     output[31] = half_round_shift(output[31]);
1162   }
1163 
1164   // Stage 3
1165   step[0] = output[0] + output[(8 - 1)];
1166   step[1] = output[1] + output[(8 - 2)];
1167   step[2] = output[2] + output[(8 - 3)];
1168   step[3] = output[3] + output[(8 - 4)];
1169   step[4] = -output[4] + output[(8 - 5)];
1170   step[5] = -output[5] + output[(8 - 6)];
1171   step[6] = -output[6] + output[(8 - 7)];
1172   step[7] = -output[7] + output[(8 - 8)];
1173   step[8] = output[8];
1174   step[9] = output[9];
1175   step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
1176   step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
1177   step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
1178   step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
1179   step[14] = output[14];
1180   step[15] = output[15];
1181 
1182   step[16] = output[16] + output[23];
1183   step[17] = output[17] + output[22];
1184   step[18] = output[18] + output[21];
1185   step[19] = output[19] + output[20];
1186   step[20] = -output[20] + output[19];
1187   step[21] = -output[21] + output[18];
1188   step[22] = -output[22] + output[17];
1189   step[23] = -output[23] + output[16];
1190   step[24] = -output[24] + output[31];
1191   step[25] = -output[25] + output[30];
1192   step[26] = -output[26] + output[29];
1193   step[27] = -output[27] + output[28];
1194   step[28] = output[28] + output[27];
1195   step[29] = output[29] + output[26];
1196   step[30] = output[30] + output[25];
1197   step[31] = output[31] + output[24];
1198 
1199   // Stage 4
1200   output[0] = step[0] + step[3];
1201   output[1] = step[1] + step[2];
1202   output[2] = -step[2] + step[1];
1203   output[3] = -step[3] + step[0];
1204   output[4] = step[4];
1205   output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
1206   output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
1207   output[7] = step[7];
1208   output[8] = step[8] + step[11];
1209   output[9] = step[9] + step[10];
1210   output[10] = -step[10] + step[9];
1211   output[11] = -step[11] + step[8];
1212   output[12] = -step[12] + step[15];
1213   output[13] = -step[13] + step[14];
1214   output[14] = step[14] + step[13];
1215   output[15] = step[15] + step[12];
1216 
1217   output[16] = step[16];
1218   output[17] = step[17];
1219   output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
1220   output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
1221   output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
1222   output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
1223   output[22] = step[22];
1224   output[23] = step[23];
1225   output[24] = step[24];
1226   output[25] = step[25];
1227   output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
1228   output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
1229   output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
1230   output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
1231   output[30] = step[30];
1232   output[31] = step[31];
1233 
1234   // Stage 5
1235   step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
1236   step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
1237   step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
1238   step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
1239   step[4] = output[4] + output[5];
1240   step[5] = -output[5] + output[4];
1241   step[6] = -output[6] + output[7];
1242   step[7] = output[7] + output[6];
1243   step[8] = output[8];
1244   step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
1245   step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
1246   step[11] = output[11];
1247   step[12] = output[12];
1248   step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
1249   step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
1250   step[15] = output[15];
1251 
1252   step[16] = output[16] + output[19];
1253   step[17] = output[17] + output[18];
1254   step[18] = -output[18] + output[17];
1255   step[19] = -output[19] + output[16];
1256   step[20] = -output[20] + output[23];
1257   step[21] = -output[21] + output[22];
1258   step[22] = output[22] + output[21];
1259   step[23] = output[23] + output[20];
1260   step[24] = output[24] + output[27];
1261   step[25] = output[25] + output[26];
1262   step[26] = -output[26] + output[25];
1263   step[27] = -output[27] + output[24];
1264   step[28] = -output[28] + output[31];
1265   step[29] = -output[29] + output[30];
1266   step[30] = output[30] + output[29];
1267   step[31] = output[31] + output[28];
1268 
1269   // Stage 6
1270   output[0] = step[0];
1271   output[1] = step[1];
1272   output[2] = step[2];
1273   output[3] = step[3];
1274   output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
1275   output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
1276   output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
1277   output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
1278   output[8] = step[8] + step[9];
1279   output[9] = -step[9] + step[8];
1280   output[10] = -step[10] + step[11];
1281   output[11] = step[11] + step[10];
1282   output[12] = step[12] + step[13];
1283   output[13] = -step[13] + step[12];
1284   output[14] = -step[14] + step[15];
1285   output[15] = step[15] + step[14];
1286 
1287   output[16] = step[16];
1288   output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
1289   output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
1290   output[19] = step[19];
1291   output[20] = step[20];
1292   output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
1293   output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
1294   output[23] = step[23];
1295   output[24] = step[24];
1296   output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
1297   output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
1298   output[27] = step[27];
1299   output[28] = step[28];
1300   output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
1301   output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
1302   output[31] = step[31];
1303 
1304   // Stage 7
1305   step[0] = output[0];
1306   step[1] = output[1];
1307   step[2] = output[2];
1308   step[3] = output[3];
1309   step[4] = output[4];
1310   step[5] = output[5];
1311   step[6] = output[6];
1312   step[7] = output[7];
1313   step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
1314   step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
1315   step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
1316   step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
1317   step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
1318   step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
1319   step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
1320   step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
1321 
1322   step[16] = output[16] + output[17];
1323   step[17] = -output[17] + output[16];
1324   step[18] = -output[18] + output[19];
1325   step[19] = output[19] + output[18];
1326   step[20] = output[20] + output[21];
1327   step[21] = -output[21] + output[20];
1328   step[22] = -output[22] + output[23];
1329   step[23] = output[23] + output[22];
1330   step[24] = output[24] + output[25];
1331   step[25] = -output[25] + output[24];
1332   step[26] = -output[26] + output[27];
1333   step[27] = output[27] + output[26];
1334   step[28] = output[28] + output[29];
1335   step[29] = -output[29] + output[28];
1336   step[30] = -output[30] + output[31];
1337   step[31] = output[31] + output[30];
1338 
1339   // Final stage --- outputs indices are bit-reversed.
1340   output[0]  = step[0];
1341   output[16] = step[1];
1342   output[8]  = step[2];
1343   output[24] = step[3];
1344   output[4]  = step[4];
1345   output[20] = step[5];
1346   output[12] = step[6];
1347   output[28] = step[7];
1348   output[2]  = step[8];
1349   output[18] = step[9];
1350   output[10] = step[10];
1351   output[26] = step[11];
1352   output[6]  = step[12];
1353   output[22] = step[13];
1354   output[14] = step[14];
1355   output[30] = step[15];
1356 
1357   output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
1358   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
1359   output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
1360   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
1361   output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
1362   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
1363   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
1364   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
1365   output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1366   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1367   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1368   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1369   output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1370   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1371   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1372   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1373 }
1374 
vp9_fdct32x32_1_c(const int16_t * input,tran_low_t * output,int stride)1375 void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
1376   int r, c;
1377   tran_low_t sum = 0;
1378   for (r = 0; r < 32; ++r)
1379     for (c = 0; c < 32; ++c)
1380       sum += input[r * stride + c];
1381 
1382   output[0] = sum >> 3;
1383   output[1] = 0;
1384 }
1385 
vp9_fdct32x32_c(const int16_t * input,tran_low_t * out,int stride)1386 void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
1387   int i, j;
1388   tran_high_t output[32 * 32];
1389 
1390   // Columns
1391   for (i = 0; i < 32; ++i) {
1392     tran_high_t temp_in[32], temp_out[32];
1393     for (j = 0; j < 32; ++j)
1394       temp_in[j] = input[j * stride + i] * 4;
1395     fdct32(temp_in, temp_out, 0);
1396     for (j = 0; j < 32; ++j)
1397       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1398   }
1399 
1400   // Rows
1401   for (i = 0; i < 32; ++i) {
1402     tran_high_t temp_in[32], temp_out[32];
1403     for (j = 0; j < 32; ++j)
1404       temp_in[j] = output[j + i * 32];
1405     fdct32(temp_in, temp_out, 0);
1406     for (j = 0; j < 32; ++j)
1407       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1408   }
1409 }
1410 
1411 // Note that although we use dct_32_round in dct32 computation flow,
1412 // this 2d fdct32x32 for rate-distortion optimization loop is operating
1413 // within 16 bits precision.
vp9_fdct32x32_rd_c(const int16_t * input,tran_low_t * out,int stride)1414 void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
1415   int i, j;
1416   tran_high_t output[32 * 32];
1417 
1418   // Columns
1419   for (i = 0; i < 32; ++i) {
1420     tran_high_t temp_in[32], temp_out[32];
1421     for (j = 0; j < 32; ++j)
1422       temp_in[j] = input[j * stride + i] * 4;
1423     fdct32(temp_in, temp_out, 0);
1424     for (j = 0; j < 32; ++j)
1425       // TODO(cd): see quality impact of only doing
1426       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
1427       //           PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1428       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1429   }
1430 
1431   // Rows
1432   for (i = 0; i < 32; ++i) {
1433     tran_high_t temp_in[32], temp_out[32];
1434     for (j = 0; j < 32; ++j)
1435       temp_in[j] = output[j + i * 32];
1436     fdct32(temp_in, temp_out, 1);
1437     for (j = 0; j < 32; ++j)
1438       out[j + i * 32] = temp_out[j];
1439   }
1440 }
1441 
1442 #if CONFIG_VP9_HIGHBITDEPTH
vp9_high_fdct4x4_c(const int16_t * input,tran_low_t * output,int stride)1443 void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1444   vp9_fdct4x4_c(input, output, stride);
1445 }
1446 
vp9_high_fht4x4_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)1447 void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
1448                        int stride, int tx_type) {
1449   vp9_fht4x4_c(input, output, stride, tx_type);
1450 }
1451 
vp9_high_fdct8x8_1_c(const int16_t * input,tran_low_t * final_output,int stride)1452 void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
1453                           int stride) {
1454   vp9_fdct8x8_1_c(input, final_output, stride);
1455 }
1456 
vp9_high_fdct8x8_c(const int16_t * input,tran_low_t * final_output,int stride)1457 void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
1458                         int stride) {
1459   vp9_fdct8x8_c(input, final_output, stride);
1460 }
1461 
vp9_high_fdct16x16_1_c(const int16_t * input,tran_low_t * output,int stride)1462 void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
1463                             int stride) {
1464   vp9_fdct16x16_1_c(input, output, stride);
1465 }
1466 
vp9_high_fdct16x16_c(const int16_t * input,tran_low_t * output,int stride)1467 void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
1468                           int stride) {
1469   vp9_fdct16x16_c(input, output, stride);
1470 }
1471 
vp9_high_fht8x8_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)1472 void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
1473                   int stride, int tx_type) {
1474   vp9_fht8x8_c(input, output, stride, tx_type);
1475 }
1476 
vp9_high_fwht4x4_c(const int16_t * input,tran_low_t * output,int stride)1477 void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1478   vp9_fwht4x4_c(input, output, stride);
1479 }
1480 
vp9_high_fht16x16_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)1481 void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
1482                     int stride, int tx_type) {
1483   vp9_fht16x16_c(input, output, stride, tx_type);
1484 }
1485 
vp9_high_fdct32x32_1_c(const int16_t * input,tran_low_t * out,int stride)1486 void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
1487   vp9_fdct32x32_1_c(input, out, stride);
1488 }
1489 
vp9_high_fdct32x32_c(const int16_t * input,tran_low_t * out,int stride)1490 void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
1491   vp9_fdct32x32_c(input, out, stride);
1492 }
1493 
vp9_high_fdct32x32_rd_c(const int16_t * input,tran_low_t * out,int stride)1494 void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
1495                              int stride) {
1496   vp9_fdct32x32_rd_c(input, out, stride);
1497 }
1498 #endif  // CONFIG_VP9_HIGHBITDEPTH
1499