• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <math.h>
12 #include <string.h>
13 
14 #include "vpx_dsp/inv_txfm.h"
15 
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)16 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
17 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
18    0.5 shifts per pixel. */
19   int i;
20   tran_low_t output[16];
21   tran_high_t a1, b1, c1, d1, e1;
22   const tran_low_t *ip = input;
23   tran_low_t *op = output;
24 
25   for (i = 0; i < 4; i++) {
26     a1 = ip[0] >> UNIT_QUANT_SHIFT;
27     c1 = ip[1] >> UNIT_QUANT_SHIFT;
28     d1 = ip[2] >> UNIT_QUANT_SHIFT;
29     b1 = ip[3] >> UNIT_QUANT_SHIFT;
30     a1 += c1;
31     d1 -= b1;
32     e1 = (a1 - d1) >> 1;
33     b1 = e1 - b1;
34     c1 = e1 - c1;
35     a1 -= b1;
36     d1 += c1;
37     op[0] = WRAPLOW(a1, 8);
38     op[1] = WRAPLOW(b1, 8);
39     op[2] = WRAPLOW(c1, 8);
40     op[3] = WRAPLOW(d1, 8);
41     ip += 4;
42     op += 4;
43   }
44 
45   ip = output;
46   for (i = 0; i < 4; i++) {
47     a1 = ip[4 * 0];
48     c1 = ip[4 * 1];
49     d1 = ip[4 * 2];
50     b1 = ip[4 * 3];
51     a1 += c1;
52     d1 -= b1;
53     e1 = (a1 - d1) >> 1;
54     b1 = e1 - b1;
55     c1 = e1 - c1;
56     a1 -= b1;
57     d1 += c1;
58     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
59     dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
60     dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
61     dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
62 
63     ip++;
64     dest++;
65   }
66 }
67 
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int dest_stride)68 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
69   int i;
70   tran_high_t a1, e1;
71   tran_low_t tmp[4];
72   const tran_low_t *ip = in;
73   tran_low_t *op = tmp;
74 
75   a1 = ip[0] >> UNIT_QUANT_SHIFT;
76   e1 = a1 >> 1;
77   a1 -= e1;
78   op[0] = WRAPLOW(a1, 8);
79   op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
80 
81   ip = tmp;
82   for (i = 0; i < 4; i++) {
83     e1 = ip[0] >> 1;
84     a1 = ip[0] - e1;
85     dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
86     dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
87     dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
88     dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
89     ip++;
90     dest++;
91   }
92 }
93 
idct4_c(const tran_low_t * input,tran_low_t * output)94 void idct4_c(const tran_low_t *input, tran_low_t *output) {
95   tran_low_t step[4];
96   tran_high_t temp1, temp2;
97   // stage 1
98   temp1 = (input[0] + input[2]) * cospi_16_64;
99   temp2 = (input[0] - input[2]) * cospi_16_64;
100   step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
101   step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
102   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
103   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
104   step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
105   step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
106 
107   // stage 2
108   output[0] = WRAPLOW(step[0] + step[3], 8);
109   output[1] = WRAPLOW(step[1] + step[2], 8);
110   output[2] = WRAPLOW(step[1] - step[2], 8);
111   output[3] = WRAPLOW(step[0] - step[3], 8);
112 }
113 
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)114 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
115   tran_low_t out[4 * 4];
116   tran_low_t *outptr = out;
117   int i, j;
118   tran_low_t temp_in[4], temp_out[4];
119 
120   // Rows
121   for (i = 0; i < 4; ++i) {
122     idct4_c(input, outptr);
123     input += 4;
124     outptr += 4;
125   }
126 
127   // Columns
128   for (i = 0; i < 4; ++i) {
129     for (j = 0; j < 4; ++j)
130       temp_in[j] = out[j * 4 + i];
131     idct4_c(temp_in, temp_out);
132     for (j = 0; j < 4; ++j) {
133       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
134                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
135     }
136   }
137 }
138 
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int dest_stride)139 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
140                          int dest_stride) {
141   int i;
142   tran_high_t a1;
143   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
144   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
145   a1 = ROUND_POWER_OF_TWO(out, 4);
146 
147   for (i = 0; i < 4; i++) {
148     dest[0] = clip_pixel_add(dest[0], a1);
149     dest[1] = clip_pixel_add(dest[1], a1);
150     dest[2] = clip_pixel_add(dest[2], a1);
151     dest[3] = clip_pixel_add(dest[3], a1);
152     dest += dest_stride;
153   }
154 }
155 
idct8_c(const tran_low_t * input,tran_low_t * output)156 void idct8_c(const tran_low_t *input, tran_low_t *output) {
157   tran_low_t step1[8], step2[8];
158   tran_high_t temp1, temp2;
159   // stage 1
160   step1[0] = input[0];
161   step1[2] = input[4];
162   step1[1] = input[2];
163   step1[3] = input[6];
164   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
165   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
166   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
167   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
168   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
169   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
170   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
171   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
172 
173   // stage 2 & stage 3 - even half
174   idct4_c(step1, step1);
175 
176   // stage 2 - odd half
177   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
178   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
179   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
180   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
181 
182   // stage 3 -odd half
183   step1[4] = step2[4];
184   temp1 = (step2[6] - step2[5]) * cospi_16_64;
185   temp2 = (step2[5] + step2[6]) * cospi_16_64;
186   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
187   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
188   step1[7] = step2[7];
189 
190   // stage 4
191   output[0] = WRAPLOW(step1[0] + step1[7], 8);
192   output[1] = WRAPLOW(step1[1] + step1[6], 8);
193   output[2] = WRAPLOW(step1[2] + step1[5], 8);
194   output[3] = WRAPLOW(step1[3] + step1[4], 8);
195   output[4] = WRAPLOW(step1[3] - step1[4], 8);
196   output[5] = WRAPLOW(step1[2] - step1[5], 8);
197   output[6] = WRAPLOW(step1[1] - step1[6], 8);
198   output[7] = WRAPLOW(step1[0] - step1[7], 8);
199 }
200 
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)201 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
202   tran_low_t out[8 * 8];
203   tran_low_t *outptr = out;
204   int i, j;
205   tran_low_t temp_in[8], temp_out[8];
206 
207   // First transform rows
208   for (i = 0; i < 8; ++i) {
209     idct8_c(input, outptr);
210     input += 8;
211     outptr += 8;
212   }
213 
214   // Then transform columns
215   for (i = 0; i < 8; ++i) {
216     for (j = 0; j < 8; ++j)
217       temp_in[j] = out[j * 8 + i];
218     idct8_c(temp_in, temp_out);
219     for (j = 0; j < 8; ++j) {
220       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
221                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
222     }
223   }
224 }
225 
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)226 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
227   int i, j;
228   tran_high_t a1;
229   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
230   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
231   a1 = ROUND_POWER_OF_TWO(out, 5);
232   for (j = 0; j < 8; ++j) {
233     for (i = 0; i < 8; ++i)
234       dest[i] = clip_pixel_add(dest[i], a1);
235     dest += stride;
236   }
237 }
238 
iadst4_c(const tran_low_t * input,tran_low_t * output)239 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
240   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
241 
242   tran_low_t x0 = input[0];
243   tran_low_t x1 = input[1];
244   tran_low_t x2 = input[2];
245   tran_low_t x3 = input[3];
246 
247   if (!(x0 | x1 | x2 | x3)) {
248     output[0] = output[1] = output[2] = output[3] = 0;
249     return;
250   }
251 
252   s0 = sinpi_1_9 * x0;
253   s1 = sinpi_2_9 * x0;
254   s2 = sinpi_3_9 * x1;
255   s3 = sinpi_4_9 * x2;
256   s4 = sinpi_1_9 * x2;
257   s5 = sinpi_2_9 * x3;
258   s6 = sinpi_4_9 * x3;
259   s7 = x0 - x2 + x3;
260 
261   s0 = s0 + s3 + s5;
262   s1 = s1 - s4 - s6;
263   s3 = s2;
264   s2 = sinpi_3_9 * s7;
265 
266   // 1-D transform scaling factor is sqrt(2).
267   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
268   // + 1b (addition) = 29b.
269   // Hence the output bit depth is 15b.
270   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
271   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
272   output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
273   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
274 }
275 
iadst8_c(const tran_low_t * input,tran_low_t * output)276 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
277   int s0, s1, s2, s3, s4, s5, s6, s7;
278 
279   tran_high_t x0 = input[7];
280   tran_high_t x1 = input[0];
281   tran_high_t x2 = input[5];
282   tran_high_t x3 = input[2];
283   tran_high_t x4 = input[3];
284   tran_high_t x5 = input[4];
285   tran_high_t x6 = input[1];
286   tran_high_t x7 = input[6];
287 
288   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
289     output[0] = output[1] = output[2] = output[3] = output[4]
290               = output[5] = output[6] = output[7] = 0;
291     return;
292   }
293 
294   // stage 1
295   s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
296   s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
297   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
298   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
299   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
300   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
301   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
302   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
303 
304   x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
305   x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
306   x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
307   x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
308   x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
309   x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
310   x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
311   x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
312 
313   // stage 2
314   s0 = (int)x0;
315   s1 = (int)x1;
316   s2 = (int)x2;
317   s3 = (int)x3;
318   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
319   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
320   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
321   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
322 
323   x0 = WRAPLOW(s0 + s2, 8);
324   x1 = WRAPLOW(s1 + s3, 8);
325   x2 = WRAPLOW(s0 - s2, 8);
326   x3 = WRAPLOW(s1 - s3, 8);
327   x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
328   x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
329   x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
330   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
331 
332   // stage 3
333   s2 = (int)(cospi_16_64 * (x2 + x3));
334   s3 = (int)(cospi_16_64 * (x2 - x3));
335   s6 = (int)(cospi_16_64 * (x6 + x7));
336   s7 = (int)(cospi_16_64 * (x6 - x7));
337 
338   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
339   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
340   x6 = WRAPLOW(dct_const_round_shift(s6), 8);
341   x7 = WRAPLOW(dct_const_round_shift(s7), 8);
342 
343   output[0] = WRAPLOW(x0, 8);
344   output[1] = WRAPLOW(-x4, 8);
345   output[2] = WRAPLOW(x6, 8);
346   output[3] = WRAPLOW(-x2, 8);
347   output[4] = WRAPLOW(x3, 8);
348   output[5] = WRAPLOW(-x7, 8);
349   output[6] = WRAPLOW(x5, 8);
350   output[7] = WRAPLOW(-x1, 8);
351 }
352 
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)353 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
354   tran_low_t out[8 * 8] = { 0 };
355   tran_low_t *outptr = out;
356   int i, j;
357   tran_low_t temp_in[8], temp_out[8];
358 
359   // First transform rows
360   // only first 4 row has non-zero coefs
361   for (i = 0; i < 4; ++i) {
362     idct8_c(input, outptr);
363     input += 8;
364     outptr += 8;
365   }
366 
367   // Then transform columns
368   for (i = 0; i < 8; ++i) {
369     for (j = 0; j < 8; ++j)
370       temp_in[j] = out[j * 8 + i];
371     idct8_c(temp_in, temp_out);
372     for (j = 0; j < 8; ++j) {
373       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
374                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
375     }
376   }
377 }
378 
idct16_c(const tran_low_t * input,tran_low_t * output)379 void idct16_c(const tran_low_t *input, tran_low_t *output) {
380   tran_low_t step1[16], step2[16];
381   tran_high_t temp1, temp2;
382 
383   // stage 1
384   step1[0] = input[0/2];
385   step1[1] = input[16/2];
386   step1[2] = input[8/2];
387   step1[3] = input[24/2];
388   step1[4] = input[4/2];
389   step1[5] = input[20/2];
390   step1[6] = input[12/2];
391   step1[7] = input[28/2];
392   step1[8] = input[2/2];
393   step1[9] = input[18/2];
394   step1[10] = input[10/2];
395   step1[11] = input[26/2];
396   step1[12] = input[6/2];
397   step1[13] = input[22/2];
398   step1[14] = input[14/2];
399   step1[15] = input[30/2];
400 
401   // stage 2
402   step2[0] = step1[0];
403   step2[1] = step1[1];
404   step2[2] = step1[2];
405   step2[3] = step1[3];
406   step2[4] = step1[4];
407   step2[5] = step1[5];
408   step2[6] = step1[6];
409   step2[7] = step1[7];
410 
411   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
412   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
413   step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
414   step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
415 
416   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
417   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
418   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
419   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
420 
421   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
422   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
423   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
424   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
425 
426   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
427   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
428   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
429   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
430 
431   // stage 3
432   step1[0] = step2[0];
433   step1[1] = step2[1];
434   step1[2] = step2[2];
435   step1[3] = step2[3];
436 
437   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
438   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
439   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
440   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
441   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
442   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
443   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
444   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
445 
446   step1[8] = WRAPLOW(step2[8] + step2[9], 8);
447   step1[9] = WRAPLOW(step2[8] - step2[9], 8);
448   step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
449   step1[11] = WRAPLOW(step2[10] + step2[11], 8);
450   step1[12] = WRAPLOW(step2[12] + step2[13], 8);
451   step1[13] = WRAPLOW(step2[12] - step2[13], 8);
452   step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
453   step1[15] = WRAPLOW(step2[14] + step2[15], 8);
454 
455   // stage 4
456   temp1 = (step1[0] + step1[1]) * cospi_16_64;
457   temp2 = (step1[0] - step1[1]) * cospi_16_64;
458   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
459   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
460   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
461   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
462   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
463   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
464   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
465   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
466   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
467   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
468 
469   step2[8] = step1[8];
470   step2[15] = step1[15];
471   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
472   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
473   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
474   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
475   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
476   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
477   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
478   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
479   step2[11] = step1[11];
480   step2[12] = step1[12];
481 
482   // stage 5
483   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
484   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
485   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
486   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
487   step1[4] = step2[4];
488   temp1 = (step2[6] - step2[5]) * cospi_16_64;
489   temp2 = (step2[5] + step2[6]) * cospi_16_64;
490   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
491   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
492   step1[7] = step2[7];
493 
494   step1[8] = WRAPLOW(step2[8] + step2[11], 8);
495   step1[9] = WRAPLOW(step2[9] + step2[10], 8);
496   step1[10] = WRAPLOW(step2[9] - step2[10], 8);
497   step1[11] = WRAPLOW(step2[8] - step2[11], 8);
498   step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
499   step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
500   step1[14] = WRAPLOW(step2[13] + step2[14], 8);
501   step1[15] = WRAPLOW(step2[12] + step2[15], 8);
502 
503   // stage 6
504   step2[0] = WRAPLOW(step1[0] + step1[7], 8);
505   step2[1] = WRAPLOW(step1[1] + step1[6], 8);
506   step2[2] = WRAPLOW(step1[2] + step1[5], 8);
507   step2[3] = WRAPLOW(step1[3] + step1[4], 8);
508   step2[4] = WRAPLOW(step1[3] - step1[4], 8);
509   step2[5] = WRAPLOW(step1[2] - step1[5], 8);
510   step2[6] = WRAPLOW(step1[1] - step1[6], 8);
511   step2[7] = WRAPLOW(step1[0] - step1[7], 8);
512   step2[8] = step1[8];
513   step2[9] = step1[9];
514   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
515   temp2 = (step1[10] + step1[13]) * cospi_16_64;
516   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
517   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
518   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
519   temp2 = (step1[11] + step1[12]) * cospi_16_64;
520   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
521   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
522   step2[14] = step1[14];
523   step2[15] = step1[15];
524 
525   // stage 7
526   output[0] = WRAPLOW(step2[0] + step2[15], 8);
527   output[1] = WRAPLOW(step2[1] + step2[14], 8);
528   output[2] = WRAPLOW(step2[2] + step2[13], 8);
529   output[3] = WRAPLOW(step2[3] + step2[12], 8);
530   output[4] = WRAPLOW(step2[4] + step2[11], 8);
531   output[5] = WRAPLOW(step2[5] + step2[10], 8);
532   output[6] = WRAPLOW(step2[6] + step2[9], 8);
533   output[7] = WRAPLOW(step2[7] + step2[8], 8);
534   output[8] = WRAPLOW(step2[7] - step2[8], 8);
535   output[9] = WRAPLOW(step2[6] - step2[9], 8);
536   output[10] = WRAPLOW(step2[5] - step2[10], 8);
537   output[11] = WRAPLOW(step2[4] - step2[11], 8);
538   output[12] = WRAPLOW(step2[3] - step2[12], 8);
539   output[13] = WRAPLOW(step2[2] - step2[13], 8);
540   output[14] = WRAPLOW(step2[1] - step2[14], 8);
541   output[15] = WRAPLOW(step2[0] - step2[15], 8);
542 }
543 
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)544 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
545                              int stride) {
546   tran_low_t out[16 * 16];
547   tran_low_t *outptr = out;
548   int i, j;
549   tran_low_t temp_in[16], temp_out[16];
550 
551   // First transform rows
552   for (i = 0; i < 16; ++i) {
553     idct16_c(input, outptr);
554     input += 16;
555     outptr += 16;
556   }
557 
558   // Then transform columns
559   for (i = 0; i < 16; ++i) {
560     for (j = 0; j < 16; ++j)
561       temp_in[j] = out[j * 16 + i];
562     idct16_c(temp_in, temp_out);
563     for (j = 0; j < 16; ++j) {
564       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
565                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
566     }
567   }
568 }
569 
iadst16_c(const tran_low_t * input,tran_low_t * output)570 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
571   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
572   tran_high_t s9, s10, s11, s12, s13, s14, s15;
573 
574   tran_high_t x0 = input[15];
575   tran_high_t x1 = input[0];
576   tran_high_t x2 = input[13];
577   tran_high_t x3 = input[2];
578   tran_high_t x4 = input[11];
579   tran_high_t x5 = input[4];
580   tran_high_t x6 = input[9];
581   tran_high_t x7 = input[6];
582   tran_high_t x8 = input[7];
583   tran_high_t x9 = input[8];
584   tran_high_t x10 = input[5];
585   tran_high_t x11 = input[10];
586   tran_high_t x12 = input[3];
587   tran_high_t x13 = input[12];
588   tran_high_t x14 = input[1];
589   tran_high_t x15 = input[14];
590 
591   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
592            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
593     output[0] = output[1] = output[2] = output[3] = output[4]
594               = output[5] = output[6] = output[7] = output[8]
595               = output[9] = output[10] = output[11] = output[12]
596               = output[13] = output[14] = output[15] = 0;
597     return;
598   }
599 
600   // stage 1
601   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
602   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
603   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
604   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
605   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
606   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
607   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
608   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
609   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
610   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
611   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
612   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
613   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
614   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
615   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
616   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
617 
618   x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
619   x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
620   x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
621   x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
622   x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
623   x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
624   x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
625   x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
626   x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
627   x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
628   x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
629   x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
630   x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
631   x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
632   x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
633   x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
634 
635   // stage 2
636   s0 = x0;
637   s1 = x1;
638   s2 = x2;
639   s3 = x3;
640   s4 = x4;
641   s5 = x5;
642   s6 = x6;
643   s7 = x7;
644   s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
645   s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
646   s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
647   s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
648   s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
649   s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
650   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
651   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
652 
653   x0 = WRAPLOW(s0 + s4, 8);
654   x1 = WRAPLOW(s1 + s5, 8);
655   x2 = WRAPLOW(s2 + s6, 8);
656   x3 = WRAPLOW(s3 + s7, 8);
657   x4 = WRAPLOW(s0 - s4, 8);
658   x5 = WRAPLOW(s1 - s5, 8);
659   x6 = WRAPLOW(s2 - s6, 8);
660   x7 = WRAPLOW(s3 - s7, 8);
661   x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
662   x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
663   x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
664   x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
665   x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
666   x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
667   x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
668   x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
669 
670   // stage 3
671   s0 = x0;
672   s1 = x1;
673   s2 = x2;
674   s3 = x3;
675   s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
676   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
677   s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
678   s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
679   s8 = x8;
680   s9 = x9;
681   s10 = x10;
682   s11 = x11;
683   s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
684   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
685   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
686   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
687 
688   x0 = WRAPLOW(check_range(s0 + s2), 8);
689   x1 = WRAPLOW(check_range(s1 + s3), 8);
690   x2 = WRAPLOW(check_range(s0 - s2), 8);
691   x3 = WRAPLOW(check_range(s1 - s3), 8);
692   x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
693   x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
694   x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
695   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
696   x8 = WRAPLOW(check_range(s8 + s10), 8);
697   x9 = WRAPLOW(check_range(s9 + s11), 8);
698   x10 = WRAPLOW(check_range(s8 - s10), 8);
699   x11 = WRAPLOW(check_range(s9 - s11), 8);
700   x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
701   x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
702   x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
703   x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
704 
705   // stage 4
706   s2 = (- cospi_16_64) * (x2 + x3);
707   s3 = cospi_16_64 * (x2 - x3);
708   s6 = cospi_16_64 * (x6 + x7);
709   s7 = cospi_16_64 * (- x6 + x7);
710   s10 = cospi_16_64 * (x10 + x11);
711   s11 = cospi_16_64 * (- x10 + x11);
712   s14 = (- cospi_16_64) * (x14 + x15);
713   s15 = cospi_16_64 * (x14 - x15);
714 
715   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
716   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
717   x6 = WRAPLOW(dct_const_round_shift(s6), 8);
718   x7 = WRAPLOW(dct_const_round_shift(s7), 8);
719   x10 = WRAPLOW(dct_const_round_shift(s10), 8);
720   x11 = WRAPLOW(dct_const_round_shift(s11), 8);
721   x14 = WRAPLOW(dct_const_round_shift(s14), 8);
722   x15 = WRAPLOW(dct_const_round_shift(s15), 8);
723 
724   output[0] = WRAPLOW(x0, 8);
725   output[1] = WRAPLOW(-x8, 8);
726   output[2] = WRAPLOW(x12, 8);
727   output[3] = WRAPLOW(-x4, 8);
728   output[4] = WRAPLOW(x6, 8);
729   output[5] = WRAPLOW(x14, 8);
730   output[6] = WRAPLOW(x10, 8);
731   output[7] = WRAPLOW(x2, 8);
732   output[8] = WRAPLOW(x3, 8);
733   output[9] = WRAPLOW(x11, 8);
734   output[10] = WRAPLOW(x15, 8);
735   output[11] = WRAPLOW(x7, 8);
736   output[12] = WRAPLOW(x5, 8);
737   output[13] = WRAPLOW(-x13, 8);
738   output[14] = WRAPLOW(x9, 8);
739   output[15] = WRAPLOW(-x1, 8);
740 }
741 
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)742 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
743                             int stride) {
744   tran_low_t out[16 * 16] = { 0 };
745   tran_low_t *outptr = out;
746   int i, j;
747   tran_low_t temp_in[16], temp_out[16];
748 
749   // First transform rows. Since all non-zero dct coefficients are in
750   // upper-left 4x4 area, we only need to calculate first 4 rows here.
751   for (i = 0; i < 4; ++i) {
752     idct16_c(input, outptr);
753     input += 16;
754     outptr += 16;
755   }
756 
757   // Then transform columns
758   for (i = 0; i < 16; ++i) {
759     for (j = 0; j < 16; ++j)
760       temp_in[j] = out[j*16 + i];
761     idct16_c(temp_in, temp_out);
762     for (j = 0; j < 16; ++j) {
763       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
764                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
765     }
766   }
767 }
768 
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)769 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
770   int i, j;
771   tran_high_t a1;
772   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
773   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
774   a1 = ROUND_POWER_OF_TWO(out, 6);
775   for (j = 0; j < 16; ++j) {
776     for (i = 0; i < 16; ++i)
777       dest[i] = clip_pixel_add(dest[i], a1);
778     dest += stride;
779   }
780 }
781 
idct32_c(const tran_low_t * input,tran_low_t * output)782 void idct32_c(const tran_low_t *input, tran_low_t *output) {
783   tran_low_t step1[32], step2[32];
784   tran_high_t temp1, temp2;
785 
786   // stage 1
787   step1[0] = input[0];
788   step1[1] = input[16];
789   step1[2] = input[8];
790   step1[3] = input[24];
791   step1[4] = input[4];
792   step1[5] = input[20];
793   step1[6] = input[12];
794   step1[7] = input[28];
795   step1[8] = input[2];
796   step1[9] = input[18];
797   step1[10] = input[10];
798   step1[11] = input[26];
799   step1[12] = input[6];
800   step1[13] = input[22];
801   step1[14] = input[14];
802   step1[15] = input[30];
803 
804   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
805   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
806   step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
807   step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
808 
809   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
810   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
811   step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
812   step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
813 
814   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
815   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
816   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
817   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
818 
819   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
820   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
821   step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
822   step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
823 
824   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
825   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
826   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
827   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
828 
829   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
830   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
831   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
832   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
833 
834   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
835   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
836   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
837   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
838 
839   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
840   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
841   step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
842   step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
843 
844   // stage 2
845   step2[0] = step1[0];
846   step2[1] = step1[1];
847   step2[2] = step1[2];
848   step2[3] = step1[3];
849   step2[4] = step1[4];
850   step2[5] = step1[5];
851   step2[6] = step1[6];
852   step2[7] = step1[7];
853 
854   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
855   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
856   step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
857   step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
858 
859   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
860   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
861   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
862   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
863 
864   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
865   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
866   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
867   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
868 
869   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
870   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
871   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
872   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
873 
874   step2[16] = WRAPLOW(step1[16] + step1[17], 8);
875   step2[17] = WRAPLOW(step1[16] - step1[17], 8);
876   step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
877   step2[19] = WRAPLOW(step1[18] + step1[19], 8);
878   step2[20] = WRAPLOW(step1[20] + step1[21], 8);
879   step2[21] = WRAPLOW(step1[20] - step1[21], 8);
880   step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
881   step2[23] = WRAPLOW(step1[22] + step1[23], 8);
882   step2[24] = WRAPLOW(step1[24] + step1[25], 8);
883   step2[25] = WRAPLOW(step1[24] - step1[25], 8);
884   step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
885   step2[27] = WRAPLOW(step1[26] + step1[27], 8);
886   step2[28] = WRAPLOW(step1[28] + step1[29], 8);
887   step2[29] = WRAPLOW(step1[28] - step1[29], 8);
888   step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
889   step2[31] = WRAPLOW(step1[30] + step1[31], 8);
890 
891   // stage 3
892   step1[0] = step2[0];
893   step1[1] = step2[1];
894   step1[2] = step2[2];
895   step1[3] = step2[3];
896 
897   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
898   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
899   step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
900   step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
901   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
902   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
903   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
904   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
905 
906   step1[8] = WRAPLOW(step2[8] + step2[9], 8);
907   step1[9] = WRAPLOW(step2[8] - step2[9], 8);
908   step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
909   step1[11] = WRAPLOW(step2[10] + step2[11], 8);
910   step1[12] = WRAPLOW(step2[12] + step2[13], 8);
911   step1[13] = WRAPLOW(step2[12] - step2[13], 8);
912   step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
913   step1[15] = WRAPLOW(step2[14] + step2[15], 8);
914 
915   step1[16] = step2[16];
916   step1[31] = step2[31];
917   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
918   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
919   step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
920   step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
921   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
922   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
923   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
924   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
925   step1[19] = step2[19];
926   step1[20] = step2[20];
927   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
928   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
929   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
930   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
931   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
932   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
933   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
934   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
935   step1[23] = step2[23];
936   step1[24] = step2[24];
937   step1[27] = step2[27];
938   step1[28] = step2[28];
939 
940   // stage 4
941   temp1 = (step1[0] + step1[1]) * cospi_16_64;
942   temp2 = (step1[0] - step1[1]) * cospi_16_64;
943   step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
944   step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
945   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
946   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
947   step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
948   step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
949   step2[4] = WRAPLOW(step1[4] + step1[5], 8);
950   step2[5] = WRAPLOW(step1[4] - step1[5], 8);
951   step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
952   step2[7] = WRAPLOW(step1[6] + step1[7], 8);
953 
954   step2[8] = step1[8];
955   step2[15] = step1[15];
956   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
957   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
958   step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
959   step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
960   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
961   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
962   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
963   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
964   step2[11] = step1[11];
965   step2[12] = step1[12];
966 
967   step2[16] = WRAPLOW(step1[16] + step1[19], 8);
968   step2[17] = WRAPLOW(step1[17] + step1[18], 8);
969   step2[18] = WRAPLOW(step1[17] - step1[18], 8);
970   step2[19] = WRAPLOW(step1[16] - step1[19], 8);
971   step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
972   step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
973   step2[22] = WRAPLOW(step1[21] + step1[22], 8);
974   step2[23] = WRAPLOW(step1[20] + step1[23], 8);
975 
976   step2[24] = WRAPLOW(step1[24] + step1[27], 8);
977   step2[25] = WRAPLOW(step1[25] + step1[26], 8);
978   step2[26] = WRAPLOW(step1[25] - step1[26], 8);
979   step2[27] = WRAPLOW(step1[24] - step1[27], 8);
980   step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
981   step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
982   step2[30] = WRAPLOW(step1[29] + step1[30], 8);
983   step2[31] = WRAPLOW(step1[28] + step1[31], 8);
984 
985   // stage 5
986   step1[0] = WRAPLOW(step2[0] + step2[3], 8);
987   step1[1] = WRAPLOW(step2[1] + step2[2], 8);
988   step1[2] = WRAPLOW(step2[1] - step2[2], 8);
989   step1[3] = WRAPLOW(step2[0] - step2[3], 8);
990   step1[4] = step2[4];
991   temp1 = (step2[6] - step2[5]) * cospi_16_64;
992   temp2 = (step2[5] + step2[6]) * cospi_16_64;
993   step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
994   step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
995   step1[7] = step2[7];
996 
997   step1[8] = WRAPLOW(step2[8] + step2[11], 8);
998   step1[9] = WRAPLOW(step2[9] + step2[10], 8);
999   step1[10] = WRAPLOW(step2[9] - step2[10], 8);
1000   step1[11] = WRAPLOW(step2[8] - step2[11], 8);
1001   step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
1002   step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
1003   step1[14] = WRAPLOW(step2[13] + step2[14], 8);
1004   step1[15] = WRAPLOW(step2[12] + step2[15], 8);
1005 
1006   step1[16] = step2[16];
1007   step1[17] = step2[17];
1008   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1009   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1010   step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1011   step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1012   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1013   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1014   step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
1015   step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
1016   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1017   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1018   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1019   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1020   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1021   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1022   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1023   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1024   step1[22] = step2[22];
1025   step1[23] = step2[23];
1026   step1[24] = step2[24];
1027   step1[25] = step2[25];
1028   step1[30] = step2[30];
1029   step1[31] = step2[31];
1030 
1031   // stage 6
1032   step2[0] = WRAPLOW(step1[0] + step1[7], 8);
1033   step2[1] = WRAPLOW(step1[1] + step1[6], 8);
1034   step2[2] = WRAPLOW(step1[2] + step1[5], 8);
1035   step2[3] = WRAPLOW(step1[3] + step1[4], 8);
1036   step2[4] = WRAPLOW(step1[3] - step1[4], 8);
1037   step2[5] = WRAPLOW(step1[2] - step1[5], 8);
1038   step2[6] = WRAPLOW(step1[1] - step1[6], 8);
1039   step2[7] = WRAPLOW(step1[0] - step1[7], 8);
1040   step2[8] = step1[8];
1041   step2[9] = step1[9];
1042   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1043   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1044   step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1045   step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1046   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1047   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1048   step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1049   step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1050   step2[14] = step1[14];
1051   step2[15] = step1[15];
1052 
1053   step2[16] = WRAPLOW(step1[16] + step1[23], 8);
1054   step2[17] = WRAPLOW(step1[17] + step1[22], 8);
1055   step2[18] = WRAPLOW(step1[18] + step1[21], 8);
1056   step2[19] = WRAPLOW(step1[19] + step1[20], 8);
1057   step2[20] = WRAPLOW(step1[19] - step1[20], 8);
1058   step2[21] = WRAPLOW(step1[18] - step1[21], 8);
1059   step2[22] = WRAPLOW(step1[17] - step1[22], 8);
1060   step2[23] = WRAPLOW(step1[16] - step1[23], 8);
1061 
1062   step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
1063   step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
1064   step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
1065   step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
1066   step2[28] = WRAPLOW(step1[27] + step1[28], 8);
1067   step2[29] = WRAPLOW(step1[26] + step1[29], 8);
1068   step2[30] = WRAPLOW(step1[25] + step1[30], 8);
1069   step2[31] = WRAPLOW(step1[24] + step1[31], 8);
1070 
1071   // stage 7
1072   step1[0] = WRAPLOW(step2[0] + step2[15], 8);
1073   step1[1] = WRAPLOW(step2[1] + step2[14], 8);
1074   step1[2] = WRAPLOW(step2[2] + step2[13], 8);
1075   step1[3] = WRAPLOW(step2[3] + step2[12], 8);
1076   step1[4] = WRAPLOW(step2[4] + step2[11], 8);
1077   step1[5] = WRAPLOW(step2[5] + step2[10], 8);
1078   step1[6] = WRAPLOW(step2[6] + step2[9], 8);
1079   step1[7] = WRAPLOW(step2[7] + step2[8], 8);
1080   step1[8] = WRAPLOW(step2[7] - step2[8], 8);
1081   step1[9] = WRAPLOW(step2[6] - step2[9], 8);
1082   step1[10] = WRAPLOW(step2[5] - step2[10], 8);
1083   step1[11] = WRAPLOW(step2[4] - step2[11], 8);
1084   step1[12] = WRAPLOW(step2[3] - step2[12], 8);
1085   step1[13] = WRAPLOW(step2[2] - step2[13], 8);
1086   step1[14] = WRAPLOW(step2[1] - step2[14], 8);
1087   step1[15] = WRAPLOW(step2[0] - step2[15], 8);
1088 
1089   step1[16] = step2[16];
1090   step1[17] = step2[17];
1091   step1[18] = step2[18];
1092   step1[19] = step2[19];
1093   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1094   temp2 = (step2[20] + step2[27]) * cospi_16_64;
1095   step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1096   step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1097   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1098   temp2 = (step2[21] + step2[26]) * cospi_16_64;
1099   step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1100   step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1101   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1102   temp2 = (step2[22] + step2[25]) * cospi_16_64;
1103   step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1104   step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1105   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1106   temp2 = (step2[23] + step2[24]) * cospi_16_64;
1107   step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
1108   step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
1109   step1[28] = step2[28];
1110   step1[29] = step2[29];
1111   step1[30] = step2[30];
1112   step1[31] = step2[31];
1113 
1114   // final stage
1115   output[0] = WRAPLOW(step1[0] + step1[31], 8);
1116   output[1] = WRAPLOW(step1[1] + step1[30], 8);
1117   output[2] = WRAPLOW(step1[2] + step1[29], 8);
1118   output[3] = WRAPLOW(step1[3] + step1[28], 8);
1119   output[4] = WRAPLOW(step1[4] + step1[27], 8);
1120   output[5] = WRAPLOW(step1[5] + step1[26], 8);
1121   output[6] = WRAPLOW(step1[6] + step1[25], 8);
1122   output[7] = WRAPLOW(step1[7] + step1[24], 8);
1123   output[8] = WRAPLOW(step1[8] + step1[23], 8);
1124   output[9] = WRAPLOW(step1[9] + step1[22], 8);
1125   output[10] = WRAPLOW(step1[10] + step1[21], 8);
1126   output[11] = WRAPLOW(step1[11] + step1[20], 8);
1127   output[12] = WRAPLOW(step1[12] + step1[19], 8);
1128   output[13] = WRAPLOW(step1[13] + step1[18], 8);
1129   output[14] = WRAPLOW(step1[14] + step1[17], 8);
1130   output[15] = WRAPLOW(step1[15] + step1[16], 8);
1131   output[16] = WRAPLOW(step1[15] - step1[16], 8);
1132   output[17] = WRAPLOW(step1[14] - step1[17], 8);
1133   output[18] = WRAPLOW(step1[13] - step1[18], 8);
1134   output[19] = WRAPLOW(step1[12] - step1[19], 8);
1135   output[20] = WRAPLOW(step1[11] - step1[20], 8);
1136   output[21] = WRAPLOW(step1[10] - step1[21], 8);
1137   output[22] = WRAPLOW(step1[9] - step1[22], 8);
1138   output[23] = WRAPLOW(step1[8] - step1[23], 8);
1139   output[24] = WRAPLOW(step1[7] - step1[24], 8);
1140   output[25] = WRAPLOW(step1[6] - step1[25], 8);
1141   output[26] = WRAPLOW(step1[5] - step1[26], 8);
1142   output[27] = WRAPLOW(step1[4] - step1[27], 8);
1143   output[28] = WRAPLOW(step1[3] - step1[28], 8);
1144   output[29] = WRAPLOW(step1[2] - step1[29], 8);
1145   output[30] = WRAPLOW(step1[1] - step1[30], 8);
1146   output[31] = WRAPLOW(step1[0] - step1[31], 8);
1147 }
1148 
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1149 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1150                               int stride) {
1151   tran_low_t out[32 * 32];
1152   tran_low_t *outptr = out;
1153   int i, j;
1154   tran_low_t temp_in[32], temp_out[32];
1155 
1156   // Rows
1157   for (i = 0; i < 32; ++i) {
1158     int16_t zero_coeff[16];
1159     for (j = 0; j < 16; ++j)
1160       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1161     for (j = 0; j < 8; ++j)
1162       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1163     for (j = 0; j < 4; ++j)
1164       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1165     for (j = 0; j < 2; ++j)
1166       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1167 
1168     if (zero_coeff[0] | zero_coeff[1])
1169       idct32_c(input, outptr);
1170     else
1171       memset(outptr, 0, sizeof(tran_low_t) * 32);
1172     input += 32;
1173     outptr += 32;
1174   }
1175 
1176   // Columns
1177   for (i = 0; i < 32; ++i) {
1178     for (j = 0; j < 32; ++j)
1179       temp_in[j] = out[j * 32 + i];
1180     idct32_c(temp_in, temp_out);
1181     for (j = 0; j < 32; ++j) {
1182       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1183                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1184     }
1185   }
1186 }
1187 
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1188 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1189                             int stride) {
1190   tran_low_t out[32 * 32] = {0};
1191   tran_low_t *outptr = out;
1192   int i, j;
1193   tran_low_t temp_in[32], temp_out[32];
1194 
1195   // Rows
1196   // only upper-left 8x8 has non-zero coeff
1197   for (i = 0; i < 8; ++i) {
1198     idct32_c(input, outptr);
1199     input += 32;
1200     outptr += 32;
1201   }
1202 
1203   // Columns
1204   for (i = 0; i < 32; ++i) {
1205     for (j = 0; j < 32; ++j)
1206       temp_in[j] = out[j * 32 + i];
1207     idct32_c(temp_in, temp_out);
1208     for (j = 0; j < 32; ++j) {
1209       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1210                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1211     }
1212   }
1213 }
1214 
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1215 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1216   int i, j;
1217   tran_high_t a1;
1218 
1219   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
1220   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
1221   a1 = ROUND_POWER_OF_TWO(out, 6);
1222 
1223   for (j = 0; j < 32; ++j) {
1224     for (i = 0; i < 32; ++i)
1225       dest[i] = clip_pixel_add(dest[i], a1);
1226     dest += stride;
1227   }
1228 }
1229 
1230 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1231 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1232                                  int stride, int bd) {
1233   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1234      0.5 shifts per pixel. */
1235   int i;
1236   tran_low_t output[16];
1237   tran_high_t a1, b1, c1, d1, e1;
1238   const tran_low_t *ip = input;
1239   tran_low_t *op = output;
1240   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1241 
1242   for (i = 0; i < 4; i++) {
1243     a1 = ip[0] >> UNIT_QUANT_SHIFT;
1244     c1 = ip[1] >> UNIT_QUANT_SHIFT;
1245     d1 = ip[2] >> UNIT_QUANT_SHIFT;
1246     b1 = ip[3] >> UNIT_QUANT_SHIFT;
1247     a1 += c1;
1248     d1 -= b1;
1249     e1 = (a1 - d1) >> 1;
1250     b1 = e1 - b1;
1251     c1 = e1 - c1;
1252     a1 -= b1;
1253     d1 += c1;
1254     op[0] = WRAPLOW(a1, bd);
1255     op[1] = WRAPLOW(b1, bd);
1256     op[2] = WRAPLOW(c1, bd);
1257     op[3] = WRAPLOW(d1, bd);
1258     ip += 4;
1259     op += 4;
1260   }
1261 
1262   ip = output;
1263   for (i = 0; i < 4; i++) {
1264     a1 = ip[4 * 0];
1265     c1 = ip[4 * 1];
1266     d1 = ip[4 * 2];
1267     b1 = ip[4 * 3];
1268     a1 += c1;
1269     d1 -= b1;
1270     e1 = (a1 - d1) >> 1;
1271     b1 = e1 - b1;
1272     c1 = e1 - c1;
1273     a1 -= b1;
1274     d1 += c1;
1275     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1276     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
1277     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
1278     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
1279 
1280     ip++;
1281     dest++;
1282   }
1283 }
1284 
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest8,int dest_stride,int bd)1285 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1286                                 int dest_stride, int bd) {
1287   int i;
1288   tran_high_t a1, e1;
1289   tran_low_t tmp[4];
1290   const tran_low_t *ip = in;
1291   tran_low_t *op = tmp;
1292   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1293   (void) bd;
1294 
1295   a1 = ip[0] >> UNIT_QUANT_SHIFT;
1296   e1 = a1 >> 1;
1297   a1 -= e1;
1298   op[0] = WRAPLOW(a1, bd);
1299   op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
1300 
1301   ip = tmp;
1302   for (i = 0; i < 4; i++) {
1303     e1 = ip[0] >> 1;
1304     a1 = ip[0] - e1;
1305     dest[dest_stride * 0] = highbd_clip_pixel_add(
1306         dest[dest_stride * 0], a1, bd);
1307     dest[dest_stride * 1] = highbd_clip_pixel_add(
1308         dest[dest_stride * 1], e1, bd);
1309     dest[dest_stride * 2] = highbd_clip_pixel_add(
1310         dest[dest_stride * 2], e1, bd);
1311     dest[dest_stride * 3] = highbd_clip_pixel_add(
1312         dest[dest_stride * 3], e1, bd);
1313     ip++;
1314     dest++;
1315   }
1316 }
1317 
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1318 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1319   tran_low_t step[4];
1320   tran_high_t temp1, temp2;
1321   (void) bd;
1322   // stage 1
1323   temp1 = (input[0] + input[2]) * cospi_16_64;
1324   temp2 = (input[0] - input[2]) * cospi_16_64;
1325   step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1326   step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1327   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1328   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1329   step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1330   step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1331 
1332   // stage 2
1333   output[0] = WRAPLOW(step[0] + step[3], bd);
1334   output[1] = WRAPLOW(step[1] + step[2], bd);
1335   output[2] = WRAPLOW(step[1] - step[2], bd);
1336   output[3] = WRAPLOW(step[0] - step[3], bd);
1337 }
1338 
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1339 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1340                                  int stride, int bd) {
1341   tran_low_t out[4 * 4];
1342   tran_low_t *outptr = out;
1343   int i, j;
1344   tran_low_t temp_in[4], temp_out[4];
1345   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1346 
1347   // Rows
1348   for (i = 0; i < 4; ++i) {
1349     vpx_highbd_idct4_c(input, outptr, bd);
1350     input += 4;
1351     outptr += 4;
1352   }
1353 
1354   // Columns
1355   for (i = 0; i < 4; ++i) {
1356     for (j = 0; j < 4; ++j)
1357       temp_in[j] = out[j * 4 + i];
1358     vpx_highbd_idct4_c(temp_in, temp_out, bd);
1359     for (j = 0; j < 4; ++j) {
1360       dest[j * stride + i] = highbd_clip_pixel_add(
1361           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1362     }
1363   }
1364 }
1365 
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest8,int dest_stride,int bd)1366 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1367                                 int dest_stride, int bd) {
1368   int i;
1369   tran_high_t a1;
1370   tran_low_t out = WRAPLOW(
1371       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1372   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1373 
1374   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1375   a1 = ROUND_POWER_OF_TWO(out, 4);
1376 
1377   for (i = 0; i < 4; i++) {
1378     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1379     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1380     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1381     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1382     dest += dest_stride;
1383   }
1384 }
1385 
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1386 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1387   tran_low_t step1[8], step2[8];
1388   tran_high_t temp1, temp2;
1389   // stage 1
1390   step1[0] = input[0];
1391   step1[2] = input[4];
1392   step1[1] = input[2];
1393   step1[3] = input[6];
1394   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1395   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1396   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1397   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1398   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1399   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1400   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1401   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1402 
1403   // stage 2 & stage 3 - even half
1404   vpx_highbd_idct4_c(step1, step1, bd);
1405 
1406   // stage 2 - odd half
1407   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1408   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1409   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1410   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1411 
1412   // stage 3 - odd half
1413   step1[4] = step2[4];
1414   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1415   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1416   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1417   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1418   step1[7] = step2[7];
1419 
1420   // stage 4
1421   output[0] = WRAPLOW(step1[0] + step1[7], bd);
1422   output[1] = WRAPLOW(step1[1] + step1[6], bd);
1423   output[2] = WRAPLOW(step1[2] + step1[5], bd);
1424   output[3] = WRAPLOW(step1[3] + step1[4], bd);
1425   output[4] = WRAPLOW(step1[3] - step1[4], bd);
1426   output[5] = WRAPLOW(step1[2] - step1[5], bd);
1427   output[6] = WRAPLOW(step1[1] - step1[6], bd);
1428   output[7] = WRAPLOW(step1[0] - step1[7], bd);
1429 }
1430 
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1431 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1432                                  int stride, int bd) {
1433   tran_low_t out[8 * 8];
1434   tran_low_t *outptr = out;
1435   int i, j;
1436   tran_low_t temp_in[8], temp_out[8];
1437   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1438 
1439   // First transform rows.
1440   for (i = 0; i < 8; ++i) {
1441     vpx_highbd_idct8_c(input, outptr, bd);
1442     input += 8;
1443     outptr += 8;
1444   }
1445 
1446   // Then transform columns.
1447   for (i = 0; i < 8; ++i) {
1448     for (j = 0; j < 8; ++j)
1449       temp_in[j] = out[j * 8 + i];
1450     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1451     for (j = 0; j < 8; ++j) {
1452       dest[j * stride + i] = highbd_clip_pixel_add(
1453           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1454     }
1455   }
1456 }
1457 
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1458 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1459                                 int stride, int bd) {
1460   int i, j;
1461   tran_high_t a1;
1462   tran_low_t out = WRAPLOW(
1463       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1464   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1465   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1466   a1 = ROUND_POWER_OF_TWO(out, 5);
1467   for (j = 0; j < 8; ++j) {
1468     for (i = 0; i < 8; ++i)
1469       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1470     dest += stride;
1471   }
1472 }
1473 
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1474 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1475   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1476 
1477   tran_low_t x0 = input[0];
1478   tran_low_t x1 = input[1];
1479   tran_low_t x2 = input[2];
1480   tran_low_t x3 = input[3];
1481   (void) bd;
1482 
1483   if (!(x0 | x1 | x2 | x3)) {
1484     memset(output, 0, 4 * sizeof(*output));
1485     return;
1486   }
1487 
1488   s0 = sinpi_1_9 * x0;
1489   s1 = sinpi_2_9 * x0;
1490   s2 = sinpi_3_9 * x1;
1491   s3 = sinpi_4_9 * x2;
1492   s4 = sinpi_1_9 * x2;
1493   s5 = sinpi_2_9 * x3;
1494   s6 = sinpi_4_9 * x3;
1495   s7 = (tran_high_t)(x0 - x2 + x3);
1496 
1497   s0 = s0 + s3 + s5;
1498   s1 = s1 - s4 - s6;
1499   s3 = s2;
1500   s2 = sinpi_3_9 * s7;
1501 
1502   // 1-D transform scaling factor is sqrt(2).
1503   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1504   // + 1b (addition) = 29b.
1505   // Hence the output bit depth is 15b.
1506   output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
1507   output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
1508   output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1509   output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
1510 }
1511 
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1512 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1513   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1514 
1515   tran_low_t x0 = input[7];
1516   tran_low_t x1 = input[0];
1517   tran_low_t x2 = input[5];
1518   tran_low_t x3 = input[2];
1519   tran_low_t x4 = input[3];
1520   tran_low_t x5 = input[4];
1521   tran_low_t x6 = input[1];
1522   tran_low_t x7 = input[6];
1523   (void) bd;
1524 
1525   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1526     memset(output, 0, 8 * sizeof(*output));
1527     return;
1528   }
1529 
1530   // stage 1
1531   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
1532   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
1533   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1534   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1535   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1536   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1537   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
1538   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
1539 
1540   x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
1541   x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
1542   x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
1543   x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
1544   x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
1545   x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
1546   x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
1547   x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
1548 
1549   // stage 2
1550   s0 = x0;
1551   s1 = x1;
1552   s2 = x2;
1553   s3 = x3;
1554   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
1555   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
1556   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
1557   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
1558 
1559   x0 = WRAPLOW(s0 + s2, bd);
1560   x1 = WRAPLOW(s1 + s3, bd);
1561   x2 = WRAPLOW(s0 - s2, bd);
1562   x3 = WRAPLOW(s1 - s3, bd);
1563   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1564   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1565   x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1566   x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1567 
1568   // stage 3
1569   s2 = cospi_16_64 * (x2 + x3);
1570   s3 = cospi_16_64 * (x2 - x3);
1571   s6 = cospi_16_64 * (x6 + x7);
1572   s7 = cospi_16_64 * (x6 - x7);
1573 
1574   x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1575   x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1576   x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1577   x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1578 
1579   output[0] = WRAPLOW(x0, bd);
1580   output[1] = WRAPLOW(-x4, bd);
1581   output[2] = WRAPLOW(x6, bd);
1582   output[3] = WRAPLOW(-x2, bd);
1583   output[4] = WRAPLOW(x3, bd);
1584   output[5] = WRAPLOW(-x7, bd);
1585   output[6] = WRAPLOW(x5, bd);
1586   output[7] = WRAPLOW(-x1, bd);
1587 }
1588 
vpx_highbd_idct8x8_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1589 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1590                                  int stride, int bd) {
1591   tran_low_t out[8 * 8] = { 0 };
1592   tran_low_t *outptr = out;
1593   int i, j;
1594   tran_low_t temp_in[8], temp_out[8];
1595   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1596 
1597   // First transform rows.
1598   // Only first 4 row has non-zero coefs.
1599   for (i = 0; i < 4; ++i) {
1600     vpx_highbd_idct8_c(input, outptr, bd);
1601     input += 8;
1602     outptr += 8;
1603   }
1604   // Then transform columns.
1605   for (i = 0; i < 8; ++i) {
1606     for (j = 0; j < 8; ++j)
1607       temp_in[j] = out[j * 8 + i];
1608     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1609     for (j = 0; j < 8; ++j) {
1610       dest[j * stride + i] = highbd_clip_pixel_add(
1611           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1612     }
1613   }
1614 }
1615 
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1616 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1617   tran_low_t step1[16], step2[16];
1618   tran_high_t temp1, temp2;
1619   (void) bd;
1620 
1621   // stage 1
1622   step1[0] = input[0/2];
1623   step1[1] = input[16/2];
1624   step1[2] = input[8/2];
1625   step1[3] = input[24/2];
1626   step1[4] = input[4/2];
1627   step1[5] = input[20/2];
1628   step1[6] = input[12/2];
1629   step1[7] = input[28/2];
1630   step1[8] = input[2/2];
1631   step1[9] = input[18/2];
1632   step1[10] = input[10/2];
1633   step1[11] = input[26/2];
1634   step1[12] = input[6/2];
1635   step1[13] = input[22/2];
1636   step1[14] = input[14/2];
1637   step1[15] = input[30/2];
1638 
1639   // stage 2
1640   step2[0] = step1[0];
1641   step2[1] = step1[1];
1642   step2[2] = step1[2];
1643   step2[3] = step1[3];
1644   step2[4] = step1[4];
1645   step2[5] = step1[5];
1646   step2[6] = step1[6];
1647   step2[7] = step1[7];
1648 
1649   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1650   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1651   step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1652   step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1653 
1654   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1655   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1656   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1657   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1658 
1659   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1660   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1661   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1662   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1663 
1664   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1665   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1666   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1667   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1668 
1669   // stage 3
1670   step1[0] = step2[0];
1671   step1[1] = step2[1];
1672   step1[2] = step2[2];
1673   step1[3] = step2[3];
1674 
1675   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1676   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1677   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1678   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1679   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1680   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1681   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1682   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1683 
1684   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
1685   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
1686   step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
1687   step1[11] = WRAPLOW(step2[10] + step2[11], bd);
1688   step1[12] = WRAPLOW(step2[12] + step2[13], bd);
1689   step1[13] = WRAPLOW(step2[12] - step2[13], bd);
1690   step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
1691   step1[15] = WRAPLOW(step2[14] + step2[15], bd);
1692 
1693   // stage 4
1694   temp1 = (step1[0] + step1[1]) * cospi_16_64;
1695   temp2 = (step1[0] - step1[1]) * cospi_16_64;
1696   step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1697   step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1698   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1699   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1700   step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1701   step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1702   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1703   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1704   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1705   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1706 
1707   step2[8] = step1[8];
1708   step2[15] = step1[15];
1709   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1710   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1711   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1712   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1713   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1714   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1715   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1716   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1717   step2[11] = step1[11];
1718   step2[12] = step1[12];
1719 
1720   // stage 5
1721   step1[0] = WRAPLOW(step2[0] + step2[3], bd);
1722   step1[1] = WRAPLOW(step2[1] + step2[2], bd);
1723   step1[2] = WRAPLOW(step2[1] - step2[2], bd);
1724   step1[3] = WRAPLOW(step2[0] - step2[3], bd);
1725   step1[4] = step2[4];
1726   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1727   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1728   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1729   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1730   step1[7] = step2[7];
1731 
1732   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
1733   step1[9] = WRAPLOW(step2[9] + step2[10], bd);
1734   step1[10] = WRAPLOW(step2[9] - step2[10], bd);
1735   step1[11] = WRAPLOW(step2[8] - step2[11], bd);
1736   step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
1737   step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
1738   step1[14] = WRAPLOW(step2[13] + step2[14], bd);
1739   step1[15] = WRAPLOW(step2[12] + step2[15], bd);
1740 
1741   // stage 6
1742   step2[0] = WRAPLOW(step1[0] + step1[7], bd);
1743   step2[1] = WRAPLOW(step1[1] + step1[6], bd);
1744   step2[2] = WRAPLOW(step1[2] + step1[5], bd);
1745   step2[3] = WRAPLOW(step1[3] + step1[4], bd);
1746   step2[4] = WRAPLOW(step1[3] - step1[4], bd);
1747   step2[5] = WRAPLOW(step1[2] - step1[5], bd);
1748   step2[6] = WRAPLOW(step1[1] - step1[6], bd);
1749   step2[7] = WRAPLOW(step1[0] - step1[7], bd);
1750   step2[8] = step1[8];
1751   step2[9] = step1[9];
1752   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1753   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1754   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1755   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1756   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1757   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1758   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1759   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1760   step2[14] = step1[14];
1761   step2[15] = step1[15];
1762 
1763   // stage 7
1764   output[0] = WRAPLOW(step2[0] + step2[15], bd);
1765   output[1] = WRAPLOW(step2[1] + step2[14], bd);
1766   output[2] = WRAPLOW(step2[2] + step2[13], bd);
1767   output[3] = WRAPLOW(step2[3] + step2[12], bd);
1768   output[4] = WRAPLOW(step2[4] + step2[11], bd);
1769   output[5] = WRAPLOW(step2[5] + step2[10], bd);
1770   output[6] = WRAPLOW(step2[6] + step2[9], bd);
1771   output[7] = WRAPLOW(step2[7] + step2[8], bd);
1772   output[8] = WRAPLOW(step2[7] - step2[8], bd);
1773   output[9] = WRAPLOW(step2[6] - step2[9], bd);
1774   output[10] = WRAPLOW(step2[5] - step2[10], bd);
1775   output[11] = WRAPLOW(step2[4] - step2[11], bd);
1776   output[12] = WRAPLOW(step2[3] - step2[12], bd);
1777   output[13] = WRAPLOW(step2[2] - step2[13], bd);
1778   output[14] = WRAPLOW(step2[1] - step2[14], bd);
1779   output[15] = WRAPLOW(step2[0] - step2[15], bd);
1780 }
1781 
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1782 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1783                                     int stride, int bd) {
1784   tran_low_t out[16 * 16];
1785   tran_low_t *outptr = out;
1786   int i, j;
1787   tran_low_t temp_in[16], temp_out[16];
1788   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1789 
1790   // First transform rows.
1791   for (i = 0; i < 16; ++i) {
1792     vpx_highbd_idct16_c(input, outptr, bd);
1793     input += 16;
1794     outptr += 16;
1795   }
1796 
1797   // Then transform columns.
1798   for (i = 0; i < 16; ++i) {
1799     for (j = 0; j < 16; ++j)
1800       temp_in[j] = out[j * 16 + i];
1801     vpx_highbd_idct16_c(temp_in, temp_out, bd);
1802     for (j = 0; j < 16; ++j) {
1803       dest[j * stride + i] = highbd_clip_pixel_add(
1804           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1805     }
1806   }
1807 }
1808 
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1809 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1810   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1811   tran_high_t s9, s10, s11, s12, s13, s14, s15;
1812 
1813   tran_low_t x0 = input[15];
1814   tran_low_t x1 = input[0];
1815   tran_low_t x2 = input[13];
1816   tran_low_t x3 = input[2];
1817   tran_low_t x4 = input[11];
1818   tran_low_t x5 = input[4];
1819   tran_low_t x6 = input[9];
1820   tran_low_t x7 = input[6];
1821   tran_low_t x8 = input[7];
1822   tran_low_t x9 = input[8];
1823   tran_low_t x10 = input[5];
1824   tran_low_t x11 = input[10];
1825   tran_low_t x12 = input[3];
1826   tran_low_t x13 = input[12];
1827   tran_low_t x14 = input[1];
1828   tran_low_t x15 = input[14];
1829   (void) bd;
1830 
1831   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
1832            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
1833     memset(output, 0, 16 * sizeof(*output));
1834     return;
1835   }
1836 
1837   // stage 1
1838   s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
1839   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1840   s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
1841   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1842   s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
1843   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1844   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1845   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1846   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1847   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1848   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1849   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1850   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1851   s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
1852   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1853   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
1854 
1855   x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
1856   x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
1857   x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
1858   x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
1859   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
1860   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
1861   x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
1862   x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
1863   x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
1864   x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
1865   x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
1866   x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
1867   x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
1868   x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
1869   x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
1870   x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
1871 
1872   // stage 2
1873   s0 = x0;
1874   s1 = x1;
1875   s2 = x2;
1876   s3 = x3;
1877   s4 = x4;
1878   s5 = x5;
1879   s6 = x6;
1880   s7 = x7;
1881   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1882   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1883   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1884   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1885   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1886   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1887   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1888   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1889 
1890   x0 = WRAPLOW(s0 + s4, bd);
1891   x1 = WRAPLOW(s1 + s5, bd);
1892   x2 = WRAPLOW(s2 + s6, bd);
1893   x3 = WRAPLOW(s3 + s7, bd);
1894   x4 = WRAPLOW(s0 - s4, bd);
1895   x5 = WRAPLOW(s1 - s5, bd);
1896   x6 = WRAPLOW(s2 - s6, bd);
1897   x7 = WRAPLOW(s3 - s7, bd);
1898   x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
1899   x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
1900   x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
1901   x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
1902   x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
1903   x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
1904   x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
1905   x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
1906 
1907   // stage 3
1908   s0 = x0;
1909   s1 = x1;
1910   s2 = x2;
1911   s3 = x3;
1912   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1913   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1914   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1915   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1916   s8 = x8;
1917   s9 = x9;
1918   s10 = x10;
1919   s11 = x11;
1920   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1921   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1922   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1923   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1924 
1925   x0 = WRAPLOW(s0 + s2, bd);
1926   x1 = WRAPLOW(s1 + s3, bd);
1927   x2 = WRAPLOW(s0 - s2, bd);
1928   x3 = WRAPLOW(s1 - s3, bd);
1929   x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1930   x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1931   x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1932   x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1933   x8 = WRAPLOW(s8 + s10, bd);
1934   x9 = WRAPLOW(s9 + s11, bd);
1935   x10 = WRAPLOW(s8 - s10, bd);
1936   x11 = WRAPLOW(s9 - s11, bd);
1937   x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
1938   x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
1939   x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
1940   x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
1941 
1942   // stage 4
1943   s2 = (- cospi_16_64) * (x2 + x3);
1944   s3 = cospi_16_64 * (x2 - x3);
1945   s6 = cospi_16_64 * (x6 + x7);
1946   s7 = cospi_16_64 * (-x6 + x7);
1947   s10 = cospi_16_64 * (x10 + x11);
1948   s11 = cospi_16_64 * (-x10 + x11);
1949   s14 = (- cospi_16_64) * (x14 + x15);
1950   s15 = cospi_16_64 * (x14 - x15);
1951 
1952   x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1953   x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1954   x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1955   x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1956   x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
1957   x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
1958   x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
1959   x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
1960 
1961   output[0] = WRAPLOW(x0, bd);
1962   output[1] = WRAPLOW(-x8, bd);
1963   output[2] = WRAPLOW(x12, bd);
1964   output[3] = WRAPLOW(-x4, bd);
1965   output[4] = WRAPLOW(x6, bd);
1966   output[5] = WRAPLOW(x14, bd);
1967   output[6] = WRAPLOW(x10, bd);
1968   output[7] = WRAPLOW(x2, bd);
1969   output[8] = WRAPLOW(x3, bd);
1970   output[9] = WRAPLOW(x11, bd);
1971   output[10] = WRAPLOW(x15, bd);
1972   output[11] = WRAPLOW(x7, bd);
1973   output[12] = WRAPLOW(x5, bd);
1974   output[13] = WRAPLOW(-x13, bd);
1975   output[14] = WRAPLOW(x9, bd);
1976   output[15] = WRAPLOW(-x1, bd);
1977 }
1978 
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1979 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
1980                                    int stride, int bd) {
1981   tran_low_t out[16 * 16] = { 0 };
1982   tran_low_t *outptr = out;
1983   int i, j;
1984   tran_low_t temp_in[16], temp_out[16];
1985   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1986 
1987   // First transform rows. Since all non-zero dct coefficients are in
1988   // upper-left 4x4 area, we only need to calculate first 4 rows here.
1989   for (i = 0; i < 4; ++i) {
1990     vpx_highbd_idct16_c(input, outptr, bd);
1991     input += 16;
1992     outptr += 16;
1993   }
1994 
1995   // Then transform columns.
1996   for (i = 0; i < 16; ++i) {
1997     for (j = 0; j < 16; ++j)
1998       temp_in[j] = out[j*16 + i];
1999     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2000     for (j = 0; j < 16; ++j) {
2001       dest[j * stride + i] = highbd_clip_pixel_add(
2002           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2003     }
2004   }
2005 }
2006 
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2007 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2008                                   int stride, int bd) {
2009   int i, j;
2010   tran_high_t a1;
2011   tran_low_t out = WRAPLOW(
2012       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2013   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2014 
2015   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2016   a1 = ROUND_POWER_OF_TWO(out, 6);
2017   for (j = 0; j < 16; ++j) {
2018     for (i = 0; i < 16; ++i)
2019       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2020     dest += stride;
2021   }
2022 }
2023 
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2024 static void highbd_idct32_c(const tran_low_t *input,
2025                             tran_low_t *output, int bd) {
2026   tran_low_t step1[32], step2[32];
2027   tran_high_t temp1, temp2;
2028   (void) bd;
2029 
2030   // stage 1
2031   step1[0] = input[0];
2032   step1[1] = input[16];
2033   step1[2] = input[8];
2034   step1[3] = input[24];
2035   step1[4] = input[4];
2036   step1[5] = input[20];
2037   step1[6] = input[12];
2038   step1[7] = input[28];
2039   step1[8] = input[2];
2040   step1[9] = input[18];
2041   step1[10] = input[10];
2042   step1[11] = input[26];
2043   step1[12] = input[6];
2044   step1[13] = input[22];
2045   step1[14] = input[14];
2046   step1[15] = input[30];
2047 
2048   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2049   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2050   step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2051   step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2052 
2053   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2054   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2055   step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2056   step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2057 
2058   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2059   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2060   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2061   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2062 
2063   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2064   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2065   step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2066   step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2067 
2068   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2069   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2070   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2071   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2072 
2073   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2074   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2075   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2076   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2077 
2078   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2079   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2080   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2081   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2082 
2083   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2084   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2085   step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2086   step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2087 
2088   // stage 2
2089   step2[0] = step1[0];
2090   step2[1] = step1[1];
2091   step2[2] = step1[2];
2092   step2[3] = step1[3];
2093   step2[4] = step1[4];
2094   step2[5] = step1[5];
2095   step2[6] = step1[6];
2096   step2[7] = step1[7];
2097 
2098   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2099   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2100   step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2101   step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2102 
2103   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2104   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2105   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2106   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2107 
2108   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2109   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2110   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2111   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2112 
2113   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2114   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2115   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2116   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2117 
2118   step2[16] = WRAPLOW(step1[16] + step1[17], bd);
2119   step2[17] = WRAPLOW(step1[16] - step1[17], bd);
2120   step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
2121   step2[19] = WRAPLOW(step1[18] + step1[19], bd);
2122   step2[20] = WRAPLOW(step1[20] + step1[21], bd);
2123   step2[21] = WRAPLOW(step1[20] - step1[21], bd);
2124   step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
2125   step2[23] = WRAPLOW(step1[22] + step1[23], bd);
2126   step2[24] = WRAPLOW(step1[24] + step1[25], bd);
2127   step2[25] = WRAPLOW(step1[24] - step1[25], bd);
2128   step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
2129   step2[27] = WRAPLOW(step1[26] + step1[27], bd);
2130   step2[28] = WRAPLOW(step1[28] + step1[29], bd);
2131   step2[29] = WRAPLOW(step1[28] - step1[29], bd);
2132   step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
2133   step2[31] = WRAPLOW(step1[30] + step1[31], bd);
2134 
2135   // stage 3
2136   step1[0] = step2[0];
2137   step1[1] = step2[1];
2138   step1[2] = step2[2];
2139   step1[3] = step2[3];
2140 
2141   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2142   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2143   step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2144   step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2145   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2146   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2147   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2148   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2149 
2150   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
2151   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
2152   step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
2153   step1[11] = WRAPLOW(step2[10] + step2[11], bd);
2154   step1[12] = WRAPLOW(step2[12] + step2[13], bd);
2155   step1[13] = WRAPLOW(step2[12] - step2[13], bd);
2156   step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
2157   step1[15] = WRAPLOW(step2[14] + step2[15], bd);
2158 
2159   step1[16] = step2[16];
2160   step1[31] = step2[31];
2161   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2162   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2163   step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2164   step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2165   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2166   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2167   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2168   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2169   step1[19] = step2[19];
2170   step1[20] = step2[20];
2171   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2172   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2173   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2174   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2175   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2176   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2177   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2178   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2179   step1[23] = step2[23];
2180   step1[24] = step2[24];
2181   step1[27] = step2[27];
2182   step1[28] = step2[28];
2183 
2184   // stage 4
2185   temp1 = (step1[0] + step1[1]) * cospi_16_64;
2186   temp2 = (step1[0] - step1[1]) * cospi_16_64;
2187   step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2188   step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2189   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2190   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2191   step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2192   step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2193   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2194   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2195   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2196   step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2197 
2198   step2[8] = step1[8];
2199   step2[15] = step1[15];
2200   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2201   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2202   step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2203   step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2204   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2205   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2206   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2207   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2208   step2[11] = step1[11];
2209   step2[12] = step1[12];
2210 
2211   step2[16] = WRAPLOW(step1[16] + step1[19], bd);
2212   step2[17] = WRAPLOW(step1[17] + step1[18], bd);
2213   step2[18] = WRAPLOW(step1[17] - step1[18], bd);
2214   step2[19] = WRAPLOW(step1[16] - step1[19], bd);
2215   step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
2216   step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
2217   step2[22] = WRAPLOW(step1[21] + step1[22], bd);
2218   step2[23] = WRAPLOW(step1[20] + step1[23], bd);
2219 
2220   step2[24] = WRAPLOW(step1[24] + step1[27], bd);
2221   step2[25] = WRAPLOW(step1[25] + step1[26], bd);
2222   step2[26] = WRAPLOW(step1[25] - step1[26], bd);
2223   step2[27] = WRAPLOW(step1[24] - step1[27], bd);
2224   step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
2225   step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
2226   step2[30] = WRAPLOW(step1[29] + step1[30], bd);
2227   step2[31] = WRAPLOW(step1[28] + step1[31], bd);
2228 
2229   // stage 5
2230   step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2231   step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2232   step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2233   step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2234   step1[4] = step2[4];
2235   temp1 = (step2[6] - step2[5]) * cospi_16_64;
2236   temp2 = (step2[5] + step2[6]) * cospi_16_64;
2237   step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2238   step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2239   step1[7] = step2[7];
2240 
2241   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2242   step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2243   step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2244   step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2245   step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2246   step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2247   step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2248   step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2249 
2250   step1[16] = step2[16];
2251   step1[17] = step2[17];
2252   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2253   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2254   step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2255   step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2256   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2257   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2258   step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2259   step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2260   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2261   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2262   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2263   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2264   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2265   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2266   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2267   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2268   step1[22] = step2[22];
2269   step1[23] = step2[23];
2270   step1[24] = step2[24];
2271   step1[25] = step2[25];
2272   step1[30] = step2[30];
2273   step1[31] = step2[31];
2274 
2275   // stage 6
2276   step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2277   step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2278   step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2279   step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2280   step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2281   step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2282   step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2283   step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2284   step2[8] = step1[8];
2285   step2[9] = step1[9];
2286   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2287   temp2 = (step1[10] + step1[13]) * cospi_16_64;
2288   step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2289   step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2290   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2291   temp2 = (step1[11] + step1[12]) * cospi_16_64;
2292   step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2293   step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2294   step2[14] = step1[14];
2295   step2[15] = step1[15];
2296 
2297   step2[16] = WRAPLOW(step1[16] + step1[23], bd);
2298   step2[17] = WRAPLOW(step1[17] + step1[22], bd);
2299   step2[18] = WRAPLOW(step1[18] + step1[21], bd);
2300   step2[19] = WRAPLOW(step1[19] + step1[20], bd);
2301   step2[20] = WRAPLOW(step1[19] - step1[20], bd);
2302   step2[21] = WRAPLOW(step1[18] - step1[21], bd);
2303   step2[22] = WRAPLOW(step1[17] - step1[22], bd);
2304   step2[23] = WRAPLOW(step1[16] - step1[23], bd);
2305 
2306   step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
2307   step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
2308   step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
2309   step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
2310   step2[28] = WRAPLOW(step1[27] + step1[28], bd);
2311   step2[29] = WRAPLOW(step1[26] + step1[29], bd);
2312   step2[30] = WRAPLOW(step1[25] + step1[30], bd);
2313   step2[31] = WRAPLOW(step1[24] + step1[31], bd);
2314 
2315   // stage 7
2316   step1[0] = WRAPLOW(step2[0] + step2[15], bd);
2317   step1[1] = WRAPLOW(step2[1] + step2[14], bd);
2318   step1[2] = WRAPLOW(step2[2] + step2[13], bd);
2319   step1[3] = WRAPLOW(step2[3] + step2[12], bd);
2320   step1[4] = WRAPLOW(step2[4] + step2[11], bd);
2321   step1[5] = WRAPLOW(step2[5] + step2[10], bd);
2322   step1[6] = WRAPLOW(step2[6] + step2[9], bd);
2323   step1[7] = WRAPLOW(step2[7] + step2[8], bd);
2324   step1[8] = WRAPLOW(step2[7] - step2[8], bd);
2325   step1[9] = WRAPLOW(step2[6] - step2[9], bd);
2326   step1[10] = WRAPLOW(step2[5] - step2[10], bd);
2327   step1[11] = WRAPLOW(step2[4] - step2[11], bd);
2328   step1[12] = WRAPLOW(step2[3] - step2[12], bd);
2329   step1[13] = WRAPLOW(step2[2] - step2[13], bd);
2330   step1[14] = WRAPLOW(step2[1] - step2[14], bd);
2331   step1[15] = WRAPLOW(step2[0] - step2[15], bd);
2332 
2333   step1[16] = step2[16];
2334   step1[17] = step2[17];
2335   step1[18] = step2[18];
2336   step1[19] = step2[19];
2337   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2338   temp2 = (step2[20] + step2[27]) * cospi_16_64;
2339   step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2340   step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2341   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2342   temp2 = (step2[21] + step2[26]) * cospi_16_64;
2343   step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2344   step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2345   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2346   temp2 = (step2[22] + step2[25]) * cospi_16_64;
2347   step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2348   step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2349   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2350   temp2 = (step2[23] + step2[24]) * cospi_16_64;
2351   step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2352   step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2353   step1[28] = step2[28];
2354   step1[29] = step2[29];
2355   step1[30] = step2[30];
2356   step1[31] = step2[31];
2357 
2358   // final stage
2359   output[0] = WRAPLOW(step1[0] + step1[31], bd);
2360   output[1] = WRAPLOW(step1[1] + step1[30], bd);
2361   output[2] = WRAPLOW(step1[2] + step1[29], bd);
2362   output[3] = WRAPLOW(step1[3] + step1[28], bd);
2363   output[4] = WRAPLOW(step1[4] + step1[27], bd);
2364   output[5] = WRAPLOW(step1[5] + step1[26], bd);
2365   output[6] = WRAPLOW(step1[6] + step1[25], bd);
2366   output[7] = WRAPLOW(step1[7] + step1[24], bd);
2367   output[8] = WRAPLOW(step1[8] + step1[23], bd);
2368   output[9] = WRAPLOW(step1[9] + step1[22], bd);
2369   output[10] = WRAPLOW(step1[10] + step1[21], bd);
2370   output[11] = WRAPLOW(step1[11] + step1[20], bd);
2371   output[12] = WRAPLOW(step1[12] + step1[19], bd);
2372   output[13] = WRAPLOW(step1[13] + step1[18], bd);
2373   output[14] = WRAPLOW(step1[14] + step1[17], bd);
2374   output[15] = WRAPLOW(step1[15] + step1[16], bd);
2375   output[16] = WRAPLOW(step1[15] - step1[16], bd);
2376   output[17] = WRAPLOW(step1[14] - step1[17], bd);
2377   output[18] = WRAPLOW(step1[13] - step1[18], bd);
2378   output[19] = WRAPLOW(step1[12] - step1[19], bd);
2379   output[20] = WRAPLOW(step1[11] - step1[20], bd);
2380   output[21] = WRAPLOW(step1[10] - step1[21], bd);
2381   output[22] = WRAPLOW(step1[9] - step1[22], bd);
2382   output[23] = WRAPLOW(step1[8] - step1[23], bd);
2383   output[24] = WRAPLOW(step1[7] - step1[24], bd);
2384   output[25] = WRAPLOW(step1[6] - step1[25], bd);
2385   output[26] = WRAPLOW(step1[5] - step1[26], bd);
2386   output[27] = WRAPLOW(step1[4] - step1[27], bd);
2387   output[28] = WRAPLOW(step1[3] - step1[28], bd);
2388   output[29] = WRAPLOW(step1[2] - step1[29], bd);
2389   output[30] = WRAPLOW(step1[1] - step1[30], bd);
2390   output[31] = WRAPLOW(step1[0] - step1[31], bd);
2391 }
2392 
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2393 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2394                                      int stride, int bd) {
2395   tran_low_t out[32 * 32];
2396   tran_low_t *outptr = out;
2397   int i, j;
2398   tran_low_t temp_in[32], temp_out[32];
2399   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2400 
2401   // Rows
2402   for (i = 0; i < 32; ++i) {
2403     tran_low_t zero_coeff[16];
2404     for (j = 0; j < 16; ++j)
2405       zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2406     for (j = 0; j < 8; ++j)
2407       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2408     for (j = 0; j < 4; ++j)
2409       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2410     for (j = 0; j < 2; ++j)
2411       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2412 
2413     if (zero_coeff[0] | zero_coeff[1])
2414       highbd_idct32_c(input, outptr, bd);
2415     else
2416       memset(outptr, 0, sizeof(tran_low_t) * 32);
2417     input += 32;
2418     outptr += 32;
2419   }
2420 
2421   // Columns
2422   for (i = 0; i < 32; ++i) {
2423     for (j = 0; j < 32; ++j)
2424       temp_in[j] = out[j * 32 + i];
2425     highbd_idct32_c(temp_in, temp_out, bd);
2426     for (j = 0; j < 32; ++j) {
2427       dest[j * stride + i] = highbd_clip_pixel_add(
2428           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2429     }
2430   }
2431 }
2432 
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2433 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2434                                    int stride, int bd) {
2435   tran_low_t out[32 * 32] = {0};
2436   tran_low_t *outptr = out;
2437   int i, j;
2438   tran_low_t temp_in[32], temp_out[32];
2439   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2440 
2441   // Rows
2442   // Only upper-left 8x8 has non-zero coeff.
2443   for (i = 0; i < 8; ++i) {
2444     highbd_idct32_c(input, outptr, bd);
2445     input += 32;
2446     outptr += 32;
2447   }
2448   // Columns
2449   for (i = 0; i < 32; ++i) {
2450     for (j = 0; j < 32; ++j)
2451       temp_in[j] = out[j * 32 + i];
2452     highbd_idct32_c(temp_in, temp_out, bd);
2453     for (j = 0; j < 32; ++j) {
2454       dest[j * stride + i] = highbd_clip_pixel_add(
2455           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2456     }
2457   }
2458 }
2459 
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2460 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2461                                   int stride, int bd) {
2462   int i, j;
2463   int a1;
2464   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2465 
2466   tran_low_t out = WRAPLOW(
2467       highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2468   out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2469   a1 = ROUND_POWER_OF_TWO(out, 6);
2470 
2471   for (j = 0; j < 32; ++j) {
2472     for (i = 0; i < 32; ++i)
2473       dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2474     dest += stride;
2475   }
2476 }
2477 #endif  // CONFIG_VP9_HIGHBITDEPTH
2478