1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <math.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/inv_txfm.h"
17
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20 0.5 shifts per pixel. */
21 int i;
22 tran_low_t output[16];
23 tran_high_t a1, b1, c1, d1, e1;
24 const tran_low_t *ip = input;
25 tran_low_t *op = output;
26
27 for (i = 0; i < 4; i++) {
28 a1 = ip[0] >> UNIT_QUANT_SHIFT;
29 c1 = ip[1] >> UNIT_QUANT_SHIFT;
30 d1 = ip[2] >> UNIT_QUANT_SHIFT;
31 b1 = ip[3] >> UNIT_QUANT_SHIFT;
32 a1 += c1;
33 d1 -= b1;
34 e1 = (a1 - d1) >> 1;
35 b1 = e1 - b1;
36 c1 = e1 - c1;
37 a1 -= b1;
38 d1 += c1;
39 op[0] = WRAPLOW(a1);
40 op[1] = WRAPLOW(b1);
41 op[2] = WRAPLOW(c1);
42 op[3] = WRAPLOW(d1);
43 ip += 4;
44 op += 4;
45 }
46
47 ip = output;
48 for (i = 0; i < 4; i++) {
49 a1 = ip[4 * 0];
50 c1 = ip[4 * 1];
51 d1 = ip[4 * 2];
52 b1 = ip[4 * 3];
53 a1 += c1;
54 d1 -= b1;
55 e1 = (a1 - d1) >> 1;
56 b1 = e1 - b1;
57 c1 = e1 - c1;
58 a1 -= b1;
59 d1 += c1;
60 dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61 dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62 dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63 dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64
65 ip++;
66 dest++;
67 }
68 }
69
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int stride)70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
71 int i;
72 tran_high_t a1, e1;
73 tran_low_t tmp[4];
74 const tran_low_t *ip = in;
75 tran_low_t *op = tmp;
76
77 a1 = ip[0] >> UNIT_QUANT_SHIFT;
78 e1 = a1 >> 1;
79 a1 -= e1;
80 op[0] = WRAPLOW(a1);
81 op[1] = op[2] = op[3] = WRAPLOW(e1);
82
83 ip = tmp;
84 for (i = 0; i < 4; i++) {
85 e1 = ip[0] >> 1;
86 a1 = ip[0] - e1;
87 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88 dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89 dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90 dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
91 ip++;
92 dest++;
93 }
94 }
95
iadst4_c(const tran_low_t * input,tran_low_t * output)96 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
97 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
98 tran_low_t x0 = input[0];
99 tran_low_t x1 = input[1];
100 tran_low_t x2 = input[2];
101 tran_low_t x3 = input[3];
102
103 if (!(x0 | x1 | x2 | x3)) {
104 memset(output, 0, 4 * sizeof(*output));
105 return;
106 }
107
108 // 32-bit result is enough for the following multiplications.
109 s0 = sinpi_1_9 * x0;
110 s1 = sinpi_2_9 * x0;
111 s2 = sinpi_3_9 * x1;
112 s3 = sinpi_4_9 * x2;
113 s4 = sinpi_1_9 * x2;
114 s5 = sinpi_2_9 * x3;
115 s6 = sinpi_4_9 * x3;
116 s7 = WRAPLOW(x0 - x2 + x3);
117
118 s0 = s0 + s3 + s5;
119 s1 = s1 - s4 - s6;
120 s3 = s2;
121 s2 = sinpi_3_9 * s7;
122
123 // 1-D transform scaling factor is sqrt(2).
124 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
125 // + 1b (addition) = 29b.
126 // Hence the output bit depth is 15b.
127 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
128 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
129 output[2] = WRAPLOW(dct_const_round_shift(s2));
130 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
131 }
132
idct4_c(const tran_low_t * input,tran_low_t * output)133 void idct4_c(const tran_low_t *input, tran_low_t *output) {
134 int16_t step[4];
135 tran_high_t temp1, temp2;
136
137 // stage 1
138 temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
139 temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
140 step[0] = WRAPLOW(dct_const_round_shift(temp1));
141 step[1] = WRAPLOW(dct_const_round_shift(temp2));
142 temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
143 temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
144 step[2] = WRAPLOW(dct_const_round_shift(temp1));
145 step[3] = WRAPLOW(dct_const_round_shift(temp2));
146
147 // stage 2
148 output[0] = WRAPLOW(step[0] + step[3]);
149 output[1] = WRAPLOW(step[1] + step[2]);
150 output[2] = WRAPLOW(step[1] - step[2]);
151 output[3] = WRAPLOW(step[0] - step[3]);
152 }
153
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)154 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
155 int i, j;
156 tran_low_t out[4 * 4];
157 tran_low_t *outptr = out;
158 tran_low_t temp_in[4], temp_out[4];
159
160 // Rows
161 for (i = 0; i < 4; ++i) {
162 idct4_c(input, outptr);
163 input += 4;
164 outptr += 4;
165 }
166
167 // Columns
168 for (i = 0; i < 4; ++i) {
169 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
170 idct4_c(temp_in, temp_out);
171 for (j = 0; j < 4; ++j) {
172 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
173 ROUND_POWER_OF_TWO(temp_out[j], 4));
174 }
175 }
176 }
177
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)178 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
179 int i;
180 tran_high_t a1;
181 tran_low_t out =
182 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
183
184 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
185 a1 = ROUND_POWER_OF_TWO(out, 4);
186
187 for (i = 0; i < 4; i++) {
188 dest[0] = clip_pixel_add(dest[0], a1);
189 dest[1] = clip_pixel_add(dest[1], a1);
190 dest[2] = clip_pixel_add(dest[2], a1);
191 dest[3] = clip_pixel_add(dest[3], a1);
192 dest += stride;
193 }
194 }
195
iadst8_c(const tran_low_t * input,tran_low_t * output)196 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
197 int s0, s1, s2, s3, s4, s5, s6, s7;
198 tran_high_t x0 = input[7];
199 tran_high_t x1 = input[0];
200 tran_high_t x2 = input[5];
201 tran_high_t x3 = input[2];
202 tran_high_t x4 = input[3];
203 tran_high_t x5 = input[4];
204 tran_high_t x6 = input[1];
205 tran_high_t x7 = input[6];
206
207 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
208 memset(output, 0, 8 * sizeof(*output));
209 return;
210 }
211
212 // stage 1
213 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
214 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
215 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
216 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
217 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
218 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
219 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
220 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
221
222 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
223 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
224 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
225 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
226 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
227 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
228 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
229 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
230
231 // stage 2
232 s0 = (int)x0;
233 s1 = (int)x1;
234 s2 = (int)x2;
235 s3 = (int)x3;
236 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
237 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
238 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
239 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
240
241 x0 = WRAPLOW(s0 + s2);
242 x1 = WRAPLOW(s1 + s3);
243 x2 = WRAPLOW(s0 - s2);
244 x3 = WRAPLOW(s1 - s3);
245 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
246 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
247 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
248 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
249
250 // stage 3
251 s2 = (int)(cospi_16_64 * (x2 + x3));
252 s3 = (int)(cospi_16_64 * (x2 - x3));
253 s6 = (int)(cospi_16_64 * (x6 + x7));
254 s7 = (int)(cospi_16_64 * (x6 - x7));
255
256 x2 = WRAPLOW(dct_const_round_shift(s2));
257 x3 = WRAPLOW(dct_const_round_shift(s3));
258 x6 = WRAPLOW(dct_const_round_shift(s6));
259 x7 = WRAPLOW(dct_const_round_shift(s7));
260
261 output[0] = WRAPLOW(x0);
262 output[1] = WRAPLOW(-x4);
263 output[2] = WRAPLOW(x6);
264 output[3] = WRAPLOW(-x2);
265 output[4] = WRAPLOW(x3);
266 output[5] = WRAPLOW(-x7);
267 output[6] = WRAPLOW(x5);
268 output[7] = WRAPLOW(-x1);
269 }
270
idct8_c(const tran_low_t * input,tran_low_t * output)271 void idct8_c(const tran_low_t *input, tran_low_t *output) {
272 int16_t step1[8], step2[8];
273 tran_high_t temp1, temp2;
274
275 // stage 1
276 step1[0] = (int16_t)input[0];
277 step1[2] = (int16_t)input[4];
278 step1[1] = (int16_t)input[2];
279 step1[3] = (int16_t)input[6];
280 temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
281 temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
282 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
283 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
284 temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
285 temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
286 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
287 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
288
289 // stage 2
290 temp1 = (step1[0] + step1[2]) * cospi_16_64;
291 temp2 = (step1[0] - step1[2]) * cospi_16_64;
292 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
293 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
294 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
295 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
296 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
297 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
298 step2[4] = WRAPLOW(step1[4] + step1[5]);
299 step2[5] = WRAPLOW(step1[4] - step1[5]);
300 step2[6] = WRAPLOW(-step1[6] + step1[7]);
301 step2[7] = WRAPLOW(step1[6] + step1[7]);
302
303 // stage 3
304 step1[0] = WRAPLOW(step2[0] + step2[3]);
305 step1[1] = WRAPLOW(step2[1] + step2[2]);
306 step1[2] = WRAPLOW(step2[1] - step2[2]);
307 step1[3] = WRAPLOW(step2[0] - step2[3]);
308 step1[4] = step2[4];
309 temp1 = (step2[6] - step2[5]) * cospi_16_64;
310 temp2 = (step2[5] + step2[6]) * cospi_16_64;
311 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
312 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
313 step1[7] = step2[7];
314
315 // stage 4
316 output[0] = WRAPLOW(step1[0] + step1[7]);
317 output[1] = WRAPLOW(step1[1] + step1[6]);
318 output[2] = WRAPLOW(step1[2] + step1[5]);
319 output[3] = WRAPLOW(step1[3] + step1[4]);
320 output[4] = WRAPLOW(step1[3] - step1[4]);
321 output[5] = WRAPLOW(step1[2] - step1[5]);
322 output[6] = WRAPLOW(step1[1] - step1[6]);
323 output[7] = WRAPLOW(step1[0] - step1[7]);
324 }
325
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)326 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
327 int i, j;
328 tran_low_t out[8 * 8];
329 tran_low_t *outptr = out;
330 tran_low_t temp_in[8], temp_out[8];
331
332 // First transform rows
333 for (i = 0; i < 8; ++i) {
334 idct8_c(input, outptr);
335 input += 8;
336 outptr += 8;
337 }
338
339 // Then transform columns
340 for (i = 0; i < 8; ++i) {
341 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
342 idct8_c(temp_in, temp_out);
343 for (j = 0; j < 8; ++j) {
344 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
345 ROUND_POWER_OF_TWO(temp_out[j], 5));
346 }
347 }
348 }
349
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)350 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
351 int i, j;
352 tran_low_t out[8 * 8] = { 0 };
353 tran_low_t *outptr = out;
354 tran_low_t temp_in[8], temp_out[8];
355
356 // First transform rows
357 // Only first 4 row has non-zero coefs
358 for (i = 0; i < 4; ++i) {
359 idct8_c(input, outptr);
360 input += 8;
361 outptr += 8;
362 }
363
364 // Then transform columns
365 for (i = 0; i < 8; ++i) {
366 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
367 idct8_c(temp_in, temp_out);
368 for (j = 0; j < 8; ++j) {
369 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
370 ROUND_POWER_OF_TWO(temp_out[j], 5));
371 }
372 }
373 }
374
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)375 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
376 int i, j;
377 tran_high_t a1;
378 tran_low_t out =
379 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
380
381 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
382 a1 = ROUND_POWER_OF_TWO(out, 5);
383 for (j = 0; j < 8; ++j) {
384 for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
385 dest += stride;
386 }
387 }
388
iadst16_c(const tran_low_t * input,tran_low_t * output)389 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
390 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
391 tran_high_t s9, s10, s11, s12, s13, s14, s15;
392 tran_high_t x0 = input[15];
393 tran_high_t x1 = input[0];
394 tran_high_t x2 = input[13];
395 tran_high_t x3 = input[2];
396 tran_high_t x4 = input[11];
397 tran_high_t x5 = input[4];
398 tran_high_t x6 = input[9];
399 tran_high_t x7 = input[6];
400 tran_high_t x8 = input[7];
401 tran_high_t x9 = input[8];
402 tran_high_t x10 = input[5];
403 tran_high_t x11 = input[10];
404 tran_high_t x12 = input[3];
405 tran_high_t x13 = input[12];
406 tran_high_t x14 = input[1];
407 tran_high_t x15 = input[14];
408
409 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
410 x13 | x14 | x15)) {
411 memset(output, 0, 16 * sizeof(*output));
412 return;
413 }
414
415 // stage 1
416 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
417 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
418 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
419 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
420 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
421 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
422 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
423 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
424 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
425 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
426 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
427 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
428 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
429 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
430 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
431 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
432
433 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
434 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
435 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
436 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
437 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
438 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
439 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
440 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
441 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
442 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
443 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
444 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
445 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
446 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
447 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
448 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
449
450 // stage 2
451 s0 = x0;
452 s1 = x1;
453 s2 = x2;
454 s3 = x3;
455 s4 = x4;
456 s5 = x5;
457 s6 = x6;
458 s7 = x7;
459 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
460 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
461 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
462 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
463 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
464 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
465 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
466 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
467
468 x0 = WRAPLOW(s0 + s4);
469 x1 = WRAPLOW(s1 + s5);
470 x2 = WRAPLOW(s2 + s6);
471 x3 = WRAPLOW(s3 + s7);
472 x4 = WRAPLOW(s0 - s4);
473 x5 = WRAPLOW(s1 - s5);
474 x6 = WRAPLOW(s2 - s6);
475 x7 = WRAPLOW(s3 - s7);
476 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
477 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
478 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
479 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
480 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
481 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
482 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
483 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
484
485 // stage 3
486 s0 = x0;
487 s1 = x1;
488 s2 = x2;
489 s3 = x3;
490 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
491 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
492 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
493 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
494 s8 = x8;
495 s9 = x9;
496 s10 = x10;
497 s11 = x11;
498 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
499 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
500 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
501 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
502
503 x0 = WRAPLOW(s0 + s2);
504 x1 = WRAPLOW(s1 + s3);
505 x2 = WRAPLOW(s0 - s2);
506 x3 = WRAPLOW(s1 - s3);
507 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
508 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
509 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
510 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
511 x8 = WRAPLOW(s8 + s10);
512 x9 = WRAPLOW(s9 + s11);
513 x10 = WRAPLOW(s8 - s10);
514 x11 = WRAPLOW(s9 - s11);
515 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
516 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
517 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
518 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
519
520 // stage 4
521 s2 = (-cospi_16_64) * (x2 + x3);
522 s3 = cospi_16_64 * (x2 - x3);
523 s6 = cospi_16_64 * (x6 + x7);
524 s7 = cospi_16_64 * (-x6 + x7);
525 s10 = cospi_16_64 * (x10 + x11);
526 s11 = cospi_16_64 * (-x10 + x11);
527 s14 = (-cospi_16_64) * (x14 + x15);
528 s15 = cospi_16_64 * (x14 - x15);
529
530 x2 = WRAPLOW(dct_const_round_shift(s2));
531 x3 = WRAPLOW(dct_const_round_shift(s3));
532 x6 = WRAPLOW(dct_const_round_shift(s6));
533 x7 = WRAPLOW(dct_const_round_shift(s7));
534 x10 = WRAPLOW(dct_const_round_shift(s10));
535 x11 = WRAPLOW(dct_const_round_shift(s11));
536 x14 = WRAPLOW(dct_const_round_shift(s14));
537 x15 = WRAPLOW(dct_const_round_shift(s15));
538
539 output[0] = WRAPLOW(x0);
540 output[1] = WRAPLOW(-x8);
541 output[2] = WRAPLOW(x12);
542 output[3] = WRAPLOW(-x4);
543 output[4] = WRAPLOW(x6);
544 output[5] = WRAPLOW(x14);
545 output[6] = WRAPLOW(x10);
546 output[7] = WRAPLOW(x2);
547 output[8] = WRAPLOW(x3);
548 output[9] = WRAPLOW(x11);
549 output[10] = WRAPLOW(x15);
550 output[11] = WRAPLOW(x7);
551 output[12] = WRAPLOW(x5);
552 output[13] = WRAPLOW(-x13);
553 output[14] = WRAPLOW(x9);
554 output[15] = WRAPLOW(-x1);
555 }
556
idct16_c(const tran_low_t * input,tran_low_t * output)557 void idct16_c(const tran_low_t *input, tran_low_t *output) {
558 int16_t step1[16], step2[16];
559 tran_high_t temp1, temp2;
560
561 // stage 1
562 step1[0] = (int16_t)input[0 / 2];
563 step1[1] = (int16_t)input[16 / 2];
564 step1[2] = (int16_t)input[8 / 2];
565 step1[3] = (int16_t)input[24 / 2];
566 step1[4] = (int16_t)input[4 / 2];
567 step1[5] = (int16_t)input[20 / 2];
568 step1[6] = (int16_t)input[12 / 2];
569 step1[7] = (int16_t)input[28 / 2];
570 step1[8] = (int16_t)input[2 / 2];
571 step1[9] = (int16_t)input[18 / 2];
572 step1[10] = (int16_t)input[10 / 2];
573 step1[11] = (int16_t)input[26 / 2];
574 step1[12] = (int16_t)input[6 / 2];
575 step1[13] = (int16_t)input[22 / 2];
576 step1[14] = (int16_t)input[14 / 2];
577 step1[15] = (int16_t)input[30 / 2];
578
579 // stage 2
580 step2[0] = step1[0];
581 step2[1] = step1[1];
582 step2[2] = step1[2];
583 step2[3] = step1[3];
584 step2[4] = step1[4];
585 step2[5] = step1[5];
586 step2[6] = step1[6];
587 step2[7] = step1[7];
588
589 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
590 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
591 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
592 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
593
594 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
595 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
596 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
597 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
598
599 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
600 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
601 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
602 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
603
604 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
605 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
606 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
607 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
608
609 // stage 3
610 step1[0] = step2[0];
611 step1[1] = step2[1];
612 step1[2] = step2[2];
613 step1[3] = step2[3];
614
615 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
616 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
617 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
618 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
619 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
620 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
621 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
622 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
623
624 step1[8] = WRAPLOW(step2[8] + step2[9]);
625 step1[9] = WRAPLOW(step2[8] - step2[9]);
626 step1[10] = WRAPLOW(-step2[10] + step2[11]);
627 step1[11] = WRAPLOW(step2[10] + step2[11]);
628 step1[12] = WRAPLOW(step2[12] + step2[13]);
629 step1[13] = WRAPLOW(step2[12] - step2[13]);
630 step1[14] = WRAPLOW(-step2[14] + step2[15]);
631 step1[15] = WRAPLOW(step2[14] + step2[15]);
632
633 // stage 4
634 temp1 = (step1[0] + step1[1]) * cospi_16_64;
635 temp2 = (step1[0] - step1[1]) * cospi_16_64;
636 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
637 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
638 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
639 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
640 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
641 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
642 step2[4] = WRAPLOW(step1[4] + step1[5]);
643 step2[5] = WRAPLOW(step1[4] - step1[5]);
644 step2[6] = WRAPLOW(-step1[6] + step1[7]);
645 step2[7] = WRAPLOW(step1[6] + step1[7]);
646
647 step2[8] = step1[8];
648 step2[15] = step1[15];
649 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
650 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
651 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
652 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
653 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
654 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
655 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
656 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
657 step2[11] = step1[11];
658 step2[12] = step1[12];
659
660 // stage 5
661 step1[0] = WRAPLOW(step2[0] + step2[3]);
662 step1[1] = WRAPLOW(step2[1] + step2[2]);
663 step1[2] = WRAPLOW(step2[1] - step2[2]);
664 step1[3] = WRAPLOW(step2[0] - step2[3]);
665 step1[4] = step2[4];
666 temp1 = (step2[6] - step2[5]) * cospi_16_64;
667 temp2 = (step2[5] + step2[6]) * cospi_16_64;
668 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
669 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
670 step1[7] = step2[7];
671
672 step1[8] = WRAPLOW(step2[8] + step2[11]);
673 step1[9] = WRAPLOW(step2[9] + step2[10]);
674 step1[10] = WRAPLOW(step2[9] - step2[10]);
675 step1[11] = WRAPLOW(step2[8] - step2[11]);
676 step1[12] = WRAPLOW(-step2[12] + step2[15]);
677 step1[13] = WRAPLOW(-step2[13] + step2[14]);
678 step1[14] = WRAPLOW(step2[13] + step2[14]);
679 step1[15] = WRAPLOW(step2[12] + step2[15]);
680
681 // stage 6
682 step2[0] = WRAPLOW(step1[0] + step1[7]);
683 step2[1] = WRAPLOW(step1[1] + step1[6]);
684 step2[2] = WRAPLOW(step1[2] + step1[5]);
685 step2[3] = WRAPLOW(step1[3] + step1[4]);
686 step2[4] = WRAPLOW(step1[3] - step1[4]);
687 step2[5] = WRAPLOW(step1[2] - step1[5]);
688 step2[6] = WRAPLOW(step1[1] - step1[6]);
689 step2[7] = WRAPLOW(step1[0] - step1[7]);
690 step2[8] = step1[8];
691 step2[9] = step1[9];
692 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
693 temp2 = (step1[10] + step1[13]) * cospi_16_64;
694 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
695 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
696 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
697 temp2 = (step1[11] + step1[12]) * cospi_16_64;
698 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
699 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
700 step2[14] = step1[14];
701 step2[15] = step1[15];
702
703 // stage 7
704 output[0] = WRAPLOW(step2[0] + step2[15]);
705 output[1] = WRAPLOW(step2[1] + step2[14]);
706 output[2] = WRAPLOW(step2[2] + step2[13]);
707 output[3] = WRAPLOW(step2[3] + step2[12]);
708 output[4] = WRAPLOW(step2[4] + step2[11]);
709 output[5] = WRAPLOW(step2[5] + step2[10]);
710 output[6] = WRAPLOW(step2[6] + step2[9]);
711 output[7] = WRAPLOW(step2[7] + step2[8]);
712 output[8] = WRAPLOW(step2[7] - step2[8]);
713 output[9] = WRAPLOW(step2[6] - step2[9]);
714 output[10] = WRAPLOW(step2[5] - step2[10]);
715 output[11] = WRAPLOW(step2[4] - step2[11]);
716 output[12] = WRAPLOW(step2[3] - step2[12]);
717 output[13] = WRAPLOW(step2[2] - step2[13]);
718 output[14] = WRAPLOW(step2[1] - step2[14]);
719 output[15] = WRAPLOW(step2[0] - step2[15]);
720 }
721
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)722 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
723 int stride) {
724 int i, j;
725 tran_low_t out[16 * 16];
726 tran_low_t *outptr = out;
727 tran_low_t temp_in[16], temp_out[16];
728
729 // First transform rows
730 for (i = 0; i < 16; ++i) {
731 idct16_c(input, outptr);
732 input += 16;
733 outptr += 16;
734 }
735
736 // Then transform columns
737 for (i = 0; i < 16; ++i) {
738 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
739 idct16_c(temp_in, temp_out);
740 for (j = 0; j < 16; ++j) {
741 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
742 ROUND_POWER_OF_TWO(temp_out[j], 6));
743 }
744 }
745 }
746
vpx_idct16x16_38_add_c(const tran_low_t * input,uint8_t * dest,int stride)747 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
748 int stride) {
749 int i, j;
750 tran_low_t out[16 * 16] = { 0 };
751 tran_low_t *outptr = out;
752 tran_low_t temp_in[16], temp_out[16];
753
754 // First transform rows. Since all non-zero dct coefficients are in
755 // upper-left 8x8 area, we only need to calculate first 8 rows here.
756 for (i = 0; i < 8; ++i) {
757 idct16_c(input, outptr);
758 input += 16;
759 outptr += 16;
760 }
761
762 // Then transform columns
763 for (i = 0; i < 16; ++i) {
764 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
765 idct16_c(temp_in, temp_out);
766 for (j = 0; j < 16; ++j) {
767 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
768 ROUND_POWER_OF_TWO(temp_out[j], 6));
769 }
770 }
771 }
772
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)773 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
774 int stride) {
775 int i, j;
776 tran_low_t out[16 * 16] = { 0 };
777 tran_low_t *outptr = out;
778 tran_low_t temp_in[16], temp_out[16];
779
780 // First transform rows. Since all non-zero dct coefficients are in
781 // upper-left 4x4 area, we only need to calculate first 4 rows here.
782 for (i = 0; i < 4; ++i) {
783 idct16_c(input, outptr);
784 input += 16;
785 outptr += 16;
786 }
787
788 // Then transform columns
789 for (i = 0; i < 16; ++i) {
790 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
791 idct16_c(temp_in, temp_out);
792 for (j = 0; j < 16; ++j) {
793 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
794 ROUND_POWER_OF_TWO(temp_out[j], 6));
795 }
796 }
797 }
798
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)799 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
800 int i, j;
801 tran_high_t a1;
802 tran_low_t out =
803 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
804
805 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
806 a1 = ROUND_POWER_OF_TWO(out, 6);
807 for (j = 0; j < 16; ++j) {
808 for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
809 dest += stride;
810 }
811 }
812
idct32_c(const tran_low_t * input,tran_low_t * output)813 void idct32_c(const tran_low_t *input, tran_low_t *output) {
814 int16_t step1[32], step2[32];
815 tran_high_t temp1, temp2;
816
817 // stage 1
818 step1[0] = (int16_t)input[0];
819 step1[1] = (int16_t)input[16];
820 step1[2] = (int16_t)input[8];
821 step1[3] = (int16_t)input[24];
822 step1[4] = (int16_t)input[4];
823 step1[5] = (int16_t)input[20];
824 step1[6] = (int16_t)input[12];
825 step1[7] = (int16_t)input[28];
826 step1[8] = (int16_t)input[2];
827 step1[9] = (int16_t)input[18];
828 step1[10] = (int16_t)input[10];
829 step1[11] = (int16_t)input[26];
830 step1[12] = (int16_t)input[6];
831 step1[13] = (int16_t)input[22];
832 step1[14] = (int16_t)input[14];
833 step1[15] = (int16_t)input[30];
834
835 temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
836 temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
837 step1[16] = WRAPLOW(dct_const_round_shift(temp1));
838 step1[31] = WRAPLOW(dct_const_round_shift(temp2));
839
840 temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
841 temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
842 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
843 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
844
845 temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
846 temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
847 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
848 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
849
850 temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
851 temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
852 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
853 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
854
855 temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
856 temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
857 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
858 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
859
860 temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
861 temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
862 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
863 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
864
865 temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
866 temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
867 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
868 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
869
870 temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
871 temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
872 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
873 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
874
875 // stage 2
876 step2[0] = step1[0];
877 step2[1] = step1[1];
878 step2[2] = step1[2];
879 step2[3] = step1[3];
880 step2[4] = step1[4];
881 step2[5] = step1[5];
882 step2[6] = step1[6];
883 step2[7] = step1[7];
884
885 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
886 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
887 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
888 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
889
890 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
891 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
892 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
893 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
894
895 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
896 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
897 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
898 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
899
900 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
901 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
902 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
903 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
904
905 step2[16] = WRAPLOW(step1[16] + step1[17]);
906 step2[17] = WRAPLOW(step1[16] - step1[17]);
907 step2[18] = WRAPLOW(-step1[18] + step1[19]);
908 step2[19] = WRAPLOW(step1[18] + step1[19]);
909 step2[20] = WRAPLOW(step1[20] + step1[21]);
910 step2[21] = WRAPLOW(step1[20] - step1[21]);
911 step2[22] = WRAPLOW(-step1[22] + step1[23]);
912 step2[23] = WRAPLOW(step1[22] + step1[23]);
913 step2[24] = WRAPLOW(step1[24] + step1[25]);
914 step2[25] = WRAPLOW(step1[24] - step1[25]);
915 step2[26] = WRAPLOW(-step1[26] + step1[27]);
916 step2[27] = WRAPLOW(step1[26] + step1[27]);
917 step2[28] = WRAPLOW(step1[28] + step1[29]);
918 step2[29] = WRAPLOW(step1[28] - step1[29]);
919 step2[30] = WRAPLOW(-step1[30] + step1[31]);
920 step2[31] = WRAPLOW(step1[30] + step1[31]);
921
922 // stage 3
923 step1[0] = step2[0];
924 step1[1] = step2[1];
925 step1[2] = step2[2];
926 step1[3] = step2[3];
927
928 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
929 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
930 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
931 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
932 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
933 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
934 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
935 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
936
937 step1[8] = WRAPLOW(step2[8] + step2[9]);
938 step1[9] = WRAPLOW(step2[8] - step2[9]);
939 step1[10] = WRAPLOW(-step2[10] + step2[11]);
940 step1[11] = WRAPLOW(step2[10] + step2[11]);
941 step1[12] = WRAPLOW(step2[12] + step2[13]);
942 step1[13] = WRAPLOW(step2[12] - step2[13]);
943 step1[14] = WRAPLOW(-step2[14] + step2[15]);
944 step1[15] = WRAPLOW(step2[14] + step2[15]);
945
946 step1[16] = step2[16];
947 step1[31] = step2[31];
948 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
949 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
950 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
951 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
952 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
953 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
954 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
955 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
956 step1[19] = step2[19];
957 step1[20] = step2[20];
958 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
959 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
960 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
961 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
962 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
963 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
964 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
965 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
966 step1[23] = step2[23];
967 step1[24] = step2[24];
968 step1[27] = step2[27];
969 step1[28] = step2[28];
970
971 // stage 4
972 temp1 = (step1[0] + step1[1]) * cospi_16_64;
973 temp2 = (step1[0] - step1[1]) * cospi_16_64;
974 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
975 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
976 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
977 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
978 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
979 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
980 step2[4] = WRAPLOW(step1[4] + step1[5]);
981 step2[5] = WRAPLOW(step1[4] - step1[5]);
982 step2[6] = WRAPLOW(-step1[6] + step1[7]);
983 step2[7] = WRAPLOW(step1[6] + step1[7]);
984
985 step2[8] = step1[8];
986 step2[15] = step1[15];
987 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
988 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
989 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
990 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
991 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
992 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
993 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
994 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
995 step2[11] = step1[11];
996 step2[12] = step1[12];
997
998 step2[16] = WRAPLOW(step1[16] + step1[19]);
999 step2[17] = WRAPLOW(step1[17] + step1[18]);
1000 step2[18] = WRAPLOW(step1[17] - step1[18]);
1001 step2[19] = WRAPLOW(step1[16] - step1[19]);
1002 step2[20] = WRAPLOW(-step1[20] + step1[23]);
1003 step2[21] = WRAPLOW(-step1[21] + step1[22]);
1004 step2[22] = WRAPLOW(step1[21] + step1[22]);
1005 step2[23] = WRAPLOW(step1[20] + step1[23]);
1006
1007 step2[24] = WRAPLOW(step1[24] + step1[27]);
1008 step2[25] = WRAPLOW(step1[25] + step1[26]);
1009 step2[26] = WRAPLOW(step1[25] - step1[26]);
1010 step2[27] = WRAPLOW(step1[24] - step1[27]);
1011 step2[28] = WRAPLOW(-step1[28] + step1[31]);
1012 step2[29] = WRAPLOW(-step1[29] + step1[30]);
1013 step2[30] = WRAPLOW(step1[29] + step1[30]);
1014 step2[31] = WRAPLOW(step1[28] + step1[31]);
1015
1016 // stage 5
1017 step1[0] = WRAPLOW(step2[0] + step2[3]);
1018 step1[1] = WRAPLOW(step2[1] + step2[2]);
1019 step1[2] = WRAPLOW(step2[1] - step2[2]);
1020 step1[3] = WRAPLOW(step2[0] - step2[3]);
1021 step1[4] = step2[4];
1022 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1023 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1024 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1025 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1026 step1[7] = step2[7];
1027
1028 step1[8] = WRAPLOW(step2[8] + step2[11]);
1029 step1[9] = WRAPLOW(step2[9] + step2[10]);
1030 step1[10] = WRAPLOW(step2[9] - step2[10]);
1031 step1[11] = WRAPLOW(step2[8] - step2[11]);
1032 step1[12] = WRAPLOW(-step2[12] + step2[15]);
1033 step1[13] = WRAPLOW(-step2[13] + step2[14]);
1034 step1[14] = WRAPLOW(step2[13] + step2[14]);
1035 step1[15] = WRAPLOW(step2[12] + step2[15]);
1036
1037 step1[16] = step2[16];
1038 step1[17] = step2[17];
1039 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1040 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1041 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1042 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1043 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1044 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1045 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1046 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1047 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1048 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1049 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1050 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1051 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1052 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1053 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1054 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1055 step1[22] = step2[22];
1056 step1[23] = step2[23];
1057 step1[24] = step2[24];
1058 step1[25] = step2[25];
1059 step1[30] = step2[30];
1060 step1[31] = step2[31];
1061
1062 // stage 6
1063 step2[0] = WRAPLOW(step1[0] + step1[7]);
1064 step2[1] = WRAPLOW(step1[1] + step1[6]);
1065 step2[2] = WRAPLOW(step1[2] + step1[5]);
1066 step2[3] = WRAPLOW(step1[3] + step1[4]);
1067 step2[4] = WRAPLOW(step1[3] - step1[4]);
1068 step2[5] = WRAPLOW(step1[2] - step1[5]);
1069 step2[6] = WRAPLOW(step1[1] - step1[6]);
1070 step2[7] = WRAPLOW(step1[0] - step1[7]);
1071 step2[8] = step1[8];
1072 step2[9] = step1[9];
1073 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1074 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1075 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1076 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1077 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1078 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1079 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1080 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1081 step2[14] = step1[14];
1082 step2[15] = step1[15];
1083
1084 step2[16] = WRAPLOW(step1[16] + step1[23]);
1085 step2[17] = WRAPLOW(step1[17] + step1[22]);
1086 step2[18] = WRAPLOW(step1[18] + step1[21]);
1087 step2[19] = WRAPLOW(step1[19] + step1[20]);
1088 step2[20] = WRAPLOW(step1[19] - step1[20]);
1089 step2[21] = WRAPLOW(step1[18] - step1[21]);
1090 step2[22] = WRAPLOW(step1[17] - step1[22]);
1091 step2[23] = WRAPLOW(step1[16] - step1[23]);
1092
1093 step2[24] = WRAPLOW(-step1[24] + step1[31]);
1094 step2[25] = WRAPLOW(-step1[25] + step1[30]);
1095 step2[26] = WRAPLOW(-step1[26] + step1[29]);
1096 step2[27] = WRAPLOW(-step1[27] + step1[28]);
1097 step2[28] = WRAPLOW(step1[27] + step1[28]);
1098 step2[29] = WRAPLOW(step1[26] + step1[29]);
1099 step2[30] = WRAPLOW(step1[25] + step1[30]);
1100 step2[31] = WRAPLOW(step1[24] + step1[31]);
1101
1102 // stage 7
1103 step1[0] = WRAPLOW(step2[0] + step2[15]);
1104 step1[1] = WRAPLOW(step2[1] + step2[14]);
1105 step1[2] = WRAPLOW(step2[2] + step2[13]);
1106 step1[3] = WRAPLOW(step2[3] + step2[12]);
1107 step1[4] = WRAPLOW(step2[4] + step2[11]);
1108 step1[5] = WRAPLOW(step2[5] + step2[10]);
1109 step1[6] = WRAPLOW(step2[6] + step2[9]);
1110 step1[7] = WRAPLOW(step2[7] + step2[8]);
1111 step1[8] = WRAPLOW(step2[7] - step2[8]);
1112 step1[9] = WRAPLOW(step2[6] - step2[9]);
1113 step1[10] = WRAPLOW(step2[5] - step2[10]);
1114 step1[11] = WRAPLOW(step2[4] - step2[11]);
1115 step1[12] = WRAPLOW(step2[3] - step2[12]);
1116 step1[13] = WRAPLOW(step2[2] - step2[13]);
1117 step1[14] = WRAPLOW(step2[1] - step2[14]);
1118 step1[15] = WRAPLOW(step2[0] - step2[15]);
1119
1120 step1[16] = step2[16];
1121 step1[17] = step2[17];
1122 step1[18] = step2[18];
1123 step1[19] = step2[19];
1124 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1125 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1126 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1127 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1128 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1129 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1130 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1131 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1132 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1133 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1134 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1135 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1136 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1137 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1138 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1139 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1140 step1[28] = step2[28];
1141 step1[29] = step2[29];
1142 step1[30] = step2[30];
1143 step1[31] = step2[31];
1144
1145 // final stage
1146 output[0] = WRAPLOW(step1[0] + step1[31]);
1147 output[1] = WRAPLOW(step1[1] + step1[30]);
1148 output[2] = WRAPLOW(step1[2] + step1[29]);
1149 output[3] = WRAPLOW(step1[3] + step1[28]);
1150 output[4] = WRAPLOW(step1[4] + step1[27]);
1151 output[5] = WRAPLOW(step1[5] + step1[26]);
1152 output[6] = WRAPLOW(step1[6] + step1[25]);
1153 output[7] = WRAPLOW(step1[7] + step1[24]);
1154 output[8] = WRAPLOW(step1[8] + step1[23]);
1155 output[9] = WRAPLOW(step1[9] + step1[22]);
1156 output[10] = WRAPLOW(step1[10] + step1[21]);
1157 output[11] = WRAPLOW(step1[11] + step1[20]);
1158 output[12] = WRAPLOW(step1[12] + step1[19]);
1159 output[13] = WRAPLOW(step1[13] + step1[18]);
1160 output[14] = WRAPLOW(step1[14] + step1[17]);
1161 output[15] = WRAPLOW(step1[15] + step1[16]);
1162 output[16] = WRAPLOW(step1[15] - step1[16]);
1163 output[17] = WRAPLOW(step1[14] - step1[17]);
1164 output[18] = WRAPLOW(step1[13] - step1[18]);
1165 output[19] = WRAPLOW(step1[12] - step1[19]);
1166 output[20] = WRAPLOW(step1[11] - step1[20]);
1167 output[21] = WRAPLOW(step1[10] - step1[21]);
1168 output[22] = WRAPLOW(step1[9] - step1[22]);
1169 output[23] = WRAPLOW(step1[8] - step1[23]);
1170 output[24] = WRAPLOW(step1[7] - step1[24]);
1171 output[25] = WRAPLOW(step1[6] - step1[25]);
1172 output[26] = WRAPLOW(step1[5] - step1[26]);
1173 output[27] = WRAPLOW(step1[4] - step1[27]);
1174 output[28] = WRAPLOW(step1[3] - step1[28]);
1175 output[29] = WRAPLOW(step1[2] - step1[29]);
1176 output[30] = WRAPLOW(step1[1] - step1[30]);
1177 output[31] = WRAPLOW(step1[0] - step1[31]);
1178 }
1179
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1180 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1181 int stride) {
1182 int i, j;
1183 tran_low_t out[32 * 32];
1184 tran_low_t *outptr = out;
1185 tran_low_t temp_in[32], temp_out[32];
1186
1187 // Rows
1188 for (i = 0; i < 32; ++i) {
1189 int16_t zero_coeff = 0;
1190 for (j = 0; j < 32; ++j) zero_coeff |= input[j];
1191
1192 if (zero_coeff)
1193 idct32_c(input, outptr);
1194 else
1195 memset(outptr, 0, sizeof(tran_low_t) * 32);
1196 input += 32;
1197 outptr += 32;
1198 }
1199
1200 // Columns
1201 for (i = 0; i < 32; ++i) {
1202 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1203 idct32_c(temp_in, temp_out);
1204 for (j = 0; j < 32; ++j) {
1205 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1206 ROUND_POWER_OF_TWO(temp_out[j], 6));
1207 }
1208 }
1209 }
1210
vpx_idct32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride)1211 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1212 int stride) {
1213 int i, j;
1214 tran_low_t out[32 * 32] = { 0 };
1215 tran_low_t *outptr = out;
1216 tran_low_t temp_in[32], temp_out[32];
1217
1218 // Rows
1219 // Only upper-left 16x16 has non-zero coeff
1220 for (i = 0; i < 16; ++i) {
1221 idct32_c(input, outptr);
1222 input += 32;
1223 outptr += 32;
1224 }
1225
1226 // Columns
1227 for (i = 0; i < 32; ++i) {
1228 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1229 idct32_c(temp_in, temp_out);
1230 for (j = 0; j < 32; ++j) {
1231 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1232 ROUND_POWER_OF_TWO(temp_out[j], 6));
1233 }
1234 }
1235 }
1236
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1237 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1238 int stride) {
1239 int i, j;
1240 tran_low_t out[32 * 32] = { 0 };
1241 tran_low_t *outptr = out;
1242 tran_low_t temp_in[32], temp_out[32];
1243
1244 // Rows
1245 // Only upper-left 8x8 has non-zero coeff
1246 for (i = 0; i < 8; ++i) {
1247 idct32_c(input, outptr);
1248 input += 32;
1249 outptr += 32;
1250 }
1251
1252 // Columns
1253 for (i = 0; i < 32; ++i) {
1254 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1255 idct32_c(temp_in, temp_out);
1256 for (j = 0; j < 32; ++j) {
1257 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1258 ROUND_POWER_OF_TWO(temp_out[j], 6));
1259 }
1260 }
1261 }
1262
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1263 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1264 int i, j;
1265 tran_high_t a1;
1266 tran_low_t out =
1267 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
1268
1269 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1270 a1 = ROUND_POWER_OF_TWO(out, 6);
1271
1272 for (j = 0; j < 32; ++j) {
1273 for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1274 dest += stride;
1275 }
1276 }
1277
1278 #if CONFIG_VP9_HIGHBITDEPTH
1279
1280 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1281 // transform amplify bits + 1 bit for contingency in rounding and quantizing
1282 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1283
detect_invalid_highbd_input(const tran_low_t * input,int size)1284 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1285 int size) {
1286 int i;
1287 for (i = 0; i < size; ++i)
1288 if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1289 return 0;
1290 }
1291
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1292 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1293 int stride, int bd) {
1294 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1295 0.5 shifts per pixel. */
1296 int i;
1297 tran_low_t output[16];
1298 tran_high_t a1, b1, c1, d1, e1;
1299 const tran_low_t *ip = input;
1300 tran_low_t *op = output;
1301
1302 for (i = 0; i < 4; i++) {
1303 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1304 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1305 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1306 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1307 a1 += c1;
1308 d1 -= b1;
1309 e1 = (a1 - d1) >> 1;
1310 b1 = e1 - b1;
1311 c1 = e1 - c1;
1312 a1 -= b1;
1313 d1 += c1;
1314 op[0] = HIGHBD_WRAPLOW(a1, bd);
1315 op[1] = HIGHBD_WRAPLOW(b1, bd);
1316 op[2] = HIGHBD_WRAPLOW(c1, bd);
1317 op[3] = HIGHBD_WRAPLOW(d1, bd);
1318 ip += 4;
1319 op += 4;
1320 }
1321
1322 ip = output;
1323 for (i = 0; i < 4; i++) {
1324 a1 = ip[4 * 0];
1325 c1 = ip[4 * 1];
1326 d1 = ip[4 * 2];
1327 b1 = ip[4 * 3];
1328 a1 += c1;
1329 d1 -= b1;
1330 e1 = (a1 - d1) >> 1;
1331 b1 = e1 - b1;
1332 c1 = e1 - c1;
1333 a1 -= b1;
1334 d1 += c1;
1335 dest[stride * 0] =
1336 highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1337 dest[stride * 1] =
1338 highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1339 dest[stride * 2] =
1340 highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1341 dest[stride * 3] =
1342 highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1343
1344 ip++;
1345 dest++;
1346 }
1347 }
1348
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint16_t * dest,int stride,int bd)1349 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
1350 int stride, int bd) {
1351 int i;
1352 tran_high_t a1, e1;
1353 tran_low_t tmp[4];
1354 const tran_low_t *ip = in;
1355 tran_low_t *op = tmp;
1356 (void)bd;
1357
1358 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1359 e1 = a1 >> 1;
1360 a1 -= e1;
1361 op[0] = HIGHBD_WRAPLOW(a1, bd);
1362 op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1363
1364 ip = tmp;
1365 for (i = 0; i < 4; i++) {
1366 e1 = ip[0] >> 1;
1367 a1 = ip[0] - e1;
1368 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1369 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1370 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1371 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1372 ip++;
1373 dest++;
1374 }
1375 }
1376
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1377 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1378 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1379 tran_low_t x0 = input[0];
1380 tran_low_t x1 = input[1];
1381 tran_low_t x2 = input[2];
1382 tran_low_t x3 = input[3];
1383 (void)bd;
1384
1385 if (detect_invalid_highbd_input(input, 4)) {
1386 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1387 assert(0 && "invalid highbd txfm input");
1388 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1389 memset(output, 0, sizeof(*output) * 4);
1390 return;
1391 }
1392
1393 if (!(x0 | x1 | x2 | x3)) {
1394 memset(output, 0, 4 * sizeof(*output));
1395 return;
1396 }
1397
1398 s0 = (tran_high_t)sinpi_1_9 * x0;
1399 s1 = (tran_high_t)sinpi_2_9 * x0;
1400 s2 = (tran_high_t)sinpi_3_9 * x1;
1401 s3 = (tran_high_t)sinpi_4_9 * x2;
1402 s4 = (tran_high_t)sinpi_1_9 * x2;
1403 s5 = (tran_high_t)sinpi_2_9 * x3;
1404 s6 = (tran_high_t)sinpi_4_9 * x3;
1405 s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1406
1407 s0 = s0 + s3 + s5;
1408 s1 = s1 - s4 - s6;
1409 s3 = s2;
1410 s2 = sinpi_3_9 * s7;
1411
1412 // 1-D transform scaling factor is sqrt(2).
1413 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1414 // + 1b (addition) = 29b.
1415 // Hence the output bit depth is 15b.
1416 output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1417 output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1418 output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1419 output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1420 }
1421
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1422 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1423 tran_low_t step[4];
1424 tran_high_t temp1, temp2;
1425 (void)bd;
1426
1427 if (detect_invalid_highbd_input(input, 4)) {
1428 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1429 assert(0 && "invalid highbd txfm input");
1430 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1431 memset(output, 0, sizeof(*output) * 4);
1432 return;
1433 }
1434
1435 // stage 1
1436 temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
1437 temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
1438 step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1439 step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1440 temp1 =
1441 input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
1442 temp2 =
1443 input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
1444 step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1445 step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1446
1447 // stage 2
1448 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1449 output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1450 output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1451 output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1452 }
1453
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1454 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1455 int stride, int bd) {
1456 int i, j;
1457 tran_low_t out[4 * 4];
1458 tran_low_t *outptr = out;
1459 tran_low_t temp_in[4], temp_out[4];
1460
1461 // Rows
1462 for (i = 0; i < 4; ++i) {
1463 vpx_highbd_idct4_c(input, outptr, bd);
1464 input += 4;
1465 outptr += 4;
1466 }
1467
1468 // Columns
1469 for (i = 0; i < 4; ++i) {
1470 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1471 vpx_highbd_idct4_c(temp_in, temp_out, bd);
1472 for (j = 0; j < 4; ++j) {
1473 dest[j * stride + i] = highbd_clip_pixel_add(
1474 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1475 }
1476 }
1477 }
1478
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1479 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
1480 int stride, int bd) {
1481 int i;
1482 tran_high_t a1;
1483 tran_low_t out = HIGHBD_WRAPLOW(
1484 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
1485
1486 out =
1487 HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
1488 a1 = ROUND_POWER_OF_TWO(out, 4);
1489
1490 for (i = 0; i < 4; i++) {
1491 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1492 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1493 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1494 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1495 dest += stride;
1496 }
1497 }
1498
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1499 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1500 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1501 tran_low_t x0 = input[7];
1502 tran_low_t x1 = input[0];
1503 tran_low_t x2 = input[5];
1504 tran_low_t x3 = input[2];
1505 tran_low_t x4 = input[3];
1506 tran_low_t x5 = input[4];
1507 tran_low_t x6 = input[1];
1508 tran_low_t x7 = input[6];
1509 (void)bd;
1510
1511 if (detect_invalid_highbd_input(input, 8)) {
1512 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1513 assert(0 && "invalid highbd txfm input");
1514 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1515 memset(output, 0, sizeof(*output) * 8);
1516 return;
1517 }
1518
1519 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1520 memset(output, 0, 8 * sizeof(*output));
1521 return;
1522 }
1523
1524 // stage 1
1525 s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
1526 s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
1527 s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
1528 s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
1529 s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
1530 s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
1531 s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
1532 s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
1533
1534 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1535 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1536 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1537 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1538 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1539 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1540 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1541 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1542
1543 // stage 2
1544 s0 = x0;
1545 s1 = x1;
1546 s2 = x2;
1547 s3 = x3;
1548 s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
1549 s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
1550 s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
1551 s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
1552
1553 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1554 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1555 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1556 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1557 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1558 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1559 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1560 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1561
1562 // stage 3
1563 s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
1564 s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
1565 s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
1566 s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
1567
1568 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1569 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1570 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1571 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1572
1573 output[0] = HIGHBD_WRAPLOW(x0, bd);
1574 output[1] = HIGHBD_WRAPLOW(-x4, bd);
1575 output[2] = HIGHBD_WRAPLOW(x6, bd);
1576 output[3] = HIGHBD_WRAPLOW(-x2, bd);
1577 output[4] = HIGHBD_WRAPLOW(x3, bd);
1578 output[5] = HIGHBD_WRAPLOW(-x7, bd);
1579 output[6] = HIGHBD_WRAPLOW(x5, bd);
1580 output[7] = HIGHBD_WRAPLOW(-x1, bd);
1581 }
1582
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1583 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1584 tran_low_t step1[8], step2[8];
1585 tran_high_t temp1, temp2;
1586
1587 if (detect_invalid_highbd_input(input, 8)) {
1588 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1589 assert(0 && "invalid highbd txfm input");
1590 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1591 memset(output, 0, sizeof(*output) * 8);
1592 return;
1593 }
1594
1595 // stage 1
1596 step1[0] = input[0];
1597 step1[2] = input[4];
1598 step1[1] = input[2];
1599 step1[3] = input[6];
1600 temp1 =
1601 input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
1602 temp2 =
1603 input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
1604 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1605 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1606 temp1 =
1607 input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
1608 temp2 =
1609 input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
1610 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1611 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1612
1613 // stage 2 & stage 3 - even half
1614 vpx_highbd_idct4_c(step1, step1, bd);
1615
1616 // stage 2 - odd half
1617 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1618 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1619 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1620 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1621
1622 // stage 3 - odd half
1623 step1[4] = step2[4];
1624 temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
1625 temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
1626 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1627 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1628 step1[7] = step2[7];
1629
1630 // stage 4
1631 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1632 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1633 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1634 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1635 output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1636 output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1637 output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1638 output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1639 }
1640
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1641 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
1642 int stride, int bd) {
1643 int i, j;
1644 tran_low_t out[8 * 8];
1645 tran_low_t *outptr = out;
1646 tran_low_t temp_in[8], temp_out[8];
1647
1648 // First transform rows
1649 for (i = 0; i < 8; ++i) {
1650 vpx_highbd_idct8_c(input, outptr, bd);
1651 input += 8;
1652 outptr += 8;
1653 }
1654
1655 // Then transform columns
1656 for (i = 0; i < 8; ++i) {
1657 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1658 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1659 for (j = 0; j < 8; ++j) {
1660 dest[j * stride + i] = highbd_clip_pixel_add(
1661 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1662 }
1663 }
1664 }
1665
vpx_highbd_idct8x8_12_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1666 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
1667 int stride, int bd) {
1668 int i, j;
1669 tran_low_t out[8 * 8] = { 0 };
1670 tran_low_t *outptr = out;
1671 tran_low_t temp_in[8], temp_out[8];
1672
1673 // First transform rows
1674 // Only first 4 row has non-zero coefs
1675 for (i = 0; i < 4; ++i) {
1676 vpx_highbd_idct8_c(input, outptr, bd);
1677 input += 8;
1678 outptr += 8;
1679 }
1680
1681 // Then transform columns
1682 for (i = 0; i < 8; ++i) {
1683 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1684 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1685 for (j = 0; j < 8; ++j) {
1686 dest[j * stride + i] = highbd_clip_pixel_add(
1687 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1688 }
1689 }
1690 }
1691
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1692 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
1693 int stride, int bd) {
1694 int i, j;
1695 tran_high_t a1;
1696 tran_low_t out = HIGHBD_WRAPLOW(
1697 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
1698
1699 out =
1700 HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
1701 a1 = ROUND_POWER_OF_TWO(out, 5);
1702 for (j = 0; j < 8; ++j) {
1703 for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1704 dest += stride;
1705 }
1706 }
1707
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1708 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1709 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1710 tran_high_t s9, s10, s11, s12, s13, s14, s15;
1711 tran_low_t x0 = input[15];
1712 tran_low_t x1 = input[0];
1713 tran_low_t x2 = input[13];
1714 tran_low_t x3 = input[2];
1715 tran_low_t x4 = input[11];
1716 tran_low_t x5 = input[4];
1717 tran_low_t x6 = input[9];
1718 tran_low_t x7 = input[6];
1719 tran_low_t x8 = input[7];
1720 tran_low_t x9 = input[8];
1721 tran_low_t x10 = input[5];
1722 tran_low_t x11 = input[10];
1723 tran_low_t x12 = input[3];
1724 tran_low_t x13 = input[12];
1725 tran_low_t x14 = input[1];
1726 tran_low_t x15 = input[14];
1727 (void)bd;
1728
1729 if (detect_invalid_highbd_input(input, 16)) {
1730 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1731 assert(0 && "invalid highbd txfm input");
1732 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1733 memset(output, 0, sizeof(*output) * 16);
1734 return;
1735 }
1736
1737 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1738 x13 | x14 | x15)) {
1739 memset(output, 0, 16 * sizeof(*output));
1740 return;
1741 }
1742
1743 // stage 1
1744 s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
1745 s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
1746 s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
1747 s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
1748 s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
1749 s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
1750 s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
1751 s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
1752 s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
1753 s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
1754 s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
1755 s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
1756 s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
1757 s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
1758 s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
1759 s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
1760
1761 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1762 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1763 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1764 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1765 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1766 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1767 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1768 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1769 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1770 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1771 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1772 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1773 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1774 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1775 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1776 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1777
1778 // stage 2
1779 s0 = x0;
1780 s1 = x1;
1781 s2 = x2;
1782 s3 = x3;
1783 s4 = x4;
1784 s5 = x5;
1785 s6 = x6;
1786 s7 = x7;
1787 s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
1788 s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
1789 s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
1790 s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
1791 s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
1792 s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
1793 s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
1794 s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
1795
1796 x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1797 x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1798 x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1799 x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1800 x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1801 x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1802 x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1803 x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1804 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1805 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1806 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1807 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1808 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1809 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1810 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1811 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1812
1813 // stage 3
1814 s0 = x0;
1815 s1 = x1;
1816 s2 = x2;
1817 s3 = x3;
1818 s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
1819 s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
1820 s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
1821 s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
1822 s8 = x8;
1823 s9 = x9;
1824 s10 = x10;
1825 s11 = x11;
1826 s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
1827 s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
1828 s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
1829 s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
1830
1831 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1832 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1833 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1834 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1835 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1836 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1837 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1838 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1839 x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1840 x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1841 x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1842 x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
1843 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1844 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1845 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1846 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
1847
1848 // stage 4
1849 s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
1850 s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
1851 s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
1852 s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
1853 s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
1854 s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
1855 s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
1856 s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
1857
1858 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1859 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1860 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1861 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1862 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1863 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1864 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1865 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
1866
1867 output[0] = HIGHBD_WRAPLOW(x0, bd);
1868 output[1] = HIGHBD_WRAPLOW(-x8, bd);
1869 output[2] = HIGHBD_WRAPLOW(x12, bd);
1870 output[3] = HIGHBD_WRAPLOW(-x4, bd);
1871 output[4] = HIGHBD_WRAPLOW(x6, bd);
1872 output[5] = HIGHBD_WRAPLOW(x14, bd);
1873 output[6] = HIGHBD_WRAPLOW(x10, bd);
1874 output[7] = HIGHBD_WRAPLOW(x2, bd);
1875 output[8] = HIGHBD_WRAPLOW(x3, bd);
1876 output[9] = HIGHBD_WRAPLOW(x11, bd);
1877 output[10] = HIGHBD_WRAPLOW(x15, bd);
1878 output[11] = HIGHBD_WRAPLOW(x7, bd);
1879 output[12] = HIGHBD_WRAPLOW(x5, bd);
1880 output[13] = HIGHBD_WRAPLOW(-x13, bd);
1881 output[14] = HIGHBD_WRAPLOW(x9, bd);
1882 output[15] = HIGHBD_WRAPLOW(-x1, bd);
1883 }
1884
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1885 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1886 tran_low_t step1[16], step2[16];
1887 tran_high_t temp1, temp2;
1888 (void)bd;
1889
1890 if (detect_invalid_highbd_input(input, 16)) {
1891 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1892 assert(0 && "invalid highbd txfm input");
1893 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1894 memset(output, 0, sizeof(*output) * 16);
1895 return;
1896 }
1897
1898 // stage 1
1899 step1[0] = input[0 / 2];
1900 step1[1] = input[16 / 2];
1901 step1[2] = input[8 / 2];
1902 step1[3] = input[24 / 2];
1903 step1[4] = input[4 / 2];
1904 step1[5] = input[20 / 2];
1905 step1[6] = input[12 / 2];
1906 step1[7] = input[28 / 2];
1907 step1[8] = input[2 / 2];
1908 step1[9] = input[18 / 2];
1909 step1[10] = input[10 / 2];
1910 step1[11] = input[26 / 2];
1911 step1[12] = input[6 / 2];
1912 step1[13] = input[22 / 2];
1913 step1[14] = input[14 / 2];
1914 step1[15] = input[30 / 2];
1915
1916 // stage 2
1917 step2[0] = step1[0];
1918 step2[1] = step1[1];
1919 step2[2] = step1[2];
1920 step2[3] = step1[3];
1921 step2[4] = step1[4];
1922 step2[5] = step1[5];
1923 step2[6] = step1[6];
1924 step2[7] = step1[7];
1925
1926 temp1 =
1927 step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
1928 temp2 =
1929 step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
1930 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1931 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1932
1933 temp1 = step1[9] * (tran_high_t)cospi_14_64 -
1934 step1[14] * (tran_high_t)cospi_18_64;
1935 temp2 = step1[9] * (tran_high_t)cospi_18_64 +
1936 step1[14] * (tran_high_t)cospi_14_64;
1937 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1938 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1939
1940 temp1 = step1[10] * (tran_high_t)cospi_22_64 -
1941 step1[13] * (tran_high_t)cospi_10_64;
1942 temp2 = step1[10] * (tran_high_t)cospi_10_64 +
1943 step1[13] * (tran_high_t)cospi_22_64;
1944 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1945 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1946
1947 temp1 = step1[11] * (tran_high_t)cospi_6_64 -
1948 step1[12] * (tran_high_t)cospi_26_64;
1949 temp2 = step1[11] * (tran_high_t)cospi_26_64 +
1950 step1[12] * (tran_high_t)cospi_6_64;
1951 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1952 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1953
1954 // stage 3
1955 step1[0] = step2[0];
1956 step1[1] = step2[1];
1957 step1[2] = step2[2];
1958 step1[3] = step2[3];
1959
1960 temp1 =
1961 step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
1962 temp2 =
1963 step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
1964 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1965 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1966 temp1 =
1967 step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
1968 temp2 =
1969 step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
1970 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1971 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1972
1973 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1974 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1975 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1976 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1977 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1978 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1979 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1980 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1981
1982 // stage 4
1983 temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
1984 temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
1985 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1986 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1987 temp1 =
1988 step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
1989 temp2 =
1990 step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
1991 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1992 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1993 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1994 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1995 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1996 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1997
1998 step2[8] = step1[8];
1999 step2[15] = step1[15];
2000 temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
2001 step1[14] * (tran_high_t)cospi_24_64;
2002 temp2 =
2003 step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
2004 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2005 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2006 temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
2007 step1[13] * (tran_high_t)cospi_8_64;
2008 temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
2009 step1[13] * (tran_high_t)cospi_24_64;
2010 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2011 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2012 step2[11] = step1[11];
2013 step2[12] = step1[12];
2014
2015 // stage 5
2016 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2017 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2018 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2019 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2020 step1[4] = step2[4];
2021 temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
2022 temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
2023 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2024 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2025 step1[7] = step2[7];
2026
2027 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2028 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2029 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2030 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2031 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2032 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2033 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2034 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2035
2036 // stage 6
2037 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2038 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2039 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2040 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2041 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2042 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2043 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2044 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2045 step2[8] = step1[8];
2046 step2[9] = step1[9];
2047 temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2048 temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2049 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2050 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2051 temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2052 temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2053 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2054 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2055 step2[14] = step1[14];
2056 step2[15] = step1[15];
2057
2058 // stage 7
2059 output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2060 output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2061 output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2062 output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2063 output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2064 output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2065 output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2066 output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2067 output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2068 output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2069 output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2070 output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2071 output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2072 output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2073 output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2074 output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2075 }
2076
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2077 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
2078 int stride, int bd) {
2079 int i, j;
2080 tran_low_t out[16 * 16];
2081 tran_low_t *outptr = out;
2082 tran_low_t temp_in[16], temp_out[16];
2083
2084 // First transform rows
2085 for (i = 0; i < 16; ++i) {
2086 vpx_highbd_idct16_c(input, outptr, bd);
2087 input += 16;
2088 outptr += 16;
2089 }
2090
2091 // Then transform columns
2092 for (i = 0; i < 16; ++i) {
2093 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2094 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2095 for (j = 0; j < 16; ++j) {
2096 dest[j * stride + i] = highbd_clip_pixel_add(
2097 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2098 }
2099 }
2100 }
2101
vpx_highbd_idct16x16_38_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2102 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
2103 int stride, int bd) {
2104 int i, j;
2105 tran_low_t out[16 * 16] = { 0 };
2106 tran_low_t *outptr = out;
2107 tran_low_t temp_in[16], temp_out[16];
2108
2109 // First transform rows. Since all non-zero dct coefficients are in
2110 // upper-left 8x8 area, we only need to calculate first 8 rows here.
2111 for (i = 0; i < 8; ++i) {
2112 vpx_highbd_idct16_c(input, outptr, bd);
2113 input += 16;
2114 outptr += 16;
2115 }
2116
2117 // Then transform columns
2118 for (i = 0; i < 16; ++i) {
2119 uint16_t *destT = dest;
2120 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2121 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2122 for (j = 0; j < 16; ++j) {
2123 destT[i] = highbd_clip_pixel_add(destT[i],
2124 ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2125 destT += stride;
2126 }
2127 }
2128 }
2129
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2130 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
2131 int stride, int bd) {
2132 int i, j;
2133 tran_low_t out[16 * 16] = { 0 };
2134 tran_low_t *outptr = out;
2135 tran_low_t temp_in[16], temp_out[16];
2136
2137 // First transform rows. Since all non-zero dct coefficients are in
2138 // upper-left 4x4 area, we only need to calculate first 4 rows here.
2139 for (i = 0; i < 4; ++i) {
2140 vpx_highbd_idct16_c(input, outptr, bd);
2141 input += 16;
2142 outptr += 16;
2143 }
2144
2145 // Then transform columns
2146 for (i = 0; i < 16; ++i) {
2147 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2148 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2149 for (j = 0; j < 16; ++j) {
2150 dest[j * stride + i] = highbd_clip_pixel_add(
2151 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2152 }
2153 }
2154 }
2155
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2156 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
2157 int stride, int bd) {
2158 int i, j;
2159 tran_high_t a1;
2160 tran_low_t out = HIGHBD_WRAPLOW(
2161 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
2162
2163 out =
2164 HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
2165 a1 = ROUND_POWER_OF_TWO(out, 6);
2166 for (j = 0; j < 16; ++j) {
2167 for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2168 dest += stride;
2169 }
2170 }
2171
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2172 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2173 int bd) {
2174 tran_low_t step1[32], step2[32];
2175 tran_high_t temp1, temp2;
2176 (void)bd;
2177
2178 if (detect_invalid_highbd_input(input, 32)) {
2179 #if CONFIG_COEFFICIENT_RANGE_CHECKING
2180 assert(0 && "invalid highbd txfm input");
2181 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
2182 memset(output, 0, sizeof(*output) * 32);
2183 return;
2184 }
2185
2186 // stage 1
2187 step1[0] = input[0];
2188 step1[1] = input[16];
2189 step1[2] = input[8];
2190 step1[3] = input[24];
2191 step1[4] = input[4];
2192 step1[5] = input[20];
2193 step1[6] = input[12];
2194 step1[7] = input[28];
2195 step1[8] = input[2];
2196 step1[9] = input[18];
2197 step1[10] = input[10];
2198 step1[11] = input[26];
2199 step1[12] = input[6];
2200 step1[13] = input[22];
2201 step1[14] = input[14];
2202 step1[15] = input[30];
2203
2204 temp1 =
2205 input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
2206 temp2 =
2207 input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
2208 step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2209 step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2210
2211 temp1 = input[17] * (tran_high_t)cospi_15_64 -
2212 input[15] * (tran_high_t)cospi_17_64;
2213 temp2 = input[17] * (tran_high_t)cospi_17_64 +
2214 input[15] * (tran_high_t)cospi_15_64;
2215 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2216 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2217
2218 temp1 =
2219 input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
2220 temp2 =
2221 input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
2222 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2223 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2224
2225 temp1 =
2226 input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
2227 temp2 =
2228 input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
2229 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2230 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2231
2232 temp1 =
2233 input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
2234 temp2 =
2235 input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
2236 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2237 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2238
2239 temp1 = input[21] * (tran_high_t)cospi_11_64 -
2240 input[11] * (tran_high_t)cospi_21_64;
2241 temp2 = input[21] * (tran_high_t)cospi_21_64 +
2242 input[11] * (tran_high_t)cospi_11_64;
2243 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2244 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2245
2246 temp1 = input[13] * (tran_high_t)cospi_19_64 -
2247 input[19] * (tran_high_t)cospi_13_64;
2248 temp2 = input[13] * (tran_high_t)cospi_13_64 +
2249 input[19] * (tran_high_t)cospi_19_64;
2250 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2251 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2252
2253 temp1 =
2254 input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
2255 temp2 =
2256 input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
2257 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2258 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2259
2260 // stage 2
2261 step2[0] = step1[0];
2262 step2[1] = step1[1];
2263 step2[2] = step1[2];
2264 step2[3] = step1[3];
2265 step2[4] = step1[4];
2266 step2[5] = step1[5];
2267 step2[6] = step1[6];
2268 step2[7] = step1[7];
2269
2270 temp1 =
2271 step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
2272 temp2 =
2273 step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
2274 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2275 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2276
2277 temp1 = step1[9] * (tran_high_t)cospi_14_64 -
2278 step1[14] * (tran_high_t)cospi_18_64;
2279 temp2 = step1[9] * (tran_high_t)cospi_18_64 +
2280 step1[14] * (tran_high_t)cospi_14_64;
2281 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2282 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2283
2284 temp1 = step1[10] * (tran_high_t)cospi_22_64 -
2285 step1[13] * (tran_high_t)cospi_10_64;
2286 temp2 = step1[10] * (tran_high_t)cospi_10_64 +
2287 step1[13] * (tran_high_t)cospi_22_64;
2288 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2289 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2290
2291 temp1 = step1[11] * (tran_high_t)cospi_6_64 -
2292 step1[12] * (tran_high_t)cospi_26_64;
2293 temp2 = step1[11] * (tran_high_t)cospi_26_64 +
2294 step1[12] * (tran_high_t)cospi_6_64;
2295 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2296 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2297
2298 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2299 step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2300 step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2301 step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2302 step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2303 step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2304 step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2305 step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2306 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2307 step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2308 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2309 step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2310 step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2311 step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2312 step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2313 step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2314
2315 // stage 3
2316 step1[0] = step2[0];
2317 step1[1] = step2[1];
2318 step1[2] = step2[2];
2319 step1[3] = step2[3];
2320
2321 temp1 =
2322 step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
2323 temp2 =
2324 step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
2325 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2326 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2327 temp1 =
2328 step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
2329 temp2 =
2330 step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
2331 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2332 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2333
2334 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2335 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2336 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2337 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2338 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2339 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2340 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2341 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2342
2343 step1[16] = step2[16];
2344 step1[31] = step2[31];
2345 temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
2346 step2[30] * (tran_high_t)cospi_28_64;
2347 temp2 = step2[17] * (tran_high_t)cospi_28_64 +
2348 step2[30] * (tran_high_t)cospi_4_64;
2349 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2350 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2351 temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
2352 step2[29] * (tran_high_t)cospi_4_64;
2353 temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
2354 step2[29] * (tran_high_t)cospi_28_64;
2355 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2356 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2357 step1[19] = step2[19];
2358 step1[20] = step2[20];
2359 temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
2360 step2[26] * (tran_high_t)cospi_12_64;
2361 temp2 = step2[21] * (tran_high_t)cospi_12_64 +
2362 step2[26] * (tran_high_t)cospi_20_64;
2363 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2364 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2365 temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
2366 step2[25] * (tran_high_t)cospi_20_64;
2367 temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
2368 step2[25] * (tran_high_t)cospi_12_64;
2369 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2370 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2371 step1[23] = step2[23];
2372 step1[24] = step2[24];
2373 step1[27] = step2[27];
2374 step1[28] = step2[28];
2375
2376 // stage 4
2377 temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
2378 temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
2379 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2380 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2381 temp1 =
2382 step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
2383 temp2 =
2384 step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
2385 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2386 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2387 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2388 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2389 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2390 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2391
2392 step2[8] = step1[8];
2393 step2[15] = step1[15];
2394 temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
2395 step1[14] * (tran_high_t)cospi_24_64;
2396 temp2 =
2397 step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
2398 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2399 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2400 temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
2401 step1[13] * (tran_high_t)cospi_8_64;
2402 temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
2403 step1[13] * (tran_high_t)cospi_24_64;
2404 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2405 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2406 step2[11] = step1[11];
2407 step2[12] = step1[12];
2408
2409 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2410 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2411 step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2412 step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2413 step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2414 step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2415 step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2416 step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2417
2418 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2419 step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2420 step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2421 step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2422 step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2423 step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2424 step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2425 step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2426
2427 // stage 5
2428 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2429 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2430 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2431 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2432 step1[4] = step2[4];
2433 temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
2434 temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
2435 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2436 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2437 step1[7] = step2[7];
2438
2439 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2440 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2441 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2442 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2443 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2444 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2445 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2446 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2447
2448 step1[16] = step2[16];
2449 step1[17] = step2[17];
2450 temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
2451 step2[29] * (tran_high_t)cospi_24_64;
2452 temp2 = step2[18] * (tran_high_t)cospi_24_64 +
2453 step2[29] * (tran_high_t)cospi_8_64;
2454 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2455 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2456 temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
2457 step2[28] * (tran_high_t)cospi_24_64;
2458 temp2 = step2[19] * (tran_high_t)cospi_24_64 +
2459 step2[28] * (tran_high_t)cospi_8_64;
2460 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2461 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2462 temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
2463 step2[27] * (tran_high_t)cospi_8_64;
2464 temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
2465 step2[27] * (tran_high_t)cospi_24_64;
2466 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2467 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2468 temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
2469 step2[26] * (tran_high_t)cospi_8_64;
2470 temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
2471 step2[26] * (tran_high_t)cospi_24_64;
2472 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2473 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2474 step1[22] = step2[22];
2475 step1[23] = step2[23];
2476 step1[24] = step2[24];
2477 step1[25] = step2[25];
2478 step1[30] = step2[30];
2479 step1[31] = step2[31];
2480
2481 // stage 6
2482 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2483 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2484 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2485 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2486 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2487 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2488 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2489 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2490 step2[8] = step1[8];
2491 step2[9] = step1[9];
2492 temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2493 temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2494 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2495 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2496 temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2497 temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2498 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2499 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2500 step2[14] = step1[14];
2501 step2[15] = step1[15];
2502
2503 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2504 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2505 step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2506 step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2507 step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2508 step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2509 step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2510 step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2511
2512 step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2513 step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2514 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2515 step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2516 step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2517 step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2518 step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2519 step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2520
2521 // stage 7
2522 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2523 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2524 step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2525 step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2526 step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2527 step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2528 step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2529 step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2530 step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2531 step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2532 step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2533 step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2534 step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2535 step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2536 step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2537 step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2538
2539 step1[16] = step2[16];
2540 step1[17] = step2[17];
2541 step1[18] = step2[18];
2542 step1[19] = step2[19];
2543 temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
2544 temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
2545 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2546 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2547 temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
2548 temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
2549 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2550 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2551 temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
2552 temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
2553 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2554 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2555 temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
2556 temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
2557 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2558 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2559 step1[28] = step2[28];
2560 step1[29] = step2[29];
2561 step1[30] = step2[30];
2562 step1[31] = step2[31];
2563
2564 // final stage
2565 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2566 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2567 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2568 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2569 output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2570 output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2571 output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2572 output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2573 output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2574 output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2575 output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2576 output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2577 output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2578 output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2579 output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2580 output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2581 output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2582 output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2583 output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2584 output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2585 output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2586 output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2587 output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2588 output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2589 output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2590 output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2591 output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2592 output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2593 output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2594 output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2595 output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2596 output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2597 }
2598
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2599 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
2600 int stride, int bd) {
2601 int i, j;
2602 tran_low_t out[32 * 32];
2603 tran_low_t *outptr = out;
2604 tran_low_t temp_in[32], temp_out[32];
2605
2606 // Rows
2607 for (i = 0; i < 32; ++i) {
2608 tran_low_t zero_coeff = 0;
2609 for (j = 0; j < 32; ++j) zero_coeff |= input[j];
2610
2611 if (zero_coeff)
2612 highbd_idct32_c(input, outptr, bd);
2613 else
2614 memset(outptr, 0, sizeof(tran_low_t) * 32);
2615 input += 32;
2616 outptr += 32;
2617 }
2618
2619 // Columns
2620 for (i = 0; i < 32; ++i) {
2621 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2622 highbd_idct32_c(temp_in, temp_out, bd);
2623 for (j = 0; j < 32; ++j) {
2624 dest[j * stride + i] = highbd_clip_pixel_add(
2625 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2626 }
2627 }
2628 }
2629
vpx_highbd_idct32x32_135_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2630 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
2631 int stride, int bd) {
2632 int i, j;
2633 tran_low_t out[32 * 32] = { 0 };
2634 tran_low_t *outptr = out;
2635 tran_low_t temp_in[32], temp_out[32];
2636
2637 // Rows
2638 // Only upper-left 16x16 has non-zero coeff
2639 for (i = 0; i < 16; ++i) {
2640 highbd_idct32_c(input, outptr, bd);
2641 input += 32;
2642 outptr += 32;
2643 }
2644
2645 // Columns
2646 for (i = 0; i < 32; ++i) {
2647 uint16_t *destT = dest;
2648 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2649 highbd_idct32_c(temp_in, temp_out, bd);
2650 for (j = 0; j < 32; ++j) {
2651 destT[i] = highbd_clip_pixel_add(destT[i],
2652 ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2653 destT += stride;
2654 }
2655 }
2656 }
2657
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2658 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
2659 int stride, int bd) {
2660 int i, j;
2661 tran_low_t out[32 * 32] = { 0 };
2662 tran_low_t *outptr = out;
2663 tran_low_t temp_in[32], temp_out[32];
2664
2665 // Rows
2666 // Only upper-left 8x8 has non-zero coeff
2667 for (i = 0; i < 8; ++i) {
2668 highbd_idct32_c(input, outptr, bd);
2669 input += 32;
2670 outptr += 32;
2671 }
2672
2673 // Columns
2674 for (i = 0; i < 32; ++i) {
2675 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2676 highbd_idct32_c(temp_in, temp_out, bd);
2677 for (j = 0; j < 32; ++j) {
2678 dest[j * stride + i] = highbd_clip_pixel_add(
2679 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2680 }
2681 }
2682 }
2683
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2684 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
2685 int stride, int bd) {
2686 int i, j;
2687 int a1;
2688 tran_low_t out = HIGHBD_WRAPLOW(
2689 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
2690
2691 out =
2692 HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
2693 a1 = ROUND_POWER_OF_TWO(out, 6);
2694
2695 for (j = 0; j < 32; ++j) {
2696 for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2697 dest += stride;
2698 }
2699 }
2700
2701 #endif // CONFIG_VP9_HIGHBITDEPTH
2702