1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <math.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/inv_txfm.h"
17
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20 0.5 shifts per pixel. */
21 int i;
22 tran_low_t output[16];
23 tran_high_t a1, b1, c1, d1, e1;
24 const tran_low_t *ip = input;
25 tran_low_t *op = output;
26
27 for (i = 0; i < 4; i++) {
28 a1 = ip[0] >> UNIT_QUANT_SHIFT;
29 c1 = ip[1] >> UNIT_QUANT_SHIFT;
30 d1 = ip[2] >> UNIT_QUANT_SHIFT;
31 b1 = ip[3] >> UNIT_QUANT_SHIFT;
32 a1 += c1;
33 d1 -= b1;
34 e1 = (a1 - d1) >> 1;
35 b1 = e1 - b1;
36 c1 = e1 - c1;
37 a1 -= b1;
38 d1 += c1;
39 op[0] = WRAPLOW(a1);
40 op[1] = WRAPLOW(b1);
41 op[2] = WRAPLOW(c1);
42 op[3] = WRAPLOW(d1);
43 ip += 4;
44 op += 4;
45 }
46
47 ip = output;
48 for (i = 0; i < 4; i++) {
49 a1 = ip[4 * 0];
50 c1 = ip[4 * 1];
51 d1 = ip[4 * 2];
52 b1 = ip[4 * 3];
53 a1 += c1;
54 d1 -= b1;
55 e1 = (a1 - d1) >> 1;
56 b1 = e1 - b1;
57 c1 = e1 - c1;
58 a1 -= b1;
59 d1 += c1;
60 dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61 dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62 dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63 dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64
65 ip++;
66 dest++;
67 }
68 }
69
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int stride)70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
71 int i;
72 tran_high_t a1, e1;
73 tran_low_t tmp[4];
74 const tran_low_t *ip = in;
75 tran_low_t *op = tmp;
76
77 a1 = ip[0] >> UNIT_QUANT_SHIFT;
78 e1 = a1 >> 1;
79 a1 -= e1;
80 op[0] = WRAPLOW(a1);
81 op[1] = op[2] = op[3] = WRAPLOW(e1);
82
83 ip = tmp;
84 for (i = 0; i < 4; i++) {
85 e1 = ip[0] >> 1;
86 a1 = ip[0] - e1;
87 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88 dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89 dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90 dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
91 ip++;
92 dest++;
93 }
94 }
95
iadst4_c(const tran_low_t * input,tran_low_t * output)96 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
97 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
98 tran_low_t x0 = input[0];
99 tran_low_t x1 = input[1];
100 tran_low_t x2 = input[2];
101 tran_low_t x3 = input[3];
102
103 if (!(x0 | x1 | x2 | x3)) {
104 memset(output, 0, 4 * sizeof(*output));
105 return;
106 }
107
108 s0 = sinpi_1_9 * x0;
109 s1 = sinpi_2_9 * x0;
110 s2 = sinpi_3_9 * x1;
111 s3 = sinpi_4_9 * x2;
112 s4 = sinpi_1_9 * x2;
113 s5 = sinpi_2_9 * x3;
114 s6 = sinpi_4_9 * x3;
115 s7 = WRAPLOW(x0 - x2 + x3);
116
117 s0 = s0 + s3 + s5;
118 s1 = s1 - s4 - s6;
119 s3 = s2;
120 s2 = sinpi_3_9 * s7;
121
122 // 1-D transform scaling factor is sqrt(2).
123 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
124 // + 1b (addition) = 29b.
125 // Hence the output bit depth is 15b.
126 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
127 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
128 output[2] = WRAPLOW(dct_const_round_shift(s2));
129 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
130 }
131
idct4_c(const tran_low_t * input,tran_low_t * output)132 void idct4_c(const tran_low_t *input, tran_low_t *output) {
133 tran_low_t step[4];
134 tran_high_t temp1, temp2;
135
136 // stage 1
137 temp1 = (input[0] + input[2]) * cospi_16_64;
138 temp2 = (input[0] - input[2]) * cospi_16_64;
139 step[0] = WRAPLOW(dct_const_round_shift(temp1));
140 step[1] = WRAPLOW(dct_const_round_shift(temp2));
141 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
142 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
143 step[2] = WRAPLOW(dct_const_round_shift(temp1));
144 step[3] = WRAPLOW(dct_const_round_shift(temp2));
145
146 // stage 2
147 output[0] = WRAPLOW(step[0] + step[3]);
148 output[1] = WRAPLOW(step[1] + step[2]);
149 output[2] = WRAPLOW(step[1] - step[2]);
150 output[3] = WRAPLOW(step[0] - step[3]);
151 }
152
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)153 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
154 int i, j;
155 tran_low_t out[4 * 4];
156 tran_low_t *outptr = out;
157 tran_low_t temp_in[4], temp_out[4];
158
159 // Rows
160 for (i = 0; i < 4; ++i) {
161 idct4_c(input, outptr);
162 input += 4;
163 outptr += 4;
164 }
165
166 // Columns
167 for (i = 0; i < 4; ++i) {
168 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
169 idct4_c(temp_in, temp_out);
170 for (j = 0; j < 4; ++j) {
171 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
172 ROUND_POWER_OF_TWO(temp_out[j], 4));
173 }
174 }
175 }
176
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)177 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
178 int i;
179 tran_high_t a1;
180 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
181
182 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
183 a1 = ROUND_POWER_OF_TWO(out, 4);
184
185 for (i = 0; i < 4; i++) {
186 dest[0] = clip_pixel_add(dest[0], a1);
187 dest[1] = clip_pixel_add(dest[1], a1);
188 dest[2] = clip_pixel_add(dest[2], a1);
189 dest[3] = clip_pixel_add(dest[3], a1);
190 dest += stride;
191 }
192 }
193
iadst8_c(const tran_low_t * input,tran_low_t * output)194 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
195 int s0, s1, s2, s3, s4, s5, s6, s7;
196 tran_high_t x0 = input[7];
197 tran_high_t x1 = input[0];
198 tran_high_t x2 = input[5];
199 tran_high_t x3 = input[2];
200 tran_high_t x4 = input[3];
201 tran_high_t x5 = input[4];
202 tran_high_t x6 = input[1];
203 tran_high_t x7 = input[6];
204
205 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
206 memset(output, 0, 8 * sizeof(*output));
207 return;
208 }
209
210 // stage 1
211 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
212 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
213 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
214 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
215 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
216 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
217 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
218 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
219
220 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
221 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
222 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
223 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
224 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
225 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
226 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
227 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
228
229 // stage 2
230 s0 = (int)x0;
231 s1 = (int)x1;
232 s2 = (int)x2;
233 s3 = (int)x3;
234 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
235 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
236 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
237 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
238
239 x0 = WRAPLOW(s0 + s2);
240 x1 = WRAPLOW(s1 + s3);
241 x2 = WRAPLOW(s0 - s2);
242 x3 = WRAPLOW(s1 - s3);
243 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
244 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
245 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
246 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
247
248 // stage 3
249 s2 = (int)(cospi_16_64 * (x2 + x3));
250 s3 = (int)(cospi_16_64 * (x2 - x3));
251 s6 = (int)(cospi_16_64 * (x6 + x7));
252 s7 = (int)(cospi_16_64 * (x6 - x7));
253
254 x2 = WRAPLOW(dct_const_round_shift(s2));
255 x3 = WRAPLOW(dct_const_round_shift(s3));
256 x6 = WRAPLOW(dct_const_round_shift(s6));
257 x7 = WRAPLOW(dct_const_round_shift(s7));
258
259 output[0] = WRAPLOW(x0);
260 output[1] = WRAPLOW(-x4);
261 output[2] = WRAPLOW(x6);
262 output[3] = WRAPLOW(-x2);
263 output[4] = WRAPLOW(x3);
264 output[5] = WRAPLOW(-x7);
265 output[6] = WRAPLOW(x5);
266 output[7] = WRAPLOW(-x1);
267 }
268
idct8_c(const tran_low_t * input,tran_low_t * output)269 void idct8_c(const tran_low_t *input, tran_low_t *output) {
270 tran_low_t step1[8], step2[8];
271 tran_high_t temp1, temp2;
272
273 // stage 1
274 step1[0] = input[0];
275 step1[2] = input[4];
276 step1[1] = input[2];
277 step1[3] = input[6];
278 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
279 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
280 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
281 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
282 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
283 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
284 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
285 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
286
287 // stage 2
288 temp1 = (step1[0] + step1[2]) * cospi_16_64;
289 temp2 = (step1[0] - step1[2]) * cospi_16_64;
290 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
291 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
292 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
293 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
294 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
295 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
296 step2[4] = WRAPLOW(step1[4] + step1[5]);
297 step2[5] = WRAPLOW(step1[4] - step1[5]);
298 step2[6] = WRAPLOW(-step1[6] + step1[7]);
299 step2[7] = WRAPLOW(step1[6] + step1[7]);
300
301 // stage 3
302 step1[0] = WRAPLOW(step2[0] + step2[3]);
303 step1[1] = WRAPLOW(step2[1] + step2[2]);
304 step1[2] = WRAPLOW(step2[1] - step2[2]);
305 step1[3] = WRAPLOW(step2[0] - step2[3]);
306 step1[4] = step2[4];
307 temp1 = (step2[6] - step2[5]) * cospi_16_64;
308 temp2 = (step2[5] + step2[6]) * cospi_16_64;
309 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
310 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
311 step1[7] = step2[7];
312
313 // stage 4
314 output[0] = WRAPLOW(step1[0] + step1[7]);
315 output[1] = WRAPLOW(step1[1] + step1[6]);
316 output[2] = WRAPLOW(step1[2] + step1[5]);
317 output[3] = WRAPLOW(step1[3] + step1[4]);
318 output[4] = WRAPLOW(step1[3] - step1[4]);
319 output[5] = WRAPLOW(step1[2] - step1[5]);
320 output[6] = WRAPLOW(step1[1] - step1[6]);
321 output[7] = WRAPLOW(step1[0] - step1[7]);
322 }
323
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)324 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
325 int i, j;
326 tran_low_t out[8 * 8];
327 tran_low_t *outptr = out;
328 tran_low_t temp_in[8], temp_out[8];
329
330 // First transform rows
331 for (i = 0; i < 8; ++i) {
332 idct8_c(input, outptr);
333 input += 8;
334 outptr += 8;
335 }
336
337 // Then transform columns
338 for (i = 0; i < 8; ++i) {
339 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
340 idct8_c(temp_in, temp_out);
341 for (j = 0; j < 8; ++j) {
342 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
343 ROUND_POWER_OF_TWO(temp_out[j], 5));
344 }
345 }
346 }
347
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)348 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
349 int i, j;
350 tran_low_t out[8 * 8] = { 0 };
351 tran_low_t *outptr = out;
352 tran_low_t temp_in[8], temp_out[8];
353
354 // First transform rows
355 // Only first 4 row has non-zero coefs
356 for (i = 0; i < 4; ++i) {
357 idct8_c(input, outptr);
358 input += 8;
359 outptr += 8;
360 }
361
362 // Then transform columns
363 for (i = 0; i < 8; ++i) {
364 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
365 idct8_c(temp_in, temp_out);
366 for (j = 0; j < 8; ++j) {
367 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
368 ROUND_POWER_OF_TWO(temp_out[j], 5));
369 }
370 }
371 }
372
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)373 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
374 int i, j;
375 tran_high_t a1;
376 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
377
378 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
379 a1 = ROUND_POWER_OF_TWO(out, 5);
380 for (j = 0; j < 8; ++j) {
381 for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
382 dest += stride;
383 }
384 }
385
iadst16_c(const tran_low_t * input,tran_low_t * output)386 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
387 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
388 tran_high_t s9, s10, s11, s12, s13, s14, s15;
389 tran_high_t x0 = input[15];
390 tran_high_t x1 = input[0];
391 tran_high_t x2 = input[13];
392 tran_high_t x3 = input[2];
393 tran_high_t x4 = input[11];
394 tran_high_t x5 = input[4];
395 tran_high_t x6 = input[9];
396 tran_high_t x7 = input[6];
397 tran_high_t x8 = input[7];
398 tran_high_t x9 = input[8];
399 tran_high_t x10 = input[5];
400 tran_high_t x11 = input[10];
401 tran_high_t x12 = input[3];
402 tran_high_t x13 = input[12];
403 tran_high_t x14 = input[1];
404 tran_high_t x15 = input[14];
405
406 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
407 x13 | x14 | x15)) {
408 memset(output, 0, 16 * sizeof(*output));
409 return;
410 }
411
412 // stage 1
413 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
414 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
415 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
416 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
417 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
418 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
419 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
420 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
421 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
422 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
423 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
424 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
425 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
426 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
427 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
428 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
429
430 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
431 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
432 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
433 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
434 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
435 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
436 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
437 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
438 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
439 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
440 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
441 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
442 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
443 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
444 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
445 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
446
447 // stage 2
448 s0 = x0;
449 s1 = x1;
450 s2 = x2;
451 s3 = x3;
452 s4 = x4;
453 s5 = x5;
454 s6 = x6;
455 s7 = x7;
456 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
457 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
458 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
459 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
460 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
461 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
462 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
463 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
464
465 x0 = WRAPLOW(s0 + s4);
466 x1 = WRAPLOW(s1 + s5);
467 x2 = WRAPLOW(s2 + s6);
468 x3 = WRAPLOW(s3 + s7);
469 x4 = WRAPLOW(s0 - s4);
470 x5 = WRAPLOW(s1 - s5);
471 x6 = WRAPLOW(s2 - s6);
472 x7 = WRAPLOW(s3 - s7);
473 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
474 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
475 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
476 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
477 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
478 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
479 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
480 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
481
482 // stage 3
483 s0 = x0;
484 s1 = x1;
485 s2 = x2;
486 s3 = x3;
487 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
488 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
489 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
490 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
491 s8 = x8;
492 s9 = x9;
493 s10 = x10;
494 s11 = x11;
495 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
496 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
497 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
498 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
499
500 x0 = WRAPLOW(s0 + s2);
501 x1 = WRAPLOW(s1 + s3);
502 x2 = WRAPLOW(s0 - s2);
503 x3 = WRAPLOW(s1 - s3);
504 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
505 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
506 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
507 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
508 x8 = WRAPLOW(s8 + s10);
509 x9 = WRAPLOW(s9 + s11);
510 x10 = WRAPLOW(s8 - s10);
511 x11 = WRAPLOW(s9 - s11);
512 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
513 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
514 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
515 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
516
517 // stage 4
518 s2 = (-cospi_16_64) * (x2 + x3);
519 s3 = cospi_16_64 * (x2 - x3);
520 s6 = cospi_16_64 * (x6 + x7);
521 s7 = cospi_16_64 * (-x6 + x7);
522 s10 = cospi_16_64 * (x10 + x11);
523 s11 = cospi_16_64 * (-x10 + x11);
524 s14 = (-cospi_16_64) * (x14 + x15);
525 s15 = cospi_16_64 * (x14 - x15);
526
527 x2 = WRAPLOW(dct_const_round_shift(s2));
528 x3 = WRAPLOW(dct_const_round_shift(s3));
529 x6 = WRAPLOW(dct_const_round_shift(s6));
530 x7 = WRAPLOW(dct_const_round_shift(s7));
531 x10 = WRAPLOW(dct_const_round_shift(s10));
532 x11 = WRAPLOW(dct_const_round_shift(s11));
533 x14 = WRAPLOW(dct_const_round_shift(s14));
534 x15 = WRAPLOW(dct_const_round_shift(s15));
535
536 output[0] = WRAPLOW(x0);
537 output[1] = WRAPLOW(-x8);
538 output[2] = WRAPLOW(x12);
539 output[3] = WRAPLOW(-x4);
540 output[4] = WRAPLOW(x6);
541 output[5] = WRAPLOW(x14);
542 output[6] = WRAPLOW(x10);
543 output[7] = WRAPLOW(x2);
544 output[8] = WRAPLOW(x3);
545 output[9] = WRAPLOW(x11);
546 output[10] = WRAPLOW(x15);
547 output[11] = WRAPLOW(x7);
548 output[12] = WRAPLOW(x5);
549 output[13] = WRAPLOW(-x13);
550 output[14] = WRAPLOW(x9);
551 output[15] = WRAPLOW(-x1);
552 }
553
idct16_c(const tran_low_t * input,tran_low_t * output)554 void idct16_c(const tran_low_t *input, tran_low_t *output) {
555 tran_low_t step1[16], step2[16];
556 tran_high_t temp1, temp2;
557
558 // stage 1
559 step1[0] = input[0 / 2];
560 step1[1] = input[16 / 2];
561 step1[2] = input[8 / 2];
562 step1[3] = input[24 / 2];
563 step1[4] = input[4 / 2];
564 step1[5] = input[20 / 2];
565 step1[6] = input[12 / 2];
566 step1[7] = input[28 / 2];
567 step1[8] = input[2 / 2];
568 step1[9] = input[18 / 2];
569 step1[10] = input[10 / 2];
570 step1[11] = input[26 / 2];
571 step1[12] = input[6 / 2];
572 step1[13] = input[22 / 2];
573 step1[14] = input[14 / 2];
574 step1[15] = input[30 / 2];
575
576 // stage 2
577 step2[0] = step1[0];
578 step2[1] = step1[1];
579 step2[2] = step1[2];
580 step2[3] = step1[3];
581 step2[4] = step1[4];
582 step2[5] = step1[5];
583 step2[6] = step1[6];
584 step2[7] = step1[7];
585
586 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
587 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
588 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
589 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
590
591 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
592 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
593 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
594 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
595
596 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
597 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
598 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
599 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
600
601 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
602 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
603 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
604 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
605
606 // stage 3
607 step1[0] = step2[0];
608 step1[1] = step2[1];
609 step1[2] = step2[2];
610 step1[3] = step2[3];
611
612 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
613 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
614 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
615 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
616 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
617 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
618 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
619 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
620
621 step1[8] = WRAPLOW(step2[8] + step2[9]);
622 step1[9] = WRAPLOW(step2[8] - step2[9]);
623 step1[10] = WRAPLOW(-step2[10] + step2[11]);
624 step1[11] = WRAPLOW(step2[10] + step2[11]);
625 step1[12] = WRAPLOW(step2[12] + step2[13]);
626 step1[13] = WRAPLOW(step2[12] - step2[13]);
627 step1[14] = WRAPLOW(-step2[14] + step2[15]);
628 step1[15] = WRAPLOW(step2[14] + step2[15]);
629
630 // stage 4
631 temp1 = (step1[0] + step1[1]) * cospi_16_64;
632 temp2 = (step1[0] - step1[1]) * cospi_16_64;
633 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
634 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
635 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
636 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
637 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
638 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
639 step2[4] = WRAPLOW(step1[4] + step1[5]);
640 step2[5] = WRAPLOW(step1[4] - step1[5]);
641 step2[6] = WRAPLOW(-step1[6] + step1[7]);
642 step2[7] = WRAPLOW(step1[6] + step1[7]);
643
644 step2[8] = step1[8];
645 step2[15] = step1[15];
646 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
647 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
648 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
649 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
650 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
651 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
652 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
653 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
654 step2[11] = step1[11];
655 step2[12] = step1[12];
656
657 // stage 5
658 step1[0] = WRAPLOW(step2[0] + step2[3]);
659 step1[1] = WRAPLOW(step2[1] + step2[2]);
660 step1[2] = WRAPLOW(step2[1] - step2[2]);
661 step1[3] = WRAPLOW(step2[0] - step2[3]);
662 step1[4] = step2[4];
663 temp1 = (step2[6] - step2[5]) * cospi_16_64;
664 temp2 = (step2[5] + step2[6]) * cospi_16_64;
665 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
666 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
667 step1[7] = step2[7];
668
669 step1[8] = WRAPLOW(step2[8] + step2[11]);
670 step1[9] = WRAPLOW(step2[9] + step2[10]);
671 step1[10] = WRAPLOW(step2[9] - step2[10]);
672 step1[11] = WRAPLOW(step2[8] - step2[11]);
673 step1[12] = WRAPLOW(-step2[12] + step2[15]);
674 step1[13] = WRAPLOW(-step2[13] + step2[14]);
675 step1[14] = WRAPLOW(step2[13] + step2[14]);
676 step1[15] = WRAPLOW(step2[12] + step2[15]);
677
678 // stage 6
679 step2[0] = WRAPLOW(step1[0] + step1[7]);
680 step2[1] = WRAPLOW(step1[1] + step1[6]);
681 step2[2] = WRAPLOW(step1[2] + step1[5]);
682 step2[3] = WRAPLOW(step1[3] + step1[4]);
683 step2[4] = WRAPLOW(step1[3] - step1[4]);
684 step2[5] = WRAPLOW(step1[2] - step1[5]);
685 step2[6] = WRAPLOW(step1[1] - step1[6]);
686 step2[7] = WRAPLOW(step1[0] - step1[7]);
687 step2[8] = step1[8];
688 step2[9] = step1[9];
689 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
690 temp2 = (step1[10] + step1[13]) * cospi_16_64;
691 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
692 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
693 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
694 temp2 = (step1[11] + step1[12]) * cospi_16_64;
695 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
696 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
697 step2[14] = step1[14];
698 step2[15] = step1[15];
699
700 // stage 7
701 output[0] = WRAPLOW(step2[0] + step2[15]);
702 output[1] = WRAPLOW(step2[1] + step2[14]);
703 output[2] = WRAPLOW(step2[2] + step2[13]);
704 output[3] = WRAPLOW(step2[3] + step2[12]);
705 output[4] = WRAPLOW(step2[4] + step2[11]);
706 output[5] = WRAPLOW(step2[5] + step2[10]);
707 output[6] = WRAPLOW(step2[6] + step2[9]);
708 output[7] = WRAPLOW(step2[7] + step2[8]);
709 output[8] = WRAPLOW(step2[7] - step2[8]);
710 output[9] = WRAPLOW(step2[6] - step2[9]);
711 output[10] = WRAPLOW(step2[5] - step2[10]);
712 output[11] = WRAPLOW(step2[4] - step2[11]);
713 output[12] = WRAPLOW(step2[3] - step2[12]);
714 output[13] = WRAPLOW(step2[2] - step2[13]);
715 output[14] = WRAPLOW(step2[1] - step2[14]);
716 output[15] = WRAPLOW(step2[0] - step2[15]);
717 }
718
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)719 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
720 int stride) {
721 int i, j;
722 tran_low_t out[16 * 16];
723 tran_low_t *outptr = out;
724 tran_low_t temp_in[16], temp_out[16];
725
726 // First transform rows
727 for (i = 0; i < 16; ++i) {
728 idct16_c(input, outptr);
729 input += 16;
730 outptr += 16;
731 }
732
733 // Then transform columns
734 for (i = 0; i < 16; ++i) {
735 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
736 idct16_c(temp_in, temp_out);
737 for (j = 0; j < 16; ++j) {
738 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
739 ROUND_POWER_OF_TWO(temp_out[j], 6));
740 }
741 }
742 }
743
vpx_idct16x16_38_add_c(const tran_low_t * input,uint8_t * dest,int stride)744 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
745 int stride) {
746 int i, j;
747 tran_low_t out[16 * 16] = { 0 };
748 tran_low_t *outptr = out;
749 tran_low_t temp_in[16], temp_out[16];
750
751 // First transform rows. Since all non-zero dct coefficients are in
752 // upper-left 8x8 area, we only need to calculate first 8 rows here.
753 for (i = 0; i < 8; ++i) {
754 idct16_c(input, outptr);
755 input += 16;
756 outptr += 16;
757 }
758
759 // Then transform columns
760 for (i = 0; i < 16; ++i) {
761 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
762 idct16_c(temp_in, temp_out);
763 for (j = 0; j < 16; ++j) {
764 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
765 ROUND_POWER_OF_TWO(temp_out[j], 6));
766 }
767 }
768 }
769
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)770 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
771 int stride) {
772 int i, j;
773 tran_low_t out[16 * 16] = { 0 };
774 tran_low_t *outptr = out;
775 tran_low_t temp_in[16], temp_out[16];
776
777 // First transform rows. Since all non-zero dct coefficients are in
778 // upper-left 4x4 area, we only need to calculate first 4 rows here.
779 for (i = 0; i < 4; ++i) {
780 idct16_c(input, outptr);
781 input += 16;
782 outptr += 16;
783 }
784
785 // Then transform columns
786 for (i = 0; i < 16; ++i) {
787 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
788 idct16_c(temp_in, temp_out);
789 for (j = 0; j < 16; ++j) {
790 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
791 ROUND_POWER_OF_TWO(temp_out[j], 6));
792 }
793 }
794 }
795
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)796 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
797 int i, j;
798 tran_high_t a1;
799 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
800
801 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
802 a1 = ROUND_POWER_OF_TWO(out, 6);
803 for (j = 0; j < 16; ++j) {
804 for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
805 dest += stride;
806 }
807 }
808
idct32_c(const tran_low_t * input,tran_low_t * output)809 void idct32_c(const tran_low_t *input, tran_low_t *output) {
810 tran_low_t step1[32], step2[32];
811 tran_high_t temp1, temp2;
812
813 // stage 1
814 step1[0] = input[0];
815 step1[1] = input[16];
816 step1[2] = input[8];
817 step1[3] = input[24];
818 step1[4] = input[4];
819 step1[5] = input[20];
820 step1[6] = input[12];
821 step1[7] = input[28];
822 step1[8] = input[2];
823 step1[9] = input[18];
824 step1[10] = input[10];
825 step1[11] = input[26];
826 step1[12] = input[6];
827 step1[13] = input[22];
828 step1[14] = input[14];
829 step1[15] = input[30];
830
831 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
832 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
833 step1[16] = WRAPLOW(dct_const_round_shift(temp1));
834 step1[31] = WRAPLOW(dct_const_round_shift(temp2));
835
836 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
837 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
838 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
839 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
840
841 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
842 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
843 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
844 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
845
846 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
847 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
848 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
849 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
850
851 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
852 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
853 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
854 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
855
856 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
857 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
858 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
859 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
860
861 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
862 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
863 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
864 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
865
866 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
867 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
868 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
869 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
870
871 // stage 2
872 step2[0] = step1[0];
873 step2[1] = step1[1];
874 step2[2] = step1[2];
875 step2[3] = step1[3];
876 step2[4] = step1[4];
877 step2[5] = step1[5];
878 step2[6] = step1[6];
879 step2[7] = step1[7];
880
881 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
882 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
883 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
884 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
885
886 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
887 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
888 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
889 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
890
891 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
892 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
893 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
894 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
895
896 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
897 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
898 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
899 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
900
901 step2[16] = WRAPLOW(step1[16] + step1[17]);
902 step2[17] = WRAPLOW(step1[16] - step1[17]);
903 step2[18] = WRAPLOW(-step1[18] + step1[19]);
904 step2[19] = WRAPLOW(step1[18] + step1[19]);
905 step2[20] = WRAPLOW(step1[20] + step1[21]);
906 step2[21] = WRAPLOW(step1[20] - step1[21]);
907 step2[22] = WRAPLOW(-step1[22] + step1[23]);
908 step2[23] = WRAPLOW(step1[22] + step1[23]);
909 step2[24] = WRAPLOW(step1[24] + step1[25]);
910 step2[25] = WRAPLOW(step1[24] - step1[25]);
911 step2[26] = WRAPLOW(-step1[26] + step1[27]);
912 step2[27] = WRAPLOW(step1[26] + step1[27]);
913 step2[28] = WRAPLOW(step1[28] + step1[29]);
914 step2[29] = WRAPLOW(step1[28] - step1[29]);
915 step2[30] = WRAPLOW(-step1[30] + step1[31]);
916 step2[31] = WRAPLOW(step1[30] + step1[31]);
917
918 // stage 3
919 step1[0] = step2[0];
920 step1[1] = step2[1];
921 step1[2] = step2[2];
922 step1[3] = step2[3];
923
924 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
925 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
926 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
927 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
928 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
929 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
930 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
931 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
932
933 step1[8] = WRAPLOW(step2[8] + step2[9]);
934 step1[9] = WRAPLOW(step2[8] - step2[9]);
935 step1[10] = WRAPLOW(-step2[10] + step2[11]);
936 step1[11] = WRAPLOW(step2[10] + step2[11]);
937 step1[12] = WRAPLOW(step2[12] + step2[13]);
938 step1[13] = WRAPLOW(step2[12] - step2[13]);
939 step1[14] = WRAPLOW(-step2[14] + step2[15]);
940 step1[15] = WRAPLOW(step2[14] + step2[15]);
941
942 step1[16] = step2[16];
943 step1[31] = step2[31];
944 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
945 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
946 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
947 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
948 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
949 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
950 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
951 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
952 step1[19] = step2[19];
953 step1[20] = step2[20];
954 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
955 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
956 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
957 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
958 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
959 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
960 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
961 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
962 step1[23] = step2[23];
963 step1[24] = step2[24];
964 step1[27] = step2[27];
965 step1[28] = step2[28];
966
967 // stage 4
968 temp1 = (step1[0] + step1[1]) * cospi_16_64;
969 temp2 = (step1[0] - step1[1]) * cospi_16_64;
970 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
971 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
972 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
973 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
974 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
975 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
976 step2[4] = WRAPLOW(step1[4] + step1[5]);
977 step2[5] = WRAPLOW(step1[4] - step1[5]);
978 step2[6] = WRAPLOW(-step1[6] + step1[7]);
979 step2[7] = WRAPLOW(step1[6] + step1[7]);
980
981 step2[8] = step1[8];
982 step2[15] = step1[15];
983 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
984 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
985 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
986 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
987 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
988 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
989 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
990 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
991 step2[11] = step1[11];
992 step2[12] = step1[12];
993
994 step2[16] = WRAPLOW(step1[16] + step1[19]);
995 step2[17] = WRAPLOW(step1[17] + step1[18]);
996 step2[18] = WRAPLOW(step1[17] - step1[18]);
997 step2[19] = WRAPLOW(step1[16] - step1[19]);
998 step2[20] = WRAPLOW(-step1[20] + step1[23]);
999 step2[21] = WRAPLOW(-step1[21] + step1[22]);
1000 step2[22] = WRAPLOW(step1[21] + step1[22]);
1001 step2[23] = WRAPLOW(step1[20] + step1[23]);
1002
1003 step2[24] = WRAPLOW(step1[24] + step1[27]);
1004 step2[25] = WRAPLOW(step1[25] + step1[26]);
1005 step2[26] = WRAPLOW(step1[25] - step1[26]);
1006 step2[27] = WRAPLOW(step1[24] - step1[27]);
1007 step2[28] = WRAPLOW(-step1[28] + step1[31]);
1008 step2[29] = WRAPLOW(-step1[29] + step1[30]);
1009 step2[30] = WRAPLOW(step1[29] + step1[30]);
1010 step2[31] = WRAPLOW(step1[28] + step1[31]);
1011
1012 // stage 5
1013 step1[0] = WRAPLOW(step2[0] + step2[3]);
1014 step1[1] = WRAPLOW(step2[1] + step2[2]);
1015 step1[2] = WRAPLOW(step2[1] - step2[2]);
1016 step1[3] = WRAPLOW(step2[0] - step2[3]);
1017 step1[4] = step2[4];
1018 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1019 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1020 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1021 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1022 step1[7] = step2[7];
1023
1024 step1[8] = WRAPLOW(step2[8] + step2[11]);
1025 step1[9] = WRAPLOW(step2[9] + step2[10]);
1026 step1[10] = WRAPLOW(step2[9] - step2[10]);
1027 step1[11] = WRAPLOW(step2[8] - step2[11]);
1028 step1[12] = WRAPLOW(-step2[12] + step2[15]);
1029 step1[13] = WRAPLOW(-step2[13] + step2[14]);
1030 step1[14] = WRAPLOW(step2[13] + step2[14]);
1031 step1[15] = WRAPLOW(step2[12] + step2[15]);
1032
1033 step1[16] = step2[16];
1034 step1[17] = step2[17];
1035 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1036 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1037 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1038 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1039 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1040 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1041 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1042 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1043 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1044 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1045 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1046 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1047 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1048 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1049 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1050 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1051 step1[22] = step2[22];
1052 step1[23] = step2[23];
1053 step1[24] = step2[24];
1054 step1[25] = step2[25];
1055 step1[30] = step2[30];
1056 step1[31] = step2[31];
1057
1058 // stage 6
1059 step2[0] = WRAPLOW(step1[0] + step1[7]);
1060 step2[1] = WRAPLOW(step1[1] + step1[6]);
1061 step2[2] = WRAPLOW(step1[2] + step1[5]);
1062 step2[3] = WRAPLOW(step1[3] + step1[4]);
1063 step2[4] = WRAPLOW(step1[3] - step1[4]);
1064 step2[5] = WRAPLOW(step1[2] - step1[5]);
1065 step2[6] = WRAPLOW(step1[1] - step1[6]);
1066 step2[7] = WRAPLOW(step1[0] - step1[7]);
1067 step2[8] = step1[8];
1068 step2[9] = step1[9];
1069 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1070 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1071 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1072 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1073 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1074 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1075 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1076 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1077 step2[14] = step1[14];
1078 step2[15] = step1[15];
1079
1080 step2[16] = WRAPLOW(step1[16] + step1[23]);
1081 step2[17] = WRAPLOW(step1[17] + step1[22]);
1082 step2[18] = WRAPLOW(step1[18] + step1[21]);
1083 step2[19] = WRAPLOW(step1[19] + step1[20]);
1084 step2[20] = WRAPLOW(step1[19] - step1[20]);
1085 step2[21] = WRAPLOW(step1[18] - step1[21]);
1086 step2[22] = WRAPLOW(step1[17] - step1[22]);
1087 step2[23] = WRAPLOW(step1[16] - step1[23]);
1088
1089 step2[24] = WRAPLOW(-step1[24] + step1[31]);
1090 step2[25] = WRAPLOW(-step1[25] + step1[30]);
1091 step2[26] = WRAPLOW(-step1[26] + step1[29]);
1092 step2[27] = WRAPLOW(-step1[27] + step1[28]);
1093 step2[28] = WRAPLOW(step1[27] + step1[28]);
1094 step2[29] = WRAPLOW(step1[26] + step1[29]);
1095 step2[30] = WRAPLOW(step1[25] + step1[30]);
1096 step2[31] = WRAPLOW(step1[24] + step1[31]);
1097
1098 // stage 7
1099 step1[0] = WRAPLOW(step2[0] + step2[15]);
1100 step1[1] = WRAPLOW(step2[1] + step2[14]);
1101 step1[2] = WRAPLOW(step2[2] + step2[13]);
1102 step1[3] = WRAPLOW(step2[3] + step2[12]);
1103 step1[4] = WRAPLOW(step2[4] + step2[11]);
1104 step1[5] = WRAPLOW(step2[5] + step2[10]);
1105 step1[6] = WRAPLOW(step2[6] + step2[9]);
1106 step1[7] = WRAPLOW(step2[7] + step2[8]);
1107 step1[8] = WRAPLOW(step2[7] - step2[8]);
1108 step1[9] = WRAPLOW(step2[6] - step2[9]);
1109 step1[10] = WRAPLOW(step2[5] - step2[10]);
1110 step1[11] = WRAPLOW(step2[4] - step2[11]);
1111 step1[12] = WRAPLOW(step2[3] - step2[12]);
1112 step1[13] = WRAPLOW(step2[2] - step2[13]);
1113 step1[14] = WRAPLOW(step2[1] - step2[14]);
1114 step1[15] = WRAPLOW(step2[0] - step2[15]);
1115
1116 step1[16] = step2[16];
1117 step1[17] = step2[17];
1118 step1[18] = step2[18];
1119 step1[19] = step2[19];
1120 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1121 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1122 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1123 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1124 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1125 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1126 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1127 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1128 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1129 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1130 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1131 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1132 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1133 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1134 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1135 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1136 step1[28] = step2[28];
1137 step1[29] = step2[29];
1138 step1[30] = step2[30];
1139 step1[31] = step2[31];
1140
1141 // final stage
1142 output[0] = WRAPLOW(step1[0] + step1[31]);
1143 output[1] = WRAPLOW(step1[1] + step1[30]);
1144 output[2] = WRAPLOW(step1[2] + step1[29]);
1145 output[3] = WRAPLOW(step1[3] + step1[28]);
1146 output[4] = WRAPLOW(step1[4] + step1[27]);
1147 output[5] = WRAPLOW(step1[5] + step1[26]);
1148 output[6] = WRAPLOW(step1[6] + step1[25]);
1149 output[7] = WRAPLOW(step1[7] + step1[24]);
1150 output[8] = WRAPLOW(step1[8] + step1[23]);
1151 output[9] = WRAPLOW(step1[9] + step1[22]);
1152 output[10] = WRAPLOW(step1[10] + step1[21]);
1153 output[11] = WRAPLOW(step1[11] + step1[20]);
1154 output[12] = WRAPLOW(step1[12] + step1[19]);
1155 output[13] = WRAPLOW(step1[13] + step1[18]);
1156 output[14] = WRAPLOW(step1[14] + step1[17]);
1157 output[15] = WRAPLOW(step1[15] + step1[16]);
1158 output[16] = WRAPLOW(step1[15] - step1[16]);
1159 output[17] = WRAPLOW(step1[14] - step1[17]);
1160 output[18] = WRAPLOW(step1[13] - step1[18]);
1161 output[19] = WRAPLOW(step1[12] - step1[19]);
1162 output[20] = WRAPLOW(step1[11] - step1[20]);
1163 output[21] = WRAPLOW(step1[10] - step1[21]);
1164 output[22] = WRAPLOW(step1[9] - step1[22]);
1165 output[23] = WRAPLOW(step1[8] - step1[23]);
1166 output[24] = WRAPLOW(step1[7] - step1[24]);
1167 output[25] = WRAPLOW(step1[6] - step1[25]);
1168 output[26] = WRAPLOW(step1[5] - step1[26]);
1169 output[27] = WRAPLOW(step1[4] - step1[27]);
1170 output[28] = WRAPLOW(step1[3] - step1[28]);
1171 output[29] = WRAPLOW(step1[2] - step1[29]);
1172 output[30] = WRAPLOW(step1[1] - step1[30]);
1173 output[31] = WRAPLOW(step1[0] - step1[31]);
1174 }
1175
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1176 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1177 int stride) {
1178 int i, j;
1179 tran_low_t out[32 * 32];
1180 tran_low_t *outptr = out;
1181 tran_low_t temp_in[32], temp_out[32];
1182
1183 // Rows
1184 for (i = 0; i < 32; ++i) {
1185 int16_t zero_coeff = 0;
1186 for (j = 0; j < 32; ++j) zero_coeff |= input[j];
1187
1188 if (zero_coeff)
1189 idct32_c(input, outptr);
1190 else
1191 memset(outptr, 0, sizeof(tran_low_t) * 32);
1192 input += 32;
1193 outptr += 32;
1194 }
1195
1196 // Columns
1197 for (i = 0; i < 32; ++i) {
1198 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1199 idct32_c(temp_in, temp_out);
1200 for (j = 0; j < 32; ++j) {
1201 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1202 ROUND_POWER_OF_TWO(temp_out[j], 6));
1203 }
1204 }
1205 }
1206
vpx_idct32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride)1207 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1208 int stride) {
1209 int i, j;
1210 tran_low_t out[32 * 32] = { 0 };
1211 tran_low_t *outptr = out;
1212 tran_low_t temp_in[32], temp_out[32];
1213
1214 // Rows
1215 // Only upper-left 16x16 has non-zero coeff
1216 for (i = 0; i < 16; ++i) {
1217 idct32_c(input, outptr);
1218 input += 32;
1219 outptr += 32;
1220 }
1221
1222 // Columns
1223 for (i = 0; i < 32; ++i) {
1224 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1225 idct32_c(temp_in, temp_out);
1226 for (j = 0; j < 32; ++j) {
1227 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1228 ROUND_POWER_OF_TWO(temp_out[j], 6));
1229 }
1230 }
1231 }
1232
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1233 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1234 int stride) {
1235 int i, j;
1236 tran_low_t out[32 * 32] = { 0 };
1237 tran_low_t *outptr = out;
1238 tran_low_t temp_in[32], temp_out[32];
1239
1240 // Rows
1241 // Only upper-left 8x8 has non-zero coeff
1242 for (i = 0; i < 8; ++i) {
1243 idct32_c(input, outptr);
1244 input += 32;
1245 outptr += 32;
1246 }
1247
1248 // Columns
1249 for (i = 0; i < 32; ++i) {
1250 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1251 idct32_c(temp_in, temp_out);
1252 for (j = 0; j < 32; ++j) {
1253 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1254 ROUND_POWER_OF_TWO(temp_out[j], 6));
1255 }
1256 }
1257 }
1258
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1259 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1260 int i, j;
1261 tran_high_t a1;
1262 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1263
1264 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1265 a1 = ROUND_POWER_OF_TWO(out, 6);
1266
1267 for (j = 0; j < 32; ++j) {
1268 for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1269 dest += stride;
1270 }
1271 }
1272
1273 #if CONFIG_VP9_HIGHBITDEPTH
1274
1275 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1276 // transform amplify bits + 1 bit for contingency in rounding and quantizing
1277 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1278
detect_invalid_highbd_input(const tran_low_t * input,int size)1279 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1280 int size) {
1281 int i;
1282 for (i = 0; i < size; ++i)
1283 if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1284 return 0;
1285 }
1286
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1287 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1288 int stride, int bd) {
1289 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1290 0.5 shifts per pixel. */
1291 int i;
1292 tran_low_t output[16];
1293 tran_high_t a1, b1, c1, d1, e1;
1294 const tran_low_t *ip = input;
1295 tran_low_t *op = output;
1296
1297 for (i = 0; i < 4; i++) {
1298 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1299 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1300 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1301 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1302 a1 += c1;
1303 d1 -= b1;
1304 e1 = (a1 - d1) >> 1;
1305 b1 = e1 - b1;
1306 c1 = e1 - c1;
1307 a1 -= b1;
1308 d1 += c1;
1309 op[0] = HIGHBD_WRAPLOW(a1, bd);
1310 op[1] = HIGHBD_WRAPLOW(b1, bd);
1311 op[2] = HIGHBD_WRAPLOW(c1, bd);
1312 op[3] = HIGHBD_WRAPLOW(d1, bd);
1313 ip += 4;
1314 op += 4;
1315 }
1316
1317 ip = output;
1318 for (i = 0; i < 4; i++) {
1319 a1 = ip[4 * 0];
1320 c1 = ip[4 * 1];
1321 d1 = ip[4 * 2];
1322 b1 = ip[4 * 3];
1323 a1 += c1;
1324 d1 -= b1;
1325 e1 = (a1 - d1) >> 1;
1326 b1 = e1 - b1;
1327 c1 = e1 - c1;
1328 a1 -= b1;
1329 d1 += c1;
1330 dest[stride * 0] =
1331 highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1332 dest[stride * 1] =
1333 highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1334 dest[stride * 2] =
1335 highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1336 dest[stride * 3] =
1337 highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1338
1339 ip++;
1340 dest++;
1341 }
1342 }
1343
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint16_t * dest,int stride,int bd)1344 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
1345 int stride, int bd) {
1346 int i;
1347 tran_high_t a1, e1;
1348 tran_low_t tmp[4];
1349 const tran_low_t *ip = in;
1350 tran_low_t *op = tmp;
1351 (void)bd;
1352
1353 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1354 e1 = a1 >> 1;
1355 a1 -= e1;
1356 op[0] = HIGHBD_WRAPLOW(a1, bd);
1357 op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1358
1359 ip = tmp;
1360 for (i = 0; i < 4; i++) {
1361 e1 = ip[0] >> 1;
1362 a1 = ip[0] - e1;
1363 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1364 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1365 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1366 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1367 ip++;
1368 dest++;
1369 }
1370 }
1371
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1372 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1373 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1374 tran_low_t x0 = input[0];
1375 tran_low_t x1 = input[1];
1376 tran_low_t x2 = input[2];
1377 tran_low_t x3 = input[3];
1378 (void)bd;
1379
1380 if (detect_invalid_highbd_input(input, 4)) {
1381 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1382 assert(0 && "invalid highbd txfm input");
1383 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1384 memset(output, 0, sizeof(*output) * 4);
1385 return;
1386 }
1387
1388 if (!(x0 | x1 | x2 | x3)) {
1389 memset(output, 0, 4 * sizeof(*output));
1390 return;
1391 }
1392
1393 s0 = sinpi_1_9 * x0;
1394 s1 = sinpi_2_9 * x0;
1395 s2 = sinpi_3_9 * x1;
1396 s3 = sinpi_4_9 * x2;
1397 s4 = sinpi_1_9 * x2;
1398 s5 = sinpi_2_9 * x3;
1399 s6 = sinpi_4_9 * x3;
1400 s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1401
1402 s0 = s0 + s3 + s5;
1403 s1 = s1 - s4 - s6;
1404 s3 = s2;
1405 s2 = sinpi_3_9 * s7;
1406
1407 // 1-D transform scaling factor is sqrt(2).
1408 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1409 // + 1b (addition) = 29b.
1410 // Hence the output bit depth is 15b.
1411 output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1412 output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1413 output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1414 output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1415 }
1416
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1417 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1418 tran_low_t step[4];
1419 tran_high_t temp1, temp2;
1420 (void)bd;
1421
1422 if (detect_invalid_highbd_input(input, 4)) {
1423 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1424 assert(0 && "invalid highbd txfm input");
1425 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1426 memset(output, 0, sizeof(*output) * 4);
1427 return;
1428 }
1429
1430 // stage 1
1431 temp1 = (input[0] + input[2]) * cospi_16_64;
1432 temp2 = (input[0] - input[2]) * cospi_16_64;
1433 step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1434 step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1435 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1436 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1437 step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1438 step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1439
1440 // stage 2
1441 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1442 output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1443 output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1444 output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1445 }
1446
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1447 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1448 int stride, int bd) {
1449 int i, j;
1450 tran_low_t out[4 * 4];
1451 tran_low_t *outptr = out;
1452 tran_low_t temp_in[4], temp_out[4];
1453
1454 // Rows
1455 for (i = 0; i < 4; ++i) {
1456 vpx_highbd_idct4_c(input, outptr, bd);
1457 input += 4;
1458 outptr += 4;
1459 }
1460
1461 // Columns
1462 for (i = 0; i < 4; ++i) {
1463 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1464 vpx_highbd_idct4_c(temp_in, temp_out, bd);
1465 for (j = 0; j < 4; ++j) {
1466 dest[j * stride + i] = highbd_clip_pixel_add(
1467 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1468 }
1469 }
1470 }
1471
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1472 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
1473 int stride, int bd) {
1474 int i;
1475 tran_high_t a1;
1476 tran_low_t out =
1477 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1478
1479 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1480 a1 = ROUND_POWER_OF_TWO(out, 4);
1481
1482 for (i = 0; i < 4; i++) {
1483 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1484 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1485 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1486 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1487 dest += stride;
1488 }
1489 }
1490
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1491 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1492 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1493 tran_low_t x0 = input[7];
1494 tran_low_t x1 = input[0];
1495 tran_low_t x2 = input[5];
1496 tran_low_t x3 = input[2];
1497 tran_low_t x4 = input[3];
1498 tran_low_t x5 = input[4];
1499 tran_low_t x6 = input[1];
1500 tran_low_t x7 = input[6];
1501 (void)bd;
1502
1503 if (detect_invalid_highbd_input(input, 8)) {
1504 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1505 assert(0 && "invalid highbd txfm input");
1506 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1507 memset(output, 0, sizeof(*output) * 8);
1508 return;
1509 }
1510
1511 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1512 memset(output, 0, 8 * sizeof(*output));
1513 return;
1514 }
1515
1516 // stage 1
1517 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1518 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1519 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1520 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1521 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1522 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1523 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1524 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1525
1526 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1527 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1528 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1529 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1530 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1531 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1532 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1533 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1534
1535 // stage 2
1536 s0 = x0;
1537 s1 = x1;
1538 s2 = x2;
1539 s3 = x3;
1540 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1541 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1542 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1543 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1544
1545 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1546 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1547 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1548 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1549 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1550 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1551 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1552 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1553
1554 // stage 3
1555 s2 = cospi_16_64 * (x2 + x3);
1556 s3 = cospi_16_64 * (x2 - x3);
1557 s6 = cospi_16_64 * (x6 + x7);
1558 s7 = cospi_16_64 * (x6 - x7);
1559
1560 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1561 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1562 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1563 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1564
1565 output[0] = HIGHBD_WRAPLOW(x0, bd);
1566 output[1] = HIGHBD_WRAPLOW(-x4, bd);
1567 output[2] = HIGHBD_WRAPLOW(x6, bd);
1568 output[3] = HIGHBD_WRAPLOW(-x2, bd);
1569 output[4] = HIGHBD_WRAPLOW(x3, bd);
1570 output[5] = HIGHBD_WRAPLOW(-x7, bd);
1571 output[6] = HIGHBD_WRAPLOW(x5, bd);
1572 output[7] = HIGHBD_WRAPLOW(-x1, bd);
1573 }
1574
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1575 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1576 tran_low_t step1[8], step2[8];
1577 tran_high_t temp1, temp2;
1578
1579 if (detect_invalid_highbd_input(input, 8)) {
1580 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1581 assert(0 && "invalid highbd txfm input");
1582 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1583 memset(output, 0, sizeof(*output) * 8);
1584 return;
1585 }
1586
1587 // stage 1
1588 step1[0] = input[0];
1589 step1[2] = input[4];
1590 step1[1] = input[2];
1591 step1[3] = input[6];
1592 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1593 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1594 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1595 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1596 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1597 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1598 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1599 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1600
1601 // stage 2 & stage 3 - even half
1602 vpx_highbd_idct4_c(step1, step1, bd);
1603
1604 // stage 2 - odd half
1605 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1606 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1607 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1608 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1609
1610 // stage 3 - odd half
1611 step1[4] = step2[4];
1612 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1613 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1614 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1615 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1616 step1[7] = step2[7];
1617
1618 // stage 4
1619 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1620 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1621 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1622 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1623 output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1624 output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1625 output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1626 output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1627 }
1628
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1629 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
1630 int stride, int bd) {
1631 int i, j;
1632 tran_low_t out[8 * 8];
1633 tran_low_t *outptr = out;
1634 tran_low_t temp_in[8], temp_out[8];
1635
1636 // First transform rows
1637 for (i = 0; i < 8; ++i) {
1638 vpx_highbd_idct8_c(input, outptr, bd);
1639 input += 8;
1640 outptr += 8;
1641 }
1642
1643 // Then transform columns
1644 for (i = 0; i < 8; ++i) {
1645 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1646 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1647 for (j = 0; j < 8; ++j) {
1648 dest[j * stride + i] = highbd_clip_pixel_add(
1649 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1650 }
1651 }
1652 }
1653
vpx_highbd_idct8x8_12_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1654 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
1655 int stride, int bd) {
1656 int i, j;
1657 tran_low_t out[8 * 8] = { 0 };
1658 tran_low_t *outptr = out;
1659 tran_low_t temp_in[8], temp_out[8];
1660
1661 // First transform rows
1662 // Only first 4 row has non-zero coefs
1663 for (i = 0; i < 4; ++i) {
1664 vpx_highbd_idct8_c(input, outptr, bd);
1665 input += 8;
1666 outptr += 8;
1667 }
1668
1669 // Then transform columns
1670 for (i = 0; i < 8; ++i) {
1671 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1672 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1673 for (j = 0; j < 8; ++j) {
1674 dest[j * stride + i] = highbd_clip_pixel_add(
1675 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1676 }
1677 }
1678 }
1679
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)1680 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
1681 int stride, int bd) {
1682 int i, j;
1683 tran_high_t a1;
1684 tran_low_t out =
1685 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1686
1687 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1688 a1 = ROUND_POWER_OF_TWO(out, 5);
1689 for (j = 0; j < 8; ++j) {
1690 for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1691 dest += stride;
1692 }
1693 }
1694
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1695 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1696 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1697 tran_high_t s9, s10, s11, s12, s13, s14, s15;
1698 tran_low_t x0 = input[15];
1699 tran_low_t x1 = input[0];
1700 tran_low_t x2 = input[13];
1701 tran_low_t x3 = input[2];
1702 tran_low_t x4 = input[11];
1703 tran_low_t x5 = input[4];
1704 tran_low_t x6 = input[9];
1705 tran_low_t x7 = input[6];
1706 tran_low_t x8 = input[7];
1707 tran_low_t x9 = input[8];
1708 tran_low_t x10 = input[5];
1709 tran_low_t x11 = input[10];
1710 tran_low_t x12 = input[3];
1711 tran_low_t x13 = input[12];
1712 tran_low_t x14 = input[1];
1713 tran_low_t x15 = input[14];
1714 (void)bd;
1715
1716 if (detect_invalid_highbd_input(input, 16)) {
1717 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1718 assert(0 && "invalid highbd txfm input");
1719 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1720 memset(output, 0, sizeof(*output) * 16);
1721 return;
1722 }
1723
1724 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1725 x13 | x14 | x15)) {
1726 memset(output, 0, 16 * sizeof(*output));
1727 return;
1728 }
1729
1730 // stage 1
1731 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1732 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1733 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1734 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1735 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1736 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1737 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1738 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1739 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1740 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1741 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1742 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1743 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1744 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1745 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1746 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1747
1748 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1749 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1750 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1751 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1752 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1753 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1754 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1755 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1756 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1757 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1758 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1759 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1760 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1761 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1762 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1763 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1764
1765 // stage 2
1766 s0 = x0;
1767 s1 = x1;
1768 s2 = x2;
1769 s3 = x3;
1770 s4 = x4;
1771 s5 = x5;
1772 s6 = x6;
1773 s7 = x7;
1774 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1775 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1776 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1777 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1778 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1779 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1780 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1781 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1782
1783 x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1784 x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1785 x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1786 x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1787 x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1788 x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1789 x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1790 x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1791 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1792 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1793 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1794 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1795 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1796 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1797 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1798 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1799
1800 // stage 3
1801 s0 = x0;
1802 s1 = x1;
1803 s2 = x2;
1804 s3 = x3;
1805 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1806 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1807 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1808 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1809 s8 = x8;
1810 s9 = x9;
1811 s10 = x10;
1812 s11 = x11;
1813 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1814 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1815 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1816 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1817
1818 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1819 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1820 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1821 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1822 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1823 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1824 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1825 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1826 x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1827 x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1828 x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1829 x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
1830 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1831 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1832 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1833 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
1834
1835 // stage 4
1836 s2 = (-cospi_16_64) * (x2 + x3);
1837 s3 = cospi_16_64 * (x2 - x3);
1838 s6 = cospi_16_64 * (x6 + x7);
1839 s7 = cospi_16_64 * (-x6 + x7);
1840 s10 = cospi_16_64 * (x10 + x11);
1841 s11 = cospi_16_64 * (-x10 + x11);
1842 s14 = (-cospi_16_64) * (x14 + x15);
1843 s15 = cospi_16_64 * (x14 - x15);
1844
1845 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1846 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1847 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1848 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1849 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1850 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1851 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1852 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
1853
1854 output[0] = HIGHBD_WRAPLOW(x0, bd);
1855 output[1] = HIGHBD_WRAPLOW(-x8, bd);
1856 output[2] = HIGHBD_WRAPLOW(x12, bd);
1857 output[3] = HIGHBD_WRAPLOW(-x4, bd);
1858 output[4] = HIGHBD_WRAPLOW(x6, bd);
1859 output[5] = HIGHBD_WRAPLOW(x14, bd);
1860 output[6] = HIGHBD_WRAPLOW(x10, bd);
1861 output[7] = HIGHBD_WRAPLOW(x2, bd);
1862 output[8] = HIGHBD_WRAPLOW(x3, bd);
1863 output[9] = HIGHBD_WRAPLOW(x11, bd);
1864 output[10] = HIGHBD_WRAPLOW(x15, bd);
1865 output[11] = HIGHBD_WRAPLOW(x7, bd);
1866 output[12] = HIGHBD_WRAPLOW(x5, bd);
1867 output[13] = HIGHBD_WRAPLOW(-x13, bd);
1868 output[14] = HIGHBD_WRAPLOW(x9, bd);
1869 output[15] = HIGHBD_WRAPLOW(-x1, bd);
1870 }
1871
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1872 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1873 tran_low_t step1[16], step2[16];
1874 tran_high_t temp1, temp2;
1875 (void)bd;
1876
1877 if (detect_invalid_highbd_input(input, 16)) {
1878 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1879 assert(0 && "invalid highbd txfm input");
1880 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1881 memset(output, 0, sizeof(*output) * 16);
1882 return;
1883 }
1884
1885 // stage 1
1886 step1[0] = input[0 / 2];
1887 step1[1] = input[16 / 2];
1888 step1[2] = input[8 / 2];
1889 step1[3] = input[24 / 2];
1890 step1[4] = input[4 / 2];
1891 step1[5] = input[20 / 2];
1892 step1[6] = input[12 / 2];
1893 step1[7] = input[28 / 2];
1894 step1[8] = input[2 / 2];
1895 step1[9] = input[18 / 2];
1896 step1[10] = input[10 / 2];
1897 step1[11] = input[26 / 2];
1898 step1[12] = input[6 / 2];
1899 step1[13] = input[22 / 2];
1900 step1[14] = input[14 / 2];
1901 step1[15] = input[30 / 2];
1902
1903 // stage 2
1904 step2[0] = step1[0];
1905 step2[1] = step1[1];
1906 step2[2] = step1[2];
1907 step2[3] = step1[3];
1908 step2[4] = step1[4];
1909 step2[5] = step1[5];
1910 step2[6] = step1[6];
1911 step2[7] = step1[7];
1912
1913 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1914 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1915 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1916 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1917
1918 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1919 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1920 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1921 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1922
1923 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1924 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1925 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1926 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1927
1928 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1929 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1930 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1931 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1932
1933 // stage 3
1934 step1[0] = step2[0];
1935 step1[1] = step2[1];
1936 step1[2] = step2[2];
1937 step1[3] = step2[3];
1938
1939 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1940 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1941 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1942 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1943 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1944 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1945 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1946 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1947
1948 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1949 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1950 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1951 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1952 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1953 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1954 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1955 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1956
1957 // stage 4
1958 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1959 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1960 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1961 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1962 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1963 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1964 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1965 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1966 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1967 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1968 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1969 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1970
1971 step2[8] = step1[8];
1972 step2[15] = step1[15];
1973 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1974 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1975 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1976 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1977 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1978 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1979 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1980 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1981 step2[11] = step1[11];
1982 step2[12] = step1[12];
1983
1984 // stage 5
1985 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1986 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
1987 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
1988 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
1989 step1[4] = step2[4];
1990 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1991 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1992 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1993 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1994 step1[7] = step2[7];
1995
1996 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
1997 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
1998 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
1999 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2000 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2001 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2002 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2003 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2004
2005 // stage 6
2006 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2007 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2008 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2009 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2010 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2011 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2012 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2013 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2014 step2[8] = step1[8];
2015 step2[9] = step1[9];
2016 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2017 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2018 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2019 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2020 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2021 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2022 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2023 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2024 step2[14] = step1[14];
2025 step2[15] = step1[15];
2026
2027 // stage 7
2028 output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2029 output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2030 output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2031 output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2032 output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2033 output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2034 output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2035 output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2036 output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2037 output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2038 output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2039 output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2040 output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2041 output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2042 output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2043 output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2044 }
2045
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2046 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
2047 int stride, int bd) {
2048 int i, j;
2049 tran_low_t out[16 * 16];
2050 tran_low_t *outptr = out;
2051 tran_low_t temp_in[16], temp_out[16];
2052
2053 // First transform rows
2054 for (i = 0; i < 16; ++i) {
2055 vpx_highbd_idct16_c(input, outptr, bd);
2056 input += 16;
2057 outptr += 16;
2058 }
2059
2060 // Then transform columns
2061 for (i = 0; i < 16; ++i) {
2062 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2063 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2064 for (j = 0; j < 16; ++j) {
2065 dest[j * stride + i] = highbd_clip_pixel_add(
2066 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2067 }
2068 }
2069 }
2070
vpx_highbd_idct16x16_38_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2071 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
2072 int stride, int bd) {
2073 int i, j;
2074 tran_low_t out[16 * 16] = { 0 };
2075 tran_low_t *outptr = out;
2076 tran_low_t temp_in[16], temp_out[16];
2077
2078 // First transform rows. Since all non-zero dct coefficients are in
2079 // upper-left 8x8 area, we only need to calculate first 8 rows here.
2080 for (i = 0; i < 8; ++i) {
2081 vpx_highbd_idct16_c(input, outptr, bd);
2082 input += 16;
2083 outptr += 16;
2084 }
2085
2086 // Then transform columns
2087 for (i = 0; i < 16; ++i) {
2088 uint16_t *destT = dest;
2089 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2090 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2091 for (j = 0; j < 16; ++j) {
2092 destT[i] = highbd_clip_pixel_add(destT[i],
2093 ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2094 destT += stride;
2095 }
2096 }
2097 }
2098
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2099 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
2100 int stride, int bd) {
2101 int i, j;
2102 tran_low_t out[16 * 16] = { 0 };
2103 tran_low_t *outptr = out;
2104 tran_low_t temp_in[16], temp_out[16];
2105
2106 // First transform rows. Since all non-zero dct coefficients are in
2107 // upper-left 4x4 area, we only need to calculate first 4 rows here.
2108 for (i = 0; i < 4; ++i) {
2109 vpx_highbd_idct16_c(input, outptr, bd);
2110 input += 16;
2111 outptr += 16;
2112 }
2113
2114 // Then transform columns
2115 for (i = 0; i < 16; ++i) {
2116 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2117 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2118 for (j = 0; j < 16; ++j) {
2119 dest[j * stride + i] = highbd_clip_pixel_add(
2120 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2121 }
2122 }
2123 }
2124
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2125 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
2126 int stride, int bd) {
2127 int i, j;
2128 tran_high_t a1;
2129 tran_low_t out =
2130 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2131
2132 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2133 a1 = ROUND_POWER_OF_TWO(out, 6);
2134 for (j = 0; j < 16; ++j) {
2135 for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2136 dest += stride;
2137 }
2138 }
2139
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2140 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2141 int bd) {
2142 tran_low_t step1[32], step2[32];
2143 tran_high_t temp1, temp2;
2144 (void)bd;
2145
2146 if (detect_invalid_highbd_input(input, 32)) {
2147 #if CONFIG_COEFFICIENT_RANGE_CHECKING
2148 assert(0 && "invalid highbd txfm input");
2149 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
2150 memset(output, 0, sizeof(*output) * 32);
2151 return;
2152 }
2153
2154 // stage 1
2155 step1[0] = input[0];
2156 step1[1] = input[16];
2157 step1[2] = input[8];
2158 step1[3] = input[24];
2159 step1[4] = input[4];
2160 step1[5] = input[20];
2161 step1[6] = input[12];
2162 step1[7] = input[28];
2163 step1[8] = input[2];
2164 step1[9] = input[18];
2165 step1[10] = input[10];
2166 step1[11] = input[26];
2167 step1[12] = input[6];
2168 step1[13] = input[22];
2169 step1[14] = input[14];
2170 step1[15] = input[30];
2171
2172 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2173 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2174 step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2175 step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2176
2177 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2178 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2179 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2180 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2181
2182 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2183 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2184 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2185 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2186
2187 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2188 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2189 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2190 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2191
2192 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2193 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2194 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2195 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2196
2197 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2198 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2199 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2200 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2201
2202 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2203 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2204 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2205 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2206
2207 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2208 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2209 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2210 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2211
2212 // stage 2
2213 step2[0] = step1[0];
2214 step2[1] = step1[1];
2215 step2[2] = step1[2];
2216 step2[3] = step1[3];
2217 step2[4] = step1[4];
2218 step2[5] = step1[5];
2219 step2[6] = step1[6];
2220 step2[7] = step1[7];
2221
2222 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2223 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2224 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2225 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2226
2227 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2228 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2229 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2230 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2231
2232 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2233 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2234 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2235 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2236
2237 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2238 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2239 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2240 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2241
2242 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2243 step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2244 step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2245 step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2246 step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2247 step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2248 step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2249 step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2250 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2251 step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2252 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2253 step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2254 step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2255 step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2256 step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2257 step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2258
2259 // stage 3
2260 step1[0] = step2[0];
2261 step1[1] = step2[1];
2262 step1[2] = step2[2];
2263 step1[3] = step2[3];
2264
2265 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2266 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2267 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2268 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2269 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2270 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2271 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2272 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2273
2274 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2275 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2276 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2277 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2278 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2279 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2280 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2281 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2282
2283 step1[16] = step2[16];
2284 step1[31] = step2[31];
2285 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2286 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2287 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2288 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2289 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2290 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2291 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2292 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2293 step1[19] = step2[19];
2294 step1[20] = step2[20];
2295 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2296 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2297 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2298 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2299 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2300 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2301 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2302 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2303 step1[23] = step2[23];
2304 step1[24] = step2[24];
2305 step1[27] = step2[27];
2306 step1[28] = step2[28];
2307
2308 // stage 4
2309 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2310 temp2 = (step1[0] - step1[1]) * cospi_16_64;
2311 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2312 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2313 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2314 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2315 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2316 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2317 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2318 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2319 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2320 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2321
2322 step2[8] = step1[8];
2323 step2[15] = step1[15];
2324 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2325 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2326 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2327 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2328 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2329 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2330 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2331 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2332 step2[11] = step1[11];
2333 step2[12] = step1[12];
2334
2335 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2336 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2337 step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2338 step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2339 step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2340 step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2341 step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2342 step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2343
2344 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2345 step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2346 step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2347 step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2348 step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2349 step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2350 step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2351 step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2352
2353 // stage 5
2354 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2355 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2356 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2357 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2358 step1[4] = step2[4];
2359 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2360 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2361 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2362 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2363 step1[7] = step2[7];
2364
2365 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2366 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2367 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2368 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2369 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2370 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2371 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2372 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2373
2374 step1[16] = step2[16];
2375 step1[17] = step2[17];
2376 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2377 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2378 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2379 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2380 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2381 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2382 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2383 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2384 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2385 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2386 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2387 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2388 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2389 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2390 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2391 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2392 step1[22] = step2[22];
2393 step1[23] = step2[23];
2394 step1[24] = step2[24];
2395 step1[25] = step2[25];
2396 step1[30] = step2[30];
2397 step1[31] = step2[31];
2398
2399 // stage 6
2400 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2401 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2402 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2403 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2404 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2405 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2406 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2407 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2408 step2[8] = step1[8];
2409 step2[9] = step1[9];
2410 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2411 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2412 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2413 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2414 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2415 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2416 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2417 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2418 step2[14] = step1[14];
2419 step2[15] = step1[15];
2420
2421 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2422 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2423 step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2424 step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2425 step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2426 step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2427 step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2428 step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2429
2430 step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2431 step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2432 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2433 step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2434 step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2435 step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2436 step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2437 step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2438
2439 // stage 7
2440 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2441 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2442 step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2443 step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2444 step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2445 step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2446 step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2447 step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2448 step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2449 step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2450 step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2451 step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2452 step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2453 step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2454 step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2455 step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2456
2457 step1[16] = step2[16];
2458 step1[17] = step2[17];
2459 step1[18] = step2[18];
2460 step1[19] = step2[19];
2461 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2462 temp2 = (step2[20] + step2[27]) * cospi_16_64;
2463 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2464 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2465 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2466 temp2 = (step2[21] + step2[26]) * cospi_16_64;
2467 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2468 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2469 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2470 temp2 = (step2[22] + step2[25]) * cospi_16_64;
2471 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2472 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2473 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2474 temp2 = (step2[23] + step2[24]) * cospi_16_64;
2475 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2476 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2477 step1[28] = step2[28];
2478 step1[29] = step2[29];
2479 step1[30] = step2[30];
2480 step1[31] = step2[31];
2481
2482 // final stage
2483 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2484 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2485 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2486 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2487 output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2488 output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2489 output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2490 output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2491 output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2492 output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2493 output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2494 output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2495 output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2496 output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2497 output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2498 output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2499 output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2500 output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2501 output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2502 output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2503 output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2504 output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2505 output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2506 output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2507 output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2508 output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2509 output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2510 output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2511 output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2512 output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2513 output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2514 output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2515 }
2516
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2517 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
2518 int stride, int bd) {
2519 int i, j;
2520 tran_low_t out[32 * 32];
2521 tran_low_t *outptr = out;
2522 tran_low_t temp_in[32], temp_out[32];
2523
2524 // Rows
2525 for (i = 0; i < 32; ++i) {
2526 tran_low_t zero_coeff = 0;
2527 for (j = 0; j < 32; ++j) zero_coeff |= input[j];
2528
2529 if (zero_coeff)
2530 highbd_idct32_c(input, outptr, bd);
2531 else
2532 memset(outptr, 0, sizeof(tran_low_t) * 32);
2533 input += 32;
2534 outptr += 32;
2535 }
2536
2537 // Columns
2538 for (i = 0; i < 32; ++i) {
2539 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2540 highbd_idct32_c(temp_in, temp_out, bd);
2541 for (j = 0; j < 32; ++j) {
2542 dest[j * stride + i] = highbd_clip_pixel_add(
2543 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2544 }
2545 }
2546 }
2547
vpx_highbd_idct32x32_135_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2548 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
2549 int stride, int bd) {
2550 int i, j;
2551 tran_low_t out[32 * 32] = { 0 };
2552 tran_low_t *outptr = out;
2553 tran_low_t temp_in[32], temp_out[32];
2554
2555 // Rows
2556 // Only upper-left 16x16 has non-zero coeff
2557 for (i = 0; i < 16; ++i) {
2558 highbd_idct32_c(input, outptr, bd);
2559 input += 32;
2560 outptr += 32;
2561 }
2562
2563 // Columns
2564 for (i = 0; i < 32; ++i) {
2565 uint16_t *destT = dest;
2566 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2567 highbd_idct32_c(temp_in, temp_out, bd);
2568 for (j = 0; j < 32; ++j) {
2569 destT[i] = highbd_clip_pixel_add(destT[i],
2570 ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2571 destT += stride;
2572 }
2573 }
2574 }
2575
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2576 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
2577 int stride, int bd) {
2578 int i, j;
2579 tran_low_t out[32 * 32] = { 0 };
2580 tran_low_t *outptr = out;
2581 tran_low_t temp_in[32], temp_out[32];
2582
2583 // Rows
2584 // Only upper-left 8x8 has non-zero coeff
2585 for (i = 0; i < 8; ++i) {
2586 highbd_idct32_c(input, outptr, bd);
2587 input += 32;
2588 outptr += 32;
2589 }
2590
2591 // Columns
2592 for (i = 0; i < 32; ++i) {
2593 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2594 highbd_idct32_c(temp_in, temp_out, bd);
2595 for (j = 0; j < 32; ++j) {
2596 dest[j * stride + i] = highbd_clip_pixel_add(
2597 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2598 }
2599 }
2600 }
2601
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint16_t * dest,int stride,int bd)2602 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
2603 int stride, int bd) {
2604 int i, j;
2605 int a1;
2606 tran_low_t out =
2607 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2608
2609 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2610 a1 = ROUND_POWER_OF_TWO(out, 6);
2611
2612 for (j = 0; j < 32; ++j) {
2613 for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2614 dest += stride;
2615 }
2616 }
2617
2618 #endif // CONFIG_VP9_HIGHBITDEPTH
2619