1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <math.h>
12 #include <string.h>
13
14 #include "vpx_dsp/inv_txfm.h"
15
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)16 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
17 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
18 0.5 shifts per pixel. */
19 int i;
20 tran_low_t output[16];
21 tran_high_t a1, b1, c1, d1, e1;
22 const tran_low_t *ip = input;
23 tran_low_t *op = output;
24
25 for (i = 0; i < 4; i++) {
26 a1 = ip[0] >> UNIT_QUANT_SHIFT;
27 c1 = ip[1] >> UNIT_QUANT_SHIFT;
28 d1 = ip[2] >> UNIT_QUANT_SHIFT;
29 b1 = ip[3] >> UNIT_QUANT_SHIFT;
30 a1 += c1;
31 d1 -= b1;
32 e1 = (a1 - d1) >> 1;
33 b1 = e1 - b1;
34 c1 = e1 - c1;
35 a1 -= b1;
36 d1 += c1;
37 op[0] = WRAPLOW(a1, 8);
38 op[1] = WRAPLOW(b1, 8);
39 op[2] = WRAPLOW(c1, 8);
40 op[3] = WRAPLOW(d1, 8);
41 ip += 4;
42 op += 4;
43 }
44
45 ip = output;
46 for (i = 0; i < 4; i++) {
47 a1 = ip[4 * 0];
48 c1 = ip[4 * 1];
49 d1 = ip[4 * 2];
50 b1 = ip[4 * 3];
51 a1 += c1;
52 d1 -= b1;
53 e1 = (a1 - d1) >> 1;
54 b1 = e1 - b1;
55 c1 = e1 - c1;
56 a1 -= b1;
57 d1 += c1;
58 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
59 dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
60 dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
61 dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
62
63 ip++;
64 dest++;
65 }
66 }
67
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int dest_stride)68 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
69 int i;
70 tran_high_t a1, e1;
71 tran_low_t tmp[4];
72 const tran_low_t *ip = in;
73 tran_low_t *op = tmp;
74
75 a1 = ip[0] >> UNIT_QUANT_SHIFT;
76 e1 = a1 >> 1;
77 a1 -= e1;
78 op[0] = WRAPLOW(a1, 8);
79 op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
80
81 ip = tmp;
82 for (i = 0; i < 4; i++) {
83 e1 = ip[0] >> 1;
84 a1 = ip[0] - e1;
85 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
86 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
87 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
88 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
89 ip++;
90 dest++;
91 }
92 }
93
idct4_c(const tran_low_t * input,tran_low_t * output)94 void idct4_c(const tran_low_t *input, tran_low_t *output) {
95 tran_low_t step[4];
96 tran_high_t temp1, temp2;
97 // stage 1
98 temp1 = (input[0] + input[2]) * cospi_16_64;
99 temp2 = (input[0] - input[2]) * cospi_16_64;
100 step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
101 step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
102 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
103 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
104 step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
105 step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
106
107 // stage 2
108 output[0] = WRAPLOW(step[0] + step[3], 8);
109 output[1] = WRAPLOW(step[1] + step[2], 8);
110 output[2] = WRAPLOW(step[1] - step[2], 8);
111 output[3] = WRAPLOW(step[0] - step[3], 8);
112 }
113
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)114 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
115 tran_low_t out[4 * 4];
116 tran_low_t *outptr = out;
117 int i, j;
118 tran_low_t temp_in[4], temp_out[4];
119
120 // Rows
121 for (i = 0; i < 4; ++i) {
122 idct4_c(input, outptr);
123 input += 4;
124 outptr += 4;
125 }
126
127 // Columns
128 for (i = 0; i < 4; ++i) {
129 for (j = 0; j < 4; ++j)
130 temp_in[j] = out[j * 4 + i];
131 idct4_c(temp_in, temp_out);
132 for (j = 0; j < 4; ++j) {
133 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
134 ROUND_POWER_OF_TWO(temp_out[j], 4));
135 }
136 }
137 }
138
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int dest_stride)139 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
140 int dest_stride) {
141 int i;
142 tran_high_t a1;
143 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
144 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
145 a1 = ROUND_POWER_OF_TWO(out, 4);
146
147 for (i = 0; i < 4; i++) {
148 dest[0] = clip_pixel_add(dest[0], a1);
149 dest[1] = clip_pixel_add(dest[1], a1);
150 dest[2] = clip_pixel_add(dest[2], a1);
151 dest[3] = clip_pixel_add(dest[3], a1);
152 dest += dest_stride;
153 }
154 }
155
idct8_c(const tran_low_t * input,tran_low_t * output)156 void idct8_c(const tran_low_t *input, tran_low_t *output) {
157 tran_low_t step1[8], step2[8];
158 tran_high_t temp1, temp2;
159 // stage 1
160 step1[0] = input[0];
161 step1[2] = input[4];
162 step1[1] = input[2];
163 step1[3] = input[6];
164 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
165 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
166 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
167 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
168 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
169 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
170 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
171 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
172
173 // stage 2 & stage 3 - even half
174 idct4_c(step1, step1);
175
176 // stage 2 - odd half
177 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
178 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
179 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
180 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
181
182 // stage 3 -odd half
183 step1[4] = step2[4];
184 temp1 = (step2[6] - step2[5]) * cospi_16_64;
185 temp2 = (step2[5] + step2[6]) * cospi_16_64;
186 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
187 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
188 step1[7] = step2[7];
189
190 // stage 4
191 output[0] = WRAPLOW(step1[0] + step1[7], 8);
192 output[1] = WRAPLOW(step1[1] + step1[6], 8);
193 output[2] = WRAPLOW(step1[2] + step1[5], 8);
194 output[3] = WRAPLOW(step1[3] + step1[4], 8);
195 output[4] = WRAPLOW(step1[3] - step1[4], 8);
196 output[5] = WRAPLOW(step1[2] - step1[5], 8);
197 output[6] = WRAPLOW(step1[1] - step1[6], 8);
198 output[7] = WRAPLOW(step1[0] - step1[7], 8);
199 }
200
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)201 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
202 tran_low_t out[8 * 8];
203 tran_low_t *outptr = out;
204 int i, j;
205 tran_low_t temp_in[8], temp_out[8];
206
207 // First transform rows
208 for (i = 0; i < 8; ++i) {
209 idct8_c(input, outptr);
210 input += 8;
211 outptr += 8;
212 }
213
214 // Then transform columns
215 for (i = 0; i < 8; ++i) {
216 for (j = 0; j < 8; ++j)
217 temp_in[j] = out[j * 8 + i];
218 idct8_c(temp_in, temp_out);
219 for (j = 0; j < 8; ++j) {
220 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
221 ROUND_POWER_OF_TWO(temp_out[j], 5));
222 }
223 }
224 }
225
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)226 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
227 int i, j;
228 tran_high_t a1;
229 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
230 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
231 a1 = ROUND_POWER_OF_TWO(out, 5);
232 for (j = 0; j < 8; ++j) {
233 for (i = 0; i < 8; ++i)
234 dest[i] = clip_pixel_add(dest[i], a1);
235 dest += stride;
236 }
237 }
238
iadst4_c(const tran_low_t * input,tran_low_t * output)239 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
240 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
241
242 tran_low_t x0 = input[0];
243 tran_low_t x1 = input[1];
244 tran_low_t x2 = input[2];
245 tran_low_t x3 = input[3];
246
247 if (!(x0 | x1 | x2 | x3)) {
248 output[0] = output[1] = output[2] = output[3] = 0;
249 return;
250 }
251
252 s0 = sinpi_1_9 * x0;
253 s1 = sinpi_2_9 * x0;
254 s2 = sinpi_3_9 * x1;
255 s3 = sinpi_4_9 * x2;
256 s4 = sinpi_1_9 * x2;
257 s5 = sinpi_2_9 * x3;
258 s6 = sinpi_4_9 * x3;
259 s7 = x0 - x2 + x3;
260
261 s0 = s0 + s3 + s5;
262 s1 = s1 - s4 - s6;
263 s3 = s2;
264 s2 = sinpi_3_9 * s7;
265
266 // 1-D transform scaling factor is sqrt(2).
267 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
268 // + 1b (addition) = 29b.
269 // Hence the output bit depth is 15b.
270 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
271 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
272 output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
273 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
274 }
275
iadst8_c(const tran_low_t * input,tran_low_t * output)276 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
277 int s0, s1, s2, s3, s4, s5, s6, s7;
278
279 tran_high_t x0 = input[7];
280 tran_high_t x1 = input[0];
281 tran_high_t x2 = input[5];
282 tran_high_t x3 = input[2];
283 tran_high_t x4 = input[3];
284 tran_high_t x5 = input[4];
285 tran_high_t x6 = input[1];
286 tran_high_t x7 = input[6];
287
288 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
289 output[0] = output[1] = output[2] = output[3] = output[4]
290 = output[5] = output[6] = output[7] = 0;
291 return;
292 }
293
294 // stage 1
295 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
296 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
297 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
298 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
299 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
300 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
301 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
302 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
303
304 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
305 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
306 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
307 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
308 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
309 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
310 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
311 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
312
313 // stage 2
314 s0 = (int)x0;
315 s1 = (int)x1;
316 s2 = (int)x2;
317 s3 = (int)x3;
318 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
319 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
320 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
321 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
322
323 x0 = WRAPLOW(s0 + s2, 8);
324 x1 = WRAPLOW(s1 + s3, 8);
325 x2 = WRAPLOW(s0 - s2, 8);
326 x3 = WRAPLOW(s1 - s3, 8);
327 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
328 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
329 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
330 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
331
332 // stage 3
333 s2 = (int)(cospi_16_64 * (x2 + x3));
334 s3 = (int)(cospi_16_64 * (x2 - x3));
335 s6 = (int)(cospi_16_64 * (x6 + x7));
336 s7 = (int)(cospi_16_64 * (x6 - x7));
337
338 x2 = WRAPLOW(dct_const_round_shift(s2), 8);
339 x3 = WRAPLOW(dct_const_round_shift(s3), 8);
340 x6 = WRAPLOW(dct_const_round_shift(s6), 8);
341 x7 = WRAPLOW(dct_const_round_shift(s7), 8);
342
343 output[0] = WRAPLOW(x0, 8);
344 output[1] = WRAPLOW(-x4, 8);
345 output[2] = WRAPLOW(x6, 8);
346 output[3] = WRAPLOW(-x2, 8);
347 output[4] = WRAPLOW(x3, 8);
348 output[5] = WRAPLOW(-x7, 8);
349 output[6] = WRAPLOW(x5, 8);
350 output[7] = WRAPLOW(-x1, 8);
351 }
352
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)353 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
354 tran_low_t out[8 * 8] = { 0 };
355 tran_low_t *outptr = out;
356 int i, j;
357 tran_low_t temp_in[8], temp_out[8];
358
359 // First transform rows
360 // only first 4 row has non-zero coefs
361 for (i = 0; i < 4; ++i) {
362 idct8_c(input, outptr);
363 input += 8;
364 outptr += 8;
365 }
366
367 // Then transform columns
368 for (i = 0; i < 8; ++i) {
369 for (j = 0; j < 8; ++j)
370 temp_in[j] = out[j * 8 + i];
371 idct8_c(temp_in, temp_out);
372 for (j = 0; j < 8; ++j) {
373 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
374 ROUND_POWER_OF_TWO(temp_out[j], 5));
375 }
376 }
377 }
378
idct16_c(const tran_low_t * input,tran_low_t * output)379 void idct16_c(const tran_low_t *input, tran_low_t *output) {
380 tran_low_t step1[16], step2[16];
381 tran_high_t temp1, temp2;
382
383 // stage 1
384 step1[0] = input[0/2];
385 step1[1] = input[16/2];
386 step1[2] = input[8/2];
387 step1[3] = input[24/2];
388 step1[4] = input[4/2];
389 step1[5] = input[20/2];
390 step1[6] = input[12/2];
391 step1[7] = input[28/2];
392 step1[8] = input[2/2];
393 step1[9] = input[18/2];
394 step1[10] = input[10/2];
395 step1[11] = input[26/2];
396 step1[12] = input[6/2];
397 step1[13] = input[22/2];
398 step1[14] = input[14/2];
399 step1[15] = input[30/2];
400
401 // stage 2
402 step2[0] = step1[0];
403 step2[1] = step1[1];
404 step2[2] = step1[2];
405 step2[3] = step1[3];
406 step2[4] = step1[4];
407 step2[5] = step1[5];
408 step2[6] = step1[6];
409 step2[7] = step1[7];
410
411 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
412 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
413 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
414 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
415
416 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
417 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
418 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
419 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
420
421 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
422 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
423 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
424 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
425
426 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
427 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
428 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
429 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
430
431 // stage 3
432 step1[0] = step2[0];
433 step1[1] = step2[1];
434 step1[2] = step2[2];
435 step1[3] = step2[3];
436
437 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
438 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
439 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
440 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
441 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
442 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
443 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
444 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
445
446 step1[8] = WRAPLOW(step2[8] + step2[9], 8);
447 step1[9] = WRAPLOW(step2[8] - step2[9], 8);
448 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
449 step1[11] = WRAPLOW(step2[10] + step2[11], 8);
450 step1[12] = WRAPLOW(step2[12] + step2[13], 8);
451 step1[13] = WRAPLOW(step2[12] - step2[13], 8);
452 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
453 step1[15] = WRAPLOW(step2[14] + step2[15], 8);
454
455 // stage 4
456 temp1 = (step1[0] + step1[1]) * cospi_16_64;
457 temp2 = (step1[0] - step1[1]) * cospi_16_64;
458 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
459 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
460 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
461 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
462 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
463 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
464 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
465 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
466 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
467 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
468
469 step2[8] = step1[8];
470 step2[15] = step1[15];
471 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
472 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
473 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
474 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
475 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
476 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
477 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
478 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
479 step2[11] = step1[11];
480 step2[12] = step1[12];
481
482 // stage 5
483 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
484 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
485 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
486 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
487 step1[4] = step2[4];
488 temp1 = (step2[6] - step2[5]) * cospi_16_64;
489 temp2 = (step2[5] + step2[6]) * cospi_16_64;
490 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
491 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
492 step1[7] = step2[7];
493
494 step1[8] = WRAPLOW(step2[8] + step2[11], 8);
495 step1[9] = WRAPLOW(step2[9] + step2[10], 8);
496 step1[10] = WRAPLOW(step2[9] - step2[10], 8);
497 step1[11] = WRAPLOW(step2[8] - step2[11], 8);
498 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
499 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
500 step1[14] = WRAPLOW(step2[13] + step2[14], 8);
501 step1[15] = WRAPLOW(step2[12] + step2[15], 8);
502
503 // stage 6
504 step2[0] = WRAPLOW(step1[0] + step1[7], 8);
505 step2[1] = WRAPLOW(step1[1] + step1[6], 8);
506 step2[2] = WRAPLOW(step1[2] + step1[5], 8);
507 step2[3] = WRAPLOW(step1[3] + step1[4], 8);
508 step2[4] = WRAPLOW(step1[3] - step1[4], 8);
509 step2[5] = WRAPLOW(step1[2] - step1[5], 8);
510 step2[6] = WRAPLOW(step1[1] - step1[6], 8);
511 step2[7] = WRAPLOW(step1[0] - step1[7], 8);
512 step2[8] = step1[8];
513 step2[9] = step1[9];
514 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
515 temp2 = (step1[10] + step1[13]) * cospi_16_64;
516 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
517 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
518 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
519 temp2 = (step1[11] + step1[12]) * cospi_16_64;
520 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
521 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
522 step2[14] = step1[14];
523 step2[15] = step1[15];
524
525 // stage 7
526 output[0] = WRAPLOW(step2[0] + step2[15], 8);
527 output[1] = WRAPLOW(step2[1] + step2[14], 8);
528 output[2] = WRAPLOW(step2[2] + step2[13], 8);
529 output[3] = WRAPLOW(step2[3] + step2[12], 8);
530 output[4] = WRAPLOW(step2[4] + step2[11], 8);
531 output[5] = WRAPLOW(step2[5] + step2[10], 8);
532 output[6] = WRAPLOW(step2[6] + step2[9], 8);
533 output[7] = WRAPLOW(step2[7] + step2[8], 8);
534 output[8] = WRAPLOW(step2[7] - step2[8], 8);
535 output[9] = WRAPLOW(step2[6] - step2[9], 8);
536 output[10] = WRAPLOW(step2[5] - step2[10], 8);
537 output[11] = WRAPLOW(step2[4] - step2[11], 8);
538 output[12] = WRAPLOW(step2[3] - step2[12], 8);
539 output[13] = WRAPLOW(step2[2] - step2[13], 8);
540 output[14] = WRAPLOW(step2[1] - step2[14], 8);
541 output[15] = WRAPLOW(step2[0] - step2[15], 8);
542 }
543
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)544 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
545 int stride) {
546 tran_low_t out[16 * 16];
547 tran_low_t *outptr = out;
548 int i, j;
549 tran_low_t temp_in[16], temp_out[16];
550
551 // First transform rows
552 for (i = 0; i < 16; ++i) {
553 idct16_c(input, outptr);
554 input += 16;
555 outptr += 16;
556 }
557
558 // Then transform columns
559 for (i = 0; i < 16; ++i) {
560 for (j = 0; j < 16; ++j)
561 temp_in[j] = out[j * 16 + i];
562 idct16_c(temp_in, temp_out);
563 for (j = 0; j < 16; ++j) {
564 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
565 ROUND_POWER_OF_TWO(temp_out[j], 6));
566 }
567 }
568 }
569
iadst16_c(const tran_low_t * input,tran_low_t * output)570 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
571 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
572 tran_high_t s9, s10, s11, s12, s13, s14, s15;
573
574 tran_high_t x0 = input[15];
575 tran_high_t x1 = input[0];
576 tran_high_t x2 = input[13];
577 tran_high_t x3 = input[2];
578 tran_high_t x4 = input[11];
579 tran_high_t x5 = input[4];
580 tran_high_t x6 = input[9];
581 tran_high_t x7 = input[6];
582 tran_high_t x8 = input[7];
583 tran_high_t x9 = input[8];
584 tran_high_t x10 = input[5];
585 tran_high_t x11 = input[10];
586 tran_high_t x12 = input[3];
587 tran_high_t x13 = input[12];
588 tran_high_t x14 = input[1];
589 tran_high_t x15 = input[14];
590
591 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
592 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
593 output[0] = output[1] = output[2] = output[3] = output[4]
594 = output[5] = output[6] = output[7] = output[8]
595 = output[9] = output[10] = output[11] = output[12]
596 = output[13] = output[14] = output[15] = 0;
597 return;
598 }
599
600 // stage 1
601 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
602 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
603 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
604 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
605 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
606 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
607 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
608 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
609 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
610 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
611 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
612 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
613 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
614 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
615 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
616 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
617
618 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
619 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
620 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
621 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
622 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
623 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
624 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
625 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
626 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
627 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
628 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
629 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
630 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
631 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
632 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
633 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
634
635 // stage 2
636 s0 = x0;
637 s1 = x1;
638 s2 = x2;
639 s3 = x3;
640 s4 = x4;
641 s5 = x5;
642 s6 = x6;
643 s7 = x7;
644 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
645 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
646 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
647 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
648 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
649 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
650 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
651 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
652
653 x0 = WRAPLOW(s0 + s4, 8);
654 x1 = WRAPLOW(s1 + s5, 8);
655 x2 = WRAPLOW(s2 + s6, 8);
656 x3 = WRAPLOW(s3 + s7, 8);
657 x4 = WRAPLOW(s0 - s4, 8);
658 x5 = WRAPLOW(s1 - s5, 8);
659 x6 = WRAPLOW(s2 - s6, 8);
660 x7 = WRAPLOW(s3 - s7, 8);
661 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
662 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
663 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
664 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
665 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
666 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
667 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
668 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
669
670 // stage 3
671 s0 = x0;
672 s1 = x1;
673 s2 = x2;
674 s3 = x3;
675 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
676 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
677 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
678 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
679 s8 = x8;
680 s9 = x9;
681 s10 = x10;
682 s11 = x11;
683 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
684 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
685 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
686 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
687
688 x0 = WRAPLOW(check_range(s0 + s2), 8);
689 x1 = WRAPLOW(check_range(s1 + s3), 8);
690 x2 = WRAPLOW(check_range(s0 - s2), 8);
691 x3 = WRAPLOW(check_range(s1 - s3), 8);
692 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
693 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
694 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
695 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
696 x8 = WRAPLOW(check_range(s8 + s10), 8);
697 x9 = WRAPLOW(check_range(s9 + s11), 8);
698 x10 = WRAPLOW(check_range(s8 - s10), 8);
699 x11 = WRAPLOW(check_range(s9 - s11), 8);
700 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
701 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
702 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
703 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
704
705 // stage 4
706 s2 = (- cospi_16_64) * (x2 + x3);
707 s3 = cospi_16_64 * (x2 - x3);
708 s6 = cospi_16_64 * (x6 + x7);
709 s7 = cospi_16_64 * (- x6 + x7);
710 s10 = cospi_16_64 * (x10 + x11);
711 s11 = cospi_16_64 * (- x10 + x11);
712 s14 = (- cospi_16_64) * (x14 + x15);
713 s15 = cospi_16_64 * (x14 - x15);
714
715 x2 = WRAPLOW(dct_const_round_shift(s2), 8);
716 x3 = WRAPLOW(dct_const_round_shift(s3), 8);
717 x6 = WRAPLOW(dct_const_round_shift(s6), 8);
718 x7 = WRAPLOW(dct_const_round_shift(s7), 8);
719 x10 = WRAPLOW(dct_const_round_shift(s10), 8);
720 x11 = WRAPLOW(dct_const_round_shift(s11), 8);
721 x14 = WRAPLOW(dct_const_round_shift(s14), 8);
722 x15 = WRAPLOW(dct_const_round_shift(s15), 8);
723
724 output[0] = WRAPLOW(x0, 8);
725 output[1] = WRAPLOW(-x8, 8);
726 output[2] = WRAPLOW(x12, 8);
727 output[3] = WRAPLOW(-x4, 8);
728 output[4] = WRAPLOW(x6, 8);
729 output[5] = WRAPLOW(x14, 8);
730 output[6] = WRAPLOW(x10, 8);
731 output[7] = WRAPLOW(x2, 8);
732 output[8] = WRAPLOW(x3, 8);
733 output[9] = WRAPLOW(x11, 8);
734 output[10] = WRAPLOW(x15, 8);
735 output[11] = WRAPLOW(x7, 8);
736 output[12] = WRAPLOW(x5, 8);
737 output[13] = WRAPLOW(-x13, 8);
738 output[14] = WRAPLOW(x9, 8);
739 output[15] = WRAPLOW(-x1, 8);
740 }
741
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)742 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
743 int stride) {
744 tran_low_t out[16 * 16] = { 0 };
745 tran_low_t *outptr = out;
746 int i, j;
747 tran_low_t temp_in[16], temp_out[16];
748
749 // First transform rows. Since all non-zero dct coefficients are in
750 // upper-left 4x4 area, we only need to calculate first 4 rows here.
751 for (i = 0; i < 4; ++i) {
752 idct16_c(input, outptr);
753 input += 16;
754 outptr += 16;
755 }
756
757 // Then transform columns
758 for (i = 0; i < 16; ++i) {
759 for (j = 0; j < 16; ++j)
760 temp_in[j] = out[j*16 + i];
761 idct16_c(temp_in, temp_out);
762 for (j = 0; j < 16; ++j) {
763 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
764 ROUND_POWER_OF_TWO(temp_out[j], 6));
765 }
766 }
767 }
768
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)769 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
770 int i, j;
771 tran_high_t a1;
772 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
773 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
774 a1 = ROUND_POWER_OF_TWO(out, 6);
775 for (j = 0; j < 16; ++j) {
776 for (i = 0; i < 16; ++i)
777 dest[i] = clip_pixel_add(dest[i], a1);
778 dest += stride;
779 }
780 }
781
idct32_c(const tran_low_t * input,tran_low_t * output)782 void idct32_c(const tran_low_t *input, tran_low_t *output) {
783 tran_low_t step1[32], step2[32];
784 tran_high_t temp1, temp2;
785
786 // stage 1
787 step1[0] = input[0];
788 step1[1] = input[16];
789 step1[2] = input[8];
790 step1[3] = input[24];
791 step1[4] = input[4];
792 step1[5] = input[20];
793 step1[6] = input[12];
794 step1[7] = input[28];
795 step1[8] = input[2];
796 step1[9] = input[18];
797 step1[10] = input[10];
798 step1[11] = input[26];
799 step1[12] = input[6];
800 step1[13] = input[22];
801 step1[14] = input[14];
802 step1[15] = input[30];
803
804 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
805 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
806 step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
807 step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
808
809 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
810 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
811 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
812 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
813
814 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
815 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
816 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
817 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
818
819 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
820 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
821 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
822 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
823
824 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
825 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
826 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
827 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
828
829 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
830 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
831 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
832 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
833
834 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
835 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
836 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
837 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
838
839 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
840 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
841 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
842 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
843
844 // stage 2
845 step2[0] = step1[0];
846 step2[1] = step1[1];
847 step2[2] = step1[2];
848 step2[3] = step1[3];
849 step2[4] = step1[4];
850 step2[5] = step1[5];
851 step2[6] = step1[6];
852 step2[7] = step1[7];
853
854 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
855 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
856 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
857 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
858
859 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
860 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
861 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
862 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
863
864 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
865 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
866 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
867 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
868
869 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
870 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
871 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
872 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
873
874 step2[16] = WRAPLOW(step1[16] + step1[17], 8);
875 step2[17] = WRAPLOW(step1[16] - step1[17], 8);
876 step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
877 step2[19] = WRAPLOW(step1[18] + step1[19], 8);
878 step2[20] = WRAPLOW(step1[20] + step1[21], 8);
879 step2[21] = WRAPLOW(step1[20] - step1[21], 8);
880 step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
881 step2[23] = WRAPLOW(step1[22] + step1[23], 8);
882 step2[24] = WRAPLOW(step1[24] + step1[25], 8);
883 step2[25] = WRAPLOW(step1[24] - step1[25], 8);
884 step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
885 step2[27] = WRAPLOW(step1[26] + step1[27], 8);
886 step2[28] = WRAPLOW(step1[28] + step1[29], 8);
887 step2[29] = WRAPLOW(step1[28] - step1[29], 8);
888 step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
889 step2[31] = WRAPLOW(step1[30] + step1[31], 8);
890
891 // stage 3
892 step1[0] = step2[0];
893 step1[1] = step2[1];
894 step1[2] = step2[2];
895 step1[3] = step2[3];
896
897 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
898 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
899 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
900 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
901 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
902 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
903 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
904 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
905
906 step1[8] = WRAPLOW(step2[8] + step2[9], 8);
907 step1[9] = WRAPLOW(step2[8] - step2[9], 8);
908 step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
909 step1[11] = WRAPLOW(step2[10] + step2[11], 8);
910 step1[12] = WRAPLOW(step2[12] + step2[13], 8);
911 step1[13] = WRAPLOW(step2[12] - step2[13], 8);
912 step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
913 step1[15] = WRAPLOW(step2[14] + step2[15], 8);
914
915 step1[16] = step2[16];
916 step1[31] = step2[31];
917 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
918 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
919 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
920 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
921 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
922 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
923 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
924 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
925 step1[19] = step2[19];
926 step1[20] = step2[20];
927 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
928 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
929 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
930 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
931 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
932 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
933 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
934 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
935 step1[23] = step2[23];
936 step1[24] = step2[24];
937 step1[27] = step2[27];
938 step1[28] = step2[28];
939
940 // stage 4
941 temp1 = (step1[0] + step1[1]) * cospi_16_64;
942 temp2 = (step1[0] - step1[1]) * cospi_16_64;
943 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
944 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
945 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
946 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
947 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
948 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
949 step2[4] = WRAPLOW(step1[4] + step1[5], 8);
950 step2[5] = WRAPLOW(step1[4] - step1[5], 8);
951 step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
952 step2[7] = WRAPLOW(step1[6] + step1[7], 8);
953
954 step2[8] = step1[8];
955 step2[15] = step1[15];
956 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
957 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
958 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
959 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
960 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
961 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
962 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
963 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
964 step2[11] = step1[11];
965 step2[12] = step1[12];
966
967 step2[16] = WRAPLOW(step1[16] + step1[19], 8);
968 step2[17] = WRAPLOW(step1[17] + step1[18], 8);
969 step2[18] = WRAPLOW(step1[17] - step1[18], 8);
970 step2[19] = WRAPLOW(step1[16] - step1[19], 8);
971 step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
972 step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
973 step2[22] = WRAPLOW(step1[21] + step1[22], 8);
974 step2[23] = WRAPLOW(step1[20] + step1[23], 8);
975
976 step2[24] = WRAPLOW(step1[24] + step1[27], 8);
977 step2[25] = WRAPLOW(step1[25] + step1[26], 8);
978 step2[26] = WRAPLOW(step1[25] - step1[26], 8);
979 step2[27] = WRAPLOW(step1[24] - step1[27], 8);
980 step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
981 step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
982 step2[30] = WRAPLOW(step1[29] + step1[30], 8);
983 step2[31] = WRAPLOW(step1[28] + step1[31], 8);
984
985 // stage 5
986 step1[0] = WRAPLOW(step2[0] + step2[3], 8);
987 step1[1] = WRAPLOW(step2[1] + step2[2], 8);
988 step1[2] = WRAPLOW(step2[1] - step2[2], 8);
989 step1[3] = WRAPLOW(step2[0] - step2[3], 8);
990 step1[4] = step2[4];
991 temp1 = (step2[6] - step2[5]) * cospi_16_64;
992 temp2 = (step2[5] + step2[6]) * cospi_16_64;
993 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
994 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
995 step1[7] = step2[7];
996
997 step1[8] = WRAPLOW(step2[8] + step2[11], 8);
998 step1[9] = WRAPLOW(step2[9] + step2[10], 8);
999 step1[10] = WRAPLOW(step2[9] - step2[10], 8);
1000 step1[11] = WRAPLOW(step2[8] - step2[11], 8);
1001 step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
1002 step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
1003 step1[14] = WRAPLOW(step2[13] + step2[14], 8);
1004 step1[15] = WRAPLOW(step2[12] + step2[15], 8);
1005
1006 step1[16] = step2[16];
1007 step1[17] = step2[17];
1008 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1009 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1010 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
1011 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
1012 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1013 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1014 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
1015 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
1016 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1017 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1018 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1019 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1020 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1021 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1022 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1023 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1024 step1[22] = step2[22];
1025 step1[23] = step2[23];
1026 step1[24] = step2[24];
1027 step1[25] = step2[25];
1028 step1[30] = step2[30];
1029 step1[31] = step2[31];
1030
1031 // stage 6
1032 step2[0] = WRAPLOW(step1[0] + step1[7], 8);
1033 step2[1] = WRAPLOW(step1[1] + step1[6], 8);
1034 step2[2] = WRAPLOW(step1[2] + step1[5], 8);
1035 step2[3] = WRAPLOW(step1[3] + step1[4], 8);
1036 step2[4] = WRAPLOW(step1[3] - step1[4], 8);
1037 step2[5] = WRAPLOW(step1[2] - step1[5], 8);
1038 step2[6] = WRAPLOW(step1[1] - step1[6], 8);
1039 step2[7] = WRAPLOW(step1[0] - step1[7], 8);
1040 step2[8] = step1[8];
1041 step2[9] = step1[9];
1042 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1043 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1044 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
1045 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
1046 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1047 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1048 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
1049 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
1050 step2[14] = step1[14];
1051 step2[15] = step1[15];
1052
1053 step2[16] = WRAPLOW(step1[16] + step1[23], 8);
1054 step2[17] = WRAPLOW(step1[17] + step1[22], 8);
1055 step2[18] = WRAPLOW(step1[18] + step1[21], 8);
1056 step2[19] = WRAPLOW(step1[19] + step1[20], 8);
1057 step2[20] = WRAPLOW(step1[19] - step1[20], 8);
1058 step2[21] = WRAPLOW(step1[18] - step1[21], 8);
1059 step2[22] = WRAPLOW(step1[17] - step1[22], 8);
1060 step2[23] = WRAPLOW(step1[16] - step1[23], 8);
1061
1062 step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
1063 step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
1064 step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
1065 step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
1066 step2[28] = WRAPLOW(step1[27] + step1[28], 8);
1067 step2[29] = WRAPLOW(step1[26] + step1[29], 8);
1068 step2[30] = WRAPLOW(step1[25] + step1[30], 8);
1069 step2[31] = WRAPLOW(step1[24] + step1[31], 8);
1070
1071 // stage 7
1072 step1[0] = WRAPLOW(step2[0] + step2[15], 8);
1073 step1[1] = WRAPLOW(step2[1] + step2[14], 8);
1074 step1[2] = WRAPLOW(step2[2] + step2[13], 8);
1075 step1[3] = WRAPLOW(step2[3] + step2[12], 8);
1076 step1[4] = WRAPLOW(step2[4] + step2[11], 8);
1077 step1[5] = WRAPLOW(step2[5] + step2[10], 8);
1078 step1[6] = WRAPLOW(step2[6] + step2[9], 8);
1079 step1[7] = WRAPLOW(step2[7] + step2[8], 8);
1080 step1[8] = WRAPLOW(step2[7] - step2[8], 8);
1081 step1[9] = WRAPLOW(step2[6] - step2[9], 8);
1082 step1[10] = WRAPLOW(step2[5] - step2[10], 8);
1083 step1[11] = WRAPLOW(step2[4] - step2[11], 8);
1084 step1[12] = WRAPLOW(step2[3] - step2[12], 8);
1085 step1[13] = WRAPLOW(step2[2] - step2[13], 8);
1086 step1[14] = WRAPLOW(step2[1] - step2[14], 8);
1087 step1[15] = WRAPLOW(step2[0] - step2[15], 8);
1088
1089 step1[16] = step2[16];
1090 step1[17] = step2[17];
1091 step1[18] = step2[18];
1092 step1[19] = step2[19];
1093 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1094 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1095 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
1096 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
1097 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1098 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1099 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
1100 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
1101 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1102 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1103 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
1104 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
1105 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1106 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1107 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
1108 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
1109 step1[28] = step2[28];
1110 step1[29] = step2[29];
1111 step1[30] = step2[30];
1112 step1[31] = step2[31];
1113
1114 // final stage
1115 output[0] = WRAPLOW(step1[0] + step1[31], 8);
1116 output[1] = WRAPLOW(step1[1] + step1[30], 8);
1117 output[2] = WRAPLOW(step1[2] + step1[29], 8);
1118 output[3] = WRAPLOW(step1[3] + step1[28], 8);
1119 output[4] = WRAPLOW(step1[4] + step1[27], 8);
1120 output[5] = WRAPLOW(step1[5] + step1[26], 8);
1121 output[6] = WRAPLOW(step1[6] + step1[25], 8);
1122 output[7] = WRAPLOW(step1[7] + step1[24], 8);
1123 output[8] = WRAPLOW(step1[8] + step1[23], 8);
1124 output[9] = WRAPLOW(step1[9] + step1[22], 8);
1125 output[10] = WRAPLOW(step1[10] + step1[21], 8);
1126 output[11] = WRAPLOW(step1[11] + step1[20], 8);
1127 output[12] = WRAPLOW(step1[12] + step1[19], 8);
1128 output[13] = WRAPLOW(step1[13] + step1[18], 8);
1129 output[14] = WRAPLOW(step1[14] + step1[17], 8);
1130 output[15] = WRAPLOW(step1[15] + step1[16], 8);
1131 output[16] = WRAPLOW(step1[15] - step1[16], 8);
1132 output[17] = WRAPLOW(step1[14] - step1[17], 8);
1133 output[18] = WRAPLOW(step1[13] - step1[18], 8);
1134 output[19] = WRAPLOW(step1[12] - step1[19], 8);
1135 output[20] = WRAPLOW(step1[11] - step1[20], 8);
1136 output[21] = WRAPLOW(step1[10] - step1[21], 8);
1137 output[22] = WRAPLOW(step1[9] - step1[22], 8);
1138 output[23] = WRAPLOW(step1[8] - step1[23], 8);
1139 output[24] = WRAPLOW(step1[7] - step1[24], 8);
1140 output[25] = WRAPLOW(step1[6] - step1[25], 8);
1141 output[26] = WRAPLOW(step1[5] - step1[26], 8);
1142 output[27] = WRAPLOW(step1[4] - step1[27], 8);
1143 output[28] = WRAPLOW(step1[3] - step1[28], 8);
1144 output[29] = WRAPLOW(step1[2] - step1[29], 8);
1145 output[30] = WRAPLOW(step1[1] - step1[30], 8);
1146 output[31] = WRAPLOW(step1[0] - step1[31], 8);
1147 }
1148
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1149 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1150 int stride) {
1151 tran_low_t out[32 * 32];
1152 tran_low_t *outptr = out;
1153 int i, j;
1154 tran_low_t temp_in[32], temp_out[32];
1155
1156 // Rows
1157 for (i = 0; i < 32; ++i) {
1158 int16_t zero_coeff[16];
1159 for (j = 0; j < 16; ++j)
1160 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1161 for (j = 0; j < 8; ++j)
1162 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1163 for (j = 0; j < 4; ++j)
1164 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1165 for (j = 0; j < 2; ++j)
1166 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1167
1168 if (zero_coeff[0] | zero_coeff[1])
1169 idct32_c(input, outptr);
1170 else
1171 memset(outptr, 0, sizeof(tran_low_t) * 32);
1172 input += 32;
1173 outptr += 32;
1174 }
1175
1176 // Columns
1177 for (i = 0; i < 32; ++i) {
1178 for (j = 0; j < 32; ++j)
1179 temp_in[j] = out[j * 32 + i];
1180 idct32_c(temp_in, temp_out);
1181 for (j = 0; j < 32; ++j) {
1182 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1183 ROUND_POWER_OF_TWO(temp_out[j], 6));
1184 }
1185 }
1186 }
1187
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1188 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1189 int stride) {
1190 tran_low_t out[32 * 32] = {0};
1191 tran_low_t *outptr = out;
1192 int i, j;
1193 tran_low_t temp_in[32], temp_out[32];
1194
1195 // Rows
1196 // only upper-left 8x8 has non-zero coeff
1197 for (i = 0; i < 8; ++i) {
1198 idct32_c(input, outptr);
1199 input += 32;
1200 outptr += 32;
1201 }
1202
1203 // Columns
1204 for (i = 0; i < 32; ++i) {
1205 for (j = 0; j < 32; ++j)
1206 temp_in[j] = out[j * 32 + i];
1207 idct32_c(temp_in, temp_out);
1208 for (j = 0; j < 32; ++j) {
1209 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1210 ROUND_POWER_OF_TWO(temp_out[j], 6));
1211 }
1212 }
1213 }
1214
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1215 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1216 int i, j;
1217 tran_high_t a1;
1218
1219 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
1220 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
1221 a1 = ROUND_POWER_OF_TWO(out, 6);
1222
1223 for (j = 0; j < 32; ++j) {
1224 for (i = 0; i < 32; ++i)
1225 dest[i] = clip_pixel_add(dest[i], a1);
1226 dest += stride;
1227 }
1228 }
1229
1230 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1231 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1232 int stride, int bd) {
1233 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1234 0.5 shifts per pixel. */
1235 int i;
1236 tran_low_t output[16];
1237 tran_high_t a1, b1, c1, d1, e1;
1238 const tran_low_t *ip = input;
1239 tran_low_t *op = output;
1240 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1241
1242 for (i = 0; i < 4; i++) {
1243 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1244 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1245 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1246 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1247 a1 += c1;
1248 d1 -= b1;
1249 e1 = (a1 - d1) >> 1;
1250 b1 = e1 - b1;
1251 c1 = e1 - c1;
1252 a1 -= b1;
1253 d1 += c1;
1254 op[0] = WRAPLOW(a1, bd);
1255 op[1] = WRAPLOW(b1, bd);
1256 op[2] = WRAPLOW(c1, bd);
1257 op[3] = WRAPLOW(d1, bd);
1258 ip += 4;
1259 op += 4;
1260 }
1261
1262 ip = output;
1263 for (i = 0; i < 4; i++) {
1264 a1 = ip[4 * 0];
1265 c1 = ip[4 * 1];
1266 d1 = ip[4 * 2];
1267 b1 = ip[4 * 3];
1268 a1 += c1;
1269 d1 -= b1;
1270 e1 = (a1 - d1) >> 1;
1271 b1 = e1 - b1;
1272 c1 = e1 - c1;
1273 a1 -= b1;
1274 d1 += c1;
1275 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1276 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
1277 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
1278 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
1279
1280 ip++;
1281 dest++;
1282 }
1283 }
1284
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest8,int dest_stride,int bd)1285 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1286 int dest_stride, int bd) {
1287 int i;
1288 tran_high_t a1, e1;
1289 tran_low_t tmp[4];
1290 const tran_low_t *ip = in;
1291 tran_low_t *op = tmp;
1292 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1293 (void) bd;
1294
1295 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1296 e1 = a1 >> 1;
1297 a1 -= e1;
1298 op[0] = WRAPLOW(a1, bd);
1299 op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
1300
1301 ip = tmp;
1302 for (i = 0; i < 4; i++) {
1303 e1 = ip[0] >> 1;
1304 a1 = ip[0] - e1;
1305 dest[dest_stride * 0] = highbd_clip_pixel_add(
1306 dest[dest_stride * 0], a1, bd);
1307 dest[dest_stride * 1] = highbd_clip_pixel_add(
1308 dest[dest_stride * 1], e1, bd);
1309 dest[dest_stride * 2] = highbd_clip_pixel_add(
1310 dest[dest_stride * 2], e1, bd);
1311 dest[dest_stride * 3] = highbd_clip_pixel_add(
1312 dest[dest_stride * 3], e1, bd);
1313 ip++;
1314 dest++;
1315 }
1316 }
1317
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1318 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1319 tran_low_t step[4];
1320 tran_high_t temp1, temp2;
1321 (void) bd;
1322 // stage 1
1323 temp1 = (input[0] + input[2]) * cospi_16_64;
1324 temp2 = (input[0] - input[2]) * cospi_16_64;
1325 step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1326 step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1327 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1328 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1329 step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1330 step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1331
1332 // stage 2
1333 output[0] = WRAPLOW(step[0] + step[3], bd);
1334 output[1] = WRAPLOW(step[1] + step[2], bd);
1335 output[2] = WRAPLOW(step[1] - step[2], bd);
1336 output[3] = WRAPLOW(step[0] - step[3], bd);
1337 }
1338
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1339 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1340 int stride, int bd) {
1341 tran_low_t out[4 * 4];
1342 tran_low_t *outptr = out;
1343 int i, j;
1344 tran_low_t temp_in[4], temp_out[4];
1345 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1346
1347 // Rows
1348 for (i = 0; i < 4; ++i) {
1349 vpx_highbd_idct4_c(input, outptr, bd);
1350 input += 4;
1351 outptr += 4;
1352 }
1353
1354 // Columns
1355 for (i = 0; i < 4; ++i) {
1356 for (j = 0; j < 4; ++j)
1357 temp_in[j] = out[j * 4 + i];
1358 vpx_highbd_idct4_c(temp_in, temp_out, bd);
1359 for (j = 0; j < 4; ++j) {
1360 dest[j * stride + i] = highbd_clip_pixel_add(
1361 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1362 }
1363 }
1364 }
1365
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest8,int dest_stride,int bd)1366 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1367 int dest_stride, int bd) {
1368 int i;
1369 tran_high_t a1;
1370 tran_low_t out = WRAPLOW(
1371 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1372 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1373
1374 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1375 a1 = ROUND_POWER_OF_TWO(out, 4);
1376
1377 for (i = 0; i < 4; i++) {
1378 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1379 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1380 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1381 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1382 dest += dest_stride;
1383 }
1384 }
1385
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1386 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1387 tran_low_t step1[8], step2[8];
1388 tran_high_t temp1, temp2;
1389 // stage 1
1390 step1[0] = input[0];
1391 step1[2] = input[4];
1392 step1[1] = input[2];
1393 step1[3] = input[6];
1394 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1395 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1396 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1397 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1398 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1399 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1400 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1401 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1402
1403 // stage 2 & stage 3 - even half
1404 vpx_highbd_idct4_c(step1, step1, bd);
1405
1406 // stage 2 - odd half
1407 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1408 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1409 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1410 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1411
1412 // stage 3 - odd half
1413 step1[4] = step2[4];
1414 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1415 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1416 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1417 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1418 step1[7] = step2[7];
1419
1420 // stage 4
1421 output[0] = WRAPLOW(step1[0] + step1[7], bd);
1422 output[1] = WRAPLOW(step1[1] + step1[6], bd);
1423 output[2] = WRAPLOW(step1[2] + step1[5], bd);
1424 output[3] = WRAPLOW(step1[3] + step1[4], bd);
1425 output[4] = WRAPLOW(step1[3] - step1[4], bd);
1426 output[5] = WRAPLOW(step1[2] - step1[5], bd);
1427 output[6] = WRAPLOW(step1[1] - step1[6], bd);
1428 output[7] = WRAPLOW(step1[0] - step1[7], bd);
1429 }
1430
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1431 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1432 int stride, int bd) {
1433 tran_low_t out[8 * 8];
1434 tran_low_t *outptr = out;
1435 int i, j;
1436 tran_low_t temp_in[8], temp_out[8];
1437 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1438
1439 // First transform rows.
1440 for (i = 0; i < 8; ++i) {
1441 vpx_highbd_idct8_c(input, outptr, bd);
1442 input += 8;
1443 outptr += 8;
1444 }
1445
1446 // Then transform columns.
1447 for (i = 0; i < 8; ++i) {
1448 for (j = 0; j < 8; ++j)
1449 temp_in[j] = out[j * 8 + i];
1450 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1451 for (j = 0; j < 8; ++j) {
1452 dest[j * stride + i] = highbd_clip_pixel_add(
1453 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1454 }
1455 }
1456 }
1457
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1458 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1459 int stride, int bd) {
1460 int i, j;
1461 tran_high_t a1;
1462 tran_low_t out = WRAPLOW(
1463 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
1464 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1465 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
1466 a1 = ROUND_POWER_OF_TWO(out, 5);
1467 for (j = 0; j < 8; ++j) {
1468 for (i = 0; i < 8; ++i)
1469 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1470 dest += stride;
1471 }
1472 }
1473
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1474 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1475 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1476
1477 tran_low_t x0 = input[0];
1478 tran_low_t x1 = input[1];
1479 tran_low_t x2 = input[2];
1480 tran_low_t x3 = input[3];
1481 (void) bd;
1482
1483 if (!(x0 | x1 | x2 | x3)) {
1484 memset(output, 0, 4 * sizeof(*output));
1485 return;
1486 }
1487
1488 s0 = sinpi_1_9 * x0;
1489 s1 = sinpi_2_9 * x0;
1490 s2 = sinpi_3_9 * x1;
1491 s3 = sinpi_4_9 * x2;
1492 s4 = sinpi_1_9 * x2;
1493 s5 = sinpi_2_9 * x3;
1494 s6 = sinpi_4_9 * x3;
1495 s7 = (tran_high_t)(x0 - x2 + x3);
1496
1497 s0 = s0 + s3 + s5;
1498 s1 = s1 - s4 - s6;
1499 s3 = s2;
1500 s2 = sinpi_3_9 * s7;
1501
1502 // 1-D transform scaling factor is sqrt(2).
1503 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1504 // + 1b (addition) = 29b.
1505 // Hence the output bit depth is 15b.
1506 output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
1507 output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
1508 output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1509 output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
1510 }
1511
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1512 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1513 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1514
1515 tran_low_t x0 = input[7];
1516 tran_low_t x1 = input[0];
1517 tran_low_t x2 = input[5];
1518 tran_low_t x3 = input[2];
1519 tran_low_t x4 = input[3];
1520 tran_low_t x5 = input[4];
1521 tran_low_t x6 = input[1];
1522 tran_low_t x7 = input[6];
1523 (void) bd;
1524
1525 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1526 memset(output, 0, 8 * sizeof(*output));
1527 return;
1528 }
1529
1530 // stage 1
1531 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1532 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1533 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1534 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1535 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1536 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1537 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1538 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1539
1540 x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
1541 x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
1542 x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
1543 x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
1544 x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
1545 x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
1546 x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
1547 x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
1548
1549 // stage 2
1550 s0 = x0;
1551 s1 = x1;
1552 s2 = x2;
1553 s3 = x3;
1554 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1555 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1556 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1557 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1558
1559 x0 = WRAPLOW(s0 + s2, bd);
1560 x1 = WRAPLOW(s1 + s3, bd);
1561 x2 = WRAPLOW(s0 - s2, bd);
1562 x3 = WRAPLOW(s1 - s3, bd);
1563 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1564 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1565 x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1566 x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1567
1568 // stage 3
1569 s2 = cospi_16_64 * (x2 + x3);
1570 s3 = cospi_16_64 * (x2 - x3);
1571 s6 = cospi_16_64 * (x6 + x7);
1572 s7 = cospi_16_64 * (x6 - x7);
1573
1574 x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1575 x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1576 x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1577 x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1578
1579 output[0] = WRAPLOW(x0, bd);
1580 output[1] = WRAPLOW(-x4, bd);
1581 output[2] = WRAPLOW(x6, bd);
1582 output[3] = WRAPLOW(-x2, bd);
1583 output[4] = WRAPLOW(x3, bd);
1584 output[5] = WRAPLOW(-x7, bd);
1585 output[6] = WRAPLOW(x5, bd);
1586 output[7] = WRAPLOW(-x1, bd);
1587 }
1588
vpx_highbd_idct8x8_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1589 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
1590 int stride, int bd) {
1591 tran_low_t out[8 * 8] = { 0 };
1592 tran_low_t *outptr = out;
1593 int i, j;
1594 tran_low_t temp_in[8], temp_out[8];
1595 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1596
1597 // First transform rows.
1598 // Only first 4 row has non-zero coefs.
1599 for (i = 0; i < 4; ++i) {
1600 vpx_highbd_idct8_c(input, outptr, bd);
1601 input += 8;
1602 outptr += 8;
1603 }
1604 // Then transform columns.
1605 for (i = 0; i < 8; ++i) {
1606 for (j = 0; j < 8; ++j)
1607 temp_in[j] = out[j * 8 + i];
1608 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1609 for (j = 0; j < 8; ++j) {
1610 dest[j * stride + i] = highbd_clip_pixel_add(
1611 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1612 }
1613 }
1614 }
1615
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1616 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1617 tran_low_t step1[16], step2[16];
1618 tran_high_t temp1, temp2;
1619 (void) bd;
1620
1621 // stage 1
1622 step1[0] = input[0/2];
1623 step1[1] = input[16/2];
1624 step1[2] = input[8/2];
1625 step1[3] = input[24/2];
1626 step1[4] = input[4/2];
1627 step1[5] = input[20/2];
1628 step1[6] = input[12/2];
1629 step1[7] = input[28/2];
1630 step1[8] = input[2/2];
1631 step1[9] = input[18/2];
1632 step1[10] = input[10/2];
1633 step1[11] = input[26/2];
1634 step1[12] = input[6/2];
1635 step1[13] = input[22/2];
1636 step1[14] = input[14/2];
1637 step1[15] = input[30/2];
1638
1639 // stage 2
1640 step2[0] = step1[0];
1641 step2[1] = step1[1];
1642 step2[2] = step1[2];
1643 step2[3] = step1[3];
1644 step2[4] = step1[4];
1645 step2[5] = step1[5];
1646 step2[6] = step1[6];
1647 step2[7] = step1[7];
1648
1649 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1650 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1651 step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1652 step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1653
1654 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1655 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1656 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1657 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1658
1659 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1660 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1661 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1662 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1663
1664 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1665 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1666 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1667 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1668
1669 // stage 3
1670 step1[0] = step2[0];
1671 step1[1] = step2[1];
1672 step1[2] = step2[2];
1673 step1[3] = step2[3];
1674
1675 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1676 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1677 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1678 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1679 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1680 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1681 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1682 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1683
1684 step1[8] = WRAPLOW(step2[8] + step2[9], bd);
1685 step1[9] = WRAPLOW(step2[8] - step2[9], bd);
1686 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
1687 step1[11] = WRAPLOW(step2[10] + step2[11], bd);
1688 step1[12] = WRAPLOW(step2[12] + step2[13], bd);
1689 step1[13] = WRAPLOW(step2[12] - step2[13], bd);
1690 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
1691 step1[15] = WRAPLOW(step2[14] + step2[15], bd);
1692
1693 // stage 4
1694 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1695 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1696 step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1697 step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1698 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1699 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1700 step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1701 step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1702 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
1703 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
1704 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
1705 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
1706
1707 step2[8] = step1[8];
1708 step2[15] = step1[15];
1709 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1710 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1711 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1712 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1713 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1714 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1715 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1716 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1717 step2[11] = step1[11];
1718 step2[12] = step1[12];
1719
1720 // stage 5
1721 step1[0] = WRAPLOW(step2[0] + step2[3], bd);
1722 step1[1] = WRAPLOW(step2[1] + step2[2], bd);
1723 step1[2] = WRAPLOW(step2[1] - step2[2], bd);
1724 step1[3] = WRAPLOW(step2[0] - step2[3], bd);
1725 step1[4] = step2[4];
1726 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1727 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1728 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1729 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1730 step1[7] = step2[7];
1731
1732 step1[8] = WRAPLOW(step2[8] + step2[11], bd);
1733 step1[9] = WRAPLOW(step2[9] + step2[10], bd);
1734 step1[10] = WRAPLOW(step2[9] - step2[10], bd);
1735 step1[11] = WRAPLOW(step2[8] - step2[11], bd);
1736 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
1737 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
1738 step1[14] = WRAPLOW(step2[13] + step2[14], bd);
1739 step1[15] = WRAPLOW(step2[12] + step2[15], bd);
1740
1741 // stage 6
1742 step2[0] = WRAPLOW(step1[0] + step1[7], bd);
1743 step2[1] = WRAPLOW(step1[1] + step1[6], bd);
1744 step2[2] = WRAPLOW(step1[2] + step1[5], bd);
1745 step2[3] = WRAPLOW(step1[3] + step1[4], bd);
1746 step2[4] = WRAPLOW(step1[3] - step1[4], bd);
1747 step2[5] = WRAPLOW(step1[2] - step1[5], bd);
1748 step2[6] = WRAPLOW(step1[1] - step1[6], bd);
1749 step2[7] = WRAPLOW(step1[0] - step1[7], bd);
1750 step2[8] = step1[8];
1751 step2[9] = step1[9];
1752 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1753 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1754 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1755 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1756 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1757 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1758 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
1759 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
1760 step2[14] = step1[14];
1761 step2[15] = step1[15];
1762
1763 // stage 7
1764 output[0] = WRAPLOW(step2[0] + step2[15], bd);
1765 output[1] = WRAPLOW(step2[1] + step2[14], bd);
1766 output[2] = WRAPLOW(step2[2] + step2[13], bd);
1767 output[3] = WRAPLOW(step2[3] + step2[12], bd);
1768 output[4] = WRAPLOW(step2[4] + step2[11], bd);
1769 output[5] = WRAPLOW(step2[5] + step2[10], bd);
1770 output[6] = WRAPLOW(step2[6] + step2[9], bd);
1771 output[7] = WRAPLOW(step2[7] + step2[8], bd);
1772 output[8] = WRAPLOW(step2[7] - step2[8], bd);
1773 output[9] = WRAPLOW(step2[6] - step2[9], bd);
1774 output[10] = WRAPLOW(step2[5] - step2[10], bd);
1775 output[11] = WRAPLOW(step2[4] - step2[11], bd);
1776 output[12] = WRAPLOW(step2[3] - step2[12], bd);
1777 output[13] = WRAPLOW(step2[2] - step2[13], bd);
1778 output[14] = WRAPLOW(step2[1] - step2[14], bd);
1779 output[15] = WRAPLOW(step2[0] - step2[15], bd);
1780 }
1781
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1782 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1783 int stride, int bd) {
1784 tran_low_t out[16 * 16];
1785 tran_low_t *outptr = out;
1786 int i, j;
1787 tran_low_t temp_in[16], temp_out[16];
1788 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1789
1790 // First transform rows.
1791 for (i = 0; i < 16; ++i) {
1792 vpx_highbd_idct16_c(input, outptr, bd);
1793 input += 16;
1794 outptr += 16;
1795 }
1796
1797 // Then transform columns.
1798 for (i = 0; i < 16; ++i) {
1799 for (j = 0; j < 16; ++j)
1800 temp_in[j] = out[j * 16 + i];
1801 vpx_highbd_idct16_c(temp_in, temp_out, bd);
1802 for (j = 0; j < 16; ++j) {
1803 dest[j * stride + i] = highbd_clip_pixel_add(
1804 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1805 }
1806 }
1807 }
1808
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1809 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1810 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1811 tran_high_t s9, s10, s11, s12, s13, s14, s15;
1812
1813 tran_low_t x0 = input[15];
1814 tran_low_t x1 = input[0];
1815 tran_low_t x2 = input[13];
1816 tran_low_t x3 = input[2];
1817 tran_low_t x4 = input[11];
1818 tran_low_t x5 = input[4];
1819 tran_low_t x6 = input[9];
1820 tran_low_t x7 = input[6];
1821 tran_low_t x8 = input[7];
1822 tran_low_t x9 = input[8];
1823 tran_low_t x10 = input[5];
1824 tran_low_t x11 = input[10];
1825 tran_low_t x12 = input[3];
1826 tran_low_t x13 = input[12];
1827 tran_low_t x14 = input[1];
1828 tran_low_t x15 = input[14];
1829 (void) bd;
1830
1831 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
1832 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
1833 memset(output, 0, 16 * sizeof(*output));
1834 return;
1835 }
1836
1837 // stage 1
1838 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1839 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1840 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1841 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1842 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1843 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1844 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1845 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1846 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1847 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1848 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1849 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1850 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1851 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1852 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1853 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1854
1855 x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
1856 x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
1857 x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
1858 x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
1859 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
1860 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
1861 x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
1862 x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
1863 x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
1864 x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
1865 x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
1866 x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
1867 x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
1868 x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
1869 x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
1870 x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
1871
1872 // stage 2
1873 s0 = x0;
1874 s1 = x1;
1875 s2 = x2;
1876 s3 = x3;
1877 s4 = x4;
1878 s5 = x5;
1879 s6 = x6;
1880 s7 = x7;
1881 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1882 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1883 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1884 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1885 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1886 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1887 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1888 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1889
1890 x0 = WRAPLOW(s0 + s4, bd);
1891 x1 = WRAPLOW(s1 + s5, bd);
1892 x2 = WRAPLOW(s2 + s6, bd);
1893 x3 = WRAPLOW(s3 + s7, bd);
1894 x4 = WRAPLOW(s0 - s4, bd);
1895 x5 = WRAPLOW(s1 - s5, bd);
1896 x6 = WRAPLOW(s2 - s6, bd);
1897 x7 = WRAPLOW(s3 - s7, bd);
1898 x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
1899 x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
1900 x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
1901 x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
1902 x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
1903 x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
1904 x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
1905 x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
1906
1907 // stage 3
1908 s0 = x0;
1909 s1 = x1;
1910 s2 = x2;
1911 s3 = x3;
1912 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1913 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1914 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1915 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1916 s8 = x8;
1917 s9 = x9;
1918 s10 = x10;
1919 s11 = x11;
1920 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1921 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1922 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1923 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1924
1925 x0 = WRAPLOW(s0 + s2, bd);
1926 x1 = WRAPLOW(s1 + s3, bd);
1927 x2 = WRAPLOW(s0 - s2, bd);
1928 x3 = WRAPLOW(s1 - s3, bd);
1929 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
1930 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
1931 x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
1932 x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
1933 x8 = WRAPLOW(s8 + s10, bd);
1934 x9 = WRAPLOW(s9 + s11, bd);
1935 x10 = WRAPLOW(s8 - s10, bd);
1936 x11 = WRAPLOW(s9 - s11, bd);
1937 x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
1938 x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
1939 x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
1940 x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
1941
1942 // stage 4
1943 s2 = (- cospi_16_64) * (x2 + x3);
1944 s3 = cospi_16_64 * (x2 - x3);
1945 s6 = cospi_16_64 * (x6 + x7);
1946 s7 = cospi_16_64 * (-x6 + x7);
1947 s10 = cospi_16_64 * (x10 + x11);
1948 s11 = cospi_16_64 * (-x10 + x11);
1949 s14 = (- cospi_16_64) * (x14 + x15);
1950 s15 = cospi_16_64 * (x14 - x15);
1951
1952 x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
1953 x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
1954 x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
1955 x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
1956 x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
1957 x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
1958 x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
1959 x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
1960
1961 output[0] = WRAPLOW(x0, bd);
1962 output[1] = WRAPLOW(-x8, bd);
1963 output[2] = WRAPLOW(x12, bd);
1964 output[3] = WRAPLOW(-x4, bd);
1965 output[4] = WRAPLOW(x6, bd);
1966 output[5] = WRAPLOW(x14, bd);
1967 output[6] = WRAPLOW(x10, bd);
1968 output[7] = WRAPLOW(x2, bd);
1969 output[8] = WRAPLOW(x3, bd);
1970 output[9] = WRAPLOW(x11, bd);
1971 output[10] = WRAPLOW(x15, bd);
1972 output[11] = WRAPLOW(x7, bd);
1973 output[12] = WRAPLOW(x5, bd);
1974 output[13] = WRAPLOW(-x13, bd);
1975 output[14] = WRAPLOW(x9, bd);
1976 output[15] = WRAPLOW(-x1, bd);
1977 }
1978
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1979 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
1980 int stride, int bd) {
1981 tran_low_t out[16 * 16] = { 0 };
1982 tran_low_t *outptr = out;
1983 int i, j;
1984 tran_low_t temp_in[16], temp_out[16];
1985 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1986
1987 // First transform rows. Since all non-zero dct coefficients are in
1988 // upper-left 4x4 area, we only need to calculate first 4 rows here.
1989 for (i = 0; i < 4; ++i) {
1990 vpx_highbd_idct16_c(input, outptr, bd);
1991 input += 16;
1992 outptr += 16;
1993 }
1994
1995 // Then transform columns.
1996 for (i = 0; i < 16; ++i) {
1997 for (j = 0; j < 16; ++j)
1998 temp_in[j] = out[j*16 + i];
1999 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2000 for (j = 0; j < 16; ++j) {
2001 dest[j * stride + i] = highbd_clip_pixel_add(
2002 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2003 }
2004 }
2005 }
2006
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2007 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2008 int stride, int bd) {
2009 int i, j;
2010 tran_high_t a1;
2011 tran_low_t out = WRAPLOW(
2012 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2013 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2014
2015 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2016 a1 = ROUND_POWER_OF_TWO(out, 6);
2017 for (j = 0; j < 16; ++j) {
2018 for (i = 0; i < 16; ++i)
2019 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2020 dest += stride;
2021 }
2022 }
2023
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2024 static void highbd_idct32_c(const tran_low_t *input,
2025 tran_low_t *output, int bd) {
2026 tran_low_t step1[32], step2[32];
2027 tran_high_t temp1, temp2;
2028 (void) bd;
2029
2030 // stage 1
2031 step1[0] = input[0];
2032 step1[1] = input[16];
2033 step1[2] = input[8];
2034 step1[3] = input[24];
2035 step1[4] = input[4];
2036 step1[5] = input[20];
2037 step1[6] = input[12];
2038 step1[7] = input[28];
2039 step1[8] = input[2];
2040 step1[9] = input[18];
2041 step1[10] = input[10];
2042 step1[11] = input[26];
2043 step1[12] = input[6];
2044 step1[13] = input[22];
2045 step1[14] = input[14];
2046 step1[15] = input[30];
2047
2048 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2049 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2050 step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2051 step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2052
2053 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2054 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2055 step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2056 step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2057
2058 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2059 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2060 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2061 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2062
2063 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2064 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2065 step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2066 step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2067
2068 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2069 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2070 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2071 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2072
2073 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2074 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2075 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2076 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2077
2078 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2079 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2080 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2081 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2082
2083 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2084 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2085 step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2086 step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2087
2088 // stage 2
2089 step2[0] = step1[0];
2090 step2[1] = step1[1];
2091 step2[2] = step1[2];
2092 step2[3] = step1[3];
2093 step2[4] = step1[4];
2094 step2[5] = step1[5];
2095 step2[6] = step1[6];
2096 step2[7] = step1[7];
2097
2098 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2099 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2100 step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2101 step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2102
2103 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2104 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2105 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2106 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2107
2108 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2109 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2110 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2111 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2112
2113 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2114 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2115 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2116 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2117
2118 step2[16] = WRAPLOW(step1[16] + step1[17], bd);
2119 step2[17] = WRAPLOW(step1[16] - step1[17], bd);
2120 step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
2121 step2[19] = WRAPLOW(step1[18] + step1[19], bd);
2122 step2[20] = WRAPLOW(step1[20] + step1[21], bd);
2123 step2[21] = WRAPLOW(step1[20] - step1[21], bd);
2124 step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
2125 step2[23] = WRAPLOW(step1[22] + step1[23], bd);
2126 step2[24] = WRAPLOW(step1[24] + step1[25], bd);
2127 step2[25] = WRAPLOW(step1[24] - step1[25], bd);
2128 step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
2129 step2[27] = WRAPLOW(step1[26] + step1[27], bd);
2130 step2[28] = WRAPLOW(step1[28] + step1[29], bd);
2131 step2[29] = WRAPLOW(step1[28] - step1[29], bd);
2132 step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
2133 step2[31] = WRAPLOW(step1[30] + step1[31], bd);
2134
2135 // stage 3
2136 step1[0] = step2[0];
2137 step1[1] = step2[1];
2138 step1[2] = step2[2];
2139 step1[3] = step2[3];
2140
2141 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2142 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2143 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2144 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2145 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2146 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2147 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2148 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2149
2150 step1[8] = WRAPLOW(step2[8] + step2[9], bd);
2151 step1[9] = WRAPLOW(step2[8] - step2[9], bd);
2152 step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
2153 step1[11] = WRAPLOW(step2[10] + step2[11], bd);
2154 step1[12] = WRAPLOW(step2[12] + step2[13], bd);
2155 step1[13] = WRAPLOW(step2[12] - step2[13], bd);
2156 step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
2157 step1[15] = WRAPLOW(step2[14] + step2[15], bd);
2158
2159 step1[16] = step2[16];
2160 step1[31] = step2[31];
2161 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2162 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2163 step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2164 step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2165 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2166 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2167 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2168 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2169 step1[19] = step2[19];
2170 step1[20] = step2[20];
2171 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2172 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2173 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2174 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2175 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2176 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2177 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2178 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2179 step1[23] = step2[23];
2180 step1[24] = step2[24];
2181 step1[27] = step2[27];
2182 step1[28] = step2[28];
2183
2184 // stage 4
2185 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2186 temp2 = (step1[0] - step1[1]) * cospi_16_64;
2187 step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2188 step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2189 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2190 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2191 step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2192 step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2193 step2[4] = WRAPLOW(step1[4] + step1[5], bd);
2194 step2[5] = WRAPLOW(step1[4] - step1[5], bd);
2195 step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
2196 step2[7] = WRAPLOW(step1[6] + step1[7], bd);
2197
2198 step2[8] = step1[8];
2199 step2[15] = step1[15];
2200 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2201 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2202 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2203 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2204 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2205 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2206 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2207 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2208 step2[11] = step1[11];
2209 step2[12] = step1[12];
2210
2211 step2[16] = WRAPLOW(step1[16] + step1[19], bd);
2212 step2[17] = WRAPLOW(step1[17] + step1[18], bd);
2213 step2[18] = WRAPLOW(step1[17] - step1[18], bd);
2214 step2[19] = WRAPLOW(step1[16] - step1[19], bd);
2215 step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
2216 step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
2217 step2[22] = WRAPLOW(step1[21] + step1[22], bd);
2218 step2[23] = WRAPLOW(step1[20] + step1[23], bd);
2219
2220 step2[24] = WRAPLOW(step1[24] + step1[27], bd);
2221 step2[25] = WRAPLOW(step1[25] + step1[26], bd);
2222 step2[26] = WRAPLOW(step1[25] - step1[26], bd);
2223 step2[27] = WRAPLOW(step1[24] - step1[27], bd);
2224 step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
2225 step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
2226 step2[30] = WRAPLOW(step1[29] + step1[30], bd);
2227 step2[31] = WRAPLOW(step1[28] + step1[31], bd);
2228
2229 // stage 5
2230 step1[0] = WRAPLOW(step2[0] + step2[3], bd);
2231 step1[1] = WRAPLOW(step2[1] + step2[2], bd);
2232 step1[2] = WRAPLOW(step2[1] - step2[2], bd);
2233 step1[3] = WRAPLOW(step2[0] - step2[3], bd);
2234 step1[4] = step2[4];
2235 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2236 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2237 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2238 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2239 step1[7] = step2[7];
2240
2241 step1[8] = WRAPLOW(step2[8] + step2[11], bd);
2242 step1[9] = WRAPLOW(step2[9] + step2[10], bd);
2243 step1[10] = WRAPLOW(step2[9] - step2[10], bd);
2244 step1[11] = WRAPLOW(step2[8] - step2[11], bd);
2245 step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
2246 step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
2247 step1[14] = WRAPLOW(step2[13] + step2[14], bd);
2248 step1[15] = WRAPLOW(step2[12] + step2[15], bd);
2249
2250 step1[16] = step2[16];
2251 step1[17] = step2[17];
2252 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2253 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2254 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2255 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2256 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2257 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2258 step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2259 step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2260 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2261 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2262 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2263 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2264 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2265 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2266 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2267 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2268 step1[22] = step2[22];
2269 step1[23] = step2[23];
2270 step1[24] = step2[24];
2271 step1[25] = step2[25];
2272 step1[30] = step2[30];
2273 step1[31] = step2[31];
2274
2275 // stage 6
2276 step2[0] = WRAPLOW(step1[0] + step1[7], bd);
2277 step2[1] = WRAPLOW(step1[1] + step1[6], bd);
2278 step2[2] = WRAPLOW(step1[2] + step1[5], bd);
2279 step2[3] = WRAPLOW(step1[3] + step1[4], bd);
2280 step2[4] = WRAPLOW(step1[3] - step1[4], bd);
2281 step2[5] = WRAPLOW(step1[2] - step1[5], bd);
2282 step2[6] = WRAPLOW(step1[1] - step1[6], bd);
2283 step2[7] = WRAPLOW(step1[0] - step1[7], bd);
2284 step2[8] = step1[8];
2285 step2[9] = step1[9];
2286 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2287 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2288 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2289 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2290 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2291 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2292 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2293 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2294 step2[14] = step1[14];
2295 step2[15] = step1[15];
2296
2297 step2[16] = WRAPLOW(step1[16] + step1[23], bd);
2298 step2[17] = WRAPLOW(step1[17] + step1[22], bd);
2299 step2[18] = WRAPLOW(step1[18] + step1[21], bd);
2300 step2[19] = WRAPLOW(step1[19] + step1[20], bd);
2301 step2[20] = WRAPLOW(step1[19] - step1[20], bd);
2302 step2[21] = WRAPLOW(step1[18] - step1[21], bd);
2303 step2[22] = WRAPLOW(step1[17] - step1[22], bd);
2304 step2[23] = WRAPLOW(step1[16] - step1[23], bd);
2305
2306 step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
2307 step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
2308 step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
2309 step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
2310 step2[28] = WRAPLOW(step1[27] + step1[28], bd);
2311 step2[29] = WRAPLOW(step1[26] + step1[29], bd);
2312 step2[30] = WRAPLOW(step1[25] + step1[30], bd);
2313 step2[31] = WRAPLOW(step1[24] + step1[31], bd);
2314
2315 // stage 7
2316 step1[0] = WRAPLOW(step2[0] + step2[15], bd);
2317 step1[1] = WRAPLOW(step2[1] + step2[14], bd);
2318 step1[2] = WRAPLOW(step2[2] + step2[13], bd);
2319 step1[3] = WRAPLOW(step2[3] + step2[12], bd);
2320 step1[4] = WRAPLOW(step2[4] + step2[11], bd);
2321 step1[5] = WRAPLOW(step2[5] + step2[10], bd);
2322 step1[6] = WRAPLOW(step2[6] + step2[9], bd);
2323 step1[7] = WRAPLOW(step2[7] + step2[8], bd);
2324 step1[8] = WRAPLOW(step2[7] - step2[8], bd);
2325 step1[9] = WRAPLOW(step2[6] - step2[9], bd);
2326 step1[10] = WRAPLOW(step2[5] - step2[10], bd);
2327 step1[11] = WRAPLOW(step2[4] - step2[11], bd);
2328 step1[12] = WRAPLOW(step2[3] - step2[12], bd);
2329 step1[13] = WRAPLOW(step2[2] - step2[13], bd);
2330 step1[14] = WRAPLOW(step2[1] - step2[14], bd);
2331 step1[15] = WRAPLOW(step2[0] - step2[15], bd);
2332
2333 step1[16] = step2[16];
2334 step1[17] = step2[17];
2335 step1[18] = step2[18];
2336 step1[19] = step2[19];
2337 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2338 temp2 = (step2[20] + step2[27]) * cospi_16_64;
2339 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2340 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2341 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2342 temp2 = (step2[21] + step2[26]) * cospi_16_64;
2343 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2344 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2345 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2346 temp2 = (step2[22] + step2[25]) * cospi_16_64;
2347 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2348 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2349 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2350 temp2 = (step2[23] + step2[24]) * cospi_16_64;
2351 step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
2352 step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
2353 step1[28] = step2[28];
2354 step1[29] = step2[29];
2355 step1[30] = step2[30];
2356 step1[31] = step2[31];
2357
2358 // final stage
2359 output[0] = WRAPLOW(step1[0] + step1[31], bd);
2360 output[1] = WRAPLOW(step1[1] + step1[30], bd);
2361 output[2] = WRAPLOW(step1[2] + step1[29], bd);
2362 output[3] = WRAPLOW(step1[3] + step1[28], bd);
2363 output[4] = WRAPLOW(step1[4] + step1[27], bd);
2364 output[5] = WRAPLOW(step1[5] + step1[26], bd);
2365 output[6] = WRAPLOW(step1[6] + step1[25], bd);
2366 output[7] = WRAPLOW(step1[7] + step1[24], bd);
2367 output[8] = WRAPLOW(step1[8] + step1[23], bd);
2368 output[9] = WRAPLOW(step1[9] + step1[22], bd);
2369 output[10] = WRAPLOW(step1[10] + step1[21], bd);
2370 output[11] = WRAPLOW(step1[11] + step1[20], bd);
2371 output[12] = WRAPLOW(step1[12] + step1[19], bd);
2372 output[13] = WRAPLOW(step1[13] + step1[18], bd);
2373 output[14] = WRAPLOW(step1[14] + step1[17], bd);
2374 output[15] = WRAPLOW(step1[15] + step1[16], bd);
2375 output[16] = WRAPLOW(step1[15] - step1[16], bd);
2376 output[17] = WRAPLOW(step1[14] - step1[17], bd);
2377 output[18] = WRAPLOW(step1[13] - step1[18], bd);
2378 output[19] = WRAPLOW(step1[12] - step1[19], bd);
2379 output[20] = WRAPLOW(step1[11] - step1[20], bd);
2380 output[21] = WRAPLOW(step1[10] - step1[21], bd);
2381 output[22] = WRAPLOW(step1[9] - step1[22], bd);
2382 output[23] = WRAPLOW(step1[8] - step1[23], bd);
2383 output[24] = WRAPLOW(step1[7] - step1[24], bd);
2384 output[25] = WRAPLOW(step1[6] - step1[25], bd);
2385 output[26] = WRAPLOW(step1[5] - step1[26], bd);
2386 output[27] = WRAPLOW(step1[4] - step1[27], bd);
2387 output[28] = WRAPLOW(step1[3] - step1[28], bd);
2388 output[29] = WRAPLOW(step1[2] - step1[29], bd);
2389 output[30] = WRAPLOW(step1[1] - step1[30], bd);
2390 output[31] = WRAPLOW(step1[0] - step1[31], bd);
2391 }
2392
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2393 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2394 int stride, int bd) {
2395 tran_low_t out[32 * 32];
2396 tran_low_t *outptr = out;
2397 int i, j;
2398 tran_low_t temp_in[32], temp_out[32];
2399 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2400
2401 // Rows
2402 for (i = 0; i < 32; ++i) {
2403 tran_low_t zero_coeff[16];
2404 for (j = 0; j < 16; ++j)
2405 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2406 for (j = 0; j < 8; ++j)
2407 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2408 for (j = 0; j < 4; ++j)
2409 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2410 for (j = 0; j < 2; ++j)
2411 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2412
2413 if (zero_coeff[0] | zero_coeff[1])
2414 highbd_idct32_c(input, outptr, bd);
2415 else
2416 memset(outptr, 0, sizeof(tran_low_t) * 32);
2417 input += 32;
2418 outptr += 32;
2419 }
2420
2421 // Columns
2422 for (i = 0; i < 32; ++i) {
2423 for (j = 0; j < 32; ++j)
2424 temp_in[j] = out[j * 32 + i];
2425 highbd_idct32_c(temp_in, temp_out, bd);
2426 for (j = 0; j < 32; ++j) {
2427 dest[j * stride + i] = highbd_clip_pixel_add(
2428 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2429 }
2430 }
2431 }
2432
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2433 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2434 int stride, int bd) {
2435 tran_low_t out[32 * 32] = {0};
2436 tran_low_t *outptr = out;
2437 int i, j;
2438 tran_low_t temp_in[32], temp_out[32];
2439 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2440
2441 // Rows
2442 // Only upper-left 8x8 has non-zero coeff.
2443 for (i = 0; i < 8; ++i) {
2444 highbd_idct32_c(input, outptr, bd);
2445 input += 32;
2446 outptr += 32;
2447 }
2448 // Columns
2449 for (i = 0; i < 32; ++i) {
2450 for (j = 0; j < 32; ++j)
2451 temp_in[j] = out[j * 32 + i];
2452 highbd_idct32_c(temp_in, temp_out, bd);
2453 for (j = 0; j < 32; ++j) {
2454 dest[j * stride + i] = highbd_clip_pixel_add(
2455 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2456 }
2457 }
2458 }
2459
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2460 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2461 int stride, int bd) {
2462 int i, j;
2463 int a1;
2464 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2465
2466 tran_low_t out = WRAPLOW(
2467 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
2468 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
2469 a1 = ROUND_POWER_OF_TWO(out, 6);
2470
2471 for (j = 0; j < 32; ++j) {
2472 for (i = 0; i < 32; ++i)
2473 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2474 dest += stride;
2475 }
2476 }
2477 #endif // CONFIG_VP9_HIGHBITDEPTH
2478