1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <math.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_systemdependent.h"
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_common.h"
19 #include "vp9/common/vp9_idct.h"
20
vp9_iwht4x4_16_add_c(const int16_t * input,uint8_t * dest,int stride)21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
23 0.5 shifts per pixel. */
24 int i;
25 int16_t output[16];
26 int a1, b1, c1, d1, e1;
27 const int16_t *ip = input;
28 int16_t *op = output;
29
30 for (i = 0; i < 4; i++) {
31 a1 = ip[0] >> UNIT_QUANT_SHIFT;
32 c1 = ip[1] >> UNIT_QUANT_SHIFT;
33 d1 = ip[2] >> UNIT_QUANT_SHIFT;
34 b1 = ip[3] >> UNIT_QUANT_SHIFT;
35 a1 += c1;
36 d1 -= b1;
37 e1 = (a1 - d1) >> 1;
38 b1 = e1 - b1;
39 c1 = e1 - c1;
40 a1 -= b1;
41 d1 += c1;
42 op[0] = a1;
43 op[1] = b1;
44 op[2] = c1;
45 op[3] = d1;
46 ip += 4;
47 op += 4;
48 }
49
50 ip = output;
51 for (i = 0; i < 4; i++) {
52 a1 = ip[4 * 0];
53 c1 = ip[4 * 1];
54 d1 = ip[4 * 2];
55 b1 = ip[4 * 3];
56 a1 += c1;
57 d1 -= b1;
58 e1 = (a1 - d1) >> 1;
59 b1 = e1 - b1;
60 c1 = e1 - c1;
61 a1 -= b1;
62 d1 += c1;
63 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
64 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
65 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
66 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
67
68 ip++;
69 dest++;
70 }
71 }
72
vp9_iwht4x4_1_add_c(const int16_t * in,uint8_t * dest,int dest_stride)73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
74 int i;
75 int a1, e1;
76 int16_t tmp[4];
77 const int16_t *ip = in;
78 int16_t *op = tmp;
79
80 a1 = ip[0] >> UNIT_QUANT_SHIFT;
81 e1 = a1 >> 1;
82 a1 -= e1;
83 op[0] = a1;
84 op[1] = op[2] = op[3] = e1;
85
86 ip = tmp;
87 for (i = 0; i < 4; i++) {
88 e1 = ip[0] >> 1;
89 a1 = ip[0] - e1;
90 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
91 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
92 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
93 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
94 ip++;
95 dest++;
96 }
97 }
98
idct4_1d(const int16_t * input,int16_t * output)99 static void idct4_1d(const int16_t *input, int16_t *output) {
100 int16_t step[4];
101 int temp1, temp2;
102 // stage 1
103 temp1 = (input[0] + input[2]) * cospi_16_64;
104 temp2 = (input[0] - input[2]) * cospi_16_64;
105 step[0] = dct_const_round_shift(temp1);
106 step[1] = dct_const_round_shift(temp2);
107 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
108 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
109 step[2] = dct_const_round_shift(temp1);
110 step[3] = dct_const_round_shift(temp2);
111
112 // stage 2
113 output[0] = step[0] + step[3];
114 output[1] = step[1] + step[2];
115 output[2] = step[1] - step[2];
116 output[3] = step[0] - step[3];
117 }
118
vp9_idct4x4_16_add_c(const int16_t * input,uint8_t * dest,int stride)119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
120 int16_t out[4 * 4];
121 int16_t *outptr = out;
122 int i, j;
123 int16_t temp_in[4], temp_out[4];
124
125 // Rows
126 for (i = 0; i < 4; ++i) {
127 idct4_1d(input, outptr);
128 input += 4;
129 outptr += 4;
130 }
131
132 // Columns
133 for (i = 0; i < 4; ++i) {
134 for (j = 0; j < 4; ++j)
135 temp_in[j] = out[j * 4 + i];
136 idct4_1d(temp_in, temp_out);
137 for (j = 0; j < 4; ++j)
138 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
139 + dest[j * stride + i]);
140 }
141 }
142
vp9_idct4x4_1_add_c(const int16_t * input,uint8_t * dest,int dest_stride)143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
144 int i;
145 int a1;
146 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
147 out = dct_const_round_shift(out * cospi_16_64);
148 a1 = ROUND_POWER_OF_TWO(out, 4);
149
150 for (i = 0; i < 4; i++) {
151 dest[0] = clip_pixel(dest[0] + a1);
152 dest[1] = clip_pixel(dest[1] + a1);
153 dest[2] = clip_pixel(dest[2] + a1);
154 dest[3] = clip_pixel(dest[3] + a1);
155 dest += dest_stride;
156 }
157 }
158
idct8_1d(const int16_t * input,int16_t * output)159 static void idct8_1d(const int16_t *input, int16_t *output) {
160 int16_t step1[8], step2[8];
161 int temp1, temp2;
162 // stage 1
163 step1[0] = input[0];
164 step1[2] = input[4];
165 step1[1] = input[2];
166 step1[3] = input[6];
167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169 step1[4] = dct_const_round_shift(temp1);
170 step1[7] = dct_const_round_shift(temp2);
171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173 step1[5] = dct_const_round_shift(temp1);
174 step1[6] = dct_const_round_shift(temp2);
175
176 // stage 2 & stage 3 - even half
177 idct4_1d(step1, step1);
178
179 // stage 2 - odd half
180 step2[4] = step1[4] + step1[5];
181 step2[5] = step1[4] - step1[5];
182 step2[6] = -step1[6] + step1[7];
183 step2[7] = step1[6] + step1[7];
184
185 // stage 3 -odd half
186 step1[4] = step2[4];
187 temp1 = (step2[6] - step2[5]) * cospi_16_64;
188 temp2 = (step2[5] + step2[6]) * cospi_16_64;
189 step1[5] = dct_const_round_shift(temp1);
190 step1[6] = dct_const_round_shift(temp2);
191 step1[7] = step2[7];
192
193 // stage 4
194 output[0] = step1[0] + step1[7];
195 output[1] = step1[1] + step1[6];
196 output[2] = step1[2] + step1[5];
197 output[3] = step1[3] + step1[4];
198 output[4] = step1[3] - step1[4];
199 output[5] = step1[2] - step1[5];
200 output[6] = step1[1] - step1[6];
201 output[7] = step1[0] - step1[7];
202 }
203
vp9_idct8x8_64_add_c(const int16_t * input,uint8_t * dest,int stride)204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
205 int16_t out[8 * 8];
206 int16_t *outptr = out;
207 int i, j;
208 int16_t temp_in[8], temp_out[8];
209
210 // First transform rows
211 for (i = 0; i < 8; ++i) {
212 idct8_1d(input, outptr);
213 input += 8;
214 outptr += 8;
215 }
216
217 // Then transform columns
218 for (i = 0; i < 8; ++i) {
219 for (j = 0; j < 8; ++j)
220 temp_in[j] = out[j * 8 + i];
221 idct8_1d(temp_in, temp_out);
222 for (j = 0; j < 8; ++j)
223 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
224 + dest[j * stride + i]);
225 }
226 }
227
vp9_idct8x8_1_add_c(const int16_t * input,uint8_t * dest,int stride)228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
229 int i, j;
230 int a1;
231 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
232 out = dct_const_round_shift(out * cospi_16_64);
233 a1 = ROUND_POWER_OF_TWO(out, 5);
234 for (j = 0; j < 8; ++j) {
235 for (i = 0; i < 8; ++i)
236 dest[i] = clip_pixel(dest[i] + a1);
237 dest += stride;
238 }
239 }
240
iadst4_1d(const int16_t * input,int16_t * output)241 static void iadst4_1d(const int16_t *input, int16_t *output) {
242 int s0, s1, s2, s3, s4, s5, s6, s7;
243
244 int x0 = input[0];
245 int x1 = input[1];
246 int x2 = input[2];
247 int x3 = input[3];
248
249 if (!(x0 | x1 | x2 | x3)) {
250 output[0] = output[1] = output[2] = output[3] = 0;
251 return;
252 }
253
254 s0 = sinpi_1_9 * x0;
255 s1 = sinpi_2_9 * x0;
256 s2 = sinpi_3_9 * x1;
257 s3 = sinpi_4_9 * x2;
258 s4 = sinpi_1_9 * x2;
259 s5 = sinpi_2_9 * x3;
260 s6 = sinpi_4_9 * x3;
261 s7 = x0 - x2 + x3;
262
263 x0 = s0 + s3 + s5;
264 x1 = s1 - s4 - s6;
265 x2 = sinpi_3_9 * s7;
266 x3 = s2;
267
268 s0 = x0 + x3;
269 s1 = x1 + x3;
270 s2 = x2;
271 s3 = x0 + x1 - x3;
272
273 // 1-D transform scaling factor is sqrt(2).
274 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
275 // + 1b (addition) = 29b.
276 // Hence the output bit depth is 15b.
277 output[0] = dct_const_round_shift(s0);
278 output[1] = dct_const_round_shift(s1);
279 output[2] = dct_const_round_shift(s2);
280 output[3] = dct_const_round_shift(s3);
281 }
282
vp9_iht4x4_16_add_c(const int16_t * input,uint8_t * dest,int stride,int tx_type)283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
284 int tx_type) {
285 const transform_2d IHT_4[] = {
286 { idct4_1d, idct4_1d }, // DCT_DCT = 0
287 { iadst4_1d, idct4_1d }, // ADST_DCT = 1
288 { idct4_1d, iadst4_1d }, // DCT_ADST = 2
289 { iadst4_1d, iadst4_1d } // ADST_ADST = 3
290 };
291
292 int i, j;
293 int16_t out[4 * 4];
294 int16_t *outptr = out;
295 int16_t temp_in[4], temp_out[4];
296
297 // inverse transform row vectors
298 for (i = 0; i < 4; ++i) {
299 IHT_4[tx_type].rows(input, outptr);
300 input += 4;
301 outptr += 4;
302 }
303
304 // inverse transform column vectors
305 for (i = 0; i < 4; ++i) {
306 for (j = 0; j < 4; ++j)
307 temp_in[j] = out[j * 4 + i];
308 IHT_4[tx_type].cols(temp_in, temp_out);
309 for (j = 0; j < 4; ++j)
310 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
311 + dest[j * stride + i]);
312 }
313 }
iadst8_1d(const int16_t * input,int16_t * output)314 static void iadst8_1d(const int16_t *input, int16_t *output) {
315 int s0, s1, s2, s3, s4, s5, s6, s7;
316
317 int x0 = input[7];
318 int x1 = input[0];
319 int x2 = input[5];
320 int x3 = input[2];
321 int x4 = input[3];
322 int x5 = input[4];
323 int x6 = input[1];
324 int x7 = input[6];
325
326 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
327 output[0] = output[1] = output[2] = output[3] = output[4]
328 = output[5] = output[6] = output[7] = 0;
329 return;
330 }
331
332 // stage 1
333 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
334 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
335 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
336 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
337 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
338 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
339 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
340 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
341
342 x0 = dct_const_round_shift(s0 + s4);
343 x1 = dct_const_round_shift(s1 + s5);
344 x2 = dct_const_round_shift(s2 + s6);
345 x3 = dct_const_round_shift(s3 + s7);
346 x4 = dct_const_round_shift(s0 - s4);
347 x5 = dct_const_round_shift(s1 - s5);
348 x6 = dct_const_round_shift(s2 - s6);
349 x7 = dct_const_round_shift(s3 - s7);
350
351 // stage 2
352 s0 = x0;
353 s1 = x1;
354 s2 = x2;
355 s3 = x3;
356 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
357 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
358 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
359 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
360
361 x0 = s0 + s2;
362 x1 = s1 + s3;
363 x2 = s0 - s2;
364 x3 = s1 - s3;
365 x4 = dct_const_round_shift(s4 + s6);
366 x5 = dct_const_round_shift(s5 + s7);
367 x6 = dct_const_round_shift(s4 - s6);
368 x7 = dct_const_round_shift(s5 - s7);
369
370 // stage 3
371 s2 = cospi_16_64 * (x2 + x3);
372 s3 = cospi_16_64 * (x2 - x3);
373 s6 = cospi_16_64 * (x6 + x7);
374 s7 = cospi_16_64 * (x6 - x7);
375
376 x2 = dct_const_round_shift(s2);
377 x3 = dct_const_round_shift(s3);
378 x6 = dct_const_round_shift(s6);
379 x7 = dct_const_round_shift(s7);
380
381 output[0] = x0;
382 output[1] = -x4;
383 output[2] = x6;
384 output[3] = -x2;
385 output[4] = x3;
386 output[5] = -x7;
387 output[6] = x5;
388 output[7] = -x1;
389 }
390
391 static const transform_2d IHT_8[] = {
392 { idct8_1d, idct8_1d }, // DCT_DCT = 0
393 { iadst8_1d, idct8_1d }, // ADST_DCT = 1
394 { idct8_1d, iadst8_1d }, // DCT_ADST = 2
395 { iadst8_1d, iadst8_1d } // ADST_ADST = 3
396 };
397
vp9_iht8x8_64_add_c(const int16_t * input,uint8_t * dest,int stride,int tx_type)398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
399 int tx_type) {
400 int i, j;
401 int16_t out[8 * 8];
402 int16_t *outptr = out;
403 int16_t temp_in[8], temp_out[8];
404 const transform_2d ht = IHT_8[tx_type];
405
406 // inverse transform row vectors
407 for (i = 0; i < 8; ++i) {
408 ht.rows(input, outptr);
409 input += 8;
410 outptr += 8;
411 }
412
413 // inverse transform column vectors
414 for (i = 0; i < 8; ++i) {
415 for (j = 0; j < 8; ++j)
416 temp_in[j] = out[j * 8 + i];
417 ht.cols(temp_in, temp_out);
418 for (j = 0; j < 8; ++j)
419 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
420 + dest[j * stride + i]);
421 }
422 }
423
vp9_idct8x8_10_add_c(const int16_t * input,uint8_t * dest,int stride)424 void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
425 int16_t out[8 * 8] = { 0 };
426 int16_t *outptr = out;
427 int i, j;
428 int16_t temp_in[8], temp_out[8];
429
430 // First transform rows
431 // only first 4 row has non-zero coefs
432 for (i = 0; i < 4; ++i) {
433 idct8_1d(input, outptr);
434 input += 8;
435 outptr += 8;
436 }
437
438 // Then transform columns
439 for (i = 0; i < 8; ++i) {
440 for (j = 0; j < 8; ++j)
441 temp_in[j] = out[j * 8 + i];
442 idct8_1d(temp_in, temp_out);
443 for (j = 0; j < 8; ++j)
444 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
445 + dest[j * stride + i]);
446 }
447 }
448
idct16_1d(const int16_t * input,int16_t * output)449 static void idct16_1d(const int16_t *input, int16_t *output) {
450 int16_t step1[16], step2[16];
451 int temp1, temp2;
452
453 // stage 1
454 step1[0] = input[0/2];
455 step1[1] = input[16/2];
456 step1[2] = input[8/2];
457 step1[3] = input[24/2];
458 step1[4] = input[4/2];
459 step1[5] = input[20/2];
460 step1[6] = input[12/2];
461 step1[7] = input[28/2];
462 step1[8] = input[2/2];
463 step1[9] = input[18/2];
464 step1[10] = input[10/2];
465 step1[11] = input[26/2];
466 step1[12] = input[6/2];
467 step1[13] = input[22/2];
468 step1[14] = input[14/2];
469 step1[15] = input[30/2];
470
471 // stage 2
472 step2[0] = step1[0];
473 step2[1] = step1[1];
474 step2[2] = step1[2];
475 step2[3] = step1[3];
476 step2[4] = step1[4];
477 step2[5] = step1[5];
478 step2[6] = step1[6];
479 step2[7] = step1[7];
480
481 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
482 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
483 step2[8] = dct_const_round_shift(temp1);
484 step2[15] = dct_const_round_shift(temp2);
485
486 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
487 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
488 step2[9] = dct_const_round_shift(temp1);
489 step2[14] = dct_const_round_shift(temp2);
490
491 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
492 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
493 step2[10] = dct_const_round_shift(temp1);
494 step2[13] = dct_const_round_shift(temp2);
495
496 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
497 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
498 step2[11] = dct_const_round_shift(temp1);
499 step2[12] = dct_const_round_shift(temp2);
500
501 // stage 3
502 step1[0] = step2[0];
503 step1[1] = step2[1];
504 step1[2] = step2[2];
505 step1[3] = step2[3];
506
507 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
508 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
509 step1[4] = dct_const_round_shift(temp1);
510 step1[7] = dct_const_round_shift(temp2);
511 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
512 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
513 step1[5] = dct_const_round_shift(temp1);
514 step1[6] = dct_const_round_shift(temp2);
515
516 step1[8] = step2[8] + step2[9];
517 step1[9] = step2[8] - step2[9];
518 step1[10] = -step2[10] + step2[11];
519 step1[11] = step2[10] + step2[11];
520 step1[12] = step2[12] + step2[13];
521 step1[13] = step2[12] - step2[13];
522 step1[14] = -step2[14] + step2[15];
523 step1[15] = step2[14] + step2[15];
524
525 // stage 4
526 temp1 = (step1[0] + step1[1]) * cospi_16_64;
527 temp2 = (step1[0] - step1[1]) * cospi_16_64;
528 step2[0] = dct_const_round_shift(temp1);
529 step2[1] = dct_const_round_shift(temp2);
530 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
531 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
532 step2[2] = dct_const_round_shift(temp1);
533 step2[3] = dct_const_round_shift(temp2);
534 step2[4] = step1[4] + step1[5];
535 step2[5] = step1[4] - step1[5];
536 step2[6] = -step1[6] + step1[7];
537 step2[7] = step1[6] + step1[7];
538
539 step2[8] = step1[8];
540 step2[15] = step1[15];
541 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
542 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
543 step2[9] = dct_const_round_shift(temp1);
544 step2[14] = dct_const_round_shift(temp2);
545 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
546 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
547 step2[10] = dct_const_round_shift(temp1);
548 step2[13] = dct_const_round_shift(temp2);
549 step2[11] = step1[11];
550 step2[12] = step1[12];
551
552 // stage 5
553 step1[0] = step2[0] + step2[3];
554 step1[1] = step2[1] + step2[2];
555 step1[2] = step2[1] - step2[2];
556 step1[3] = step2[0] - step2[3];
557 step1[4] = step2[4];
558 temp1 = (step2[6] - step2[5]) * cospi_16_64;
559 temp2 = (step2[5] + step2[6]) * cospi_16_64;
560 step1[5] = dct_const_round_shift(temp1);
561 step1[6] = dct_const_round_shift(temp2);
562 step1[7] = step2[7];
563
564 step1[8] = step2[8] + step2[11];
565 step1[9] = step2[9] + step2[10];
566 step1[10] = step2[9] - step2[10];
567 step1[11] = step2[8] - step2[11];
568 step1[12] = -step2[12] + step2[15];
569 step1[13] = -step2[13] + step2[14];
570 step1[14] = step2[13] + step2[14];
571 step1[15] = step2[12] + step2[15];
572
573 // stage 6
574 step2[0] = step1[0] + step1[7];
575 step2[1] = step1[1] + step1[6];
576 step2[2] = step1[2] + step1[5];
577 step2[3] = step1[3] + step1[4];
578 step2[4] = step1[3] - step1[4];
579 step2[5] = step1[2] - step1[5];
580 step2[6] = step1[1] - step1[6];
581 step2[7] = step1[0] - step1[7];
582 step2[8] = step1[8];
583 step2[9] = step1[9];
584 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
585 temp2 = (step1[10] + step1[13]) * cospi_16_64;
586 step2[10] = dct_const_round_shift(temp1);
587 step2[13] = dct_const_round_shift(temp2);
588 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
589 temp2 = (step1[11] + step1[12]) * cospi_16_64;
590 step2[11] = dct_const_round_shift(temp1);
591 step2[12] = dct_const_round_shift(temp2);
592 step2[14] = step1[14];
593 step2[15] = step1[15];
594
595 // stage 7
596 output[0] = step2[0] + step2[15];
597 output[1] = step2[1] + step2[14];
598 output[2] = step2[2] + step2[13];
599 output[3] = step2[3] + step2[12];
600 output[4] = step2[4] + step2[11];
601 output[5] = step2[5] + step2[10];
602 output[6] = step2[6] + step2[9];
603 output[7] = step2[7] + step2[8];
604 output[8] = step2[7] - step2[8];
605 output[9] = step2[6] - step2[9];
606 output[10] = step2[5] - step2[10];
607 output[11] = step2[4] - step2[11];
608 output[12] = step2[3] - step2[12];
609 output[13] = step2[2] - step2[13];
610 output[14] = step2[1] - step2[14];
611 output[15] = step2[0] - step2[15];
612 }
613
vp9_idct16x16_256_add_c(const int16_t * input,uint8_t * dest,int stride)614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
615 int16_t out[16 * 16];
616 int16_t *outptr = out;
617 int i, j;
618 int16_t temp_in[16], temp_out[16];
619
620 // First transform rows
621 for (i = 0; i < 16; ++i) {
622 idct16_1d(input, outptr);
623 input += 16;
624 outptr += 16;
625 }
626
627 // Then transform columns
628 for (i = 0; i < 16; ++i) {
629 for (j = 0; j < 16; ++j)
630 temp_in[j] = out[j * 16 + i];
631 idct16_1d(temp_in, temp_out);
632 for (j = 0; j < 16; ++j)
633 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
634 + dest[j * stride + i]);
635 }
636 }
637
iadst16_1d(const int16_t * input,int16_t * output)638 static void iadst16_1d(const int16_t *input, int16_t *output) {
639 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
640
641 int x0 = input[15];
642 int x1 = input[0];
643 int x2 = input[13];
644 int x3 = input[2];
645 int x4 = input[11];
646 int x5 = input[4];
647 int x6 = input[9];
648 int x7 = input[6];
649 int x8 = input[7];
650 int x9 = input[8];
651 int x10 = input[5];
652 int x11 = input[10];
653 int x12 = input[3];
654 int x13 = input[12];
655 int x14 = input[1];
656 int x15 = input[14];
657
658 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
659 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
660 output[0] = output[1] = output[2] = output[3] = output[4]
661 = output[5] = output[6] = output[7] = output[8]
662 = output[9] = output[10] = output[11] = output[12]
663 = output[13] = output[14] = output[15] = 0;
664 return;
665 }
666
667 // stage 1
668 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
669 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
670 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
671 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
672 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
673 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
674 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
675 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
676 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
677 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
678 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
679 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
680 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
681 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
682 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
683 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
684
685 x0 = dct_const_round_shift(s0 + s8);
686 x1 = dct_const_round_shift(s1 + s9);
687 x2 = dct_const_round_shift(s2 + s10);
688 x3 = dct_const_round_shift(s3 + s11);
689 x4 = dct_const_round_shift(s4 + s12);
690 x5 = dct_const_round_shift(s5 + s13);
691 x6 = dct_const_round_shift(s6 + s14);
692 x7 = dct_const_round_shift(s7 + s15);
693 x8 = dct_const_round_shift(s0 - s8);
694 x9 = dct_const_round_shift(s1 - s9);
695 x10 = dct_const_round_shift(s2 - s10);
696 x11 = dct_const_round_shift(s3 - s11);
697 x12 = dct_const_round_shift(s4 - s12);
698 x13 = dct_const_round_shift(s5 - s13);
699 x14 = dct_const_round_shift(s6 - s14);
700 x15 = dct_const_round_shift(s7 - s15);
701
702 // stage 2
703 s0 = x0;
704 s1 = x1;
705 s2 = x2;
706 s3 = x3;
707 s4 = x4;
708 s5 = x5;
709 s6 = x6;
710 s7 = x7;
711 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
712 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
713 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
714 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
715 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
716 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
717 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
718 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
719
720 x0 = s0 + s4;
721 x1 = s1 + s5;
722 x2 = s2 + s6;
723 x3 = s3 + s7;
724 x4 = s0 - s4;
725 x5 = s1 - s5;
726 x6 = s2 - s6;
727 x7 = s3 - s7;
728 x8 = dct_const_round_shift(s8 + s12);
729 x9 = dct_const_round_shift(s9 + s13);
730 x10 = dct_const_round_shift(s10 + s14);
731 x11 = dct_const_round_shift(s11 + s15);
732 x12 = dct_const_round_shift(s8 - s12);
733 x13 = dct_const_round_shift(s9 - s13);
734 x14 = dct_const_round_shift(s10 - s14);
735 x15 = dct_const_round_shift(s11 - s15);
736
737 // stage 3
738 s0 = x0;
739 s1 = x1;
740 s2 = x2;
741 s3 = x3;
742 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
743 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
744 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
745 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
746 s8 = x8;
747 s9 = x9;
748 s10 = x10;
749 s11 = x11;
750 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
751 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
752 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
753 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
754
755 x0 = s0 + s2;
756 x1 = s1 + s3;
757 x2 = s0 - s2;
758 x3 = s1 - s3;
759 x4 = dct_const_round_shift(s4 + s6);
760 x5 = dct_const_round_shift(s5 + s7);
761 x6 = dct_const_round_shift(s4 - s6);
762 x7 = dct_const_round_shift(s5 - s7);
763 x8 = s8 + s10;
764 x9 = s9 + s11;
765 x10 = s8 - s10;
766 x11 = s9 - s11;
767 x12 = dct_const_round_shift(s12 + s14);
768 x13 = dct_const_round_shift(s13 + s15);
769 x14 = dct_const_round_shift(s12 - s14);
770 x15 = dct_const_round_shift(s13 - s15);
771
772 // stage 4
773 s2 = (- cospi_16_64) * (x2 + x3);
774 s3 = cospi_16_64 * (x2 - x3);
775 s6 = cospi_16_64 * (x6 + x7);
776 s7 = cospi_16_64 * (- x6 + x7);
777 s10 = cospi_16_64 * (x10 + x11);
778 s11 = cospi_16_64 * (- x10 + x11);
779 s14 = (- cospi_16_64) * (x14 + x15);
780 s15 = cospi_16_64 * (x14 - x15);
781
782 x2 = dct_const_round_shift(s2);
783 x3 = dct_const_round_shift(s3);
784 x6 = dct_const_round_shift(s6);
785 x7 = dct_const_round_shift(s7);
786 x10 = dct_const_round_shift(s10);
787 x11 = dct_const_round_shift(s11);
788 x14 = dct_const_round_shift(s14);
789 x15 = dct_const_round_shift(s15);
790
791 output[0] = x0;
792 output[1] = -x8;
793 output[2] = x12;
794 output[3] = -x4;
795 output[4] = x6;
796 output[5] = x14;
797 output[6] = x10;
798 output[7] = x2;
799 output[8] = x3;
800 output[9] = x11;
801 output[10] = x15;
802 output[11] = x7;
803 output[12] = x5;
804 output[13] = -x13;
805 output[14] = x9;
806 output[15] = -x1;
807 }
808
809 static const transform_2d IHT_16[] = {
810 { idct16_1d, idct16_1d }, // DCT_DCT = 0
811 { iadst16_1d, idct16_1d }, // ADST_DCT = 1
812 { idct16_1d, iadst16_1d }, // DCT_ADST = 2
813 { iadst16_1d, iadst16_1d } // ADST_ADST = 3
814 };
815
vp9_iht16x16_256_add_c(const int16_t * input,uint8_t * dest,int stride,int tx_type)816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
817 int tx_type) {
818 int i, j;
819 int16_t out[16 * 16];
820 int16_t *outptr = out;
821 int16_t temp_in[16], temp_out[16];
822 const transform_2d ht = IHT_16[tx_type];
823
824 // Rows
825 for (i = 0; i < 16; ++i) {
826 ht.rows(input, outptr);
827 input += 16;
828 outptr += 16;
829 }
830
831 // Columns
832 for (i = 0; i < 16; ++i) {
833 for (j = 0; j < 16; ++j)
834 temp_in[j] = out[j * 16 + i];
835 ht.cols(temp_in, temp_out);
836 for (j = 0; j < 16; ++j)
837 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
838 + dest[j * stride + i]); }
839 }
840
vp9_idct16x16_10_add_c(const int16_t * input,uint8_t * dest,int stride)841 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
842 int16_t out[16 * 16] = { 0 };
843 int16_t *outptr = out;
844 int i, j;
845 int16_t temp_in[16], temp_out[16];
846
847 // First transform rows. Since all non-zero dct coefficients are in
848 // upper-left 4x4 area, we only need to calculate first 4 rows here.
849 for (i = 0; i < 4; ++i) {
850 idct16_1d(input, outptr);
851 input += 16;
852 outptr += 16;
853 }
854
855 // Then transform columns
856 for (i = 0; i < 16; ++i) {
857 for (j = 0; j < 16; ++j)
858 temp_in[j] = out[j*16 + i];
859 idct16_1d(temp_in, temp_out);
860 for (j = 0; j < 16; ++j)
861 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
862 + dest[j * stride + i]);
863 }
864 }
865
vp9_idct16x16_1_add_c(const int16_t * input,uint8_t * dest,int stride)866 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
867 int i, j;
868 int a1;
869 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
870 out = dct_const_round_shift(out * cospi_16_64);
871 a1 = ROUND_POWER_OF_TWO(out, 6);
872 for (j = 0; j < 16; ++j) {
873 for (i = 0; i < 16; ++i)
874 dest[i] = clip_pixel(dest[i] + a1);
875 dest += stride;
876 }
877 }
878
idct32_1d(const int16_t * input,int16_t * output)879 static void idct32_1d(const int16_t *input, int16_t *output) {
880 int16_t step1[32], step2[32];
881 int temp1, temp2;
882
883 // stage 1
884 step1[0] = input[0];
885 step1[1] = input[16];
886 step1[2] = input[8];
887 step1[3] = input[24];
888 step1[4] = input[4];
889 step1[5] = input[20];
890 step1[6] = input[12];
891 step1[7] = input[28];
892 step1[8] = input[2];
893 step1[9] = input[18];
894 step1[10] = input[10];
895 step1[11] = input[26];
896 step1[12] = input[6];
897 step1[13] = input[22];
898 step1[14] = input[14];
899 step1[15] = input[30];
900
901 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
902 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
903 step1[16] = dct_const_round_shift(temp1);
904 step1[31] = dct_const_round_shift(temp2);
905
906 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
907 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
908 step1[17] = dct_const_round_shift(temp1);
909 step1[30] = dct_const_round_shift(temp2);
910
911 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
912 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
913 step1[18] = dct_const_round_shift(temp1);
914 step1[29] = dct_const_round_shift(temp2);
915
916 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
917 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
918 step1[19] = dct_const_round_shift(temp1);
919 step1[28] = dct_const_round_shift(temp2);
920
921 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
922 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
923 step1[20] = dct_const_round_shift(temp1);
924 step1[27] = dct_const_round_shift(temp2);
925
926 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
927 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
928 step1[21] = dct_const_round_shift(temp1);
929 step1[26] = dct_const_round_shift(temp2);
930
931 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
932 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
933 step1[22] = dct_const_round_shift(temp1);
934 step1[25] = dct_const_round_shift(temp2);
935
936 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
937 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
938 step1[23] = dct_const_round_shift(temp1);
939 step1[24] = dct_const_round_shift(temp2);
940
941 // stage 2
942 step2[0] = step1[0];
943 step2[1] = step1[1];
944 step2[2] = step1[2];
945 step2[3] = step1[3];
946 step2[4] = step1[4];
947 step2[5] = step1[5];
948 step2[6] = step1[6];
949 step2[7] = step1[7];
950
951 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
952 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
953 step2[8] = dct_const_round_shift(temp1);
954 step2[15] = dct_const_round_shift(temp2);
955
956 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
957 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
958 step2[9] = dct_const_round_shift(temp1);
959 step2[14] = dct_const_round_shift(temp2);
960
961 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
962 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
963 step2[10] = dct_const_round_shift(temp1);
964 step2[13] = dct_const_round_shift(temp2);
965
966 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
967 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
968 step2[11] = dct_const_round_shift(temp1);
969 step2[12] = dct_const_round_shift(temp2);
970
971 step2[16] = step1[16] + step1[17];
972 step2[17] = step1[16] - step1[17];
973 step2[18] = -step1[18] + step1[19];
974 step2[19] = step1[18] + step1[19];
975 step2[20] = step1[20] + step1[21];
976 step2[21] = step1[20] - step1[21];
977 step2[22] = -step1[22] + step1[23];
978 step2[23] = step1[22] + step1[23];
979 step2[24] = step1[24] + step1[25];
980 step2[25] = step1[24] - step1[25];
981 step2[26] = -step1[26] + step1[27];
982 step2[27] = step1[26] + step1[27];
983 step2[28] = step1[28] + step1[29];
984 step2[29] = step1[28] - step1[29];
985 step2[30] = -step1[30] + step1[31];
986 step2[31] = step1[30] + step1[31];
987
988 // stage 3
989 step1[0] = step2[0];
990 step1[1] = step2[1];
991 step1[2] = step2[2];
992 step1[3] = step2[3];
993
994 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
995 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
996 step1[4] = dct_const_round_shift(temp1);
997 step1[7] = dct_const_round_shift(temp2);
998 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
999 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1000 step1[5] = dct_const_round_shift(temp1);
1001 step1[6] = dct_const_round_shift(temp2);
1002
1003 step1[8] = step2[8] + step2[9];
1004 step1[9] = step2[8] - step2[9];
1005 step1[10] = -step2[10] + step2[11];
1006 step1[11] = step2[10] + step2[11];
1007 step1[12] = step2[12] + step2[13];
1008 step1[13] = step2[12] - step2[13];
1009 step1[14] = -step2[14] + step2[15];
1010 step1[15] = step2[14] + step2[15];
1011
1012 step1[16] = step2[16];
1013 step1[31] = step2[31];
1014 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
1015 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
1016 step1[17] = dct_const_round_shift(temp1);
1017 step1[30] = dct_const_round_shift(temp2);
1018 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
1019 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
1020 step1[18] = dct_const_round_shift(temp1);
1021 step1[29] = dct_const_round_shift(temp2);
1022 step1[19] = step2[19];
1023 step1[20] = step2[20];
1024 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
1025 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
1026 step1[21] = dct_const_round_shift(temp1);
1027 step1[26] = dct_const_round_shift(temp2);
1028 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
1029 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
1030 step1[22] = dct_const_round_shift(temp1);
1031 step1[25] = dct_const_round_shift(temp2);
1032 step1[23] = step2[23];
1033 step1[24] = step2[24];
1034 step1[27] = step2[27];
1035 step1[28] = step2[28];
1036
1037 // stage 4
1038 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1039 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1040 step2[0] = dct_const_round_shift(temp1);
1041 step2[1] = dct_const_round_shift(temp2);
1042 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1043 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1044 step2[2] = dct_const_round_shift(temp1);
1045 step2[3] = dct_const_round_shift(temp2);
1046 step2[4] = step1[4] + step1[5];
1047 step2[5] = step1[4] - step1[5];
1048 step2[6] = -step1[6] + step1[7];
1049 step2[7] = step1[6] + step1[7];
1050
1051 step2[8] = step1[8];
1052 step2[15] = step1[15];
1053 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1054 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1055 step2[9] = dct_const_round_shift(temp1);
1056 step2[14] = dct_const_round_shift(temp2);
1057 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1058 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1059 step2[10] = dct_const_round_shift(temp1);
1060 step2[13] = dct_const_round_shift(temp2);
1061 step2[11] = step1[11];
1062 step2[12] = step1[12];
1063
1064 step2[16] = step1[16] + step1[19];
1065 step2[17] = step1[17] + step1[18];
1066 step2[18] = step1[17] - step1[18];
1067 step2[19] = step1[16] - step1[19];
1068 step2[20] = -step1[20] + step1[23];
1069 step2[21] = -step1[21] + step1[22];
1070 step2[22] = step1[21] + step1[22];
1071 step2[23] = step1[20] + step1[23];
1072
1073 step2[24] = step1[24] + step1[27];
1074 step2[25] = step1[25] + step1[26];
1075 step2[26] = step1[25] - step1[26];
1076 step2[27] = step1[24] - step1[27];
1077 step2[28] = -step1[28] + step1[31];
1078 step2[29] = -step1[29] + step1[30];
1079 step2[30] = step1[29] + step1[30];
1080 step2[31] = step1[28] + step1[31];
1081
1082 // stage 5
1083 step1[0] = step2[0] + step2[3];
1084 step1[1] = step2[1] + step2[2];
1085 step1[2] = step2[1] - step2[2];
1086 step1[3] = step2[0] - step2[3];
1087 step1[4] = step2[4];
1088 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1089 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1090 step1[5] = dct_const_round_shift(temp1);
1091 step1[6] = dct_const_round_shift(temp2);
1092 step1[7] = step2[7];
1093
1094 step1[8] = step2[8] + step2[11];
1095 step1[9] = step2[9] + step2[10];
1096 step1[10] = step2[9] - step2[10];
1097 step1[11] = step2[8] - step2[11];
1098 step1[12] = -step2[12] + step2[15];
1099 step1[13] = -step2[13] + step2[14];
1100 step1[14] = step2[13] + step2[14];
1101 step1[15] = step2[12] + step2[15];
1102
1103 step1[16] = step2[16];
1104 step1[17] = step2[17];
1105 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1106 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1107 step1[18] = dct_const_round_shift(temp1);
1108 step1[29] = dct_const_round_shift(temp2);
1109 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1110 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1111 step1[19] = dct_const_round_shift(temp1);
1112 step1[28] = dct_const_round_shift(temp2);
1113 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1114 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1115 step1[20] = dct_const_round_shift(temp1);
1116 step1[27] = dct_const_round_shift(temp2);
1117 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1118 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1119 step1[21] = dct_const_round_shift(temp1);
1120 step1[26] = dct_const_round_shift(temp2);
1121 step1[22] = step2[22];
1122 step1[23] = step2[23];
1123 step1[24] = step2[24];
1124 step1[25] = step2[25];
1125 step1[30] = step2[30];
1126 step1[31] = step2[31];
1127
1128 // stage 6
1129 step2[0] = step1[0] + step1[7];
1130 step2[1] = step1[1] + step1[6];
1131 step2[2] = step1[2] + step1[5];
1132 step2[3] = step1[3] + step1[4];
1133 step2[4] = step1[3] - step1[4];
1134 step2[5] = step1[2] - step1[5];
1135 step2[6] = step1[1] - step1[6];
1136 step2[7] = step1[0] - step1[7];
1137 step2[8] = step1[8];
1138 step2[9] = step1[9];
1139 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1140 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1141 step2[10] = dct_const_round_shift(temp1);
1142 step2[13] = dct_const_round_shift(temp2);
1143 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1144 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1145 step2[11] = dct_const_round_shift(temp1);
1146 step2[12] = dct_const_round_shift(temp2);
1147 step2[14] = step1[14];
1148 step2[15] = step1[15];
1149
1150 step2[16] = step1[16] + step1[23];
1151 step2[17] = step1[17] + step1[22];
1152 step2[18] = step1[18] + step1[21];
1153 step2[19] = step1[19] + step1[20];
1154 step2[20] = step1[19] - step1[20];
1155 step2[21] = step1[18] - step1[21];
1156 step2[22] = step1[17] - step1[22];
1157 step2[23] = step1[16] - step1[23];
1158
1159 step2[24] = -step1[24] + step1[31];
1160 step2[25] = -step1[25] + step1[30];
1161 step2[26] = -step1[26] + step1[29];
1162 step2[27] = -step1[27] + step1[28];
1163 step2[28] = step1[27] + step1[28];
1164 step2[29] = step1[26] + step1[29];
1165 step2[30] = step1[25] + step1[30];
1166 step2[31] = step1[24] + step1[31];
1167
1168 // stage 7
1169 step1[0] = step2[0] + step2[15];
1170 step1[1] = step2[1] + step2[14];
1171 step1[2] = step2[2] + step2[13];
1172 step1[3] = step2[3] + step2[12];
1173 step1[4] = step2[4] + step2[11];
1174 step1[5] = step2[5] + step2[10];
1175 step1[6] = step2[6] + step2[9];
1176 step1[7] = step2[7] + step2[8];
1177 step1[8] = step2[7] - step2[8];
1178 step1[9] = step2[6] - step2[9];
1179 step1[10] = step2[5] - step2[10];
1180 step1[11] = step2[4] - step2[11];
1181 step1[12] = step2[3] - step2[12];
1182 step1[13] = step2[2] - step2[13];
1183 step1[14] = step2[1] - step2[14];
1184 step1[15] = step2[0] - step2[15];
1185
1186 step1[16] = step2[16];
1187 step1[17] = step2[17];
1188 step1[18] = step2[18];
1189 step1[19] = step2[19];
1190 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1191 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1192 step1[20] = dct_const_round_shift(temp1);
1193 step1[27] = dct_const_round_shift(temp2);
1194 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1195 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1196 step1[21] = dct_const_round_shift(temp1);
1197 step1[26] = dct_const_round_shift(temp2);
1198 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1199 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1200 step1[22] = dct_const_round_shift(temp1);
1201 step1[25] = dct_const_round_shift(temp2);
1202 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1203 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1204 step1[23] = dct_const_round_shift(temp1);
1205 step1[24] = dct_const_round_shift(temp2);
1206 step1[28] = step2[28];
1207 step1[29] = step2[29];
1208 step1[30] = step2[30];
1209 step1[31] = step2[31];
1210
1211 // final stage
1212 output[0] = step1[0] + step1[31];
1213 output[1] = step1[1] + step1[30];
1214 output[2] = step1[2] + step1[29];
1215 output[3] = step1[3] + step1[28];
1216 output[4] = step1[4] + step1[27];
1217 output[5] = step1[5] + step1[26];
1218 output[6] = step1[6] + step1[25];
1219 output[7] = step1[7] + step1[24];
1220 output[8] = step1[8] + step1[23];
1221 output[9] = step1[9] + step1[22];
1222 output[10] = step1[10] + step1[21];
1223 output[11] = step1[11] + step1[20];
1224 output[12] = step1[12] + step1[19];
1225 output[13] = step1[13] + step1[18];
1226 output[14] = step1[14] + step1[17];
1227 output[15] = step1[15] + step1[16];
1228 output[16] = step1[15] - step1[16];
1229 output[17] = step1[14] - step1[17];
1230 output[18] = step1[13] - step1[18];
1231 output[19] = step1[12] - step1[19];
1232 output[20] = step1[11] - step1[20];
1233 output[21] = step1[10] - step1[21];
1234 output[22] = step1[9] - step1[22];
1235 output[23] = step1[8] - step1[23];
1236 output[24] = step1[7] - step1[24];
1237 output[25] = step1[6] - step1[25];
1238 output[26] = step1[5] - step1[26];
1239 output[27] = step1[4] - step1[27];
1240 output[28] = step1[3] - step1[28];
1241 output[29] = step1[2] - step1[29];
1242 output[30] = step1[1] - step1[30];
1243 output[31] = step1[0] - step1[31];
1244 }
1245
vp9_idct32x32_1024_add_c(const int16_t * input,uint8_t * dest,int stride)1246 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
1247 int16_t out[32 * 32];
1248 int16_t *outptr = out;
1249 int i, j;
1250 int16_t temp_in[32], temp_out[32];
1251
1252 // Rows
1253 for (i = 0; i < 32; ++i) {
1254 int16_t zero_coeff[16];
1255 for (j = 0; j < 16; ++j)
1256 zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1257 for (j = 0; j < 8; ++j)
1258 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1259 for (j = 0; j < 4; ++j)
1260 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1261 for (j = 0; j < 2; ++j)
1262 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1263
1264 if (zero_coeff[0] | zero_coeff[1])
1265 idct32_1d(input, outptr);
1266 else
1267 vpx_memset(outptr, 0, sizeof(int16_t) * 32);
1268 input += 32;
1269 outptr += 32;
1270 }
1271
1272 // Columns
1273 for (i = 0; i < 32; ++i) {
1274 for (j = 0; j < 32; ++j)
1275 temp_in[j] = out[j * 32 + i];
1276 idct32_1d(temp_in, temp_out);
1277 for (j = 0; j < 32; ++j)
1278 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1279 + dest[j * stride + i]);
1280 }
1281 }
1282
vp9_idct32x32_34_add_c(const int16_t * input,uint8_t * dest,int stride)1283 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
1284 int16_t out[32 * 32] = {0};
1285 int16_t *outptr = out;
1286 int i, j;
1287 int16_t temp_in[32], temp_out[32];
1288
1289 // Rows
1290 // only upper-left 8x8 has non-zero coeff
1291 for (i = 0; i < 8; ++i) {
1292 idct32_1d(input, outptr);
1293 input += 32;
1294 outptr += 32;
1295 }
1296
1297 // Columns
1298 for (i = 0; i < 32; ++i) {
1299 for (j = 0; j < 32; ++j)
1300 temp_in[j] = out[j * 32 + i];
1301 idct32_1d(temp_in, temp_out);
1302 for (j = 0; j < 32; ++j)
1303 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1304 + dest[j * stride + i]);
1305 }
1306 }
1307
vp9_idct32x32_1_add_c(const int16_t * input,uint8_t * dest,int stride)1308 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
1309 int i, j;
1310 int a1;
1311
1312 int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
1313 out = dct_const_round_shift(out * cospi_16_64);
1314 a1 = ROUND_POWER_OF_TWO(out, 6);
1315
1316 for (j = 0; j < 32; ++j) {
1317 for (i = 0; i < 32; ++i)
1318 dest[i] = clip_pixel(dest[i] + a1);
1319 dest += stride;
1320 }
1321 }
1322
1323 // idct
vp9_idct4x4_add(const int16_t * input,uint8_t * dest,int stride,int eob)1324 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1325 if (eob > 1)
1326 vp9_idct4x4_16_add(input, dest, stride);
1327 else
1328 vp9_idct4x4_1_add(input, dest, stride);
1329 }
1330
1331
vp9_iwht4x4_add(const int16_t * input,uint8_t * dest,int stride,int eob)1332 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1333 if (eob > 1)
1334 vp9_iwht4x4_16_add(input, dest, stride);
1335 else
1336 vp9_iwht4x4_1_add(input, dest, stride);
1337 }
1338
vp9_idct8x8_add(const int16_t * input,uint8_t * dest,int stride,int eob)1339 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
1340 // If dc is 1, then input[0] is the reconstructed value, do not need
1341 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
1342
1343 // The calculation can be simplified if there are not many non-zero dct
1344 // coefficients. Use eobs to decide what to do.
1345 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
1346 // Combine that with code here.
1347 if (eob) {
1348 if (eob == 1)
1349 // DC only DCT coefficient
1350 vp9_idct8x8_1_add(input, dest, stride);
1351 else if (eob <= 10)
1352 vp9_idct8x8_10_add(input, dest, stride);
1353 else
1354 vp9_idct8x8_64_add(input, dest, stride);
1355 }
1356 }
1357
vp9_idct16x16_add(const int16_t * input,uint8_t * dest,int stride,int eob)1358 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
1359 int eob) {
1360 /* The calculation can be simplified if there are not many non-zero dct
1361 * coefficients. Use eobs to separate different cases. */
1362 if (eob) {
1363 if (eob == 1)
1364 /* DC only DCT coefficient. */
1365 vp9_idct16x16_1_add(input, dest, stride);
1366 else if (eob <= 10)
1367 vp9_idct16x16_10_add(input, dest, stride);
1368 else
1369 vp9_idct16x16_256_add(input, dest, stride);
1370 }
1371 }
1372
vp9_idct32x32_add(const int16_t * input,uint8_t * dest,int stride,int eob)1373 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
1374 int eob) {
1375 if (eob) {
1376 if (eob == 1)
1377 vp9_idct32x32_1_add(input, dest, stride);
1378 else if (eob <= 34)
1379 // non-zero coeff only in upper-left 8x8
1380 vp9_idct32x32_34_add(input, dest, stride);
1381 else
1382 vp9_idct32x32_1024_add(input, dest, stride);
1383 }
1384 }
1385
1386 // iht
vp9_iht4x4_add(TX_TYPE tx_type,const int16_t * input,uint8_t * dest,int stride,int eob)1387 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1388 int stride, int eob) {
1389 if (tx_type == DCT_DCT)
1390 vp9_idct4x4_add(input, dest, stride, eob);
1391 else
1392 vp9_iht4x4_16_add(input, dest, stride, tx_type);
1393 }
1394
vp9_iht8x8_add(TX_TYPE tx_type,const int16_t * input,uint8_t * dest,int stride,int eob)1395 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1396 int stride, int eob) {
1397 if (tx_type == DCT_DCT) {
1398 vp9_idct8x8_add(input, dest, stride, eob);
1399 } else {
1400 if (eob > 0) {
1401 vp9_iht8x8_64_add(input, dest, stride, tx_type);
1402 }
1403 }
1404 }
1405
vp9_iht16x16_add(TX_TYPE tx_type,const int16_t * input,uint8_t * dest,int stride,int eob)1406 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
1407 int stride, int eob) {
1408 if (tx_type == DCT_DCT) {
1409 vp9_idct16x16_add(input, dest, stride, eob);
1410 } else {
1411 if (eob > 0) {
1412 vp9_iht16x16_256_add(input, dest, stride, tx_type);
1413 }
1414 }
1415 }
1416