1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <math.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/vp9_systemdependent.h"
20
fdct_round_shift(tran_high_t input)21 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
22 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
23 // TODO(debargha, peter.derivaz): Find new bounds for this assert
24 // and make the bounds consts.
25 // assert(INT16_MIN <= rv && rv <= INT16_MAX);
26 return rv;
27 }
28
fdct4(const tran_low_t * input,tran_low_t * output)29 static void fdct4(const tran_low_t *input, tran_low_t *output) {
30 tran_high_t step[4];
31 tran_high_t temp1, temp2;
32
33 step[0] = input[0] + input[3];
34 step[1] = input[1] + input[2];
35 step[2] = input[1] - input[2];
36 step[3] = input[0] - input[3];
37
38 temp1 = (step[0] + step[1]) * cospi_16_64;
39 temp2 = (step[0] - step[1]) * cospi_16_64;
40 output[0] = fdct_round_shift(temp1);
41 output[2] = fdct_round_shift(temp2);
42 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
43 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
44 output[1] = fdct_round_shift(temp1);
45 output[3] = fdct_round_shift(temp2);
46 }
47
vp9_fdct4x4_1_c(const int16_t * input,tran_low_t * output,int stride)48 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
49 int r, c;
50 tran_low_t sum = 0;
51 for (r = 0; r < 4; ++r)
52 for (c = 0; c < 4; ++c)
53 sum += input[r * stride + c];
54
55 output[0] = sum << 1;
56 output[1] = 0;
57 }
58
vp9_fdct4x4_c(const int16_t * input,tran_low_t * output,int stride)59 void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
60 // The 2D transform is done with two passes which are actually pretty
61 // similar. In the first one, we transform the columns and transpose
62 // the results. In the second one, we transform the rows. To achieve that,
63 // as the first pass results are transposed, we transpose the columns (that
64 // is the transposed rows) and transpose the results (so that it goes back
65 // in normal/row positions).
66 int pass;
67 // We need an intermediate buffer between passes.
68 tran_low_t intermediate[4 * 4];
69 const int16_t *in_pass0 = input;
70 const tran_low_t *in = NULL;
71 tran_low_t *out = intermediate;
72 // Do the two transform/transpose passes
73 for (pass = 0; pass < 2; ++pass) {
74 tran_high_t input[4]; // canbe16
75 tran_high_t step[4]; // canbe16
76 tran_high_t temp1, temp2; // needs32
77 int i;
78 for (i = 0; i < 4; ++i) {
79 // Load inputs.
80 if (0 == pass) {
81 input[0] = in_pass0[0 * stride] * 16;
82 input[1] = in_pass0[1 * stride] * 16;
83 input[2] = in_pass0[2 * stride] * 16;
84 input[3] = in_pass0[3 * stride] * 16;
85 if (i == 0 && input[0]) {
86 input[0] += 1;
87 }
88 } else {
89 input[0] = in[0 * 4];
90 input[1] = in[1 * 4];
91 input[2] = in[2 * 4];
92 input[3] = in[3 * 4];
93 }
94 // Transform.
95 step[0] = input[0] + input[3];
96 step[1] = input[1] + input[2];
97 step[2] = input[1] - input[2];
98 step[3] = input[0] - input[3];
99 temp1 = (step[0] + step[1]) * cospi_16_64;
100 temp2 = (step[0] - step[1]) * cospi_16_64;
101 out[0] = fdct_round_shift(temp1);
102 out[2] = fdct_round_shift(temp2);
103 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
104 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
105 out[1] = fdct_round_shift(temp1);
106 out[3] = fdct_round_shift(temp2);
107 // Do next column (which is a transposed row in second/horizontal pass)
108 in_pass0++;
109 in++;
110 out += 4;
111 }
112 // Setup in/out for next pass.
113 in = intermediate;
114 out = output;
115 }
116
117 {
118 int i, j;
119 for (i = 0; i < 4; ++i) {
120 for (j = 0; j < 4; ++j)
121 output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
122 }
123 }
124 }
125
fadst4(const tran_low_t * input,tran_low_t * output)126 static void fadst4(const tran_low_t *input, tran_low_t *output) {
127 tran_high_t x0, x1, x2, x3;
128 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
129
130 x0 = input[0];
131 x1 = input[1];
132 x2 = input[2];
133 x3 = input[3];
134
135 if (!(x0 | x1 | x2 | x3)) {
136 output[0] = output[1] = output[2] = output[3] = 0;
137 return;
138 }
139
140 s0 = sinpi_1_9 * x0;
141 s1 = sinpi_4_9 * x0;
142 s2 = sinpi_2_9 * x1;
143 s3 = sinpi_1_9 * x1;
144 s4 = sinpi_3_9 * x2;
145 s5 = sinpi_4_9 * x3;
146 s6 = sinpi_2_9 * x3;
147 s7 = x0 + x1 - x3;
148
149 x0 = s0 + s2 + s5;
150 x1 = sinpi_3_9 * s7;
151 x2 = s1 - s3 + s6;
152 x3 = s4;
153
154 s0 = x0 + x3;
155 s1 = x1;
156 s2 = x2 - x3;
157 s3 = x2 - x0 + x3;
158
159 // 1-D transform scaling factor is sqrt(2).
160 output[0] = fdct_round_shift(s0);
161 output[1] = fdct_round_shift(s1);
162 output[2] = fdct_round_shift(s2);
163 output[3] = fdct_round_shift(s3);
164 }
165
166 static const transform_2d FHT_4[] = {
167 { fdct4, fdct4 }, // DCT_DCT = 0
168 { fadst4, fdct4 }, // ADST_DCT = 1
169 { fdct4, fadst4 }, // DCT_ADST = 2
170 { fadst4, fadst4 } // ADST_ADST = 3
171 };
172
vp9_fht4x4_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)173 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
174 int stride, int tx_type) {
175 if (tx_type == DCT_DCT) {
176 vp9_fdct4x4_c(input, output, stride);
177 } else {
178 tran_low_t out[4 * 4];
179 tran_low_t *outptr = &out[0];
180 int i, j;
181 tran_low_t temp_in[4], temp_out[4];
182 const transform_2d ht = FHT_4[tx_type];
183
184 // Columns
185 for (i = 0; i < 4; ++i) {
186 for (j = 0; j < 4; ++j)
187 temp_in[j] = input[j * stride + i] * 16;
188 if (i == 0 && temp_in[0])
189 temp_in[0] += 1;
190 ht.cols(temp_in, temp_out);
191 for (j = 0; j < 4; ++j)
192 outptr[j * 4 + i] = temp_out[j];
193 }
194
195 // Rows
196 for (i = 0; i < 4; ++i) {
197 for (j = 0; j < 4; ++j)
198 temp_in[j] = out[j + i * 4];
199 ht.rows(temp_in, temp_out);
200 for (j = 0; j < 4; ++j)
201 output[j + i * 4] = (temp_out[j] + 1) >> 2;
202 }
203 }
204 }
205
fdct8(const tran_low_t * input,tran_low_t * output)206 static void fdct8(const tran_low_t *input, tran_low_t *output) {
207 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
208 tran_high_t t0, t1, t2, t3; // needs32
209 tran_high_t x0, x1, x2, x3; // canbe16
210
211 // stage 1
212 s0 = input[0] + input[7];
213 s1 = input[1] + input[6];
214 s2 = input[2] + input[5];
215 s3 = input[3] + input[4];
216 s4 = input[3] - input[4];
217 s5 = input[2] - input[5];
218 s6 = input[1] - input[6];
219 s7 = input[0] - input[7];
220
221 // fdct4(step, step);
222 x0 = s0 + s3;
223 x1 = s1 + s2;
224 x2 = s1 - s2;
225 x3 = s0 - s3;
226 t0 = (x0 + x1) * cospi_16_64;
227 t1 = (x0 - x1) * cospi_16_64;
228 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
229 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
230 output[0] = fdct_round_shift(t0);
231 output[2] = fdct_round_shift(t2);
232 output[4] = fdct_round_shift(t1);
233 output[6] = fdct_round_shift(t3);
234
235 // Stage 2
236 t0 = (s6 - s5) * cospi_16_64;
237 t1 = (s6 + s5) * cospi_16_64;
238 t2 = fdct_round_shift(t0);
239 t3 = fdct_round_shift(t1);
240
241 // Stage 3
242 x0 = s4 + t2;
243 x1 = s4 - t2;
244 x2 = s7 - t3;
245 x3 = s7 + t3;
246
247 // Stage 4
248 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
249 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
250 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
251 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
252 output[1] = fdct_round_shift(t0);
253 output[3] = fdct_round_shift(t2);
254 output[5] = fdct_round_shift(t1);
255 output[7] = fdct_round_shift(t3);
256 }
257
vp9_fdct8x8_1_c(const int16_t * input,tran_low_t * output,int stride)258 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
259 int r, c;
260 tran_low_t sum = 0;
261 for (r = 0; r < 8; ++r)
262 for (c = 0; c < 8; ++c)
263 sum += input[r * stride + c];
264
265 output[0] = sum;
266 output[1] = 0;
267 }
268
vp9_fdct8x8_c(const int16_t * input,tran_low_t * final_output,int stride)269 void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
270 int i, j;
271 tran_low_t intermediate[64];
272
273 // Transform columns
274 {
275 tran_low_t *output = intermediate;
276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
277 tran_high_t t0, t1, t2, t3; // needs32
278 tran_high_t x0, x1, x2, x3; // canbe16
279
280 int i;
281 for (i = 0; i < 8; i++) {
282 // stage 1
283 s0 = (input[0 * stride] + input[7 * stride]) * 4;
284 s1 = (input[1 * stride] + input[6 * stride]) * 4;
285 s2 = (input[2 * stride] + input[5 * stride]) * 4;
286 s3 = (input[3 * stride] + input[4 * stride]) * 4;
287 s4 = (input[3 * stride] - input[4 * stride]) * 4;
288 s5 = (input[2 * stride] - input[5 * stride]) * 4;
289 s6 = (input[1 * stride] - input[6 * stride]) * 4;
290 s7 = (input[0 * stride] - input[7 * stride]) * 4;
291
292 // fdct4(step, step);
293 x0 = s0 + s3;
294 x1 = s1 + s2;
295 x2 = s1 - s2;
296 x3 = s0 - s3;
297 t0 = (x0 + x1) * cospi_16_64;
298 t1 = (x0 - x1) * cospi_16_64;
299 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
300 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
301 output[0 * 8] = fdct_round_shift(t0);
302 output[2 * 8] = fdct_round_shift(t2);
303 output[4 * 8] = fdct_round_shift(t1);
304 output[6 * 8] = fdct_round_shift(t3);
305
306 // Stage 2
307 t0 = (s6 - s5) * cospi_16_64;
308 t1 = (s6 + s5) * cospi_16_64;
309 t2 = fdct_round_shift(t0);
310 t3 = fdct_round_shift(t1);
311
312 // Stage 3
313 x0 = s4 + t2;
314 x1 = s4 - t2;
315 x2 = s7 - t3;
316 x3 = s7 + t3;
317
318 // Stage 4
319 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
320 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
321 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
322 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
323 output[1 * 8] = fdct_round_shift(t0);
324 output[3 * 8] = fdct_round_shift(t2);
325 output[5 * 8] = fdct_round_shift(t1);
326 output[7 * 8] = fdct_round_shift(t3);
327 input++;
328 output++;
329 }
330 }
331
332 // Rows
333 for (i = 0; i < 8; ++i) {
334 fdct8(&intermediate[i * 8], &final_output[i * 8]);
335 for (j = 0; j < 8; ++j)
336 final_output[j + i * 8] /= 2;
337 }
338 }
339
vp9_fdct16x16_1_c(const int16_t * input,tran_low_t * output,int stride)340 void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
341 int r, c;
342 tran_low_t sum = 0;
343 for (r = 0; r < 16; ++r)
344 for (c = 0; c < 16; ++c)
345 sum += input[r * stride + c];
346
347 output[0] = sum >> 1;
348 output[1] = 0;
349 }
350
vp9_fdct16x16_c(const int16_t * input,tran_low_t * output,int stride)351 void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
352 // The 2D transform is done with two passes which are actually pretty
353 // similar. In the first one, we transform the columns and transpose
354 // the results. In the second one, we transform the rows. To achieve that,
355 // as the first pass results are transposed, we transpose the columns (that
356 // is the transposed rows) and transpose the results (so that it goes back
357 // in normal/row positions).
358 int pass;
359 // We need an intermediate buffer between passes.
360 tran_low_t intermediate[256];
361 const int16_t *in_pass0 = input;
362 const tran_low_t *in = NULL;
363 tran_low_t *out = intermediate;
364 // Do the two transform/transpose passes
365 for (pass = 0; pass < 2; ++pass) {
366 tran_high_t step1[8]; // canbe16
367 tran_high_t step2[8]; // canbe16
368 tran_high_t step3[8]; // canbe16
369 tran_high_t input[8]; // canbe16
370 tran_high_t temp1, temp2; // needs32
371 int i;
372 for (i = 0; i < 16; i++) {
373 if (0 == pass) {
374 // Calculate input for the first 8 results.
375 input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
376 input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
377 input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
378 input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
379 input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
380 input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
381 input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
382 input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
383 // Calculate input for the next 8 results.
384 step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
385 step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
386 step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
387 step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
388 step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
389 step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
390 step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
391 step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
392 } else {
393 // Calculate input for the first 8 results.
394 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
395 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
396 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
397 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
398 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
399 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
400 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
401 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
402 // Calculate input for the next 8 results.
403 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
404 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
405 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
406 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
407 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
408 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
409 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
410 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
411 }
412 // Work on the first eight values; fdct8(input, even_results);
413 {
414 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
415 tran_high_t t0, t1, t2, t3; // needs32
416 tran_high_t x0, x1, x2, x3; // canbe16
417
418 // stage 1
419 s0 = input[0] + input[7];
420 s1 = input[1] + input[6];
421 s2 = input[2] + input[5];
422 s3 = input[3] + input[4];
423 s4 = input[3] - input[4];
424 s5 = input[2] - input[5];
425 s6 = input[1] - input[6];
426 s7 = input[0] - input[7];
427
428 // fdct4(step, step);
429 x0 = s0 + s3;
430 x1 = s1 + s2;
431 x2 = s1 - s2;
432 x3 = s0 - s3;
433 t0 = (x0 + x1) * cospi_16_64;
434 t1 = (x0 - x1) * cospi_16_64;
435 t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
436 t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
437 out[0] = fdct_round_shift(t0);
438 out[4] = fdct_round_shift(t2);
439 out[8] = fdct_round_shift(t1);
440 out[12] = fdct_round_shift(t3);
441
442 // Stage 2
443 t0 = (s6 - s5) * cospi_16_64;
444 t1 = (s6 + s5) * cospi_16_64;
445 t2 = fdct_round_shift(t0);
446 t3 = fdct_round_shift(t1);
447
448 // Stage 3
449 x0 = s4 + t2;
450 x1 = s4 - t2;
451 x2 = s7 - t3;
452 x3 = s7 + t3;
453
454 // Stage 4
455 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
456 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
457 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
458 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
459 out[2] = fdct_round_shift(t0);
460 out[6] = fdct_round_shift(t2);
461 out[10] = fdct_round_shift(t1);
462 out[14] = fdct_round_shift(t3);
463 }
464 // Work on the next eight values; step1 -> odd_results
465 {
466 // step 2
467 temp1 = (step1[5] - step1[2]) * cospi_16_64;
468 temp2 = (step1[4] - step1[3]) * cospi_16_64;
469 step2[2] = fdct_round_shift(temp1);
470 step2[3] = fdct_round_shift(temp2);
471 temp1 = (step1[4] + step1[3]) * cospi_16_64;
472 temp2 = (step1[5] + step1[2]) * cospi_16_64;
473 step2[4] = fdct_round_shift(temp1);
474 step2[5] = fdct_round_shift(temp2);
475 // step 3
476 step3[0] = step1[0] + step2[3];
477 step3[1] = step1[1] + step2[2];
478 step3[2] = step1[1] - step2[2];
479 step3[3] = step1[0] - step2[3];
480 step3[4] = step1[7] - step2[4];
481 step3[5] = step1[6] - step2[5];
482 step3[6] = step1[6] + step2[5];
483 step3[7] = step1[7] + step2[4];
484 // step 4
485 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
486 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
487 step2[1] = fdct_round_shift(temp1);
488 step2[2] = fdct_round_shift(temp2);
489 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
490 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
491 step2[5] = fdct_round_shift(temp1);
492 step2[6] = fdct_round_shift(temp2);
493 // step 5
494 step1[0] = step3[0] + step2[1];
495 step1[1] = step3[0] - step2[1];
496 step1[2] = step3[3] + step2[2];
497 step1[3] = step3[3] - step2[2];
498 step1[4] = step3[4] - step2[5];
499 step1[5] = step3[4] + step2[5];
500 step1[6] = step3[7] - step2[6];
501 step1[7] = step3[7] + step2[6];
502 // step 6
503 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
504 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
505 out[1] = fdct_round_shift(temp1);
506 out[9] = fdct_round_shift(temp2);
507 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
508 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
509 out[5] = fdct_round_shift(temp1);
510 out[13] = fdct_round_shift(temp2);
511 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
512 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
513 out[3] = fdct_round_shift(temp1);
514 out[11] = fdct_round_shift(temp2);
515 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
516 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
517 out[7] = fdct_round_shift(temp1);
518 out[15] = fdct_round_shift(temp2);
519 }
520 // Do next column (which is a transposed row in second/horizontal pass)
521 in++;
522 in_pass0++;
523 out += 16;
524 }
525 // Setup in/out for next pass.
526 in = intermediate;
527 out = output;
528 }
529 }
530
fadst8(const tran_low_t * input,tran_low_t * output)531 static void fadst8(const tran_low_t *input, tran_low_t *output) {
532 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
533
534 tran_high_t x0 = input[7];
535 tran_high_t x1 = input[0];
536 tran_high_t x2 = input[5];
537 tran_high_t x3 = input[2];
538 tran_high_t x4 = input[3];
539 tran_high_t x5 = input[4];
540 tran_high_t x6 = input[1];
541 tran_high_t x7 = input[6];
542
543 // stage 1
544 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
545 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
546 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
547 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
548 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
549 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
550 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
551 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
552
553 x0 = fdct_round_shift(s0 + s4);
554 x1 = fdct_round_shift(s1 + s5);
555 x2 = fdct_round_shift(s2 + s6);
556 x3 = fdct_round_shift(s3 + s7);
557 x4 = fdct_round_shift(s0 - s4);
558 x5 = fdct_round_shift(s1 - s5);
559 x6 = fdct_round_shift(s2 - s6);
560 x7 = fdct_round_shift(s3 - s7);
561
562 // stage 2
563 s0 = x0;
564 s1 = x1;
565 s2 = x2;
566 s3 = x3;
567 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
568 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
569 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7;
570 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
571
572 x0 = s0 + s2;
573 x1 = s1 + s3;
574 x2 = s0 - s2;
575 x3 = s1 - s3;
576 x4 = fdct_round_shift(s4 + s6);
577 x5 = fdct_round_shift(s5 + s7);
578 x6 = fdct_round_shift(s4 - s6);
579 x7 = fdct_round_shift(s5 - s7);
580
581 // stage 3
582 s2 = cospi_16_64 * (x2 + x3);
583 s3 = cospi_16_64 * (x2 - x3);
584 s6 = cospi_16_64 * (x6 + x7);
585 s7 = cospi_16_64 * (x6 - x7);
586
587 x2 = fdct_round_shift(s2);
588 x3 = fdct_round_shift(s3);
589 x6 = fdct_round_shift(s6);
590 x7 = fdct_round_shift(s7);
591
592 output[0] = x0;
593 output[1] = - x4;
594 output[2] = x6;
595 output[3] = - x2;
596 output[4] = x3;
597 output[5] = - x7;
598 output[6] = x5;
599 output[7] = - x1;
600 }
601
602 static const transform_2d FHT_8[] = {
603 { fdct8, fdct8 }, // DCT_DCT = 0
604 { fadst8, fdct8 }, // ADST_DCT = 1
605 { fdct8, fadst8 }, // DCT_ADST = 2
606 { fadst8, fadst8 } // ADST_ADST = 3
607 };
608
vp9_fht8x8_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)609 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
610 int stride, int tx_type) {
611 if (tx_type == DCT_DCT) {
612 vp9_fdct8x8_c(input, output, stride);
613 } else {
614 tran_low_t out[64];
615 tran_low_t *outptr = &out[0];
616 int i, j;
617 tran_low_t temp_in[8], temp_out[8];
618 const transform_2d ht = FHT_8[tx_type];
619
620 // Columns
621 for (i = 0; i < 8; ++i) {
622 for (j = 0; j < 8; ++j)
623 temp_in[j] = input[j * stride + i] * 4;
624 ht.cols(temp_in, temp_out);
625 for (j = 0; j < 8; ++j)
626 outptr[j * 8 + i] = temp_out[j];
627 }
628
629 // Rows
630 for (i = 0; i < 8; ++i) {
631 for (j = 0; j < 8; ++j)
632 temp_in[j] = out[j + i * 8];
633 ht.rows(temp_in, temp_out);
634 for (j = 0; j < 8; ++j)
635 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
636 }
637 }
638 }
639
640 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
641 pixel. */
vp9_fwht4x4_c(const int16_t * input,tran_low_t * output,int stride)642 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
643 int i;
644 tran_high_t a1, b1, c1, d1, e1;
645 const int16_t *ip_pass0 = input;
646 const tran_low_t *ip = NULL;
647 tran_low_t *op = output;
648
649 for (i = 0; i < 4; i++) {
650 a1 = ip_pass0[0 * stride];
651 b1 = ip_pass0[1 * stride];
652 c1 = ip_pass0[2 * stride];
653 d1 = ip_pass0[3 * stride];
654
655 a1 += b1;
656 d1 = d1 - c1;
657 e1 = (a1 - d1) >> 1;
658 b1 = e1 - b1;
659 c1 = e1 - c1;
660 a1 -= c1;
661 d1 += b1;
662 op[0] = a1;
663 op[4] = c1;
664 op[8] = d1;
665 op[12] = b1;
666
667 ip_pass0++;
668 op++;
669 }
670 ip = output;
671 op = output;
672
673 for (i = 0; i < 4; i++) {
674 a1 = ip[0];
675 b1 = ip[1];
676 c1 = ip[2];
677 d1 = ip[3];
678
679 a1 += b1;
680 d1 -= c1;
681 e1 = (a1 - d1) >> 1;
682 b1 = e1 - b1;
683 c1 = e1 - c1;
684 a1 -= c1;
685 d1 += b1;
686 op[0] = a1 * UNIT_QUANT_FACTOR;
687 op[1] = c1 * UNIT_QUANT_FACTOR;
688 op[2] = d1 * UNIT_QUANT_FACTOR;
689 op[3] = b1 * UNIT_QUANT_FACTOR;
690
691 ip += 4;
692 op += 4;
693 }
694 }
695
696 // Rewrote to use same algorithm as others.
fdct16(const tran_low_t in[16],tran_low_t out[16])697 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
698 tran_high_t step1[8]; // canbe16
699 tran_high_t step2[8]; // canbe16
700 tran_high_t step3[8]; // canbe16
701 tran_high_t input[8]; // canbe16
702 tran_high_t temp1, temp2; // needs32
703
704 // step 1
705 input[0] = in[0] + in[15];
706 input[1] = in[1] + in[14];
707 input[2] = in[2] + in[13];
708 input[3] = in[3] + in[12];
709 input[4] = in[4] + in[11];
710 input[5] = in[5] + in[10];
711 input[6] = in[6] + in[ 9];
712 input[7] = in[7] + in[ 8];
713
714 step1[0] = in[7] - in[ 8];
715 step1[1] = in[6] - in[ 9];
716 step1[2] = in[5] - in[10];
717 step1[3] = in[4] - in[11];
718 step1[4] = in[3] - in[12];
719 step1[5] = in[2] - in[13];
720 step1[6] = in[1] - in[14];
721 step1[7] = in[0] - in[15];
722
723 // fdct8(step, step);
724 {
725 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
726 tran_high_t t0, t1, t2, t3; // needs32
727 tran_high_t x0, x1, x2, x3; // canbe16
728
729 // stage 1
730 s0 = input[0] + input[7];
731 s1 = input[1] + input[6];
732 s2 = input[2] + input[5];
733 s3 = input[3] + input[4];
734 s4 = input[3] - input[4];
735 s5 = input[2] - input[5];
736 s6 = input[1] - input[6];
737 s7 = input[0] - input[7];
738
739 // fdct4(step, step);
740 x0 = s0 + s3;
741 x1 = s1 + s2;
742 x2 = s1 - s2;
743 x3 = s0 - s3;
744 t0 = (x0 + x1) * cospi_16_64;
745 t1 = (x0 - x1) * cospi_16_64;
746 t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
747 t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
748 out[0] = fdct_round_shift(t0);
749 out[4] = fdct_round_shift(t2);
750 out[8] = fdct_round_shift(t1);
751 out[12] = fdct_round_shift(t3);
752
753 // Stage 2
754 t0 = (s6 - s5) * cospi_16_64;
755 t1 = (s6 + s5) * cospi_16_64;
756 t2 = fdct_round_shift(t0);
757 t3 = fdct_round_shift(t1);
758
759 // Stage 3
760 x0 = s4 + t2;
761 x1 = s4 - t2;
762 x2 = s7 - t3;
763 x3 = s7 + t3;
764
765 // Stage 4
766 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
767 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
768 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
769 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
770 out[2] = fdct_round_shift(t0);
771 out[6] = fdct_round_shift(t2);
772 out[10] = fdct_round_shift(t1);
773 out[14] = fdct_round_shift(t3);
774 }
775
776 // step 2
777 temp1 = (step1[5] - step1[2]) * cospi_16_64;
778 temp2 = (step1[4] - step1[3]) * cospi_16_64;
779 step2[2] = fdct_round_shift(temp1);
780 step2[3] = fdct_round_shift(temp2);
781 temp1 = (step1[4] + step1[3]) * cospi_16_64;
782 temp2 = (step1[5] + step1[2]) * cospi_16_64;
783 step2[4] = fdct_round_shift(temp1);
784 step2[5] = fdct_round_shift(temp2);
785
786 // step 3
787 step3[0] = step1[0] + step2[3];
788 step3[1] = step1[1] + step2[2];
789 step3[2] = step1[1] - step2[2];
790 step3[3] = step1[0] - step2[3];
791 step3[4] = step1[7] - step2[4];
792 step3[5] = step1[6] - step2[5];
793 step3[6] = step1[6] + step2[5];
794 step3[7] = step1[7] + step2[4];
795
796 // step 4
797 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
798 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
799 step2[1] = fdct_round_shift(temp1);
800 step2[2] = fdct_round_shift(temp2);
801 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
802 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
803 step2[5] = fdct_round_shift(temp1);
804 step2[6] = fdct_round_shift(temp2);
805
806 // step 5
807 step1[0] = step3[0] + step2[1];
808 step1[1] = step3[0] - step2[1];
809 step1[2] = step3[3] + step2[2];
810 step1[3] = step3[3] - step2[2];
811 step1[4] = step3[4] - step2[5];
812 step1[5] = step3[4] + step2[5];
813 step1[6] = step3[7] - step2[6];
814 step1[7] = step3[7] + step2[6];
815
816 // step 6
817 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
818 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
819 out[1] = fdct_round_shift(temp1);
820 out[9] = fdct_round_shift(temp2);
821
822 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
823 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
824 out[5] = fdct_round_shift(temp1);
825 out[13] = fdct_round_shift(temp2);
826
827 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
828 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
829 out[3] = fdct_round_shift(temp1);
830 out[11] = fdct_round_shift(temp2);
831
832 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
833 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
834 out[7] = fdct_round_shift(temp1);
835 out[15] = fdct_round_shift(temp2);
836 }
837
fadst16(const tran_low_t * input,tran_low_t * output)838 static void fadst16(const tran_low_t *input, tran_low_t *output) {
839 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
840 tran_high_t s9, s10, s11, s12, s13, s14, s15;
841
842 tran_high_t x0 = input[15];
843 tran_high_t x1 = input[0];
844 tran_high_t x2 = input[13];
845 tran_high_t x3 = input[2];
846 tran_high_t x4 = input[11];
847 tran_high_t x5 = input[4];
848 tran_high_t x6 = input[9];
849 tran_high_t x7 = input[6];
850 tran_high_t x8 = input[7];
851 tran_high_t x9 = input[8];
852 tran_high_t x10 = input[5];
853 tran_high_t x11 = input[10];
854 tran_high_t x12 = input[3];
855 tran_high_t x13 = input[12];
856 tran_high_t x14 = input[1];
857 tran_high_t x15 = input[14];
858
859 // stage 1
860 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
861 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
862 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
863 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
864 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
865 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
866 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
867 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
868 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
869 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
870 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
871 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
872 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
873 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
874 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
875 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
876
877 x0 = fdct_round_shift(s0 + s8);
878 x1 = fdct_round_shift(s1 + s9);
879 x2 = fdct_round_shift(s2 + s10);
880 x3 = fdct_round_shift(s3 + s11);
881 x4 = fdct_round_shift(s4 + s12);
882 x5 = fdct_round_shift(s5 + s13);
883 x6 = fdct_round_shift(s6 + s14);
884 x7 = fdct_round_shift(s7 + s15);
885 x8 = fdct_round_shift(s0 - s8);
886 x9 = fdct_round_shift(s1 - s9);
887 x10 = fdct_round_shift(s2 - s10);
888 x11 = fdct_round_shift(s3 - s11);
889 x12 = fdct_round_shift(s4 - s12);
890 x13 = fdct_round_shift(s5 - s13);
891 x14 = fdct_round_shift(s6 - s14);
892 x15 = fdct_round_shift(s7 - s15);
893
894 // stage 2
895 s0 = x0;
896 s1 = x1;
897 s2 = x2;
898 s3 = x3;
899 s4 = x4;
900 s5 = x5;
901 s6 = x6;
902 s7 = x7;
903 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
904 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
905 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
906 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
907 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
908 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
909 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
910 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
911
912 x0 = s0 + s4;
913 x1 = s1 + s5;
914 x2 = s2 + s6;
915 x3 = s3 + s7;
916 x4 = s0 - s4;
917 x5 = s1 - s5;
918 x6 = s2 - s6;
919 x7 = s3 - s7;
920 x8 = fdct_round_shift(s8 + s12);
921 x9 = fdct_round_shift(s9 + s13);
922 x10 = fdct_round_shift(s10 + s14);
923 x11 = fdct_round_shift(s11 + s15);
924 x12 = fdct_round_shift(s8 - s12);
925 x13 = fdct_round_shift(s9 - s13);
926 x14 = fdct_round_shift(s10 - s14);
927 x15 = fdct_round_shift(s11 - s15);
928
929 // stage 3
930 s0 = x0;
931 s1 = x1;
932 s2 = x2;
933 s3 = x3;
934 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
935 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
936 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
937 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
938 s8 = x8;
939 s9 = x9;
940 s10 = x10;
941 s11 = x11;
942 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
943 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
944 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
945 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
946
947 x0 = s0 + s2;
948 x1 = s1 + s3;
949 x2 = s0 - s2;
950 x3 = s1 - s3;
951 x4 = fdct_round_shift(s4 + s6);
952 x5 = fdct_round_shift(s5 + s7);
953 x6 = fdct_round_shift(s4 - s6);
954 x7 = fdct_round_shift(s5 - s7);
955 x8 = s8 + s10;
956 x9 = s9 + s11;
957 x10 = s8 - s10;
958 x11 = s9 - s11;
959 x12 = fdct_round_shift(s12 + s14);
960 x13 = fdct_round_shift(s13 + s15);
961 x14 = fdct_round_shift(s12 - s14);
962 x15 = fdct_round_shift(s13 - s15);
963
964 // stage 4
965 s2 = (- cospi_16_64) * (x2 + x3);
966 s3 = cospi_16_64 * (x2 - x3);
967 s6 = cospi_16_64 * (x6 + x7);
968 s7 = cospi_16_64 * (- x6 + x7);
969 s10 = cospi_16_64 * (x10 + x11);
970 s11 = cospi_16_64 * (- x10 + x11);
971 s14 = (- cospi_16_64) * (x14 + x15);
972 s15 = cospi_16_64 * (x14 - x15);
973
974 x2 = fdct_round_shift(s2);
975 x3 = fdct_round_shift(s3);
976 x6 = fdct_round_shift(s6);
977 x7 = fdct_round_shift(s7);
978 x10 = fdct_round_shift(s10);
979 x11 = fdct_round_shift(s11);
980 x14 = fdct_round_shift(s14);
981 x15 = fdct_round_shift(s15);
982
983 output[0] = x0;
984 output[1] = - x8;
985 output[2] = x12;
986 output[3] = - x4;
987 output[4] = x6;
988 output[5] = x14;
989 output[6] = x10;
990 output[7] = x2;
991 output[8] = x3;
992 output[9] = x11;
993 output[10] = x15;
994 output[11] = x7;
995 output[12] = x5;
996 output[13] = - x13;
997 output[14] = x9;
998 output[15] = - x1;
999 }
1000
1001 static const transform_2d FHT_16[] = {
1002 { fdct16, fdct16 }, // DCT_DCT = 0
1003 { fadst16, fdct16 }, // ADST_DCT = 1
1004 { fdct16, fadst16 }, // DCT_ADST = 2
1005 { fadst16, fadst16 } // ADST_ADST = 3
1006 };
1007
vp9_fht16x16_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)1008 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
1009 int stride, int tx_type) {
1010 if (tx_type == DCT_DCT) {
1011 vp9_fdct16x16_c(input, output, stride);
1012 } else {
1013 tran_low_t out[256];
1014 tran_low_t *outptr = &out[0];
1015 int i, j;
1016 tran_low_t temp_in[16], temp_out[16];
1017 const transform_2d ht = FHT_16[tx_type];
1018
1019 // Columns
1020 for (i = 0; i < 16; ++i) {
1021 for (j = 0; j < 16; ++j)
1022 temp_in[j] = input[j * stride + i] * 4;
1023 ht.cols(temp_in, temp_out);
1024 for (j = 0; j < 16; ++j)
1025 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1026 }
1027
1028 // Rows
1029 for (i = 0; i < 16; ++i) {
1030 for (j = 0; j < 16; ++j)
1031 temp_in[j] = out[j + i * 16];
1032 ht.rows(temp_in, temp_out);
1033 for (j = 0; j < 16; ++j)
1034 output[j + i * 16] = temp_out[j];
1035 }
1036 }
1037 }
1038
dct_32_round(tran_high_t input)1039 static INLINE tran_high_t dct_32_round(tran_high_t input) {
1040 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
1041 // TODO(debargha, peter.derivaz): Find new bounds for this assert,
1042 // and make the bounds consts.
1043 // assert(-131072 <= rv && rv <= 131071);
1044 return rv;
1045 }
1046
half_round_shift(tran_high_t input)1047 static INLINE tran_high_t half_round_shift(tran_high_t input) {
1048 tran_high_t rv = (input + 1 + (input < 0)) >> 2;
1049 return rv;
1050 }
1051
fdct32(const tran_high_t * input,tran_high_t * output,int round)1052 static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
1053 tran_high_t step[32];
1054 // Stage 1
1055 step[0] = input[0] + input[(32 - 1)];
1056 step[1] = input[1] + input[(32 - 2)];
1057 step[2] = input[2] + input[(32 - 3)];
1058 step[3] = input[3] + input[(32 - 4)];
1059 step[4] = input[4] + input[(32 - 5)];
1060 step[5] = input[5] + input[(32 - 6)];
1061 step[6] = input[6] + input[(32 - 7)];
1062 step[7] = input[7] + input[(32 - 8)];
1063 step[8] = input[8] + input[(32 - 9)];
1064 step[9] = input[9] + input[(32 - 10)];
1065 step[10] = input[10] + input[(32 - 11)];
1066 step[11] = input[11] + input[(32 - 12)];
1067 step[12] = input[12] + input[(32 - 13)];
1068 step[13] = input[13] + input[(32 - 14)];
1069 step[14] = input[14] + input[(32 - 15)];
1070 step[15] = input[15] + input[(32 - 16)];
1071 step[16] = -input[16] + input[(32 - 17)];
1072 step[17] = -input[17] + input[(32 - 18)];
1073 step[18] = -input[18] + input[(32 - 19)];
1074 step[19] = -input[19] + input[(32 - 20)];
1075 step[20] = -input[20] + input[(32 - 21)];
1076 step[21] = -input[21] + input[(32 - 22)];
1077 step[22] = -input[22] + input[(32 - 23)];
1078 step[23] = -input[23] + input[(32 - 24)];
1079 step[24] = -input[24] + input[(32 - 25)];
1080 step[25] = -input[25] + input[(32 - 26)];
1081 step[26] = -input[26] + input[(32 - 27)];
1082 step[27] = -input[27] + input[(32 - 28)];
1083 step[28] = -input[28] + input[(32 - 29)];
1084 step[29] = -input[29] + input[(32 - 30)];
1085 step[30] = -input[30] + input[(32 - 31)];
1086 step[31] = -input[31] + input[(32 - 32)];
1087
1088 // Stage 2
1089 output[0] = step[0] + step[16 - 1];
1090 output[1] = step[1] + step[16 - 2];
1091 output[2] = step[2] + step[16 - 3];
1092 output[3] = step[3] + step[16 - 4];
1093 output[4] = step[4] + step[16 - 5];
1094 output[5] = step[5] + step[16 - 6];
1095 output[6] = step[6] + step[16 - 7];
1096 output[7] = step[7] + step[16 - 8];
1097 output[8] = -step[8] + step[16 - 9];
1098 output[9] = -step[9] + step[16 - 10];
1099 output[10] = -step[10] + step[16 - 11];
1100 output[11] = -step[11] + step[16 - 12];
1101 output[12] = -step[12] + step[16 - 13];
1102 output[13] = -step[13] + step[16 - 14];
1103 output[14] = -step[14] + step[16 - 15];
1104 output[15] = -step[15] + step[16 - 16];
1105
1106 output[16] = step[16];
1107 output[17] = step[17];
1108 output[18] = step[18];
1109 output[19] = step[19];
1110
1111 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
1112 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
1113 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
1114 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
1115
1116 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
1117 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
1118 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
1119 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
1120
1121 output[28] = step[28];
1122 output[29] = step[29];
1123 output[30] = step[30];
1124 output[31] = step[31];
1125
1126 // dump the magnitude by 4, hence the intermediate values are within
1127 // the range of 16 bits.
1128 if (round) {
1129 output[0] = half_round_shift(output[0]);
1130 output[1] = half_round_shift(output[1]);
1131 output[2] = half_round_shift(output[2]);
1132 output[3] = half_round_shift(output[3]);
1133 output[4] = half_round_shift(output[4]);
1134 output[5] = half_round_shift(output[5]);
1135 output[6] = half_round_shift(output[6]);
1136 output[7] = half_round_shift(output[7]);
1137 output[8] = half_round_shift(output[8]);
1138 output[9] = half_round_shift(output[9]);
1139 output[10] = half_round_shift(output[10]);
1140 output[11] = half_round_shift(output[11]);
1141 output[12] = half_round_shift(output[12]);
1142 output[13] = half_round_shift(output[13]);
1143 output[14] = half_round_shift(output[14]);
1144 output[15] = half_round_shift(output[15]);
1145
1146 output[16] = half_round_shift(output[16]);
1147 output[17] = half_round_shift(output[17]);
1148 output[18] = half_round_shift(output[18]);
1149 output[19] = half_round_shift(output[19]);
1150 output[20] = half_round_shift(output[20]);
1151 output[21] = half_round_shift(output[21]);
1152 output[22] = half_round_shift(output[22]);
1153 output[23] = half_round_shift(output[23]);
1154 output[24] = half_round_shift(output[24]);
1155 output[25] = half_round_shift(output[25]);
1156 output[26] = half_round_shift(output[26]);
1157 output[27] = half_round_shift(output[27]);
1158 output[28] = half_round_shift(output[28]);
1159 output[29] = half_round_shift(output[29]);
1160 output[30] = half_round_shift(output[30]);
1161 output[31] = half_round_shift(output[31]);
1162 }
1163
1164 // Stage 3
1165 step[0] = output[0] + output[(8 - 1)];
1166 step[1] = output[1] + output[(8 - 2)];
1167 step[2] = output[2] + output[(8 - 3)];
1168 step[3] = output[3] + output[(8 - 4)];
1169 step[4] = -output[4] + output[(8 - 5)];
1170 step[5] = -output[5] + output[(8 - 6)];
1171 step[6] = -output[6] + output[(8 - 7)];
1172 step[7] = -output[7] + output[(8 - 8)];
1173 step[8] = output[8];
1174 step[9] = output[9];
1175 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
1176 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
1177 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
1178 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
1179 step[14] = output[14];
1180 step[15] = output[15];
1181
1182 step[16] = output[16] + output[23];
1183 step[17] = output[17] + output[22];
1184 step[18] = output[18] + output[21];
1185 step[19] = output[19] + output[20];
1186 step[20] = -output[20] + output[19];
1187 step[21] = -output[21] + output[18];
1188 step[22] = -output[22] + output[17];
1189 step[23] = -output[23] + output[16];
1190 step[24] = -output[24] + output[31];
1191 step[25] = -output[25] + output[30];
1192 step[26] = -output[26] + output[29];
1193 step[27] = -output[27] + output[28];
1194 step[28] = output[28] + output[27];
1195 step[29] = output[29] + output[26];
1196 step[30] = output[30] + output[25];
1197 step[31] = output[31] + output[24];
1198
1199 // Stage 4
1200 output[0] = step[0] + step[3];
1201 output[1] = step[1] + step[2];
1202 output[2] = -step[2] + step[1];
1203 output[3] = -step[3] + step[0];
1204 output[4] = step[4];
1205 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
1206 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
1207 output[7] = step[7];
1208 output[8] = step[8] + step[11];
1209 output[9] = step[9] + step[10];
1210 output[10] = -step[10] + step[9];
1211 output[11] = -step[11] + step[8];
1212 output[12] = -step[12] + step[15];
1213 output[13] = -step[13] + step[14];
1214 output[14] = step[14] + step[13];
1215 output[15] = step[15] + step[12];
1216
1217 output[16] = step[16];
1218 output[17] = step[17];
1219 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
1220 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
1221 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
1222 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
1223 output[22] = step[22];
1224 output[23] = step[23];
1225 output[24] = step[24];
1226 output[25] = step[25];
1227 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
1228 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
1229 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
1230 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
1231 output[30] = step[30];
1232 output[31] = step[31];
1233
1234 // Stage 5
1235 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
1236 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
1237 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
1238 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
1239 step[4] = output[4] + output[5];
1240 step[5] = -output[5] + output[4];
1241 step[6] = -output[6] + output[7];
1242 step[7] = output[7] + output[6];
1243 step[8] = output[8];
1244 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
1245 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
1246 step[11] = output[11];
1247 step[12] = output[12];
1248 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
1249 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
1250 step[15] = output[15];
1251
1252 step[16] = output[16] + output[19];
1253 step[17] = output[17] + output[18];
1254 step[18] = -output[18] + output[17];
1255 step[19] = -output[19] + output[16];
1256 step[20] = -output[20] + output[23];
1257 step[21] = -output[21] + output[22];
1258 step[22] = output[22] + output[21];
1259 step[23] = output[23] + output[20];
1260 step[24] = output[24] + output[27];
1261 step[25] = output[25] + output[26];
1262 step[26] = -output[26] + output[25];
1263 step[27] = -output[27] + output[24];
1264 step[28] = -output[28] + output[31];
1265 step[29] = -output[29] + output[30];
1266 step[30] = output[30] + output[29];
1267 step[31] = output[31] + output[28];
1268
1269 // Stage 6
1270 output[0] = step[0];
1271 output[1] = step[1];
1272 output[2] = step[2];
1273 output[3] = step[3];
1274 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
1275 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
1276 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
1277 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
1278 output[8] = step[8] + step[9];
1279 output[9] = -step[9] + step[8];
1280 output[10] = -step[10] + step[11];
1281 output[11] = step[11] + step[10];
1282 output[12] = step[12] + step[13];
1283 output[13] = -step[13] + step[12];
1284 output[14] = -step[14] + step[15];
1285 output[15] = step[15] + step[14];
1286
1287 output[16] = step[16];
1288 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
1289 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
1290 output[19] = step[19];
1291 output[20] = step[20];
1292 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
1293 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
1294 output[23] = step[23];
1295 output[24] = step[24];
1296 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
1297 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
1298 output[27] = step[27];
1299 output[28] = step[28];
1300 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
1301 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
1302 output[31] = step[31];
1303
1304 // Stage 7
1305 step[0] = output[0];
1306 step[1] = output[1];
1307 step[2] = output[2];
1308 step[3] = output[3];
1309 step[4] = output[4];
1310 step[5] = output[5];
1311 step[6] = output[6];
1312 step[7] = output[7];
1313 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
1314 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
1315 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
1316 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
1317 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
1318 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
1319 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
1320 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
1321
1322 step[16] = output[16] + output[17];
1323 step[17] = -output[17] + output[16];
1324 step[18] = -output[18] + output[19];
1325 step[19] = output[19] + output[18];
1326 step[20] = output[20] + output[21];
1327 step[21] = -output[21] + output[20];
1328 step[22] = -output[22] + output[23];
1329 step[23] = output[23] + output[22];
1330 step[24] = output[24] + output[25];
1331 step[25] = -output[25] + output[24];
1332 step[26] = -output[26] + output[27];
1333 step[27] = output[27] + output[26];
1334 step[28] = output[28] + output[29];
1335 step[29] = -output[29] + output[28];
1336 step[30] = -output[30] + output[31];
1337 step[31] = output[31] + output[30];
1338
1339 // Final stage --- outputs indices are bit-reversed.
1340 output[0] = step[0];
1341 output[16] = step[1];
1342 output[8] = step[2];
1343 output[24] = step[3];
1344 output[4] = step[4];
1345 output[20] = step[5];
1346 output[12] = step[6];
1347 output[28] = step[7];
1348 output[2] = step[8];
1349 output[18] = step[9];
1350 output[10] = step[10];
1351 output[26] = step[11];
1352 output[6] = step[12];
1353 output[22] = step[13];
1354 output[14] = step[14];
1355 output[30] = step[15];
1356
1357 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
1358 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
1359 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
1360 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
1361 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
1362 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
1363 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
1364 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
1365 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
1366 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
1367 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
1368 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
1369 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
1370 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
1371 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
1372 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
1373 }
1374
vp9_fdct32x32_1_c(const int16_t * input,tran_low_t * output,int stride)1375 void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
1376 int r, c;
1377 tran_low_t sum = 0;
1378 for (r = 0; r < 32; ++r)
1379 for (c = 0; c < 32; ++c)
1380 sum += input[r * stride + c];
1381
1382 output[0] = sum >> 3;
1383 output[1] = 0;
1384 }
1385
vp9_fdct32x32_c(const int16_t * input,tran_low_t * out,int stride)1386 void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
1387 int i, j;
1388 tran_high_t output[32 * 32];
1389
1390 // Columns
1391 for (i = 0; i < 32; ++i) {
1392 tran_high_t temp_in[32], temp_out[32];
1393 for (j = 0; j < 32; ++j)
1394 temp_in[j] = input[j * stride + i] * 4;
1395 fdct32(temp_in, temp_out, 0);
1396 for (j = 0; j < 32; ++j)
1397 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1398 }
1399
1400 // Rows
1401 for (i = 0; i < 32; ++i) {
1402 tran_high_t temp_in[32], temp_out[32];
1403 for (j = 0; j < 32; ++j)
1404 temp_in[j] = output[j + i * 32];
1405 fdct32(temp_in, temp_out, 0);
1406 for (j = 0; j < 32; ++j)
1407 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
1408 }
1409 }
1410
1411 // Note that although we use dct_32_round in dct32 computation flow,
1412 // this 2d fdct32x32 for rate-distortion optimization loop is operating
1413 // within 16 bits precision.
vp9_fdct32x32_rd_c(const int16_t * input,tran_low_t * out,int stride)1414 void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
1415 int i, j;
1416 tran_high_t output[32 * 32];
1417
1418 // Columns
1419 for (i = 0; i < 32; ++i) {
1420 tran_high_t temp_in[32], temp_out[32];
1421 for (j = 0; j < 32; ++j)
1422 temp_in[j] = input[j * stride + i] * 4;
1423 fdct32(temp_in, temp_out, 0);
1424 for (j = 0; j < 32; ++j)
1425 // TODO(cd): see quality impact of only doing
1426 // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
1427 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c
1428 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1429 }
1430
1431 // Rows
1432 for (i = 0; i < 32; ++i) {
1433 tran_high_t temp_in[32], temp_out[32];
1434 for (j = 0; j < 32; ++j)
1435 temp_in[j] = output[j + i * 32];
1436 fdct32(temp_in, temp_out, 1);
1437 for (j = 0; j < 32; ++j)
1438 out[j + i * 32] = temp_out[j];
1439 }
1440 }
1441
1442 #if CONFIG_VP9_HIGHBITDEPTH
vp9_high_fdct4x4_c(const int16_t * input,tran_low_t * output,int stride)1443 void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1444 vp9_fdct4x4_c(input, output, stride);
1445 }
1446
vp9_high_fht4x4_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)1447 void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
1448 int stride, int tx_type) {
1449 vp9_fht4x4_c(input, output, stride, tx_type);
1450 }
1451
vp9_high_fdct8x8_1_c(const int16_t * input,tran_low_t * final_output,int stride)1452 void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
1453 int stride) {
1454 vp9_fdct8x8_1_c(input, final_output, stride);
1455 }
1456
vp9_high_fdct8x8_c(const int16_t * input,tran_low_t * final_output,int stride)1457 void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
1458 int stride) {
1459 vp9_fdct8x8_c(input, final_output, stride);
1460 }
1461
vp9_high_fdct16x16_1_c(const int16_t * input,tran_low_t * output,int stride)1462 void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
1463 int stride) {
1464 vp9_fdct16x16_1_c(input, output, stride);
1465 }
1466
vp9_high_fdct16x16_c(const int16_t * input,tran_low_t * output,int stride)1467 void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
1468 int stride) {
1469 vp9_fdct16x16_c(input, output, stride);
1470 }
1471
vp9_high_fht8x8_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)1472 void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
1473 int stride, int tx_type) {
1474 vp9_fht8x8_c(input, output, stride, tx_type);
1475 }
1476
vp9_high_fwht4x4_c(const int16_t * input,tran_low_t * output,int stride)1477 void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
1478 vp9_fwht4x4_c(input, output, stride);
1479 }
1480
vp9_high_fht16x16_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)1481 void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
1482 int stride, int tx_type) {
1483 vp9_fht16x16_c(input, output, stride, tx_type);
1484 }
1485
vp9_high_fdct32x32_1_c(const int16_t * input,tran_low_t * out,int stride)1486 void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
1487 vp9_fdct32x32_1_c(input, out, stride);
1488 }
1489
vp9_high_fdct32x32_c(const int16_t * input,tran_low_t * out,int stride)1490 void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
1491 vp9_fdct32x32_c(input, out, stride);
1492 }
1493
vp9_high_fdct32x32_rd_c(const int16_t * input,tran_low_t * out,int stride)1494 void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
1495 int stride) {
1496 vp9_fdct32x32_rd_c(input, out, stride);
1497 }
1498 #endif // CONFIG_VP9_HIGHBITDEPTH
1499