1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <math.h>
13
14 #include "./vp9_rtcd.h"
15 #include "./vpx_config.h"
16 #include "./vpx_dsp_rtcd.h"
17
18 #include "vp9/common/vp9_blockd.h"
19 #include "vp9/common/vp9_idct.h"
20 #include "vpx_dsp/fwd_txfm.h"
21 #include "vpx_ports/mem.h"
22
fdct4(const tran_low_t * input,tran_low_t * output)23 static void fdct4(const tran_low_t *input, tran_low_t *output) {
24 tran_high_t step[4];
25 tran_high_t temp1, temp2;
26
27 step[0] = input[0] + input[3];
28 step[1] = input[1] + input[2];
29 step[2] = input[1] - input[2];
30 step[3] = input[0] - input[3];
31
32 temp1 = (step[0] + step[1]) * cospi_16_64;
33 temp2 = (step[0] - step[1]) * cospi_16_64;
34 output[0] = (tran_low_t)fdct_round_shift(temp1);
35 output[2] = (tran_low_t)fdct_round_shift(temp2);
36 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
37 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
38 output[1] = (tran_low_t)fdct_round_shift(temp1);
39 output[3] = (tran_low_t)fdct_round_shift(temp2);
40 }
41
fdct8(const tran_low_t * input,tran_low_t * output)42 static void fdct8(const tran_low_t *input, tran_low_t *output) {
43 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
44 tran_high_t t0, t1, t2, t3; // needs32
45 tran_high_t x0, x1, x2, x3; // canbe16
46
47 // stage 1
48 s0 = input[0] + input[7];
49 s1 = input[1] + input[6];
50 s2 = input[2] + input[5];
51 s3 = input[3] + input[4];
52 s4 = input[3] - input[4];
53 s5 = input[2] - input[5];
54 s6 = input[1] - input[6];
55 s7 = input[0] - input[7];
56
57 // fdct4(step, step);
58 x0 = s0 + s3;
59 x1 = s1 + s2;
60 x2 = s1 - s2;
61 x3 = s0 - s3;
62 t0 = (x0 + x1) * cospi_16_64;
63 t1 = (x0 - x1) * cospi_16_64;
64 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
65 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
66 output[0] = (tran_low_t)fdct_round_shift(t0);
67 output[2] = (tran_low_t)fdct_round_shift(t2);
68 output[4] = (tran_low_t)fdct_round_shift(t1);
69 output[6] = (tran_low_t)fdct_round_shift(t3);
70
71 // Stage 2
72 t0 = (s6 - s5) * cospi_16_64;
73 t1 = (s6 + s5) * cospi_16_64;
74 t2 = (tran_low_t)fdct_round_shift(t0);
75 t3 = (tran_low_t)fdct_round_shift(t1);
76
77 // Stage 3
78 x0 = s4 + t2;
79 x1 = s4 - t2;
80 x2 = s7 - t3;
81 x3 = s7 + t3;
82
83 // Stage 4
84 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
85 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
86 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
87 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
88 output[1] = (tran_low_t)fdct_round_shift(t0);
89 output[3] = (tran_low_t)fdct_round_shift(t2);
90 output[5] = (tran_low_t)fdct_round_shift(t1);
91 output[7] = (tran_low_t)fdct_round_shift(t3);
92 }
93
fdct16(const tran_low_t in[16],tran_low_t out[16])94 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
95 tran_high_t step1[8]; // canbe16
96 tran_high_t step2[8]; // canbe16
97 tran_high_t step3[8]; // canbe16
98 tran_high_t input[8]; // canbe16
99 tran_high_t temp1, temp2; // needs32
100
101 // step 1
102 input[0] = in[0] + in[15];
103 input[1] = in[1] + in[14];
104 input[2] = in[2] + in[13];
105 input[3] = in[3] + in[12];
106 input[4] = in[4] + in[11];
107 input[5] = in[5] + in[10];
108 input[6] = in[6] + in[9];
109 input[7] = in[7] + in[8];
110
111 step1[0] = in[7] - in[8];
112 step1[1] = in[6] - in[9];
113 step1[2] = in[5] - in[10];
114 step1[3] = in[4] - in[11];
115 step1[4] = in[3] - in[12];
116 step1[5] = in[2] - in[13];
117 step1[6] = in[1] - in[14];
118 step1[7] = in[0] - in[15];
119
120 // fdct8(step, step);
121 {
122 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
123 tran_high_t t0, t1, t2, t3; // needs32
124 tran_high_t x0, x1, x2, x3; // canbe16
125
126 // stage 1
127 s0 = input[0] + input[7];
128 s1 = input[1] + input[6];
129 s2 = input[2] + input[5];
130 s3 = input[3] + input[4];
131 s4 = input[3] - input[4];
132 s5 = input[2] - input[5];
133 s6 = input[1] - input[6];
134 s7 = input[0] - input[7];
135
136 // fdct4(step, step);
137 x0 = s0 + s3;
138 x1 = s1 + s2;
139 x2 = s1 - s2;
140 x3 = s0 - s3;
141 t0 = (x0 + x1) * cospi_16_64;
142 t1 = (x0 - x1) * cospi_16_64;
143 t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
144 t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
145 out[0] = (tran_low_t)fdct_round_shift(t0);
146 out[4] = (tran_low_t)fdct_round_shift(t2);
147 out[8] = (tran_low_t)fdct_round_shift(t1);
148 out[12] = (tran_low_t)fdct_round_shift(t3);
149
150 // Stage 2
151 t0 = (s6 - s5) * cospi_16_64;
152 t1 = (s6 + s5) * cospi_16_64;
153 t2 = fdct_round_shift(t0);
154 t3 = fdct_round_shift(t1);
155
156 // Stage 3
157 x0 = s4 + t2;
158 x1 = s4 - t2;
159 x2 = s7 - t3;
160 x3 = s7 + t3;
161
162 // Stage 4
163 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
164 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
165 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
166 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
167 out[2] = (tran_low_t)fdct_round_shift(t0);
168 out[6] = (tran_low_t)fdct_round_shift(t2);
169 out[10] = (tran_low_t)fdct_round_shift(t1);
170 out[14] = (tran_low_t)fdct_round_shift(t3);
171 }
172
173 // step 2
174 temp1 = (step1[5] - step1[2]) * cospi_16_64;
175 temp2 = (step1[4] - step1[3]) * cospi_16_64;
176 step2[2] = fdct_round_shift(temp1);
177 step2[3] = fdct_round_shift(temp2);
178 temp1 = (step1[4] + step1[3]) * cospi_16_64;
179 temp2 = (step1[5] + step1[2]) * cospi_16_64;
180 step2[4] = fdct_round_shift(temp1);
181 step2[5] = fdct_round_shift(temp2);
182
183 // step 3
184 step3[0] = step1[0] + step2[3];
185 step3[1] = step1[1] + step2[2];
186 step3[2] = step1[1] - step2[2];
187 step3[3] = step1[0] - step2[3];
188 step3[4] = step1[7] - step2[4];
189 step3[5] = step1[6] - step2[5];
190 step3[6] = step1[6] + step2[5];
191 step3[7] = step1[7] + step2[4];
192
193 // step 4
194 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
195 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
196 step2[1] = fdct_round_shift(temp1);
197 step2[2] = fdct_round_shift(temp2);
198 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
199 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
200 step2[5] = fdct_round_shift(temp1);
201 step2[6] = fdct_round_shift(temp2);
202
203 // step 5
204 step1[0] = step3[0] + step2[1];
205 step1[1] = step3[0] - step2[1];
206 step1[2] = step3[3] + step2[2];
207 step1[3] = step3[3] - step2[2];
208 step1[4] = step3[4] - step2[5];
209 step1[5] = step3[4] + step2[5];
210 step1[6] = step3[7] - step2[6];
211 step1[7] = step3[7] + step2[6];
212
213 // step 6
214 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
215 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
216 out[1] = (tran_low_t)fdct_round_shift(temp1);
217 out[9] = (tran_low_t)fdct_round_shift(temp2);
218
219 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
220 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
221 out[5] = (tran_low_t)fdct_round_shift(temp1);
222 out[13] = (tran_low_t)fdct_round_shift(temp2);
223
224 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
225 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
226 out[3] = (tran_low_t)fdct_round_shift(temp1);
227 out[11] = (tran_low_t)fdct_round_shift(temp2);
228
229 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
230 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
231 out[7] = (tran_low_t)fdct_round_shift(temp1);
232 out[15] = (tran_low_t)fdct_round_shift(temp2);
233 }
234
fadst4(const tran_low_t * input,tran_low_t * output)235 static void fadst4(const tran_low_t *input, tran_low_t *output) {
236 tran_high_t x0, x1, x2, x3;
237 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
238
239 x0 = input[0];
240 x1 = input[1];
241 x2 = input[2];
242 x3 = input[3];
243
244 if (!(x0 | x1 | x2 | x3)) {
245 output[0] = output[1] = output[2] = output[3] = 0;
246 return;
247 }
248
249 s0 = sinpi_1_9 * x0;
250 s1 = sinpi_4_9 * x0;
251 s2 = sinpi_2_9 * x1;
252 s3 = sinpi_1_9 * x1;
253 s4 = sinpi_3_9 * x2;
254 s5 = sinpi_4_9 * x3;
255 s6 = sinpi_2_9 * x3;
256 s7 = x0 + x1 - x3;
257
258 x0 = s0 + s2 + s5;
259 x1 = sinpi_3_9 * s7;
260 x2 = s1 - s3 + s6;
261 x3 = s4;
262
263 s0 = x0 + x3;
264 s1 = x1;
265 s2 = x2 - x3;
266 s3 = x2 - x0 + x3;
267
268 // 1-D transform scaling factor is sqrt(2).
269 output[0] = (tran_low_t)fdct_round_shift(s0);
270 output[1] = (tran_low_t)fdct_round_shift(s1);
271 output[2] = (tran_low_t)fdct_round_shift(s2);
272 output[3] = (tran_low_t)fdct_round_shift(s3);
273 }
274
fadst8(const tran_low_t * input,tran_low_t * output)275 static void fadst8(const tran_low_t *input, tran_low_t *output) {
276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
277
278 tran_high_t x0 = input[7];
279 tran_high_t x1 = input[0];
280 tran_high_t x2 = input[5];
281 tran_high_t x3 = input[2];
282 tran_high_t x4 = input[3];
283 tran_high_t x5 = input[4];
284 tran_high_t x6 = input[1];
285 tran_high_t x7 = input[6];
286
287 // stage 1
288 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
289 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
290 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
291 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
292 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
293 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
294 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
295 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
296
297 x0 = fdct_round_shift(s0 + s4);
298 x1 = fdct_round_shift(s1 + s5);
299 x2 = fdct_round_shift(s2 + s6);
300 x3 = fdct_round_shift(s3 + s7);
301 x4 = fdct_round_shift(s0 - s4);
302 x5 = fdct_round_shift(s1 - s5);
303 x6 = fdct_round_shift(s2 - s6);
304 x7 = fdct_round_shift(s3 - s7);
305
306 // stage 2
307 s0 = x0;
308 s1 = x1;
309 s2 = x2;
310 s3 = x3;
311 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
312 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
313 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
314 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
315
316 x0 = s0 + s2;
317 x1 = s1 + s3;
318 x2 = s0 - s2;
319 x3 = s1 - s3;
320 x4 = fdct_round_shift(s4 + s6);
321 x5 = fdct_round_shift(s5 + s7);
322 x6 = fdct_round_shift(s4 - s6);
323 x7 = fdct_round_shift(s5 - s7);
324
325 // stage 3
326 s2 = cospi_16_64 * (x2 + x3);
327 s3 = cospi_16_64 * (x2 - x3);
328 s6 = cospi_16_64 * (x6 + x7);
329 s7 = cospi_16_64 * (x6 - x7);
330
331 x2 = fdct_round_shift(s2);
332 x3 = fdct_round_shift(s3);
333 x6 = fdct_round_shift(s6);
334 x7 = fdct_round_shift(s7);
335
336 output[0] = (tran_low_t)x0;
337 output[1] = (tran_low_t)-x4;
338 output[2] = (tran_low_t)x6;
339 output[3] = (tran_low_t)-x2;
340 output[4] = (tran_low_t)x3;
341 output[5] = (tran_low_t)-x7;
342 output[6] = (tran_low_t)x5;
343 output[7] = (tran_low_t)-x1;
344 }
345
fadst16(const tran_low_t * input,tran_low_t * output)346 static void fadst16(const tran_low_t *input, tran_low_t *output) {
347 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
348 tran_high_t s9, s10, s11, s12, s13, s14, s15;
349
350 tran_high_t x0 = input[15];
351 tran_high_t x1 = input[0];
352 tran_high_t x2 = input[13];
353 tran_high_t x3 = input[2];
354 tran_high_t x4 = input[11];
355 tran_high_t x5 = input[4];
356 tran_high_t x6 = input[9];
357 tran_high_t x7 = input[6];
358 tran_high_t x8 = input[7];
359 tran_high_t x9 = input[8];
360 tran_high_t x10 = input[5];
361 tran_high_t x11 = input[10];
362 tran_high_t x12 = input[3];
363 tran_high_t x13 = input[12];
364 tran_high_t x14 = input[1];
365 tran_high_t x15 = input[14];
366
367 // stage 1
368 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
369 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
370 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
371 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
372 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
373 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
374 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
375 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
376 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
377 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
378 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
379 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
380 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
381 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
382 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
383 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
384
385 x0 = fdct_round_shift(s0 + s8);
386 x1 = fdct_round_shift(s1 + s9);
387 x2 = fdct_round_shift(s2 + s10);
388 x3 = fdct_round_shift(s3 + s11);
389 x4 = fdct_round_shift(s4 + s12);
390 x5 = fdct_round_shift(s5 + s13);
391 x6 = fdct_round_shift(s6 + s14);
392 x7 = fdct_round_shift(s7 + s15);
393 x8 = fdct_round_shift(s0 - s8);
394 x9 = fdct_round_shift(s1 - s9);
395 x10 = fdct_round_shift(s2 - s10);
396 x11 = fdct_round_shift(s3 - s11);
397 x12 = fdct_round_shift(s4 - s12);
398 x13 = fdct_round_shift(s5 - s13);
399 x14 = fdct_round_shift(s6 - s14);
400 x15 = fdct_round_shift(s7 - s15);
401
402 // stage 2
403 s0 = x0;
404 s1 = x1;
405 s2 = x2;
406 s3 = x3;
407 s4 = x4;
408 s5 = x5;
409 s6 = x6;
410 s7 = x7;
411 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
412 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
413 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
414 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
415 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
416 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
417 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
418 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
419
420 x0 = s0 + s4;
421 x1 = s1 + s5;
422 x2 = s2 + s6;
423 x3 = s3 + s7;
424 x4 = s0 - s4;
425 x5 = s1 - s5;
426 x6 = s2 - s6;
427 x7 = s3 - s7;
428 x8 = fdct_round_shift(s8 + s12);
429 x9 = fdct_round_shift(s9 + s13);
430 x10 = fdct_round_shift(s10 + s14);
431 x11 = fdct_round_shift(s11 + s15);
432 x12 = fdct_round_shift(s8 - s12);
433 x13 = fdct_round_shift(s9 - s13);
434 x14 = fdct_round_shift(s10 - s14);
435 x15 = fdct_round_shift(s11 - s15);
436
437 // stage 3
438 s0 = x0;
439 s1 = x1;
440 s2 = x2;
441 s3 = x3;
442 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
443 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
444 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
445 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
446 s8 = x8;
447 s9 = x9;
448 s10 = x10;
449 s11 = x11;
450 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
451 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
452 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
453 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
454
455 x0 = s0 + s2;
456 x1 = s1 + s3;
457 x2 = s0 - s2;
458 x3 = s1 - s3;
459 x4 = fdct_round_shift(s4 + s6);
460 x5 = fdct_round_shift(s5 + s7);
461 x6 = fdct_round_shift(s4 - s6);
462 x7 = fdct_round_shift(s5 - s7);
463 x8 = s8 + s10;
464 x9 = s9 + s11;
465 x10 = s8 - s10;
466 x11 = s9 - s11;
467 x12 = fdct_round_shift(s12 + s14);
468 x13 = fdct_round_shift(s13 + s15);
469 x14 = fdct_round_shift(s12 - s14);
470 x15 = fdct_round_shift(s13 - s15);
471
472 // stage 4
473 s2 = (-cospi_16_64) * (x2 + x3);
474 s3 = cospi_16_64 * (x2 - x3);
475 s6 = cospi_16_64 * (x6 + x7);
476 s7 = cospi_16_64 * (-x6 + x7);
477 s10 = cospi_16_64 * (x10 + x11);
478 s11 = cospi_16_64 * (-x10 + x11);
479 s14 = (-cospi_16_64) * (x14 + x15);
480 s15 = cospi_16_64 * (x14 - x15);
481
482 x2 = fdct_round_shift(s2);
483 x3 = fdct_round_shift(s3);
484 x6 = fdct_round_shift(s6);
485 x7 = fdct_round_shift(s7);
486 x10 = fdct_round_shift(s10);
487 x11 = fdct_round_shift(s11);
488 x14 = fdct_round_shift(s14);
489 x15 = fdct_round_shift(s15);
490
491 output[0] = (tran_low_t)x0;
492 output[1] = (tran_low_t)-x8;
493 output[2] = (tran_low_t)x12;
494 output[3] = (tran_low_t)-x4;
495 output[4] = (tran_low_t)x6;
496 output[5] = (tran_low_t)x14;
497 output[6] = (tran_low_t)x10;
498 output[7] = (tran_low_t)x2;
499 output[8] = (tran_low_t)x3;
500 output[9] = (tran_low_t)x11;
501 output[10] = (tran_low_t)x15;
502 output[11] = (tran_low_t)x7;
503 output[12] = (tran_low_t)x5;
504 output[13] = (tran_low_t)-x13;
505 output[14] = (tran_low_t)x9;
506 output[15] = (tran_low_t)-x1;
507 }
508
509 static const transform_2d FHT_4[] = {
510 { fdct4, fdct4 }, // DCT_DCT = 0
511 { fadst4, fdct4 }, // ADST_DCT = 1
512 { fdct4, fadst4 }, // DCT_ADST = 2
513 { fadst4, fadst4 } // ADST_ADST = 3
514 };
515
516 static const transform_2d FHT_8[] = {
517 { fdct8, fdct8 }, // DCT_DCT = 0
518 { fadst8, fdct8 }, // ADST_DCT = 1
519 { fdct8, fadst8 }, // DCT_ADST = 2
520 { fadst8, fadst8 } // ADST_ADST = 3
521 };
522
523 static const transform_2d FHT_16[] = {
524 { fdct16, fdct16 }, // DCT_DCT = 0
525 { fadst16, fdct16 }, // ADST_DCT = 1
526 { fdct16, fadst16 }, // DCT_ADST = 2
527 { fadst16, fadst16 } // ADST_ADST = 3
528 };
529
vp9_fht4x4_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)530 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
531 int tx_type) {
532 if (tx_type == DCT_DCT) {
533 vpx_fdct4x4_c(input, output, stride);
534 } else {
535 tran_low_t out[4 * 4];
536 int i, j;
537 tran_low_t temp_in[4], temp_out[4];
538 const transform_2d ht = FHT_4[tx_type];
539
540 // Columns
541 for (i = 0; i < 4; ++i) {
542 for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
543 if (i == 0 && temp_in[0]) temp_in[0] += 1;
544 ht.cols(temp_in, temp_out);
545 for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
546 }
547
548 // Rows
549 for (i = 0; i < 4; ++i) {
550 for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
551 ht.rows(temp_in, temp_out);
552 for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
553 }
554 }
555 }
556
vp9_fdct8x8_quant_c(const int16_t * input,int stride,tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)557 void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
558 tran_low_t *coeff_ptr, intptr_t n_coeffs,
559 int skip_block, const int16_t *round_ptr,
560 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
561 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
562 uint16_t *eob_ptr, const int16_t *scan,
563 const int16_t *iscan) {
564 int eob = -1;
565
566 int i, j;
567 tran_low_t intermediate[64];
568
569 (void)iscan;
570
571 // Transform columns
572 {
573 tran_low_t *output = intermediate;
574 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
575 tran_high_t t0, t1, t2, t3; // needs32
576 tran_high_t x0, x1, x2, x3; // canbe16
577
578 int i;
579 for (i = 0; i < 8; i++) {
580 // stage 1
581 s0 = (input[0 * stride] + input[7 * stride]) * 4;
582 s1 = (input[1 * stride] + input[6 * stride]) * 4;
583 s2 = (input[2 * stride] + input[5 * stride]) * 4;
584 s3 = (input[3 * stride] + input[4 * stride]) * 4;
585 s4 = (input[3 * stride] - input[4 * stride]) * 4;
586 s5 = (input[2 * stride] - input[5 * stride]) * 4;
587 s6 = (input[1 * stride] - input[6 * stride]) * 4;
588 s7 = (input[0 * stride] - input[7 * stride]) * 4;
589
590 // fdct4(step, step);
591 x0 = s0 + s3;
592 x1 = s1 + s2;
593 x2 = s1 - s2;
594 x3 = s0 - s3;
595 t0 = (x0 + x1) * cospi_16_64;
596 t1 = (x0 - x1) * cospi_16_64;
597 t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
598 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
599 output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
600 output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
601 output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
602 output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
603
604 // Stage 2
605 t0 = (s6 - s5) * cospi_16_64;
606 t1 = (s6 + s5) * cospi_16_64;
607 t2 = fdct_round_shift(t0);
608 t3 = fdct_round_shift(t1);
609
610 // Stage 3
611 x0 = s4 + t2;
612 x1 = s4 - t2;
613 x2 = s7 - t3;
614 x3 = s7 + t3;
615
616 // Stage 4
617 t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
618 t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
619 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
620 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
621 output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
622 output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
623 output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
624 output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
625 input++;
626 output++;
627 }
628 }
629
630 // Rows
631 for (i = 0; i < 8; ++i) {
632 fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
633 for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
634 }
635
636 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
637 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
638
639 if (!skip_block) {
640 // Quantization pass: All coefficients with index >= zero_flag are
641 // skippable. Note: zero_flag can be zero.
642 for (i = 0; i < n_coeffs; i++) {
643 const int rc = scan[i];
644 const int coeff = coeff_ptr[rc];
645 const int coeff_sign = (coeff >> 31);
646 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
647
648 int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
649 tmp = (tmp * quant_ptr[rc != 0]) >> 16;
650
651 qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
652 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
653
654 if (tmp) eob = i;
655 }
656 }
657 *eob_ptr = eob + 1;
658 }
659
vp9_fht8x8_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)660 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
661 int tx_type) {
662 if (tx_type == DCT_DCT) {
663 vpx_fdct8x8_c(input, output, stride);
664 } else {
665 tran_low_t out[64];
666 int i, j;
667 tran_low_t temp_in[8], temp_out[8];
668 const transform_2d ht = FHT_8[tx_type];
669
670 // Columns
671 for (i = 0; i < 8; ++i) {
672 for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
673 ht.cols(temp_in, temp_out);
674 for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
675 }
676
677 // Rows
678 for (i = 0; i < 8; ++i) {
679 for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
680 ht.rows(temp_in, temp_out);
681 for (j = 0; j < 8; ++j)
682 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
683 }
684 }
685 }
686
687 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
688 pixel. */
vp9_fwht4x4_c(const int16_t * input,tran_low_t * output,int stride)689 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
690 int i;
691 tran_high_t a1, b1, c1, d1, e1;
692 const int16_t *ip_pass0 = input;
693 const tran_low_t *ip = NULL;
694 tran_low_t *op = output;
695
696 for (i = 0; i < 4; i++) {
697 a1 = ip_pass0[0 * stride];
698 b1 = ip_pass0[1 * stride];
699 c1 = ip_pass0[2 * stride];
700 d1 = ip_pass0[3 * stride];
701
702 a1 += b1;
703 d1 = d1 - c1;
704 e1 = (a1 - d1) >> 1;
705 b1 = e1 - b1;
706 c1 = e1 - c1;
707 a1 -= c1;
708 d1 += b1;
709 op[0] = (tran_low_t)a1;
710 op[4] = (tran_low_t)c1;
711 op[8] = (tran_low_t)d1;
712 op[12] = (tran_low_t)b1;
713
714 ip_pass0++;
715 op++;
716 }
717 ip = output;
718 op = output;
719
720 for (i = 0; i < 4; i++) {
721 a1 = ip[0];
722 b1 = ip[1];
723 c1 = ip[2];
724 d1 = ip[3];
725
726 a1 += b1;
727 d1 -= c1;
728 e1 = (a1 - d1) >> 1;
729 b1 = e1 - b1;
730 c1 = e1 - c1;
731 a1 -= c1;
732 d1 += b1;
733 op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
734 op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
735 op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
736 op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
737
738 ip += 4;
739 op += 4;
740 }
741 }
742
vp9_fht16x16_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)743 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
744 int tx_type) {
745 if (tx_type == DCT_DCT) {
746 vpx_fdct16x16_c(input, output, stride);
747 } else {
748 tran_low_t out[256];
749 int i, j;
750 tran_low_t temp_in[16], temp_out[16];
751 const transform_2d ht = FHT_16[tx_type];
752
753 // Columns
754 for (i = 0; i < 16; ++i) {
755 for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
756 ht.cols(temp_in, temp_out);
757 for (j = 0; j < 16; ++j)
758 out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
759 }
760
761 // Rows
762 for (i = 0; i < 16; ++i) {
763 for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
764 ht.rows(temp_in, temp_out);
765 for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
766 }
767 }
768 }
769
770 #if CONFIG_VP9_HIGHBITDEPTH
vp9_highbd_fht4x4_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)771 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
772 int tx_type) {
773 vp9_fht4x4_c(input, output, stride, tx_type);
774 }
775
vp9_highbd_fht8x8_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)776 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
777 int tx_type) {
778 vp9_fht8x8_c(input, output, stride, tx_type);
779 }
780
vp9_highbd_fwht4x4_c(const int16_t * input,tran_low_t * output,int stride)781 void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
782 int stride) {
783 vp9_fwht4x4_c(input, output, stride);
784 }
785
vp9_highbd_fht16x16_c(const int16_t * input,tran_low_t * output,int stride,int tx_type)786 void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
787 int tx_type) {
788 vp9_fht16x16_c(input, output, stride, tx_type);
789 }
790 #endif // CONFIG_VP9_HIGHBITDEPTH
791