1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <stdlib.h>
13 #include "av1/encoder/av1_fwd_txfm1d.h"
14 #include "av1/common/av1_txfm.h"
15
av1_fdct4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)16 void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
17 const int8_t *stage_range) {
18 const int32_t size = 4;
19 const int32_t *cospi;
20
21 int32_t stage = 0;
22 int32_t *bf0, *bf1;
23 int32_t step[4];
24
25 // stage 0;
26 av1_range_check_buf(stage, input, input, size, stage_range[stage]);
27
28 // stage 1;
29 stage++;
30 bf1 = output;
31 bf1[0] = input[0] + input[3];
32 bf1[1] = input[1] + input[2];
33 bf1[2] = -input[2] + input[1];
34 bf1[3] = -input[3] + input[0];
35 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
36
37 // stage 2
38 stage++;
39 cospi = cospi_arr(cos_bit);
40 bf0 = output;
41 bf1 = step;
42 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
43 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
44 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
45 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
46 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
47
48 // stage 3
49 stage++;
50 bf0 = step;
51 bf1 = output;
52 bf1[0] = bf0[0];
53 bf1[1] = bf0[2];
54 bf1[2] = bf0[1];
55 bf1[3] = bf0[3];
56 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
57 }
58
av1_fdct8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)59 void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
60 const int8_t *stage_range) {
61 const int32_t size = 8;
62 const int32_t *cospi;
63
64 int32_t stage = 0;
65 int32_t *bf0, *bf1;
66 int32_t step[8];
67
68 // stage 0;
69 av1_range_check_buf(stage, input, input, size, stage_range[stage]);
70
71 // stage 1;
72 stage++;
73 bf1 = output;
74 bf1[0] = input[0] + input[7];
75 bf1[1] = input[1] + input[6];
76 bf1[2] = input[2] + input[5];
77 bf1[3] = input[3] + input[4];
78 bf1[4] = -input[4] + input[3];
79 bf1[5] = -input[5] + input[2];
80 bf1[6] = -input[6] + input[1];
81 bf1[7] = -input[7] + input[0];
82 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
83
84 // stage 2
85 stage++;
86 cospi = cospi_arr(cos_bit);
87 bf0 = output;
88 bf1 = step;
89 bf1[0] = bf0[0] + bf0[3];
90 bf1[1] = bf0[1] + bf0[2];
91 bf1[2] = -bf0[2] + bf0[1];
92 bf1[3] = -bf0[3] + bf0[0];
93 bf1[4] = bf0[4];
94 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
95 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
96 bf1[7] = bf0[7];
97 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
98
99 // stage 3
100 stage++;
101 cospi = cospi_arr(cos_bit);
102 bf0 = step;
103 bf1 = output;
104 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
105 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
106 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
107 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
108 bf1[4] = bf0[4] + bf0[5];
109 bf1[5] = -bf0[5] + bf0[4];
110 bf1[6] = -bf0[6] + bf0[7];
111 bf1[7] = bf0[7] + bf0[6];
112 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
113
114 // stage 4
115 stage++;
116 cospi = cospi_arr(cos_bit);
117 bf0 = output;
118 bf1 = step;
119 bf1[0] = bf0[0];
120 bf1[1] = bf0[1];
121 bf1[2] = bf0[2];
122 bf1[3] = bf0[3];
123 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
124 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
125 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
126 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
127 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
128
129 // stage 5
130 stage++;
131 bf0 = step;
132 bf1 = output;
133 bf1[0] = bf0[0];
134 bf1[1] = bf0[4];
135 bf1[2] = bf0[2];
136 bf1[3] = bf0[6];
137 bf1[4] = bf0[1];
138 bf1[5] = bf0[5];
139 bf1[6] = bf0[3];
140 bf1[7] = bf0[7];
141 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
142 }
143
av1_fdct16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)144 void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
145 const int8_t *stage_range) {
146 const int32_t size = 16;
147 const int32_t *cospi;
148
149 int32_t stage = 0;
150 int32_t *bf0, *bf1;
151 int32_t step[16];
152
153 // stage 0;
154 av1_range_check_buf(stage, input, input, size, stage_range[stage]);
155
156 // stage 1;
157 stage++;
158 bf1 = output;
159 bf1[0] = input[0] + input[15];
160 bf1[1] = input[1] + input[14];
161 bf1[2] = input[2] + input[13];
162 bf1[3] = input[3] + input[12];
163 bf1[4] = input[4] + input[11];
164 bf1[5] = input[5] + input[10];
165 bf1[6] = input[6] + input[9];
166 bf1[7] = input[7] + input[8];
167 bf1[8] = -input[8] + input[7];
168 bf1[9] = -input[9] + input[6];
169 bf1[10] = -input[10] + input[5];
170 bf1[11] = -input[11] + input[4];
171 bf1[12] = -input[12] + input[3];
172 bf1[13] = -input[13] + input[2];
173 bf1[14] = -input[14] + input[1];
174 bf1[15] = -input[15] + input[0];
175 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
176
177 // stage 2
178 stage++;
179 cospi = cospi_arr(cos_bit);
180 bf0 = output;
181 bf1 = step;
182 bf1[0] = bf0[0] + bf0[7];
183 bf1[1] = bf0[1] + bf0[6];
184 bf1[2] = bf0[2] + bf0[5];
185 bf1[3] = bf0[3] + bf0[4];
186 bf1[4] = -bf0[4] + bf0[3];
187 bf1[5] = -bf0[5] + bf0[2];
188 bf1[6] = -bf0[6] + bf0[1];
189 bf1[7] = -bf0[7] + bf0[0];
190 bf1[8] = bf0[8];
191 bf1[9] = bf0[9];
192 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
193 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
194 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
195 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
196 bf1[14] = bf0[14];
197 bf1[15] = bf0[15];
198 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
199
200 // stage 3
201 stage++;
202 cospi = cospi_arr(cos_bit);
203 bf0 = step;
204 bf1 = output;
205 bf1[0] = bf0[0] + bf0[3];
206 bf1[1] = bf0[1] + bf0[2];
207 bf1[2] = -bf0[2] + bf0[1];
208 bf1[3] = -bf0[3] + bf0[0];
209 bf1[4] = bf0[4];
210 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
211 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
212 bf1[7] = bf0[7];
213 bf1[8] = bf0[8] + bf0[11];
214 bf1[9] = bf0[9] + bf0[10];
215 bf1[10] = -bf0[10] + bf0[9];
216 bf1[11] = -bf0[11] + bf0[8];
217 bf1[12] = -bf0[12] + bf0[15];
218 bf1[13] = -bf0[13] + bf0[14];
219 bf1[14] = bf0[14] + bf0[13];
220 bf1[15] = bf0[15] + bf0[12];
221 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
222
223 // stage 4
224 stage++;
225 cospi = cospi_arr(cos_bit);
226 bf0 = output;
227 bf1 = step;
228 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
229 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
230 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
231 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
232 bf1[4] = bf0[4] + bf0[5];
233 bf1[5] = -bf0[5] + bf0[4];
234 bf1[6] = -bf0[6] + bf0[7];
235 bf1[7] = bf0[7] + bf0[6];
236 bf1[8] = bf0[8];
237 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
238 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
239 bf1[11] = bf0[11];
240 bf1[12] = bf0[12];
241 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
242 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
243 bf1[15] = bf0[15];
244 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
245
246 // stage 5
247 stage++;
248 cospi = cospi_arr(cos_bit);
249 bf0 = step;
250 bf1 = output;
251 bf1[0] = bf0[0];
252 bf1[1] = bf0[1];
253 bf1[2] = bf0[2];
254 bf1[3] = bf0[3];
255 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
256 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
257 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
258 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
259 bf1[8] = bf0[8] + bf0[9];
260 bf1[9] = -bf0[9] + bf0[8];
261 bf1[10] = -bf0[10] + bf0[11];
262 bf1[11] = bf0[11] + bf0[10];
263 bf1[12] = bf0[12] + bf0[13];
264 bf1[13] = -bf0[13] + bf0[12];
265 bf1[14] = -bf0[14] + bf0[15];
266 bf1[15] = bf0[15] + bf0[14];
267 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
268
269 // stage 6
270 stage++;
271 cospi = cospi_arr(cos_bit);
272 bf0 = output;
273 bf1 = step;
274 bf1[0] = bf0[0];
275 bf1[1] = bf0[1];
276 bf1[2] = bf0[2];
277 bf1[3] = bf0[3];
278 bf1[4] = bf0[4];
279 bf1[5] = bf0[5];
280 bf1[6] = bf0[6];
281 bf1[7] = bf0[7];
282 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
283 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
284 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
285 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
286 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
287 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
288 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
289 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
290 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
291
292 // stage 7
293 stage++;
294 bf0 = step;
295 bf1 = output;
296 bf1[0] = bf0[0];
297 bf1[1] = bf0[8];
298 bf1[2] = bf0[4];
299 bf1[3] = bf0[12];
300 bf1[4] = bf0[2];
301 bf1[5] = bf0[10];
302 bf1[6] = bf0[6];
303 bf1[7] = bf0[14];
304 bf1[8] = bf0[1];
305 bf1[9] = bf0[9];
306 bf1[10] = bf0[5];
307 bf1[11] = bf0[13];
308 bf1[12] = bf0[3];
309 bf1[13] = bf0[11];
310 bf1[14] = bf0[7];
311 bf1[15] = bf0[15];
312 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
313 }
314
av1_fdct32_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)315 void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
316 const int8_t *stage_range) {
317 const int32_t size = 32;
318 const int32_t *cospi;
319
320 int32_t stage = 0;
321 int32_t *bf0, *bf1;
322 int32_t step[32];
323
324 // stage 0;
325 av1_range_check_buf(stage, input, input, size, stage_range[stage]);
326
327 // stage 1;
328 stage++;
329 bf1 = output;
330 bf1[0] = input[0] + input[31];
331 bf1[1] = input[1] + input[30];
332 bf1[2] = input[2] + input[29];
333 bf1[3] = input[3] + input[28];
334 bf1[4] = input[4] + input[27];
335 bf1[5] = input[5] + input[26];
336 bf1[6] = input[6] + input[25];
337 bf1[7] = input[7] + input[24];
338 bf1[8] = input[8] + input[23];
339 bf1[9] = input[9] + input[22];
340 bf1[10] = input[10] + input[21];
341 bf1[11] = input[11] + input[20];
342 bf1[12] = input[12] + input[19];
343 bf1[13] = input[13] + input[18];
344 bf1[14] = input[14] + input[17];
345 bf1[15] = input[15] + input[16];
346 bf1[16] = -input[16] + input[15];
347 bf1[17] = -input[17] + input[14];
348 bf1[18] = -input[18] + input[13];
349 bf1[19] = -input[19] + input[12];
350 bf1[20] = -input[20] + input[11];
351 bf1[21] = -input[21] + input[10];
352 bf1[22] = -input[22] + input[9];
353 bf1[23] = -input[23] + input[8];
354 bf1[24] = -input[24] + input[7];
355 bf1[25] = -input[25] + input[6];
356 bf1[26] = -input[26] + input[5];
357 bf1[27] = -input[27] + input[4];
358 bf1[28] = -input[28] + input[3];
359 bf1[29] = -input[29] + input[2];
360 bf1[30] = -input[30] + input[1];
361 bf1[31] = -input[31] + input[0];
362 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
363
364 // stage 2
365 stage++;
366 cospi = cospi_arr(cos_bit);
367 bf0 = output;
368 bf1 = step;
369 bf1[0] = bf0[0] + bf0[15];
370 bf1[1] = bf0[1] + bf0[14];
371 bf1[2] = bf0[2] + bf0[13];
372 bf1[3] = bf0[3] + bf0[12];
373 bf1[4] = bf0[4] + bf0[11];
374 bf1[5] = bf0[5] + bf0[10];
375 bf1[6] = bf0[6] + bf0[9];
376 bf1[7] = bf0[7] + bf0[8];
377 bf1[8] = -bf0[8] + bf0[7];
378 bf1[9] = -bf0[9] + bf0[6];
379 bf1[10] = -bf0[10] + bf0[5];
380 bf1[11] = -bf0[11] + bf0[4];
381 bf1[12] = -bf0[12] + bf0[3];
382 bf1[13] = -bf0[13] + bf0[2];
383 bf1[14] = -bf0[14] + bf0[1];
384 bf1[15] = -bf0[15] + bf0[0];
385 bf1[16] = bf0[16];
386 bf1[17] = bf0[17];
387 bf1[18] = bf0[18];
388 bf1[19] = bf0[19];
389 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
390 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
391 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
392 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
393 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
394 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
395 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
396 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
397 bf1[28] = bf0[28];
398 bf1[29] = bf0[29];
399 bf1[30] = bf0[30];
400 bf1[31] = bf0[31];
401 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
402
403 // stage 3
404 stage++;
405 cospi = cospi_arr(cos_bit);
406 bf0 = step;
407 bf1 = output;
408 bf1[0] = bf0[0] + bf0[7];
409 bf1[1] = bf0[1] + bf0[6];
410 bf1[2] = bf0[2] + bf0[5];
411 bf1[3] = bf0[3] + bf0[4];
412 bf1[4] = -bf0[4] + bf0[3];
413 bf1[5] = -bf0[5] + bf0[2];
414 bf1[6] = -bf0[6] + bf0[1];
415 bf1[7] = -bf0[7] + bf0[0];
416 bf1[8] = bf0[8];
417 bf1[9] = bf0[9];
418 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
419 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
420 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
421 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
422 bf1[14] = bf0[14];
423 bf1[15] = bf0[15];
424 bf1[16] = bf0[16] + bf0[23];
425 bf1[17] = bf0[17] + bf0[22];
426 bf1[18] = bf0[18] + bf0[21];
427 bf1[19] = bf0[19] + bf0[20];
428 bf1[20] = -bf0[20] + bf0[19];
429 bf1[21] = -bf0[21] + bf0[18];
430 bf1[22] = -bf0[22] + bf0[17];
431 bf1[23] = -bf0[23] + bf0[16];
432 bf1[24] = -bf0[24] + bf0[31];
433 bf1[25] = -bf0[25] + bf0[30];
434 bf1[26] = -bf0[26] + bf0[29];
435 bf1[27] = -bf0[27] + bf0[28];
436 bf1[28] = bf0[28] + bf0[27];
437 bf1[29] = bf0[29] + bf0[26];
438 bf1[30] = bf0[30] + bf0[25];
439 bf1[31] = bf0[31] + bf0[24];
440 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
441
442 // stage 4
443 stage++;
444 cospi = cospi_arr(cos_bit);
445 bf0 = output;
446 bf1 = step;
447 bf1[0] = bf0[0] + bf0[3];
448 bf1[1] = bf0[1] + bf0[2];
449 bf1[2] = -bf0[2] + bf0[1];
450 bf1[3] = -bf0[3] + bf0[0];
451 bf1[4] = bf0[4];
452 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
453 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
454 bf1[7] = bf0[7];
455 bf1[8] = bf0[8] + bf0[11];
456 bf1[9] = bf0[9] + bf0[10];
457 bf1[10] = -bf0[10] + bf0[9];
458 bf1[11] = -bf0[11] + bf0[8];
459 bf1[12] = -bf0[12] + bf0[15];
460 bf1[13] = -bf0[13] + bf0[14];
461 bf1[14] = bf0[14] + bf0[13];
462 bf1[15] = bf0[15] + bf0[12];
463 bf1[16] = bf0[16];
464 bf1[17] = bf0[17];
465 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
466 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
467 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
468 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
469 bf1[22] = bf0[22];
470 bf1[23] = bf0[23];
471 bf1[24] = bf0[24];
472 bf1[25] = bf0[25];
473 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
474 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
475 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
476 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
477 bf1[30] = bf0[30];
478 bf1[31] = bf0[31];
479 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
480
481 // stage 5
482 stage++;
483 cospi = cospi_arr(cos_bit);
484 bf0 = step;
485 bf1 = output;
486 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
487 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
488 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
489 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
490 bf1[4] = bf0[4] + bf0[5];
491 bf1[5] = -bf0[5] + bf0[4];
492 bf1[6] = -bf0[6] + bf0[7];
493 bf1[7] = bf0[7] + bf0[6];
494 bf1[8] = bf0[8];
495 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
496 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
497 bf1[11] = bf0[11];
498 bf1[12] = bf0[12];
499 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
500 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
501 bf1[15] = bf0[15];
502 bf1[16] = bf0[16] + bf0[19];
503 bf1[17] = bf0[17] + bf0[18];
504 bf1[18] = -bf0[18] + bf0[17];
505 bf1[19] = -bf0[19] + bf0[16];
506 bf1[20] = -bf0[20] + bf0[23];
507 bf1[21] = -bf0[21] + bf0[22];
508 bf1[22] = bf0[22] + bf0[21];
509 bf1[23] = bf0[23] + bf0[20];
510 bf1[24] = bf0[24] + bf0[27];
511 bf1[25] = bf0[25] + bf0[26];
512 bf1[26] = -bf0[26] + bf0[25];
513 bf1[27] = -bf0[27] + bf0[24];
514 bf1[28] = -bf0[28] + bf0[31];
515 bf1[29] = -bf0[29] + bf0[30];
516 bf1[30] = bf0[30] + bf0[29];
517 bf1[31] = bf0[31] + bf0[28];
518 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
519
520 // stage 6
521 stage++;
522 cospi = cospi_arr(cos_bit);
523 bf0 = output;
524 bf1 = step;
525 bf1[0] = bf0[0];
526 bf1[1] = bf0[1];
527 bf1[2] = bf0[2];
528 bf1[3] = bf0[3];
529 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
530 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
531 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
532 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
533 bf1[8] = bf0[8] + bf0[9];
534 bf1[9] = -bf0[9] + bf0[8];
535 bf1[10] = -bf0[10] + bf0[11];
536 bf1[11] = bf0[11] + bf0[10];
537 bf1[12] = bf0[12] + bf0[13];
538 bf1[13] = -bf0[13] + bf0[12];
539 bf1[14] = -bf0[14] + bf0[15];
540 bf1[15] = bf0[15] + bf0[14];
541 bf1[16] = bf0[16];
542 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
543 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
544 bf1[19] = bf0[19];
545 bf1[20] = bf0[20];
546 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
547 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
548 bf1[23] = bf0[23];
549 bf1[24] = bf0[24];
550 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
551 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
552 bf1[27] = bf0[27];
553 bf1[28] = bf0[28];
554 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
555 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
556 bf1[31] = bf0[31];
557 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
558
559 // stage 7
560 stage++;
561 cospi = cospi_arr(cos_bit);
562 bf0 = step;
563 bf1 = output;
564 bf1[0] = bf0[0];
565 bf1[1] = bf0[1];
566 bf1[2] = bf0[2];
567 bf1[3] = bf0[3];
568 bf1[4] = bf0[4];
569 bf1[5] = bf0[5];
570 bf1[6] = bf0[6];
571 bf1[7] = bf0[7];
572 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
573 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
574 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
575 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
576 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
577 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
578 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
579 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
580 bf1[16] = bf0[16] + bf0[17];
581 bf1[17] = -bf0[17] + bf0[16];
582 bf1[18] = -bf0[18] + bf0[19];
583 bf1[19] = bf0[19] + bf0[18];
584 bf1[20] = bf0[20] + bf0[21];
585 bf1[21] = -bf0[21] + bf0[20];
586 bf1[22] = -bf0[22] + bf0[23];
587 bf1[23] = bf0[23] + bf0[22];
588 bf1[24] = bf0[24] + bf0[25];
589 bf1[25] = -bf0[25] + bf0[24];
590 bf1[26] = -bf0[26] + bf0[27];
591 bf1[27] = bf0[27] + bf0[26];
592 bf1[28] = bf0[28] + bf0[29];
593 bf1[29] = -bf0[29] + bf0[28];
594 bf1[30] = -bf0[30] + bf0[31];
595 bf1[31] = bf0[31] + bf0[30];
596 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
597
598 // stage 8
599 stage++;
600 cospi = cospi_arr(cos_bit);
601 bf0 = output;
602 bf1 = step;
603 bf1[0] = bf0[0];
604 bf1[1] = bf0[1];
605 bf1[2] = bf0[2];
606 bf1[3] = bf0[3];
607 bf1[4] = bf0[4];
608 bf1[5] = bf0[5];
609 bf1[6] = bf0[6];
610 bf1[7] = bf0[7];
611 bf1[8] = bf0[8];
612 bf1[9] = bf0[9];
613 bf1[10] = bf0[10];
614 bf1[11] = bf0[11];
615 bf1[12] = bf0[12];
616 bf1[13] = bf0[13];
617 bf1[14] = bf0[14];
618 bf1[15] = bf0[15];
619 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
620 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
621 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
622 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
623 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
624 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
625 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
626 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
627 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
628 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
629 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
630 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
631 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
632 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
633 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
634 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
635 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
636
637 // stage 9
638 stage++;
639 bf0 = step;
640 bf1 = output;
641 bf1[0] = bf0[0];
642 bf1[1] = bf0[16];
643 bf1[2] = bf0[8];
644 bf1[3] = bf0[24];
645 bf1[4] = bf0[4];
646 bf1[5] = bf0[20];
647 bf1[6] = bf0[12];
648 bf1[7] = bf0[28];
649 bf1[8] = bf0[2];
650 bf1[9] = bf0[18];
651 bf1[10] = bf0[10];
652 bf1[11] = bf0[26];
653 bf1[12] = bf0[6];
654 bf1[13] = bf0[22];
655 bf1[14] = bf0[14];
656 bf1[15] = bf0[30];
657 bf1[16] = bf0[1];
658 bf1[17] = bf0[17];
659 bf1[18] = bf0[9];
660 bf1[19] = bf0[25];
661 bf1[20] = bf0[5];
662 bf1[21] = bf0[21];
663 bf1[22] = bf0[13];
664 bf1[23] = bf0[29];
665 bf1[24] = bf0[3];
666 bf1[25] = bf0[19];
667 bf1[26] = bf0[11];
668 bf1[27] = bf0[27];
669 bf1[28] = bf0[7];
670 bf1[29] = bf0[23];
671 bf1[30] = bf0[15];
672 bf1[31] = bf0[31];
673 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
674 }
675
av1_fadst4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)676 void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
677 const int8_t *stage_range) {
678 int bit = cos_bit;
679 const int32_t *sinpi = sinpi_arr(bit);
680 int32_t x0, x1, x2, x3;
681 int32_t s0, s1, s2, s3, s4, s5, s6, s7;
682
683 // stage 0
684 av1_range_check_buf(0, input, input, 4, stage_range[0]);
685 x0 = input[0];
686 x1 = input[1];
687 x2 = input[2];
688 x3 = input[3];
689
690 if (!(x0 | x1 | x2 | x3)) {
691 output[0] = output[1] = output[2] = output[3] = 0;
692 return;
693 }
694
695 // stage 1
696 s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
697 s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
698 s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
699 s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
700 s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
701 s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
702 s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
703 s7 = range_check_value(x0 + x1, stage_range[1]);
704
705 // stage 2
706 s7 = range_check_value(s7 - x3, stage_range[2]);
707
708 // stage 3
709 x0 = range_check_value(s0 + s2, bit + stage_range[3]);
710 x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
711 x2 = range_check_value(s1 - s3, bit + stage_range[3]);
712 x3 = range_check_value(s4, bit + stage_range[3]);
713
714 // stage 4
715 x0 = range_check_value(x0 + s5, bit + stage_range[4]);
716 x2 = range_check_value(x2 + s6, bit + stage_range[4]);
717
718 // stage 5
719 s0 = range_check_value(x0 + x3, bit + stage_range[5]);
720 s1 = range_check_value(x1, bit + stage_range[5]);
721 s2 = range_check_value(x2 - x3, bit + stage_range[5]);
722 s3 = range_check_value(x2 - x0, bit + stage_range[5]);
723
724 // stage 6
725 s3 = range_check_value(s3 + x3, bit + stage_range[6]);
726
727 // 1-D transform scaling factor is sqrt(2).
728 output[0] = round_shift(s0, bit);
729 output[1] = round_shift(s1, bit);
730 output[2] = round_shift(s2, bit);
731 output[3] = round_shift(s3, bit);
732 av1_range_check_buf(6, input, output, 4, stage_range[6]);
733 }
734
av1_fadst8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)735 void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
736 const int8_t *stage_range) {
737 const int32_t size = 8;
738 const int32_t *cospi;
739
740 int32_t stage = 0;
741 int32_t *bf0, *bf1;
742 int32_t step[8];
743
744 // stage 0;
745 av1_range_check_buf(stage, input, input, size, stage_range[stage]);
746
747 // stage 1;
748 stage++;
749 assert(output != input);
750 bf1 = output;
751 bf1[0] = input[0];
752 bf1[1] = -input[7];
753 bf1[2] = -input[3];
754 bf1[3] = input[4];
755 bf1[4] = -input[1];
756 bf1[5] = input[6];
757 bf1[6] = input[2];
758 bf1[7] = -input[5];
759 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
760
761 // stage 2
762 stage++;
763 cospi = cospi_arr(cos_bit);
764 bf0 = output;
765 bf1 = step;
766 bf1[0] = bf0[0];
767 bf1[1] = bf0[1];
768 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
769 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
770 bf1[4] = bf0[4];
771 bf1[5] = bf0[5];
772 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
773 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
774 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
775
776 // stage 3
777 stage++;
778 bf0 = step;
779 bf1 = output;
780 bf1[0] = bf0[0] + bf0[2];
781 bf1[1] = bf0[1] + bf0[3];
782 bf1[2] = bf0[0] - bf0[2];
783 bf1[3] = bf0[1] - bf0[3];
784 bf1[4] = bf0[4] + bf0[6];
785 bf1[5] = bf0[5] + bf0[7];
786 bf1[6] = bf0[4] - bf0[6];
787 bf1[7] = bf0[5] - bf0[7];
788 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
789
790 // stage 4
791 stage++;
792 cospi = cospi_arr(cos_bit);
793 bf0 = output;
794 bf1 = step;
795 bf1[0] = bf0[0];
796 bf1[1] = bf0[1];
797 bf1[2] = bf0[2];
798 bf1[3] = bf0[3];
799 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
800 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
801 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
802 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
803 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
804
805 // stage 5
806 stage++;
807 bf0 = step;
808 bf1 = output;
809 bf1[0] = bf0[0] + bf0[4];
810 bf1[1] = bf0[1] + bf0[5];
811 bf1[2] = bf0[2] + bf0[6];
812 bf1[3] = bf0[3] + bf0[7];
813 bf1[4] = bf0[0] - bf0[4];
814 bf1[5] = bf0[1] - bf0[5];
815 bf1[6] = bf0[2] - bf0[6];
816 bf1[7] = bf0[3] - bf0[7];
817 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
818
819 // stage 6
820 stage++;
821 cospi = cospi_arr(cos_bit);
822 bf0 = output;
823 bf1 = step;
824 bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
825 bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
826 bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
827 bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
828 bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
829 bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
830 bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
831 bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
832 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
833
834 // stage 7
835 stage++;
836 bf0 = step;
837 bf1 = output;
838 bf1[0] = bf0[1];
839 bf1[1] = bf0[6];
840 bf1[2] = bf0[3];
841 bf1[3] = bf0[4];
842 bf1[4] = bf0[5];
843 bf1[5] = bf0[2];
844 bf1[6] = bf0[7];
845 bf1[7] = bf0[0];
846 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
847 }
848
av1_fadst16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)849 void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
850 const int8_t *stage_range) {
851 const int32_t size = 16;
852 const int32_t *cospi;
853
854 int32_t stage = 0;
855 int32_t *bf0, *bf1;
856 int32_t step[16];
857
858 // stage 0;
859 av1_range_check_buf(stage, input, input, size, stage_range[stage]);
860
861 // stage 1;
862 stage++;
863 assert(output != input);
864 bf1 = output;
865 bf1[0] = input[0];
866 bf1[1] = -input[15];
867 bf1[2] = -input[7];
868 bf1[3] = input[8];
869 bf1[4] = -input[3];
870 bf1[5] = input[12];
871 bf1[6] = input[4];
872 bf1[7] = -input[11];
873 bf1[8] = -input[1];
874 bf1[9] = input[14];
875 bf1[10] = input[6];
876 bf1[11] = -input[9];
877 bf1[12] = input[2];
878 bf1[13] = -input[13];
879 bf1[14] = -input[5];
880 bf1[15] = input[10];
881 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
882
883 // stage 2
884 stage++;
885 cospi = cospi_arr(cos_bit);
886 bf0 = output;
887 bf1 = step;
888 bf1[0] = bf0[0];
889 bf1[1] = bf0[1];
890 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
891 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
892 bf1[4] = bf0[4];
893 bf1[5] = bf0[5];
894 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
895 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
896 bf1[8] = bf0[8];
897 bf1[9] = bf0[9];
898 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
899 bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
900 bf1[12] = bf0[12];
901 bf1[13] = bf0[13];
902 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
903 bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
904 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
905
906 // stage 3
907 stage++;
908 bf0 = step;
909 bf1 = output;
910 bf1[0] = bf0[0] + bf0[2];
911 bf1[1] = bf0[1] + bf0[3];
912 bf1[2] = bf0[0] - bf0[2];
913 bf1[3] = bf0[1] - bf0[3];
914 bf1[4] = bf0[4] + bf0[6];
915 bf1[5] = bf0[5] + bf0[7];
916 bf1[6] = bf0[4] - bf0[6];
917 bf1[7] = bf0[5] - bf0[7];
918 bf1[8] = bf0[8] + bf0[10];
919 bf1[9] = bf0[9] + bf0[11];
920 bf1[10] = bf0[8] - bf0[10];
921 bf1[11] = bf0[9] - bf0[11];
922 bf1[12] = bf0[12] + bf0[14];
923 bf1[13] = bf0[13] + bf0[15];
924 bf1[14] = bf0[12] - bf0[14];
925 bf1[15] = bf0[13] - bf0[15];
926 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
927
928 // stage 4
929 stage++;
930 cospi = cospi_arr(cos_bit);
931 bf0 = output;
932 bf1 = step;
933 bf1[0] = bf0[0];
934 bf1[1] = bf0[1];
935 bf1[2] = bf0[2];
936 bf1[3] = bf0[3];
937 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
938 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
939 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
940 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
941 bf1[8] = bf0[8];
942 bf1[9] = bf0[9];
943 bf1[10] = bf0[10];
944 bf1[11] = bf0[11];
945 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
946 bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
947 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
948 bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
949 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
950
951 // stage 5
952 stage++;
953 bf0 = step;
954 bf1 = output;
955 bf1[0] = bf0[0] + bf0[4];
956 bf1[1] = bf0[1] + bf0[5];
957 bf1[2] = bf0[2] + bf0[6];
958 bf1[3] = bf0[3] + bf0[7];
959 bf1[4] = bf0[0] - bf0[4];
960 bf1[5] = bf0[1] - bf0[5];
961 bf1[6] = bf0[2] - bf0[6];
962 bf1[7] = bf0[3] - bf0[7];
963 bf1[8] = bf0[8] + bf0[12];
964 bf1[9] = bf0[9] + bf0[13];
965 bf1[10] = bf0[10] + bf0[14];
966 bf1[11] = bf0[11] + bf0[15];
967 bf1[12] = bf0[8] - bf0[12];
968 bf1[13] = bf0[9] - bf0[13];
969 bf1[14] = bf0[10] - bf0[14];
970 bf1[15] = bf0[11] - bf0[15];
971 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
972
973 // stage 6
974 stage++;
975 cospi = cospi_arr(cos_bit);
976 bf0 = output;
977 bf1 = step;
978 bf1[0] = bf0[0];
979 bf1[1] = bf0[1];
980 bf1[2] = bf0[2];
981 bf1[3] = bf0[3];
982 bf1[4] = bf0[4];
983 bf1[5] = bf0[5];
984 bf1[6] = bf0[6];
985 bf1[7] = bf0[7];
986 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
987 bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
988 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
989 bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
990 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
991 bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
992 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
993 bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
994 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
995
996 // stage 7
997 stage++;
998 bf0 = step;
999 bf1 = output;
1000 bf1[0] = bf0[0] + bf0[8];
1001 bf1[1] = bf0[1] + bf0[9];
1002 bf1[2] = bf0[2] + bf0[10];
1003 bf1[3] = bf0[3] + bf0[11];
1004 bf1[4] = bf0[4] + bf0[12];
1005 bf1[5] = bf0[5] + bf0[13];
1006 bf1[6] = bf0[6] + bf0[14];
1007 bf1[7] = bf0[7] + bf0[15];
1008 bf1[8] = bf0[0] - bf0[8];
1009 bf1[9] = bf0[1] - bf0[9];
1010 bf1[10] = bf0[2] - bf0[10];
1011 bf1[11] = bf0[3] - bf0[11];
1012 bf1[12] = bf0[4] - bf0[12];
1013 bf1[13] = bf0[5] - bf0[13];
1014 bf1[14] = bf0[6] - bf0[14];
1015 bf1[15] = bf0[7] - bf0[15];
1016 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1017
1018 // stage 8
1019 stage++;
1020 cospi = cospi_arr(cos_bit);
1021 bf0 = output;
1022 bf1 = step;
1023 bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
1024 bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
1025 bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
1026 bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
1027 bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
1028 bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
1029 bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
1030 bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
1031 bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
1032 bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
1033 bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
1034 bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
1035 bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
1036 bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
1037 bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
1038 bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
1039 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1040
1041 // stage 9
1042 stage++;
1043 bf0 = step;
1044 bf1 = output;
1045 bf1[0] = bf0[1];
1046 bf1[1] = bf0[14];
1047 bf1[2] = bf0[3];
1048 bf1[3] = bf0[12];
1049 bf1[4] = bf0[5];
1050 bf1[5] = bf0[10];
1051 bf1[6] = bf0[7];
1052 bf1[7] = bf0[8];
1053 bf1[8] = bf0[9];
1054 bf1[9] = bf0[6];
1055 bf1[10] = bf0[11];
1056 bf1[11] = bf0[4];
1057 bf1[12] = bf0[13];
1058 bf1[13] = bf0[2];
1059 bf1[14] = bf0[15];
1060 bf1[15] = bf0[0];
1061 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1062 }
1063
av1_fidentity4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1064 void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1065 const int8_t *stage_range) {
1066 (void)cos_bit;
1067 for (int i = 0; i < 4; ++i)
1068 output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
1069 assert(stage_range[0] + NewSqrt2Bits <= 32);
1070 av1_range_check_buf(0, input, output, 4, stage_range[0]);
1071 }
1072
av1_fidentity8_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1073 void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1074 const int8_t *stage_range) {
1075 (void)cos_bit;
1076 for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
1077 av1_range_check_buf(0, input, output, 8, stage_range[0]);
1078 }
1079
av1_fidentity16_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1080 void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1081 const int8_t *stage_range) {
1082 (void)cos_bit;
1083 for (int i = 0; i < 16; ++i)
1084 output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
1085 assert(stage_range[0] + NewSqrt2Bits <= 32);
1086 av1_range_check_buf(0, input, output, 16, stage_range[0]);
1087 }
1088
av1_fidentity32_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1089 void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1090 const int8_t *stage_range) {
1091 (void)cos_bit;
1092 for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
1093 av1_range_check_buf(0, input, output, 32, stage_range[0]);
1094 }
1095
av1_fdct64_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1096 void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1097 const int8_t *stage_range) {
1098 const int32_t size = 64;
1099 const int32_t *cospi;
1100
1101 int32_t stage = 0;
1102 int32_t *bf0, *bf1;
1103 int32_t step[64];
1104
1105 // stage 0;
1106 av1_range_check_buf(stage, input, input, size, stage_range[stage]);
1107
1108 // stage 1;
1109 stage++;
1110 bf1 = output;
1111 bf1[0] = input[0] + input[63];
1112 bf1[1] = input[1] + input[62];
1113 bf1[2] = input[2] + input[61];
1114 bf1[3] = input[3] + input[60];
1115 bf1[4] = input[4] + input[59];
1116 bf1[5] = input[5] + input[58];
1117 bf1[6] = input[6] + input[57];
1118 bf1[7] = input[7] + input[56];
1119 bf1[8] = input[8] + input[55];
1120 bf1[9] = input[9] + input[54];
1121 bf1[10] = input[10] + input[53];
1122 bf1[11] = input[11] + input[52];
1123 bf1[12] = input[12] + input[51];
1124 bf1[13] = input[13] + input[50];
1125 bf1[14] = input[14] + input[49];
1126 bf1[15] = input[15] + input[48];
1127 bf1[16] = input[16] + input[47];
1128 bf1[17] = input[17] + input[46];
1129 bf1[18] = input[18] + input[45];
1130 bf1[19] = input[19] + input[44];
1131 bf1[20] = input[20] + input[43];
1132 bf1[21] = input[21] + input[42];
1133 bf1[22] = input[22] + input[41];
1134 bf1[23] = input[23] + input[40];
1135 bf1[24] = input[24] + input[39];
1136 bf1[25] = input[25] + input[38];
1137 bf1[26] = input[26] + input[37];
1138 bf1[27] = input[27] + input[36];
1139 bf1[28] = input[28] + input[35];
1140 bf1[29] = input[29] + input[34];
1141 bf1[30] = input[30] + input[33];
1142 bf1[31] = input[31] + input[32];
1143 bf1[32] = -input[32] + input[31];
1144 bf1[33] = -input[33] + input[30];
1145 bf1[34] = -input[34] + input[29];
1146 bf1[35] = -input[35] + input[28];
1147 bf1[36] = -input[36] + input[27];
1148 bf1[37] = -input[37] + input[26];
1149 bf1[38] = -input[38] + input[25];
1150 bf1[39] = -input[39] + input[24];
1151 bf1[40] = -input[40] + input[23];
1152 bf1[41] = -input[41] + input[22];
1153 bf1[42] = -input[42] + input[21];
1154 bf1[43] = -input[43] + input[20];
1155 bf1[44] = -input[44] + input[19];
1156 bf1[45] = -input[45] + input[18];
1157 bf1[46] = -input[46] + input[17];
1158 bf1[47] = -input[47] + input[16];
1159 bf1[48] = -input[48] + input[15];
1160 bf1[49] = -input[49] + input[14];
1161 bf1[50] = -input[50] + input[13];
1162 bf1[51] = -input[51] + input[12];
1163 bf1[52] = -input[52] + input[11];
1164 bf1[53] = -input[53] + input[10];
1165 bf1[54] = -input[54] + input[9];
1166 bf1[55] = -input[55] + input[8];
1167 bf1[56] = -input[56] + input[7];
1168 bf1[57] = -input[57] + input[6];
1169 bf1[58] = -input[58] + input[5];
1170 bf1[59] = -input[59] + input[4];
1171 bf1[60] = -input[60] + input[3];
1172 bf1[61] = -input[61] + input[2];
1173 bf1[62] = -input[62] + input[1];
1174 bf1[63] = -input[63] + input[0];
1175 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1176
1177 // stage 2
1178 stage++;
1179 cospi = cospi_arr(cos_bit);
1180 bf0 = output;
1181 bf1 = step;
1182 bf1[0] = bf0[0] + bf0[31];
1183 bf1[1] = bf0[1] + bf0[30];
1184 bf1[2] = bf0[2] + bf0[29];
1185 bf1[3] = bf0[3] + bf0[28];
1186 bf1[4] = bf0[4] + bf0[27];
1187 bf1[5] = bf0[5] + bf0[26];
1188 bf1[6] = bf0[6] + bf0[25];
1189 bf1[7] = bf0[7] + bf0[24];
1190 bf1[8] = bf0[8] + bf0[23];
1191 bf1[9] = bf0[9] + bf0[22];
1192 bf1[10] = bf0[10] + bf0[21];
1193 bf1[11] = bf0[11] + bf0[20];
1194 bf1[12] = bf0[12] + bf0[19];
1195 bf1[13] = bf0[13] + bf0[18];
1196 bf1[14] = bf0[14] + bf0[17];
1197 bf1[15] = bf0[15] + bf0[16];
1198 bf1[16] = -bf0[16] + bf0[15];
1199 bf1[17] = -bf0[17] + bf0[14];
1200 bf1[18] = -bf0[18] + bf0[13];
1201 bf1[19] = -bf0[19] + bf0[12];
1202 bf1[20] = -bf0[20] + bf0[11];
1203 bf1[21] = -bf0[21] + bf0[10];
1204 bf1[22] = -bf0[22] + bf0[9];
1205 bf1[23] = -bf0[23] + bf0[8];
1206 bf1[24] = -bf0[24] + bf0[7];
1207 bf1[25] = -bf0[25] + bf0[6];
1208 bf1[26] = -bf0[26] + bf0[5];
1209 bf1[27] = -bf0[27] + bf0[4];
1210 bf1[28] = -bf0[28] + bf0[3];
1211 bf1[29] = -bf0[29] + bf0[2];
1212 bf1[30] = -bf0[30] + bf0[1];
1213 bf1[31] = -bf0[31] + bf0[0];
1214 bf1[32] = bf0[32];
1215 bf1[33] = bf0[33];
1216 bf1[34] = bf0[34];
1217 bf1[35] = bf0[35];
1218 bf1[36] = bf0[36];
1219 bf1[37] = bf0[37];
1220 bf1[38] = bf0[38];
1221 bf1[39] = bf0[39];
1222 bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1223 bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1224 bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1225 bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1226 bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1227 bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1228 bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1229 bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1230 bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
1231 bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
1232 bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
1233 bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
1234 bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
1235 bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
1236 bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
1237 bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
1238 bf1[56] = bf0[56];
1239 bf1[57] = bf0[57];
1240 bf1[58] = bf0[58];
1241 bf1[59] = bf0[59];
1242 bf1[60] = bf0[60];
1243 bf1[61] = bf0[61];
1244 bf1[62] = bf0[62];
1245 bf1[63] = bf0[63];
1246 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1247
1248 // stage 3
1249 stage++;
1250 cospi = cospi_arr(cos_bit);
1251 bf0 = step;
1252 bf1 = output;
1253 bf1[0] = bf0[0] + bf0[15];
1254 bf1[1] = bf0[1] + bf0[14];
1255 bf1[2] = bf0[2] + bf0[13];
1256 bf1[3] = bf0[3] + bf0[12];
1257 bf1[4] = bf0[4] + bf0[11];
1258 bf1[5] = bf0[5] + bf0[10];
1259 bf1[6] = bf0[6] + bf0[9];
1260 bf1[7] = bf0[7] + bf0[8];
1261 bf1[8] = -bf0[8] + bf0[7];
1262 bf1[9] = -bf0[9] + bf0[6];
1263 bf1[10] = -bf0[10] + bf0[5];
1264 bf1[11] = -bf0[11] + bf0[4];
1265 bf1[12] = -bf0[12] + bf0[3];
1266 bf1[13] = -bf0[13] + bf0[2];
1267 bf1[14] = -bf0[14] + bf0[1];
1268 bf1[15] = -bf0[15] + bf0[0];
1269 bf1[16] = bf0[16];
1270 bf1[17] = bf0[17];
1271 bf1[18] = bf0[18];
1272 bf1[19] = bf0[19];
1273 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1274 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1275 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1276 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1277 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
1278 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
1279 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
1280 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
1281 bf1[28] = bf0[28];
1282 bf1[29] = bf0[29];
1283 bf1[30] = bf0[30];
1284 bf1[31] = bf0[31];
1285 bf1[32] = bf0[32] + bf0[47];
1286 bf1[33] = bf0[33] + bf0[46];
1287 bf1[34] = bf0[34] + bf0[45];
1288 bf1[35] = bf0[35] + bf0[44];
1289 bf1[36] = bf0[36] + bf0[43];
1290 bf1[37] = bf0[37] + bf0[42];
1291 bf1[38] = bf0[38] + bf0[41];
1292 bf1[39] = bf0[39] + bf0[40];
1293 bf1[40] = -bf0[40] + bf0[39];
1294 bf1[41] = -bf0[41] + bf0[38];
1295 bf1[42] = -bf0[42] + bf0[37];
1296 bf1[43] = -bf0[43] + bf0[36];
1297 bf1[44] = -bf0[44] + bf0[35];
1298 bf1[45] = -bf0[45] + bf0[34];
1299 bf1[46] = -bf0[46] + bf0[33];
1300 bf1[47] = -bf0[47] + bf0[32];
1301 bf1[48] = -bf0[48] + bf0[63];
1302 bf1[49] = -bf0[49] + bf0[62];
1303 bf1[50] = -bf0[50] + bf0[61];
1304 bf1[51] = -bf0[51] + bf0[60];
1305 bf1[52] = -bf0[52] + bf0[59];
1306 bf1[53] = -bf0[53] + bf0[58];
1307 bf1[54] = -bf0[54] + bf0[57];
1308 bf1[55] = -bf0[55] + bf0[56];
1309 bf1[56] = bf0[56] + bf0[55];
1310 bf1[57] = bf0[57] + bf0[54];
1311 bf1[58] = bf0[58] + bf0[53];
1312 bf1[59] = bf0[59] + bf0[52];
1313 bf1[60] = bf0[60] + bf0[51];
1314 bf1[61] = bf0[61] + bf0[50];
1315 bf1[62] = bf0[62] + bf0[49];
1316 bf1[63] = bf0[63] + bf0[48];
1317 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1318
1319 // stage 4
1320 stage++;
1321 cospi = cospi_arr(cos_bit);
1322 bf0 = output;
1323 bf1 = step;
1324 bf1[0] = bf0[0] + bf0[7];
1325 bf1[1] = bf0[1] + bf0[6];
1326 bf1[2] = bf0[2] + bf0[5];
1327 bf1[3] = bf0[3] + bf0[4];
1328 bf1[4] = -bf0[4] + bf0[3];
1329 bf1[5] = -bf0[5] + bf0[2];
1330 bf1[6] = -bf0[6] + bf0[1];
1331 bf1[7] = -bf0[7] + bf0[0];
1332 bf1[8] = bf0[8];
1333 bf1[9] = bf0[9];
1334 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1335 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1336 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
1337 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
1338 bf1[14] = bf0[14];
1339 bf1[15] = bf0[15];
1340 bf1[16] = bf0[16] + bf0[23];
1341 bf1[17] = bf0[17] + bf0[22];
1342 bf1[18] = bf0[18] + bf0[21];
1343 bf1[19] = bf0[19] + bf0[20];
1344 bf1[20] = -bf0[20] + bf0[19];
1345 bf1[21] = -bf0[21] + bf0[18];
1346 bf1[22] = -bf0[22] + bf0[17];
1347 bf1[23] = -bf0[23] + bf0[16];
1348 bf1[24] = -bf0[24] + bf0[31];
1349 bf1[25] = -bf0[25] + bf0[30];
1350 bf1[26] = -bf0[26] + bf0[29];
1351 bf1[27] = -bf0[27] + bf0[28];
1352 bf1[28] = bf0[28] + bf0[27];
1353 bf1[29] = bf0[29] + bf0[26];
1354 bf1[30] = bf0[30] + bf0[25];
1355 bf1[31] = bf0[31] + bf0[24];
1356 bf1[32] = bf0[32];
1357 bf1[33] = bf0[33];
1358 bf1[34] = bf0[34];
1359 bf1[35] = bf0[35];
1360 bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1361 bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1362 bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1363 bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1364 bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1365 bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1366 bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1367 bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1368 bf1[44] = bf0[44];
1369 bf1[45] = bf0[45];
1370 bf1[46] = bf0[46];
1371 bf1[47] = bf0[47];
1372 bf1[48] = bf0[48];
1373 bf1[49] = bf0[49];
1374 bf1[50] = bf0[50];
1375 bf1[51] = bf0[51];
1376 bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
1377 bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
1378 bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
1379 bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
1380 bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
1381 bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
1382 bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
1383 bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
1384 bf1[60] = bf0[60];
1385 bf1[61] = bf0[61];
1386 bf1[62] = bf0[62];
1387 bf1[63] = bf0[63];
1388 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1389
1390 // stage 5
1391 stage++;
1392 cospi = cospi_arr(cos_bit);
1393 bf0 = step;
1394 bf1 = output;
1395 bf1[0] = bf0[0] + bf0[3];
1396 bf1[1] = bf0[1] + bf0[2];
1397 bf1[2] = -bf0[2] + bf0[1];
1398 bf1[3] = -bf0[3] + bf0[0];
1399 bf1[4] = bf0[4];
1400 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1401 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
1402 bf1[7] = bf0[7];
1403 bf1[8] = bf0[8] + bf0[11];
1404 bf1[9] = bf0[9] + bf0[10];
1405 bf1[10] = -bf0[10] + bf0[9];
1406 bf1[11] = -bf0[11] + bf0[8];
1407 bf1[12] = -bf0[12] + bf0[15];
1408 bf1[13] = -bf0[13] + bf0[14];
1409 bf1[14] = bf0[14] + bf0[13];
1410 bf1[15] = bf0[15] + bf0[12];
1411 bf1[16] = bf0[16];
1412 bf1[17] = bf0[17];
1413 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1414 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1415 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1416 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1417 bf1[22] = bf0[22];
1418 bf1[23] = bf0[23];
1419 bf1[24] = bf0[24];
1420 bf1[25] = bf0[25];
1421 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
1422 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
1423 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
1424 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
1425 bf1[30] = bf0[30];
1426 bf1[31] = bf0[31];
1427 bf1[32] = bf0[32] + bf0[39];
1428 bf1[33] = bf0[33] + bf0[38];
1429 bf1[34] = bf0[34] + bf0[37];
1430 bf1[35] = bf0[35] + bf0[36];
1431 bf1[36] = -bf0[36] + bf0[35];
1432 bf1[37] = -bf0[37] + bf0[34];
1433 bf1[38] = -bf0[38] + bf0[33];
1434 bf1[39] = -bf0[39] + bf0[32];
1435 bf1[40] = -bf0[40] + bf0[47];
1436 bf1[41] = -bf0[41] + bf0[46];
1437 bf1[42] = -bf0[42] + bf0[45];
1438 bf1[43] = -bf0[43] + bf0[44];
1439 bf1[44] = bf0[44] + bf0[43];
1440 bf1[45] = bf0[45] + bf0[42];
1441 bf1[46] = bf0[46] + bf0[41];
1442 bf1[47] = bf0[47] + bf0[40];
1443 bf1[48] = bf0[48] + bf0[55];
1444 bf1[49] = bf0[49] + bf0[54];
1445 bf1[50] = bf0[50] + bf0[53];
1446 bf1[51] = bf0[51] + bf0[52];
1447 bf1[52] = -bf0[52] + bf0[51];
1448 bf1[53] = -bf0[53] + bf0[50];
1449 bf1[54] = -bf0[54] + bf0[49];
1450 bf1[55] = -bf0[55] + bf0[48];
1451 bf1[56] = -bf0[56] + bf0[63];
1452 bf1[57] = -bf0[57] + bf0[62];
1453 bf1[58] = -bf0[58] + bf0[61];
1454 bf1[59] = -bf0[59] + bf0[60];
1455 bf1[60] = bf0[60] + bf0[59];
1456 bf1[61] = bf0[61] + bf0[58];
1457 bf1[62] = bf0[62] + bf0[57];
1458 bf1[63] = bf0[63] + bf0[56];
1459 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1460
1461 // stage 6
1462 stage++;
1463 cospi = cospi_arr(cos_bit);
1464 bf0 = output;
1465 bf1 = step;
1466 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1467 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1468 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1469 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1470 bf1[4] = bf0[4] + bf0[5];
1471 bf1[5] = -bf0[5] + bf0[4];
1472 bf1[6] = -bf0[6] + bf0[7];
1473 bf1[7] = bf0[7] + bf0[6];
1474 bf1[8] = bf0[8];
1475 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1476 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1477 bf1[11] = bf0[11];
1478 bf1[12] = bf0[12];
1479 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1480 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1481 bf1[15] = bf0[15];
1482 bf1[16] = bf0[16] + bf0[19];
1483 bf1[17] = bf0[17] + bf0[18];
1484 bf1[18] = -bf0[18] + bf0[17];
1485 bf1[19] = -bf0[19] + bf0[16];
1486 bf1[20] = -bf0[20] + bf0[23];
1487 bf1[21] = -bf0[21] + bf0[22];
1488 bf1[22] = bf0[22] + bf0[21];
1489 bf1[23] = bf0[23] + bf0[20];
1490 bf1[24] = bf0[24] + bf0[27];
1491 bf1[25] = bf0[25] + bf0[26];
1492 bf1[26] = -bf0[26] + bf0[25];
1493 bf1[27] = -bf0[27] + bf0[24];
1494 bf1[28] = -bf0[28] + bf0[31];
1495 bf1[29] = -bf0[29] + bf0[30];
1496 bf1[30] = bf0[30] + bf0[29];
1497 bf1[31] = bf0[31] + bf0[28];
1498 bf1[32] = bf0[32];
1499 bf1[33] = bf0[33];
1500 bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1501 bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1502 bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1503 bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1504 bf1[38] = bf0[38];
1505 bf1[39] = bf0[39];
1506 bf1[40] = bf0[40];
1507 bf1[41] = bf0[41];
1508 bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1509 bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1510 bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1511 bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1512 bf1[46] = bf0[46];
1513 bf1[47] = bf0[47];
1514 bf1[48] = bf0[48];
1515 bf1[49] = bf0[49];
1516 bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
1517 bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
1518 bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
1519 bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
1520 bf1[54] = bf0[54];
1521 bf1[55] = bf0[55];
1522 bf1[56] = bf0[56];
1523 bf1[57] = bf0[57];
1524 bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
1525 bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
1526 bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
1527 bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
1528 bf1[62] = bf0[62];
1529 bf1[63] = bf0[63];
1530 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1531
1532 // stage 7
1533 stage++;
1534 cospi = cospi_arr(cos_bit);
1535 bf0 = step;
1536 bf1 = output;
1537 bf1[0] = bf0[0];
1538 bf1[1] = bf0[1];
1539 bf1[2] = bf0[2];
1540 bf1[3] = bf0[3];
1541 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1542 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1543 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1544 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1545 bf1[8] = bf0[8] + bf0[9];
1546 bf1[9] = -bf0[9] + bf0[8];
1547 bf1[10] = -bf0[10] + bf0[11];
1548 bf1[11] = bf0[11] + bf0[10];
1549 bf1[12] = bf0[12] + bf0[13];
1550 bf1[13] = -bf0[13] + bf0[12];
1551 bf1[14] = -bf0[14] + bf0[15];
1552 bf1[15] = bf0[15] + bf0[14];
1553 bf1[16] = bf0[16];
1554 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1555 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1556 bf1[19] = bf0[19];
1557 bf1[20] = bf0[20];
1558 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1559 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1560 bf1[23] = bf0[23];
1561 bf1[24] = bf0[24];
1562 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1563 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1564 bf1[27] = bf0[27];
1565 bf1[28] = bf0[28];
1566 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1567 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1568 bf1[31] = bf0[31];
1569 bf1[32] = bf0[32] + bf0[35];
1570 bf1[33] = bf0[33] + bf0[34];
1571 bf1[34] = -bf0[34] + bf0[33];
1572 bf1[35] = -bf0[35] + bf0[32];
1573 bf1[36] = -bf0[36] + bf0[39];
1574 bf1[37] = -bf0[37] + bf0[38];
1575 bf1[38] = bf0[38] + bf0[37];
1576 bf1[39] = bf0[39] + bf0[36];
1577 bf1[40] = bf0[40] + bf0[43];
1578 bf1[41] = bf0[41] + bf0[42];
1579 bf1[42] = -bf0[42] + bf0[41];
1580 bf1[43] = -bf0[43] + bf0[40];
1581 bf1[44] = -bf0[44] + bf0[47];
1582 bf1[45] = -bf0[45] + bf0[46];
1583 bf1[46] = bf0[46] + bf0[45];
1584 bf1[47] = bf0[47] + bf0[44];
1585 bf1[48] = bf0[48] + bf0[51];
1586 bf1[49] = bf0[49] + bf0[50];
1587 bf1[50] = -bf0[50] + bf0[49];
1588 bf1[51] = -bf0[51] + bf0[48];
1589 bf1[52] = -bf0[52] + bf0[55];
1590 bf1[53] = -bf0[53] + bf0[54];
1591 bf1[54] = bf0[54] + bf0[53];
1592 bf1[55] = bf0[55] + bf0[52];
1593 bf1[56] = bf0[56] + bf0[59];
1594 bf1[57] = bf0[57] + bf0[58];
1595 bf1[58] = -bf0[58] + bf0[57];
1596 bf1[59] = -bf0[59] + bf0[56];
1597 bf1[60] = -bf0[60] + bf0[63];
1598 bf1[61] = -bf0[61] + bf0[62];
1599 bf1[62] = bf0[62] + bf0[61];
1600 bf1[63] = bf0[63] + bf0[60];
1601 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1602
1603 // stage 8
1604 stage++;
1605 cospi = cospi_arr(cos_bit);
1606 bf0 = output;
1607 bf1 = step;
1608 bf1[0] = bf0[0];
1609 bf1[1] = bf0[1];
1610 bf1[2] = bf0[2];
1611 bf1[3] = bf0[3];
1612 bf1[4] = bf0[4];
1613 bf1[5] = bf0[5];
1614 bf1[6] = bf0[6];
1615 bf1[7] = bf0[7];
1616 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1617 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1618 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1619 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1620 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1621 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1622 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1623 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1624 bf1[16] = bf0[16] + bf0[17];
1625 bf1[17] = -bf0[17] + bf0[16];
1626 bf1[18] = -bf0[18] + bf0[19];
1627 bf1[19] = bf0[19] + bf0[18];
1628 bf1[20] = bf0[20] + bf0[21];
1629 bf1[21] = -bf0[21] + bf0[20];
1630 bf1[22] = -bf0[22] + bf0[23];
1631 bf1[23] = bf0[23] + bf0[22];
1632 bf1[24] = bf0[24] + bf0[25];
1633 bf1[25] = -bf0[25] + bf0[24];
1634 bf1[26] = -bf0[26] + bf0[27];
1635 bf1[27] = bf0[27] + bf0[26];
1636 bf1[28] = bf0[28] + bf0[29];
1637 bf1[29] = -bf0[29] + bf0[28];
1638 bf1[30] = -bf0[30] + bf0[31];
1639 bf1[31] = bf0[31] + bf0[30];
1640 bf1[32] = bf0[32];
1641 bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1642 bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1643 bf1[35] = bf0[35];
1644 bf1[36] = bf0[36];
1645 bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1646 bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1647 bf1[39] = bf0[39];
1648 bf1[40] = bf0[40];
1649 bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1650 bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1651 bf1[43] = bf0[43];
1652 bf1[44] = bf0[44];
1653 bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1654 bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1655 bf1[47] = bf0[47];
1656 bf1[48] = bf0[48];
1657 bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
1658 bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
1659 bf1[51] = bf0[51];
1660 bf1[52] = bf0[52];
1661 bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
1662 bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
1663 bf1[55] = bf0[55];
1664 bf1[56] = bf0[56];
1665 bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
1666 bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
1667 bf1[59] = bf0[59];
1668 bf1[60] = bf0[60];
1669 bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
1670 bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
1671 bf1[63] = bf0[63];
1672 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1673
1674 // stage 9
1675 stage++;
1676 cospi = cospi_arr(cos_bit);
1677 bf0 = step;
1678 bf1 = output;
1679 bf1[0] = bf0[0];
1680 bf1[1] = bf0[1];
1681 bf1[2] = bf0[2];
1682 bf1[3] = bf0[3];
1683 bf1[4] = bf0[4];
1684 bf1[5] = bf0[5];
1685 bf1[6] = bf0[6];
1686 bf1[7] = bf0[7];
1687 bf1[8] = bf0[8];
1688 bf1[9] = bf0[9];
1689 bf1[10] = bf0[10];
1690 bf1[11] = bf0[11];
1691 bf1[12] = bf0[12];
1692 bf1[13] = bf0[13];
1693 bf1[14] = bf0[14];
1694 bf1[15] = bf0[15];
1695 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1696 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1697 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1698 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1699 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1700 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1701 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1702 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1703 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1704 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1705 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1706 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1707 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1708 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1709 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1710 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1711 bf1[32] = bf0[32] + bf0[33];
1712 bf1[33] = -bf0[33] + bf0[32];
1713 bf1[34] = -bf0[34] + bf0[35];
1714 bf1[35] = bf0[35] + bf0[34];
1715 bf1[36] = bf0[36] + bf0[37];
1716 bf1[37] = -bf0[37] + bf0[36];
1717 bf1[38] = -bf0[38] + bf0[39];
1718 bf1[39] = bf0[39] + bf0[38];
1719 bf1[40] = bf0[40] + bf0[41];
1720 bf1[41] = -bf0[41] + bf0[40];
1721 bf1[42] = -bf0[42] + bf0[43];
1722 bf1[43] = bf0[43] + bf0[42];
1723 bf1[44] = bf0[44] + bf0[45];
1724 bf1[45] = -bf0[45] + bf0[44];
1725 bf1[46] = -bf0[46] + bf0[47];
1726 bf1[47] = bf0[47] + bf0[46];
1727 bf1[48] = bf0[48] + bf0[49];
1728 bf1[49] = -bf0[49] + bf0[48];
1729 bf1[50] = -bf0[50] + bf0[51];
1730 bf1[51] = bf0[51] + bf0[50];
1731 bf1[52] = bf0[52] + bf0[53];
1732 bf1[53] = -bf0[53] + bf0[52];
1733 bf1[54] = -bf0[54] + bf0[55];
1734 bf1[55] = bf0[55] + bf0[54];
1735 bf1[56] = bf0[56] + bf0[57];
1736 bf1[57] = -bf0[57] + bf0[56];
1737 bf1[58] = -bf0[58] + bf0[59];
1738 bf1[59] = bf0[59] + bf0[58];
1739 bf1[60] = bf0[60] + bf0[61];
1740 bf1[61] = -bf0[61] + bf0[60];
1741 bf1[62] = -bf0[62] + bf0[63];
1742 bf1[63] = bf0[63] + bf0[62];
1743 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1744
1745 // stage 10
1746 stage++;
1747 cospi = cospi_arr(cos_bit);
1748 bf0 = output;
1749 bf1 = step;
1750 bf1[0] = bf0[0];
1751 bf1[1] = bf0[1];
1752 bf1[2] = bf0[2];
1753 bf1[3] = bf0[3];
1754 bf1[4] = bf0[4];
1755 bf1[5] = bf0[5];
1756 bf1[6] = bf0[6];
1757 bf1[7] = bf0[7];
1758 bf1[8] = bf0[8];
1759 bf1[9] = bf0[9];
1760 bf1[10] = bf0[10];
1761 bf1[11] = bf0[11];
1762 bf1[12] = bf0[12];
1763 bf1[13] = bf0[13];
1764 bf1[14] = bf0[14];
1765 bf1[15] = bf0[15];
1766 bf1[16] = bf0[16];
1767 bf1[17] = bf0[17];
1768 bf1[18] = bf0[18];
1769 bf1[19] = bf0[19];
1770 bf1[20] = bf0[20];
1771 bf1[21] = bf0[21];
1772 bf1[22] = bf0[22];
1773 bf1[23] = bf0[23];
1774 bf1[24] = bf0[24];
1775 bf1[25] = bf0[25];
1776 bf1[26] = bf0[26];
1777 bf1[27] = bf0[27];
1778 bf1[28] = bf0[28];
1779 bf1[29] = bf0[29];
1780 bf1[30] = bf0[30];
1781 bf1[31] = bf0[31];
1782 bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
1783 bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
1784 bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
1785 bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
1786 bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
1787 bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
1788 bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
1789 bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
1790 bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
1791 bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
1792 bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
1793 bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
1794 bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
1795 bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
1796 bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
1797 bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
1798 bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
1799 bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
1800 bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
1801 bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
1802 bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
1803 bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
1804 bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
1805 bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
1806 bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
1807 bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
1808 bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
1809 bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
1810 bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
1811 bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
1812 bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
1813 bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
1814 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1815
1816 // stage 11
1817 stage++;
1818 bf0 = step;
1819 bf1 = output;
1820 bf1[0] = bf0[0];
1821 bf1[1] = bf0[32];
1822 bf1[2] = bf0[16];
1823 bf1[3] = bf0[48];
1824 bf1[4] = bf0[8];
1825 bf1[5] = bf0[40];
1826 bf1[6] = bf0[24];
1827 bf1[7] = bf0[56];
1828 bf1[8] = bf0[4];
1829 bf1[9] = bf0[36];
1830 bf1[10] = bf0[20];
1831 bf1[11] = bf0[52];
1832 bf1[12] = bf0[12];
1833 bf1[13] = bf0[44];
1834 bf1[14] = bf0[28];
1835 bf1[15] = bf0[60];
1836 bf1[16] = bf0[2];
1837 bf1[17] = bf0[34];
1838 bf1[18] = bf0[18];
1839 bf1[19] = bf0[50];
1840 bf1[20] = bf0[10];
1841 bf1[21] = bf0[42];
1842 bf1[22] = bf0[26];
1843 bf1[23] = bf0[58];
1844 bf1[24] = bf0[6];
1845 bf1[25] = bf0[38];
1846 bf1[26] = bf0[22];
1847 bf1[27] = bf0[54];
1848 bf1[28] = bf0[14];
1849 bf1[29] = bf0[46];
1850 bf1[30] = bf0[30];
1851 bf1[31] = bf0[62];
1852 bf1[32] = bf0[1];
1853 bf1[33] = bf0[33];
1854 bf1[34] = bf0[17];
1855 bf1[35] = bf0[49];
1856 bf1[36] = bf0[9];
1857 bf1[37] = bf0[41];
1858 bf1[38] = bf0[25];
1859 bf1[39] = bf0[57];
1860 bf1[40] = bf0[5];
1861 bf1[41] = bf0[37];
1862 bf1[42] = bf0[21];
1863 bf1[43] = bf0[53];
1864 bf1[44] = bf0[13];
1865 bf1[45] = bf0[45];
1866 bf1[46] = bf0[29];
1867 bf1[47] = bf0[61];
1868 bf1[48] = bf0[3];
1869 bf1[49] = bf0[35];
1870 bf1[50] = bf0[19];
1871 bf1[51] = bf0[51];
1872 bf1[52] = bf0[11];
1873 bf1[53] = bf0[43];
1874 bf1[54] = bf0[27];
1875 bf1[55] = bf0[59];
1876 bf1[56] = bf0[7];
1877 bf1[57] = bf0[39];
1878 bf1[58] = bf0[23];
1879 bf1[59] = bf0[55];
1880 bf1[60] = bf0[15];
1881 bf1[61] = bf0[47];
1882 bf1[62] = bf0[31];
1883 bf1[63] = bf0[63];
1884 av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1885 }
1886