• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdlib.h>
13 #include "av1/common/av1_inv_txfm1d.h"
14 #include "av1/common/av1_txfm.h"
15 
av1_idct4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)16 void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
17                const int8_t *stage_range) {
18   assert(output != input);
19   const int32_t size = 4;
20   const int32_t *cospi = cospi_arr(cos_bit);
21 
22   int32_t stage = 0;
23   int32_t *bf0, *bf1;
24   int32_t step[4];
25 
26   // stage 0;
27 
28   // stage 1;
29   stage++;
30   bf1 = output;
31   bf1[0] = input[0];
32   bf1[1] = input[2];
33   bf1[2] = input[1];
34   bf1[3] = input[3];
35   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
36 
37   // stage 2
38   stage++;
39   bf0 = output;
40   bf1 = step;
41   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
42   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
43   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
44   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
45   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
46 
47   // stage 3
48   stage++;
49   bf0 = step;
50   bf1 = output;
51   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
52   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
53   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
54   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
55 }
56 
av1_idct8(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)57 void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
58                const int8_t *stage_range) {
59   assert(output != input);
60   const int32_t size = 8;
61   const int32_t *cospi = cospi_arr(cos_bit);
62 
63   int32_t stage = 0;
64   int32_t *bf0, *bf1;
65   int32_t step[8];
66 
67   // stage 0;
68 
69   // stage 1;
70   stage++;
71   bf1 = output;
72   bf1[0] = input[0];
73   bf1[1] = input[4];
74   bf1[2] = input[2];
75   bf1[3] = input[6];
76   bf1[4] = input[1];
77   bf1[5] = input[5];
78   bf1[6] = input[3];
79   bf1[7] = input[7];
80   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
81 
82   // stage 2
83   stage++;
84   bf0 = output;
85   bf1 = step;
86   bf1[0] = bf0[0];
87   bf1[1] = bf0[1];
88   bf1[2] = bf0[2];
89   bf1[3] = bf0[3];
90   bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
91   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
92   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
93   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
94   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
95 
96   // stage 3
97   stage++;
98   bf0 = step;
99   bf1 = output;
100   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
101   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
102   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
103   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
104   bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
105   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
106   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
107   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
108   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
109 
110   // stage 4
111   stage++;
112   bf0 = output;
113   bf1 = step;
114   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
115   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
116   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
117   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
118   bf1[4] = bf0[4];
119   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
120   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
121   bf1[7] = bf0[7];
122   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
123 
124   // stage 5
125   stage++;
126   bf0 = step;
127   bf1 = output;
128   bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
129   bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
130   bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
131   bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
132   bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
133   bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
134   bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
135   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
136 }
137 
av1_idct16(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)138 void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
139                 const int8_t *stage_range) {
140   assert(output != input);
141   const int32_t size = 16;
142   const int32_t *cospi = cospi_arr(cos_bit);
143 
144   int32_t stage = 0;
145   int32_t *bf0, *bf1;
146   int32_t step[16];
147 
148   // stage 0;
149 
150   // stage 1;
151   stage++;
152   bf1 = output;
153   bf1[0] = input[0];
154   bf1[1] = input[8];
155   bf1[2] = input[4];
156   bf1[3] = input[12];
157   bf1[4] = input[2];
158   bf1[5] = input[10];
159   bf1[6] = input[6];
160   bf1[7] = input[14];
161   bf1[8] = input[1];
162   bf1[9] = input[9];
163   bf1[10] = input[5];
164   bf1[11] = input[13];
165   bf1[12] = input[3];
166   bf1[13] = input[11];
167   bf1[14] = input[7];
168   bf1[15] = input[15];
169   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
170 
171   // stage 2
172   stage++;
173   bf0 = output;
174   bf1 = step;
175   bf1[0] = bf0[0];
176   bf1[1] = bf0[1];
177   bf1[2] = bf0[2];
178   bf1[3] = bf0[3];
179   bf1[4] = bf0[4];
180   bf1[5] = bf0[5];
181   bf1[6] = bf0[6];
182   bf1[7] = bf0[7];
183   bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
184   bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
185   bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
186   bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
187   bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
188   bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
189   bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
190   bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
191   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
192 
193   // stage 3
194   stage++;
195   bf0 = step;
196   bf1 = output;
197   bf1[0] = bf0[0];
198   bf1[1] = bf0[1];
199   bf1[2] = bf0[2];
200   bf1[3] = bf0[3];
201   bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
202   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
203   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
204   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
205   bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
206   bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
207   bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
208   bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
209   bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
210   bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
211   bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
212   bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
213   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
214 
215   // stage 4
216   stage++;
217   bf0 = output;
218   bf1 = step;
219   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
220   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
221   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
222   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
223   bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
224   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
225   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
226   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
227   bf1[8] = bf0[8];
228   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
229   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
230   bf1[11] = bf0[11];
231   bf1[12] = bf0[12];
232   bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
233   bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
234   bf1[15] = bf0[15];
235   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
236 
237   // stage 5
238   stage++;
239   bf0 = step;
240   bf1 = output;
241   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
242   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
243   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
244   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
245   bf1[4] = bf0[4];
246   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
247   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
248   bf1[7] = bf0[7];
249   bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
250   bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
251   bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
252   bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
253   bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
254   bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
255   bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
256   bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
257   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
258 
259   // stage 6
260   stage++;
261   bf0 = output;
262   bf1 = step;
263   bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
264   bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
265   bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
266   bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
267   bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
268   bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
269   bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
270   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
271   bf1[8] = bf0[8];
272   bf1[9] = bf0[9];
273   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
274   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
275   bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
276   bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
277   bf1[14] = bf0[14];
278   bf1[15] = bf0[15];
279   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
280 
281   // stage 7
282   stage++;
283   bf0 = step;
284   bf1 = output;
285   bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
286   bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
287   bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
288   bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
289   bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
290   bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
291   bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
292   bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
293   bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
294   bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
295   bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
296   bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
297   bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
298   bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
299   bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
300   bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
301 }
302 
av1_idct32(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)303 void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
304                 const int8_t *stage_range) {
305   assert(output != input);
306   const int32_t size = 32;
307   const int32_t *cospi = cospi_arr(cos_bit);
308 
309   int32_t stage = 0;
310   int32_t *bf0, *bf1;
311   int32_t step[32];
312 
313   // stage 0;
314 
315   // stage 1;
316   stage++;
317   bf1 = output;
318   bf1[0] = input[0];
319   bf1[1] = input[16];
320   bf1[2] = input[8];
321   bf1[3] = input[24];
322   bf1[4] = input[4];
323   bf1[5] = input[20];
324   bf1[6] = input[12];
325   bf1[7] = input[28];
326   bf1[8] = input[2];
327   bf1[9] = input[18];
328   bf1[10] = input[10];
329   bf1[11] = input[26];
330   bf1[12] = input[6];
331   bf1[13] = input[22];
332   bf1[14] = input[14];
333   bf1[15] = input[30];
334   bf1[16] = input[1];
335   bf1[17] = input[17];
336   bf1[18] = input[9];
337   bf1[19] = input[25];
338   bf1[20] = input[5];
339   bf1[21] = input[21];
340   bf1[22] = input[13];
341   bf1[23] = input[29];
342   bf1[24] = input[3];
343   bf1[25] = input[19];
344   bf1[26] = input[11];
345   bf1[27] = input[27];
346   bf1[28] = input[7];
347   bf1[29] = input[23];
348   bf1[30] = input[15];
349   bf1[31] = input[31];
350   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
351 
352   // stage 2
353   stage++;
354   bf0 = output;
355   bf1 = step;
356   bf1[0] = bf0[0];
357   bf1[1] = bf0[1];
358   bf1[2] = bf0[2];
359   bf1[3] = bf0[3];
360   bf1[4] = bf0[4];
361   bf1[5] = bf0[5];
362   bf1[6] = bf0[6];
363   bf1[7] = bf0[7];
364   bf1[8] = bf0[8];
365   bf1[9] = bf0[9];
366   bf1[10] = bf0[10];
367   bf1[11] = bf0[11];
368   bf1[12] = bf0[12];
369   bf1[13] = bf0[13];
370   bf1[14] = bf0[14];
371   bf1[15] = bf0[15];
372   bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
373   bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
374   bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
375   bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
376   bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
377   bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
378   bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
379   bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
380   bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
381   bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
382   bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
383   bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
384   bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
385   bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
386   bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
387   bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
388   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
389 
390   // stage 3
391   stage++;
392   bf0 = step;
393   bf1 = output;
394   bf1[0] = bf0[0];
395   bf1[1] = bf0[1];
396   bf1[2] = bf0[2];
397   bf1[3] = bf0[3];
398   bf1[4] = bf0[4];
399   bf1[5] = bf0[5];
400   bf1[6] = bf0[6];
401   bf1[7] = bf0[7];
402   bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
403   bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
404   bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
405   bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
406   bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
407   bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
408   bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
409   bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
410   bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
411   bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
412   bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
413   bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
414   bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
415   bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
416   bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
417   bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
418   bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
419   bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
420   bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
421   bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
422   bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
423   bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
424   bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
425   bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
426   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
427 
428   // stage 4
429   stage++;
430   bf0 = output;
431   bf1 = step;
432   bf1[0] = bf0[0];
433   bf1[1] = bf0[1];
434   bf1[2] = bf0[2];
435   bf1[3] = bf0[3];
436   bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
437   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
438   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
439   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
440   bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
441   bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
442   bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
443   bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
444   bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
445   bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
446   bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
447   bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
448   bf1[16] = bf0[16];
449   bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
450   bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
451   bf1[19] = bf0[19];
452   bf1[20] = bf0[20];
453   bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
454   bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
455   bf1[23] = bf0[23];
456   bf1[24] = bf0[24];
457   bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
458   bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
459   bf1[27] = bf0[27];
460   bf1[28] = bf0[28];
461   bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
462   bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
463   bf1[31] = bf0[31];
464   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
465 
466   // stage 5
467   stage++;
468   bf0 = step;
469   bf1 = output;
470   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
471   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
472   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
473   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
474   bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
475   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
476   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
477   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
478   bf1[8] = bf0[8];
479   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
480   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
481   bf1[11] = bf0[11];
482   bf1[12] = bf0[12];
483   bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
484   bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
485   bf1[15] = bf0[15];
486   bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
487   bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
488   bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
489   bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
490   bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
491   bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
492   bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
493   bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
494   bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
495   bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
496   bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
497   bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
498   bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
499   bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
500   bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
501   bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
502   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
503 
504   // stage 6
505   stage++;
506   bf0 = output;
507   bf1 = step;
508   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
509   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
510   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
511   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
512   bf1[4] = bf0[4];
513   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
514   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
515   bf1[7] = bf0[7];
516   bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
517   bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
518   bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
519   bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
520   bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
521   bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
522   bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
523   bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
524   bf1[16] = bf0[16];
525   bf1[17] = bf0[17];
526   bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
527   bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
528   bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
529   bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
530   bf1[22] = bf0[22];
531   bf1[23] = bf0[23];
532   bf1[24] = bf0[24];
533   bf1[25] = bf0[25];
534   bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
535   bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
536   bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
537   bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
538   bf1[30] = bf0[30];
539   bf1[31] = bf0[31];
540   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
541 
542   // stage 7
543   stage++;
544   bf0 = step;
545   bf1 = output;
546   bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
547   bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
548   bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
549   bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
550   bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
551   bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
552   bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
553   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
554   bf1[8] = bf0[8];
555   bf1[9] = bf0[9];
556   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
557   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
558   bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
559   bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
560   bf1[14] = bf0[14];
561   bf1[15] = bf0[15];
562   bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
563   bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
564   bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
565   bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
566   bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
567   bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
568   bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
569   bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
570   bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
571   bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
572   bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
573   bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
574   bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
575   bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
576   bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
577   bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
578   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
579 
580   // stage 8
581   stage++;
582   bf0 = output;
583   bf1 = step;
584   bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
585   bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
586   bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
587   bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
588   bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
589   bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
590   bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
591   bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
592   bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
593   bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
594   bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
595   bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
596   bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
597   bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
598   bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
599   bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
600   bf1[16] = bf0[16];
601   bf1[17] = bf0[17];
602   bf1[18] = bf0[18];
603   bf1[19] = bf0[19];
604   bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
605   bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
606   bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
607   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
608   bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
609   bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
610   bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
611   bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
612   bf1[28] = bf0[28];
613   bf1[29] = bf0[29];
614   bf1[30] = bf0[30];
615   bf1[31] = bf0[31];
616   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
617 
618   // stage 9
619   stage++;
620   bf0 = step;
621   bf1 = output;
622   bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
623   bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
624   bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
625   bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
626   bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
627   bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
628   bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
629   bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
630   bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
631   bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
632   bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
633   bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
634   bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
635   bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
636   bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
637   bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
638   bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
639   bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
640   bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
641   bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
642   bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
643   bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
644   bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
645   bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
646   bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
647   bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
648   bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
649   bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
650   bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
651   bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
652   bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
653   bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
654 }
655 
av1_iadst4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)656 void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
657                 const int8_t *stage_range) {
658   int bit = cos_bit;
659   const int32_t *sinpi = sinpi_arr(bit);
660   int32_t s0, s1, s2, s3, s4, s5, s6, s7;
661 
662   int32_t x0 = input[0];
663   int32_t x1 = input[1];
664   int32_t x2 = input[2];
665   int32_t x3 = input[3];
666 
667   if (!(x0 | x1 | x2 | x3)) {
668     output[0] = output[1] = output[2] = output[3] = 0;
669     return;
670   }
671 
672   assert(sinpi[1] + sinpi[2] == sinpi[4]);
673 
674   // stage 1
675   s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
676   s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
677   s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
678   s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
679   s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
680   s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
681   s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
682 
683   // stage 2
684   // NOTICE: (x0 - x2) here may use one extra bit compared to the
685   // opt_range_row/col specified in av1_gen_inv_stage_range()
686   s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
687 
688   // stage 3
689   s0 = range_check_value(s0 + s3, stage_range[3] + bit);
690   s1 = range_check_value(s1 - s4, stage_range[3] + bit);
691   s3 = range_check_value(s2, stage_range[3] + bit);
692   s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
693 
694   // stage 4
695   s0 = range_check_value(s0 + s5, stage_range[4] + bit);
696   s1 = range_check_value(s1 - s6, stage_range[4] + bit);
697 
698   // stage 5
699   x0 = range_check_value(s0 + s3, stage_range[5] + bit);
700   x1 = range_check_value(s1 + s3, stage_range[5] + bit);
701   x2 = range_check_value(s2, stage_range[5] + bit);
702   x3 = range_check_value(s0 + s1, stage_range[5] + bit);
703 
704   // stage 6
705   x3 = range_check_value(x3 - s3, stage_range[6] + bit);
706 
707   output[0] = round_shift(x0, bit);
708   output[1] = round_shift(x1, bit);
709   output[2] = round_shift(x2, bit);
710   output[3] = round_shift(x3, bit);
711 }
712 
av1_iadst8(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)713 void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
714                 const int8_t *stage_range) {
715   assert(output != input);
716   const int32_t size = 8;
717   const int32_t *cospi = cospi_arr(cos_bit);
718 
719   int32_t stage = 0;
720   int32_t *bf0, *bf1;
721   int32_t step[8];
722 
723   // stage 0;
724 
725   // stage 1;
726   stage++;
727   bf1 = output;
728   bf1[0] = input[7];
729   bf1[1] = input[0];
730   bf1[2] = input[5];
731   bf1[3] = input[2];
732   bf1[4] = input[3];
733   bf1[5] = input[4];
734   bf1[6] = input[1];
735   bf1[7] = input[6];
736   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
737 
738   // stage 2
739   stage++;
740   bf0 = output;
741   bf1 = step;
742   bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
743   bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
744   bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
745   bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
746   bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
747   bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
748   bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
749   bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
750   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
751 
752   // stage 3
753   stage++;
754   bf0 = step;
755   bf1 = output;
756   bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
757   bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
758   bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
759   bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
760   bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
761   bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
762   bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
763   bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
764   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
765 
766   // stage 4
767   stage++;
768   bf0 = output;
769   bf1 = step;
770   bf1[0] = bf0[0];
771   bf1[1] = bf0[1];
772   bf1[2] = bf0[2];
773   bf1[3] = bf0[3];
774   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
775   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
776   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
777   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
778   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
779 
780   // stage 5
781   stage++;
782   bf0 = step;
783   bf1 = output;
784   bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
785   bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
786   bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
787   bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
788   bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
789   bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
790   bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
791   bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
792   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
793 
794   // stage 6
795   stage++;
796   bf0 = output;
797   bf1 = step;
798   bf1[0] = bf0[0];
799   bf1[1] = bf0[1];
800   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
801   bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
802   bf1[4] = bf0[4];
803   bf1[5] = bf0[5];
804   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
805   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
806   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
807 
808   // stage 7
809   bf0 = step;
810   bf1 = output;
811   bf1[0] = bf0[0];
812   bf1[1] = -bf0[4];
813   bf1[2] = bf0[6];
814   bf1[3] = -bf0[2];
815   bf1[4] = bf0[3];
816   bf1[5] = -bf0[7];
817   bf1[6] = bf0[5];
818   bf1[7] = -bf0[1];
819 }
820 
av1_iadst16(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)821 void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
822                  const int8_t *stage_range) {
823   assert(output != input);
824   const int32_t size = 16;
825   const int32_t *cospi = cospi_arr(cos_bit);
826 
827   int32_t stage = 0;
828   int32_t *bf0, *bf1;
829   int32_t step[16];
830 
831   // stage 0;
832 
833   // stage 1;
834   stage++;
835   bf1 = output;
836   bf1[0] = input[15];
837   bf1[1] = input[0];
838   bf1[2] = input[13];
839   bf1[3] = input[2];
840   bf1[4] = input[11];
841   bf1[5] = input[4];
842   bf1[6] = input[9];
843   bf1[7] = input[6];
844   bf1[8] = input[7];
845   bf1[9] = input[8];
846   bf1[10] = input[5];
847   bf1[11] = input[10];
848   bf1[12] = input[3];
849   bf1[13] = input[12];
850   bf1[14] = input[1];
851   bf1[15] = input[14];
852   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
853 
854   // stage 2
855   stage++;
856   bf0 = output;
857   bf1 = step;
858   bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
859   bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
860   bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
861   bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
862   bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
863   bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
864   bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
865   bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
866   bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
867   bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
868   bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
869   bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
870   bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
871   bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
872   bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
873   bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
874   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
875 
876   // stage 3
877   stage++;
878   bf0 = step;
879   bf1 = output;
880   bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
881   bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
882   bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
883   bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
884   bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
885   bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
886   bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
887   bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
888   bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
889   bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
890   bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
891   bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
892   bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
893   bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
894   bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
895   bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
896   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
897 
898   // stage 4
899   stage++;
900   bf0 = output;
901   bf1 = step;
902   bf1[0] = bf0[0];
903   bf1[1] = bf0[1];
904   bf1[2] = bf0[2];
905   bf1[3] = bf0[3];
906   bf1[4] = bf0[4];
907   bf1[5] = bf0[5];
908   bf1[6] = bf0[6];
909   bf1[7] = bf0[7];
910   bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
911   bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
912   bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
913   bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
914   bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
915   bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
916   bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
917   bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
918   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
919 
920   // stage 5
921   stage++;
922   bf0 = step;
923   bf1 = output;
924   bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
925   bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
926   bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
927   bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
928   bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
929   bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
930   bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
931   bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
932   bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
933   bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
934   bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
935   bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
936   bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
937   bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
938   bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
939   bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
940   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
941 
942   // stage 6
943   stage++;
944   bf0 = output;
945   bf1 = step;
946   bf1[0] = bf0[0];
947   bf1[1] = bf0[1];
948   bf1[2] = bf0[2];
949   bf1[3] = bf0[3];
950   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
951   bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
952   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
953   bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
954   bf1[8] = bf0[8];
955   bf1[9] = bf0[9];
956   bf1[10] = bf0[10];
957   bf1[11] = bf0[11];
958   bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
959   bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
960   bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
961   bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
962   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
963 
964   // stage 7
965   stage++;
966   bf0 = step;
967   bf1 = output;
968   bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
969   bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
970   bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
971   bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
972   bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
973   bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
974   bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
975   bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
976   bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
977   bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
978   bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
979   bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
980   bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
981   bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
982   bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
983   bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
984   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
985 
986   // stage 8
987   stage++;
988   bf0 = output;
989   bf1 = step;
990   bf1[0] = bf0[0];
991   bf1[1] = bf0[1];
992   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
993   bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
994   bf1[4] = bf0[4];
995   bf1[5] = bf0[5];
996   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
997   bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
998   bf1[8] = bf0[8];
999   bf1[9] = bf0[9];
1000   bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
1001   bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
1002   bf1[12] = bf0[12];
1003   bf1[13] = bf0[13];
1004   bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
1005   bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
1006   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1007 
1008   // stage 9
1009   bf0 = step;
1010   bf1 = output;
1011   bf1[0] = bf0[0];
1012   bf1[1] = -bf0[8];
1013   bf1[2] = bf0[12];
1014   bf1[3] = -bf0[4];
1015   bf1[4] = bf0[6];
1016   bf1[5] = -bf0[14];
1017   bf1[6] = bf0[10];
1018   bf1[7] = -bf0[2];
1019   bf1[8] = bf0[3];
1020   bf1[9] = -bf0[11];
1021   bf1[10] = bf0[15];
1022   bf1[11] = -bf0[7];
1023   bf1[12] = bf0[5];
1024   bf1[13] = -bf0[13];
1025   bf1[14] = bf0[9];
1026   bf1[15] = -bf0[1];
1027 }
1028 
av1_iidentity4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1029 void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1030                       const int8_t *stage_range) {
1031   (void)cos_bit;
1032   (void)stage_range;
1033   for (int i = 0; i < 4; ++i) {
1034     output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
1035   }
1036   assert(stage_range[0] + NewSqrt2Bits <= 32);
1037 }
1038 
av1_iidentity8_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1039 void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1040                       const int8_t *stage_range) {
1041   (void)cos_bit;
1042   (void)stage_range;
1043   for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
1044 }
1045 
av1_iidentity16_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1046 void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1047                        const int8_t *stage_range) {
1048   (void)cos_bit;
1049   (void)stage_range;
1050   for (int i = 0; i < 16; ++i)
1051     output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
1052   assert(stage_range[0] + NewSqrt2Bits <= 32);
1053 }
1054 
av1_iidentity32_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1055 void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
1056                        const int8_t *stage_range) {
1057   (void)cos_bit;
1058   (void)stage_range;
1059   for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
1060 }
1061 
av1_idct64(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1062 void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
1063                 const int8_t *stage_range) {
1064   assert(output != input);
1065   const int32_t size = 64;
1066   const int32_t *cospi = cospi_arr(cos_bit);
1067 
1068   int32_t stage = 0;
1069   int32_t *bf0, *bf1;
1070   int32_t step[64];
1071 
1072   // stage 0;
1073 
1074   // stage 1;
1075   stage++;
1076   bf1 = output;
1077   bf1[0] = input[0];
1078   bf1[1] = input[32];
1079   bf1[2] = input[16];
1080   bf1[3] = input[48];
1081   bf1[4] = input[8];
1082   bf1[5] = input[40];
1083   bf1[6] = input[24];
1084   bf1[7] = input[56];
1085   bf1[8] = input[4];
1086   bf1[9] = input[36];
1087   bf1[10] = input[20];
1088   bf1[11] = input[52];
1089   bf1[12] = input[12];
1090   bf1[13] = input[44];
1091   bf1[14] = input[28];
1092   bf1[15] = input[60];
1093   bf1[16] = input[2];
1094   bf1[17] = input[34];
1095   bf1[18] = input[18];
1096   bf1[19] = input[50];
1097   bf1[20] = input[10];
1098   bf1[21] = input[42];
1099   bf1[22] = input[26];
1100   bf1[23] = input[58];
1101   bf1[24] = input[6];
1102   bf1[25] = input[38];
1103   bf1[26] = input[22];
1104   bf1[27] = input[54];
1105   bf1[28] = input[14];
1106   bf1[29] = input[46];
1107   bf1[30] = input[30];
1108   bf1[31] = input[62];
1109   bf1[32] = input[1];
1110   bf1[33] = input[33];
1111   bf1[34] = input[17];
1112   bf1[35] = input[49];
1113   bf1[36] = input[9];
1114   bf1[37] = input[41];
1115   bf1[38] = input[25];
1116   bf1[39] = input[57];
1117   bf1[40] = input[5];
1118   bf1[41] = input[37];
1119   bf1[42] = input[21];
1120   bf1[43] = input[53];
1121   bf1[44] = input[13];
1122   bf1[45] = input[45];
1123   bf1[46] = input[29];
1124   bf1[47] = input[61];
1125   bf1[48] = input[3];
1126   bf1[49] = input[35];
1127   bf1[50] = input[19];
1128   bf1[51] = input[51];
1129   bf1[52] = input[11];
1130   bf1[53] = input[43];
1131   bf1[54] = input[27];
1132   bf1[55] = input[59];
1133   bf1[56] = input[7];
1134   bf1[57] = input[39];
1135   bf1[58] = input[23];
1136   bf1[59] = input[55];
1137   bf1[60] = input[15];
1138   bf1[61] = input[47];
1139   bf1[62] = input[31];
1140   bf1[63] = input[63];
1141   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1142 
1143   // stage 2
1144   stage++;
1145   bf0 = output;
1146   bf1 = step;
1147   bf1[0] = bf0[0];
1148   bf1[1] = bf0[1];
1149   bf1[2] = bf0[2];
1150   bf1[3] = bf0[3];
1151   bf1[4] = bf0[4];
1152   bf1[5] = bf0[5];
1153   bf1[6] = bf0[6];
1154   bf1[7] = bf0[7];
1155   bf1[8] = bf0[8];
1156   bf1[9] = bf0[9];
1157   bf1[10] = bf0[10];
1158   bf1[11] = bf0[11];
1159   bf1[12] = bf0[12];
1160   bf1[13] = bf0[13];
1161   bf1[14] = bf0[14];
1162   bf1[15] = bf0[15];
1163   bf1[16] = bf0[16];
1164   bf1[17] = bf0[17];
1165   bf1[18] = bf0[18];
1166   bf1[19] = bf0[19];
1167   bf1[20] = bf0[20];
1168   bf1[21] = bf0[21];
1169   bf1[22] = bf0[22];
1170   bf1[23] = bf0[23];
1171   bf1[24] = bf0[24];
1172   bf1[25] = bf0[25];
1173   bf1[26] = bf0[26];
1174   bf1[27] = bf0[27];
1175   bf1[28] = bf0[28];
1176   bf1[29] = bf0[29];
1177   bf1[30] = bf0[30];
1178   bf1[31] = bf0[31];
1179   bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
1180   bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
1181   bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
1182   bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
1183   bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
1184   bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
1185   bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
1186   bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
1187   bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
1188   bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
1189   bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
1190   bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
1191   bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
1192   bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
1193   bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
1194   bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
1195   bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
1196   bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
1197   bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
1198   bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
1199   bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
1200   bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
1201   bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
1202   bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
1203   bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
1204   bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
1205   bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
1206   bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
1207   bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
1208   bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
1209   bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
1210   bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
1211   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1212 
1213   // stage 3
1214   stage++;
1215   bf0 = step;
1216   bf1 = output;
1217   bf1[0] = bf0[0];
1218   bf1[1] = bf0[1];
1219   bf1[2] = bf0[2];
1220   bf1[3] = bf0[3];
1221   bf1[4] = bf0[4];
1222   bf1[5] = bf0[5];
1223   bf1[6] = bf0[6];
1224   bf1[7] = bf0[7];
1225   bf1[8] = bf0[8];
1226   bf1[9] = bf0[9];
1227   bf1[10] = bf0[10];
1228   bf1[11] = bf0[11];
1229   bf1[12] = bf0[12];
1230   bf1[13] = bf0[13];
1231   bf1[14] = bf0[14];
1232   bf1[15] = bf0[15];
1233   bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
1234   bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
1235   bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
1236   bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
1237   bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
1238   bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
1239   bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
1240   bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
1241   bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
1242   bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
1243   bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
1244   bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
1245   bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
1246   bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
1247   bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
1248   bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
1249   bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
1250   bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
1251   bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
1252   bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
1253   bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
1254   bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
1255   bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
1256   bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
1257   bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
1258   bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
1259   bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
1260   bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
1261   bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
1262   bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
1263   bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
1264   bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
1265   bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
1266   bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
1267   bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
1268   bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
1269   bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
1270   bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
1271   bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
1272   bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
1273   bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
1274   bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
1275   bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
1276   bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
1277   bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
1278   bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
1279   bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
1280   bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
1281   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1282 
1283   // stage 4
1284   stage++;
1285   bf0 = output;
1286   bf1 = step;
1287   bf1[0] = bf0[0];
1288   bf1[1] = bf0[1];
1289   bf1[2] = bf0[2];
1290   bf1[3] = bf0[3];
1291   bf1[4] = bf0[4];
1292   bf1[5] = bf0[5];
1293   bf1[6] = bf0[6];
1294   bf1[7] = bf0[7];
1295   bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
1296   bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
1297   bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
1298   bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
1299   bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
1300   bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
1301   bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
1302   bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
1303   bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
1304   bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
1305   bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
1306   bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
1307   bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
1308   bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
1309   bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
1310   bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
1311   bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
1312   bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
1313   bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
1314   bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
1315   bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
1316   bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
1317   bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
1318   bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
1319   bf1[32] = bf0[32];
1320   bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1321   bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1322   bf1[35] = bf0[35];
1323   bf1[36] = bf0[36];
1324   bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1325   bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1326   bf1[39] = bf0[39];
1327   bf1[40] = bf0[40];
1328   bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1329   bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1330   bf1[43] = bf0[43];
1331   bf1[44] = bf0[44];
1332   bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1333   bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1334   bf1[47] = bf0[47];
1335   bf1[48] = bf0[48];
1336   bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
1337   bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
1338   bf1[51] = bf0[51];
1339   bf1[52] = bf0[52];
1340   bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
1341   bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
1342   bf1[55] = bf0[55];
1343   bf1[56] = bf0[56];
1344   bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
1345   bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
1346   bf1[59] = bf0[59];
1347   bf1[60] = bf0[60];
1348   bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
1349   bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
1350   bf1[63] = bf0[63];
1351   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1352 
1353   // stage 5
1354   stage++;
1355   bf0 = step;
1356   bf1 = output;
1357   bf1[0] = bf0[0];
1358   bf1[1] = bf0[1];
1359   bf1[2] = bf0[2];
1360   bf1[3] = bf0[3];
1361   bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
1362   bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
1363   bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
1364   bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
1365   bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
1366   bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
1367   bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
1368   bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
1369   bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
1370   bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
1371   bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
1372   bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
1373   bf1[16] = bf0[16];
1374   bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1375   bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1376   bf1[19] = bf0[19];
1377   bf1[20] = bf0[20];
1378   bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1379   bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1380   bf1[23] = bf0[23];
1381   bf1[24] = bf0[24];
1382   bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
1383   bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
1384   bf1[27] = bf0[27];
1385   bf1[28] = bf0[28];
1386   bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
1387   bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
1388   bf1[31] = bf0[31];
1389   bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
1390   bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
1391   bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
1392   bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
1393   bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
1394   bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
1395   bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
1396   bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
1397   bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
1398   bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
1399   bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
1400   bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
1401   bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
1402   bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
1403   bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
1404   bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
1405   bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
1406   bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
1407   bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
1408   bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
1409   bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
1410   bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
1411   bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
1412   bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
1413   bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
1414   bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
1415   bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
1416   bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
1417   bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
1418   bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
1419   bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
1420   bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
1421   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1422 
1423   // stage 6
1424   stage++;
1425   bf0 = output;
1426   bf1 = step;
1427   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1428   bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
1429   bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
1430   bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
1431   bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
1432   bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
1433   bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
1434   bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
1435   bf1[8] = bf0[8];
1436   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1437   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1438   bf1[11] = bf0[11];
1439   bf1[12] = bf0[12];
1440   bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
1441   bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
1442   bf1[15] = bf0[15];
1443   bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
1444   bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
1445   bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
1446   bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
1447   bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
1448   bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
1449   bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
1450   bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
1451   bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
1452   bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
1453   bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
1454   bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
1455   bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
1456   bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
1457   bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
1458   bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
1459   bf1[32] = bf0[32];
1460   bf1[33] = bf0[33];
1461   bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1462   bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1463   bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1464   bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1465   bf1[38] = bf0[38];
1466   bf1[39] = bf0[39];
1467   bf1[40] = bf0[40];
1468   bf1[41] = bf0[41];
1469   bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1470   bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1471   bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1472   bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1473   bf1[46] = bf0[46];
1474   bf1[47] = bf0[47];
1475   bf1[48] = bf0[48];
1476   bf1[49] = bf0[49];
1477   bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
1478   bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
1479   bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
1480   bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
1481   bf1[54] = bf0[54];
1482   bf1[55] = bf0[55];
1483   bf1[56] = bf0[56];
1484   bf1[57] = bf0[57];
1485   bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
1486   bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
1487   bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
1488   bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
1489   bf1[62] = bf0[62];
1490   bf1[63] = bf0[63];
1491   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1492 
1493   // stage 7
1494   stage++;
1495   bf0 = step;
1496   bf1 = output;
1497   bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
1498   bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
1499   bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
1500   bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
1501   bf1[4] = bf0[4];
1502   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1503   bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
1504   bf1[7] = bf0[7];
1505   bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
1506   bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
1507   bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
1508   bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
1509   bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
1510   bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
1511   bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
1512   bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
1513   bf1[16] = bf0[16];
1514   bf1[17] = bf0[17];
1515   bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
1516   bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
1517   bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
1518   bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
1519   bf1[22] = bf0[22];
1520   bf1[23] = bf0[23];
1521   bf1[24] = bf0[24];
1522   bf1[25] = bf0[25];
1523   bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
1524   bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
1525   bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
1526   bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
1527   bf1[30] = bf0[30];
1528   bf1[31] = bf0[31];
1529   bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
1530   bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
1531   bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
1532   bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
1533   bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
1534   bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
1535   bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
1536   bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
1537   bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
1538   bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
1539   bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
1540   bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
1541   bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
1542   bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
1543   bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
1544   bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
1545   bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
1546   bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
1547   bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
1548   bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
1549   bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
1550   bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
1551   bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
1552   bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
1553   bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
1554   bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
1555   bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
1556   bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
1557   bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
1558   bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
1559   bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
1560   bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
1561   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1562 
1563   // stage 8
1564   stage++;
1565   bf0 = output;
1566   bf1 = step;
1567   bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
1568   bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
1569   bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
1570   bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
1571   bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
1572   bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
1573   bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
1574   bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
1575   bf1[8] = bf0[8];
1576   bf1[9] = bf0[9];
1577   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1578   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1579   bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
1580   bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
1581   bf1[14] = bf0[14];
1582   bf1[15] = bf0[15];
1583   bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
1584   bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
1585   bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
1586   bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
1587   bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
1588   bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
1589   bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
1590   bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
1591   bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
1592   bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
1593   bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
1594   bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
1595   bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
1596   bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
1597   bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
1598   bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
1599   bf1[32] = bf0[32];
1600   bf1[33] = bf0[33];
1601   bf1[34] = bf0[34];
1602   bf1[35] = bf0[35];
1603   bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
1604   bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
1605   bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
1606   bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
1607   bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
1608   bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
1609   bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
1610   bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
1611   bf1[44] = bf0[44];
1612   bf1[45] = bf0[45];
1613   bf1[46] = bf0[46];
1614   bf1[47] = bf0[47];
1615   bf1[48] = bf0[48];
1616   bf1[49] = bf0[49];
1617   bf1[50] = bf0[50];
1618   bf1[51] = bf0[51];
1619   bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
1620   bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
1621   bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
1622   bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
1623   bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
1624   bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
1625   bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
1626   bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
1627   bf1[60] = bf0[60];
1628   bf1[61] = bf0[61];
1629   bf1[62] = bf0[62];
1630   bf1[63] = bf0[63];
1631   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1632 
1633   // stage 9
1634   stage++;
1635   bf0 = step;
1636   bf1 = output;
1637   bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
1638   bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
1639   bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
1640   bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
1641   bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
1642   bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
1643   bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
1644   bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
1645   bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
1646   bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
1647   bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
1648   bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
1649   bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
1650   bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
1651   bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
1652   bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
1653   bf1[16] = bf0[16];
1654   bf1[17] = bf0[17];
1655   bf1[18] = bf0[18];
1656   bf1[19] = bf0[19];
1657   bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1658   bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1659   bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1660   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1661   bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
1662   bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
1663   bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
1664   bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
1665   bf1[28] = bf0[28];
1666   bf1[29] = bf0[29];
1667   bf1[30] = bf0[30];
1668   bf1[31] = bf0[31];
1669   bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
1670   bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
1671   bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
1672   bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
1673   bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
1674   bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
1675   bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
1676   bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
1677   bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
1678   bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
1679   bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
1680   bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
1681   bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
1682   bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
1683   bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
1684   bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
1685   bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
1686   bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
1687   bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
1688   bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
1689   bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
1690   bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
1691   bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
1692   bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
1693   bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
1694   bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
1695   bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
1696   bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
1697   bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
1698   bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
1699   bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
1700   bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
1701   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1702 
1703   // stage 10
1704   stage++;
1705   bf0 = output;
1706   bf1 = step;
1707   bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
1708   bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
1709   bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
1710   bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
1711   bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
1712   bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
1713   bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
1714   bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
1715   bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
1716   bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
1717   bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
1718   bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
1719   bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
1720   bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
1721   bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
1722   bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
1723   bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
1724   bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
1725   bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
1726   bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
1727   bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
1728   bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
1729   bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
1730   bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
1731   bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
1732   bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
1733   bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
1734   bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
1735   bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
1736   bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
1737   bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
1738   bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
1739   bf1[32] = bf0[32];
1740   bf1[33] = bf0[33];
1741   bf1[34] = bf0[34];
1742   bf1[35] = bf0[35];
1743   bf1[36] = bf0[36];
1744   bf1[37] = bf0[37];
1745   bf1[38] = bf0[38];
1746   bf1[39] = bf0[39];
1747   bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1748   bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1749   bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1750   bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1751   bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1752   bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1753   bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1754   bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1755   bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
1756   bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
1757   bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
1758   bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
1759   bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
1760   bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
1761   bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
1762   bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
1763   bf1[56] = bf0[56];
1764   bf1[57] = bf0[57];
1765   bf1[58] = bf0[58];
1766   bf1[59] = bf0[59];
1767   bf1[60] = bf0[60];
1768   bf1[61] = bf0[61];
1769   bf1[62] = bf0[62];
1770   bf1[63] = bf0[63];
1771   av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
1772 
1773   // stage 11
1774   stage++;
1775   bf0 = step;
1776   bf1 = output;
1777   bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
1778   bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
1779   bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
1780   bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
1781   bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
1782   bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
1783   bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
1784   bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
1785   bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
1786   bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
1787   bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
1788   bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
1789   bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
1790   bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
1791   bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
1792   bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
1793   bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
1794   bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
1795   bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
1796   bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
1797   bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
1798   bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
1799   bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
1800   bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
1801   bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
1802   bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
1803   bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
1804   bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
1805   bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
1806   bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
1807   bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
1808   bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
1809   bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
1810   bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
1811   bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
1812   bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
1813   bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
1814   bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
1815   bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
1816   bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
1817   bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
1818   bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
1819   bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
1820   bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
1821   bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
1822   bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
1823   bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
1824   bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
1825   bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
1826   bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
1827   bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
1828   bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
1829   bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
1830   bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
1831   bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
1832   bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
1833   bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
1834   bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
1835   bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
1836   bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
1837   bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
1838   bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
1839   bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
1840   bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
1841 }
1842