• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *                                                                            *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 #include <stdlib.h>
21 #include <math.h>
22 #include "ixheaacd_type_def.h"
23 #include "ixheaacd_constants.h"
24 #include "ixheaacd_basic_ops32.h"
25 #include "ixheaacd_fft_ifft_rom.h"
26 #include "ixheaacd_dsp_fft32x32s.h"
27 
28 #define DIG_REV(i, m, j)                                    \
29   do {                                                      \
30     unsigned _ = (i);                                       \
31     _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
32     _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
33     _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
34     (j) = _ >> (m);                                         \
35   } while (0)
36 
ixheaacd_mult32X32float(FLOAT64 a,FLOAT64 b)37 FLOAT64 ixheaacd_mult32X32float(FLOAT64 a, FLOAT64 b) {
38   FLOAT64 result;
39 
40   result = a * b;
41 
42   return result;
43 }
44 
ixheaacd_mac32X32float(FLOAT64 a,FLOAT64 b,FLOAT64 c)45 FLOAT64 ixheaacd_mac32X32float(FLOAT64 a, FLOAT64 b, FLOAT64 c) {
46   FLOAT64 result;
47 
48   result = a + b * c;
49 
50   return result;
51 }
52 
ixheaacd_hbe_apply_ifft_7(FLOAT32 * inp,FLOAT32 * op)53 VOID ixheaacd_hbe_apply_ifft_7(FLOAT32 *inp, FLOAT32 *op) {
54   FLOAT32 x0r, x1r, x2r, x3r, x4r, x5r, x6r, x7r, x8r;
55   FLOAT32 x0i, x1i, x2i, x3i, x4i, x5i, x6i, x7i, x8i;
56   FLOAT32 y0r, y1r, y2r, y3r, y4r, y5r, y6r, y7r, y8r;
57   FLOAT32 y0i, y1i, y2i, y3i, y4i, y5i, y6i, y7i, y8i;
58 
59   x0r = inp[0];
60   x0i = inp[1];
61   x1r = inp[2] + inp[12];
62   x1i = inp[3] + inp[13];
63   x2r = inp[2] - inp[12];
64   x2i = inp[3] - inp[13];
65   x3r = inp[4] + inp[10];
66   x3i = inp[5] + inp[11];
67   x4r = inp[4] - inp[10];
68   x4i = inp[5] - inp[11];
69   x5r = inp[8] + inp[6];
70   x5i = inp[9] + inp[7];
71   x6r = inp[8] - inp[6];
72   x6i = inp[9] - inp[7];
73 
74   y0r = x0r;
75   y0i = x0i;
76   y1r = x1r + x3r + x5r;
77   y1i = x1i + x3i + x5i;
78   y2r = x1r - x3r;
79   y2i = x1i - x3i;
80   y3r = x5r - x1r;
81   y3i = x5i - x1i;
82   y4r = x3r - x5r;
83   y4i = x3i - x5i;
84   y5r = x2r + x4r + x6r;
85   y5i = x2i + x4i + x6i;
86   y6r = x2r - x4r;
87   y6i = x2i - x4i;
88   y7r = x6r - x2r;
89   y7i = x6i - x2i;
90   y8r = x4r - x6r;
91   y8i = x4i - x6i;
92 
93   x0r = y0r + y1r;
94   x0i = y0i + y1i;
95   x1r = y0r + C70 * y1r;
96   x1i = y0i + C70 * y1i;
97   x2r = C71 * y2r;
98   x2i = C71 * y2i;
99   x3r = C72 * y3r;
100   x3i = C72 * y3i;
101   x4r = C73 * y4r;
102   x4i = C73 * y4i;
103   x5r = C74 * y5i;
104   x5i = -C74 * y5r;
105   x6r = C75 * y6i;
106   x6i = -C75 * y6r;
107   x7r = C76 * y7i;
108   x7i = -C76 * y7r;
109   x8r = C77 * y8i;
110   x8i = -C77 * y8r;
111 
112   y0r = x0r;
113   y0i = x0i;
114   y1r = x1r + x2r + x4r;
115   y1i = x1i + x2i + x4i;
116   y2r = x1r - x2r - x3r;
117   y2i = x1i - x2i - x3i;
118   y3r = x1r + x3r - x4r;
119   y3i = x1i + x3i - x4i;
120   y4r = x5r + x6r + x8r;
121   y4i = x5i + x6i + x8i;
122   y5r = x5r - x6r - x7r;
123   y5i = x5i - x6i - x7i;
124   y6r = x5r + x7r - x8r;
125   y6i = x5i + x7i - x8i;
126 
127   x0r = y0r;
128   x0i = y0i;
129   x1r = y1r + y4r;
130   x1i = y1i + y4i;
131   x2r = y3r + y6r;
132   x2i = y3i + y6i;
133   x3r = y2r - y5r;
134   x3i = y2i - y5i;
135   x4r = y2r + y5r;
136   x4i = y2i + y5i;
137   x5r = y3r - y6r;
138   x5i = y3i - y6i;
139   x6r = y1r - y4r;
140   x6i = y1i - y4i;
141 
142   op[0] = x0r;
143   op[1] = x0i;
144   op[2] = x1r;
145   op[3] = x1i;
146   op[4] = x2r;
147   op[5] = x2i;
148   op[6] = x3r;
149   op[7] = x3i;
150   op[8] = x4r;
151   op[9] = x4i;
152   op[10] = x5r;
153   op[11] = x5i;
154   op[12] = x6r;
155   op[13] = x6i;
156 
157   return;
158 }
159 
ixheaacd_hbe_apply_fft_3(FLOAT32 * inp,FLOAT32 * op,WORD32 i_sign)160 VOID ixheaacd_hbe_apply_fft_3(FLOAT32 *inp, FLOAT32 *op, WORD32 i_sign) {
161   FLOAT32 add_r, sub_r;
162   FLOAT32 add_i, sub_i;
163   FLOAT32 X01r, X01i, temp;
164 
165   FLOAT32 p1, p2, p3, p4;
166 
167   FLOAT64 sinmu;
168   sinmu = -0.866025403784439 * (FLOAT64)i_sign;
169 
170   X01r = inp[0] + inp[2];
171   X01i = inp[1] + inp[3];
172 
173   add_r = inp[2] + inp[4];
174   add_i = inp[3] + inp[5];
175 
176   sub_r = inp[2] - inp[4];
177   sub_i = inp[3] - inp[5];
178 
179   p1 = add_r / (FLOAT32)2.0;
180   p4 = add_i / (FLOAT32)2.0;
181   p2 = (FLOAT32)((FLOAT64)sub_i * sinmu);
182   p3 = (FLOAT32)((FLOAT64)sub_r * sinmu);
183 
184   temp = inp[0] - p1;
185 
186   op[0] = X01r + inp[4];
187   op[1] = X01i + inp[5];
188   op[2] = temp + p2;
189   op[3] = (inp[1] - p3) - p4;
190   op[4] = temp - p2;
191   op[5] = (inp[1] + p3) - p4;
192 
193   return;
194 }
195 
ixheaacd_hbe_apply_tw_mult_ifft(FLOAT32 * inp,FLOAT32 * op,WORD32 dim1,WORD32 dim2,const FLOAT32 * tw)196 VOID ixheaacd_hbe_apply_tw_mult_ifft(FLOAT32 *inp, FLOAT32 *op, WORD32 dim1, WORD32 dim2,
197                                      const FLOAT32 *tw) {
198   FLOAT32 accu1, accu2;
199   WORD32 i, j;
200   WORD32 step_val = (dim2 - 1) << 1;
201   for (i = 0; i < (dim2); i++) {
202     op[0] = inp[0];
203     op[1] = inp[1];
204     op += 2;
205     inp += 2;
206   }
207 
208   for (j = 0; j < (dim1 - 1); j++) {
209     op[0] = inp[0];
210     op[1] = inp[1];
211     inp += 2;
212     op += 2;
213     for (i = 0; i < (dim2 - 1); i++) {
214       CPLX_MPY_IFFT(accu1, accu2, inp[2 * i + 0], inp[2 * i + 1], tw[2 * i + 1], tw[2 * i]);
215       op[2 * i + 0] = accu1;
216       op[2 * i + 1] = accu2;
217     }
218     inp += step_val;
219     op += step_val;
220     tw += (dim2 - 1) * 2;
221   }
222 }
223 
ixheaacd_hbe_apply_tw_mult_fft(FLOAT32 * inp,FLOAT32 * op,WORD32 dim1,WORD32 dim2,const FLOAT32 * tw)224 VOID ixheaacd_hbe_apply_tw_mult_fft(FLOAT32 *inp, FLOAT32 *op, WORD32 dim1, WORD32 dim2,
225                                     const FLOAT32 *tw) {
226   FLOAT32 accu1, accu2;
227   WORD32 i, j;
228   WORD32 step_val = (dim2 - 1) << 1;
229   for (i = 0; i < (dim2); i++) {
230     op[0] = inp[0];
231     op[1] = inp[1];
232     op += 2;
233     inp += 2;
234   }
235 
236   for (j = 0; j < (dim1 - 1); j++) {
237     op[0] = inp[0];
238     op[1] = inp[1];
239     inp += 2;
240     op += 2;
241     for (i = 0; i < (dim2 - 1); i++) {
242       CPLX_MPY_FFT(accu1, accu2, inp[2 * i + 0], inp[2 * i + 1], tw[2 * i + 1], tw[2 * i]);
243       op[2 * i + 0] = accu1;
244       op[2 * i + 1] = accu2;
245     }
246     inp += step_val;
247     op += step_val;
248     tw += (dim2 - 1) * 2;
249   }
250 }
251 
ixheaacd_hbe_apply_cfftn(FLOAT32 re[],FLOAT32 * scratch,WORD32 n_pass,WORD32 i_sign)252 VOID ixheaacd_hbe_apply_cfftn(FLOAT32 re[], FLOAT32 *scratch, WORD32 n_pass, WORD32 i_sign) {
253   WORD32 i, j, k, n_stages, h2;
254   FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
255   WORD32 del, nodespacing, in_loop_cnt;
256   WORD32 not_power_4;
257   WORD32 dig_rev_shift;
258   WORD32 mpass = n_pass;
259   WORD32 npoints = n_pass;
260   const FLOAT64 *ptr_w;
261   FLOAT32 *ptr_x = scratch;
262   FLOAT32 *y = scratch + (2 * n_pass);
263   FLOAT32 *ptr_y = y;
264 
265   dig_rev_shift = ixheaacd_norm32(mpass) + 1 - 16;
266   n_stages = 30 - ixheaacd_norm32(mpass);
267   not_power_4 = n_stages & 1;
268 
269   n_stages = n_stages >> 1;
270 
271   ptr_w = ixheaacd_twid_tbl_fft_double;
272   ptr_x = re;
273 
274   if (i_sign == -1) {
275     for (i = 0; i < npoints; i += 4) {
276       FLOAT32 *inp = ptr_x;
277       FLOAT32 tmk;
278 
279       DIG_REV(i, dig_rev_shift, h2);
280       if (not_power_4) {
281         h2 += 1;
282         h2 &= ~1;
283       }
284       inp += (h2);
285 
286       x0r = *inp;
287       x0i = *(inp + 1);
288       inp += (npoints >> 1);
289 
290       x1r = *inp;
291       x1i = *(inp + 1);
292       inp += (npoints >> 1);
293 
294       x2r = *inp;
295       x2i = *(inp + 1);
296       inp += (npoints >> 1);
297 
298       x3r = *inp;
299       x3i = *(inp + 1);
300 
301       x0r = x0r + x2r;
302       x0i = x0i + x2i;
303 
304       tmk = x0r - x2r;
305       x2r = tmk - x2r;
306       tmk = x0i - x2i;
307       x2i = tmk - x2i;
308 
309       x1r = x1r + x3r;
310       x1i = x1i + x3i;
311 
312       tmk = x1r - x3r;
313       x3r = tmk - x3r;
314       tmk = x1i - x3i;
315       x3i = tmk - x3i;
316 
317       x0r = x0r + x1r;
318       x0i = x0i + x1i;
319 
320       tmk = x0r - x1r;
321       x1r = tmk - x1r;
322       tmk = x0i - x1i;
323       x1i = tmk - x1i;
324 
325       x2r = x2r + x3i;
326       x2i = x2i - x3r;
327 
328       tmk = x2r - x3i;
329       x3i = tmk - x3i;
330       tmk = x2i + x3r;
331       x3r = tmk + x3r;
332 
333       *ptr_y++ = x0r;
334       *ptr_y++ = x0i;
335       *ptr_y++ = x2r;
336       *ptr_y++ = x2i;
337       *ptr_y++ = x1r;
338       *ptr_y++ = x1i;
339       *ptr_y++ = x3i;
340       *ptr_y++ = x3r;
341     }
342     ptr_y -= 2 * npoints;
343     del = 4;
344     nodespacing = 64;
345     in_loop_cnt = npoints >> 4;
346     for (i = n_stages - 1; i > 0; i--) {
347       const FLOAT64 *twiddles = ptr_w;
348       FLOAT32 *data = ptr_y;
349       FLOAT64 W1, W2, W3, W4, W5, W6;
350       WORD32 sec_loop_cnt;
351 
352       for (k = in_loop_cnt; k != 0; k--) {
353         x0r = (*data);
354         x0i = (*(data + 1));
355         data += (del << 1);
356 
357         x1r = (*data);
358         x1i = (*(data + 1));
359         data += (del << 1);
360 
361         x2r = (*data);
362         x2i = (*(data + 1));
363         data += (del << 1);
364 
365         x3r = (*data);
366         x3i = (*(data + 1));
367         data -= 3 * (del << 1);
368 
369         x0r = x0r + x2r;
370         x0i = x0i + x2i;
371         x2r = x0r - (x2r * 2);
372         x2i = x0i - (x2i * 2);
373         x1r = x1r + x3r;
374         x1i = x1i + x3i;
375         x3r = x1r - (x3r * 2);
376         x3i = x1i - (x3i * 2);
377 
378         x0r = x0r + x1r;
379         x0i = x0i + x1i;
380         x1r = x0r - (x1r * 2);
381         x1i = x0i - (x1i * 2);
382         x2r = x2r + x3i;
383         x2i = x2i - x3r;
384         x3i = x2r - (x3i * 2);
385         x3r = x2i + (x3r * 2);
386 
387         *data = x0r;
388         *(data + 1) = x0i;
389         data += (del << 1);
390 
391         *data = x2r;
392         *(data + 1) = x2i;
393         data += (del << 1);
394 
395         *data = x1r;
396         *(data + 1) = x1i;
397         data += (del << 1);
398 
399         *data = x3i;
400         *(data + 1) = x3r;
401         data += (del << 1);
402       }
403       data = ptr_y + 2;
404 
405       sec_loop_cnt = (nodespacing * del);
406       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
407                      (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
408                      (sec_loop_cnt / 256);
409       j = nodespacing;
410 
411       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
412         W1 = *(twiddles + j);
413         W4 = *(twiddles + j + 257);
414         W2 = *(twiddles + (j << 1));
415         W5 = *(twiddles + (j << 1) + 257);
416         W3 = *(twiddles + j + (j << 1));
417         W6 = *(twiddles + j + (j << 1) + 257);
418 
419         for (k = in_loop_cnt; k != 0; k--) {
420           FLOAT32 tmp;
421           FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
422 
423           data += (del << 1);
424 
425           x1r = *data;
426           x1i = *(data + 1);
427           data += (del << 1);
428 
429           x2r = *data;
430           x2i = *(data + 1);
431           data += (del << 1);
432 
433           x3r = *data;
434           x3i = *(data + 1);
435           data -= 3 * (del << 1);
436 
437           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x1r, W1) -
438                           ixheaacd_mult32X32float((FLOAT64)x1i, W4));
439           x1i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float((FLOAT64)x1r, W4),
440                                                       (FLOAT64)x1i, W1);
441           x1r = tmp;
442 
443           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x2r, W2) -
444                           ixheaacd_mult32X32float((FLOAT64)x2i, W5));
445           x2i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float((FLOAT64)x2r, W5),
446                                                       (FLOAT64)x2i, W2);
447           x2r = tmp;
448 
449           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x3r, W3) -
450                           ixheaacd_mult32X32float((FLOAT64)x3i, W6));
451           x3i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float((FLOAT64)x3r, W6),
452                                                       (FLOAT64)x3i, W3);
453           x3r = tmp;
454 
455           x0r = (*data);
456           x0i = (*(data + 1));
457 
458           x0r = x0r + (x2r);
459           x0i = x0i + (x2i);
460           x2r = x0r - (x2r * 2);
461           x2i = x0i - (x2i * 2);
462           x1r = x1r + x3r;
463           x1i = x1i + x3i;
464           x3r = x1r - (x3r * 2);
465           x3i = x1i - (x3i * 2);
466 
467           x0r = x0r + (x1r);
468           x0i = x0i + (x1i);
469           x1r = x0r - (x1r * 2);
470           x1i = x0i - (x1i * 2);
471           x2r = x2r + (x3i);
472           x2i = x2i - (x3r);
473           x3i = x2r - (x3i * 2);
474           x3r = x2i + (x3r * 2);
475 
476           *data = x0r;
477           *(data + 1) = x0i;
478           data += (del << 1);
479 
480           *data = x2r;
481           *(data + 1) = x2i;
482           data += (del << 1);
483 
484           *data = x1r;
485           *(data + 1) = x1i;
486           data += (del << 1);
487 
488           *data = x3i;
489           *(data + 1) = x3r;
490           data += (del << 1);
491         }
492         data -= 2 * npoints;
493         data += 2;
494       }
495       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
496         W1 = *(twiddles + j);
497         W4 = *(twiddles + j + 257);
498         W2 = *(twiddles + (j << 1));
499         W5 = *(twiddles + (j << 1) + 257);
500         W3 = *(twiddles + j + (j << 1) - 256);
501         W6 = *(twiddles + j + (j << 1) + 1);
502 
503         for (k = in_loop_cnt; k != 0; k--) {
504           FLOAT32 tmp;
505           FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
506 
507           data += (del << 1);
508 
509           x1r = *data;
510           x1i = *(data + 1);
511           data += (del << 1);
512 
513           x2r = *data;
514           x2i = *(data + 1);
515           data += (del << 1);
516 
517           x3r = *data;
518           x3i = *(data + 1);
519           data -= 3 * (del << 1);
520 
521           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x1r, W1) -
522                           ixheaacd_mult32X32float((FLOAT64)x1i, W4));
523           x1i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float((FLOAT64)x1r, W4),
524                                                       (FLOAT64)x1i, W1);
525           x1r = tmp;
526 
527           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x2r, W2) -
528                           ixheaacd_mult32X32float((FLOAT64)x2i, W5));
529           x2i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float((FLOAT64)x2r, W5),
530                                                       (FLOAT64)x2i, W2);
531           x2r = tmp;
532 
533           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x3r, W6) +
534                           ixheaacd_mult32X32float((FLOAT64)x3i, W3));
535           x3i = (FLOAT32)(-ixheaacd_mult32X32float((FLOAT64)x3r, W3) +
536                           ixheaacd_mult32X32float((FLOAT64)x3i, W6));
537           x3r = tmp;
538 
539           x0r = (*data);
540           x0i = (*(data + 1));
541 
542           x0r = x0r + (x2r);
543           x0i = x0i + (x2i);
544           x2r = x0r - (x2r * 2);
545           x2i = x0i - (x2i * 2);
546           x1r = x1r + x3r;
547           x1i = x1i + x3i;
548           x3r = x1r - (x3r * 2);
549           x3i = x1i - (x3i * 2);
550 
551           x0r = x0r + (x1r);
552           x0i = x0i + (x1i);
553           x1r = x0r - (x1r * 2);
554           x1i = x0i - (x1i * 2);
555           x2r = x2r + (x3i);
556           x2i = x2i - (x3r);
557           x3i = x2r - (x3i * 2);
558           x3r = x2i + (x3r * 2);
559 
560           *data = x0r;
561           *(data + 1) = x0i;
562           data += (del << 1);
563 
564           *data = x2r;
565           *(data + 1) = x2i;
566           data += (del << 1);
567 
568           *data = x1r;
569           *(data + 1) = x1i;
570           data += (del << 1);
571 
572           *data = x3i;
573           *(data + 1) = x3r;
574           data += (del << 1);
575         }
576         data -= 2 * npoints;
577         data += 2;
578       }
579       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
580         W1 = *(twiddles + j);
581         W4 = *(twiddles + j + 257);
582         W2 = *(twiddles + (j << 1) - 256);
583         W5 = *(twiddles + (j << 1) + 1);
584         W3 = *(twiddles + j + (j << 1) - 256);
585         W6 = *(twiddles + j + (j << 1) + 1);
586 
587         for (k = in_loop_cnt; k != 0; k--) {
588           FLOAT32 tmp;
589           FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
590 
591           data += (del << 1);
592 
593           x1r = *data;
594           x1i = *(data + 1);
595           data += (del << 1);
596 
597           x2r = *data;
598           x2i = *(data + 1);
599           data += (del << 1);
600 
601           x3r = *data;
602           x3i = *(data + 1);
603           data -= 3 * (del << 1);
604 
605           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x1r, W1) -
606                           ixheaacd_mult32X32float((FLOAT64)x1i, W4));
607           x1i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float(x1r, W4), x1i, W1);
608           x1r = tmp;
609 
610           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x2r, W5) +
611                           ixheaacd_mult32X32float((FLOAT64)x2i, W2));
612           x2i = (FLOAT32)(-ixheaacd_mult32X32float(x2r, W2) + ixheaacd_mult32X32float(x2i, W5));
613           x2r = tmp;
614 
615           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x3r, W6) +
616                           ixheaacd_mult32X32float((FLOAT64)x3i, W3));
617           x3i = (FLOAT32)(-ixheaacd_mult32X32float((FLOAT64)x3r, W3) +
618                           ixheaacd_mult32X32float((FLOAT64)x3i, W6));
619           x3r = tmp;
620 
621           x0r = (*data);
622           x0i = (*(data + 1));
623 
624           x0r = x0r + (x2r);
625           x0i = x0i + (x2i);
626           x2r = x0r - (x2r * 2);
627           x2i = x0i - (x2i * 2);
628           x1r = x1r + x3r;
629           x1i = x1i + x3i;
630           x3r = x1r - (x3r * 2);
631           x3i = x1i - (x3i * 2);
632 
633           x0r = x0r + (x1r);
634           x0i = x0i + (x1i);
635           x1r = x0r - (x1r * 2);
636           x1i = x0i - (x1i * 2);
637           x2r = x2r + (x3i);
638           x2i = x2i - (x3r);
639           x3i = x2r - (x3i * 2);
640           x3r = x2i + (x3r * 2);
641 
642           *data = x0r;
643           *(data + 1) = x0i;
644           data += (del << 1);
645 
646           *data = x2r;
647           *(data + 1) = x2i;
648           data += (del << 1);
649 
650           *data = x1r;
651           *(data + 1) = x1i;
652           data += (del << 1);
653 
654           *data = x3i;
655           *(data + 1) = x3r;
656           data += (del << 1);
657         }
658         data -= 2 * npoints;
659         data += 2;
660       }
661       for (; j < nodespacing * del; j += nodespacing) {
662         W1 = *(twiddles + j);
663         W4 = *(twiddles + j + 257);
664         W2 = *(twiddles + (j << 1) - 256);
665         W5 = *(twiddles + (j << 1) + 1);
666         W3 = *(twiddles + j + (j << 1) - 512);
667         W6 = *(twiddles + j + (j << 1) - 512 + 257);
668 
669         for (k = in_loop_cnt; k != 0; k--) {
670           FLOAT32 tmp;
671           FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
672 
673           data += (del << 1);
674 
675           x1r = *data;
676           x1i = *(data + 1);
677           data += (del << 1);
678 
679           x2r = *data;
680           x2i = *(data + 1);
681           data += (del << 1);
682 
683           x3r = *data;
684           x3i = *(data + 1);
685           data -= 3 * (del << 1);
686 
687           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x1r, W1) -
688                           ixheaacd_mult32X32float((FLOAT64)x1i, W4));
689           x1i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float((FLOAT64)x1r, W4),
690                                                       (FLOAT64)x1i, W1);
691           x1r = tmp;
692 
693           tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x2r, W5) +
694                           ixheaacd_mult32X32float((FLOAT64)x2i, W2));
695           x2i = (FLOAT32)(-ixheaacd_mult32X32float((FLOAT64)x2r, W2) +
696                           ixheaacd_mult32X32float((FLOAT64)x2i, W5));
697           x2r = tmp;
698 
699           tmp = (FLOAT32)(-ixheaacd_mult32X32float((FLOAT64)x3r, W3) +
700                           ixheaacd_mult32X32float((FLOAT64)x3i, W6));
701           x3i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float((FLOAT64)x3r, W6),
702                                                       (FLOAT64)x3i, W3);
703           x3r = tmp;
704 
705           x0r = (*data);
706           x0i = (*(data + 1));
707 
708           x0r = x0r + (x2r);
709           x0i = x0i + (x2i);
710           x2r = x0r - (x2r * 2);
711           x2i = x0i - (x2i * 2);
712           x1r = x1r + x3r;
713           x1i = x1i - x3i;
714           x3r = x1r - (x3r * 2);
715           x3i = x1i + (x3i * 2);
716 
717           x0r = x0r + (x1r);
718           x0i = x0i + (x1i);
719           x1r = x0r - (x1r * 2);
720           x1i = x0i - (x1i * 2);
721           x2r = x2r + (x3i);
722           x2i = x2i - (x3r);
723           x3i = x2r - (x3i * 2);
724           x3r = x2i + (x3r * 2);
725 
726           *data = x0r;
727           *(data + 1) = x0i;
728           data += (del << 1);
729 
730           *data = x2r;
731           *(data + 1) = x2i;
732           data += (del << 1);
733 
734           *data = x1r;
735           *(data + 1) = x1i;
736           data += (del << 1);
737 
738           *data = x3i;
739           *(data + 1) = x3r;
740           data += (del << 1);
741         }
742         data -= 2 * npoints;
743         data += 2;
744       }
745       nodespacing >>= 2;
746       del <<= 2;
747       in_loop_cnt >>= 2;
748     }
749     if (not_power_4) {
750       const FLOAT64 *twiddles = ptr_w;
751       nodespacing <<= 1;
752 
753       for (j = del / 2; j != 0; j--) {
754         FLOAT64 W1 = *twiddles;
755         FLOAT64 W4 = *(twiddles + 257);
756         FLOAT32 tmp;
757         twiddles += nodespacing;
758 
759         x0r = *ptr_y;
760         x0i = *(ptr_y + 1);
761         ptr_y += (del << 1);
762 
763         x1r = *ptr_y;
764         x1i = *(ptr_y + 1);
765 
766         tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x1r, W1) -
767                         ixheaacd_mult32X32float((FLOAT64)x1i, W4));
768         x1i = (FLOAT32)ixheaacd_mac32X32float(ixheaacd_mult32X32float((FLOAT64)x1r, W4),
769                                                     (FLOAT64)x1i, W1);
770         x1r = tmp;
771 
772         *ptr_y = (x0r) - (x1r);
773         *(ptr_y + 1) = (x0i) - (x1i);
774         ptr_y -= (del << 1);
775 
776         *ptr_y = (x0r) + (x1r);
777         *(ptr_y + 1) = (x0i) + (x1i);
778         ptr_y += 2;
779       }
780       twiddles = ptr_w;
781       for (j = del / 2; j != 0; j--) {
782         FLOAT64 W1 = *twiddles;
783         FLOAT64 W4 = *(twiddles + 257);
784         FLOAT32 tmp;
785         twiddles += nodespacing;
786 
787         x0r = *ptr_y;
788         x0i = *(ptr_y + 1);
789         ptr_y += (del << 1);
790 
791         x1r = *ptr_y;
792         x1i = *(ptr_y + 1);
793 
794         tmp = (FLOAT32)(ixheaacd_mult32X32float((FLOAT64)x1r, W4) +
795                         ixheaacd_mult32X32float((FLOAT64)x1i, W1));
796         x1i = (FLOAT32)(-ixheaacd_mult32X32float((FLOAT64)x1r, W1) +
797                         ixheaacd_mult32X32float((FLOAT64)x1i, W4));
798         x1r = tmp;
799 
800         *ptr_y = (x0r) - (x1r);
801         *(ptr_y + 1) = (x0i) - (x1i);
802         ptr_y -= (del << 1);
803 
804         *ptr_y = (x0r) + (x1r);
805         *(ptr_y + 1) = (x0i) + (x1i);
806         ptr_y += 2;
807       }
808     }
809   } else {
810     for (i = 0; i < npoints; i += 4) {
811       FLOAT32 *inp = ptr_x;
812 
813       DIG_REV(i, dig_rev_shift, h2);
814       if (not_power_4) {
815         h2 += 1;
816         h2 &= ~1;
817       }
818       inp += (h2);
819 
820       x0r = *inp;
821       x0i = *(inp + 1);
822       inp += (npoints >> 1);
823 
824       x1r = *inp;
825       x1i = *(inp + 1);
826       inp += (npoints >> 1);
827 
828       x2r = *inp;
829       x2i = *(inp + 1);
830       inp += (npoints >> 1);
831 
832       x3r = *inp;
833       x3i = *(inp + 1);
834 
835       x0r = x0r + x2r;
836       x0i = x0i + x2i;
837       x2r = x0r - (x2r * 2);
838       x2i = x0i - (x2i * 2);
839       x1r = x1r + x3r;
840       x1i = x1i + x3i;
841       x3r = x1r - (x3r * 2);
842       x3i = x1i - (x3i * 2);
843 
844       x0r = x0r + x1r;
845       x0i = x0i + x1i;
846       x1r = x0r - (x1r * 2);
847       x1i = x0i - (x1i * 2);
848       x2r = x2r - x3i;
849       x2i = x2i + x3r;
850       x3i = x2r + (x3i * 2);
851       x3r = x2i - (x3r * 2);
852 
853       *ptr_y++ = x0r;
854       *ptr_y++ = x0i;
855       *ptr_y++ = x2r;
856       *ptr_y++ = x2i;
857       *ptr_y++ = x1r;
858       *ptr_y++ = x1i;
859       *ptr_y++ = x3i;
860       *ptr_y++ = x3r;
861     }
862     ptr_y -= 2 * npoints;
863     del = 4;
864     nodespacing = 64;
865     in_loop_cnt = npoints >> 4;
866     for (i = n_stages - 1; i > 0; i--) {
867       const FLOAT64 *twiddles = ptr_w;
868       FLOAT32 *data = ptr_y;
869       FLOAT64 W1, W2, W3, W4, W5, W6;
870       WORD32 sec_loop_cnt;
871 
872       for (k = in_loop_cnt; k != 0; k--) {
873         x0r = (*data);
874         x0i = (*(data + 1));
875         data += (del << 1);
876 
877         x1r = (*data);
878         x1i = (*(data + 1));
879         data += (del << 1);
880 
881         x2r = (*data);
882         x2i = (*(data + 1));
883         data += (del << 1);
884 
885         x3r = (*data);
886         x3i = (*(data + 1));
887         data -= 3 * (del << 1);
888 
889         x0r = x0r + x2r;
890         x0i = x0i + x2i;
891         x2r = x0r - (x2r * 2);
892         x2i = x0i - (x2i * 2);
893         x1r = x1r + x3r;
894         x1i = x1i + x3i;
895         x3r = x1r - (x3r * 2);
896         x3i = x1i - (x3i * 2);
897 
898         x0r = x0r + x1r;
899         x0i = x0i + x1i;
900         x1r = x0r - (x1r * 2);
901         x1i = x0i - (x1i * 2);
902         x2r = x2r - x3i;
903         x2i = x2i + x3r;
904         x3i = x2r + (x3i * 2);
905         x3r = x2i - (x3r * 2);
906 
907         *data = x0r;
908         *(data + 1) = x0i;
909         data += (del << 1);
910 
911         *data = x2r;
912         *(data + 1) = x2i;
913         data += (del << 1);
914 
915         *data = x1r;
916         *(data + 1) = x1i;
917         data += (del << 1);
918 
919         *data = x3i;
920         *(data + 1) = x3r;
921         data += (del << 1);
922       }
923       data = ptr_y + 2;
924 
925       sec_loop_cnt = (nodespacing * del);
926       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
927                      (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
928                      (sec_loop_cnt / 256);
929       j = nodespacing;
930 
931       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
932         W1 = *(twiddles + j);
933         W4 = *(twiddles + j + 257);
934         W2 = *(twiddles + (j << 1));
935         W5 = *(twiddles + (j << 1) + 257);
936         W3 = *(twiddles + j + (j << 1));
937         W6 = *(twiddles + j + (j << 1) + 257);
938 
939         for (k = in_loop_cnt; k != 0; k--) {
940           FLOAT32 tmp;
941           FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
942 
943           data += (del << 1);
944 
945           x1r = *data;
946           x1i = *(data + 1);
947           data += (del << 1);
948 
949           x2r = *data;
950           x2i = *(data + 1);
951           data += (del << 1);
952 
953           x3r = *data;
954           x3i = *(data + 1);
955           data -= 3 * (del << 1);
956 
957           tmp = (FLOAT32)(((FLOAT64)x1r * W1) + ((FLOAT64)x1i * W4));
958           x1i = (FLOAT32)(-((FLOAT64)x1r * W4) + (FLOAT64)x1i * W1);
959           x1r = tmp;
960 
961           tmp = (FLOAT32)(((FLOAT64)x2r * W2) + ((FLOAT64)x2i * W5));
962           x2i = (FLOAT32)(-((FLOAT64)x2r * W5) + (FLOAT64)x2i * W2);
963           x2r = tmp;
964 
965           tmp = (FLOAT32)(((FLOAT64)x3r * W3) + ((FLOAT64)x3i * W6));
966           x3i = (FLOAT32)(-((FLOAT64)x3r * W6) + (FLOAT64)x3i * W3);
967           x3r = tmp;
968 
969           x0r = (*data);
970           x0i = (*(data + 1));
971 
972           x0r = x0r + (x2r);
973           x0i = x0i + (x2i);
974           x2r = x0r - (x2r * 2);
975           x2i = x0i - (x2i * 2);
976           x1r = x1r + x3r;
977           x1i = x1i + x3i;
978           x3r = x1r - (x3r * 2);
979           x3i = x1i - (x3i * 2);
980 
981           x0r = x0r + (x1r);
982           x0i = x0i + (x1i);
983           x1r = x0r - (x1r * 2);
984           x1i = x0i - (x1i * 2);
985           x2r = x2r - (x3i);
986           x2i = x2i + (x3r);
987           x3i = x2r + (x3i * 2);
988           x3r = x2i - (x3r * 2);
989 
990           *data = x0r;
991           *(data + 1) = x0i;
992           data += (del << 1);
993 
994           *data = x2r;
995           *(data + 1) = x2i;
996           data += (del << 1);
997 
998           *data = x1r;
999           *(data + 1) = x1i;
1000           data += (del << 1);
1001 
1002           *data = x3i;
1003           *(data + 1) = x3r;
1004           data += (del << 1);
1005         }
1006         data -= 2 * npoints;
1007         data += 2;
1008       }
1009       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1010         W1 = *(twiddles + j);
1011         W4 = *(twiddles + j + 257);
1012         W2 = *(twiddles + (j << 1));
1013         W5 = *(twiddles + (j << 1) + 257);
1014         W3 = *(twiddles + j + (j << 1) - 256);
1015         W6 = *(twiddles + j + (j << 1) + 1);
1016 
1017         for (k = in_loop_cnt; k != 0; k--) {
1018           FLOAT32 tmp;
1019           FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1020 
1021           data += (del << 1);
1022 
1023           x1r = *data;
1024           x1i = *(data + 1);
1025           data += (del << 1);
1026 
1027           x2r = *data;
1028           x2i = *(data + 1);
1029           data += (del << 1);
1030 
1031           x3r = *data;
1032           x3i = *(data + 1);
1033           data -= 3 * (del << 1);
1034 
1035           tmp = (FLOAT32)(((FLOAT64)x1r * W1) + ((FLOAT64)x1i * W4));
1036           x1i = (FLOAT32)(-((FLOAT64)x1r * W4) + (FLOAT64)x1i * W1);
1037           x1r = tmp;
1038 
1039           tmp = (FLOAT32)(((FLOAT64)x2r * W2) + ((FLOAT64)x2i * W5));
1040           x2i = (FLOAT32)(-((FLOAT64)x2r * W5) + (FLOAT64)x2i * W2);
1041           x2r = tmp;
1042 
1043           tmp = (FLOAT32)(((FLOAT64)x3r * W6) - ((FLOAT64)x3i * W3));
1044           x3i = (FLOAT32)(((FLOAT64)x3r * W3) + ((FLOAT64)x3i * W6));
1045           x3r = tmp;
1046 
1047           x0r = (*data);
1048           x0i = (*(data + 1));
1049 
1050           x0r = x0r + (x2r);
1051           x0i = x0i + (x2i);
1052           x2r = x0r - (x2r * 2);
1053           x2i = x0i - (x2i * 2);
1054           x1r = x1r + x3r;
1055           x1i = x1i + x3i;
1056           x3r = x1r - (x3r * 2);
1057           x3i = x1i - (x3i * 2);
1058 
1059           x0r = x0r + (x1r);
1060           x0i = x0i + (x1i);
1061           x1r = x0r - (x1r * 2);
1062           x1i = x0i - (x1i * 2);
1063           x2r = x2r - (x3i);
1064           x2i = x2i + (x3r);
1065           x3i = x2r + (x3i * 2);
1066           x3r = x2i - (x3r * 2);
1067 
1068           *data = x0r;
1069           *(data + 1) = x0i;
1070           data += (del << 1);
1071 
1072           *data = x2r;
1073           *(data + 1) = x2i;
1074           data += (del << 1);
1075 
1076           *data = x1r;
1077           *(data + 1) = x1i;
1078           data += (del << 1);
1079 
1080           *data = x3i;
1081           *(data + 1) = x3r;
1082           data += (del << 1);
1083         }
1084         data -= 2 * npoints;
1085         data += 2;
1086       }
1087       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1088         W1 = *(twiddles + j);
1089         W4 = *(twiddles + j + 257);
1090         W2 = *(twiddles + (j << 1) - 256);
1091         W5 = *(twiddles + (j << 1) + 1);
1092         W3 = *(twiddles + j + (j << 1) - 256);
1093         W6 = *(twiddles + j + (j << 1) + 1);
1094 
1095         for (k = in_loop_cnt; k != 0; k--) {
1096           FLOAT32 tmp;
1097           FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1098 
1099           data += (del << 1);
1100 
1101           x1r = *data;
1102           x1i = *(data + 1);
1103           data += (del << 1);
1104 
1105           x2r = *data;
1106           x2i = *(data + 1);
1107           data += (del << 1);
1108 
1109           x3r = *data;
1110           x3i = *(data + 1);
1111           data -= 3 * (del << 1);
1112 
1113           tmp = (FLOAT32)(((FLOAT64)x1r * W1) + ((FLOAT64)x1i * W4));
1114           x1i = (FLOAT32)(-((FLOAT64)x1r * W4) + (FLOAT64)x1i * W1);
1115           x1r = tmp;
1116 
1117           tmp = (FLOAT32)(((FLOAT64)x2r * W5) - ((FLOAT64)x2i * W2));
1118           x2i = (FLOAT32)(((FLOAT64)x2r * W2) + ((FLOAT64)x2i * W5));
1119           x2r = tmp;
1120 
1121           tmp = (FLOAT32)(((FLOAT64)x3r * W6) - ((FLOAT64)x3i * W3));
1122           x3i = (FLOAT32)(((FLOAT64)x3r * W3) + ((FLOAT64)x3i * W6));
1123           x3r = tmp;
1124 
1125           x0r = (*data);
1126           x0i = (*(data + 1));
1127 
1128           x0r = x0r + (x2r);
1129           x0i = x0i + (x2i);
1130           x2r = x0r - (x2r * 2);
1131           x2i = x0i - (x2i * 2);
1132           x1r = x1r + x3r;
1133           x1i = x1i + x3i;
1134           x3r = x1r - (x3r * 2);
1135           x3i = x1i - (x3i * 2);
1136 
1137           x0r = x0r + (x1r);
1138           x0i = x0i + (x1i);
1139           x1r = x0r - (x1r * 2);
1140           x1i = x0i - (x1i * 2);
1141           x2r = x2r - (x3i);
1142           x2i = x2i + (x3r);
1143           x3i = x2r + (x3i * 2);
1144           x3r = x2i - (x3r * 2);
1145 
1146           *data = x0r;
1147           *(data + 1) = x0i;
1148           data += (del << 1);
1149 
1150           *data = x2r;
1151           *(data + 1) = x2i;
1152           data += (del << 1);
1153 
1154           *data = x1r;
1155           *(data + 1) = x1i;
1156           data += (del << 1);
1157 
1158           *data = x3i;
1159           *(data + 1) = x3r;
1160           data += (del << 1);
1161         }
1162         data -= 2 * npoints;
1163         data += 2;
1164       }
1165       for (; j < nodespacing * del; j += nodespacing) {
1166         W1 = *(twiddles + j);
1167         W4 = *(twiddles + j + 257);
1168         W2 = *(twiddles + (j << 1) - 256);
1169         W5 = *(twiddles + (j << 1) + 1);
1170         W3 = *(twiddles + j + (j << 1) - 512);
1171         W6 = *(twiddles + j + (j << 1) - 512 + 257);
1172 
1173         for (k = in_loop_cnt; k != 0; k--) {
1174           FLOAT32 tmp;
1175           FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1176 
1177           data += (del << 1);
1178 
1179           x1r = *data;
1180           x1i = *(data + 1);
1181           data += (del << 1);
1182 
1183           x2r = *data;
1184           x2i = *(data + 1);
1185           data += (del << 1);
1186 
1187           x3r = *data;
1188           x3i = *(data + 1);
1189           data -= 3 * (del << 1);
1190 
1191           tmp = (FLOAT32)(((FLOAT64)x1r * W1) + ((FLOAT64)x1i * W4));
1192           x1i = (FLOAT32)(-((FLOAT64)x1r * W4) + (FLOAT64)x1i * W1);
1193           x1r = tmp;
1194 
1195           tmp = (FLOAT32)(((FLOAT64)x2r * W5) - ((FLOAT64)x2i * W2));
1196           x2i = (FLOAT32)(((FLOAT64)x2r * W2) + ((FLOAT64)x2i * W5));
1197           x2r = tmp;
1198 
1199           tmp = (FLOAT32)(-((FLOAT64)x3r * W3) - ((FLOAT64)x3i * W6));
1200           x3i = (FLOAT32)(-((FLOAT64)x3r * W6) + (FLOAT64)x3i * W3);
1201           x3r = tmp;
1202 
1203           x0r = (*data);
1204           x0i = (*(data + 1));
1205 
1206           x0r = x0r + (x2r);
1207           x0i = x0i + (x2i);
1208           x2r = x0r - (x2r * 2);
1209           x2i = x0i - (x2i * 2);
1210           x1r = x1r + x3r;
1211           x1i = x1i - x3i;
1212           x3r = x1r - (x3r * 2);
1213           x3i = x1i + (x3i * 2);
1214 
1215           x0r = x0r + (x1r);
1216           x0i = x0i + (x1i);
1217           x1r = x0r - (x1r * 2);
1218           x1i = x0i - (x1i * 2);
1219           x2r = x2r - (x3i);
1220           x2i = x2i + (x3r);
1221           x3i = x2r + (x3i * 2);
1222           x3r = x2i - (x3r * 2);
1223 
1224           *data = x0r;
1225           *(data + 1) = x0i;
1226           data += (del << 1);
1227 
1228           *data = x2r;
1229           *(data + 1) = x2i;
1230           data += (del << 1);
1231 
1232           *data = x1r;
1233           *(data + 1) = x1i;
1234           data += (del << 1);
1235 
1236           *data = x3i;
1237           *(data + 1) = x3r;
1238           data += (del << 1);
1239         }
1240         data -= 2 * npoints;
1241         data += 2;
1242       }
1243       nodespacing >>= 2;
1244       del <<= 2;
1245       in_loop_cnt >>= 2;
1246     }
1247 
1248     if (not_power_4) {
1249       const FLOAT64 *twiddles = ptr_w;
1250       nodespacing <<= 1;
1251 
1252       for (j = del / 2; j != 0; j--) {
1253         FLOAT64 W1 = *twiddles;
1254         FLOAT64 W4 = *(twiddles + 257);
1255         FLOAT32 tmp;
1256         twiddles += nodespacing;
1257 
1258         x0r = *ptr_y;
1259         x0i = *(ptr_y + 1);
1260         ptr_y += (del << 1);
1261 
1262         x1r = *ptr_y;
1263         x1i = *(ptr_y + 1);
1264 
1265         tmp = (FLOAT32)(((FLOAT64)x1r * W1) + ((FLOAT64)x1i * W4));
1266         x1i = (FLOAT32)(-((FLOAT64)x1r * W4) + (FLOAT64)x1i * W1);
1267         x1r = tmp;
1268 
1269         *ptr_y = (x0r) - (x1r);
1270         *(ptr_y + 1) = (x0i) - (x1i);
1271         ptr_y -= (del << 1);
1272 
1273         *ptr_y = (x0r) + (x1r);
1274         *(ptr_y + 1) = (x0i) + (x1i);
1275         ptr_y += 2;
1276       }
1277       twiddles = ptr_w;
1278       for (j = del / 2; j != 0; j--) {
1279         FLOAT64 W1 = *twiddles;
1280         FLOAT64 W4 = *(twiddles + 257);
1281         FLOAT32 tmp;
1282         twiddles += nodespacing;
1283 
1284         x0r = *ptr_y;
1285         x0i = *(ptr_y + 1);
1286         ptr_y += (del << 1);
1287 
1288         x1r = *ptr_y;
1289         x1i = *(ptr_y + 1);
1290 
1291         tmp = (FLOAT32)(((FLOAT64)x1r * W4) - ((FLOAT64)x1i * W1));
1292         x1i = (FLOAT32)(((FLOAT64)x1r * W1) + ((FLOAT64)x1i * W4));
1293         x1r = tmp;
1294 
1295         *ptr_y = (x0r) - (x1r);
1296         *(ptr_y + 1) = (x0i) - (x1i);
1297         ptr_y -= (del << 1);
1298 
1299         *ptr_y = (x0r) + (x1r);
1300         *(ptr_y + 1) = (x0i) + (x1i);
1301         ptr_y += 2;
1302       }
1303     }
1304   }
1305 
1306   for (i = 0; i < n_pass; i++) {
1307     re[2 * i + 0] = y[2 * i + 0];
1308     re[2 * i + 1] = y[2 * i + 1];
1309   }
1310 }
1311 
ixheaacd_hbe_apply_cfftn_gen(FLOAT32 re[],FLOAT32 * scratch,WORD32 n_pass,WORD32 i_sign)1312 VOID ixheaacd_hbe_apply_cfftn_gen(FLOAT32 re[], FLOAT32 *scratch, WORD32 n_pass,
1313                                   WORD32 i_sign) {
1314   WORD32 i, j;
1315   WORD32 m_points = n_pass;
1316   FLOAT32 *x, *y, *re3;
1317   FLOAT32 *ptr_x, *ptr_y;
1318   ptr_x = x = scratch;
1319   scratch += 2 * m_points;
1320   ptr_y = y = scratch;
1321   scratch += 4 * m_points;
1322   re3 = scratch;
1323   scratch += 2 * m_points;
1324   WORD32 cnfac;
1325   WORD32 mpass = n_pass;
1326 
1327   cnfac = 0;
1328   while (mpass % 3 == 0) {
1329     mpass /= 3;
1330     cnfac++;
1331   }
1332 
1333   for (i = 0; i < 3 * cnfac; i++) {
1334     for (j = 0; j < mpass; j++) {
1335       re3[2 * j + 0] = re[6 * j + 2 * i + 0];
1336       re3[2 * j + 1] = re[6 * j + 2 * i + 1];
1337     }
1338 
1339     ixheaacd_hbe_apply_cfftn(re3, scratch, mpass, i_sign);
1340 
1341     for (j = 0; j < mpass; j++) {
1342       re[6 * j + 2 * i + 0] = re3[2 * j + 0];
1343       re[6 * j + 2 * i + 1] = re3[2 * j + 1];
1344     }
1345   }
1346 
1347   {
1348     FLOAT64 *w1r, *w1i;
1349     FLOAT32 tmp;
1350     w1r = (FLOAT64 *)ixheaacd_twid_tbl_fft_ntwt3r;
1351     w1i = (FLOAT64 *)ixheaacd_twid_tbl_fft_ntwt3i;
1352 
1353     if (i_sign < 0) {
1354 
1355       for (i = 0; i < n_pass; i += 3) {
1356         tmp = (FLOAT32)((FLOAT64)re[2 * i + 0] * (*w1r) - (FLOAT64)re[2 * i + 1] * (*w1i));
1357         re[2 * i + 1] =
1358             (FLOAT32)((FLOAT64)re[2 * i + 0] * (*w1i) + (FLOAT64)re[2 * i + 1] * (*w1r));
1359         re[2 * i + 0] = tmp;
1360 
1361         w1r++;
1362         w1i++;
1363 
1364         tmp = (FLOAT32)((FLOAT64)re[2 * i + 2] * (*w1r) - (FLOAT64)re[2 * i + 3] * (*w1i));
1365         re[2 * i + 3] =
1366             (FLOAT32)((FLOAT64)re[2 * i + 2] * (*w1i) + (FLOAT64)re[2 * i + 3] * (*w1r));
1367         re[2 * i + 2] = tmp;
1368 
1369         w1r++;
1370         w1i++;
1371 
1372         tmp = (FLOAT32)((FLOAT64)re[2 * i + 4] * (*w1r) - (FLOAT64)re[2 * i + 5] * (*w1i));
1373         re[2 * i + 5] =
1374             (FLOAT32)((FLOAT64)re[2 * i + 4] * (*w1i) + (FLOAT64)re[2 * i + 5] * (*w1r));
1375         re[2 * i + 4] = tmp;
1376 
1377         w1r += 3 * (128 / mpass - 1) + 1;
1378         w1i += 3 * (128 / mpass - 1) + 1;
1379       }
1380     } else {
1381       for (i = 0; i < n_pass; i += 3) {
1382         tmp = (FLOAT32)((FLOAT64)re[2 * i + 0] * (*w1r) + (FLOAT64)re[2 * i + 1] * (*w1i));
1383         re[2 * i + 1] =
1384             (FLOAT32)(-(FLOAT64)re[2 * i + 0] * (*w1i) + (FLOAT64)re[2 * i + 1] * (*w1r));
1385         re[2 * i + 0] = tmp;
1386 
1387         w1r++;
1388         w1i++;
1389 
1390         tmp = (FLOAT32)((FLOAT64)re[2 * i + 2] * (*w1r) + (FLOAT64)re[2 * i + 3] * (*w1i));
1391         re[2 * i + 3] =
1392             (FLOAT32)(-(FLOAT64)re[2 * i + 2] * (*w1i) + (FLOAT64)re[2 * i + 3] * (*w1r));
1393         re[2 * i + 2] = tmp;
1394 
1395         w1r++;
1396         w1i++;
1397 
1398         tmp = (FLOAT32)((FLOAT64)re[2 * i + 4] * (*w1r) + (FLOAT64)re[2 * i + 5] * (*w1i));
1399         re[2 * i + 5] =
1400             (FLOAT32)(-(FLOAT64)re[2 * i + 4] * (*w1i) + (FLOAT64)re[2 * i + 5] * (*w1r));
1401         re[2 * i + 4] = tmp;
1402 
1403         w1r += 3 * (128 / mpass - 1) + 1;
1404         w1i += 3 * (128 / mpass - 1) + 1;
1405       }
1406     }
1407   }
1408 
1409   for (i = 0; i < n_pass; i++) {
1410     ptr_x[2 * i + 0] = re[2 * i + 0];
1411     ptr_x[2 * i + 1] = re[2 * i + 1];
1412   }
1413   for (i = 0; i < mpass; i++) {
1414     ixheaacd_hbe_apply_fft_3(ptr_x, ptr_y, i_sign);
1415 
1416     ptr_x = ptr_x + 6;
1417     ptr_y = ptr_y + 6;
1418   }
1419 
1420   for (i = 0; i < mpass; i++) {
1421     re[2 * i + 0] = y[6 * i + 0];
1422     re[2 * i + 1] = y[6 * i + 1];
1423   }
1424 
1425   for (i = 0; i < mpass; i++) {
1426     re[2 * mpass + 2 * i + 0] = y[6 * i + 2];
1427     re[2 * mpass + 2 * i + 1] = y[6 * i + 3];
1428   }
1429 
1430   for (i = 0; i < mpass; i++) {
1431     re[4 * mpass + 2 * i + 0] = y[6 * i + 4];
1432     re[4 * mpass + 2 * i + 1] = y[6 * i + 5];
1433   }
1434 }
1435 
ixheaacd_hbe_apply_fft_288(FLOAT32 * inp,FLOAT32 * scratch,WORD32 len,WORD32 i_sign)1436 VOID ixheaacd_hbe_apply_fft_288(FLOAT32 *inp, FLOAT32 *scratch, WORD32 len, WORD32 i_sign) {
1437   FLOAT32 *op = scratch;
1438   WORD32 mpoints = len / 96;
1439   WORD32 fpoints = len / 3;
1440   WORD32 ii, jj;
1441   scratch += 2 * len;
1442 
1443   for (ii = 0; ii < mpoints; ii++) {
1444     for (jj = 0; jj < fpoints; jj++) {
1445       op[2 * jj + 0] = inp[2 * mpoints * jj + 2 * ii];
1446       op[2 * jj + 1] = inp[2 * mpoints * jj + 2 * ii + 1];
1447     }
1448 
1449     if (fpoints & (fpoints - 1))
1450       ixheaacd_hbe_apply_cfftn_gen(op, scratch, fpoints, i_sign);
1451     else
1452       ixheaacd_hbe_apply_cfftn(op, scratch, fpoints, i_sign);
1453 
1454     for (jj = 0; jj < fpoints; jj++) {
1455       inp[mpoints * 2 * jj + 2 * ii + 0] = op[2 * jj + 0];
1456       inp[mpoints * 2 * jj + 2 * ii + 1] = op[2 * jj + 1];
1457     }
1458   }
1459 
1460   ixheaacd_hbe_apply_tw_mult_fft(inp, op, fpoints, mpoints, ixheaacd_twid_tbl_fft_288);
1461 
1462   for (ii = 0; ii < fpoints; ii++) {
1463     ixheaacd_hbe_apply_fft_3(op, scratch, i_sign);
1464     op = op + (mpoints * 2);
1465     scratch = scratch + (mpoints * 2);
1466   }
1467 
1468   scratch -= fpoints * mpoints * 2;
1469 
1470   for (jj = 0; jj < fpoints; jj++) {
1471     inp[2 * jj + 0] = scratch[6 * jj];
1472     inp[2 * jj + 1] = scratch[6 * jj + 1];
1473   }
1474   for (jj = 0; jj < fpoints; jj++) {
1475     inp[2 * fpoints + 2 * jj + 0] = scratch[6 * jj + 2];
1476     inp[2 * fpoints + 2 * jj + 1] = scratch[6 * jj + 3];
1477   }
1478   for (jj = 0; jj < fpoints; jj++) {
1479     inp[4 * fpoints + 2 * jj + 0] = scratch[6 * jj + 4];
1480     inp[4 * fpoints + 2 * jj + 1] = scratch[6 * jj + 5];
1481   }
1482 }
1483 
ixheaacd_hbe_apply_ifft_224(FLOAT32 * inp,FLOAT32 * scratch,WORD32 len,WORD32 i_sign)1484 VOID ixheaacd_hbe_apply_ifft_224(FLOAT32 *inp, FLOAT32 *scratch, WORD32 len, WORD32 i_sign) {
1485   WORD32 mpoints = len / 32;
1486   WORD32 fpoints = len / 7;
1487   WORD32 ii, jj;
1488   FLOAT32 *op = scratch;
1489   scratch += 2 * len;
1490 
1491   for (ii = 0; ii < mpoints; ii++) {
1492     for (jj = 0; jj < fpoints; jj++) {
1493       op[2 * jj + 0] = inp[2 * mpoints * jj + 2 * ii];
1494       op[2 * jj + 1] = inp[2 * mpoints * jj + 2 * ii + 1];
1495     }
1496 
1497     if (fpoints & (fpoints - 1))
1498       ixheaacd_hbe_apply_cfftn_gen(op, scratch, fpoints, i_sign);
1499     else
1500       ixheaacd_hbe_apply_cfftn(op, scratch, fpoints, i_sign);
1501 
1502     for (jj = 0; jj < fpoints; jj++) {
1503       inp[mpoints * 2 * jj + 2 * ii + 0] = op[2 * jj + 0];
1504       inp[mpoints * 2 * jj + 2 * ii + 1] = op[2 * jj + 1];
1505     }
1506   }
1507 
1508   ixheaacd_hbe_apply_tw_mult_ifft(inp, op, fpoints, mpoints, ixheaacd_twid_tbl_fft_224);
1509 
1510   for (ii = 0; ii < fpoints; ii++) {
1511     ixheaacd_hbe_apply_ifft_7(op, scratch);
1512     scratch += (mpoints * 2);
1513     op += (mpoints * 2);
1514   }
1515 
1516   scratch -= fpoints * mpoints * 2;
1517 
1518   for (jj = 0; jj < fpoints; jj++) {
1519     for (ii = 0; ii < mpoints; ii++) {
1520       inp[fpoints * ii * 2 + 2 * jj + 0] = scratch[mpoints * jj * 2 + 2 * ii + 0];
1521       inp[fpoints * ii * 2 + 2 * jj + 1] = scratch[mpoints * jj * 2 + 2 * ii + 1];
1522     }
1523   }
1524 }
1525 
ixheaacd_hbe_apply_ifft_336(FLOAT32 * inp,FLOAT32 * ptr_scratch,WORD32 len,WORD32 i_sign)1526 VOID ixheaacd_hbe_apply_ifft_336(FLOAT32 *inp, FLOAT32 *ptr_scratch, WORD32 len,
1527                                  WORD32 i_sign) {
1528   WORD32 i, j;
1529   WORD32 m_points = len / 7;
1530   WORD32 n_points = len / 48;
1531   FLOAT32 *ptr_real, *ptr_imag, *p_real_1, *p_scratch;
1532   ptr_real = ptr_scratch;
1533   ptr_scratch += 2 * len;
1534   ptr_imag = ptr_scratch;
1535   ptr_scratch += len;
1536   p_scratch = ptr_scratch;
1537   ptr_scratch += len;
1538   p_real_1 = ptr_scratch;
1539   ptr_scratch += len;
1540 
1541   for (i = 0; i < len; i++) {
1542     ptr_real[i] = inp[2 * i + 0];
1543     ptr_imag[i] = inp[2 * i + 1];
1544   }
1545 
1546   for (i = 0; i < m_points; i++) {
1547     for (j = 0; j < n_points; j++) {
1548       p_real_1[2 * j + 0] = inp[m_points * 2 * j + 2 * i + 0];
1549       p_real_1[2 * j + 1] = inp[m_points * 2 * j + 2 * i + 1];
1550     }
1551 
1552     ixheaacd_hbe_apply_ifft_7(p_real_1, ptr_scratch);
1553 
1554     for (j = 0; j < n_points; j++) {
1555       inp[m_points * 2 * j + 2 * i + 0] = ptr_scratch[2 * j + 0];
1556       inp[m_points * 2 * j + 2 * i + 1] = ptr_scratch[2 * j + 1];
1557     }
1558   }
1559 
1560   if (m_points == 48)
1561     ixheaacd_hbe_apply_tw_mult_ifft(inp, p_scratch, n_points, m_points,
1562                                     ixheaacd_twid_tbl_fft_336);
1563   else
1564     ixheaacd_hbe_apply_tw_mult_ifft(inp, p_scratch, n_points, m_points,
1565                                     ixheaacd_twid_tbl_fft_168);
1566 
1567   for (i = 0; i < len; i++) {
1568     ptr_real[2 * i + 0] = p_scratch[2 * i + 0];
1569     ptr_real[2 * i + 1] = p_scratch[2 * i + 1];
1570   }
1571 
1572   for (i = 0; i < n_points; i++) {
1573     ixheaacd_hbe_apply_cfftn_gen(ptr_real, ptr_scratch, m_points, i_sign);
1574     ptr_real += (2 * m_points);
1575   }
1576 
1577   ptr_real -= n_points * 2 * m_points;
1578 
1579   for (j = 0; j < n_points; j++) {
1580     for (i = 0; i < m_points; i++) {
1581       inp[n_points * 2 * i + 2 * j + 0] = ptr_real[2 * m_points * j + 2 * i + 0];
1582       inp[n_points * 2 * i + 2 * j + 1] = ptr_real[2 * m_points * j + 2 * i + 1];
1583     }
1584   }
1585   return;
1586 }
1587 
1588