• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *                                                                            *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 #include <stdlib.h>
21 #include <stdio.h>
22 
23 #include <ixheaacd_type_def.h>
24 #include "ixheaacd_interface.h"
25 #include "ixheaacd_constants.h"
26 #include <ixheaacd_basic_ops32.h>
27 #include "ixheaacd_function_selector.h"
28 
29 extern const WORD32 ixheaacd_twiddle_table_fft_32x32[514];
30 extern const WORD32 ixheaacd_twiddle_table_3pr[1155];
31 extern const WORD32 ixheaacd_twiddle_table_3pi[1155];
32 extern const WORD8 ixheaacd_mps_dig_rev[16];
33 
34 #define PLATFORM_INLINE __inline
35 
36 #define DIG_REV(i, m, j)                                    \
37   do {                                                      \
38     unsigned _ = (i);                                       \
39     _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
40     _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
41     _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
42     (j) = _ >> (m);                                         \
43   } while (0)
44 
ixheaacd_mult32(WORD32 a,WORD32 b)45 static PLATFORM_INLINE WORD32 ixheaacd_mult32(WORD32 a, WORD32 b) {
46   WORD32 result;
47   WORD64 temp_result;
48 
49   temp_result = (WORD64)a * (WORD64)b;
50   result = (WORD32)(temp_result >> 31);
51 
52   return (result);
53 }
54 
ixheaacd_mac32(WORD32 a,WORD32 b,WORD32 c)55 static PLATFORM_INLINE WORD32 ixheaacd_mac32(WORD32 a, WORD32 b, WORD32 c) {
56   WORD32 result;
57 
58   result = a + ixheaacd_mult32(b, c);
59 
60   return (result);
61 }
62 
ixheaacd_mult32_shl(WORD32 a,WORD32 b)63 static PLATFORM_INLINE WORD32 ixheaacd_mult32_shl(WORD32 a, WORD32 b) {
64   WORD32 result;
65   WORD64 temp_result;
66 
67   temp_result = (WORD64)a * (WORD64)b;
68   result = (WORD32)(temp_result >> 32);
69 
70   return (result << 1);
71 }
72 
ixheaacd_mps_complex_fft_64_dec(WORD32 * ptr_x,WORD32 * fin_re,WORD32 * fin_im,WORD32 nlength)73 VOID ixheaacd_mps_complex_fft_64_dec(WORD32 *ptr_x, WORD32 *fin_re,
74                                      WORD32 *fin_im, WORD32 nlength) {
75   WORD32 i, j, k, n_stages;
76   WORD32 h2, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
77   WORD32 del, nodespacing, in_loop_cnt;
78   WORD32 y[128];
79   WORD32 npoints = nlength;
80   WORD32 *ptr_y = y;
81   const WORD32 *ptr_w;
82   n_stages = 30 - ixheaacd_norm32(npoints);
83 
84   n_stages = n_stages >> 1;
85 
86   ptr_w = ixheaacd_twiddle_table_fft_32x32;
87 
88   for (i = 0; i < npoints; i += 4) {
89     WORD32 *inp = ptr_x;
90     h2 = ixheaacd_mps_dig_rev[i >> 2];
91     inp += (h2);
92 
93     x0r = *inp;
94     x0i = *(inp + 1);
95     inp += (npoints >> 1);
96 
97     x1r = *inp;
98     x1i = *(inp + 1);
99     inp += (npoints >> 1);
100 
101     x2r = *inp;
102     x2i = *(inp + 1);
103     inp += (npoints >> 1);
104 
105     x3r = *inp;
106     x3i = *(inp + 1);
107 
108     x0r = x0r + x2r;
109     x0i = x0i + x2i;
110     x2r = x0r - (x2r << 1);
111     x2i = x0i - (x2i << 1);
112     x1r = x1r + x3r;
113     x1i = x1i + x3i;
114     x3r = x1r - (x3r << 1);
115     x3i = x1i - (x3i << 1);
116 
117     x0r = x0r + x1r;
118     x0i = x0i + x1i;
119     x1r = x0r - (x1r << 1);
120     x1i = x0i - (x1i << 1);
121     x2r = x2r + x3i;
122     x2i = x2i - x3r;
123     x3i = x2r - (x3i << 1);
124     x3r = x2i + (x3r << 1);
125 
126     *ptr_y++ = x0r;
127     *ptr_y++ = x0i;
128     *ptr_y++ = x2r;
129     *ptr_y++ = x2i;
130     *ptr_y++ = x1r;
131     *ptr_y++ = x1i;
132     *ptr_y++ = x3i;
133     *ptr_y++ = x3r;
134   }
135   ptr_y -= 2 * npoints;
136   del = 4;
137   nodespacing = 64;
138   in_loop_cnt = npoints >> 4;
139   for (i = n_stages - 1; i > 0; i--) {
140     const WORD32 *twiddles = ptr_w;
141     WORD32 *data = ptr_y;
142     WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
143     WORD32 sec_loop_cnt;
144 
145     for (k = in_loop_cnt; k != 0; k--) {
146       x0r = (*data);
147       x0i = (*(data + 1));
148       data += (del << 1);
149 
150       x1r = (*data);
151       x1i = (*(data + 1));
152       data += (del << 1);
153 
154       x2r = (*data);
155       x2i = (*(data + 1));
156       data += (del << 1);
157 
158       x3r = (*data);
159       x3i = (*(data + 1));
160       data -= 3 * (del << 1);
161 
162       x0r = x0r + x2r;
163       x0i = x0i + x2i;
164       x2r = x0r - (x2r << 1);
165       x2i = x0i - (x2i << 1);
166       x1r = x1r + x3r;
167       x1i = x1i + x3i;
168       x3r = x1r - (x3r << 1);
169       x3i = x1i - (x3i << 1);
170 
171       x0r = x0r + x1r;
172       x0i = x0i + x1i;
173       x1r = x0r - (x1r << 1);
174       x1i = x0i - (x1i << 1);
175       x2r = x2r + x3i;
176       x2i = x2i - x3r;
177       x3i = x2r - (x3i << 1);
178       x3r = x2i + (x3r << 1);
179 
180       *data = x0r;
181       *(data + 1) = x0i;
182       data += (del << 1);
183 
184       *data = x2r;
185       *(data + 1) = x2i;
186       data += (del << 1);
187 
188       *data = x1r;
189       *(data + 1) = x1i;
190       data += (del << 1);
191 
192       *data = x3i;
193       *(data + 1) = x3r;
194       data += (del << 1);
195     }
196     data = ptr_y + 2;
197 
198     sec_loop_cnt = (nodespacing * del);
199     sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
200                    (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
201                    (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
202                    (sec_loop_cnt / 256);
203     j = nodespacing;
204 
205     for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
206       w1h = *(twiddles + 2 * j);
207       w1l = *(twiddles + 2 * j + 1);
208       w2h = *(twiddles + 2 * (j << 1));
209       w2l = *(twiddles + 2 * (j << 1) + 1);
210       w3h = *(twiddles + 2 * j + 2 * (j << 1));
211       w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
212 
213       for (k = in_loop_cnt; k != 0; k--) {
214         WORD32 tmp;
215         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
216 
217         data += (del << 1);
218 
219         x1r = *data;
220         x1i = *(data + 1);
221         data += (del << 1);
222 
223         x2r = *data;
224         x2i = *(data + 1);
225         data += (del << 1);
226 
227         x3r = *data;
228         x3i = *(data + 1);
229         data -= 3 * (del << 1);
230 
231         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
232         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
233         x1r = tmp;
234 
235         tmp = (ixheaacd_mult32(x2r, w2l) - ixheaacd_mult32(x2i, w2h));
236         x2i = ixheaacd_mac32(ixheaacd_mult32(x2r, w2h), x2i, w2l);
237         x2r = tmp;
238 
239         tmp = (ixheaacd_mult32(x3r, w3l) - ixheaacd_mult32(x3i, w3h));
240         x3i = ixheaacd_mac32(ixheaacd_mult32(x3r, w3h), x3i, w3l);
241         x3r = tmp;
242 
243         x0r = (*data);
244         x0i = (*(data + 1));
245 
246         x0r = x0r + (x2r);
247         x0i = x0i + (x2i);
248         x2r = x0r - (x2r << 1);
249         x2i = x0i - (x2i << 1);
250         x1r = x1r + x3r;
251         x1i = x1i + x3i;
252         x3r = x1r - (x3r << 1);
253         x3i = x1i - (x3i << 1);
254 
255         x0r = x0r + (x1r);
256         x0i = x0i + (x1i);
257         x1r = x0r - (x1r << 1);
258         x1i = x0i - (x1i << 1);
259         x2r = x2r + (x3i);
260         x2i = x2i - (x3r);
261         x3i = x2r - (x3i << 1);
262         x3r = x2i + (x3r << 1);
263 
264         *data = x0r;
265         *(data + 1) = x0i;
266         data += (del << 1);
267 
268         *data = x2r;
269         *(data + 1) = x2i;
270         data += (del << 1);
271 
272         *data = x1r;
273         *(data + 1) = x1i;
274         data += (del << 1);
275 
276         *data = x3i;
277         *(data + 1) = x3r;
278         data += (del << 1);
279       }
280       data -= 2 * npoints;
281       data += 2;
282     }
283     for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
284       w1h = *(twiddles + 2 * j);
285       w2h = *(twiddles + 2 * (j << 1));
286       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
287       w1l = *(twiddles + 2 * j + 1);
288       w2l = *(twiddles + 2 * (j << 1) + 1);
289       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
290 
291       for (k = in_loop_cnt; k != 0; k--) {
292         WORD32 tmp;
293         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
294 
295         data += (del << 1);
296 
297         x1r = *data;
298         x1i = *(data + 1);
299         data += (del << 1);
300 
301         x2r = *data;
302         x2i = *(data + 1);
303         data += (del << 1);
304 
305         x3r = *data;
306         x3i = *(data + 1);
307         data -= 3 * (del << 1);
308 
309         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
310         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
311         x1r = tmp;
312 
313         tmp = (ixheaacd_mult32(x2r, w2l) - ixheaacd_mult32(x2i, w2h));
314         x2i = ixheaacd_mac32(ixheaacd_mult32(x2r, w2h), x2i, w2l);
315         x2r = tmp;
316 
317         tmp = (ixheaacd_mult32(x3r, w3h) + ixheaacd_mult32(x3i, w3l));
318         x3i = -ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
319         x3r = tmp;
320 
321         x0r = (*data);
322         x0i = (*(data + 1));
323 
324         x0r = x0r + (x2r);
325         x0i = x0i + (x2i);
326         x2r = x0r - (x2r << 1);
327         x2i = x0i - (x2i << 1);
328         x1r = x1r + x3r;
329         x1i = x1i + x3i;
330         x3r = x1r - (x3r << 1);
331         x3i = x1i - (x3i << 1);
332 
333         x0r = x0r + (x1r);
334         x0i = x0i + (x1i);
335         x1r = x0r - (x1r << 1);
336         x1i = x0i - (x1i << 1);
337         x2r = x2r + (x3i);
338         x2i = x2i - (x3r);
339         x3i = x2r - (x3i << 1);
340         x3r = x2i + (x3r << 1);
341 
342         *data = x0r;
343         *(data + 1) = x0i;
344         data += (del << 1);
345 
346         *data = x2r;
347         *(data + 1) = x2i;
348         data += (del << 1);
349 
350         *data = x1r;
351         *(data + 1) = x1i;
352         data += (del << 1);
353 
354         *data = x3i;
355         *(data + 1) = x3r;
356         data += (del << 1);
357       }
358       data -= 2 * npoints;
359       data += 2;
360     }
361     for (; j <= sec_loop_cnt * 2; j += nodespacing) {
362       w1h = *(twiddles + 2 * j);
363       w2h = *(twiddles + 2 * (j << 1) - 512);
364       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
365       w1l = *(twiddles + 2 * j + 1);
366       w2l = *(twiddles + 2 * (j << 1) - 511);
367       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
368 
369       for (k = in_loop_cnt; k != 0; k--) {
370         WORD32 tmp;
371         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
372 
373         data += (del << 1);
374 
375         x1r = *data;
376         x1i = *(data + 1);
377         data += (del << 1);
378 
379         x2r = *data;
380         x2i = *(data + 1);
381         data += (del << 1);
382 
383         x3r = *data;
384         x3i = *(data + 1);
385         data -= 3 * (del << 1);
386 
387         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
388         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
389         x1r = tmp;
390 
391         tmp = (ixheaacd_mult32(x2r, w2h) + ixheaacd_mult32(x2i, w2l));
392         x2i = -ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
393         x2r = tmp;
394 
395         tmp = (ixheaacd_mult32(x3r, w3h) + ixheaacd_mult32(x3i, w3l));
396         x3i = -ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
397         x3r = tmp;
398 
399         x0r = (*data);
400         x0i = (*(data + 1));
401 
402         x0r = x0r + (x2r);
403         x0i = x0i + (x2i);
404         x2r = x0r - (x2r << 1);
405         x2i = x0i - (x2i << 1);
406         x1r = x1r + x3r;
407         x1i = x1i + x3i;
408         x3r = x1r - (x3r << 1);
409         x3i = x1i - (x3i << 1);
410 
411         x0r = x0r + (x1r);
412         x0i = x0i + (x1i);
413         x1r = x0r - (x1r << 1);
414         x1i = x0i - (x1i << 1);
415         x2r = x2r + (x3i);
416         x2i = x2i - (x3r);
417         x3i = x2r - (x3i << 1);
418         x3r = x2i + (x3r << 1);
419 
420         *data = x0r;
421         *(data + 1) = x0i;
422         data += (del << 1);
423 
424         *data = x2r;
425         *(data + 1) = x2i;
426         data += (del << 1);
427 
428         *data = x1r;
429         *(data + 1) = x1i;
430         data += (del << 1);
431 
432         *data = x3i;
433         *(data + 1) = x3r;
434         data += (del << 1);
435       }
436       data -= 2 * npoints;
437       data += 2;
438     }
439     for (; j < nodespacing * del; j += nodespacing) {
440       w1h = *(twiddles + 2 * j);
441       w2h = *(twiddles + 2 * (j << 1) - 512);
442       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
443       w1l = *(twiddles + 2 * j + 1);
444       w2l = *(twiddles + 2 * (j << 1) - 511);
445       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
446 
447       for (k = in_loop_cnt; k != 0; k--) {
448         WORD32 tmp;
449         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
450 
451         data += (del << 1);
452 
453         x1r = *data;
454         x1i = *(data + 1);
455         data += (del << 1);
456 
457         x2r = *data;
458         x2i = *(data + 1);
459         data += (del << 1);
460 
461         x3r = *data;
462         x3i = *(data + 1);
463         data -= 3 * (del << 1);
464 
465         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
466         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
467         x1r = tmp;
468 
469         tmp = (ixheaacd_mult32(x2r, w2h) + ixheaacd_mult32(x2i, w2l));
470         x2i = -ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
471         x2r = tmp;
472 
473         tmp = (-ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h));
474         x3i = ixheaacd_mac32(ixheaacd_mult32(x3r, w3h), x3i, w3l);
475         x3r = tmp;
476 
477         x0r = (*data);
478         x0i = (*(data + 1));
479 
480         x0r = x0r + (x2r);
481         x0i = x0i + (x2i);
482         x2r = x0r - (x2r << 1);
483         x2i = x0i - (x2i << 1);
484         x1r = x1r + x3r;
485         x1i = x1i - x3i;
486         x3r = x1r - (x3r << 1);
487         x3i = x1i + (x3i << 1);
488 
489         x0r = x0r + (x1r);
490         x0i = x0i + (x1i);
491         x1r = x0r - (x1r << 1);
492         x1i = x0i - (x1i << 1);
493         x2r = x2r + (x3i);
494         x2i = x2i - (x3r);
495         x3i = x2r - (x3i << 1);
496         x3r = x2i + (x3r << 1);
497 
498         *data = x0r;
499         *(data + 1) = x0i;
500         data += (del << 1);
501 
502         *data = x2r;
503         *(data + 1) = x2i;
504         data += (del << 1);
505 
506         *data = x1r;
507         *(data + 1) = x1i;
508         data += (del << 1);
509 
510         *data = x3i;
511         *(data + 1) = x3r;
512         data += (del << 1);
513       }
514       data -= 2 * npoints;
515       data += 2;
516     }
517     nodespacing >>= 2;
518     del <<= 2;
519     in_loop_cnt >>= 2;
520   }
521 
522   for (i = 0; i < 2 * nlength; i += 2) {
523     fin_re[i] = y[i];
524     fin_im[i] = y[i + 1];
525   }
526 
527   return;
528 }
529 
ixheaacd_complex_fft_p2_dec(WORD32 * xr,WORD32 * xi,WORD32 nlength,WORD32 fft_mode,WORD32 * preshift)530 VOID ixheaacd_complex_fft_p2_dec(WORD32 *xr, WORD32 *xi, WORD32 nlength,
531                                  WORD32 fft_mode, WORD32 *preshift) {
532   WORD32 i, j, k, n_stages;
533   WORD32 h2, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
534   WORD32 del, nodespacing, in_loop_cnt;
535   WORD32 not_power_4;
536   WORD32 npts, shift;
537   WORD32 dig_rev_shift;
538   WORD32 ptr_x[1024];
539   WORD32 y[1024];
540   WORD32 npoints = nlength;
541   WORD32 n = 0;
542   WORD32 *ptr_y = y;
543   const WORD32 *ptr_w;
544   dig_rev_shift = ixheaacd_norm32(npoints) + 1 - 16;
545   n_stages = 30 - ixheaacd_norm32(npoints);
546   not_power_4 = n_stages & 1;
547 
548   n_stages = n_stages >> 1;
549 
550   npts = npoints;
551   while (npts >> 1) {
552     n++;
553     npts = npts >> 1;
554   }
555 
556   if (n % 2 == 0)
557     shift = ((n + 4)) / 2;
558   else
559     shift = ((n + 3) / 2);
560 
561   for (i = 0; i < nlength; i++) {
562     ptr_x[2 * i] = (xr[i] / (1 << (shift)));
563     ptr_x[2 * i + 1] = (xi[i] / (1 << (shift)));
564   }
565 
566   if (fft_mode == -1) {
567     ptr_w = ixheaacd_twiddle_table_fft_32x32;
568 
569     for (i = 0; i < npoints; i += 4) {
570       WORD32 *inp = ptr_x;
571 
572       DIG_REV(i, dig_rev_shift, h2);
573       if (not_power_4) {
574         h2 += 1;
575         h2 &= ~1;
576       }
577       inp += (h2);
578 
579       x0r = *inp;
580       x0i = *(inp + 1);
581       inp += (npoints >> 1);
582 
583       x1r = *inp;
584       x1i = *(inp + 1);
585       inp += (npoints >> 1);
586 
587       x2r = *inp;
588       x2i = *(inp + 1);
589       inp += (npoints >> 1);
590 
591       x3r = *inp;
592       x3i = *(inp + 1);
593 
594       x0r = x0r + x2r;
595       x0i = x0i + x2i;
596       x2r = x0r - (x2r << 1);
597       x2i = x0i - (x2i << 1);
598       x1r = x1r + x3r;
599       x1i = x1i + x3i;
600       x3r = x1r - (x3r << 1);
601       x3i = x1i - (x3i << 1);
602 
603       x0r = x0r + x1r;
604       x0i = x0i + x1i;
605       x1r = x0r - (x1r << 1);
606       x1i = x0i - (x1i << 1);
607       x2r = x2r + x3i;
608       x2i = x2i - x3r;
609       x3i = x2r - (x3i << 1);
610       x3r = x2i + (x3r << 1);
611 
612       *ptr_y++ = x0r;
613       *ptr_y++ = x0i;
614       *ptr_y++ = x2r;
615       *ptr_y++ = x2i;
616       *ptr_y++ = x1r;
617       *ptr_y++ = x1i;
618       *ptr_y++ = x3i;
619       *ptr_y++ = x3r;
620     }
621     ptr_y -= 2 * npoints;
622     del = 4;
623     nodespacing = 64;
624     in_loop_cnt = npoints >> 4;
625     for (i = n_stages - 1; i > 0; i--) {
626       const WORD32 *twiddles = ptr_w;
627       WORD32 *data = ptr_y;
628       WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
629       WORD32 sec_loop_cnt;
630 
631       for (k = in_loop_cnt; k != 0; k--) {
632         x0r = (*data);
633         x0i = (*(data + 1));
634         data += (del << 1);
635 
636         x1r = (*data);
637         x1i = (*(data + 1));
638         data += (del << 1);
639 
640         x2r = (*data);
641         x2i = (*(data + 1));
642         data += (del << 1);
643 
644         x3r = (*data);
645         x3i = (*(data + 1));
646         data -= 3 * (del << 1);
647 
648         x0r = x0r + x2r;
649         x0i = x0i + x2i;
650         x2r = x0r - (x2r << 1);
651         x2i = x0i - (x2i << 1);
652         x1r = x1r + x3r;
653         x1i = x1i + x3i;
654         x3r = x1r - (x3r << 1);
655         x3i = x1i - (x3i << 1);
656 
657         x0r = x0r + x1r;
658         x0i = x0i + x1i;
659         x1r = x0r - (x1r << 1);
660         x1i = x0i - (x1i << 1);
661         x2r = x2r + x3i;
662         x2i = x2i - x3r;
663         x3i = x2r - (x3i << 1);
664         x3r = x2i + (x3r << 1);
665 
666         *data = x0r;
667         *(data + 1) = x0i;
668         data += (del << 1);
669 
670         *data = x2r;
671         *(data + 1) = x2i;
672         data += (del << 1);
673 
674         *data = x1r;
675         *(data + 1) = x1i;
676         data += (del << 1);
677 
678         *data = x3i;
679         *(data + 1) = x3r;
680         data += (del << 1);
681       }
682       data = ptr_y + 2;
683 
684       sec_loop_cnt = (nodespacing * del);
685       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
686                      (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
687                      (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
688                      (sec_loop_cnt / 256);
689       j = nodespacing;
690 
691       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
692         w1h = *(twiddles + 2 * j);
693         w1l = *(twiddles + 2 * j + 1);
694         w2h = *(twiddles + 2 * (j << 1));
695         w2l = *(twiddles + 2 * (j << 1) + 1);
696         w3h = *(twiddles + 2 * j + 2 * (j << 1));
697         w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
698 
699         for (k = in_loop_cnt; k != 0; k--) {
700           WORD32 tmp;
701           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
702 
703           data += (del << 1);
704 
705           x1r = *data;
706           x1i = *(data + 1);
707           data += (del << 1);
708 
709           x2r = *data;
710           x2i = *(data + 1);
711           data += (del << 1);
712 
713           x3r = *data;
714           x3i = *(data + 1);
715           data -= 3 * (del << 1);
716 
717           tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
718           x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
719           x1r = tmp;
720 
721           tmp = (ixheaacd_mult32(x2r, w2l) - ixheaacd_mult32(x2i, w2h));
722           x2i = ixheaacd_mac32(ixheaacd_mult32(x2r, w2h), x2i, w2l);
723           x2r = tmp;
724 
725           tmp = (ixheaacd_mult32(x3r, w3l) - ixheaacd_mult32(x3i, w3h));
726           x3i = ixheaacd_mac32(ixheaacd_mult32(x3r, w3h), x3i, w3l);
727           x3r = tmp;
728 
729           x0r = (*data);
730           x0i = (*(data + 1));
731 
732           x0r = x0r + (x2r);
733           x0i = x0i + (x2i);
734           x2r = x0r - (x2r << 1);
735           x2i = x0i - (x2i << 1);
736           x1r = x1r + x3r;
737           x1i = x1i + x3i;
738           x3r = x1r - (x3r << 1);
739           x3i = x1i - (x3i << 1);
740 
741           x0r = x0r + (x1r);
742           x0i = x0i + (x1i);
743           x1r = x0r - (x1r << 1);
744           x1i = x0i - (x1i << 1);
745           x2r = x2r + (x3i);
746           x2i = x2i - (x3r);
747           x3i = x2r - (x3i << 1);
748           x3r = x2i + (x3r << 1);
749 
750           *data = x0r;
751           *(data + 1) = x0i;
752           data += (del << 1);
753 
754           *data = x2r;
755           *(data + 1) = x2i;
756           data += (del << 1);
757 
758           *data = x1r;
759           *(data + 1) = x1i;
760           data += (del << 1);
761 
762           *data = x3i;
763           *(data + 1) = x3r;
764           data += (del << 1);
765         }
766         data -= 2 * npoints;
767         data += 2;
768       }
769       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
770         w1h = *(twiddles + 2 * j);
771         w2h = *(twiddles + 2 * (j << 1));
772         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
773         w1l = *(twiddles + 2 * j + 1);
774         w2l = *(twiddles + 2 * (j << 1) + 1);
775         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
776 
777         for (k = in_loop_cnt; k != 0; k--) {
778           WORD32 tmp;
779           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
780           data += (del << 1);
781 
782           x1r = *data;
783           x1i = *(data + 1);
784           data += (del << 1);
785 
786           x2r = *data;
787           x2i = *(data + 1);
788           data += (del << 1);
789 
790           x3r = *data;
791           x3i = *(data + 1);
792           data -= 3 * (del << 1);
793 
794           tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
795           x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
796           x1r = tmp;
797 
798           tmp = (ixheaacd_mult32(x2r, w2l) - ixheaacd_mult32(x2i, w2h));
799           x2i = ixheaacd_mac32(ixheaacd_mult32(x2r, w2h), x2i, w2l);
800           x2r = tmp;
801 
802           tmp = (ixheaacd_mult32(x3r, w3h) + ixheaacd_mult32(x3i, w3l));
803           x3i = -ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
804           x3r = tmp;
805 
806           x0r = (*data);
807           x0i = (*(data + 1));
808 
809           x0r = x0r + (x2r);
810           x0i = x0i + (x2i);
811           x2r = x0r - (x2r << 1);
812           x2i = x0i - (x2i << 1);
813           x1r = x1r + x3r;
814           x1i = x1i + x3i;
815           x3r = x1r - (x3r << 1);
816           x3i = x1i - (x3i << 1);
817 
818           x0r = x0r + (x1r);
819           x0i = x0i + (x1i);
820           x1r = x0r - (x1r << 1);
821           x1i = x0i - (x1i << 1);
822           x2r = x2r + (x3i);
823           x2i = x2i - (x3r);
824           x3i = x2r - (x3i << 1);
825           x3r = x2i + (x3r << 1);
826 
827           *data = x0r;
828           *(data + 1) = x0i;
829           data += (del << 1);
830 
831           *data = x2r;
832           *(data + 1) = x2i;
833           data += (del << 1);
834 
835           *data = x1r;
836           *(data + 1) = x1i;
837           data += (del << 1);
838 
839           *data = x3i;
840           *(data + 1) = x3r;
841           data += (del << 1);
842         }
843         data -= 2 * npoints;
844         data += 2;
845       }
846       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
847         w1h = *(twiddles + 2 * j);
848         w2h = *(twiddles + 2 * (j << 1) - 512);
849         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
850         w1l = *(twiddles + 2 * j + 1);
851         w2l = *(twiddles + 2 * (j << 1) - 511);
852         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
853 
854         for (k = in_loop_cnt; k != 0; k--) {
855           WORD32 tmp;
856           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
857 
858           data += (del << 1);
859 
860           x1r = *data;
861           x1i = *(data + 1);
862           data += (del << 1);
863 
864           x2r = *data;
865           x2i = *(data + 1);
866           data += (del << 1);
867 
868           x3r = *data;
869           x3i = *(data + 1);
870           data -= 3 * (del << 1);
871 
872           tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
873           x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
874           x1r = tmp;
875 
876           tmp = (ixheaacd_mult32(x2r, w2h) + ixheaacd_mult32(x2i, w2l));
877           x2i = -ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
878           x2r = tmp;
879 
880           tmp = (ixheaacd_mult32(x3r, w3h) + ixheaacd_mult32(x3i, w3l));
881           x3i = -ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
882           x3r = tmp;
883 
884           x0r = (*data);
885           x0i = (*(data + 1));
886 
887           x0r = x0r + (x2r);
888           x0i = x0i + (x2i);
889           x2r = x0r - (x2r << 1);
890           x2i = x0i - (x2i << 1);
891           x1r = x1r + x3r;
892           x1i = x1i + x3i;
893           x3r = x1r - (x3r << 1);
894           x3i = x1i - (x3i << 1);
895 
896           x0r = x0r + (x1r);
897           x0i = x0i + (x1i);
898           x1r = x0r - (x1r << 1);
899           x1i = x0i - (x1i << 1);
900           x2r = x2r + (x3i);
901           x2i = x2i - (x3r);
902           x3i = x2r - (x3i << 1);
903           x3r = x2i + (x3r << 1);
904 
905           *data = x0r;
906           *(data + 1) = x0i;
907           data += (del << 1);
908 
909           *data = x2r;
910           *(data + 1) = x2i;
911           data += (del << 1);
912 
913           *data = x1r;
914           *(data + 1) = x1i;
915           data += (del << 1);
916 
917           *data = x3i;
918           *(data + 1) = x3r;
919           data += (del << 1);
920         }
921         data -= 2 * npoints;
922         data += 2;
923       }
924       for (; j < nodespacing * del; j += nodespacing) {
925         w1h = *(twiddles + 2 * j);
926         w2h = *(twiddles + 2 * (j << 1) - 512);
927         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
928         w1l = *(twiddles + 2 * j + 1);
929         w2l = *(twiddles + 2 * (j << 1) - 511);
930         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
931 
932         for (k = in_loop_cnt; k != 0; k--) {
933           WORD32 tmp;
934           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
935 
936           data += (del << 1);
937 
938           x1r = *data;
939           x1i = *(data + 1);
940           data += (del << 1);
941 
942           x2r = *data;
943           x2i = *(data + 1);
944           data += (del << 1);
945 
946           x3r = *data;
947           x3i = *(data + 1);
948           data -= 3 * (del << 1);
949 
950           tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
951           x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
952           x1r = tmp;
953 
954           tmp = (ixheaacd_mult32(x2r, w2h) + ixheaacd_mult32(x2i, w2l));
955           x2i = -ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
956           x2r = tmp;
957 
958           tmp = (-ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h));
959           x3i = ixheaacd_mac32(ixheaacd_mult32(x3r, w3h), x3i, w3l);
960           x3r = tmp;
961 
962           x0r = (*data);
963           x0i = (*(data + 1));
964 
965           x0r = x0r + (x2r);
966           x0i = x0i + (x2i);
967           x2r = x0r - (x2r << 1);
968           x2i = x0i - (x2i << 1);
969           x1r = x1r + x3r;
970           x1i = x1i - x3i;
971           x3r = x1r - (x3r << 1);
972           x3i = x1i + (x3i << 1);
973 
974           x0r = x0r + (x1r);
975           x0i = x0i + (x1i);
976           x1r = x0r - (x1r << 1);
977           x1i = x0i - (x1i << 1);
978           x2r = x2r + (x3i);
979           x2i = x2i - (x3r);
980           x3i = x2r - (x3i << 1);
981           x3r = x2i + (x3r << 1);
982 
983           *data = x0r;
984           *(data + 1) = x0i;
985           data += (del << 1);
986 
987           *data = x2r;
988           *(data + 1) = x2i;
989           data += (del << 1);
990 
991           *data = x1r;
992           *(data + 1) = x1i;
993           data += (del << 1);
994 
995           *data = x3i;
996           *(data + 1) = x3r;
997           data += (del << 1);
998         }
999         data -= 2 * npoints;
1000         data += 2;
1001       }
1002       nodespacing >>= 2;
1003       del <<= 2;
1004       in_loop_cnt >>= 2;
1005     }
1006     if (not_power_4) {
1007       const WORD32 *twiddles = ptr_w;
1008       nodespacing <<= 1;
1009       shift += 1;
1010 
1011       for (j = del / 2; j != 0; j--) {
1012         WORD32 w1h = *twiddles;
1013         WORD32 w1l = *(twiddles + 1);
1014         WORD32 tmp;
1015         twiddles += nodespacing * 2;
1016 
1017         x0r = *ptr_y;
1018         x0i = *(ptr_y + 1);
1019         ptr_y += (del << 1);
1020 
1021         x1r = *ptr_y;
1022         x1i = *(ptr_y + 1);
1023 
1024         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
1025         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
1026         x1r = tmp;
1027 
1028         *ptr_y = (x0r) / 2 - (x1r) / 2;
1029         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
1030         ptr_y -= (del << 1);
1031 
1032         *ptr_y = (x0r) / 2 + (x1r) / 2;
1033         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
1034         ptr_y += 2;
1035       }
1036       twiddles = ptr_w;
1037       for (j = del / 2; j != 0; j--) {
1038         WORD32 w1h = *twiddles;
1039         WORD32 w1l = *(twiddles + 1);
1040         WORD32 tmp;
1041         twiddles += nodespacing * 2;
1042 
1043         x0r = *ptr_y;
1044         x0i = *(ptr_y + 1);
1045         ptr_y += (del << 1);
1046 
1047         x1r = *ptr_y;
1048         x1i = *(ptr_y + 1);
1049 
1050         tmp = (ixheaacd_mult32(x1r, w1h) + ixheaacd_mult32(x1i, w1l));
1051         x1i = -ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h);
1052         x1r = tmp;
1053 
1054         *ptr_y = (x0r) / 2 - (x1r) / 2;
1055         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
1056         ptr_y -= (del << 1);
1057 
1058         *ptr_y = (x0r) / 2 + (x1r) / 2;
1059         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
1060         ptr_y += 2;
1061       }
1062     }
1063 
1064   }
1065 
1066   else {
1067     ptr_w = ixheaacd_twiddle_table_fft_32x32;
1068 
1069     for (i = 0; i < npoints; i += 4) {
1070       WORD32 *inp = ptr_x;
1071 
1072       DIG_REV(i, dig_rev_shift, h2);
1073       if (not_power_4) {
1074         h2 += 1;
1075         h2 &= ~1;
1076       }
1077       inp += (h2);
1078 
1079       x0r = *inp;
1080       x0i = *(inp + 1);
1081       inp += (npoints >> 1);
1082 
1083       x1r = *inp;
1084       x1i = *(inp + 1);
1085       inp += (npoints >> 1);
1086 
1087       x2r = *inp;
1088       x2i = *(inp + 1);
1089       inp += (npoints >> 1);
1090 
1091       x3r = *inp;
1092       x3i = *(inp + 1);
1093 
1094       x0r = x0r + x2r;
1095       x0i = x0i + x2i;
1096       x2r = x0r - (x2r << 1);
1097       x2i = x0i - (x2i << 1);
1098       x1r = x1r + x3r;
1099       x1i = x1i + x3i;
1100       x3r = x1r - (x3r << 1);
1101       x3i = x1i - (x3i << 1);
1102 
1103       x0r = x0r + x1r;
1104       x0i = x0i + x1i;
1105       x1r = x0r - (x1r << 1);
1106       x1i = x0i - (x1i << 1);
1107       x2r = x2r - x3i;
1108       x2i = x2i + x3r;
1109       x3i = x2r + (x3i << 1);
1110       x3r = x2i - (x3r << 1);
1111 
1112       *ptr_y++ = x0r;
1113       *ptr_y++ = x0i;
1114       *ptr_y++ = x2r;
1115       *ptr_y++ = x2i;
1116       *ptr_y++ = x1r;
1117       *ptr_y++ = x1i;
1118       *ptr_y++ = x3i;
1119       *ptr_y++ = x3r;
1120     }
1121     ptr_y -= 2 * npoints;
1122     del = 4;
1123     nodespacing = 64;
1124     in_loop_cnt = npoints >> 4;
1125     for (i = n_stages - 1; i > 0; i--) {
1126       const WORD32 *twiddles = ptr_w;
1127       WORD32 *data = ptr_y;
1128       WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
1129       WORD32 sec_loop_cnt;
1130 
1131       for (k = in_loop_cnt; k != 0; k--) {
1132         x0r = (*data);
1133         x0i = (*(data + 1));
1134         data += (del << 1);
1135 
1136         x1r = (*data);
1137         x1i = (*(data + 1));
1138         data += (del << 1);
1139 
1140         x2r = (*data);
1141         x2i = (*(data + 1));
1142         data += (del << 1);
1143 
1144         x3r = (*data);
1145         x3i = (*(data + 1));
1146         data -= 3 * (del << 1);
1147 
1148         x0r = x0r + x2r;
1149         x0i = x0i + x2i;
1150         x2r = x0r - (x2r << 1);
1151         x2i = x0i - (x2i << 1);
1152         x1r = x1r + x3r;
1153         x1i = x1i + x3i;
1154         x3r = x1r - (x3r << 1);
1155         x3i = x1i - (x3i << 1);
1156 
1157         x0r = ixheaacd_add32_sat(x0r, x1r);
1158         x0i = ixheaacd_add32_sat(x0i, x1i);
1159         x1r = ixheaacd_sub32_sat(x0r, (x1r << 1));
1160         x1i = ixheaacd_sub32_sat(x0i, (x1i << 1));
1161         x2r = ixheaacd_sub32_sat(x2r, x3i);
1162         x2i = ixheaacd_add32_sat(x2i, x3r);
1163         x3i = ixheaacd_add32_sat(x2r, (x3i << 1));
1164         x3r = ixheaacd_sub32_sat(x2i, (x3r << 1));
1165 
1166         *data = x0r;
1167         *(data + 1) = x0i;
1168         data += (del << 1);
1169 
1170         *data = x2r;
1171         *(data + 1) = x2i;
1172         data += (del << 1);
1173 
1174         *data = x1r;
1175         *(data + 1) = x1i;
1176         data += (del << 1);
1177 
1178         *data = x3i;
1179         *(data + 1) = x3r;
1180         data += (del << 1);
1181       }
1182       data = ptr_y + 2;
1183 
1184       sec_loop_cnt = (nodespacing * del);
1185       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
1186                      (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
1187                      (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
1188                      (sec_loop_cnt / 256);
1189       j = nodespacing;
1190 
1191       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
1192         w1h = *(twiddles + 2 * j);
1193         w2h = *(twiddles + 2 * (j << 1));
1194         w3h = *(twiddles + 2 * j + 2 * (j << 1));
1195         w1l = *(twiddles + 2 * j + 1);
1196         w2l = *(twiddles + 2 * (j << 1) + 1);
1197         w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
1198 
1199         for (k = in_loop_cnt; k != 0; k--) {
1200           WORD32 tmp;
1201           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1202 
1203           data += (del << 1);
1204 
1205           x1r = *data;
1206           x1i = *(data + 1);
1207           data += (del << 1);
1208 
1209           x2r = *data;
1210           x2i = *(data + 1);
1211           data += (del << 1);
1212 
1213           x3r = *data;
1214           x3i = *(data + 1);
1215           data -= 3 * (del << 1);
1216 
1217           tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
1218           x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
1219           x1r = tmp;
1220 
1221           tmp = (ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h));
1222           x2i = ixheaacd_mac32(-ixheaacd_mult32(x2r, w2h), x2i, w2l);
1223           x2r = tmp;
1224 
1225           tmp = (ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h));
1226           x3i = ixheaacd_mac32(-ixheaacd_mult32(x3r, w3h), x3i, w3l);
1227           x3r = tmp;
1228 
1229           x0r = (*data);
1230           x0i = (*(data + 1));
1231 
1232           x0r = x0r + (x2r);
1233           x0i = x0i + (x2i);
1234           x2r = x0r - (x2r << 1);
1235           x2i = x0i - (x2i << 1);
1236           x1r = x1r + x3r;
1237           x1i = x1i + x3i;
1238           x3r = x1r - (x3r << 1);
1239           x3i = x1i - (x3i << 1);
1240 
1241           x0r = x0r + (x1r);
1242           x0i = x0i + (x1i);
1243           x1r = x0r - (x1r << 1);
1244           x1i = x0i - (x1i << 1);
1245           x2r = x2r - (x3i);
1246           x2i = x2i + (x3r);
1247           x3i = x2r + (x3i << 1);
1248           x3r = x2i - (x3r << 1);
1249 
1250           *data = x0r;
1251           *(data + 1) = x0i;
1252           data += (del << 1);
1253 
1254           *data = x2r;
1255           *(data + 1) = x2i;
1256           data += (del << 1);
1257 
1258           *data = x1r;
1259           *(data + 1) = x1i;
1260           data += (del << 1);
1261 
1262           *data = x3i;
1263           *(data + 1) = x3r;
1264           data += (del << 1);
1265         }
1266         data -= 2 * npoints;
1267         data += 2;
1268       }
1269       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1270         w1h = *(twiddles + 2 * j);
1271         w2h = *(twiddles + 2 * (j << 1));
1272         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
1273         w1l = *(twiddles + 2 * j + 1);
1274         w2l = *(twiddles + 2 * (j << 1) + 1);
1275         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
1276 
1277         for (k = in_loop_cnt; k != 0; k--) {
1278           WORD32 tmp;
1279           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1280 
1281           data += (del << 1);
1282 
1283           x1r = *data;
1284           x1i = *(data + 1);
1285           data += (del << 1);
1286 
1287           x2r = *data;
1288           x2i = *(data + 1);
1289           data += (del << 1);
1290 
1291           x3r = *data;
1292           x3i = *(data + 1);
1293           data -= 3 * (del << 1);
1294 
1295           tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
1296           x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
1297           x1r = tmp;
1298 
1299           tmp = (ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h));
1300           x2i = ixheaacd_mac32(-ixheaacd_mult32(x2r, w2h), x2i, w2l);
1301           x2r = tmp;
1302 
1303           tmp = (ixheaacd_mult32(x3r, w3h) - ixheaacd_mult32(x3i, w3l));
1304           x3i = ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
1305           x3r = tmp;
1306 
1307           x0r = (*data);
1308           x0i = (*(data + 1));
1309 
1310           x0r = x0r + (x2r);
1311           x0i = x0i + (x2i);
1312           x2r = x0r - (x2r << 1);
1313           x2i = x0i - (x2i << 1);
1314           x1r = x1r + x3r;
1315           x1i = x1i + x3i;
1316           x3r = x1r - (x3r << 1);
1317           x3i = x1i - (x3i << 1);
1318 
1319           x0r = x0r + (x1r);
1320           x0i = x0i + (x1i);
1321           x1r = x0r - (x1r << 1);
1322           x1i = x0i - (x1i << 1);
1323           x2r = x2r - (x3i);
1324           x2i = x2i + (x3r);
1325           x3i = x2r + (x3i << 1);
1326           x3r = x2i - (x3r << 1);
1327 
1328           *data = x0r;
1329           *(data + 1) = x0i;
1330           data += (del << 1);
1331 
1332           *data = x2r;
1333           *(data + 1) = x2i;
1334           data += (del << 1);
1335 
1336           *data = x1r;
1337           *(data + 1) = x1i;
1338           data += (del << 1);
1339 
1340           *data = x3i;
1341           *(data + 1) = x3r;
1342           data += (del << 1);
1343         }
1344         data -= 2 * npoints;
1345         data += 2;
1346       }
1347       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1348         w1h = *(twiddles + 2 * j);
1349         w2h = *(twiddles + 2 * (j << 1) - 512);
1350         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
1351         w1l = *(twiddles + 2 * j + 1);
1352         w2l = *(twiddles + 2 * (j << 1) - 511);
1353         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
1354 
1355         for (k = in_loop_cnt; k != 0; k--) {
1356           WORD32 tmp;
1357           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1358 
1359           data += (del << 1);
1360 
1361           x1r = *data;
1362           x1i = *(data + 1);
1363           data += (del << 1);
1364 
1365           x2r = *data;
1366           x2i = *(data + 1);
1367           data += (del << 1);
1368 
1369           x3r = *data;
1370           x3i = *(data + 1);
1371           data -= 3 * (del << 1);
1372 
1373           tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
1374           x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
1375           x1r = tmp;
1376 
1377           tmp = (ixheaacd_mult32(x2r, w2h) - ixheaacd_mult32(x2i, w2l));
1378           x2i = ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
1379           x2r = tmp;
1380 
1381           tmp = (ixheaacd_mult32(x3r, w3h) - ixheaacd_mult32(x3i, w3l));
1382           x3i = ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
1383           x3r = tmp;
1384 
1385           x0r = (*data);
1386           x0i = (*(data + 1));
1387 
1388           x0r = x0r + (x2r);
1389           x0i = x0i + (x2i);
1390           x2r = x0r - (x2r << 1);
1391           x2i = x0i - (x2i << 1);
1392           x1r = x1r + x3r;
1393           x1i = x1i + x3i;
1394           x3r = x1r - (x3r << 1);
1395           x3i = x1i - (x3i << 1);
1396 
1397           x0r = x0r + (x1r);
1398           x0i = x0i + (x1i);
1399           x1r = x0r - (x1r << 1);
1400           x1i = x0i - (x1i << 1);
1401           x2r = x2r - (x3i);
1402           x2i = x2i + (x3r);
1403           x3i = x2r + (x3i << 1);
1404           x3r = x2i - (x3r << 1);
1405 
1406           *data = x0r;
1407           *(data + 1) = x0i;
1408           data += (del << 1);
1409 
1410           *data = x2r;
1411           *(data + 1) = x2i;
1412           data += (del << 1);
1413 
1414           *data = x1r;
1415           *(data + 1) = x1i;
1416           data += (del << 1);
1417 
1418           *data = x3i;
1419           *(data + 1) = x3r;
1420           data += (del << 1);
1421         }
1422         data -= 2 * npoints;
1423         data += 2;
1424       }
1425       for (; j < nodespacing * del; j += nodespacing) {
1426         w1h = *(twiddles + 2 * j);
1427         w2h = *(twiddles + 2 * (j << 1) - 512);
1428         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
1429         w1l = *(twiddles + 2 * j + 1);
1430         w2l = *(twiddles + 2 * (j << 1) - 511);
1431         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
1432 
1433         for (k = in_loop_cnt; k != 0; k--) {
1434           WORD32 tmp;
1435           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
1436 
1437           data += (del << 1);
1438 
1439           x1r = *data;
1440           x1i = *(data + 1);
1441           data += (del << 1);
1442 
1443           x2r = *data;
1444           x2i = *(data + 1);
1445           data += (del << 1);
1446 
1447           x3r = *data;
1448           x3i = *(data + 1);
1449           data -= 3 * (del << 1);
1450 
1451           tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
1452           x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
1453           x1r = tmp;
1454 
1455           tmp = (ixheaacd_mult32(x2r, w2h) - ixheaacd_mult32(x2i, w2l));
1456           x2i = ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
1457           x2r = tmp;
1458 
1459           tmp = (-ixheaacd_mult32(x3r, w3l) - ixheaacd_mult32(x3i, w3h));
1460           x3i = ixheaacd_mac32(-ixheaacd_mult32(x3r, w3h), x3i, w3l);
1461           x3r = tmp;
1462 
1463           x0r = (*data);
1464           x0i = (*(data + 1));
1465 
1466           x0r = x0r + (x2r);
1467           x0i = x0i + (x2i);
1468           x2r = x0r - (x2r << 1);
1469           x2i = x0i - (x2i << 1);
1470           x1r = x1r + x3r;
1471           x1i = x1i - x3i;
1472           x3r = x1r - (x3r << 1);
1473           x3i = x1i + (x3i << 1);
1474 
1475           x0r = x0r + (x1r);
1476           x0i = x0i + (x1i);
1477           x1r = x0r - (x1r << 1);
1478           x1i = x0i - (x1i << 1);
1479           x2r = x2r - (x3i);
1480           x2i = x2i + (x3r);
1481           x3i = x2r + (x3i << 1);
1482           x3r = x2i - (x3r << 1);
1483 
1484           *data = x0r;
1485           *(data + 1) = x0i;
1486           data += (del << 1);
1487 
1488           *data = x2r;
1489           *(data + 1) = x2i;
1490           data += (del << 1);
1491 
1492           *data = x1r;
1493           *(data + 1) = x1i;
1494           data += (del << 1);
1495 
1496           *data = x3i;
1497           *(data + 1) = x3r;
1498           data += (del << 1);
1499         }
1500         data -= 2 * npoints;
1501         data += 2;
1502       }
1503       nodespacing >>= 2;
1504       del <<= 2;
1505       in_loop_cnt >>= 2;
1506     }
1507     if (not_power_4) {
1508       const WORD32 *twiddles = ptr_w;
1509       nodespacing <<= 1;
1510       shift += 1;
1511       for (j = del / 2; j != 0; j--) {
1512         WORD32 w1h = *twiddles;
1513         WORD32 w1l = *(twiddles + 1);
1514 
1515         WORD32 tmp;
1516         twiddles += nodespacing * 2;
1517 
1518         x0r = *ptr_y;
1519         x0i = *(ptr_y + 1);
1520         ptr_y += (del << 1);
1521 
1522         x1r = *ptr_y;
1523         x1i = *(ptr_y + 1);
1524 
1525         tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
1526         x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
1527         x1r = tmp;
1528 
1529         *ptr_y = (x0r) / 2 - (x1r) / 2;
1530         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
1531         ptr_y -= (del << 1);
1532 
1533         *ptr_y = (x0r) / 2 + (x1r) / 2;
1534         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
1535         ptr_y += 2;
1536       }
1537       twiddles = ptr_w;
1538       for (j = del / 2; j != 0; j--) {
1539         WORD32 w1h = *twiddles;
1540         WORD32 w1l = *(twiddles + 1);
1541         WORD32 tmp;
1542         twiddles += nodespacing * 2;
1543 
1544         x0r = *ptr_y;
1545         x0i = *(ptr_y + 1);
1546         ptr_y += (del << 1);
1547 
1548         x1r = *ptr_y;
1549         x1i = *(ptr_y + 1);
1550 
1551         tmp = (ixheaacd_mult32(x1r, w1h) - ixheaacd_mult32(x1i, w1l));
1552         x1i = ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h);
1553         x1r = tmp;
1554 
1555         *ptr_y = (x0r) / 2 - (x1r) / 2;
1556         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
1557         ptr_y -= (del << 1);
1558 
1559         *ptr_y = (x0r) / 2 + (x1r) / 2;
1560         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
1561         ptr_y += 2;
1562       }
1563     }
1564   }
1565 
1566   for (i = 0; i < nlength; i++) {
1567     xr[i] = y[2 * i];
1568     xi[i] = y[2 * i + 1];
1569   }
1570 
1571   *preshift = shift - *preshift;
1572   return;
1573 }
1574 
ixheaacd_complex_3point_fft(WORD32 * inp,WORD32 * op,WORD32 sign_dir)1575 static PLATFORM_INLINE void ixheaacd_complex_3point_fft(WORD32 *inp, WORD32 *op,
1576                                                         WORD32 sign_dir) {
1577   WORD32 add_r, sub_r;
1578   WORD32 add_i, sub_i;
1579   WORD32 temp_real, temp_imag, temp;
1580 
1581   WORD32 p1, p2, p3, p4;
1582 
1583   WORD32 sinmu;
1584   sinmu = -1859775393 * sign_dir;
1585 
1586   temp_real = ixheaacd_add32_sat(inp[0], inp[2]);
1587   temp_imag = ixheaacd_add32_sat(inp[1], inp[3]);
1588 
1589   add_r = ixheaacd_add32_sat(inp[2], inp[4]);
1590   add_i = ixheaacd_add32_sat(inp[3], inp[5]);
1591 
1592   sub_r = ixheaacd_sub32_sat(inp[2], inp[4]);
1593   sub_i = ixheaacd_sub32_sat(inp[3], inp[5]);
1594 
1595   p1 = add_r >> 1;
1596   p4 = add_i >> 1;
1597   p2 = ixheaacd_mult32_shl(sub_i, sinmu);
1598   p3 = ixheaacd_mult32_shl(sub_r, sinmu);
1599 
1600   temp = ixheaacd_sub32(inp[0], p1);
1601 
1602   op[0] = ixheaacd_add32_sat(temp_real, inp[4]);
1603   op[1] = ixheaacd_add32_sat(temp_imag, inp[5]);
1604   op[2] = ixheaacd_add32_sat(temp, p2);
1605   op[3] = ixheaacd_sub32_sat(ixheaacd_sub32_sat(inp[1], p3), p4);
1606   op[4] = ixheaacd_sub32_sat(temp, p2);
1607   op[5] = ixheaacd_sub32_sat(ixheaacd_add32_sat(inp[1], p3), p4);
1608 
1609   return;
1610 }
1611 
ixheaacd_complex_fft_p3(WORD32 * xr,WORD32 * xi,WORD32 nlength,WORD32 fft_mode,WORD32 * preshift)1612 VOID ixheaacd_complex_fft_p3(WORD32 *xr, WORD32 *xi, WORD32 nlength,
1613                              WORD32 fft_mode, WORD32 *preshift) {
1614   WORD32 i, j;
1615   WORD32 shift = 0;
1616   WORD32 xr_3[384];
1617   WORD32 xi_3[384];
1618   WORD32 x[1024];
1619   WORD32 y[1024];
1620   WORD32 cnfac, npts;
1621   WORD32 mpass = nlength;
1622   WORD32 n = 0;
1623   WORD32 *ptr_x = x;
1624   WORD32 *ptr_y = y;
1625 
1626   cnfac = 0;
1627   while (mpass % 3 == 0) {
1628     mpass /= 3;
1629     cnfac++;
1630   }
1631   npts = mpass;
1632 
1633   for (i = 0; i < 3 * cnfac; i++) {
1634     for (j = 0; j < mpass; j++) {
1635       xr_3[j] = xr[3 * j + i];
1636       xi_3[j] = xi[3 * j + i];
1637     }
1638 
1639     (*ixheaacd_complex_fft_p2)(xr_3, xi_3, mpass, fft_mode, &shift);
1640 
1641     for (j = 0; j < mpass; j++) {
1642       xr[3 * j + i] = xr_3[j];
1643       xi[3 * j + i] = xi_3[j];
1644     }
1645   }
1646 
1647   while (npts >> 1) {
1648     n++;
1649     npts = npts >> 1;
1650   }
1651 
1652   if (n % 2 == 0)
1653     shift = ((n + 4)) / 2;
1654   else
1655     shift = ((n + 5) / 2);
1656 
1657   *preshift = shift - *preshift + 1;
1658 
1659   for (i = 0; i < nlength; i++) {
1660     ptr_x[2 * i] = (xr[i] >> 1);
1661     ptr_x[2 * i + 1] = (xi[i] >> 1);
1662   }
1663 
1664   {
1665     const WORD32 *w1r, *w1i;
1666     WORD32 tmp;
1667     w1r = ixheaacd_twiddle_table_3pr;
1668     w1i = ixheaacd_twiddle_table_3pi;
1669 
1670     if (fft_mode < 0) {
1671       for (i = 0; i < nlength; i += 3) {
1672         tmp = ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i], (*w1r)),
1673                                  ixheaacd_mult32(ptr_x[2 * i + 1], (*w1i)));
1674         ptr_x[2 * i + 1] =
1675             ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i], (*w1i)),
1676                                ixheaacd_mult32(ptr_x[2 * i + 1], (*w1r)));
1677         ptr_x[2 * i] = tmp;
1678 
1679         w1r++;
1680         w1i++;
1681 
1682         tmp = ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 2], (*w1r)),
1683                                  ixheaacd_mult32(ptr_x[2 * i + 3], (*w1i)));
1684         ptr_x[2 * i + 3] =
1685             ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i + 2], (*w1i)),
1686                                ixheaacd_mult32(ptr_x[2 * i + 3], (*w1r)));
1687         ptr_x[2 * i + 2] = tmp;
1688 
1689         w1r++;
1690         w1i++;
1691 
1692         tmp = ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 4], (*w1r)),
1693                                  ixheaacd_mult32(ptr_x[2 * i + 5], (*w1i)));
1694         ptr_x[2 * i + 5] =
1695             ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i + 4], (*w1i)),
1696                                ixheaacd_mult32(ptr_x[2 * i + 5], (*w1r)));
1697         ptr_x[2 * i + 4] = tmp;
1698 
1699         w1r += 3 * (128 / mpass - 1) + 1;
1700         w1i += 3 * (128 / mpass - 1) + 1;
1701       }
1702     }
1703 
1704     else {
1705       for (i = 0; i < nlength; i += 3) {
1706         tmp = ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i], (*w1r)),
1707                                  ixheaacd_mult32(ptr_x[2 * i + 1], (*w1i)));
1708         ptr_x[2 * i + 1] =
1709             ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 1], (*w1r)),
1710                                ixheaacd_mult32(ptr_x[2 * i], (*w1i)));
1711         ptr_x[2 * i] = tmp;
1712 
1713         w1r++;
1714         w1i++;
1715 
1716         tmp = ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i + 2], (*w1r)),
1717                                  ixheaacd_mult32(ptr_x[2 * i + 3], (*w1i)));
1718         ptr_x[2 * i + 3] =
1719             ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 3], (*w1r)),
1720                                ixheaacd_mult32(ptr_x[2 * i + 2], (*w1i)));
1721         ptr_x[2 * i + 2] = tmp;
1722 
1723         w1r++;
1724         w1i++;
1725 
1726         tmp = ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i + 4], (*w1r)),
1727                                  ixheaacd_mult32(ptr_x[2 * i + 5], (*w1i)));
1728         ptr_x[2 * i + 5] =
1729             ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 5], (*w1r)),
1730                                ixheaacd_mult32(ptr_x[2 * i + 4], (*w1i)));
1731         ptr_x[2 * i + 4] = tmp;
1732 
1733         w1r += 3 * (128 / mpass - 1) + 1;
1734         w1i += 3 * (128 / mpass - 1) + 1;
1735       }
1736     }
1737   }
1738 
1739   for (i = 0; i < mpass; i++) {
1740     ixheaacd_complex_3point_fft(ptr_x, ptr_y, fft_mode);
1741 
1742     ptr_x = ptr_x + 6;
1743     ptr_y = ptr_y + 6;
1744   }
1745 
1746   for (i = 0; i < mpass; i++) {
1747     xr[i] = y[6 * i];
1748     xi[i] = y[6 * i + 1];
1749   }
1750 
1751   for (i = 0; i < mpass; i++) {
1752     xr[mpass + i] = y[6 * i + 2];
1753     xi[mpass + i] = y[6 * i + 3];
1754   }
1755 
1756   for (i = 0; i < mpass; i++) {
1757     xr[2 * mpass + i] = y[6 * i + 4];
1758     xi[2 * mpass + i] = y[6 * i + 5];
1759   }
1760   return;
1761 }
1762 
ixheaacd_complex_fft(WORD32 * data_r,WORD32 * data_i,WORD32 nlength,WORD32 fft_mode,WORD32 * preshift)1763 VOID ixheaacd_complex_fft(WORD32 *data_r, WORD32 *data_i, WORD32 nlength,
1764                           WORD32 fft_mode, WORD32 *preshift) {
1765   if (nlength & (nlength - 1)) {
1766     if ((nlength != 24) && (nlength != 48) && (nlength != 96) &&
1767         (nlength != 192) && (nlength != 384)) {
1768       printf("%d point FFT not supported", nlength);
1769       exit(0);
1770     }
1771     ixheaacd_complex_fft_p3(data_r, data_i, nlength, fft_mode, preshift);
1772   } else
1773     (*ixheaacd_complex_fft_p2)(data_r, data_i, nlength, fft_mode, preshift);
1774 
1775   return;
1776 }
1777