• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1.text
2.p2align 2
3.global ixheaacd_complex_ifft_p2_asm
4
5ixheaacd_complex_ifft_p2_asm:
6    STMFD           sp!, {r0-r12, lr}
7    SUB             sp, sp, #0x28
8    LDR             r0, [sp, #0x2c]
9    @LDR      r12,[sp,#0x5c+4]
10    EOR             r0, r0, r0, ASR #31
11    CLZ             r0, r0
12    SUB             r12, r0, #16        @dig_rev_shift = norm32(npoints) + 1 -16@
13    SUB             r0, r0, #1
14    RSB             r0, r0, #0x1e
15    AND             r1, r0, #1
16    STR             r1, [sp, #0x14]
17    MOV             r1, r0, ASR #1
18    LDR             r0, [sp, #0x2c]     @npoints
19    STR             r1, [sp, #-4]!
20    MOV             lr, r0, LSL #1      @(npoints >>1) * 4
21    MOV             r0, #0
22
23FIRST_STAGE_R4:
24    LDR             r4, =0x33333333
25    LDR             r5, =0x0F0F0F0F
26    AND             r6, r4, r0
27    AND             r7, r4, r0, LSR #2
28    ORR             r4, r7, r6, LSL #2
29    AND             r6, r5, r4
30    AND             r7, r5, r4, LSR #4
31    ORR             r4, r7, r6, LSL #4
32    BIC             r6, r4, #0x0000FF00
33    BIC             r7, r4, #0x00FF0000
34    MOV             r7, r7, LSR #8
35    ORR             r4, r7, r6, LSL #8
36    LDR             r5, [sp, #0x18]
37    MOV             r10, r4, LSR r12
38    CMP             r5, #0
39    ADDNE           r10, r10, #1
40    BICNE           r10, r10, #1
41
42    ADD             r1, r2, r10, LSL #2
43    LDRD            r4, [r1]            @r4=x0r,  r5=x0i
44    ADD             r1, r1, lr
45    LDRD            r8, [r1]            @r8=x1r,  r9=x1i
46    ADD             r1, r1, lr
47    LDRD            r6, [r1]            @r6=x2r,  r7=x2i
48    ADD             r1, r1, lr
49    LDRD            r10, [r1]           @r10=x3r, r11=x3i
50    ADD             r0, r0, #4
51    CMP             r0, lr, ASR #1
52
53    ADD             r4, r4, r6          @x0r = x0r + x2r@
54    ADD             r5, r5, r7          @x0i = x0i + x2i@
55    SUB             r6, r4, r6, lsl#1   @x2r = x0r - (x2r << 1)@
56    SUB             r7, r5, r7, lsl#1   @x2i = x0i - (x2i << 1)@
57    ADD             r8, r8, r10         @x1r = x1r + x3r@
58    ADD             r9, r9, r11         @x1i = x1i + x3i@
59    SUB             r1, r8, r10, lsl#1  @x3r = x1r - (x3r << 1)@
60    SUB             r11, r9, r11, lsl#1 @x3i = x1i - (x3i << 1)@
61
62    ADD             r4, r4, r8          @x0r = x0r + x1r@
63    ADD             r5, r5, r9          @x0i = x0i + x1i@
64    SUB             r8, r4, r8, lsl#1   @x1r = x0r - (x1r << 1)@
65    SUB             r9, r5, r9, lsl#1   @x1i = x0i - (x1i << 1)
66    SUB             r6, r6, r11         @x2r = x2r - x3i@
67    ADD             r7, r7, r1          @x2i = x2i + x3r@
68    ADD             r10, r6, r11, lsl#1 @x3i = x2r + (x3i << 1)@
69    SUB             r11, r7, r1, lsl#1  @x3r = x2i - (x3r << 1)@
70
71    STMIA           r3!, {r4-r11}
72    BLT             FIRST_STAGE_R4
73    LDR             r1, [sp], #4
74    LDR             r0, [sp, #0x2c]
75    MOV             r12, #0x40          @nodespacing = 64@
76    STR             r12, [sp, #0x1c]
77    LDR             r12, [sp, #0x2c]
78    SUB             r3, r3, r0, LSL #3
79    SUBS            r1, r1, #1
80    STR             r3, [sp, #0x34]
81    MOV             r4, r12, ASR #4
82    MOV             r0, #4
83    STR             r4, [sp, #0x18]
84    STR             r1, [sp, #0x20]
85    BLE             RADIX2
86OUTER_LOOP:
87    LDR             r1, [sp, #0x28]
88    LDR             r12, [sp, #0x34]    @WORD32 *data = ptr_y@
89    STR             r1, [sp, #0x10]
90    LDR             r1, [sp, #0x18]
91
92    MOV             r0, r0, LSL #3      @(del<<1) * 4
93LOOP_TRIVIAL_TWIDDLE:
94    LDRD            r4, [r12]           @r4=x0r,  r5=x0i
95    ADD             r12, r12, r0
96    LDRD            r6, [r12]           @r6=x1r,  r7=x1i
97    ADD             r12, r12, r0
98    LDRD            r8, [r12]           @r8=x2r,  r9=x2i
99    ADD             r12, r12, r0
100    LDRD            r10, [r12]          @r10=x3r, r11=x3i
101
102@MOV    r4,r4,ASR #1
103@MOV    r5,r5,ASR #1
104@MOV    r6,r6,ASR #1
105@MOV    r7,r7,ASR #1
106@MOV    r8,r8,ASR #1
107@MOV    r9,r9,ASR #1
108@MOV    r10,r10,ASR #1
109@MOV    r11,r11,ASR #1
110
111    ADD             r4, r4, r8          @x0r = x0r + x2r@
112    ADD             r5, r5, r9          @x0i = x0i + x2i@
113    SUB             r8, r4, r8, lsl #1  @x2r = x0r - (x2r << 1)@
114    SUB             r9, r5, r9, lsl #1  @x2i = x0i - (x2i << 1)@
115    ADD             r6, r6, r10         @x1r = x1r + x3r@
116    ADD             r7, r7, r11         @x1i = x1i + x3i@
117    SUB             r2, r6, r10, lsl #1 @x3r = x1r - (x3r << 1)@
118    SUB             r11, r7, r11, lsl #1 @x3i = x1i - (x3i << 1)@
119
120    ADD             r4, r4, r6          @x0r = x0r + x1r@
121    ADD             r5, r5, r7          @x0i = x0i + x1i@
122@MOV    r4,r4,ASR #1
123@MOV    r5,r5,ASR #1
124    SUB             r6, r4, r6, lsl #1  @x1r = x0r - (x1r << 1)@
125    SUB             r7, r5, r7, lsl #1  @x1i = x0i - (x1i << 1)
126    SUB             r8, r8, r11         @x2r = x2r - x3i@
127    ADD             r9, r9, r2          @x2i = x2i + x3r@
128    ADD             r10, r8, r11, lsl#1 @x3i = x2r + (x3i << 1)@
129    SUB             r11, r9, r2, lsl#1  @x3r = x2i - (x3r << 1)
130
131    STRD            r10, [r12]          @r10=x3r, r11=x3i
132    SUB             r12, r12, r0
133    STRD            r6, [r12]           @r6=x1r,  r7=x1i
134    SUB             r12, r12, r0
135    STRD            r8, [r12]           @r8=x2r,  r9=x2i
136    SUB             r12, r12, r0
137    STRD            r4, [r12]           @r4=x0r,  r5=x0i
138    ADD             r12, r12, r0, lsl #2
139
140    SUBS            r1, r1, #1
141    BNE             LOOP_TRIVIAL_TWIDDLE
142
143    MOV             r0, r0, ASR #3
144    LDR             r4, [sp, #0x1c]
145    LDR             r3, [sp, #0x34]
146    MUL             r1, r0, r4
147    ADD             r12, r3, #8
148    STR             r1, [sp, #0x24]
149    MOV             r3, r1, ASR #2
150    ADD             r3, r3, r1, ASR #3
151    SUB             r3, r3, r1, ASR #4
152    ADD             r3, r3, r1, ASR #5
153    SUB             r3, r3, r1, ASR #6
154    ADD             r3, r3, r1, ASR #7
155    SUB             r3, r3, r1, ASR #8
156    STR             r3, [sp, #-4]!
157SECOND_LOOP:
158    LDR             r3, [sp, #0x10+4]
159    LDR             r14, [sp, #0x18+4]
160    MOV             r0, r0, LSL #3      @(del<<1) * 4
161    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
162    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
163    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
164    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
165    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
166    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
167
168    STR             r4, [sp, #8+4]
169    STR             r1, [sp, #-4]
170    STR             r2, [sp, #-8]
171    STR             r5, [sp, #-12]
172    STR             r6, [sp, #-16]
173    STR             r7, [sp, #-20]
174    STR             r8, [sp, #-24]
175
176RADIX4_BFLY:
177
178    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
179    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
180    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
181    SUBS            r14, r14, #1
182
183    LDR             r1, [sp, #-4]
184    LDR             r2, [sp, #-8]
185
186    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
187    LSR             r3, r3, #31
188    ORR             r4, r3, r4, LSL#1
189    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
190    LSR             r3, r3, #31
191    ORR             r6, r3, r6, LSL#1
192    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
193    LSR             r3, r3, #31
194    ORR             r5, r3, r5, LSL#1
195    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
196    LSR             r3, r3, #31
197    ORR             r7, r3, r7, LSL#1
198    SUB             r7, r7, r6
199    ADD             r6, r4, r5          @
200
201    LDR             r1, [sp, #-12]
202    LDR             r2, [sp, #-16]
203
204    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
205    LSR             r3, r3, #31
206    ORR             r4, r3, r4, LSL#1
207    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
208    LSR             r3, r3, #31
209    ORR             r8, r3, r8, LSL#1
210    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
211    LSR             r3, r3, #31
212    ORR             r5, r3, r5, LSL#1
213    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
214    LSR             r3, r3, #31
215    ORR             r9, r3, r9, LSL#1
216    SUB             r9, r9, r8
217    ADD             r8, r4, r5          @
218
219    LDR             r1, [sp, #-20]
220    LDR             r2, [sp, #-24]
221
222    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
223    LSR             r3, r3, #31
224    ORR             r4, r3, r4, LSL#1
225    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
226    LSR             r3, r3, #31
227    ORR             r10, r3, r10, LSL#1
228    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
229    LSR             r3, r3, #31
230    ORR             r5, r3, r5, LSL#1
231    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
232    LSR             r3, r3, #31
233    ORR             r11, r3, r11, LSL#1
234    SUB             r11, r11, r10
235    ADD             r10, r4, r5         @
236
237    @SUB   r12,r12,r0,lsl #1
238    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
239    LDR             r4, [r12, -r0, lsl #1]! @
240    LDR             r5, [r12, #4]
241
242
243    ADD             r4, r8, r4          @x0r = x0r + x2r@
244    ADD             r5, r9, r5          @x0i = x0i + x2i@
245    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
246    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
247    ADD             r6, r6, r10         @x1r = x1r + x3r@
248    ADD             r7, r7, r11         @x1i = x1i + x3i@
249    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
250    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
251
252    ADD             r4, r4, r6          @x0r = x0r + x1r@
253    ADD             r5, r5, r7          @x0i = x0i + x1i@
254    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
255    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
256    STRD            r4, [r12]           @r4=x0r,  r5=x0i
257    ADD             r12, r12, r0
258
259    SUB             r8, r8, r11         @x2r = x2r - x3i@
260    ADD             r9, r9, r10         @x2i = x2i + x3r@
261    ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
262    SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
263
264    STRD            r8, [r12]           @r8=x2r,  r9=x2i
265    ADD             r12, r12, r0
266    STRD            r6, [r12]           @r6=x1r,  r7=x1i
267    ADD             r12, r12, r0
268    STRD            r4, [r12]           @r10=x3r, r11=x3i
269    ADD             r12, r12, r0
270
271    BNE             RADIX4_BFLY
272    MOV             r0, r0, ASR #3
273
274    LDR             r1, [sp, #0x2c+4]
275    LDR             r4, [sp, #8+4]
276    SUB             r1, r12, r1, LSL #3
277    LDR             r6, [sp, #0x1c+4]
278    ADD             r12, r1, #8
279    LDR             r7, [sp, #0]
280    ADD             r4, r4, r6
281    CMP             r4, r7
282    BLE             SECOND_LOOP
283
284SECOND_LOOP_2:
285    LDR             r3, [sp, #0x10+4]
286    LDR             r14, [sp, #0x18+4]
287    MOV             r0, r0, LSL #3      @(del<<1) * 4
288
289    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
290    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
291    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
292    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
293    SUB             r3, r3, #2048       @ 512 *4
294    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
295    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
296
297    STR             r4, [sp, #8+4]
298
299    STR             r1, [sp, #-4]
300    STR             r2, [sp, #-8]
301    STR             r5, [sp, #-12]
302    STR             r6, [sp, #-16]
303    STR             r7, [sp, #-20]
304    STR             r8, [sp, #-24]
305
306RADIX4_BFLY_2:
307    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
308    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
309    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
310    SUBS            r14, r14, #1
311    LDR             r1, [sp, #-4]
312    LDR             r2, [sp, #-8]
313
314    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
315    LSR             r3, r3, #31
316    ORR             r4, r3, r4, LSL#1
317    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
318    LSR             r3, r3, #31
319    ORR             r6, r3, r6, LSL#1
320    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
321    LSR             r3, r3, #31
322    ORR             r5, r3, r5, LSL#1
323    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
324    LSR             r3, r3, #31
325    ORR             r7, r3, r7, LSL#1
326    SUB             r7, r7, r6
327    ADD             r6, r4, r5          @
328
329    LDR             r1, [sp, #-12]
330    LDR             r2, [sp, #-16]
331
332    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
333    LSR             r3, r3, #31
334    ORR             r4, r3, r4, LSL#1
335    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
336    LSR             r3, r3, #31
337    ORR             r8, r3, r8, LSL#1
338    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
339    LSR             r3, r3, #31
340    ORR             r5, r3, r5, LSL#1
341    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
342    LSR             r3, r3, #31
343    ORR             r9, r3, r9, LSL#1
344    SUB             r9, r9, r8
345    ADD             r8, r4, r5          @
346
347    LDR             r1, [sp, #-20]
348    LDR             r2, [sp, #-24]
349
350    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
351    LSR             r3, r3, #31
352    ORR             r4, r3, r4, LSL#1
353    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
354    LSR             r3, r3, #31
355    ORR             r10, r3, r10, LSL#1
356    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
357    LSR             r3, r3, #31
358    ORR             r5, r3, r5, LSL#1
359    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
360    LSR             r3, r3, #31
361    ORR             r11, r3, r11, LSL#1
362    SUB             r10, r10, r11
363    ADD             r11, r5, r4         @
364
365    @SUB    r12,r12,r0,lsl #1
366    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
367    LDR             r4, [r12, -r0, lsl #1]! @
368    LDR             r5, [r12, #4]
369
370
371    ADD             r4, r8, r4          @x0r = x0r + x2r@
372    ADD             r5, r9, r5          @x0i = x0i + x2i@
373    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
374    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
375    ADD             r6, r6, r10         @x1r = x1r + x3r@
376    ADD             r7, r7, r11         @x1i = x1i + x3i@
377    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
378    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
379
380    ADD             r4, r4, r6          @x0r = x0r + x1r@
381    ADD             r5, r5, r7          @x0i = x0i + x1i@
382    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
383    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
384    STRD            r4, [r12]           @r4=x0r,  r5=x0i
385    ADD             r12, r12, r0
386
387    SUB             r8, r8, r11         @x2r = x2r - x3i@
388    ADD             r9, r9, r10         @x2i = x2i + x3r@
389    ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
390    SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
391
392    STRD            r8, [r12]           @r8=x2r,  r9=x2i
393    ADD             r12, r12, r0
394    STRD            r6, [r12]           @r6=x1r,  r7=x1i
395    ADD             r12, r12, r0
396    STRD            r4, [r12]           @r10=x3r, r11=x3i
397    ADD             r12, r12, r0
398
399    BNE             RADIX4_BFLY_2
400    MOV             r0, r0, ASR #3
401
402    LDR             r1, [sp, #0x2c+4]
403    LDR             r4, [sp, #8+4]
404    SUB             r1, r12, r1, LSL #3
405    LDR             r6, [sp, #0x1c+4]
406    ADD             r12, r1, #8
407    LDR             r7, [sp, #0x24+4]
408    ADD             r4, r4, r6
409    CMP             r4, r7, ASR #1
410    BLE             SECOND_LOOP_2
411    LDR             r7, [sp, #0]
412    CMP             r4, r7, LSL #1
413    BGT             SECOND_LOOP_4
414
415SECOND_LOOP_3:
416    LDR             r3, [sp, #0x10+4]
417    LDR             r14, [sp, #0x18+4]
418    MOV             r0, r0, LSL #3      @(del<<1) * 4
419
420    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
421    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
422    SUB             r3, r3, #2048       @ 512 *4
423    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
424    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
425    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
426    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
427
428    STR             r4, [sp, #8+4]
429    STR             r1, [sp, #-4]
430    STR             r2, [sp, #-8]
431    STR             r5, [sp, #-12]
432    STR             r6, [sp, #-16]
433    STR             r7, [sp, #-20]
434    STR             r8, [sp, #-24]
435
436
437RADIX4_BFLY_3:
438    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
439    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
440    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
441    SUBS            r14, r14, #1
442
443    LDR             r1, [sp, #-4]
444    LDR             r2, [sp, #-8]
445
446    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
447    LSR             r3, r3, #31
448    ORR             r4, r3, r4, LSL#1
449    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
450    LSR             r3, r3, #31
451    ORR             r6, r3, r6, LSL#1
452    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
453    LSR             r3, r3, #31
454    ORR             r5, r3, r5, LSL#1
455    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
456    LSR             r3, r3, #31
457    ORR             r7, r3, r7, LSL#1
458    SUB             r7, r7, r6
459    ADD             r6, r4, r5          @
460
461    LDR             r1, [sp, #-12]
462    LDR             r2, [sp, #-16]
463
464    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
465    LSR             r3, r3, #31
466    ORR             r4, r3, r4, LSL#1
467    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
468    LSR             r3, r3, #31
469    ORR             r8, r3, r8, LSL#1
470    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
471    LSR             r3, r3, #31
472    ORR             r5, r3, r5, LSL#1
473    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
474    LSR             r3, r3, #31
475    ORR             r9, r3, r9, LSL#1
476    SUB             r8, r8, r9
477    ADD             r9, r5, r4          @
478
479    LDR             r1, [sp, #-20]
480    LDR             r2, [sp, #-24]
481
482    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
483    LSR             r3, r3, #31
484    ORR             r4, r3, r4, LSL#1
485    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
486    LSR             r3, r3, #31
487    ORR             r10, r3, r10, LSL#1
488    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
489    LSR             r3, r3, #31
490    ORR             r5, r3, r5, LSL#1
491    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
492    LSR             r3, r3, #31
493    ORR             r11, r3, r11, LSL#1
494    SUB             r10, r10, r11
495    ADD             r11, r5, r4         @
496
497    @SUB    r12,r12,r0,lsl #1
498    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
499    LDR             r4, [r12, -r0, lsl #1]! @
500    LDR             r5, [r12, #4]
501
502
503    ADD             r4, r8, r4          @x0r = x0r + x2r@
504    ADD             r5, r9, r5          @x0i = x0i + x2i@
505    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
506    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
507    ADD             r6, r6, r10         @x1r = x1r + x3r@
508    ADD             r7, r7, r11         @x1i = x1i + x3i@
509    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
510    SUB             r11, r7, r11, lsl#1 @x3i = x1i - (x3i << 1)@
511
512    ADD             r4, r4, r6          @x0r = x0r + x1r@
513    ADD             r5, r5, r7          @x0i = x0i + x1i@
514    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
515    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
516    STRD            r4, [r12]           @r4=x0r,  r5=x0i
517    ADD             r12, r12, r0
518
519    SUB             r8, r8, r11         @x2r = x2r - x3i@
520    ADD             r9, r9, r10         @x2i = x2i + x3r@
521    ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
522    SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
523
524    STRD            r8, [r12]           @r8=x2r,  r9=x2i
525    ADD             r12, r12, r0
526    STRD            r6, [r12]           @r6=x1r,  r7=x1i
527    ADD             r12, r12, r0
528    STRD            r4, [r12]           @r10=x3r, r11=x3i
529    ADD             r12, r12, r0
530
531    BNE             RADIX4_BFLY_3
532    MOV             r0, r0, ASR #3
533
534    LDR             r1, [sp, #0x2c+4]
535    LDR             r4, [sp, #8+4]
536    SUB             r1, r12, r1, LSL #3
537    LDR             r6, [sp, #0x1c+4]
538    ADD             r12, r1, #8
539    LDR             r7, [sp, #0]
540    ADD             r4, r4, r6
541    CMP             r4, r7, LSL #1
542    BLE             SECOND_LOOP_3
543
544SECOND_LOOP_4:
545    LDR             r3, [sp, #0x10+4]
546    LDR             r14, [sp, #0x18+4]
547    MOV             r0, r0, LSL #3      @(del<<1) * 4
548
549    LDR             r1, [r3, r4, LSL #3]! @ w1h = *(twiddles + 2*j)@
550    LDR             r2, [r3, #4]        @w1l = *(twiddles + 2*j + 1)@
551    SUB             r3, r3, #2048       @ 512 *4
552    LDR             r5, [r3, r4, LSL #3]! @w2h = *(twiddles + 2*(j<<1))@
553    LDR             r6, [r3, #4]        @w2l = *(twiddles + 2*(j<<1) + 1)@
554    SUB             r3, r3, #2048       @ 512 *4
555    LDR             r7, [r3, r4, LSL #3]! @w3h = *(twiddles + 2*j + 2*(j<<1))@
556    LDR             r8, [r3, #4]        @w3l = *(twiddles + 2*j + 2*(j<<1) + 1)@
557
558
559    STR             r4, [sp, #8+4]
560    STR             r1, [sp, #-4]
561    STR             r2, [sp, #-8]
562    STR             r5, [sp, #-12]
563    STR             r6, [sp, #-16]
564    STR             r7, [sp, #-20]
565    STR             r8, [sp, #-24]
566
567RADIX4_BFLY_4:
568    LDRD            r6, [r12, r0]!      @r6=x1r,  r7=x1i
569    LDRD            r8, [r12, r0]!      @r8=x2r,  r9=x2i
570    LDRD            r10, [r12, r0]      @r10=x3r, r11=x3i
571    SUBS            r14, r14, #1
572
573    LDR             r1, [sp, #-4]
574    LDR             r2, [sp, #-8]
575
576    SMULL           r3, r4, r6, r2      @ixheaacd_mult32(x1r,w1l)
577    LSR             r3, r3, #31
578    ORR             r4, r3, r4, LSL#1
579    SMULL           r3, r6, r6, r1      @mult32x16hin32(x1r,W1h)
580    LSR             r3, r3, #31
581    ORR             r6, r3, r6, LSL#1
582    SMULL           r3, r5, r7, r1      @mult32x16hin32(x1i,W1h)
583    LSR             r3, r3, #31
584    ORR             r5, r3, r5, LSL#1
585    SMULL           r3, r7, r7, r2      @ixheaacd_mac32(ixheaacd_mult32(x1r,w1h) ,x1i,w1l)
586    LSR             r3, r3, #31
587    ORR             r7, r3, r7, LSL#1
588    SUB             r7, r7, r6
589    ADD             r6, r4, r5          @
590
591    LDR             r1, [sp, #-12]
592    LDR             r2, [sp, #-16]
593
594    SMULL           r3, r4, r8, r2      @ixheaacd_mult32(x2r,w2l)
595    LSR             r3, r3, #31
596    ORR             r4, r3, r4, LSL#1
597    SMULL           r3, r8, r8, r1      @mult32x16hin32(x2r,W2h)
598    LSR             r3, r3, #31
599    ORR             r8, r3, r8, LSL#1
600    SMULL           r3, r5, r9, r1      @mult32x16hin32(x2i,W2h)
601    LSR             r3, r3, #31
602    ORR             r5, r3, r5, LSL#1
603    SMULL           r3, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
604    LSR             r3, r3, #31
605    ORR             r9, r3, r9, LSL#1
606    SUB             r8, r8, r9
607    ADD             r9, r5, r4          @
608
609    LDR             r1, [sp, #-20]
610    LDR             r2, [sp, #-24]
611
612    SMULL           r3, r4, r10, r2     @ixheaacd_mult32(x3r,w3l)
613    LSR             r3, r3, #31
614    ORR             r4, r3, r4, LSL#1
615    SMULL           r3, r10, r10, r1    @mult32x16hin32(x3r,W3h)
616    LSR             r3, r3, #31
617    ORR             r10, r3, r10, LSL#1
618    SMULL           r3, r5, r11, r1     @mult32x16hin32(x3i,W3h)
619    LSR             r3, r3, #31
620    ORR             r5, r3, r5, LSL#1
621    SMULL           r3, r11, r11, r2    @ixheaacd_mac32(ixheacd_mult32(x3r,w3h) ,x3i,w3l)
622    LSR             r3, r3, #31
623    ORR             r11, r3, r11, LSL#1
624    SUB             r11, r11, r10
625    ADD             r10, r5, r4         @
626    RSB             r10, r10, #0
627
628    @SUB    r12,r12,r0,lsl #1
629    @LDRD     r4,[r12]      @r4=x0r,  r5=x0i
630    LDR             r4, [r12, -r0, lsl #1]! @
631    LDR             r5, [r12, #4]
632
633
634    ADD             r4, r8, r4          @x0r = x0r + x2r@
635    ADD             r5, r9, r5          @x0i = x0i + x2i@
636    SUB             r8, r4, r8, lsl#1   @x2r = x0r - (x2r << 1)@
637    SUB             r9, r5, r9, lsl#1   @x2i = x0i - (x2i << 1)@
638    ADD             r6, r6, r10         @x1r = x1r + x3r@
639    SUB             r7, r7, r11         @x1i = x1i - x3i@
640    SUB             r10, r6, r10, lsl#1 @x3r = x1r - (x3r << 1)@
641    ADD             r11, r7, r11, lsl#1 @x3i = x1i + (x3i << 1)@
642
643    ADD             r4, r4, r6          @x0r = x0r + x1r@
644    ADD             r5, r5, r7          @x0i = x0i + x1i@
645    SUB             r6, r4, r6, lsl#1   @x1r = x0r - (x1r << 1)@
646    SUB             r7, r5, r7, lsl#1   @x1i = x0i - (x1i << 1)
647    STRD            r4, [r12]           @r4=x0r,  r5=x0i
648    ADD             r12, r12, r0
649
650    SUB             r8, r8, r11         @x2r = x2r - x3i@
651    ADD             r9, r9, r10         @x2i = x2i + x3r@
652    ADD             r4, r8, r11, lsl#1  @x3i = x2r + (x3i << 1)@
653    SUB             r5, r9, r10, lsl#1  @x3r = x2i - (x3r << 1)
654
655    STRD            r8, [r12]           @r8=x2r,  r9=x2i
656    ADD             r12, r12, r0
657    STRD            r6, [r12]           @r6=x1r,  r7=x1i
658    ADD             r12, r12, r0
659    STRD            r4, [r12]           @r10=x3r, r11=x3i
660    ADD             r12, r12, r0
661
662    BNE             RADIX4_BFLY_4
663    MOV             r0, r0, ASR #3
664
665    LDR             r1, [sp, #0x2c+4]
666    LDR             r4, [sp, #8+4]
667    SUB             r1, r12, r1, LSL #3
668    LDR             r6, [sp, #0x1c+4]
669    ADD             r12, r1, #8
670    LDR             r7, [sp, #0x24+4]
671    ADD             r4, r4, r6
672    CMP             r4, r7
673    BLT             SECOND_LOOP_4
674    ADD             sp, sp, #4
675
676    LDR             r1, [sp, #0x1c]
677    MOV             r0, r0, LSL #2
678    MOV             r1, r1, ASR #2
679    STR             r1, [sp, #0x1c]
680    LDR             r1, [sp, #0x18]
681    MOV             r1, r1, ASR #2
682    STR             r1, [sp, #0x18]
683    LDR             r1, [sp, #0x20]
684    SUBS            r1, r1, #1
685    STR             r1, [sp, #0x20]
686    BGT             OUTER_LOOP
687
688RADIX2:
689    LDR             r1, [sp, #0x14]
690    CMP             r1, #0
691    BEQ             EXIT
692    LDR             r12, [sp, #0x1c]
693    LDR             r1, [sp, #0x28]
694    CMP             r12, #0
695    LDRNE           r12, [sp, #0x1c]
696    MOVEQ           r4, #1
697    MOVNE           r4, r12, LSL #1
698    MOVS            r3, r0
699    BEQ             EXIT
700
701    MOV             r3, r3, ASR #1
702    LDR             r5, [sp, #0x34]
703    MOV             r0, r0, LSL #3      @(del<<1) * 4
704    STR             r1, [sp, #-4]
705RADIX2_BFLY:
706    LDR             r1, [sp, #-4]
707    LDRD            r6, [r5]            @r6 = x0r
708    ADD             r5, r5, r0
709    LDRD            r8, [r5]            @r8 = x1r
710
711    LDR             r2, [r1]
712    SUBS            r3, r3, #1
713
714
715    SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
716    LSR             r1, r1, #31
717    ORR             r11, r1, r11, LSL#1
718    SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
719    LSR             r1, r1, #31
720    ORR             r10, r1, r10, LSL#1
721
722
723    LDR             r1, [sp, #-4]
724    LDR             r2, [r1, #4]
725    ADD             r1, r1, r4, LSL #3
726    STR             r1, [sp, #-4]
727
728    SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
729    LSR             r1, r1, #31
730    ORR             r8, r1, r8, LSL#1
731    SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
732    LSR             r1, r1, #31
733    ORR             r9, r1, r9, LSL#1
734
735    ADD             r8, r8, r10
736    SUB             r9, r9, r11
737
738    ASR             r8, r8, #1
739    ASR             r6, r6, #1
740    ASR             r9, r9, #1
741    ASR             r7, r7, #1
742    ADD             r10, r8, r6         @(x0r/2) + (x1r/2)
743    ADD             r11, r9, r7         @(x0i/2) + (x1i/2)@
744    SUB             r8, r6, r8          @(x0r/2) - (x1r/2)
745    SUB             r9, r7, r9          @(x0i/2) - (x1i/2)@
746
747    STRD            r8, [r5]
748    SUB             r5, r5, r0
749    STRD            r10, [r5], #8
750
751    BNE             RADIX2_BFLY
752
753    LDR             r1, [sp, #0x28]
754    MOV             r3, r0, ASR #4
755    STR             r1, [sp, #-4]
756RADIX2_BFLY_2:
757    LDR             r1, [sp, #-4]
758    LDRD            r6, [r5]            @r6 = x0r
759    ADD             r5, r5, r0
760    LDRD            r8, [r5]            @r8 = x1r
761
762    LDR             r2, [r1]
763    SUBS            r3, r3, #1
764
765
766
767    SMULL           r1, r11, r8, r2     @mult32x16hin32(x1r,W1h)
768    LSR             r1, r1, #31
769    ORR             r11, r1, r11, LSL#1
770    SMULL           r1, r10, r9, r2     @mult32x16hin32(x1i,W1h)
771    LSR             r1, r1, #31
772    ORR             r10, r1, r10, LSL#1
773
774
775    LDR             r1, [sp, #-4]
776    LDR             r2, [r1, #4]
777    ADD             r1, r1, r4, LSL #3
778    STR             r1, [sp, #-4]
779
780    SMULL           r1, r8, r8, r2      @ixheaacd_mult32(x1r,w1l)
781    LSR             r1, r1, #31
782    ORR             r8, r1, r8, LSL#1
783    SMULL           r1, r9, r9, r2      @ixheaacd_mac32(ixheacd_mult32(x1r,w1h) ,x1i,w1l)
784    LSR             r1, r1, #31
785    ORR             r9, r1, r9, LSL#1
786
787    SUB             r11, r11, r9
788    ADD             r9, r10, r8         @
789    MOV             r8, r11
790
791    ASR             r8, r8, #1
792    ASR             r6, r6, #1
793    ASR             r9, r9, #1
794    ASR             r7, r7, #1
795    ADD             r10, r8, r6         @(x0r>>1) + (x1r)
796    ADD             r11, r9, r7         @(x0i>>1) + (x1i)@
797    SUB             r8, r6, r8          @(x0r>>1) - (x1r)
798    SUB             r9, r7, r9          @(x0i>>1) - (x1i)@
799
800    STRD            r8, [r5]
801    SUB             r5, r5, r0
802    STRD            r10, [r5], #8
803
804    BNE             RADIX2_BFLY_2
805
806EXIT:
807    ADD             sp, sp, #0x38
808    LDMFD           sp!, {r4-r12, pc}
809
810