1/* 2 * ARM NEON optimised FFT 3 * 4 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 5 * Copyright (c) 2009 Naotoshi Nojiri 6 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 7 * 8 * This algorithm (though not any of the implementation details) is 9 * based on libdjbfft by D. J. Bernstein. 10 * 11 * This file is part of FFmpeg. 12 * 13 * FFmpeg is free software; you can redistribute it and/or 14 * modify it under the terms of the GNU Lesser General Public 15 * License as published by the Free Software Foundation; either 16 * version 2.1 of the License, or (at your option) any later version. 17 * 18 * FFmpeg is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 * Lesser General Public License for more details. 22 * 23 * You should have received a copy of the GNU Lesser General Public 24 * License along with FFmpeg; if not, write to the Free Software 25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26 */ 27 28#include "libavutil/aarch64/asm.S" 29 30#define M_SQRT1_2 0.70710678118654752440 31 32.macro transpose d0, d1, s0, s1 33 trn1 \d0, \s0, \s1 34 trn2 \d1, \s0, \s1 35.endm 36 37 38function fft4_neon 39 ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] 40 41 fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1 42 fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1 43 44 ext v16.8b, v2.8b, v3.8b, #4 45 ext v17.8b, v3.8b, v2.8b, #4 46 47 fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3 48 fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3 49 50 fadd v0.2s, v4.2s, v5.2s 51 fsub v2.2s, v4.2s, v5.2s 52 fadd v1.2s, v6.2s, v7.2s 53 fsub v3.2s, v6.2s, v7.2s 54 55 st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] 56 57 ret 58endfunc 59 60function fft8_neon 61 mov x1, x0 62 ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 63 ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] 64 ext v22.8b, v2.8b, v3.8b, #4 65 ext v23.8b, v3.8b, v2.8b, #4 66 fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 67 fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 68 fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 69 fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 70 rev64 v27.2s, v28.2s // ??? 71 fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 72 fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 73 fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w 74 ext v6.8b, v4.8b, v5.8b, #4 75 ext v7.8b, v5.8b, v4.8b, #4 76 fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w 77 fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 78 fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 79 fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w 80 fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w 81 fadd v0.2s, v20.2s, v21.2s 82 fsub v2.2s, v20.2s, v21.2s 83 fadd v1.2s, v22.2s, v23.2s 84 rev64 v26.2s, v26.2s 85 rev64 v27.2s, v27.2s 86 fsub v3.2s, v22.2s, v23.2s 87 fsub v6.2s, v6.2s, v7.2s 88 fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 89 fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 90 fadd v7.2s, v4.2s, v5.2s 91 fsub v18.2s, v2.2s, v6.2s 92 ext v26.8b, v24.8b, v25.8b, #4 93 ext v27.8b, v25.8b, v24.8b, #4 94 fadd v2.2s, v2.2s, v6.2s 95 fsub v16.2s, v0.2s, v7.2s 96 fadd v5.2s, v25.2s, v24.2s 97 fsub v4.2s, v26.2s, v27.2s 98 fadd v0.2s, v0.2s, v7.2s 99 fsub v17.2s, v1.2s, v5.2s 100 fsub v19.2s, v3.2s, v4.2s 101 fadd v3.2s, v3.2s, v4.2s 102 fadd v1.2s, v1.2s, v5.2s 103 104 st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] 105 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1] 106 107 ret 108endfunc 109 110function fft16_neon 111 mov x1, x0 112 ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 113 ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32 114 ext v22.8b, v2.8b, v3.8b, #4 115 ext v23.8b, v3.8b, v2.8b, #4 116 fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 117 fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 118 fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 119 fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 120 rev64 v27.2s, v28.2s // ??? 121 fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 122 fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 123 fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w 124 ext v6.8b, v4.8b, v5.8b, #4 125 ext v7.8b, v5.8b, v4.8b, #4 126 fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w 127 fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 128 fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 129 fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w 130 fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w 131 fadd v0.2s, v20.2s, v21.2s 132 fsub v2.2s, v20.2s, v21.2s 133 fadd v1.2s, v22.2s, v23.2s 134 rev64 v26.2s, v26.2s 135 rev64 v27.2s, v27.2s 136 fsub v3.2s, v22.2s, v23.2s 137 fsub v6.2s, v6.2s, v7.2s 138 fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 139 fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 140 fadd v7.2s, v4.2s, v5.2s 141 fsub v18.2s, v2.2s, v6.2s 142 ld1 {v20.4s,v21.4s}, [x0], #32 143 ld1 {v22.4s,v23.4s}, [x0], #32 144 ext v26.8b, v24.8b, v25.8b, #4 145 ext v27.8b, v25.8b, v24.8b, #4 146 fadd v2.2s, v2.2s, v6.2s 147 fsub v16.2s, v0.2s, v7.2s 148 fadd v5.2s, v25.2s, v24.2s 149 fsub v4.2s, v26.2s, v27.2s 150 transpose v24.2d, v25.2d, v20.2d, v22.2d 151 transpose v26.2d, v27.2d, v21.2d, v23.2d 152 fadd v0.2s, v0.2s, v7.2s 153 fsub v17.2s, v1.2s, v5.2s 154 fsub v19.2s, v3.2s, v4.2s 155 fadd v3.2s, v3.2s, v4.2s 156 fadd v1.2s, v1.2s, v5.2s 157 ext v20.16b, v21.16b, v21.16b, #4 158 ext v21.16b, v23.16b, v23.16b, #4 159 160 zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]} 161 zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]} 162 zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]} 163 zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]} 164 165 // 2 x fft4 166 transpose v22.2d, v23.2d, v20.2d, v21.2d 167 168 fadd v4.4s, v24.4s, v25.4s 169 fadd v5.4s, v26.4s, v27.4s 170 fsub v6.4s, v24.4s, v25.4s 171 fsub v7.4s, v22.4s, v23.4s 172 173 ld1 {v23.4s}, [x14] 174 175 fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]} 176 fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]} 177 fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]} 178 fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]} 179 180 //fft_pass_neon_16 181 rev64 v7.4s, v25.4s 182 fmul v25.4s, v25.4s, v23.s[1] 183 fmul v7.4s, v7.4s, v29.4s 184 fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a} 185 186 zip1 v20.4s, v24.4s, v25.4s 187 zip2 v21.4s, v24.4s, v25.4s 188 fneg v22.4s, v20.4s 189 fadd v4.4s, v21.4s, v20.4s 190 fsub v6.4s, v20.4s, v21.4s // just the second half 191 fadd v5.4s, v21.4s, v22.4s // just the first half 192 193 tbl v4.16b, {v4.16b}, v30.16b // trans4_float 194 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 195 196 fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]} 197 fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]} 198 fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]} 199 fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]} 200 201//second half 202 rev64 v6.4s, v26.4s 203 fmul v26.4s, v26.4s, v23.s[2] 204 rev64 v7.4s, v27.4s 205 fmul v27.4s, v27.4s, v23.s[3] 206 fmul v6.4s, v6.4s, v29.4s 207 fmul v7.4s, v7.4s, v29.4s 208 fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6} 209 fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a} 210 211 zip1 v24.4s, v26.4s, v27.4s 212 zip2 v25.4s, v26.4s, v27.4s 213 fneg v26.4s, v24.4s 214 fadd v4.4s, v25.4s, v24.4s 215 fsub v6.4s, v24.4s, v25.4s // just the second half 216 fadd v5.4s, v25.4s, v26.4s // just the first half 217 218 tbl v4.16b, {v4.16b}, v30.16b // trans4_float 219 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 220 221 fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]} 222 fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]} 223 fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]} 224 fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]} 225 226 st1 {v16.4s,v17.4s}, [x1], #32 227 st1 {v18.4s,v19.4s}, [x1], #32 228 st1 {v20.4s,v21.4s}, [x1], #32 229 st1 {v22.4s,v23.4s}, [x1], #32 230 231 ret 232endfunc 233 234 235const trans4_float, align=4 236 .byte 0, 1, 2, 3 237 .byte 8, 9, 10, 11 238 .byte 4, 5, 6, 7 239 .byte 12, 13, 14, 15 240endconst 241 242const trans8_float, align=4 243 .byte 24, 25, 26, 27 244 .byte 0, 1, 2, 3 245 .byte 28, 29, 30, 31 246 .byte 4, 5, 6, 7 247endconst 248 249function fft_pass_neon 250 sub x6, x2, #1 // n - 1, loop counter 251 lsl x5, x2, #3 // 2 * n * sizeof FFTSample 252 lsl x1, x2, #4 // 2 * n * sizeof FFTComplex 253 add x5, x4, x5 // wim 254 add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex 255 add x2, x0, x2, lsl #5 // &z[o2] 256 add x3, x0, x3 // &z[o3] 257 add x1, x0, x1 // &z[o1] 258 ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} 259 ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} 260 ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} 261 trn2 v25.2d, v20.2d, v22.2d 262 sub x5, x5, #4 // wim-- 263 trn1 v24.2d, v20.2d, v22.2d 264 ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1] 265 rev64 v7.4s, v25.4s 266 fmul v25.4s, v25.4s, v4.s[1] 267 ld1 {v16.4s}, [x0] // {z[0],z[1]} 268 fmul v7.4s, v7.4s, v29.4s 269 ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]} 270 prfm pldl1keep, [x2, #16] 271 prfm pldl1keep, [x3, #16] 272 fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} 273 prfm pldl1keep, [x0, #16] 274 prfm pldl1keep, [x1, #16] 275 276 zip1 v20.4s, v24.4s, v25.4s 277 zip2 v21.4s, v24.4s, v25.4s 278 fneg v22.4s, v20.4s 279 fadd v4.4s, v21.4s, v20.4s 280 fsub v6.4s, v20.4s, v21.4s // just the second half 281 fadd v5.4s, v21.4s, v22.4s // just the first half 282 283 tbl v4.16b, {v4.16b}, v30.16b // trans4_float 284 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 285 286 fadd v20.4s, v16.4s, v4.4s 287 fsub v22.4s, v16.4s, v4.4s 288 fadd v21.4s, v17.4s, v5.4s 289 st1 {v20.4s}, [x0], #16 // {z[0], z[1]} 290 fsub v23.4s, v17.4s, v5.4s 291 292 st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} 293 st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} 294 st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} 2951: 296 ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} 297 ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} 298 ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} 299 transpose v26.2d, v27.2d, v20.2d, v22.2d 300 ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]} 301 rev64 v6.4s, v26.4s 302 fmul v26.4s, v26.4s, v4.s[0] 303 rev64 v7.4s, v27.4s 304 fmul v27.4s, v27.4s, v4.s[1] 305 fmul v6.4s, v6.4s, v29.4s 306 fmul v7.4s, v7.4s, v29.4s 307 ld1 {v16.4s},[x0] // {z[0],z[1]} 308 fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6} 309 fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} 310 ld1 {v17.4s},[x1] // {z[o1],z[o1+1]} 311 312 subs x6, x6, #1 // n-- 313 314 zip1 v20.4s, v26.4s, v27.4s 315 zip2 v21.4s, v26.4s, v27.4s 316 fneg v22.4s, v20.4s 317 fadd v4.4s, v21.4s, v20.4s 318 fsub v6.4s, v20.4s, v21.4s // just the second half 319 fadd v5.4s, v21.4s, v22.4s // just the first half 320 321 tbl v4.16b, {v4.16b}, v30.16b // trans4_float 322 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float 323 324 fadd v20.4s, v16.4s, v4.4s 325 fsub v22.4s, v16.4s, v4.4s 326 fadd v21.4s, v17.4s, v5.4s 327 st1 {v20.4s}, [x0], #16 // {z[0], z[1]} 328 fsub v23.4s, v17.4s, v5.4s 329 330 st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} 331 st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} 332 st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} 333 b.ne 1b 334 335 ret 336endfunc 337 338.macro def_fft n, n2, n4 339function fft\n\()_neon, align=6 340 sub sp, sp, #16 341 stp x28, x30, [sp] 342 add x28, x0, #\n4*2*8 343 bl fft\n2\()_neon 344 mov x0, x28 345 bl fft\n4\()_neon 346 add x0, x28, #\n4*1*8 347 bl fft\n4\()_neon 348 sub x0, x28, #\n4*2*8 349 ldp x28, x30, [sp], #16 350 movrel x4, X(ff_cos_\n) 351 mov x2, #\n4>>1 352 b fft_pass_neon 353endfunc 354.endm 355 356 def_fft 32, 16, 8 357 def_fft 64, 32, 16 358 def_fft 128, 64, 32 359 def_fft 256, 128, 64 360 def_fft 512, 256, 128 361 def_fft 1024, 512, 256 362 def_fft 2048, 1024, 512 363 def_fft 4096, 2048, 1024 364 def_fft 8192, 4096, 2048 365 def_fft 16384, 8192, 4096 366 def_fft 32768, 16384, 8192 367 def_fft 65536, 32768, 16384 368 369function ff_fft_calc_neon, export=1 370 prfm pldl1keep, [x1] 371 movrel x10, trans4_float 372 ldr w2, [x0] 373 movrel x11, trans8_float 374 sub w2, w2, #2 375 movrel x3, fft_tab_neon 376 ld1 {v30.16b}, [x10] 377 mov x7, #-8 378 movrel x12, pmmp 379 ldr x3, [x3, x2, lsl #3] 380 movrel x13, mppm 381 movrel x14, X(ff_cos_16) 382 ld1 {v31.16b}, [x11] 383 mov x0, x1 384 ld1 {v29.4s}, [x12] // pmmp 385 ld1 {v28.4s}, [x13] 386 br x3 387endfunc 388 389function ff_fft_permute_neon, export=1 390 mov x6, #1 391 ldr w2, [x0] // nbits 392 ldr x3, [x0, #16] // tmp_buf 393 ldr x0, [x0, #8] // revtab 394 lsl x6, x6, x2 395 mov x2, x6 3961: 397 ld1 {v0.2s,v1.2s}, [x1], #16 398 ldr w4, [x0], #4 399 uxth w5, w4 400 lsr w4, w4, #16 401 add x5, x3, x5, lsl #3 402 add x4, x3, x4, lsl #3 403 st1 {v0.2s}, [x5] 404 st1 {v1.2s}, [x4] 405 subs x6, x6, #2 406 b.gt 1b 407 408 sub x1, x1, x2, lsl #3 4091: 410 ld1 {v0.4s,v1.4s}, [x3], #32 411 st1 {v0.4s,v1.4s}, [x1], #32 412 subs x2, x2, #4 413 b.gt 1b 414 415 ret 416endfunc 417 418const fft_tab_neon, relocate=1 419 .quad fft4_neon 420 .quad fft8_neon 421 .quad fft16_neon 422 .quad fft32_neon 423 .quad fft64_neon 424 .quad fft128_neon 425 .quad fft256_neon 426 .quad fft512_neon 427 .quad fft1024_neon 428 .quad fft2048_neon 429 .quad fft4096_neon 430 .quad fft8192_neon 431 .quad fft16384_neon 432 .quad fft32768_neon 433 .quad fft65536_neon 434endconst 435 436const pmmp, align=4 437 .float +1.0, -1.0, -1.0, +1.0 438endconst 439 440const mppm, align=4 441 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 442endconst 443