1/* 2 * Copyright (c) 2013 RISC OS Open Ltd 3 * Author: Ben Avison <bavison@riscosopen.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/arm/asm.S" 23 24@ The fftx_internal_vfp versions of the functions obey a modified AAPCS: 25@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and 26@ all single-precision VFP registers may be corrupted on exit. The a2 27@ register may not be clobbered in these functions, as it holds the 28@ stored original FPSCR. 29 30function ff_fft_calc_vfp, export=1 31 ldr ip, [a1, #0] @ nbits 32 mov a1, a2 33 movrel a2, (fft_tab_vfp - 8) 34 ldr pc, [a2, ip, lsl #2] 35endfunc 36const fft_tab_vfp, relocate=1 37 .word fft4_vfp 38 .word fft8_vfp 39 .word X(ff_fft16_vfp) @ this one alone is exported 40 .word fft32_vfp 41 .word fft64_vfp 42 .word fft128_vfp 43 .word fft256_vfp 44 .word fft512_vfp 45 .word fft1024_vfp 46 .word fft2048_vfp 47 .word fft4096_vfp 48 .word fft8192_vfp 49 .word fft16384_vfp 50 .word fft32768_vfp 51 .word fft65536_vfp 52endconst 53 54function fft4_vfp 55 vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] 56 vldr d4, [a1, #1*2*4] @ s8,s9 = z[1] 57 vldr d1, [a1, #2*2*4] @ s2,s3 = z[2] 58 vldr d5, [a1, #3*2*4] @ s10,s11 = z[3] 59 @ stall 60 vadd.f s12, s0, s8 @ i0 61 vadd.f s13, s1, s9 @ i1 62 vadd.f s14, s2, s10 @ i2 63 vadd.f s15, s3, s11 @ i3 64 vsub.f s8, s0, s8 @ i4 65 vsub.f s9, s1, s9 @ i5 66 vsub.f s10, s2, s10 @ i6 67 vsub.f s11, s3, s11 @ i7 68 @ stall 69 @ stall 70 vadd.f s0, s12, s14 @ z[0].re 71 vsub.f s4, s12, s14 @ z[2].re 72 vadd.f s1, s13, s15 @ z[0].im 73 vsub.f s5, s13, s15 @ z[2].im 74 vadd.f s7, s9, s10 @ z[3].im 75 vsub.f s3, s9, s10 @ z[1].im 76 vadd.f s2, s8, s11 @ z[1].re 77 vsub.f s6, s8, s11 @ z[3].re 78 @ stall 79 @ stall 80 vstr d0, [a1, #0*2*4] 81 vstr d2, [a1, #2*2*4] 82 @ stall 83 @ stall 84 vstr d1, [a1, #1*2*4] 85 vstr d3, [a1, #3*2*4] 86 87 bx lr 88endfunc 89 90.macro macro_fft8_head 91 @ FFT4 92 vldr d4, [a1, #0 * 2*4] 93 vldr d6, [a1, #1 * 2*4] 94 vldr d5, [a1, #2 * 2*4] 95 vldr d7, [a1, #3 * 2*4] 96 @ BF 97 vldr d12, [a1, #4 * 2*4] 98 vadd.f s16, s8, s12 @ vector op 99 vldr d14, [a1, #5 * 2*4] 100 vldr d13, [a1, #6 * 2*4] 101 vldr d15, [a1, #7 * 2*4] 102 vsub.f s20, s8, s12 @ vector op 103 vadd.f s0, s16, s18 104 vsub.f s2, s16, s18 105 vadd.f s1, s17, s19 106 vsub.f s3, s17, s19 107 vadd.f s7, s21, s22 108 vsub.f s5, s21, s22 109 vadd.f s4, s20, s23 110 vsub.f s6, s20, s23 111 vsub.f s20, s24, s28 @ vector op 112 vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory 113 vstr d1, [a1, #1 * 2*4] 114 vldr s0, cos1pi4 115 vadd.f s16, s24, s28 @ vector op 116 vstr d2, [a1, #2 * 2*4] 117 vstr d3, [a1, #3 * 2*4] 118 vldr d12, [a1, #0 * 2*4] 119 @ TRANSFORM 120 vmul.f s20, s20, s0 @ vector x scalar op 121 vldr d13, [a1, #1 * 2*4] 122 vldr d14, [a1, #2 * 2*4] 123 vldr d15, [a1, #3 * 2*4] 124 @ BUTTERFLIES 125 vadd.f s0, s18, s16 126 vadd.f s1, s17, s19 127 vsub.f s2, s17, s19 128 vsub.f s3, s18, s16 129 vadd.f s4, s21, s20 130 vsub.f s5, s21, s20 131 vadd.f s6, s22, s23 132 vsub.f s7, s22, s23 133 vadd.f s8, s0, s24 @ vector op 134 vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory 135 vstr d1, [a1, #1 * 2*4] 136 vldr d6, [a1, #0 * 2*4] 137 vldr d7, [a1, #1 * 2*4] 138 vadd.f s1, s5, s6 139 vadd.f s0, s7, s4 140 vsub.f s2, s5, s6 141 vsub.f s3, s7, s4 142 vsub.f s12, s24, s12 @ vector op 143 vsub.f s5, s29, s1 144 vsub.f s4, s28, s0 145 vsub.f s6, s30, s2 146 vsub.f s7, s31, s3 147 vadd.f s16, s0, s28 @ vector op 148 vstr d6, [a1, #4 * 2*4] 149 vstr d7, [a1, #6 * 2*4] 150 vstr d4, [a1, #0 * 2*4] 151 vstr d5, [a1, #2 * 2*4] 152 vstr d2, [a1, #5 * 2*4] 153 vstr d3, [a1, #7 * 2*4] 154.endm 155 156.macro macro_fft8_tail 157 vstr d8, [a1, #1 * 2*4] 158 vstr d9, [a1, #3 * 2*4] 159.endm 160 161function .Lfft8_internal_vfp 162 macro_fft8_head 163 macro_fft8_tail 164 bx lr 165endfunc 166 167function fft8_vfp 168 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 169 fmrx a2, FPSCR 170 fmxr FPSCR, a3 171 vpush {s16-s31} 172 mov ip, lr 173 bl .Lfft8_internal_vfp 174 vpop {s16-s31} 175 fmxr FPSCR, a2 176 bx ip 177endfunc 178 179.align 3 180cos1pi4: @ cos(1*pi/4) = sqrt(2) 181 .float 0.707106769084930419921875 182cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 183 .float 0.92387950420379638671875 184cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 185 .float 0.3826834261417388916015625 186 187function .Lfft16_internal_vfp 188 macro_fft8_head 189 @ FFT4(z+8) 190 vldr d10, [a1, #8 * 2*4] 191 vldr d12, [a1, #9 * 2*4] 192 vldr d11, [a1, #10 * 2*4] 193 vldr d13, [a1, #11 * 2*4] 194 macro_fft8_tail 195 vadd.f s16, s20, s24 @ vector op 196 @ FFT4(z+12) 197 vldr d4, [a1, #12 * 2*4] 198 vldr d6, [a1, #13 * 2*4] 199 vldr d5, [a1, #14 * 2*4] 200 vsub.f s20, s20, s24 @ vector op 201 vldr d7, [a1, #15 * 2*4] 202 vadd.f s0, s16, s18 203 vsub.f s4, s16, s18 204 vadd.f s1, s17, s19 205 vsub.f s5, s17, s19 206 vadd.f s7, s21, s22 207 vsub.f s3, s21, s22 208 vadd.f s2, s20, s23 209 vsub.f s6, s20, s23 210 vadd.f s16, s8, s12 @ vector op 211 vstr d0, [a1, #8 * 2*4] 212 vstr d2, [a1, #10 * 2*4] 213 vstr d1, [a1, #9 * 2*4] 214 vsub.f s20, s8, s12 215 vstr d3, [a1, #11 * 2*4] 216 @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4) 217 vldr d12, [a1, #10 * 2*4] 218 vadd.f s0, s16, s18 219 vadd.f s1, s17, s19 220 vsub.f s6, s16, s18 221 vsub.f s7, s17, s19 222 vsub.f s3, s21, s22 223 vadd.f s2, s20, s23 224 vadd.f s5, s21, s22 225 vsub.f s4, s20, s23 226 vstr d0, [a1, #12 * 2*4] 227 vmov s0, s6 228 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8) 229 vldr d6, [a1, #9 * 2*4] 230 vstr d1, [a1, #13 * 2*4] 231 vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8 232 vstr d2, [a1, #15 * 2*4] 233 vldr d7, [a1, #13 * 2*4] 234 vadd.f s4, s25, s24 235 vsub.f s5, s25, s24 236 vsub.f s6, s0, s7 237 vadd.f s7, s0, s7 238 vmul.f s20, s12, s3 @ vector op 239 @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8) 240 vldr d4, [a1, #11 * 2*4] 241 vldr d5, [a1, #15 * 2*4] 242 vldr s1, cos3pi8 243 vmul.f s24, s4, s2 @ vector * scalar op 244 vmul.f s28, s12, s1 @ vector * scalar op 245 vmul.f s12, s8, s1 @ vector * scalar op 246 vadd.f s4, s20, s29 247 vsub.f s5, s21, s28 248 vsub.f s6, s22, s31 249 vadd.f s7, s23, s30 250 vmul.f s8, s8, s3 @ vector * scalar op 251 vldr d8, [a1, #1 * 2*4] 252 vldr d9, [a1, #5 * 2*4] 253 vldr d10, [a1, #3 * 2*4] 254 vldr d11, [a1, #7 * 2*4] 255 vldr d14, [a1, #2 * 2*4] 256 vadd.f s0, s6, s4 257 vadd.f s1, s5, s7 258 vsub.f s2, s5, s7 259 vsub.f s3, s6, s4 260 vadd.f s4, s12, s9 261 vsub.f s5, s13, s8 262 vsub.f s6, s14, s11 263 vadd.f s7, s15, s10 264 vadd.f s12, s0, s16 @ vector op 265 vstr d0, [a1, #1 * 2*4] 266 vstr d1, [a1, #5 * 2*4] 267 vldr d4, [a1, #1 * 2*4] 268 vldr d5, [a1, #5 * 2*4] 269 vadd.f s0, s6, s4 270 vadd.f s1, s5, s7 271 vsub.f s2, s5, s7 272 vsub.f s3, s6, s4 273 vsub.f s8, s16, s8 @ vector op 274 vstr d6, [a1, #1 * 2*4] 275 vstr d7, [a1, #5 * 2*4] 276 vldr d15, [a1, #6 * 2*4] 277 vsub.f s4, s20, s0 278 vsub.f s5, s21, s1 279 vsub.f s6, s22, s2 280 vsub.f s7, s23, s3 281 vadd.f s20, s0, s20 @ vector op 282 vstr d4, [a1, #9 * 2*4] 283 @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]) 284 vldr d6, [a1, #8 * 2*4] 285 vstr d5, [a1, #13 * 2*4] 286 vldr d7, [a1, #12 * 2*4] 287 vstr d2, [a1, #11 * 2*4] 288 vldr d8, [a1, #0 * 2*4] 289 vstr d3, [a1, #15 * 2*4] 290 vldr d9, [a1, #4 * 2*4] 291 vadd.f s0, s26, s24 292 vadd.f s1, s25, s27 293 vsub.f s2, s25, s27 294 vsub.f s3, s26, s24 295 vadd.f s4, s14, s12 296 vadd.f s5, s13, s15 297 vsub.f s6, s13, s15 298 vsub.f s7, s14, s12 299 vadd.f s8, s0, s28 @ vector op 300 vstr d0, [a1, #3 * 2*4] 301 vstr d1, [a1, #7 * 2*4] 302 vldr d6, [a1, #3 * 2*4] 303 vldr d7, [a1, #7 * 2*4] 304 vsub.f s0, s16, s4 305 vsub.f s1, s17, s5 306 vsub.f s2, s18, s6 307 vsub.f s3, s19, s7 308 vsub.f s12, s28, s12 @ vector op 309 vadd.f s16, s4, s16 @ vector op 310 vstr d10, [a1, #3 * 2*4] 311 vstr d11, [a1, #7 * 2*4] 312 vstr d4, [a1, #2 * 2*4] 313 vstr d5, [a1, #6 * 2*4] 314 vstr d0, [a1, #8 * 2*4] 315 vstr d1, [a1, #12 * 2*4] 316 vstr d6, [a1, #10 * 2*4] 317 vstr d7, [a1, #14 * 2*4] 318 vstr d8, [a1, #0 * 2*4] 319 vstr d9, [a1, #4 * 2*4] 320 321 bx lr 322endfunc 323 324function ff_fft16_vfp, export=1 325 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 326 fmrx a2, FPSCR 327 fmxr FPSCR, a3 328 vpush {s16-s31} 329 mov ip, lr 330 bl .Lfft16_internal_vfp 331 vpop {s16-s31} 332 fmxr FPSCR, a2 333 bx ip 334endfunc 335 336.macro pass n, z0, z1, z2, z3 337 add v6, v5, #4*2*\n 338 @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) 339 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) 340 @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) 341 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) 342 vldr d8, [\z2, #8*(o2+1)] @ s16,s17 343 vldmdb v6!, {s2} 344 vldr d9, [\z3, #8*(o3+1)] @ s18,s19 345 vldmia v5!, {s0,s1} @ s0 is unused 346 vldr s7, [\z2, #8*o2] @ t1 347 vmul.f s20, s16, s2 @ vector * scalar 348 vldr s0, [\z3, #8*o3] @ t5 349 vldr s6, [\z2, #8*o2+4] @ t2 350 vldr s3, [\z3, #8*o3+4] @ t6 351 vmul.f s16, s16, s1 @ vector * scalar 352 ldr a4, =\n-1 3531: add \z0, \z0, #8*2 354 .if \n*4*2 >= 512 355 add \z1, \z1, #8*2 356 .endif 357 .if \n*4*2 >= 256 358 add \z2, \z2, #8*2 359 .endif 360 .if \n*4*2 >= 512 361 add \z3, \z3, #8*2 362 .endif 363 @ up to 2 stalls (VFP vector issuing / waiting for s0) 364 @ depending upon whether this is the first iteration and 365 @ how many add instructions are inserted above 366 vadd.f s4, s0, s7 @ t5 367 vadd.f s5, s6, s3 @ t6 368 vsub.f s6, s6, s3 @ t4 369 vsub.f s7, s0, s7 @ t3 370 vldr d6, [\z0, #8*0-8*2] @ s12,s13 371 vadd.f s0, s16, s21 @ t1 372 vldr d7, [\z1, #8*o1-8*2] @ s14,s15 373 vsub.f s1, s18, s23 @ t5 374 vadd.f s8, s4, s12 @ vector + vector 375 @ stall (VFP vector issuing) 376 @ stall (VFP vector issuing) 377 @ stall (VFP vector issuing) 378 vsub.f s4, s12, s4 379 vsub.f s5, s13, s5 380 vsub.f s6, s14, s6 381 vsub.f s7, s15, s7 382 vsub.f s2, s17, s20 @ t2 383 vadd.f s3, s19, s22 @ t6 384 vstr d4, [\z0, #8*0-8*2] @ s8,s9 385 vstr d5, [\z1, #8*o1-8*2] @ s10,s11 386 @ stall (waiting for s5) 387 vstr d2, [\z2, #8*o2-8*2] @ s4,s5 388 vadd.f s4, s1, s0 @ t5 389 vstr d3, [\z3, #8*o3-8*2] @ s6,s7 390 vsub.f s7, s1, s0 @ t3 391 vadd.f s5, s2, s3 @ t6 392 vsub.f s6, s2, s3 @ t4 393 vldr d6, [\z0, #8*1-8*2] @ s12,s13 394 vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 395 vldr d4, [\z2, #8*o2] @ s8,s9 396 vldmdb v6!, {s2,s3} 397 vldr d5, [\z3, #8*o3] @ s10,s11 398 vadd.f s20, s4, s12 @ vector + vector 399 vldmia v5!, {s0,s1} 400 vldr d8, [\z2, #8*(o2+1)] @ s16,s17 401 @ stall (VFP vector issuing) 402 vsub.f s4, s12, s4 403 vsub.f s5, s13, s5 404 vsub.f s6, s14, s6 405 vsub.f s7, s15, s7 406 vmul.f s12, s8, s3 @ vector * scalar 407 vstr d10, [\z0, #8*1-8*2] @ s20,s21 408 vldr d9, [\z3, #8*(o3+1)] @ s18,s19 409 vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 410 vmul.f s8, s8, s0 @ vector * scalar 411 vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 412 @ stall (waiting for s7) 413 vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 414 vmul.f s20, s16, s2 @ vector * scalar 415 @ stall (VFP vector issuing) 416 @ stall (VFP vector issuing) 417 @ stall (VFP vector issuing) 418 vadd.f s7, s8, s13 @ t1 419 vsub.f s6, s9, s12 @ t2 420 vsub.f s0, s10, s15 @ t5 421 vadd.f s3, s11, s14 @ t6 422 vmul.f s16, s16, s1 @ vector * scalar 423 subs a4, a4, #1 424 bne 1b 425 @ What remains is identical to the first two indentations of 426 @ the above, but without the increment of z 427 vadd.f s4, s0, s7 @ t5 428 vadd.f s5, s6, s3 @ t6 429 vsub.f s6, s6, s3 @ t4 430 vsub.f s7, s0, s7 @ t3 431 vldr d6, [\z0, #8*0] @ s12,s13 432 vadd.f s0, s16, s21 @ t1 433 vldr d7, [\z1, #8*o1] @ s14,s15 434 vsub.f s1, s18, s23 @ t5 435 vadd.f s8, s4, s12 @ vector + vector 436 vsub.f s4, s12, s4 437 vsub.f s5, s13, s5 438 vsub.f s6, s14, s6 439 vsub.f s7, s15, s7 440 vsub.f s2, s17, s20 @ t2 441 vadd.f s3, s19, s22 @ t6 442 vstr d4, [\z0, #8*0] @ s8,s9 443 vstr d5, [\z1, #8*o1] @ s10,s11 444 vstr d2, [\z2, #8*o2] @ s4,s5 445 vadd.f s4, s1, s0 @ t5 446 vstr d3, [\z3, #8*o3] @ s6,s7 447 vsub.f s7, s1, s0 @ t3 448 vadd.f s5, s2, s3 @ t6 449 vsub.f s6, s2, s3 @ t4 450 vldr d6, [\z0, #8*1] @ s12,s13 451 vldr d7, [\z1, #8*(o1+1)] @ s14,s15 452 vadd.f s20, s4, s12 @ vector + vector 453 vsub.f s4, s12, s4 454 vsub.f s5, s13, s5 455 vsub.f s6, s14, s6 456 vsub.f s7, s15, s7 457 vstr d10, [\z0, #8*1] @ s20,s21 458 vstr d11, [\z1, #8*(o1+1)] @ s22,s23 459 vstr d2, [\z2, #8*(o2+1)] @ s4,s5 460 vstr d3, [\z3, #8*(o3+1)] @ s6,s7 461.endm 462 463.macro def_fft n, n2, n4 464function .Lfft\n\()_internal_vfp 465 .if \n >= 512 466 push {v1-v6,lr} 467 .elseif \n >= 256 468 push {v1-v2,v5-v6,lr} 469 .else 470 push {v1,v5-v6,lr} 471 .endif 472 mov v1, a1 473 bl .Lfft\n2\()_internal_vfp 474 add a1, v1, #8*(\n/4)*2 475 bl .Lfft\n4\()_internal_vfp 476 movrelx v5, X(ff_cos_\n), a1 477 add a1, v1, #8*(\n/4)*3 478 bl .Lfft\n4\()_internal_vfp 479 .if \n >= 512 480 .set o1, 0*(\n/4/2) 481 .set o2, 0*(\n/4/2) 482 .set o3, 0*(\n/4/2) 483 add v2, v1, #8*2*(\n/4/2) 484 add v3, v1, #8*4*(\n/4/2) 485 add v4, v1, #8*6*(\n/4/2) 486 pass (\n/4/2), v1, v2, v3, v4 487 pop {v1-v6,pc} 488 .elseif \n >= 256 489 .set o1, 2*(\n/4/2) 490 .set o2, 0*(\n/4/2) 491 .set o3, 2*(\n/4/2) 492 add v2, v1, #8*4*(\n/4/2) 493 pass (\n/4/2), v1, v1, v2, v2 494 pop {v1-v2,v5-v6,pc} 495 .else 496 .set o1, 2*(\n/4/2) 497 .set o2, 4*(\n/4/2) 498 .set o3, 6*(\n/4/2) 499 pass (\n/4/2), v1, v1, v1, v1 500 pop {v1,v5-v6,pc} 501 .endif 502endfunc 503 504function fft\n\()_vfp 505 ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ 506 fmrx a2, FPSCR 507 fmxr FPSCR, a3 508 vpush {s16-s31} 509 mov ip, lr 510 bl .Lfft\n\()_internal_vfp 511 vpop {s16-s31} 512 fmxr FPSCR, a2 513 bx ip 514endfunc 515 516.ltorg 517.endm 518 519 def_fft 32, 16, 8 520 def_fft 64, 32, 16 521 def_fft 128, 64, 32 522 def_fft 256, 128, 64 523 def_fft 512, 256, 128 524 def_fft 1024, 512, 256 525 def_fft 2048, 1024, 512 526 def_fft 4096, 2048, 1024 527 def_fft 8192, 4096, 2048 528 def_fft 16384, 8192, 4096 529 def_fft 32768, 16384, 8192 530 def_fft 65536, 32768, 16384 531