1/* 2 * FFT transform with Altivec optimizations 3 * Copyright (c) 2009 Loren Merritt 4 * 5 * This algorithm (though not any of the implementation details) is 6 * based on libdjbfft by D. J. Bernstein. 7 * 8 * This file is part of FFmpeg. 9 * 10 * FFmpeg is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU Lesser General Public 12 * License as published by the Free Software Foundation; either 13 * version 2.1 of the License, or (at your option) any later version. 14 * 15 * FFmpeg is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public 21 * License along with FFmpeg; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25/* 26 * These functions are not individually interchangeable with the C versions. 27 * While C takes arrays of FFTComplex, Altivec leaves intermediate results 28 * in blocks as convenient to the vector size. 29 * i.e. {4x real, 4x imaginary, 4x real, ...} 30 * 31 * I ignore standard calling convention. 32 * Instead, the following registers are treated as global constants: 33 * v14: zero 34 * v15..v18: cosines 35 * v19..v29: permutations 36 * r9: 16 37 * r12: ff_cos_tabs 38 * and the rest are free for local use. 39 */ 40 41#include "config.h" 42 43#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN 44 45#include "asm.S" 46 47.text 48 49.macro addi2 ra, imm // add 32-bit immediate 50.if \imm & 0xffff 51 addi \ra, \ra, \imm@l 52.endif 53.if (\imm+0x8000)>>16 54 addis \ra, \ra, \imm@ha 55.endif 56.endm 57 58.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3 59 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} 60 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} 61 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} 62 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} 63 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} 64 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} 65 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} 66 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} 67 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} 68 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} 69.endm 70 71.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3 72 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2} 73 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3} 74 vperm \b2,\b0,\b1,v20 75 vperm \b3,\b0,\b1,v21 76 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5} 77 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7} 78 vaddfp \b0,\b2,\b3 79 vsubfp \b1,\b2,\b3 80 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4} 81 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8} 82 vmrghw \b2,\b0,\b1 83 vperm \b3,\b0,\b1,v22 84 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1} 85 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3} 86 vaddfp \b0,\b2,\b3 87 vsubfp \b1,\b2,\b3 88 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3} 89 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3} 90 vperm \b2,\b0,\b1,v23 91 vperm \b3,\b0,\b1,v24 92.endm 93 94.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1 95 vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6} 96 vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7} 97 vperm \a2,\a0,\a1,v20 // FFT4 ... 98 vperm \a3,\a0,\a1,v21 99 vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4} 100 vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7} 101 vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7} 102 vaddfp \a0,\a2,\a3 103 vsubfp \a1,\a2,\a3 104 vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2) 105 vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9} 106 vmrghw \a2,\a0,\a1 107 vperm \a3,\a0,\a1,v22 108 vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8} 109 vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta} 110 vaddfp \a0,\a2,\a3 111 vsubfp \a1,\a2,\a3 112 vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta} 113 vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb} 114 vperm \a2,\a0,\a1,v23 115 vperm \a3,\a0,\a1,v24 116 vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb} 117 vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc} 118 vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7} 119 vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7} 120 vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3} 121 vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3} 122.endm 123 124.macro BF d0,d1,s0,s1 125 vsubfp \d1,\s0,\s1 126 vaddfp \d0,\s0,\s1 127.endm 128 129.macro zip d0,d1,s0,s1 130 vmrghw \d0,\s0,\s1 131 vmrglw \d1,\s0,\s1 132.endm 133 134.macro def_fft4 interleave 135fft4\interleave\()_altivec: 136 lvx v0, 0,r3 137 lvx v1,r9,r3 138 FFT4 v0,v1,v2,v3 139.ifnb \interleave 140 zip v0,v1,v2,v3 141 stvx v0, 0,r3 142 stvx v1,r9,r3 143.else 144 stvx v2, 0,r3 145 stvx v3,r9,r3 146.endif 147 blr 148.endm 149 150.macro def_fft8 interleave 151fft8\interleave\()_altivec: 152 addi r4,r3,32 153 lvx v0, 0,r3 154 lvx v1,r9,r3 155 lvx v2, 0,r4 156 lvx v3,r9,r4 157 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 158.ifnb \interleave 159 zip v4,v5,v0,v1 160 zip v6,v7,v2,v3 161 stvx v4, 0,r3 162 stvx v5,r9,r3 163 stvx v6, 0,r4 164 stvx v7,r9,r4 165.else 166 stvx v0, 0,r3 167 stvx v1,r9,r3 168 stvx v2, 0,r4 169 stvx v3,r9,r4 170.endif 171 blr 172.endm 173 174.macro def_fft16 interleave 175fft16\interleave\()_altivec: 176 addi r5,r3,64 177 addi r6,r3,96 178 addi r4,r3,32 179 lvx v0, 0,r5 180 lvx v1,r9,r5 181 lvx v2, 0,r6 182 lvx v3,r9,r6 183 FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7 184 lvx v0, 0,r3 185 lvx v1,r9,r3 186 lvx v2, 0,r4 187 lvx v3,r9,r4 188 FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12 189 vmaddfp v8,v4,v15,v14 // r2*wre 190 vmaddfp v9,v5,v15,v14 // i2*wre 191 vmaddfp v10,v6,v15,v14 // r3*wre 192 vmaddfp v11,v7,v15,v14 // i3*wre 193 vmaddfp v8,v5,v16,v8 // i2*wim 194 vnmsubfp v9,v4,v16,v9 // r2*wim 195 vnmsubfp v10,v7,v16,v10 // i3*wim 196 vmaddfp v11,v6,v16,v11 // r3*wim 197 BF v10,v12,v10,v8 198 BF v11,v13,v9,v11 199 BF v0,v4,v0,v10 200 BF v3,v7,v3,v12 201 BF v1,v5,v1,v11 202 BF v2,v6,v2,v13 203.ifnb \interleave 204 zip v8, v9,v0,v1 205 zip v10,v11,v2,v3 206 zip v12,v13,v4,v5 207 zip v14,v15,v6,v7 208 stvx v8, 0,r3 209 stvx v9,r9,r3 210 stvx v10, 0,r4 211 stvx v11,r9,r4 212 stvx v12, 0,r5 213 stvx v13,r9,r5 214 stvx v14, 0,r6 215 stvx v15,r9,r6 216.else 217 stvx v0, 0,r3 218 stvx v4, 0,r5 219 stvx v3,r9,r4 220 stvx v7,r9,r6 221 stvx v1,r9,r3 222 stvx v5,r9,r5 223 stvx v2, 0,r4 224 stvx v6, 0,r6 225.endif 226 blr 227.endm 228 229// void pass(float *z, float *wre, int n) 230.macro PASS interleave, suffix 231fft_pass\suffix\()_altivec: 232 mtctr r5 233 slwi r0,r5,4 234 slwi r7,r5,6 // o2 235 slwi r5,r5,5 // o1 236 add r10,r5,r7 // o3 237 add r0,r4,r0 // wim 238 addi r6,r5,16 // o1+16 239 addi r8,r7,16 // o2+16 240 addi r11,r10,16 // o3+16 2411: 242 lvx v8, 0,r4 // wre 243 lvx v10, 0,r0 // wim 244 sub r0,r0,r9 245 lvx v9, 0,r0 246 vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3] 247 lvx v4,r3,r7 // r2 = z[o2] 248 lvx v5,r3,r8 // i2 = z[o2+16] 249 lvx v6,r3,r10 // r3 = z[o3] 250 lvx v7,r3,r11 // i3 = z[o3+16] 251 vmaddfp v10,v4,v8,v14 // r2*wre 252 vmaddfp v11,v5,v8,v14 // i2*wre 253 vmaddfp v12,v6,v8,v14 // r3*wre 254 vmaddfp v13,v7,v8,v14 // i3*wre 255 lvx v0, 0,r3 // r0 = z[0] 256 lvx v3,r3,r6 // i1 = z[o1+16] 257 vmaddfp v10,v5,v9,v10 // i2*wim 258 vnmsubfp v11,v4,v9,v11 // r2*wim 259 vnmsubfp v12,v7,v9,v12 // i3*wim 260 vmaddfp v13,v6,v9,v13 // r3*wim 261 lvx v1,r3,r9 // i0 = z[16] 262 lvx v2,r3,r5 // r1 = z[o1] 263 BF v12,v8,v12,v10 264 BF v13,v9,v11,v13 265 BF v0,v4,v0,v12 266 BF v3,v7,v3,v8 267.if !\interleave 268 stvx v0, 0,r3 269 stvx v4,r3,r7 270 stvx v3,r3,r6 271 stvx v7,r3,r11 272.endif 273 BF v1,v5,v1,v13 274 BF v2,v6,v2,v9 275.if !\interleave 276 stvx v1,r3,r9 277 stvx v2,r3,r5 278 stvx v5,r3,r8 279 stvx v6,r3,r10 280.else 281 vmrghw v8,v0,v1 282 vmrglw v9,v0,v1 283 stvx v8, 0,r3 284 stvx v9,r3,r9 285 vmrghw v8,v2,v3 286 vmrglw v9,v2,v3 287 stvx v8,r3,r5 288 stvx v9,r3,r6 289 vmrghw v8,v4,v5 290 vmrglw v9,v4,v5 291 stvx v8,r3,r7 292 stvx v9,r3,r8 293 vmrghw v8,v6,v7 294 vmrglw v9,v6,v7 295 stvx v8,r3,r10 296 stvx v9,r3,r11 297.endif 298 addi r3,r3,32 299 addi r4,r4,16 300 bdnz 1b 301 sub r3,r3,r5 302 blr 303.endm 304 305#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ 306 307#define WORD_0 0x00,0x01,0x02,0x03 308#define WORD_1 0x04,0x05,0x06,0x07 309#define WORD_2 0x08,0x09,0x0a,0x0b 310#define WORD_3 0x0c,0x0d,0x0e,0x0f 311#define WORD_s0 0x10,0x11,0x12,0x13 312#define WORD_s1 0x14,0x15,0x16,0x17 313#define WORD_s2 0x18,0x19,0x1a,0x1b 314#define WORD_s3 0x1c,0x1d,0x1e,0x1f 315 316#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d 317 318 .rodata 319 .align 4 320fft_data: 321 .float 0, 0, 0, 0 322 .float 1, 0.92387953, M_SQRT1_2, 0.38268343 323 .float 0, 0.38268343, M_SQRT1_2, 0.92387953 324 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2 325 .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 326 vcprm(s0,3,2,1) 327 vcprm(0,1,s2,s1) 328 vcprm(2,3,s0,s3) 329 vcprm(2,s3,3,s2) 330 vcprm(0,1,s0,s1) 331 vcprm(2,3,s2,s3) 332 vcprm(2,3,0,1) 333 vcprm(1,2,s3,s0) 334 vcprm(0,3,s2,s1) 335 vcprm(0,2,s1,s3) 336 vcprm(1,3,s0,s2) 337 338.macro lvm b, r, regs:vararg 339 lvx \r, 0, \b 340 addi \b, \b, 16 341 .ifnb \regs 342 lvm \b, \regs 343 .endif 344.endm 345 346.macro stvm b, r, regs:vararg 347 stvx \r, 0, \b 348 addi \b, \b, 16 349 .ifnb \regs 350 stvm \b, \regs 351 .endif 352.endm 353 354.macro fft_calc interleave 355extfunc ff_fft_calc\interleave\()_altivec 356 mflr r0 357 stp r0, 2*PS(R(1)) 358 stpu r1, -(160+16*PS)(R(1)) 359 get_got r11 360 addi r6, r1, 16*PS 361 stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 362 mfvrsave r0 363 stw r0, 15*PS(R(1)) 364#if __APPLE__ 365 li r6, 0xfffffffc 366#else 367 li r6, -4 368#endif 369 mtvrsave r6 370 371 movrel r6, fft_data, r11 372 lvm r6, v14, v15, v16, v17, v18, v19, v20, v21 373 lvm r6, v22, v23, v24, v25, v26, v27, v28, v29 374 375 li r9, 16 376 movrel r12, X(ff_cos_tabs), r11 377 378 movrel r6, fft_dispatch_tab\interleave\()_altivec, r11 379 lwz r3, 0(R(3)) 380 subi r3, r3, 2 381 slwi r3, r3, 2+ARCH_PPC64 382 lpx r3, r3, r6 383 mtctr r3 384 mr r3, r4 385 bctrl 386 387 addi r6, r1, 16*PS 388 lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 389 lwz r6, 15*PS(R(1)) 390 mtvrsave r6 391 lp r1, 0(R(1)) 392 lp r0, 2*PS(R(1)) 393 mtlr r0 394 blr 395.endm 396 397.macro DECL_FFT suffix, bits, n, n2, n4 398fft\n\suffix\()_altivec: 399 mflr r0 400 stp r0,PS*(\bits-3)(R(1)) 401 bl fft\n2\()_altivec 402 addi2 r3,\n*4 403 bl fft\n4\()_altivec 404 addi2 r3,\n*2 405 bl fft\n4\()_altivec 406 addi2 r3,\n*-6 407 lp r0,PS*(\bits-3)(R(1)) 408 lp r4,\bits*PS(R(12)) 409 mtlr r0 410 li r5,\n/16 411 b fft_pass\suffix\()_altivec 412.endm 413 414.macro DECL_FFTS interleave, suffix 415 .text 416 def_fft4 \suffix 417 def_fft8 \suffix 418 def_fft16 \suffix 419 PASS \interleave, \suffix 420 DECL_FFT \suffix, 5, 32, 16, 8 421 DECL_FFT \suffix, 6, 64, 32, 16 422 DECL_FFT \suffix, 7, 128, 64, 32 423 DECL_FFT \suffix, 8, 256, 128, 64 424 DECL_FFT \suffix, 9, 512, 256, 128 425 DECL_FFT \suffix,10, 1024, 512, 256 426 DECL_FFT \suffix,11, 2048, 1024, 512 427 DECL_FFT \suffix,12, 4096, 2048, 1024 428 DECL_FFT \suffix,13, 8192, 4096, 2048 429 DECL_FFT \suffix,14,16384, 8192, 4096 430 DECL_FFT \suffix,15,32768,16384, 8192 431 DECL_FFT \suffix,16,65536,32768,16384 432 433 fft_calc \suffix 434 435 .rodata 436 .align 3 437fft_dispatch_tab\suffix\()_altivec: 438 PTR fft4\suffix\()_altivec 439 PTR fft8\suffix\()_altivec 440 PTR fft16\suffix\()_altivec 441 PTR fft32\suffix\()_altivec 442 PTR fft64\suffix\()_altivec 443 PTR fft128\suffix\()_altivec 444 PTR fft256\suffix\()_altivec 445 PTR fft512\suffix\()_altivec 446 PTR fft1024\suffix\()_altivec 447 PTR fft2048\suffix\()_altivec 448 PTR fft4096\suffix\()_altivec 449 PTR fft8192\suffix\()_altivec 450 PTR fft16384\suffix\()_altivec 451 PTR fft32768\suffix\()_altivec 452 PTR fft65536\suffix\()_altivec 453.endm 454 455DECL_FFTS 0 456DECL_FFTS 1, _interleave 457 458#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ 459