1;****************************************************************************** 2;* 32 point SSE-optimized DCT transform 3;* Copyright (c) 2010 Vitor Sessak 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 32 25 26ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 27 28ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 29 dd 0.553104, 0.582935, 0.622504, 0.674808 30 dd -10.190008, -3.407609, -2.057781, -1.484165 31 dd -1.169440, -0.972568, -0.839350, -0.744536 32 dd 0.502419, 0.522499, 0.566944, 0.646822 33 dd 0.788155, 1.060678, 1.722447, 5.101149 34 dd 0.509796, 0.601345, 0.899976, 2.562916 35 dd 0.509796, 0.601345, 0.899976, 2.562916 36 dd 1.000000, 1.000000, 1.306563, 0.541196 37 dd 1.000000, 1.000000, 1.306563, 0.541196 38 dd 1.000000, 0.707107, 1.000000, -0.707107 39 dd 1.000000, 0.707107, 1.000000, -0.707107 40 dd 0.707107, 0.707107, 0.707107, 0.707107 41 42%macro BUTTERFLY 4 43 subps %4, %1, %2 44 addps %2, %2, %1 45 mulps %1, %4, %3 46%endmacro 47 48%macro BUTTERFLY0 5 49%if cpuflag(sse2) && notcpuflag(avx) 50 pshufd %4, %1, %5 51 xorps %1, %2 52 addps %1, %4 53 mulps %1, %3 54%else 55 shufps %4, %1, %1, %5 56 xorps %1, %1, %2 57 addps %4, %4, %1 58 mulps %1, %4, %3 59%endif 60%endmacro 61 62%macro BUTTERFLY2 4 63 BUTTERFLY0 %1, %2, %3, %4, 0x1b 64%endmacro 65 66%macro BUTTERFLY3 4 67 BUTTERFLY0 %1, %2, %3, %4, 0xb1 68%endmacro 69 70%macro BUTTERFLY3V 5 71 movaps m%5, m%1 72 addps m%1, m%2 73 subps m%5, m%2 74 SWAP %2, %5 75 mulps m%2, [ps_cos_vec+192] 76 movaps m%5, m%3 77 addps m%3, m%4 78 subps m%4, m%5 79 mulps m%4, [ps_cos_vec+192] 80%endmacro 81 82%macro PASS6_AND_PERMUTE 0 83 mov tmpd, [outq+4] 84 movss m7, [outq+72] 85 addss m7, [outq+76] 86 movss m3, [outq+56] 87 addss m3, [outq+60] 88 addss m4, m3 89 movss m2, [outq+52] 90 addss m2, m3 91 movss m3, [outq+104] 92 addss m3, [outq+108] 93 addss m1, m3 94 addss m5, m4 95 movss [outq+ 16], m1 96 movss m1, [outq+100] 97 addss m1, m3 98 movss m3, [outq+40] 99 movss [outq+ 48], m1 100 addss m3, [outq+44] 101 movss m1, [outq+100] 102 addss m4, m3 103 addss m3, m2 104 addss m1, [outq+108] 105 movss [outq+ 40], m3 106 addss m2, [outq+36] 107 movss m3, [outq+8] 108 movss [outq+ 56], m2 109 addss m3, [outq+12] 110 movss [outq+ 32], m3 111 movss m3, [outq+80] 112 movss [outq+ 8], m5 113 movss [outq+ 80], m1 114 movss m2, [outq+52] 115 movss m5, [outq+120] 116 addss m5, [outq+124] 117 movss m1, [outq+64] 118 addss m2, [outq+60] 119 addss m0, m5 120 addss m5, [outq+116] 121 mov [outq+64], tmpd 122 addss m6, m0 123 addss m1, m6 124 mov tmpd, [outq+12] 125 mov [outq+ 96], tmpd 126 movss [outq+ 4], m1 127 movss m1, [outq+24] 128 movss [outq+ 24], m4 129 movss m4, [outq+88] 130 addss m4, [outq+92] 131 addss m3, m4 132 addss m4, [outq+84] 133 mov tmpd, [outq+108] 134 addss m1, [outq+28] 135 addss m0, m1 136 addss m1, m5 137 addss m6, m3 138 addss m3, m0 139 addss m0, m7 140 addss m5, [outq+20] 141 addss m7, m1 142 movss [outq+ 12], m6 143 mov [outq+112], tmpd 144 movss m6, [outq+28] 145 movss [outq+ 28], m0 146 movss m0, [outq+36] 147 movss [outq+ 36], m7 148 addss m1, m4 149 movss m7, [outq+116] 150 addss m0, m2 151 addss m7, [outq+124] 152 movss [outq+ 72], m0 153 movss m0, [outq+44] 154 addss m2, m0 155 movss [outq+ 44], m1 156 movss [outq+ 88], m2 157 addss m0, [outq+60] 158 mov tmpd, [outq+60] 159 mov [outq+120], tmpd 160 movss [outq+104], m0 161 addss m4, m5 162 addss m5, [outq+68] 163 movss [outq+52], m4 164 movss [outq+60], m5 165 movss m4, [outq+68] 166 movss m5, [outq+20] 167 movss [outq+ 20], m3 168 addss m5, m7 169 addss m7, m6 170 addss m4, m5 171 movss m2, [outq+84] 172 addss m2, [outq+92] 173 addss m5, m2 174 movss [outq+ 68], m4 175 addss m2, m7 176 movss m4, [outq+76] 177 movss [outq+ 84], m2 178 movss [outq+ 76], m5 179 addss m7, m4 180 addss m6, [outq+124] 181 addss m4, m6 182 addss m6, [outq+92] 183 movss [outq+100], m4 184 movss [outq+108], m6 185 movss m6, [outq+92] 186 movss [outq+92], m7 187 addss m6, [outq+124] 188 movss [outq+116], m6 189%endmacro 190 191INIT_YMM avx 192SECTION .text 193%if HAVE_AVX_EXTERNAL 194; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) 195cglobal dct32_float, 2,3,8, out, in, tmp 196 ; pass 1 197 vmovaps m4, [inq+0] 198 vinsertf128 m5, m5, [inq+96], 1 199 vinsertf128 m5, m5, [inq+112], 0 200 vshufps m5, m5, m5, 0x1b 201 BUTTERFLY m4, m5, [ps_cos_vec], m6 202 203 vmovaps m2, [inq+64] 204 vinsertf128 m6, m6, [inq+32], 1 205 vinsertf128 m6, m6, [inq+48], 0 206 vshufps m6, m6, m6, 0x1b 207 BUTTERFLY m2, m6, [ps_cos_vec+32], m0 208 209 ; pass 2 210 211 BUTTERFLY m5, m6, [ps_cos_vec+64], m0 212 BUTTERFLY m4, m2, [ps_cos_vec+64], m7 213 214 215 ; pass 3 216 vperm2f128 m3, m6, m4, 0x31 217 vperm2f128 m1, m6, m4, 0x20 218 vshufps m3, m3, m3, 0x1b 219 220 BUTTERFLY m1, m3, [ps_cos_vec+96], m6 221 222 223 vperm2f128 m4, m5, m2, 0x20 224 vperm2f128 m5, m5, m2, 0x31 225 vshufps m5, m5, m5, 0x1b 226 227 BUTTERFLY m4, m5, [ps_cos_vec+96], m6 228 229 ; pass 4 230 vmovaps m6, [ps_p1p1m1m1+0] 231 vmovaps m2, [ps_cos_vec+128] 232 233 BUTTERFLY2 m5, m6, m2, m7 234 BUTTERFLY2 m4, m6, m2, m7 235 BUTTERFLY2 m1, m6, m2, m7 236 BUTTERFLY2 m3, m6, m2, m7 237 238 239 ; pass 5 240 vshufps m6, m6, m6, 0xcc 241 vmovaps m2, [ps_cos_vec+160] 242 243 BUTTERFLY3 m5, m6, m2, m7 244 BUTTERFLY3 m4, m6, m2, m7 245 BUTTERFLY3 m1, m6, m2, m7 246 BUTTERFLY3 m3, m6, m2, m7 247 248 vperm2f128 m6, m3, m3, 0x31 249 vmovaps [outq], m3 250 251 vextractf128 [outq+64], m5, 1 252 vextractf128 [outq+32], m5, 0 253 254 vextractf128 [outq+80], m4, 1 255 vextractf128 [outq+48], m4, 0 256 257 vperm2f128 m0, m1, m1, 0x31 258 vmovaps [outq+96], m1 259 260 vzeroupper 261 262 ; pass 6, no SIMD... 263INIT_XMM 264 PASS6_AND_PERMUTE 265 RET 266%endif 267 268%if ARCH_X86_64 269%define SPILL SWAP 270%define UNSPILL SWAP 271 272%macro PASS5 0 273 nop ; FIXME code alignment 274 SWAP 5, 8 275 SWAP 4, 12 276 SWAP 6, 14 277 SWAP 7, 13 278 SWAP 0, 15 279 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 280 TRANSPOSE4x4PS 8, 9, 10, 11, 0 281 BUTTERFLY3V 8, 9, 10, 11, 0 282 addps m10, m11 283 TRANSPOSE4x4PS 12, 13, 14, 15, 0 284 BUTTERFLY3V 12, 13, 14, 15, 0 285 addps m14, m15 286 addps m12, m14 287 addps m14, m13 288 addps m13, m15 289%endmacro 290 291%macro PASS6 0 292 SWAP 9, 12 293 SWAP 11, 14 294 movss [outq+0x00], m8 295 pshuflw m0, m8, 0xe 296 movss [outq+0x10], m9 297 pshuflw m1, m9, 0xe 298 movss [outq+0x20], m10 299 pshuflw m2, m10, 0xe 300 movss [outq+0x30], m11 301 pshuflw m3, m11, 0xe 302 movss [outq+0x40], m12 303 pshuflw m4, m12, 0xe 304 movss [outq+0x50], m13 305 pshuflw m5, m13, 0xe 306 movss [outq+0x60], m14 307 pshuflw m6, m14, 0xe 308 movaps [outq+0x70], m15 309 pshuflw m7, m15, 0xe 310 addss m0, m1 311 addss m1, m2 312 movss [outq+0x08], m0 313 addss m2, m3 314 movss [outq+0x18], m1 315 addss m3, m4 316 movss [outq+0x28], m2 317 addss m4, m5 318 movss [outq+0x38], m3 319 addss m5, m6 320 movss [outq+0x48], m4 321 addss m6, m7 322 movss [outq+0x58], m5 323 movss [outq+0x68], m6 324 movss [outq+0x78], m7 325 326 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 327 movhlps m0, m1 328 pshufd m1, m1, 3 329 SWAP 0, 2, 4, 6, 8, 10, 12, 14 330 SWAP 1, 3, 5, 7, 9, 11, 13, 15 331%rep 7 332 movhlps m0, m1 333 pshufd m1, m1, 3 334 addss m15, m1 335 SWAP 0, 2, 4, 6, 8, 10, 12, 14 336 SWAP 1, 3, 5, 7, 9, 11, 13, 15 337%endrep 338%assign i 4 339%rep 15 340 addss m0, m1 341 movss [outq+i], m0 342 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 343 %assign i i+8 344%endrep 345%endmacro 346 347%else ; ARCH_X86_32 348%macro SPILL 2 ; xmm#, mempos 349 movaps [outq+(%2-8)*16], m%1 350%endmacro 351%macro UNSPILL 2 352 movaps m%1, [outq+(%2-8)*16] 353%endmacro 354 355%define PASS6 PASS6_AND_PERMUTE 356%macro PASS5 0 357 movaps m2, [ps_cos_vec+160] 358 shufps m3, m3, 0xcc 359 360 BUTTERFLY3 m5, m3, m2, m1 361 SPILL 5, 8 362 363 UNSPILL 1, 9 364 BUTTERFLY3 m1, m3, m2, m5 365 SPILL 1, 14 366 367 BUTTERFLY3 m4, m3, m2, m5 368 SPILL 4, 12 369 370 BUTTERFLY3 m7, m3, m2, m5 371 SPILL 7, 13 372 373 UNSPILL 5, 10 374 BUTTERFLY3 m5, m3, m2, m7 375 SPILL 5, 10 376 377 UNSPILL 4, 11 378 BUTTERFLY3 m4, m3, m2, m7 379 SPILL 4, 11 380 381 BUTTERFLY3 m6, m3, m2, m7 382 SPILL 6, 9 383 384 BUTTERFLY3 m0, m3, m2, m7 385 SPILL 0, 15 386%endmacro 387%endif 388 389 390; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) 391%macro DCT32_FUNC 0 392cglobal dct32_float, 2, 3, 16, out, in, tmp 393 ; pass 1 394 395 movaps m0, [inq+0] 396 LOAD_INV m1, [inq+112] 397 BUTTERFLY m0, m1, [ps_cos_vec], m3 398 399 movaps m7, [inq+64] 400 LOAD_INV m4, [inq+48] 401 BUTTERFLY m7, m4, [ps_cos_vec+32], m3 402 403 ; pass 2 404 movaps m2, [ps_cos_vec+64] 405 BUTTERFLY m1, m4, m2, m3 406 SPILL 1, 11 407 SPILL 4, 8 408 409 ; pass 1 410 movaps m1, [inq+16] 411 LOAD_INV m6, [inq+96] 412 BUTTERFLY m1, m6, [ps_cos_vec+16], m3 413 414 movaps m4, [inq+80] 415 LOAD_INV m5, [inq+32] 416 BUTTERFLY m4, m5, [ps_cos_vec+48], m3 417 418 ; pass 2 419 BUTTERFLY m0, m7, m2, m3 420 421 movaps m2, [ps_cos_vec+80] 422 BUTTERFLY m6, m5, m2, m3 423 424 BUTTERFLY m1, m4, m2, m3 425 426 ; pass 3 427 movaps m2, [ps_cos_vec+96] 428 shufps m1, m1, 0x1b 429 BUTTERFLY m0, m1, m2, m3 430 SPILL 0, 15 431 SPILL 1, 14 432 433 UNSPILL 0, 8 434 shufps m5, m5, 0x1b 435 BUTTERFLY m0, m5, m2, m3 436 437 UNSPILL 1, 11 438 shufps m6, m6, 0x1b 439 BUTTERFLY m1, m6, m2, m3 440 SPILL 1, 11 441 442 shufps m4, m4, 0x1b 443 BUTTERFLY m7, m4, m2, m3 444 445 ; pass 4 446 movaps m3, [ps_p1p1m1m1+0] 447 movaps m2, [ps_cos_vec+128] 448 449 BUTTERFLY2 m5, m3, m2, m1 450 451 BUTTERFLY2 m0, m3, m2, m1 452 SPILL 0, 9 453 454 BUTTERFLY2 m6, m3, m2, m1 455 SPILL 6, 10 456 457 UNSPILL 0, 11 458 BUTTERFLY2 m0, m3, m2, m1 459 SPILL 0, 11 460 461 BUTTERFLY2 m4, m3, m2, m1 462 463 BUTTERFLY2 m7, m3, m2, m1 464 465 UNSPILL 6, 14 466 BUTTERFLY2 m6, m3, m2, m1 467 468 UNSPILL 0, 15 469 BUTTERFLY2 m0, m3, m2, m1 470 471 PASS5 472 PASS6 473 RET 474%endmacro 475 476%macro LOAD_INV 2 477%if cpuflag(sse2) 478 pshufd %1, %2, 0x1b 479%elif cpuflag(sse) 480 movaps %1, %2 481 shufps %1, %1, 0x1b 482%endif 483%endmacro 484 485%if ARCH_X86_32 486INIT_XMM sse 487DCT32_FUNC 488%endif 489 490INIT_XMM sse2 491DCT32_FUNC 492