1;***************************************************************************** 2;* x86-optimized Float DSP functions 3;* 4;* Copyright 2006 Loren Merritt 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "x86util.asm" 24 25SECTION_RODATA 32 26pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 27 28SECTION .text 29 30;----------------------------------------------------------------------------- 31; void vector_fmul(float *dst, const float *src0, const float *src1, int len) 32;----------------------------------------------------------------------------- 33%macro VECTOR_FMUL 0 34cglobal vector_fmul, 4,4,2, dst, src0, src1, len 35 lea lenq, [lend*4 - 64] 36ALIGN 16 37.loop: 38%assign a 0 39%rep 32/mmsize 40 mova m0, [src0q + lenq + (a+0)*mmsize] 41 mova m1, [src0q + lenq + (a+1)*mmsize] 42 mulps m0, m0, [src1q + lenq + (a+0)*mmsize] 43 mulps m1, m1, [src1q + lenq + (a+1)*mmsize] 44 mova [dstq + lenq + (a+0)*mmsize], m0 45 mova [dstq + lenq + (a+1)*mmsize], m1 46%assign a a+2 47%endrep 48 49 sub lenq, 64 50 jge .loop 51 REP_RET 52%endmacro 53 54INIT_XMM sse 55VECTOR_FMUL 56%if HAVE_AVX_EXTERNAL 57INIT_YMM avx 58VECTOR_FMUL 59%endif 60 61;----------------------------------------------------------------------------- 62; void vector_dmul(double *dst, const double *src0, const double *src1, int len) 63;----------------------------------------------------------------------------- 64%macro VECTOR_DMUL 0 65cglobal vector_dmul, 4,4,4, dst, src0, src1, len 66 lea lend, [lenq*8 - mmsize*4] 67ALIGN 16 68.loop: 69 movaps m0, [src0q + lenq + 0*mmsize] 70 movaps m1, [src0q + lenq + 1*mmsize] 71 movaps m2, [src0q + lenq + 2*mmsize] 72 movaps m3, [src0q + lenq + 3*mmsize] 73 mulpd m0, m0, [src1q + lenq + 0*mmsize] 74 mulpd m1, m1, [src1q + lenq + 1*mmsize] 75 mulpd m2, m2, [src1q + lenq + 2*mmsize] 76 mulpd m3, m3, [src1q + lenq + 3*mmsize] 77 movaps [dstq + lenq + 0*mmsize], m0 78 movaps [dstq + lenq + 1*mmsize], m1 79 movaps [dstq + lenq + 2*mmsize], m2 80 movaps [dstq + lenq + 3*mmsize], m3 81 82 sub lenq, mmsize*4 83 jge .loop 84 RET 85%endmacro 86 87INIT_XMM sse2 88VECTOR_DMUL 89%if HAVE_AVX_EXTERNAL 90INIT_YMM avx 91VECTOR_DMUL 92%endif 93 94;------------------------------------------------------------------------------ 95; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) 96;------------------------------------------------------------------------------ 97 98%macro VECTOR_FMAC_SCALAR 0 99%if UNIX64 100cglobal vector_fmac_scalar, 3,3,5, dst, src, len 101%else 102cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len 103%endif 104%if ARCH_X86_32 105 VBROADCASTSS m0, mulm 106%else 107%if WIN64 108 SWAP 0, 2 109%endif 110 shufps xm0, xm0, 0 111%if cpuflag(avx) 112 vinsertf128 m0, m0, xm0, 1 113%endif 114%endif 115 lea lenq, [lend*4-64] 116.loop: 117%if cpuflag(fma3) 118 mova m1, [dstq+lenq] 119 mova m2, [dstq+lenq+1*mmsize] 120 fmaddps m1, m0, [srcq+lenq], m1 121 fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 122%else ; cpuflag 123 mulps m1, m0, [srcq+lenq] 124 mulps m2, m0, [srcq+lenq+1*mmsize] 125%if mmsize < 32 126 mulps m3, m0, [srcq+lenq+2*mmsize] 127 mulps m4, m0, [srcq+lenq+3*mmsize] 128%endif ; mmsize 129 addps m1, m1, [dstq+lenq] 130 addps m2, m2, [dstq+lenq+1*mmsize] 131%if mmsize < 32 132 addps m3, m3, [dstq+lenq+2*mmsize] 133 addps m4, m4, [dstq+lenq+3*mmsize] 134%endif ; mmsize 135%endif ; cpuflag 136 mova [dstq+lenq], m1 137 mova [dstq+lenq+1*mmsize], m2 138%if mmsize < 32 139 mova [dstq+lenq+2*mmsize], m3 140 mova [dstq+lenq+3*mmsize], m4 141%endif ; mmsize 142 sub lenq, 64 143 jge .loop 144 REP_RET 145%endmacro 146 147INIT_XMM sse 148VECTOR_FMAC_SCALAR 149%if HAVE_AVX_EXTERNAL 150INIT_YMM avx 151VECTOR_FMAC_SCALAR 152%endif 153%if HAVE_FMA3_EXTERNAL 154INIT_YMM fma3 155VECTOR_FMAC_SCALAR 156%endif 157 158;------------------------------------------------------------------------------ 159; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) 160;------------------------------------------------------------------------------ 161 162%macro VECTOR_FMUL_SCALAR 0 163%if UNIX64 164cglobal vector_fmul_scalar, 3,3,2, dst, src, len 165%else 166cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len 167%endif 168%if ARCH_X86_32 169 movss m0, mulm 170%elif WIN64 171 SWAP 0, 2 172%endif 173 shufps m0, m0, 0 174 lea lenq, [lend*4-mmsize] 175.loop: 176 mova m1, [srcq+lenq] 177 mulps m1, m0 178 mova [dstq+lenq], m1 179 sub lenq, mmsize 180 jge .loop 181 REP_RET 182%endmacro 183 184INIT_XMM sse 185VECTOR_FMUL_SCALAR 186 187;------------------------------------------------------------------------------ 188; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, 189; int len) 190;------------------------------------------------------------------------------ 191 192%macro VECTOR_DMAC_SCALAR 0 193%if ARCH_X86_32 194cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr 195 mov lenq, lenaddrm 196 VBROADCASTSD m0, mulm 197%else 198%if UNIX64 199cglobal vector_dmac_scalar, 3,3,5, dst, src, len 200%else 201cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len 202 SWAP 0, 2 203%endif 204 movlhps xm0, xm0 205%if cpuflag(avx) 206 vinsertf128 m0, m0, xm0, 1 207%endif 208%endif 209 lea lenq, [lend*8-mmsize*4] 210.loop: 211%if cpuflag(fma3) 212 movaps m1, [dstq+lenq] 213 movaps m2, [dstq+lenq+1*mmsize] 214 movaps m3, [dstq+lenq+2*mmsize] 215 movaps m4, [dstq+lenq+3*mmsize] 216 fmaddpd m1, m0, [srcq+lenq], m1 217 fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 218 fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 219 fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 220%else ; cpuflag 221 mulpd m1, m0, [srcq+lenq] 222 mulpd m2, m0, [srcq+lenq+1*mmsize] 223 mulpd m3, m0, [srcq+lenq+2*mmsize] 224 mulpd m4, m0, [srcq+lenq+3*mmsize] 225 addpd m1, m1, [dstq+lenq] 226 addpd m2, m2, [dstq+lenq+1*mmsize] 227 addpd m3, m3, [dstq+lenq+2*mmsize] 228 addpd m4, m4, [dstq+lenq+3*mmsize] 229%endif ; cpuflag 230 movaps [dstq+lenq], m1 231 movaps [dstq+lenq+1*mmsize], m2 232 movaps [dstq+lenq+2*mmsize], m3 233 movaps [dstq+lenq+3*mmsize], m4 234 sub lenq, mmsize*4 235 jge .loop 236 REP_RET 237%endmacro 238 239INIT_XMM sse2 240VECTOR_DMAC_SCALAR 241%if HAVE_AVX_EXTERNAL 242INIT_YMM avx 243VECTOR_DMAC_SCALAR 244%endif 245%if HAVE_FMA3_EXTERNAL 246INIT_YMM fma3 247VECTOR_DMAC_SCALAR 248%endif 249 250;------------------------------------------------------------------------------ 251; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, 252; int len) 253;------------------------------------------------------------------------------ 254 255%macro VECTOR_DMUL_SCALAR 0 256%if ARCH_X86_32 257cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr 258 mov lenq, lenaddrm 259%elif UNIX64 260cglobal vector_dmul_scalar, 3,3,3, dst, src, len 261%else 262cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len 263%endif 264%if ARCH_X86_32 265 VBROADCASTSD m0, mulm 266%else 267%if WIN64 268 SWAP 0, 2 269%endif 270 movlhps xm0, xm0 271%if cpuflag(avx) 272 vinsertf128 ym0, ym0, xm0, 1 273%endif 274%endif 275 lea lenq, [lend*8-2*mmsize] 276.loop: 277 mulpd m1, m0, [srcq+lenq ] 278 mulpd m2, m0, [srcq+lenq+mmsize] 279 movaps [dstq+lenq ], m1 280 movaps [dstq+lenq+mmsize], m2 281 sub lenq, 2*mmsize 282 jge .loop 283 REP_RET 284%endmacro 285 286INIT_XMM sse2 287VECTOR_DMUL_SCALAR 288%if HAVE_AVX_EXTERNAL 289INIT_YMM avx 290VECTOR_DMUL_SCALAR 291%endif 292 293;----------------------------------------------------------------------------- 294; vector_fmul_window(float *dst, const float *src0, 295; const float *src1, const float *win, int len); 296;----------------------------------------------------------------------------- 297%macro VECTOR_FMUL_WINDOW 0 298cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 299 shl lend, 2 300 lea len1q, [lenq - mmsize] 301 add src0q, lenq 302 add dstq, lenq 303 add winq, lenq 304 neg lenq 305.loop: 306 mova m0, [winq + lenq] 307 mova m4, [src0q + lenq] 308%if cpuflag(sse) 309 mova m1, [winq + len1q] 310 mova m5, [src1q + len1q] 311 shufps m1, m1, 0x1b 312 shufps m5, m5, 0x1b 313 mova m2, m0 314 mova m3, m1 315 mulps m2, m4 316 mulps m3, m5 317 mulps m1, m4 318 mulps m0, m5 319 addps m2, m3 320 subps m1, m0 321 shufps m2, m2, 0x1b 322%else 323 pswapd m1, [winq + len1q] 324 pswapd m5, [src1q + len1q] 325 mova m2, m0 326 mova m3, m1 327 pfmul m2, m4 328 pfmul m3, m5 329 pfmul m1, m4 330 pfmul m0, m5 331 pfadd m2, m3 332 pfsub m1, m0 333 pswapd m2, m2 334%endif 335 mova [dstq + lenq], m1 336 mova [dstq + len1q], m2 337 sub len1q, mmsize 338 add lenq, mmsize 339 jl .loop 340%if mmsize == 8 341 femms 342%endif 343 REP_RET 344%endmacro 345 346INIT_MMX 3dnowext 347VECTOR_FMUL_WINDOW 348INIT_XMM sse 349VECTOR_FMUL_WINDOW 350 351;----------------------------------------------------------------------------- 352; vector_fmul_add(float *dst, const float *src0, const float *src1, 353; const float *src2, int len) 354;----------------------------------------------------------------------------- 355%macro VECTOR_FMUL_ADD 0 356cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len 357 lea lenq, [lend*4 - 2*mmsize] 358ALIGN 16 359.loop: 360 mova m0, [src0q + lenq] 361 mova m1, [src0q + lenq + mmsize] 362%if cpuflag(fma3) 363 mova m2, [src2q + lenq] 364 mova m3, [src2q + lenq + mmsize] 365 fmaddps m0, m0, [src1q + lenq], m2 366 fmaddps m1, m1, [src1q + lenq + mmsize], m3 367%else 368 mulps m0, m0, [src1q + lenq] 369 mulps m1, m1, [src1q + lenq + mmsize] 370 addps m0, m0, [src2q + lenq] 371 addps m1, m1, [src2q + lenq + mmsize] 372%endif 373 mova [dstq + lenq], m0 374 mova [dstq + lenq + mmsize], m1 375 376 sub lenq, 2*mmsize 377 jge .loop 378 REP_RET 379%endmacro 380 381INIT_XMM sse 382VECTOR_FMUL_ADD 383%if HAVE_AVX_EXTERNAL 384INIT_YMM avx 385VECTOR_FMUL_ADD 386%endif 387%if HAVE_FMA3_EXTERNAL 388INIT_YMM fma3 389VECTOR_FMUL_ADD 390%endif 391 392;----------------------------------------------------------------------------- 393; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, 394; int len) 395;----------------------------------------------------------------------------- 396%macro VECTOR_FMUL_REVERSE 0 397cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len 398%if cpuflag(avx2) 399 movaps m2, [pd_reverse] 400%endif 401 lea lenq, [lend*4 - 2*mmsize] 402ALIGN 16 403.loop: 404%if cpuflag(avx2) 405 vpermps m0, m2, [src1q] 406 vpermps m1, m2, [src1q+mmsize] 407%elif cpuflag(avx) 408 vmovaps xmm0, [src1q + 16] 409 vinsertf128 m0, m0, [src1q], 1 410 vshufps m0, m0, m0, q0123 411 vmovaps xmm1, [src1q + mmsize + 16] 412 vinsertf128 m1, m1, [src1q + mmsize], 1 413 vshufps m1, m1, m1, q0123 414%else 415 mova m0, [src1q] 416 mova m1, [src1q + mmsize] 417 shufps m0, m0, q0123 418 shufps m1, m1, q0123 419%endif 420 mulps m0, m0, [src0q + lenq + mmsize] 421 mulps m1, m1, [src0q + lenq] 422 movaps [dstq + lenq + mmsize], m0 423 movaps [dstq + lenq], m1 424 add src1q, 2*mmsize 425 sub lenq, 2*mmsize 426 jge .loop 427 REP_RET 428%endmacro 429 430INIT_XMM sse 431VECTOR_FMUL_REVERSE 432%if HAVE_AVX_EXTERNAL 433INIT_YMM avx 434VECTOR_FMUL_REVERSE 435%endif 436%if HAVE_AVX2_EXTERNAL 437INIT_YMM avx2 438VECTOR_FMUL_REVERSE 439%endif 440 441; float scalarproduct_float_sse(const float *v1, const float *v2, int len) 442INIT_XMM sse 443cglobal scalarproduct_float, 3,3,2, v1, v2, offset 444 shl offsetd, 2 445 add v1q, offsetq 446 add v2q, offsetq 447 neg offsetq 448 xorps xmm0, xmm0 449.loop: 450 movaps xmm1, [v1q+offsetq] 451 mulps xmm1, [v2q+offsetq] 452 addps xmm0, xmm1 453 add offsetq, 16 454 js .loop 455 movhlps xmm1, xmm0 456 addps xmm0, xmm1 457 movss xmm1, xmm0 458 shufps xmm0, xmm0, 1 459 addss xmm0, xmm1 460%if ARCH_X86_64 == 0 461 movss r0m, xmm0 462 fld dword r0m 463%endif 464 RET 465 466;----------------------------------------------------------------------------- 467; void ff_butterflies_float(float *src0, float *src1, int len); 468;----------------------------------------------------------------------------- 469INIT_XMM sse 470cglobal butterflies_float, 3,3,3, src0, src1, len 471 shl lend, 2 472 add src0q, lenq 473 add src1q, lenq 474 neg lenq 475.loop: 476 mova m0, [src0q + lenq] 477 mova m1, [src1q + lenq] 478 subps m2, m0, m1 479 addps m0, m0, m1 480 mova [src1q + lenq], m2 481 mova [src0q + lenq], m0 482 add lenq, mmsize 483 jl .loop 484 REP_RET 485