1;****************************************************************************** 2;* SIMD optimized MPEG-4 Parametric Stereo decoding functions 3;* 4;* Copyright (C) 2015 James Almer 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 28 29SECTION .text 30 31;************************************************************************* 32;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n); 33;************************************************************************* 34%macro PS_ADD_SQUARES 1 35cglobal ps_add_squares, 3, 3, %1, dst, src, n 36 shl nd, 3 37 add srcq, nq 38 neg nq 39 40align 16 41.loop: 42 movaps m0, [srcq+nq] 43 movaps m1, [srcq+nq+mmsize] 44 mulps m0, m0 45 mulps m1, m1 46 HADDPS m0, m1, m2 47 addps m0, [dstq] 48 movaps [dstq], m0 49 add dstq, mmsize 50 add nq, mmsize*2 51 jl .loop 52 REP_RET 53%endmacro 54 55INIT_XMM sse 56PS_ADD_SQUARES 2 57INIT_XMM sse3 58PS_ADD_SQUARES 3 59 60;******************************************************************* 61;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2], 62; float *src1, int n); 63;******************************************************************* 64INIT_XMM sse 65cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n 66 shl nd, 3 67 add src1q, nq 68 add dstq, nq 69 neg nq 70 71align 16 72.loop: 73 movu m0, [src1q+nq] 74 movu m1, [src1q+nq+mmsize] 75 mova m2, [src2q] 76 mova m3, m2 77 unpcklps m2, m2 78 unpckhps m3, m3 79 mulps m0, m2 80 mulps m1, m3 81 mova [dstq+nq], m0 82 mova [dstq+nq+mmsize], m1 83 add src2q, mmsize 84 add nq, mmsize*2 85 jl .loop 86 REP_RET 87 88;*********************************************************************** 89;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], 90; float h[2][4], float h_step[2][4], 91; int len); 92;*********************************************************************** 93INIT_XMM sse3 94cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n 95 movaps m0, [hq] 96 movaps m1, [h_stepq] 97 unpcklps m4, m0, m0 98 unpckhps m0, m0 99 unpcklps m5, m1, m1 100 unpckhps m1, m1 101 shl nd, 3 102 add lq, nq 103 add rq, nq 104 neg nq 105 106align 16 107.loop: 108 addps m4, m5 109 addps m0, m1 110 movddup m2, [lq+nq] 111 movddup m3, [rq+nq] 112 mulps m2, m4 113 mulps m3, m0 114 addps m2, m3 115 movsd [lq+nq], m2 116 movhps [rq+nq], m2 117 add nq, 8 118 jl .loop 119 REP_RET 120 121;*************************************************************************** 122;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2], 123; float h[2][4], float h_step[2][4], 124; int len); 125;*************************************************************************** 126INIT_XMM sse3 127cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n 128 movaps m0, [hq] 129 movaps m1, [hq+mmsize] 130%if ARCH_X86_64 131 movaps m8, [h_stepq] 132 movaps m9, [h_stepq+mmsize] 133 %define H_STEP0 m8 134 %define H_STEP1 m9 135%else 136 %define H_STEP0 [h_stepq] 137 %define H_STEP1 [h_stepq+mmsize] 138%endif 139 shl nd, 3 140 add lq, nq 141 add rq, nq 142 neg nq 143 144align 16 145.loop: 146 addps m0, H_STEP0 147 addps m1, H_STEP1 148 movddup m2, [lq+nq] 149 movddup m3, [rq+nq] 150 shufps m4, m2, m2, q2301 151 shufps m5, m3, m3, q2301 152 unpcklps m6, m0, m0 153 unpckhps m7, m0, m0 154 mulps m2, m6 155 mulps m3, m7 156 unpcklps m6, m1, m1 157 unpckhps m7, m1, m1 158 mulps m4, m6 159 mulps m5, m7 160 addps m2, m3 161 addsubps m2, m4 162 addsubps m2, m5 163 movsd [lq+nq], m2 164 movhps [rq+nq], m2 165 add nq, 8 166 jl .loop 167 REP_RET 168 169;********************************************************** 170;void ps_hybrid_analysis_ileave_sse(float out[2][38][64], 171; float (*in)[32][2], 172; int i, int len) 173;********************************************************** 174INIT_XMM sse 175cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp 176 movsxdifnidn iq, id 177 mov lend, 32 << 3 178 lea inq, [inq+iq*4] 179 mov tmpd, id 180 shl tmpd, 8 181 add outq, tmpq 182 mov tmpd, 64 183 sub tmpd, id 184 mov id, tmpd 185 186 test id, 1 187 jne .loop4 188 test id, 2 189 jne .loop8 190 191align 16 192.loop16: 193 mov in0q, inq 194 mov in1q, 38*64*4 195 add in1q, in0q 196 mov tmpd, lend 197 198.inner_loop16: 199 movaps m0, [in0q] 200 movaps m1, [in1q] 201 movaps m2, [in0q+lenq] 202 movaps m3, [in1q+lenq] 203 TRANSPOSE4x4PS 0, 1, 2, 3, 4 204 movaps [outq], m0 205 movaps [outq+lenq], m1 206 movaps [outq+lenq*2], m2 207 movaps [outq+3*32*2*4], m3 208 lea in0q, [in0q+lenq*2] 209 lea in1q, [in1q+lenq*2] 210 add outq, mmsize 211 sub tmpd, mmsize 212 jg .inner_loop16 213 add inq, 16 214 add outq, 3*32*2*4 215 sub id, 4 216 jg .loop16 217 RET 218 219align 16 220.loop8: 221 mov in0q, inq 222 mov in1q, 38*64*4 223 add in1q, in0q 224 mov tmpd, lend 225 226.inner_loop8: 227 movlps m0, [in0q] 228 movlps m1, [in1q] 229 movhps m0, [in0q+lenq] 230 movhps m1, [in1q+lenq] 231 SBUTTERFLYPS 0, 1, 2 232 SBUTTERFLYPD 0, 1, 2 233 movaps [outq], m0 234 movaps [outq+lenq], m1 235 lea in0q, [in0q+lenq*2] 236 lea in1q, [in1q+lenq*2] 237 add outq, mmsize 238 sub tmpd, mmsize 239 jg .inner_loop8 240 add inq, 8 241 add outq, lenq 242 sub id, 2 243 jg .loop16 244 RET 245 246align 16 247.loop4: 248 mov in0q, inq 249 mov in1q, 38*64*4 250 add in1q, in0q 251 mov tmpd, lend 252 253.inner_loop4: 254 movss m0, [in0q] 255 movss m1, [in1q] 256 movss m2, [in0q+lenq] 257 movss m3, [in1q+lenq] 258 movlhps m0, m1 259 movlhps m2, m3 260 shufps m0, m2, q2020 261 movaps [outq], m0 262 lea in0q, [in0q+lenq*2] 263 lea in1q, [in1q+lenq*2] 264 add outq, mmsize 265 sub tmpd, mmsize 266 jg .inner_loop4 267 add inq, 4 268 sub id, 1 269 test id, 2 270 jne .loop8 271 cmp id, 4 272 jge .loop16 273 RET 274 275;*********************************************************** 276;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], 277; float (*in)[32][2], 278; int i, int len) 279;*********************************************************** 280%macro HYBRID_SYNTHESIS_DEINT 0 281cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp 282%if cpuflag(sse4) 283%define MOVH movsd 284%else 285%define MOVH movlps 286%endif 287 movsxdifnidn iq, id 288 mov lend, 32 << 3 289 lea outq, [outq+iq*4] 290 mov tmpd, id 291 shl tmpd, 8 292 add inq, tmpq 293 mov tmpd, 64 294 sub tmpd, id 295 mov id, tmpd 296 297 test id, 1 298 jne .loop4 299 test id, 2 300 jne .loop8 301 302align 16 303.loop16: 304 mov out0q, outq 305 mov out1q, 38*64*4 306 add out1q, out0q 307 mov tmpd, lend 308 309.inner_loop16: 310 movaps m0, [inq] 311 movaps m1, [inq+lenq] 312 movaps m2, [inq+lenq*2] 313 movaps m3, [inq+3*32*2*4] 314 TRANSPOSE4x4PS 0, 1, 2, 3, 4 315 movaps [out0q], m0 316 movaps [out1q], m1 317 movaps [out0q+lenq], m2 318 movaps [out1q+lenq], m3 319 lea out0q, [out0q+lenq*2] 320 lea out1q, [out1q+lenq*2] 321 add inq, mmsize 322 sub tmpd, mmsize 323 jg .inner_loop16 324 add outq, 16 325 add inq, 3*32*2*4 326 sub id, 4 327 jg .loop16 328 RET 329 330align 16 331.loop8: 332 mov out0q, outq 333 mov out1q, 38*64*4 334 add out1q, out0q 335 mov tmpd, lend 336 337.inner_loop8: 338 movaps m0, [inq] 339 movaps m1, [inq+lenq] 340 SBUTTERFLYPS 0, 1, 2 341 SBUTTERFLYPD 0, 1, 2 342 MOVH [out0q], m0 343 MOVH [out1q], m1 344 movhps [out0q+lenq], m0 345 movhps [out1q+lenq], m1 346 lea out0q, [out0q+lenq*2] 347 lea out1q, [out1q+lenq*2] 348 add inq, mmsize 349 sub tmpd, mmsize 350 jg .inner_loop8 351 add outq, 8 352 add inq, lenq 353 sub id, 2 354 jg .loop16 355 RET 356 357align 16 358.loop4: 359 mov out0q, outq 360 mov out1q, 38*64*4 361 add out1q, out0q 362 mov tmpd, lend 363 364.inner_loop4: 365 movaps m0, [inq] 366 movss [out0q], m0 367%if cpuflag(sse4) 368 extractps [out1q], m0, 1 369 extractps [out0q+lenq], m0, 2 370 extractps [out1q+lenq], m0, 3 371%else 372 movhlps m1, m0 373 movss [out0q+lenq], m1 374 shufps m0, m0, 0xb1 375 movss [out1q], m0 376 movhlps m1, m0 377 movss [out1q+lenq], m1 378%endif 379 lea out0q, [out0q+lenq*2] 380 lea out1q, [out1q+lenq*2] 381 add inq, mmsize 382 sub tmpd, mmsize 383 jg .inner_loop4 384 add outq, 4 385 sub id, 1 386 test id, 2 387 jne .loop8 388 cmp id, 4 389 jge .loop16 390 RET 391%endmacro 392 393INIT_XMM sse 394HYBRID_SYNTHESIS_DEINT 395INIT_XMM sse4 396HYBRID_SYNTHESIS_DEINT 397 398;******************************************************************* 399;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2], 400; const float (*filter)[8][2], 401; ptrdiff_t stride, int n); 402;******************************************************************* 403%macro PS_HYBRID_ANALYSIS_LOOP 3 404 movu %1, [inq+mmsize*%3] 405 movu m1, [inq+mmsize*(5-%3)+8] 406%if cpuflag(sse3) 407 pshufd %2, %1, q2301 408 pshufd m4, m1, q0123 409 pshufd m1, m1, q1032 410 pshufd m2, [filterq+nq+mmsize*%3], q2301 411 addsubps %2, m4 412 addsubps %1, m1 413%else 414 mova m2, [filterq+nq+mmsize*%3] 415 mova %2, %1 416 mova m4, m1 417 shufps %2, %2, q2301 418 shufps m4, m4, q0123 419 shufps m1, m1, q1032 420 shufps m2, m2, q2301 421 xorps m4, m7 422 xorps m1, m7 423 subps %2, m4 424 subps %1, m1 425%endif 426 mulps %2, m2 427 mulps %1, m2 428%if %3 429 addps m3, %2 430 addps m0, %1 431%endif 432%endmacro 433 434%macro PS_HYBRID_ANALYSIS 0 435cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n 436%if cpuflag(sse3) 437%define MOVH movsd 438%else 439%define MOVH movlps 440%endif 441 shl strideq, 3 442 shl nd, 6 443 add filterq, nq 444 neg nq 445 mova m7, [ps_p1m1p1m1] 446 447align 16 448.loop: 449 PS_HYBRID_ANALYSIS_LOOP m0, m3, 0 450 PS_HYBRID_ANALYSIS_LOOP m5, m6, 1 451 PS_HYBRID_ANALYSIS_LOOP m5, m6, 2 452 453%if cpuflag(sse3) 454 pshufd m3, m3, q2301 455 xorps m0, m7 456 hsubps m3, m0 457 pshufd m1, m3, q0020 458 pshufd m3, m3, q0031 459 addps m1, m3 460 movsd m2, [inq+6*8] 461%else 462 mova m1, m3 463 mova m2, m0 464 shufps m1, m1, q2301 465 shufps m2, m2, q2301 466 subps m1, m3 467 addps m2, m0 468 unpcklps m3, m1, m2 469 unpckhps m1, m2 470 addps m1, m3 471 movu m2, [inq+6*8] ; faster than movlps and no risk of overread 472%endif 473 movss m3, [filterq+nq+8*6] 474 SPLATD m3 475 mulps m2, m3 476 addps m1, m2 477 MOVH [outq], m1 478 add outq, strideq 479 add nq, 64 480 jl .loop 481 REP_RET 482%endmacro 483 484INIT_XMM sse 485PS_HYBRID_ANALYSIS 486INIT_XMM sse3 487PS_HYBRID_ANALYSIS 488