1;****************************************************************************** 2;* x86 optimized channel mixing 3;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23%include "util.asm" 24 25SECTION .text 26 27;----------------------------------------------------------------------------- 28; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len, 29; int out_ch, int in_ch); 30;----------------------------------------------------------------------------- 31 32%macro MIX_2_TO_1_FLTP_FLT 0 33cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1 34 mov src1q, [srcq+gprsize] 35 mov srcq, [srcq ] 36 sub src1q, srcq 37 mov matrixq, [matrixq ] 38 VBROADCASTSS m4, [matrixq ] 39 VBROADCASTSS m5, [matrixq+4] 40 ALIGN 16 41.loop: 42 mulps m0, m4, [srcq ] 43 mulps m1, m5, [srcq+src1q ] 44 mulps m2, m4, [srcq+ mmsize] 45 mulps m3, m5, [srcq+src1q+mmsize] 46 addps m0, m0, m1 47 addps m2, m2, m3 48 mova [srcq ], m0 49 mova [srcq+mmsize], m2 50 add srcq, mmsize*2 51 sub lend, mmsize*2/4 52 jg .loop 53 REP_RET 54%endmacro 55 56INIT_XMM sse 57MIX_2_TO_1_FLTP_FLT 58%if HAVE_AVX_EXTERNAL 59INIT_YMM avx 60MIX_2_TO_1_FLTP_FLT 61%endif 62 63;----------------------------------------------------------------------------- 64; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len, 65; int out_ch, int in_ch); 66;----------------------------------------------------------------------------- 67 68%macro MIX_2_TO_1_S16P_FLT 0 69cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1 70 mov src1q, [srcq+gprsize] 71 mov srcq, [srcq] 72 sub src1q, srcq 73 mov matrixq, [matrixq ] 74 VBROADCASTSS m4, [matrixq ] 75 VBROADCASTSS m5, [matrixq+4] 76 ALIGN 16 77.loop: 78 mova m0, [srcq ] 79 mova m2, [srcq+src1q] 80 S16_TO_S32_SX 0, 1 81 S16_TO_S32_SX 2, 3 82 cvtdq2ps m0, m0 83 cvtdq2ps m1, m1 84 cvtdq2ps m2, m2 85 cvtdq2ps m3, m3 86 mulps m0, m4 87 mulps m1, m4 88 mulps m2, m5 89 mulps m3, m5 90 addps m0, m2 91 addps m1, m3 92 cvtps2dq m0, m0 93 cvtps2dq m1, m1 94 packssdw m0, m1 95 mova [srcq], m0 96 add srcq, mmsize 97 sub lend, mmsize/2 98 jg .loop 99 REP_RET 100%endmacro 101 102INIT_XMM sse2 103MIX_2_TO_1_S16P_FLT 104INIT_XMM sse4 105MIX_2_TO_1_S16P_FLT 106 107;----------------------------------------------------------------------------- 108; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len, 109; int out_ch, int in_ch); 110;----------------------------------------------------------------------------- 111 112INIT_XMM sse2 113cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1 114 mov src1q, [srcq+gprsize] 115 mov srcq, [srcq] 116 sub src1q, srcq 117 mov matrixq, [matrixq] 118 movd m4, [matrixq] 119 movd m5, [matrixq] 120 SPLATW m4, m4, 0 121 SPLATW m5, m5, 1 122 pxor m0, m0 123 punpcklwd m4, m0 124 punpcklwd m5, m0 125 ALIGN 16 126.loop: 127 mova m0, [srcq ] 128 mova m2, [srcq+src1q] 129 punpckhwd m1, m0, m0 130 punpcklwd m0, m0 131 punpckhwd m3, m2, m2 132 punpcklwd m2, m2 133 pmaddwd m0, m4 134 pmaddwd m1, m4 135 pmaddwd m2, m5 136 pmaddwd m3, m5 137 paddd m0, m2 138 paddd m1, m3 139 psrad m0, 8 140 psrad m1, 8 141 packssdw m0, m1 142 mova [srcq], m0 143 add srcq, mmsize 144 sub lend, mmsize/2 145 jg .loop 146 REP_RET 147 148;----------------------------------------------------------------------------- 149; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len, 150; int out_ch, int in_ch); 151;----------------------------------------------------------------------------- 152 153%macro MIX_1_TO_2_FLTP_FLT 0 154cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1 155 mov src1q, [src0q+gprsize] 156 mov src0q, [src0q] 157 sub src1q, src0q 158 mov matrix1q, [matrix0q+gprsize] 159 mov matrix0q, [matrix0q] 160 VBROADCASTSS m2, [matrix0q] 161 VBROADCASTSS m3, [matrix1q] 162 ALIGN 16 163.loop: 164 mova m0, [src0q] 165 mulps m1, m0, m3 166 mulps m0, m0, m2 167 mova [src0q ], m0 168 mova [src0q+src1q], m1 169 add src0q, mmsize 170 sub lend, mmsize/4 171 jg .loop 172 REP_RET 173%endmacro 174 175INIT_XMM sse 176MIX_1_TO_2_FLTP_FLT 177%if HAVE_AVX_EXTERNAL 178INIT_YMM avx 179MIX_1_TO_2_FLTP_FLT 180%endif 181 182;----------------------------------------------------------------------------- 183; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len, 184; int out_ch, int in_ch); 185;----------------------------------------------------------------------------- 186 187%macro MIX_1_TO_2_S16P_FLT 0 188cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1 189 mov src1q, [src0q+gprsize] 190 mov src0q, [src0q] 191 sub src1q, src0q 192 mov matrix1q, [matrix0q+gprsize] 193 mov matrix0q, [matrix0q] 194 VBROADCASTSS m4, [matrix0q] 195 VBROADCASTSS m5, [matrix1q] 196 ALIGN 16 197.loop: 198 mova m0, [src0q] 199 S16_TO_S32_SX 0, 2 200 cvtdq2ps m0, m0 201 cvtdq2ps m2, m2 202 mulps m1, m0, m5 203 mulps m0, m0, m4 204 mulps m3, m2, m5 205 mulps m2, m2, m4 206 cvtps2dq m0, m0 207 cvtps2dq m1, m1 208 cvtps2dq m2, m2 209 cvtps2dq m3, m3 210 packssdw m0, m2 211 packssdw m1, m3 212 mova [src0q ], m0 213 mova [src0q+src1q], m1 214 add src0q, mmsize 215 sub lend, mmsize/2 216 jg .loop 217 REP_RET 218%endmacro 219 220INIT_XMM sse2 221MIX_1_TO_2_S16P_FLT 222INIT_XMM sse4 223MIX_1_TO_2_S16P_FLT 224%if HAVE_AVX_EXTERNAL 225INIT_XMM avx 226MIX_1_TO_2_S16P_FLT 227%endif 228 229;----------------------------------------------------------------------------- 230; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix, 231; int len, int out_ch, int in_ch); 232;----------------------------------------------------------------------------- 233 234%macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp 235; define some names to make the code clearer 236%assign in_channels %1 237%assign out_channels %2 238%assign stereo out_channels - 1 239%ifidn %3, s16p 240 %assign is_s16 1 241%else 242 %assign is_s16 0 243%endif 244 245; determine how many matrix elements must go on the stack vs. mmregs 246%assign matrix_elements in_channels * out_channels 247%if is_s16 248 %if stereo 249 %assign needed_mmregs 7 250 %else 251 %assign needed_mmregs 5 252 %endif 253%else 254 %if stereo 255 %assign needed_mmregs 4 256 %else 257 %assign needed_mmregs 3 258 %endif 259%endif 260%assign matrix_elements_mm num_mmregs - needed_mmregs 261%if matrix_elements < matrix_elements_mm 262 %assign matrix_elements_mm matrix_elements 263%endif 264%if matrix_elements_mm < matrix_elements 265 %assign matrix_elements_stack matrix_elements - matrix_elements_mm 266%else 267 %assign matrix_elements_stack 0 268%endif 269%assign matrix_stack_size matrix_elements_stack * mmsize 270 271%assign needed_stack_size -1 * matrix_stack_size 272%if ARCH_X86_32 && in_channels >= 7 273%assign needed_stack_size needed_stack_size - 16 274%endif 275 276cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, needed_stack_size, src0, src1, len, src2, src3, src4, src5, src6, src7 277 278; define src pointers on stack if needed 279%if matrix_elements_stack > 0 && ARCH_X86_32 && in_channels >= 7 280 %define src5m [rsp+matrix_stack_size+0] 281 %define src6m [rsp+matrix_stack_size+4] 282 %define src7m [rsp+matrix_stack_size+8] 283%endif 284 285; load matrix pointers 286%define matrix0q r1q 287%define matrix1q r3q 288%if stereo 289 mov matrix1q, [matrix0q+gprsize] 290%endif 291 mov matrix0q, [matrix0q] 292 293; define matrix coeff names 294%assign %%i 0 295%assign %%j needed_mmregs 296%rep in_channels 297 %if %%i >= matrix_elements_mm 298 CAT_XDEFINE mx_stack_0_, %%i, 1 299 CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize] 300 %else 301 CAT_XDEFINE mx_stack_0_, %%i, 0 302 CAT_XDEFINE mx_0_, %%i, m %+ %%j 303 %assign %%j %%j+1 304 %endif 305 %assign %%i %%i+1 306%endrep 307%if stereo 308%assign %%i 0 309%rep in_channels 310 %if in_channels + %%i >= matrix_elements_mm 311 CAT_XDEFINE mx_stack_1_, %%i, 1 312 CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize] 313 %else 314 CAT_XDEFINE mx_stack_1_, %%i, 0 315 CAT_XDEFINE mx_1_, %%i, m %+ %%j 316 %assign %%j %%j+1 317 %endif 318 %assign %%i %%i+1 319%endrep 320%endif 321 322; load/splat matrix coeffs 323%assign %%i 0 324%rep in_channels 325 %if mx_stack_0_ %+ %%i 326 VBROADCASTSS m0, [matrix0q+4*%%i] 327 mova mx_0_ %+ %%i, m0 328 %else 329 VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i] 330 %endif 331 %if stereo 332 %if mx_stack_1_ %+ %%i 333 VBROADCASTSS m0, [matrix1q+4*%%i] 334 mova mx_1_ %+ %%i, m0 335 %else 336 VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i] 337 %endif 338 %endif 339 %assign %%i %%i+1 340%endrep 341 342; load channel pointers to registers as offsets from the first channel pointer 343%if ARCH_X86_64 344 movsxd lenq, r2d 345%endif 346 shl lenq, 2-is_s16 347%assign %%i 1 348%rep (in_channels - 1) 349 %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5 350 mov src5q, [src0q+%%i*gprsize] 351 add src5q, lenq 352 mov src %+ %%i %+ m, src5q 353 %else 354 mov src %+ %%i %+ q, [src0q+%%i*gprsize] 355 add src %+ %%i %+ q, lenq 356 %endif 357 %assign %%i %%i+1 358%endrep 359 mov src0q, [src0q] 360 add src0q, lenq 361 neg lenq 362.loop: 363; for x86-32 with 7-8 channels we do not have enough gp registers for all src 364; pointers, so we have to load some of them from the stack each time 365%define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5 366%if is_s16 367 ; mix with s16p input 368 mova m0, [src0q+lenq] 369 S16_TO_S32_SX 0, 1 370 cvtdq2ps m0, m0 371 cvtdq2ps m1, m1 372 %if stereo 373 mulps m2, m0, mx_1_0 374 mulps m3, m1, mx_1_0 375 %endif 376 mulps m0, m0, mx_0_0 377 mulps m1, m1, mx_0_0 378%assign %%i 1 379%rep (in_channels - 1) 380 %if copy_src_from_stack 381 %define src_ptr src5q 382 %else 383 %define src_ptr src %+ %%i %+ q 384 %endif 385 %if stereo 386 %if copy_src_from_stack 387 mov src_ptr, src %+ %%i %+ m 388 %endif 389 mova m4, [src_ptr+lenq] 390 S16_TO_S32_SX 4, 5 391 cvtdq2ps m4, m4 392 cvtdq2ps m5, m5 393 FMULADD_PS m2, m4, mx_1_ %+ %%i, m2, m6 394 FMULADD_PS m3, m5, mx_1_ %+ %%i, m3, m6 395 FMULADD_PS m0, m4, mx_0_ %+ %%i, m0, m4 396 FMULADD_PS m1, m5, mx_0_ %+ %%i, m1, m5 397 %else 398 %if copy_src_from_stack 399 mov src_ptr, src %+ %%i %+ m 400 %endif 401 mova m2, [src_ptr+lenq] 402 S16_TO_S32_SX 2, 3 403 cvtdq2ps m2, m2 404 cvtdq2ps m3, m3 405 FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m4 406 FMULADD_PS m1, m3, mx_0_ %+ %%i, m1, m4 407 %endif 408 %assign %%i %%i+1 409%endrep 410 %if stereo 411 cvtps2dq m2, m2 412 cvtps2dq m3, m3 413 packssdw m2, m3 414 mova [src1q+lenq], m2 415 %endif 416 cvtps2dq m0, m0 417 cvtps2dq m1, m1 418 packssdw m0, m1 419 mova [src0q+lenq], m0 420%else 421 ; mix with fltp input 422 %if stereo || mx_stack_0_0 423 mova m0, [src0q+lenq] 424 %endif 425 %if stereo 426 mulps m1, m0, mx_1_0 427 %endif 428 %if stereo || mx_stack_0_0 429 mulps m0, m0, mx_0_0 430 %else 431 mulps m0, mx_0_0, [src0q+lenq] 432 %endif 433%assign %%i 1 434%rep (in_channels - 1) 435 %if copy_src_from_stack 436 %define src_ptr src5q 437 mov src_ptr, src %+ %%i %+ m 438 %else 439 %define src_ptr src %+ %%i %+ q 440 %endif 441 ; avoid extra load for mono if matrix is in a mm register 442 %if stereo || mx_stack_0_ %+ %%i 443 mova m2, [src_ptr+lenq] 444 %endif 445 %if stereo 446 FMULADD_PS m1, m2, mx_1_ %+ %%i, m1, m3 447 %endif 448 %if stereo || mx_stack_0_ %+ %%i 449 FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m2 450 %else 451 FMULADD_PS m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1 452 %endif 453 %assign %%i %%i+1 454%endrep 455 mova [src0q+lenq], m0 456 %if stereo 457 mova [src1q+lenq], m1 458 %endif 459%endif 460 461 add lenq, mmsize 462 jl .loop 463; zero ymm high halves 464%if mmsize == 32 465 vzeroupper 466%endif 467 RET 468%endmacro 469 470%macro MIX_3_8_TO_1_2_FLT_FUNCS 0 471%assign %%i 3 472%rep 6 473 INIT_XMM sse 474 MIX_3_8_TO_1_2_FLT %%i, 1, fltp 475 MIX_3_8_TO_1_2_FLT %%i, 2, fltp 476 INIT_XMM sse2 477 MIX_3_8_TO_1_2_FLT %%i, 1, s16p 478 MIX_3_8_TO_1_2_FLT %%i, 2, s16p 479 INIT_XMM sse4 480 MIX_3_8_TO_1_2_FLT %%i, 1, s16p 481 MIX_3_8_TO_1_2_FLT %%i, 2, s16p 482 ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues 483 %if HAVE_AVX_EXTERNAL 484 %if ARCH_X86_64 || %%i < 6 485 INIT_YMM avx 486 %else 487 INIT_XMM avx 488 %endif 489 MIX_3_8_TO_1_2_FLT %%i, 1, fltp 490 MIX_3_8_TO_1_2_FLT %%i, 2, fltp 491 INIT_XMM avx 492 MIX_3_8_TO_1_2_FLT %%i, 1, s16p 493 MIX_3_8_TO_1_2_FLT %%i, 2, s16p 494 %endif 495 %if HAVE_FMA4_EXTERNAL 496 %if ARCH_X86_64 || %%i < 6 497 INIT_YMM fma4 498 %else 499 INIT_XMM fma4 500 %endif 501 MIX_3_8_TO_1_2_FLT %%i, 1, fltp 502 MIX_3_8_TO_1_2_FLT %%i, 2, fltp 503 INIT_XMM fma4 504 MIX_3_8_TO_1_2_FLT %%i, 1, s16p 505 MIX_3_8_TO_1_2_FLT %%i, 2, s16p 506 %endif 507 %assign %%i %%i+1 508%endrep 509%endmacro 510 511MIX_3_8_TO_1_2_FLT_FUNCS 512