1;****************************************************************************** 2;* x86 optimized Format Conversion Utils 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24%include "util.asm" 25 26SECTION_RODATA 32 27 28pf_s32_inv_scale: times 8 dd 0x30000000 29pf_s32_scale: times 8 dd 0x4f000000 30pf_s32_clip: times 8 dd 0x4effffff 31pf_s16_inv_scale: times 4 dd 0x38000000 32pf_s16_scale: times 4 dd 0x47000000 33pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11 34pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15 35pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7 36pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7 37pw_zero_even: times 4 dw 0x0000, 0xffff 38 39SECTION .text 40 41;------------------------------------------------------------------------------ 42; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len); 43;------------------------------------------------------------------------------ 44 45INIT_XMM sse2 46cglobal conv_s16_to_s32, 3,3,3, dst, src, len 47 lea lenq, [2*lend] 48 lea dstq, [dstq+2*lenq] 49 add srcq, lenq 50 neg lenq 51.loop: 52 mova m2, [srcq+lenq] 53 pxor m0, m0 54 pxor m1, m1 55 punpcklwd m0, m2 56 punpckhwd m1, m2 57 mova [dstq+2*lenq ], m0 58 mova [dstq+2*lenq+mmsize], m1 59 add lenq, mmsize 60 jl .loop 61 REP_RET 62 63;------------------------------------------------------------------------------ 64; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len); 65;------------------------------------------------------------------------------ 66 67%macro CONV_S16_TO_FLT 0 68cglobal conv_s16_to_flt, 3,3,3, dst, src, len 69 lea lenq, [2*lend] 70 add srcq, lenq 71 lea dstq, [dstq + 2*lenq] 72 neg lenq 73 mova m2, [pf_s16_inv_scale] 74 ALIGN 16 75.loop: 76 mova m0, [srcq+lenq] 77 S16_TO_S32_SX 0, 1 78 cvtdq2ps m0, m0 79 cvtdq2ps m1, m1 80 mulps m0, m2 81 mulps m1, m2 82 mova [dstq+2*lenq ], m0 83 mova [dstq+2*lenq+mmsize], m1 84 add lenq, mmsize 85 jl .loop 86 REP_RET 87%endmacro 88 89INIT_XMM sse2 90CONV_S16_TO_FLT 91INIT_XMM sse4 92CONV_S16_TO_FLT 93 94;------------------------------------------------------------------------------ 95; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len); 96;------------------------------------------------------------------------------ 97 98%macro CONV_S32_TO_S16 0 99cglobal conv_s32_to_s16, 3,3,4, dst, src, len 100 lea lenq, [2*lend] 101 lea srcq, [srcq+2*lenq] 102 add dstq, lenq 103 neg lenq 104.loop: 105 mova m0, [srcq+2*lenq ] 106 mova m1, [srcq+2*lenq+ mmsize] 107 mova m2, [srcq+2*lenq+2*mmsize] 108 mova m3, [srcq+2*lenq+3*mmsize] 109 psrad m0, 16 110 psrad m1, 16 111 psrad m2, 16 112 psrad m3, 16 113 packssdw m0, m1 114 packssdw m2, m3 115 mova [dstq+lenq ], m0 116 mova [dstq+lenq+mmsize], m2 117 add lenq, mmsize*2 118 jl .loop 119%if mmsize == 8 120 emms 121 RET 122%else 123 REP_RET 124%endif 125%endmacro 126 127INIT_MMX mmx 128CONV_S32_TO_S16 129INIT_XMM sse2 130CONV_S32_TO_S16 131 132;------------------------------------------------------------------------------ 133; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len); 134;------------------------------------------------------------------------------ 135 136%macro CONV_S32_TO_FLT 0 137cglobal conv_s32_to_flt, 3,3,3, dst, src, len 138 lea lenq, [4*lend] 139 add srcq, lenq 140 add dstq, lenq 141 neg lenq 142 mova m0, [pf_s32_inv_scale] 143 ALIGN 16 144.loop: 145 cvtdq2ps m1, [srcq+lenq ] 146 cvtdq2ps m2, [srcq+lenq+mmsize] 147 mulps m1, m1, m0 148 mulps m2, m2, m0 149 mova [dstq+lenq ], m1 150 mova [dstq+lenq+mmsize], m2 151 add lenq, mmsize*2 152 jl .loop 153 REP_RET 154%endmacro 155 156INIT_XMM sse2 157CONV_S32_TO_FLT 158%if HAVE_AVX_EXTERNAL 159INIT_YMM avx 160CONV_S32_TO_FLT 161%endif 162 163;------------------------------------------------------------------------------ 164; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len); 165;------------------------------------------------------------------------------ 166 167INIT_XMM sse2 168cglobal conv_flt_to_s16, 3,3,5, dst, src, len 169 lea lenq, [2*lend] 170 lea srcq, [srcq+2*lenq] 171 add dstq, lenq 172 neg lenq 173 mova m4, [pf_s16_scale] 174.loop: 175 mova m0, [srcq+2*lenq ] 176 mova m1, [srcq+2*lenq+1*mmsize] 177 mova m2, [srcq+2*lenq+2*mmsize] 178 mova m3, [srcq+2*lenq+3*mmsize] 179 mulps m0, m4 180 mulps m1, m4 181 mulps m2, m4 182 mulps m3, m4 183 cvtps2dq m0, m0 184 cvtps2dq m1, m1 185 cvtps2dq m2, m2 186 cvtps2dq m3, m3 187 packssdw m0, m1 188 packssdw m2, m3 189 mova [dstq+lenq ], m0 190 mova [dstq+lenq+mmsize], m2 191 add lenq, mmsize*2 192 jl .loop 193 REP_RET 194 195;------------------------------------------------------------------------------ 196; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len); 197;------------------------------------------------------------------------------ 198 199%macro CONV_FLT_TO_S32 0 200cglobal conv_flt_to_s32, 3,3,6, dst, src, len 201 lea lenq, [lend*4] 202 add srcq, lenq 203 add dstq, lenq 204 neg lenq 205 mova m4, [pf_s32_scale] 206 mova m5, [pf_s32_clip] 207.loop: 208 mulps m0, m4, [srcq+lenq ] 209 mulps m1, m4, [srcq+lenq+1*mmsize] 210 mulps m2, m4, [srcq+lenq+2*mmsize] 211 mulps m3, m4, [srcq+lenq+3*mmsize] 212 minps m0, m0, m5 213 minps m1, m1, m5 214 minps m2, m2, m5 215 minps m3, m3, m5 216 cvtps2dq m0, m0 217 cvtps2dq m1, m1 218 cvtps2dq m2, m2 219 cvtps2dq m3, m3 220 mova [dstq+lenq ], m0 221 mova [dstq+lenq+1*mmsize], m1 222 mova [dstq+lenq+2*mmsize], m2 223 mova [dstq+lenq+3*mmsize], m3 224 add lenq, mmsize*4 225 jl .loop 226 REP_RET 227%endmacro 228 229INIT_XMM sse2 230CONV_FLT_TO_S32 231%if HAVE_AVX_EXTERNAL 232INIT_YMM avx 233CONV_FLT_TO_S32 234%endif 235 236;------------------------------------------------------------------------------ 237; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len, 238; int channels); 239;------------------------------------------------------------------------------ 240 241%macro CONV_S16P_TO_S16_2CH 0 242cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1 243 mov src1q, [src0q+gprsize] 244 mov src0q, [src0q ] 245 lea lenq, [2*lend] 246 add src0q, lenq 247 add src1q, lenq 248 lea dstq, [dstq+2*lenq] 249 neg lenq 250.loop: 251 mova m0, [src0q+lenq ] 252 mova m1, [src1q+lenq ] 253 mova m2, [src0q+lenq+mmsize] 254 mova m3, [src1q+lenq+mmsize] 255 SBUTTERFLY2 wd, 0, 1, 4 256 SBUTTERFLY2 wd, 2, 3, 4 257 mova [dstq+2*lenq+0*mmsize], m0 258 mova [dstq+2*lenq+1*mmsize], m1 259 mova [dstq+2*lenq+2*mmsize], m2 260 mova [dstq+2*lenq+3*mmsize], m3 261 add lenq, 2*mmsize 262 jl .loop 263 REP_RET 264%endmacro 265 266INIT_XMM sse2 267CONV_S16P_TO_S16_2CH 268%if HAVE_AVX_EXTERNAL 269INIT_XMM avx 270CONV_S16P_TO_S16_2CH 271%endif 272 273;------------------------------------------------------------------------------ 274; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len, 275; int channels); 276;------------------------------------------------------------------------------ 277 278;------------------------------------------------------------------------------ 279; NOTE: In the 6-channel functions, len could be used as an index on x86-64 280; instead of just a counter, which would avoid incrementing the 281; pointers, but the extra complexity and amount of code is not worth 282; the small gain. On x86-32 there are not enough registers to use len 283; as an index without keeping two of the pointers on the stack and 284; loading them in each iteration. 285;------------------------------------------------------------------------------ 286 287%macro CONV_S16P_TO_S16_6CH 0 288%if ARCH_X86_64 289cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5 290%else 291cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5 292%define lend dword r2m 293%endif 294 mov src1q, [src0q+1*gprsize] 295 mov src2q, [src0q+2*gprsize] 296 mov src3q, [src0q+3*gprsize] 297 mov src4q, [src0q+4*gprsize] 298 mov src5q, [src0q+5*gprsize] 299 mov src0q, [src0q] 300 sub src1q, src0q 301 sub src2q, src0q 302 sub src3q, src0q 303 sub src4q, src0q 304 sub src5q, src0q 305.loop: 306%if cpuflag(sse2slow) 307 movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x 308 movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x 309 movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x 310 movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x 311 movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x 312 movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x 313 ; unpack words: 314 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 315 punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23 316 punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21 317 ; blend dwords 318 shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15 319 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 320 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 321 ; shuffle dwords 322 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19 323 pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15 324 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 325 movq [dstq+0*mmsize/2], m1 326 movq [dstq+1*mmsize/2], m0 327 movq [dstq+2*mmsize/2], m2 328 movhps [dstq+3*mmsize/2], m1 329 movhps [dstq+4*mmsize/2], m0 330 movhps [dstq+5*mmsize/2], m2 331 add src0q, mmsize/2 332 add dstq, mmsize*3 333 sub lend, mmsize/4 334%else 335 mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42 336 mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43 337 mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44 338 mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45 339 mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46 340 mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47 341 ; unpack words: 342 SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 343 ; m1 = 24, 25, 30, 31, 36, 37, 42, 43 344 SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 345 ; m3 = 26, 27, 32, 33, 38, 39, 44, 45 346 SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23 347 ; m5 = 28, 29, 34, 35, 40, 41, 46, 47 348 ; blend dwords 349 shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15 350 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 351 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 352 SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15 353 shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39 354 shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41 355 shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47 356 SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39 357 ; shuffle dwords 358 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19 359 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 360 pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15 361 pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43 362 pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47 363 pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39 364 ; shuffle qwords 365 punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7 366 punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23 367 shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15 368 SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7 369 punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31 370 punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47 371 shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39 372 SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31 373 mova [dstq+0*mmsize], m4 374 mova [dstq+1*mmsize], m2 375 mova [dstq+2*mmsize], m0 376 mova [dstq+3*mmsize], m5 377 mova [dstq+4*mmsize], m3 378 mova [dstq+5*mmsize], m1 379 add src0q, mmsize 380 add dstq, mmsize*6 381 sub lend, mmsize/2 382%endif 383 jg .loop 384 REP_RET 385%endmacro 386 387INIT_XMM sse2 388CONV_S16P_TO_S16_6CH 389INIT_XMM sse2slow 390CONV_S16P_TO_S16_6CH 391%if HAVE_AVX_EXTERNAL 392INIT_XMM avx 393CONV_S16P_TO_S16_6CH 394%endif 395 396;------------------------------------------------------------------------------ 397; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len, 398; int channels); 399;------------------------------------------------------------------------------ 400 401%macro CONV_S16P_TO_FLT_2CH 0 402cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1 403 lea lenq, [2*lend] 404 mov src1q, [src0q+gprsize] 405 mov src0q, [src0q ] 406 lea dstq, [dstq+4*lenq] 407 add src0q, lenq 408 add src1q, lenq 409 neg lenq 410 mova m5, [pf_s32_inv_scale] 411.loop: 412 mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14 413 mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15 414 SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7 415 ; m4 = 8, 9, 10, 11, 12, 13, 14, 15 416 pxor m3, m3 417 punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3 418 punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7 419 punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11 420 punpckhwd m3, m4 ; m3 = 12, 13, 14, 15 421 cvtdq2ps m0, m0 422 cvtdq2ps m1, m1 423 cvtdq2ps m2, m2 424 cvtdq2ps m3, m3 425 mulps m0, m5 426 mulps m1, m5 427 mulps m2, m5 428 mulps m3, m5 429 mova [dstq+4*lenq ], m0 430 mova [dstq+4*lenq+ mmsize], m1 431 mova [dstq+4*lenq+2*mmsize], m2 432 mova [dstq+4*lenq+3*mmsize], m3 433 add lenq, mmsize 434 jl .loop 435 REP_RET 436%endmacro 437 438INIT_XMM sse2 439CONV_S16P_TO_FLT_2CH 440%if HAVE_AVX_EXTERNAL 441INIT_XMM avx 442CONV_S16P_TO_FLT_2CH 443%endif 444 445;------------------------------------------------------------------------------ 446; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len, 447; int channels); 448;------------------------------------------------------------------------------ 449 450%macro CONV_S16P_TO_FLT_6CH 0 451%if ARCH_X86_64 452cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5 453%else 454cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5 455%define lend dword r2m 456%endif 457 mov src1q, [srcq+1*gprsize] 458 mov src2q, [srcq+2*gprsize] 459 mov src3q, [srcq+3*gprsize] 460 mov src4q, [srcq+4*gprsize] 461 mov src5q, [srcq+5*gprsize] 462 mov srcq, [srcq] 463 sub src1q, srcq 464 sub src2q, srcq 465 sub src3q, srcq 466 sub src4q, srcq 467 sub src5q, srcq 468 mova m7, [pf_s32_inv_scale] 469%if cpuflag(ssse3) 470 %define unpack_even m6 471 mova m6, [pb_shuf_unpack_even] 472%if ARCH_X86_64 473 %define unpack_odd m8 474 mova m8, [pb_shuf_unpack_odd] 475%else 476 %define unpack_odd [pb_shuf_unpack_odd] 477%endif 478%endif 479.loop: 480 movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x 481 movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x 482 movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x 483 movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x 484 movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x 485 movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x 486 ; unpack words: 487 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 488 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 489 punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23 490 ; blend dwords 491 shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19 492 shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15 493 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 494%if cpuflag(ssse3) 495 pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15 496 pshufb m0, unpack_even ; m0 = 0, 1, 2, 3 497 pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19 498 pshufb m1, unpack_even ; m1 = 4, 5, 6, 7 499 pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23 500 pshufb m2, unpack_even ; m2 = 8, 9, 10, 11 501%else 502 ; shuffle dwords 503 pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15 504 pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19 505 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 506 pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5 507 punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3 508 punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15 509 punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7 510 punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19 511 punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11 512 punpckhwd m6, m2 ; m6 = 20, 21, 22, 23 513 SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5 514%endif 515 cvtdq2ps m0, m0 ; convert s32 to float 516 cvtdq2ps m1, m1 517 cvtdq2ps m2, m2 518 cvtdq2ps m3, m3 519 cvtdq2ps m4, m4 520 cvtdq2ps m5, m5 521 mulps m0, m7 ; scale float from s32 range to [-1.0,1.0] 522 mulps m1, m7 523 mulps m2, m7 524 mulps m3, m7 525 mulps m4, m7 526 mulps m5, m7 527 mova [dstq ], m0 528 mova [dstq+ mmsize], m1 529 mova [dstq+2*mmsize], m2 530 mova [dstq+3*mmsize], m3 531 mova [dstq+4*mmsize], m4 532 mova [dstq+5*mmsize], m5 533 add srcq, mmsize/2 534 add dstq, mmsize*6 535 sub lend, mmsize/4 536 jg .loop 537 REP_RET 538%endmacro 539 540INIT_XMM sse2 541CONV_S16P_TO_FLT_6CH 542INIT_XMM ssse3 543CONV_S16P_TO_FLT_6CH 544%if HAVE_AVX_EXTERNAL 545INIT_XMM avx 546CONV_S16P_TO_FLT_6CH 547%endif 548 549;------------------------------------------------------------------------------ 550; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len, 551; int channels); 552;------------------------------------------------------------------------------ 553 554%macro CONV_FLTP_TO_S16_2CH 0 555cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1 556 lea lenq, [4*lend] 557 mov src1q, [src0q+gprsize] 558 mov src0q, [src0q ] 559 add dstq, lenq 560 add src0q, lenq 561 add src1q, lenq 562 neg lenq 563 mova m2, [pf_s16_scale] 564%if cpuflag(ssse3) 565 mova m3, [pb_interleave_words] 566%endif 567.loop: 568 mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6 569 mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7 570 cvtps2dq m0, m0 571 cvtps2dq m1, m1 572%if cpuflag(ssse3) 573 packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7 574 pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 575%else 576 packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x 577 packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x 578 punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 579%endif 580 mova [dstq+lenq], m0 581 add lenq, mmsize 582 jl .loop 583 REP_RET 584%endmacro 585 586INIT_XMM sse2 587CONV_FLTP_TO_S16_2CH 588INIT_XMM ssse3 589CONV_FLTP_TO_S16_2CH 590 591;------------------------------------------------------------------------------ 592; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len, 593; int channels); 594;------------------------------------------------------------------------------ 595 596%macro CONV_FLTP_TO_S16_6CH 0 597%if ARCH_X86_64 598cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5 599%else 600cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5 601%define lend dword r2m 602%endif 603 mov src1q, [srcq+1*gprsize] 604 mov src2q, [srcq+2*gprsize] 605 mov src3q, [srcq+3*gprsize] 606 mov src4q, [srcq+4*gprsize] 607 mov src5q, [srcq+5*gprsize] 608 mov srcq, [srcq] 609 sub src1q, srcq 610 sub src2q, srcq 611 sub src3q, srcq 612 sub src4q, srcq 613 sub src5q, srcq 614 movaps xmm6, [pf_s16_scale] 615.loop: 616%if cpuflag(sse2) 617 mulps m0, m6, [srcq ] 618 mulps m1, m6, [srcq+src1q] 619 mulps m2, m6, [srcq+src2q] 620 mulps m3, m6, [srcq+src3q] 621 mulps m4, m6, [srcq+src4q] 622 mulps m5, m6, [srcq+src5q] 623 cvtps2dq m0, m0 624 cvtps2dq m1, m1 625 cvtps2dq m2, m2 626 cvtps2dq m3, m3 627 cvtps2dq m4, m4 628 cvtps2dq m5, m5 629 packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21 630 packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22 631 packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23 632 ; unpack words: 633 movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x 634 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 635 punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23 636 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 637 ; blend dwords: 638 shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15 639 shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 640 shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 641 ; shuffle dwords: 642 shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 643 shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7 644 shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23 645 mova [dstq+0*mmsize], m3 646 mova [dstq+1*mmsize], m1 647 mova [dstq+2*mmsize], m0 648%else ; sse 649 movlps xmm0, [srcq ] 650 movlps xmm1, [srcq+src1q] 651 movlps xmm2, [srcq+src2q] 652 movlps xmm3, [srcq+src3q] 653 movlps xmm4, [srcq+src4q] 654 movlps xmm5, [srcq+src5q] 655 mulps xmm0, xmm6 656 mulps xmm1, xmm6 657 mulps xmm2, xmm6 658 mulps xmm3, xmm6 659 mulps xmm4, xmm6 660 mulps xmm5, xmm6 661 cvtps2pi mm0, xmm0 662 cvtps2pi mm1, xmm1 663 cvtps2pi mm2, xmm2 664 cvtps2pi mm3, xmm3 665 cvtps2pi mm4, xmm4 666 cvtps2pi mm5, xmm5 667 packssdw mm0, mm3 ; m0 = 0, 6, 3, 9 668 packssdw mm1, mm4 ; m1 = 1, 7, 4, 10 669 packssdw mm2, mm5 ; m2 = 2, 8, 5, 11 670 ; unpack words 671 pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6 672 punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7 673 punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11 674 punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9 675 ; unpack dwords 676 pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1 677 punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final) 678 punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final) 679 punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final) 680 mova [dstq+0*mmsize], mm0 681 mova [dstq+1*mmsize], mm1 682 mova [dstq+2*mmsize], mm2 683%endif 684 add srcq, mmsize 685 add dstq, mmsize*3 686 sub lend, mmsize/4 687 jg .loop 688%if mmsize == 8 689 emms 690 RET 691%else 692 REP_RET 693%endif 694%endmacro 695 696INIT_MMX sse 697CONV_FLTP_TO_S16_6CH 698INIT_XMM sse2 699CONV_FLTP_TO_S16_6CH 700%if HAVE_AVX_EXTERNAL 701INIT_XMM avx 702CONV_FLTP_TO_S16_6CH 703%endif 704 705;------------------------------------------------------------------------------ 706; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len, 707; int channels); 708;------------------------------------------------------------------------------ 709 710%macro CONV_FLTP_TO_FLT_2CH 0 711cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1 712 mov src1q, [src0q+gprsize] 713 mov src0q, [src0q] 714 lea lenq, [4*lend] 715 add src0q, lenq 716 add src1q, lenq 717 lea dstq, [dstq+2*lenq] 718 neg lenq 719.loop: 720 mova m0, [src0q+lenq ] 721 mova m1, [src1q+lenq ] 722 mova m2, [src0q+lenq+mmsize] 723 mova m3, [src1q+lenq+mmsize] 724 SBUTTERFLYPS 0, 1, 4 725 SBUTTERFLYPS 2, 3, 4 726 mova [dstq+2*lenq+0*mmsize], m0 727 mova [dstq+2*lenq+1*mmsize], m1 728 mova [dstq+2*lenq+2*mmsize], m2 729 mova [dstq+2*lenq+3*mmsize], m3 730 add lenq, 2*mmsize 731 jl .loop 732 REP_RET 733%endmacro 734 735INIT_XMM sse 736CONV_FLTP_TO_FLT_2CH 737%if HAVE_AVX_EXTERNAL 738INIT_XMM avx 739CONV_FLTP_TO_FLT_2CH 740%endif 741 742;----------------------------------------------------------------------------- 743; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len, 744; int channels); 745;----------------------------------------------------------------------------- 746 747%macro CONV_FLTP_TO_FLT_6CH 0 748cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len 749%if ARCH_X86_64 750 mov lend, r2d 751%else 752 %define lend dword r2m 753%endif 754 mov src1q, [srcq+1*gprsize] 755 mov src2q, [srcq+2*gprsize] 756 mov src3q, [srcq+3*gprsize] 757 mov src4q, [srcq+4*gprsize] 758 mov src5q, [srcq+5*gprsize] 759 mov srcq, [srcq] 760 sub src1q, srcq 761 sub src2q, srcq 762 sub src3q, srcq 763 sub src4q, srcq 764 sub src5q, srcq 765.loop: 766 mova m0, [srcq ] 767 mova m1, [srcq+src1q] 768 mova m2, [srcq+src2q] 769 mova m3, [srcq+src3q] 770 mova m4, [srcq+src4q] 771 mova m5, [srcq+src5q] 772%if cpuflag(sse4) 773 SBUTTERFLYPS 0, 1, 6 774 SBUTTERFLYPS 2, 3, 6 775 SBUTTERFLYPS 4, 5, 6 776 777 blendps m6, m4, m0, 1100b 778 movlhps m0, m2 779 movhlps m4, m2 780 blendps m2, m5, m1, 1100b 781 movlhps m1, m3 782 movhlps m5, m3 783 784 movaps [dstq ], m0 785 movaps [dstq+16], m6 786 movaps [dstq+32], m4 787 movaps [dstq+48], m1 788 movaps [dstq+64], m2 789 movaps [dstq+80], m5 790%else ; mmx 791 SBUTTERFLY dq, 0, 1, 6 792 SBUTTERFLY dq, 2, 3, 6 793 SBUTTERFLY dq, 4, 5, 6 794 795 movq [dstq ], m0 796 movq [dstq+ 8], m2 797 movq [dstq+16], m4 798 movq [dstq+24], m1 799 movq [dstq+32], m3 800 movq [dstq+40], m5 801%endif 802 add srcq, mmsize 803 add dstq, mmsize*6 804 sub lend, mmsize/4 805 jg .loop 806%if mmsize == 8 807 emms 808 RET 809%else 810 REP_RET 811%endif 812%endmacro 813 814INIT_MMX mmx 815CONV_FLTP_TO_FLT_6CH 816INIT_XMM sse4 817CONV_FLTP_TO_FLT_6CH 818%if HAVE_AVX_EXTERNAL 819INIT_XMM avx 820CONV_FLTP_TO_FLT_6CH 821%endif 822 823;------------------------------------------------------------------------------ 824; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len, 825; int channels); 826;------------------------------------------------------------------------------ 827 828%macro CONV_S16_TO_S16P_2CH 0 829cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1 830 lea lenq, [2*lend] 831 mov dst1q, [dst0q+gprsize] 832 mov dst0q, [dst0q ] 833 lea srcq, [srcq+2*lenq] 834 add dst0q, lenq 835 add dst1q, lenq 836 neg lenq 837%if cpuflag(ssse3) 838 mova m3, [pb_deinterleave_words] 839%endif 840.loop: 841 mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 842 mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 843%if cpuflag(ssse3) 844 pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7 845 pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15 846 SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14 847 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15 848%else ; sse2 849 pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7 850 pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7 851 pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15 852 pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15 853 DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14 854 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15 855%endif 856 mova [dst0q+lenq], m0 857 mova [dst1q+lenq], m1 858 add lenq, mmsize 859 jl .loop 860 REP_RET 861%endmacro 862 863INIT_XMM sse2 864CONV_S16_TO_S16P_2CH 865INIT_XMM ssse3 866CONV_S16_TO_S16P_2CH 867%if HAVE_AVX_EXTERNAL 868INIT_XMM avx 869CONV_S16_TO_S16P_2CH 870%endif 871 872;------------------------------------------------------------------------------ 873; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len, 874; int channels); 875;------------------------------------------------------------------------------ 876 877%macro CONV_S16_TO_S16P_6CH 0 878%if ARCH_X86_64 879cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5 880%else 881cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5 882%define lend dword r2m 883%endif 884 mov dst1q, [dstq+ gprsize] 885 mov dst2q, [dstq+2*gprsize] 886 mov dst3q, [dstq+3*gprsize] 887 mov dst4q, [dstq+4*gprsize] 888 mov dst5q, [dstq+5*gprsize] 889 mov dstq, [dstq ] 890 sub dst1q, dstq 891 sub dst2q, dstq 892 sub dst3q, dstq 893 sub dst4q, dstq 894 sub dst5q, dstq 895.loop: 896 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 897 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15 898 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 899 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x 900 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 901 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x 902 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 903 ; m1 = 4, 10, 5, 11, x, x, x, x 904 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21 905 ; m2 = 16, 22, 17, 23, x, x, x, x 906 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 907 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21 908 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23 909 movq [dstq ], m0 910 movhps [dstq+dst1q], m0 911 movq [dstq+dst2q], m3 912 movhps [dstq+dst3q], m3 913 movq [dstq+dst4q], m1 914 movhps [dstq+dst5q], m1 915 add srcq, mmsize*3 916 add dstq, mmsize/2 917 sub lend, mmsize/4 918 jg .loop 919 REP_RET 920%endmacro 921 922INIT_XMM sse2 923CONV_S16_TO_S16P_6CH 924INIT_XMM ssse3 925CONV_S16_TO_S16P_6CH 926%if HAVE_AVX_EXTERNAL 927INIT_XMM avx 928CONV_S16_TO_S16P_6CH 929%endif 930 931;------------------------------------------------------------------------------ 932; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len, 933; int channels); 934;------------------------------------------------------------------------------ 935 936%macro CONV_S16_TO_FLTP_2CH 0 937cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1 938 lea lenq, [4*lend] 939 mov dst1q, [dst0q+gprsize] 940 mov dst0q, [dst0q ] 941 add srcq, lenq 942 add dst0q, lenq 943 add dst1q, lenq 944 neg lenq 945 mova m3, [pf_s32_inv_scale] 946 mova m4, [pw_zero_even] 947.loop: 948 mova m1, [srcq+lenq] 949 pslld m0, m1, 16 950 pand m1, m4 951 cvtdq2ps m0, m0 952 cvtdq2ps m1, m1 953 mulps m0, m0, m3 954 mulps m1, m1, m3 955 mova [dst0q+lenq], m0 956 mova [dst1q+lenq], m1 957 add lenq, mmsize 958 jl .loop 959 REP_RET 960%endmacro 961 962INIT_XMM sse2 963CONV_S16_TO_FLTP_2CH 964%if HAVE_AVX_EXTERNAL 965INIT_XMM avx 966CONV_S16_TO_FLTP_2CH 967%endif 968 969;------------------------------------------------------------------------------ 970; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len, 971; int channels); 972;------------------------------------------------------------------------------ 973 974%macro CONV_S16_TO_FLTP_6CH 0 975%if ARCH_X86_64 976cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 977%else 978cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 979%define lend dword r2m 980%endif 981 mov dst1q, [dstq+ gprsize] 982 mov dst2q, [dstq+2*gprsize] 983 mov dst3q, [dstq+3*gprsize] 984 mov dst4q, [dstq+4*gprsize] 985 mov dst5q, [dstq+5*gprsize] 986 mov dstq, [dstq ] 987 sub dst1q, dstq 988 sub dst2q, dstq 989 sub dst3q, dstq 990 sub dst4q, dstq 991 sub dst5q, dstq 992 mova m6, [pf_s16_inv_scale] 993.loop: 994 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 995 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15 996 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 997 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x 998 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 999 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x 1000 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 1001 ; m1 = 4, 10, 5, 11, x, x, x, x 1002 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21 1003 ; m2 = 16, 22, 17, 23, x, x, x, x 1004 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 1005 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21 1006 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23 1007 S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18 1008 ; m2 = 1, 7, 13, 19 1009 S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20 1010 ; m4 = 3, 9, 15, 21 1011 S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22 1012 ; m5 = 5, 11, 17, 23 1013 SWAP 1,2,3,4 1014 cvtdq2ps m0, m0 1015 cvtdq2ps m1, m1 1016 cvtdq2ps m2, m2 1017 cvtdq2ps m3, m3 1018 cvtdq2ps m4, m4 1019 cvtdq2ps m5, m5 1020 mulps m0, m6 1021 mulps m1, m6 1022 mulps m2, m6 1023 mulps m3, m6 1024 mulps m4, m6 1025 mulps m5, m6 1026 mova [dstq ], m0 1027 mova [dstq+dst1q], m1 1028 mova [dstq+dst2q], m2 1029 mova [dstq+dst3q], m3 1030 mova [dstq+dst4q], m4 1031 mova [dstq+dst5q], m5 1032 add srcq, mmsize*3 1033 add dstq, mmsize 1034 sub lend, mmsize/4 1035 jg .loop 1036 REP_RET 1037%endmacro 1038 1039INIT_XMM sse2 1040CONV_S16_TO_FLTP_6CH 1041INIT_XMM ssse3 1042CONV_S16_TO_FLTP_6CH 1043INIT_XMM sse4 1044CONV_S16_TO_FLTP_6CH 1045%if HAVE_AVX_EXTERNAL 1046INIT_XMM avx 1047CONV_S16_TO_FLTP_6CH 1048%endif 1049 1050;------------------------------------------------------------------------------ 1051; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len, 1052; int channels); 1053;------------------------------------------------------------------------------ 1054 1055%macro CONV_FLT_TO_S16P_2CH 0 1056cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1 1057 lea lenq, [2*lend] 1058 mov dst1q, [dst0q+gprsize] 1059 mov dst0q, [dst0q ] 1060 lea srcq, [srcq+4*lenq] 1061 add dst0q, lenq 1062 add dst1q, lenq 1063 neg lenq 1064 mova m5, [pf_s16_scale] 1065.loop: 1066 mova m0, [srcq+4*lenq ] 1067 mova m1, [srcq+4*lenq+ mmsize] 1068 mova m2, [srcq+4*lenq+2*mmsize] 1069 mova m3, [srcq+4*lenq+3*mmsize] 1070 DEINT2_PS 0, 1, 4 1071 DEINT2_PS 2, 3, 4 1072 mulps m0, m0, m5 1073 mulps m1, m1, m5 1074 mulps m2, m2, m5 1075 mulps m3, m3, m5 1076 cvtps2dq m0, m0 1077 cvtps2dq m1, m1 1078 cvtps2dq m2, m2 1079 cvtps2dq m3, m3 1080 packssdw m0, m2 1081 packssdw m1, m3 1082 mova [dst0q+lenq], m0 1083 mova [dst1q+lenq], m1 1084 add lenq, mmsize 1085 jl .loop 1086 REP_RET 1087%endmacro 1088 1089INIT_XMM sse2 1090CONV_FLT_TO_S16P_2CH 1091%if HAVE_AVX_EXTERNAL 1092INIT_XMM avx 1093CONV_FLT_TO_S16P_2CH 1094%endif 1095 1096;------------------------------------------------------------------------------ 1097; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len, 1098; int channels); 1099;------------------------------------------------------------------------------ 1100 1101%macro CONV_FLT_TO_S16P_6CH 0 1102%if ARCH_X86_64 1103cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 1104%else 1105cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 1106%define lend dword r2m 1107%endif 1108 mov dst1q, [dstq+ gprsize] 1109 mov dst2q, [dstq+2*gprsize] 1110 mov dst3q, [dstq+3*gprsize] 1111 mov dst4q, [dstq+4*gprsize] 1112 mov dst5q, [dstq+5*gprsize] 1113 mov dstq, [dstq ] 1114 sub dst1q, dstq 1115 sub dst2q, dstq 1116 sub dst3q, dstq 1117 sub dst4q, dstq 1118 sub dst5q, dstq 1119 mova m6, [pf_s16_scale] 1120.loop: 1121 mulps m0, m6, [srcq+0*mmsize] 1122 mulps m3, m6, [srcq+1*mmsize] 1123 mulps m1, m6, [srcq+2*mmsize] 1124 mulps m4, m6, [srcq+3*mmsize] 1125 mulps m2, m6, [srcq+4*mmsize] 1126 mulps m5, m6, [srcq+5*mmsize] 1127 cvtps2dq m0, m0 1128 cvtps2dq m1, m1 1129 cvtps2dq m2, m2 1130 cvtps2dq m3, m3 1131 cvtps2dq m4, m4 1132 cvtps2dq m5, m5 1133 packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 1134 packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 1135 packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 1136 PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x 1137 shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19 1138 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x 1139 SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 1140 ; m3 = 4, 10, 5, 11, x, x, x, x 1141 SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21 1142 ; m2 = 16, 22, 17, 23, x, x, x, x 1143 SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 1144 ; m1 = 2, 8, 14, 20, 3, 9, 15, 21 1145 punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23 1146 movq [dstq ], m0 1147 movhps [dstq+dst1q], m0 1148 movq [dstq+dst2q], m1 1149 movhps [dstq+dst3q], m1 1150 movq [dstq+dst4q], m3 1151 movhps [dstq+dst5q], m3 1152 add srcq, mmsize*6 1153 add dstq, mmsize/2 1154 sub lend, mmsize/4 1155 jg .loop 1156 REP_RET 1157%endmacro 1158 1159INIT_XMM sse2 1160CONV_FLT_TO_S16P_6CH 1161INIT_XMM ssse3 1162CONV_FLT_TO_S16P_6CH 1163%if HAVE_AVX_EXTERNAL 1164INIT_XMM avx 1165CONV_FLT_TO_S16P_6CH 1166%endif 1167 1168;------------------------------------------------------------------------------ 1169; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len, 1170; int channels); 1171;------------------------------------------------------------------------------ 1172 1173%macro CONV_FLT_TO_FLTP_2CH 0 1174cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1 1175 lea lenq, [4*lend] 1176 mov dst1q, [dst0q+gprsize] 1177 mov dst0q, [dst0q ] 1178 lea srcq, [srcq+2*lenq] 1179 add dst0q, lenq 1180 add dst1q, lenq 1181 neg lenq 1182.loop: 1183 mova m0, [srcq+2*lenq ] 1184 mova m1, [srcq+2*lenq+mmsize] 1185 DEINT2_PS 0, 1, 2 1186 mova [dst0q+lenq], m0 1187 mova [dst1q+lenq], m1 1188 add lenq, mmsize 1189 jl .loop 1190 REP_RET 1191%endmacro 1192 1193INIT_XMM sse 1194CONV_FLT_TO_FLTP_2CH 1195%if HAVE_AVX_EXTERNAL 1196INIT_XMM avx 1197CONV_FLT_TO_FLTP_2CH 1198%endif 1199 1200;------------------------------------------------------------------------------ 1201; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len, 1202; int channels); 1203;------------------------------------------------------------------------------ 1204 1205%macro CONV_FLT_TO_FLTP_6CH 0 1206%if ARCH_X86_64 1207cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 1208%else 1209cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 1210%define lend dword r2m 1211%endif 1212 mov dst1q, [dstq+ gprsize] 1213 mov dst2q, [dstq+2*gprsize] 1214 mov dst3q, [dstq+3*gprsize] 1215 mov dst4q, [dstq+4*gprsize] 1216 mov dst5q, [dstq+5*gprsize] 1217 mov dstq, [dstq ] 1218 sub dst1q, dstq 1219 sub dst2q, dstq 1220 sub dst3q, dstq 1221 sub dst4q, dstq 1222 sub dst5q, dstq 1223.loop: 1224 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3 1225 mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7 1226 mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11 1227 mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15 1228 mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19 1229 mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23 1230 1231 SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13 1232 ; m3 = 2, 14, 3, 15 1233 SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17 1234 ; m4 = 6, 18, 7, 19 1235 SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21 1236 ; m5 = 10, 22, 11, 23 1237 SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18 1238 ; m4 = 1, 7, 13, 19 1239 SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20 1240 ; m2 = 3, 9, 15, 21 1241 SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22 1242 ; m5 = 5, 11, 17, 23 1243 mova [dstq ], m0 1244 mova [dstq+dst1q], m4 1245 mova [dstq+dst2q], m3 1246 mova [dstq+dst3q], m2 1247 mova [dstq+dst4q], m1 1248 mova [dstq+dst5q], m5 1249 add srcq, mmsize*6 1250 add dstq, mmsize 1251 sub lend, mmsize/4 1252 jg .loop 1253 REP_RET 1254%endmacro 1255 1256INIT_XMM sse2 1257CONV_FLT_TO_FLTP_6CH 1258%if HAVE_AVX_EXTERNAL 1259INIT_XMM avx 1260CONV_FLT_TO_FLTP_6CH 1261%endif 1262