1;****************************************************************************** 2;* x86 optimized Format Conversion Utils 3;* Copyright (c) 2008 Loren Merritt 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26;------------------------------------------------------------------------------ 27; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, 28; int len); 29;------------------------------------------------------------------------------ 30%macro INT32_TO_FLOAT_FMUL_SCALAR 1 31%if UNIX64 32cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len 33%else 34cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len 35%endif 36%if WIN64 37 SWAP 0, 2 38%elif ARCH_X86_32 39 movss m0, mulm 40%endif 41 SPLATD m0 42 shl lend, 2 43 add srcq, lenq 44 add dstq, lenq 45 neg lenq 46.loop: 47%if cpuflag(sse2) 48 cvtdq2ps m1, [srcq+lenq ] 49 cvtdq2ps m2, [srcq+lenq+16] 50%else 51 cvtpi2ps m1, [srcq+lenq ] 52 cvtpi2ps m3, [srcq+lenq+ 8] 53 cvtpi2ps m2, [srcq+lenq+16] 54 cvtpi2ps m4, [srcq+lenq+24] 55 movlhps m1, m3 56 movlhps m2, m4 57%endif 58 mulps m1, m0 59 mulps m2, m0 60 mova [dstq+lenq ], m1 61 mova [dstq+lenq+16], m2 62 add lenq, 32 63 jl .loop 64%if notcpuflag(sse2) 65 ;; cvtpi2ps switches to MMX even if the source is a memory location 66 ;; possible an error in documentation since every tested CPU disagrees with 67 ;; that. Use emms anyway since the vast majority of machines will use the 68 ;; SSE2 variant 69 emms 70%endif 71 RET 72%endmacro 73 74INIT_XMM sse 75INT32_TO_FLOAT_FMUL_SCALAR 5 76INIT_XMM sse2 77INT32_TO_FLOAT_FMUL_SCALAR 3 78 79;------------------------------------------------------------------------------ 80; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src, 81; const float *mul, int len); 82;------------------------------------------------------------------------------ 83%macro INT32_TO_FLOAT_FMUL_ARRAY8 0 84cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len 85 shl lend, 2 86 add srcq, lenq 87 add dstq, lenq 88 neg lenq 89.loop: 90 movss m0, [mulq] 91 SPLATD m0 92%if cpuflag(sse2) 93 cvtdq2ps m1, [srcq+lenq ] 94 cvtdq2ps m2, [srcq+lenq+16] 95%else 96 cvtpi2ps m1, [srcq+lenq ] 97 cvtpi2ps m3, [srcq+lenq+ 8] 98 cvtpi2ps m2, [srcq+lenq+16] 99 cvtpi2ps m4, [srcq+lenq+24] 100 movlhps m1, m3 101 movlhps m2, m4 102%endif 103 mulps m1, m0 104 mulps m2, m0 105 mova [dstq+lenq ], m1 106 mova [dstq+lenq+16], m2 107 add mulq, 4 108 add lenq, 32 109 jl .loop 110%if notcpuflag(sse2) 111 ;; cvtpi2ps switches to MMX even if the source is a memory location 112 ;; possible an error in documentation since every tested CPU disagrees with 113 ;; that. Use emms anyway since the vast majority of machines will use the 114 ;; SSE2 variant 115 emms 116%endif 117 RET 118%endmacro 119 120INIT_XMM sse 121INT32_TO_FLOAT_FMUL_ARRAY8 122INIT_XMM sse2 123INT32_TO_FLOAT_FMUL_ARRAY8 124 125