1;****************************************************************************** 2;* optimized audio functions 3;* Copyright (c) 2008 Loren Merritt 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26%macro SCALARPRODUCT 0 27; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order) 28cglobal scalarproduct_int16, 3,3,3, v1, v2, order 29 add orderd, orderd 30 add v1q, orderq 31 add v2q, orderq 32 neg orderq 33 pxor m2, m2 34.loop: 35 movu m0, [v1q + orderq] 36 movu m1, [v1q + orderq + mmsize] 37 pmaddwd m0, [v2q + orderq] 38 pmaddwd m1, [v2q + orderq + mmsize] 39 paddd m2, m0 40 paddd m2, m1 41 add orderq, mmsize*2 42 jl .loop 43 HADDD m2, m0 44 movd eax, m2 45%if mmsize == 8 46 emms 47%endif 48 RET 49%endmacro 50 51INIT_MMX mmxext 52SCALARPRODUCT 53INIT_XMM sse2 54SCALARPRODUCT 55 56 57;----------------------------------------------------------------------------- 58; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, 59; int32_t max, unsigned int len) 60;----------------------------------------------------------------------------- 61 62; %1 = number of xmm registers used 63; %2 = number of inline load/process/store loops per asm loop 64; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop 65; %4 = CLIPD function takes min/max as float instead of int (SSE2 version) 66; %5 = suffix 67%macro VECTOR_CLIP_INT32 4-5 68cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len 69%if %4 70 cvtsi2ss m4, minm 71 cvtsi2ss m5, maxm 72%else 73 movd m4, minm 74 movd m5, maxm 75%endif 76 SPLATD m4 77 SPLATD m5 78.loop: 79%assign %%i 0 80%rep %2 81 mova m0, [srcq + mmsize * (0 + %%i)] 82 mova m1, [srcq + mmsize * (1 + %%i)] 83 mova m2, [srcq + mmsize * (2 + %%i)] 84 mova m3, [srcq + mmsize * (3 + %%i)] 85%if %3 86 mova m7, [srcq + mmsize * (4 + %%i)] 87 mova m8, [srcq + mmsize * (5 + %%i)] 88 mova m9, [srcq + mmsize * (6 + %%i)] 89 mova m10, [srcq + mmsize * (7 + %%i)] 90%endif 91 CLIPD m0, m4, m5, m6 92 CLIPD m1, m4, m5, m6 93 CLIPD m2, m4, m5, m6 94 CLIPD m3, m4, m5, m6 95%if %3 96 CLIPD m7, m4, m5, m6 97 CLIPD m8, m4, m5, m6 98 CLIPD m9, m4, m5, m6 99 CLIPD m10, m4, m5, m6 100%endif 101 mova [dstq + mmsize * (0 + %%i)], m0 102 mova [dstq + mmsize * (1 + %%i)], m1 103 mova [dstq + mmsize * (2 + %%i)], m2 104 mova [dstq + mmsize * (3 + %%i)], m3 105%if %3 106 mova [dstq + mmsize * (4 + %%i)], m7 107 mova [dstq + mmsize * (5 + %%i)], m8 108 mova [dstq + mmsize * (6 + %%i)], m9 109 mova [dstq + mmsize * (7 + %%i)], m10 110%endif 111%assign %%i (%%i + 4 * (1 + %3)) 112%endrep 113 add srcq, mmsize*4*(%2+%3) 114 add dstq, mmsize*4*(%2+%3) 115 sub lend, mmsize*(%2+%3) 116 jg .loop 117 REP_RET 118%endmacro 119 120INIT_MMX mmx 121VECTOR_CLIP_INT32 0, 1, 0, 0 122INIT_XMM sse2 123VECTOR_CLIP_INT32 6, 1, 0, 0, _int 124VECTOR_CLIP_INT32 6, 2, 0, 1 125INIT_XMM sse4 126%ifdef m8 127VECTOR_CLIP_INT32 11, 1, 1, 0 128%else 129VECTOR_CLIP_INT32 6, 1, 0, 0 130%endif 131 132; void ff_vector_clipf_sse(float *dst, const float *src, 133; int len, float min, float max) 134INIT_XMM sse 135cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max 136%if ARCH_X86_32 137 VBROADCASTSS m0, minm 138 VBROADCASTSS m1, maxm 139%elif WIN64 140 SWAP 0, 3 141 VBROADCASTSS m0, m0 142 VBROADCASTSS m1, maxm 143%else ; 64bit sysv 144 VBROADCASTSS m0, m0 145 VBROADCASTSS m1, m1 146%endif 147 148 movsxdifnidn lenq, lend 149 150.loop: 151 mova m2, [srcq + 4 * lenq - 4 * mmsize] 152 mova m3, [srcq + 4 * lenq - 3 * mmsize] 153 mova m4, [srcq + 4 * lenq - 2 * mmsize] 154 mova m5, [srcq + 4 * lenq - 1 * mmsize] 155 156 maxps m2, m0 157 maxps m3, m0 158 maxps m4, m0 159 maxps m5, m0 160 161 minps m2, m1 162 minps m3, m1 163 minps m4, m1 164 minps m5, m1 165 166 mova [dstq + 4 * lenq - 4 * mmsize], m2 167 mova [dstq + 4 * lenq - 3 * mmsize], m3 168 mova [dstq + 4 * lenq - 2 * mmsize], m4 169 mova [dstq + 4 * lenq - 1 * mmsize], m5 170 171 sub lenq, mmsize 172 jg .loop 173 174 RET 175