1;****************************************************************************** 2;* optimized audio functions 3;* Copyright (c) 2008 Loren Merritt 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order) 27INIT_XMM sse2 28cglobal scalarproduct_int16, 3,3,3, v1, v2, order 29 add orderd, orderd 30 add v1q, orderq 31 add v2q, orderq 32 neg orderq 33 pxor m2, m2 34.loop: 35 movu m0, [v1q + orderq] 36 movu m1, [v1q + orderq + mmsize] 37 pmaddwd m0, [v2q + orderq] 38 pmaddwd m1, [v2q + orderq + mmsize] 39 paddd m2, m0 40 paddd m2, m1 41 add orderq, mmsize*2 42 jl .loop 43 HADDD m2, m0 44 movd eax, m2 45 RET 46 47 48;----------------------------------------------------------------------------- 49; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, 50; int32_t max, unsigned int len) 51;----------------------------------------------------------------------------- 52 53; %1 = number of xmm registers used 54; %2 = number of inline load/process/store loops per asm loop 55; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop 56; %4 = CLIPD function takes min/max as float instead of int (SSE2 version) 57; %5 = suffix 58%macro VECTOR_CLIP_INT32 4-5 59cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len 60%if %4 61 cvtsi2ss m4, minm 62 cvtsi2ss m5, maxm 63%else 64 movd m4, minm 65 movd m5, maxm 66%endif 67 SPLATD m4 68 SPLATD m5 69.loop: 70%assign %%i 0 71%rep %2 72 mova m0, [srcq + mmsize * (0 + %%i)] 73 mova m1, [srcq + mmsize * (1 + %%i)] 74 mova m2, [srcq + mmsize * (2 + %%i)] 75 mova m3, [srcq + mmsize * (3 + %%i)] 76%if %3 77 mova m7, [srcq + mmsize * (4 + %%i)] 78 mova m8, [srcq + mmsize * (5 + %%i)] 79 mova m9, [srcq + mmsize * (6 + %%i)] 80 mova m10, [srcq + mmsize * (7 + %%i)] 81%endif 82 CLIPD m0, m4, m5, m6 83 CLIPD m1, m4, m5, m6 84 CLIPD m2, m4, m5, m6 85 CLIPD m3, m4, m5, m6 86%if %3 87 CLIPD m7, m4, m5, m6 88 CLIPD m8, m4, m5, m6 89 CLIPD m9, m4, m5, m6 90 CLIPD m10, m4, m5, m6 91%endif 92 mova [dstq + mmsize * (0 + %%i)], m0 93 mova [dstq + mmsize * (1 + %%i)], m1 94 mova [dstq + mmsize * (2 + %%i)], m2 95 mova [dstq + mmsize * (3 + %%i)], m3 96%if %3 97 mova [dstq + mmsize * (4 + %%i)], m7 98 mova [dstq + mmsize * (5 + %%i)], m8 99 mova [dstq + mmsize * (6 + %%i)], m9 100 mova [dstq + mmsize * (7 + %%i)], m10 101%endif 102%assign %%i (%%i + 4 * (1 + %3)) 103%endrep 104 add srcq, mmsize*4*(%2+%3) 105 add dstq, mmsize*4*(%2+%3) 106 sub lend, mmsize*(%2+%3) 107 jg .loop 108 REP_RET 109%endmacro 110 111INIT_XMM sse2 112VECTOR_CLIP_INT32 6, 1, 0, 0, _int 113VECTOR_CLIP_INT32 6, 2, 0, 1 114INIT_XMM sse4 115%ifdef m8 116VECTOR_CLIP_INT32 11, 1, 1, 0 117%else 118VECTOR_CLIP_INT32 6, 1, 0, 0 119%endif 120 121; void ff_vector_clipf_sse(float *dst, const float *src, 122; int len, float min, float max) 123INIT_XMM sse 124cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max 125%if ARCH_X86_32 126 VBROADCASTSS m0, minm 127 VBROADCASTSS m1, maxm 128%elif WIN64 129 SWAP 0, 3 130 VBROADCASTSS m0, m0 131 VBROADCASTSS m1, maxm 132%else ; 64bit sysv 133 VBROADCASTSS m0, m0 134 VBROADCASTSS m1, m1 135%endif 136 137 movsxdifnidn lenq, lend 138 139.loop: 140 mova m2, [srcq + 4 * lenq - 4 * mmsize] 141 mova m3, [srcq + 4 * lenq - 3 * mmsize] 142 mova m4, [srcq + 4 * lenq - 2 * mmsize] 143 mova m5, [srcq + 4 * lenq - 1 * mmsize] 144 145 maxps m2, m0 146 maxps m3, m0 147 maxps m4, m0 148 maxps m5, m0 149 150 minps m2, m1 151 minps m3, m1 152 minps m4, m1 153 minps m5, m1 154 155 mova [dstq + 4 * lenq - 4 * mmsize], m2 156 mova [dstq + 4 * lenq - 3 * mmsize], m3 157 mova [dstq + 4 * lenq - 2 * mmsize], m4 158 mova [dstq + 4 * lenq - 1 * mmsize], m5 159 160 sub lenq, mmsize 161 jg .loop 162 163 RET 164