• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* optimized audio functions
3;* Copyright (c) 2008 Loren Merritt
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION .text
25
26%macro SCALARPRODUCT 0
27; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
28cglobal scalarproduct_int16, 3,3,3, v1, v2, order
29    add orderd, orderd
30    add v1q, orderq
31    add v2q, orderq
32    neg orderq
33    pxor    m2, m2
34.loop:
35    movu    m0, [v1q + orderq]
36    movu    m1, [v1q + orderq + mmsize]
37    pmaddwd m0, [v2q + orderq]
38    pmaddwd m1, [v2q + orderq + mmsize]
39    paddd   m2, m0
40    paddd   m2, m1
41    add     orderq, mmsize*2
42    jl .loop
43    HADDD   m2, m0
44    movd   eax, m2
45%if mmsize == 8
46    emms
47%endif
48    RET
49%endmacro
50
51INIT_MMX mmxext
52SCALARPRODUCT
53INIT_XMM sse2
54SCALARPRODUCT
55
56
57;-----------------------------------------------------------------------------
58; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
59;                           int32_t max, unsigned int len)
60;-----------------------------------------------------------------------------
61
62; %1 = number of xmm registers used
63; %2 = number of inline load/process/store loops per asm loop
64; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
65; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)
66; %5 = suffix
67%macro VECTOR_CLIP_INT32 4-5
68cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
69%if %4
70    cvtsi2ss  m4, minm
71    cvtsi2ss  m5, maxm
72%else
73    movd      m4, minm
74    movd      m5, maxm
75%endif
76    SPLATD    m4
77    SPLATD    m5
78.loop:
79%assign %%i 0
80%rep %2
81    mova      m0,  [srcq + mmsize * (0 + %%i)]
82    mova      m1,  [srcq + mmsize * (1 + %%i)]
83    mova      m2,  [srcq + mmsize * (2 + %%i)]
84    mova      m3,  [srcq + mmsize * (3 + %%i)]
85%if %3
86    mova      m7,  [srcq + mmsize * (4 + %%i)]
87    mova      m8,  [srcq + mmsize * (5 + %%i)]
88    mova      m9,  [srcq + mmsize * (6 + %%i)]
89    mova      m10, [srcq + mmsize * (7 + %%i)]
90%endif
91    CLIPD  m0,  m4, m5, m6
92    CLIPD  m1,  m4, m5, m6
93    CLIPD  m2,  m4, m5, m6
94    CLIPD  m3,  m4, m5, m6
95%if %3
96    CLIPD  m7,  m4, m5, m6
97    CLIPD  m8,  m4, m5, m6
98    CLIPD  m9,  m4, m5, m6
99    CLIPD  m10, m4, m5, m6
100%endif
101    mova  [dstq + mmsize * (0 + %%i)], m0
102    mova  [dstq + mmsize * (1 + %%i)], m1
103    mova  [dstq + mmsize * (2 + %%i)], m2
104    mova  [dstq + mmsize * (3 + %%i)], m3
105%if %3
106    mova  [dstq + mmsize * (4 + %%i)], m7
107    mova  [dstq + mmsize * (5 + %%i)], m8
108    mova  [dstq + mmsize * (6 + %%i)], m9
109    mova  [dstq + mmsize * (7 + %%i)], m10
110%endif
111%assign %%i (%%i + 4 * (1 + %3))
112%endrep
113    add     srcq, mmsize*4*(%2+%3)
114    add     dstq, mmsize*4*(%2+%3)
115    sub     lend, mmsize*(%2+%3)
116    jg .loop
117    REP_RET
118%endmacro
119
120INIT_MMX mmx
121VECTOR_CLIP_INT32 0, 1, 0, 0
122INIT_XMM sse2
123VECTOR_CLIP_INT32 6, 1, 0, 0, _int
124VECTOR_CLIP_INT32 6, 2, 0, 1
125INIT_XMM sse4
126%ifdef m8
127VECTOR_CLIP_INT32 11, 1, 1, 0
128%else
129VECTOR_CLIP_INT32 6, 1, 0, 0
130%endif
131
132; void ff_vector_clipf_sse(float *dst, const float *src,
133;                          int len, float min, float max)
134INIT_XMM sse
135cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
136%if ARCH_X86_32
137    VBROADCASTSS m0, minm
138    VBROADCASTSS m1, maxm
139%elif WIN64
140    SWAP 0, 3
141    VBROADCASTSS m0, m0
142    VBROADCASTSS m1, maxm
143%else ; 64bit sysv
144    VBROADCASTSS m0, m0
145    VBROADCASTSS m1, m1
146%endif
147
148    movsxdifnidn lenq, lend
149
150.loop:
151    mova m2, [srcq + 4 * lenq - 4 * mmsize]
152    mova m3, [srcq + 4 * lenq - 3 * mmsize]
153    mova m4, [srcq + 4 * lenq - 2 * mmsize]
154    mova m5, [srcq + 4 * lenq - 1 * mmsize]
155
156    maxps m2, m0
157    maxps m3, m0
158    maxps m4, m0
159    maxps m5, m0
160
161    minps m2, m1
162    minps m3, m1
163    minps m4, m1
164    minps m5, m1
165
166    mova [dstq + 4 * lenq - 4 * mmsize], m2
167    mova [dstq + 4 * lenq - 3 * mmsize], m3
168    mova [dstq + 4 * lenq - 2 * mmsize], m4
169    mova [dstq + 4 * lenq - 1 * mmsize], m5
170
171    sub lenq, mmsize
172    jg .loop
173
174    RET
175