• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* optimized audio functions
3;* Copyright (c) 2008 Loren Merritt
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION .text
25
26; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
27INIT_XMM sse2
28cglobal scalarproduct_int16, 3,3,3, v1, v2, order
29    add orderd, orderd
30    add v1q, orderq
31    add v2q, orderq
32    neg orderq
33    pxor    m2, m2
34.loop:
35    movu    m0, [v1q + orderq]
36    movu    m1, [v1q + orderq + mmsize]
37    pmaddwd m0, [v2q + orderq]
38    pmaddwd m1, [v2q + orderq + mmsize]
39    paddd   m2, m0
40    paddd   m2, m1
41    add     orderq, mmsize*2
42    jl .loop
43    HADDD   m2, m0
44    movd   eax, m2
45    RET
46
47
48;-----------------------------------------------------------------------------
49; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
50;                           int32_t max, unsigned int len)
51;-----------------------------------------------------------------------------
52
53; %1 = number of xmm registers used
54; %2 = number of inline load/process/store loops per asm loop
55; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
56; %4 = CLIPD function takes min/max as float instead of int (SSE2 version)
57; %5 = suffix
58%macro VECTOR_CLIP_INT32 4-5
59cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
60%if %4
61    cvtsi2ss  m4, minm
62    cvtsi2ss  m5, maxm
63%else
64    movd      m4, minm
65    movd      m5, maxm
66%endif
67    SPLATD    m4
68    SPLATD    m5
69.loop:
70%assign %%i 0
71%rep %2
72    mova      m0,  [srcq + mmsize * (0 + %%i)]
73    mova      m1,  [srcq + mmsize * (1 + %%i)]
74    mova      m2,  [srcq + mmsize * (2 + %%i)]
75    mova      m3,  [srcq + mmsize * (3 + %%i)]
76%if %3
77    mova      m7,  [srcq + mmsize * (4 + %%i)]
78    mova      m8,  [srcq + mmsize * (5 + %%i)]
79    mova      m9,  [srcq + mmsize * (6 + %%i)]
80    mova      m10, [srcq + mmsize * (7 + %%i)]
81%endif
82    CLIPD  m0,  m4, m5, m6
83    CLIPD  m1,  m4, m5, m6
84    CLIPD  m2,  m4, m5, m6
85    CLIPD  m3,  m4, m5, m6
86%if %3
87    CLIPD  m7,  m4, m5, m6
88    CLIPD  m8,  m4, m5, m6
89    CLIPD  m9,  m4, m5, m6
90    CLIPD  m10, m4, m5, m6
91%endif
92    mova  [dstq + mmsize * (0 + %%i)], m0
93    mova  [dstq + mmsize * (1 + %%i)], m1
94    mova  [dstq + mmsize * (2 + %%i)], m2
95    mova  [dstq + mmsize * (3 + %%i)], m3
96%if %3
97    mova  [dstq + mmsize * (4 + %%i)], m7
98    mova  [dstq + mmsize * (5 + %%i)], m8
99    mova  [dstq + mmsize * (6 + %%i)], m9
100    mova  [dstq + mmsize * (7 + %%i)], m10
101%endif
102%assign %%i (%%i + 4 * (1 + %3))
103%endrep
104    add     srcq, mmsize*4*(%2+%3)
105    add     dstq, mmsize*4*(%2+%3)
106    sub     lend, mmsize*(%2+%3)
107    jg .loop
108    REP_RET
109%endmacro
110
111INIT_XMM sse2
112VECTOR_CLIP_INT32 6, 1, 0, 0, _int
113VECTOR_CLIP_INT32 6, 2, 0, 1
114INIT_XMM sse4
115%ifdef m8
116VECTOR_CLIP_INT32 11, 1, 1, 0
117%else
118VECTOR_CLIP_INT32 6, 1, 0, 0
119%endif
120
121; void ff_vector_clipf_sse(float *dst, const float *src,
122;                          int len, float min, float max)
123INIT_XMM sse
124cglobal vector_clipf, 3, 3, 6, dst, src, len, min, max
125%if ARCH_X86_32
126    VBROADCASTSS m0, minm
127    VBROADCASTSS m1, maxm
128%elif WIN64
129    SWAP 0, 3
130    VBROADCASTSS m0, m0
131    VBROADCASTSS m1, maxm
132%else ; 64bit sysv
133    VBROADCASTSS m0, m0
134    VBROADCASTSS m1, m1
135%endif
136
137    movsxdifnidn lenq, lend
138
139.loop:
140    mova m2, [srcq + 4 * lenq - 4 * mmsize]
141    mova m3, [srcq + 4 * lenq - 3 * mmsize]
142    mova m4, [srcq + 4 * lenq - 2 * mmsize]
143    mova m5, [srcq + 4 * lenq - 1 * mmsize]
144
145    maxps m2, m0
146    maxps m3, m0
147    maxps m4, m0
148    maxps m5, m0
149
150    minps m2, m1
151    minps m3, m1
152    minps m4, m1
153    minps m5, m1
154
155    mova [dstq + 4 * lenq - 4 * mmsize], m2
156    mova [dstq + 4 * lenq - 3 * mmsize], m3
157    mova [dstq + 4 * lenq - 2 * mmsize], m4
158    mova [dstq + 4 * lenq - 1 * mmsize], m5
159
160    sub lenq, mmsize
161    jg .loop
162
163    RET
164