• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* x86-optimized AC-3 DSP functions
3;* Copyright (c) 2011 Justin Ruggles
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26; 16777216.0f - used in ff_float_to_fixed24()
27pf_1_24: times 4 dd 0x4B800000
28
29; used in ff_ac3_compute_mantissa_size()
30cextern ac3_bap_bits
31pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
32pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
33
34; used in ff_ac3_extract_exponents()
35cextern pd_1
36pd_151: times 4 dd 151
37
38SECTION .text
39
40;-----------------------------------------------------------------------------
41; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
42;-----------------------------------------------------------------------------
43
44%macro AC3_EXPONENT_MIN 0
45cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
46    shl  reuse_blksq, 8
47    jz .end
48    LOOP_ALIGN
49.nextexp:
50    mov      offsetq, reuse_blksq
51    mova          m0, [expq+offsetq]
52    sub      offsetq, 256
53    LOOP_ALIGN
54.nextblk:
55    PMINUB        m0, [expq+offsetq], m1
56    sub      offsetq, 256
57    jae .nextblk
58    mova      [expq], m0
59    add         expq, mmsize
60    sub        expnq, mmsize
61    jg .nextexp
62.end:
63    REP_RET
64%endmacro
65
66%define LOOP_ALIGN
67INIT_MMX mmx
68AC3_EXPONENT_MIN
69%if HAVE_MMXEXT_EXTERNAL
70%define LOOP_ALIGN ALIGN 16
71INIT_MMX mmxext
72AC3_EXPONENT_MIN
73%endif
74%if HAVE_SSE2_EXTERNAL
75INIT_XMM sse2
76AC3_EXPONENT_MIN
77%endif
78%undef LOOP_ALIGN
79
80;-----------------------------------------------------------------------------
81; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
82;-----------------------------------------------------------------------------
83
84; The 3DNow! version is not bit-identical because pf2id uses truncation rather
85; than round-to-nearest.
86INIT_MMX 3dnow
87cglobal float_to_fixed24, 3, 3, 0, dst, src, len
88    movq   m0, [pf_1_24]
89.loop:
90    movq   m1, [srcq   ]
91    movq   m2, [srcq+8 ]
92    movq   m3, [srcq+16]
93    movq   m4, [srcq+24]
94    pfmul  m1, m0
95    pfmul  m2, m0
96    pfmul  m3, m0
97    pfmul  m4, m0
98    pf2id  m1, m1
99    pf2id  m2, m2
100    pf2id  m3, m3
101    pf2id  m4, m4
102    movq  [dstq   ], m1
103    movq  [dstq+8 ], m2
104    movq  [dstq+16], m3
105    movq  [dstq+24], m4
106    add  srcq, 32
107    add  dstq, 32
108    sub  lend, 8
109    ja .loop
110    femms
111    RET
112
113INIT_XMM sse
114cglobal float_to_fixed24, 3, 3, 3, dst, src, len
115    movaps     m0, [pf_1_24]
116.loop:
117    movaps     m1, [srcq   ]
118    movaps     m2, [srcq+16]
119    mulps      m1, m0
120    mulps      m2, m0
121    cvtps2pi  mm0, m1
122    movhlps    m1, m1
123    cvtps2pi  mm1, m1
124    cvtps2pi  mm2, m2
125    movhlps    m2, m2
126    cvtps2pi  mm3, m2
127    movq  [dstq   ], mm0
128    movq  [dstq+ 8], mm1
129    movq  [dstq+16], mm2
130    movq  [dstq+24], mm3
131    add      srcq, 32
132    add      dstq, 32
133    sub      lend, 8
134    ja .loop
135    emms
136    RET
137
138INIT_XMM sse2
139cglobal float_to_fixed24, 3, 3, 9, dst, src, len
140    movaps     m0, [pf_1_24]
141.loop:
142    movaps     m1, [srcq    ]
143    movaps     m2, [srcq+16 ]
144    movaps     m3, [srcq+32 ]
145    movaps     m4, [srcq+48 ]
146%ifdef m8
147    movaps     m5, [srcq+64 ]
148    movaps     m6, [srcq+80 ]
149    movaps     m7, [srcq+96 ]
150    movaps     m8, [srcq+112]
151%endif
152    mulps      m1, m0
153    mulps      m2, m0
154    mulps      m3, m0
155    mulps      m4, m0
156%ifdef m8
157    mulps      m5, m0
158    mulps      m6, m0
159    mulps      m7, m0
160    mulps      m8, m0
161%endif
162    cvtps2dq   m1, m1
163    cvtps2dq   m2, m2
164    cvtps2dq   m3, m3
165    cvtps2dq   m4, m4
166%ifdef m8
167    cvtps2dq   m5, m5
168    cvtps2dq   m6, m6
169    cvtps2dq   m7, m7
170    cvtps2dq   m8, m8
171%endif
172    movdqa  [dstq    ], m1
173    movdqa  [dstq+16 ], m2
174    movdqa  [dstq+32 ], m3
175    movdqa  [dstq+48 ], m4
176%ifdef m8
177    movdqa  [dstq+64 ], m5
178    movdqa  [dstq+80 ], m6
179    movdqa  [dstq+96 ], m7
180    movdqa  [dstq+112], m8
181    add      srcq, 128
182    add      dstq, 128
183    sub      lenq, 32
184%else
185    add      srcq, 64
186    add      dstq, 64
187    sub      lenq, 16
188%endif
189    ja .loop
190    REP_RET
191
192;------------------------------------------------------------------------------
193; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
194;------------------------------------------------------------------------------
195
196%macro PHADDD4 2 ; xmm src, xmm tmp
197    movhlps  %2, %1
198    paddd    %1, %2
199    pshufd   %2, %1, 0x1
200    paddd    %1, %2
201%endmacro
202
203INIT_XMM sse2
204cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
205    movdqa      m0, [mant_cntq      ]
206    movdqa      m1, [mant_cntq+ 1*16]
207    paddw       m0, [mant_cntq+ 2*16]
208    paddw       m1, [mant_cntq+ 3*16]
209    paddw       m0, [mant_cntq+ 4*16]
210    paddw       m1, [mant_cntq+ 5*16]
211    paddw       m0, [mant_cntq+ 6*16]
212    paddw       m1, [mant_cntq+ 7*16]
213    paddw       m0, [mant_cntq+ 8*16]
214    paddw       m1, [mant_cntq+ 9*16]
215    paddw       m0, [mant_cntq+10*16]
216    paddw       m1, [mant_cntq+11*16]
217    pmaddwd     m0, [ac3_bap_bits   ]
218    pmaddwd     m1, [ac3_bap_bits+16]
219    paddd       m0, m1
220    PHADDD4     m0, m1
221    movd      sumd, m0
222    movdqa      m3, [pw_bap_mul1]
223    movhpd      m0, [mant_cntq     +2]
224    movlpd      m0, [mant_cntq+1*32+2]
225    movhpd      m1, [mant_cntq+2*32+2]
226    movlpd      m1, [mant_cntq+3*32+2]
227    movhpd      m2, [mant_cntq+4*32+2]
228    movlpd      m2, [mant_cntq+5*32+2]
229    pmulhuw     m0, m3
230    pmulhuw     m1, m3
231    pmulhuw     m2, m3
232    paddusw     m0, m1
233    paddusw     m0, m2
234    pmaddwd     m0, [pw_bap_mul2]
235    PHADDD4     m0, m1
236    movd       eax, m0
237    add        eax, sumd
238    RET
239
240;------------------------------------------------------------------------------
241; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
242;------------------------------------------------------------------------------
243
244%macro PABSD 1-2 ; src/dst, unused
245%if cpuflag(ssse3)
246    pabsd    %1, %1
247%else ; src/dst, tmp
248    pxor     %2, %2
249    pcmpgtd  %2, %1
250    pxor     %1, %2
251    psubd    %1, %2
252%endif
253%endmacro
254
255%macro AC3_EXTRACT_EXPONENTS 0
256cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
257    add     expq, lenq
258    lea    coefq, [coefq+4*lenq]
259    neg     lenq
260    mova      m2, [pd_1]
261    mova      m3, [pd_151]
262.loop:
263    ; move 4 32-bit coefs to xmm0
264    mova      m0, [coefq+4*lenq]
265    ; absolute value
266    PABSD     m0, m1
267    ; convert to float and extract exponents
268    pslld     m0, 1
269    por       m0, m2
270    cvtdq2ps  m1, m0
271    psrld     m1, 23
272    mova      m0, m3
273    psubd     m0, m1
274    ; move the lowest byte in each of 4 dwords to the low dword
275    ; NOTE: We cannot just extract the low bytes with pshufb because the dword
276    ;       result for 16777215 is -1 due to float inaccuracy. Using packuswb
277    ;       clips this to 0, which is the correct exponent.
278    packssdw  m0, m0
279    packuswb  m0, m0
280    movd  [expq+lenq], m0
281
282    add     lenq, 4
283    jl .loop
284    REP_RET
285%endmacro
286
287%if HAVE_SSE2_EXTERNAL
288INIT_XMM sse2
289AC3_EXTRACT_EXPONENTS
290%endif
291%if HAVE_SSSE3_EXTERNAL
292INIT_XMM ssse3
293AC3_EXTRACT_EXPONENTS
294%endif
295