• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* x86-optimized AC-3 DSP functions
3;* Copyright (c) 2011 Justin Ruggles
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26; 16777216.0f - used in ff_float_to_fixed24()
27pf_1_24: times 4 dd 0x4B800000
28
29; used in ff_ac3_compute_mantissa_size()
30cextern ac3_bap_bits
31pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
32pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
33
34; used in ff_ac3_extract_exponents()
35cextern pd_1
36pd_151: times 4 dd 151
37
38; used in ff_apply_window_int16()
39pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
40pd_16384: times 4 dd 16384
41
42SECTION .text
43
44;-----------------------------------------------------------------------------
45; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
46;-----------------------------------------------------------------------------
47
48%macro AC3_EXPONENT_MIN 0
49cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
50    shl  reuse_blksq, 8
51    jz .end
52    LOOP_ALIGN
53.nextexp:
54    mov      offsetq, reuse_blksq
55    mova          m0, [expq+offsetq]
56    sub      offsetq, 256
57    LOOP_ALIGN
58.nextblk:
59    PMINUB        m0, [expq+offsetq], m1
60    sub      offsetq, 256
61    jae .nextblk
62    mova      [expq], m0
63    add         expq, mmsize
64    sub        expnq, mmsize
65    jg .nextexp
66.end:
67    REP_RET
68%endmacro
69
70%define LOOP_ALIGN
71INIT_MMX mmx
72AC3_EXPONENT_MIN
73%if HAVE_MMXEXT_EXTERNAL
74%define LOOP_ALIGN ALIGN 16
75INIT_MMX mmxext
76AC3_EXPONENT_MIN
77%endif
78%if HAVE_SSE2_EXTERNAL
79INIT_XMM sse2
80AC3_EXPONENT_MIN
81%endif
82%undef LOOP_ALIGN
83
84;-----------------------------------------------------------------------------
85; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
86;
87; This function uses 2 different methods to calculate a valid result.
88; 1) logical 'or' of abs of each element
89;        This is used for ssse3 because of the pabsw instruction.
90;        It is also used for mmx because of the lack of min/max instructions.
91; 2) calculate min/max for the array, then or(abs(min),abs(max))
92;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
93;-----------------------------------------------------------------------------
94
95; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
96%macro OR_WORDS_HORIZ 2 ; src, tmp
97%if cpuflag(sse2)
98    movhlps     %2, %1
99    por         %1, %2
100    pshuflw     %2, %1, q0032
101    por         %1, %2
102    pshuflw     %2, %1, q0001
103    por         %1, %2
104%elif cpuflag(mmxext)
105    pshufw      %2, %1, q0032
106    por         %1, %2
107    pshufw      %2, %1, q0001
108    por         %1, %2
109%else ; mmx
110    movq        %2, %1
111    psrlq       %2, 32
112    por         %1, %2
113    movq        %2, %1
114    psrlq       %2, 16
115    por         %1, %2
116%endif
117%endmacro
118
119%macro AC3_MAX_MSB_ABS_INT16 1
120cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
121    pxor        m2, m2
122    pxor        m3, m3
123.loop:
124%ifidn %1, min_max
125    mova        m0, [srcq]
126    mova        m1, [srcq+mmsize]
127    pminsw      m2, m0
128    pminsw      m2, m1
129    pmaxsw      m3, m0
130    pmaxsw      m3, m1
131%else ; or_abs
132%if notcpuflag(ssse3)
133    mova        m0, [srcq]
134    mova        m1, [srcq+mmsize]
135    ABS2        m0, m1, m3, m4
136%else ; ssse3
137    ; using memory args is faster for ssse3
138    pabsw       m0, [srcq]
139    pabsw       m1, [srcq+mmsize]
140%endif
141    por         m2, m0
142    por         m2, m1
143%endif
144    add       srcq, mmsize*2
145    sub       lend, mmsize
146    ja .loop
147%ifidn %1, min_max
148    ABS2        m2, m3, m0, m1
149    por         m2, m3
150%endif
151    OR_WORDS_HORIZ m2, m0
152    movd       eax, m2
153    and        eax, 0xFFFF
154    RET
155%endmacro
156
157INIT_MMX mmx
158AC3_MAX_MSB_ABS_INT16 or_abs
159INIT_MMX mmxext
160AC3_MAX_MSB_ABS_INT16 min_max
161INIT_XMM sse2
162AC3_MAX_MSB_ABS_INT16 min_max
163INIT_XMM ssse3
164AC3_MAX_MSB_ABS_INT16 or_abs
165
166;-----------------------------------------------------------------------------
167; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
168;-----------------------------------------------------------------------------
169
170%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
171cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
172    movd      m0, shiftd
173.loop:
174    mova      m1, [srcq         ]
175    mova      m2, [srcq+mmsize  ]
176    mova      m3, [srcq+mmsize*2]
177    mova      m4, [srcq+mmsize*3]
178    %3        m1, m0
179    %3        m2, m0
180    %3        m3, m0
181    %3        m4, m0
182    mova  [srcq         ], m1
183    mova  [srcq+mmsize  ], m2
184    mova  [srcq+mmsize*2], m3
185    mova  [srcq+mmsize*3], m4
186    add     srcq, mmsize*4
187    sub     lend, mmsize*32/%2
188    ja .loop
189.end:
190    REP_RET
191%endmacro
192
193;-----------------------------------------------------------------------------
194; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
195;-----------------------------------------------------------------------------
196
197INIT_MMX mmx
198AC3_SHIFT l, 16, psllw
199INIT_XMM sse2
200AC3_SHIFT l, 16, psllw
201
202;-----------------------------------------------------------------------------
203; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
204;-----------------------------------------------------------------------------
205
206INIT_MMX mmx
207AC3_SHIFT r, 32, psrad
208INIT_XMM sse2
209AC3_SHIFT r, 32, psrad
210
211;-----------------------------------------------------------------------------
212; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
213;-----------------------------------------------------------------------------
214
215; The 3DNow! version is not bit-identical because pf2id uses truncation rather
216; than round-to-nearest.
217INIT_MMX 3dnow
218cglobal float_to_fixed24, 3, 3, 0, dst, src, len
219    movq   m0, [pf_1_24]
220.loop:
221    movq   m1, [srcq   ]
222    movq   m2, [srcq+8 ]
223    movq   m3, [srcq+16]
224    movq   m4, [srcq+24]
225    pfmul  m1, m0
226    pfmul  m2, m0
227    pfmul  m3, m0
228    pfmul  m4, m0
229    pf2id  m1, m1
230    pf2id  m2, m2
231    pf2id  m3, m3
232    pf2id  m4, m4
233    movq  [dstq   ], m1
234    movq  [dstq+8 ], m2
235    movq  [dstq+16], m3
236    movq  [dstq+24], m4
237    add  srcq, 32
238    add  dstq, 32
239    sub  lend, 8
240    ja .loop
241    femms
242    RET
243
244INIT_XMM sse
245cglobal float_to_fixed24, 3, 3, 3, dst, src, len
246    movaps     m0, [pf_1_24]
247.loop:
248    movaps     m1, [srcq   ]
249    movaps     m2, [srcq+16]
250    mulps      m1, m0
251    mulps      m2, m0
252    cvtps2pi  mm0, m1
253    movhlps    m1, m1
254    cvtps2pi  mm1, m1
255    cvtps2pi  mm2, m2
256    movhlps    m2, m2
257    cvtps2pi  mm3, m2
258    movq  [dstq   ], mm0
259    movq  [dstq+ 8], mm1
260    movq  [dstq+16], mm2
261    movq  [dstq+24], mm3
262    add      srcq, 32
263    add      dstq, 32
264    sub      lend, 8
265    ja .loop
266    emms
267    RET
268
269INIT_XMM sse2
270cglobal float_to_fixed24, 3, 3, 9, dst, src, len
271    movaps     m0, [pf_1_24]
272.loop:
273    movaps     m1, [srcq    ]
274    movaps     m2, [srcq+16 ]
275    movaps     m3, [srcq+32 ]
276    movaps     m4, [srcq+48 ]
277%ifdef m8
278    movaps     m5, [srcq+64 ]
279    movaps     m6, [srcq+80 ]
280    movaps     m7, [srcq+96 ]
281    movaps     m8, [srcq+112]
282%endif
283    mulps      m1, m0
284    mulps      m2, m0
285    mulps      m3, m0
286    mulps      m4, m0
287%ifdef m8
288    mulps      m5, m0
289    mulps      m6, m0
290    mulps      m7, m0
291    mulps      m8, m0
292%endif
293    cvtps2dq   m1, m1
294    cvtps2dq   m2, m2
295    cvtps2dq   m3, m3
296    cvtps2dq   m4, m4
297%ifdef m8
298    cvtps2dq   m5, m5
299    cvtps2dq   m6, m6
300    cvtps2dq   m7, m7
301    cvtps2dq   m8, m8
302%endif
303    movdqa  [dstq    ], m1
304    movdqa  [dstq+16 ], m2
305    movdqa  [dstq+32 ], m3
306    movdqa  [dstq+48 ], m4
307%ifdef m8
308    movdqa  [dstq+64 ], m5
309    movdqa  [dstq+80 ], m6
310    movdqa  [dstq+96 ], m7
311    movdqa  [dstq+112], m8
312    add      srcq, 128
313    add      dstq, 128
314    sub      lenq, 32
315%else
316    add      srcq, 64
317    add      dstq, 64
318    sub      lenq, 16
319%endif
320    ja .loop
321    REP_RET
322
323;------------------------------------------------------------------------------
324; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
325;------------------------------------------------------------------------------
326
327%macro PHADDD4 2 ; xmm src, xmm tmp
328    movhlps  %2, %1
329    paddd    %1, %2
330    pshufd   %2, %1, 0x1
331    paddd    %1, %2
332%endmacro
333
334INIT_XMM sse2
335cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
336    movdqa      m0, [mant_cntq      ]
337    movdqa      m1, [mant_cntq+ 1*16]
338    paddw       m0, [mant_cntq+ 2*16]
339    paddw       m1, [mant_cntq+ 3*16]
340    paddw       m0, [mant_cntq+ 4*16]
341    paddw       m1, [mant_cntq+ 5*16]
342    paddw       m0, [mant_cntq+ 6*16]
343    paddw       m1, [mant_cntq+ 7*16]
344    paddw       m0, [mant_cntq+ 8*16]
345    paddw       m1, [mant_cntq+ 9*16]
346    paddw       m0, [mant_cntq+10*16]
347    paddw       m1, [mant_cntq+11*16]
348    pmaddwd     m0, [ac3_bap_bits   ]
349    pmaddwd     m1, [ac3_bap_bits+16]
350    paddd       m0, m1
351    PHADDD4     m0, m1
352    movd      sumd, m0
353    movdqa      m3, [pw_bap_mul1]
354    movhpd      m0, [mant_cntq     +2]
355    movlpd      m0, [mant_cntq+1*32+2]
356    movhpd      m1, [mant_cntq+2*32+2]
357    movlpd      m1, [mant_cntq+3*32+2]
358    movhpd      m2, [mant_cntq+4*32+2]
359    movlpd      m2, [mant_cntq+5*32+2]
360    pmulhuw     m0, m3
361    pmulhuw     m1, m3
362    pmulhuw     m2, m3
363    paddusw     m0, m1
364    paddusw     m0, m2
365    pmaddwd     m0, [pw_bap_mul2]
366    PHADDD4     m0, m1
367    movd       eax, m0
368    add        eax, sumd
369    RET
370
371;------------------------------------------------------------------------------
372; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
373;------------------------------------------------------------------------------
374
375%macro PABSD 1-2 ; src/dst, unused
376%if cpuflag(ssse3)
377    pabsd    %1, %1
378%else ; src/dst, tmp
379    pxor     %2, %2
380    pcmpgtd  %2, %1
381    pxor     %1, %2
382    psubd    %1, %2
383%endif
384%endmacro
385
386%macro AC3_EXTRACT_EXPONENTS 0
387cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
388    add     expq, lenq
389    lea    coefq, [coefq+4*lenq]
390    neg     lenq
391    mova      m2, [pd_1]
392    mova      m3, [pd_151]
393.loop:
394    ; move 4 32-bit coefs to xmm0
395    mova      m0, [coefq+4*lenq]
396    ; absolute value
397    PABSD     m0, m1
398    ; convert to float and extract exponents
399    pslld     m0, 1
400    por       m0, m2
401    cvtdq2ps  m1, m0
402    psrld     m1, 23
403    mova      m0, m3
404    psubd     m0, m1
405    ; move the lowest byte in each of 4 dwords to the low dword
406    ; NOTE: We cannot just extract the low bytes with pshufb because the dword
407    ;       result for 16777215 is -1 due to float inaccuracy. Using packuswb
408    ;       clips this to 0, which is the correct exponent.
409    packssdw  m0, m0
410    packuswb  m0, m0
411    movd  [expq+lenq], m0
412
413    add     lenq, 4
414    jl .loop
415    REP_RET
416%endmacro
417
418%if HAVE_SSE2_EXTERNAL
419INIT_XMM sse2
420AC3_EXTRACT_EXPONENTS
421%endif
422%if HAVE_SSSE3_EXTERNAL
423INIT_XMM ssse3
424AC3_EXTRACT_EXPONENTS
425%endif
426
427;-----------------------------------------------------------------------------
428; void ff_apply_window_int16(int16_t *output, const int16_t *input,
429;                            const int16_t *window, unsigned int len)
430;-----------------------------------------------------------------------------
431
432%macro REVERSE_WORDS 1-2
433%if cpuflag(ssse3) && notcpuflag(atom)
434    pshufb  %1, %2
435%elif cpuflag(sse2)
436    pshuflw  %1, %1, 0x1B
437    pshufhw  %1, %1, 0x1B
438    pshufd   %1, %1, 0x4E
439%elif cpuflag(mmxext)
440    pshufw   %1, %1, 0x1B
441%endif
442%endmacro
443
444%macro MUL16FIXED 3
445%if cpuflag(ssse3) ; dst, src, unused
446; dst = ((dst * src) + (1<<14)) >> 15
447    pmulhrsw   %1, %2
448%elif cpuflag(mmxext) ; dst, src, temp
449; dst = (dst * src) >> 15
450; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
451; in from the pmullw result.
452    mova    %3, %1
453    pmulhw  %1, %2
454    pmullw  %3, %2
455    psrlw   %3, 15
456    psllw   %1, 1
457    por     %1, %3
458%endif
459%endmacro
460
461%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
462%if %1
463cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
464%else
465cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
466%endif
467    lea     offset2q, [offsetq-mmsize]
468%if cpuflag(ssse3) && notcpuflag(atom)
469    mova          m5, [pb_revwords]
470    ALIGN 16
471%elif %1
472    mova          m5, [pd_16384]
473%endif
474.loop:
475%if cpuflag(ssse3)
476    ; This version does the 16x16->16 multiplication in-place without expanding
477    ; to 32-bit. The ssse3 version is bit-identical.
478    mova          m0, [windowq+offset2q]
479    mova          m1, [ inputq+offset2q]
480    pmulhrsw      m1, m0
481    REVERSE_WORDS m0, m5
482    pmulhrsw      m0, [ inputq+offsetq ]
483    mova  [outputq+offset2q], m1
484    mova  [outputq+offsetq ], m0
485%elif %1
486    ; This version expands 16-bit to 32-bit, multiplies by the window,
487    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
488    ; save to the output. The window is reversed for the second half.
489    mova          m3, [windowq+offset2q]
490    mova          m4, [ inputq+offset2q]
491    pxor          m0, m0
492    punpcklwd     m0, m3
493    punpcklwd     m1, m4
494    pmaddwd       m0, m1
495    paddd         m0, m5
496    psrad         m0, 15
497    pxor          m2, m2
498    punpckhwd     m2, m3
499    punpckhwd     m1, m4
500    pmaddwd       m2, m1
501    paddd         m2, m5
502    psrad         m2, 15
503    packssdw      m0, m2
504    mova  [outputq+offset2q], m0
505    REVERSE_WORDS m3
506    mova          m4, [ inputq+offsetq]
507    pxor          m0, m0
508    punpcklwd     m0, m3
509    punpcklwd     m1, m4
510    pmaddwd       m0, m1
511    paddd         m0, m5
512    psrad         m0, 15
513    pxor          m2, m2
514    punpckhwd     m2, m3
515    punpckhwd     m1, m4
516    pmaddwd       m2, m1
517    paddd         m2, m5
518    psrad         m2, 15
519    packssdw      m0, m2
520    mova  [outputq+offsetq], m0
521%else
522    ; This version does the 16x16->16 multiplication in-place without expanding
523    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
524    ; therefore are not bit-identical to the C version.
525    mova          m0, [windowq+offset2q]
526    mova          m1, [ inputq+offset2q]
527    mova          m2, [ inputq+offsetq ]
528    MUL16FIXED    m1, m0, m3
529    REVERSE_WORDS m0
530    MUL16FIXED    m2, m0, m3
531    mova  [outputq+offset2q], m1
532    mova  [outputq+offsetq ], m2
533%endif
534    add      offsetd, mmsize
535    sub     offset2d, mmsize
536    jae .loop
537    REP_RET
538%endmacro
539
540INIT_MMX mmxext
541APPLY_WINDOW_INT16 0
542INIT_XMM sse2
543APPLY_WINDOW_INT16 0
544
545INIT_MMX mmxext
546APPLY_WINDOW_INT16 1
547INIT_XMM sse2
548APPLY_WINDOW_INT16 1
549INIT_XMM ssse3
550APPLY_WINDOW_INT16 1
551INIT_XMM ssse3, atom
552APPLY_WINDOW_INT16 1
553