1;***************************************************************************** 2;* x86-optimized AC-3 DSP functions 3;* Copyright (c) 2011 Justin Ruggles 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26; 16777216.0f - used in ff_float_to_fixed24() 27pf_1_24: times 4 dd 0x4B800000 28 29; used in ff_ac3_compute_mantissa_size() 30cextern ac3_bap_bits 31pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768 32pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 33 34; used in ff_ac3_extract_exponents() 35cextern pd_1 36pd_151: times 4 dd 151 37 38SECTION .text 39 40;----------------------------------------------------------------------------- 41; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) 42;----------------------------------------------------------------------------- 43 44%macro AC3_EXPONENT_MIN 0 45cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset 46 shl reuse_blksq, 8 47 jz .end 48 LOOP_ALIGN 49.nextexp: 50 mov offsetq, reuse_blksq 51 mova m0, [expq+offsetq] 52 sub offsetq, 256 53 LOOP_ALIGN 54.nextblk: 55 PMINUB m0, [expq+offsetq], m1 56 sub offsetq, 256 57 jae .nextblk 58 mova [expq], m0 59 add expq, mmsize 60 sub expnq, mmsize 61 jg .nextexp 62.end: 63 REP_RET 64%endmacro 65 66%define LOOP_ALIGN 67INIT_MMX mmx 68AC3_EXPONENT_MIN 69%if HAVE_MMXEXT_EXTERNAL 70%define LOOP_ALIGN ALIGN 16 71INIT_MMX mmxext 72AC3_EXPONENT_MIN 73%endif 74%if HAVE_SSE2_EXTERNAL 75INIT_XMM sse2 76AC3_EXPONENT_MIN 77%endif 78%undef LOOP_ALIGN 79 80;----------------------------------------------------------------------------- 81; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) 82;----------------------------------------------------------------------------- 83 84; The 3DNow! version is not bit-identical because pf2id uses truncation rather 85; than round-to-nearest. 86INIT_MMX 3dnow 87cglobal float_to_fixed24, 3, 3, 0, dst, src, len 88 movq m0, [pf_1_24] 89.loop: 90 movq m1, [srcq ] 91 movq m2, [srcq+8 ] 92 movq m3, [srcq+16] 93 movq m4, [srcq+24] 94 pfmul m1, m0 95 pfmul m2, m0 96 pfmul m3, m0 97 pfmul m4, m0 98 pf2id m1, m1 99 pf2id m2, m2 100 pf2id m3, m3 101 pf2id m4, m4 102 movq [dstq ], m1 103 movq [dstq+8 ], m2 104 movq [dstq+16], m3 105 movq [dstq+24], m4 106 add srcq, 32 107 add dstq, 32 108 sub lend, 8 109 ja .loop 110 femms 111 RET 112 113INIT_XMM sse 114cglobal float_to_fixed24, 3, 3, 3, dst, src, len 115 movaps m0, [pf_1_24] 116.loop: 117 movaps m1, [srcq ] 118 movaps m2, [srcq+16] 119 mulps m1, m0 120 mulps m2, m0 121 cvtps2pi mm0, m1 122 movhlps m1, m1 123 cvtps2pi mm1, m1 124 cvtps2pi mm2, m2 125 movhlps m2, m2 126 cvtps2pi mm3, m2 127 movq [dstq ], mm0 128 movq [dstq+ 8], mm1 129 movq [dstq+16], mm2 130 movq [dstq+24], mm3 131 add srcq, 32 132 add dstq, 32 133 sub lend, 8 134 ja .loop 135 emms 136 RET 137 138INIT_XMM sse2 139cglobal float_to_fixed24, 3, 3, 9, dst, src, len 140 movaps m0, [pf_1_24] 141.loop: 142 movaps m1, [srcq ] 143 movaps m2, [srcq+16 ] 144 movaps m3, [srcq+32 ] 145 movaps m4, [srcq+48 ] 146%ifdef m8 147 movaps m5, [srcq+64 ] 148 movaps m6, [srcq+80 ] 149 movaps m7, [srcq+96 ] 150 movaps m8, [srcq+112] 151%endif 152 mulps m1, m0 153 mulps m2, m0 154 mulps m3, m0 155 mulps m4, m0 156%ifdef m8 157 mulps m5, m0 158 mulps m6, m0 159 mulps m7, m0 160 mulps m8, m0 161%endif 162 cvtps2dq m1, m1 163 cvtps2dq m2, m2 164 cvtps2dq m3, m3 165 cvtps2dq m4, m4 166%ifdef m8 167 cvtps2dq m5, m5 168 cvtps2dq m6, m6 169 cvtps2dq m7, m7 170 cvtps2dq m8, m8 171%endif 172 movdqa [dstq ], m1 173 movdqa [dstq+16 ], m2 174 movdqa [dstq+32 ], m3 175 movdqa [dstq+48 ], m4 176%ifdef m8 177 movdqa [dstq+64 ], m5 178 movdqa [dstq+80 ], m6 179 movdqa [dstq+96 ], m7 180 movdqa [dstq+112], m8 181 add srcq, 128 182 add dstq, 128 183 sub lenq, 32 184%else 185 add srcq, 64 186 add dstq, 64 187 sub lenq, 16 188%endif 189 ja .loop 190 REP_RET 191 192;------------------------------------------------------------------------------ 193; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16]) 194;------------------------------------------------------------------------------ 195 196%macro PHADDD4 2 ; xmm src, xmm tmp 197 movhlps %2, %1 198 paddd %1, %2 199 pshufd %2, %1, 0x1 200 paddd %1, %2 201%endmacro 202 203INIT_XMM sse2 204cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum 205 movdqa m0, [mant_cntq ] 206 movdqa m1, [mant_cntq+ 1*16] 207 paddw m0, [mant_cntq+ 2*16] 208 paddw m1, [mant_cntq+ 3*16] 209 paddw m0, [mant_cntq+ 4*16] 210 paddw m1, [mant_cntq+ 5*16] 211 paddw m0, [mant_cntq+ 6*16] 212 paddw m1, [mant_cntq+ 7*16] 213 paddw m0, [mant_cntq+ 8*16] 214 paddw m1, [mant_cntq+ 9*16] 215 paddw m0, [mant_cntq+10*16] 216 paddw m1, [mant_cntq+11*16] 217 pmaddwd m0, [ac3_bap_bits ] 218 pmaddwd m1, [ac3_bap_bits+16] 219 paddd m0, m1 220 PHADDD4 m0, m1 221 movd sumd, m0 222 movdqa m3, [pw_bap_mul1] 223 movhpd m0, [mant_cntq +2] 224 movlpd m0, [mant_cntq+1*32+2] 225 movhpd m1, [mant_cntq+2*32+2] 226 movlpd m1, [mant_cntq+3*32+2] 227 movhpd m2, [mant_cntq+4*32+2] 228 movlpd m2, [mant_cntq+5*32+2] 229 pmulhuw m0, m3 230 pmulhuw m1, m3 231 pmulhuw m2, m3 232 paddusw m0, m1 233 paddusw m0, m2 234 pmaddwd m0, [pw_bap_mul2] 235 PHADDD4 m0, m1 236 movd eax, m0 237 add eax, sumd 238 RET 239 240;------------------------------------------------------------------------------ 241; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs) 242;------------------------------------------------------------------------------ 243 244%macro PABSD 1-2 ; src/dst, unused 245%if cpuflag(ssse3) 246 pabsd %1, %1 247%else ; src/dst, tmp 248 pxor %2, %2 249 pcmpgtd %2, %1 250 pxor %1, %2 251 psubd %1, %2 252%endif 253%endmacro 254 255%macro AC3_EXTRACT_EXPONENTS 0 256cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len 257 add expq, lenq 258 lea coefq, [coefq+4*lenq] 259 neg lenq 260 mova m2, [pd_1] 261 mova m3, [pd_151] 262.loop: 263 ; move 4 32-bit coefs to xmm0 264 mova m0, [coefq+4*lenq] 265 ; absolute value 266 PABSD m0, m1 267 ; convert to float and extract exponents 268 pslld m0, 1 269 por m0, m2 270 cvtdq2ps m1, m0 271 psrld m1, 23 272 mova m0, m3 273 psubd m0, m1 274 ; move the lowest byte in each of 4 dwords to the low dword 275 ; NOTE: We cannot just extract the low bytes with pshufb because the dword 276 ; result for 16777215 is -1 due to float inaccuracy. Using packuswb 277 ; clips this to 0, which is the correct exponent. 278 packssdw m0, m0 279 packuswb m0, m0 280 movd [expq+lenq], m0 281 282 add lenq, 4 283 jl .loop 284 REP_RET 285%endmacro 286 287%if HAVE_SSE2_EXTERNAL 288INIT_XMM sse2 289AC3_EXTRACT_EXPONENTS 290%endif 291%if HAVE_SSSE3_EXTERNAL 292INIT_XMM ssse3 293AC3_EXTRACT_EXPONENTS 294%endif 295