1;****************************************************************************** 2;* ALAC DSP SIMD optimizations 3;* 4;* Copyright (C) 2015 James Almer 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION .text 26 27INIT_XMM sse4 28%if ARCH_X86_64 29cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1 30%else 31cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight 32%define buf1q r2q 33%endif 34 movd m6, shiftm 35 movd m7, weightm 36 SPLATD m7 37 shl lend, 2 38 mov buf1q, [buf0q + gprsize] 39 mov buf0q, [buf0q] 40 add buf1q, lenq 41 add buf0q, lenq 42 neg lenq 43 44align 16 45.loop: 46 mova m0, [buf0q + lenq] 47 mova m1, [buf0q + lenq + mmsize] 48 mova m2, [buf1q + lenq] 49 mova m3, [buf1q + lenq + mmsize] 50 pmulld m4, m2, m7 51 pmulld m5, m3, m7 52 psrad m4, m6 53 psrad m5, m6 54 psubd m0, m4 55 psubd m1, m5 56 paddd m2, m0 57 paddd m3, m1 58 mova [buf1q + lenq], m0 59 mova [buf1q + lenq + mmsize], m1 60 mova [buf0q + lenq], m2 61 mova [buf0q + lenq + mmsize], m3 62 63 add lenq, mmsize*2 64 jl .loop 65 RET 66 67INIT_XMM sse2 68cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len 69 movifnidn lend, lenm 70 movd m4, r2m ; exbits 71 shl lend, 2 72 mov buf1q, [buf0q + gprsize] 73 mov buf0q, [buf0q] 74 mov exbuf1q, [exbuf0q + gprsize] 75 mov exbuf0q, [exbuf0q] 76 add buf1q, lenq 77 add buf0q, lenq 78 add exbuf1q, lenq 79 add exbuf0q, lenq 80 neg lenq 81 82align 16 83.loop: 84 mova m0, [buf0q + lenq] 85 mova m1, [buf0q + lenq + mmsize] 86 pslld m0, m4 87 pslld m1, m4 88 mova m2, [buf1q + lenq] 89 mova m3, [buf1q + lenq + mmsize] 90 pslld m2, m4 91 pslld m3, m4 92 por m0, [exbuf0q + lenq] 93 por m1, [exbuf0q + lenq + mmsize] 94 por m2, [exbuf1q + lenq] 95 por m3, [exbuf1q + lenq + mmsize] 96 mova [buf0q + lenq ], m0 97 mova [buf0q + lenq + mmsize], m1 98 mova [buf1q + lenq ], m2 99 mova [buf1q + lenq + mmsize], m3 100 101 add lenq, mmsize*2 102 jl .loop 103 REP_RET 104 105%if ARCH_X86_64 106cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len 107%else 108cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len 109%define exbitsm r2m 110%endif 111 movifnidn lend, r4m 112 movd m2, exbitsm 113 shl lend, 2 114 mov bufq, [bufq] 115 mov exbufq, [exbufq] 116 add bufq, lenq 117 add exbufq, lenq 118 neg lenq 119 120align 16 121.loop: 122 mova m0, [bufq + lenq] 123 mova m1, [bufq + lenq + mmsize] 124 pslld m0, m2 125 pslld m1, m2 126 por m0, [exbufq + lenq] 127 por m1, [exbufq + lenq + mmsize] 128 mova [bufq + lenq], m0 129 mova [bufq + lenq + mmsize], m1 130 131 add lenq, mmsize*2 132 jl .loop 133 REP_RET 134