1;****************************************************************************** 2;* TAK DSP SIMD optimizations 3;* 4;* Copyright (C) 2015 Paul B Mahol 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27pd_128: times 4 dd 128 28 29SECTION .text 30 31INIT_XMM sse2 32cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length 33 shl lengthd, 2 34 add p1q, lengthq 35 add p2q, lengthq 36 neg lengthq 37.loop: 38 mova m0, [p1q+lengthq+mmsize*0] 39 mova m1, [p1q+lengthq+mmsize*1] 40 paddd m0, [p2q+lengthq+mmsize*0] 41 paddd m1, [p2q+lengthq+mmsize*1] 42 mova [p2q+lengthq+mmsize*0], m0 43 mova [p2q+lengthq+mmsize*1], m1 44 add lengthq, mmsize*2 45 jl .loop 46 REP_RET 47 48cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length 49 shl lengthd, 2 50 add p1q, lengthq 51 add p2q, lengthq 52 neg lengthq 53 54.loop: 55 mova m0, [p2q+lengthq+mmsize*0] 56 mova m1, [p2q+lengthq+mmsize*1] 57 psubd m0, [p1q+lengthq+mmsize*0] 58 psubd m1, [p1q+lengthq+mmsize*1] 59 mova [p1q+lengthq+mmsize*0], m0 60 mova [p1q+lengthq+mmsize*1], m1 61 add lengthq, mmsize*2 62 jl .loop 63 REP_RET 64 65cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length 66 shl lengthd, 2 67 add p1q, lengthq 68 add p2q, lengthq 69 neg lengthq 70 71.loop: 72 mova m0, [p1q+lengthq] 73 mova m1, [p2q+lengthq] 74 mova m3, [p1q+lengthq+mmsize] 75 mova m4, [p2q+lengthq+mmsize] 76 mova m2, m1 77 mova m5, m4 78 psrad m2, 1 79 psrad m5, 1 80 psubd m0, m2 81 psubd m3, m5 82 paddd m1, m0 83 paddd m4, m3 84 mova [p1q+lengthq], m0 85 mova [p2q+lengthq], m1 86 mova [p1q+lengthq+mmsize], m3 87 mova [p2q+lengthq+mmsize], m4 88 add lengthq, mmsize*2 89 jl .loop 90 REP_RET 91 92INIT_XMM sse4 93cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor 94 shl lengthd, 2 95 add p1q, lengthq 96 add p2q, lengthq 97 neg lengthq 98 99 movd m2, dshiftm 100 movd m3, dfactorm 101 pshufd m3, m3, 0 102 mova m4, [pd_128] 103 104.loop: 105 mova m0, [p1q+lengthq] 106 mova m1, [p2q+lengthq] 107 psrad m1, m2 108 pmulld m1, m3 109 paddd m1, m4 110 psrad m1, 8 111 pslld m1, m2 112 psubd m1, m0 113 mova [p1q+lengthq], m1 114 add lengthq, mmsize 115 jl .loop 116 REP_RET 117