1;****************************************************************************** 2;* SIMD optimized non-power-of-two MDCT functions 3;* 4;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 32 26 27perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0 28perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2 29sign_adjust_r: times 4 dd 0x80000000, 0x00000000 30 31sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000 32 33SECTION .text 34 35%if ARCH_X86_64 36 37;***************************************************************************************** 38;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride); 39;***************************************************************************************** 40%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2 41 VBROADCASTSD m0, [inq + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im 42 movsd xm1, [inq + 1*16 + 8 + %1] ; in[ 3].re, in[ 3].im, 0, 0 43 movsd xm4, [inq + 6*16 + 0 + %1] ; in[12].re, in[12].im, 0, 0 44 movhps xm1, [inq + 3*16 + 0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im 45 movhps xm4, [inq + 4*16 + 8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im 46 47 subps xm2, xm1, xm4 ; t[2].im, t[2].re, t[3].im, t[3].re 48 addps xm1, xm4 ; t[0].re, t[0].im, t[1].re, t[1].im 49 50 movhlps %2, xm1 ; t[0].re, t[1].re, t[0].im, t[1].im 51 addps %2, xm1 52 addps %2, xm0 ; DC[0].re, DC[0].im, junk... 53 movlhps %2, %2 ; DC[0].re, DC[0].im, DC[0].re, DC[0].im 54 55 shufps xm3, xm1, xm2, q0110 ; t[0].re, t[0].im, t[2].re, t[2].im 56 shufps xm1, xm2, q2332 ; t[1].re, t[1].im, t[3].re, t[3].im 57 58 mulps xm%3, xm1, xm5 59 mulps xm4, xm3, xm6 60 mulps xm1, xm6 61 62 xorps xm1, xm7 63 mulps xm3, xm5 64 addsubps xm3, xm1 ; t[0].re, t[0].im, t[2].re, t[2].im 65 subps xm%3, xm4 ; t[4].re, t[4].im, t[5].re, t[5].im 66 67 movhlps xm2, xm%3, xm3 ; t[2].re, t[2].im, t[5].re, t[5].im 68 movlhps xm3, xm%3 ; t[0].re, t[0].im, t[4].re, t[4].im 69 70 xorps xm2, xm7 71 addps xm%3, xm2, xm3 72 subps xm3, xm2 73 74 shufps xm3, xm3, q1032 75 vinsertf128 m%3, m%3, xm3, 1 ; All ACs (tmp[1] through to tmp[4]) 76 addps m%3, m%3, m0 ; Finally offset with DCs 77%endmacro 78 79%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset 80 mulps xm0, xm9, [exptabq + %1 + 16*0] 81 mulps xm1, xm10, [exptabq + %1 + 16*1] 82 83 haddps xm0, xm1 84 movhlps xm1, xm0 ; t[0].re, t[1].re, t[0].im, t[1].im 85 86 addps xm0, xm1 87 addps xm0, xm8 88 89 movsd [outq], xm0 90%endmacro 91 92%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset 93 mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1] 94 mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1] 95 mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1] 96 mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1] 97 98 addps m0, m0, m2 99 addps m1, m1, m3 100 addps m0, m0, m11 101 102 shufps m1, m1, m1, q2301 103 addps m0, m0, m1 104 105 vextractf128 xm1, m0, 1 106 107 movlps [outq + strideq*1], xm0 108 movhps [outq + strideq*2], xm0 109 movlps [outq + stride3q], xm1 110 movhps [outq + strideq*4], xm1 111%endmacro 112 113INIT_YMM avx 114cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5 115 shl strideq, 3 116 117 movaps xm5, [exptabq + 480 + 16*0] 118 movaps xm6, [exptabq + 480 + 16*1] 119 movaps xm7, [sign_adjust_5] 120 121 FFT5 0, xm8, 11 122 FFT5 8, xm9, 12 123 FFT5 16, xm10, 13 124 125%define stride3q inq 126 lea stride3q, [strideq + strideq*2] 127 lea stride5q, [strideq + strideq*4] 128 129 BUTTERFLIES_DC (8*6 + 4*0)*2*4 130 BUTTERFLIES_AC (8*0 + 0*0)*2*4 131 132 add outq, stride5q 133 BUTTERFLIES_DC (8*6 + 4*1)*2*4 134 BUTTERFLIES_AC (8*2 + 0*0)*2*4 135 136 add outq, stride5q 137 BUTTERFLIES_DC (8*6 + 4*2)*2*4 138 BUTTERFLIES_AC (8*4 + 0*0)*2*4 139 140 RET 141 142%endif ; ARCH_X86_64 143 144;******************************************************************************************************* 145;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8); 146;******************************************************************************************************* 147%macro LUT_LOAD_4D 3 148 mov r4d, [lutq + %3q*4 + 0] 149 movsd xmm%1, [inq + r4q*8] 150 mov r4d, [lutq + %3q*4 + 4] 151 movhps xmm%1, [inq + r4q*8] 152%if cpuflag(avx2) 153 mov r4d, [lutq + %3q*4 + 8] 154 movsd %2, [inq + r4q*8] 155 mov r4d, [lutq + %3q*4 + 12] 156 movhps %2, [inq + r4q*8] 157 vinsertf128 %1, %1, %2, 1 158%endif 159%endmacro 160 161%macro POSTROTATE_FN 1 162cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n 163 164 xor offset_nq, offset_nq 165 lea offset_pq, [len8q*2 - %1] 166 167 movaps m7, [sign_adjust_r] 168 169%if cpuflag(avx2) 170 movaps m8, [perm_pos] 171 movaps m9, [perm_neg] 172%endif 173 174.loop: 175 movups m0, [expq + offset_pq*8] ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im 176 movups m1, [expq + offset_nq*8] ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im 177 178 LUT_LOAD_4D m3, xm4, offset_p ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im 179 LUT_LOAD_4D m4, xm5, offset_n ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im 180 181 mulps m5, m3, m0 ; in[p].reim * exp[p].reim 182 mulps m6, m4, m1 ; in[n].reim * exp[n].reim 183 184 xorps m5, m7 ; in[p].re *= -1, in[p].im *= 1 185 xorps m6, m7 ; in[n].re *= -1, in[n].im *= 1 186 187 shufps m3, m3, m3, q2301 ; in[p].imre 188 shufps m4, m4, m4, q2301 ; in[n].imre 189 190 mulps m3, m0 ; in[p].imre * exp[p].reim 191 mulps m4, m1 ; in[n].imre * exp[n].reim 192 193 haddps m3, m6 ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re 194 haddps m5, m4 ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im 195 196%if cpuflag(avx2) 197 vpermps m3, m9, m3 ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re 198 vpermps m5, m8, m5 ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im 199%else 200 shufps m3, m3, m3, q0312 201 shufps m5, m5, m5, q2130 202%endif 203 204 movups [outq + offset_nq*8], m3 205 movups [outq + offset_pq*8], m5 206 207 sub offset_pq, %1 208 add offset_nq, %1 209 cmp offset_nq, offset_pq 210 jle .loop 211 212 REP_RET 213%endmacro 214 215INIT_XMM sse3 216POSTROTATE_FN 2 217 218%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL 219INIT_YMM avx2 220POSTROTATE_FN 4 221%endif 222