1;****************************************************************************** 2;* FLAC DSP SIMD optimizations 3;* 4;* Copyright (C) 2014 Loren Merritt 5;* Copyright (C) 2014 James Almer 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION .text 27 28%macro PMACSDQL 5 29%if cpuflag(xop) 30 pmacsdql %1, %2, %3, %1 31%else 32 pmuldq %2, %3 33 paddq %1, %2 34%endif 35%endmacro 36 37%macro LPC_32 1 38INIT_XMM %1 39cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j 40 sub lend, pred_orderd 41 jle .ret 42 lea decodedq, [decodedq+pred_orderq*4-8] 43 lea coeffsq, [coeffsq+pred_orderq*4] 44 neg pred_orderq 45 movd m4, qlevelm 46ALIGN 16 47.loop_sample: 48 movd m0, [decodedq+pred_orderq*4+8] 49 add decodedq, 8 50 movd m1, [coeffsq+pred_orderq*4] 51 pxor m2, m2 52 pxor m3, m3 53 lea jq, [pred_orderq+1] 54 test jq, jq 55 jz .end_order 56.loop_order: 57 PMACSDQL m2, m0, m1, m2, m0 58 movd m0, [decodedq+jq*4] 59 PMACSDQL m3, m1, m0, m3, m1 60 movd m1, [coeffsq+jq*4] 61 inc jq 62 jl .loop_order 63.end_order: 64 PMACSDQL m2, m0, m1, m2, m0 65 psrlq m2, m4 66 movd m0, [decodedq] 67 paddd m0, m2 68 movd [decodedq], m0 69 sub lend, 2 70 jl .ret 71 PMACSDQL m3, m1, m0, m3, m1 72 psrlq m3, m4 73 movd m1, [decodedq+4] 74 paddd m1, m3 75 movd [decodedq+4], m1 76 jg .loop_sample 77.ret: 78 REP_RET 79%endmacro 80 81%if HAVE_XOP_EXTERNAL 82LPC_32 xop 83%endif 84LPC_32 sse4 85 86;---------------------------------------------------------------------------------- 87;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels, 88; int len, int shift); 89;---------------------------------------------------------------------------------- 90%macro FLAC_DECORRELATE_16 3-4 91cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len 92%if ARCH_X86_32 93 mov lend, lenm 94%endif 95 movd m3, r4m 96 shl lend, 2 97 mov in1q, [in0q + gprsize] 98 mov in0q, [in0q] 99 mov outq, [outq] 100 add in1q, lenq 101 add in0q, lenq 102 add outq, lenq 103 neg lenq 104 105align 16 106.loop: 107 mova m0, [in0q + lenq] 108 mova m1, [in1q + lenq] 109%ifidn %1, ms 110 psrad m2, m1, 1 111 psubd m0, m2 112%endif 113%ifnidn %1, indep2 114 p%4d m2, m0, m1 115%endif 116 packssdw m%2, m%2 117 packssdw m%3, m%3 118 punpcklwd m%2, m%3 119 psllw m%2, m3 120 mova [outq + lenq], m%2 121 add lenq, 16 122 jl .loop 123 REP_RET 124%endmacro 125 126INIT_XMM sse2 127FLAC_DECORRELATE_16 ls, 0, 2, sub 128FLAC_DECORRELATE_16 rs, 2, 1, add 129FLAC_DECORRELATE_16 ms, 2, 0, add 130 131;---------------------------------------------------------------------------------- 132;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels, 133; int len, int shift); 134;---------------------------------------------------------------------------------- 135%macro FLAC_DECORRELATE_32 5 136cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len 137%if ARCH_X86_32 138 mov lend, lenm 139%endif 140 movd m3, r4m 141 mov in1q, [in0q + gprsize] 142 mov in0q, [in0q] 143 mov outq, [outq] 144 sub in1q, in0q 145 146align 16 147.loop: 148 mova m0, [in0q] 149 mova m1, [in0q + in1q] 150%ifidn %1, ms 151 psrad m2, m1, 1 152 psubd m0, m2 153%endif 154 p%5d m2, m0, m1 155 pslld m%2, m3 156 pslld m%3, m3 157 158 SBUTTERFLY dq, %2, %3, %4 159 160 mova [outq ], m%2 161 mova [outq + mmsize], m%3 162 163 add in0q, mmsize 164 add outq, mmsize*2 165 sub lend, mmsize/4 166 jg .loop 167 REP_RET 168%endmacro 169 170INIT_XMM sse2 171FLAC_DECORRELATE_32 ls, 0, 2, 1, sub 172FLAC_DECORRELATE_32 rs, 2, 1, 0, add 173FLAC_DECORRELATE_32 ms, 2, 0, 1, add 174 175;----------------------------------------------------------------------------------------- 176;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels, 177; int len, int shift); 178;----------------------------------------------------------------------------------------- 179;%1 = bps 180;%2 = channels 181;%3 = last xmm reg used 182;%4 = word/dword (shift instruction) 183%macro FLAC_DECORRELATE_INDEP 4 184%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels 185cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7 186%if ARCH_X86_32 187%if %2 == 6 188 DEFINE_ARGS out, in0, in1, in2, in3, in4, in5 189 %define lend dword r3m 190%else 191 mov lend, lenm 192%endif 193%endif 194 movd m%3, r4m 195 196%assign %%i 1 197%rep %2-1 198 mov in %+ %%i %+ q, [in0q+%%i*gprsize] 199%assign %%i %%i+1 200%endrep 201 202 mov in0q, [in0q] 203 mov outq, [outq] 204 205%assign %%i 1 206%rep %2-1 207 sub in %+ %%i %+ q, in0q 208%assign %%i %%i+1 209%endrep 210 211align 16 212.loop: 213 mova m0, [in0q] 214 215%assign %%i 1 216%rep REPCOUNT-1 217 mova m %+ %%i, [in0q + in %+ %%i %+ q] 218%assign %%i %%i+1 219%endrep 220 221%if %1 == 32 222 223%if %2 == 8 224 TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 225%elif %2 == 6 226 SBUTTERFLY dq, 0, 1, 6 227 SBUTTERFLY dq, 2, 3, 6 228 SBUTTERFLY dq, 4, 5, 6 229 230 punpcklqdq m6, m0, m2 231 punpckhqdq m2, m4 232 shufps m4, m0, 0xe4 233 punpcklqdq m0, m1, m3 234 punpckhqdq m3, m5 235 shufps m5, m1, 0xe4 236 SWAP 0,6,1,4,5,3 237%elif %2 == 4 238 TRANSPOSE4x4D 0, 1, 2, 3, 4 239%else ; %2 == 2 240 SBUTTERFLY dq, 0, 1, 2 241%endif 242 243%else ; %1 == 16 244 245%if %2 == 8 246 packssdw m0, [in0q + in4q] 247 packssdw m1, [in0q + in5q] 248 packssdw m2, [in0q + in6q] 249 packssdw m3, [in0q + in7q] 250 TRANSPOSE2x4x4W 0, 1, 2, 3, 4 251%elif %2 == 6 252 packssdw m0, [in0q + in3q] 253 packssdw m1, [in0q + in4q] 254 packssdw m2, [in0q + in5q] 255 pshufd m3, m0, q1032 256 punpcklwd m0, m1 257 punpckhwd m1, m2 258 punpcklwd m2, m3 259 260 shufps m3, m0, m2, q2020 261 shufps m0, m1, q2031 262 shufps m2, m1, q3131 263 shufps m1, m2, m3, q3120 264 shufps m3, m0, q0220 265 shufps m0, m2, q3113 266 SWAP 2, 0, 3 267%else ; %2 == 4 268 packssdw m0, [in0q + in2q] 269 packssdw m1, [in0q + in3q] 270 SBUTTERFLY wd, 0, 1, 2 271 SBUTTERFLY dq, 0, 1, 2 272%endif 273 274%endif 275 276%assign %%i 0 277%rep REPCOUNT 278 psll%4 m %+ %%i, m%3 279%assign %%i %%i+1 280%endrep 281 282%assign %%i 0 283%rep REPCOUNT 284 mova [outq + %%i*mmsize], m %+ %%i 285%assign %%i %%i+1 286%endrep 287 288 add in0q, mmsize 289 add outq, mmsize*REPCOUNT 290 sub lend, mmsize/4 291 jg .loop 292 REP_RET 293%endmacro 294 295INIT_XMM sse2 296FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro 297FLAC_DECORRELATE_INDEP 32, 2, 3, d 298FLAC_DECORRELATE_INDEP 16, 4, 3, w 299FLAC_DECORRELATE_INDEP 32, 4, 5, d 300FLAC_DECORRELATE_INDEP 16, 6, 4, w 301FLAC_DECORRELATE_INDEP 32, 6, 7, d 302%if ARCH_X86_64 303FLAC_DECORRELATE_INDEP 16, 8, 5, w 304FLAC_DECORRELATE_INDEP 32, 8, 9, d 305%endif 306 307INIT_XMM avx 308FLAC_DECORRELATE_INDEP 32, 4, 5, d 309FLAC_DECORRELATE_INDEP 32, 6, 7, d 310%if ARCH_X86_64 311FLAC_DECORRELATE_INDEP 16, 8, 5, w 312FLAC_DECORRELATE_INDEP 32, 8, 9, d 313%endif 314