1;****************************************************************************** 2;* Copyright (c) 2012 Michael Niedermayer 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21%include "libavutil/x86/x86util.asm" 22 23 24SECTION_RODATA 32 25dw1: times 8 dd 1 26w1 : times 16 dw 1 27 28SECTION .text 29 30%macro MIX2_FLT 1 31cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len 32%ifidn %1, a 33 test in1q, mmsize-1 34 jne mix_2_1_float_u_int %+ SUFFIX 35 test in2q, mmsize-1 36 jne mix_2_1_float_u_int %+ SUFFIX 37 test outq, mmsize-1 38 jne mix_2_1_float_u_int %+ SUFFIX 39%else 40mix_2_1_float_u_int %+ SUFFIX: 41%endif 42 VBROADCASTSS m4, [coeffpq + 4*index1q] 43 VBROADCASTSS m5, [coeffpq + 4*index2q] 44 shl lend , 2 45 add in1q , lenq 46 add in2q , lenq 47 add outq , lenq 48 neg lenq 49.next: 50%ifidn %1, a 51 mulps m0, m4, [in1q + lenq ] 52 mulps m1, m5, [in2q + lenq ] 53 mulps m2, m4, [in1q + lenq + mmsize] 54 mulps m3, m5, [in2q + lenq + mmsize] 55%else 56 movu m0, [in1q + lenq ] 57 movu m1, [in2q + lenq ] 58 movu m2, [in1q + lenq + mmsize] 59 movu m3, [in2q + lenq + mmsize] 60 mulps m0, m0, m4 61 mulps m1, m1, m5 62 mulps m2, m2, m4 63 mulps m3, m3, m5 64%endif 65 addps m0, m0, m1 66 addps m2, m2, m3 67 mov%1 [outq + lenq ], m0 68 mov%1 [outq + lenq + mmsize], m2 69 add lenq, mmsize*2 70 jl .next 71 REP_RET 72%endmacro 73 74%macro MIX1_FLT 1 75cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len 76%ifidn %1, a 77 test inq, mmsize-1 78 jne mix_1_1_float_u_int %+ SUFFIX 79 test outq, mmsize-1 80 jne mix_1_1_float_u_int %+ SUFFIX 81%else 82mix_1_1_float_u_int %+ SUFFIX: 83%endif 84 VBROADCASTSS m2, [coeffpq + 4*indexq] 85 shl lenq , 2 86 add inq , lenq 87 add outq , lenq 88 neg lenq 89.next: 90%ifidn %1, a 91 mulps m0, m2, [inq + lenq ] 92 mulps m1, m2, [inq + lenq + mmsize] 93%else 94 movu m0, [inq + lenq ] 95 movu m1, [inq + lenq + mmsize] 96 mulps m0, m0, m2 97 mulps m1, m1, m2 98%endif 99 mov%1 [outq + lenq ], m0 100 mov%1 [outq + lenq + mmsize], m1 101 add lenq, mmsize*2 102 jl .next 103 REP_RET 104%endmacro 105 106%macro MIX1_INT16 1 107cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len 108%ifidn %1, a 109 test inq, mmsize-1 110 jne mix_1_1_int16_u_int %+ SUFFIX 111 test outq, mmsize-1 112 jne mix_1_1_int16_u_int %+ SUFFIX 113%else 114mix_1_1_int16_u_int %+ SUFFIX: 115%endif 116 movd m4, [coeffpq + 4*indexq] 117 SPLATW m5, m4 118 psllq m4, 32 119 psrlq m4, 48 120 mova m0, [w1] 121 psllw m0, m4 122 psrlw m0, 1 123 punpcklwd m5, m0 124 add lenq , lenq 125 add inq , lenq 126 add outq , lenq 127 neg lenq 128.next: 129 mov%1 m0, [inq + lenq ] 130 mov%1 m2, [inq + lenq + mmsize] 131 mova m1, m0 132 mova m3, m2 133 punpcklwd m0, [w1] 134 punpckhwd m1, [w1] 135 punpcklwd m2, [w1] 136 punpckhwd m3, [w1] 137 pmaddwd m0, m5 138 pmaddwd m1, m5 139 pmaddwd m2, m5 140 pmaddwd m3, m5 141 psrad m0, m4 142 psrad m1, m4 143 psrad m2, m4 144 psrad m3, m4 145 packssdw m0, m1 146 packssdw m2, m3 147 mov%1 [outq + lenq ], m0 148 mov%1 [outq + lenq + mmsize], m2 149 add lenq, mmsize*2 150 jl .next 151%if mmsize == 8 152 emms 153 RET 154%else 155 REP_RET 156%endif 157%endmacro 158 159%macro MIX2_INT16 1 160cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len 161%ifidn %1, a 162 test in1q, mmsize-1 163 jne mix_2_1_int16_u_int %+ SUFFIX 164 test in2q, mmsize-1 165 jne mix_2_1_int16_u_int %+ SUFFIX 166 test outq, mmsize-1 167 jne mix_2_1_int16_u_int %+ SUFFIX 168%else 169mix_2_1_int16_u_int %+ SUFFIX: 170%endif 171 movd m4, [coeffpq + 4*index1q] 172 movd m6, [coeffpq + 4*index2q] 173 SPLATW m5, m4 174 SPLATW m6, m6 175 psllq m4, 32 176 psrlq m4, 48 177 mova m7, [dw1] 178 pslld m7, m4 179 psrld m7, 1 180 punpcklwd m5, m6 181 add lend , lend 182 add in1q , lenq 183 add in2q , lenq 184 add outq , lenq 185 neg lenq 186.next: 187 mov%1 m0, [in1q + lenq ] 188 mov%1 m2, [in2q + lenq ] 189 mova m1, m0 190 punpcklwd m0, m2 191 punpckhwd m1, m2 192 193 mov%1 m2, [in1q + lenq + mmsize] 194 mov%1 m6, [in2q + lenq + mmsize] 195 mova m3, m2 196 punpcklwd m2, m6 197 punpckhwd m3, m6 198 199 pmaddwd m0, m5 200 pmaddwd m1, m5 201 pmaddwd m2, m5 202 pmaddwd m3, m5 203 paddd m0, m7 204 paddd m1, m7 205 paddd m2, m7 206 paddd m3, m7 207 psrad m0, m4 208 psrad m1, m4 209 psrad m2, m4 210 psrad m3, m4 211 packssdw m0, m1 212 packssdw m2, m3 213 mov%1 [outq + lenq ], m0 214 mov%1 [outq + lenq + mmsize], m2 215 add lenq, mmsize*2 216 jl .next 217%if mmsize == 8 218 emms 219 RET 220%else 221 REP_RET 222%endif 223%endmacro 224 225 226INIT_MMX mmx 227MIX1_INT16 u 228MIX1_INT16 a 229MIX2_INT16 u 230MIX2_INT16 a 231 232INIT_XMM sse 233MIX2_FLT u 234MIX2_FLT a 235MIX1_FLT u 236MIX1_FLT a 237 238INIT_XMM sse2 239MIX1_INT16 u 240MIX1_INT16 a 241MIX2_INT16 u 242MIX2_INT16 a 243 244%if HAVE_AVX_EXTERNAL 245INIT_YMM avx 246MIX2_FLT u 247MIX2_FLT a 248MIX1_FLT u 249MIX1_FLT a 250%endif 251