1;***************************************************************************** 2;* x86-optimized functions for yadif filter 3;* 4;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> 5;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> 6;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29pw_1: times 8 dw 1 30pw_8000: times 8 dw 0x8000 31pd_1: times 4 dd 1 32pd_8000: times 4 dd 0x8000 33 34SECTION .text 35 36%macro PABS 2 37%if cpuflag(ssse3) 38 pabsd %1, %1 39%else 40 pxor %2, %2 41 pcmpgtd %2, %1 42 pxor %1, %2 43 psubd %1, %2 44%endif 45%endmacro 46 47%macro PACK 1 48%if cpuflag(sse4) 49 packusdw %1, %1 50%else 51 psubd %1, [pd_8000] 52 packssdw %1, %1 53 paddw %1, [pw_8000] 54%endif 55%endmacro 56 57%macro PMAXUW 2 58%if cpuflag(sse4) 59 pmaxuw %1, %2 60%else 61 psubusw %1, %2 62 paddusw %1, %2 63%endif 64%endmacro 65 66%macro CHECK 2 67 movu m2, [curq+t1+%1*2] 68 movu m3, [curq+t0+%2*2] 69 mova m4, m2 70 mova m5, m2 71 pxor m4, m3 72 pavgw m5, m3 73 pand m4, [pw_1] 74 psubusw m5, m4 75 RSHIFT m5, 2 76 punpcklwd m5, m7 77 mova m4, m2 78 psubusw m2, m3 79 psubusw m3, m4 80 PMAXUW m2, m3 81 mova m3, m2 82 mova m4, m2 83 RSHIFT m3, 2 84 RSHIFT m4, 4 85 punpcklwd m2, m7 86 punpcklwd m3, m7 87 punpcklwd m4, m7 88 paddd m2, m3 89 paddd m2, m4 90%endmacro 91 92%macro CHECK1 0 93 mova m3, m0 94 pcmpgtd m3, m2 95 PMINSD m0, m2, m6 96 mova m6, m3 97 pand m5, m3 98 pandn m3, m1 99 por m3, m5 100 mova m1, m3 101%endmacro 102 103%macro CHECK2 0 104 paddd m6, [pd_1] 105 pslld m6, 30 106 paddd m2, m6 107 mova m3, m0 108 pcmpgtd m3, m2 109 PMINSD m0, m2, m4 110 pand m5, m3 111 pandn m3, m1 112 por m3, m5 113 mova m1, m3 114%endmacro 115 116; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I 117; am not sure whether it is any faster. A rewrite or refactor of the filter 118; code should make it possible to eliminate the move instruction at the end. It 119; exists to satisfy the expectation that the "score" values are in m1. 120 121; %macro CHECK2 0 122; mova m3, m0 123; pcmpgtd m0, m2 124; pand m0, m6 125; mova m6, m0 126; pand m5, m6 127; pand m2, m0 128; pandn m6, m1 129; pandn m0, m3 130; por m6, m5 131; por m0, m2 132; mova m1, m6 133; %endmacro 134 135%macro LOAD 2 136 movh %1, %2 137 punpcklwd %1, m7 138%endmacro 139 140%macro FILTER 3 141.loop%1: 142 pxor m7, m7 143 LOAD m0, [curq+t1] 144 LOAD m1, [curq+t0] 145 LOAD m2, [%2] 146 LOAD m3, [%3] 147 mova m4, m3 148 paddd m3, m2 149 psrad m3, 1 150 mova [rsp+ 0], m0 151 mova [rsp+16], m3 152 mova [rsp+32], m1 153 psubd m2, m4 154 PABS m2, m4 155 LOAD m3, [prevq+t1] 156 LOAD m4, [prevq+t0] 157 psubd m3, m0 158 psubd m4, m1 159 PABS m3, m5 160 PABS m4, m5 161 paddd m3, m4 162 psrld m2, 1 163 psrld m3, 1 164 PMAXSD m2, m3, m6 165 LOAD m3, [nextq+t1] 166 LOAD m4, [nextq+t0] 167 psubd m3, m0 168 psubd m4, m1 169 PABS m3, m5 170 PABS m4, m5 171 paddd m3, m4 172 psrld m3, 1 173 PMAXSD m2, m3, m6 174 mova [rsp+48], m2 175 176 paddd m1, m0 177 paddd m0, m0 178 psubd m0, m1 179 psrld m1, 1 180 PABS m0, m2 181 182 movu m2, [curq+t1-1*2] 183 movu m3, [curq+t0-1*2] 184 mova m4, m2 185 psubusw m2, m3 186 psubusw m3, m4 187 PMAXUW m2, m3 188 mova m3, m2 189 RSHIFT m3, 4 190 punpcklwd m2, m7 191 punpcklwd m3, m7 192 paddd m0, m2 193 paddd m0, m3 194 psubd m0, [pd_1] 195 196 CHECK -2, 0 197 CHECK1 198 CHECK -3, 1 199 CHECK2 200 CHECK 0, -2 201 CHECK1 202 CHECK 1, -3 203 CHECK2 204 205 mova m6, [rsp+48] 206 cmp DWORD r8m, 2 207 jge .end%1 208 LOAD m2, [%2+t1*2] 209 LOAD m4, [%3+t1*2] 210 LOAD m3, [%2+t0*2] 211 LOAD m5, [%3+t0*2] 212 paddd m2, m4 213 paddd m3, m5 214 psrld m2, 1 215 psrld m3, 1 216 mova m4, [rsp+ 0] 217 mova m5, [rsp+16] 218 mova m7, [rsp+32] 219 psubd m2, m4 220 psubd m3, m7 221 mova m0, m5 222 psubd m5, m4 223 psubd m0, m7 224 mova m4, m2 225 PMINSD m2, m3, m7 226 PMAXSD m3, m4, m7 227 PMAXSD m2, m5, m7 228 PMINSD m3, m5, m7 229 PMAXSD m2, m0, m7 230 PMINSD m3, m0, m7 231 pxor m4, m4 232 PMAXSD m6, m3, m7 233 psubd m4, m2 234 PMAXSD m6, m4, m7 235 236.end%1: 237 mova m2, [rsp+16] 238 mova m3, m2 239 psubd m2, m6 240 paddd m3, m6 241 PMAXSD m1, m2, m7 242 PMINSD m1, m3, m7 243 PACK m1 244 245 movh [dstq], m1 246 add dstq, mmsize/2 247 add prevq, mmsize/2 248 add curq, mmsize/2 249 add nextq, mmsize/2 250 sub DWORD r4m, mmsize/4 251 jg .loop%1 252%endmacro 253 254%macro YADIF 0 255%if ARCH_X86_32 256cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ 257 prefs, mrefs, parity, mode 258%else 259cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ 260 prefs, mrefs, parity, mode 261%endif 262%if ARCH_X86_32 263 mov r4, r5mp 264 mov r5, r6mp 265 DECLARE_REG_TMP 4,5 266%else 267 movsxd r5, DWORD r5m 268 movsxd r6, DWORD r6m 269 DECLARE_REG_TMP 5,6 270%endif 271 272 cmp DWORD paritym, 0 273 je .parity0 274 FILTER 1, prevq, curq 275 jmp .ret 276 277.parity0: 278 FILTER 0, curq, nextq 279 280.ret: 281 RET 282%endmacro 283 284INIT_XMM sse4 285YADIF 286INIT_XMM ssse3 287YADIF 288INIT_XMM sse2 289YADIF 290