1;***************************************************************************** 2;* x86-optimized functions for yadif filter 3;* 4;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> 5;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> 6;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29pw_1: times 8 dw 1 30 31SECTION .text 32 33%macro PMAXUW 2 34%if cpuflag(sse4) 35 pmaxuw %1, %2 36%else 37 psubusw %1, %2 38 paddusw %1, %2 39%endif 40%endmacro 41 42%macro CHECK 2 43 movu m2, [curq+t1+%1*2] 44 movu m3, [curq+t0+%2*2] 45 mova m4, m2 46 mova m5, m2 47 pxor m4, m3 48 pavgw m5, m3 49 pand m4, [pw_1] 50 psubusw m5, m4 51 RSHIFT m5, 2 52 mova m4, m2 53 psubusw m2, m3 54 psubusw m3, m4 55 PMAXUW m2, m3 56 mova m3, m2 57 mova m4, m2 58 RSHIFT m3, 2 59 RSHIFT m4, 4 60 paddw m2, m3 61 paddw m2, m4 62%endmacro 63 64%macro CHECK1 0 65 mova m3, m0 66 pcmpgtw m3, m2 67 pminsw m0, m2 68 mova m6, m3 69 pand m5, m3 70 pandn m3, m1 71 por m3, m5 72 mova m1, m3 73%endmacro 74 75; %macro CHECK2 0 76; paddw m6, [pw_1] 77; psllw m6, 14 78; paddsw m2, m6 79; mova m3, m0 80; pcmpgtw m3, m2 81; pminsw m0, m2 82; pand m5, m3 83; pandn m3, m1 84; por m3, m5 85; mova m1, m3 86; %endmacro 87 88; This version of CHECK2 is required for 14-bit samples. The left-shift trick 89; in the old code is not large enough to correctly select pixels or scores. 90 91%macro CHECK2 0 92 mova m3, m0 93 pcmpgtw m0, m2 94 pand m0, m6 95 mova m6, m0 96 pand m5, m6 97 pand m2, m0 98 pandn m6, m1 99 pandn m0, m3 100 por m6, m5 101 por m0, m2 102 mova m1, m6 103%endmacro 104 105%macro LOAD 2 106 movu %1, %2 107%endmacro 108 109%macro FILTER 3 110.loop%1: 111 pxor m7, m7 112 LOAD m0, [curq+t1] 113 LOAD m1, [curq+t0] 114 LOAD m2, [%2] 115 LOAD m3, [%3] 116 mova m4, m3 117 paddw m3, m2 118 psraw m3, 1 119 mova [rsp+ 0], m0 120 mova [rsp+16], m3 121 mova [rsp+32], m1 122 psubw m2, m4 123 ABS1 m2, m4 124 LOAD m3, [prevq+t1] 125 LOAD m4, [prevq+t0] 126 psubw m3, m0 127 psubw m4, m1 128 ABS2 m3, m4, m5, m6 129 paddw m3, m4 130 psrlw m2, 1 131 psrlw m3, 1 132 pmaxsw m2, m3 133 LOAD m3, [nextq+t1] 134 LOAD m4, [nextq+t0] 135 psubw m3, m0 136 psubw m4, m1 137 ABS2 m3, m4, m5, m6 138 paddw m3, m4 139 psrlw m3, 1 140 pmaxsw m2, m3 141 mova [rsp+48], m2 142 143 paddw m1, m0 144 paddw m0, m0 145 psubw m0, m1 146 psrlw m1, 1 147 ABS1 m0, m2 148 149 movu m2, [curq+t1-1*2] 150 movu m3, [curq+t0-1*2] 151 mova m4, m2 152 psubusw m2, m3 153 psubusw m3, m4 154 PMAXUW m2, m3 155 mova m3, m2 156 RSHIFT m3, 4 157 paddw m0, m2 158 paddw m0, m3 159 psubw m0, [pw_1] 160 161 CHECK -2, 0 162 CHECK1 163 CHECK -3, 1 164 CHECK2 165 CHECK 0, -2 166 CHECK1 167 CHECK 1, -3 168 CHECK2 169 170 mova m6, [rsp+48] 171 cmp DWORD r8m, 2 172 jge .end%1 173 LOAD m2, [%2+t1*2] 174 LOAD m4, [%3+t1*2] 175 LOAD m3, [%2+t0*2] 176 LOAD m5, [%3+t0*2] 177 paddw m2, m4 178 paddw m3, m5 179 psrlw m2, 1 180 psrlw m3, 1 181 mova m4, [rsp+ 0] 182 mova m5, [rsp+16] 183 mova m7, [rsp+32] 184 psubw m2, m4 185 psubw m3, m7 186 mova m0, m5 187 psubw m5, m4 188 psubw m0, m7 189 mova m4, m2 190 pminsw m2, m3 191 pmaxsw m3, m4 192 pmaxsw m2, m5 193 pminsw m3, m5 194 pmaxsw m2, m0 195 pminsw m3, m0 196 pxor m4, m4 197 pmaxsw m6, m3 198 psubw m4, m2 199 pmaxsw m6, m4 200 201.end%1: 202 mova m2, [rsp+16] 203 mova m3, m2 204 psubw m2, m6 205 paddw m3, m6 206 pmaxsw m1, m2 207 pminsw m1, m3 208 209 movu [dstq], m1 210 add dstq, mmsize-4 211 add prevq, mmsize-4 212 add curq, mmsize-4 213 add nextq, mmsize-4 214 sub DWORD r4m, mmsize/2-2 215 jg .loop%1 216%endmacro 217 218%macro YADIF 0 219%if ARCH_X86_32 220cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ 221 prefs, mrefs, parity, mode 222%else 223cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ 224 prefs, mrefs, parity, mode 225%endif 226%if ARCH_X86_32 227 mov r4, r5mp 228 mov r5, r6mp 229 DECLARE_REG_TMP 4,5 230%else 231 movsxd r5, DWORD r5m 232 movsxd r6, DWORD r6m 233 DECLARE_REG_TMP 5,6 234%endif 235 236 cmp DWORD paritym, 0 237 je .parity0 238 FILTER 1, prevq, curq 239 jmp .ret 240 241.parity0: 242 FILTER 0, curq, nextq 243 244.ret: 245 RET 246%endmacro 247 248INIT_XMM ssse3 249YADIF 250INIT_XMM sse2 251YADIF 252%if ARCH_X86_32 253INIT_MMX mmxext 254YADIF 255%endif 256