• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* x86-optimized functions for yadif filter
3;*
4;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
5;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
6;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_1:    times 8 dw 1
30pw_8000: times 8 dw 0x8000
31pd_1:    times 4 dd 1
32pd_8000: times 4 dd 0x8000
33
34SECTION .text
35
36%macro PABS 2
37%if cpuflag(ssse3)
38    pabsd %1, %1
39%else
40    pxor    %2, %2
41    pcmpgtd %2, %1
42    pxor    %1, %2
43    psubd   %1, %2
44%endif
45%endmacro
46
47%macro PACK 1
48%if cpuflag(sse4)
49    packusdw %1, %1
50%else
51    psubd    %1, [pd_8000]
52    packssdw %1, %1
53    paddw    %1, [pw_8000]
54%endif
55%endmacro
56
57%macro PMAXUW 2
58%if cpuflag(sse4)
59    pmaxuw %1, %2
60%else
61    psubusw %1, %2
62    paddusw %1, %2
63%endif
64%endmacro
65
66%macro CHECK 2
67    movu      m2, [curq+t1+%1*2]
68    movu      m3, [curq+t0+%2*2]
69    mova      m4, m2
70    mova      m5, m2
71    pxor      m4, m3
72    pavgw     m5, m3
73    pand      m4, [pw_1]
74    psubusw   m5, m4
75    RSHIFT    m5, 2
76    punpcklwd m5, m7
77    mova      m4, m2
78    psubusw   m2, m3
79    psubusw   m3, m4
80    PMAXUW    m2, m3
81    mova      m3, m2
82    mova      m4, m2
83    RSHIFT    m3, 2
84    RSHIFT    m4, 4
85    punpcklwd m2, m7
86    punpcklwd m3, m7
87    punpcklwd m4, m7
88    paddd     m2, m3
89    paddd     m2, m4
90%endmacro
91
92%macro CHECK1 0
93    mova    m3, m0
94    pcmpgtd m3, m2
95    PMINSD  m0, m2, m6
96    mova    m6, m3
97    pand    m5, m3
98    pandn   m3, m1
99    por     m3, m5
100    mova    m1, m3
101%endmacro
102
103%macro CHECK2 0
104    paddd   m6, [pd_1]
105    pslld   m6, 30
106    paddd   m2, m6
107    mova    m3, m0
108    pcmpgtd m3, m2
109    PMINSD  m0, m2, m4
110    pand    m5, m3
111    pandn   m3, m1
112    por     m3, m5
113    mova    m1, m3
114%endmacro
115
116; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
117; am not sure whether it is any faster.  A rewrite or refactor of the filter
118; code should make it possible to eliminate the move instruction at the end.  It
119; exists to satisfy the expectation that the "score" values are in m1.
120
121; %macro CHECK2 0
122;     mova    m3, m0
123;     pcmpgtd m0, m2
124;     pand    m0, m6
125;     mova    m6, m0
126;     pand    m5, m6
127;     pand    m2, m0
128;     pandn   m6, m1
129;     pandn   m0, m3
130;     por     m6, m5
131;     por     m0, m2
132;     mova    m1, m6
133; %endmacro
134
135%macro LOAD 2
136    movh      %1, %2
137    punpcklwd %1, m7
138%endmacro
139
140%macro FILTER 3
141.loop%1:
142    pxor         m7, m7
143    LOAD         m0, [curq+t1]
144    LOAD         m1, [curq+t0]
145    LOAD         m2, [%2]
146    LOAD         m3, [%3]
147    mova         m4, m3
148    paddd        m3, m2
149    psrad        m3, 1
150    mova   [rsp+ 0], m0
151    mova   [rsp+16], m3
152    mova   [rsp+32], m1
153    psubd        m2, m4
154    PABS         m2, m4
155    LOAD         m3, [prevq+t1]
156    LOAD         m4, [prevq+t0]
157    psubd        m3, m0
158    psubd        m4, m1
159    PABS         m3, m5
160    PABS         m4, m5
161    paddd        m3, m4
162    psrld        m2, 1
163    psrld        m3, 1
164    PMAXSD       m2, m3, m6
165    LOAD         m3, [nextq+t1]
166    LOAD         m4, [nextq+t0]
167    psubd        m3, m0
168    psubd        m4, m1
169    PABS         m3, m5
170    PABS         m4, m5
171    paddd        m3, m4
172    psrld        m3, 1
173    PMAXSD       m2, m3, m6
174    mova   [rsp+48], m2
175
176    paddd        m1, m0
177    paddd        m0, m0
178    psubd        m0, m1
179    psrld        m1, 1
180    PABS         m0, m2
181
182    movu         m2, [curq+t1-1*2]
183    movu         m3, [curq+t0-1*2]
184    mova         m4, m2
185    psubusw      m2, m3
186    psubusw      m3, m4
187    PMAXUW       m2, m3
188    mova         m3, m2
189    RSHIFT       m3, 4
190    punpcklwd    m2, m7
191    punpcklwd    m3, m7
192    paddd        m0, m2
193    paddd        m0, m3
194    psubd        m0, [pd_1]
195
196    CHECK -2, 0
197    CHECK1
198    CHECK -3, 1
199    CHECK2
200    CHECK 0, -2
201    CHECK1
202    CHECK 1, -3
203    CHECK2
204
205    mova         m6, [rsp+48]
206    cmp   DWORD r8m, 2
207    jge .end%1
208    LOAD         m2, [%2+t1*2]
209    LOAD         m4, [%3+t1*2]
210    LOAD         m3, [%2+t0*2]
211    LOAD         m5, [%3+t0*2]
212    paddd        m2, m4
213    paddd        m3, m5
214    psrld        m2, 1
215    psrld        m3, 1
216    mova         m4, [rsp+ 0]
217    mova         m5, [rsp+16]
218    mova         m7, [rsp+32]
219    psubd        m2, m4
220    psubd        m3, m7
221    mova         m0, m5
222    psubd        m5, m4
223    psubd        m0, m7
224    mova         m4, m2
225    PMINSD       m2, m3, m7
226    PMAXSD       m3, m4, m7
227    PMAXSD       m2, m5, m7
228    PMINSD       m3, m5, m7
229    PMAXSD       m2, m0, m7
230    PMINSD       m3, m0, m7
231    pxor         m4, m4
232    PMAXSD       m6, m3, m7
233    psubd        m4, m2
234    PMAXSD       m6, m4, m7
235
236.end%1:
237    mova         m2, [rsp+16]
238    mova         m3, m2
239    psubd        m2, m6
240    paddd        m3, m6
241    PMAXSD       m1, m2, m7
242    PMINSD       m1, m3, m7
243    PACK         m1
244
245    movh     [dstq], m1
246    add        dstq, mmsize/2
247    add       prevq, mmsize/2
248    add        curq, mmsize/2
249    add       nextq, mmsize/2
250    sub   DWORD r4m, mmsize/4
251    jg .loop%1
252%endmacro
253
254%macro YADIF 0
255%if ARCH_X86_32
256cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
257                                              prefs, mrefs, parity, mode
258%else
259cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
260                                              prefs, mrefs, parity, mode
261%endif
262%if ARCH_X86_32
263    mov            r4, r5mp
264    mov            r5, r6mp
265    DECLARE_REG_TMP 4,5
266%else
267    movsxd         r5, DWORD r5m
268    movsxd         r6, DWORD r6m
269    DECLARE_REG_TMP 5,6
270%endif
271
272    cmp DWORD paritym, 0
273    je .parity0
274    FILTER 1, prevq, curq
275    jmp .ret
276
277.parity0:
278    FILTER 0, curq, nextq
279
280.ret:
281    RET
282%endmacro
283
284INIT_XMM sse4
285YADIF
286INIT_XMM ssse3
287YADIF
288INIT_XMM sse2
289YADIF
290