• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* x86-optimized functions for yadif filter
3;* Copyright (C) 2020 Vivia Nikolaidou <vivia.nikolaidou@ltnglobal.com>
4;*
5;* Based on libav's vf_yadif.asm file
6;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
7;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26%include "x86inc.asm"
27
28SECTION_RODATA
29
30; 16 bytes of value 1
31pb_1: times 16 db 1
32; 8 words of value 1
33pw_1: times  8 dw 1
34
35SECTION .text
36
37%macro ABS1 2
38%if cpuflag(ssse3)
39    pabsw   %1, %1
40%elif cpuflag(mmxext) ; a, tmp
41    pxor    %2, %2
42    psubw   %2, %1
43    pmaxsw  %1, %2
44%else ; a, tmp
45    pxor       %2, %2
46    pcmpgtw    %2, %1
47    pxor       %1, %2
48    psubw      %1, %2
49%endif
50%endmacro
51
52%macro CHECK 2
53; %1 = 1+j, %2 = 1-j
54    ; m2 = t0[x+1+j]
55    movu      m2, [tzeroq+%1]
56    ; m3 = b0[x+1-j]
57    movu      m3, [bzeroq+%2]
58    ; m4 = t0[x+1+j]
59    mova      m4, m2
60    ; m5 = t0[x+1+j]
61    mova      m5, m2
62    ; m4 = xor(t0[x+1+j], b0[x+1-j])
63    pxor      m4, m3
64    pavgb     m5, m3
65    ; round down to 0
66    pand      m4, [pb_1]
67    ; m5 = rounded down average of the whole thing
68    psubusb   m5, m4
69    ; shift by 1 quadword to prepare for spatial_pred
70    psrldq    m5, 1
71    ; m7 = 0
72    ; Interleave low-order bytes with 0
73    ; so one pixel doesn't spill into the next one
74    punpcklbw m5, m7
75    ; m4 = t0[x+1+j] (reset)
76    mova      m4, m2
77    ; m2 = t0[x+1+j] - b0[x+1-j]
78    psubusb   m2, m3
79    ; m3 = -m2
80    psubusb   m3, m4
81    ; m2 = FFABS(t0[x+1+j] - b0[x+1-j]);
82    pmaxub    m2, m3
83    ; m3 = FFABS(t0[x+1+j] - b0[x+1-j]);
84    mova      m3, m2
85    ; m4 = FFABS(t0[x+1+j] - b0[x+1-j]);
86    mova      m4, m2
87    ; m3 = FFABS(t0[x+j] - b0[x-j])
88    psrldq    m3, 1
89    ; m4 = FFABS(t0[x-1+j] - b0[x-1-j])
90    psrldq    m4, 2
91    ; prevent pixel spilling for all of them
92    punpcklbw m2, m7
93    punpcklbw m3, m7
94    punpcklbw m4, m7
95    paddw     m2, m3
96    ; m2 = score
97    paddw     m2, m4
98%endmacro
99
100%macro CHECK1 0
101; m0 was spatial_score
102; m1 was spatial_pred
103    mova    m3, m0
104    ; compare for greater than
105    ; each word will be 1111 or 0000
106    pcmpgtw m3, m2
107    ; if (score < spatial_score) spatial_score = score;
108    pminsw  m0, m2
109    ; m6 = the mask
110    mova    m6, m3
111    ; m5 = becomes 0 if it should change
112    pand    m5, m3
113    ; nand: m3 = becomes 0 if it should not change
114    pandn   m3, m1
115    ; m3 = put them together in an OR
116    por     m3, m5
117    ; and put it in spatial_pred
118    mova    m1, m3
119%endmacro
120
121%macro CHECK2 0
122; m6 was the mask from CHECK1 (we don't change it)
123    paddw   m6, [pw_1]
124    ; shift words left while shifting in 14 0s (16 - j)
125    ; essentially to not recalculate the mask!
126    psllw   m6, 14
127    ; add it to score
128    paddsw  m2, m6
129    ; same as CHECK1
130    mova    m3, m0
131    pcmpgtw m3, m2
132    pminsw  m0, m2
133    pand    m5, m3
134    pandn   m3, m1
135    por     m3, m5
136    mova    m1, m3
137%endmacro
138
139%macro LOAD 2
140    movh      %1, %2
141    punpcklbw %1, m7
142%endmacro
143
144%macro FILTER_HEAD 0
145    ; m7 = 0
146    pxor         m7, m7
147    ; m0 = c
148    LOAD         m0, [tzeroq]
149    ; m1 = e
150    LOAD         m1, [bzeroq]
151    ; m3 = mp
152    LOAD         m3, [mpq]
153    ; m2 = m1
154    LOAD         m2, [moneq]
155    ; m4 = mp
156    mova         m4, m3
157    ; m3 = m1 + mp
158    paddw        m3, m2
159    ; m3 = d
160    psraw        m3, 1
161    ; rsp + 0 = d
162    mova   [rsp+ 0], m3
163    ; rsp + 16 = bzeroq
164    mova   [rsp+16], m1
165    ; m2 = m1 - mp
166    psubw        m2, m4
167    ; m2 = temporal_diff0 (m4 is temporary)
168    ABS1         m2, m4
169    ; m3 = t2
170    LOAD         m3, [ttwoq]
171    ; m4 = b2
172    LOAD         m4, [btwoq]
173    ; m3 = t2 - c
174    psubw        m3, m0
175    ; m4 = b2 - e
176    psubw        m4, m1
177    ; m3 = ABS(t2 - c)
178    ABS1         m3, m5
179    ; m4 = ABS(b2 - e)
180    ABS1         m4, m5
181    paddw        m3, m4
182    psrlw        m2, 1
183    ; m3 = temporal_diff1
184    psrlw        m3, 1
185    ; m2 = left part of diff
186    pmaxsw       m2, m3
187    ; m3 = tp2
188    LOAD         m3, [tptwoq]
189    ; m4 = bp2
190    LOAD         m4, [bptwoq]
191    psubw        m3, m0
192    psubw        m4, m1
193    ABS1         m3, m5
194    ABS1         m4, m5
195    paddw        m3, m4
196    ; m3 = temporal_diff2
197    psrlw        m3, 1
198    ; m2 = diff (for real)
199    pmaxsw       m2, m3
200    ; rsp + 32 = diff
201    mova   [rsp+32], m2
202
203    ; m1 = e + c
204    paddw        m1, m0
205    ; m0 = 2c
206    paddw        m0, m0
207    ; m0 = c - e
208    psubw        m0, m1
209    ; m1 = spatial_pred
210    psrlw        m1, 1
211    ; m0 = FFABS(c-e)
212    ABS1         m0, m2
213
214    ; m2 = t0[x-1]
215    ; if it's unpacked it should contain 4 bytes
216    movu         m2, [tzeroq-1]
217    ; m3 = b0[x-1]
218    movu         m3, [bzeroq-1]
219    ; m4 = t0[x-1]
220    mova         m4, m2
221    ; m2 = t0[x-1]-b0[x-1] unsigned packed
222    psubusb      m2, m3
223    ; m3 = m3 - m4 = b0[x-1]-t0[x-1] = -m2 unsigned packed
224    psubusb      m3, m4
225    ; m2 = max(m2, -m2) = abs(t0[x-1]-b0[x-1])
226    pmaxub       m2, m3
227%if mmsize == 16
228    ; m3 = m2 >> 2quadwords
229    ; pixel jump: go from x-1 to x+1
230    mova         m3, m2
231    psrldq       m3, 2
232%else
233    pshufw       m3, m2, q0021
234%endif
235    ; m7 = 0
236    ; unpack and interleave low-order bytes
237    ; to prevent pixel spilling when adding
238    punpcklbw    m2, m7
239    punpcklbw    m3, m7
240    paddw        m0, m2
241    paddw        m0, m3
242    ; m0 = spatial_score
243    psubw        m0, [pw_1]
244
245    CHECK -2, 0
246    CHECK1
247    CHECK -3, 1
248    CHECK2
249    CHECK 0, -2
250    CHECK1
251    CHECK 1, -3
252    CHECK2
253    ; now m0 = spatial_score, m1 = spatial_pred
254
255    ; m6 = diff
256    mova         m6, [rsp+32]
257%endmacro
258
259%macro FILTER_TAIL 0
260    ; m2 = d
261    mova         m2, [rsp]
262    ; m3 = d
263    mova         m3, m2
264    ; m2 = d - diff
265    psubw        m2, m6
266    ; m3 = d + diff
267    paddw        m3, m6
268    ; m1 = max(spatial_pred, d-diff)
269    pmaxsw       m1, m2
270    ; m1 = min(d + diff, max(spatial_pred, d-diff))
271    ; m1 = spatial_pred
272    pminsw       m1, m3
273    ; Converts 8 signed word integers into 16 unsigned byte integers with saturation
274    packuswb     m1, m1
275
276    ; dst = spatial_pred
277    movh     [dstq], m1
278    ; half the register size
279    add        dstq, mmsize/2
280    add        tzeroq, mmsize/2
281    add        bzeroq, mmsize/2
282    add        moneq, mmsize/2
283    add        mpq, mmsize/2
284    add        ttwoq, mmsize/2
285    add        btwoq, mmsize/2
286    add        tptwoq, mmsize/2
287    add        bptwoq, mmsize/2
288    add        ttoneq, mmsize/2
289    add        ttpq, mmsize/2
290    add        bboneq, mmsize/2
291    add        bbpq, mmsize/2
292%endmacro
293
294%macro FILTER_MODE0 0
295.loop0:
296    FILTER_HEAD
297    ; m2 = tt1
298    LOAD         m2, [ttoneq]
299    ; m4 = ttp
300    LOAD         m4, [ttpq]
301    ; m3 = bb1
302    LOAD         m3, [bboneq]
303    ; m5 = bbp
304    LOAD         m5, [bbpq]
305    paddw        m2, m4
306    paddw        m3, m5
307    ; m2 = b
308    psrlw        m2, 1
309    ; m3 = f
310    psrlw        m3, 1
311    ; m4 = c
312    LOAD         m4, [tzeroq]
313    ; m5 = d
314    mova         m5, [rsp]
315    ; m7 = e
316    mova         m7, [rsp+16]
317    ; m2 = b - c
318    psubw        m2, m4
319    ; m3 = f - e
320    psubw        m3, m7
321    ; m0 = d
322    mova         m0, m5
323    ; m5 = d - c
324    psubw        m5, m4
325    ; m0 = d - e
326    psubw        m0, m7
327    ; m4 = b - c
328    mova         m4, m2
329    ; m2 = FFMIN(b-c, f-e)
330    pminsw       m2, m3
331    ; m3 = FFMAX(f-e, b-c)
332    pmaxsw       m3, m4
333    ; m2 = FFMAX(d-c, FFMIN(b-c, f-e))
334    pmaxsw       m2, m5
335    ; m3 = FFMIN(d-c, FFMAX(f-e, b-c))
336    pminsw       m3, m5
337    ; m2 = max
338    pmaxsw       m2, m0
339    ; m3 = min
340    pminsw       m3, m0
341    ; m4 = 0
342    pxor         m4, m4
343    ; m6 = MAX(diff, min)
344    pmaxsw       m6, m3
345    ; m4 = -max
346    psubw        m4, m2
347    ; m6 = diff
348    pmaxsw       m6, m4
349
350    FILTER_TAIL
351    ; r13m = w
352    sub   DWORD r13m, mmsize/2
353    jg .loop0
354%endmacro
355
356%macro FILTER_MODE2 0
357.loop2:
358    FILTER_HEAD
359    FILTER_TAIL
360    ; r13m = w
361    sub   DWORD r13m, mmsize/2
362    jg .loop2
363%endmacro
364
365%macro YADIF_ADD3 0
366    ; start 3 pixels later
367    add        dstq, 3
368    add        tzeroq, 3
369    add        bzeroq, 3
370    add        moneq, 3
371    add        mpq, 3
372    add        ttwoq, 3
373    add        btwoq, 3
374    add        tptwoq, 3
375    add        bptwoq, 3
376    add        ttoneq, 3
377    add        ttpq, 3
378    add        bboneq, 3
379    add        bbpq, 3
380%endmacro
381
382; cglobal foo, 2,3,7,0x40, dst, src, tmp
383; declares a function (foo) that automatically loads two arguments (dst and
384; src) into registers, uses one additional register (tmp) plus 7 vector
385; registers (m0-m6) and allocates 0x40 bytes of stack space.
386%macro YADIF_MODE0 0
387cglobal yadif_filter_line_mode0, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
388                                        ttwo, btwo, tptwo, bptwo, ttone, \
389                                        ttp, bbone, bbp, w
390
391    YADIF_ADD3
392    FILTER_MODE0
393    RET
394%endmacro
395
396%macro YADIF_MODE2 0
397cglobal yadif_filter_line_mode2, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
398                                        ttwo, btwo, tptwo, bptwo, ttone, \
399                                        ttp, bbone, bbp, w
400
401    YADIF_ADD3
402    FILTER_MODE2
403    RET
404%endmacro
405
406; declares two functions for ssse3, and two for sse2
407INIT_XMM ssse3
408YADIF_MODE0
409YADIF_MODE2
410INIT_XMM sse2
411YADIF_MODE0
412YADIF_MODE2
413