• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* x86-optimized yuv2yuvX
3;* Copyright 2020 Google LLC
4;* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION .text
26
27;-----------------------------------------------------------------------------
28; yuv2yuvX
29;
30; void ff_yuv2yuvX_<opt>(const int16_t *filter, int filterSize,
31;                        int srcOffset, uint8_t *dest, int dstW,
32;                        const uint8_t *dither, int offset);
33;
34;-----------------------------------------------------------------------------
35
36%macro YUV2YUVX_FUNC 0
37cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
38%if notcpuflag(sse3)
39%define movr mova
40%define unroll 1
41%else
42%define movr movdqu
43%define unroll 2
44%endif
45    movsxdifnidn         dstWq, dstWd
46    movsxdifnidn         offsetq, offsetd
47    movsxdifnidn         srcq, srcd
48%if cpuflag(avx2)
49    vpbroadcastq         m3, [ditherq]
50%else
51    movq                 xm3, [ditherq]
52%endif ; avx2
53    cmp                  offsetd, 0
54    jz                   .offset
55
56    ; offset != 0 path.
57    psrlq                m5, m3, $18
58    psllq                m3, m3, $28
59    por                  m3, m3, m5
60
61.offset:
62    add offsetq, srcq
63    movd                 xm1, filterSized
64    SPLATW               m1, xm1, 0
65    pxor                 m0, m0, m0
66    mov                  filterSizeq, filterq
67    mov                  srcq, [filterSizeq]
68    punpcklbw            m3, m0
69    psllw                m1, m1, 3
70    paddw                m3, m3, m1
71    psraw                m7, m3, 4
72.outerloop:
73    mova                 m4, m7
74    mova                 m3, m7
75%if cpuflag(sse3)
76    mova                 m6, m7
77    mova                 m1, m7
78%endif
79.loop:
80%if cpuflag(avx2)
81    vpbroadcastq         m0, [filterSizeq + 8]
82%elif cpuflag(sse3)
83    movddup              m0, [filterSizeq + 8]
84%else
85    mova                 m0, [filterSizeq + 8]
86%endif
87    pmulhw               m2, m0, [srcq + offsetq * 2]
88    pmulhw               m5, m0, [srcq + offsetq * 2 + mmsize]
89    paddw                m3, m3, m2
90    paddw                m4, m4, m5
91%if cpuflag(sse3)
92    pmulhw               m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
93    pmulhw               m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
94    paddw                m6, m6, m2
95    paddw                m1, m1, m5
96%endif
97    add                  filterSizeq, $10
98    mov                  srcq, [filterSizeq]
99    test                 srcq, srcq
100    jnz                  .loop
101    psraw                m3, m3, 3
102    psraw                m4, m4, 3
103%if cpuflag(sse3)
104    psraw                m6, m6, 3
105    psraw                m1, m1, 3
106%endif
107    packuswb             m3, m3, m4
108%if cpuflag(sse3)
109    packuswb             m6, m6, m1
110%endif
111    mov                  srcq, [filterq]
112%if cpuflag(avx2)
113    vpermq               m3, m3, 216
114    vpermq               m6, m6, 216
115%endif
116    movr                 [destq + offsetq], m3
117%if cpuflag(sse3)
118    movr                 [destq + offsetq + mmsize], m6
119%endif
120    add                  offsetq, mmsize * unroll
121    mov                  filterSizeq, filterq
122    cmp                  offsetq, dstWq
123    jb                  .outerloop
124    REP_RET
125%endmacro
126
127INIT_MMX mmx
128YUV2YUVX_FUNC
129INIT_MMX mmxext
130YUV2YUVX_FUNC
131INIT_XMM sse3
132YUV2YUVX_FUNC
133%if HAVE_AVX2_EXTERNAL
134INIT_YMM avx2
135YUV2YUVX_FUNC
136%endif
137