• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* Copyright Nick Kurshev
3;* Copyright Michael (michaelni@gmx.at)
4;* Copyright 2018 Jokyo Images
5;* Copyright Ivo van Poorten
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28pb_mask_shuffle2103_mmx times 8 dw 255
29pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
30pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
31pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
32pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
33pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
34
35SECTION .text
36
37%macro RSHIFT_COPY 3
38; %1 dst ; %2 src ; %3 shift
39%if cpuflag(avx)
40    psrldq  %1, %2, %3
41%else
42    mova           %1, %2
43    RSHIFT         %1, %3
44%endif
45%endmacro
46
47;------------------------------------------------------------------------------
48; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
49;------------------------------------------------------------------------------
50INIT_MMX mmxext
51cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
52    mova   m6, [pb_mask_shuffle2103_mmx]
53    mova   m7, m6
54    psllq  m7, 8
55
56    movsxdifnidn wq, wd
57    mov xq, wq
58
59    add        srcq, wq
60    add        dstq, wq
61    neg          wq
62
63;calc scalar loop
64    and xq, mmsize*2 -4
65    je .loop_simd
66
67.loop_scalar:
68   mov          tmpb, [srcq + wq + 2]
69   mov [dstq+wq + 0], tmpb
70   mov          tmpb, [srcq + wq + 1]
71   mov [dstq+wq + 1], tmpb
72   mov          tmpb, [srcq + wq + 0]
73   mov [dstq+wq + 2], tmpb
74   mov          tmpb, [srcq + wq + 3]
75   mov [dstq+wq + 3], tmpb
76   add            wq, 4
77   sub            xq, 4
78   jg .loop_scalar
79
80;check if src_size < mmsize * 2
81cmp wq, 0
82jge .end
83
84.loop_simd:
85    movu     m0, [srcq+wq]
86    movu     m1, [srcq+wq+8]
87
88    pshufw   m3, m0, 177
89    pshufw   m5, m1, 177
90
91    pand     m0, m7
92    pand     m3, m6
93
94    pand     m1, m7
95    pand     m5, m6
96
97    por      m0, m3
98    por      m1, m5
99
100    movu      [dstq+wq], m0
101    movu  [dstq+wq + 8], m1
102
103    add              wq, mmsize*2
104    jl .loop_simd
105
106.end:
107    RET
108
109;------------------------------------------------------------------------------
110; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
111;------------------------------------------------------------------------------
112; %1-4 index shuffle
113%macro SHUFFLE_BYTES 4
114cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
115    VBROADCASTI128    m0, [pb_shuffle%1%2%3%4]
116    movsxdifnidn wq, wd
117    mov xq, wq
118
119    add        srcq, wq
120    add        dstq, wq
121    neg          wq
122
123;calc scalar loop
124    and xq, mmsize-4
125    je .loop_simd
126
127.loop_scalar:
128   mov          tmpb, [srcq + wq + %1]
129   mov [dstq+wq + 0], tmpb
130   mov          tmpb, [srcq + wq + %2]
131   mov [dstq+wq + 1], tmpb
132   mov          tmpb, [srcq + wq + %3]
133   mov [dstq+wq + 2], tmpb
134   mov          tmpb, [srcq + wq + %4]
135   mov [dstq+wq + 3], tmpb
136   add            wq, 4
137   sub            xq, 4
138   jg .loop_scalar
139
140;check if src_size < mmsize
141cmp wq, 0
142jge .end
143
144.loop_simd:
145    movu           m1, [srcq+wq]
146    pshufb         m1, m0
147    movu    [dstq+wq], m1
148    add            wq, mmsize
149    jl .loop_simd
150
151.end:
152    RET
153%endmacro
154
155INIT_XMM ssse3
156SHUFFLE_BYTES 2, 1, 0, 3
157SHUFFLE_BYTES 0, 3, 2, 1
158SHUFFLE_BYTES 1, 2, 3, 0
159SHUFFLE_BYTES 3, 0, 1, 2
160SHUFFLE_BYTES 3, 2, 1, 0
161
162%if ARCH_X86_64
163%if HAVE_AVX2_EXTERNAL
164INIT_YMM avx2
165SHUFFLE_BYTES 2, 1, 0, 3
166SHUFFLE_BYTES 0, 3, 2, 1
167SHUFFLE_BYTES 1, 2, 3, 0
168SHUFFLE_BYTES 3, 0, 1, 2
169SHUFFLE_BYTES 3, 2, 1, 0
170%endif
171%endif
172
173;-----------------------------------------------------------------------------------------------
174; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
175;              const uint8_t *src, int width, int height,
176;              int lumStride, int chromStride, int srcStride)
177;-----------------------------------------------------------------------------------------------
178%macro UYVY_TO_YUV422 0
179cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
180    pxor         m0, m0
181    pcmpeqw      m1, m1
182    psrlw        m1, 8
183
184    movsxdifnidn            wq, wd
185    movsxdifnidn   lum_strideq, lum_strided
186    movsxdifnidn chrom_strideq, chrom_strided
187    movsxdifnidn   src_strideq, src_strided
188
189    mov     back_wq, wq
190    mov      whalfq, wq
191    shr      whalfq, 1     ; whalf = width / 2
192
193    lea srcq, [srcq + wq * 2]
194    add    ydstq, wq
195    add    udstq, whalfq
196    add    vdstq, whalfq
197
198.loop_line:
199    mov          xq, wq
200    mov       wtwoq, wq
201    add       wtwoq, wtwoq ; wtwo = width * 2
202
203    neg       wq
204    neg    wtwoq
205    neg   whalfq
206
207    ;calc scalar loop count
208    and       xq, mmsize * 2 - 1
209    je .loop_simd
210
211    .loop_scalar:
212        mov             tmpb, [srcq + wtwoq + 0]
213        mov [udstq + whalfq], tmpb
214
215        mov             tmpb, [srcq + wtwoq + 1]
216        mov     [ydstq + wq], tmpb
217
218        mov             tmpb, [srcq + wtwoq + 2]
219        mov [vdstq + whalfq], tmpb
220
221        mov             tmpb, [srcq + wtwoq + 3]
222        mov [ydstq + wq + 1], tmpb
223
224        add      wq, 2
225        add   wtwoq, 4
226        add  whalfq, 1
227        sub      xq, 2
228        jg .loop_scalar
229
230    ; check if simd loop is need
231    cmp      wq, 0
232    jge .end_line
233
234    .loop_simd:
235        movu    m2, [srcq + wtwoq             ]
236        movu    m3, [srcq + wtwoq + mmsize    ]
237        movu    m4, [srcq + wtwoq + mmsize * 2]
238        movu    m5, [srcq + wtwoq + mmsize * 3]
239
240        ; extract y part 1
241        RSHIFT_COPY    m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
242        pand           m6, m1; YxYx YxYx...
243
244        RSHIFT_COPY    m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
245        pand           m7, m1 ; YxYx YxYx...
246
247        packuswb       m6, m7 ; YYYY YYYY...
248        movu [ydstq + wq], m6
249
250        ; extract y part 2
251        RSHIFT_COPY    m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
252        pand           m6, m1; YxYx YxYx...
253
254        RSHIFT_COPY    m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
255        pand           m7, m1 ; YxYx YxYx...
256
257        packuswb                m6, m7 ; YYYY YYYY...
258        movu [ydstq + wq + mmsize], m6
259
260        ; extract uv
261        pand       m2, m1   ; UxVx...
262        pand       m3, m1   ; UxVx...
263        pand       m4, m1   ; UxVx...
264        pand       m5, m1   ; UxVx...
265
266        packuswb   m2, m3   ; UVUV...
267        packuswb   m4, m5   ; UVUV...
268
269        ; U
270        pand       m6, m2, m1 ; UxUx...
271        pand       m7, m4, m1 ; UxUx...
272
273        packuswb m6, m7 ; UUUU
274        movu   [udstq + whalfq], m6
275
276
277        ; V
278        psrlw      m2, 8  ; VxVx...
279        psrlw      m4, 8  ; VxVx...
280        packuswb   m2, m4 ; VVVV
281        movu   [vdstq + whalfq], m2
282
283        add   whalfq, mmsize
284        add    wtwoq, mmsize * 4
285        add       wq, mmsize * 2
286        jl .loop_simd
287
288    .end_line:
289        add        srcq, src_strideq
290        add        ydstq, lum_strideq
291        add        udstq, chrom_strideq
292        add        vdstq, chrom_strideq
293
294        ;restore initial state of line variable
295        mov           wq, back_wq
296        mov          xq, wq
297        mov      whalfq, wq
298        shr      whalfq, 1     ; whalf = width / 2
299        sub          hd, 1
300        jg .loop_line
301
302    RET
303%endmacro
304
305%if ARCH_X86_64
306INIT_XMM sse2
307UYVY_TO_YUV422
308
309INIT_XMM avx
310UYVY_TO_YUV422
311%endif
312