• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* Copyright Nick Kurshev
3;* Copyright Michael (michaelni@gmx.at)
4;* Copyright 2018 Jokyo Images
5;* Copyright Ivo van Poorten
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28pb_mask_shuffle2103_mmx times 8 dw 255
29pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
30pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
31pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
32pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
33pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
34
35SECTION .text
36
37%macro RSHIFT_COPY 3
38; %1 dst ; %2 src ; %3 shift
39%if cpuflag(avx)
40    psrldq  %1, %2, %3
41%else
42    mova           %1, %2
43    RSHIFT         %1, %3
44%endif
45%endmacro
46
47;------------------------------------------------------------------------------
48; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
49;------------------------------------------------------------------------------
50INIT_MMX mmxext
51cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
52    mova   m6, [pb_mask_shuffle2103_mmx]
53    mova   m7, m6
54    psllq  m7, 8
55
56    movsxdifnidn wq, wd
57    mov xq, wq
58
59    add        srcq, wq
60    add        dstq, wq
61    neg          wq
62
63;calc scalar loop
64    and xq, mmsize*2 -4
65    je .loop_simd
66
67.loop_scalar:
68   mov          tmpb, [srcq + wq + 2]
69   mov [dstq+wq + 0], tmpb
70   mov          tmpb, [srcq + wq + 1]
71   mov [dstq+wq + 1], tmpb
72   mov          tmpb, [srcq + wq + 0]
73   mov [dstq+wq + 2], tmpb
74   mov          tmpb, [srcq + wq + 3]
75   mov [dstq+wq + 3], tmpb
76   add            wq, 4
77   sub            xq, 4
78   jg .loop_scalar
79
80;check if src_size < mmsize * 2
81cmp wq, 0
82jge .end
83
84.loop_simd:
85    movu     m0, [srcq+wq]
86    movu     m1, [srcq+wq+8]
87
88    pshufw   m3, m0, 177
89    pshufw   m5, m1, 177
90
91    pand     m0, m7
92    pand     m3, m6
93
94    pand     m1, m7
95    pand     m5, m6
96
97    por      m0, m3
98    por      m1, m5
99
100    movu      [dstq+wq], m0
101    movu  [dstq+wq + 8], m1
102
103    add              wq, mmsize*2
104    jl .loop_simd
105
106.end:
107    RET
108
109;------------------------------------------------------------------------------
110; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
111;------------------------------------------------------------------------------
112; %1-4 index shuffle
113%macro SHUFFLE_BYTES 4
114cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
115    VBROADCASTI128    m0, [pb_shuffle%1%2%3%4]
116    movsxdifnidn wq, wd
117    mov xq, wq
118
119    add        srcq, wq
120    add        dstq, wq
121    neg          wq
122
123;calc scalar loop
124    and xq, mmsize-4
125    je .loop_simd
126
127.loop_scalar:
128   mov          tmpb, [srcq + wq + %1]
129   mov [dstq+wq + 0], tmpb
130   mov          tmpb, [srcq + wq + %2]
131   mov [dstq+wq + 1], tmpb
132   mov          tmpb, [srcq + wq + %3]
133   mov [dstq+wq + 2], tmpb
134   mov          tmpb, [srcq + wq + %4]
135   mov [dstq+wq + 3], tmpb
136   add            wq, 4
137   sub            xq, 4
138   jg .loop_scalar
139
140;check if src_size < mmsize
141cmp wq, 0
142jge .end
143
144.loop_simd:
145    movu           m1, [srcq+wq]
146    pshufb         m1, m0
147    movu    [dstq+wq], m1
148    add            wq, mmsize
149    jl .loop_simd
150
151.end:
152    RET
153%endmacro
154
155INIT_XMM ssse3
156SHUFFLE_BYTES 2, 1, 0, 3
157SHUFFLE_BYTES 0, 3, 2, 1
158SHUFFLE_BYTES 1, 2, 3, 0
159SHUFFLE_BYTES 3, 0, 1, 2
160SHUFFLE_BYTES 3, 2, 1, 0
161
162;-----------------------------------------------------------------------------------------------
163; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
164;              const uint8_t *src, int width, int height,
165;              int lumStride, int chromStride, int srcStride)
166;-----------------------------------------------------------------------------------------------
167%macro UYVY_TO_YUV422 0
168cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
169    pxor         m0, m0
170    pcmpeqw      m1, m1
171    psrlw        m1, 8
172
173    movsxdifnidn            wq, wd
174    movsxdifnidn   lum_strideq, lum_strided
175    movsxdifnidn chrom_strideq, chrom_strided
176    movsxdifnidn   src_strideq, src_strided
177
178    mov     back_wq, wq
179    mov      whalfq, wq
180    shr      whalfq, 1     ; whalf = width / 2
181
182    lea srcq, [srcq + wq * 2]
183    add    ydstq, wq
184    add    udstq, whalfq
185    add    vdstq, whalfq
186
187.loop_line:
188    mov          xq, wq
189    mov       wtwoq, wq
190    add       wtwoq, wtwoq ; wtwo = width * 2
191
192    neg       wq
193    neg    wtwoq
194    neg   whalfq
195
196    ;calc scalar loop count
197    and       xq, mmsize * 2 - 1
198    je .loop_simd
199
200    .loop_scalar:
201        mov             tmpb, [srcq + wtwoq + 0]
202        mov [udstq + whalfq], tmpb
203
204        mov             tmpb, [srcq + wtwoq + 1]
205        mov     [ydstq + wq], tmpb
206
207        mov             tmpb, [srcq + wtwoq + 2]
208        mov [vdstq + whalfq], tmpb
209
210        mov             tmpb, [srcq + wtwoq + 3]
211        mov [ydstq + wq + 1], tmpb
212
213        add      wq, 2
214        add   wtwoq, 4
215        add  whalfq, 1
216        sub      xq, 2
217        jg .loop_scalar
218
219    ; check if simd loop is need
220    cmp      wq, 0
221    jge .end_line
222
223    .loop_simd:
224        movu    m2, [srcq + wtwoq             ]
225        movu    m3, [srcq + wtwoq + mmsize    ]
226        movu    m4, [srcq + wtwoq + mmsize * 2]
227        movu    m5, [srcq + wtwoq + mmsize * 3]
228
229        ; extract y part 1
230        RSHIFT_COPY    m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
231        pand           m6, m1; YxYx YxYx...
232
233        RSHIFT_COPY    m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
234        pand           m7, m1 ; YxYx YxYx...
235
236        packuswb       m6, m7 ; YYYY YYYY...
237        movu [ydstq + wq], m6
238
239        ; extract y part 2
240        RSHIFT_COPY    m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
241        pand           m6, m1; YxYx YxYx...
242
243        RSHIFT_COPY    m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
244        pand           m7, m1 ; YxYx YxYx...
245
246        packuswb                m6, m7 ; YYYY YYYY...
247        movu [ydstq + wq + mmsize], m6
248
249        ; extract uv
250        pand       m2, m1   ; UxVx...
251        pand       m3, m1   ; UxVx...
252        pand       m4, m1   ; UxVx...
253        pand       m5, m1   ; UxVx...
254
255        packuswb   m2, m3   ; UVUV...
256        packuswb   m4, m5   ; UVUV...
257
258        ; U
259        pand       m6, m2, m1 ; UxUx...
260        pand       m7, m4, m1 ; UxUx...
261
262        packuswb m6, m7 ; UUUU
263        movu   [udstq + whalfq], m6
264
265
266        ; V
267        psrlw      m2, 8  ; VxVx...
268        psrlw      m4, 8  ; VxVx...
269        packuswb   m2, m4 ; VVVV
270        movu   [vdstq + whalfq], m2
271
272        add   whalfq, mmsize
273        add    wtwoq, mmsize * 4
274        add       wq, mmsize * 2
275        jl .loop_simd
276
277    .end_line:
278        add        srcq, src_strideq
279        add        ydstq, lum_strideq
280        add        udstq, chrom_strideq
281        add        vdstq, chrom_strideq
282
283        ;restore initial state of line variable
284        mov           wq, back_wq
285        mov          xq, wq
286        mov      whalfq, wq
287        shr      whalfq, 1     ; whalf = width / 2
288        sub          hd, 1
289        jg .loop_line
290
291    RET
292%endmacro
293
294%if ARCH_X86_64
295INIT_XMM sse2
296UYVY_TO_YUV422
297
298INIT_XMM avx
299UYVY_TO_YUV422
300%endif
301