• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* x86-optimized vertical line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*                    Kieran Kunhya <kieran@kunhya.com>
5;*           (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA 32
27
28minshort:      times 8 dw 0x8000
29yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
30yuv2yuvX_10_start:  times 4 dd 0x10000
31yuv2yuvX_9_start:   times 4 dd 0x20000
32yuv2yuvX_10_upper:  times 8 dw 0x3ff
33yuv2yuvX_9_upper:   times 8 dw 0x1ff
34pd_4:          times 4 dd 4
35pd_4min0x40000:times 4 dd 4 - (0x40000)
36pw_16:         times 8 dw 16
37pw_32:         times 8 dw 32
38pd_255:        times 8 dd 255
39pw_512:        times 8 dw 512
40pw_1024:       times 8 dw 1024
41
42yuv2nv12_shuffle_mask: times 2 db 0,  4,  8, 12, \
43                                 -1, -1, -1, -1, \
44                                 -1, -1, -1, -1, \
45                                 -1, -1, -1, -1
46yuv2nv21_shuffle_mask: times 2 db 4,  0, 12,  8, \
47                                 -1, -1, -1, -1, \
48                                 -1, -1, -1, -1, \
49                                 -1, -1, -1, -1
50yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
51
52SECTION .text
53
54;-----------------------------------------------------------------------------
55; vertical line scaling
56;
57; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
58;                                     const uint8_t *dither, int offset)
59; and
60; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
61;                                     const int16_t **src, uint8_t *dst, int dstW,
62;                                     const uint8_t *dither, int offset)
63;
64; Scale one or $filterSize lines of source data to generate one line of output
65; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in
66; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
67; of 2. $offset is either 0 or 3. $dither holds 8 values.
68;-----------------------------------------------------------------------------
69%macro yuv2planeX_mainloop 2
70.pixelloop_%2:
71%assign %%i 0
72    ; the rep here is for the 8-bit output MMX case, where dither covers
73    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
74    ; pixels per iteration. In order to not have to keep track of where
75    ; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2.
76%if %1 == 8
77%assign %%repcnt 16/mmsize
78%else
79%assign %%repcnt 1
80%endif
81
82%rep %%repcnt
83
84%if %1 == 8
85%if ARCH_X86_32
86    mova            m2, [rsp+mmsize*(0+%%i)]
87    mova            m1, [rsp+mmsize*(1+%%i)]
88%else ; x86-64
89    mova            m2,  m8
90    mova            m1,  m_dith
91%endif ; x86-32/64
92%else ; %1 == 9/10/16
93    mova            m1, [yuv2yuvX_%1_start]
94    mova            m2,  m1
95%endif ; %1 == 8/9/10/16
96    movsx     cntr_reg,  fltsizem
97.filterloop_%2_ %+ %%i:
98    ; input pixels
99    mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]
100%if %1 == 16
101    mova            m3, [r6+r5*4]
102    mova            m5, [r6+r5*4+mmsize]
103%else ; %1 == 8/9/10
104    mova            m3, [r6+r5*2]
105%endif ; %1 == 8/9/10/16
106    mov             r6, [srcq+gprsize*cntr_reg-gprsize]
107%if %1 == 16
108    mova            m4, [r6+r5*4]
109    mova            m6, [r6+r5*4+mmsize]
110%else ; %1 == 8/9/10
111    mova            m4, [r6+r5*2]
112%endif ; %1 == 8/9/10/16
113
114    ; coefficients
115    movd            m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
116%if %1 == 16
117    pshuflw         m7,  m0,  0          ; coeff[0]
118    pshuflw         m0,  m0,  0x55       ; coeff[1]
119    pmovsxwd        m7,  m7              ; word -> dword
120    pmovsxwd        m0,  m0              ; word -> dword
121
122    pmulld          m3,  m7
123    pmulld          m5,  m7
124    pmulld          m4,  m0
125    pmulld          m6,  m0
126
127    paddd           m2,  m3
128    paddd           m1,  m5
129    paddd           m2,  m4
130    paddd           m1,  m6
131%else ; %1 == 10/9/8
132    punpcklwd       m5,  m3,  m4
133    punpckhwd       m3,  m4
134    SPLATD          m0
135
136    pmaddwd         m5,  m0
137    pmaddwd         m3,  m0
138
139    paddd           m2,  m5
140    paddd           m1,  m3
141%endif ; %1 == 8/9/10/16
142
143    sub       cntr_reg,  2
144    jg .filterloop_%2_ %+ %%i
145
146%if %1 == 16
147    psrad           m2,  31 - %1
148    psrad           m1,  31 - %1
149%else ; %1 == 10/9/8
150    psrad           m2,  27 - %1
151    psrad           m1,  27 - %1
152%endif ; %1 == 8/9/10/16
153
154%if %1 == 8
155    packssdw        m2,  m1
156    packuswb        m2,  m2
157    movh   [dstq+r5*1],  m2
158%else ; %1 == 9/10/16
159%if %1 == 16
160    packssdw        m2,  m1
161    paddw           m2, [minshort]
162%else ; %1 == 9/10
163%if cpuflag(sse4)
164    packusdw        m2,  m1
165%else ; mmxext/sse2
166    packssdw        m2,  m1
167    pmaxsw          m2,  m6
168%endif ; mmxext/sse2/sse4/avx
169    pminsw          m2, [yuv2yuvX_%1_upper]
170%endif ; %1 == 9/10/16
171    mov%2   [dstq+r5*2],  m2
172%endif ; %1 == 8/9/10/16
173
174    add             r5,  mmsize/2
175    sub             wd,  mmsize/2
176
177%assign %%i %%i+2
178%endrep
179    jg .pixelloop_%2
180%endmacro
181
182%macro yuv2planeX_fn 3
183
184%if ARCH_X86_32
185%define cntr_reg fltsizeq
186%define movsx mov
187%else
188%define cntr_reg r7
189%define movsx movsxd
190%endif
191
192cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
193%if %1 == 8 || %1 == 9 || %1 == 10
194    pxor            m6,  m6
195%endif ; %1 == 8/9/10
196
197%if %1 == 8
198%if ARCH_X86_32
199%assign pad 0x2c - (stack_offset & 15)
200    SUB             rsp, pad
201%define m_dith m7
202%else ; x86-64
203%define m_dith m9
204%endif ; x86-32
205
206    ; create registers holding dither
207    movq        m_dith, [ditherq]        ; dither
208    test        offsetd, offsetd
209    jz              .no_rot
210%if mmsize == 16
211    punpcklqdq  m_dith,  m_dith
212%endif ; mmsize == 16
213    PALIGNR     m_dith,  m_dith,  3,  m0
214.no_rot:
215%if mmsize == 16
216    punpcklbw   m_dith,  m6
217%if ARCH_X86_64
218    punpcklwd       m8,  m_dith,  m6
219    pslld           m8,  12
220%else ; x86-32
221    punpcklwd       m5,  m_dith,  m6
222    pslld           m5,  12
223%endif ; x86-32/64
224    punpckhwd   m_dith,  m6
225    pslld       m_dith,  12
226%if ARCH_X86_32
227    mova      [rsp+ 0],  m5
228    mova      [rsp+16],  m_dith
229%endif
230%else ; mmsize == 8
231    punpcklbw       m5,  m_dith,  m6
232    punpckhbw   m_dith,  m6
233    punpcklwd       m4,  m5,  m6
234    punpckhwd       m5,  m6
235    punpcklwd       m3,  m_dith,  m6
236    punpckhwd   m_dith,  m6
237    pslld           m4,  12
238    pslld           m5,  12
239    pslld           m3,  12
240    pslld       m_dith,  12
241    mova      [rsp+ 0],  m4
242    mova      [rsp+ 8],  m5
243    mova      [rsp+16],  m3
244    mova      [rsp+24],  m_dith
245%endif ; mmsize == 8/16
246%endif ; %1 == 8
247
248    xor             r5,  r5
249
250%if mmsize == 8 || %1 == 8
251    yuv2planeX_mainloop %1, a
252%else ; mmsize == 16
253    test          dstq, 15
254    jnz .unaligned
255    yuv2planeX_mainloop %1, a
256    REP_RET
257.unaligned:
258    yuv2planeX_mainloop %1, u
259%endif ; mmsize == 8/16
260
261%if %1 == 8
262%if ARCH_X86_32
263    ADD             rsp, pad
264    RET
265%else ; x86-64
266    REP_RET
267%endif ; x86-32/64
268%else ; %1 == 9/10/16
269    REP_RET
270%endif ; %1 == 8/9/10/16
271%endmacro
272
273%if ARCH_X86_32
274INIT_MMX mmxext
275yuv2planeX_fn  8,  0, 7
276yuv2planeX_fn  9,  0, 5
277yuv2planeX_fn 10,  0, 5
278%endif
279
280INIT_XMM sse2
281yuv2planeX_fn  8, 10, 7
282yuv2planeX_fn  9,  7, 5
283yuv2planeX_fn 10,  7, 5
284
285INIT_XMM sse4
286yuv2planeX_fn  8, 10, 7
287yuv2planeX_fn  9,  7, 5
288yuv2planeX_fn 10,  7, 5
289yuv2planeX_fn 16,  8, 5
290
291%if HAVE_AVX_EXTERNAL
292INIT_XMM avx
293yuv2planeX_fn  8, 10, 7
294yuv2planeX_fn  9,  7, 5
295yuv2planeX_fn 10,  7, 5
296%endif
297
298; %1=outout-bpc, %2=alignment (u/a)
299%macro yuv2plane1_mainloop 2
300.loop_%2:
301%if %1 == 8
302    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
303    paddsw          m1, m3, [srcq+wq*2+mmsize*1]
304    psraw           m0, 7
305    psraw           m1, 7
306    packuswb        m0, m1
307    mov%2    [dstq+wq], m0
308%elif %1 == 16
309    paddd           m0, m4, [srcq+wq*4+mmsize*0]
310    paddd           m1, m4, [srcq+wq*4+mmsize*1]
311    paddd           m2, m4, [srcq+wq*4+mmsize*2]
312    paddd           m3, m4, [srcq+wq*4+mmsize*3]
313    psrad           m0, 3
314    psrad           m1, 3
315    psrad           m2, 3
316    psrad           m3, 3
317%if cpuflag(sse4) ; avx/sse4
318    packusdw        m0, m1
319    packusdw        m2, m3
320%else ; mmx/sse2
321    packssdw        m0, m1
322    packssdw        m2, m3
323    paddw           m0, m5
324    paddw           m2, m5
325%endif ; mmx/sse2/sse4/avx
326    mov%2    [dstq+wq*2+mmsize*0], m0
327    mov%2    [dstq+wq*2+mmsize*1], m2
328%else ; %1 == 9/10
329    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
330    paddsw          m1, m2, [srcq+wq*2+mmsize*1]
331    psraw           m0, 15 - %1
332    psraw           m1, 15 - %1
333    pmaxsw          m0, m4
334    pmaxsw          m1, m4
335    pminsw          m0, m3
336    pminsw          m1, m3
337    mov%2    [dstq+wq*2+mmsize*0], m0
338    mov%2    [dstq+wq*2+mmsize*1], m1
339%endif
340    add             wq, mmsize
341    jl .loop_%2
342%endmacro
343
344%macro yuv2plane1_fn 3
345cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
346    movsxdifnidn    wq, wd
347    add             wq, mmsize - 1
348    and             wq, ~(mmsize - 1)
349%if %1 == 8
350    add           dstq, wq
351%else ; %1 != 8
352    lea           dstq, [dstq+wq*2]
353%endif ; %1 == 8
354%if %1 == 16
355    lea           srcq, [srcq+wq*4]
356%else ; %1 != 16
357    lea           srcq, [srcq+wq*2]
358%endif ; %1 == 16
359    neg             wq
360
361%if %1 == 8
362    pxor            m4, m4               ; zero
363
364    ; create registers holding dither
365    movq            m3, [ditherq]        ; dither
366    test       offsetd, offsetd
367    jz              .no_rot
368%if mmsize == 16
369    punpcklqdq      m3, m3
370%endif ; mmsize == 16
371    PALIGNR         m3, m3, 3, m2
372.no_rot:
373%if mmsize == 8
374    mova            m2, m3
375    punpckhbw       m3, m4               ; byte->word
376    punpcklbw       m2, m4               ; byte->word
377%else
378    punpcklbw       m3, m4
379    mova            m2, m3
380%endif
381%elif %1 == 9
382    pxor            m4, m4
383    mova            m3, [pw_512]
384    mova            m2, [pw_32]
385%elif %1 == 10
386    pxor            m4, m4
387    mova            m3, [pw_1024]
388    mova            m2, [pw_16]
389%else ; %1 == 16
390%if cpuflag(sse4) ; sse4/avx
391    mova            m4, [pd_4]
392%else ; mmx/sse2
393    mova            m4, [pd_4min0x40000]
394    mova            m5, [minshort]
395%endif ; mmx/sse2/sse4/avx
396%endif ; %1 == ..
397
398    ; actual pixel scaling
399%if mmsize == 8
400    yuv2plane1_mainloop %1, a
401%else ; mmsize == 16
402    test          dstq, 15
403    jnz .unaligned
404    yuv2plane1_mainloop %1, a
405    REP_RET
406.unaligned:
407    yuv2plane1_mainloop %1, u
408%endif ; mmsize == 8/16
409    REP_RET
410%endmacro
411
412%if ARCH_X86_32
413INIT_MMX mmx
414yuv2plane1_fn  8, 0, 5
415yuv2plane1_fn 16, 0, 3
416
417INIT_MMX mmxext
418yuv2plane1_fn  9, 0, 3
419yuv2plane1_fn 10, 0, 3
420%endif
421
422INIT_XMM sse2
423yuv2plane1_fn  8, 5, 5
424yuv2plane1_fn  9, 5, 3
425yuv2plane1_fn 10, 5, 3
426yuv2plane1_fn 16, 6, 3
427
428INIT_XMM sse4
429yuv2plane1_fn 16, 5, 3
430
431%if HAVE_AVX_EXTERNAL
432INIT_XMM avx
433yuv2plane1_fn  8, 5, 5
434yuv2plane1_fn  9, 5, 3
435yuv2plane1_fn 10, 5, 3
436yuv2plane1_fn 16, 5, 3
437%endif
438
439%undef movsx
440
441;-----------------------------------------------------------------------------
442; AVX2 yuv2nv12cX implementation
443;
444; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
445;                         const int16_t *filter, int filterSize,
446;                         const int16_t **u, const int16_t **v,
447;                         uint8_t *dst, int dstWidth)
448;
449; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
450;                         const int16_t *filter, int filterSize,
451;                         const int16_t **u, const int16_t **v,
452;                         uint8_t *dst, int dstWidth)
453;-----------------------------------------------------------------------------
454
455%if ARCH_X86_64
456%macro yuv2nv12cX_fn 1
457cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth
458
459    mov tmp1q, qword [ditherq]
460    movq xm0, tmp1q
461    ror tmp1q, 24
462    movq xm1, tmp1q
463
464    pmovzxbd m0, xm0
465    pslld m0, m0, 12                        ; ditherLo
466    pmovzxbd m1, xm1
467    pslld m1, m1, 12                        ; ditherHi
468
469    pxor m9, m9                             ; uint8_min dwords
470    mova m10, [pd_255]                      ; uint8_max dwords
471    mova m11, [%1_shuffle_mask]             ; shuffle_mask
472    mova m12, [yuv2nv12_permute_mask]       ; permute mask
473
474    DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
475
476    xor r8q, r8q
477
478nv12_outer_%1:
479    mova m2, m0                             ; resultLo
480    mova m3, m1                             ; resultHi
481    xor r9q, r9q
482
483nv12_inner_%1:
484    movsx r10d, word [filterq + (2 * r9q)]
485    movd xm4, r10d
486    vpbroadcastd m4, xm4                    ; filter
487
488    mov tmp1q, [uq + (gprsize * r9q)]
489    mova xm7, oword [tmp1q + 2 * r8q]
490
491    mov tmp2q, [vq + (gprsize * r9q)]
492    mova xm8, oword [tmp2q + 2 * r8q]
493
494    punpcklwd xm5, xm7, xm8
495    pmovsxwd m5, xm5                        ; multiplicandsLo
496    punpckhwd xm6, xm7, xm8
497    pmovsxwd m6, xm6                        ; multiplicandsHi
498
499    pmulld m7, m5, m4                       ; mulResultLo
500    pmulld m8, m6, m4                       ; mulResultHi
501    paddd m2, m2, m7                        ; resultLo += mulResultLo
502    paddd m3, m3, m8                        ; resultHi += mulResultHi
503
504    inc r9d
505    cmp r9d, filterSized
506    jl nv12_inner_%1
507    ; end of inner loop
508
509    psrad m2, m2, 19
510    psrad m3, m3, 19
511
512    ; Vectorized av_clip_uint8
513    pmaxsd m2, m2, m9
514    pmaxsd m3, m3, m9
515    pminsd m2, m2, m10
516    pminsd m3, m3, m10
517
518    ; At this point we have clamped uint8s arranged in this order:
519    ;     m2: u1  0  0  0  v1  0  0  0  [...]
520    ;     m3: u5  0  0  0  v5  0  0  0  [...]
521    ;
522    ; First, we shuffle the bytes to make the bytes semi-contiguous.
523    ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
524    ;     m2: u1  v1  u2  v2  0  0  0  0  0  0  0  0  u3  v3  u4  v4
525    ;     m3: u5  v5  u6  v6  0  0  0  0  0  0  0  0  u7  v7  u8  v8
526    pshufb m2, m2, m11
527    pshufb m3, m3, m11
528
529    ; To fix the cross-lane shuffling issue, we'll then use cross-lane
530    ; permutation to combine the two segments
531    vpermd m2, m12, m2
532    vpermd m3, m12, m3
533
534    ; Now we have the final results in the lower 8 bytes of each register
535    movq [dstq], xm2
536    movq [dstq + 8], xm3
537
538    add r8d, 8
539    add dstq, 16
540
541    cmp r8d, dstWidthd
542    jl nv12_outer_%1
543    RET
544%endmacro
545
546%if HAVE_AVX2_EXTERNAL
547INIT_YMM avx2
548yuv2nv12cX_fn yuv2nv12
549yuv2nv12cX_fn yuv2nv21
550%endif
551%endif ; ARCH_X86_64
552