• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* x86-optimized vertical line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;*                    Kieran Kunhya <kieran@kunhya.com>
5;*           (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA 32
27
28minshort:      times 8 dw 0x8000
29yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
30yuv2yuvX_10_start:  times 4 dd 0x10000
31yuv2yuvX_9_start:   times 4 dd 0x20000
32yuv2yuvX_10_upper:  times 8 dw 0x3ff
33yuv2yuvX_9_upper:   times 8 dw 0x1ff
34pd_4:          times 4 dd 4
35pd_4min0x40000:times 4 dd 4 - (0x40000)
36pw_16:         times 8 dw 16
37pw_32:         times 8 dw 32
38pd_255:        times 8 dd 255
39pw_512:        times 8 dw 512
40pw_1024:       times 8 dw 1024
41pd_65535_invf:             times 8 dd 0x37800080 ;1.0/65535.0
42pd_yuv2gbrp16_start:       times 8 dd -0x40000000
43pd_yuv2gbrp_y_start:       times 8 dd  (1 << 9)
44pd_yuv2gbrp_uv_start:      times 8 dd  ((1 << 9) - (128 << 19))
45pd_yuv2gbrp_a_start:       times 8 dd  (1 << 18)
46pd_yuv2gbrp16_offset:      times 8 dd  0x10000  ;(1 << 16)
47pd_yuv2gbrp16_round13:     times 8 dd  0xE0002000  ;(1 << 13) - (1 << 29)
48pd_yuv2gbrp16_a_offset:    times 8 dd  0x20002000
49pd_yuv2gbrp16_upper30:     times 8 dd  0x3FFFFFFF ;(1<<30) - 1
50pd_yuv2gbrp16_upper27:     times 8 dd  0x07FFFFFF ;(1<<27) - 1
51pd_yuv2gbrp16_upper16:     times 8 dd  0x0000FFFF ;(1<<16) - 1
52pd_yuv2gbrp16_upperC:      times 8 dd  0xC0000000
53pd_yuv2gbrp_debias:        times 8 dd  0x00008000 ;(1 << 29 - 14)
54pb_pack_shuffle8:       db  0,  4,  8, 12, \
55                           -1, -1, -1, -1, \
56                           -1, -1, -1, -1, \
57                           -1, -1, -1, -1, \
58                           -1, -1, -1, -1, \
59                            0,  4,  8, 12, \
60                           -1, -1, -1, -1, \
61                           -1, -1, -1, -1
62pb_pack_shuffle16le:    db  0,  1,  4,  5, \
63                            8,  9, 12, 13, \
64                           -1, -1, -1, -1, \
65                           -1, -1, -1, -1, \
66                           -1, -1, -1, -1, \
67                           -1, -1, -1, -1, \
68                            0,  1,  4,  5, \
69                            8,  9, 12, 13
70pb_pack_shuffle16be:    db  1,  0,  5,  4, \
71                            9,  8, 13, 12, \
72                           -1, -1, -1, -1, \
73                           -1, -1, -1, -1, \
74                           -1, -1, -1, -1, \
75                           -1, -1, -1, -1, \
76                            1,  0,  5,  4, \
77                            9,  8, 13, 12
78pb_shuffle32be:         db  3,  2,  1,  0, \
79                            7,  6,  5,  4, \
80                           11, 10,  9,  8, \
81                           15, 14, 13, 12, \
82                            3,  2,  1,  0, \
83                            7,  6,  5,  4, \
84                           11, 10,  9,  8, \
85                           15, 14, 13, 12
86yuv2nv12_shuffle_mask: times 2 db 0,  4,  8, 12, \
87                                 -1, -1, -1, -1, \
88                                 -1, -1, -1, -1, \
89                                 -1, -1, -1, -1
90yuv2nv21_shuffle_mask: times 2 db 4,  0, 12,  8, \
91                                 -1, -1, -1, -1, \
92                                 -1, -1, -1, -1, \
93                                 -1, -1, -1, -1
94yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
95
96SECTION .text
97
98;-----------------------------------------------------------------------------
99; vertical line scaling
100;
101; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
102;                                     const uint8_t *dither, int offset)
103; and
104; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
105;                                     const int16_t **src, uint8_t *dst, int dstW,
106;                                     const uint8_t *dither, int offset)
107;
108; Scale one or $filterSize lines of source data to generate one line of output
109; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in
110; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
111; of 2. $offset is either 0 or 3. $dither holds 8 values.
112;-----------------------------------------------------------------------------
113%macro yuv2planeX_mainloop 2
114.pixelloop_%2:
115%assign %%i 0
116    ; the rep here is for the 8-bit output MMX case, where dither covers
117    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
118    ; pixels per iteration. In order to not have to keep track of where
119    ; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2.
120%if %1 == 8
121%assign %%repcnt 16/mmsize
122%else
123%assign %%repcnt 1
124%endif
125
126%rep %%repcnt
127
128%if %1 == 8
129%if ARCH_X86_32
130    mova            m2, [rsp+mmsize*(0+%%i)]
131    mova            m1, [rsp+mmsize*(1+%%i)]
132%else ; x86-64
133    mova            m2,  m8
134    mova            m1,  m_dith
135%endif ; x86-32/64
136%else ; %1 == 9/10/16
137    mova            m1, [yuv2yuvX_%1_start]
138    mova            m2,  m1
139%endif ; %1 == 8/9/10/16
140    movsx     cntr_reg,  fltsizem
141.filterloop_%2_ %+ %%i:
142    ; input pixels
143    mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]
144%if %1 == 16
145    mova            m3, [r6+r5*4]
146    mova            m5, [r6+r5*4+mmsize]
147%else ; %1 == 8/9/10
148    mova            m3, [r6+r5*2]
149%endif ; %1 == 8/9/10/16
150    mov             r6, [srcq+gprsize*cntr_reg-gprsize]
151%if %1 == 16
152    mova            m4, [r6+r5*4]
153    mova            m6, [r6+r5*4+mmsize]
154%else ; %1 == 8/9/10
155    mova            m4, [r6+r5*2]
156%endif ; %1 == 8/9/10/16
157
158    ; coefficients
159    movd            m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
160%if %1 == 16
161    pshuflw         m7,  m0,  0          ; coeff[0]
162    pshuflw         m0,  m0,  0x55       ; coeff[1]
163    pmovsxwd        m7,  m7              ; word -> dword
164    pmovsxwd        m0,  m0              ; word -> dword
165
166    pmulld          m3,  m7
167    pmulld          m5,  m7
168    pmulld          m4,  m0
169    pmulld          m6,  m0
170
171    paddd           m2,  m3
172    paddd           m1,  m5
173    paddd           m2,  m4
174    paddd           m1,  m6
175%else ; %1 == 10/9/8
176    punpcklwd       m5,  m3,  m4
177    punpckhwd       m3,  m4
178    SPLATD          m0
179
180    pmaddwd         m5,  m0
181    pmaddwd         m3,  m0
182
183    paddd           m2,  m5
184    paddd           m1,  m3
185%endif ; %1 == 8/9/10/16
186
187    sub       cntr_reg,  2
188    jg .filterloop_%2_ %+ %%i
189
190%if %1 == 16
191    psrad           m2,  31 - %1
192    psrad           m1,  31 - %1
193%else ; %1 == 10/9/8
194    psrad           m2,  27 - %1
195    psrad           m1,  27 - %1
196%endif ; %1 == 8/9/10/16
197
198%if %1 == 8
199    packssdw        m2,  m1
200    packuswb        m2,  m2
201    movh   [dstq+r5*1],  m2
202%else ; %1 == 9/10/16
203%if %1 == 16
204    packssdw        m2,  m1
205    paddw           m2, [minshort]
206%else ; %1 == 9/10
207%if cpuflag(sse4)
208    packusdw        m2,  m1
209%else ; mmxext/sse2
210    packssdw        m2,  m1
211    pmaxsw          m2,  m6
212%endif ; mmxext/sse2/sse4/avx
213    pminsw          m2, [yuv2yuvX_%1_upper]
214%endif ; %1 == 9/10/16
215    mov%2   [dstq+r5*2],  m2
216%endif ; %1 == 8/9/10/16
217
218    add             r5,  mmsize/2
219    sub             wd,  mmsize/2
220
221%assign %%i %%i+2
222%endrep
223    jg .pixelloop_%2
224%endmacro
225
226%macro yuv2planeX_fn 3
227
228%if ARCH_X86_32
229%define cntr_reg fltsizeq
230%define movsx mov
231%else
232%define cntr_reg r7
233%define movsx movsxd
234%endif
235
236cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
237%if %1 == 8 || %1 == 9 || %1 == 10
238    pxor            m6,  m6
239%endif ; %1 == 8/9/10
240
241%if %1 == 8
242%if ARCH_X86_32
243%assign pad 0x2c - (stack_offset & 15)
244    SUB             rsp, pad
245%define m_dith m7
246%else ; x86-64
247%define m_dith m9
248%endif ; x86-32
249
250    ; create registers holding dither
251    movq        m_dith, [ditherq]        ; dither
252    test        offsetd, offsetd
253    jz              .no_rot
254%if mmsize == 16
255    punpcklqdq  m_dith,  m_dith
256%endif ; mmsize == 16
257    PALIGNR     m_dith,  m_dith,  3,  m0
258.no_rot:
259%if mmsize == 16
260    punpcklbw   m_dith,  m6
261%if ARCH_X86_64
262    punpcklwd       m8,  m_dith,  m6
263    pslld           m8,  12
264%else ; x86-32
265    punpcklwd       m5,  m_dith,  m6
266    pslld           m5,  12
267%endif ; x86-32/64
268    punpckhwd   m_dith,  m6
269    pslld       m_dith,  12
270%if ARCH_X86_32
271    mova      [rsp+ 0],  m5
272    mova      [rsp+16],  m_dith
273%endif
274%else ; mmsize == 8
275    punpcklbw       m5,  m_dith,  m6
276    punpckhbw   m_dith,  m6
277    punpcklwd       m4,  m5,  m6
278    punpckhwd       m5,  m6
279    punpcklwd       m3,  m_dith,  m6
280    punpckhwd   m_dith,  m6
281    pslld           m4,  12
282    pslld           m5,  12
283    pslld           m3,  12
284    pslld       m_dith,  12
285    mova      [rsp+ 0],  m4
286    mova      [rsp+ 8],  m5
287    mova      [rsp+16],  m3
288    mova      [rsp+24],  m_dith
289%endif ; mmsize == 8/16
290%endif ; %1 == 8
291
292    xor             r5,  r5
293
294%if mmsize == 8 || %1 == 8
295    yuv2planeX_mainloop %1, a
296%else ; mmsize == 16
297    test          dstq, 15
298    jnz .unaligned
299    yuv2planeX_mainloop %1, a
300    REP_RET
301.unaligned:
302    yuv2planeX_mainloop %1, u
303%endif ; mmsize == 8/16
304
305%if %1 == 8
306%if ARCH_X86_32
307    ADD             rsp, pad
308    RET
309%else ; x86-64
310    REP_RET
311%endif ; x86-32/64
312%else ; %1 == 9/10/16
313    REP_RET
314%endif ; %1 == 8/9/10/16
315%endmacro
316
317%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
318INIT_MMX mmxext
319yuv2planeX_fn  8,  0, 7
320%endif
321
322INIT_XMM sse2
323yuv2planeX_fn  8, 10, 7
324yuv2planeX_fn  9,  7, 5
325yuv2planeX_fn 10,  7, 5
326
327INIT_XMM sse4
328yuv2planeX_fn  8, 10, 7
329yuv2planeX_fn  9,  7, 5
330yuv2planeX_fn 10,  7, 5
331yuv2planeX_fn 16,  8, 5
332
333%if HAVE_AVX_EXTERNAL
334INIT_XMM avx
335yuv2planeX_fn  8, 10, 7
336yuv2planeX_fn  9,  7, 5
337yuv2planeX_fn 10,  7, 5
338%endif
339
340; %1=outout-bpc, %2=alignment (u/a)
341%macro yuv2plane1_mainloop 2
342.loop_%2:
343%if %1 == 8
344    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
345    paddsw          m1, m3, [srcq+wq*2+mmsize*1]
346    psraw           m0, 7
347    psraw           m1, 7
348    packuswb        m0, m1
349    mov%2    [dstq+wq], m0
350%elif %1 == 16
351    paddd           m0, m4, [srcq+wq*4+mmsize*0]
352    paddd           m1, m4, [srcq+wq*4+mmsize*1]
353    paddd           m2, m4, [srcq+wq*4+mmsize*2]
354    paddd           m3, m4, [srcq+wq*4+mmsize*3]
355    psrad           m0, 3
356    psrad           m1, 3
357    psrad           m2, 3
358    psrad           m3, 3
359%if cpuflag(sse4) ; avx/sse4
360    packusdw        m0, m1
361    packusdw        m2, m3
362%else ; mmx/sse2
363    packssdw        m0, m1
364    packssdw        m2, m3
365    paddw           m0, m5
366    paddw           m2, m5
367%endif ; mmx/sse2/sse4/avx
368    mov%2    [dstq+wq*2+mmsize*0], m0
369    mov%2    [dstq+wq*2+mmsize*1], m2
370%else ; %1 == 9/10
371    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
372    paddsw          m1, m2, [srcq+wq*2+mmsize*1]
373    psraw           m0, 15 - %1
374    psraw           m1, 15 - %1
375    pmaxsw          m0, m4
376    pmaxsw          m1, m4
377    pminsw          m0, m3
378    pminsw          m1, m3
379    mov%2    [dstq+wq*2+mmsize*0], m0
380    mov%2    [dstq+wq*2+mmsize*1], m1
381%endif
382    add             wq, mmsize
383    jl .loop_%2
384%endmacro
385
386%macro yuv2plane1_fn 3
387cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
388    movsxdifnidn    wq, wd
389    add             wq, mmsize - 1
390    and             wq, ~(mmsize - 1)
391%if %1 == 8
392    add           dstq, wq
393%else ; %1 != 8
394    lea           dstq, [dstq+wq*2]
395%endif ; %1 == 8
396%if %1 == 16
397    lea           srcq, [srcq+wq*4]
398%else ; %1 != 16
399    lea           srcq, [srcq+wq*2]
400%endif ; %1 == 16
401    neg             wq
402
403%if %1 == 8
404    pxor            m4, m4               ; zero
405
406    ; create registers holding dither
407    movq            m3, [ditherq]        ; dither
408    test       offsetd, offsetd
409    jz              .no_rot
410    punpcklqdq      m3, m3
411    PALIGNR         m3, m3, 3, m2
412.no_rot:
413    punpcklbw       m3, m4
414    mova            m2, m3
415%elif %1 == 9
416    pxor            m4, m4
417    mova            m3, [pw_512]
418    mova            m2, [pw_32]
419%elif %1 == 10
420    pxor            m4, m4
421    mova            m3, [pw_1024]
422    mova            m2, [pw_16]
423%else ; %1 == 16
424%if cpuflag(sse4) ; sse4/avx
425    mova            m4, [pd_4]
426%else ; sse2
427    mova            m4, [pd_4min0x40000]
428    mova            m5, [minshort]
429%endif ; sse2/sse4/avx
430%endif ; %1 == ..
431
432    ; actual pixel scaling
433    test          dstq, 15
434    jnz .unaligned
435    yuv2plane1_mainloop %1, a
436    REP_RET
437.unaligned:
438    yuv2plane1_mainloop %1, u
439    REP_RET
440%endmacro
441
442INIT_XMM sse2
443yuv2plane1_fn  8, 5, 5
444yuv2plane1_fn  9, 5, 3
445yuv2plane1_fn 10, 5, 3
446yuv2plane1_fn 16, 6, 3
447
448INIT_XMM sse4
449yuv2plane1_fn 16, 5, 3
450
451%if HAVE_AVX_EXTERNAL
452INIT_XMM avx
453yuv2plane1_fn  8, 5, 5
454yuv2plane1_fn  9, 5, 3
455yuv2plane1_fn 10, 5, 3
456yuv2plane1_fn 16, 5, 3
457%endif
458
459%undef movsx
460
461;-----------------------------------------------------------------------------
462; AVX2 yuv2nv12cX implementation
463;
464; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
465;                         const int16_t *filter, int filterSize,
466;                         const int16_t **u, const int16_t **v,
467;                         uint8_t *dst, int dstWidth)
468;
469; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
470;                         const int16_t *filter, int filterSize,
471;                         const int16_t **u, const int16_t **v,
472;                         uint8_t *dst, int dstWidth)
473;-----------------------------------------------------------------------------
474
475%if ARCH_X86_64
476%macro yuv2nv12cX_fn 1
477cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth
478
479    mov tmp1q, qword [ditherq]
480    movq xm0, tmp1q
481    ror tmp1q, 24
482    movq xm1, tmp1q
483
484    pmovzxbd m0, xm0
485    pslld m0, m0, 12                        ; ditherLo
486    pmovzxbd m1, xm1
487    pslld m1, m1, 12                        ; ditherHi
488
489    pxor m9, m9                             ; uint8_min dwords
490    mova m10, [pd_255]                      ; uint8_max dwords
491    mova m11, [%1_shuffle_mask]             ; shuffle_mask
492    mova m12, [yuv2nv12_permute_mask]       ; permute mask
493
494    DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
495
496    xor r8q, r8q
497
498nv12_outer_%1:
499    mova m2, m0                             ; resultLo
500    mova m3, m1                             ; resultHi
501    xor r9q, r9q
502
503nv12_inner_%1:
504    movsx r10d, word [filterq + (2 * r9q)]
505    movd xm4, r10d
506    vpbroadcastd m4, xm4                    ; filter
507
508    mov tmp1q, [uq + (gprsize * r9q)]
509    mova xm7, oword [tmp1q + 2 * r8q]
510
511    mov tmp2q, [vq + (gprsize * r9q)]
512    mova xm8, oword [tmp2q + 2 * r8q]
513
514    punpcklwd xm5, xm7, xm8
515    pmovsxwd m5, xm5                        ; multiplicandsLo
516    punpckhwd xm6, xm7, xm8
517    pmovsxwd m6, xm6                        ; multiplicandsHi
518
519    pmulld m7, m5, m4                       ; mulResultLo
520    pmulld m8, m6, m4                       ; mulResultHi
521    paddd m2, m2, m7                        ; resultLo += mulResultLo
522    paddd m3, m3, m8                        ; resultHi += mulResultHi
523
524    inc r9d
525    cmp r9d, filterSized
526    jl nv12_inner_%1
527    ; end of inner loop
528
529    psrad m2, m2, 19
530    psrad m3, m3, 19
531
532    ; Vectorized av_clip_uint8
533    pmaxsd m2, m2, m9
534    pmaxsd m3, m3, m9
535    pminsd m2, m2, m10
536    pminsd m3, m3, m10
537
538    ; At this point we have clamped uint8s arranged in this order:
539    ;     m2: u1  0  0  0  v1  0  0  0  [...]
540    ;     m3: u5  0  0  0  v5  0  0  0  [...]
541    ;
542    ; First, we shuffle the bytes to make the bytes semi-contiguous.
543    ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
544    ;     m2: u1  v1  u2  v2  0  0  0  0  0  0  0  0  u3  v3  u4  v4
545    ;     m3: u5  v5  u6  v6  0  0  0  0  0  0  0  0  u7  v7  u8  v8
546    pshufb m2, m2, m11
547    pshufb m3, m3, m11
548
549    ; To fix the cross-lane shuffling issue, we'll then use cross-lane
550    ; permutation to combine the two segments
551    vpermd m2, m12, m2
552    vpermd m3, m12, m3
553
554    ; Now we have the final results in the lower 8 bytes of each register
555    movq [dstq], xm2
556    movq [dstq + 8], xm3
557
558    add r8d, 8
559    add dstq, 16
560
561    cmp r8d, dstWidthd
562    jl nv12_outer_%1
563    RET
564%endmacro
565
566%if HAVE_AVX2_EXTERNAL
567INIT_YMM avx2
568yuv2nv12cX_fn yuv2nv12
569yuv2nv12cX_fn yuv2nv21
570%endif
571%endif ; ARCH_X86_64
572
573;-----------------------------------------------------------------------------
574; planar grb yuv2anyX functions
575; void ff_yuv2<gbr_format>_full_X_<opt>(SwsContext *c, const int16_t *lumFilter,
576;                                       const int16_t **lumSrcx, int lumFilterSize,
577;                                       const int16_t *chrFilter, const int16_t **chrUSrcx,
578;                                       const int16_t **chrVSrcx, int chrFilterSize,
579;                                       const int16_t **alpSrcx, uint8_t **dest,
580;                                       int dstW, int y)
581;-----------------------------------------------------------------------------
582
583%if ARCH_X86_64
584struc SwsContext
585    .padding:           resb 40292 ; offsetof(SwsContext, yuv2rgb_y_offset)
586    .yuv2rgb_y_offset:  resd 1
587    .yuv2rgb_y_coeff:   resd 1
588    .yuv2rgb_v2r_coeff: resd 1
589    .yuv2rgb_v2g_coeff: resd 1
590    .yuv2rgb_u2g_coeff: resd 1
591    .yuv2rgb_u2b_coeff: resd 1
592endstruc
593
594%define R m0
595%define G m1
596%define B m2
597%define A m3
598
599%define Y m4
600%define U m5
601%define V m6
602
603; Clip a signed integer to an unsigned power of two range.
604; av_clip_uintp2
605; 1 - dest
606; 2 - bit position to clip at
607%macro CLIPP2 2
608    ; (~a) >> 31 & ((1<<p) - 1);
609    pcmpeqb m4, m4
610    pxor m4, %1
611    psrad m4, 31
612    movu m5, [pd_yuv2gbrp16_upper%2]
613    pand m4, m5
614
615    ; (a & ~((1<<p) - 1)) == 0
616    pandn m5, %1
617    pxor m6, m6
618    pcmpeqd m5, m6
619%if cpuflag(avx2)
620    vpblendvb %1, m4, %1, m5
621%else
622    pxor %1, m4
623    pand %1, m5
624    pxor %1, m4
625%endif
626%endmacro
627
628; 1 - dest
629; 2 - source
630%macro LOAD16 2
631    %if cpuflag(avx2)
632        movu xm%1, %2
633        vpmovsxwd m%1, xm%1
634    %elif cpuflag(sse4)
635        movsd m%1, %2
636        pmovsxwd m%1, m%1
637    %else
638        movsd m%1, %2
639        punpcklwd m%1, m%1
640        psrad m%1, 16 ; sign extend
641    %endif
642%endmacro
643
644; 1 - dest
645; 2 - source
646; 3 - depth
647%macro LOAD_PIXELS 3
648    mov ptrq, [%2 + jq*8]
649%if %3 >= 16
650    movu m%1, [ptrq + xq*4]
651%else
652    LOAD16 %1, [ptrq + xq*2]
653%endif
654%endmacro
655
656; 1 - dest
657; 2 - source
658%macro STORE8 2
659    mov ptrq, %1
660    %if mmsize > 16
661        pshufb m%2, [pb_pack_shuffle8]
662        vextractf128 xm4, m%2, 1
663        por xm%2, xm4
664        movq [ptrq + xq], xm%2
665    %else
666        %if cpuflag(sse4)
667            pshufb m%2, [pb_pack_shuffle8]
668        %else
669            psrldq m4, m%2, 3
670            por m%2, m4
671            psrldq m4, m%2, 6
672            por m%2, m4
673        %endif
674        movd [ptrq + xq], m%2
675    %endif
676%endmacro
677
678; 1 - dest
679; 2 - source
680; 3 - is big endian
681%macro STORE16 3
682    mov ptrq, %1
683    %if mmsize > 16
684        %if %3 ; bigendian
685            pshufb m%2, [pb_pack_shuffle16be]
686        %else
687            pshufb m%2, [pb_pack_shuffle16le]
688        %endif
689        vpermq m%2, m%2, (3 << 6 | 0 << 4 | 3 << 2 | 0 << 0)
690        movu [ptrq + xq*2], xm%2
691    %else
692        %if cpuflag(sse4) && %3 ; bigendian
693            pshufb m%2, [pb_pack_shuffle16be]
694        %elif cpuflag(sse4)
695            pshufb m%2, [pb_pack_shuffle16le]
696        %else
697            pshuflw m%2, m%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
698            pshufhw m%2, m%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
699            pshufd  m%2, m%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
700            %if %3 ; bigendian
701                psrlw  m4, m%2, 8
702                psllw  m%2, 8
703                por m%2, m4
704            %endif
705        %endif
706        movq [ptrq + xq*2], m%2
707    %endif
708%endmacro
709
710%macro SWAP32 1
711%if mmsize > 16 || cpuflag(sse4)
712    pshufb m%1, [pb_shuffle32be]
713%else
714    psrlw  m4, m%1, 8
715    psllw  m%1, 8
716    por m%1, m4
717    pshuflw m%1, m%1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0)
718    pshufhw m%1, m%1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0)
719%endif
720%endmacro
721
722; 1 - dest
723; 2 - source
724; 3 - depth
725; 4 - is big endian
726%macro STORE_PIXELS 4
727%if %3 > 16
728    %if %4
729        SWAP32 %2
730    %endif
731    mov ptrq, %1
732    movu [ptrq + xq*4], m%2
733%elif %3 > 8
734    STORE16 %1, %2, %4
735%else
736    STORE8 %1, %2
737%endif
738%endmacro
739
740%macro PMULLO 3
741%if cpuflag(sse4) || mmsize > 16
742    pmulld %1, %2, %3
743%else
744    %ifidni %1, %2
745    %else
746        mova %1, %2
747    %endif
748    pshufd m7, %1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0) ; 0xb1
749    pshufd m8, %3, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0) ; 0xb1
750    pmuludq m7, m8
751    pshufd  m7, m7, (3 << 6 | 1 << 4 | 2 << 2 | 0 << 0) ; 0xd8
752    pmuludq %1, %3
753    pshufd  %1, %1, (3 << 6 | 1 << 4 | 2 << 2 | 0 << 0) ; 0xd8
754    punpckldq %1, m7
755%endif
756%endmacro
757
758; 1 - name
759; 2 - depth
760; 3 - has alpha
761; 3 - is big endian
762; 5 - is float
763%macro yuv2gbrp_fn 5
764%define DEPTH %2
765%define HAS_ALPHA %3
766%define IS_BE %4
767%define FLOAT %5
768%define SH (22 + 8 - DEPTH)
769
770%if DEPTH >= 16
771    %define RGB_SHIFT 14
772    %define A_SHIFT 14
773%elif 22 != SH
774    %define RGB_SHIFT SH
775    %define A_SHIFT (SH-3)
776%else
777    %define RGB_SHIFT 22
778    %define A_SHIFT 19
779%endif
780
781%if DEPTH >= 16
782    %define YUV_SHIFT 14
783    %define Y_START  m9
784    %define Y_ROUND [pd_yuv2gbrp16_round13]
785    %define UV_START m9
786    %define A_START  m9
787    %define A_CLIP2P 30
788%else
789    %define YUV_SHIFT 10
790    %define Y_START  [pd_yuv2gbrp_y_start]
791    %define Y_ROUND  m9
792    %define UV_START [pd_yuv2gbrp_uv_start]
793    %define A_START  [pd_yuv2gbrp_a_start]
794    %define A_CLIP2P 27
795%endif
796
797cglobal yuv2%1_full_X, 12, 14, 16, ptr, lumFilter, lumSrcx, lumFilterSize, chrFilter, chrUSrcx, chrVSrcx, chrFilterSize, alpSrcx, dest, dstW, y, x, j
798    VBROADCASTSS m10, dword [ptrq + SwsContext.yuv2rgb_y_offset]
799    VBROADCASTSS m11, dword [ptrq + SwsContext.yuv2rgb_y_coeff]
800    VBROADCASTSS m12, dword [ptrq + SwsContext.yuv2rgb_v2r_coeff]
801    VBROADCASTSS m13, dword [ptrq + SwsContext.yuv2rgb_v2g_coeff]
802    VBROADCASTSS m14, dword [ptrq + SwsContext.yuv2rgb_u2g_coeff]
803    VBROADCASTSS m15, dword [ptrq + SwsContext.yuv2rgb_u2b_coeff]
804
805%if DEPTH >= 16
806    movu m9, [pd_yuv2gbrp16_start]
807%else
808    mov xq, (1 << (SH-1))
809    movq xm9, xq
810    VBROADCASTSS m9, xm9
811%endif
812    xor xq, xq
813
814    %%loop_x:
815        movu Y, Y_START
816        movu U, UV_START
817        movu V, UV_START
818
819        xor jq, jq
820        %%loop_luma:
821            movsx ptrd, word [lumFilterq + jq*2]
822            movd xm0, ptrd
823            VBROADCASTSS m0, xm0
824            LOAD_PIXELS 1, lumSrcxq, DEPTH
825            PMULLO m1, m1, m0
826            paddd Y, m1
827            inc jd
828            cmp jd, lumFilterSized
829            jl %%loop_luma
830
831%if HAS_ALPHA
832        cmp alpSrcxq, 0
833        je %%skip_alpha_load
834            xor jq, jq
835            movu A, A_START
836            %%loop_alpha:
837                movsx ptrd, word [lumFilterq + jq*2]
838                movd xm0, ptrd
839                VBROADCASTSS m0, xm0
840                LOAD_PIXELS 1, alpSrcxq, DEPTH
841                PMULLO m1, m1, m0
842                paddd A, m1
843                inc jd
844                cmp jd, lumFilterSized
845                jl %%loop_alpha
846%if DEPTH >= 16
847            psrad A, 1
848            paddd A, [pd_yuv2gbrp16_a_offset]
849%endif
850        %%skip_alpha_load:
851%endif
852        xor jq, jq
853        %%loop_chr:
854            movsx ptrd, word [chrFilterq + jq*2]
855            movd xm0, ptrd
856            VBROADCASTSS m0, xm0
857            LOAD_PIXELS 1, chrUSrcxq, DEPTH
858            LOAD_PIXELS 2, chrVSrcxq, DEPTH
859            PMULLO m1, m1, m0
860            PMULLO m2, m2, m0
861            paddd U, m1
862            paddd V, m2
863            inc jd
864            cmp jd, chrFilterSized
865            jl %%loop_chr
866
867        psrad Y, YUV_SHIFT
868%if  DEPTH >= 16
869        paddd Y, [pd_yuv2gbrp16_offset]
870%endif
871        psrad U, YUV_SHIFT
872        psrad V, YUV_SHIFT
873
874        psubd  Y, m10    ; yuv2rgb_y_offset
875        PMULLO Y, Y, m11 ; yuv2rgb_y_coeff
876        paddd  Y, Y_ROUND
877
878        PMULLO R, V, m12 ; yuv2rgb_v2r_coeff
879        PMULLO B, U, m15 ; yuv2rgb_u2b_coeff
880
881        PMULLO U, U, m14 ; yuv2rgb_u2g_coeff
882        PMULLO V, V, m13 ; yuv2rgb_v2g_coeff
883        paddd G, U, V
884        paddd R, Y
885        paddd G, Y
886        paddd B, Y
887
888%if  DEPTH < 16
889        CLIPP2 R, 30
890        CLIPP2 G, 30
891        CLIPP2 B, 30
892%endif
893
894        psrad R, RGB_SHIFT
895        psrad G, RGB_SHIFT
896        psrad B, RGB_SHIFT
897
898%if  DEPTH >= 16
899        paddd R, [pd_yuv2gbrp_debias]
900        paddd G, [pd_yuv2gbrp_debias]
901        paddd B, [pd_yuv2gbrp_debias]
902
903        CLIPP2 R, 16
904        CLIPP2 G, 16
905        CLIPP2 B, 16
906%endif
907
908%if FLOAT
909        cvtdq2ps R, R
910        cvtdq2ps G, G
911        cvtdq2ps B, B
912        mulps R, [pd_65535_invf]
913        mulps G, [pd_65535_invf]
914        mulps B, [pd_65535_invf]
915%endif
916        STORE_PIXELS [destq +  0], 1, DEPTH, IS_BE ; G
917        STORE_PIXELS [destq +  8], 2, DEPTH, IS_BE ; B
918        STORE_PIXELS [destq + 16], 0, DEPTH, IS_BE ; R
919
920%if HAS_ALPHA
921        cmp alpSrcxq, 0
922        je %%skip_alpha_store
923            CLIPP2 A, A_CLIP2P
924            psrad A, A_SHIFT
925%if FLOAT
926            cvtdq2ps A, A
927            mulps A, [pd_65535_invf]
928%endif
929            STORE_PIXELS [destq + 24], 3, DEPTH, IS_BE
930        %%skip_alpha_store:
931%endif
932        add xq, mmsize/4
933        cmp xd, dstWd
934        jl %%loop_x
935
936    RET
937%endmacro
938
939%macro yuv2gbrp_fn_decl 2
940INIT_%1 %2
941yuv2gbrp_fn gbrp,        8, 0, 0, 0
942yuv2gbrp_fn gbrap,       8, 1, 0, 0
943yuv2gbrp_fn gbrp9le,     9, 0, 0, 0
944yuv2gbrp_fn gbrp10le,   10, 0, 0, 0
945yuv2gbrp_fn gbrap10le,  10, 1, 0, 0
946yuv2gbrp_fn gbrp12le,   12, 0, 0, 0
947yuv2gbrp_fn gbrap12le,  12, 1, 0, 0
948yuv2gbrp_fn gbrp14le,   14, 0, 0, 0
949yuv2gbrp_fn gbrp16le,   16, 0, 0, 0
950yuv2gbrp_fn gbrap16le,  16, 1, 0, 0
951yuv2gbrp_fn gbrpf32le,  32, 0, 0, 1
952yuv2gbrp_fn gbrapf32le, 32, 1, 0, 1
953
954yuv2gbrp_fn gbrp9be,     9, 0, 1, 0
955yuv2gbrp_fn gbrp10be,   10, 0, 1, 0
956yuv2gbrp_fn gbrap10be,  10, 1, 1, 0
957yuv2gbrp_fn gbrp12be,   12, 0, 1, 0
958yuv2gbrp_fn gbrap12be,  12, 1, 1, 0
959yuv2gbrp_fn gbrp14be,   14, 0, 1, 0
960yuv2gbrp_fn gbrp16be,   16, 0, 1, 0
961yuv2gbrp_fn gbrap16be,  16, 1, 1, 0
962yuv2gbrp_fn gbrpf32be,  32, 0, 1, 1
963yuv2gbrp_fn gbrapf32be, 32, 1, 1, 1
964%endmacro
965
966yuv2gbrp_fn_decl XMM, sse2
967yuv2gbrp_fn_decl XMM, sse4
968
969%if HAVE_AVX2_EXTERNAL
970yuv2gbrp_fn_decl YMM, avx2
971%endif
972
973%endif ; ARCH_X86_64
974