• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* x86-optimized functions for colorspace filter
3;*
4;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27pw_1: times 8 dw 1
28pw_2: times 8 dw 2
29pw_4: times 8 dw 4
30pw_8: times 8 dw 8
31pw_16: times 8 dw 16
32pw_64: times 8 dw 64
33pw_128: times 8 dw 128
34pw_256: times 8 dw 256
35pw_512: times 8 dw 512
36pw_1023: times 8 dw 1023
37pw_1024: times 8 dw 1024
38pw_2048: times 8 dw 2048
39pw_4095: times 8 dw 4095
40pw_8192: times 8 dw 8192
41pw_16384: times 8 dw 16384
42
43pd_1: times 4 dd 1
44pd_2: times 4 dd 2
45pd_128: times 4 dd 128
46pd_512: times 4 dd 512
47pd_2048: times 4 dd 2048
48pd_8192: times 4 dd 8192
49pd_32768: times 4 dd 32768
50pd_131072: times 4 dd 131072
51
52SECTION .text
53
54; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3],
55;                               uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3],
56;                               int w, int h, const int16_t yuv2yuv_coeffs[3][3][8],
57;                               const int16_t yuv_offset[2][8])
58
59%if ARCH_X86_64
60%macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert)
61
62%assign %%sh (14 + %1 - %2)
63%assign %%rnd (1 << (%%sh - 1))
64%assign %%uvinoff (128 << (%1 - 8))
65%assign %%uvoutoff (128 << (%2 - 8))
66%if %3 == 0
67%assign %%ss 444
68%elif %4 == 0
69%assign %%ss 422
70%else ; %4 == 1
71%assign %%ss 420
72%endif ; %3/%4
73%if %2 != 8
74%assign %%maxval (1 << %2) - 1
75%endif ; %2 != 8
76
77%assign %%ypsh %%sh - 1
78%if %%ypsh > 14
79%assign %%yoffsh %%ypsh - 13
80%assign %%ypsh 14
81%else
82%assign %%yoffsh 1
83%endif
84%assign %%yprnd (1 << (%%yoffsh - 1))
85%assign %%ypmul (1 << %%ypsh)
86
87cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \
88                                     yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo
89%if %3 == 1
90    inc             wd
91    sar             wd, 1
92%if %4 == 1
93    inc             hd
94    sar             hd, 1
95%endif ; %4 == 1
96%endif ; %3 == 1
97    mov [rsp+3*mmsize+0], wd
98    mov [rsp+3*mmsize+4], hd
99
100    mova           m10, [cq]
101    pxor           m11, m11
102    mova           m12, [pd_ %+ %%uvoutoff]
103    pslld          m12, %%sh
104    paddd          m12, [pd_ %+ %%rnd]
105    mova           m13, [pw_ %+ %%uvinoff]
106    mova           m14, [yoffq+ 0]      ; y_off_in
107    mova           m15, [yoffq+16]      ; y_off_out
108%if %%yoffsh != 0
109    psllw          m15, %%yoffsh
110%endif
111    paddw          m15, [pw_ %+ %%yprnd]
112    punpcklwd      m10, m15
113    mova           m15, [pw_ %+ %%ypmul]
114    movh            m0, [cq+1*16]       ; cyu
115    movh            m1, [cq+2*16]       ; cyv
116    movh            m2, [cq+4*16]       ; cuu
117    movh            m3, [cq+5*16]       ; cuv
118    movh            m4, [cq+7*16]       ; cvu
119    movh            m5, [cq+8*16]       ; cvv
120    punpcklwd       m0, m1
121    punpcklwd       m2, m3
122    punpcklwd       m4, m5
123    mova [rsp+0*mmsize], m0
124    mova [rsp+1*mmsize], m2
125    mova [rsp+2*mmsize], m4
126
127    DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp
128
129    mov            uiq, [yiq+gprsize*1]
130    mov            viq, [yiq+gprsize*2]
131    mov            yiq, [yiq+gprsize*0]
132    mov            uoq, [yoq+gprsize*1]
133    mov            voq, [yoq+gprsize*2]
134    mov            yoq, [yoq+gprsize*0]
135    mov           uisq, [yisq+gprsize*1]
136    mov           visq, [yisq+gprsize*2]
137    mov           yisq, [yisq+gprsize*0]
138    mov           uosq, [yosq+gprsize*1]
139    mov           vosq, [yosq+gprsize*2]
140    mov           yosq, [yosq+gprsize*0]
141
142.loop_v:
143    xor             xq, xq
144
145.loop_h:
146%if %4 == 1
147    lea           tmpq, [yiq+yisq]
148%endif ; %4 == 1
149%if %1 == 8
150    movu            m0, [yiq+xq*(1<<%3)]        ; y00/01
151%if %4 == 1
152    movu            m2, [tmpq+xq*2]             ; y10/11
153%endif ; %4 == 1
154%if %3 == 1
155    movh            m4, [uiq+xq]                ; u
156    movh            m5, [viq+xq]                ; v
157%else ; %3 != 1
158    movu            m4, [uiq+xq]                ; u
159    movu            m5, [viq+xq]                ; v
160%endif ; %3 ==/!= 1
161    punpckhbw       m1, m0, m11
162    punpcklbw       m0, m11
163%if %4 == 1
164    punpckhbw       m3, m2, m11
165    punpcklbw       m2, m11
166%endif ; %4 == 1
167%if %3 == 0
168    punpckhbw       m2, m4, m11
169    punpckhbw       m3, m5, m11
170%endif ; %3 == 0
171    punpcklbw       m4, m11
172    punpcklbw       m5, m11
173%else ; %1 != 8
174    movu            m0, [yiq+xq*(2<<%3)]        ; y00/01
175    movu            m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01
176%if %4 == 1
177    movu            m2, [tmpq+xq*4]             ; y10/11
178    movu            m3, [tmpq+xq*4+mmsize]      ; y10/11
179%endif ; %4 == 1
180    movu            m4, [uiq+xq*2]              ; u
181    movu            m5, [viq+xq*2]              ; v
182%if %3 == 0
183    movu            m2, [uiq+xq*2+mmsize]
184    movu            m3, [viq+xq*2+mmsize]
185%endif ; %3 == 0
186%endif ; %1 ==/!= 8
187    psubw           m0, m14
188    psubw           m1, m14
189%if %4 == 1
190    psubw           m2, m14
191    psubw           m3, m14
192%endif ; %4 == 1
193    psubw           m4, m13
194    psubw           m5, m13
195%if %3 == 0
196    psubw           m2, m13
197    psubw           m3, m13
198%endif ; %3 == 0
199
200    SBUTTERFLY   wd, 4, 5, 6
201    pmaddwd         m6, m4, [rsp+1*mmsize]
202    pmaddwd         m7, m5, [rsp+1*mmsize]
203%if %3 == 0
204    SBUTTERFLY   wd, 2, 3, 8
205    pmaddwd         m8, m2, [rsp+1*mmsize]
206    pmaddwd         m9, m3, [rsp+1*mmsize]
207%else ; %3 != 0
208    pmaddwd         m8, m4, [rsp+2*mmsize]
209    pmaddwd         m9, m5, [rsp+2*mmsize]
210%endif
211    paddd           m6, m12
212    paddd           m7, m12
213    paddd           m8, m12
214    paddd           m9, m12
215    psrad           m6, %%sh
216    psrad           m7, %%sh
217    psrad           m8, %%sh
218    psrad           m9, %%sh
219    packssdw        m6, m7
220    packssdw        m8, m9
221%if %2 == 8
222    packuswb        m6, m8
223%if %3 == 0
224    movu      [uoq+xq], m6
225%else ; %3 != 0
226    movh      [uoq+xq], m6
227    movhps    [voq+xq], m6
228%endif ; %3 ==/!= 0
229%else ; %2 != 8
230    CLIPW           m6, m11, [pw_ %+ %%maxval]
231    CLIPW           m8, m11, [pw_ %+ %%maxval]
232    movu    [uoq+xq*2], m6
233%if %3 == 0
234    movu    [uoq+xq*2+mmsize], m8
235%else ; %3 != 0
236    movu    [voq+xq*2], m8
237%endif ; %3 ==/!= 0
238%endif ; %2 ==/!= 8
239
240%if %3 == 0
241    pmaddwd         m6, m4, [rsp+2*mmsize]
242    pmaddwd         m7, m5, [rsp+2*mmsize]
243    pmaddwd         m8, m2, [rsp+2*mmsize]
244    pmaddwd         m9, m3, [rsp+2*mmsize]
245    paddd           m6, m12
246    paddd           m7, m12
247    paddd           m8, m12
248    paddd           m9, m12
249    psrad           m6, %%sh
250    psrad           m7, %%sh
251    psrad           m8, %%sh
252    psrad           m9, %%sh
253    packssdw        m6, m7
254    packssdw        m8, m9
255%if %2 == 8
256    packuswb        m6, m8
257    movu      [voq+xq], m6
258%else ; %2 != 8
259    CLIPW           m6, m11, [pw_ %+ %%maxval]
260    CLIPW           m8, m11, [pw_ %+ %%maxval]
261    movu    [voq+xq*2], m6
262    movu    [voq+xq*2+mmsize], m8
263%endif ; %2 ==/!= 8
264%endif ; %3 == 0
265
266    pmaddwd         m4, [rsp+0*mmsize]
267    pmaddwd         m5, [rsp+0*mmsize]          ; uv_val
268%if %3 == 0
269    pmaddwd         m2, [rsp+0*mmsize]
270    pmaddwd         m3, [rsp+0*mmsize]
271%endif ; %3 == 0
272
273    ; unpack y pixels with m15 (shifted round + offset), then multiply
274    ; by m10, add uv pixels, and we're done!
275%if %3 == 1
276    punpckhdq       m8, m4, m4
277    punpckldq       m4, m4
278    punpckhdq       m9, m5, m5
279    punpckldq       m5, m5
280%else ; %3 != 1
281    SWAP             8, 5, 2
282    SWAP             3, 9
283%endif ; %3 ==/!= 1
284%if %4 == 1
285    punpckhwd       m6, m2, m15
286    punpcklwd       m2, m15
287    punpckhwd       m7, m3, m15
288    punpcklwd       m3, m15
289    pmaddwd         m2, m10
290    pmaddwd         m6, m10
291    pmaddwd         m3, m10
292    pmaddwd         m7, m10
293    paddd           m2, m4
294    paddd           m6, m8
295    paddd           m3, m5
296    paddd           m7, m9
297    psrad           m2, %%sh
298    psrad           m6, %%sh
299    psrad           m3, %%sh
300    psrad           m7, %%sh
301    packssdw        m2, m6
302    packssdw        m3, m7
303
304    lea           tmpq, [yoq+yosq]
305%if %2 == 8
306    packuswb        m2, m3
307    movu   [tmpq+xq*2], m2
308%else ; %2 != 8
309    CLIPW           m2, m11, [pw_ %+ %%maxval]
310    CLIPW           m3, m11, [pw_ %+ %%maxval]
311    movu   [tmpq+xq*4], m2
312    movu [tmpq+xq*4+mmsize], m3
313%endif ; %2 ==/!= 8
314%endif ; %4 == 1
315
316    punpckhwd       m6, m0, m15
317    punpcklwd       m0, m15
318    punpckhwd       m7, m1, m15
319    punpcklwd       m1, m15
320    pmaddwd         m0, m10
321    pmaddwd         m6, m10
322    pmaddwd         m1, m10
323    pmaddwd         m7, m10
324    paddd           m0, m4
325    paddd           m6, m8
326    paddd           m1, m5
327    paddd           m7, m9
328    psrad           m0, %%sh
329    psrad           m6, %%sh
330    psrad           m1, %%sh
331    psrad           m7, %%sh
332    packssdw        m0, m6
333    packssdw        m1, m7
334
335%if %2 == 8
336    packuswb        m0, m1
337    movu    [yoq+xq*(1<<%3)], m0
338%else ; %2 != 8
339    CLIPW           m0, m11, [pw_ %+ %%maxval]
340    CLIPW           m1, m11, [pw_ %+ %%maxval]
341    movu  [yoq+xq*(2<<%3)], m0
342    movu [yoq+xq*(2<<%3)+mmsize], m1
343%endif ; %2 ==/!= 8
344
345    add             xq, mmsize >> %3
346    cmp             xd, dword [rsp+3*mmsize+0]
347    jl .loop_h
348
349%if %4 == 1
350    lea            yiq, [yiq+yisq*2]
351    lea            yoq, [yoq+yosq*2]
352%else ; %4 != 1
353    add            yiq, yisq
354    add            yoq, yosq
355%endif ; %4 ==/!= 1
356    add            uiq, uisq
357    add            viq, visq
358    add            uoq, uosq
359    add            voq, vosq
360    dec dword [rsp+3*mmsize+4]
361    jg .loop_v
362
363    RET
364%endmacro
365
366%macro YUV2YUV_FNS 2 ; ss_w, ss_h
367YUV2YUV_FN  8,  8, %1, %2
368YUV2YUV_FN 10,  8, %1, %2
369YUV2YUV_FN 12,  8, %1, %2
370YUV2YUV_FN  8, 10, %1, %2
371YUV2YUV_FN 10, 10, %1, %2
372YUV2YUV_FN 12, 10, %1, %2
373YUV2YUV_FN  8, 12, %1, %2
374YUV2YUV_FN 10, 12, %1, %2
375YUV2YUV_FN 12, 12, %1, %2
376%endmacro
377
378INIT_XMM sse2
379YUV2YUV_FNS 0, 0
380YUV2YUV_FNS 1, 0
381YUV2YUV_FNS 1, 1
382
383; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
384;                            uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
385;                            int w, int h, const int16_t yuv2rgb_coeffs[3][3][8],
386;                            const int16_t yuv_offset[8])
387%macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
388%assign %%sh (%1 - 1)
389%assign %%rnd (1 << (%%sh - 1))
390%assign %%uvoff (1 << (%1 - 1))
391%if %2 == 0
392%assign %%ss 444
393%elif %3 == 0
394%assign %%ss 422
395%else ; %3 == 1
396%assign %%ss 420
397%endif ; %2/%3
398
399cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \
400                                rgb, rgbs, yuv, yuvs, ww, h, c, yoff
401%if %2 == 1
402    inc            wwd
403    sar            wwd, 1
404%endif ; %2 == 1
405%if %3 == 1
406    inc             hd
407    sar             hd, 1
408%endif ; %3 == 1
409    pxor           m11, m11
410    mova           m15, [yoffq]                 ; yoff
411    movh           m14, [cq+  0]                ; cy
412    movh           m10, [cq+ 32]                ; crv
413    movh           m13, [cq+112]                ; cbu
414    movh           m12, [cq+ 64]                ; cgu
415    movh            m9, [cq+ 80]                ; cgv
416    punpcklwd      m14, [pw_ %+ %%rnd]          ; cy, rnd
417    punpcklwd      m13, m11                     ; cbu, 0
418    punpcklwd      m11, m10                     ; 0, crv
419    punpcklwd      m12, m9                      ; cgu, cgv
420    mova [rsp+0*mmsize], m11
421    mova [rsp+1*mmsize], m12
422    mova [rsp+2*mmsize], m13
423    mova [rsp+3*mmsize], m14
424    pxor           m14, m14
425
426    DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp
427
428    mov             gq, [rq+1*gprsize]
429    mov             bq, [rq+2*gprsize]
430    mov             rq, [rq+0*gprsize]
431    mov             uq, [yq+1*gprsize]
432    mov             vq, [yq+2*gprsize]
433    mov             yq, [yq+0*gprsize]
434    mov            usq, [ysq+1*gprsize]
435    mov            vsq, [ysq+2*gprsize]
436    mov            ysq, [ysq+0*gprsize]
437
438.loop_v:
439    xor             xq, xq
440
441.loop_h:
442%if %3 == 1
443    lea           tmpq, [yq+ysq]
444%endif ; %3 == 1
445%if %1 == 8
446    movu            m0, [yq+xq*(1<<%2)]
447%if %3 == 1
448    movu            m2, [tmpq+xq*2]
449%endif ; %3 == 1
450%if %2 == 1
451    movh            m4, [uq+xq]
452    movh            m5, [vq+xq]
453%else ; %2 != 1
454    movu            m4, [uq+xq]
455    movu            m5, [vq+xq]
456%endif ; %2 ==/!= 1
457    punpckhbw       m1, m0, m14
458    punpcklbw       m0, m14
459%if %3 == 1
460    punpckhbw       m3, m2, m14
461    punpcklbw       m2, m14
462%endif ; %3 == 1
463%if %2 == 0
464    punpckhbw       m2, m4, m14
465    punpckhbw       m3, m5, m14
466%endif ; %2 == 0
467    punpcklbw       m4, m14
468    punpcklbw       m5, m14
469%else ; %1 != 8
470    movu            m0, [yq+xq*(2<<%2)]
471    movu            m1, [yq+xq*(2<<%2)+mmsize]
472%if %3 == 1
473    movu            m2, [tmpq+xq*4]
474    movu            m3, [tmpq+xq*4+mmsize]
475%endif ; %3 == 1
476    movu            m4, [uq+xq*2]
477    movu            m5, [vq+xq*2]
478%if %2 == 0
479    movu            m2, [uq+xq*2+mmsize]
480    movu            m3, [vq+xq*2+mmsize]
481%endif ; %2 == 0
482%endif ; %1 ==/!= 8
483    psubw           m0, m15
484    psubw           m1, m15
485%if %3 == 1
486    psubw           m2, m15
487    psubw           m3, m15
488%endif ; %3 == 1
489    psubw           m4, [pw_ %+ %%uvoff]
490    psubw           m5, [pw_ %+ %%uvoff]
491    SBUTTERFLY   wd, 4, 5, 6
492%if %2 == 0
493    psubw           m2, [pw_ %+ %%uvoff]
494    psubw           m3, [pw_ %+ %%uvoff]
495    SBUTTERFLY   wd, 2, 3, 6
496%endif ; %2 == 0
497
498    ; calculate y+rnd full-resolution [0-3,6-9]
499    punpckhwd       m6, m0, [pw_1]              ; y, 1
500    punpcklwd       m0, [pw_1]                  ; y, 1
501    punpckhwd       m7, m1, [pw_1]              ; y, 1
502    punpcklwd       m1, [pw_1]                  ; y, 1
503    pmaddwd         m0, [rsp+3*mmsize]
504    pmaddwd         m6, [rsp+3*mmsize]
505    pmaddwd         m1, [rsp+3*mmsize]
506    pmaddwd         m7, [rsp+3*mmsize]
507%if %3 == 1
508    punpckhwd       m8, m2, [pw_1]              ; y, 1
509    punpcklwd       m2, [pw_1]                  ; y, 1
510    punpckhwd       m9, m3, [pw_1]              ; y, 1
511    punpcklwd       m3, [pw_1]                  ; y, 1
512    pmaddwd         m2, [rsp+3*mmsize]
513    pmaddwd         m8, [rsp+3*mmsize]
514    pmaddwd         m3, [rsp+3*mmsize]
515    pmaddwd         m9, [rsp+3*mmsize]
516    mova [rsp+4*mmsize], m2
517    mova [rsp+5*mmsize], m8
518    mova [rsp+6*mmsize], m3
519    mova [rsp+7*mmsize], m9
520%endif ; %3 == 1
521
522    ; calculate r offsets (un-subsampled, then duplicate)
523    pmaddwd        m10, m4, [rsp+0*mmsize]
524%if %2 == 1
525    pmaddwd        m12, m5, [rsp+0*mmsize]
526    punpckhdq      m11, m10, m10
527    punpckldq      m10, m10
528    punpckhdq      m13, m12, m12
529    punpckldq      m12, m12
530%else ; %2 != 1
531    pmaddwd        m11, m5, [rsp+0*mmsize]
532    pmaddwd        m12, m2, [rsp+0*mmsize]
533    pmaddwd        m13, m3, [rsp+0*mmsize]
534%endif ; %2 ==/!= 1
535%if %3 == 1
536    paddd           m2, m10, [rsp+4*mmsize]
537    paddd           m3, m11, [rsp+5*mmsize]
538    paddd           m8, m12, [rsp+6*mmsize]
539    paddd           m9, m13, [rsp+7*mmsize]
540%endif
541    paddd          m10, m0
542    paddd          m11, m6
543    paddd          m12, m1
544    paddd          m13, m7
545%if %3 == 1
546    psrad           m2, %%sh
547    psrad           m3, %%sh
548    psrad           m8, %%sh
549    psrad           m9, %%sh
550%endif ; %3 == 1
551    psrad          m10, %%sh
552    psrad          m11, %%sh
553    psrad          m12, %%sh
554    psrad          m13, %%sh
555%if %3 == 1
556    lea           tmpq, [rq+rgbsq*2]
557    packssdw        m2, m3
558    packssdw        m8, m9
559    mova [tmpq+xq*4], m2
560    mova [tmpq+xq*4+mmsize], m8
561%endif ; %3 == 1
562    packssdw       m10, m11
563    packssdw       m12, m13
564    mova   [rq+xq*(2 << %2)], m10
565    mova   [rq+xq*(2 << %2)+mmsize], m12
566
567    ; calculate g offsets (un-subsampled, then duplicate)
568    pmaddwd        m10, m4, [rsp+1*mmsize]
569%if %2 == 1
570    pmaddwd        m12, m5, [rsp+1*mmsize]
571    punpckhdq      m11, m10, m10
572    punpckldq      m10, m10
573    punpckhdq      m13, m12, m12
574    punpckldq      m12, m12
575%else ; %2 != 1
576    pmaddwd        m11, m5, [rsp+1*mmsize]
577    pmaddwd        m12, m2, [rsp+1*mmsize]
578    pmaddwd        m13, m3, [rsp+1*mmsize]
579%endif ; %2 ==/!= 1
580%if %3 == 1
581    paddd           m2, m10, [rsp+4*mmsize]
582    paddd           m3, m11, [rsp+5*mmsize]
583    paddd           m8, m12, [rsp+6*mmsize]
584    paddd           m9, m13, [rsp+7*mmsize]
585%endif ; %3 == 1
586    paddd          m10, m0
587    paddd          m11, m6
588    paddd          m12, m1
589    paddd          m13, m7
590%if %3 == 1
591    psrad           m2, %%sh
592    psrad           m3, %%sh
593    psrad           m8, %%sh
594    psrad           m9, %%sh
595%endif ; %3 == 1
596    psrad          m10, %%sh
597    psrad          m11, %%sh
598    psrad          m12, %%sh
599    psrad          m13, %%sh
600%if %3 == 1
601    lea           tmpq, [gq+rgbsq*2]
602    packssdw        m2, m3
603    packssdw        m8, m9
604    mova [tmpq+xq*4], m2
605    mova [tmpq+xq*4+mmsize], m8
606%endif ; %3 == 1
607    packssdw       m10, m11
608    packssdw       m12, m13
609    mova   [gq+xq*(2 << %2)], m10
610    mova   [gq+xq*(2 << %2)+mmsize], m12
611
612    ; calculate b offsets (un-subsampled, then duplicate)
613    pmaddwd         m4, [rsp+2*mmsize]
614    pmaddwd         m5, [rsp+2*mmsize]
615%if %2 == 1
616    punpckhdq       m2, m4, m4
617    punpckldq       m4, m4
618    punpckhdq       m3, m5, m5
619    punpckldq       m5, m5
620%else ; %2 != 1
621    pmaddwd         m2, [rsp+2*mmsize]
622    pmaddwd         m3, [rsp+2*mmsize]
623    SWAP             2, 5
624%endif ; %2 ==/!= 1
625    paddd           m0, m4
626    paddd           m6, m2
627    paddd           m1, m5
628    paddd           m7, m3
629%if %3 == 1
630    paddd           m4, [rsp+4*mmsize]
631    paddd           m2, [rsp+5*mmsize]
632    paddd           m5, [rsp+6*mmsize]
633    paddd           m3, [rsp+7*mmsize]
634%endif ; %3 == 1
635    psrad           m0, %%sh
636    psrad           m6, %%sh
637    psrad           m1, %%sh
638    psrad           m7, %%sh
639%if %3 == 1
640    psrad           m4, %%sh
641    psrad           m2, %%sh
642    psrad           m5, %%sh
643    psrad           m3, %%sh
644%endif ; %3 == 1
645    packssdw        m0, m6
646    packssdw        m1, m7
647    movu   [bq+xq*(2 << %2)], m0
648    movu   [bq+xq*(2 << %2)+mmsize], m1
649%if %3 == 1
650    lea           tmpq, [bq+rgbsq*2]
651    packssdw        m4, m2
652    packssdw        m5, m3
653    movu [tmpq+xq*4], m4
654    movu [tmpq+xq*4+mmsize], m5
655%endif ; %3 == 1
656
657    add             xd, mmsize >> %2
658    cmp             xd, wwd
659    jl .loop_h
660
661    lea             rq, [rq+rgbsq*(2 << %3)]
662    lea             gq, [gq+rgbsq*(2 << %3)]
663    lea             bq, [bq+rgbsq*(2 << %3)]
664%if %3 == 1
665    lea             yq, [yq+ysq*2]
666%else ; %3 != 0
667    add             yq, ysq
668%endif ; %3 ==/!= 1
669    add             uq, usq
670    add             vq, vsq
671    dec             hd
672    jg .loop_v
673
674    RET
675%endmacro
676
677%macro YUV2RGB_FNS 2
678YUV2RGB_FN  8, %1, %2
679YUV2RGB_FN 10, %1, %2
680YUV2RGB_FN 12, %1, %2
681%endmacro
682
683INIT_XMM sse2
684YUV2RGB_FNS 0, 0
685YUV2RGB_FNS 1, 0
686YUV2RGB_FNS 1, 1
687
688%macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
689%assign %%sh 29 - %1
690%assign %%rnd (1 << (%%sh - 15))
691%assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14))
692%if %1 != 8
693%assign %%maxval ((1 << %1) - 1)
694%endif ; %1 != 8
695%if %2 == 0
696%assign %%ss 444
697%elif %3 == 0
698%assign %%ss 422
699%else ; %3 == 1
700%assign %%ss 420
701%endif ; %2/%3
702
703cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \
704                                 yuv, yuvs, rgb, rgbs, ww, h, c, off
705%if %2 == 1
706    inc            wwd
707    sar            wwd, 1
708%endif ; %2 == 1
709%if %3 == 1
710    inc             hd
711    sar             hd, 1
712%endif ; %3 == 1
713
714    ; prepare coeffs
715    movh            m8, [offq]
716    movh            m9, [pw_ %+ %%uvrnd]
717    psllw           m8, %%sh - 14
718    paddw           m9, [pw_ %+ %%rnd]
719    paddw           m8, [pw_ %+ %%rnd]
720    movh            m0, [cq+  0]
721    movh            m1, [cq+ 16]
722    movh            m2, [cq+ 32]
723    movh            m3, [cq+ 48]
724    movh            m4, [cq+ 64]
725    movh            m5, [cq+ 80]
726    movh            m6, [cq+112]
727    movh            m7, [cq+128]
728    punpcklwd       m0, m1
729    punpcklwd       m2, m8
730    punpcklwd       m3, m4
731    punpcklwd       m4, m5, m9
732    punpcklwd       m5, m6
733    punpcklwd       m7, m9
734
735    mova [rsp+0*mmsize], m0                 ; cry, cgy
736    mova [rsp+1*mmsize], m2                 ; cby, off + rnd
737    mova [rsp+2*mmsize], m3                 ; cru, cgu
738    mova [rsp+3*mmsize], m4                 ; cburv, uvoff + rnd
739    mova [rsp+4*mmsize], m5                 ; cburv, cgv
740    mova [rsp+5*mmsize], m7                 ; cbv, uvoff + rnd
741
742
743    DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x
744    mov             gq, [rq+gprsize*1]
745    mov             bq, [rq+gprsize*2]
746    mov             rq, [rq+gprsize*0]
747    mov             uq, [yq+gprsize*1]
748    mov             vq, [yq+gprsize*2]
749    mov             yq, [yq+gprsize*0]
750    mov            usq, [ysq+gprsize*1]
751    mov            vsq, [ysq+gprsize*2]
752    mov            ysq, [ysq+gprsize*0]
753
754    pxor           m15, m15
755.loop_v:
756    xor             xd, xd
757
758.loop_h:
759    ; top line y
760    mova            m0, [rq+xq*(2<<%2)]
761    mova            m3, [rq+xq*(2<<%2)+mmsize]
762    mova            m1, [gq+xq*(2<<%2)]
763    mova            m4, [gq+xq*(2<<%2)+mmsize]
764    mova            m2, [bq+xq*(2<<%2)]
765    mova            m5, [bq+xq*(2<<%2)+mmsize]
766
767    punpcklwd       m6, m0, m1
768    punpckhwd       m7, m0, m1
769    punpcklwd       m8, m3, m4
770    punpckhwd       m9, m3, m4
771    punpcklwd      m10, m2, [pw_16384]
772    punpckhwd      m11, m2, [pw_16384]
773    punpcklwd      m12, m5, [pw_16384]
774    punpckhwd      m13, m5, [pw_16384]
775
776    pmaddwd         m6, [rsp+0*mmsize]
777    pmaddwd         m7, [rsp+0*mmsize]
778    pmaddwd         m8, [rsp+0*mmsize]
779    pmaddwd         m9, [rsp+0*mmsize]
780    pmaddwd        m10, [rsp+1*mmsize]
781    pmaddwd        m11, [rsp+1*mmsize]
782    pmaddwd        m12, [rsp+1*mmsize]
783    pmaddwd        m13, [rsp+1*mmsize]
784    paddd           m6, m10
785    paddd           m7, m11
786    paddd           m8, m12
787    paddd           m9, m13
788    psrad           m6, %%sh
789    psrad           m7, %%sh
790    psrad           m8, %%sh
791    psrad           m9, %%sh
792    packssdw        m6, m7
793    packssdw        m8, m9
794%if %1 == 8
795    packuswb        m6, m8
796    movu [yq+xq*(1<<%2)], m6
797%else
798    CLIPW           m6, m15, [pw_ %+ %%maxval]
799    CLIPW           m8, m15, [pw_ %+ %%maxval]
800    movu [yq+xq*(2<<%2)], m6
801    movu [yq+xq*(2<<%2)+mmsize], m8
802%endif
803
804%if %2 == 1
805    ; subsampling cached data
806    pmaddwd         m0, [pw_1]
807    pmaddwd         m1, [pw_1]
808    pmaddwd         m2, [pw_1]
809    pmaddwd         m3, [pw_1]
810    pmaddwd         m4, [pw_1]
811    pmaddwd         m5, [pw_1]
812
813%if %3 == 1
814    ; bottom line y, r/g portion only
815    lea           tmpq, [rgbsq+xq*2]
816    mova            m6, [rq+tmpq*2]
817    mova            m9, [rq+tmpq*2+mmsize]
818    mova            m7, [gq+tmpq*2]
819    mova           m10, [gq+tmpq*2+mmsize]
820    mova            m8, [bq+tmpq*2]
821    mova           m11, [bq+tmpq*2+mmsize]
822
823    punpcklwd      m12, m6, m7
824    punpckhwd      m13, m6, m7
825    punpcklwd      m14, m9, m10
826    punpckhwd      m15, m9, m10
827
828    ; release two more registers
829    pmaddwd         m6, [pw_1]
830    pmaddwd         m7, [pw_1]
831    pmaddwd         m9, [pw_1]
832    pmaddwd        m10, [pw_1]
833    paddd           m0, m6
834    paddd           m3, m9
835    paddd           m1, m7
836    paddd           m4, m10
837
838    ; bottom line y, b/rnd portion only
839    punpcklwd       m6, m8,  [pw_16384]
840    punpckhwd       m7, m8,  [pw_16384]
841    punpcklwd       m9, m11, [pw_16384]
842    punpckhwd      m10, m11, [pw_16384]
843
844    pmaddwd        m12, [rsp+0*mmsize]
845    pmaddwd        m13, [rsp+0*mmsize]
846    pmaddwd        m14, [rsp+0*mmsize]
847    pmaddwd        m15, [rsp+0*mmsize]
848    pmaddwd         m6, [rsp+1*mmsize]
849    pmaddwd         m7, [rsp+1*mmsize]
850    pmaddwd         m9, [rsp+1*mmsize]
851    pmaddwd        m10, [rsp+1*mmsize]
852    paddd          m12, m6
853    paddd          m13, m7
854    paddd          m14, m9
855    paddd          m15, m10
856    psrad          m12, %%sh
857    psrad          m13, %%sh
858    psrad          m14, %%sh
859    psrad          m15, %%sh
860    packssdw       m12, m13
861    packssdw       m14, m15
862    lea           tmpq, [yq+ysq]
863%if %1 == 8
864    packuswb       m12, m14
865    movu   [tmpq+xq*2], m12
866%else
867    pxor           m15, m15
868    CLIPW          m12, m15, [pw_ %+ %%maxval]
869    CLIPW          m14, m15, [pw_ %+ %%maxval]
870    movu   [tmpq+xq*4], m12
871    movu [tmpq+xq*4+mmsize], m14
872%endif
873
874    ; complete subsampling of r/g/b pixels for u/v
875    pmaddwd         m8, [pw_1]
876    pmaddwd        m11, [pw_1]
877    paddd           m2, m8
878    paddd           m5, m11
879    paddd           m0, [pd_2]
880    paddd           m1, [pd_2]
881    paddd           m2, [pd_2]
882    paddd           m3, [pd_2]
883    paddd           m4, [pd_2]
884    paddd           m5, [pd_2]
885    psrad           m0, 2
886    psrad           m1, 2
887    psrad           m2, 2
888    psrad           m3, 2
889    psrad           m4, 2
890    psrad           m5, 2
891%else ; %3 != 1
892    paddd           m0, [pd_1]
893    paddd           m1, [pd_1]
894    paddd           m2, [pd_1]
895    paddd           m3, [pd_1]
896    paddd           m4, [pd_1]
897    paddd           m5, [pd_1]
898    psrad           m0, 1
899    psrad           m1, 1
900    psrad           m2, 1
901    psrad           m3, 1
902    psrad           m4, 1
903    psrad           m5, 1
904%endif ; %3 ==/!= 1
905    packssdw        m0, m3
906    packssdw        m1, m4
907    packssdw        m2, m5
908%endif ; %2 == 1
909
910    ; convert u/v pixels
911    SBUTTERFLY   wd, 0, 1, 6
912    punpckhwd       m6, m2, [pw_16384]
913    punpcklwd       m2, [pw_16384]
914
915    pmaddwd         m7, m0, [rsp+2*mmsize]
916    pmaddwd         m8, m1, [rsp+2*mmsize]
917    pmaddwd         m9, m2, [rsp+3*mmsize]
918    pmaddwd        m10, m6, [rsp+3*mmsize]
919    pmaddwd         m0, [rsp+4*mmsize]
920    pmaddwd         m1, [rsp+4*mmsize]
921    pmaddwd         m2, [rsp+5*mmsize]
922    pmaddwd         m6, [rsp+5*mmsize]
923    paddd           m7, m9
924    paddd           m8, m10
925    paddd           m0, m2
926    paddd           m1, m6
927    psrad           m7, %%sh
928    psrad           m8, %%sh
929    psrad           m0, %%sh
930    psrad           m1, %%sh
931    packssdw        m7, m8
932    packssdw        m0, m1
933%if %2 == 1
934%if %1 == 8
935    packuswb        m7, m0
936    movh       [uq+xq], m7
937    movhps     [vq+xq], m7
938%else
939    CLIPW           m7, m15, [pw_ %+ %%maxval]
940    CLIPW           m0, m15, [pw_ %+ %%maxval]
941    movu     [uq+xq*2], m7
942    movu     [vq+xq*2], m0
943%endif
944%else ; %2 != 1
945    ; second set of u/v pixels
946    SBUTTERFLY   wd, 3, 4, 6
947    punpckhwd       m6, m5, [pw_16384]
948    punpcklwd       m5, [pw_16384]
949
950    pmaddwd         m8, m3, [rsp+2*mmsize]
951    pmaddwd         m9, m4, [rsp+2*mmsize]
952    pmaddwd        m10, m5, [rsp+3*mmsize]
953    pmaddwd        m11, m6, [rsp+3*mmsize]
954    pmaddwd         m3, [rsp+4*mmsize]
955    pmaddwd         m4, [rsp+4*mmsize]
956    pmaddwd         m5, [rsp+5*mmsize]
957    pmaddwd         m6, [rsp+5*mmsize]
958    paddd           m8, m10
959    paddd           m9, m11
960    paddd           m3, m5
961    paddd           m4, m6
962    psrad           m8, %%sh
963    psrad           m9, %%sh
964    psrad           m3, %%sh
965    psrad           m4, %%sh
966    packssdw        m8, m9
967    packssdw        m3, m4
968
969%if %1 == 8
970    packuswb        m7, m8
971    packuswb        m0, m3
972    movu       [uq+xq], m7
973    movu       [vq+xq], m0
974%else
975    CLIPW           m7, m15, [pw_ %+ %%maxval]
976    CLIPW           m0, m15, [pw_ %+ %%maxval]
977    CLIPW           m8, m15, [pw_ %+ %%maxval]
978    CLIPW           m3, m15, [pw_ %+ %%maxval]
979    movu     [uq+xq*2], m7
980    movu [uq+xq*2+mmsize], m8
981    movu     [vq+xq*2], m0
982    movu [vq+xq*2+mmsize], m3
983%endif
984%endif ; %2 ==/!= 1
985
986    add             xq, mmsize >> %2
987    cmp             xd, wwd
988    jl .loop_h
989
990%if %3 == 0
991    add             yq, ysq
992%else ; %3 != 0
993    lea             yq, [yq+ysq*2]
994%endif ; %3 ==/!= 0
995    add             uq, usq
996    add             vq, vsq
997    lea             rq, [rq+rgbsq*(2<<%3)]
998    lea             gq, [gq+rgbsq*(2<<%3)]
999    lea             bq, [bq+rgbsq*(2<<%3)]
1000    dec             hd
1001    jg .loop_v
1002
1003    RET
1004%endmacro
1005
1006%macro RGB2YUV_FNS 2
1007RGB2YUV_FN  8, %1, %2
1008RGB2YUV_FN 10, %1, %2
1009RGB2YUV_FN 12, %1, %2
1010%endmacro
1011
1012INIT_XMM sse2
1013RGB2YUV_FNS 0, 0
1014RGB2YUV_FNS 1, 0
1015RGB2YUV_FNS 1, 1
1016
1017; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
1018;                          int w, int h, const int16_t coeff[3][3][8])
1019INIT_XMM sse2
1020cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c
1021    movh            m0, [cq+  0]
1022    movh            m1, [cq+ 32]
1023    movh            m2, [cq+ 48]
1024    movh            m3, [cq+ 80]
1025    movh            m4, [cq+ 96]
1026    movh            m5, [cq+128]
1027    punpcklwd       m0, [cq+ 16]
1028    punpcklwd       m1, [pw_8192]
1029    punpcklwd       m2, [cq+ 64]
1030    punpcklwd       m3, [pw_8192]
1031    punpcklwd       m4, [cq+112]
1032    punpcklwd       m5, [pw_8192]
1033
1034    DEFINE_ARGS data0, stride, ww, h, data1, data2, x
1035    shl        strideq, 1
1036    mov         data1q, [data0q+gprsize*1]
1037    mov         data2q, [data0q+gprsize*2]
1038    mov         data0q, [data0q+gprsize*0]
1039
1040.loop_v:
1041    xor             xd, xd
1042
1043.loop_h:
1044    mova            m6, [data0q+xq*2]
1045    mova            m7, [data1q+xq*2]
1046    mova            m8, [data2q+xq*2]
1047    SBUTTERFLY   wd, 6, 7, 9
1048    punpckhwd       m9, m8, [pw_1]
1049    punpcklwd       m8, [pw_1]
1050
1051    pmaddwd        m10, m6, m0
1052    pmaddwd        m11, m7, m0
1053    pmaddwd        m12, m8, m1
1054    pmaddwd        m13, m9, m1
1055    paddd          m10, m12
1056    paddd          m11, m13
1057    psrad          m10, 14
1058    psrad          m11, 14
1059
1060    pmaddwd        m12, m6, m2
1061    pmaddwd        m13, m7, m2
1062    pmaddwd        m14, m8, m3
1063    pmaddwd        m15, m9, m3
1064    paddd          m12, m14
1065    paddd          m13, m15
1066    psrad          m12, 14
1067    psrad          m13, 14
1068
1069    pmaddwd         m6, m4
1070    pmaddwd         m7, m4
1071    pmaddwd         m8, m5
1072    pmaddwd         m9, m5
1073    paddd           m6, m8
1074    paddd           m7, m9
1075    psrad           m6, 14
1076    psrad           m7, 14
1077
1078    packssdw       m10, m11
1079    packssdw       m12, m13
1080    packssdw        m6, m7
1081
1082    mova [data0q+xq*2], m10
1083    mova [data1q+xq*2], m12
1084    mova [data2q+xq*2], m6
1085
1086    add             xd, mmsize / 2
1087    cmp             xd, wwd
1088    jl .loop_h
1089
1090    add         data0q, strideq
1091    add         data1q, strideq
1092    add         data2q, strideq
1093    dec             hd
1094    jg .loop_v
1095
1096    RET
1097%endif
1098