1;****************************************************************************** 2;* V210 SIMD unpack 3;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> 4;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 32 26 27; for AVX2 version only 28v210_luma_permute: dd 0,1,2,4,5,6,7,7 ; 32-byte alignment required 29v210_chroma_shuf2: db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1 30v210_luma_shuf_avx2: db 0,1,4,5,6,7,8,9,12,13,14,15,-1,-1,-1,-1 31v210_chroma_shuf_avx2: db 0,1,4,5,10,11,-1,-1,2,3,8,9,12,13,-1,-1 32 33v210_mult: dw 64,4,64,4,64,4,64,4 34v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 35v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 36 37SECTION .text 38 39%macro v210_planar_unpack 1 40 41; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) 42cglobal v210_planar_unpack_%1, 5, 5, 6 + 2 * cpuflag(avx2), src, y, u, v, w 43 movsxdifnidn wq, wd 44 lea yq, [yq+2*wq] 45 add uq, wq 46 add vq, wq 47 neg wq 48 49 VBROADCASTI128 m3, [v210_mult] 50 51%if cpuflag(avx2) 52 VBROADCASTI128 m4, [v210_luma_shuf_avx2] 53 VBROADCASTI128 m5, [v210_chroma_shuf_avx2] 54 mova m6, [v210_luma_permute] 55 VBROADCASTI128 m7, [v210_chroma_shuf2] 56%else 57 VBROADCASTI128 m4, [v210_luma_shuf] 58 VBROADCASTI128 m5, [v210_chroma_shuf] 59%endif 60 61.loop: 62%ifidn %1, unaligned 63 movu m0, [srcq] ; yB v5 yA u5 y9 v4 y8 u4 y7 v3 y6 u3 y5 v2 y4 u2 y3 v1 y2 u1 y1 v0 y0 u0 64%else 65 mova m0, [srcq] 66%endif 67 68 pmullw m1, m0, m3 69 pslld m0, 12 70 psrlw m1, 6 ; yB yA u5 v4 y8 y7 v3 u3 y5 y4 u2 v1 y2 y1 v0 u0 71 psrld m0, 22 ; 00 v5 00 y9 00 u4 00 y6 00 v2 00 y3 00 u1 00 y0 72 73%if cpuflag(avx2) 74 vpblendd m2, m1, m0, 0x55 ; yB yA 00 y9 y8 y7 00 y6 y5 y4 00 y3 y2 y1 00 y0 75 pshufb m2, m4 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 76 vpermd m2, m6, m2 ; 00 00 00 00 yB yA y9 y8 y7 y6 y5 y4 y3 y2 y1 y0 77 movu [yq+2*wq], m2 78 79 vpblendd m1, m1, m0, 0xaa ; 00 v5 u5 v4 00 u4 v3 u3 00 v2 u2 v1 00 u1 v0 u0 80 pshufb m1, m5 ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0 81 vpermq m1, m1, 0xd8 ; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 00 u2 u1 u0 82 pshufb m1, m7 ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 u3 u2 u1 u0 83 84 movu [uq+wq], xm1 85 vextracti128 [vq+wq], m1, 1 86%else 87 shufps m2, m1, m0, 0x8d ; 00 y9 00 y6 yB yA y8 y7 00 y3 00 y0 y5 y4 y2 y1 88 pshufb m2, m4 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 89 movu [yq+2*wq], m2 90 91 shufps m1, m0, 0xd8 ; 00 v5 00 u4 u5 v4 v3 u3 00 v2 00 u1 u2 v1 v0 u0 92 pshufb m1, m5 ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0 93 94 movq [uq+wq], m1 95 movhps [vq+wq], m1 96%endif 97 98 add srcq, mmsize 99 add wq, (mmsize*3)/8 100 jl .loop 101 102 REP_RET 103%endmacro 104 105INIT_XMM ssse3 106v210_planar_unpack unaligned 107 108%if HAVE_AVX_EXTERNAL 109INIT_XMM avx 110v210_planar_unpack unaligned 111%endif 112 113%if HAVE_AVX2_EXTERNAL 114INIT_YMM avx2 115v210_planar_unpack unaligned 116%endif 117 118INIT_XMM ssse3 119v210_planar_unpack aligned 120 121%if HAVE_AVX_EXTERNAL 122INIT_XMM avx 123v210_planar_unpack aligned 124%endif 125 126%if HAVE_AVX2_EXTERNAL 127INIT_YMM avx2 128v210_planar_unpack aligned 129%endif 130