1;****************************************************************************** 2;* x86-optimized yuv2yuvX 3;* Copyright 2020 Google LLC 4;* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION .text 26 27;----------------------------------------------------------------------------- 28; yuv2yuvX 29; 30; void ff_yuv2yuvX_<opt>(const int16_t *filter, int filterSize, 31; int srcOffset, uint8_t *dest, int dstW, 32; const uint8_t *dither, int offset); 33; 34;----------------------------------------------------------------------------- 35 36%macro YUV2YUVX_FUNC 0 37cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset 38%if notcpuflag(sse3) 39%define movr mova 40%define unroll 1 41%else 42%define movr movdqu 43%define unroll 2 44%endif 45 movsxdifnidn dstWq, dstWd 46 movsxdifnidn offsetq, offsetd 47 movsxdifnidn srcq, srcd 48%if cpuflag(avx2) 49 vpbroadcastq m3, [ditherq] 50%else 51 movq xm3, [ditherq] 52%endif ; avx2 53 cmp offsetd, 0 54 jz .offset 55 56 ; offset != 0 path. 57 psrlq m5, m3, $18 58 psllq m3, m3, $28 59 por m3, m3, m5 60 61.offset: 62 add offsetq, srcq 63 movd xm1, filterSized 64 SPLATW m1, xm1, 0 65 pxor m0, m0, m0 66 mov filterSizeq, filterq 67 mov srcq, [filterSizeq] 68 punpcklbw m3, m0 69 psllw m1, m1, 3 70 paddw m3, m3, m1 71 psraw m7, m3, 4 72.outerloop: 73 mova m4, m7 74 mova m3, m7 75%if cpuflag(sse3) 76 mova m6, m7 77 mova m1, m7 78%endif 79.loop: 80%if cpuflag(avx2) 81 vpbroadcastq m0, [filterSizeq + 8] 82%elif cpuflag(sse3) 83 movddup m0, [filterSizeq + 8] 84%else 85 mova m0, [filterSizeq + 8] 86%endif 87 pmulhw m2, m0, [srcq + offsetq * 2] 88 pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] 89 paddw m3, m3, m2 90 paddw m4, m4, m5 91%if cpuflag(sse3) 92 pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] 93 pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] 94 paddw m6, m6, m2 95 paddw m1, m1, m5 96%endif 97 add filterSizeq, $10 98 mov srcq, [filterSizeq] 99 test srcq, srcq 100 jnz .loop 101 psraw m3, m3, 3 102 psraw m4, m4, 3 103%if cpuflag(sse3) 104 psraw m6, m6, 3 105 psraw m1, m1, 3 106%endif 107 packuswb m3, m3, m4 108%if cpuflag(sse3) 109 packuswb m6, m6, m1 110%endif 111 mov srcq, [filterq] 112%if cpuflag(avx2) 113 vpermq m3, m3, 216 114 vpermq m6, m6, 216 115%endif 116 movr [destq + offsetq], m3 117%if cpuflag(sse3) 118 movr [destq + offsetq + mmsize], m6 119%endif 120 add offsetq, mmsize * unroll 121 mov filterSizeq, filterq 122 cmp offsetq, dstWq 123 jb .outerloop 124 REP_RET 125%endmacro 126 127INIT_MMX mmx 128YUV2YUVX_FUNC 129INIT_MMX mmxext 130YUV2YUVX_FUNC 131INIT_XMM sse3 132YUV2YUVX_FUNC 133%if HAVE_AVX2_EXTERNAL 134INIT_YMM avx2 135YUV2YUVX_FUNC 136%endif 137