1;****************************************************************************** 2;* x86-optimized horizontal line scaling functions 3;* Copyright 2020 Google LLC 4;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 32 26 27swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7 28four: times 8 dd 4 29 30SECTION .text 31 32;----------------------------------------------------------------------------- 33; horizontal line scaling 34; 35; void hscale8to15_<filterSize>_<opt> 36; (SwsContext *c, int16_t *dst, 37; int dstW, const uint8_t *src, 38; const int16_t *filter, 39; const int32_t *filterPos, int filterSize); 40; 41; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is 42; 15 bits (in int16_t). Each output pixel is generated from $filterSize input 43; pixels, the position of the first pixel is given in filterPos[nOutputPixel]. 44;----------------------------------------------------------------------------- 45 46%macro SCALE_FUNC 1 47cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner 48 pxor m0, m0 49 mova m15, [swizzle] 50 xor countq, countq 51 movsxd wq, wd 52%ifidn %1, X4 53 mova m14, [four] 54 shr fltsized, 2 55%endif 56.loop: 57 movu m1, [fltposq] 58 movu m2, [fltposq+32] 59%ifidn %1, X4 60 pxor m9, m9 61 pxor m10, m10 62 pxor m11, m11 63 pxor m12, m12 64 xor innerq, innerq 65.innerloop: 66%endif 67 vpcmpeqd m13, m13 68 vpgatherdd m3,[srcmemq + m1], m13 69 vpcmpeqd m13, m13 70 vpgatherdd m4,[srcmemq + m2], m13 71 vpunpcklbw m5, m3, m0 72 vpunpckhbw m6, m3, m0 73 vpunpcklbw m7, m4, m0 74 vpunpckhbw m8, m4, m0 75 vpmaddwd m5, m5, [filterq] 76 vpmaddwd m6, m6, [filterq + 32] 77 vpmaddwd m7, m7, [filterq + 64] 78 vpmaddwd m8, m8, [filterq + 96] 79 add filterq, 0x80 80%ifidn %1, X4 81 paddd m9, m5 82 paddd m10, m6 83 paddd m11, m7 84 paddd m12, m8 85 paddd m1, m14 86 paddd m2, m14 87 add innerq, 1 88 cmp innerq, fltsizeq 89 jl .innerloop 90 vphaddd m5, m9, m10 91 vphaddd m6, m11, m12 92%else 93 vphaddd m5, m5, m6 94 vphaddd m6, m7, m8 95%endif 96 vpsrad m5, 7 97 vpsrad m6, 7 98 vpackssdw m5, m5, m6 99 vpermd m5, m15, m5 100 vmovdqu [dstq + countq * 2], m5 101 add fltposq, 0x40 102 add countq, 0x10 103 cmp countq, wq 104 jl .loop 105REP_RET 106%endmacro 107 108%if ARCH_X86_64 109%if HAVE_AVX2_EXTERNAL 110INIT_YMM avx2 111SCALE_FUNC 4 112SCALE_FUNC X4 113%endif 114%endif 115