1;****************************************************************************** 2;* X86 Optimized functions for Open Exr Decoder 3;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC 4;* 5;* reorder_pixels, predictor based on patch by John Loy 6;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema 7;* 8;* predictor AVX/AVX2 by Henrik Gramner 9;* 10;* This file is part of FFmpeg. 11;* 12;* FFmpeg is free software; you can redistribute it and/or 13;* modify it under the terms of the GNU Lesser General Public 14;* License as published by the Free Software Foundation; either 15;* version 2.1 of the License, or (at your option) any later version. 16;* 17;* FFmpeg is distributed in the hope that it will be useful, 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20;* Lesser General Public License for more details. 21;* 22;* You should have received a copy of the GNU Lesser General Public 23;* License along with FFmpeg; if not, write to the Free Software 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25;****************************************************************************** 26 27%include "libavutil/x86/x86util.asm" 28 29cextern pb_15 30cextern pb_80 31 32SECTION .text 33 34;------------------------------------------------------------------------------ 35; void ff_reorder_pixels(uint8_t *dst, const uint8_t *src, ptrdiff_t size); 36;------------------------------------------------------------------------------ 37 38%macro REORDER_PIXELS 0 39cglobal reorder_pixels, 3,4,3, dst, src1, size, src2 40 lea src2q, [src1q+sizeq] ; src2 = src + 2 * half_size 41 add dstq, sizeq ; dst offset by size 42 shr sizeq, 1 ; half_size 43 add src1q, sizeq ; offset src by half_size 44 neg sizeq ; size = offset for dst, src1, src2 45.loop: 46 47 mova m0, [src1q+sizeq] ; load first part 48 movu m1, [src2q+sizeq] ; load second part 49 SBUTTERFLY bw, 0, 1, 2 ; interleaved 50 mova [dstq+2*sizeq ], xm0 ; copy to dst 51 mova [dstq+2*sizeq+16], xm1 52%if cpuflag(avx2) 53 vperm2i128 m0, m0, m1, q0301 54 mova [dstq+2*sizeq+32], m0 55%endif 56 add sizeq, mmsize 57 jl .loop 58 RET 59%endmacro 60 61INIT_XMM sse2 62REORDER_PIXELS 63 64%if HAVE_AVX2_EXTERNAL 65INIT_YMM avx2 66REORDER_PIXELS 67%endif 68 69 70;------------------------------------------------------------------------------ 71; void ff_predictor(uint8_t *src, ptrdiff_t size); 72;------------------------------------------------------------------------------ 73 74%macro PREDICTOR 0 75cglobal predictor, 2,2,5, src, size 76 mova m0, [pb_80] 77 mova xm1, [pb_15] 78 mova xm2, xm0 79 add srcq, sizeq 80 neg sizeq 81.loop: 82 pxor m3, m0, [srcq + sizeq] 83 pslldq m4, m3, 1 84 paddb m3, m4 85 pslldq m4, m3, 2 86 paddb m3, m4 87 pslldq m4, m3, 4 88 paddb m3, m4 89 pslldq m4, m3, 8 90%if mmsize == 32 91 paddb m3, m4 92 paddb xm2, xm3 93 vextracti128 xm4, m3, 1 94 mova [srcq + sizeq], xm2 95 pshufb xm2, xm1 96 paddb xm2, xm4 97 mova [srcq + sizeq + 16], xm2 98%else 99 paddb m2, m3 100 paddb m2, m4 101 mova [srcq + sizeq], m2 102%endif 103 pshufb xm2, xm1 104 add sizeq, mmsize 105 jl .loop 106 RET 107%endmacro 108 109INIT_XMM ssse3 110PREDICTOR 111 112INIT_XMM avx 113PREDICTOR 114 115%if HAVE_AVX2_EXTERNAL 116INIT_YMM avx2 117PREDICTOR 118%endif 119