1;***************************************************************************** 2;* SIMD-optimized MPEG encoding functions 3;***************************************************************************** 4;* Copyright (c) 2000, 2001 Fabrice Bellard 5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;***************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28cextern pw_1 29 30SECTION .text 31; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) 32; %1 = number of loops 33; %2 = number of GPRs used 34%macro PIX_SUM16 3 35cglobal pix_sum16, 2, %2, 6 36 movsxdifnidn r1, r1d 37 mov r2, %1 38%if mmsize == 16 39 lea r3, [r1*3] 40%endif 41%if notcpuflag(xop) 42 pxor m5, m5 43%endif 44 pxor m4, m4 45.loop: 46%if cpuflag(xop) 47 vphaddubq m0, [r0] 48 vphaddubq m1, [r0+r1] 49 vphaddubq m2, [r0+r1*2] 50 vphaddubq m3, [r0+r3] 51%else 52 mova m0, [r0] 53%if mmsize == 8 54 mova m1, [r0+8] 55%if cpuflag(mmxext) 56 mova m2, [r0+r1] 57 mova m3, [r0+r1+8] 58%endif 59%else ; sse2 60 mova m1, [r0+r1] 61 mova m2, [r0+r1*2] 62 mova m3, [r0+r3] 63%endif 64%if cpuflag(mmxext) 65 psadbw m0, m5 66 psadbw m1, m5 67 psadbw m2, m5 68 psadbw m3, m5 69%else ; mmx 70 punpckhbw m2, m0, m5 71 punpcklbw m0, m5 72 punpckhbw m3, m1, m5 73 punpcklbw m1, m5 74%endif ; cpuflag(mmxext) 75%endif ; cpuflag(xop) 76 paddw m1, m0 77 paddw m3, m2 78 paddw m3, m1 79 paddw m4, m3 80%if cpuflag(mmxext) 81 lea r0, [r0+r1*%3] 82%else 83 add r0, r1 84%endif 85 dec r2 86 jne .loop 87%if mmsize == 16 88 pshufd m0, m4, q0032 89 paddd m4, m0 90%elif notcpuflag(mmxext) 91 HADDW m4, m5 92%endif 93 movd eax, m4 94 RET 95%endmacro 96 97%if ARCH_X86_32 98INIT_MMX mmx 99PIX_SUM16 16, 3, 0 100INIT_MMX mmxext 101PIX_SUM16 8, 4, 2 102%endif 103INIT_XMM sse2 104PIX_SUM16 4, 4, 4 105%if HAVE_XOP_EXTERNAL 106INIT_XMM xop 107PIX_SUM16 4, 4, 4 108%endif 109 110; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) 111; %1 = number of xmm registers used 112; %2 = number of loops 113%macro PIX_NORM1 2 114cglobal pix_norm1, 2, 3, %1 115 movsxdifnidn r1, r1d 116 mov r2, %2 117 pxor m0, m0 118 pxor m5, m5 119.loop: 120 mova m2, [r0+0] 121%if mmsize == 8 122 mova m3, [r0+8] 123%else 124 mova m3, [r0+r1] 125%endif 126 punpckhbw m1, m2, m0 127 punpcklbw m2, m0 128 punpckhbw m4, m3, m0 129 punpcklbw m3, m0 130 pmaddwd m1, m1 131 pmaddwd m2, m2 132 pmaddwd m3, m3 133 pmaddwd m4, m4 134 paddd m2, m1 135 paddd m4, m3 136 paddd m5, m2 137 paddd m5, m4 138%if mmsize == 8 139 add r0, r1 140%else 141 lea r0, [r0+r1*2] 142%endif 143 dec r2 144 jne .loop 145 HADDD m5, m1 146 movd eax, m5 147 RET 148%endmacro 149 150INIT_MMX mmx 151PIX_NORM1 0, 16 152INIT_XMM sse2 153PIX_NORM1 6, 8 154 155