1;***************************************************************************** 2;* SIMD-optimized pixel operations 3;***************************************************************************** 4;* Copyright (c) 2000, 2001 Fabrice Bellard 5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;***************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION .text 27 28INIT_MMX mmx 29; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t stride) 30cglobal get_pixels, 3,4 31 add r0, 128 32 mov r3, -128 33 pxor m7, m7 34.loop: 35 mova m0, [r1] 36 mova m2, [r1+r2] 37 mova m1, m0 38 mova m3, m2 39 punpcklbw m0, m7 40 punpckhbw m1, m7 41 punpcklbw m2, m7 42 punpckhbw m3, m7 43 mova [r0+r3+ 0], m0 44 mova [r0+r3+ 8], m1 45 mova [r0+r3+16], m2 46 mova [r0+r3+24], m3 47 lea r1, [r1+r2*2] 48 add r3, 32 49 js .loop 50 REP_RET 51 52INIT_XMM sse2 53cglobal get_pixels, 3, 4, 5 54 lea r3, [r2*3] 55 pxor m4, m4 56 movh m0, [r1] 57 movh m1, [r1+r2] 58 movh m2, [r1+r2*2] 59 movh m3, [r1+r3] 60 lea r1, [r1+r2*4] 61 punpcklbw m0, m4 62 punpcklbw m1, m4 63 punpcklbw m2, m4 64 punpcklbw m3, m4 65 mova [r0], m0 66 mova [r0+0x10], m1 67 mova [r0+0x20], m2 68 mova [r0+0x30], m3 69 movh m0, [r1] 70 movh m1, [r1+r2*1] 71 movh m2, [r1+r2*2] 72 movh m3, [r1+r3] 73 punpcklbw m0, m4 74 punpcklbw m1, m4 75 punpcklbw m2, m4 76 punpcklbw m3, m4 77 mova [r0+0x40], m0 78 mova [r0+0x50], m1 79 mova [r0+0x60], m2 80 mova [r0+0x70], m3 81 RET 82 83; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, 84; ptrdiff_t stride); 85%macro DIFF_PIXELS 0 86cglobal diff_pixels, 4,5,5 87 pxor m4, m4 88 add r0, 128 89 mov r4, -128 90.loop: 91 movq m0, [r1] 92 movq m2, [r2] 93%if mmsize == 8 94 movq m1, m0 95 movq m3, m2 96 punpcklbw m0, m4 97 punpckhbw m1, m4 98 punpcklbw m2, m4 99 punpckhbw m3, m4 100%else 101 movq m1, [r1+r3] 102 movq m3, [r2+r3] 103 punpcklbw m0, m4 104 punpcklbw m1, m4 105 punpcklbw m2, m4 106 punpcklbw m3, m4 107%endif 108 psubw m0, m2 109 psubw m1, m3 110 mova [r0+r4+0], m0 111 mova [r0+r4+mmsize], m1 112%if mmsize == 8 113 add r1, r3 114 add r2, r3 115%else 116 lea r1, [r1+r3*2] 117 lea r2, [r2+r3*2] 118%endif 119 add r4, 2 * mmsize 120 jne .loop 121 RET 122%endmacro 123 124INIT_MMX mmx 125DIFF_PIXELS 126 127INIT_XMM sse2 128DIFF_PIXELS 129