1;****************************************************************************** 2;* optimized bswap buffer functions 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2003-2013 Michael Niedermayer 5;* Copyright (c) 2013 Daniel Kang 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 28 29cextern pb_80 30 31SECTION .text 32 33; %1 = aligned/unaligned 34%macro BSWAP_LOOPS 1 35 mov r3d, r2d 36 sar r2d, 3 37 jz .left4_%1 38%if cpuflag(avx2) 39 sar r2d, 1 40 jz .left8_%1 41%endif 42.loop8_%1: 43 mov%1 m0, [r1 + 0] 44 mov%1 m1, [r1 + mmsize] 45%if cpuflag(ssse3)||cpuflag(avx2) 46 pshufb m0, m2 47 pshufb m1, m2 48 mov%1 [r0 + 0], m0 49 mov%1 [r0 + mmsize], m1 50%else 51 pshuflw m0, m0, 10110001b 52 pshuflw m1, m1, 10110001b 53 pshufhw m0, m0, 10110001b 54 pshufhw m1, m1, 10110001b 55 mova m2, m0 56 mova m3, m1 57 psllw m0, 8 58 psllw m1, 8 59 psrlw m2, 8 60 psrlw m3, 8 61 por m2, m0 62 por m3, m1 63 mov%1 [r0 + 0], m2 64 mov%1 [r0 + 16], m3 65%endif 66 add r0, mmsize*2 67 add r1, mmsize*2 68 dec r2d 69 jnz .loop8_%1 70%if cpuflag(avx2) 71.left8_%1: 72 mov r2d, r3d 73 test r3d, 8 74 jz .left4_%1 75 mov%1 m0, [r1] 76 pshufb m0, m2 77 mov%1 [r0 + 0], m0 78 add r1, mmsize 79 add r0, mmsize 80%endif 81.left4_%1: 82 mov r2d, r3d 83 test r3d, 4 84 jz .left 85 mov%1 xm0, [r1] 86%if cpuflag(ssse3) 87 pshufb xm0, xm2 88 mov%1 [r0], xm0 89%else 90 pshuflw m0, m0, 10110001b 91 pshufhw m0, m0, 10110001b 92 mova m2, m0 93 psllw m0, 8 94 psrlw m2, 8 95 por m2, m0 96 mov%1 [r0], m2 97%endif 98 add r1, 16 99 add r0, 16 100%endmacro 101 102; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w); 103%macro BSWAP32_BUF 0 104%if cpuflag(ssse3)||cpuflag(avx2) 105cglobal bswap32_buf, 3,4,3 106 mov r3, r1 107 VBROADCASTI128 m2, [pb_bswap32] 108%else 109cglobal bswap32_buf, 3,4,5 110 mov r3, r1 111%endif 112 or r3, r0 113 test r3, mmsize - 1 114 jz .start_align 115 BSWAP_LOOPS u 116 jmp .left 117.start_align: 118 BSWAP_LOOPS a 119.left: 120%if cpuflag(ssse3) 121 test r2d, 2 122 jz .left1 123 movq xm0, [r1] 124 pshufb xm0, xm2 125 movq [r0], xm0 126 add r1, 8 127 add r0, 8 128.left1: 129 test r2d, 1 130 jz .end 131 mov r2d, [r1] 132 bswap r2d 133 mov [r0], r2d 134%else 135 and r2d, 3 136 jz .end 137.loop2: 138 mov r3d, [r1] 139 bswap r3d 140 mov [r0], r3d 141 add r1, 4 142 add r0, 4 143 dec r2d 144 jnz .loop2 145%endif 146.end: 147 RET 148%endmacro 149 150INIT_XMM sse2 151BSWAP32_BUF 152 153INIT_XMM ssse3 154BSWAP32_BUF 155 156%if HAVE_AVX2_EXTERNAL 157INIT_YMM avx2 158BSWAP32_BUF 159%endif 160