1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14SECTION .text 15 16;void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff) 17global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE 18sym(vp8_short_inv_walsh4x4_sse2): 19 push rbp 20 mov rbp, rsp 21 SHADOW_ARGS_TO_STACK 2 22 ; end prolog 23 24 mov rcx, arg(0) 25 mov rdx, arg(1) 26 mov rax, 30003h 27 28 movdqa xmm0, [rcx + 0] ;ip[4] ip[0] 29 movdqa xmm1, [rcx + 16] ;ip[12] ip[8] 30 31 32 pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] 33 movdqa xmm3, xmm0 ;ip[4] ip[0] 34 35 paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 36 psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 37 38 movdqa xmm4, xmm0 39 punpcklqdq xmm0, xmm3 ;d1 a1 40 punpckhqdq xmm4, xmm3 ;c1 b1 41 42 movdqa xmm1, xmm4 ;c1 b1 43 paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] 44 psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] 45 46 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 47 ; 13 12 11 10 03 02 01 00 48 ; 49 ; 33 32 31 30 23 22 21 20 50 ; 51 movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 52 punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 53 punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 54 movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 55 punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 56 punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 57 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 58 movd xmm0, eax 59 pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] 60 movdqa xmm3, xmm4 ;ip[4] ip[0] 61 62 pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03 63 64 paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 65 psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 66 67 movdqa xmm5, xmm4 68 punpcklqdq xmm4, xmm3 ;d1 a1 69 punpckhqdq xmm5, xmm3 ;c1 b1 70 71 movdqa xmm1, xmm5 ;c1 b1 72 paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] 73 psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] 74 75 paddw xmm5, xmm0 76 paddw xmm4, xmm0 77 psraw xmm5, 3 78 psraw xmm4, 3 79 80 movd eax, xmm5 81 movd ecx, xmm4 82 psrldq xmm5, 4 83 psrldq xmm4, 4 84 mov word ptr[rdx+32*0], ax 85 mov word ptr[rdx+32*2], cx 86 shr eax, 16 87 shr ecx, 16 88 mov word ptr[rdx+32*4], ax 89 mov word ptr[rdx+32*6], cx 90 movd eax, xmm5 91 movd ecx, xmm4 92 psrldq xmm5, 4 93 psrldq xmm4, 4 94 mov word ptr[rdx+32*8], ax 95 mov word ptr[rdx+32*10], cx 96 shr eax, 16 97 shr ecx, 16 98 mov word ptr[rdx+32*12], ax 99 mov word ptr[rdx+32*14], cx 100 101 movd eax, xmm5 102 movd ecx, xmm4 103 psrldq xmm5, 4 104 psrldq xmm4, 4 105 mov word ptr[rdx+32*1], ax 106 mov word ptr[rdx+32*3], cx 107 shr eax, 16 108 shr ecx, 16 109 mov word ptr[rdx+32*5], ax 110 mov word ptr[rdx+32*7], cx 111 movd eax, xmm5 112 movd ecx, xmm4 113 mov word ptr[rdx+32*9], ax 114 mov word ptr[rdx+32*11], cx 115 shr eax, 16 116 shr ecx, 16 117 mov word ptr[rdx+32*13], ax 118 mov word ptr[rdx+32*15], cx 119 120 ; begin epilog 121 UNSHADOW_ARGS 122 pop rbp 123 ret 124