1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14; /**************************************************************************** 15; * Notes: 16; * 17; * This implementation makes use of 16 bit fixed point version of two multiply 18; * constants: 19; * 1. sqrt(2) * cos (pi/8) 20; * 2. sqrt(2) * sin (pi/8) 21; * Because the first constant is bigger than 1, to maintain the same 16 bit 22; * fixed point precision as the second one, we use a trick of 23; * x * a = x + x*(a-1) 24; * so 25; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). 26; * 27; * For the second constant, because of the 16bit version is 35468, which 28; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative 29; * number. 30; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x 31; * 32; **************************************************************************/ 33 34SECTION .text 35 36;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, 37;int pitch, unsigned char *dest,int stride) 38global sym(vp8_short_idct4x4llm_mmx) PRIVATE 39sym(vp8_short_idct4x4llm_mmx): 40 push rbp 41 mov rbp, rsp 42 SHADOW_ARGS_TO_STACK 5 43 GET_GOT rbx 44 push rsi 45 push rdi 46 ; end prolog 47 48 mov rax, arg(0) ;input 49 mov rsi, arg(1) ;pred 50 51 movq mm0, [rax ] 52 movq mm1, [rax+ 8] 53 movq mm2, [rax+16] 54 movq mm3, [rax+24] 55 56%if 0 57 pxor mm7, mm7 58 movq [rax], mm7 59 movq [rax+8], mm7 60 movq [rax+16],mm7 61 movq [rax+24],mm7 62%endif 63 movsxd rax, dword ptr arg(2) ;pitch 64 mov rdx, arg(3) ;dest 65 movsxd rdi, dword ptr arg(4) ;stride 66 67 68 psubw mm0, mm2 ; b1= 0-2 69 paddw mm2, mm2 ; 70 71 movq mm5, mm1 72 paddw mm2, mm0 ; a1 =0+2 73 74 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 75 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 76 77 movq mm7, mm3 ; 78 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 79 80 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 81 psubw mm7, mm5 ; c1 82 83 movq mm5, mm1 84 movq mm4, mm3 85 86 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 87 paddw mm5, mm1 88 89 pmulhw mm3, [GLOBAL(x_s1sqr2)] 90 paddw mm3, mm4 91 92 paddw mm3, mm5 ; d1 93 movq mm6, mm2 ; a1 94 95 movq mm4, mm0 ; b1 96 paddw mm2, mm3 ;0 97 98 paddw mm4, mm7 ;1 99 psubw mm0, mm7 ;2 100 101 psubw mm6, mm3 ;3 102 103 movq mm1, mm2 ; 03 02 01 00 104 movq mm3, mm4 ; 23 22 21 20 105 106 punpcklwd mm1, mm0 ; 11 01 10 00 107 punpckhwd mm2, mm0 ; 13 03 12 02 108 109 punpcklwd mm3, mm6 ; 31 21 30 20 110 punpckhwd mm4, mm6 ; 33 23 32 22 111 112 movq mm0, mm1 ; 11 01 10 00 113 movq mm5, mm2 ; 13 03 12 02 114 115 punpckldq mm0, mm3 ; 30 20 10 00 116 punpckhdq mm1, mm3 ; 31 21 11 01 117 118 punpckldq mm2, mm4 ; 32 22 12 02 119 punpckhdq mm5, mm4 ; 33 23 13 03 120 121 movq mm3, mm5 ; 33 23 13 03 122 123 psubw mm0, mm2 ; b1= 0-2 124 paddw mm2, mm2 ; 125 126 movq mm5, mm1 127 paddw mm2, mm0 ; a1 =0+2 128 129 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 130 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 131 132 movq mm7, mm3 ; 133 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 134 135 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 136 psubw mm7, mm5 ; c1 137 138 movq mm5, mm1 139 movq mm4, mm3 140 141 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 142 paddw mm5, mm1 143 144 pmulhw mm3, [GLOBAL(x_s1sqr2)] 145 paddw mm3, mm4 146 147 paddw mm3, mm5 ; d1 148 paddw mm0, [GLOBAL(fours)] 149 150 paddw mm2, [GLOBAL(fours)] 151 movq mm6, mm2 ; a1 152 153 movq mm4, mm0 ; b1 154 paddw mm2, mm3 ;0 155 156 paddw mm4, mm7 ;1 157 psubw mm0, mm7 ;2 158 159 psubw mm6, mm3 ;3 160 psraw mm2, 3 161 162 psraw mm0, 3 163 psraw mm4, 3 164 165 psraw mm6, 3 166 167 movq mm1, mm2 ; 03 02 01 00 168 movq mm3, mm4 ; 23 22 21 20 169 170 punpcklwd mm1, mm0 ; 11 01 10 00 171 punpckhwd mm2, mm0 ; 13 03 12 02 172 173 punpcklwd mm3, mm6 ; 31 21 30 20 174 punpckhwd mm4, mm6 ; 33 23 32 22 175 176 movq mm0, mm1 ; 11 01 10 00 177 movq mm5, mm2 ; 13 03 12 02 178 179 punpckldq mm0, mm3 ; 30 20 10 00 180 punpckhdq mm1, mm3 ; 31 21 11 01 181 182 punpckldq mm2, mm4 ; 32 22 12 02 183 punpckhdq mm5, mm4 ; 33 23 13 03 184 185 pxor mm7, mm7 186 187 movd mm4, [rsi] 188 punpcklbw mm4, mm7 189 paddsw mm0, mm4 190 packuswb mm0, mm7 191 movd [rdx], mm0 192 193 movd mm4, [rsi+rax] 194 punpcklbw mm4, mm7 195 paddsw mm1, mm4 196 packuswb mm1, mm7 197 movd [rdx+rdi], mm1 198 199 movd mm4, [rsi+2*rax] 200 punpcklbw mm4, mm7 201 paddsw mm2, mm4 202 packuswb mm2, mm7 203 movd [rdx+rdi*2], mm2 204 205 add rdx, rdi 206 add rsi, rax 207 208 movd mm4, [rsi+2*rax] 209 punpcklbw mm4, mm7 210 paddsw mm5, mm4 211 packuswb mm5, mm7 212 movd [rdx+rdi*2], mm5 213 214 ; begin epilog 215 pop rdi 216 pop rsi 217 RESTORE_GOT 218 UNSHADOW_ARGS 219 pop rbp 220 ret 221 222;void vp8_dc_only_idct_add_mmx( 223;short input_dc, 224;unsigned char *pred_ptr, 225;int pred_stride, 226;unsigned char *dst_ptr, 227;int stride) 228global sym(vp8_dc_only_idct_add_mmx) PRIVATE 229sym(vp8_dc_only_idct_add_mmx): 230 push rbp 231 mov rbp, rsp 232 SHADOW_ARGS_TO_STACK 5 233 GET_GOT rbx 234 ; end prolog 235 236 movd mm5, arg(0) ;input_dc 237 mov rax, arg(1) ;pred_ptr 238 movsxd rdx, dword ptr arg(2) ;pred_stride 239 240 pxor mm0, mm0 241 242 paddw mm5, [GLOBAL(fours)] 243 lea rcx, [rdx + rdx*2] 244 245 psraw mm5, 3 246 247 punpcklwd mm5, mm5 248 249 punpckldq mm5, mm5 250 251 movd mm1, [rax] 252 movd mm2, [rax+rdx] 253 movd mm3, [rax+2*rdx] 254 movd mm4, [rax+rcx] 255 256 mov rax, arg(3) ;d -- destination 257 movsxd rdx, dword ptr arg(4) ;dst_stride 258 259 punpcklbw mm1, mm0 260 paddsw mm1, mm5 261 packuswb mm1, mm0 ; pack and unpack to saturate 262 lea rcx, [rdx + rdx*2] 263 264 punpcklbw mm2, mm0 265 paddsw mm2, mm5 266 packuswb mm2, mm0 ; pack and unpack to saturate 267 268 punpcklbw mm3, mm0 269 paddsw mm3, mm5 270 packuswb mm3, mm0 ; pack and unpack to saturate 271 272 punpcklbw mm4, mm0 273 paddsw mm4, mm5 274 packuswb mm4, mm0 ; pack and unpack to saturate 275 276 movd [rax], mm1 277 movd [rax+rdx], mm2 278 movd [rax+2*rdx], mm3 279 movd [rax+rcx], mm4 280 281 ; begin epilog 282 RESTORE_GOT 283 UNSHADOW_ARGS 284 pop rbp 285 ret 286 287SECTION_RODATA 288align 16 289x_s1sqr2: 290 times 4 dw 0x8A8C 291align 16 292x_c1sqr2less1: 293 times 4 dw 0x4E7B 294align 16 295fours: 296 times 4 dw 0x0004 297