1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) 15global sym(vp8_block_error_xmm) 16sym(vp8_block_error_xmm): 17 push rbp 18 mov rbp, rsp 19 SHADOW_ARGS_TO_STACK 2 20 push rsi 21 push rdi 22 ; end prologue 23 24 mov rsi, arg(0) ;coeff_ptr 25 26 mov rdi, arg(1) ;dcoef_ptr 27 movdqa xmm3, [rsi] 28 29 movdqa xmm4, [rdi] 30 movdqa xmm5, [rsi+16] 31 32 movdqa xmm6, [rdi+16] 33 psubw xmm3, xmm4 34 35 psubw xmm5, xmm6 36 pmaddwd xmm3, xmm3 37 pmaddwd xmm5, xmm5 38 39 paddd xmm3, xmm5 40 41 pxor xmm7, xmm7 42 movdqa xmm0, xmm3 43 44 punpckldq xmm0, xmm7 45 punpckhdq xmm3, xmm7 46 47 paddd xmm0, xmm3 48 movdqa xmm3, xmm0 49 50 psrldq xmm0, 8 51 paddd xmm0, xmm3 52 53 movq rax, xmm0 54 55 pop rdi 56 pop rsi 57 ; begin epilog 58 UNSHADOW_ARGS 59 pop rbp 60 ret 61 62;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) 63global sym(vp8_block_error_mmx) 64sym(vp8_block_error_mmx): 65 push rbp 66 mov rbp, rsp 67 SHADOW_ARGS_TO_STACK 2 68 push rsi 69 push rdi 70 ; end prolog 71 72 73 mov rsi, arg(0) ;coeff_ptr 74 pxor mm7, mm7 75 76 mov rdi, arg(1) ;dcoef_ptr 77 movq mm3, [rsi] 78 79 movq mm4, [rdi] 80 movq mm5, [rsi+8] 81 82 movq mm6, [rdi+8] 83 pxor mm1, mm1 ; from movd mm1, dc ; dc =0 84 85 movq mm2, mm7 86 psubw mm5, mm6 87 88 por mm1, mm2 89 pmaddwd mm5, mm5 90 91 pcmpeqw mm1, mm7 92 psubw mm3, mm4 93 94 pand mm1, mm3 95 pmaddwd mm1, mm1 96 97 paddd mm1, mm5 98 movq mm3, [rsi+16] 99 100 movq mm4, [rdi+16] 101 movq mm5, [rsi+24] 102 103 movq mm6, [rdi+24] 104 psubw mm5, mm6 105 106 pmaddwd mm5, mm5 107 psubw mm3, mm4 108 109 pmaddwd mm3, mm3 110 paddd mm3, mm5 111 112 paddd mm1, mm3 113 movq mm0, mm1 114 115 psrlq mm1, 32 116 paddd mm0, mm1 117 118 movq rax, mm0 119 120 pop rdi 121 pop rsi 122 ; begin epilog 123 UNSHADOW_ARGS 124 pop rbp 125 ret 126 127 128;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 129global sym(vp8_mbblock_error_mmx_impl) 130sym(vp8_mbblock_error_mmx_impl): 131 push rbp 132 mov rbp, rsp 133 SHADOW_ARGS_TO_STACK 3 134 push rsi 135 push rdi 136 ; end prolog 137 138 139 mov rsi, arg(0) ;coeff_ptr 140 pxor mm7, mm7 141 142 mov rdi, arg(1) ;dcoef_ptr 143 pxor mm2, mm2 144 145 movd mm1, dword ptr arg(2) ;dc 146 por mm1, mm2 147 148 pcmpeqw mm1, mm7 149 mov rcx, 16 150 151mberror_loop_mmx: 152 movq mm3, [rsi] 153 movq mm4, [rdi] 154 155 movq mm5, [rsi+8] 156 movq mm6, [rdi+8] 157 158 159 psubw mm5, mm6 160 pmaddwd mm5, mm5 161 162 psubw mm3, mm4 163 pand mm3, mm1 164 165 pmaddwd mm3, mm3 166 paddd mm2, mm5 167 168 paddd mm2, mm3 169 movq mm3, [rsi+16] 170 171 movq mm4, [rdi+16] 172 movq mm5, [rsi+24] 173 174 movq mm6, [rdi+24] 175 psubw mm5, mm6 176 177 pmaddwd mm5, mm5 178 psubw mm3, mm4 179 180 pmaddwd mm3, mm3 181 paddd mm2, mm5 182 183 paddd mm2, mm3 184 add rsi, 32 185 186 add rdi, 32 187 sub rcx, 1 188 189 jnz mberror_loop_mmx 190 191 movq mm0, mm2 192 psrlq mm2, 32 193 194 paddd mm0, mm2 195 movq rax, mm0 196 197 pop rdi 198 pop rsi 199 ; begin epilog 200 UNSHADOW_ARGS 201 pop rbp 202 ret 203 204 205;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 206global sym(vp8_mbblock_error_xmm_impl) 207sym(vp8_mbblock_error_xmm_impl): 208 push rbp 209 mov rbp, rsp 210 SHADOW_ARGS_TO_STACK 3 211 push rsi 212 push rdi 213 ; end prolog 214 215 216 mov rsi, arg(0) ;coeff_ptr 217 pxor xmm7, xmm7 218 219 mov rdi, arg(1) ;dcoef_ptr 220 pxor xmm2, xmm2 221 222 movd xmm1, dword ptr arg(2) ;dc 223 por xmm1, xmm2 224 225 pcmpeqw xmm1, xmm7 226 mov rcx, 16 227 228mberror_loop: 229 movdqa xmm3, [rsi] 230 movdqa xmm4, [rdi] 231 232 movdqa xmm5, [rsi+16] 233 movdqa xmm6, [rdi+16] 234 235 236 psubw xmm5, xmm6 237 pmaddwd xmm5, xmm5 238 239 psubw xmm3, xmm4 240 pand xmm3, xmm1 241 242 pmaddwd xmm3, xmm3 243 add rsi, 32 244 245 add rdi, 32 246 247 sub rcx, 1 248 paddd xmm2, xmm5 249 250 paddd xmm2, xmm3 251 jnz mberror_loop 252 253 movdqa xmm0, xmm2 254 punpckldq xmm0, xmm7 255 256 punpckhdq xmm2, xmm7 257 paddd xmm0, xmm2 258 259 movdqa xmm1, xmm0 260 psrldq xmm0, 8 261 262 paddd xmm0, xmm1 263 movq rax, xmm0 264 265 pop rdi 266 pop rsi 267 ; begin epilog 268 UNSHADOW_ARGS 269 pop rbp 270 ret 271 272 273;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); 274global sym(vp8_mbuverror_mmx_impl) 275sym(vp8_mbuverror_mmx_impl): 276 push rbp 277 mov rbp, rsp 278 SHADOW_ARGS_TO_STACK 2 279 push rsi 280 push rdi 281 ; end prolog 282 283 284 mov rsi, arg(0) ;s_ptr 285 mov rdi, arg(1) ;d_ptr 286 287 mov rcx, 16 288 pxor mm7, mm7 289 290mbuverror_loop_mmx: 291 292 movq mm1, [rsi] 293 movq mm2, [rdi] 294 295 psubw mm1, mm2 296 pmaddwd mm1, mm1 297 298 299 movq mm3, [rsi+8] 300 movq mm4, [rdi+8] 301 302 psubw mm3, mm4 303 pmaddwd mm3, mm3 304 305 306 paddd mm7, mm1 307 paddd mm7, mm3 308 309 310 add rsi, 16 311 add rdi, 16 312 313 dec rcx 314 jnz mbuverror_loop_mmx 315 316 movq mm0, mm7 317 psrlq mm7, 32 318 319 paddd mm0, mm7 320 movq rax, mm0 321 322 pop rdi 323 pop rsi 324 ; begin epilog 325 UNSHADOW_ARGS 326 pop rbp 327 ret 328 329 330;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); 331global sym(vp8_mbuverror_xmm_impl) 332sym(vp8_mbuverror_xmm_impl): 333 push rbp 334 mov rbp, rsp 335 SHADOW_ARGS_TO_STACK 2 336 push rsi 337 push rdi 338 ; end prolog 339 340 341 mov rsi, arg(0) ;s_ptr 342 mov rdi, arg(1) ;d_ptr 343 344 mov rcx, 16 345 pxor xmm7, xmm7 346 347mbuverror_loop: 348 349 movdqa xmm1, [rsi] 350 movdqa xmm2, [rdi] 351 352 psubw xmm1, xmm2 353 pmaddwd xmm1, xmm1 354 355 paddd xmm7, xmm1 356 357 add rsi, 16 358 add rdi, 16 359 360 dec rcx 361 jnz mbuverror_loop 362 363 pxor xmm0, xmm0 364 movdqa xmm1, xmm7 365 366 movdqa xmm2, xmm1 367 punpckldq xmm1, xmm0 368 369 punpckhdq xmm2, xmm0 370 paddd xmm1, xmm2 371 372 movdqa xmm2, xmm1 373 374 psrldq xmm1, 8 375 paddd xmm1, xmm2 376 377 movq rax, xmm1 378 379 pop rdi 380 pop rsi 381 ; begin epilog 382 UNSHADOW_ARGS 383 pop rbp 384 ret 385