1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14global sym(vpx_sad16x16_mmx) PRIVATE 15global sym(vpx_sad8x16_mmx) PRIVATE 16global sym(vpx_sad8x8_mmx) PRIVATE 17global sym(vpx_sad4x4_mmx) PRIVATE 18global sym(vpx_sad16x8_mmx) PRIVATE 19 20;unsigned int vpx_sad16x16_mmx( 21; unsigned char *src_ptr, 22; int src_stride, 23; unsigned char *ref_ptr, 24; int ref_stride) 25sym(vpx_sad16x16_mmx): 26 push rbp 27 mov rbp, rsp 28 SHADOW_ARGS_TO_STACK 4 29 push rsi 30 push rdi 31 ; end prolog 32 33 mov rsi, arg(0) ;src_ptr 34 mov rdi, arg(2) ;ref_ptr 35 36 movsxd rax, dword ptr arg(1) ;src_stride 37 movsxd rdx, dword ptr arg(3) ;ref_stride 38 39 lea rcx, [rsi+rax*8] 40 41 lea rcx, [rcx+rax*8] 42 pxor mm7, mm7 43 44 pxor mm6, mm6 45 46.x16x16sad_mmx_loop: 47 48 movq mm0, QWORD PTR [rsi] 49 movq mm2, QWORD PTR [rsi+8] 50 51 movq mm1, QWORD PTR [rdi] 52 movq mm3, QWORD PTR [rdi+8] 53 54 movq mm4, mm0 55 movq mm5, mm2 56 57 psubusb mm0, mm1 58 psubusb mm1, mm4 59 60 psubusb mm2, mm3 61 psubusb mm3, mm5 62 63 por mm0, mm1 64 por mm2, mm3 65 66 movq mm1, mm0 67 movq mm3, mm2 68 69 punpcklbw mm0, mm6 70 punpcklbw mm2, mm6 71 72 punpckhbw mm1, mm6 73 punpckhbw mm3, mm6 74 75 paddw mm0, mm2 76 paddw mm1, mm3 77 78 79 lea rsi, [rsi+rax] 80 add rdi, rdx 81 82 paddw mm7, mm0 83 paddw mm7, mm1 84 85 cmp rsi, rcx 86 jne .x16x16sad_mmx_loop 87 88 89 movq mm0, mm7 90 91 punpcklwd mm0, mm6 92 punpckhwd mm7, mm6 93 94 paddw mm0, mm7 95 movq mm7, mm0 96 97 98 psrlq mm0, 32 99 paddw mm7, mm0 100 101 movq rax, mm7 102 103 pop rdi 104 pop rsi 105 mov rsp, rbp 106 ; begin epilog 107 UNSHADOW_ARGS 108 pop rbp 109 ret 110 111 112;unsigned int vpx_sad8x16_mmx( 113; unsigned char *src_ptr, 114; int src_stride, 115; unsigned char *ref_ptr, 116; int ref_stride) 117sym(vpx_sad8x16_mmx): 118 push rbp 119 mov rbp, rsp 120 SHADOW_ARGS_TO_STACK 4 121 push rsi 122 push rdi 123 ; end prolog 124 125 mov rsi, arg(0) ;src_ptr 126 mov rdi, arg(2) ;ref_ptr 127 128 movsxd rax, dword ptr arg(1) ;src_stride 129 movsxd rdx, dword ptr arg(3) ;ref_stride 130 131 lea rcx, [rsi+rax*8] 132 133 lea rcx, [rcx+rax*8] 134 pxor mm7, mm7 135 136 pxor mm6, mm6 137 138.x8x16sad_mmx_loop: 139 140 movq mm0, QWORD PTR [rsi] 141 movq mm1, QWORD PTR [rdi] 142 143 movq mm2, mm0 144 psubusb mm0, mm1 145 146 psubusb mm1, mm2 147 por mm0, mm1 148 149 movq mm2, mm0 150 punpcklbw mm0, mm6 151 152 punpckhbw mm2, mm6 153 lea rsi, [rsi+rax] 154 155 add rdi, rdx 156 paddw mm7, mm0 157 158 paddw mm7, mm2 159 cmp rsi, rcx 160 161 jne .x8x16sad_mmx_loop 162 163 movq mm0, mm7 164 punpcklwd mm0, mm6 165 166 punpckhwd mm7, mm6 167 paddw mm0, mm7 168 169 movq mm7, mm0 170 psrlq mm0, 32 171 172 paddw mm7, mm0 173 movq rax, mm7 174 175 pop rdi 176 pop rsi 177 mov rsp, rbp 178 ; begin epilog 179 UNSHADOW_ARGS 180 pop rbp 181 ret 182 183 184;unsigned int vpx_sad8x8_mmx( 185; unsigned char *src_ptr, 186; int src_stride, 187; unsigned char *ref_ptr, 188; int ref_stride) 189sym(vpx_sad8x8_mmx): 190 push rbp 191 mov rbp, rsp 192 SHADOW_ARGS_TO_STACK 4 193 push rsi 194 push rdi 195 ; end prolog 196 197 mov rsi, arg(0) ;src_ptr 198 mov rdi, arg(2) ;ref_ptr 199 200 movsxd rax, dword ptr arg(1) ;src_stride 201 movsxd rdx, dword ptr arg(3) ;ref_stride 202 203 lea rcx, [rsi+rax*8] 204 pxor mm7, mm7 205 206 pxor mm6, mm6 207 208.x8x8sad_mmx_loop: 209 210 movq mm0, QWORD PTR [rsi] 211 movq mm1, QWORD PTR [rdi] 212 213 movq mm2, mm0 214 psubusb mm0, mm1 215 216 psubusb mm1, mm2 217 por mm0, mm1 218 219 movq mm2, mm0 220 punpcklbw mm0, mm6 221 222 punpckhbw mm2, mm6 223 paddw mm0, mm2 224 225 lea rsi, [rsi+rax] 226 add rdi, rdx 227 228 paddw mm7, mm0 229 cmp rsi, rcx 230 231 jne .x8x8sad_mmx_loop 232 233 movq mm0, mm7 234 punpcklwd mm0, mm6 235 236 punpckhwd mm7, mm6 237 paddw mm0, mm7 238 239 movq mm7, mm0 240 psrlq mm0, 32 241 242 paddw mm7, mm0 243 movq rax, mm7 244 245 pop rdi 246 pop rsi 247 mov rsp, rbp 248 ; begin epilog 249 UNSHADOW_ARGS 250 pop rbp 251 ret 252 253 254;unsigned int vpx_sad4x4_mmx( 255; unsigned char *src_ptr, 256; int src_stride, 257; unsigned char *ref_ptr, 258; int ref_stride) 259sym(vpx_sad4x4_mmx): 260 push rbp 261 mov rbp, rsp 262 SHADOW_ARGS_TO_STACK 4 263 push rsi 264 push rdi 265 ; end prolog 266 267 mov rsi, arg(0) ;src_ptr 268 mov rdi, arg(2) ;ref_ptr 269 270 movsxd rax, dword ptr arg(1) ;src_stride 271 movsxd rdx, dword ptr arg(3) ;ref_stride 272 273 movd mm0, DWORD PTR [rsi] 274 movd mm1, DWORD PTR [rdi] 275 276 movd mm2, DWORD PTR [rsi+rax] 277 movd mm3, DWORD PTR [rdi+rdx] 278 279 punpcklbw mm0, mm2 280 punpcklbw mm1, mm3 281 282 movq mm2, mm0 283 psubusb mm0, mm1 284 285 psubusb mm1, mm2 286 por mm0, mm1 287 288 movq mm2, mm0 289 pxor mm3, mm3 290 291 punpcklbw mm0, mm3 292 punpckhbw mm2, mm3 293 294 paddw mm0, mm2 295 296 lea rsi, [rsi+rax*2] 297 lea rdi, [rdi+rdx*2] 298 299 movd mm4, DWORD PTR [rsi] 300 movd mm5, DWORD PTR [rdi] 301 302 movd mm6, DWORD PTR [rsi+rax] 303 movd mm7, DWORD PTR [rdi+rdx] 304 305 punpcklbw mm4, mm6 306 punpcklbw mm5, mm7 307 308 movq mm6, mm4 309 psubusb mm4, mm5 310 311 psubusb mm5, mm6 312 por mm4, mm5 313 314 movq mm5, mm4 315 punpcklbw mm4, mm3 316 317 punpckhbw mm5, mm3 318 paddw mm4, mm5 319 320 paddw mm0, mm4 321 movq mm1, mm0 322 323 punpcklwd mm0, mm3 324 punpckhwd mm1, mm3 325 326 paddw mm0, mm1 327 movq mm1, mm0 328 329 psrlq mm0, 32 330 paddw mm0, mm1 331 332 movq rax, mm0 333 334 pop rdi 335 pop rsi 336 mov rsp, rbp 337 ; begin epilog 338 UNSHADOW_ARGS 339 pop rbp 340 ret 341 342 343;unsigned int vpx_sad16x8_mmx( 344; unsigned char *src_ptr, 345; int src_stride, 346; unsigned char *ref_ptr, 347; int ref_stride) 348sym(vpx_sad16x8_mmx): 349 push rbp 350 mov rbp, rsp 351 SHADOW_ARGS_TO_STACK 4 352 push rsi 353 push rdi 354 ; end prolog 355 356 mov rsi, arg(0) ;src_ptr 357 mov rdi, arg(2) ;ref_ptr 358 359 movsxd rax, dword ptr arg(1) ;src_stride 360 movsxd rdx, dword ptr arg(3) ;ref_stride 361 362 lea rcx, [rsi+rax*8] 363 pxor mm7, mm7 364 365 pxor mm6, mm6 366 367.x16x8sad_mmx_loop: 368 369 movq mm0, [rsi] 370 movq mm1, [rdi] 371 372 movq mm2, [rsi+8] 373 movq mm3, [rdi+8] 374 375 movq mm4, mm0 376 movq mm5, mm2 377 378 psubusb mm0, mm1 379 psubusb mm1, mm4 380 381 psubusb mm2, mm3 382 psubusb mm3, mm5 383 384 por mm0, mm1 385 por mm2, mm3 386 387 movq mm1, mm0 388 movq mm3, mm2 389 390 punpcklbw mm0, mm6 391 punpckhbw mm1, mm6 392 393 punpcklbw mm2, mm6 394 punpckhbw mm3, mm6 395 396 397 paddw mm0, mm2 398 paddw mm1, mm3 399 400 paddw mm0, mm1 401 lea rsi, [rsi+rax] 402 403 add rdi, rdx 404 paddw mm7, mm0 405 406 cmp rsi, rcx 407 jne .x16x8sad_mmx_loop 408 409 movq mm0, mm7 410 punpcklwd mm0, mm6 411 412 punpckhwd mm7, mm6 413 paddw mm0, mm7 414 415 movq mm7, mm0 416 psrlq mm0, 32 417 418 paddw mm7, mm0 419 movq rax, mm7 420 421 pop rdi 422 pop rsi 423 mov rsp, rbp 424 ; begin epilog 425 UNSHADOW_ARGS 426 pop rbp 427 ret 428