1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14 15;void vp8_loop_filter_horizontal_edge_mmx 16;( 17; unsigned char *src_ptr, 18; int src_pixel_step, 19; const char *blimit, 20; const char *limit, 21; const char *thresh, 22; int count 23;) 24global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE 25sym(vp8_loop_filter_horizontal_edge_mmx): 26 push rbp 27 mov rbp, rsp 28 SHADOW_ARGS_TO_STACK 6 29 GET_GOT rbx 30 push rsi 31 push rdi 32 ; end prolog 33 34 ALIGN_STACK 16, rax 35 sub rsp, 32 ; reserve 32 bytes 36 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 37 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 38 39 mov rsi, arg(0) ;src_ptr 40 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 41 42 movsxd rcx, dword ptr arg(5) ;count 43.next8_h: 44 mov rdx, arg(3) ;limit 45 movq mm7, [rdx] 46 mov rdi, rsi ; rdi points to row +1 for indirect addressing 47 add rdi, rax 48 49 ; calculate breakout conditions 50 movq mm2, [rdi+2*rax] ; q3 51 movq mm1, [rsi+2*rax] ; q2 52 movq mm6, mm1 ; q2 53 psubusb mm1, mm2 ; q2-=q3 54 psubusb mm2, mm6 ; q3-=q2 55 por mm1, mm2 ; abs(q3-q2) 56 psubusb mm1, mm7 ; 57 58 59 movq mm4, [rsi+rax] ; q1 60 movq mm3, mm4 ; q1 61 psubusb mm4, mm6 ; q1-=q2 62 psubusb mm6, mm3 ; q2-=q1 63 por mm4, mm6 ; abs(q2-q1) 64 65 psubusb mm4, mm7 66 por mm1, mm4 67 68 movq mm4, [rsi] ; q0 69 movq mm0, mm4 ; q0 70 psubusb mm4, mm3 ; q0-=q1 71 psubusb mm3, mm0 ; q1-=q0 72 por mm4, mm3 ; abs(q0-q1) 73 movq t0, mm4 ; save to t0 74 psubusb mm4, mm7 75 por mm1, mm4 76 77 78 neg rax ; negate pitch to deal with above border 79 80 movq mm2, [rsi+4*rax] ; p3 81 movq mm4, [rdi+4*rax] ; p2 82 movq mm5, mm4 ; p2 83 psubusb mm4, mm2 ; p2-=p3 84 psubusb mm2, mm5 ; p3-=p2 85 por mm4, mm2 ; abs(p3 - p2) 86 psubusb mm4, mm7 87 por mm1, mm4 88 89 90 movq mm4, [rsi+2*rax] ; p1 91 movq mm3, mm4 ; p1 92 psubusb mm4, mm5 ; p1-=p2 93 psubusb mm5, mm3 ; p2-=p1 94 por mm4, mm5 ; abs(p2 - p1) 95 psubusb mm4, mm7 96 por mm1, mm4 97 98 movq mm2, mm3 ; p1 99 100 movq mm4, [rsi+rax] ; p0 101 movq mm5, mm4 ; p0 102 psubusb mm4, mm3 ; p0-=p1 103 psubusb mm3, mm5 ; p1-=p0 104 por mm4, mm3 ; abs(p1 - p0) 105 movq t1, mm4 ; save to t1 106 psubusb mm4, mm7 107 por mm1, mm4 108 109 movq mm3, [rdi] ; q1 110 movq mm4, mm3 ; q1 111 psubusb mm3, mm2 ; q1-=p1 112 psubusb mm2, mm4 ; p1-=q1 113 por mm2, mm3 ; abs(p1-q1) 114 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 115 psrlw mm2, 1 ; abs(p1-q1)/2 116 117 movq mm6, mm5 ; p0 118 movq mm3, [rsi] ; q0 119 psubusb mm5, mm3 ; p0-=q0 120 psubusb mm3, mm6 ; q0-=p0 121 por mm5, mm3 ; abs(p0 - q0) 122 paddusb mm5, mm5 ; abs(p0-q0)*2 123 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 124 125 mov rdx, arg(2) ;blimit ; get blimit 126 movq mm7, [rdx] ; blimit 127 128 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 129 por mm1, mm5 130 pxor mm5, mm5 131 pcmpeqb mm1, mm5 ; mask mm1 132 133 ; calculate high edge variance 134 mov rdx, arg(4) ;thresh ; get thresh 135 movq mm7, [rdx] ; 136 movq mm4, t0 ; get abs (q1 - q0) 137 psubusb mm4, mm7 138 movq mm3, t1 ; get abs (p1 - p0) 139 psubusb mm3, mm7 140 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 141 142 pcmpeqb mm4, mm5 143 144 pcmpeqb mm5, mm5 145 pxor mm4, mm5 146 147 148 ; start work on filters 149 movq mm2, [rsi+2*rax] ; p1 150 movq mm7, [rdi] ; q1 151 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 152 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 153 psubsb mm2, mm7 ; p1 - q1 154 pand mm2, mm4 ; high var mask (hvm)(p1 - q1) 155 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 156 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 157 movq mm3, mm0 ; q0 158 psubsb mm0, mm6 ; q0 - p0 159 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 160 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 161 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 162 pand mm1, mm2 ; mask filter values we don't care about 163 movq mm2, mm1 164 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 165 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 166 167 pxor mm0, mm0 ; 168 pxor mm5, mm5 169 punpcklbw mm0, mm2 ; 170 punpckhbw mm5, mm2 ; 171 psraw mm0, 11 ; 172 psraw mm5, 11 173 packsswb mm0, mm5 174 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 175 176 pxor mm0, mm0 ; 0 177 movq mm5, mm1 ; abcdefgh 178 punpcklbw mm0, mm1 ; e0f0g0h0 179 psraw mm0, 11 ; sign extended shift right by 3 180 pxor mm1, mm1 ; 0 181 punpckhbw mm1, mm5 ; a0b0c0d0 182 psraw mm1, 11 ; sign extended shift right by 3 183 movq mm5, mm0 ; save results 184 185 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 186 paddsw mm5, [GLOBAL(ones)] 187 paddsw mm1, [GLOBAL(ones)] 188 psraw mm5, 1 ; partial shifted one more time for 2nd tap 189 psraw mm1, 1 ; partial shifted one more time for 2nd tap 190 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 191 pandn mm4, mm5 ; high edge variance additive 192 193 paddsb mm6, mm2 ; p0+= p0 add 194 pxor mm6, [GLOBAL(t80)] ; unoffset 195 movq [rsi+rax], mm6 ; write back 196 197 movq mm6, [rsi+2*rax] ; p1 198 pxor mm6, [GLOBAL(t80)] ; reoffset 199 paddsb mm6, mm4 ; p1+= p1 add 200 pxor mm6, [GLOBAL(t80)] ; unoffset 201 movq [rsi+2*rax], mm6 ; write back 202 203 psubsb mm3, mm0 ; q0-= q0 add 204 pxor mm3, [GLOBAL(t80)] ; unoffset 205 movq [rsi], mm3 ; write back 206 207 psubsb mm7, mm4 ; q1-= q1 add 208 pxor mm7, [GLOBAL(t80)] ; unoffset 209 movq [rdi], mm7 ; write back 210 211 add rsi,8 212 neg rax 213 dec rcx 214 jnz .next8_h 215 216 add rsp, 32 217 pop rsp 218 ; begin epilog 219 pop rdi 220 pop rsi 221 RESTORE_GOT 222 UNSHADOW_ARGS 223 pop rbp 224 ret 225 226 227;void vp8_loop_filter_vertical_edge_mmx 228;( 229; unsigned char *src_ptr, 230; int src_pixel_step, 231; const char *blimit, 232; const char *limit, 233; const char *thresh, 234; int count 235;) 236global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE 237sym(vp8_loop_filter_vertical_edge_mmx): 238 push rbp 239 mov rbp, rsp 240 SHADOW_ARGS_TO_STACK 6 241 GET_GOT rbx 242 push rsi 243 push rdi 244 ; end prolog 245 246 ALIGN_STACK 16, rax 247 sub rsp, 64 ; reserve 64 bytes 248 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 249 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 250 %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; 251 252 mov rsi, arg(0) ;src_ptr 253 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 254 255 lea rsi, [rsi + rax*4 - 4] 256 257 movsxd rcx, dword ptr arg(5) ;count 258.next8_v: 259 mov rdi, rsi ; rdi points to row +1 for indirect addressing 260 add rdi, rax 261 262 263 ;transpose 264 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 265 movq mm7, mm6 ; 77 76 75 74 73 72 71 70 266 267 punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 268 punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 269 270 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 271 movq mm5, mm4 ; 47 46 45 44 43 42 41 40 272 273 punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 274 punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 275 276 movq mm3, mm5 ; 57 47 56 46 55 45 54 44 277 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 278 279 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 280 movq mm2, mm4 ; 53 43 52 42 51 41 50 40 281 282 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 283 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 284 285 neg rax 286 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 287 288 movq mm1, mm6 ; 27 26 25 24 23 22 21 20 289 punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 290 291 punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 292 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 293 294 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 295 movq mm0, mm7 ; 17 07 16 06 15 05 14 04 296 297 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 298 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 299 300 movq mm6, mm7 ; 37 27 17 07 36 26 16 06 301 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 302 303 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 304 305 movq mm5, mm6 ; 76 66 56 46 36 26 16 06 306 psubusb mm5, mm7 ; q2-q3 307 308 psubusb mm7, mm6 ; q3-q2 309 por mm7, mm5; ; mm7=abs (q3-q2) 310 311 movq mm5, mm0 ; 35 25 15 05 34 24 14 04 312 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 313 314 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 315 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 316 317 psubusb mm3, mm6 ; q1-q2 318 psubusb mm6, mm5 ; q2-q1 319 320 por mm6, mm3 ; mm6=abs(q2-q1) 321 lea rdx, srct 322 323 movq [rdx+24], mm5 ; save q1 324 movq [rdx+16], mm0 ; save q0 325 326 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 327 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 328 329 movq mm0, mm3 ; 13 03 12 02 11 01 10 00 330 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 331 332 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 333 movq mm1, mm0 ; 31 21 11 01 30 20 10 00 334 335 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 336 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 337 338 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 339 psubusb mm2, mm0 ; p2-p3 340 341 psubusb mm0, mm1 ; p3-p2 342 por mm0, mm2 ; mm0=abs(p3-p2) 343 344 movq mm2, mm3 ; 33 23 13 03 32 22 12 02 345 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 346 347 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 348 movq [rdx+8], mm3 ; save p0 349 350 movq [rdx], mm2 ; save p1 351 movq mm5, mm2 ; mm5 = p1 352 353 psubusb mm2, mm1 ; p1-p2 354 psubusb mm1, mm5 ; p2-p1 355 356 por mm1, mm2 ; mm1=abs(p2-p1) 357 mov rdx, arg(3) ;limit 358 359 movq mm4, [rdx] ; mm4 = limit 360 psubusb mm7, mm4 361 362 psubusb mm0, mm4 363 psubusb mm1, mm4 364 365 psubusb mm6, mm4 366 por mm7, mm6 367 368 por mm0, mm1 369 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 370 371 movq mm1, mm5 ; p1 372 373 movq mm7, mm3 ; mm3=mm7=p0 374 psubusb mm7, mm5 ; p0 - p1 375 376 psubusb mm5, mm3 ; p1 - p0 377 por mm5, mm7 ; abs(p1-p0) 378 379 movq t0, mm5 ; save abs(p1-p0) 380 lea rdx, srct 381 382 psubusb mm5, mm4 383 por mm0, mm5 ; mm0=mask 384 385 movq mm5, [rdx+16] ; mm5=q0 386 movq mm7, [rdx+24] ; mm7=q1 387 388 movq mm6, mm5 ; mm6=q0 389 movq mm2, mm7 ; q1 390 psubusb mm5, mm7 ; q0-q1 391 392 psubusb mm7, mm6 ; q1-q0 393 por mm7, mm5 ; abs(q1-q0) 394 395 movq t1, mm7 ; save abs(q1-q0) 396 psubusb mm7, mm4 397 398 por mm0, mm7 ; mask 399 400 movq mm5, mm2 ; q1 401 psubusb mm5, mm1 ; q1-=p1 402 psubusb mm1, mm2 ; p1-=q1 403 por mm5, mm1 ; abs(p1-q1) 404 pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 405 psrlw mm5, 1 ; abs(p1-q1)/2 406 407 mov rdx, arg(2) ;blimit ; 408 409 movq mm4, [rdx] ;blimit 410 movq mm1, mm3 ; mm1=mm3=p0 411 412 movq mm7, mm6 ; mm7=mm6=q0 413 psubusb mm1, mm7 ; p0-q0 414 415 psubusb mm7, mm3 ; q0-p0 416 por mm1, mm7 ; abs(q0-p0) 417 paddusb mm1, mm1 ; abs(q0-p0)*2 418 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 419 420 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 421 por mm1, mm0; ; mask 422 423 pxor mm0, mm0 424 pcmpeqb mm1, mm0 425 426 ; calculate high edge variance 427 mov rdx, arg(4) ;thresh ; get thresh 428 movq mm7, [rdx] 429 ; 430 movq mm4, t0 ; get abs (q1 - q0) 431 psubusb mm4, mm7 432 433 movq mm3, t1 ; get abs (p1 - p0) 434 psubusb mm3, mm7 435 436 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 437 pcmpeqb mm4, mm0 438 439 pcmpeqb mm0, mm0 440 pxor mm4, mm0 441 442 443 444 ; start work on filters 445 lea rdx, srct 446 447 movq mm2, [rdx] ; p1 448 movq mm7, [rdx+24] ; q1 449 450 movq mm6, [rdx+8] ; p0 451 movq mm0, [rdx+16] ; q0 452 453 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 454 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 455 456 psubsb mm2, mm7 ; p1 - q1 457 pand mm2, mm4 ; high var mask (hvm)(p1 - q1) 458 459 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 460 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 461 462 movq mm3, mm0 ; q0 463 psubsb mm0, mm6 ; q0 - p0 464 465 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 466 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 467 468 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 469 pand mm1, mm2 ; mask filter values we don't care about 470 471 movq mm2, mm1 472 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 473 474 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 475 pxor mm0, mm0 ; 476 477 pxor mm5, mm5 478 punpcklbw mm0, mm2 ; 479 480 punpckhbw mm5, mm2 ; 481 psraw mm0, 11 ; 482 483 psraw mm5, 11 484 packsswb mm0, mm5 485 486 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 487 488 pxor mm0, mm0 ; 0 489 movq mm5, mm1 ; abcdefgh 490 491 punpcklbw mm0, mm1 ; e0f0g0h0 492 psraw mm0, 11 ; sign extended shift right by 3 493 494 pxor mm1, mm1 ; 0 495 punpckhbw mm1, mm5 ; a0b0c0d0 496 497 psraw mm1, 11 ; sign extended shift right by 3 498 movq mm5, mm0 ; save results 499 500 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 501 paddsw mm5, [GLOBAL(ones)] 502 503 paddsw mm1, [GLOBAL(ones)] 504 psraw mm5, 1 ; partial shifted one more time for 2nd tap 505 506 psraw mm1, 1 ; partial shifted one more time for 2nd tap 507 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 508 509 pandn mm4, mm5 ; high edge variance additive 510 511 paddsb mm6, mm2 ; p0+= p0 add 512 pxor mm6, [GLOBAL(t80)] ; unoffset 513 514 ; mm6=p0 ; 515 movq mm1, [rdx] ; p1 516 pxor mm1, [GLOBAL(t80)] ; reoffset 517 518 paddsb mm1, mm4 ; p1+= p1 add 519 pxor mm1, [GLOBAL(t80)] ; unoffset 520 ; mm6 = p0 mm1 = p1 521 522 psubsb mm3, mm0 ; q0-= q0 add 523 pxor mm3, [GLOBAL(t80)] ; unoffset 524 525 ; mm3 = q0 526 psubsb mm7, mm4 ; q1-= q1 add 527 pxor mm7, [GLOBAL(t80)] ; unoffset 528 ; mm7 = q1 529 530 ; transpose and write back 531 ; mm1 = 72 62 52 42 32 22 12 02 532 ; mm6 = 73 63 53 43 33 23 13 03 533 ; mm3 = 74 64 54 44 34 24 14 04 534 ; mm7 = 75 65 55 45 35 25 15 05 535 536 movq mm2, mm1 ; 72 62 52 42 32 22 12 02 537 punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 538 539 movq mm4, mm3 ; 74 64 54 44 34 24 14 04 540 punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 541 542 punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 543 punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 544 545 movq mm6, mm2 ; 33 32 23 22 13 12 03 02 546 punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 547 548 punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 549 movq mm5, mm1 ; 73 72 63 62 53 52 43 42 550 551 punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 552 punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 553 554 555 ; mm2 = 15 14 13 12 05 04 03 02 556 ; mm6 = 35 34 33 32 25 24 23 22 557 ; mm5 = 55 54 53 52 45 44 43 42 558 ; mm1 = 75 74 73 72 65 64 63 62 559 560 561 562 movd [rsi+rax*4+2], mm2 563 psrlq mm2, 32 564 565 movd [rdi+rax*4+2], mm2 566 movd [rsi+rax*2+2], mm6 567 568 psrlq mm6, 32 569 movd [rsi+rax+2],mm6 570 571 movd [rsi+2], mm1 572 psrlq mm1, 32 573 574 movd [rdi+2], mm1 575 neg rax 576 577 movd [rdi+rax+2],mm5 578 psrlq mm5, 32 579 580 movd [rdi+rax*2+2], mm5 581 582 lea rsi, [rsi+rax*8] 583 dec rcx 584 jnz .next8_v 585 586 add rsp, 64 587 pop rsp 588 ; begin epilog 589 pop rdi 590 pop rsi 591 RESTORE_GOT 592 UNSHADOW_ARGS 593 pop rbp 594 ret 595 596 597;void vp8_mbloop_filter_horizontal_edge_mmx 598;( 599; unsigned char *src_ptr, 600; int src_pixel_step, 601; const char *blimit, 602; const char *limit, 603; const char *thresh, 604; int count 605;) 606global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE 607sym(vp8_mbloop_filter_horizontal_edge_mmx): 608 push rbp 609 mov rbp, rsp 610 SHADOW_ARGS_TO_STACK 6 611 GET_GOT rbx 612 push rsi 613 push rdi 614 ; end prolog 615 616 ALIGN_STACK 16, rax 617 sub rsp, 32 ; reserve 32 bytes 618 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 619 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 620 621 mov rsi, arg(0) ;src_ptr 622 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 623 624 movsxd rcx, dword ptr arg(5) ;count 625.next8_mbh: 626 mov rdx, arg(3) ;limit 627 movq mm7, [rdx] 628 mov rdi, rsi ; rdi points to row +1 for indirect addressing 629 add rdi, rax 630 631 ; calculate breakout conditions 632 movq mm2, [rdi+2*rax] ; q3 633 634 movq mm1, [rsi+2*rax] ; q2 635 movq mm6, mm1 ; q2 636 psubusb mm1, mm2 ; q2-=q3 637 psubusb mm2, mm6 ; q3-=q2 638 por mm1, mm2 ; abs(q3-q2) 639 psubusb mm1, mm7 640 641 642 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit 643 movq mm4, [rsi+rax] ; q1 644 movq mm3, mm4 ; q1 645 psubusb mm4, mm6 ; q1-=q2 646 psubusb mm6, mm3 ; q2-=q1 647 por mm4, mm6 ; abs(q2-q1) 648 psubusb mm4, mm7 649 por mm1, mm4 650 651 652 ; mm1 = mask, mm3=q1, mm7 = limit 653 654 movq mm4, [rsi] ; q0 655 movq mm0, mm4 ; q0 656 psubusb mm4, mm3 ; q0-=q1 657 psubusb mm3, mm0 ; q1-=q0 658 por mm4, mm3 ; abs(q0-q1) 659 movq t0, mm4 ; save to t0 660 psubusb mm4, mm7 661 por mm1, mm4 662 663 664 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 665 666 neg rax ; negate pitch to deal with above border 667 668 movq mm2, [rsi+4*rax] ; p3 669 movq mm4, [rdi+4*rax] ; p2 670 movq mm5, mm4 ; p2 671 psubusb mm4, mm2 ; p2-=p3 672 psubusb mm2, mm5 ; p3-=p2 673 por mm4, mm2 ; abs(p3 - p2) 674 psubusb mm4, mm7 675 por mm1, mm4 676 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 677 678 movq mm4, [rsi+2*rax] ; p1 679 movq mm3, mm4 ; p1 680 psubusb mm4, mm5 ; p1-=p2 681 psubusb mm5, mm3 ; p2-=p1 682 por mm4, mm5 ; abs(p2 - p1) 683 psubusb mm4, mm7 684 por mm1, mm4 685 686 movq mm2, mm3 ; p1 687 688 689 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 690 691 movq mm4, [rsi+rax] ; p0 692 movq mm5, mm4 ; p0 693 psubusb mm4, mm3 ; p0-=p1 694 psubusb mm3, mm5 ; p1-=p0 695 por mm4, mm3 ; abs(p1 - p0) 696 movq t1, mm4 ; save to t1 697 psubusb mm4, mm7 698 por mm1, mm4 699 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) 700 ; mm5 = p0 701 movq mm3, [rdi] ; q1 702 movq mm4, mm3 ; q1 703 psubusb mm3, mm2 ; q1-=p1 704 psubusb mm2, mm4 ; p1-=q1 705 por mm2, mm3 ; abs(p1-q1) 706 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 707 psrlw mm2, 1 ; abs(p1-q1)/2 708 709 movq mm6, mm5 ; p0 710 movq mm3, mm0 ; q0 711 psubusb mm5, mm3 ; p0-=q0 712 psubusb mm3, mm6 ; q0-=p0 713 por mm5, mm3 ; abs(p0 - q0) 714 paddusb mm5, mm5 ; abs(p0-q0)*2 715 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 716 717 mov rdx, arg(2) ;blimit ; get blimit 718 movq mm7, [rdx] ; blimit 719 720 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 721 por mm1, mm5 722 pxor mm5, mm5 723 pcmpeqb mm1, mm5 ; mask mm1 724 725 ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) 726 ; mm6 = p0, 727 728 ; calculate high edge variance 729 mov rdx, arg(4) ;thresh ; get thresh 730 movq mm7, [rdx] ; 731 movq mm4, t0 ; get abs (q1 - q0) 732 psubusb mm4, mm7 733 movq mm3, t1 ; get abs (p1 - p0) 734 psubusb mm3, mm7 735 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 736 737 pcmpeqb mm4, mm5 738 739 pcmpeqb mm5, mm5 740 pxor mm4, mm5 741 742 743 744 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) 745 ; mm6 = p0, mm4=hev 746 ; start work on filters 747 movq mm2, [rsi+2*rax] ; p1 748 movq mm7, [rdi] ; q1 749 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 750 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 751 psubsb mm2, mm7 ; p1 - q1 752 753 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 754 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 755 movq mm3, mm0 ; q0 756 psubsb mm0, mm6 ; q0 - p0 757 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) 758 paddsb mm2, mm0 ; 2 * (q0 - p0) 759 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) 760 pand mm1, mm2 ; mask filter values we don't care about 761 762 763 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 764 movq mm2, mm1 ; vp8_filter 765 pand mm2, mm4; ; Filter2 = vp8_filter & hev 766 767 movq mm5, mm2 ; 768 paddsb mm5, [GLOBAL(t3)]; 769 770 pxor mm0, mm0 ; 0 771 pxor mm7, mm7 ; 0 772 773 punpcklbw mm0, mm5 ; e0f0g0h0 774 psraw mm0, 11 ; sign extended shift right by 3 775 punpckhbw mm7, mm5 ; a0b0c0d0 776 psraw mm7, 11 ; sign extended shift right by 3 777 packsswb mm0, mm7 ; Filter2 >>=3; 778 779 movq mm5, mm0 ; Filter2 780 781 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 782 pxor mm0, mm0 ; 0 783 pxor mm7, mm7 ; 0 784 785 punpcklbw mm0, mm2 ; e0f0g0h0 786 psraw mm0, 11 ; sign extended shift right by 3 787 punpckhbw mm7, mm2 ; a0b0c0d0 788 psraw mm7, 11 ; sign extended shift right by 3 789 packsswb mm0, mm7 ; Filter2 >>=3; 790 791 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 792 psubsb mm3, mm0 ; qs0 =qs0 - filter1 793 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 794 795 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 796 ; vp8_filter &= ~hev; 797 ; Filter2 = vp8_filter; 798 pandn mm4, mm1 ; vp8_filter&=~hev 799 800 801 ; mm3=qs0, mm4=filter2, mm6=ps0 802 803 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 804 ; s = vp8_signed_char_clamp(qs0 - u); 805 ; *oq0 = s^0x80; 806 ; s = vp8_signed_char_clamp(ps0 + u); 807 ; *op0 = s^0x80; 808 pxor mm0, mm0 809 810 pxor mm1, mm1 811 pxor mm2, mm2 812 punpcklbw mm1, mm4 813 punpckhbw mm2, mm4 814 pmulhw mm1, [GLOBAL(s27)] 815 pmulhw mm2, [GLOBAL(s27)] 816 paddw mm1, [GLOBAL(s63)] 817 paddw mm2, [GLOBAL(s63)] 818 psraw mm1, 7 819 psraw mm2, 7 820 packsswb mm1, mm2 821 822 psubsb mm3, mm1 823 paddsb mm6, mm1 824 825 pxor mm3, [GLOBAL(t80)] 826 pxor mm6, [GLOBAL(t80)] 827 movq [rsi+rax], mm6 828 movq [rsi], mm3 829 830 ; roughly 2/7th difference across boundary 831 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 832 ; s = vp8_signed_char_clamp(qs1 - u); 833 ; *oq1 = s^0x80; 834 ; s = vp8_signed_char_clamp(ps1 + u); 835 ; *op1 = s^0x80; 836 pxor mm1, mm1 837 pxor mm2, mm2 838 punpcklbw mm1, mm4 839 punpckhbw mm2, mm4 840 pmulhw mm1, [GLOBAL(s18)] 841 pmulhw mm2, [GLOBAL(s18)] 842 paddw mm1, [GLOBAL(s63)] 843 paddw mm2, [GLOBAL(s63)] 844 psraw mm1, 7 845 psraw mm2, 7 846 packsswb mm1, mm2 847 848 movq mm3, [rdi] 849 movq mm6, [rsi+rax*2] ; p1 850 851 pxor mm3, [GLOBAL(t80)] 852 pxor mm6, [GLOBAL(t80)] 853 854 paddsb mm6, mm1 855 psubsb mm3, mm1 856 857 pxor mm6, [GLOBAL(t80)] 858 pxor mm3, [GLOBAL(t80)] 859 movq [rdi], mm3 860 movq [rsi+rax*2], mm6 861 862 ; roughly 1/7th difference across boundary 863 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 864 ; s = vp8_signed_char_clamp(qs2 - u); 865 ; *oq2 = s^0x80; 866 ; s = vp8_signed_char_clamp(ps2 + u); 867 ; *op2 = s^0x80; 868 pxor mm1, mm1 869 pxor mm2, mm2 870 punpcklbw mm1, mm4 871 punpckhbw mm2, mm4 872 pmulhw mm1, [GLOBAL(s9)] 873 pmulhw mm2, [GLOBAL(s9)] 874 paddw mm1, [GLOBAL(s63)] 875 paddw mm2, [GLOBAL(s63)] 876 psraw mm1, 7 877 psraw mm2, 7 878 packsswb mm1, mm2 879 880 881 movq mm6, [rdi+rax*4] 882 neg rax 883 movq mm3, [rdi+rax ] 884 885 pxor mm6, [GLOBAL(t80)] 886 pxor mm3, [GLOBAL(t80)] 887 888 paddsb mm6, mm1 889 psubsb mm3, mm1 890 891 pxor mm6, [GLOBAL(t80)] 892 pxor mm3, [GLOBAL(t80)] 893 movq [rdi+rax ], mm3 894 neg rax 895 movq [rdi+rax*4], mm6 896 897;EARLY_BREAK_OUT: 898 neg rax 899 add rsi,8 900 dec rcx 901 jnz .next8_mbh 902 903 add rsp, 32 904 pop rsp 905 ; begin epilog 906 pop rdi 907 pop rsi 908 RESTORE_GOT 909 UNSHADOW_ARGS 910 pop rbp 911 ret 912 913 914;void vp8_mbloop_filter_vertical_edge_mmx 915;( 916; unsigned char *src_ptr, 917; int src_pixel_step, 918; const char *blimit, 919; const char *limit, 920; const char *thresh, 921; int count 922;) 923global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE 924sym(vp8_mbloop_filter_vertical_edge_mmx): 925 push rbp 926 mov rbp, rsp 927 SHADOW_ARGS_TO_STACK 6 928 GET_GOT rbx 929 push rsi 930 push rdi 931 ; end prolog 932 933 ALIGN_STACK 16, rax 934 sub rsp, 96 ; reserve 96 bytes 935 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 936 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 937 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 938 939 mov rsi, arg(0) ;src_ptr 940 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 941 942 lea rsi, [rsi + rax*4 - 4] 943 944 movsxd rcx, dword ptr arg(5) ;count 945.next8_mbv: 946 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 947 948 ;transpose 949 movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 950 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 951 952 movq mm7, mm6 ; 77 76 75 74 73 72 71 70 953 punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 954 955 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 956 movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 957 958 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 959 movq mm5, mm4 ; 47 46 45 44 43 42 41 40 960 961 punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 962 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 963 964 movq mm3, mm5 ; 57 47 56 46 55 45 54 44 965 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 966 967 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 968 movq mm2, mm4 ; 53 43 52 42 51 41 50 40 969 970 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 971 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 972 973 neg rax 974 975 movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 976 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 977 978 movq mm1, mm6 ; 27 26 25 24 23 22 21 20 979 punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 980 981 punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 982 983 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 984 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 985 986 movq mm0, mm7 ; 17 07 16 06 15 05 14 04 987 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 988 989 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 990 movq mm6, mm7 ; 37 27 17 07 36 26 16 06 991 992 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 993 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 994 995 lea rdx, srct 996 movq mm5, mm6 ; 76 66 56 46 36 26 16 06 997 998 movq [rdx+56], mm7 999 psubusb mm5, mm7 ; q2-q3 1000 1001 1002 movq [rdx+48], mm6 1003 psubusb mm7, mm6 ; q3-q2 1004 1005 por mm7, mm5; ; mm7=abs (q3-q2) 1006 movq mm5, mm0 ; 35 25 15 05 34 24 14 04 1007 1008 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 1009 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 1010 1011 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 1012 psubusb mm3, mm6 ; q1-q2 1013 1014 psubusb mm6, mm5 ; q2-q1 1015 por mm6, mm3 ; mm6=abs(q2-q1) 1016 1017 movq [rdx+40], mm5 ; save q1 1018 movq [rdx+32], mm0 ; save q0 1019 1020 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 1021 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 1022 1023 movq mm0, mm3 ; 13 03 12 02 11 01 10 00 1024 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 1025 1026 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 1027 movq mm1, mm0 ; 31 21 11 01 30 20 10 00 1028 1029 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 1030 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 1031 1032 movq [rdx], mm0 ; save p3 1033 movq [rdx+8], mm1 ; save p2 1034 1035 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 1036 psubusb mm2, mm0 ; p2-p3 1037 1038 psubusb mm0, mm1 ; p3-p2 1039 por mm0, mm2 ; mm0=abs(p3-p2) 1040 1041 movq mm2, mm3 ; 33 23 13 03 32 22 12 02 1042 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 1043 1044 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 1045 movq [rdx+24], mm3 ; save p0 1046 1047 movq [rdx+16], mm2 ; save p1 1048 movq mm5, mm2 ; mm5 = p1 1049 1050 psubusb mm2, mm1 ; p1-p2 1051 psubusb mm1, mm5 ; p2-p1 1052 1053 por mm1, mm2 ; mm1=abs(p2-p1) 1054 mov rdx, arg(3) ;limit 1055 1056 movq mm4, [rdx] ; mm4 = limit 1057 psubusb mm7, mm4 ; abs(q3-q2) > limit 1058 1059 psubusb mm0, mm4 ; abs(p3-p2) > limit 1060 psubusb mm1, mm4 ; abs(p2-p1) > limit 1061 1062 psubusb mm6, mm4 ; abs(q2-q1) > limit 1063 por mm7, mm6 ; or 1064 1065 por mm0, mm1 ; 1066 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 1067 1068 movq mm1, mm5 ; p1 1069 1070 movq mm7, mm3 ; mm3=mm7=p0 1071 psubusb mm7, mm5 ; p0 - p1 1072 1073 psubusb mm5, mm3 ; p1 - p0 1074 por mm5, mm7 ; abs(p1-p0) 1075 1076 movq t0, mm5 ; save abs(p1-p0) 1077 lea rdx, srct 1078 1079 psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit 1080 por mm0, mm5 ; mm0=mask 1081 1082 movq mm5, [rdx+32] ; mm5=q0 1083 movq mm7, [rdx+40] ; mm7=q1 1084 1085 movq mm6, mm5 ; mm6=q0 1086 movq mm2, mm7 ; q1 1087 psubusb mm5, mm7 ; q0-q1 1088 1089 psubusb mm7, mm6 ; q1-q0 1090 por mm7, mm5 ; abs(q1-q0) 1091 1092 movq t1, mm7 ; save abs(q1-q0) 1093 psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit 1094 1095 por mm0, mm7 ; mask 1096 1097 movq mm5, mm2 ; q1 1098 psubusb mm5, mm1 ; q1-=p1 1099 psubusb mm1, mm2 ; p1-=q1 1100 por mm5, mm1 ; abs(p1-q1) 1101 pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 1102 psrlw mm5, 1 ; abs(p1-q1)/2 1103 1104 mov rdx, arg(2) ;blimit ; 1105 1106 movq mm4, [rdx] ;blimit 1107 movq mm1, mm3 ; mm1=mm3=p0 1108 1109 movq mm7, mm6 ; mm7=mm6=q0 1110 psubusb mm1, mm7 ; p0-q0 1111 1112 psubusb mm7, mm3 ; q0-p0 1113 por mm1, mm7 ; abs(q0-p0) 1114 paddusb mm1, mm1 ; abs(q0-p0)*2 1115 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1116 1117 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 1118 por mm1, mm0; ; mask 1119 1120 pxor mm0, mm0 1121 pcmpeqb mm1, mm0 1122 1123 ; calculate high edge variance 1124 mov rdx, arg(4) ;thresh ; get thresh 1125 movq mm7, [rdx] 1126 ; 1127 movq mm4, t0 ; get abs (q1 - q0) 1128 psubusb mm4, mm7 ; abs(q1 - q0) > thresh 1129 1130 movq mm3, t1 ; get abs (p1 - p0) 1131 psubusb mm3, mm7 ; abs(p1 - p0)> thresh 1132 1133 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 1134 pcmpeqb mm4, mm0 1135 1136 pcmpeqb mm0, mm0 1137 pxor mm4, mm0 1138 1139 1140 1141 1142 ; start work on filters 1143 lea rdx, srct 1144 1145 ; start work on filters 1146 movq mm2, [rdx+16] ; p1 1147 movq mm7, [rdx+40] ; q1 1148 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 1149 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 1150 psubsb mm2, mm7 ; p1 - q1 1151 1152 movq mm6, [rdx+24] ; p0 1153 movq mm0, [rdx+32] ; q0 1154 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 1155 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 1156 1157 movq mm3, mm0 ; q0 1158 psubsb mm0, mm6 ; q0 - p0 1159 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) 1160 paddsb mm2, mm0 ; 2 * (q0 - p0) 1161 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) 1162 pand mm1, mm2 ; mask filter values we don't care about 1163 1164 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 1165 movq mm2, mm1 ; vp8_filter 1166 pand mm2, mm4; ; Filter2 = vp8_filter & hev 1167 1168 movq mm5, mm2 ; 1169 paddsb mm5, [GLOBAL(t3)]; 1170 1171 pxor mm0, mm0 ; 0 1172 pxor mm7, mm7 ; 0 1173 1174 punpcklbw mm0, mm5 ; e0f0g0h0 1175 psraw mm0, 11 ; sign extended shift right by 3 1176 punpckhbw mm7, mm5 ; a0b0c0d0 1177 psraw mm7, 11 ; sign extended shift right by 3 1178 packsswb mm0, mm7 ; Filter2 >>=3; 1179 1180 movq mm5, mm0 ; Filter2 1181 1182 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 1183 pxor mm0, mm0 ; 0 1184 pxor mm7, mm7 ; 0 1185 1186 punpcklbw mm0, mm2 ; e0f0g0h0 1187 psraw mm0, 11 ; sign extended shift right by 3 1188 punpckhbw mm7, mm2 ; a0b0c0d0 1189 psraw mm7, 11 ; sign extended shift right by 3 1190 packsswb mm0, mm7 ; Filter2 >>=3; 1191 1192 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 1193 psubsb mm3, mm0 ; qs0 =qs0 - filter1 1194 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 1195 1196 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 1197 ; vp8_filter &= ~hev; 1198 ; Filter2 = vp8_filter; 1199 pandn mm4, mm1 ; vp8_filter&=~hev 1200 1201 1202 ; mm3=qs0, mm4=filter2, mm6=ps0 1203 1204 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 1205 ; s = vp8_signed_char_clamp(qs0 - u); 1206 ; *oq0 = s^0x80; 1207 ; s = vp8_signed_char_clamp(ps0 + u); 1208 ; *op0 = s^0x80; 1209 pxor mm0, mm0 1210 1211 pxor mm1, mm1 1212 pxor mm2, mm2 1213 punpcklbw mm1, mm4 1214 punpckhbw mm2, mm4 1215 pmulhw mm1, [GLOBAL(s27)] 1216 pmulhw mm2, [GLOBAL(s27)] 1217 paddw mm1, [GLOBAL(s63)] 1218 paddw mm2, [GLOBAL(s63)] 1219 psraw mm1, 7 1220 psraw mm2, 7 1221 packsswb mm1, mm2 1222 1223 psubsb mm3, mm1 1224 paddsb mm6, mm1 1225 1226 pxor mm3, [GLOBAL(t80)] 1227 pxor mm6, [GLOBAL(t80)] 1228 movq [rdx+24], mm6 1229 movq [rdx+32], mm3 1230 1231 ; roughly 2/7th difference across boundary 1232 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 1233 ; s = vp8_signed_char_clamp(qs1 - u); 1234 ; *oq1 = s^0x80; 1235 ; s = vp8_signed_char_clamp(ps1 + u); 1236 ; *op1 = s^0x80; 1237 pxor mm1, mm1 1238 pxor mm2, mm2 1239 punpcklbw mm1, mm4 1240 punpckhbw mm2, mm4 1241 pmulhw mm1, [GLOBAL(s18)] 1242 pmulhw mm2, [GLOBAL(s18)] 1243 paddw mm1, [GLOBAL(s63)] 1244 paddw mm2, [GLOBAL(s63)] 1245 psraw mm1, 7 1246 psraw mm2, 7 1247 packsswb mm1, mm2 1248 1249 movq mm3, [rdx + 40] 1250 movq mm6, [rdx + 16] ; p1 1251 pxor mm3, [GLOBAL(t80)] 1252 pxor mm6, [GLOBAL(t80)] 1253 1254 paddsb mm6, mm1 1255 psubsb mm3, mm1 1256 1257 pxor mm6, [GLOBAL(t80)] 1258 pxor mm3, [GLOBAL(t80)] 1259 movq [rdx + 40], mm3 1260 movq [rdx + 16], mm6 1261 1262 ; roughly 1/7th difference across boundary 1263 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 1264 ; s = vp8_signed_char_clamp(qs2 - u); 1265 ; *oq2 = s^0x80; 1266 ; s = vp8_signed_char_clamp(ps2 + u); 1267 ; *op2 = s^0x80; 1268 pxor mm1, mm1 1269 pxor mm2, mm2 1270 punpcklbw mm1, mm4 1271 punpckhbw mm2, mm4 1272 pmulhw mm1, [GLOBAL(s9)] 1273 pmulhw mm2, [GLOBAL(s9)] 1274 paddw mm1, [GLOBAL(s63)] 1275 paddw mm2, [GLOBAL(s63)] 1276 psraw mm1, 7 1277 psraw mm2, 7 1278 packsswb mm1, mm2 1279 1280 movq mm6, [rdx+ 8] 1281 movq mm3, [rdx+48] 1282 1283 pxor mm6, [GLOBAL(t80)] 1284 pxor mm3, [GLOBAL(t80)] 1285 1286 paddsb mm6, mm1 1287 psubsb mm3, mm1 1288 1289 pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 1290 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 1291 1292 ; transpose and write back 1293 movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 1294 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 1295 1296 punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 1297 punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 1298 1299 movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 1300 movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 1301 1302 punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 1303 punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 1304 1305 movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 1306 punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 1307 1308 punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 1309 movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 1310 1311 punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 1312 punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 1313 1314 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 1315 punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 1316 1317 movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 1318 punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 1319 1320 movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 1321 punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 1322 1323 punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 1324 movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 1325 1326 punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 1327 punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 1328 1329 movq [rsi+rax*4], mm0 ; write out 1330 movq [rdi+rax*4], mm6 ; write out 1331 1332 movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 1333 punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 1334 1335 punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 1336 movq [rsi+rax*2], mm0 ; write out 1337 1338 movq [rdi+rax*2], mm5 ; write out 1339 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 1340 1341 punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 1342 punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 1343 1344 movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 1345 punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 1346 1347 punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 1348 movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 1349 1350 movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 1351 punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 1352 1353 punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 1354 movq [rsi], mm0 ; write out 1355 1356 movq [rdi], mm1 ; write out 1357 neg rax 1358 1359 punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 1360 punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 1361 1362 movq [rsi+rax*2], mm3 1363 movq [rdi+rax*2], mm4 1364 1365 lea rsi, [rsi+rax*8] 1366 dec rcx 1367 1368 jnz .next8_mbv 1369 1370 add rsp, 96 1371 pop rsp 1372 ; begin epilog 1373 pop rdi 1374 pop rsi 1375 RESTORE_GOT 1376 UNSHADOW_ARGS 1377 pop rbp 1378 ret 1379 1380 1381;void vp8_loop_filter_simple_horizontal_edge_mmx 1382;( 1383; unsigned char *src_ptr, 1384; int src_pixel_step, 1385; const char *blimit 1386;) 1387global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE 1388sym(vp8_loop_filter_simple_horizontal_edge_mmx): 1389 push rbp 1390 mov rbp, rsp 1391 SHADOW_ARGS_TO_STACK 3 1392 GET_GOT rbx 1393 push rsi 1394 push rdi 1395 ; end prolog 1396 1397 mov rsi, arg(0) ;src_ptr 1398 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1399 1400 mov rcx, 2 ; count 1401.nexts8_h: 1402 mov rdx, arg(2) ;blimit ; get blimit 1403 movq mm3, [rdx] ; 1404 1405 mov rdi, rsi ; rdi points to row +1 for indirect addressing 1406 add rdi, rax 1407 neg rax 1408 1409 ; calculate mask 1410 movq mm1, [rsi+2*rax] ; p1 1411 movq mm0, [rdi] ; q1 1412 movq mm2, mm1 1413 movq mm7, mm0 1414 movq mm4, mm0 1415 psubusb mm0, mm1 ; q1-=p1 1416 psubusb mm1, mm4 ; p1-=q1 1417 por mm1, mm0 ; abs(p1-q1) 1418 pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero 1419 psrlw mm1, 1 ; abs(p1-q1)/2 1420 1421 movq mm5, [rsi+rax] ; p0 1422 movq mm4, [rsi] ; q0 1423 movq mm0, mm4 ; q0 1424 movq mm6, mm5 ; p0 1425 psubusb mm5, mm4 ; p0-=q0 1426 psubusb mm4, mm6 ; q0-=p0 1427 por mm5, mm4 ; abs(p0 - q0) 1428 paddusb mm5, mm5 ; abs(p0-q0)*2 1429 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1430 1431 psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 1432 pxor mm3, mm3 1433 pcmpeqb mm5, mm3 1434 1435 ; start work on filters 1436 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 1437 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 1438 psubsb mm2, mm7 ; p1 - q1 1439 1440 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 1441 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 1442 movq mm3, mm0 ; q0 1443 psubsb mm0, mm6 ; q0 - p0 1444 paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) 1445 paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) 1446 paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) 1447 pand mm5, mm2 ; mask filter values we don't care about 1448 1449 ; do + 4 side 1450 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 1451 1452 movq mm0, mm5 ; get a copy of filters 1453 psllw mm0, 8 ; shift left 8 1454 psraw mm0, 3 ; arithmetic shift right 11 1455 psrlw mm0, 8 1456 movq mm1, mm5 ; get a copy of filters 1457 psraw mm1, 11 ; arithmetic shift right 11 1458 psllw mm1, 8 ; shift left 8 to put it back 1459 1460 por mm0, mm1 ; put the two together to get result 1461 1462 psubsb mm3, mm0 ; q0-= q0 add 1463 pxor mm3, [GLOBAL(t80)] ; unoffset 1464 movq [rsi], mm3 ; write back 1465 1466 1467 ; now do +3 side 1468 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 1469 1470 movq mm0, mm5 ; get a copy of filters 1471 psllw mm0, 8 ; shift left 8 1472 psraw mm0, 3 ; arithmetic shift right 11 1473 psrlw mm0, 8 1474 psraw mm5, 11 ; arithmetic shift right 11 1475 psllw mm5, 8 ; shift left 8 to put it back 1476 por mm0, mm5 ; put the two together to get result 1477 1478 1479 paddsb mm6, mm0 ; p0+= p0 add 1480 pxor mm6, [GLOBAL(t80)] ; unoffset 1481 movq [rsi+rax], mm6 ; write back 1482 1483 add rsi,8 1484 neg rax 1485 dec rcx 1486 jnz .nexts8_h 1487 1488 ; begin epilog 1489 pop rdi 1490 pop rsi 1491 RESTORE_GOT 1492 UNSHADOW_ARGS 1493 pop rbp 1494 ret 1495 1496 1497;void vp8_loop_filter_simple_vertical_edge_mmx 1498;( 1499; unsigned char *src_ptr, 1500; int src_pixel_step, 1501; const char *blimit 1502;) 1503global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE 1504sym(vp8_loop_filter_simple_vertical_edge_mmx): 1505 push rbp 1506 mov rbp, rsp 1507 SHADOW_ARGS_TO_STACK 3 1508 GET_GOT rbx 1509 push rsi 1510 push rdi 1511 ; end prolog 1512 1513 ALIGN_STACK 16, rax 1514 sub rsp, 32 ; reserve 32 bytes 1515 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 1516 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 1517 1518 mov rsi, arg(0) ;src_ptr 1519 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1520 1521 lea rsi, [rsi + rax*4- 2]; ; 1522 mov rcx, 2 ; count 1523.nexts8_v: 1524 1525 lea rdi, [rsi + rax]; 1526 movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 1527 1528 movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 1529 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 1530 1531 movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 1532 movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 1533 1534 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 1535 movq mm5, mm4 ; 53 43 52 42 51 41 50 40 1536 1537 punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 1538 punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 1539 1540 neg rax 1541 1542 movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 1543 movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 1544 1545 punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 1546 movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 1547 1548 movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 1549 punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 1550 1551 movq mm2, mm0 ; 13 03 12 02 11 01 10 00 1552 punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 1553 1554 punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 1555 movq mm1, mm0 ; 13 03 12 02 11 01 10 00 1556 1557 punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 1558 movq mm3, mm2 ; 33 23 13 03 32 22 12 02 1559 1560 punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 1561 punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 1562 1563 punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 1564 1565 1566 ; calculate mask 1567 movq mm6, mm0 ; p1 1568 movq mm7, mm3 ; q1 1569 psubusb mm7, mm6 ; q1-=p1 1570 psubusb mm6, mm3 ; p1-=q1 1571 por mm6, mm7 ; abs(p1-q1) 1572 pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero 1573 psrlw mm6, 1 ; abs(p1-q1)/2 1574 1575 movq mm5, mm1 ; p0 1576 movq mm4, mm2 ; q0 1577 1578 psubusb mm5, mm2 ; p0-=q0 1579 psubusb mm4, mm1 ; q0-=p0 1580 1581 por mm5, mm4 ; abs(p0 - q0) 1582 paddusb mm5, mm5 ; abs(p0-q0)*2 1583 paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1584 1585 mov rdx, arg(2) ;blimit ; get blimit 1586 movq mm7, [rdx] 1587 1588 psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 1589 pxor mm7, mm7 1590 pcmpeqb mm5, mm7 ; mm5 = mask 1591 1592 ; start work on filters 1593 movq t0, mm0 1594 movq t1, mm3 1595 1596 pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values 1597 pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values 1598 1599 psubsb mm0, mm3 ; p1 - q1 1600 movq mm6, mm1 ; p0 1601 1602 movq mm7, mm2 ; q0 1603 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 1604 1605 pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values 1606 movq mm3, mm7 ; offseted ; q0 1607 1608 psubsb mm7, mm6 ; q0 - p0 1609 paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) 1610 1611 paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) 1612 paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) 1613 1614 pand mm5, mm0 ; mask filter values we don't care about 1615 1616 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 1617 1618 movq mm0, mm5 ; get a copy of filters 1619 psllw mm0, 8 ; shift left 8 1620 psraw mm0, 3 ; arithmetic shift right 11 1621 psrlw mm0, 8 1622 1623 movq mm7, mm5 ; get a copy of filters 1624 psraw mm7, 11 ; arithmetic shift right 11 1625 psllw mm7, 8 ; shift left 8 to put it back 1626 1627 por mm0, mm7 ; put the two together to get result 1628 1629 psubsb mm3, mm0 ; q0-= q0sz add 1630 pxor mm3, [GLOBAL(t80)] ; unoffset 1631 1632 ; now do +3 side 1633 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 1634 1635 movq mm0, mm5 ; get a copy of filters 1636 psllw mm0, 8 ; shift left 8 1637 psraw mm0, 3 ; arithmetic shift right 11 1638 psrlw mm0, 8 1639 1640 psraw mm5, 11 ; arithmetic shift right 11 1641 psllw mm5, 8 ; shift left 8 to put it back 1642 por mm0, mm5 ; put the two together to get result 1643 1644 paddsb mm6, mm0 ; p0+= p0 add 1645 pxor mm6, [GLOBAL(t80)] ; unoffset 1646 1647 1648 movq mm0, t0 1649 movq mm4, t1 1650 1651 ; mm0 = 70 60 50 40 30 20 10 00 1652 ; mm6 = 71 61 51 41 31 21 11 01 1653 ; mm3 = 72 62 52 42 32 22 12 02 1654 ; mm4 = 73 63 53 43 33 23 13 03 1655 ; transpose back to write out 1656 1657 movq mm1, mm0 ; 1658 punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 1659 1660 punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 1661 movq mm2, mm3 ; 1662 1663 punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 1664 movq mm5, mm1 ; 71 70 61 60 51 50 41 40 1665 1666 punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 1667 movq mm6, mm0 ; 31 30 21 20 11 10 01 00 1668 1669 punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 1670 punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 1671 1672 movd [rsi+rax*4], mm0 ; write 03 02 01 00 1673 punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 1674 1675 psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 1676 punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 1677 1678 movd [rdi+rax*4], mm0 ; write 13 12 11 10 1679 movd [rsi+rax*2], mm6 ; write 23 22 21 20 1680 1681 psrlq mm6, 32 ; 33 32 31 30 1682 movd [rsi], mm1 ; write 43 42 41 40 1683 1684 movd [rsi + rax], mm6 ; write 33 32 31 30 1685 neg rax 1686 1687 movd [rsi + rax*2], mm5 ; write 63 62 61 60 1688 psrlq mm1, 32 ; 53 52 51 50 1689 1690 movd [rdi], mm1 ; write out 53 52 51 50 1691 psrlq mm5, 32 ; 73 72 71 70 1692 1693 movd [rdi + rax*2], mm5 ; write 73 72 71 70 1694 1695 lea rsi, [rsi+rax*8] ; next 8 1696 1697 dec rcx 1698 jnz .nexts8_v 1699 1700 add rsp, 32 1701 pop rsp 1702 ; begin epilog 1703 pop rdi 1704 pop rsi 1705 RESTORE_GOT 1706 UNSHADOW_ARGS 1707 pop rbp 1708 ret 1709 1710 1711 1712;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, 1713; int y_stride, 1714; loop_filter_info *lfi) 1715;{ 1716; 1717; 1718; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 1719; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 1720; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 1721;} 1722 1723SECTION_RODATA 1724align 16 1725tfe: 1726 times 8 db 0xfe 1727align 16 1728t80: 1729 times 8 db 0x80 1730align 16 1731t1s: 1732 times 8 db 0x01 1733align 16 1734t3: 1735 times 8 db 0x03 1736align 16 1737t4: 1738 times 8 db 0x04 1739align 16 1740ones: 1741 times 4 dw 0x0001 1742align 16 1743s27: 1744 times 4 dw 0x1b00 1745align 16 1746s18: 1747 times 4 dw 0x1200 1748align 16 1749s9: 1750 times 4 dw 0x0900 1751align 16 1752s63: 1753 times 4 dw 0x003f 1754