1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* satd_sad.asm 33;* 34;* Abstract 35;* WelsSampleSatd4x4_sse2 36;* WelsSampleSatd8x8_sse2 37;* WelsSampleSatd16x8_sse2 38;* WelsSampleSatd8x16_sse2 39;* WelsSampleSatd16x16_sse2 40;* 41;* WelsSampleSad16x8_sse2 42;* WelsSampleSad16x16_sse2 43;* 44;* History 45;* 8/5/2009 Created 46;* 24/9/2009 modified 47;* 48;* 49;*************************************************************************/ 50 51%include "asm_inc.asm" 52 53;*********************************************************************** 54; Data 55;*********************************************************************** 56%ifdef X86_32_PICASM 57SECTION .text align=16 58%else 59SECTION .rodata align=16 60%endif 61 62align 16 63HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1 64align 16 65HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1 66align 16 67PDW1: dw 1,1,1,1,1,1,1,1 68align 16 69PDQ2: dw 2,0,0,0,2,0,0,0 70align 16 71HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 72 73;*********************************************************************** 74; Code 75;*********************************************************************** 76SECTION .text 77 78;*********************************************************************** 79; 80;Pixel_satd_wxh_sse2 BEGIN 81; 82;*********************************************************************** 83%macro MMX_DW_1_2REG 2 84 pxor %1, %1 85 pcmpeqw %2, %2 86 psubw %1, %2 87%endmacro 88 89%macro SSE2_SumWHorizon1 2 90 movdqa %2, %1 91 psrldq %2, 8 92 paddusw %1, %2 93 movdqa %2, %1 94 psrldq %2, 4 95 paddusw %1, %2 96 movdqa %2, %1 97 psrldq %2, 2 98 paddusw %1, %2 99%endmacro 100 101%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3 102 SSE2_SumSub %1, %2, %5 103 SSE2_SumSub %3, %4, %5 104 SSE2_SumSub %2, %4, %5 105 SSE2_SumSub %1, %3, %5 106%endmacro 107 108%macro SSE2_SumAbs4 7 109 WELS_AbsW %1, %3 110 WELS_AbsW %2, %3 111 WELS_AbsW %4, %6 112 WELS_AbsW %5, %6 113 paddusw %1, %2 114 paddusw %4, %5 115 paddusw %7, %1 116 paddusw %7, %4 117%endmacro 118 119%macro SSE2_SumWHorizon 3 120 movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 121 paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 122 punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 123 movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 124 paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 125 pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 126 paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 127%endmacro 128 129%macro SSE2_GetSatd8x8 0 130 SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] 131 SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] 132 lea r0, [r0+2*r1] 133 lea r2, [r2+2*r3] 134 SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] 135 SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] 136 137 SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 138 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 139 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 140 SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 141 142 lea r0, [r0+2*r1] 143 lea r2, [r2+2*r3] 144 SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2] 145 SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3] 146 lea r0, [r0+2*r1] 147 lea r2, [r2+2*r3] 148 SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2] 149 SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3] 150 151 SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4 152 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4 153 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5 154 SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6 155%endmacro 156 157;*********************************************************************** 158; 159;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t ); 160; 161;*********************************************************************** 162WELS_EXTERN WelsSampleSatd4x4_sse2 163 %assign push_num 0 164 LOAD_4_PARA 165 PUSH_XMM 8 166 SIGN_EXTENSION r1, r1d 167 SIGN_EXTENSION r3, r3d 168 movd xmm0, [r0] 169 movd xmm1, [r0+r1] 170 lea r0 , [r0+2*r1] 171 movd xmm2, [r0] 172 movd xmm3, [r0+r1] 173 punpckldq xmm0, xmm2 174 punpckldq xmm1, xmm3 175 176 movd xmm4, [r2] 177 movd xmm5, [r2+r3] 178 lea r2 , [r2+2*r3] 179 movd xmm6, [r2] 180 movd xmm7, [r2+r3] 181 punpckldq xmm4, xmm6 182 punpckldq xmm5, xmm7 183 184 pxor xmm6, xmm6 185 punpcklbw xmm0, xmm6 186 punpcklbw xmm1, xmm6 187 punpcklbw xmm4, xmm6 188 punpcklbw xmm5, xmm6 189 190 psubw xmm0, xmm4 191 psubw xmm1, xmm5 192 193 movdqa xmm2, xmm0 194 paddw xmm0, xmm1 195 psubw xmm2, xmm1 196 SSE2_XSawp qdq, xmm0, xmm2, xmm3 197 198 movdqa xmm4, xmm0 199 paddw xmm0, xmm3 200 psubw xmm4, xmm3 201 202 movdqa xmm2, xmm0 203 punpcklwd xmm0, xmm4 204 punpckhwd xmm4, xmm2 205 206 SSE2_XSawp dq, xmm0, xmm4, xmm3 207 SSE2_XSawp qdq, xmm0, xmm3, xmm5 208 209 movdqa xmm7, xmm0 210 paddw xmm0, xmm5 211 psubw xmm7, xmm5 212 213 SSE2_XSawp qdq, xmm0, xmm7, xmm1 214 215 movdqa xmm2, xmm0 216 paddw xmm0, xmm1 217 psubw xmm2, xmm1 218 219 WELS_AbsW xmm0, xmm3 220 paddusw xmm6, xmm0 221 WELS_AbsW xmm2, xmm4 222 paddusw xmm6, xmm2 223 SSE2_SumWHorizon1 xmm6, xmm4 224 movd retrd, xmm6 225 and retrd, 0xffff 226 shr retrd, 1 227 POP_XMM 228 LOAD_4_PARA_POP 229 ret 230 231 ;*********************************************************************** 232 ; 233 ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); 234 ; 235 ;*********************************************************************** 236WELS_EXTERN WelsSampleSatd8x8_sse2 237 %assign push_num 0 238 LOAD_4_PARA 239 PUSH_XMM 8 240 SIGN_EXTENSION r1, r1d 241 SIGN_EXTENSION r3, r3d 242 pxor xmm6, xmm6 243 pxor xmm7, xmm7 244 SSE2_GetSatd8x8 245 psrlw xmm6, 1 246 SSE2_SumWHorizon xmm6,xmm4,xmm7 247 movd retrd, xmm6 248 POP_XMM 249 LOAD_4_PARA_POP 250 ret 251 252 ;*********************************************************************** 253 ; 254 ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); 255 ; 256 ;*********************************************************************** 257WELS_EXTERN WelsSampleSatd8x16_sse2 258 %assign push_num 0 259 LOAD_4_PARA 260 PUSH_XMM 8 261 SIGN_EXTENSION r1, r1d 262 SIGN_EXTENSION r3, r3d 263 pxor xmm6, xmm6 264 pxor xmm7, xmm7 265 266 SSE2_GetSatd8x8 267 lea r0, [r0+2*r1] 268 lea r2, [r2+2*r3] 269 SSE2_GetSatd8x8 270 271 psrlw xmm6, 1 272 SSE2_SumWHorizon xmm6,xmm4,xmm7 273 movd retrd, xmm6 274 POP_XMM 275 LOAD_4_PARA_POP 276 ret 277 278;*********************************************************************** 279; 280;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); 281; 282;*********************************************************************** 283WELS_EXTERN WelsSampleSatd16x8_sse2 284 %assign push_num 0 285 LOAD_4_PARA 286 PUSH_XMM 8 287 SIGN_EXTENSION r1, r1d 288 SIGN_EXTENSION r3, r3d 289 push r0 290 push r2 291 pxor xmm6, xmm6 292 pxor xmm7, xmm7 293 294 SSE2_GetSatd8x8 295 296 pop r2 297 pop r0 298 add r0, 8 299 add r2, 8 300 SSE2_GetSatd8x8 301 302 psrlw xmm6, 1 303 SSE2_SumWHorizon xmm6,xmm4,xmm7 304 movd retrd, xmm6 305 POP_XMM 306 LOAD_4_PARA_POP 307 ret 308 309;*********************************************************************** 310; 311;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ); 312; 313;*********************************************************************** 314WELS_EXTERN WelsSampleSatd16x16_sse2 315 %assign push_num 0 316 LOAD_4_PARA 317 PUSH_XMM 8 318 SIGN_EXTENSION r1, r1d 319 SIGN_EXTENSION r3, r3d 320 push r0 321 push r2 322 pxor xmm6, xmm6 323 pxor xmm7, xmm7 324 325 SSE2_GetSatd8x8 326 lea r0, [r0+2*r1] 327 lea r2, [r2+2*r3] 328 SSE2_GetSatd8x8 329 330 pop r2 331 pop r0 332 add r0, 8 333 add r2, 8 334 335 SSE2_GetSatd8x8 336 lea r0, [r0+2*r1] 337 lea r2, [r2+2*r3] 338 SSE2_GetSatd8x8 339 340 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. 341 psrlw xmm6, 1 342 SSE2_SumWHorizon xmm6,xmm4,xmm7 343 movd retrd, xmm6 344 POP_XMM 345 LOAD_4_PARA_POP 346 ret 347 348;*********************************************************************** 349; 350;Pixel_satd_wxh_sse2 END 351; 352;*********************************************************************** 353 354;*********************************************************************** 355; 356;Pixel_satd_intra_sse2 BEGIN 357; 358;*********************************************************************** 359 360 361%macro SSE_DB_1_2REG 2 362 pxor %1, %1 363 pcmpeqw %2, %2 364 psubb %1, %2 365%endmacro 366 367;*********************************************************************** 368; 369;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, 370; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t); 371; 372;*********************************************************************** 373WELS_EXTERN WelsSampleSatdThree4x4_sse2 374 375%ifdef X86_32 376 push r3 377 push r4 378 push r5 379 push r6 380 %assign push_num 4 381%else 382 %assign push_num 0 383%endif 384 PUSH_XMM 8 385 386 mov r2, arg3 387 mov r3, arg4 388 SIGN_EXTENSION r3, r3d 389 390 ; load source 4x4 samples and Hadamard transform 391 movd xmm0, [r2] 392 movd xmm1, [r2+r3] 393 lea r2 , [r2+2*r3] 394 movd xmm2, [r2] 395 movd xmm3, [r2+r3] 396 punpckldq xmm0, xmm2 397 punpckldq xmm1, xmm3 398 399 pxor xmm6, xmm6 400 punpcklbw xmm0, xmm6 401 punpcklbw xmm1, xmm6 402 403 movdqa xmm2, xmm0 404 paddw xmm0, xmm1 405 psubw xmm2, xmm1 406 SSE2_XSawp qdq, xmm0, xmm2, xmm3 407 408 movdqa xmm4, xmm0 409 paddw xmm0, xmm3 410 psubw xmm4, xmm3 411 412 movdqa xmm2, xmm0 413 punpcklwd xmm0, xmm4 414 punpckhwd xmm4, xmm2 415 416 SSE2_XSawp dq, xmm0, xmm4, xmm3 417 SSE2_XSawp qdq, xmm0, xmm3, xmm5 418 419 movdqa xmm7, xmm0 420 paddw xmm0, xmm5 421 psubw xmm7, xmm5 422 423 SSE2_XSawp qdq, xmm0, xmm7, xmm1 424 425 ; Hadamard transform results are saved in xmm0 and xmm2 426 movdqa xmm2, xmm0 427 paddw xmm0, xmm1 428 psubw xmm2, xmm1 429 430 ;load top boundary samples: [a b c d] 431 mov r0, arg1 432 mov r1, arg2 433 SIGN_EXTENSION r1, r1d 434 sub r0, r1 435%ifdef UNIX64 436 push r4 437 push r5 438%endif 439 440 movzx r2d, byte [r0] 441 movzx r3d, byte [r0+1] 442 movzx r4d, byte [r0+2] 443 movzx r5d, byte [r0+3] 444 445 ; get the transform results of top boundary samples: [a b c d] 446 add r3d, r2d ; r3d = a + b 447 add r5d, r4d ; r5d = c + d 448 add r2d, r2d ; r2d = a + a 449 add r4d, r4d ; r4d = c + c 450 sub r2d, r3d ; r2d = a + a - a - b = a - b 451 sub r4d, r5d ; r4d = c + c - c - d = c - d 452 add r5d, r3d ; r5d = (a + b) + (c + d) 453 add r3d, r3d 454 sub r3d, r5d ; r3d = (a + b) - (c + d) 455 add r4d, r2d ; r4d = (a - b) + (c - d) 456 add r2d, r2d 457 sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] 458 459 movdqa xmm6, xmm0 460 movdqa xmm7, xmm2 461 movd xmm5, r5d ; store the edi for DC mode 462 pxor xmm3, xmm3 463 pxor xmm4, xmm4 464 pinsrw xmm3, r5d, 0 465 pinsrw xmm3, r4d, 4 466 psllw xmm3, 2 467 pinsrw xmm4, r3d, 0 468 pinsrw xmm4, r2d, 4 469 psllw xmm4, 2 470 471 ; get the satd of H 472 psubw xmm0, xmm3 473 psubw xmm2, xmm4 474 475 WELS_AbsW xmm0, xmm1 476 WELS_AbsW xmm2, xmm1 477 paddusw xmm0, xmm2 478 SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0 479 480 ;load left boundary samples: [a b c d]' 481 add r0, r1 482 483 movzx r2d, byte [r0-1] 484 movzx r3d, byte [r0+r1-1] 485 lea r0 , [r0+2*r1] 486 movzx r4d, byte [r0-1] 487 movzx r5d, byte [r0+r1-1] 488 489 ; get the transform results of left boundary samples: [a b c d]' 490 add r3d, r2d ; r3d = a + b 491 add r5d, r4d ; r5d = c + d 492 add r2d, r2d ; r2d = a + a 493 add r4d, r4d ; r4d = c + c 494 sub r2d, r3d ; r2d = a + a - a - b = a - b 495 sub r4d, r5d ; r4d = c + c - c - d = c - d 496 add r5d, r3d ; r5d = (a + b) + (c + d) 497 add r3d, r3d 498 sub r3d, r5d ; r3d = (a + b) - (c + d) 499 add r4d, r2d ; r4d = (a - b) + (c - d) 500 add r2d, r2d 501 sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d] 502 503 ; store the transform results in xmm3 504 movd xmm3, r5d 505 pinsrw xmm3, r3d, 1 506 pinsrw xmm3, r2d, 2 507 pinsrw xmm3, r4d, 3 508 psllw xmm3, 2 509 510 ; get the satd of V 511 movdqa xmm2, xmm6 512 movdqa xmm4, xmm7 513 psubw xmm2, xmm3 514 WELS_AbsW xmm2, xmm1 515 WELS_AbsW xmm4, xmm1 516 paddusw xmm2, xmm4 517 SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2 518 519 ; DC result is stored in xmm1 520 add r5d, 4 521 movd xmm1, r5d 522 paddw xmm1, xmm5 523 psrlw xmm1, 3 524 movdqa xmm5, xmm1 525 psllw xmm1, 4 526 527 ; get the satd of DC 528 psubw xmm6, xmm1 529 WELS_AbsW xmm6, xmm1 530 WELS_AbsW xmm7, xmm1 531 paddusw xmm6, xmm7 532 SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6 533%ifdef UNIX64 534 pop r5 535 pop r4 536%endif 537 ; comparing order: DC H V 538 539 mov r4, arg5 540 movd r2d, xmm6 541 movd r3d, xmm2 542 movd r6d, xmm0 543 544 and r2d, 0xffff 545 shr r2d, 1 546 and r3d, 0xffff 547 shr r3d, 1 548 and r6d, 0xffff 549 shr r6d, 1 550 add r2d, dword arg7 551 add r3d, dword arg8 552 add r6d, dword arg9 553 cmp r2w, r3w 554 jg near not_dc 555 cmp r2w, r6w 556 jg near not_dc_h 557 558 ; for DC mode 559 movd r3d, xmm5 560 imul r3d, 0x01010101 561 movd xmm5, r3d 562 pshufd xmm5, xmm5, 0 563 movdqa [r4], xmm5 564 mov r5, arg6 565 mov dword [r5], 0x02 566 mov retrd, r2d 567 POP_XMM 568%ifdef X86_32 569 pop r6 570 pop r5 571 pop r4 572 pop r3 573%endif 574 ret 575 576not_dc: 577 cmp r3w, r6w 578 jg near not_dc_h 579 580 ; for H mode 581 SSE_DB_1_2REG xmm6, xmm7 582 sub r0, r1 583 sub r0, r1 584 movzx r6d, byte [r0-1] 585 movd xmm0, r6d 586 pmuludq xmm0, xmm6 587 588 movzx r6d, byte [r0+r1-1] 589 movd xmm1, r6d 590 pmuludq xmm1, xmm6 591 punpckldq xmm0, xmm1 592 593 lea r0, [r0+r1*2] 594 movzx r6d, byte [r0-1] 595 movd xmm2, r6d 596 pmuludq xmm2, xmm6 597 598 movzx r6d, byte [r0+r1-1] 599 movd xmm3, r6d 600 pmuludq xmm3, xmm6 601 punpckldq xmm2, xmm3 602 punpcklqdq xmm0, xmm2 603 604 movdqa [r4],xmm0 605 606 mov retrd, r3d 607 mov r5, arg6 608 mov dword [r5], 0x01 609 POP_XMM 610%ifdef X86_32 611 pop r6 612 pop r5 613 pop r4 614 pop r3 615%endif 616 ret 617not_dc_h: 618 sub r0, r1 619 sub r0, r1 620 sub r0, r1 621 movd xmm0, [r0] 622 pshufd xmm0, xmm0, 0 623 movdqa [r4],xmm0 624 mov retrd, r6d 625 mov r5, arg6 626 mov dword [r5], 0x00 627 POP_XMM 628%ifdef X86_32 629 pop r6 630 pop r5 631 pop r4 632 pop r3 633%endif 634 ret 635 636 637%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 638 pmaddubsw %1, xmm5 639 movdqa %2, %1 640 pmaddwd %1, xmm7 641 pmaddwd %2, xmm6 642 movdqa %3, %1 643 punpckldq %1, %2 644 punpckhdq %2, %3 645 movdqa %3, %1 646 punpcklqdq %1, %2 647 punpckhqdq %3, %2 648 paddd xmm4, %1 ;for dc 649 paddd xmm4, %3 ;for dc 650 packssdw %1, %3 651 psllw %1, 2 652%endmacro 653%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2 654 pmaddubsw %1, xmm5 655 movdqa %2, %1 656 pmaddwd %1, xmm7 657 pmaddwd %2, xmm6 658 movdqa %3, %1 659 punpckldq %1, %2 660 punpckhdq %2, %3 661 movdqa %3, %1 662 punpcklqdq %1, %2 663 punpckhqdq %3, %2 664; paddd xmm4, %1 ;for dc 665; paddd xmm4, %3 ;for dc 666 movdqa %4, %1 667 punpcklqdq %4, %3 668 packssdw %1, %3 669 psllw %1, 2 670%endmacro 671 672%macro SSE41_GetX38x4SatdDec 0 673 pxor xmm7, xmm7 674 movq xmm0, [r2] 675 movq xmm1, [r2+r3] 676 lea r2, [r2+2*r3] 677 movq xmm2, [r2] 678 movq xmm3, [r2+r3] 679 lea r2, [r2+2*r3] 680 punpcklbw xmm0, xmm7 681 punpcklbw xmm1, xmm7 682 punpcklbw xmm2, xmm7 683 punpcklbw xmm3, xmm7 684 SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7 685 SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7 686 SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2 687 ;doesn't need another transpose 688%endmacro 689 690%macro SSE41_GetX38x4SatdV 2 691 pxor xmm0, xmm0 692 pinsrw xmm0, word[r6+%2], 0 693 pinsrw xmm0, word[r6+%2+8], 4 694 psubsw xmm0, xmm7 695 pabsw xmm0, xmm0 696 paddw xmm4, xmm0 697 pxor xmm0, xmm0 698 pinsrw xmm0, word[r6+%2+2], 0 699 pinsrw xmm0, word[r6+%2+10], 4 700 psubsw xmm0, xmm1 701 pabsw xmm0, xmm0 702 paddw xmm4, xmm0 703 pxor xmm0, xmm0 704 pinsrw xmm0, word[r6+%2+4], 0 705 pinsrw xmm0, word[r6+%2+12], 4 706 psubsw xmm0, xmm3 707 pabsw xmm0, xmm0 708 paddw xmm4, xmm0 709 pxor xmm0, xmm0 710 pinsrw xmm0, word[r6+%2+6], 0 711 pinsrw xmm0, word[r6+%2+14], 4 712 psubsw xmm0, xmm2 713 pabsw xmm0, xmm0 714 paddw xmm4, xmm0 715%endmacro 716%macro SSE41_GetX38x4SatdH 3 717 movq xmm0, [r6+%3+8*%1] 718 punpcklqdq xmm0, xmm0 719 psubsw xmm0, xmm7 720 pabsw xmm0, xmm0 721 paddw xmm5, xmm0 722 pabsw xmm1, xmm1 723 pabsw xmm2, xmm2 724 pabsw xmm3, xmm3 725 paddw xmm2, xmm1;for DC 726 paddw xmm2, xmm3;for DC 727 paddw xmm5, xmm2 728%endmacro 729%macro SSE41_I16X16GetX38x4SatdDC 0 730 pxor xmm0, xmm0 731 movq2dq xmm0, mm4 732 punpcklqdq xmm0, xmm0 733 psubsw xmm0, xmm7 734 pabsw xmm0, xmm0 735 paddw xmm6, xmm0 736 paddw xmm6, xmm2 737%endmacro 738%macro SSE41_ChromaGetX38x4SatdDC 1 739 shl %1, 4 740 movdqa xmm0, [r6+32+%1] 741 psubsw xmm0, xmm7 742 pabsw xmm0, xmm0 743 paddw xmm6, xmm0 744 paddw xmm6, xmm2 745%endmacro 746%macro SSE41_I16x16GetX38x4Satd 2 747 SSE41_GetX38x4SatdDec 748 SSE41_GetX38x4SatdV %1, %2 749 SSE41_GetX38x4SatdH %1, %2, 32 750 SSE41_I16X16GetX38x4SatdDC 751%endmacro 752%macro SSE41_ChromaGetX38x4Satd 2 753 SSE41_GetX38x4SatdDec 754 SSE41_GetX38x4SatdV %1, %2 755 SSE41_GetX38x4SatdH %1, %2, 16 756 SSE41_ChromaGetX38x4SatdDC %1 757%endmacro 758%macro SSE41_HSum8W 3 759 pmaddwd %1, %2 760 movhlps %3, %1 761 paddd %1, %3 762 pshuflw %3, %1,0Eh 763 paddd %1, %3 764%endmacro 765 766WELS_EXTERN WelsIntra16x16Combined3Satd_sse41 767 %assign push_num 0 768 LOAD_7_PARA 769 PUSH_XMM 8 770 SIGN_EXTENSION r1, r1d 771 SIGN_EXTENSION r3, r3d 772 SIGN_EXTENSION r5, r5d 773 774%ifndef X86_32 775 push r12 776 mov r12, r2 777%endif 778 779 INIT_X86_32_PIC r2 780 pxor xmm4, xmm4 781 movdqa xmm5, [pic(HSumSubDB1)] 782 movdqa xmm6, [pic(HSumSubDW1)] 783 movdqa xmm7, [pic(PDW1)] 784 DEINIT_X86_32_PIC 785 sub r0, r1 786 movdqu xmm0, [r0] 787 movhlps xmm1, xmm0 788 punpcklqdq xmm0, xmm0 789 punpcklqdq xmm1, xmm1 790 SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 791 SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 792 movdqa [r6], xmm0 ;V 793 movdqa [r6+16], xmm1 794 add r0, r1 795 pinsrb xmm0, byte[r0-1], 0 796 pinsrb xmm0, byte[r0+r1-1], 1 797 lea r0, [r0+2*r1] 798 pinsrb xmm0, byte[r0-1], 2 799 pinsrb xmm0, byte[r0+r1-1], 3 800 lea r0, [r0+2*r1] 801 pinsrb xmm0, byte[r0-1], 4 802 pinsrb xmm0, byte[r0+r1-1], 5 803 lea r0, [r0+2*r1] 804 pinsrb xmm0, byte[r0-1], 6 805 pinsrb xmm0, byte[r0+r1-1], 7 806 lea r0, [r0+2*r1] 807 pinsrb xmm0, byte[r0-1], 8 808 pinsrb xmm0, byte[r0+r1-1], 9 809 lea r0, [r0+2*r1] 810 pinsrb xmm0, byte[r0-1], 10 811 pinsrb xmm0, byte[r0+r1-1], 11 812 lea r0, [r0+2*r1] 813 pinsrb xmm0, byte[r0-1], 12 814 pinsrb xmm0, byte[r0+r1-1], 13 815 lea r0, [r0+2*r1] 816 pinsrb xmm0, byte[r0-1], 14 817 pinsrb xmm0, byte[r0+r1-1], 15 818 movhlps xmm1, xmm0 819 punpcklqdq xmm0, xmm0 820 punpcklqdq xmm1, xmm1 821 SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3 822 SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3 823 movdqa [r6+32], xmm0 ;H 824 movdqa [r6+48], xmm1 825 movd r0d, xmm4 ;dc 826 add r0d, 16 ;(sum+16) 827 shr r0d, 5 ;((sum+16)>>5) 828 shl r0d, 4 ; 829 movd mm4, r0d ; mm4 copy DC 830 pxor xmm4, xmm4 ;V 831 pxor xmm5, xmm5 ;H 832 pxor xmm6, xmm6 ;DC 833%ifdef UNIX64 834 push r4 835%endif 836 mov r0, 0 837 mov r4, 0 838 839.loop16x16_get_satd: 840.loopStart1: 841 SSE41_I16x16GetX38x4Satd r0, r4 842 inc r0 843 cmp r0, 4 844 jl .loopStart1 845 cmp r4, 16 846 je .loop16x16_get_satd_end 847%ifdef X86_32 848 mov r2, arg3 849%else 850 mov r2, r12 851%endif 852 add r2, 8 853 mov r0, 0 854 add r4, 16 855 jmp .loop16x16_get_satd 856 .loop16x16_get_satd_end: 857 MMX_DW_1_2REG xmm0, xmm1 858 psrlw xmm4, 1 ;/2 859 psrlw xmm5, 1 ;/2 860 psrlw xmm6, 1 ;/2 861 SSE41_HSum8W xmm4, xmm0, xmm1 862 SSE41_HSum8W xmm5, xmm0, xmm1 863 SSE41_HSum8W xmm6, xmm0, xmm1 864 865%ifdef UNIX64 866 pop r4 867%endif 868 ; comparing order: DC H V 869 movd r3d, xmm6 ;DC 870 movd r1d, xmm5 ;H 871 movd r0d, xmm4 ;V 872%ifndef X86_32 873 pop r12 874%endif 875 shl r5d, 1 876 add r1d, r5d 877 add r3d, r5d 878 mov r4, arg5 879 cmp r3d, r1d 880 jge near not_dc_16x16 881 cmp r3d, r0d 882 jge near not_dc_h_16x16 883 884 ; for DC mode 885 mov dword[r4], 2;I16_PRED_DC 886 mov retrd, r3d 887 jmp near return_satd_intra_16x16_x3 888not_dc_16x16: 889 ; for H mode 890 cmp r1d, r0d 891 jge near not_dc_h_16x16 892 mov dword[r4], 1;I16_PRED_H 893 mov retrd, r1d 894 jmp near return_satd_intra_16x16_x3 895not_dc_h_16x16: 896 ; for V mode 897 mov dword[r4], 0;I16_PRED_V 898 mov retrd, r0d 899return_satd_intra_16x16_x3: 900 WELSEMMS 901 POP_XMM 902 LOAD_7_PARA_POP 903ret 904 905%macro SSE41_ChromaGetX38x8Satd 0 906 movdqa xmm5, [pic(HSumSubDB1)] 907 movdqa xmm6, [pic(HSumSubDW1)] 908 movdqa xmm7, [pic(PDW1)] 909 sub r0, r1 910 movq xmm0, [r0] 911 punpcklqdq xmm0, xmm0 912 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4 913 movdqa [r6], xmm0 ;V 914 add r0, r1 915 pinsrb xmm0, byte[r0-1], 0 916 pinsrb xmm0, byte[r0+r1-1], 1 917 lea r0, [r0+2*r1] 918 pinsrb xmm0, byte[r0-1], 2 919 pinsrb xmm0, byte[r0+r1-1], 3 920 lea r0, [r0+2*r1] 921 pinsrb xmm0, byte[r0-1], 4 922 pinsrb xmm0, byte[r0+r1-1], 5 923 lea r0, [r0+2*r1] 924 pinsrb xmm0, byte[r0-1], 6 925 pinsrb xmm0, byte[r0+r1-1], 7 926 punpcklqdq xmm0, xmm0 927 SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1 928 movdqa [r6+16], xmm0 ;H 929;(sum+2)>>2 930 movdqa xmm6, [pic(PDQ2)] 931 movdqa xmm5, xmm4 932 punpckhqdq xmm5, xmm1 933 paddd xmm5, xmm6 934 psrld xmm5, 2 935;(sum1+sum2+4)>>3 936 paddd xmm6, xmm6 937 paddd xmm4, xmm1 938 paddd xmm4, xmm6 939 psrld xmm4, 3 940;satd *16 941 pslld xmm5, 4 942 pslld xmm4, 4 943;temp satd 944 movdqa xmm6, xmm4 945 punpcklqdq xmm4, xmm5 946 psllq xmm4, 32 947 psrlq xmm4, 32 948 movdqa [r6+32], xmm4 949 punpckhqdq xmm5, xmm6 950 psllq xmm5, 32 951 psrlq xmm5, 32 952 movdqa [r6+48], xmm5 953 954 pxor xmm4, xmm4 ;V 955 pxor xmm5, xmm5 ;H 956 pxor xmm6, xmm6 ;DC 957 mov r0, 0 958 SSE41_ChromaGetX38x4Satd r0, 0 959 inc r0 960 SSE41_ChromaGetX38x4Satd r0, 0 961%endmacro 962 963%macro SSEReg2MMX 3 964 movdq2q %2, %1 965 movhlps %1, %1 966 movdq2q %3, %1 967%endmacro 968%macro MMXReg2SSE 4 969 movq2dq %1, %3 970 movq2dq %2, %4 971 punpcklqdq %1, %2 972%endmacro 973;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41 974 975WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41 976 %assign push_num 0 977 LOAD_7_PARA 978 PUSH_XMM 8 979 SIGN_EXTENSION r1, r1d 980 SIGN_EXTENSION r3, r3d 981 SIGN_EXTENSION r5, r5d 982loop_chroma_satdx3: 983 INIT_X86_32_PIC r4 984 SSE41_ChromaGetX38x8Satd 985 SSEReg2MMX xmm4, mm0,mm1 986 SSEReg2MMX xmm5, mm2,mm3 987 SSEReg2MMX xmm6, mm5,mm6 988 mov r0, arg8 989 mov r2, arg9 990 991 SSE41_ChromaGetX38x8Satd 992 DEINIT_X86_32_PIC 993 994 MMXReg2SSE xmm0, xmm3, mm0, mm1 995 MMXReg2SSE xmm1, xmm3, mm2, mm3 996 MMXReg2SSE xmm2, xmm3, mm5, mm6 997 998 paddw xmm4, xmm0 999 paddw xmm5, xmm1 1000 paddw xmm6, xmm2 1001 1002 MMX_DW_1_2REG xmm0, xmm1 1003 psrlw xmm4, 1 ;/2 1004 psrlw xmm5, 1 ;/2 1005 psrlw xmm6, 1 ;/2 1006 SSE41_HSum8W xmm4, xmm0, xmm1 1007 SSE41_HSum8W xmm5, xmm0, xmm1 1008 SSE41_HSum8W xmm6, xmm0, xmm1 1009 ; comparing order: DC H V 1010 movd r3d, xmm6 ;DC 1011 movd r1d, xmm5 ;H 1012 movd r0d, xmm4 ;V 1013 1014 1015 shl r5d, 1 1016 add r1d, r5d 1017 add r0d, r5d 1018 cmp r3d, r1d 1019 jge near not_dc_8x8 1020 cmp r3d, r0d 1021 jge near not_dc_h_8x8 1022 1023 ; for DC mode 1024 mov dword[r4], 0;I8_PRED_DC 1025 mov retrd, r3d 1026 jmp near return_satd_intra_8x8_x3 1027not_dc_8x8: 1028 ; for H mode 1029 cmp r1d, r0d 1030 jge near not_dc_h_8x8 1031 mov dword[r4], 1;I8_PRED_H 1032 mov retrd, r1d 1033 jmp near return_satd_intra_8x8_x3 1034not_dc_h_8x8: 1035 ; for V mode 1036 mov dword[r4], 2;I8_PRED_V 1037 mov retrd, r0d 1038return_satd_intra_8x8_x3: 1039 WELSEMMS 1040 POP_XMM 1041 LOAD_7_PARA_POP 1042ret 1043 1044 1045;*********************************************************************** 1046; 1047;Pixel_satd_intra_sse2 END 1048; 1049;*********************************************************************** 1050%macro SSSE3_Get16BSadHVDC 2 1051 movd xmm6,%1 1052 pshufb xmm6,xmm1 1053 movdqa %1, xmm6 1054 movdqa xmm0,%2 1055 psadbw xmm0,xmm7 1056 paddw xmm4,xmm0 1057 movdqa xmm0,%2 1058 psadbw xmm0,xmm5 1059 paddw xmm2,xmm0 1060 psadbw xmm6,%2 1061 paddw xmm3,xmm6 1062%endmacro 1063%macro WelsAddDCValue 4 1064 movzx %2, byte %1 1065 mov %3, %2 1066 add %4, %2 1067%endmacro 1068 1069;*********************************************************************** 1070; 1071;Pixel_sad_intra_ssse3 BEGIN 1072; 1073;*********************************************************************** 1074WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3 1075 %assign push_num 0 1076 LOAD_7_PARA 1077 PUSH_XMM 8 1078 SIGN_EXTENSION r1, r1d 1079 SIGN_EXTENSION r3, r3d 1080 SIGN_EXTENSION r5, r5d 1081 1082 push r5 1083 push r4 1084 push r3 1085 1086 sub r0, r1 1087 movdqa xmm5,[r0] 1088 pxor xmm0,xmm0 1089 psadbw xmm0,xmm5 1090 movhlps xmm1,xmm0 1091 paddw xmm0,xmm1 1092 movd r5d, xmm0 1093 1094 add r0,r1 1095 lea r3,[r1+2*r1] ;ebx r3 1096 WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d 1097 WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d 1098 WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d 1099 WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d 1100 lea r0, [r0+4*r1] 1101 add r6, 64 1102 WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d 1103 WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d 1104 WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d 1105 WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d 1106 lea r0, [r0+4*r1] 1107 add r6, 64 1108 WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d 1109 WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d 1110 WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d 1111 WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d 1112 lea r0, [r0+4*r1] 1113 add r6, 64 1114 WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d 1115 WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d 1116 WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d 1117 WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d 1118 sub r6, 192 1119 add r5d,10h 1120 shr r5d,5 1121 movd xmm7,r5d 1122 pxor xmm1,xmm1 1123 pshufb xmm7,xmm1 1124 pxor xmm4,xmm4 1125 pxor xmm3,xmm3 1126 pxor xmm2,xmm2 1127 ;sad begin 1128 pop r3 1129 lea r4, [r3+2*r3] ;esi r4 1130 SSSE3_Get16BSadHVDC [r6], [r2] 1131 SSSE3_Get16BSadHVDC [r6+16], [r2+r3] 1132 SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] 1133 SSSE3_Get16BSadHVDC [r6+48], [r2+r4] 1134 add r6, 64 1135 lea r2, [r2+4*r3] 1136 SSSE3_Get16BSadHVDC [r6], [r2] 1137 SSSE3_Get16BSadHVDC [r6+16], [r2+r3] 1138 SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] 1139 SSSE3_Get16BSadHVDC [r6+48], [r2+r4] 1140 add r6, 64 1141 lea r2, [r2+4*r3] 1142 SSSE3_Get16BSadHVDC [r6], [r2] 1143 SSSE3_Get16BSadHVDC [r6+16], [r2+r3] 1144 SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] 1145 SSSE3_Get16BSadHVDC [r6+48], [r2+r4] 1146 add r6, 64 1147 lea r2, [r2+4*r3] 1148 SSSE3_Get16BSadHVDC [r6], [r2] 1149 SSSE3_Get16BSadHVDC [r6+16], [r2+r3] 1150 SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3] 1151 SSSE3_Get16BSadHVDC [r6+48], [r2+r4] 1152 1153 pop r4 1154 pop r5 1155 pslldq xmm3,4 1156 por xmm3,xmm2 1157 movhlps xmm1,xmm3 1158 paddw xmm3,xmm1 1159 movhlps xmm0,xmm4 1160 paddw xmm4,xmm0 1161 ; comparing order: DC H V 1162 movd r1d, xmm4 ;DC ;ebx r1d 1163 movd r0d, xmm3 ;V ;ecx r0d 1164 psrldq xmm3, 4 1165 movd r2d, xmm3 ;H ;esi r2d 1166 1167 ;mov eax, [esp+36] ;lamda ;eax r5 1168 shl r5d, 1 1169 add r2d, r5d 1170 add r1d, r5d 1171 ;mov edx, [esp+32] ;edx r4 1172 cmp r1d, r2d 1173 jge near not_dc_16x16_sad 1174 cmp r1d, r0d 1175 jge near not_dc_h_16x16_sad 1176 ; for DC mode 1177 mov dword[r4], 2;I16_PRED_DC 1178 mov retrd, r1d 1179 sub r6, 192 1180%assign x 0 1181%rep 16 1182 movdqa [r6+16*x], xmm7 1183%assign x x+1 1184%endrep 1185 jmp near return_sad_intra_16x16_x3 1186not_dc_16x16_sad: 1187 ; for H mode 1188 cmp r2d, r0d 1189 jge near not_dc_h_16x16_sad 1190 mov dword[r4], 1;I16_PRED_H 1191 mov retrd, r2d 1192 jmp near return_sad_intra_16x16_x3 1193not_dc_h_16x16_sad: 1194 ; for V mode 1195 mov dword[r4], 0;I16_PRED_V 1196 mov retrd, r0d 1197 sub r6, 192 1198%assign x 0 1199%rep 16 1200 movdqa [r6+16*x], xmm5 1201%assign x x+1 1202%endrep 1203return_sad_intra_16x16_x3: 1204 POP_XMM 1205 LOAD_7_PARA_POP 1206 ret 1207 1208;*********************************************************************** 1209; 1210;Pixel_sad_intra_ssse3 END 1211; 1212;*********************************************************************** 1213;*********************************************************************** 1214; 1215;Pixel_satd_wxh_sse41 BEGIN 1216; 1217;*********************************************************************** 1218 1219;SSE4.1 1220%macro SSE41_GetSatd8x4 0 1221 movq xmm0, [r0] 1222 punpcklqdq xmm0, xmm0 1223 pmaddubsw xmm0, xmm7 1224 movq xmm1, [r0+r1] 1225 punpcklqdq xmm1, xmm1 1226 pmaddubsw xmm1, xmm7 1227 movq xmm2, [r2] 1228 punpcklqdq xmm2, xmm2 1229 pmaddubsw xmm2, xmm7 1230 movq xmm3, [r2+r3] 1231 punpcklqdq xmm3, xmm3 1232 pmaddubsw xmm3, xmm7 1233 psubsw xmm0, xmm2 1234 psubsw xmm1, xmm3 1235 movq xmm2, [r0+2*r1] 1236 punpcklqdq xmm2, xmm2 1237 pmaddubsw xmm2, xmm7 1238 movq xmm3, [r0+r4] 1239 punpcklqdq xmm3, xmm3 1240 pmaddubsw xmm3, xmm7 1241 movq xmm4, [r2+2*r3] 1242 punpcklqdq xmm4, xmm4 1243 pmaddubsw xmm4, xmm7 1244 movq xmm5, [r2+r5] 1245 punpcklqdq xmm5, xmm5 1246 pmaddubsw xmm5, xmm7 1247 psubsw xmm2, xmm4 1248 psubsw xmm3, xmm5 1249 SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4 1250 pabsw xmm0, xmm0 1251 pabsw xmm2, xmm2 1252 pabsw xmm1, xmm1 1253 pabsw xmm3, xmm3 1254 movdqa xmm4, xmm3 1255 pblendw xmm3, xmm1, 0xAA 1256 pslld xmm1, 16 1257 psrld xmm4, 16 1258 por xmm1, xmm4 1259 pmaxuw xmm1, xmm3 1260 paddw xmm6, xmm1 1261 movdqa xmm4, xmm0 1262 pblendw xmm0, xmm2, 0xAA 1263 pslld xmm2, 16 1264 psrld xmm4, 16 1265 por xmm2, xmm4 1266 pmaxuw xmm0, xmm2 1267 paddw xmm6, xmm0 1268%endmacro 1269 1270%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE 1271 MMX_DW_1_2REG %3, %4 1272 pmaddwd %2, %3 1273 movhlps %4, %2 1274 paddd %2, %4 1275 pshuflw %4, %2,0Eh 1276 paddd %2, %4 1277 movd %1, %2 1278%endmacro 1279;*********************************************************************** 1280; 1281;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t ); 1282; 1283;*********************************************************************** 1284WELS_EXTERN WelsSampleSatd4x4_sse41 1285 %assign push_num 0 1286 INIT_X86_32_PIC r5 1287 LOAD_4_PARA 1288 PUSH_XMM 8 1289 SIGN_EXTENSION r1, r1d 1290 SIGN_EXTENSION r3, r3d 1291 movdqa xmm4,[pic(HSwapSumSubDB1)] 1292 movd xmm2,[r2] 1293 movd xmm5,[r2+r3] 1294 shufps xmm2,xmm5,0 1295 movd xmm3,[r2+r3*2] 1296 lea r2, [r3*2+r2] 1297 movd xmm5,[r2+r3] 1298 shufps xmm3,xmm5,0 1299 movd xmm0,[r0] 1300 movd xmm5,[r0+r1] 1301 shufps xmm0,xmm5,0 1302 movd xmm1,[r0+r1*2] 1303 lea r0, [r1*2+r0] 1304 movd xmm5,[r0+r1] 1305 shufps xmm1,xmm5,0 1306 pmaddubsw xmm0,xmm4 1307 pmaddubsw xmm1,xmm4 1308 pmaddubsw xmm2,xmm4 1309 pmaddubsw xmm3,xmm4 1310 psubw xmm0,xmm2 1311 psubw xmm1,xmm3 1312 movdqa xmm2,xmm0 1313 paddw xmm0,xmm1 1314 psubw xmm1,xmm2 1315 movdqa xmm2,xmm0 1316 punpcklqdq xmm0,xmm1 1317 punpckhqdq xmm2,xmm1 1318 movdqa xmm1,xmm0 1319 paddw xmm0,xmm2 1320 psubw xmm2,xmm1 1321 movdqa xmm1,xmm0 1322 pblendw xmm0,xmm2,0AAh 1323 pslld xmm2,16 1324 psrld xmm1,16 1325 por xmm2,xmm1 1326 pabsw xmm0,xmm0 1327 pabsw xmm2,xmm2 1328 pmaxsw xmm0,xmm2 1329 SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7 1330 POP_XMM 1331 LOAD_4_PARA_POP 1332 DEINIT_X86_32_PIC 1333 ret 1334 1335;*********************************************************************** 1336; 1337;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); 1338; 1339;*********************************************************************** 1340WELS_EXTERN WelsSampleSatd8x8_sse41 1341%ifdef X86_32 1342 push r4 1343 push r5 1344%endif 1345 %assign push_num 2 1346 INIT_X86_32_PIC r6 1347 LOAD_4_PARA 1348 PUSH_XMM 8 1349 SIGN_EXTENSION r1, r1d 1350 SIGN_EXTENSION r3, r3d 1351 1352 movdqa xmm7, [pic(HSumSubDB1)] 1353 lea r4, [r1+r1*2] 1354 lea r5, [r3+r3*2] 1355 pxor xmm6, xmm6 1356 SSE41_GetSatd8x4 1357 lea r0, [r0+4*r1] 1358 lea r2, [r2+4*r3] 1359 SSE41_GetSatd8x4 1360 SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 1361 POP_XMM 1362 LOAD_4_PARA_POP 1363 DEINIT_X86_32_PIC 1364%ifdef X86_32 1365 pop r5 1366 pop r4 1367%endif 1368 ret 1369 1370;*********************************************************************** 1371; 1372;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); 1373; 1374;*********************************************************************** 1375WELS_EXTERN WelsSampleSatd8x16_sse41 1376%ifdef X86_32 1377 push r4 1378 push r5 1379 push r6 1380%endif 1381 %assign push_num 3 1382 LOAD_4_PARA 1383 PUSH_XMM 8 1384 SIGN_EXTENSION r1, r1d 1385 SIGN_EXTENSION r3, r3d 1386 1387 INIT_X86_32_PIC_NOPRESERVE r4 1388 movdqa xmm7, [pic(HSumSubDB1)] 1389 DEINIT_X86_32_PIC 1390 lea r4, [r1+r1*2] 1391 lea r5, [r3+r3*2] 1392 pxor xmm6, xmm6 1393 mov r6, 0 1394loop_get_satd_8x16: 1395 SSE41_GetSatd8x4 1396 lea r0, [r0+4*r1] 1397 lea r2, [r2+4*r3] 1398 inc r6 1399 cmp r6, 4 1400 jl loop_get_satd_8x16 1401 SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 1402 POP_XMM 1403 LOAD_4_PARA_POP 1404%ifdef X86_32 1405 pop r6 1406 pop r5 1407 pop r4 1408%endif 1409 ret 1410 1411;*********************************************************************** 1412; 1413;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); 1414; 1415;*********************************************************************** 1416WELS_EXTERN WelsSampleSatd16x8_sse41 1417%ifdef X86_32 1418 push r4 1419 push r5 1420%endif 1421 %assign push_num 2 1422 INIT_X86_32_PIC r6 1423 LOAD_4_PARA 1424 PUSH_XMM 8 1425 SIGN_EXTENSION r1, r1d 1426 SIGN_EXTENSION r3, r3d 1427 push r0 1428 push r2 1429 1430 movdqa xmm7, [pic(HSumSubDB1)] 1431 lea r4, [r1+r1*2] 1432 lea r5, [r3+r3*2] 1433 pxor xmm6, xmm6 1434 SSE41_GetSatd8x4 1435 lea r0, [r0+4*r1] 1436 lea r2, [r2+4*r3] 1437 SSE41_GetSatd8x4 1438 1439 pop r2 1440 pop r0 1441 add r0, 8 1442 add r2, 8 1443 SSE41_GetSatd8x4 1444 lea r0, [r0+4*r1] 1445 lea r2, [r2+4*r3] 1446 SSE41_GetSatd8x4 1447 SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 1448 POP_XMM 1449 LOAD_4_PARA_POP 1450 DEINIT_X86_32_PIC 1451%ifdef X86_32 1452 pop r5 1453 pop r4 1454%endif 1455 ret 1456 1457;*********************************************************************** 1458; 1459;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, ); 1460; 1461;*********************************************************************** 1462 1463WELS_EXTERN WelsSampleSatd16x16_sse41 1464%ifdef X86_32 1465 push r4 1466 push r5 1467 push r6 1468%endif 1469 %assign push_num 3 1470 LOAD_4_PARA 1471 PUSH_XMM 8 1472 SIGN_EXTENSION r1, r1d 1473 SIGN_EXTENSION r3, r3d 1474 1475 push r0 1476 push r2 1477 1478 INIT_X86_32_PIC_NOPRESERVE r4 1479 movdqa xmm7, [pic(HSumSubDB1)] 1480 DEINIT_X86_32_PIC 1481 lea r4, [r1+r1*2] 1482 lea r5, [r3+r3*2] 1483 pxor xmm6, xmm6 1484 mov r6, 0 1485loop_get_satd_16x16_left: 1486 SSE41_GetSatd8x4 1487 lea r0, [r0+4*r1] 1488 lea r2, [r2+4*r3] 1489 inc r6 1490 cmp r6, 4 1491 jl loop_get_satd_16x16_left 1492 1493 pop r2 1494 pop r0 1495 add r0, 8 1496 add r2, 8 1497 mov r6, 0 1498loop_get_satd_16x16_right: 1499 SSE41_GetSatd8x4 1500 lea r0, [r0+4*r1] 1501 lea r2, [r2+4*r3] 1502 inc r6 1503 cmp r6, 4 1504 jl loop_get_satd_16x16_right 1505 SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7 1506 POP_XMM 1507 LOAD_4_PARA_POP 1508%ifdef X86_32 1509 pop r6 1510 pop r5 1511 pop r4 1512%endif 1513 ret 1514 1515;*********************************************************************** 1516; 1517;Pixel_satd_wxh_sse41 END 1518; 1519;*********************************************************************** 1520 1521;*********************************************************************** 1522; 1523;Pixel_satd_wxh_avx2 BEGIN 1524; 1525;*********************************************************************** 1526 1527%ifdef HAVE_AVX2 1528; out=%1 pSrcA=%2 pSrcB=%3 HSumSubDB1_256=%4 ymm_clobber=%5 1529%macro AVX2_LoadDiffSatd16x1 5 1530 vbroadcasti128 %1, [%2] 1531 vpmaddubsw %1, %1, %4 ; hadamard neighboring horizontal sums and differences 1532 vbroadcasti128 %5, [%3] 1533 vpmaddubsw %5, %5, %4 ; hadamard neighboring horizontal sums and differences 1534 vpsubw %1, %1, %5 ; diff srcA srcB 1535%endmacro 1536 1537; out=%1 pSrcA=%2 pSrcA+4*iStride=%3 pSrcB=%4 pSrcB+4*iStride=%5 HSumSubDB1_128x2=%6 ymm_clobber=%7,%8 1538%macro AVX2_LoadDiffSatd8x2 8 1539 vpbroadcastq %1, [%2] 1540 vpbroadcastq %7, [%3] 1541 vpblendd %1, %1, %7, 11110000b 1542 vpmaddubsw %1, %1, %6 ; hadamard neighboring horizontal sums and differences 1543 vpbroadcastq %7, [%4] 1544 vpbroadcastq %8, [%5] 1545 vpblendd %7, %7, %8, 11110000b 1546 vpmaddubsw %7, %7, %6 ; hadamard neighboring horizontal sums and differences 1547 vpsubw %1, %1, %7 ; diff srcA srcB 1548%endmacro 1549 1550; in/out=%1,%2,%3,%4 clobber=%5 1551%macro AVX2_HDMFour4x4 5 1552 vpsubw %5, %1, %4 ; s3 = x0 - x3 1553 vpaddw %1, %1, %4 ; s0 = x0 + x3 1554 vpsubw %4, %2, %3 ; s2 = x1 - x2 1555 vpaddw %2, %2, %3 ; s1 = x1 + x2 1556 vpsubw %3, %1, %2 ; y2 = s0 - s1 1557 vpaddw %1, %1, %2 ; y0 = s0 + s1 1558 vpaddw %2, %5, %4 ; y1 = s3 + s2 1559 vpsubw %4, %5, %4 ; y3 = s3 - s2 1560%endmacro 1561 1562; out=%1 in=%1,%2,%3,%4 clobber=%5 1563%macro AVX2_SatdFour4x4 5 1564 AVX2_HDMFour4x4 %1, %2, %3, %4, %5 1565 vpabsw %1, %1 1566 vpabsw %2, %2 1567 vpabsw %3, %3 1568 vpabsw %4, %4 1569 ; second stage of horizontal hadamard. 1570 ; utilizes that |a + b| + |a - b| = 2 * max(|a|, |b|) 1571 vpblendw %5, %1, %2, 10101010b 1572 vpslld %2, %2, 16 1573 vpsrld %1, %1, 16 1574 vpor %2, %2, %1 1575 vpmaxuw %2, %2, %5 1576 vpblendw %5, %3, %4, 10101010b 1577 vpslld %4, %4, 16 1578 vpsrld %3, %3, 16 1579 vpor %4, %4, %3 1580 vpmaxuw %3, %5, %4 1581 vpaddw %1, %2, %3 1582%endmacro 1583 1584; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_256=%8 ymm_clobber=%9,%10,%11,%12 1585%macro AVX2_GetSatd16x4 12 1586 AVX2_LoadDiffSatd16x1 %1, %2 + 0 * %3, %5 + 0 * %6, %8, %12 1587 AVX2_LoadDiffSatd16x1 %9, %2 + 1 * %3, %5 + 1 * %6, %8, %12 1588 AVX2_LoadDiffSatd16x1 %10, %2 + 2 * %3, %5 + 2 * %6, %8, %12 1589 AVX2_LoadDiffSatd16x1 %11, %2 + 1 * %4, %5 + 1 * %7, %8, %12 1590 AVX2_SatdFour4x4 %1, %9, %10, %11, %12 1591%endmacro 1592 1593; out=%1 pSrcA=%2 iStrideA=%3 3*iStrideA=%4 pSrcB=%5 iStrideB=%6 3*iStrideB=%7 HSumSubDB1_128x2=%8 ymm_clobber=%9,%10,%11,%12,%13 1594%macro AVX2_GetSatd8x8 13 1595 AVX2_LoadDiffSatd8x2 %1, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13 1596 AVX2_LoadDiffSatd8x2 %10, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13 1597 add %2, %3 1598 add %5, %6 1599 AVX2_LoadDiffSatd8x2 %9, %2 + 0 * %3, %2 + 4 * %3, %5 + 0 * %6, %5 + 4 * %6, %8, %12, %13 1600 AVX2_LoadDiffSatd8x2 %11, %2 + 2 * %3, %2 + 2 * %4, %5 + 2 * %6, %5 + 2 * %7, %8, %12, %13 1601 AVX2_SatdFour4x4 %1, %9, %10, %11, %12 1602%endmacro 1603 1604; d_out=%1 mm_in=%2 mm_clobber=%3 1605%macro AVX2_SumWHorizon 3 1606 WELS_DW1_VEX y%3 1607 vpmaddwd y%2, y%2, y%3 1608 vextracti128 x%3, y%2, 1 1609 vpaddd x%2, x%2, x%3 1610 vpunpckhqdq x%3, x%2, x%2 1611 vpaddd x%2, x%2, x%3 1612 vpsrldq x%3, x%2, 4 1613 vpaddd x%2, x%2, x%3 1614 vmovd %1, x%2 1615%endmacro 1616 1617;*********************************************************************** 1618; 1619;int32_t WelsSampleSatd8x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, ); 1620; 1621;*********************************************************************** 1622 1623WELS_EXTERN WelsSampleSatd8x16_avx2 1624 %assign push_num 0 1625%ifdef X86_32 1626 push r4 1627 %assign push_num 1 1628%endif 1629 mov r4, 2 ; loop cnt 1630 jmp WelsSampleSatd8x8N_avx2 1631 1632;*********************************************************************** 1633; 1634;int32_t WelsSampleSatd8x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, ); 1635; 1636;*********************************************************************** 1637 1638WELS_EXTERN WelsSampleSatd8x8_avx2 1639 %assign push_num 0 1640%ifdef X86_32 1641 push r4 1642 %assign push_num 1 1643%endif 1644 mov r4, 1 ; loop cnt 1645 ; fall through 1646WelsSampleSatd8x8N_avx2: 1647%ifdef X86_32 1648 push r5 1649 push r6 1650 %assign push_num push_num+2 1651%endif 1652 LOAD_4_PARA 1653 PUSH_XMM 8 1654 SIGN_EXTENSION r1, r1d 1655 SIGN_EXTENSION r3, r3d 1656 1657 INIT_X86_32_PIC_NOPRESERVE r5 1658 vbroadcasti128 ymm7, [pic(HSumSubDB1)] 1659 DEINIT_X86_32_PIC 1660 lea r5, [3 * r1] 1661 lea r6, [3 * r3] 1662 vpxor ymm6, ymm6, ymm6 1663.loop: 1664 AVX2_GetSatd8x8 ymm0, r0, r1, r5, r2, r3, r6, ymm7, ymm1, ymm2, ymm3, ymm4, ymm5 1665 vpaddw ymm6, ymm6, ymm0 1666 sub r4, 1 1667 jbe .loop_end 1668 add r0, r5 1669 add r2, r6 1670 lea r0, [r0 + 4 * r1] 1671 lea r2, [r2 + 4 * r3] 1672 jmp .loop 1673.loop_end: 1674 AVX2_SumWHorizon retrd, mm6, mm5 1675 vzeroupper 1676 POP_XMM 1677 LOAD_4_PARA_POP 1678%ifdef X86_32 1679 pop r6 1680 pop r5 1681 pop r4 1682%endif 1683 ret 1684 1685;*********************************************************************** 1686; 1687;int32_t WelsSampleSatd16x16_avx2( uint8_t *, int32_t, uint8_t *, int32_t, ); 1688; 1689;*********************************************************************** 1690 1691WELS_EXTERN WelsSampleSatd16x16_avx2 1692 %assign push_num 0 1693%ifdef X86_32 1694 push r4 1695 %assign push_num 1 1696%endif 1697 mov r4, 4 ; loop cnt 1698 jmp WelsSampleSatd16x4N_avx2 1699 1700;*********************************************************************** 1701; 1702;int32_t WelsSampleSatd16x8_avx2( uint8_t *, int32_t, uint8_t *, int32_t, ); 1703; 1704;*********************************************************************** 1705 1706WELS_EXTERN WelsSampleSatd16x8_avx2 1707 %assign push_num 0 1708%ifdef X86_32 1709 push r4 1710 %assign push_num 1 1711%endif 1712 mov r4, 2 ; loop cnt 1713 ; fall through 1714WelsSampleSatd16x4N_avx2: 1715%ifdef X86_32 1716 push r5 1717 push r6 1718 %assign push_num push_num+2 1719%endif 1720 LOAD_4_PARA 1721 PUSH_XMM 7 1722 SIGN_EXTENSION r1, r1d 1723 SIGN_EXTENSION r3, r3d 1724 1725 INIT_X86_32_PIC_NOPRESERVE r5 1726 vpbroadcastq xmm0, [pic(HSumSubDB1)] 1727 vpbroadcastq ymm6, [pic(HSumSubDB1 + 8)] 1728 vpblendd ymm6, ymm0, ymm6, 11110000b 1729 DEINIT_X86_32_PIC 1730 lea r5, [3 * r1] 1731 lea r6, [3 * r3] 1732 vpxor ymm5, ymm5, ymm5 1733.loop: 1734 AVX2_GetSatd16x4 ymm0, r0, r1, r5, r2, r3, r6, ymm6, ymm1, ymm2, ymm3, ymm4 1735 vpaddw ymm5, ymm5, ymm0 1736 lea r0, [r0 + 4 * r1] 1737 lea r2, [r2 + 4 * r3] 1738 sub r4, 1 1739 ja .loop 1740 AVX2_SumWHorizon retrd, mm5, mm0 1741 vzeroupper 1742 POP_XMM 1743 LOAD_4_PARA_POP 1744%ifdef X86_32 1745 pop r6 1746 pop r5 1747 pop r4 1748%endif 1749 ret 1750 1751%endif 1752 1753;*********************************************************************** 1754; 1755;Pixel_satd_wxh_avx2 END 1756; 1757;*********************************************************************** 1758 1759;*********************************************************************** 1760; 1761;Pixel_sad_wxh_sse2 BEGIN 1762; 1763;*********************************************************************** 1764 1765%macro SSE2_GetSad2x16 0 1766 lea r0, [r0+2*r1] 1767 lea r2, [r2+2*r3] 1768 movdqu xmm1, [r2] 1769 MOVDQ xmm2, [r0];[eax] must aligned 16 1770 psadbw xmm1, xmm2 1771 paddw xmm0, xmm1 1772 movdqu xmm1, [r2+r3] 1773 MOVDQ xmm2, [r0+r1] 1774 psadbw xmm1, xmm2 1775 paddw xmm0, xmm1 1776%endmacro 1777 1778 1779%macro SSE2_GetSad4x16 0 1780 movdqu xmm0, [r2] 1781 MOVDQ xmm2, [r0] 1782 psadbw xmm0, xmm2 1783 paddw xmm7, xmm0 1784 movdqu xmm1, [r2+r3] 1785 MOVDQ xmm2, [r0+r1] 1786 psadbw xmm1, xmm2 1787 paddw xmm7, xmm1 1788 movdqu xmm1, [r2+2*r3] 1789 MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16 1790 psadbw xmm1, xmm2 1791 paddw xmm7, xmm1 1792 movdqu xmm1, [r2+r5] 1793 MOVDQ xmm2, [r0+r4] 1794 psadbw xmm1, xmm2 1795 paddw xmm7, xmm1 1796%endmacro 1797 1798 1799%macro SSE2_GetSad8x4 0 1800 movq xmm0, [r0] 1801 movq xmm1, [r0+r1] 1802 lea r0, [r0+2*r1] 1803 movhps xmm0, [r0] 1804 movhps xmm1, [r0+r1] 1805 1806 movq xmm2, [r2] 1807 movq xmm3, [r2+r3] 1808 lea r2, [r2+2*r3] 1809 movhps xmm2, [r2] 1810 movhps xmm3, [r2+r3] 1811 psadbw xmm0, xmm2 1812 psadbw xmm1, xmm3 1813 paddw xmm6, xmm0 1814 paddw xmm6, xmm1 1815%endmacro 1816 1817;*********************************************************************** 1818; 1819;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) 1820;First parameter can align to 16 bytes, 1821;In wels, the third parameter can't align to 16 bytes. 1822; 1823;*********************************************************************** 1824WELS_EXTERN WelsSampleSad16x16_sse2 1825%ifdef X86_32 1826 push r4 1827 push r5 1828%endif 1829 1830 %assign push_num 2 1831 LOAD_4_PARA 1832 PUSH_XMM 8 1833 SIGN_EXTENSION r1, r1d 1834 SIGN_EXTENSION r3, r3d 1835 lea r4, [3*r1] 1836 lea r5, [3*r3] 1837 1838 pxor xmm7, xmm7 1839 SSE2_GetSad4x16 1840 lea r0, [r0+4*r1] 1841 lea r2, [r2+4*r3] 1842 SSE2_GetSad4x16 1843 lea r0, [r0+4*r1] 1844 lea r2, [r2+4*r3] 1845 SSE2_GetSad4x16 1846 lea r0, [r0+4*r1] 1847 lea r2, [r2+4*r3] 1848 SSE2_GetSad4x16 1849 movhlps xmm0, xmm7 1850 paddw xmm0, xmm7 1851 movd retrd, xmm0 1852 POP_XMM 1853 LOAD_4_PARA_POP 1854%ifdef X86_32 1855 pop r5 1856 pop r4 1857%endif 1858 ret 1859 1860;*********************************************************************** 1861; 1862;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, ) 1863;First parameter can align to 16 bytes, 1864;In wels, the third parameter can't align to 16 bytes. 1865; 1866;*********************************************************************** 1867WELS_EXTERN WelsSampleSad16x8_sse2 1868 %assign push_num 0 1869 LOAD_4_PARA 1870 SIGN_EXTENSION r1, r1d 1871 SIGN_EXTENSION r3, r3d 1872 movdqu xmm0, [r2] 1873 MOVDQ xmm2, [r0] 1874 psadbw xmm0, xmm2 1875 movdqu xmm1, [r2+r3] 1876 MOVDQ xmm2, [r0+r1] 1877 psadbw xmm1, xmm2 1878 paddw xmm0, xmm1 1879 1880 SSE2_GetSad2x16 1881 SSE2_GetSad2x16 1882 SSE2_GetSad2x16 1883 1884 movhlps xmm1, xmm0 1885 paddw xmm0, xmm1 1886 movd retrd, xmm0 1887 LOAD_4_PARA_POP 1888 ret 1889 1890 1891 1892WELS_EXTERN WelsSampleSad8x16_sse2 1893 %assign push_num 0 1894 LOAD_4_PARA 1895 PUSH_XMM 7 1896 SIGN_EXTENSION r1, r1d 1897 SIGN_EXTENSION r3, r3d 1898 pxor xmm6, xmm6 1899 1900 SSE2_GetSad8x4 1901 lea r0, [r0+2*r1] 1902 lea r2, [r2+2*r3] 1903 SSE2_GetSad8x4 1904 lea r0, [r0+2*r1] 1905 lea r2, [r2+2*r3] 1906 SSE2_GetSad8x4 1907 lea r0, [r0+2*r1] 1908 lea r2, [r2+2*r3] 1909 SSE2_GetSad8x4 1910 1911 movhlps xmm0, xmm6 1912 paddw xmm0, xmm6 1913 movd retrd, xmm0 1914 POP_XMM 1915 LOAD_4_PARA_POP 1916 ret 1917 1918 1919%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline 1920and %1, 0x1f|(%3>>1) 1921cmp %1, (32-%2)|(%3>>1) 1922%endmacro 1923 1924WELS_EXTERN WelsSampleSad8x8_sse21 1925 %assign push_num 0 1926 mov r2, arg3 1927 push r2 1928 CACHE_SPLIT_CHECK r2, 8, 64 1929 jle near .pixel_sad_8x8_nsplit 1930 pop r2 1931%ifdef X86_32 1932 push r3 1933 push r4 1934 push r5 1935%endif 1936 %assign push_num 3 1937 PUSH_XMM 8 1938 mov r0, arg1 1939 mov r1, arg2 1940 SIGN_EXTENSION r1, r1d 1941 pxor xmm7, xmm7 1942 1943 ;ecx r2, edx r4, edi r5 1944 1945 mov r5, r2 1946 and r5, 0x07 1947 sub r2, r5 1948 mov r4, 8 1949 sub r4, r5 1950 1951 shl r5, 3 1952 shl r4, 3 1953 movd xmm5, r5d 1954 movd xmm6, r4d 1955 mov r5, 8 1956 add r5, r2 1957 mov r3, arg4 1958 SIGN_EXTENSION r3, r3d 1959 movq xmm0, [r0] 1960 movhps xmm0, [r0+r1] 1961 1962 movq xmm1, [r2] 1963 movq xmm2, [r5] 1964 movhps xmm1, [r2+r3] 1965 movhps xmm2, [r5+r3] 1966 psrlq xmm1, xmm5 1967 psllq xmm2, xmm6 1968 por xmm1, xmm2 1969 1970 psadbw xmm0, xmm1 1971 paddw xmm7, xmm0 1972 1973 lea r0, [r0+2*r1] 1974 lea r2, [r2+2*r3] 1975 lea r5, [r5+2*r3] 1976 1977 movq xmm0, [r0] 1978 movhps xmm0, [r0+r1] 1979 1980 movq xmm1, [r2] 1981 movq xmm2, [r5] 1982 movhps xmm1, [r2+r3] 1983 movhps xmm2, [r5+r3] 1984 psrlq xmm1, xmm5 1985 psllq xmm2, xmm6 1986 por xmm1, xmm2 1987 1988 psadbw xmm0, xmm1 1989 paddw xmm7, xmm0 1990 1991 lea r0, [r0+2*r1] 1992 lea r2, [r2+2*r3] 1993 lea r5, [r5+2*r3] 1994 1995 movq xmm0, [r0] 1996 movhps xmm0, [r0+r1] 1997 1998 movq xmm1, [r2] 1999 movq xmm2, [r5] 2000 movhps xmm1, [r2+r3] 2001 movhps xmm2, [r5+r3] 2002 psrlq xmm1, xmm5 2003 psllq xmm2, xmm6 2004 por xmm1, xmm2 2005 2006 psadbw xmm0, xmm1 2007 paddw xmm7, xmm0 2008 2009 lea r0, [r0+2*r1] 2010 lea r2, [r2+2*r3] 2011 lea r5, [r5+2*r3] 2012 2013 movq xmm0, [r0] 2014 movhps xmm0, [r0+r1] 2015 2016 movq xmm1, [r2] 2017 movq xmm2, [r5] 2018 movhps xmm1, [r2+r3] 2019 movhps xmm2, [r5+r3] 2020 psrlq xmm1, xmm5 2021 psllq xmm2, xmm6 2022 por xmm1, xmm2 2023 2024 psadbw xmm0, xmm1 2025 paddw xmm7, xmm0 2026 2027 movhlps xmm0, xmm7 2028 paddw xmm0, xmm7 2029 movd retrd, xmm0 2030 POP_XMM 2031%ifdef X86_32 2032 pop r5 2033 pop r4 2034 pop r3 2035%endif 2036 jmp .return 2037 2038.pixel_sad_8x8_nsplit: 2039 2040 pop r2 2041 %assign push_num 0 2042 LOAD_4_PARA 2043 PUSH_XMM 7 2044 SIGN_EXTENSION r1, r1d 2045 SIGN_EXTENSION r3, r3d 2046 pxor xmm6, xmm6 2047 SSE2_GetSad8x4 2048 lea r0, [r0+2*r1] 2049 lea r2, [r2+2*r3] 2050 SSE2_GetSad8x4 2051 movhlps xmm0, xmm6 2052 paddw xmm0, xmm6 2053 movd retrd, xmm0 2054 POP_XMM 2055 LOAD_4_PARA_POP 2056.return: 2057 ret 2058 2059 2060;*********************************************************************** 2061; 2062;Pixel_sad_wxh_sse2 END 2063; 2064;*********************************************************************** 2065 2066 2067;*********************************************************************** 2068; 2069;Pixel_sad_4_wxh_sse2 BEGIN 2070; 2071;*********************************************************************** 2072 2073 2074%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address 2075 psadbw %1, %4 2076 paddw xmm5, %1 2077 psadbw %4, %3 2078 paddw xmm4, %4 2079 movdqu %4, [%5-1] 2080 psadbw %4, %2 2081 paddw xmm6, %4 2082 movdqu %4, [%5+1] 2083 psadbw %4, %2 2084 paddw xmm7, %4 2085%endmacro 2086WELS_EXTERN WelsSampleSadFour16x16_sse2 2087 %assign push_num 0 2088 LOAD_5_PARA 2089 PUSH_XMM 8 2090 SIGN_EXTENSION r1, r1d 2091 SIGN_EXTENSION r3, r3d 2092 pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref 2093 pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref 2094 pxor xmm6, xmm6 ;sad pRefMb-1 2095 pxor xmm7, xmm7 ;sad pRefMb+1 2096 movdqa xmm0, [r0] 2097 sub r2, r3 2098 movdqu xmm3, [r2] 2099 psadbw xmm3, xmm0 2100 paddw xmm4, xmm3 2101 2102 movdqa xmm1, [r0+r1] 2103 movdqu xmm3, [r2+r3] 2104 psadbw xmm3, xmm1 2105 paddw xmm4, xmm3 2106 2107 movdqu xmm2, [r2+r3-1] 2108 psadbw xmm2, xmm0 2109 paddw xmm6, xmm2 2110 2111 movdqu xmm3, [r2+r3+1] 2112 psadbw xmm3, xmm0 2113 paddw xmm7, xmm3 2114 2115 lea r0, [r0+2*r1] 2116 lea r2, [r2+2*r3] 2117 movdqa xmm2, [r0] 2118 movdqu xmm3, [r2] 2119 SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 2120 movdqa xmm0, [r0+r1] 2121 movdqu xmm3, [r2+r3] 2122 SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 2123 lea r0, [r0+2*r1] 2124 lea r2, [r2+2*r3] 2125 movdqa xmm1, [r0] 2126 movdqu xmm3, [r2] 2127 SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 2128 movdqa xmm2, [r0+r1] 2129 movdqu xmm3, [r2+r3] 2130 SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 2131 lea r0, [r0+2*r1] 2132 lea r2, [r2+2*r3] 2133 movdqa xmm0, [r0] 2134 movdqu xmm3, [r2] 2135 SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 2136 movdqa xmm1, [r0+r1] 2137 movdqu xmm3, [r2+r3] 2138 SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 2139 lea r0, [r0+2*r1] 2140 lea r2, [r2+2*r3] 2141 movdqa xmm2, [r0] 2142 movdqu xmm3, [r2] 2143 SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 2144 movdqa xmm0, [r0+r1] 2145 movdqu xmm3, [r2+r3] 2146 SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 2147 lea r0, [r0+2*r1] 2148 lea r2, [r2+2*r3] 2149 movdqa xmm1, [r0] 2150 movdqu xmm3, [r2] 2151 SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 2152 movdqa xmm2, [r0+r1] 2153 movdqu xmm3, [r2+r3] 2154 SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 2155 lea r0, [r0+2*r1] 2156 lea r2, [r2+2*r3] 2157 movdqa xmm0, [r0] 2158 movdqu xmm3, [r2] 2159 SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 2160 movdqa xmm1, [r0+r1] 2161 movdqu xmm3, [r2+r3] 2162 SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 2163 lea r0, [r0+2*r1] 2164 lea r2, [r2+2*r3] 2165 movdqa xmm2, [r0] 2166 movdqu xmm3, [r2] 2167 SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 2168 movdqa xmm0, [r0+r1] 2169 movdqu xmm3, [r2+r3] 2170 SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 2171 lea r2, [r2+2*r3] 2172 movdqu xmm3, [r2] 2173 psadbw xmm2, xmm3 2174 paddw xmm5, xmm2 2175 2176 movdqu xmm2, [r2-1] 2177 psadbw xmm2, xmm0 2178 paddw xmm6, xmm2 2179 2180 movdqu xmm3, [r2+1] 2181 psadbw xmm3, xmm0 2182 paddw xmm7, xmm3 2183 2184 movdqu xmm3, [r2+r3] 2185 psadbw xmm0, xmm3 2186 paddw xmm5, xmm0 2187 2188 movhlps xmm0, xmm4 2189 paddw xmm4, xmm0 2190 movhlps xmm0, xmm5 2191 paddw xmm5, xmm0 2192 movhlps xmm0, xmm6 2193 paddw xmm6, xmm0 2194 movhlps xmm0, xmm7 2195 paddw xmm7, xmm0 2196 punpckldq xmm4, xmm5 2197 punpckldq xmm6, xmm7 2198 punpcklqdq xmm4, xmm6 2199 movdqa [r4],xmm4 2200 POP_XMM 2201 LOAD_5_PARA_POP 2202 ret 2203 2204 2205WELS_EXTERN WelsSampleSadFour16x8_sse2 2206 %assign push_num 0 2207 LOAD_5_PARA 2208 PUSH_XMM 8 2209 SIGN_EXTENSION r1, r1d 2210 SIGN_EXTENSION r3, r3d 2211 pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref 2212 pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref 2213 pxor xmm6, xmm6 ;sad pRefMb-1 2214 pxor xmm7, xmm7 ;sad pRefMb+1 2215 movdqa xmm0, [r0] 2216 sub r2, r3 2217 movdqu xmm3, [r2] 2218 psadbw xmm3, xmm0 2219 paddw xmm4, xmm3 2220 2221 movdqa xmm1, [r0+r1] 2222 movdqu xmm3, [r2+r3] 2223 psadbw xmm3, xmm1 2224 paddw xmm4, xmm3 2225 2226 movdqu xmm2, [r2+r3-1] 2227 psadbw xmm2, xmm0 2228 paddw xmm6, xmm2 2229 2230 movdqu xmm3, [r2+r3+1] 2231 psadbw xmm3, xmm0 2232 paddw xmm7, xmm3 2233 2234 lea r0, [r0+2*r1] 2235 lea r2, [r2+2*r3] 2236 movdqa xmm2, [r0] 2237 movdqu xmm3, [r2] 2238 SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2 2239 movdqa xmm0, [r0+r1] 2240 movdqu xmm3, [r2+r3] 2241 SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3 2242 lea r0, [r0+2*r1] 2243 lea r2, [r2+2*r3] 2244 movdqa xmm1, [r0] 2245 movdqu xmm3, [r2] 2246 SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2 2247 movdqa xmm2, [r0+r1] 2248 movdqu xmm3, [r2+r3] 2249 SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3 2250 lea r0, [r0+2*r1] 2251 lea r2, [r2+2*r3] 2252 movdqa xmm0, [r0] 2253 movdqu xmm3, [r2] 2254 SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2 2255 movdqa xmm1, [r0+r1] 2256 movdqu xmm3, [r2+r3] 2257 SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3 2258 lea r2, [r2+2*r3] 2259 movdqu xmm3, [r2] 2260 psadbw xmm0, xmm3 2261 paddw xmm5, xmm0 2262 2263 movdqu xmm0, [r2-1] 2264 psadbw xmm0, xmm1 2265 paddw xmm6, xmm0 2266 2267 movdqu xmm3, [r2+1] 2268 psadbw xmm3, xmm1 2269 paddw xmm7, xmm3 2270 2271 movdqu xmm3, [r2+r3] 2272 psadbw xmm1, xmm3 2273 paddw xmm5, xmm1 2274 2275 movhlps xmm0, xmm4 2276 paddw xmm4, xmm0 2277 movhlps xmm0, xmm5 2278 paddw xmm5, xmm0 2279 movhlps xmm0, xmm6 2280 paddw xmm6, xmm0 2281 movhlps xmm0, xmm7 2282 paddw xmm7, xmm0 2283 punpckldq xmm4, xmm5 2284 punpckldq xmm6, xmm7 2285 punpcklqdq xmm4, xmm6 2286 movdqa [r4],xmm4 2287 POP_XMM 2288 LOAD_5_PARA_POP 2289 ret 2290 2291WELS_EXTERN WelsSampleSadFour8x16_sse2 2292 %assign push_num 0 2293 LOAD_5_PARA 2294 PUSH_XMM 8 2295 SIGN_EXTENSION r1, r1d 2296 SIGN_EXTENSION r3, r3d 2297 pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref 2298 pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref 2299 pxor xmm6, xmm6 ;sad pRefMb-1 2300 pxor xmm7, xmm7 ;sad pRefMb+1 2301 movq xmm0, [r0] 2302 movhps xmm0, [r0+r1] 2303 sub r2, r3 2304 movq xmm3, [r2] 2305 movhps xmm3, [r2+r3] 2306 psadbw xmm3, xmm0 2307 paddw xmm4, xmm3 2308 2309 movq xmm1, [r2+r3-1] 2310 movq xmm3, [r2+r3+1] 2311 2312 lea r0, [r0+2*r1] 2313 lea r2, [r2+2*r3] 2314 movhps xmm1, [r2-1] 2315 movhps xmm3, [r2+1] 2316 psadbw xmm1, xmm0 2317 paddw xmm6, xmm1 2318 psadbw xmm3, xmm0 2319 paddw xmm7, xmm3 2320 2321 movq xmm3, [r2] 2322 movhps xmm3, [r2+r3] 2323 psadbw xmm0, xmm3 2324 paddw xmm5, xmm0 2325 2326 movq xmm0, [r0] 2327 movhps xmm0, [r0+r1] 2328 psadbw xmm3, xmm0 2329 paddw xmm4, xmm3 2330 2331 movq xmm1, [r2+r3-1] 2332 movq xmm3, [r2+r3+1] 2333 2334 lea r0, [r0+2*r1] 2335 lea r2, [r2+2*r3] 2336 movhps xmm1, [r2-1] 2337 movhps xmm3, [r2+1] 2338 2339 psadbw xmm1, xmm0 2340 paddw xmm6, xmm1 2341 psadbw xmm3, xmm0 2342 paddw xmm7, xmm3 2343 2344 movq xmm3, [r2] 2345 movhps xmm3, [r2+r3] 2346 psadbw xmm0, xmm3 2347 paddw xmm5, xmm0 2348 2349 movq xmm0, [r0] 2350 movhps xmm0, [r0+r1] 2351 psadbw xmm3, xmm0 2352 paddw xmm4, xmm3 2353 2354 movq xmm1, [r2+r3-1] 2355 movq xmm3, [r2+r3+1] 2356 2357 lea r0, [r0+2*r1] 2358 lea r2, [r2+2*r3] 2359 movhps xmm1, [r2-1] 2360 movhps xmm3, [r2+1] 2361 2362 psadbw xmm1, xmm0 2363 paddw xmm6, xmm1 2364 psadbw xmm3, xmm0 2365 paddw xmm7, xmm3 2366 2367 movq xmm3, [r2] 2368 movhps xmm3, [r2+r3] 2369 psadbw xmm0, xmm3 2370 paddw xmm5, xmm0 2371 2372 movq xmm0, [r0] 2373 movhps xmm0, [r0+r1] 2374 psadbw xmm3, xmm0 2375 paddw xmm4, xmm3 2376 2377 movq xmm1, [r2+r3-1] 2378 movq xmm3, [r2+r3+1] 2379 2380 lea r0, [r0+2*r1] 2381 lea r2, [r2+2*r3] 2382 movhps xmm1, [r2-1] 2383 movhps xmm3, [r2+1] 2384 2385 psadbw xmm1, xmm0 2386 paddw xmm6, xmm1 2387 psadbw xmm3, xmm0 2388 paddw xmm7, xmm3 2389 2390 movq xmm3, [r2] 2391 movhps xmm3, [r2+r3] 2392 psadbw xmm0, xmm3 2393 paddw xmm5, xmm0 2394 2395 movq xmm0, [r0] 2396 movhps xmm0, [r0+r1] 2397 psadbw xmm3, xmm0 2398 paddw xmm4, xmm3 2399 2400 movq xmm1, [r2+r3-1] 2401 movq xmm3, [r2+r3+1] 2402 2403 lea r0, [r0+2*r1] 2404 lea r2, [r2+2*r3] 2405 movhps xmm1, [r2-1] 2406 movhps xmm3, [r2+1] 2407 2408 psadbw xmm1, xmm0 2409 paddw xmm6, xmm1 2410 psadbw xmm3, xmm0 2411 paddw xmm7, xmm3 2412 2413 movq xmm3, [r2] 2414 movhps xmm3, [r2+r3] 2415 psadbw xmm0, xmm3 2416 paddw xmm5, xmm0 2417 2418 movq xmm0, [r0] 2419 movhps xmm0, [r0+r1] 2420 psadbw xmm3, xmm0 2421 paddw xmm4, xmm3 2422 2423 movq xmm1, [r2+r3-1] 2424 movq xmm3, [r2+r3+1] 2425 2426 lea r0, [r0+2*r1] 2427 lea r2, [r2+2*r3] 2428 movhps xmm1, [r2-1] 2429 movhps xmm3, [r2+1] 2430 2431 psadbw xmm1, xmm0 2432 paddw xmm6, xmm1 2433 psadbw xmm3, xmm0 2434 paddw xmm7, xmm3 2435 2436 movq xmm3, [r2] 2437 movhps xmm3, [r2+r3] 2438 psadbw xmm0, xmm3 2439 paddw xmm5, xmm0 2440 2441 movq xmm0, [r0] 2442 movhps xmm0, [r0+r1] 2443 psadbw xmm3, xmm0 2444 paddw xmm4, xmm3 2445 2446 movq xmm1, [r2+r3-1] 2447 movq xmm3, [r2+r3+1] 2448 2449 lea r0, [r0+2*r1] 2450 lea r2, [r2+2*r3] 2451 movhps xmm1, [r2-1] 2452 movhps xmm3, [r2+1] 2453 2454 psadbw xmm1, xmm0 2455 paddw xmm6, xmm1 2456 psadbw xmm3, xmm0 2457 paddw xmm7, xmm3 2458 2459 movq xmm3, [r2] 2460 movhps xmm3, [r2+r3] 2461 psadbw xmm0, xmm3 2462 paddw xmm5, xmm0 2463 2464 movq xmm0, [r0] 2465 movhps xmm0, [r0+r1] 2466 psadbw xmm3, xmm0 2467 paddw xmm4, xmm3 2468 2469 movq xmm1, [r2+r3-1] 2470 movq xmm3, [r2+r3+1] 2471 2472 lea r0, [r0+2*r1] 2473 lea r2, [r2+2*r3] 2474 movhps xmm1, [r2-1] 2475 movhps xmm3, [r2+1] 2476 2477 psadbw xmm1, xmm0 2478 paddw xmm6, xmm1 2479 psadbw xmm3, xmm0 2480 paddw xmm7, xmm3 2481 2482 movq xmm3, [r2] 2483 movhps xmm3, [r2+r3] 2484 psadbw xmm0, xmm3 2485 paddw xmm5, xmm0 2486 2487 movhlps xmm0, xmm4 2488 paddw xmm4, xmm0 2489 movhlps xmm0, xmm5 2490 paddw xmm5, xmm0 2491 movhlps xmm0, xmm6 2492 paddw xmm6, xmm0 2493 movhlps xmm0, xmm7 2494 paddw xmm7, xmm0 2495 punpckldq xmm4, xmm5 2496 punpckldq xmm6, xmm7 2497 punpcklqdq xmm4, xmm6 2498 movdqa [r4],xmm4 2499 POP_XMM 2500 LOAD_5_PARA_POP 2501 ret 2502 2503 2504WELS_EXTERN WelsSampleSadFour8x8_sse2 2505 %assign push_num 0 2506 LOAD_5_PARA 2507 PUSH_XMM 8 2508 SIGN_EXTENSION r1, r1d 2509 SIGN_EXTENSION r3, r3d 2510 pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref 2511 pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref 2512 pxor xmm6, xmm6 ;sad pRefMb-1 2513 pxor xmm7, xmm7 ;sad pRefMb+1 2514 movq xmm0, [r0] 2515 movhps xmm0, [r0+r1] 2516 sub r2, r3 2517 movq xmm3, [r2] 2518 movhps xmm3, [r2+r3] 2519 psadbw xmm3, xmm0 2520 paddw xmm4, xmm3 2521 2522 movq xmm1, [r2+r3-1] 2523 movq xmm3, [r2+r3+1] 2524 2525 lea r0, [r0+2*r1] 2526 lea r2, [r2+2*r3] 2527 movhps xmm1, [r2-1] 2528 movhps xmm3, [r2+1] 2529 psadbw xmm1, xmm0 2530 paddw xmm6, xmm1 2531 psadbw xmm3, xmm0 2532 paddw xmm7, xmm3 2533 2534 movq xmm3, [r2] 2535 movhps xmm3, [r2+r3] 2536 psadbw xmm0, xmm3 2537 paddw xmm5, xmm0 2538 2539 movq xmm0, [r0] 2540 movhps xmm0, [r0+r1] 2541 psadbw xmm3, xmm0 2542 paddw xmm4, xmm3 2543 2544 movq xmm1, [r2+r3-1] 2545 movq xmm3, [r2+r3+1] 2546 2547 lea r0, [r0+2*r1] 2548 lea r2, [r2+2*r3] 2549 movhps xmm1, [r2-1] 2550 movhps xmm3, [r2+1] 2551 2552 psadbw xmm1, xmm0 2553 paddw xmm6, xmm1 2554 psadbw xmm3, xmm0 2555 paddw xmm7, xmm3 2556 2557 movq xmm3, [r2] 2558 movhps xmm3, [r2+r3] 2559 psadbw xmm0, xmm3 2560 paddw xmm5, xmm0 2561 2562 movq xmm0, [r0] 2563 movhps xmm0, [r0+r1] 2564 psadbw xmm3, xmm0 2565 paddw xmm4, xmm3 2566 2567 movq xmm1, [r2+r3-1] 2568 movq xmm3, [r2+r3+1] 2569 2570 lea r0, [r0+2*r1] 2571 lea r2, [r2+2*r3] 2572 movhps xmm1, [r2-1] 2573 movhps xmm3, [r2+1] 2574 2575 psadbw xmm1, xmm0 2576 paddw xmm6, xmm1 2577 psadbw xmm3, xmm0 2578 paddw xmm7, xmm3 2579 2580 movq xmm3, [r2] 2581 movhps xmm3, [r2+r3] 2582 psadbw xmm0, xmm3 2583 paddw xmm5, xmm0 2584 2585 movq xmm0, [r0] 2586 movhps xmm0, [r0+r1] 2587 psadbw xmm3, xmm0 2588 paddw xmm4, xmm3 2589 2590 2591 movq xmm1, [r2+r3-1] 2592 movq xmm3, [r2+r3+1] 2593 2594 lea r0, [r0+2*r1] 2595 lea r2, [r2+2*r3] 2596 movhps xmm1, [r2-1] 2597 movhps xmm3, [r2+1] 2598 2599 psadbw xmm1, xmm0 2600 paddw xmm6, xmm1 2601 psadbw xmm3, xmm0 2602 paddw xmm7, xmm3 2603 2604 movq xmm3, [r2] 2605 movhps xmm3, [r2+r3] 2606 psadbw xmm0, xmm3 2607 paddw xmm5, xmm0 2608 2609 movhlps xmm0, xmm4 2610 paddw xmm4, xmm0 2611 movhlps xmm0, xmm5 2612 paddw xmm5, xmm0 2613 movhlps xmm0, xmm6 2614 paddw xmm6, xmm0 2615 movhlps xmm0, xmm7 2616 paddw xmm7, xmm0 2617 punpckldq xmm4, xmm5 2618 punpckldq xmm6, xmm7 2619 punpcklqdq xmm4, xmm6 2620 movdqa [r4],xmm4 2621 POP_XMM 2622 LOAD_5_PARA_POP 2623 ret 2624 2625WELS_EXTERN WelsSampleSadFour4x4_sse2 2626 %assign push_num 0 2627 LOAD_5_PARA 2628 PUSH_XMM 8 2629 SIGN_EXTENSION r1, r1d 2630 SIGN_EXTENSION r3, r3d 2631 movd xmm0, [r0] 2632 movd xmm1, [r0+r1] 2633 lea r0, [r0+2*r1] 2634 movd xmm2, [r0] 2635 movd xmm3, [r0+r1] 2636 punpckldq xmm0, xmm1 2637 punpckldq xmm2, xmm3 2638 punpcklqdq xmm0, xmm2 2639 sub r2, r3 2640 movd xmm1, [r2] 2641 movd xmm2, [r2+r3] 2642 punpckldq xmm1, xmm2 2643 movd xmm2, [r2+r3-1] 2644 movd xmm3, [r2+r3+1] 2645 2646 lea r2, [r2+2*r3] 2647 2648 movd xmm4, [r2] 2649 movd xmm5, [r2-1] 2650 punpckldq xmm2, xmm5 2651 movd xmm5, [r2+1] 2652 punpckldq xmm3, xmm5 2653 2654 movd xmm5, [r2+r3] 2655 punpckldq xmm4, xmm5 2656 2657 punpcklqdq xmm1, xmm4 ;-L 2658 2659 movd xmm5, [r2+r3-1] 2660 movd xmm6, [r2+r3+1] 2661 2662 lea r2, [r2+2*r3] 2663 movd xmm7, [r2-1] 2664 punpckldq xmm5, xmm7 2665 punpcklqdq xmm2, xmm5 ;-1 2666 movd xmm7, [r2+1] 2667 punpckldq xmm6, xmm7 2668 punpcklqdq xmm3, xmm6 ;+1 2669 movd xmm6, [r2] 2670 movd xmm7, [r2+r3] 2671 punpckldq xmm6, xmm7 2672 punpcklqdq xmm4, xmm6 ;+L 2673 psadbw xmm1, xmm0 2674 psadbw xmm2, xmm0 2675 psadbw xmm3, xmm0 2676 psadbw xmm4, xmm0 2677 2678 movhlps xmm0, xmm1 2679 paddw xmm1, xmm0 2680 movhlps xmm0, xmm2 2681 paddw xmm2, xmm0 2682 movhlps xmm0, xmm3 2683 paddw xmm3, xmm0 2684 movhlps xmm0, xmm4 2685 paddw xmm4, xmm0 2686 punpckldq xmm1, xmm4 2687 punpckldq xmm2, xmm3 2688 punpcklqdq xmm1, xmm2 2689 movdqa [r4],xmm1 2690 POP_XMM 2691 LOAD_5_PARA_POP 2692 ret 2693 2694;*********************************************************************** 2695; 2696;Pixel_sad_4_wxh_sse2 END 2697; 2698;*********************************************************************** 2699 2700;*********************************************************************** 2701; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t ) 2702;*********************************************************************** 2703WELS_EXTERN WelsSampleSad4x4_mmx 2704 %assign push_num 0 2705 LOAD_4_PARA 2706 SIGN_EXTENSION r1, r1d 2707 SIGN_EXTENSION r3, r3d 2708 movd mm0, [r0] 2709 movd mm1, [r0+r1] 2710 punpckldq mm0, mm1 2711 2712 movd mm3, [r2] 2713 movd mm4, [r2+r3] 2714 punpckldq mm3, mm4 2715 psadbw mm0, mm3 2716 2717 lea r0, [r0+2*r1] 2718 lea r2, [r2+2*r3] 2719 2720 movd mm1, [r0] 2721 movd mm2, [r0+r1] 2722 punpckldq mm1, mm2 2723 2724 movd mm3, [r2] 2725 movd mm4, [r2+r3] 2726 punpckldq mm3, mm4 2727 psadbw mm1, mm3 2728 paddw mm0, mm1 2729 2730 movd retrd, mm0 2731 2732 WELSEMMS 2733 LOAD_4_PARA_POP 2734 ret 2735