1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* sse2inc.asm 33;* 34;* Abstract 35;* macro and constant 36;* 37;* History 38;* 8/5/2009 Created 39;* 40;* 41;*************************************************************************/ 42;*********************************************************************** 43; Options, for DEBUG 44;*********************************************************************** 45 46%if 1 47 %define MOVDQ movdqa 48%else 49 %define MOVDQ movdqu 50%endif 51 52%if 1 53 %define WELSEMMS emms 54%else 55 %define WELSEMMS 56%endif 57 58 59;*********************************************************************** 60; Macros 61;*********************************************************************** 62 63%ifdef WIN64 ; Windows x64 ;************************************ 64 65DEFAULT REL 66 67BITS 64 68 69%define arg1 rcx 70%define arg2 rdx 71%define arg3 r8 72%define arg4 r9 73%define arg5 [rsp + push_num*8 + 40] 74%define arg6 [rsp + push_num*8 + 48] 75%define arg7 [rsp + push_num*8 + 56] 76%define arg8 [rsp + push_num*8 + 64] 77%define arg9 [rsp + push_num*8 + 72] 78%define arg10 [rsp + push_num*8 + 80] 79%define arg11 [rsp + push_num*8 + 88] 80%define arg12 [rsp + push_num*8 + 96] 81 82%define arg1d ecx 83%define arg2d edx 84%define arg3d r8d 85%define arg4d r9d 86%define arg5d arg5 87%define arg6d arg6 88%define arg7d arg7 89%define arg8d arg8 90%define arg9d arg9 91%define arg10d arg10 92%define arg11d arg11 93%define arg12d arg12 94 95%define r0 rcx 96%define r1 rdx 97%define r2 r8 98%define r3 r9 99%define r4 rax 100%define r5 r10 101%define r6 r11 102%define r7 rsp 103 104%define r0d ecx 105%define r1d edx 106%define r2d r8d 107%define r3d r9d 108%define r4d eax 109%define r5d r10d 110%define r6d r11d 111 112%define r0w cx 113%define r1w dx 114%define r2w r8w 115%define r3w r9w 116%define r4w ax 117%define r6w r11w 118 119%define r0b cl 120%define r1b dl 121%define r2b r8l 122%define r3b r9l 123 124%define PUSHRFLAGS pushfq 125%define POPRFLAGS popfq 126%define retrq rax 127%define retrd eax 128 129%elifdef UNIX64 ; Unix x64 ;************************************ 130 131DEFAULT REL 132 133BITS 64 134 135%ifidn __OUTPUT_FORMAT__,elf64 136SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-executable 137%endif 138 139%define arg1 rdi 140%define arg2 rsi 141%define arg3 rdx 142%define arg4 rcx 143%define arg5 r8 144%define arg6 r9 145%define arg7 [rsp + push_num*8 + 8] 146%define arg8 [rsp + push_num*8 + 16] 147%define arg9 [rsp + push_num*8 + 24] 148%define arg10 [rsp + push_num*8 + 32] 149%define arg11 [rsp + push_num*8 + 40] 150%define arg12 [rsp + push_num*8 + 48] 151 152%define arg1d edi 153%define arg2d esi 154%define arg3d edx 155%define arg4d ecx 156%define arg5d r8d 157%define arg6d r9d 158%define arg7d arg7 159%define arg8d arg8 160%define arg9d arg9 161%define arg10d arg10 162%define arg11d arg11 163%define arg12d arg12 164 165%define r0 rdi 166%define r1 rsi 167%define r2 rdx 168%define r3 rcx 169%define r4 r8 170%define r5 r9 171%define r6 r10 172%define r7 rsp 173 174%define r0d edi 175%define r1d esi 176%define r2d edx 177%define r3d ecx 178%define r4d r8d 179%define r5d r9d 180%define r6d r10d 181 182%define r0w di 183%define r1w si 184%define r2w dx 185%define r3w cx 186%define r4w r8w 187%define r6w r10w 188 189%define r0b dil 190%define r1b sil 191%define r2b dl 192%define r3b cl 193 194%define PUSHRFLAGS pushfq 195%define POPRFLAGS popfq 196%define retrq rax 197%define retrd eax 198 199%elifdef X86_32 ; X86_32 ;************************************ 200 201BITS 32 202 203%ifidn __OUTPUT_FORMAT__,elf 204SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-executable 205%endif 206 207%define arg1 [esp + push_num*4 + 4] 208%define arg2 [esp + push_num*4 + 8] 209%define arg3 [esp + push_num*4 + 12] 210%define arg4 [esp + push_num*4 + 16] 211%define arg5 [esp + push_num*4 + 20] 212%define arg6 [esp + push_num*4 + 24] 213%define arg7 [esp + push_num*4 + 28] 214%define arg8 [esp + push_num*4 + 32] 215%define arg9 [esp + push_num*4 + 36] 216%define arg10 [esp + push_num*4 + 40] 217%define arg11 [esp + push_num*4 + 44] 218%define arg12 [esp + push_num*4 + 48] 219 220%define arg1d arg1 221%define arg2d arg2 222%define arg3d arg3 223%define arg4d arg4 224%define arg5d arg5 225%define arg6d arg6 226%define arg7d arg7 227%define arg8d arg8 228%define arg9d arg9 229%define arg10d arg10 230%define arg11d arg11 231%define arg12d arg12 232 233%define r0 eax 234%define r1 ecx 235%define r2 edx 236%define r3 ebx 237%define r4 esi 238%define r5 edi 239%define r6 ebp 240%define r7 esp 241 242%define r0d eax 243%define r1d ecx 244%define r2d edx 245%define r3d ebx 246%define r4d esi 247%define r5d edi 248%define r6d ebp 249 250%define r0w ax 251%define r1w cx 252%define r2w dx 253%define r3w bx 254%define r4w si 255%define r6w bp 256 257%define r0b al 258%define r1b cl 259%define r2b dl 260%define r3b bl 261 262%define PUSHRFLAGS pushfd 263%define POPRFLAGS popfd 264%define retrq eax ; 32 bit mode do not support 64 bits regesters 265%define retrd eax 266 267%endif 268 269%macro LOAD_PARA 2 270 mov %1, %2 271%endmacro 272 273%macro LOAD_1_PARA 0 274 %ifdef X86_32 275 mov r0, [esp + push_num*4 + 4] 276 %endif 277%endmacro 278 279%macro LOAD_2_PARA 0 280 %ifdef X86_32 281 mov r0, [esp + push_num*4 + 4] 282 mov r1, [esp + push_num*4 + 8] 283 %endif 284%endmacro 285 286%macro LOAD_3_PARA 0 287 %ifdef X86_32 288 mov r0, [esp + push_num*4 + 4] 289 mov r1, [esp + push_num*4 + 8] 290 mov r2, [esp + push_num*4 + 12] 291 %endif 292%endmacro 293 294%macro LOAD_4_PARA 0 295 %ifdef X86_32 296 push r3 297 %assign push_num push_num+1 298 mov r0, [esp + push_num*4 + 4] 299 mov r1, [esp + push_num*4 + 8] 300 mov r2, [esp + push_num*4 + 12] 301 mov r3, [esp + push_num*4 + 16] 302 %endif 303%endmacro 304 305%macro LOAD_5_PARA 0 306 %ifdef X86_32 307 push r3 308 push r4 309 %assign push_num push_num+2 310 mov r0, [esp + push_num*4 + 4] 311 mov r1, [esp + push_num*4 + 8] 312 mov r2, [esp + push_num*4 + 12] 313 mov r3, [esp + push_num*4 + 16] 314 mov r4, [esp + push_num*4 + 20] 315 %elifdef WIN64 316 mov r4, [rsp + push_num*8 + 40] 317 %endif 318%endmacro 319 320%macro LOAD_6_PARA 0 321 %ifdef X86_32 322 push r3 323 push r4 324 push r5 325 %assign push_num push_num+3 326 mov r0, [esp + push_num*4 + 4] 327 mov r1, [esp + push_num*4 + 8] 328 mov r2, [esp + push_num*4 + 12] 329 mov r3, [esp + push_num*4 + 16] 330 mov r4, [esp + push_num*4 + 20] 331 mov r5, [esp + push_num*4 + 24] 332 %elifdef WIN64 333 mov r4, [rsp + push_num*8 + 40] 334 mov r5, [rsp + push_num*8 + 48] 335 %endif 336%endmacro 337 338%macro LOAD_7_PARA 0 339 %ifdef X86_32 340 push r3 341 push r4 342 push r5 343 push r6 344 %assign push_num push_num+4 345 mov r0, [esp + push_num*4 + 4] 346 mov r1, [esp + push_num*4 + 8] 347 mov r2, [esp + push_num*4 + 12] 348 mov r3, [esp + push_num*4 + 16] 349 mov r4, [esp + push_num*4 + 20] 350 mov r5, [esp + push_num*4 + 24] 351 mov r6, [esp + push_num*4 + 28] 352 %elifdef WIN64 353 mov r4, [rsp + push_num*8 + 40] 354 mov r5, [rsp + push_num*8 + 48] 355 mov r6, [rsp + push_num*8 + 56] 356 %elifdef UNIX64 357 mov r6, [rsp + push_num*8 + 8] 358 %endif 359%endmacro 360 361 362 363%macro LOAD_4_PARA_POP 0 364 %ifdef X86_32 365 pop r3 366 %endif 367%endmacro 368 369%macro LOAD_5_PARA_POP 0 370 %ifdef X86_32 371 pop r4 372 pop r3 373 %endif 374%endmacro 375 376%macro LOAD_6_PARA_POP 0 377 %ifdef X86_32 378 pop r5 379 pop r4 380 pop r3 381 %endif 382%endmacro 383 384%macro LOAD_7_PARA_POP 0 385 %ifdef X86_32 386 pop r6 387 pop r5 388 pop r4 389 pop r3 390 %endif 391%endmacro 392 393%macro PUSH_XMM 1 394 %ifdef WIN64 395 %assign xmm_num_regs %1 396 %if xmm_num_regs > 6 397 %ifdef push_num 398 %assign push_num push_num+2*(%1-6) 399 %endif 400 sub rsp, 16*(%1 - 6) 401 movdqu [rsp], xmm6 402 %endif 403 %if xmm_num_regs > 7 404 movdqu [rsp+16], xmm7 405 %endif 406 %if xmm_num_regs > 8 407 movdqu [rsp+32], xmm8 408 %endif 409 %if xmm_num_regs > 9 410 movdqu [rsp+48], xmm9 411 %endif 412 %if xmm_num_regs > 10 413 movdqu [rsp+64], xmm10 414 %endif 415 %if xmm_num_regs > 11 416 movdqu [rsp+80], xmm11 417 %endif 418 %if xmm_num_regs > 12 419 movdqu [rsp+96], xmm12 420 %endif 421 %if xmm_num_regs > 13 422 movdqu [rsp+112], xmm13 423 %endif 424 %if xmm_num_regs > 14 425 movdqu [rsp+128], xmm14 426 %endif 427 %if xmm_num_regs > 15 428 movdqu [rsp+144], xmm15 429 %endif 430 %endif 431%endmacro 432 433%macro POP_XMM 0 434 %ifdef WIN64 435 %if xmm_num_regs > 15 436 movdqu xmm15, [rsp+144] 437 %endif 438 %if xmm_num_regs > 14 439 movdqu xmm14, [rsp+128] 440 %endif 441 %if xmm_num_regs > 13 442 movdqu xmm13, [rsp+112] 443 %endif 444 %if xmm_num_regs > 12 445 movdqu xmm12, [rsp+96] 446 %endif 447 %if xmm_num_regs > 11 448 movdqu xmm11, [rsp+80] 449 %endif 450 %if xmm_num_regs > 10 451 movdqu xmm10, [rsp+64] 452 %endif 453 %if xmm_num_regs > 9 454 movdqu xmm9, [rsp+48] 455 %endif 456 %if xmm_num_regs > 8 457 movdqu xmm8, [rsp+32] 458 %endif 459 %if xmm_num_regs > 7 460 movdqu xmm7, [rsp+16] 461 %endif 462 %if xmm_num_regs > 6 463 movdqu xmm6, [rsp] 464 add rsp, 16*(xmm_num_regs - 6) 465 %endif 466 %endif 467%endmacro 468 469%macro SIGN_EXTENSION 2 470 %ifndef X86_32 471 movsxd %1, %2 472 %endif 473%endmacro 474 475%macro SIGN_EXTENSIONW 2 476 %ifndef X86_32 477 movsx %1, %2 478 %endif 479%endmacro 480 481%macro ZERO_EXTENSION 1 482 %ifndef X86_32 483 mov dword %1, %1 484 %endif 485%endmacro 486 487%macro WELS_EXTERN 1 488 ALIGN 16, nop 489 %ifdef PREFIX 490 %ifdef WELS_PRIVATE_EXTERN 491 global _%1: WELS_PRIVATE_EXTERN 492 %else 493 global _%1 494 %endif 495 %define %1 _%1 496 %else 497 %ifdef WELS_PRIVATE_EXTERN 498 global %1: WELS_PRIVATE_EXTERN 499 %else 500 global %1 501 %endif 502 %endif 503 %1: 504%endmacro 505 506%macro WELS_AbsW 2 507 pxor %2, %2 508 psubw %2, %1 509 pmaxsw %1, %2 510%endmacro 511 512%macro MMX_XSwap 4 513 movq %4, %2 514 punpckh%1 %4, %3 515 punpckl%1 %2, %3 516%endmacro 517 518; pOut mm1, mm4, mm5, mm3 519%macro MMX_Trans4x4W 5 520 MMX_XSwap wd, %1, %2, %5 521 MMX_XSwap wd, %3, %4, %2 522 MMX_XSwap dq, %1, %3, %4 523 MMX_XSwap dq, %5, %2, %3 524%endmacro 525 526;for TRANSPOSE 527%macro SSE2_XSawp 4 528 movdqa %4, %2 529 punpckl%1 %2, %3 530 punpckh%1 %4, %3 531%endmacro 532 533; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3 534%macro SSE2_Trans4x4D 5 535 SSE2_XSawp dq, %1, %2, %5 536 SSE2_XSawp dq, %3, %4, %2 537 SSE2_XSawp qdq, %1, %3, %4 538 SSE2_XSawp qdq, %5, %2, %3 539%endmacro 540 541;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4 542%macro SSE2_TransTwo4x4W 5 543 SSE2_XSawp wd, %1, %2, %5 544 SSE2_XSawp wd, %3, %4, %2 545 SSE2_XSawp dq, %1, %3, %4 546 SSE2_XSawp dq, %5, %2, %3 547 SSE2_XSawp qdq, %1, %5, %2 548 SSE2_XSawp qdq, %4, %3, %5 549%endmacro 550 551;in: m1, m2, m3, m4, m5, m6, m7, m8 552;pOut: m5, m3, m4, m8, m6, m2, m7, m1 553%macro SSE2_TransTwo8x8B 9 554 movdqa %9, %8 555 SSE2_XSawp bw, %1, %2, %8 556 SSE2_XSawp bw, %3, %4, %2 557 SSE2_XSawp bw, %5, %6, %4 558 movdqa %6, %9 559 movdqa %9, %4 560 SSE2_XSawp bw, %7, %6, %4 561 562 SSE2_XSawp wd, %1, %3, %6 563 SSE2_XSawp wd, %8, %2, %3 564 SSE2_XSawp wd, %5, %7, %2 565 movdqa %7, %9 566 movdqa %9, %3 567 SSE2_XSawp wd, %7, %4, %3 568 569 SSE2_XSawp dq, %1, %5, %4 570 SSE2_XSawp dq, %6, %2, %5 571 SSE2_XSawp dq, %8, %7, %2 572 movdqa %7, %9 573 movdqa %9, %5 574 SSE2_XSawp dq, %7, %3, %5 575 576 SSE2_XSawp qdq, %1, %8, %3 577 SSE2_XSawp qdq, %4, %2, %8 578 SSE2_XSawp qdq, %6, %7, %2 579 movdqa %7, %9 580 movdqa %9, %1 581 SSE2_XSawp qdq, %7, %5, %1 582 movdqa %5, %9 583%endmacro 584 585;xmm0, xmm6, xmm7, [eax], [ecx] 586;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result 587%macro SSE2_LoadDiff8P 5 588 movq %1, %4 589 punpcklbw %1, %3 590 movq %2, %5 591 punpcklbw %2, %3 592 psubw %1, %2 593%endmacro 594 595; m2 = m1 + m2, m1 = m1 - m2 596%macro SSE2_SumSub 3 597 movdqa %3, %2 598 paddw %2, %1 599 psubw %1, %3 600%endmacro 601 602 603%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d] 604 mov %3h, %3l 605 movd %1, e%3x ; i.e, 1% = eax (=b0) 606 pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0 607 pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0 608%endmacro 609 610;copy a dw into a xmm for 8 times 611%macro SSE2_Copy8Times 2 612 movd %1, %2 613 punpcklwd %1, %1 614 pshufd %1, %1, 0 615%endmacro 616 617;copy a db into a xmm for 16 times 618%macro SSE2_Copy16Times 2 619 movd %1, %2 620 pshuflw %1, %1, 0 621 punpcklqdq %1, %1 622 packuswb %1, %1 623%endmacro 624 625 626 627;*********************************************************************** 628;preprocessor constants 629;*********************************************************************** 630;dw 32,32,32,32,32,32,32,32 for xmm 631;dw 32,32,32,32 for mm 632%macro WELS_DW32 1 633 pcmpeqw %1,%1 634 psrlw %1,15 635 psllw %1,5 636%endmacro 637 638;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm 639;dw 1, 1, 1, 1 for mm 640%macro WELS_DW1 1 641 pcmpeqw %1,%1 642 psrlw %1,15 643%endmacro 644 645;all 0 for xmm and mm 646%macro WELS_Zero 1 647 pxor %1, %1 648%endmacro 649 650;dd 1, 1, 1, 1 for xmm 651;dd 1, 1 for mm 652%macro WELS_DD1 1 653 pcmpeqw %1,%1 654 psrld %1,31 655%endmacro 656 657;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 658%macro WELS_DB1 1 659 pcmpeqw %1,%1 660 psrlw %1,15 661 packuswb %1,%1 662%endmacro 663 664%macro WELS_DW1_VEX 1 665 vpcmpeqw %1, %1, %1 666 vpsrlw %1, %1, 15 667%endmacro 668 669%macro WELS_DW32_VEX 1 670 vpcmpeqw %1, %1, %1 671 vpsrlw %1, %1, 15 672 vpsllw %1, %1, 5 673%endmacro 674 675%macro WELS_DW32767_VEX 1 676 vpcmpeqw %1, %1, %1 677 vpsrlw %1, %1, 1 678%endmacro 679 680 681;*********************************************************************** 682; Utility macros for X86_32 PIC support 683;*********************************************************************** 684 685; Used internally by other macros. 686%macro INIT_X86_32_PIC_ 2 687%ifdef X86_32_PICASM 688 %xdefine pic_ptr %1 689 %xdefine pic_ptr_preserve %2 690 %if pic_ptr_preserve 691 %assign push_num push_num+1 692 push pic_ptr 693 %endif 694 call %%get_pc 695%%pic_refpoint: 696 jmp %%pic_init_done 697%%get_pc: 698 mov pic_ptr, [esp] 699 ret 700%%pic_init_done: 701 %define pic(data_addr) (pic_ptr+(data_addr)-%%pic_refpoint) 702%else 703 %define pic(data_addr) (data_addr) 704%endif 705%endmacro 706 707; Get program counter and define a helper macro "pic(addr)" to convert absolute 708; addresses to program counter-relative addresses if X86_32_PICASM is defined. 709; Otherwise define "pic(addr)" as an identity function. 710; %1=register to store PC/EIP in. 711%macro INIT_X86_32_PIC 1 712 INIT_X86_32_PIC_ %1, 1 713%endmacro 714 715; Equivalent as above, but without preserving the value of the register argument. 716%macro INIT_X86_32_PIC_NOPRESERVE 1 717 INIT_X86_32_PIC_ %1, 0 718%endmacro 719 720; Clean up after INIT_X86_32_PIC. 721; Restore the register used to hold PC/EIP if applicable, and undefine defines. 722%macro DEINIT_X86_32_PIC 0 723%ifdef X86_32_PICASM 724 %if pic_ptr_preserve 725 pop pic_ptr 726 %assign push_num push_num-1 727 %endif 728 %undef pic_ptr 729 %undef pic_ptr_preserve 730%endif 731 %undef pic 732%endmacro 733 734; Equivalent as above, but without undefining. Useful for functions with 735; multiple epilogues. 736%macro DEINIT_X86_32_PIC_KEEPDEF 0 737%ifdef X86_32_PICASM 738 %if pic_ptr_preserve 739 pop pic_ptr 740 %assign push_num push_num-1 741 %endif 742%endif 743%endmacro 744