1;***************************************************************************** 2;* x86inc.asm: x264asm abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2012 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Anton Mitrofanov <BugMaster@narod.ru> 8;* Jason Garrett-Glaser <darkshikari@gmail.com> 9;* Henrik Gramner <hengar-6@student.ltu.se> 10;* 11;* Permission to use, copy, modify, and/or distribute this software for any 12;* purpose with or without fee is hereby granted, provided that the above 13;* copyright notice and this permission notice appear in all copies. 14;* 15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22;***************************************************************************** 23 24; This is a header file for the x264ASM assembly language, which uses 25; NASM/YASM syntax combined with a large number of macros to provide easy 26; abstraction between different calling conventions (x86_32, win64, linux64). 27; It also has various other useful features to simplify writing the kind of 28; DSP functions that are most often used in x264. 29 30; Unlike the rest of x264, this file is available under an ISC license, as it 31; has significant usefulness outside of x264 and we want it to be available 32; to the largest audience possible. Of course, if you modify it for your own 33; purposes to add a new feature, we strongly encourage contributing a patch 34; as this feature might be useful for others as well. Send patches or ideas 35; to x264-devel@videolan.org . 36 37; Local changes for libyuv: 38; remove %define program_name and references in labels 39; rename cpus to uppercase 40 41%define WIN64 0 42%define UNIX64 0 43%if ARCH_X86_64 44 %ifidn __OUTPUT_FORMAT__,win32 45 %define WIN64 1 46 %elifidn __OUTPUT_FORMAT__,win64 47 %define WIN64 1 48 %else 49 %define UNIX64 1 50 %endif 51%endif 52 53%ifdef PREFIX 54 %define mangle(x) _ %+ x 55%else 56 %define mangle(x) x 57%endif 58 59; Name of the .rodata section. 60; Kludge: Something on OS X fails to align .rodata even given an align attribute, 61; so use a different read-only section. 62%macro SECTION_RODATA 0-1 16 63 %ifidn __OUTPUT_FORMAT__,macho64 64 SECTION .text align=%1 65 %elifidn __OUTPUT_FORMAT__,macho 66 SECTION .text align=%1 67 fakegot: 68 %elifidn __OUTPUT_FORMAT__,aout 69 section .text 70 %else 71 SECTION .rodata align=%1 72 %endif 73%endmacro 74 75; aout does not support align= 76%macro SECTION_TEXT 0-1 16 77 %ifidn __OUTPUT_FORMAT__,aout 78 SECTION .text 79 %else 80 SECTION .text align=%1 81 %endif 82%endmacro 83 84%if WIN64 85 %define PIC 86%elif ARCH_X86_64 == 0 87; x86_32 doesn't require PIC. 88; Some distros prefer shared objects to be PIC, but nothing breaks if 89; the code contains a few textrels, so we'll skip that complexity. 90 %undef PIC 91%endif 92%ifdef PIC 93 default rel 94%endif 95 96; Always use long nops (reduces 0x90 spam in disassembly on x86_32) 97CPU amdnop 98 99; Macros to eliminate most code duplication between x86_32 and x86_64: 100; Currently this works only for leaf functions which load all their arguments 101; into registers at the start, and make no other use of the stack. Luckily that 102; covers most of x264's asm. 103 104; PROLOGUE: 105; %1 = number of arguments. loads them from stack if needed. 106; %2 = number of registers used. pushes callee-saved regs if needed. 107; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 108; %4 = list of names to define to registers 109; PROLOGUE can also be invoked by adding the same options to cglobal 110 111; e.g. 112; cglobal foo, 2,3,0, dst, src, tmp 113; declares a function (foo), taking two args (dst and src) and one local variable (tmp) 114 115; TODO Some functions can use some args directly from the stack. If they're the 116; last args then you can just not declare them, but if they're in the middle 117; we need more flexible macro. 118 119; RET: 120; Pops anything that was pushed by PROLOGUE, and returns. 121 122; REP_RET: 123; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 124; which are slow when a normal ret follows a branch. 125 126; registers: 127; rN and rNq are the native-size register holding function argument N 128; rNd, rNw, rNb are dword, word, and byte size 129; rNh is the high 8 bits of the word size 130; rNm is the original location of arg N (a register or on the stack), dword 131; rNmp is native size 132 133%macro DECLARE_REG 2-3 134 %define r%1q %2 135 %define r%1d %2d 136 %define r%1w %2w 137 %define r%1b %2b 138 %define r%1h %2h 139 %if %0 == 2 140 %define r%1m %2d 141 %define r%1mp %2 142 %elif ARCH_X86_64 ; memory 143 %define r%1m [rsp + stack_offset + %3] 144 %define r%1mp qword r %+ %1m 145 %else 146 %define r%1m [esp + stack_offset + %3] 147 %define r%1mp dword r %+ %1m 148 %endif 149 %define r%1 %2 150%endmacro 151 152%macro DECLARE_REG_SIZE 3 153 %define r%1q r%1 154 %define e%1q r%1 155 %define r%1d e%1 156 %define e%1d e%1 157 %define r%1w %1 158 %define e%1w %1 159 %define r%1h %3 160 %define e%1h %3 161 %define r%1b %2 162 %define e%1b %2 163%if ARCH_X86_64 == 0 164 %define r%1 e%1 165%endif 166%endmacro 167 168DECLARE_REG_SIZE ax, al, ah 169DECLARE_REG_SIZE bx, bl, bh 170DECLARE_REG_SIZE cx, cl, ch 171DECLARE_REG_SIZE dx, dl, dh 172DECLARE_REG_SIZE si, sil, null 173DECLARE_REG_SIZE di, dil, null 174DECLARE_REG_SIZE bp, bpl, null 175 176; t# defines for when per-arch register allocation is more complex than just function arguments 177 178%macro DECLARE_REG_TMP 1-* 179 %assign %%i 0 180 %rep %0 181 CAT_XDEFINE t, %%i, r%1 182 %assign %%i %%i+1 183 %rotate 1 184 %endrep 185%endmacro 186 187%macro DECLARE_REG_TMP_SIZE 0-* 188 %rep %0 189 %define t%1q t%1 %+ q 190 %define t%1d t%1 %+ d 191 %define t%1w t%1 %+ w 192 %define t%1h t%1 %+ h 193 %define t%1b t%1 %+ b 194 %rotate 1 195 %endrep 196%endmacro 197 198DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 199 200%if ARCH_X86_64 201 %define gprsize 8 202%else 203 %define gprsize 4 204%endif 205 206%macro PUSH 1 207 push %1 208 %assign stack_offset stack_offset+gprsize 209%endmacro 210 211%macro POP 1 212 pop %1 213 %assign stack_offset stack_offset-gprsize 214%endmacro 215 216%macro PUSH_IF_USED 1-* 217 %rep %0 218 %if %1 < regs_used 219 PUSH r%1 220 %endif 221 %rotate 1 222 %endrep 223%endmacro 224 225%macro POP_IF_USED 1-* 226 %rep %0 227 %if %1 < regs_used 228 pop r%1 229 %endif 230 %rotate 1 231 %endrep 232%endmacro 233 234%macro LOAD_IF_USED 1-* 235 %rep %0 236 %if %1 < num_args 237 mov r%1, r %+ %1 %+ mp 238 %endif 239 %rotate 1 240 %endrep 241%endmacro 242 243%macro SUB 2 244 sub %1, %2 245 %ifidn %1, rsp 246 %assign stack_offset stack_offset+(%2) 247 %endif 248%endmacro 249 250%macro ADD 2 251 add %1, %2 252 %ifidn %1, rsp 253 %assign stack_offset stack_offset-(%2) 254 %endif 255%endmacro 256 257%macro movifnidn 2 258 %ifnidn %1, %2 259 mov %1, %2 260 %endif 261%endmacro 262 263%macro movsxdifnidn 2 264 %ifnidn %1, %2 265 movsxd %1, %2 266 %endif 267%endmacro 268 269%macro ASSERT 1 270 %if (%1) == 0 271 %error assert failed 272 %endif 273%endmacro 274 275%macro DEFINE_ARGS 0-* 276 %ifdef n_arg_names 277 %assign %%i 0 278 %rep n_arg_names 279 CAT_UNDEF arg_name %+ %%i, q 280 CAT_UNDEF arg_name %+ %%i, d 281 CAT_UNDEF arg_name %+ %%i, w 282 CAT_UNDEF arg_name %+ %%i, h 283 CAT_UNDEF arg_name %+ %%i, b 284 CAT_UNDEF arg_name %+ %%i, m 285 CAT_UNDEF arg_name %+ %%i, mp 286 CAT_UNDEF arg_name, %%i 287 %assign %%i %%i+1 288 %endrep 289 %endif 290 291 %xdefine %%stack_offset stack_offset 292 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 293 %assign %%i 0 294 %rep %0 295 %xdefine %1q r %+ %%i %+ q 296 %xdefine %1d r %+ %%i %+ d 297 %xdefine %1w r %+ %%i %+ w 298 %xdefine %1h r %+ %%i %+ h 299 %xdefine %1b r %+ %%i %+ b 300 %xdefine %1m r %+ %%i %+ m 301 %xdefine %1mp r %+ %%i %+ mp 302 CAT_XDEFINE arg_name, %%i, %1 303 %assign %%i %%i+1 304 %rotate 1 305 %endrep 306 %xdefine stack_offset %%stack_offset 307 %assign n_arg_names %0 308%endmacro 309 310%if WIN64 ; Windows x64 ;================================================= 311 312DECLARE_REG 0, rcx 313DECLARE_REG 1, rdx 314DECLARE_REG 2, R8 315DECLARE_REG 3, R9 316DECLARE_REG 4, R10, 40 317DECLARE_REG 5, R11, 48 318DECLARE_REG 6, rax, 56 319DECLARE_REG 7, rdi, 64 320DECLARE_REG 8, rsi, 72 321DECLARE_REG 9, rbx, 80 322DECLARE_REG 10, rbp, 88 323DECLARE_REG 11, R12, 96 324DECLARE_REG 12, R13, 104 325DECLARE_REG 13, R14, 112 326DECLARE_REG 14, R15, 120 327 328%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... 329 %assign num_args %1 330 %assign regs_used %2 331 ASSERT regs_used >= num_args 332 ASSERT regs_used <= 15 333 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 334 %if mmsize == 8 335 %assign xmm_regs_used 0 336 %else 337 WIN64_SPILL_XMM %3 338 %endif 339 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 340 DEFINE_ARGS %4 341%endmacro 342 343%macro WIN64_SPILL_XMM 1 344 %assign xmm_regs_used %1 345 ASSERT xmm_regs_used <= 16 346 %if xmm_regs_used > 6 347 SUB rsp, (xmm_regs_used-6)*16+16 348 %assign %%i xmm_regs_used 349 %rep (xmm_regs_used-6) 350 %assign %%i %%i-1 351 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i 352 %endrep 353 %endif 354%endmacro 355 356%macro WIN64_RESTORE_XMM_INTERNAL 1 357 %if xmm_regs_used > 6 358 %assign %%i xmm_regs_used 359 %rep (xmm_regs_used-6) 360 %assign %%i %%i-1 361 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] 362 %endrep 363 add %1, (xmm_regs_used-6)*16+16 364 %endif 365%endmacro 366 367%macro WIN64_RESTORE_XMM 1 368 WIN64_RESTORE_XMM_INTERNAL %1 369 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 370 %assign xmm_regs_used 0 371%endmacro 372 373%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 374 375%macro RET 0 376 WIN64_RESTORE_XMM_INTERNAL rsp 377 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 378%if mmsize == 32 379 vzeroupper 380%endif 381 ret 382%endmacro 383 384%elif ARCH_X86_64 ; *nix x64 ;============================================= 385 386DECLARE_REG 0, rdi 387DECLARE_REG 1, rsi 388DECLARE_REG 2, rdx 389DECLARE_REG 3, rcx 390DECLARE_REG 4, R8 391DECLARE_REG 5, R9 392DECLARE_REG 6, rax, 8 393DECLARE_REG 7, R10, 16 394DECLARE_REG 8, R11, 24 395DECLARE_REG 9, rbx, 32 396DECLARE_REG 10, rbp, 40 397DECLARE_REG 11, R12, 48 398DECLARE_REG 12, R13, 56 399DECLARE_REG 13, R14, 64 400DECLARE_REG 14, R15, 72 401 402%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 403 %assign num_args %1 404 %assign regs_used %2 405 ASSERT regs_used >= num_args 406 ASSERT regs_used <= 15 407 PUSH_IF_USED 9, 10, 11, 12, 13, 14 408 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 409 DEFINE_ARGS %4 410%endmacro 411 412%define has_epilogue regs_used > 9 || mmsize == 32 413 414%macro RET 0 415 POP_IF_USED 14, 13, 12, 11, 10, 9 416%if mmsize == 32 417 vzeroupper 418%endif 419 ret 420%endmacro 421 422%else ; X86_32 ;============================================================== 423 424DECLARE_REG 0, eax, 4 425DECLARE_REG 1, ecx, 8 426DECLARE_REG 2, edx, 12 427DECLARE_REG 3, ebx, 16 428DECLARE_REG 4, esi, 20 429DECLARE_REG 5, edi, 24 430DECLARE_REG 6, ebp, 28 431%define rsp esp 432 433%macro DECLARE_ARG 1-* 434 %rep %0 435 %define r%1m [esp + stack_offset + 4*%1 + 4] 436 %define r%1mp dword r%1m 437 %rotate 1 438 %endrep 439%endmacro 440 441DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 442 443%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 444 %assign num_args %1 445 %assign regs_used %2 446 %if regs_used > 7 447 %assign regs_used 7 448 %endif 449 ASSERT regs_used >= num_args 450 PUSH_IF_USED 3, 4, 5, 6 451 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 452 DEFINE_ARGS %4 453%endmacro 454 455%define has_epilogue regs_used > 3 || mmsize == 32 456 457%macro RET 0 458 POP_IF_USED 6, 5, 4, 3 459%if mmsize == 32 460 vzeroupper 461%endif 462 ret 463%endmacro 464 465%endif ;====================================================================== 466 467%if WIN64 == 0 468%macro WIN64_SPILL_XMM 1 469%endmacro 470%macro WIN64_RESTORE_XMM 1 471%endmacro 472%endif 473 474%macro REP_RET 0 475 %if has_epilogue 476 RET 477 %else 478 rep ret 479 %endif 480%endmacro 481 482%macro TAIL_CALL 2 ; callee, is_nonadjacent 483 %if has_epilogue 484 call %1 485 RET 486 %elif %2 487 jmp %1 488 %endif 489%endmacro 490 491;============================================================================= 492; arch-independent part 493;============================================================================= 494 495%assign function_align 16 496 497; Begin a function. 498; Applies any symbol mangling needed for C linkage, and sets up a define such that 499; subsequent uses of the function name automatically refer to the mangled version. 500; Appends cpuflags to the function name if cpuflags has been specified. 501%macro cglobal 1-2+ ; name, [PROLOGUE args] 502%if %0 == 1 503 cglobal_internal %1 %+ SUFFIX 504%else 505 cglobal_internal %1 %+ SUFFIX, %2 506%endif 507%endmacro 508%macro cglobal_internal 1-2+ 509 %ifndef cglobaled_%1 510 %xdefine %1 mangle(%1) 511 %xdefine %1.skip_prologue %1 %+ .skip_prologue 512 CAT_XDEFINE cglobaled_, %1, 1 513 %endif 514 %xdefine current_function %1 515 %ifidn __OUTPUT_FORMAT__,elf 516 global %1:function hidden 517 %else 518 global %1 519 %endif 520 align function_align 521 %1: 522 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer 523 %assign stack_offset 0 524 %if %0 > 1 525 PROLOGUE %2 526 %endif 527%endmacro 528 529%macro cextern 1 530 %xdefine %1 mangle(%1) 531 CAT_XDEFINE cglobaled_, %1, 1 532 extern %1 533%endmacro 534 535; like cextern, but without the prefix 536%macro cextern_naked 1 537 %xdefine %1 mangle(%1) 538 CAT_XDEFINE cglobaled_, %1, 1 539 extern %1 540%endmacro 541 542%macro const 2+ 543 %xdefine %1 mangle(%1) 544 global %1 545 %1: %2 546%endmacro 547 548; This is needed for ELF, otherwise the GNU linker assumes the stack is 549; executable by default. 550%ifidn __OUTPUT_FORMAT__,elf 551SECTION .note.GNU-stack noalloc noexec nowrite progbits 552%endif 553%ifidn __OUTPUT_FORMAT__,elf32 554section .note.GNU-stack noalloc noexec nowrite progbits 555%endif 556%ifidn __OUTPUT_FORMAT__,elf64 557section .note.GNU-stack noalloc noexec nowrite progbits 558%endif 559 560; cpuflags 561 562%assign cpuflags_MMX (1<<0) 563%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX 564%assign cpuflags_3dnow (1<<2) | cpuflags_MMX 565%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow 566%assign cpuflags_SSE (1<<4) | cpuflags_MMX2 567%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE 568%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2 569%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2 570%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3 571%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3 572%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4 573%assign cpuflags_AVX (1<<11)| cpuflags_SSE42 574%assign cpuflags_xop (1<<12)| cpuflags_AVX 575%assign cpuflags_fma4 (1<<13)| cpuflags_AVX 576%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX 577%assign cpuflags_fma3 (1<<15)| cpuflags_AVX 578 579%assign cpuflags_cache32 (1<<16) 580%assign cpuflags_cache64 (1<<17) 581%assign cpuflags_slowctz (1<<18) 582%assign cpuflags_lzcnt (1<<19) 583%assign cpuflags_misalign (1<<20) 584%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant 585%assign cpuflags_atom (1<<22) 586%assign cpuflags_bmi1 (1<<23) 587%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 588%assign cpuflags_tbm (1<<25)|cpuflags_bmi1 589 590%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) 591%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) 592 593; Takes up to 2 cpuflags from the above list. 594; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 595; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 596%macro INIT_CPUFLAGS 0-2 597 %if %0 >= 1 598 %xdefine cpuname %1 599 %assign cpuflags cpuflags_%1 600 %if %0 >= 2 601 %xdefine cpuname %1_%2 602 %assign cpuflags cpuflags | cpuflags_%2 603 %endif 604 %xdefine SUFFIX _ %+ cpuname 605 %if cpuflag(AVX) 606 %assign AVX_enabled 1 607 %endif 608 %if mmsize == 16 && notcpuflag(SSE2) 609 %define mova movaps 610 %define movu movups 611 %define movnta movntps 612 %endif 613 %if cpuflag(aligned) 614 %define movu mova 615 %elifidn %1, SSE3 616 %define movu lddqu 617 %endif 618 %else 619 %xdefine SUFFIX 620 %undef cpuname 621 %undef cpuflags 622 %endif 623%endmacro 624 625; merge MMX and SSE* 626 627%macro CAT_XDEFINE 3 628 %xdefine %1%2 %3 629%endmacro 630 631%macro CAT_UNDEF 2 632 %undef %1%2 633%endmacro 634 635%macro INIT_MMX 0-1+ 636 %assign AVX_enabled 0 637 %define RESET_MM_PERMUTATION INIT_MMX %1 638 %define mmsize 8 639 %define num_mmregs 8 640 %define mova movq 641 %define movu movq 642 %define movh movd 643 %define movnta movntq 644 %assign %%i 0 645 %rep 8 646 CAT_XDEFINE m, %%i, mm %+ %%i 647 CAT_XDEFINE nmm, %%i, %%i 648 %assign %%i %%i+1 649 %endrep 650 %rep 8 651 CAT_UNDEF m, %%i 652 CAT_UNDEF nmm, %%i 653 %assign %%i %%i+1 654 %endrep 655 INIT_CPUFLAGS %1 656%endmacro 657 658%macro INIT_XMM 0-1+ 659 %assign AVX_enabled 0 660 %define RESET_MM_PERMUTATION INIT_XMM %1 661 %define mmsize 16 662 %define num_mmregs 8 663 %if ARCH_X86_64 664 %define num_mmregs 16 665 %endif 666 %define mova movdqa 667 %define movu movdqu 668 %define movh movq 669 %define movnta movntdq 670 %assign %%i 0 671 %rep num_mmregs 672 CAT_XDEFINE m, %%i, xmm %+ %%i 673 CAT_XDEFINE nxmm, %%i, %%i 674 %assign %%i %%i+1 675 %endrep 676 INIT_CPUFLAGS %1 677%endmacro 678 679%macro INIT_YMM 0-1+ 680 %assign AVX_enabled 1 681 %define RESET_MM_PERMUTATION INIT_YMM %1 682 %define mmsize 32 683 %define num_mmregs 8 684 %if ARCH_X86_64 685 %define num_mmregs 16 686 %endif 687 %define mova vmovaps 688 %define movu vmovups 689 %undef movh 690 %define movnta vmovntps 691 %assign %%i 0 692 %rep num_mmregs 693 CAT_XDEFINE m, %%i, ymm %+ %%i 694 CAT_XDEFINE nymm, %%i, %%i 695 %assign %%i %%i+1 696 %endrep 697 INIT_CPUFLAGS %1 698%endmacro 699 700INIT_XMM 701 702; I often want to use macros that permute their arguments. e.g. there's no 703; efficient way to implement butterfly or transpose or dct without swapping some 704; arguments. 705; 706; I would like to not have to manually keep track of the permutations: 707; If I insert a permutation in the middle of a function, it should automatically 708; change everything that follows. For more complex macros I may also have multiple 709; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 710; 711; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 712; permutes its arguments. It's equivalent to exchanging the contents of the 713; registers, except that this way you exchange the register names instead, so it 714; doesn't cost any cycles. 715 716%macro PERMUTE 2-* ; takes a list of pairs to swap 717%rep %0/2 718 %xdefine tmp%2 m%2 719 %xdefine ntmp%2 nm%2 720 %rotate 2 721%endrep 722%rep %0/2 723 %xdefine m%1 tmp%2 724 %xdefine nm%1 ntmp%2 725 %undef tmp%2 726 %undef ntmp%2 727 %rotate 2 728%endrep 729%endmacro 730 731%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) 732%rep %0-1 733%ifdef m%1 734 %xdefine tmp m%1 735 %xdefine m%1 m%2 736 %xdefine m%2 tmp 737 CAT_XDEFINE n, m%1, %1 738 CAT_XDEFINE n, m%2, %2 739%else 740 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. 741 ; Be careful using this mode in nested macros though, as in some cases there may be 742 ; other copies of m# that have already been dereferenced and don't get updated correctly. 743 %xdefine %%n1 n %+ %1 744 %xdefine %%n2 n %+ %2 745 %xdefine tmp m %+ %%n1 746 CAT_XDEFINE m, %%n1, m %+ %%n2 747 CAT_XDEFINE m, %%n2, tmp 748 CAT_XDEFINE n, m %+ %%n1, %%n1 749 CAT_XDEFINE n, m %+ %%n2, %%n2 750%endif 751 %undef tmp 752 %rotate 1 753%endrep 754%endmacro 755 756; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 757; calls to that function will automatically load the permutation, so values can 758; be returned in mmregs. 759%macro SAVE_MM_PERMUTATION 0-1 760 %if %0 761 %xdefine %%f %1_m 762 %else 763 %xdefine %%f current_function %+ _m 764 %endif 765 %assign %%i 0 766 %rep num_mmregs 767 CAT_XDEFINE %%f, %%i, m %+ %%i 768 %assign %%i %%i+1 769 %endrep 770%endmacro 771 772%macro LOAD_MM_PERMUTATION 1 ; name to load from 773 %ifdef %1_m0 774 %assign %%i 0 775 %rep num_mmregs 776 CAT_XDEFINE m, %%i, %1_m %+ %%i 777 CAT_XDEFINE n, m %+ %%i, %%i 778 %assign %%i %%i+1 779 %endrep 780 %endif 781%endmacro 782 783; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 784%macro call 1 785 call_internal %1, %1 %+ SUFFIX 786%endmacro 787%macro call_internal 2 788 %xdefine %%i %1 789 %ifndef cglobaled_%1 790 %ifdef cglobaled_%2 791 %xdefine %%i %2 792 %endif 793 %endif 794 call %%i 795 LOAD_MM_PERMUTATION %%i 796%endmacro 797 798; Substitutions that reduce instruction size but are functionally equivalent 799%macro add 2 800 %ifnum %2 801 %if %2==128 802 sub %1, -128 803 %else 804 add %1, %2 805 %endif 806 %else 807 add %1, %2 808 %endif 809%endmacro 810 811%macro sub 2 812 %ifnum %2 813 %if %2==128 814 add %1, -128 815 %else 816 sub %1, %2 817 %endif 818 %else 819 sub %1, %2 820 %endif 821%endmacro 822 823;============================================================================= 824; AVX abstraction layer 825;============================================================================= 826 827%assign i 0 828%rep 16 829 %if i < 8 830 CAT_XDEFINE sizeofmm, i, 8 831 %endif 832 CAT_XDEFINE sizeofxmm, i, 16 833 CAT_XDEFINE sizeofymm, i, 32 834%assign i i+1 835%endrep 836%undef i 837 838%macro CHECK_AVX_INSTR_EMU 3-* 839 %xdefine %%opcode %1 840 %xdefine %%dst %2 841 %rep %0-2 842 %ifidn %%dst, %3 843 %error non-AVX emulation of ``%%opcode'' is not supported 844 %endif 845 %rotate 1 846 %endrep 847%endmacro 848 849;%1 == instruction 850;%2 == 1 if float, 0 if int 851;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) 852;%4 == number of operands given 853;%5+: operands 854%macro RUN_AVX_INSTR 6-7+ 855 %ifid %6 856 %define %%sizeofreg sizeof%6 857 %elifid %5 858 %define %%sizeofreg sizeof%5 859 %else 860 %define %%sizeofreg mmsize 861 %endif 862 %if %%sizeofreg==32 863 %if %4>=3 864 v%1 %5, %6, %7 865 %else 866 v%1 %5, %6 867 %endif 868 %else 869 %if %%sizeofreg==8 870 %define %%regmov movq 871 %elif %2 872 %define %%regmov movaps 873 %else 874 %define %%regmov movdqa 875 %endif 876 877 %if %4>=3+%3 878 %ifnidn %5, %6 879 %if AVX_enabled && %%sizeofreg==16 880 v%1 %5, %6, %7 881 %else 882 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 883 %%regmov %5, %6 884 %1 %5, %7 885 %endif 886 %else 887 %1 %5, %7 888 %endif 889 %elif %4>=3 890 %1 %5, %6, %7 891 %else 892 %1 %5, %6 893 %endif 894 %endif 895%endmacro 896 897; 3arg AVX ops with a memory arg can only have it in src2, 898; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). 899; So, if the op is symmetric and the wrong one is memory, swap them. 900%macro RUN_AVX_INSTR1 8 901 %assign %%swap 0 902 %if AVX_enabled 903 %ifnid %6 904 %assign %%swap 1 905 %endif 906 %elifnidn %5, %6 907 %ifnid %7 908 %assign %%swap 1 909 %endif 910 %endif 911 %if %%swap && %3 == 0 && %8 == 1 912 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 913 %else 914 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 915 %endif 916%endmacro 917 918;%1 == instruction 919;%2 == 1 if float, 0 if int 920;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) 921;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not 922%macro AVX_INSTR 4 923 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 924 %ifidn %3, fnord 925 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 926 %elifidn %4, fnord 927 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 928 %elifidn %5, fnord 929 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 930 %else 931 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 932 %endif 933 %endmacro 934%endmacro 935 936AVX_INSTR addpd, 1, 0, 1 937AVX_INSTR addps, 1, 0, 1 938AVX_INSTR addsd, 1, 0, 1 939AVX_INSTR addss, 1, 0, 1 940AVX_INSTR addsubpd, 1, 0, 0 941AVX_INSTR addsubps, 1, 0, 0 942AVX_INSTR andpd, 1, 0, 1 943AVX_INSTR andps, 1, 0, 1 944AVX_INSTR andnpd, 1, 0, 0 945AVX_INSTR andnps, 1, 0, 0 946AVX_INSTR blendpd, 1, 0, 0 947AVX_INSTR blendps, 1, 0, 0 948AVX_INSTR blendvpd, 1, 0, 0 949AVX_INSTR blendvps, 1, 0, 0 950AVX_INSTR cmppd, 1, 0, 0 951AVX_INSTR cmpps, 1, 0, 0 952AVX_INSTR cmpsd, 1, 0, 0 953AVX_INSTR cmpss, 1, 0, 0 954AVX_INSTR cvtdq2ps, 1, 0, 0 955AVX_INSTR cvtps2dq, 1, 0, 0 956AVX_INSTR divpd, 1, 0, 0 957AVX_INSTR divps, 1, 0, 0 958AVX_INSTR divsd, 1, 0, 0 959AVX_INSTR divss, 1, 0, 0 960AVX_INSTR dppd, 1, 1, 0 961AVX_INSTR dpps, 1, 1, 0 962AVX_INSTR haddpd, 1, 0, 0 963AVX_INSTR haddps, 1, 0, 0 964AVX_INSTR hsubpd, 1, 0, 0 965AVX_INSTR hsubps, 1, 0, 0 966AVX_INSTR maxpd, 1, 0, 1 967AVX_INSTR maxps, 1, 0, 1 968AVX_INSTR maxsd, 1, 0, 1 969AVX_INSTR maxss, 1, 0, 1 970AVX_INSTR minpd, 1, 0, 1 971AVX_INSTR minps, 1, 0, 1 972AVX_INSTR minsd, 1, 0, 1 973AVX_INSTR minss, 1, 0, 1 974AVX_INSTR movhlps, 1, 0, 0 975AVX_INSTR movlhps, 1, 0, 0 976AVX_INSTR movsd, 1, 0, 0 977AVX_INSTR movss, 1, 0, 0 978AVX_INSTR mpsadbw, 0, 1, 0 979AVX_INSTR mulpd, 1, 0, 1 980AVX_INSTR mulps, 1, 0, 1 981AVX_INSTR mulsd, 1, 0, 1 982AVX_INSTR mulss, 1, 0, 1 983AVX_INSTR orpd, 1, 0, 1 984AVX_INSTR orps, 1, 0, 1 985AVX_INSTR pabsb, 0, 0, 0 986AVX_INSTR pabsw, 0, 0, 0 987AVX_INSTR pabsd, 0, 0, 0 988AVX_INSTR packsswb, 0, 0, 0 989AVX_INSTR packssdw, 0, 0, 0 990AVX_INSTR packuswb, 0, 0, 0 991AVX_INSTR packusdw, 0, 0, 0 992AVX_INSTR paddb, 0, 0, 1 993AVX_INSTR paddw, 0, 0, 1 994AVX_INSTR paddd, 0, 0, 1 995AVX_INSTR paddq, 0, 0, 1 996AVX_INSTR paddsb, 0, 0, 1 997AVX_INSTR paddsw, 0, 0, 1 998AVX_INSTR paddusb, 0, 0, 1 999AVX_INSTR paddusw, 0, 0, 1 1000AVX_INSTR palignr, 0, 1, 0 1001AVX_INSTR pand, 0, 0, 1 1002AVX_INSTR pandn, 0, 0, 0 1003AVX_INSTR pavgb, 0, 0, 1 1004AVX_INSTR pavgw, 0, 0, 1 1005AVX_INSTR pblendvb, 0, 0, 0 1006AVX_INSTR pblendw, 0, 1, 0 1007AVX_INSTR pcmpestri, 0, 0, 0 1008AVX_INSTR pcmpestrm, 0, 0, 0 1009AVX_INSTR pcmpistri, 0, 0, 0 1010AVX_INSTR pcmpistrm, 0, 0, 0 1011AVX_INSTR pcmpeqb, 0, 0, 1 1012AVX_INSTR pcmpeqw, 0, 0, 1 1013AVX_INSTR pcmpeqd, 0, 0, 1 1014AVX_INSTR pcmpeqq, 0, 0, 1 1015AVX_INSTR pcmpgtb, 0, 0, 0 1016AVX_INSTR pcmpgtw, 0, 0, 0 1017AVX_INSTR pcmpgtd, 0, 0, 0 1018AVX_INSTR pcmpgtq, 0, 0, 0 1019AVX_INSTR phaddw, 0, 0, 0 1020AVX_INSTR phaddd, 0, 0, 0 1021AVX_INSTR phaddsw, 0, 0, 0 1022AVX_INSTR phsubw, 0, 0, 0 1023AVX_INSTR phsubd, 0, 0, 0 1024AVX_INSTR phsubsw, 0, 0, 0 1025AVX_INSTR pmaddwd, 0, 0, 1 1026AVX_INSTR pmaddubsw, 0, 0, 0 1027AVX_INSTR pmaxsb, 0, 0, 1 1028AVX_INSTR pmaxsw, 0, 0, 1 1029AVX_INSTR pmaxsd, 0, 0, 1 1030AVX_INSTR pmaxub, 0, 0, 1 1031AVX_INSTR pmaxuw, 0, 0, 1 1032AVX_INSTR pmaxud, 0, 0, 1 1033AVX_INSTR pminsb, 0, 0, 1 1034AVX_INSTR pminsw, 0, 0, 1 1035AVX_INSTR pminsd, 0, 0, 1 1036AVX_INSTR pminub, 0, 0, 1 1037AVX_INSTR pminuw, 0, 0, 1 1038AVX_INSTR pminud, 0, 0, 1 1039AVX_INSTR pmovmskb, 0, 0, 0 1040AVX_INSTR pmulhuw, 0, 0, 1 1041AVX_INSTR pmulhrsw, 0, 0, 1 1042AVX_INSTR pmulhw, 0, 0, 1 1043AVX_INSTR pmullw, 0, 0, 1 1044AVX_INSTR pmulld, 0, 0, 1 1045AVX_INSTR pmuludq, 0, 0, 1 1046AVX_INSTR pmuldq, 0, 0, 1 1047AVX_INSTR por, 0, 0, 1 1048AVX_INSTR psadbw, 0, 0, 1 1049AVX_INSTR pshufb, 0, 0, 0 1050AVX_INSTR pshufd, 0, 1, 0 1051AVX_INSTR pshufhw, 0, 1, 0 1052AVX_INSTR pshuflw, 0, 1, 0 1053AVX_INSTR psignb, 0, 0, 0 1054AVX_INSTR psignw, 0, 0, 0 1055AVX_INSTR psignd, 0, 0, 0 1056AVX_INSTR psllw, 0, 0, 0 1057AVX_INSTR pslld, 0, 0, 0 1058AVX_INSTR psllq, 0, 0, 0 1059AVX_INSTR pslldq, 0, 0, 0 1060AVX_INSTR psraw, 0, 0, 0 1061AVX_INSTR psrad, 0, 0, 0 1062AVX_INSTR psrlw, 0, 0, 0 1063AVX_INSTR psrld, 0, 0, 0 1064AVX_INSTR psrlq, 0, 0, 0 1065AVX_INSTR psrldq, 0, 0, 0 1066AVX_INSTR psubb, 0, 0, 0 1067AVX_INSTR psubw, 0, 0, 0 1068AVX_INSTR psubd, 0, 0, 0 1069AVX_INSTR psubq, 0, 0, 0 1070AVX_INSTR psubsb, 0, 0, 0 1071AVX_INSTR psubsw, 0, 0, 0 1072AVX_INSTR psubusb, 0, 0, 0 1073AVX_INSTR psubusw, 0, 0, 0 1074AVX_INSTR ptest, 0, 0, 0 1075AVX_INSTR punpckhbw, 0, 0, 0 1076AVX_INSTR punpckhwd, 0, 0, 0 1077AVX_INSTR punpckhdq, 0, 0, 0 1078AVX_INSTR punpckhqdq, 0, 0, 0 1079AVX_INSTR punpcklbw, 0, 0, 0 1080AVX_INSTR punpcklwd, 0, 0, 0 1081AVX_INSTR punpckldq, 0, 0, 0 1082AVX_INSTR punpcklqdq, 0, 0, 0 1083AVX_INSTR pxor, 0, 0, 1 1084AVX_INSTR shufps, 1, 1, 0 1085AVX_INSTR subpd, 1, 0, 0 1086AVX_INSTR subps, 1, 0, 0 1087AVX_INSTR subsd, 1, 0, 0 1088AVX_INSTR subss, 1, 0, 0 1089AVX_INSTR unpckhpd, 1, 0, 0 1090AVX_INSTR unpckhps, 1, 0, 0 1091AVX_INSTR unpcklpd, 1, 0, 0 1092AVX_INSTR unpcklps, 1, 0, 0 1093AVX_INSTR xorpd, 1, 0, 1 1094AVX_INSTR xorps, 1, 0, 1 1095 1096; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1097AVX_INSTR pfadd, 1, 0, 1 1098AVX_INSTR pfsub, 1, 0, 0 1099AVX_INSTR pfmul, 1, 0, 1 1100 1101; base-4 constants for shuffles 1102%assign i 0 1103%rep 256 1104 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1105 %if j < 10 1106 CAT_XDEFINE q000, j, i 1107 %elif j < 100 1108 CAT_XDEFINE q00, j, i 1109 %elif j < 1000 1110 CAT_XDEFINE q0, j, i 1111 %else 1112 CAT_XDEFINE q, j, i 1113 %endif 1114%assign i i+1 1115%endrep 1116%undef i 1117%undef j 1118 1119%macro FMA_INSTR 3 1120 %macro %1 4-7 %1, %2, %3 1121 %if cpuflag(xop) 1122 v%5 %1, %2, %3, %4 1123 %else 1124 %6 %1, %2, %3 1125 %7 %1, %4 1126 %endif 1127 %endmacro 1128%endmacro 1129 1130FMA_INSTR pmacsdd, pmulld, paddd 1131FMA_INSTR pmacsww, pmullw, paddw 1132FMA_INSTR pmadcswd, pmaddwd, paddd 1133 1134; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. 1135; This lets us use tzcnt without bumping the yasm version requirement yet. 1136%define tzcnt rep bsf 1137