1;***************************************************************************** 2;* x86inc.asm: x264asm abstraction layer 3;***************************************************************************** 4;* Copyright (C) 2005-2018 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Henrik Gramner <henrik@gramner.com> 8;* Anton Mitrofanov <BugMaster@narod.ru> 9;* Fiona Glaser <fiona@x264.com> 10;* 11;* Permission to use, copy, modify, and/or distribute this software for any 12;* purpose with or without fee is hereby granted, provided that the above 13;* copyright notice and this permission notice appear in all copies. 14;* 15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22;***************************************************************************** 23 24; This is a header file for the x264ASM assembly language, which uses 25; NASM/YASM syntax combined with a large number of macros to provide easy 26; abstraction between different calling conventions (x86_32, win64, linux64). 27; It also has various other useful features to simplify writing the kind of 28; DSP functions that are most often used in x264. 29 30; Unlike the rest of x264, this file is available under an ISC license, as it 31; has significant usefulness outside of x264 and we want it to be available 32; to the largest audience possible. Of course, if you modify it for your own 33; purposes to add a new feature, we strongly encourage contributing a patch 34; as this feature might be useful for others as well. Send patches or ideas 35; to x264-devel@videolan.org . 36 37%ifndef private_prefix 38 %define private_prefix gst 39%endif 40 41%ifndef public_prefix 42 %define public_prefix private_prefix 43%endif 44 45%ifndef STACK_ALIGNMENT 46 %if ARCH_X86_64 47 %define STACK_ALIGNMENT 16 48 %else 49 %define STACK_ALIGNMENT 4 50 %endif 51%endif 52 53%define WIN64 0 54%define UNIX64 0 55%if ARCH_X86_64 56 %ifidn __OUTPUT_FORMAT__,win32 57 %define WIN64 1 58 %elifidn __OUTPUT_FORMAT__,win64 59 %define WIN64 1 60 %elifidn __OUTPUT_FORMAT__,x64 61 %define WIN64 1 62 %else 63 %define UNIX64 1 64 %endif 65%endif 66 67; Only 1 for yasm. Workaround here. 68%define HAVE_CPUNOP 0 69 70%define FORMAT_ELF 0 71%ifidn __OUTPUT_FORMAT__,elf 72 %define FORMAT_ELF 1 73%elifidn __OUTPUT_FORMAT__,elf32 74 %define FORMAT_ELF 1 75%elifidn __OUTPUT_FORMAT__,elf64 76 %define FORMAT_ELF 1 77%endif 78 79%ifdef PREFIX 80 %define mangle(x) _ %+ x 81%else 82 %define mangle(x) x 83%endif 84 85; aout does not support align= 86; NOTE: This section is out of sync with x264, in order to 87; keep supporting OS/2. 88%macro SECTION_RODATA 0-1 16 89 %ifidn __OUTPUT_FORMAT__,aout 90 SECTION .text 91 %elifidn __OUTPUT_FORMAT__,coff 92 SECTION .text 93 %elifidn __OUTPUT_FORMAT__,win32 94 SECTION .rdata align=%1 95 %elif WIN64 96 SECTION .rdata align=%1 97 %else 98 SECTION .rodata align=%1 99 %endif 100%endmacro 101 102%if WIN64 103 %define PIC 104%elif ARCH_X86_64 == 0 105; x86_32 doesn't require PIC. 106; Some distros prefer shared objects to be PIC, but nothing breaks if 107; the code contains a few textrels, so we'll skip that complexity. 108 %undef PIC 109%endif 110%ifdef PIC 111 default rel 112%endif 113 114%macro CPUNOP 1 115 %if HAVE_CPUNOP 116 CPU %1 117 %endif 118%endmacro 119 120; Macros to eliminate most code duplication between x86_32 and x86_64: 121; Currently this works only for leaf functions which load all their arguments 122; into registers at the start, and make no other use of the stack. Luckily that 123; covers most of x264's asm. 124 125; PROLOGUE: 126; %1 = number of arguments. loads them from stack if needed. 127; %2 = number of registers used. pushes callee-saved regs if needed. 128; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 129; %4 = (optional) stack size to be allocated. The stack will be aligned before 130; allocating the specified stack size. If the required stack alignment is 131; larger than the known stack alignment the stack will be manually aligned 132; and an extra register will be allocated to hold the original stack 133; pointer (to not invalidate r0m etc.). To prevent the use of an extra 134; register as stack pointer, request a negative stack size. 135; %4+/%5+ = list of names to define to registers 136; PROLOGUE can also be invoked by adding the same options to cglobal 137 138; e.g. 139; cglobal foo, 2,3,7,0x40, dst, src, tmp 140; declares a function (foo) that automatically loads two arguments (dst and 141; src) into registers, uses one additional register (tmp) plus 7 vector 142; registers (m0-m6) and allocates 0x40 bytes of stack space. 143 144; TODO Some functions can use some args directly from the stack. If they're the 145; last args then you can just not declare them, but if they're in the middle 146; we need more flexible macro. 147 148; RET: 149; Pops anything that was pushed by PROLOGUE, and returns. 150 151; REP_RET: 152; Use this instead of RET if it's a branch target. 153 154; registers: 155; rN and rNq are the native-size register holding function argument N 156; rNd, rNw, rNb are dword, word, and byte size 157; rNh is the high 8 bits of the word size 158; rNm is the original location of arg N (a register or on the stack), dword 159; rNmp is native size 160 161%macro DECLARE_REG 2-3 162 %define r%1q %2 163 %define r%1d %2d 164 %define r%1w %2w 165 %define r%1b %2b 166 %define r%1h %2h 167 %define %2q %2 168 %if %0 == 2 169 %define r%1m %2d 170 %define r%1mp %2 171 %elif ARCH_X86_64 ; memory 172 %define r%1m [rstk + stack_offset + %3] 173 %define r%1mp qword r %+ %1 %+ m 174 %else 175 %define r%1m [rstk + stack_offset + %3] 176 %define r%1mp dword r %+ %1 %+ m 177 %endif 178 %define r%1 %2 179%endmacro 180 181%macro DECLARE_REG_SIZE 3 182 %define r%1q r%1 183 %define e%1q r%1 184 %define r%1d e%1 185 %define e%1d e%1 186 %define r%1w %1 187 %define e%1w %1 188 %define r%1h %3 189 %define e%1h %3 190 %define r%1b %2 191 %define e%1b %2 192 %if ARCH_X86_64 == 0 193 %define r%1 e%1 194 %endif 195%endmacro 196 197DECLARE_REG_SIZE ax, al, ah 198DECLARE_REG_SIZE bx, bl, bh 199DECLARE_REG_SIZE cx, cl, ch 200DECLARE_REG_SIZE dx, dl, dh 201DECLARE_REG_SIZE si, sil, null 202DECLARE_REG_SIZE di, dil, null 203DECLARE_REG_SIZE bp, bpl, null 204 205; t# defines for when per-arch register allocation is more complex than just function arguments 206 207%macro DECLARE_REG_TMP 1-* 208 %assign %%i 0 209 %rep %0 210 CAT_XDEFINE t, %%i, r%1 211 %assign %%i %%i+1 212 %rotate 1 213 %endrep 214%endmacro 215 216%macro DECLARE_REG_TMP_SIZE 0-* 217 %rep %0 218 %define t%1q t%1 %+ q 219 %define t%1d t%1 %+ d 220 %define t%1w t%1 %+ w 221 %define t%1h t%1 %+ h 222 %define t%1b t%1 %+ b 223 %rotate 1 224 %endrep 225%endmacro 226 227DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 228 229%if ARCH_X86_64 230 %define gprsize 8 231%else 232 %define gprsize 4 233%endif 234 235%macro PUSH 1 236 push %1 237 %ifidn rstk, rsp 238 %assign stack_offset stack_offset+gprsize 239 %endif 240%endmacro 241 242%macro POP 1 243 pop %1 244 %ifidn rstk, rsp 245 %assign stack_offset stack_offset-gprsize 246 %endif 247%endmacro 248 249%macro PUSH_IF_USED 1-* 250 %rep %0 251 %if %1 < regs_used 252 PUSH r%1 253 %endif 254 %rotate 1 255 %endrep 256%endmacro 257 258%macro POP_IF_USED 1-* 259 %rep %0 260 %if %1 < regs_used 261 pop r%1 262 %endif 263 %rotate 1 264 %endrep 265%endmacro 266 267%macro LOAD_IF_USED 1-* 268 %rep %0 269 %if %1 < num_args 270 mov r%1, r %+ %1 %+ mp 271 %endif 272 %rotate 1 273 %endrep 274%endmacro 275 276%macro SUB 2 277 sub %1, %2 278 %ifidn %1, rstk 279 %assign stack_offset stack_offset+(%2) 280 %endif 281%endmacro 282 283%macro ADD 2 284 add %1, %2 285 %ifidn %1, rstk 286 %assign stack_offset stack_offset-(%2) 287 %endif 288%endmacro 289 290%macro movifnidn 2 291 %ifnidn %1, %2 292 mov %1, %2 293 %endif 294%endmacro 295 296%macro movsxdifnidn 2 297 %ifnidn %1, %2 298 movsxd %1, %2 299 %endif 300%endmacro 301 302%macro ASSERT 1 303 %if (%1) == 0 304 %error assertion ``%1'' failed 305 %endif 306%endmacro 307 308%macro DEFINE_ARGS 0-* 309 %ifdef n_arg_names 310 %assign %%i 0 311 %rep n_arg_names 312 CAT_UNDEF arg_name %+ %%i, q 313 CAT_UNDEF arg_name %+ %%i, d 314 CAT_UNDEF arg_name %+ %%i, w 315 CAT_UNDEF arg_name %+ %%i, h 316 CAT_UNDEF arg_name %+ %%i, b 317 CAT_UNDEF arg_name %+ %%i, m 318 CAT_UNDEF arg_name %+ %%i, mp 319 CAT_UNDEF arg_name, %%i 320 %assign %%i %%i+1 321 %endrep 322 %endif 323 324 %xdefine %%stack_offset stack_offset 325 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 326 %assign %%i 0 327 %rep %0 328 %xdefine %1q r %+ %%i %+ q 329 %xdefine %1d r %+ %%i %+ d 330 %xdefine %1w r %+ %%i %+ w 331 %xdefine %1h r %+ %%i %+ h 332 %xdefine %1b r %+ %%i %+ b 333 %xdefine %1m r %+ %%i %+ m 334 %xdefine %1mp r %+ %%i %+ mp 335 CAT_XDEFINE arg_name, %%i, %1 336 %assign %%i %%i+1 337 %rotate 1 338 %endrep 339 %xdefine stack_offset %%stack_offset 340 %assign n_arg_names %0 341%endmacro 342 343%define required_stack_alignment ((mmsize + 15) & ~15) 344%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) 345%define high_mm_regs (16*cpuflag(avx512)) 346 347%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) 348 %ifnum %1 349 %if %1 != 0 350 %assign %%pad 0 351 %assign stack_size %1 352 %if stack_size < 0 353 %assign stack_size -stack_size 354 %endif 355 %if WIN64 356 %assign %%pad %%pad + 32 ; shadow space 357 %if mmsize != 8 358 %assign xmm_regs_used %2 359 %if xmm_regs_used > 8 360 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers 361 %endif 362 %endif 363 %endif 364 %if required_stack_alignment <= STACK_ALIGNMENT 365 ; maintain the current stack alignment 366 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 367 SUB rsp, stack_size_padded 368 %else 369 %assign %%reg_num (regs_used - 1) 370 %xdefine rstk r %+ %%reg_num 371 ; align stack, and save original stack location directly above 372 ; it, i.e. in [rsp+stack_size_padded], so we can restore the 373 ; stack in a single instruction (i.e. mov rsp, rstk or mov 374 ; rsp, [rsp+stack_size_padded]) 375 %if %1 < 0 ; need to store rsp on stack 376 %xdefine rstkm [rsp + stack_size + %%pad] 377 %assign %%pad %%pad + gprsize 378 %else ; can keep rsp in rstk during whole function 379 %xdefine rstkm rstk 380 %endif 381 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) 382 mov rstk, rsp 383 and rsp, ~(required_stack_alignment-1) 384 sub rsp, stack_size_padded 385 movifnidn rstkm, rstk 386 %endif 387 WIN64_PUSH_XMM 388 %endif 389 %endif 390%endmacro 391 392%macro SETUP_STACK_POINTER 1 393 %ifnum %1 394 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT 395 %if %1 > 0 396 ; Reserve an additional register for storing the original stack pointer, but avoid using 397 ; eax/rax for this purpose since it can potentially get overwritten as a return value. 398 %assign regs_used (regs_used + 1) 399 %if ARCH_X86_64 && regs_used == 7 400 %assign regs_used 8 401 %elif ARCH_X86_64 == 0 && regs_used == 1 402 %assign regs_used 2 403 %endif 404 %endif 405 %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 406 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) 407 ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. 408 %assign regs_used 5 + UNIX64 * 3 409 %endif 410 %endif 411 %endif 412%endmacro 413 414%macro DEFINE_ARGS_INTERNAL 3+ 415 %ifnum %2 416 DEFINE_ARGS %3 417 %elif %1 == 4 418 DEFINE_ARGS %2 419 %elif %1 > 4 420 DEFINE_ARGS %2, %3 421 %endif 422%endmacro 423 424%if WIN64 ; Windows x64 ;================================================= 425 426DECLARE_REG 0, rcx 427DECLARE_REG 1, rdx 428DECLARE_REG 2, R8 429DECLARE_REG 3, R9 430DECLARE_REG 4, R10, 40 431DECLARE_REG 5, R11, 48 432DECLARE_REG 6, rax, 56 433DECLARE_REG 7, rdi, 64 434DECLARE_REG 8, rsi, 72 435DECLARE_REG 9, rbx, 80 436DECLARE_REG 10, rbp, 88 437DECLARE_REG 11, R14, 96 438DECLARE_REG 12, R15, 104 439DECLARE_REG 13, R12, 112 440DECLARE_REG 14, R13, 120 441 442%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 443 %assign num_args %1 444 %assign regs_used %2 445 ASSERT regs_used >= num_args 446 SETUP_STACK_POINTER %4 447 ASSERT regs_used <= 15 448 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 449 ALLOC_STACK %4, %3 450 %if mmsize != 8 && stack_size == 0 451 WIN64_SPILL_XMM %3 452 %endif 453 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 454 DEFINE_ARGS_INTERNAL %0, %4, %5 455%endmacro 456 457%macro WIN64_PUSH_XMM 0 458 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. 459 %if xmm_regs_used > 6 + high_mm_regs 460 movaps [rstk + stack_offset + 8], xmm6 461 %endif 462 %if xmm_regs_used > 7 + high_mm_regs 463 movaps [rstk + stack_offset + 24], xmm7 464 %endif 465 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 466 %if %%xmm_regs_on_stack > 0 467 %assign %%i 8 468 %rep %%xmm_regs_on_stack 469 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i 470 %assign %%i %%i+1 471 %endrep 472 %endif 473%endmacro 474 475%macro WIN64_SPILL_XMM 1 476 %assign xmm_regs_used %1 477 ASSERT xmm_regs_used <= 16 + high_mm_regs 478 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 479 %if %%xmm_regs_on_stack > 0 480 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. 481 %assign %%pad %%xmm_regs_on_stack*16 + 32 482 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) 483 SUB rsp, stack_size_padded 484 %endif 485 WIN64_PUSH_XMM 486%endmacro 487 488%macro WIN64_RESTORE_XMM_INTERNAL 0 489 %assign %%pad_size 0 490 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 491 %if %%xmm_regs_on_stack > 0 492 %assign %%i xmm_regs_used - high_mm_regs 493 %rep %%xmm_regs_on_stack 494 %assign %%i %%i-1 495 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] 496 %endrep 497 %endif 498 %if stack_size_padded > 0 499 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT 500 mov rsp, rstkm 501 %else 502 add rsp, stack_size_padded 503 %assign %%pad_size stack_size_padded 504 %endif 505 %endif 506 %if xmm_regs_used > 7 + high_mm_regs 507 movaps xmm7, [rsp + stack_offset - %%pad_size + 24] 508 %endif 509 %if xmm_regs_used > 6 + high_mm_regs 510 movaps xmm6, [rsp + stack_offset - %%pad_size + 8] 511 %endif 512%endmacro 513 514%macro WIN64_RESTORE_XMM 0 515 WIN64_RESTORE_XMM_INTERNAL 516 %assign stack_offset (stack_offset-stack_size_padded) 517 %assign stack_size_padded 0 518 %assign xmm_regs_used 0 519%endmacro 520 521%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs 522 523%macro RET 0 524 WIN64_RESTORE_XMM_INTERNAL 525 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 526 %if vzeroupper_required 527 vzeroupper 528 %endif 529 AUTO_REP_RET 530%endmacro 531 532%elif ARCH_X86_64 ; *nix x64 ;============================================= 533 534DECLARE_REG 0, rdi 535DECLARE_REG 1, rsi 536DECLARE_REG 2, rdx 537DECLARE_REG 3, rcx 538DECLARE_REG 4, R8 539DECLARE_REG 5, R9 540DECLARE_REG 6, rax, 8 541DECLARE_REG 7, R10, 16 542DECLARE_REG 8, R11, 24 543DECLARE_REG 9, rbx, 32 544DECLARE_REG 10, rbp, 40 545DECLARE_REG 11, R14, 48 546DECLARE_REG 12, R15, 56 547DECLARE_REG 13, R12, 64 548DECLARE_REG 14, R13, 72 549 550%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 551 %assign num_args %1 552 %assign regs_used %2 553 %assign xmm_regs_used %3 554 ASSERT regs_used >= num_args 555 SETUP_STACK_POINTER %4 556 ASSERT regs_used <= 15 557 PUSH_IF_USED 9, 10, 11, 12, 13, 14 558 ALLOC_STACK %4 559 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 560 DEFINE_ARGS_INTERNAL %0, %4, %5 561%endmacro 562 563%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required 564 565%macro RET 0 566 %if stack_size_padded > 0 567 %if required_stack_alignment > STACK_ALIGNMENT 568 mov rsp, rstkm 569 %else 570 add rsp, stack_size_padded 571 %endif 572 %endif 573 POP_IF_USED 14, 13, 12, 11, 10, 9 574 %if vzeroupper_required 575 vzeroupper 576 %endif 577 AUTO_REP_RET 578%endmacro 579 580%else ; X86_32 ;============================================================== 581 582DECLARE_REG 0, eax, 4 583DECLARE_REG 1, ecx, 8 584DECLARE_REG 2, edx, 12 585DECLARE_REG 3, ebx, 16 586DECLARE_REG 4, esi, 20 587DECLARE_REG 5, edi, 24 588DECLARE_REG 6, ebp, 28 589%define rsp esp 590 591%macro DECLARE_ARG 1-* 592 %rep %0 593 %define r%1m [rstk + stack_offset + 4*%1 + 4] 594 %define r%1mp dword r%1m 595 %rotate 1 596 %endrep 597%endmacro 598 599DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 600 601%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... 602 %assign num_args %1 603 %assign regs_used %2 604 ASSERT regs_used >= num_args 605 %if num_args > 7 606 %assign num_args 7 607 %endif 608 %if regs_used > 7 609 %assign regs_used 7 610 %endif 611 SETUP_STACK_POINTER %4 612 ASSERT regs_used <= 7 613 PUSH_IF_USED 3, 4, 5, 6 614 ALLOC_STACK %4 615 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 616 DEFINE_ARGS_INTERNAL %0, %4, %5 617%endmacro 618 619%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required 620 621%macro RET 0 622 %if stack_size_padded > 0 623 %if required_stack_alignment > STACK_ALIGNMENT 624 mov rsp, rstkm 625 %else 626 add rsp, stack_size_padded 627 %endif 628 %endif 629 POP_IF_USED 6, 5, 4, 3 630 %if vzeroupper_required 631 vzeroupper 632 %endif 633 AUTO_REP_RET 634%endmacro 635 636%endif ;====================================================================== 637 638%if WIN64 == 0 639 %macro WIN64_SPILL_XMM 1 640 %endmacro 641 %macro WIN64_RESTORE_XMM 0 642 %endmacro 643 %macro WIN64_PUSH_XMM 0 644 %endmacro 645%endif 646 647; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either 648; a branch or a branch target. So switch to a 2-byte form of ret in that case. 649; We can automatically detect "follows a branch", but not a branch target. 650; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) 651%macro REP_RET 0 652 %if has_epilogue || cpuflag(ssse3) 653 RET 654 %else 655 rep ret 656 %endif 657 annotate_function_size 658%endmacro 659 660%define last_branch_adr $$ 661%macro AUTO_REP_RET 0 662 %if notcpuflag(ssse3) 663 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. 664 %endif 665 ret 666 annotate_function_size 667%endmacro 668 669%macro BRANCH_INSTR 0-* 670 %rep %0 671 %macro %1 1-2 %1 672 %2 %1 673 %if notcpuflag(ssse3) 674 %%branch_instr equ $ 675 %xdefine last_branch_adr %%branch_instr 676 %endif 677 %endmacro 678 %rotate 1 679 %endrep 680%endmacro 681 682BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp 683 684%macro TAIL_CALL 2 ; callee, is_nonadjacent 685 %if has_epilogue 686 call %1 687 RET 688 %elif %2 689 jmp %1 690 %endif 691 annotate_function_size 692%endmacro 693 694;============================================================================= 695; arch-independent part 696;============================================================================= 697 698%assign function_align 16 699 700; Begin a function. 701; Applies any symbol mangling needed for C linkage, and sets up a define such that 702; subsequent uses of the function name automatically refer to the mangled version. 703; Appends cpuflags to the function name if cpuflags has been specified. 704; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX 705; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). 706%macro cglobal 1-2+ "" ; name, [PROLOGUE args] 707 cglobal_internal 1, %1 %+ SUFFIX, %2 708%endmacro 709%macro cvisible 1-2+ "" ; name, [PROLOGUE args] 710 cglobal_internal 0, %1 %+ SUFFIX, %2 711%endmacro 712%macro cglobal_internal 2-3+ 713 annotate_function_size 714 %if %1 715 %xdefine %%FUNCTION_PREFIX private_prefix 716 %xdefine %%VISIBILITY hidden 717 %else 718 %xdefine %%FUNCTION_PREFIX public_prefix 719 %xdefine %%VISIBILITY 720 %endif 721 %ifndef cglobaled_%2 722 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) 723 %xdefine %2.skip_prologue %2 %+ .skip_prologue 724 CAT_XDEFINE cglobaled_, %2, 1 725 %endif 726 %xdefine current_function %2 727 %xdefine current_function_section __SECT__ 728 %if FORMAT_ELF 729 global %2:function %%VISIBILITY 730 %else 731 global %2 732 %endif 733 align function_align 734 %2: 735 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer 736 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required 737 %assign stack_offset 0 ; stack pointer offset relative to the return address 738 %assign stack_size 0 ; amount of stack space that can be freely used inside a function 739 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding 740 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper 741 %ifnidn %3, "" 742 PROLOGUE %3 743 %endif 744%endmacro 745 746; Create a global symbol from a local label with the correct name mangling and type 747%macro cglobal_label 1 748 %if FORMAT_ELF 749 global current_function %+ %1:function hidden 750 %else 751 global current_function %+ %1 752 %endif 753 %1: 754%endmacro 755 756%macro cextern 1 757 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 758 CAT_XDEFINE cglobaled_, %1, 1 759 extern %1 760%endmacro 761 762; like cextern, but without the prefix 763%macro cextern_naked 1 764 %ifdef PREFIX 765 %xdefine %1 mangle(%1) 766 %endif 767 CAT_XDEFINE cglobaled_, %1, 1 768 extern %1 769%endmacro 770 771%macro const 1-2+ 772 %xdefine %1 mangle(private_prefix %+ _ %+ %1) 773 %if FORMAT_ELF 774 global %1:data hidden 775 %else 776 global %1 777 %endif 778 %1: %2 779%endmacro 780 781; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. 782%if FORMAT_ELF 783 [SECTION .note.GNU-stack noalloc noexec nowrite progbits] 784%endif 785 786; Tell debuggers how large the function was. 787; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. 788; This is invoked by RET and similar macros, and also cglobal does it for the previous function, 789; but if the last function in a source file doesn't use any of the standard macros for its epilogue, 790; then its size might be unspecified. 791%macro annotate_function_size 0 792 %ifdef __YASM_VER__ 793 %ifdef current_function 794 %if FORMAT_ELF 795 current_function_section 796 %%ecf equ $ 797 size current_function %%ecf - current_function 798 __SECT__ 799 %endif 800 %endif 801 %endif 802%endmacro 803 804; cpuflags 805 806%assign cpuflags_mmx (1<<0) 807%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx 808%assign cpuflags_3dnow (1<<2) | cpuflags_mmx 809%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow 810%assign cpuflags_sse (1<<4) | cpuflags_mmx2 811%assign cpuflags_sse2 (1<<5) | cpuflags_sse 812%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 813%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 814%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 815%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 816%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 817%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 818%assign cpuflags_aesni (1<<12)| cpuflags_sse42 819%assign cpuflags_avx (1<<13)| cpuflags_sse42 820%assign cpuflags_xop (1<<14)| cpuflags_avx 821%assign cpuflags_fma4 (1<<15)| cpuflags_avx 822%assign cpuflags_fma3 (1<<16)| cpuflags_avx 823%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt 824%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 825%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 826%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL 827 828%assign cpuflags_cache32 (1<<21) 829%assign cpuflags_cache64 (1<<22) 830%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant 831%assign cpuflags_atom (1<<24) 832 833; Returns a boolean value expressing whether or not the specified cpuflag is enabled. 834%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) 835%define notcpuflag(x) (cpuflag(x) ^ 1) 836 837; Takes an arbitrary number of cpuflags from the above list. 838; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 839; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 840%macro INIT_CPUFLAGS 0-* 841 %xdefine SUFFIX 842 %undef cpuname 843 %assign cpuflags 0 844 845 %if %0 >= 1 846 %rep %0 847 %ifdef cpuname 848 %xdefine cpuname cpuname %+ _%1 849 %else 850 %xdefine cpuname %1 851 %endif 852 %assign cpuflags cpuflags | cpuflags_%1 853 %rotate 1 854 %endrep 855 %xdefine SUFFIX _ %+ cpuname 856 857 %if cpuflag(avx) 858 %assign avx_enabled 1 859 %endif 860 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) 861 %define mova movaps 862 %define movu movups 863 %define movnta movntps 864 %endif 865 %if cpuflag(aligned) 866 %define movu mova 867 %elif cpuflag(sse3) && notcpuflag(ssse3) 868 %define movu lddqu 869 %endif 870 %endif 871 872 %if ARCH_X86_64 || cpuflag(sse2) 873 CPUNOP amdnop 874 %else 875 CPUNOP basicnop 876 %endif 877%endmacro 878 879; Merge mmx, sse*, and avx* 880; m# is a simd register of the currently selected size 881; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# 882; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# 883; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# 884; (All 4 remain in sync through SWAP.) 885 886%macro CAT_XDEFINE 3 887 %xdefine %1%2 %3 888%endmacro 889 890%macro CAT_UNDEF 2 891 %undef %1%2 892%endmacro 893 894%macro DEFINE_MMREGS 1 ; mmtype 895 %assign %%prev_mmregs 0 896 %ifdef num_mmregs 897 %assign %%prev_mmregs num_mmregs 898 %endif 899 900 %assign num_mmregs 8 901 %if ARCH_X86_64 && mmsize >= 16 902 %assign num_mmregs 16 903 %if cpuflag(avx512) || mmsize == 64 904 %assign num_mmregs 32 905 %endif 906 %endif 907 908 %assign %%i 0 909 %rep num_mmregs 910 CAT_XDEFINE m, %%i, %1 %+ %%i 911 CAT_XDEFINE nn%1, %%i, %%i 912 %assign %%i %%i+1 913 %endrep 914 %if %%prev_mmregs > num_mmregs 915 %rep %%prev_mmregs - num_mmregs 916 CAT_UNDEF m, %%i 917 CAT_UNDEF nn %+ mmtype, %%i 918 %assign %%i %%i+1 919 %endrep 920 %endif 921 %xdefine mmtype %1 922%endmacro 923 924; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper 925%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg 926 %if ARCH_X86_64 && cpuflag(avx512) 927 %assign %%i %1 928 %rep 16-%1 929 %assign %%i_high %%i+16 930 SWAP %%i, %%i_high 931 %assign %%i %%i+1 932 %endrep 933 %endif 934%endmacro 935 936%macro INIT_MMX 0-1+ 937 %assign avx_enabled 0 938 %define RESET_MM_PERMUTATION INIT_MMX %1 939 %define mmsize 8 940 %define mova movq 941 %define movu movq 942 %define movh movd 943 %define movnta movntq 944 INIT_CPUFLAGS %1 945 DEFINE_MMREGS mm 946%endmacro 947 948%macro INIT_XMM 0-1+ 949 %assign avx_enabled 0 950 %define RESET_MM_PERMUTATION INIT_XMM %1 951 %define mmsize 16 952 %define mova movdqa 953 %define movu movdqu 954 %define movh movq 955 %define movnta movntdq 956 INIT_CPUFLAGS %1 957 DEFINE_MMREGS xmm 958 %if WIN64 959 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers 960 %endif 961%endmacro 962 963%macro INIT_YMM 0-1+ 964 %assign avx_enabled 1 965 %define RESET_MM_PERMUTATION INIT_YMM %1 966 %define mmsize 32 967 %define mova movdqa 968 %define movu movdqu 969 %undef movh 970 %define movnta movntdq 971 INIT_CPUFLAGS %1 972 DEFINE_MMREGS ymm 973 AVX512_MM_PERMUTATION 974%endmacro 975 976%macro INIT_ZMM 0-1+ 977 %assign avx_enabled 1 978 %define RESET_MM_PERMUTATION INIT_ZMM %1 979 %define mmsize 64 980 %define mova movdqa 981 %define movu movdqu 982 %undef movh 983 %define movnta movntdq 984 INIT_CPUFLAGS %1 985 DEFINE_MMREGS zmm 986 AVX512_MM_PERMUTATION 987%endmacro 988 989INIT_XMM 990 991%macro DECLARE_MMCAST 1 992 %define mmmm%1 mm%1 993 %define mmxmm%1 mm%1 994 %define mmymm%1 mm%1 995 %define mmzmm%1 mm%1 996 %define xmmmm%1 mm%1 997 %define xmmxmm%1 xmm%1 998 %define xmmymm%1 xmm%1 999 %define xmmzmm%1 xmm%1 1000 %define ymmmm%1 mm%1 1001 %define ymmxmm%1 xmm%1 1002 %define ymmymm%1 ymm%1 1003 %define ymmzmm%1 ymm%1 1004 %define zmmmm%1 mm%1 1005 %define zmmxmm%1 xmm%1 1006 %define zmmymm%1 ymm%1 1007 %define zmmzmm%1 zmm%1 1008 %define xm%1 xmm %+ m%1 1009 %define ym%1 ymm %+ m%1 1010 %define zm%1 zmm %+ m%1 1011%endmacro 1012 1013%assign i 0 1014%rep 32 1015 DECLARE_MMCAST i 1016 %assign i i+1 1017%endrep 1018 1019; I often want to use macros that permute their arguments. e.g. there's no 1020; efficient way to implement butterfly or transpose or dct without swapping some 1021; arguments. 1022; 1023; I would like to not have to manually keep track of the permutations: 1024; If I insert a permutation in the middle of a function, it should automatically 1025; change everything that follows. For more complex macros I may also have multiple 1026; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 1027; 1028; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 1029; permutes its arguments. It's equivalent to exchanging the contents of the 1030; registers, except that this way you exchange the register names instead, so it 1031; doesn't cost any cycles. 1032 1033%macro PERMUTE 2-* ; takes a list of pairs to swap 1034 %rep %0/2 1035 %xdefine %%tmp%2 m%2 1036 %rotate 2 1037 %endrep 1038 %rep %0/2 1039 %xdefine m%1 %%tmp%2 1040 CAT_XDEFINE nn, m%1, %1 1041 %rotate 2 1042 %endrep 1043%endmacro 1044 1045%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) 1046 %ifnum %1 ; SWAP 0, 1, ... 1047 SWAP_INTERNAL_NUM %1, %2 1048 %else ; SWAP m0, m1, ... 1049 SWAP_INTERNAL_NAME %1, %2 1050 %endif 1051%endmacro 1052 1053%macro SWAP_INTERNAL_NUM 2-* 1054 %rep %0-1 1055 %xdefine %%tmp m%1 1056 %xdefine m%1 m%2 1057 %xdefine m%2 %%tmp 1058 CAT_XDEFINE nn, m%1, %1 1059 CAT_XDEFINE nn, m%2, %2 1060 %rotate 1 1061 %endrep 1062%endmacro 1063 1064%macro SWAP_INTERNAL_NAME 2-* 1065 %xdefine %%args nn %+ %1 1066 %rep %0-1 1067 %xdefine %%args %%args, nn %+ %2 1068 %rotate 1 1069 %endrep 1070 SWAP_INTERNAL_NUM %%args 1071%endmacro 1072 1073; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 1074; calls to that function will automatically load the permutation, so values can 1075; be returned in mmregs. 1076%macro SAVE_MM_PERMUTATION 0-1 1077 %if %0 1078 %xdefine %%f %1_m 1079 %else 1080 %xdefine %%f current_function %+ _m 1081 %endif 1082 %assign %%i 0 1083 %rep num_mmregs 1084 CAT_XDEFINE %%f, %%i, m %+ %%i 1085 %assign %%i %%i+1 1086 %endrep 1087%endmacro 1088 1089%macro LOAD_MM_PERMUTATION 1 ; name to load from 1090 %ifdef %1_m0 1091 %assign %%i 0 1092 %rep num_mmregs 1093 CAT_XDEFINE m, %%i, %1_m %+ %%i 1094 CAT_XDEFINE nn, m %+ %%i, %%i 1095 %assign %%i %%i+1 1096 %endrep 1097 %endif 1098%endmacro 1099 1100; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 1101%macro call 1 1102 %ifid %1 1103 call_internal %1 %+ SUFFIX, %1 1104 %else 1105 call %1 1106 %endif 1107%endmacro 1108%macro call_internal 2 1109 %xdefine %%i %2 1110 %ifndef cglobaled_%2 1111 %ifdef cglobaled_%1 1112 %xdefine %%i %1 1113 %endif 1114 %endif 1115 call %%i 1116 LOAD_MM_PERMUTATION %%i 1117%endmacro 1118 1119; Substitutions that reduce instruction size but are functionally equivalent 1120%macro add 2 1121 %ifnum %2 1122 %if %2==128 1123 sub %1, -128 1124 %else 1125 add %1, %2 1126 %endif 1127 %else 1128 add %1, %2 1129 %endif 1130%endmacro 1131 1132%macro sub 2 1133 %ifnum %2 1134 %if %2==128 1135 add %1, -128 1136 %else 1137 sub %1, %2 1138 %endif 1139 %else 1140 sub %1, %2 1141 %endif 1142%endmacro 1143 1144;============================================================================= 1145; AVX abstraction layer 1146;============================================================================= 1147 1148%assign i 0 1149%rep 32 1150 %if i < 8 1151 CAT_XDEFINE sizeofmm, i, 8 1152 CAT_XDEFINE regnumofmm, i, i 1153 %endif 1154 CAT_XDEFINE sizeofxmm, i, 16 1155 CAT_XDEFINE sizeofymm, i, 32 1156 CAT_XDEFINE sizeofzmm, i, 64 1157 CAT_XDEFINE regnumofxmm, i, i 1158 CAT_XDEFINE regnumofymm, i, i 1159 CAT_XDEFINE regnumofzmm, i, i 1160 %assign i i+1 1161%endrep 1162%undef i 1163 1164%macro CHECK_AVX_INSTR_EMU 3-* 1165 %xdefine %%opcode %1 1166 %xdefine %%dst %2 1167 %rep %0-2 1168 %ifidn %%dst, %3 1169 %error non-avx emulation of ``%%opcode'' is not supported 1170 %endif 1171 %rotate 1 1172 %endrep 1173%endmacro 1174 1175;%1 == instruction 1176;%2 == minimal instruction set 1177;%3 == 1 if float, 0 if int 1178;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1179;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1180;%6+: operands 1181%macro RUN_AVX_INSTR 6-9+ 1182 %ifnum sizeof%7 1183 %assign __sizeofreg sizeof%7 1184 %elifnum sizeof%6 1185 %assign __sizeofreg sizeof%6 1186 %else 1187 %assign __sizeofreg mmsize 1188 %endif 1189 %assign __emulate_avx 0 1190 %if avx_enabled && __sizeofreg >= 16 1191 %xdefine __instr v%1 1192 %else 1193 %xdefine __instr %1 1194 %if %0 >= 8+%4 1195 %assign __emulate_avx 1 1196 %endif 1197 %endif 1198 %ifnidn %2, fnord 1199 %ifdef cpuname 1200 %if notcpuflag(%2) 1201 %error use of ``%1'' %2 instruction in cpuname function: current_function 1202 %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 1203 %error use of ``%1'' sse2 instruction in cpuname function: current_function 1204 %endif 1205 %endif 1206 %endif 1207 1208 %if __emulate_avx 1209 %xdefine __src1 %7 1210 %xdefine __src2 %8 1211 %if %5 && %4 == 0 1212 %ifnidn %6, %7 1213 %ifidn %6, %8 1214 %xdefine __src1 %8 1215 %xdefine __src2 %7 1216 %elifnnum sizeof%8 1217 ; 3-operand AVX instructions with a memory arg can only have it in src2, 1218 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). 1219 ; So, if the instruction is commutative with a memory arg, swap them. 1220 %xdefine __src1 %8 1221 %xdefine __src2 %7 1222 %endif 1223 %endif 1224 %endif 1225 %ifnidn %6, __src1 1226 %if %0 >= 9 1227 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 1228 %else 1229 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 1230 %endif 1231 %if __sizeofreg == 8 1232 MOVQ %6, __src1 1233 %elif %3 1234 MOVAPS %6, __src1 1235 %else 1236 MOVDQA %6, __src1 1237 %endif 1238 %endif 1239 %if %0 >= 9 1240 %1 %6, __src2, %9 1241 %else 1242 %1 %6, __src2 1243 %endif 1244 %elif %0 >= 9 1245 __instr %6, %7, %8, %9 1246 %elif %0 == 8 1247 __instr %6, %7, %8 1248 %elif %0 == 7 1249 __instr %6, %7 1250 %else 1251 __instr %6 1252 %endif 1253%endmacro 1254 1255;%1 == instruction 1256;%2 == minimal instruction set 1257;%3 == 1 if float, 0 if int 1258;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) 1259;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not 1260%macro AVX_INSTR 1-5 fnord, 0, 255, 0 1261 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 1262 %ifidn %2, fnord 1263 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 1264 %elifidn %3, fnord 1265 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 1266 %elifidn %4, fnord 1267 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 1268 %elifidn %5, fnord 1269 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 1270 %else 1271 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 1272 %endif 1273 %endmacro 1274%endmacro 1275 1276; Instructions with both VEX/EVEX and legacy encodings 1277; Non-destructive instructions are written without parameters 1278AVX_INSTR addpd, sse2, 1, 0, 1 1279AVX_INSTR addps, sse, 1, 0, 1 1280AVX_INSTR addsd, sse2, 1, 0, 0 1281AVX_INSTR addss, sse, 1, 0, 0 1282AVX_INSTR addsubpd, sse3, 1, 0, 0 1283AVX_INSTR addsubps, sse3, 1, 0, 0 1284AVX_INSTR aesdec, aesni, 0, 0, 0 1285AVX_INSTR aesdeclast, aesni, 0, 0, 0 1286AVX_INSTR aesenc, aesni, 0, 0, 0 1287AVX_INSTR aesenclast, aesni, 0, 0, 0 1288AVX_INSTR aesimc, aesni 1289AVX_INSTR aeskeygenassist, aesni 1290AVX_INSTR andnpd, sse2, 1, 0, 0 1291AVX_INSTR andnps, sse, 1, 0, 0 1292AVX_INSTR andpd, sse2, 1, 0, 1 1293AVX_INSTR andps, sse, 1, 0, 1 1294AVX_INSTR blendpd, sse4, 1, 1, 0 1295AVX_INSTR blendps, sse4, 1, 1, 0 1296AVX_INSTR blendvpd, sse4 ; can't be emulated 1297AVX_INSTR blendvps, sse4 ; can't be emulated 1298AVX_INSTR cmpeqpd, sse2, 1, 0, 1 1299AVX_INSTR cmpeqps, sse, 1, 0, 1 1300AVX_INSTR cmpeqsd, sse2, 1, 0, 0 1301AVX_INSTR cmpeqss, sse, 1, 0, 0 1302AVX_INSTR cmplepd, sse2, 1, 0, 0 1303AVX_INSTR cmpleps, sse, 1, 0, 0 1304AVX_INSTR cmplesd, sse2, 1, 0, 0 1305AVX_INSTR cmpless, sse, 1, 0, 0 1306AVX_INSTR cmpltpd, sse2, 1, 0, 0 1307AVX_INSTR cmpltps, sse, 1, 0, 0 1308AVX_INSTR cmpltsd, sse2, 1, 0, 0 1309AVX_INSTR cmpltss, sse, 1, 0, 0 1310AVX_INSTR cmpneqpd, sse2, 1, 0, 1 1311AVX_INSTR cmpneqps, sse, 1, 0, 1 1312AVX_INSTR cmpneqsd, sse2, 1, 0, 0 1313AVX_INSTR cmpneqss, sse, 1, 0, 0 1314AVX_INSTR cmpnlepd, sse2, 1, 0, 0 1315AVX_INSTR cmpnleps, sse, 1, 0, 0 1316AVX_INSTR cmpnlesd, sse2, 1, 0, 0 1317AVX_INSTR cmpnless, sse, 1, 0, 0 1318AVX_INSTR cmpnltpd, sse2, 1, 0, 0 1319AVX_INSTR cmpnltps, sse, 1, 0, 0 1320AVX_INSTR cmpnltsd, sse2, 1, 0, 0 1321AVX_INSTR cmpnltss, sse, 1, 0, 0 1322AVX_INSTR cmpordpd, sse2 1, 0, 1 1323AVX_INSTR cmpordps, sse 1, 0, 1 1324AVX_INSTR cmpordsd, sse2 1, 0, 0 1325AVX_INSTR cmpordss, sse 1, 0, 0 1326AVX_INSTR cmppd, sse2, 1, 1, 0 1327AVX_INSTR cmpps, sse, 1, 1, 0 1328AVX_INSTR cmpsd, sse2, 1, 1, 0 1329AVX_INSTR cmpss, sse, 1, 1, 0 1330AVX_INSTR cmpunordpd, sse2, 1, 0, 1 1331AVX_INSTR cmpunordps, sse, 1, 0, 1 1332AVX_INSTR cmpunordsd, sse2, 1, 0, 0 1333AVX_INSTR cmpunordss, sse, 1, 0, 0 1334AVX_INSTR comisd, sse2 1335AVX_INSTR comiss, sse 1336AVX_INSTR cvtdq2pd, sse2 1337AVX_INSTR cvtdq2ps, sse2 1338AVX_INSTR cvtpd2dq, sse2 1339AVX_INSTR cvtpd2ps, sse2 1340AVX_INSTR cvtps2dq, sse2 1341AVX_INSTR cvtps2pd, sse2 1342AVX_INSTR cvtsd2si, sse2 1343AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 1344AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 1345AVX_INSTR cvtsi2ss, sse, 1, 0, 0 1346AVX_INSTR cvtss2sd, sse2, 1, 0, 0 1347AVX_INSTR cvtss2si, sse 1348AVX_INSTR cvttpd2dq, sse2 1349AVX_INSTR cvttps2dq, sse2 1350AVX_INSTR cvttsd2si, sse2 1351AVX_INSTR cvttss2si, sse 1352AVX_INSTR divpd, sse2, 1, 0, 0 1353AVX_INSTR divps, sse, 1, 0, 0 1354AVX_INSTR divsd, sse2, 1, 0, 0 1355AVX_INSTR divss, sse, 1, 0, 0 1356AVX_INSTR dppd, sse4, 1, 1, 0 1357AVX_INSTR dpps, sse4, 1, 1, 0 1358AVX_INSTR extractps, sse4 1359AVX_INSTR haddpd, sse3, 1, 0, 0 1360AVX_INSTR haddps, sse3, 1, 0, 0 1361AVX_INSTR hsubpd, sse3, 1, 0, 0 1362AVX_INSTR hsubps, sse3, 1, 0, 0 1363AVX_INSTR insertps, sse4, 1, 1, 0 1364AVX_INSTR lddqu, sse3 1365AVX_INSTR ldmxcsr, sse 1366AVX_INSTR maskmovdqu, sse2 1367AVX_INSTR maxpd, sse2, 1, 0, 1 1368AVX_INSTR maxps, sse, 1, 0, 1 1369AVX_INSTR maxsd, sse2, 1, 0, 0 1370AVX_INSTR maxss, sse, 1, 0, 0 1371AVX_INSTR minpd, sse2, 1, 0, 1 1372AVX_INSTR minps, sse, 1, 0, 1 1373AVX_INSTR minsd, sse2, 1, 0, 0 1374AVX_INSTR minss, sse, 1, 0, 0 1375AVX_INSTR movapd, sse2 1376AVX_INSTR movaps, sse 1377AVX_INSTR movd, mmx 1378AVX_INSTR movddup, sse3 1379AVX_INSTR movdqa, sse2 1380AVX_INSTR movdqu, sse2 1381AVX_INSTR movhlps, sse, 1, 0, 0 1382AVX_INSTR movhpd, sse2, 1, 0, 0 1383AVX_INSTR movhps, sse, 1, 0, 0 1384AVX_INSTR movlhps, sse, 1, 0, 0 1385AVX_INSTR movlpd, sse2, 1, 0, 0 1386AVX_INSTR movlps, sse, 1, 0, 0 1387AVX_INSTR movmskpd, sse2 1388AVX_INSTR movmskps, sse 1389AVX_INSTR movntdq, sse2 1390AVX_INSTR movntdqa, sse4 1391AVX_INSTR movntpd, sse2 1392AVX_INSTR movntps, sse 1393AVX_INSTR movq, mmx 1394AVX_INSTR movsd, sse2, 1, 0, 0 1395AVX_INSTR movshdup, sse3 1396AVX_INSTR movsldup, sse3 1397AVX_INSTR movss, sse, 1, 0, 0 1398AVX_INSTR movupd, sse2 1399AVX_INSTR movups, sse 1400AVX_INSTR mpsadbw, sse4, 0, 1, 0 1401AVX_INSTR mulpd, sse2, 1, 0, 1 1402AVX_INSTR mulps, sse, 1, 0, 1 1403AVX_INSTR mulsd, sse2, 1, 0, 0 1404AVX_INSTR mulss, sse, 1, 0, 0 1405AVX_INSTR orpd, sse2, 1, 0, 1 1406AVX_INSTR orps, sse, 1, 0, 1 1407AVX_INSTR pabsb, ssse3 1408AVX_INSTR pabsd, ssse3 1409AVX_INSTR pabsw, ssse3 1410AVX_INSTR packsswb, mmx, 0, 0, 0 1411AVX_INSTR packssdw, mmx, 0, 0, 0 1412AVX_INSTR packuswb, mmx, 0, 0, 0 1413AVX_INSTR packusdw, sse4, 0, 0, 0 1414AVX_INSTR paddb, mmx, 0, 0, 1 1415AVX_INSTR paddw, mmx, 0, 0, 1 1416AVX_INSTR paddd, mmx, 0, 0, 1 1417AVX_INSTR paddq, sse2, 0, 0, 1 1418AVX_INSTR paddsb, mmx, 0, 0, 1 1419AVX_INSTR paddsw, mmx, 0, 0, 1 1420AVX_INSTR paddusb, mmx, 0, 0, 1 1421AVX_INSTR paddusw, mmx, 0, 0, 1 1422AVX_INSTR palignr, ssse3, 0, 1, 0 1423AVX_INSTR pand, mmx, 0, 0, 1 1424AVX_INSTR pandn, mmx, 0, 0, 0 1425AVX_INSTR pavgb, mmx2, 0, 0, 1 1426AVX_INSTR pavgw, mmx2, 0, 0, 1 1427AVX_INSTR pblendvb, sse4 ; can't be emulated 1428AVX_INSTR pblendw, sse4, 0, 1, 0 1429AVX_INSTR pclmulqdq, fnord, 0, 1, 0 1430AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 1431AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 1432AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 1433AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 1434AVX_INSTR pcmpestri, sse42 1435AVX_INSTR pcmpestrm, sse42 1436AVX_INSTR pcmpistri, sse42 1437AVX_INSTR pcmpistrm, sse42 1438AVX_INSTR pcmpeqb, mmx, 0, 0, 1 1439AVX_INSTR pcmpeqw, mmx, 0, 0, 1 1440AVX_INSTR pcmpeqd, mmx, 0, 0, 1 1441AVX_INSTR pcmpeqq, sse4, 0, 0, 1 1442AVX_INSTR pcmpgtb, mmx, 0, 0, 0 1443AVX_INSTR pcmpgtw, mmx, 0, 0, 0 1444AVX_INSTR pcmpgtd, mmx, 0, 0, 0 1445AVX_INSTR pcmpgtq, sse42, 0, 0, 0 1446AVX_INSTR pextrb, sse4 1447AVX_INSTR pextrd, sse4 1448AVX_INSTR pextrq, sse4 1449AVX_INSTR pextrw, mmx2 1450AVX_INSTR phaddw, ssse3, 0, 0, 0 1451AVX_INSTR phaddd, ssse3, 0, 0, 0 1452AVX_INSTR phaddsw, ssse3, 0, 0, 0 1453AVX_INSTR phminposuw, sse4 1454AVX_INSTR phsubw, ssse3, 0, 0, 0 1455AVX_INSTR phsubd, ssse3, 0, 0, 0 1456AVX_INSTR phsubsw, ssse3, 0, 0, 0 1457AVX_INSTR pinsrb, sse4, 0, 1, 0 1458AVX_INSTR pinsrd, sse4, 0, 1, 0 1459AVX_INSTR pinsrq, sse4, 0, 1, 0 1460AVX_INSTR pinsrw, mmx2, 0, 1, 0 1461AVX_INSTR pmaddwd, mmx, 0, 0, 1 1462AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 1463AVX_INSTR pmaxsb, sse4, 0, 0, 1 1464AVX_INSTR pmaxsw, mmx2, 0, 0, 1 1465AVX_INSTR pmaxsd, sse4, 0, 0, 1 1466AVX_INSTR pmaxub, mmx2, 0, 0, 1 1467AVX_INSTR pmaxuw, sse4, 0, 0, 1 1468AVX_INSTR pmaxud, sse4, 0, 0, 1 1469AVX_INSTR pminsb, sse4, 0, 0, 1 1470AVX_INSTR pminsw, mmx2, 0, 0, 1 1471AVX_INSTR pminsd, sse4, 0, 0, 1 1472AVX_INSTR pminub, mmx2, 0, 0, 1 1473AVX_INSTR pminuw, sse4, 0, 0, 1 1474AVX_INSTR pminud, sse4, 0, 0, 1 1475AVX_INSTR pmovmskb, mmx2 1476AVX_INSTR pmovsxbw, sse4 1477AVX_INSTR pmovsxbd, sse4 1478AVX_INSTR pmovsxbq, sse4 1479AVX_INSTR pmovsxwd, sse4 1480AVX_INSTR pmovsxwq, sse4 1481AVX_INSTR pmovsxdq, sse4 1482AVX_INSTR pmovzxbw, sse4 1483AVX_INSTR pmovzxbd, sse4 1484AVX_INSTR pmovzxbq, sse4 1485AVX_INSTR pmovzxwd, sse4 1486AVX_INSTR pmovzxwq, sse4 1487AVX_INSTR pmovzxdq, sse4 1488AVX_INSTR pmuldq, sse4, 0, 0, 1 1489AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 1490AVX_INSTR pmulhuw, mmx2, 0, 0, 1 1491AVX_INSTR pmulhw, mmx, 0, 0, 1 1492AVX_INSTR pmullw, mmx, 0, 0, 1 1493AVX_INSTR pmulld, sse4, 0, 0, 1 1494AVX_INSTR pmuludq, sse2, 0, 0, 1 1495AVX_INSTR por, mmx, 0, 0, 1 1496AVX_INSTR psadbw, mmx2, 0, 0, 1 1497AVX_INSTR pshufb, ssse3, 0, 0, 0 1498AVX_INSTR pshufd, sse2 1499AVX_INSTR pshufhw, sse2 1500AVX_INSTR pshuflw, sse2 1501AVX_INSTR psignb, ssse3, 0, 0, 0 1502AVX_INSTR psignw, ssse3, 0, 0, 0 1503AVX_INSTR psignd, ssse3, 0, 0, 0 1504AVX_INSTR psllw, mmx, 0, 0, 0 1505AVX_INSTR pslld, mmx, 0, 0, 0 1506AVX_INSTR psllq, mmx, 0, 0, 0 1507AVX_INSTR pslldq, sse2, 0, 0, 0 1508AVX_INSTR psraw, mmx, 0, 0, 0 1509AVX_INSTR psrad, mmx, 0, 0, 0 1510AVX_INSTR psrlw, mmx, 0, 0, 0 1511AVX_INSTR psrld, mmx, 0, 0, 0 1512AVX_INSTR psrlq, mmx, 0, 0, 0 1513AVX_INSTR psrldq, sse2, 0, 0, 0 1514AVX_INSTR psubb, mmx, 0, 0, 0 1515AVX_INSTR psubw, mmx, 0, 0, 0 1516AVX_INSTR psubd, mmx, 0, 0, 0 1517AVX_INSTR psubq, sse2, 0, 0, 0 1518AVX_INSTR psubsb, mmx, 0, 0, 0 1519AVX_INSTR psubsw, mmx, 0, 0, 0 1520AVX_INSTR psubusb, mmx, 0, 0, 0 1521AVX_INSTR psubusw, mmx, 0, 0, 0 1522AVX_INSTR ptest, sse4 1523AVX_INSTR punpckhbw, mmx, 0, 0, 0 1524AVX_INSTR punpckhwd, mmx, 0, 0, 0 1525AVX_INSTR punpckhdq, mmx, 0, 0, 0 1526AVX_INSTR punpckhqdq, sse2, 0, 0, 0 1527AVX_INSTR punpcklbw, mmx, 0, 0, 0 1528AVX_INSTR punpcklwd, mmx, 0, 0, 0 1529AVX_INSTR punpckldq, mmx, 0, 0, 0 1530AVX_INSTR punpcklqdq, sse2, 0, 0, 0 1531AVX_INSTR pxor, mmx, 0, 0, 1 1532AVX_INSTR rcpps, sse 1533AVX_INSTR rcpss, sse, 1, 0, 0 1534AVX_INSTR roundpd, sse4 1535AVX_INSTR roundps, sse4 1536AVX_INSTR roundsd, sse4, 1, 1, 0 1537AVX_INSTR roundss, sse4, 1, 1, 0 1538AVX_INSTR rsqrtps, sse 1539AVX_INSTR rsqrtss, sse, 1, 0, 0 1540AVX_INSTR shufpd, sse2, 1, 1, 0 1541AVX_INSTR shufps, sse, 1, 1, 0 1542AVX_INSTR sqrtpd, sse2 1543AVX_INSTR sqrtps, sse 1544AVX_INSTR sqrtsd, sse2, 1, 0, 0 1545AVX_INSTR sqrtss, sse, 1, 0, 0 1546AVX_INSTR stmxcsr, sse 1547AVX_INSTR subpd, sse2, 1, 0, 0 1548AVX_INSTR subps, sse, 1, 0, 0 1549AVX_INSTR subsd, sse2, 1, 0, 0 1550AVX_INSTR subss, sse, 1, 0, 0 1551AVX_INSTR ucomisd, sse2 1552AVX_INSTR ucomiss, sse 1553AVX_INSTR unpckhpd, sse2, 1, 0, 0 1554AVX_INSTR unpckhps, sse, 1, 0, 0 1555AVX_INSTR unpcklpd, sse2, 1, 0, 0 1556AVX_INSTR unpcklps, sse, 1, 0, 0 1557AVX_INSTR xorpd, sse2, 1, 0, 1 1558AVX_INSTR xorps, sse, 1, 0, 1 1559 1560; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1561AVX_INSTR pfadd, 3dnow, 1, 0, 1 1562AVX_INSTR pfsub, 3dnow, 1, 0, 0 1563AVX_INSTR pfmul, 3dnow, 1, 0, 1 1564 1565; base-4 constants for shuffles 1566%assign i 0 1567%rep 256 1568 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1569 %if j < 10 1570 CAT_XDEFINE q000, j, i 1571 %elif j < 100 1572 CAT_XDEFINE q00, j, i 1573 %elif j < 1000 1574 CAT_XDEFINE q0, j, i 1575 %else 1576 CAT_XDEFINE q, j, i 1577 %endif 1578 %assign i i+1 1579%endrep 1580%undef i 1581%undef j 1582 1583%macro FMA_INSTR 3 1584 %macro %1 4-7 %1, %2, %3 1585 %if cpuflag(xop) 1586 v%5 %1, %2, %3, %4 1587 %elifnidn %1, %4 1588 %6 %1, %2, %3 1589 %7 %1, %4 1590 %else 1591 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported 1592 %endif 1593 %endmacro 1594%endmacro 1595 1596FMA_INSTR pmacsww, pmullw, paddw 1597FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation 1598FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation 1599FMA_INSTR pmadcswd, pmaddwd, paddd 1600 1601; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. 1602; This lets us use tzcnt without bumping the yasm version requirement yet. 1603%define tzcnt rep bsf 1604 1605; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. 1606; FMA3 is only possible if dst is the same as one of the src registers. 1607; Either src2 or src3 can be a memory operand. 1608%macro FMA4_INSTR 2-* 1609 %push fma4_instr 1610 %xdefine %$prefix %1 1611 %rep %0 - 1 1612 %macro %$prefix%2 4-6 %$prefix, %2 1613 %if notcpuflag(fma3) && notcpuflag(fma4) 1614 %error use of ``%5%6'' fma instruction in cpuname function: current_function 1615 %elif cpuflag(fma4) 1616 v%5%6 %1, %2, %3, %4 1617 %elifidn %1, %2 1618 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. 1619 %ifnum sizeof%3 1620 v%{5}213%6 %2, %3, %4 1621 %else 1622 v%{5}132%6 %2, %4, %3 1623 %endif 1624 %elifidn %1, %3 1625 v%{5}213%6 %3, %2, %4 1626 %elifidn %1, %4 1627 v%{5}231%6 %4, %2, %3 1628 %else 1629 %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported 1630 %endif 1631 %endmacro 1632 %rotate 1 1633 %endrep 1634 %pop 1635%endmacro 1636 1637FMA4_INSTR fmadd, pd, ps, sd, ss 1638FMA4_INSTR fmaddsub, pd, ps 1639FMA4_INSTR fmsub, pd, ps, sd, ss 1640FMA4_INSTR fmsubadd, pd, ps 1641FMA4_INSTR fnmadd, pd, ps, sd, ss 1642FMA4_INSTR fnmsub, pd, ps, sd, ss 1643 1644; Macros for converting VEX instructions to equivalent EVEX ones. 1645%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex 1646 %macro %1 2-7 fnord, fnord, %1, %2, %3 1647 %ifidn %3, fnord 1648 %define %%args %1, %2 1649 %elifidn %4, fnord 1650 %define %%args %1, %2, %3 1651 %else 1652 %define %%args %1, %2, %3, %4 1653 %endif 1654 %assign %%evex_required cpuflag(avx512) & %7 1655 %ifnum regnumof%1 1656 %if regnumof%1 >= 16 || sizeof%1 > 32 1657 %assign %%evex_required 1 1658 %endif 1659 %endif 1660 %ifnum regnumof%2 1661 %if regnumof%2 >= 16 || sizeof%2 > 32 1662 %assign %%evex_required 1 1663 %endif 1664 %endif 1665 %if %%evex_required 1666 %6 %%args 1667 %else 1668 %5 %%args ; Prefer VEX over EVEX due to shorter instruction length 1669 %endif 1670 %endmacro 1671%endmacro 1672 1673EVEX_INSTR vbroadcastf128, vbroadcastf32x4 1674EVEX_INSTR vbroadcasti128, vbroadcasti32x4 1675EVEX_INSTR vextractf128, vextractf32x4 1676EVEX_INSTR vextracti128, vextracti32x4 1677EVEX_INSTR vinsertf128, vinsertf32x4 1678EVEX_INSTR vinserti128, vinserti32x4 1679EVEX_INSTR vmovdqa, vmovdqa32 1680EVEX_INSTR vmovdqu, vmovdqu32 1681EVEX_INSTR vpand, vpandd 1682EVEX_INSTR vpandn, vpandnd 1683EVEX_INSTR vpor, vpord 1684EVEX_INSTR vpxor, vpxord 1685EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision 1686EVEX_INSTR vrcpss, vrcp14ss, 1 1687EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 1688EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 1689 1690; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) 1691%ifdef __YASM_VER__ 1692 %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 1693 %macro vpbroadcastq 2 1694 %if sizeof%1 == 16 1695 movddup %1, %2 1696 %else 1697 vbroadcastsd %1, %2 1698 %endif 1699 %endmacro 1700 %endif 1701%endif 1702