1; 2; jsimdext.inc - common declarations 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander. 6; Copyright (C) 2018, Matthieu Darbois. 7; Copyright (C) 2018, Matthias Räncker. 8; 9; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 10; 11; Copyright (C) 1999-2006, MIYASAKA Masaru. 12; 13; This software is provided 'as-is', without any express or implied 14; warranty. In no event will the authors be held liable for any damages 15; arising from the use of this software. 16; 17; Permission is granted to anyone to use this software for any purpose, 18; including commercial applications, and to alter it and redistribute it 19; freely, subject to the following restrictions: 20; 21; 1. The origin of this software must not be misrepresented; you must not 22; claim that you wrote the original software. If you use this software 23; in a product, an acknowledgment in the product documentation would be 24; appreciated but is not required. 25; 2. Altered source versions must be plainly marked as such, and must not be 26; misrepresented as being the original software. 27; 3. This notice may not be removed or altered from any source distribution. 28 29; ========================================================================== 30; System-dependent configurations 31 32%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- 33; * Microsoft Visual C++ 34; * MinGW (Minimalist GNU for Windows) 35; * CygWin 36; * LCC-Win32 37 38; -- segment definition -- 39; 40%ifdef __YASM_VER__ 41%define SEG_TEXT .text align=32 42%define SEG_CONST .rdata align=32 43%else 44%define SEG_TEXT .text align=32 public use32 class=CODE 45%define SEG_CONST .rdata align=32 public use32 class=CONST 46%endif 47 48%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- 49; * Microsoft Visual C++ 50 51; -- segment definition -- 52; 53%ifdef __YASM_VER__ 54%define SEG_TEXT .text align=32 55%define SEG_CONST .rdata align=32 56%else 57%define SEG_TEXT .text align=32 public use64 class=CODE 58%define SEG_CONST .rdata align=32 public use64 class=CONST 59%endif 60%define EXTN(name) name ; foo() -> foo 61 62%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- 63; * Borland C++ (Win32) 64 65; -- segment definition -- 66; 67%define SEG_TEXT _text align=32 public use32 class=CODE 68%define SEG_CONST _data align=32 public use32 class=DATA 69 70%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ 71; * Linux 72; * *BSD family Unix using elf format 73; * Unix System V, including Solaris x86, UnixWare and SCO Unix 74 75; mark stack as non-executable 76section .note.GNU-stack noalloc noexec nowrite progbits 77 78; -- segment definition -- 79; 80%ifdef __x86_64__ 81%define SEG_TEXT .text progbits align=32 82%define SEG_CONST .rodata progbits align=32 83%else 84%define SEG_TEXT .text progbits alloc exec nowrite align=32 85%define SEG_CONST .rodata progbits alloc noexec nowrite align=32 86%endif 87 88; To make the code position-independent, append -DPIC to the commandline 89; 90%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC 91%define EXTN(name) name ; foo() -> foo 92 93%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- 94; * Older Linux using a.out format (nasm -f aout -DAOUT ...) 95; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) 96 97; -- segment definition -- 98; 99%define SEG_TEXT .text 100%define SEG_CONST .data 101 102; To make the code position-independent, append -DPIC to the commandline 103; 104%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC 105 106%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- 107; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) 108 109; -- segment definition -- 110; 111%define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why? 112%define SEG_CONST .rodata align=32 113 114; The generation of position-independent code (PIC) is the default on Darwin. 115; 116%define PIC 117%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing 118 119%else ; ----(Other case)---------------------- 120 121; -- segment definition -- 122; 123%define SEG_TEXT .text 124%define SEG_CONST .data 125 126%endif ; ---------------------------------------------- 127 128; ========================================================================== 129 130; -------------------------------------------------------------------------- 131; Common types 132; 133%ifdef __x86_64__ 134%ifnidn __OUTPUT_FORMAT__, elfx32 135%define POINTER qword ; general pointer type 136%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) 137%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT 138%define resp resq 139%define dp dq 140%define raxp rax 141%define rbxp rbx 142%define rcxp rcx 143%define rdxp rdx 144%define rsip rsi 145%define rdip rdi 146%define rbpp rbp 147%define rspp rsp 148%define r8p r8 149%define r9p r9 150%define r10p r10 151%define r11p r11 152%define r12p r12 153%define r13p r13 154%define r14p r14 155%define r15p r15 156%endif 157%endif 158%ifndef raxp 159%define POINTER dword ; general pointer type 160%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) 161%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT 162%define resp resd 163%define dp dd 164; x86_64 ILP32 ABI (x32) 165%define raxp eax 166%define rbxp ebx 167%define rcxp ecx 168%define rdxp edx 169%define rsip esi 170%define rdip edi 171%define rbpp ebp 172%define rspp esp 173%define r8p r8d 174%define r9p r9d 175%define r10p r10d 176%define r11p r11d 177%define r12p r12d 178%define r13p r13d 179%define r14p r14d 180%define r15p r15d 181%endif 182 183%define INT dword ; signed integer type 184%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) 185%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT 186 187%define FP32 dword ; IEEE754 single 188%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) 189%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT 190 191%define MMWORD qword ; int64 (MMX register) 192%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) 193%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT 194 195; NASM is buggy and doesn't properly handle operand sizes for SSE 196; instructions, so for now we have to define XMMWORD as blank. 197%define XMMWORD ; int128 (SSE register) 198%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) 199%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT 200 201%define YMMWORD ; int256 (AVX register) 202%define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD) 203%define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT 204 205; Similar hacks for when we load a dword or MMWORD into an xmm# register 206%define XMM_DWORD 207%define XMM_MMWORD 208 209%define SIZEOF_BYTE 1 ; sizeof(byte) 210%define SIZEOF_WORD 2 ; sizeof(word) 211%define SIZEOF_DWORD 4 ; sizeof(dword) 212%define SIZEOF_QWORD 8 ; sizeof(qword) 213%define SIZEOF_OWORD 16 ; sizeof(oword) 214%define SIZEOF_YWORD 32 ; sizeof(yword) 215 216%define BYTE_BIT 8 ; CHAR_BIT in C 217%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT 218%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT 219%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT 220%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT 221%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT 222 223; -------------------------------------------------------------------------- 224; External Symbol Name 225; 226%ifndef EXTN 227%define EXTN(name) _ %+ name ; foo() -> _foo 228%endif 229 230; -------------------------------------------------------------------------- 231; Hidden symbols 232; 233%ifdef ELF ; ----(nasm -felf[64] -DELF ...)-------- 234%define GLOBAL_FUNCTION(name) global EXTN(name):function hidden 235%define GLOBAL_DATA(name) global EXTN(name):data hidden 236%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- 237%ifdef __YASM_VER__ 238%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern 239%define GLOBAL_DATA(name) global EXTN(name):private_extern 240%else 241%if __NASM_VERSION_ID__ >= 0x020E0000 242%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern 243%define GLOBAL_DATA(name) global EXTN(name):private_extern 244%endif 245%endif 246%endif 247 248%ifndef GLOBAL_FUNCTION 249%define GLOBAL_FUNCTION(name) global EXTN(name) 250%endif 251%ifndef GLOBAL_DATA 252%define GLOBAL_DATA(name) global EXTN(name) 253%endif 254 255; -------------------------------------------------------------------------- 256; Macros for position-independent code (PIC) support 257; 258%ifndef GOT_SYMBOL 259%undef PIC 260%endif 261 262%ifdef PIC ; ------------------------------------------- 263 264%ifidn GOT_SYMBOL, _MACHO_PIC_ ; -------------------- 265 266; At present, nasm doesn't seem to support PIC generation for Mach-O. 267; The PIC support code below is a little tricky. 268 269 SECTION SEG_CONST 270const_base: 271 272%define GOTOFF(got, sym) (got) + (sym) - const_base 273 274%imacro get_GOT 1 275 ; NOTE: this macro destroys ecx resister. 276 call %%geteip 277 add ecx, byte (%%ref - $) 278 jmp short %%adjust 279%%geteip: 280 mov ecx, POINTER [esp] 281 ret 282%%adjust: 283 push ebp 284 xor ebp, ebp ; ebp = 0 285%ifidni %1, ebx ; (%1 == ebx) 286 ; db 0x8D,0x9C + jmp near const_base = 287 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) 288 db 0x8D, 0x9C ; 8D,9C 289 jmp near const_base ; E9,(const_base-%%ref) 290%%ref: 291%else ; (%1 != ebx) 292 ; db 0x8D,0x8C + jmp near const_base = 293 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) 294 db 0x8D, 0x8C ; 8D,8C 295 jmp near const_base ; E9,(const_base-%%ref) 296%%ref: 297 mov %1, ecx 298%endif ; (%1 == ebx) 299 pop ebp 300%endmacro 301 302%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- 303 304%define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff 305 306%imacro get_GOT 1 307 extern GOT_SYMBOL 308 call %%geteip 309 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc 310 jmp short %%done 311%%geteip: 312 mov %1, POINTER [esp] 313 ret 314%%done: 315%endmacro 316 317%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- 318 319%imacro pushpic 1.nolist 320 push %1 321%endmacro 322%imacro poppic 1.nolist 323 pop %1 324%endmacro 325%imacro movpic 2.nolist 326 mov %1, %2 327%endmacro 328 329%else ; !PIC ----------------------------------------- 330 331%define GOTOFF(got, sym) (sym) 332 333%imacro get_GOT 1.nolist 334%endmacro 335%imacro pushpic 1.nolist 336%endmacro 337%imacro poppic 1.nolist 338%endmacro 339%imacro movpic 2.nolist 340%endmacro 341 342%endif ; PIC ----------------------------------------- 343 344; -------------------------------------------------------------------------- 345; Align the next instruction on {2,4,8,16,..}-byte boundary. 346; ".balign n,,m" in GNU as 347; 348%define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) 349%define FILLB(b, n) (($$-(b)) & ((n)-1)) 350 351%imacro alignx 1-2.nolist 0xFFFF 352%%bs: \ 353 times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \ 354 db 0x90 ; nop 355 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \ 356 db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000] 357 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \ 358 db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] 359 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \ 360 db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000] 361 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \ 362 db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00] 363 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \ 364 db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00] 365 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \ 366 db 0x8B, 0xED ; mov ebp,ebp 367 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \ 368 db 0x90 ; nop 369%endmacro 370 371; Align the next data on {2,4,8,16,..}-byte boundary. 372; 373%imacro alignz 1.nolist 374 align %1, db 0 ; filling zeros 375%endmacro 376 377%ifdef __x86_64__ 378 379%ifdef WIN64 380 381%imacro collect_args 1 382 sub rsp, SIZEOF_XMMWORD 383 movaps XMMWORD [rsp], xmm6 384 sub rsp, SIZEOF_XMMWORD 385 movaps XMMWORD [rsp], xmm7 386 mov r10, rcx 387%if %1 > 1 388 mov r11, rdx 389%endif 390%if %1 > 2 391 push r12 392 mov r12, r8 393%endif 394%if %1 > 3 395 push r13 396 mov r13, r9 397%endif 398%if %1 > 4 399 push r14 400 mov r14, [rax+48] 401%endif 402%if %1 > 5 403 push r15 404 mov r15, [rax+56] 405%endif 406 push rsi 407 push rdi 408%endmacro 409 410%imacro uncollect_args 1 411 pop rdi 412 pop rsi 413%if %1 > 5 414 pop r15 415%endif 416%if %1 > 4 417 pop r14 418%endif 419%if %1 > 3 420 pop r13 421%endif 422%if %1 > 2 423 pop r12 424%endif 425 movaps xmm7, XMMWORD [rsp] 426 add rsp, SIZEOF_XMMWORD 427 movaps xmm6, XMMWORD [rsp] 428 add rsp, SIZEOF_XMMWORD 429%endmacro 430 431%imacro push_xmm 1 432 sub rsp, %1 * SIZEOF_XMMWORD 433 movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8 434%if %1 > 1 435 movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9 436%endif 437%if %1 > 2 438 movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10 439%endif 440%if %1 > 3 441 movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11 442%endif 443%endmacro 444 445%imacro pop_xmm 1 446 movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD] 447%if %1 > 1 448 movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD] 449%endif 450%if %1 > 2 451 movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD] 452%endif 453%if %1 > 3 454 movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD] 455%endif 456 add rsp, %1 * SIZEOF_XMMWORD 457%endmacro 458 459%else 460 461%imacro collect_args 1 462 push r10 463 mov r10, rdi 464%if %1 > 1 465 push r11 466 mov r11, rsi 467%endif 468%if %1 > 2 469 push r12 470 mov r12, rdx 471%endif 472%if %1 > 3 473 push r13 474 mov r13, rcx 475%endif 476%if %1 > 4 477 push r14 478 mov r14, r8 479%endif 480%if %1 > 5 481 push r15 482 mov r15, r9 483%endif 484%endmacro 485 486%imacro uncollect_args 1 487%if %1 > 5 488 pop r15 489%endif 490%if %1 > 4 491 pop r14 492%endif 493%if %1 > 3 494 pop r13 495%endif 496%if %1 > 2 497 pop r12 498%endif 499%if %1 > 1 500 pop r11 501%endif 502 pop r10 503%endmacro 504 505%imacro push_xmm 1 506%endmacro 507 508%imacro pop_xmm 1 509%endmacro 510 511%endif 512 513%endif 514 515; -------------------------------------------------------------------------- 516; Defines picked up from the C headers 517; 518%include "jsimdcfg.inc" 519 520; -------------------------------------------------------------------------- 521