• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jsimdext.inc - common declarations
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
6; Copyright (C) 2018, Matthieu Darbois.
7; Copyright (C) 2018, Matthias Räncker.
8;
9; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
10;
11; Copyright (C) 1999-2006, MIYASAKA Masaru.
12;
13; This software is provided 'as-is', without any express or implied
14; warranty.  In no event will the authors be held liable for any damages
15; arising from the use of this software.
16;
17; Permission is granted to anyone to use this software for any purpose,
18; including commercial applications, and to alter it and redistribute it
19; freely, subject to the following restrictions:
20;
21; 1. The origin of this software must not be misrepresented; you must not
22;    claim that you wrote the original software. If you use this software
23;    in a product, an acknowledgment in the product documentation would be
24;    appreciated but is not required.
25; 2. Altered source versions must be plainly marked as such, and must not be
26;    misrepresented as being the original software.
27; 3. This notice may not be removed or altered from any source distribution.
28
29; ==========================================================================
30;  System-dependent configurations
31
32%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
33; * Microsoft Visual C++
34; * MinGW (Minimalist GNU for Windows)
35; * CygWin
36; * LCC-Win32
37
38; -- segment definition --
39;
40%ifdef __YASM_VER__
41%define SEG_TEXT   .text  align=32
42%define SEG_CONST  .rdata align=32
43%else
44%define SEG_TEXT   .text  align=32 public use32 class=CODE
45%define SEG_CONST  .rdata align=32 public use32 class=CONST
46%endif
47
48%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
49; * Microsoft Visual C++
50
51; -- segment definition --
52;
53%ifdef __YASM_VER__
54%define SEG_TEXT    .text  align=32
55%define SEG_CONST   .rdata align=32
56%else
57%define SEG_TEXT    .text  align=32 public use64 class=CODE
58%define SEG_CONST   .rdata align=32 public use64 class=CONST
59%endif
60%define EXTN(name)  name                ; foo() -> foo
61
62%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
63; * Borland C++ (Win32)
64
65; -- segment definition --
66;
67%define SEG_TEXT   _text align=32 public use32 class=CODE
68%define SEG_CONST  _data align=32 public use32 class=DATA
69
70%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
71; * Linux
72; * *BSD family Unix using elf format
73; * Unix System V, including Solaris x86, UnixWare and SCO Unix
74
75; mark stack as non-executable
76section .note.GNU-stack noalloc noexec nowrite progbits
77
78; -- segment definition --
79;
80%ifdef __x86_64__
81%define SEG_TEXT   .text   progbits align=32
82%define SEG_CONST  .rodata progbits align=32
83%else
84%define SEG_TEXT   .text   progbits alloc exec   nowrite align=32
85%define SEG_CONST  .rodata progbits alloc noexec nowrite align=32
86%endif
87
88; To make the code position-independent, append -DPIC to the commandline
89;
90%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_  ; ELF supports PIC
91%define EXTN(name)  name                   ; foo() -> foo
92
93%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
94; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
95; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
96
97; -- segment definition --
98;
99%define SEG_TEXT   .text
100%define SEG_CONST  .data
101
102; To make the code position-independent, append -DPIC to the commandline
103;
104%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_  ; BSD-style a.out supports PIC
105
106%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
107; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
108
109; -- segment definition --
110;
111%define SEG_TEXT   .text  ;align=32     ; nasm doesn't accept align=32. why?
112%define SEG_CONST  .rodata align=32
113
114; The generation of position-independent code (PIC) is the default on Darwin.
115;
116%define PIC
117%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
118
119%else           ; ----(Other case)----------------------
120
121; -- segment definition --
122;
123%define SEG_TEXT   .text
124%define SEG_CONST  .data
125
126%endif          ; ----------------------------------------------
127
128; ==========================================================================
129
130; --------------------------------------------------------------------------
131;  Common types
132;
133%ifdef __x86_64__
134%ifnidn __OUTPUT_FORMAT__, elfx32
135%define POINTER         qword           ; general pointer type
136%define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
137%define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
138%define resp            resq
139%define dp              dq
140%define raxp            rax
141%define rbxp            rbx
142%define rcxp            rcx
143%define rdxp            rdx
144%define rsip            rsi
145%define rdip            rdi
146%define rbpp            rbp
147%define rspp            rsp
148%define r8p             r8
149%define r9p             r9
150%define r10p            r10
151%define r11p            r11
152%define r12p            r12
153%define r13p            r13
154%define r14p            r14
155%define r15p            r15
156%endif
157%endif
158%ifndef raxp
159%define POINTER         dword           ; general pointer type
160%define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
161%define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
162%define resp            resd
163%define dp              dd
164; x86_64 ILP32 ABI (x32)
165%define raxp            eax
166%define rbxp            ebx
167%define rcxp            ecx
168%define rdxp            edx
169%define rsip            esi
170%define rdip            edi
171%define rbpp            ebp
172%define rspp            esp
173%define r8p             r8d
174%define r9p             r9d
175%define r10p            r10d
176%define r11p            r11d
177%define r12p            r12d
178%define r13p            r13d
179%define r14p            r14d
180%define r15p            r15d
181%endif
182
183%define INT             dword           ; signed integer type
184%define SIZEOF_INT      SIZEOF_DWORD    ; sizeof(INT)
185%define INT_BIT         DWORD_BIT       ; sizeof(INT)*BYTE_BIT
186
187%define FP32            dword           ; IEEE754 single
188%define SIZEOF_FP32     SIZEOF_DWORD    ; sizeof(FP32)
189%define FP32_BIT        DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
190
191%define MMWORD          qword           ; int64  (MMX register)
192%define SIZEOF_MMWORD   SIZEOF_QWORD    ; sizeof(MMWORD)
193%define MMWORD_BIT      QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
194
195; NASM is buggy and doesn't properly handle operand sizes for SSE
196; instructions, so for now we have to define XMMWORD as blank.
197%define XMMWORD                         ; int128 (SSE register)
198%define SIZEOF_XMMWORD  SIZEOF_OWORD    ; sizeof(XMMWORD)
199%define XMMWORD_BIT     OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
200
201%define YMMWORD                         ; int256 (AVX register)
202%define SIZEOF_YMMWORD  SIZEOF_YWORD    ; sizeof(YMMWORD)
203%define YMMWORD_BIT     YWORD_BIT       ; sizeof(YMMWORD)*BYTE_BIT
204
205; Similar hacks for when we load a dword or MMWORD into an xmm# register
206%define XMM_DWORD
207%define XMM_MMWORD
208
209%define SIZEOF_BYTE   1                 ; sizeof(byte)
210%define SIZEOF_WORD   2                 ; sizeof(word)
211%define SIZEOF_DWORD  4                 ; sizeof(dword)
212%define SIZEOF_QWORD  8                 ; sizeof(qword)
213%define SIZEOF_OWORD  16                ; sizeof(oword)
214%define SIZEOF_YWORD  32                ; sizeof(yword)
215
216%define BYTE_BIT      8                 ; CHAR_BIT in C
217%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
218%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
219%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
220%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
221%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
222
223; --------------------------------------------------------------------------
224;  External Symbol Name
225;
226%ifndef EXTN
227%define EXTN(name)  _ %+ name           ; foo() -> _foo
228%endif
229
230; --------------------------------------------------------------------------
231;  Hidden symbols
232;
233%ifdef ELF      ; ----(nasm -felf[64] -DELF ...)--------
234%define GLOBAL_FUNCTION(name)  global EXTN(name):function hidden
235%define GLOBAL_DATA(name)      global EXTN(name):data hidden
236%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
237%ifdef __YASM_VER__
238%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
239%define GLOBAL_DATA(name)      global EXTN(name):private_extern
240%else
241%if __NASM_VERSION_ID__ >= 0x020E0000
242%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
243%define GLOBAL_DATA(name)      global EXTN(name):private_extern
244%endif
245%endif
246%endif
247
248%ifndef GLOBAL_FUNCTION
249%define GLOBAL_FUNCTION(name)  global EXTN(name)
250%endif
251%ifndef GLOBAL_DATA
252%define GLOBAL_DATA(name)      global EXTN(name)
253%endif
254
255; --------------------------------------------------------------------------
256;  Macros for position-independent code (PIC) support
257;
258%ifndef GOT_SYMBOL
259%undef PIC
260%endif
261
262%ifdef PIC  ; -------------------------------------------
263
264%ifidn GOT_SYMBOL, _MACHO_PIC_  ; --------------------
265
266; At present, nasm doesn't seem to support PIC generation for Mach-O.
267; The PIC support code below is a little tricky.
268
269    SECTION     SEG_CONST
270const_base:
271
272%define GOTOFF(got, sym)  (got) + (sym) - const_base
273
274%imacro get_GOT 1
275    ; NOTE: this macro destroys ecx resister.
276    call        %%geteip
277    add         ecx, byte (%%ref - $)
278    jmp         short %%adjust
279%%geteip:
280    mov         ecx, POINTER [esp]
281    ret
282%%adjust:
283    push        ebp
284    xor         ebp, ebp                ; ebp = 0
285%ifidni %1, ebx  ; (%1 == ebx)
286    ; db 0x8D,0x9C + jmp near const_base =
287    ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
288    db          0x8D, 0x9C              ; 8D,9C
289    jmp         near const_base         ; E9,(const_base-%%ref)
290%%ref:
291%else  ; (%1 != ebx)
292    ; db 0x8D,0x8C + jmp near const_base =
293    ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
294    db          0x8D, 0x8C              ; 8D,8C
295    jmp         near const_base         ; E9,(const_base-%%ref)
296%%ref:
297    mov         %1, ecx
298%endif  ; (%1 == ebx)
299    pop         ebp
300%endmacro
301
302%else     ; GOT_SYMBOL != _MACHO_PIC_ ----------------
303
304%define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
305
306%imacro get_GOT 1
307    extern      GOT_SYMBOL
308    call        %%geteip
309    add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
310    jmp         short %%done
311%%geteip:
312    mov         %1, POINTER [esp]
313    ret
314%%done:
315%endmacro
316
317%endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
318
319%imacro pushpic 1.nolist
320    push        %1
321%endmacro
322%imacro poppic  1.nolist
323    pop         %1
324%endmacro
325%imacro movpic  2.nolist
326    mov         %1, %2
327%endmacro
328
329%else    ; !PIC -----------------------------------------
330
331%define GOTOFF(got, sym)  (sym)
332
333%imacro get_GOT 1.nolist
334%endmacro
335%imacro pushpic 1.nolist
336%endmacro
337%imacro poppic  1.nolist
338%endmacro
339%imacro movpic  2.nolist
340%endmacro
341
342%endif   ;  PIC -----------------------------------------
343
344; --------------------------------------------------------------------------
345;  Align the next instruction on {2,4,8,16,..}-byte boundary.
346;  ".balign n,,m" in GNU as
347;
348%define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
349%define FILLB(b, n)  (($$-(b)) & ((n)-1))
350
351%imacro alignx 1-2.nolist 0xFFFF
352%%bs: \
353  times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
354        db 0x90                                      ; nop
355  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
356        db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00  ; lea ebx,[ebx+0x00000000]
357  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
358        db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00  ; lea ebp,[ebp+0x00000000]
359  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
360        db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00        ; lea ebp,[ebp+0x00000000]
361  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
362        db 0x8D, 0x6C, 0x25, 0x00                    ; lea ebp,[ebp+0x00]
363  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
364        db 0x8D, 0x6D, 0x00                          ; lea ebp,[ebp+0x00]
365  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
366        db 0x8B, 0xED                                ; mov ebp,ebp
367  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
368        db 0x90                                      ; nop
369%endmacro
370
371; Align the next data on {2,4,8,16,..}-byte boundary.
372;
373%imacro alignz 1.nolist
374    align       %1, db 0                ; filling zeros
375%endmacro
376
377%ifdef __x86_64__
378
379%ifdef WIN64
380
381%imacro collect_args 1
382    sub         rsp, SIZEOF_XMMWORD
383    movaps      XMMWORD [rsp], xmm6
384    sub         rsp, SIZEOF_XMMWORD
385    movaps      XMMWORD [rsp], xmm7
386    mov         r10, rcx
387%if %1 > 1
388    mov         r11, rdx
389%endif
390%if %1 > 2
391    push        r12
392    mov         r12, r8
393%endif
394%if %1 > 3
395    push        r13
396    mov         r13, r9
397%endif
398%if %1 > 4
399    push        r14
400    mov         r14, [rax+48]
401%endif
402%if %1 > 5
403    push        r15
404    mov         r15, [rax+56]
405%endif
406    push        rsi
407    push        rdi
408%endmacro
409
410%imacro uncollect_args 1
411    pop         rdi
412    pop         rsi
413%if %1 > 5
414    pop         r15
415%endif
416%if %1 > 4
417    pop         r14
418%endif
419%if %1 > 3
420    pop         r13
421%endif
422%if %1 > 2
423    pop         r12
424%endif
425    movaps      xmm7, XMMWORD [rsp]
426    add         rsp, SIZEOF_XMMWORD
427    movaps      xmm6, XMMWORD [rsp]
428    add         rsp, SIZEOF_XMMWORD
429%endmacro
430
431%imacro push_xmm 1
432    sub         rsp, %1 * SIZEOF_XMMWORD
433    movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
434%if %1 > 1
435    movaps      XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
436%endif
437%if %1 > 2
438    movaps      XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
439%endif
440%if %1 > 3
441    movaps      XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
442%endif
443%endmacro
444
445%imacro pop_xmm 1
446    movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
447%if %1 > 1
448    movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
449%endif
450%if %1 > 2
451    movaps      xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
452%endif
453%if %1 > 3
454    movaps      xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
455%endif
456    add         rsp, %1 * SIZEOF_XMMWORD
457%endmacro
458
459%else
460
461%imacro collect_args 1
462    push        r10
463    mov         r10, rdi
464%if %1 > 1
465    push        r11
466    mov         r11, rsi
467%endif
468%if %1 > 2
469    push        r12
470    mov         r12, rdx
471%endif
472%if %1 > 3
473    push        r13
474    mov         r13, rcx
475%endif
476%if %1 > 4
477    push        r14
478    mov         r14, r8
479%endif
480%if %1 > 5
481    push        r15
482    mov         r15, r9
483%endif
484%endmacro
485
486%imacro uncollect_args 1
487%if %1 > 5
488    pop         r15
489%endif
490%if %1 > 4
491    pop         r14
492%endif
493%if %1 > 3
494    pop         r13
495%endif
496%if %1 > 2
497    pop         r12
498%endif
499%if %1 > 1
500    pop         r11
501%endif
502    pop         r10
503%endmacro
504
505%imacro push_xmm 1
506%endmacro
507
508%imacro pop_xmm 1
509%endmacro
510
511%endif
512
513%endif
514
515; --------------------------------------------------------------------------
516;  Defines picked up from the C headers
517;
518%include "jsimdcfg.inc"
519
520; --------------------------------------------------------------------------
521