• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jsimdext.inc - common declarations
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2010, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
8;
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10;
11; This software is provided 'as-is', without any express or implied
12; warranty.  In no event will the authors be held liable for any damages
13; arising from the use of this software.
14;
15; Permission is granted to anyone to use this software for any purpose,
16; including commercial applications, and to alter it and redistribute it
17; freely, subject to the following restrictions:
18;
19; 1. The origin of this software must not be misrepresented; you must not
20;    claim that you wrote the original software. If you use this software
21;    in a product, an acknowledgment in the product documentation would be
22;    appreciated but is not required.
23; 2. Altered source versions must be plainly marked as such, and must not be
24;    misrepresented as being the original software.
25; 3. This notice may not be removed or altered from any source distribution.
26;
27; [TAB8]
28
29; ==========================================================================
30;  System-dependent configurations
31
32%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
33; * Microsoft Visual C++
34; * MinGW (Minimalist GNU for Windows)
35; * CygWin
36; * LCC-Win32
37
38; -- segment definition --
39;
40%ifdef __YASM_VER__
41%define SEG_TEXT    .text  align=16
42%define SEG_CONST   .rdata align=16
43%else
44%define SEG_TEXT    .text  align=16 public use32 class=CODE
45%define SEG_CONST   .rdata align=16 public use32 class=CONST
46%endif
47
48%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
49; * Microsoft Visual C++
50
51; -- segment definition --
52;
53%ifdef __YASM_VER__
54%define SEG_TEXT    .text  align=16
55%define SEG_CONST   .rdata align=16
56%else
57%define SEG_TEXT    .text  align=16 public use64 class=CODE
58%define SEG_CONST   .rdata align=16 public use64 class=CONST
59%endif
60%define EXTN(name)  name                        ; foo() -> foo
61
62%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
63; * Borland C++ (Win32)
64
65; -- segment definition --
66;
67%define SEG_TEXT    _text  align=16 public use32 class=CODE
68%define SEG_CONST   _data  align=16 public use32 class=DATA
69
70%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
71; * Linux
72; * *BSD family Unix using elf format
73; * Unix System V, including Solaris x86, UnixWare and SCO Unix
74
75; mark stack as non-executable
76section .note.GNU-stack noalloc noexec nowrite progbits
77
78; -- segment definition --
79;
80%ifdef __x86_64__
81%define SEG_TEXT    .text   progbits align=16
82%define SEG_CONST   .rodata progbits align=16
83%else
84%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
85%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
86%endif
87
88; To make the code position-independent, append -DPIC to the commandline
89;
90%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
91%define EXTN(name)  name                        ; foo() -> foo
92
93%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
94; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
95; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
96
97; -- segment definition --
98;
99%define SEG_TEXT    .text
100%define SEG_CONST   .data
101
102; To make the code position-independent, append -DPIC to the commandline
103;
104%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC
105
106%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
107; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
108
109; -- segment definition --
110;
111%define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
112%define SEG_CONST   .rodata align=16
113
114; The generation of position-independent code (PIC) is the default on Darwin.
115;
116%define PIC
117%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
118
119%else           ; ----(Other case)----------------------
120
121; -- segment definition --
122;
123%define SEG_TEXT    .text
124%define SEG_CONST   .data
125
126%endif  ; ----------------------------------------------
127
128; ==========================================================================
129
130; --------------------------------------------------------------------------
131;  Common types
132;
133%ifdef __x86_64__
134%define POINTER                 qword           ; general pointer type
135%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
136%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
137%else
138%define POINTER                 dword           ; general pointer type
139%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
140%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
141%endif
142
143%define INT                     dword           ; signed integer type
144%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
145%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
146
147%define FP32                    dword           ; IEEE754 single
148%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
149%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
150
151%define MMWORD                  qword           ; int64  (MMX register)
152%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
153%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
154
155; NASM is buggy and doesn't properly handle operand sizes for SSE
156; instructions, so for now we have to define XMMWORD as blank.
157%define XMMWORD                                 ; int128 (SSE register)
158%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
159%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
160
161; Similar hacks for when we load a dword or MMWORD into an xmm# register
162%define XMM_DWORD
163%define XMM_MMWORD
164
165%define SIZEOF_BYTE             1               ; sizeof(BYTE)
166%define SIZEOF_WORD             2               ; sizeof(WORD)
167%define SIZEOF_DWORD            4               ; sizeof(DWORD)
168%define SIZEOF_QWORD            8               ; sizeof(QWORD)
169%define SIZEOF_OWORD            16              ; sizeof(OWORD)
170
171%define BYTE_BIT                8               ; CHAR_BIT in C
172%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
173%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
174%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
175%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
176
177; --------------------------------------------------------------------------
178;  External Symbol Name
179;
180%ifndef EXTN
181%define EXTN(name)   _ %+ name          ; foo() -> _foo
182%endif
183
184; --------------------------------------------------------------------------
185;  Macros for position-independent code (PIC) support
186;
187%ifndef GOT_SYMBOL
188%undef PIC
189%endif
190
191%ifdef PIC ; -------------------------------------------
192
193%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
194
195; At present, nasm doesn't seem to support PIC generation for Mach-O.
196; The PIC support code below is a little tricky.
197
198        SECTION SEG_CONST
199const_base:
200
201%define GOTOFF(got,sym) (got) + (sym) - const_base
202
203%imacro get_GOT 1
204        ; NOTE: this macro destroys ecx resister.
205        call    %%geteip
206        add     ecx, byte (%%ref - $)
207        jmp     short %%adjust
208%%geteip:
209        mov     ecx, POINTER [esp]
210        ret
211%%adjust:
212        push    ebp
213        xor     ebp,ebp         ; ebp = 0
214%ifidni %1,ebx  ; (%1 == ebx)
215        ; db 0x8D,0x9C + jmp near const_base =
216        ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
217        db      0x8D,0x9C               ; 8D,9C
218        jmp     near const_base         ; E9,(const_base-%%ref)
219%%ref:
220%else  ; (%1 != ebx)
221        ; db 0x8D,0x8C + jmp near const_base =
222        ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
223        db      0x8D,0x8C               ; 8D,8C
224        jmp     near const_base         ; E9,(const_base-%%ref)
225%%ref:  mov     %1, ecx
226%endif ; (%1 == ebx)
227        pop     ebp
228%endmacro
229
230%else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------
231
232%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
233
234%imacro get_GOT 1
235        extern  GOT_SYMBOL
236        call    %%geteip
237        add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
238        jmp     short %%done
239%%geteip:
240        mov     %1, POINTER [esp]
241        ret
242%%done:
243%endmacro
244
245%endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------
246
247%imacro pushpic 1.nolist
248        push    %1
249%endmacro
250%imacro poppic  1.nolist
251        pop     %1
252%endmacro
253%imacro movpic  2.nolist
254        mov     %1,%2
255%endmacro
256
257%else   ; !PIC -----------------------------------------
258
259%define GOTOFF(got,sym) (sym)
260
261%imacro get_GOT 1.nolist
262%endmacro
263%imacro pushpic 1.nolist
264%endmacro
265%imacro poppic  1.nolist
266%endmacro
267%imacro movpic  2.nolist
268%endmacro
269
270%endif  ;  PIC -----------------------------------------
271
272; --------------------------------------------------------------------------
273;  Align the next instruction on {2,4,8,16,..}-byte boundary.
274;  ".balign n,,m" in GNU as
275;
276%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
277%define FILLB(b,n)  (($$-(b)) & ((n)-1))
278
279%imacro alignx 1-2.nolist 0xFFFF
280%%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
281               db 0x90                               ; nop
282        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
283               db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
284        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
285               db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
286        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
287               db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
288        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
289               db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
290        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
291               db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
292        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
293               db 0x8B,0xED                          ; mov ebp,ebp
294        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
295               db 0x90                               ; nop
296%endmacro
297
298; Align the next data on {2,4,8,16,..}-byte boundary.
299;
300%imacro alignz 1.nolist
301        align %1, db 0          ; filling zeros
302%endmacro
303
304%ifdef __x86_64__
305
306%ifdef WIN64
307
308%imacro collect_args 0
309        push r12
310        push r13
311        push r14
312        push r15
313        mov r10, rcx
314        mov r11, rdx
315        mov r12, r8
316        mov r13, r9
317        mov r14, [rax+48]
318        mov r15, [rax+56]
319        push rsi
320        push rdi
321        sub     rsp, SIZEOF_XMMWORD
322        movaps  XMMWORD [rsp], xmm6
323        sub     rsp, SIZEOF_XMMWORD
324        movaps  XMMWORD [rsp], xmm7
325%endmacro
326
327%imacro uncollect_args 0
328        movaps  xmm7, XMMWORD [rsp]
329        add     rsp, SIZEOF_XMMWORD
330        movaps  xmm6, XMMWORD [rsp]
331        add     rsp, SIZEOF_XMMWORD
332        pop rdi
333        pop rsi
334        pop r15
335        pop r14
336        pop r13
337        pop r12
338%endmacro
339
340%else
341
342%imacro collect_args 0
343        push r10
344        push r11
345        push r12
346        push r13
347        push r14
348        push r15
349        mov r10, rdi
350        mov r11, rsi
351        mov r12, rdx
352        mov r13, rcx
353        mov r14, r8
354        mov r15, r9
355%endmacro
356
357%imacro uncollect_args 0
358        pop r15
359        pop r14
360        pop r13
361        pop r12
362        pop r11
363        pop r10
364%endmacro
365
366%endif
367
368%endif
369
370; --------------------------------------------------------------------------
371;  Defines picked up from the C headers
372;
373%include "jsimdcfg.inc"
374
375; --------------------------------------------------------------------------
376