1/* ----------------------------------------------------------------------- 2 unix64.S - Copyright (c) 2013 The Written Word, Inc. 3 - Copyright (c) 2008 Red Hat, Inc 4 - Copyright (c) 2002 Bo Thorsen <bo@suse.de> 5 6 x86-64 Foreign Function Interface 7 8 Permission is hereby granted, free of charge, to any person obtaining 9 a copy of this software and associated documentation files (the 10 ``Software''), to deal in the Software without restriction, including 11 without limitation the rights to use, copy, modify, merge, publish, 12 distribute, sublicense, and/or sell copies of the Software, and to 13 permit persons to whom the Software is furnished to do so, subject to 14 the following conditions: 15 16 The above copyright notice and this permission notice shall be included 17 in all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, 20 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 23 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 24 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 ----------------------------------------------------------------------- */ 28 29#ifdef __x86_64__ 30#define LIBFFI_ASM 31#include <fficonfig.h> 32#include <ffi.h> 33 34.text 35 36/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, 37 void *raddr, void (*fnaddr)(void)); 38 39 Bit o trickiness here -- ARGS+BYTES is the base of the stack frame 40 for this function. This has been allocated by ffi_call. We also 41 deallocate some of the stack that has been alloca'd. */ 42 43 .align 2 44 .globl ffi_call_unix64 45 .type ffi_call_unix64,@function 46 47ffi_call_unix64: 48.LUW0: 49 movq (%rsp), %r10 /* Load return address. */ 50 leaq (%rdi, %rsi), %rax /* Find local stack base. */ 51 movq %rdx, (%rax) /* Save flags. */ 52 movq %rcx, 8(%rax) /* Save raddr. */ 53 movq %rbp, 16(%rax) /* Save old frame pointer. */ 54 movq %r10, 24(%rax) /* Relocate return address. */ 55 movq %rax, %rbp /* Finalize local stack frame. */ 56.LUW1: 57 movq %rdi, %r10 /* Save a copy of the register area. */ 58 movq %r8, %r11 /* Save a copy of the target fn. */ 59 movl %r9d, %eax /* Set number of SSE registers. */ 60 61 /* Load up all argument registers. */ 62 movq (%r10), %rdi 63 movq 8(%r10), %rsi 64 movq 16(%r10), %rdx 65 movq 24(%r10), %rcx 66 movq 32(%r10), %r8 67 movq 40(%r10), %r9 68 testl %eax, %eax 69 jnz .Lload_sse 70.Lret_from_load_sse: 71 72 /* Deallocate the reg arg area. */ 73 leaq 176(%r10), %rsp 74 75 /* Call the user function. */ 76 call *%r11 77 78 /* Deallocate stack arg area; local stack frame in redzone. */ 79 leaq 24(%rbp), %rsp 80 81 movq 0(%rbp), %rcx /* Reload flags. */ 82 movq 8(%rbp), %rdi /* Reload raddr. */ 83 movq 16(%rbp), %rbp /* Reload old frame pointer. */ 84.LUW2: 85 86 /* The first byte of the flags contains the FFI_TYPE. */ 87 movzbl %cl, %r10d 88 leaq .Lstore_table(%rip), %r11 89 movslq (%r11, %r10, 4), %r10 90 addq %r11, %r10 91 jmp *%r10 92 93.Lstore_table: 94 .long .Lst_void-.Lstore_table /* FFI_TYPE_VOID */ 95 .long .Lst_sint32-.Lstore_table /* FFI_TYPE_INT */ 96 .long .Lst_float-.Lstore_table /* FFI_TYPE_FLOAT */ 97 .long .Lst_double-.Lstore_table /* FFI_TYPE_DOUBLE */ 98 .long .Lst_ldouble-.Lstore_table /* FFI_TYPE_LONGDOUBLE */ 99 .long .Lst_uint8-.Lstore_table /* FFI_TYPE_UINT8 */ 100 .long .Lst_sint8-.Lstore_table /* FFI_TYPE_SINT8 */ 101 .long .Lst_uint16-.Lstore_table /* FFI_TYPE_UINT16 */ 102 .long .Lst_sint16-.Lstore_table /* FFI_TYPE_SINT16 */ 103 .long .Lst_uint32-.Lstore_table /* FFI_TYPE_UINT32 */ 104 .long .Lst_sint32-.Lstore_table /* FFI_TYPE_SINT32 */ 105 .long .Lst_int64-.Lstore_table /* FFI_TYPE_UINT64 */ 106 .long .Lst_int64-.Lstore_table /* FFI_TYPE_SINT64 */ 107 .long .Lst_struct-.Lstore_table /* FFI_TYPE_STRUCT */ 108 .long .Lst_int64-.Lstore_table /* FFI_TYPE_POINTER */ 109 110 .align 2 111.Lst_void: 112 ret 113 .align 2 114 115.Lst_uint8: 116 movzbq %al, %rax 117 movq %rax, (%rdi) 118 ret 119 .align 2 120.Lst_sint8: 121 movsbq %al, %rax 122 movq %rax, (%rdi) 123 ret 124 .align 2 125.Lst_uint16: 126 movzwq %ax, %rax 127 movq %rax, (%rdi) 128 .align 2 129.Lst_sint16: 130 movswq %ax, %rax 131 movq %rax, (%rdi) 132 ret 133 .align 2 134.Lst_uint32: 135 movl %eax, %eax 136 movq %rax, (%rdi) 137 .align 2 138.Lst_sint32: 139 cltq 140 movq %rax, (%rdi) 141 ret 142 .align 2 143.Lst_int64: 144 movq %rax, (%rdi) 145 ret 146 147 .align 2 148.Lst_float: 149 movss %xmm0, (%rdi) 150 ret 151 .align 2 152.Lst_double: 153 movsd %xmm0, (%rdi) 154 ret 155.Lst_ldouble: 156 fstpt (%rdi) 157 ret 158 159 .align 2 160.Lst_struct: 161 leaq -20(%rsp), %rsi /* Scratch area in redzone. */ 162 163 /* We have to locate the values now, and since we don't want to 164 write too much data into the user's return value, we spill the 165 value to a 16 byte scratch area first. Bits 8, 9, and 10 166 control where the values are located. Only one of the three 167 bits will be set; see ffi_prep_cif_machdep for the pattern. */ 168 movd %xmm0, %r10 169 movd %xmm1, %r11 170 testl $0x100, %ecx 171 cmovnz %rax, %rdx 172 cmovnz %r10, %rax 173 testl $0x200, %ecx 174 cmovnz %r10, %rdx 175 testl $0x400, %ecx 176 cmovnz %r10, %rax 177 cmovnz %r11, %rdx 178 movq %rax, (%rsi) 179 movq %rdx, 8(%rsi) 180 181 /* Bits 12-31 contain the true size of the structure. Copy from 182 the scratch area to the true destination. */ 183 shrl $12, %ecx 184 rep movsb 185 ret 186 187 /* Many times we can avoid loading any SSE registers at all. 188 It's not worth an indirect jump to load the exact set of 189 SSE registers needed; zero or all is a good compromise. */ 190 .align 2 191.LUW3: 192.Lload_sse: 193 movdqa 48(%r10), %xmm0 194 movdqa 64(%r10), %xmm1 195 movdqa 80(%r10), %xmm2 196 movdqa 96(%r10), %xmm3 197 movdqa 112(%r10), %xmm4 198 movdqa 128(%r10), %xmm5 199 movdqa 144(%r10), %xmm6 200 movdqa 160(%r10), %xmm7 201 jmp .Lret_from_load_sse 202 203.LUW4: 204 .size ffi_call_unix64,.-ffi_call_unix64 205 206 .align 2 207 .globl ffi_closure_unix64 208 .type ffi_closure_unix64,@function 209 210ffi_closure_unix64: 211.LUW5: 212 /* The carry flag is set by the trampoline iff SSE registers 213 are used. Don't clobber it before the branch instruction. */ 214 leaq -200(%rsp), %rsp 215.LUW6: 216 movq %rdi, (%rsp) 217 movq %rsi, 8(%rsp) 218 movq %rdx, 16(%rsp) 219 movq %rcx, 24(%rsp) 220 movq %r8, 32(%rsp) 221 movq %r9, 40(%rsp) 222 jc .Lsave_sse 223.Lret_from_save_sse: 224 225 movq %r10, %rdi 226 leaq 176(%rsp), %rsi 227 movq %rsp, %rdx 228 leaq 208(%rsp), %rcx 229 call ffi_closure_unix64_inner@PLT 230 231 /* Deallocate stack frame early; return value is now in redzone. */ 232 addq $200, %rsp 233.LUW7: 234 235 /* The first byte of the return value contains the FFI_TYPE. */ 236 movzbl %al, %r10d 237 leaq .Lload_table(%rip), %r11 238 movslq (%r11, %r10, 4), %r10 239 addq %r11, %r10 240 jmp *%r10 241 242.Lload_table: 243 .long .Lld_void-.Lload_table /* FFI_TYPE_VOID */ 244 .long .Lld_int32-.Lload_table /* FFI_TYPE_INT */ 245 .long .Lld_float-.Lload_table /* FFI_TYPE_FLOAT */ 246 .long .Lld_double-.Lload_table /* FFI_TYPE_DOUBLE */ 247 .long .Lld_ldouble-.Lload_table /* FFI_TYPE_LONGDOUBLE */ 248 .long .Lld_int8-.Lload_table /* FFI_TYPE_UINT8 */ 249 .long .Lld_int8-.Lload_table /* FFI_TYPE_SINT8 */ 250 .long .Lld_int16-.Lload_table /* FFI_TYPE_UINT16 */ 251 .long .Lld_int16-.Lload_table /* FFI_TYPE_SINT16 */ 252 .long .Lld_int32-.Lload_table /* FFI_TYPE_UINT32 */ 253 .long .Lld_int32-.Lload_table /* FFI_TYPE_SINT32 */ 254 .long .Lld_int64-.Lload_table /* FFI_TYPE_UINT64 */ 255 .long .Lld_int64-.Lload_table /* FFI_TYPE_SINT64 */ 256 .long .Lld_struct-.Lload_table /* FFI_TYPE_STRUCT */ 257 .long .Lld_int64-.Lload_table /* FFI_TYPE_POINTER */ 258 259 .align 2 260.Lld_void: 261 ret 262 263 .align 2 264.Lld_int8: 265 movzbl -24(%rsp), %eax 266 ret 267 .align 2 268.Lld_int16: 269 movzwl -24(%rsp), %eax 270 ret 271 .align 2 272.Lld_int32: 273 movl -24(%rsp), %eax 274 ret 275 .align 2 276.Lld_int64: 277 movq -24(%rsp), %rax 278 ret 279 280 .align 2 281.Lld_float: 282 movss -24(%rsp), %xmm0 283 ret 284 .align 2 285.Lld_double: 286 movsd -24(%rsp), %xmm0 287 ret 288 .align 2 289.Lld_ldouble: 290 fldt -24(%rsp) 291 ret 292 293 .align 2 294.Lld_struct: 295 /* There are four possibilities here, %rax/%rdx, %xmm0/%rax, 296 %rax/%xmm0, %xmm0/%xmm1. We collapse two by always loading 297 both rdx and xmm1 with the second word. For the remaining, 298 bit 8 set means xmm0 gets the second word, and bit 9 means 299 that rax gets the second word. */ 300 movq -24(%rsp), %rcx 301 movq -16(%rsp), %rdx 302 movq -16(%rsp), %xmm1 303 testl $0x100, %eax 304 cmovnz %rdx, %rcx 305 movd %rcx, %xmm0 306 testl $0x200, %eax 307 movq -24(%rsp), %rax 308 cmovnz %rdx, %rax 309 ret 310 311 /* See the comment above .Lload_sse; the same logic applies here. */ 312 .align 2 313.LUW8: 314.Lsave_sse: 315 movdqa %xmm0, 48(%rsp) 316 movdqa %xmm1, 64(%rsp) 317 movdqa %xmm2, 80(%rsp) 318 movdqa %xmm3, 96(%rsp) 319 movdqa %xmm4, 112(%rsp) 320 movdqa %xmm5, 128(%rsp) 321 movdqa %xmm6, 144(%rsp) 322 movdqa %xmm7, 160(%rsp) 323 jmp .Lret_from_save_sse 324 325.LUW9: 326 .size ffi_closure_unix64,.-ffi_closure_unix64 327 328#ifdef __GNUC__ 329/* Only emit DWARF unwind info when building with the GNU toolchain. */ 330 331#ifdef HAVE_AS_X86_64_UNWIND_SECTION_TYPE 332 .section .eh_frame,"a",@unwind 333#else 334 .section .eh_frame,"a",@progbits 335#endif 336.Lframe1: 337 .long .LECIE1-.LSCIE1 /* CIE Length */ 338.LSCIE1: 339 .long 0 /* CIE Identifier Tag */ 340 .byte 1 /* CIE Version */ 341 .ascii "zR\0" /* CIE Augmentation */ 342 .uleb128 1 /* CIE Code Alignment Factor */ 343 .sleb128 -8 /* CIE Data Alignment Factor */ 344 .byte 0x10 /* CIE RA Column */ 345 .uleb128 1 /* Augmentation size */ 346 .byte 0x1b /* FDE Encoding (pcrel sdata4) */ 347 .byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */ 348 .uleb128 7 349 .uleb128 8 350 .byte 0x80+16 /* DW_CFA_offset, %rip offset 1*-8 */ 351 .uleb128 1 352 .align 8 353.LECIE1: 354.LSFDE1: 355 .long .LEFDE1-.LASFDE1 /* FDE Length */ 356.LASFDE1: 357 .long .LASFDE1-.Lframe1 /* FDE CIE offset */ 358#if HAVE_AS_X86_PCREL 359 .long .LUW0-. /* FDE initial location */ 360#else 361 .long .LUW0@rel 362#endif 363 .long .LUW4-.LUW0 /* FDE address range */ 364 .uleb128 0x0 /* Augmentation size */ 365 366 .byte 0x4 /* DW_CFA_advance_loc4 */ 367 .long .LUW1-.LUW0 368 369 /* New stack frame based off rbp. This is a itty bit of unwind 370 trickery in that the CFA *has* changed. There is no easy way 371 to describe it correctly on entry to the function. Fortunately, 372 it doesn't matter too much since at all points we can correctly 373 unwind back to ffi_call. Note that the location to which we 374 moved the return address is (the new) CFA-8, so from the 375 perspective of the unwind info, it hasn't moved. */ 376 .byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */ 377 .uleb128 6 378 .uleb128 32 379 .byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */ 380 .uleb128 2 381 .byte 0xa /* DW_CFA_remember_state */ 382 383 .byte 0x4 /* DW_CFA_advance_loc4 */ 384 .long .LUW2-.LUW1 385 .byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */ 386 .uleb128 7 387 .uleb128 8 388 .byte 0xc0+6 /* DW_CFA_restore, %rbp */ 389 390 .byte 0x4 /* DW_CFA_advance_loc4 */ 391 .long .LUW3-.LUW2 392 .byte 0xb /* DW_CFA_restore_state */ 393 394 .align 8 395.LEFDE1: 396.LSFDE3: 397 .long .LEFDE3-.LASFDE3 /* FDE Length */ 398.LASFDE3: 399 .long .LASFDE3-.Lframe1 /* FDE CIE offset */ 400#if HAVE_AS_X86_PCREL 401 .long .LUW5-. /* FDE initial location */ 402#else 403 .long .LUW5@rel 404#endif 405 .long .LUW9-.LUW5 /* FDE address range */ 406 .uleb128 0x0 /* Augmentation size */ 407 408 .byte 0x4 /* DW_CFA_advance_loc4 */ 409 .long .LUW6-.LUW5 410 .byte 0xe /* DW_CFA_def_cfa_offset */ 411 .uleb128 208 412 .byte 0xa /* DW_CFA_remember_state */ 413 414 .byte 0x4 /* DW_CFA_advance_loc4 */ 415 .long .LUW7-.LUW6 416 .byte 0xe /* DW_CFA_def_cfa_offset */ 417 .uleb128 8 418 419 .byte 0x4 /* DW_CFA_advance_loc4 */ 420 .long .LUW8-.LUW7 421 .byte 0xb /* DW_CFA_restore_state */ 422 423 .align 8 424.LEFDE3: 425 426#endif /* __GNUC__ */ 427 428#endif /* __x86_64__ */ 429 430#if defined __ELF__ && defined __linux__ 431 .section .note.GNU-stack,"",@progbits 432#endif 433