1/* ----------------------------------------------------------------------- 2 unix64.S - Copyright (c) 2013 The Written Word, Inc. 3 - Copyright (c) 2008 Red Hat, Inc 4 - Copyright (c) 2002 Bo Thorsen <bo@suse.de> 5 6 x86-64 Foreign Function Interface 7 8 Permission is hereby granted, free of charge, to any person obtaining 9 a copy of this software and associated documentation files (the 10 ``Software''), to deal in the Software without restriction, including 11 without limitation the rights to use, copy, modify, merge, publish, 12 distribute, sublicense, and/or sell copies of the Software, and to 13 permit persons to whom the Software is furnished to do so, subject to 14 the following conditions: 15 16 The above copyright notice and this permission notice shall be included 17 in all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, 20 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 23 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 24 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 ----------------------------------------------------------------------- */ 28 29#ifdef __x86_64__ 30#define LIBFFI_ASM 31#include <fficonfig.h> 32#include <ffi.h> 33#include "internal64.h" 34#include "asmnames.h" 35 36 .text 37 38/* This macro allows the safe creation of jump tables without an 39 actual table. The entry points into the table are all 8 bytes. 40 The use of ORG asserts that we're at the correct location. */ 41/* ??? The clang assembler doesn't handle .org with symbolic expressions. */ 42#if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__)) 43# define E(BASE, X) .balign 8 44#else 45# define E(BASE, X) .balign 8; .org BASE + X * 8 46#endif 47 48/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, 49 void *raddr, void (*fnaddr)(void)); 50 51 Bit o trickiness here -- ARGS+BYTES is the base of the stack frame 52 for this function. This has been allocated by ffi_call. We also 53 deallocate some of the stack that has been alloca'd. */ 54 55 .balign 8 56 .globl C(ffi_call_unix64) 57 FFI_HIDDEN(C(ffi_call_unix64)) 58 59C(ffi_call_unix64): 60L(UW0): 61 movq (%rsp), %r10 /* Load return address. */ 62 leaq (%rdi, %rsi), %rax /* Find local stack base. */ 63 movq %rdx, (%rax) /* Save flags. */ 64 movq %rcx, 8(%rax) /* Save raddr. */ 65 movq %rbp, 16(%rax) /* Save old frame pointer. */ 66 movq %r10, 24(%rax) /* Relocate return address. */ 67 movq %rax, %rbp /* Finalize local stack frame. */ 68 69 /* New stack frame based off rbp. This is a itty bit of unwind 70 trickery in that the CFA *has* changed. There is no easy way 71 to describe it correctly on entry to the function. Fortunately, 72 it doesn't matter too much since at all points we can correctly 73 unwind back to ffi_call. Note that the location to which we 74 moved the return address is (the new) CFA-8, so from the 75 perspective of the unwind info, it hasn't moved. */ 76L(UW1): 77 /* cfi_def_cfa(%rbp, 32) */ 78 /* cfi_rel_offset(%rbp, 16) */ 79 80 movq %rdi, %r10 /* Save a copy of the register area. */ 81 movq %r8, %r11 /* Save a copy of the target fn. */ 82 movl %r9d, %eax /* Set number of SSE registers. */ 83 84 /* Load up all argument registers. */ 85 movq (%r10), %rdi 86 movq 0x08(%r10), %rsi 87 movq 0x10(%r10), %rdx 88 movq 0x18(%r10), %rcx 89 movq 0x20(%r10), %r8 90 movq 0x28(%r10), %r9 91 movl 0xb0(%r10), %eax 92 testl %eax, %eax 93 jnz L(load_sse) 94L(ret_from_load_sse): 95 96 /* Deallocate the reg arg area, except for r10, then load via pop. */ 97 leaq 0xb8(%r10), %rsp 98 popq %r10 99 100 /* Call the user function. */ 101 call *%r11 102 103 /* Deallocate stack arg area; local stack frame in redzone. */ 104 leaq 24(%rbp), %rsp 105 106 movq 0(%rbp), %rcx /* Reload flags. */ 107 movq 8(%rbp), %rdi /* Reload raddr. */ 108 movq 16(%rbp), %rbp /* Reload old frame pointer. */ 109L(UW2): 110 /* cfi_remember_state */ 111 /* cfi_def_cfa(%rsp, 8) */ 112 /* cfi_restore(%rbp) */ 113 114 /* The first byte of the flags contains the FFI_TYPE. */ 115 cmpb $UNIX64_RET_LAST, %cl 116 movzbl %cl, %r10d 117 leaq L(store_table)(%rip), %r11 118 ja L(sa) 119 leaq (%r11, %r10, 8), %r10 120 121 /* Prep for the structure cases: scratch area in redzone. */ 122 leaq -20(%rsp), %rsi 123 jmp *%r10 124 125 .balign 8 126L(store_table): 127E(L(store_table), UNIX64_RET_VOID) 128 ret 129E(L(store_table), UNIX64_RET_UINT8) 130 movzbl %al, %eax 131 movq %rax, (%rdi) 132 ret 133E(L(store_table), UNIX64_RET_UINT16) 134 movzwl %ax, %eax 135 movq %rax, (%rdi) 136 ret 137E(L(store_table), UNIX64_RET_UINT32) 138 movl %eax, %eax 139 movq %rax, (%rdi) 140 ret 141E(L(store_table), UNIX64_RET_SINT8) 142 movsbq %al, %rax 143 movq %rax, (%rdi) 144 ret 145E(L(store_table), UNIX64_RET_SINT16) 146 movswq %ax, %rax 147 movq %rax, (%rdi) 148 ret 149E(L(store_table), UNIX64_RET_SINT32) 150 cltq 151 movq %rax, (%rdi) 152 ret 153E(L(store_table), UNIX64_RET_INT64) 154 movq %rax, (%rdi) 155 ret 156E(L(store_table), UNIX64_RET_XMM32) 157 movd %xmm0, (%rdi) 158 ret 159E(L(store_table), UNIX64_RET_XMM64) 160 movq %xmm0, (%rdi) 161 ret 162E(L(store_table), UNIX64_RET_X87) 163 fstpt (%rdi) 164 ret 165E(L(store_table), UNIX64_RET_X87_2) 166 fstpt (%rdi) 167 fstpt 16(%rdi) 168 ret 169E(L(store_table), UNIX64_RET_ST_XMM0_RAX) 170 movq %rax, 8(%rsi) 171 jmp L(s3) 172E(L(store_table), UNIX64_RET_ST_RAX_XMM0) 173 movq %xmm0, 8(%rsi) 174 jmp L(s2) 175E(L(store_table), UNIX64_RET_ST_XMM0_XMM1) 176 movq %xmm1, 8(%rsi) 177 jmp L(s3) 178E(L(store_table), UNIX64_RET_ST_RAX_RDX) 179 movq %rdx, 8(%rsi) 180L(s2): 181 movq %rax, (%rsi) 182 shrl $UNIX64_SIZE_SHIFT, %ecx 183 rep movsb 184 ret 185 .balign 8 186L(s3): 187 movq %xmm0, (%rsi) 188 shrl $UNIX64_SIZE_SHIFT, %ecx 189 rep movsb 190 ret 191 192L(sa): call PLT(C(abort)) 193 194 /* Many times we can avoid loading any SSE registers at all. 195 It's not worth an indirect jump to load the exact set of 196 SSE registers needed; zero or all is a good compromise. */ 197 .balign 2 198L(UW3): 199 /* cfi_restore_state */ 200L(load_sse): 201 movdqa 0x30(%r10), %xmm0 202 movdqa 0x40(%r10), %xmm1 203 movdqa 0x50(%r10), %xmm2 204 movdqa 0x60(%r10), %xmm3 205 movdqa 0x70(%r10), %xmm4 206 movdqa 0x80(%r10), %xmm5 207 movdqa 0x90(%r10), %xmm6 208 movdqa 0xa0(%r10), %xmm7 209 jmp L(ret_from_load_sse) 210 211L(UW4): 212ENDF(C(ffi_call_unix64)) 213 214/* 6 general registers, 8 vector registers, 215 32 bytes of rvalue, 8 bytes of alignment. */ 216#define ffi_closure_OFS_G 0 217#define ffi_closure_OFS_V (6*8) 218#define ffi_closure_OFS_RVALUE (ffi_closure_OFS_V + 8*16) 219#define ffi_closure_FS (ffi_closure_OFS_RVALUE + 32 + 8) 220 221/* The location of rvalue within the red zone after deallocating the frame. */ 222#define ffi_closure_RED_RVALUE (ffi_closure_OFS_RVALUE - ffi_closure_FS) 223 224 .balign 2 225 .globl C(ffi_closure_unix64_sse) 226 FFI_HIDDEN(C(ffi_closure_unix64_sse)) 227 228C(ffi_closure_unix64_sse): 229L(UW5): 230 subq $ffi_closure_FS, %rsp 231L(UW6): 232 /* cfi_adjust_cfa_offset(ffi_closure_FS) */ 233 234 movdqa %xmm0, ffi_closure_OFS_V+0x00(%rsp) 235 movdqa %xmm1, ffi_closure_OFS_V+0x10(%rsp) 236 movdqa %xmm2, ffi_closure_OFS_V+0x20(%rsp) 237 movdqa %xmm3, ffi_closure_OFS_V+0x30(%rsp) 238 movdqa %xmm4, ffi_closure_OFS_V+0x40(%rsp) 239 movdqa %xmm5, ffi_closure_OFS_V+0x50(%rsp) 240 movdqa %xmm6, ffi_closure_OFS_V+0x60(%rsp) 241 movdqa %xmm7, ffi_closure_OFS_V+0x70(%rsp) 242 jmp L(sse_entry1) 243 244L(UW7): 245ENDF(C(ffi_closure_unix64_sse)) 246 247 .balign 2 248 .globl C(ffi_closure_unix64) 249 FFI_HIDDEN(C(ffi_closure_unix64)) 250 251C(ffi_closure_unix64): 252L(UW8): 253 subq $ffi_closure_FS, %rsp 254L(UW9): 255 /* cfi_adjust_cfa_offset(ffi_closure_FS) */ 256L(sse_entry1): 257 movq %rdi, ffi_closure_OFS_G+0x00(%rsp) 258 movq %rsi, ffi_closure_OFS_G+0x08(%rsp) 259 movq %rdx, ffi_closure_OFS_G+0x10(%rsp) 260 movq %rcx, ffi_closure_OFS_G+0x18(%rsp) 261 movq %r8, ffi_closure_OFS_G+0x20(%rsp) 262 movq %r9, ffi_closure_OFS_G+0x28(%rsp) 263 264#ifdef __ILP32__ 265 movl FFI_TRAMPOLINE_SIZE(%r10), %edi /* Load cif */ 266 movl FFI_TRAMPOLINE_SIZE+4(%r10), %esi /* Load fun */ 267 movl FFI_TRAMPOLINE_SIZE+8(%r10), %edx /* Load user_data */ 268#else 269 movq FFI_TRAMPOLINE_SIZE(%r10), %rdi /* Load cif */ 270 movq FFI_TRAMPOLINE_SIZE+8(%r10), %rsi /* Load fun */ 271 movq FFI_TRAMPOLINE_SIZE+16(%r10), %rdx /* Load user_data */ 272#endif 273L(do_closure): 274 leaq ffi_closure_OFS_RVALUE(%rsp), %rcx /* Load rvalue */ 275 movq %rsp, %r8 /* Load reg_args */ 276 leaq ffi_closure_FS+8(%rsp), %r9 /* Load argp */ 277 call PLT(C(ffi_closure_unix64_inner)) 278 279 /* Deallocate stack frame early; return value is now in redzone. */ 280 addq $ffi_closure_FS, %rsp 281L(UW10): 282 /* cfi_adjust_cfa_offset(-ffi_closure_FS) */ 283 284 /* The first byte of the return value contains the FFI_TYPE. */ 285 cmpb $UNIX64_RET_LAST, %al 286 movzbl %al, %r10d 287 leaq L(load_table)(%rip), %r11 288 ja L(la) 289 leaq (%r11, %r10, 8), %r10 290 leaq ffi_closure_RED_RVALUE(%rsp), %rsi 291 jmp *%r10 292 293 .balign 8 294L(load_table): 295E(L(load_table), UNIX64_RET_VOID) 296 ret 297E(L(load_table), UNIX64_RET_UINT8) 298 movzbl (%rsi), %eax 299 ret 300E(L(load_table), UNIX64_RET_UINT16) 301 movzwl (%rsi), %eax 302 ret 303E(L(load_table), UNIX64_RET_UINT32) 304 movl (%rsi), %eax 305 ret 306E(L(load_table), UNIX64_RET_SINT8) 307 movsbl (%rsi), %eax 308 ret 309E(L(load_table), UNIX64_RET_SINT16) 310 movswl (%rsi), %eax 311 ret 312E(L(load_table), UNIX64_RET_SINT32) 313 movl (%rsi), %eax 314 ret 315E(L(load_table), UNIX64_RET_INT64) 316 movq (%rsi), %rax 317 ret 318E(L(load_table), UNIX64_RET_XMM32) 319 movd (%rsi), %xmm0 320 ret 321E(L(load_table), UNIX64_RET_XMM64) 322 movq (%rsi), %xmm0 323 ret 324E(L(load_table), UNIX64_RET_X87) 325 fldt (%rsi) 326 ret 327E(L(load_table), UNIX64_RET_X87_2) 328 fldt 16(%rsi) 329 fldt (%rsi) 330 ret 331E(L(load_table), UNIX64_RET_ST_XMM0_RAX) 332 movq 8(%rsi), %rax 333 jmp L(l3) 334E(L(load_table), UNIX64_RET_ST_RAX_XMM0) 335 movq 8(%rsi), %xmm0 336 jmp L(l2) 337E(L(load_table), UNIX64_RET_ST_XMM0_XMM1) 338 movq 8(%rsi), %xmm1 339 jmp L(l3) 340E(L(load_table), UNIX64_RET_ST_RAX_RDX) 341 movq 8(%rsi), %rdx 342L(l2): 343 movq (%rsi), %rax 344 ret 345 .balign 8 346L(l3): 347 movq (%rsi), %xmm0 348 ret 349 350L(la): call PLT(C(abort)) 351 352L(UW11): 353ENDF(C(ffi_closure_unix64)) 354 355 .balign 2 356 .globl C(ffi_go_closure_unix64_sse) 357 FFI_HIDDEN(C(ffi_go_closure_unix64_sse)) 358 359C(ffi_go_closure_unix64_sse): 360L(UW12): 361 subq $ffi_closure_FS, %rsp 362L(UW13): 363 /* cfi_adjust_cfa_offset(ffi_closure_FS) */ 364 365 movdqa %xmm0, ffi_closure_OFS_V+0x00(%rsp) 366 movdqa %xmm1, ffi_closure_OFS_V+0x10(%rsp) 367 movdqa %xmm2, ffi_closure_OFS_V+0x20(%rsp) 368 movdqa %xmm3, ffi_closure_OFS_V+0x30(%rsp) 369 movdqa %xmm4, ffi_closure_OFS_V+0x40(%rsp) 370 movdqa %xmm5, ffi_closure_OFS_V+0x50(%rsp) 371 movdqa %xmm6, ffi_closure_OFS_V+0x60(%rsp) 372 movdqa %xmm7, ffi_closure_OFS_V+0x70(%rsp) 373 jmp L(sse_entry2) 374 375L(UW14): 376ENDF(C(ffi_go_closure_unix64_sse)) 377 378 .balign 2 379 .globl C(ffi_go_closure_unix64) 380 FFI_HIDDEN(C(ffi_go_closure_unix64)) 381 382C(ffi_go_closure_unix64): 383L(UW15): 384 subq $ffi_closure_FS, %rsp 385L(UW16): 386 /* cfi_adjust_cfa_offset(ffi_closure_FS) */ 387L(sse_entry2): 388 movq %rdi, ffi_closure_OFS_G+0x00(%rsp) 389 movq %rsi, ffi_closure_OFS_G+0x08(%rsp) 390 movq %rdx, ffi_closure_OFS_G+0x10(%rsp) 391 movq %rcx, ffi_closure_OFS_G+0x18(%rsp) 392 movq %r8, ffi_closure_OFS_G+0x20(%rsp) 393 movq %r9, ffi_closure_OFS_G+0x28(%rsp) 394 395#ifdef __ILP32__ 396 movl 4(%r10), %edi /* Load cif */ 397 movl 8(%r10), %esi /* Load fun */ 398 movl %r10d, %edx /* Load closure (user_data) */ 399#else 400 movq 8(%r10), %rdi /* Load cif */ 401 movq 16(%r10), %rsi /* Load fun */ 402 movq %r10, %rdx /* Load closure (user_data) */ 403#endif 404 jmp L(do_closure) 405 406L(UW17): 407ENDF(C(ffi_go_closure_unix64)) 408 409/* Sadly, OSX cctools-as doesn't understand .cfi directives at all. */ 410 411#ifdef __APPLE__ 412.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support 413EHFrame0: 414#elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE) 415.section .eh_frame,"a",@unwind 416#else 417.section .eh_frame,"a",@progbits 418#endif 419 420#ifdef HAVE_AS_X86_PCREL 421# define PCREL(X) X - . 422#else 423# define PCREL(X) X@rel 424#endif 425 426/* Simplify advancing between labels. Assume DW_CFA_advance_loc1 fits. */ 427#define ADV(N, P) .byte 2, L(N)-L(P) 428 429 .balign 8 430L(CIE): 431 .set L(set0),L(ECIE)-L(SCIE) 432 .long L(set0) /* CIE Length */ 433L(SCIE): 434 .long 0 /* CIE Identifier Tag */ 435 .byte 1 /* CIE Version */ 436 .ascii "zR\0" /* CIE Augmentation */ 437 .byte 1 /* CIE Code Alignment Factor */ 438 .byte 0x78 /* CIE Data Alignment Factor */ 439 .byte 0x10 /* CIE RA Column */ 440 .byte 1 /* Augmentation size */ 441 .byte 0x1b /* FDE Encoding (pcrel sdata4) */ 442 .byte 0xc, 7, 8 /* DW_CFA_def_cfa, %rsp offset 8 */ 443 .byte 0x80+16, 1 /* DW_CFA_offset, %rip offset 1*-8 */ 444 .balign 8 445L(ECIE): 446 447 .set L(set1),L(EFDE1)-L(SFDE1) 448 .long L(set1) /* FDE Length */ 449L(SFDE1): 450 .long L(SFDE1)-L(CIE) /* FDE CIE offset */ 451 .long PCREL(L(UW0)) /* Initial location */ 452 .long L(UW4)-L(UW0) /* Address range */ 453 .byte 0 /* Augmentation size */ 454 ADV(UW1, UW0) 455 .byte 0xc, 6, 32 /* DW_CFA_def_cfa, %rbp 32 */ 456 .byte 0x80+6, 2 /* DW_CFA_offset, %rbp 2*-8 */ 457 ADV(UW2, UW1) 458 .byte 0xa /* DW_CFA_remember_state */ 459 .byte 0xc, 7, 8 /* DW_CFA_def_cfa, %rsp 8 */ 460 .byte 0xc0+6 /* DW_CFA_restore, %rbp */ 461 ADV(UW3, UW2) 462 .byte 0xb /* DW_CFA_restore_state */ 463 .balign 8 464L(EFDE1): 465 466 .set L(set2),L(EFDE2)-L(SFDE2) 467 .long L(set2) /* FDE Length */ 468L(SFDE2): 469 .long L(SFDE2)-L(CIE) /* FDE CIE offset */ 470 .long PCREL(L(UW5)) /* Initial location */ 471 .long L(UW7)-L(UW5) /* Address range */ 472 .byte 0 /* Augmentation size */ 473 ADV(UW6, UW5) 474 .byte 0xe /* DW_CFA_def_cfa_offset */ 475 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */ 476 .balign 8 477L(EFDE2): 478 479 .set L(set3),L(EFDE3)-L(SFDE3) 480 .long L(set3) /* FDE Length */ 481L(SFDE3): 482 .long L(SFDE3)-L(CIE) /* FDE CIE offset */ 483 .long PCREL(L(UW8)) /* Initial location */ 484 .long L(UW11)-L(UW8) /* Address range */ 485 .byte 0 /* Augmentation size */ 486 ADV(UW9, UW8) 487 .byte 0xe /* DW_CFA_def_cfa_offset */ 488 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */ 489 ADV(UW10, UW9) 490 .byte 0xe, 8 /* DW_CFA_def_cfa_offset 8 */ 491L(EFDE3): 492 493 .set L(set4),L(EFDE4)-L(SFDE4) 494 .long L(set4) /* FDE Length */ 495L(SFDE4): 496 .long L(SFDE4)-L(CIE) /* FDE CIE offset */ 497 .long PCREL(L(UW12)) /* Initial location */ 498 .long L(UW14)-L(UW12) /* Address range */ 499 .byte 0 /* Augmentation size */ 500 ADV(UW13, UW12) 501 .byte 0xe /* DW_CFA_def_cfa_offset */ 502 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */ 503 .balign 8 504L(EFDE4): 505 506 .set L(set5),L(EFDE5)-L(SFDE5) 507 .long L(set5) /* FDE Length */ 508L(SFDE5): 509 .long L(SFDE5)-L(CIE) /* FDE CIE offset */ 510 .long PCREL(L(UW15)) /* Initial location */ 511 .long L(UW17)-L(UW15) /* Address range */ 512 .byte 0 /* Augmentation size */ 513 ADV(UW16, UW15) 514 .byte 0xe /* DW_CFA_def_cfa_offset */ 515 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */ 516 .balign 8 517L(EFDE5): 518#ifdef __APPLE__ 519 .subsections_via_symbols 520 .section __LD,__compact_unwind,regular,debug 521 522 /* compact unwind for ffi_call_unix64 */ 523 .quad C(ffi_call_unix64) 524 .set L1,L(UW4)-L(UW0) 525 .long L1 526 .long 0x04000000 /* use dwarf unwind info */ 527 .quad 0 528 .quad 0 529 530 /* compact unwind for ffi_closure_unix64_sse */ 531 .quad C(ffi_closure_unix64_sse) 532 .set L2,L(UW7)-L(UW5) 533 .long L2 534 .long 0x04000000 /* use dwarf unwind info */ 535 .quad 0 536 .quad 0 537 538 /* compact unwind for ffi_closure_unix64 */ 539 .quad C(ffi_closure_unix64) 540 .set L3,L(UW11)-L(UW8) 541 .long L3 542 .long 0x04000000 /* use dwarf unwind info */ 543 .quad 0 544 .quad 0 545 546 /* compact unwind for ffi_go_closure_unix64_sse */ 547 .quad C(ffi_go_closure_unix64_sse) 548 .set L4,L(UW14)-L(UW12) 549 .long L4 550 .long 0x04000000 /* use dwarf unwind info */ 551 .quad 0 552 .quad 0 553 554 /* compact unwind for ffi_go_closure_unix64 */ 555 .quad C(ffi_go_closure_unix64) 556 .set L5,L(UW17)-L(UW15) 557 .long L5 558 .long 0x04000000 /* use dwarf unwind info */ 559 .quad 0 560 .quad 0 561#endif 562 563#endif /* __x86_64__ */ 564#if defined __ELF__ && defined __linux__ 565 .section .note.GNU-stack,"",@progbits 566#endif 567