1 #ifndef LIBDISASM_H 2 #define LIBDISASM_H 3 4 #include <stdint.h> 5 6 /* 'NEW" types 7 * __________________________________________________________________________*/ 8 #ifndef LIBDISASM_QWORD_H /* do not interfere with qword.h */ 9 #define LIBDISASM_QWORD_H 10 #ifdef _MSC_VER 11 typedef __int64 qword_t; 12 #else 13 typedef int64_t qword_t; 14 #endif 15 #endif 16 17 #include <sys/types.h> 18 19 #ifdef __cplusplus 20 extern "C" { 21 #endif 22 23 /* 'NEW" x86 API 24 * __________________________________________________________________________*/ 25 26 27 /* ========================================= Error Reporting */ 28 /* REPORT CODES 29 * These are passed to a reporter function passed at initialization. 30 * Each code determines the type of the argument passed to the reporter; 31 * this allows the report to recover from errors, or just log them. 32 */ 33 enum x86_report_codes { 34 report_disasm_bounds, /* RVA OUT OF BOUNDS : The disassembler could 35 not disassemble the supplied RVA as it is 36 out of the range of the buffer. The 37 application should store the address and 38 attempt to determine what section of the 39 binary it is in, then disassemble the 40 address from the bytes in that section. 41 data: uint32_t rva */ 42 report_insn_bounds, /* INSTRUCTION OUT OF BOUNDS: The disassembler 43 could not disassemble the instruction as 44 the instruction would require bytes beyond 45 the end of the current buffer. This usually 46 indicated garbage bytes at the end of a 47 buffer, or an incorrectly-sized buffer. 48 data: uint32_t rva */ 49 report_invalid_insn, /* INVALID INSTRUCTION: The disassembler could 50 not disassemble the instruction as it has an 51 invalid combination of opcodes and operands. 52 This will stop automated disassembly; the 53 application can restart the disassembly 54 after the invalid instruction. 55 data: uint32_t rva */ 56 report_unknown 57 }; 58 59 /* 'arg' is optional arbitrary data provided by the code passing the 60 * callback -- for example, it could be 'this' or 'self' in OOP code. 61 * 'code' is provided by libdisasm, it is one of the above 62 * 'data' is provided by libdisasm and is context-specific, per the enums */ 63 typedef void (*DISASM_REPORTER)( enum x86_report_codes code, 64 void *data, void *arg ); 65 66 67 /* x86_report_error : Call the register reporter to report an error */ 68 void x86_report_error( enum x86_report_codes code, void *data ); 69 70 /* ========================================= Libdisasm Management Routines */ 71 enum x86_options { /* these can be ORed together */ 72 opt_none= 0, 73 opt_ignore_nulls=1, /* ignore sequences of > 4 NULL bytes */ 74 opt_16_bit=2, /* 16-bit/DOS disassembly */ 75 opt_att_mnemonics=4, /* use AT&T syntax names for alternate opcode mnemonics */ 76 }; 77 78 /* management routines */ 79 /* 'arg' is caller-specific data which is passed as the first argument 80 * to the reporter callback routine */ 81 int x86_init( enum x86_options options, DISASM_REPORTER reporter, void *arg); 82 void x86_set_reporter( DISASM_REPORTER reporter, void *arg); 83 void x86_set_options( enum x86_options options ); 84 enum x86_options x86_get_options( void ); 85 int x86_cleanup(void); 86 87 88 /* ========================================= Instruction Representation */ 89 /* these defines are only intended for use in the array decl's */ 90 #define MAX_REGNAME 8 91 92 #define MAX_PREFIX_STR 32 93 #define MAX_MNEM_STR 16 94 #define MAX_INSN_SIZE 20 /* same as in i386.h */ 95 #define MAX_OP_STRING 32 /* max possible operand size in string form */ 96 #define MAX_OP_RAW_STRING 64 /* max possible operand size in raw form */ 97 #define MAX_OP_XML_STRING 256 /* max possible operand size in xml form */ 98 #define MAX_NUM_OPERANDS 8 /* max # implicit and explicit operands */ 99 /* in these, the '2 *' is arbitrary: the max # of operands should require 100 * more space than the rest of the insn */ 101 #define MAX_INSN_STRING 512 /* 2 * 8 * MAX_OP_STRING */ 102 #define MAX_INSN_RAW_STRING 1024 /* 2 * 8 * MAX_OP_RAW_STRING */ 103 #define MAX_INSN_XML_STRING 4096 /* 2 * 8 * MAX_OP_XML_STRING */ 104 105 enum x86_reg_type { /* NOTE: these may be ORed together */ 106 reg_gen = 0x00001, /* general purpose */ 107 reg_in = 0x00002, /* incoming args, ala RISC */ 108 reg_out = 0x00004, /* args to calls, ala RISC */ 109 reg_local = 0x00008, /* local vars, ala RISC */ 110 reg_fpu = 0x00010, /* FPU data register */ 111 reg_seg = 0x00020, /* segment register */ 112 reg_simd = 0x00040, /* SIMD/MMX reg */ 113 reg_sys = 0x00080, /* restricted/system register */ 114 reg_sp = 0x00100, /* stack pointer */ 115 reg_fp = 0x00200, /* frame pointer */ 116 reg_pc = 0x00400, /* program counter */ 117 reg_retaddr = 0x00800, /* return addr for func */ 118 reg_cond = 0x01000, /* condition code / flags */ 119 reg_zero = 0x02000, /* zero register, ala RISC */ 120 reg_ret = 0x04000, /* return value */ 121 reg_src = 0x10000, /* array/rep source */ 122 reg_dest = 0x20000, /* array/rep destination */ 123 reg_count = 0x40000 /* array/rep/loop counter */ 124 }; 125 126 /* x86_reg_t : an X86 CPU register */ 127 typedef struct { 128 char name[MAX_REGNAME]; 129 enum x86_reg_type type; /* what register is used for */ 130 unsigned int size; /* size of register in bytes */ 131 unsigned int id; /* register ID #, for quick compares */ 132 unsigned int alias; /* ID of reg this is an alias for */ 133 unsigned int shift; /* amount to shift aliased reg by */ 134 } x86_reg_t; 135 136 /* x86_ea_t : an X86 effective address (address expression) */ 137 typedef struct { 138 unsigned int scale; /* scale factor */ 139 x86_reg_t index, base; /* index, base registers */ 140 int32_t disp; /* displacement */ 141 char disp_sign; /* is negative? 1/0 */ 142 char disp_size; /* 0, 1, 2, 4 */ 143 } x86_ea_t; 144 145 /* x86_absolute_t : an X86 segment:offset address (descriptor) */ 146 typedef struct { 147 unsigned short segment; /* loaded directly into CS */ 148 union { 149 unsigned short off16; /* loaded directly into IP */ 150 uint32_t off32; /* loaded directly into EIP */ 151 } offset; 152 } x86_absolute_t; 153 154 enum x86_op_type { /* mutually exclusive */ 155 op_unused = 0, /* empty/unused operand: should never occur */ 156 op_register = 1, /* CPU register */ 157 op_immediate = 2, /* Immediate Value */ 158 op_relative_near = 3, /* Relative offset from IP */ 159 op_relative_far = 4, /* Relative offset from IP */ 160 op_absolute = 5, /* Absolute address (ptr16:32) */ 161 op_expression = 6, /* Address expression (scale/index/base/disp) */ 162 op_offset = 7, /* Offset from start of segment (m32) */ 163 op_unknown 164 }; 165 166 #define x86_optype_is_address( optype ) \ 167 ( optype == op_absolute || optype == op_offset ) 168 #define x86_optype_is_relative( optype ) \ 169 ( optype == op_relative_near || optype == op_relative_far ) 170 #define x86_optype_is_memory( optype ) \ 171 ( optype > op_immediate && optype < op_unknown ) 172 173 enum x86_op_datatype { /* these use Intel's lame terminology */ 174 op_byte = 1, /* 1 byte integer */ 175 op_word = 2, /* 2 byte integer */ 176 op_dword = 3, /* 4 byte integer */ 177 op_qword = 4, /* 8 byte integer */ 178 op_dqword = 5, /* 16 byte integer */ 179 op_sreal = 6, /* 4 byte real (single real) */ 180 op_dreal = 7, /* 8 byte real (double real) */ 181 op_extreal = 8, /* 10 byte real (extended real) */ 182 op_bcd = 9, /* 10 byte binary-coded decimal */ 183 op_ssimd = 10, /* 16 byte : 4 packed single FP (SIMD, MMX) */ 184 op_dsimd = 11, /* 16 byte : 2 packed double FP (SIMD, MMX) */ 185 op_sssimd = 12, /* 4 byte : scalar single FP (SIMD, MMX) */ 186 op_sdsimd = 13, /* 8 byte : scalar double FP (SIMD, MMX) */ 187 op_descr32 = 14, /* 6 byte Intel descriptor 2:4 */ 188 op_descr16 = 15, /* 4 byte Intel descriptor 2:2 */ 189 op_pdescr32 = 16, /* 6 byte Intel pseudo-descriptor 32:16 */ 190 op_pdescr16 = 17, /* 6 byte Intel pseudo-descriptor 8:24:16 */ 191 op_bounds16 = 18, /* signed 16:16 lower:upper bounds */ 192 op_bounds32 = 19, /* signed 32:32 lower:upper bounds */ 193 op_fpuenv16 = 20, /* 14 byte FPU control/environment data */ 194 op_fpuenv32 = 21, /* 28 byte FPU control/environment data */ 195 op_fpustate16 = 22, /* 94 byte FPU state (env & reg stack) */ 196 op_fpustate32 = 23, /* 108 byte FPU state (env & reg stack) */ 197 op_fpregset = 24, /* 512 bytes: register set */ 198 op_fpreg = 25, /* FPU register */ 199 op_none = 0xFF, /* operand without a datatype (INVLPG) */ 200 }; 201 202 enum x86_op_access { /* ORed together */ 203 op_read = 1, 204 op_write = 2, 205 op_execute = 4 206 }; 207 208 enum x86_op_flags { /* ORed together, but segs are mutually exclusive */ 209 op_signed = 1, /* signed integer */ 210 op_string = 2, /* possible string or array */ 211 op_constant = 4, /* symbolic constant */ 212 op_pointer = 8, /* operand points to a memory address */ 213 op_sysref = 0x010, /* operand is a syscall number */ 214 op_implied = 0x020, /* operand is implicit in the insn */ 215 op_hardcode = 0x40, /* operand is hardcoded in insn definition */ 216 /* NOTE: an 'implied' operand is one which can be considered a side 217 * effect of the insn, e.g. %esp being modified by PUSH or POP. A 218 * 'hard-coded' operand is one which is specified in the instruction 219 * definition, e.g. %es:%edi in MOVSB or 1 in ROL Eb, 1. The difference 220 * is that hard-coded operands are printed by disassemblers and are 221 * required to re-assemble, while implicit operands are invisible. */ 222 op_es_seg = 0x100, /* ES segment override */ 223 op_cs_seg = 0x200, /* CS segment override */ 224 op_ss_seg = 0x300, /* SS segment override */ 225 op_ds_seg = 0x400, /* DS segment override */ 226 op_fs_seg = 0x500, /* FS segment override */ 227 op_gs_seg = 0x600 /* GS segment override */ 228 }; 229 230 /* x86_op_t : an X86 instruction operand */ 231 typedef struct { 232 enum x86_op_type type; /* operand type */ 233 enum x86_op_datatype datatype; /* operand size */ 234 enum x86_op_access access; /* operand access [RWX] */ 235 enum x86_op_flags flags; /* misc flags */ 236 union { 237 /* sizeof will have to work on these union members! */ 238 /* immediate values */ 239 char sbyte; 240 short sword; 241 int32_t sdword; 242 qword_t sqword; 243 unsigned char byte; 244 unsigned short word; 245 uint32_t dword; 246 qword_t qword; 247 float sreal; 248 double dreal; 249 /* misc large/non-native types */ 250 unsigned char extreal[10]; 251 unsigned char bcd[10]; 252 qword_t dqword[2]; 253 unsigned char simd[16]; 254 unsigned char fpuenv[28]; 255 /* offset from segment */ 256 uint32_t offset; 257 /* ID of CPU register */ 258 x86_reg_t reg; 259 /* offsets from current insn */ 260 char relative_near; 261 int32_t relative_far; 262 /* segment:offset */ 263 x86_absolute_t absolute; 264 /* effective address [expression] */ 265 x86_ea_t expression; 266 } data; 267 /* this is needed to make formatting operands more sane */ 268 void * insn; /* pointer to x86_insn_t owning operand */ 269 } x86_op_t; 270 271 /* Linked list of x86_op_t; provided for manual traversal of the operand 272 * list in an insn. Users wishing to add operands to this list, e.g. to add 273 * implicit operands, should use x86_operand_new in x86_operand_list.h */ 274 typedef struct x86_operand_list { 275 x86_op_t op; 276 struct x86_operand_list *next; 277 } x86_oplist_t; 278 279 enum x86_insn_group { 280 insn_none = 0, /* invalid instruction */ 281 insn_controlflow = 1, 282 insn_arithmetic = 2, 283 insn_logic = 3, 284 insn_stack = 4, 285 insn_comparison = 5, 286 insn_move = 6, 287 insn_string = 7, 288 insn_bit_manip = 8, 289 insn_flag_manip = 9, 290 insn_fpu = 10, 291 insn_interrupt = 13, 292 insn_system = 14, 293 insn_other = 15 294 }; 295 296 enum x86_insn_type { 297 insn_invalid = 0, /* invalid instruction */ 298 /* insn_controlflow */ 299 insn_jmp = 0x1001, 300 insn_jcc = 0x1002, 301 insn_call = 0x1003, 302 insn_callcc = 0x1004, 303 insn_return = 0x1005, 304 /* insn_arithmetic */ 305 insn_add = 0x2001, 306 insn_sub = 0x2002, 307 insn_mul = 0x2003, 308 insn_div = 0x2004, 309 insn_inc = 0x2005, 310 insn_dec = 0x2006, 311 insn_shl = 0x2007, 312 insn_shr = 0x2008, 313 insn_rol = 0x2009, 314 insn_ror = 0x200A, 315 /* insn_logic */ 316 insn_and = 0x3001, 317 insn_or = 0x3002, 318 insn_xor = 0x3003, 319 insn_not = 0x3004, 320 insn_neg = 0x3005, 321 /* insn_stack */ 322 insn_push = 0x4001, 323 insn_pop = 0x4002, 324 insn_pushregs = 0x4003, 325 insn_popregs = 0x4004, 326 insn_pushflags = 0x4005, 327 insn_popflags = 0x4006, 328 insn_enter = 0x4007, 329 insn_leave = 0x4008, 330 /* insn_comparison */ 331 insn_test = 0x5001, 332 insn_cmp = 0x5002, 333 /* insn_move */ 334 insn_mov = 0x6001, /* move */ 335 insn_movcc = 0x6002, /* conditional move */ 336 insn_xchg = 0x6003, /* exchange */ 337 insn_xchgcc = 0x6004, /* conditional exchange */ 338 /* insn_string */ 339 insn_strcmp = 0x7001, 340 insn_strload = 0x7002, 341 insn_strmov = 0x7003, 342 insn_strstore = 0x7004, 343 insn_translate = 0x7005, /* xlat */ 344 /* insn_bit_manip */ 345 insn_bittest = 0x8001, 346 insn_bitset = 0x8002, 347 insn_bitclear = 0x8003, 348 /* insn_flag_manip */ 349 insn_clear_carry = 0x9001, 350 insn_clear_zero = 0x9002, 351 insn_clear_oflow = 0x9003, 352 insn_clear_dir = 0x9004, 353 insn_clear_sign = 0x9005, 354 insn_clear_parity = 0x9006, 355 insn_set_carry = 0x9007, 356 insn_set_zero = 0x9008, 357 insn_set_oflow = 0x9009, 358 insn_set_dir = 0x900A, 359 insn_set_sign = 0x900B, 360 insn_set_parity = 0x900C, 361 insn_tog_carry = 0x9010, 362 insn_tog_zero = 0x9020, 363 insn_tog_oflow = 0x9030, 364 insn_tog_dir = 0x9040, 365 insn_tog_sign = 0x9050, 366 insn_tog_parity = 0x9060, 367 /* insn_fpu */ 368 insn_fmov = 0xA001, 369 insn_fmovcc = 0xA002, 370 insn_fneg = 0xA003, 371 insn_fabs = 0xA004, 372 insn_fadd = 0xA005, 373 insn_fsub = 0xA006, 374 insn_fmul = 0xA007, 375 insn_fdiv = 0xA008, 376 insn_fsqrt = 0xA009, 377 insn_fcmp = 0xA00A, 378 insn_fcos = 0xA00C, 379 insn_fldpi = 0xA00D, 380 insn_fldz = 0xA00E, 381 insn_ftan = 0xA00F, 382 insn_fsine = 0xA010, 383 insn_fsys = 0xA020, 384 /* insn_interrupt */ 385 insn_int = 0xD001, 386 insn_intcc = 0xD002, /* not present in x86 ISA */ 387 insn_iret = 0xD003, 388 insn_bound = 0xD004, 389 insn_debug = 0xD005, 390 insn_trace = 0xD006, 391 insn_invalid_op = 0xD007, 392 insn_oflow = 0xD008, 393 /* insn_system */ 394 insn_halt = 0xE001, 395 insn_in = 0xE002, /* input from port/bus */ 396 insn_out = 0xE003, /* output to port/bus */ 397 insn_cpuid = 0xE004, 398 /* insn_other */ 399 insn_nop = 0xF001, 400 insn_bcdconv = 0xF002, /* convert to or from BCD */ 401 insn_szconv = 0xF003 /* change size of operand */ 402 }; 403 404 /* These flags specify special characteristics of the instruction, such as 405 * whether the inatruction is privileged or whether it serializes the 406 * pipeline. 407 * NOTE : These may not be accurate for all instructions; updates to the 408 * opcode tables have not been completed. */ 409 enum x86_insn_note { 410 insn_note_ring0 = 1, /* Only available in ring 0 */ 411 insn_note_smm = 2, /* "" in System Management Mode */ 412 insn_note_serial = 4, /* Serializing instruction */ 413 insn_note_nonswap = 8, /* Does not swap arguments in att-style formatting */ 414 insn_note_nosuffix = 16, /* Does not have size suffix in att-style formatting */ 415 }; 416 417 /* This specifies what effects the instruction has on the %eflags register */ 418 enum x86_flag_status { 419 insn_carry_set = 0x1, /* CF */ 420 insn_zero_set = 0x2, /* ZF */ 421 insn_oflow_set = 0x4, /* OF */ 422 insn_dir_set = 0x8, /* DF */ 423 insn_sign_set = 0x10, /* SF */ 424 insn_parity_set = 0x20, /* PF */ 425 insn_carry_or_zero_set = 0x40, 426 insn_zero_set_or_sign_ne_oflow = 0x80, 427 insn_carry_clear = 0x100, 428 insn_zero_clear = 0x200, 429 insn_oflow_clear = 0x400, 430 insn_dir_clear = 0x800, 431 insn_sign_clear = 0x1000, 432 insn_parity_clear = 0x2000, 433 insn_sign_eq_oflow = 0x4000, 434 insn_sign_ne_oflow = 0x8000 435 }; 436 437 /* The CPU model in which the insturction first appeared; this can be used 438 * to mask out instructions appearing in earlier or later models or to 439 * check the portability of a binary. 440 * NOTE : These may not be accurate for all instructions; updates to the 441 * opcode tables have not been completed. */ 442 enum x86_insn_cpu { 443 cpu_8086 = 1, /* Intel */ 444 cpu_80286 = 2, 445 cpu_80386 = 3, 446 cpu_80387 = 4, 447 cpu_80486 = 5, 448 cpu_pentium = 6, 449 cpu_pentiumpro = 7, 450 cpu_pentium2 = 8, 451 cpu_pentium3 = 9, 452 cpu_pentium4 = 10, 453 cpu_k6 = 16, /* AMD */ 454 cpu_k7 = 32, 455 cpu_athlon = 48 456 }; 457 458 /* CPU ISA subsets: These are derived from the Instruction Groups in 459 * Intel Vol 1 Chapter 5; they represent subsets of the IA32 ISA but 460 * do not reflect the 'type' of the instruction in the same way that 461 * x86_insn_group does. In short, these are AMD/Intel's somewhat useless 462 * designations. 463 * NOTE : These may not be accurate for all instructions; updates to the 464 * opcode tables have not been completed. */ 465 enum x86_insn_isa { 466 isa_gp = 1, /* general purpose */ 467 isa_fp = 2, /* floating point */ 468 isa_fpumgt = 3, /* FPU/SIMD management */ 469 isa_mmx = 4, /* Intel MMX */ 470 isa_sse1 = 5, /* Intel SSE SIMD */ 471 isa_sse2 = 6, /* Intel SSE2 SIMD */ 472 isa_sse3 = 7, /* Intel SSE3 SIMD */ 473 isa_3dnow = 8, /* AMD 3DNow! SIMD */ 474 isa_sys = 9 /* system instructions */ 475 }; 476 477 enum x86_insn_prefix { 478 insn_no_prefix = 0, 479 insn_rep_zero = 1, /* REPZ and REPE */ 480 insn_rep_notzero = 2, /* REPNZ and REPNZ */ 481 insn_lock = 4 /* LOCK: */ 482 }; 483 484 /* TODO: maybe provide insn_new/free(), and have disasm return new insn_t */ 485 /* x86_insn_t : an X86 instruction */ 486 typedef struct { 487 /* information about the instruction */ 488 uint32_t addr; /* load address */ 489 uint32_t offset; /* offset into file/buffer */ 490 enum x86_insn_group group; /* meta-type, e.g. INS_EXEC */ 491 enum x86_insn_type type; /* type, e.g. INS_BRANCH */ 492 enum x86_insn_note note; /* note, e.g. RING0 */ 493 unsigned char bytes[MAX_INSN_SIZE]; 494 unsigned char size; /* size of insn in bytes */ 495 /* 16/32-bit mode settings */ 496 unsigned char addr_size; /* default address size : 2 or 4 */ 497 unsigned char op_size; /* default operand size : 2 or 4 */ 498 /* CPU/instruction set */ 499 enum x86_insn_cpu cpu; 500 enum x86_insn_isa isa; 501 /* flags */ 502 enum x86_flag_status flags_set; /* flags set or tested by insn */ 503 enum x86_flag_status flags_tested; 504 /* stack */ 505 unsigned char stack_mod; /* 0 or 1 : is the stack modified? */ 506 int32_t stack_mod_val; /* val stack is modified by if known */ 507 508 /* the instruction proper */ 509 enum x86_insn_prefix prefix; /* prefixes ORed together */ 510 char prefix_string[MAX_PREFIX_STR]; /* prefixes [might be truncated] */ 511 char mnemonic[MAX_MNEM_STR]; 512 x86_oplist_t *operands; /* list of explicit/implicit operands */ 513 size_t operand_count; /* total number of operands */ 514 size_t explicit_count; /* number of explicit operands */ 515 /* convenience fields for user */ 516 void *block; /* code block containing this insn */ 517 void *function; /* function containing this insn */ 518 int tag; /* tag the insn as seen/processed */ 519 } x86_insn_t; 520 521 522 /* returns 0 if an instruction is invalid, 1 if valid */ 523 int x86_insn_is_valid( x86_insn_t *insn ); 524 525 /* DISASSEMBLY ROUTINES 526 * Canonical order of arguments is 527 * (buf, buf_len, buf_rva, offset, len, insn, func, arg, resolve_func) 528 * ...but of course all of these are not used at the same time. 529 */ 530 531 532 /* Function prototype for caller-supplied callback routine 533 * These callbacks are intended to process 'insn' further, e.g. by 534 * adding it to a linked list, database, etc */ 535 typedef void (*DISASM_CALLBACK)( x86_insn_t *insn, void * arg ); 536 537 /* Function prototype for caller-supplied address resolver. 538 * This routine is used to determine the rva to disassemble next, given 539 * the 'dest' operand of a jump/call. This allows the caller to resolve 540 * jump/call targets stored in a register or on the stack, and also allows 541 * the caller to prevent endless loops by checking if an address has 542 * already been disassembled. If an address cannot be resolved from the 543 * operand, or if the address has already been disassembled, this routine 544 * should return -1; in all other cases the RVA to be disassembled next 545 * should be returned. */ 546 typedef int32_t (*DISASM_RESOLVER)( x86_op_t *op, x86_insn_t * current_insn, 547 void *arg ); 548 549 550 /* x86_disasm: Disassemble a single instruction from a buffer of bytes. 551 * Returns size of instruction in bytes. 552 * Caller is responsible for calling x86_oplist_free() on 553 * a reused "insn" to avoid leaking memory when calling this 554 * function repeatedly. 555 * buf : Buffer of bytes to disassemble 556 * buf_len : Length of the buffer 557 * buf_rva : Load address of the start of the buffer 558 * offset : Offset in buffer to disassemble 559 * insn : Structure to fill with disassembled instruction 560 */ 561 unsigned int x86_disasm( unsigned char *buf, unsigned int buf_len, 562 uint32_t buf_rva, unsigned int offset, 563 x86_insn_t * insn ); 564 565 /* x86_disasm_range: Sequential disassembly of a range of bytes in a buffer, 566 * invoking a callback function each time an instruction 567 * is successfully disassembled. The 'range' refers to the 568 * bytes between 'offset' and 'offset + len' in the buffer; 569 * 'len' is assumed to be less than the length of the buffer. 570 * Returns number of instructions processed. 571 * buf : Buffer of bytes to disassemble (e.g. .text section) 572 * buf_rva : Load address of buffer (e.g. ELF Virtual Address) 573 * offset : Offset in buffer to start disassembly at 574 * len : Number of bytes to disassemble 575 * func : Callback function to invoke (may be NULL) 576 * arg : Arbitrary data to pass to callback (may be NULL) 577 */ 578 unsigned int x86_disasm_range( unsigned char *buf, uint32_t buf_rva, 579 unsigned int offset, unsigned int len, 580 DISASM_CALLBACK func, void *arg ); 581 582 /* x86_disasm_forward: Flow-of-execution disassembly of the bytes in a buffer, 583 * invoking a callback function each time an instruction 584 * is successfully disassembled. 585 * buf : Buffer to disassemble (e.g. .text section) 586 * buf_len : Number of bytes in buffer 587 * buf_rva : Load address of buffer (e.g. ELF Virtual Address) 588 * offset : Offset in buffer to start disassembly at (e.g. entry point) 589 * func : Callback function to invoke (may be NULL) 590 * arg : Arbitrary data to pass to callback (may be NULL) 591 * resolver: Caller-supplied address resolver. If no resolver is 592 * supplied, a default internal one is used -- however the 593 * internal resolver does NOT catch loops and could end up 594 * disassembling forever.. 595 * r_arg : Arbitrary data to pass to resolver (may be NULL) 596 */ 597 unsigned int x86_disasm_forward( unsigned char *buf, unsigned int buf_len, 598 uint32_t buf_rva, unsigned int offset, 599 DISASM_CALLBACK func, void *arg, 600 DISASM_RESOLVER resolver, void *r_arg ); 601 602 /* Instruction operands: these are stored as a list of explicit and 603 * implicit operands. It is recommended that the 'foreach' routines 604 * be used to when examining operands for purposes of data flow analysis */ 605 606 /* Operand FOREACH callback: 'arg' is an abritrary parameter passed to the 607 * foreach routine, 'insn' is the x86_insn_t whose operands are being 608 * iterated over, and 'op' is the current x86_op_t */ 609 typedef void (*x86_operand_fn)(x86_op_t *op, x86_insn_t *insn, void *arg); 610 611 /* FOREACH types: these are used to limit the foreach results to 612 * operands which match a certain "type" (implicit or explicit) 613 * or which are accessed in certain ways (e.g. read or write). Note 614 * that this operates on the operand list of single instruction, so 615 * specifying the 'real' operand type (register, memory, etc) is not 616 * useful. Note also that by definition Execute Access implies Read 617 * Access and implies Not Write Access. 618 * The "type" (implicit or explicit) and the access method can 619 * be ORed together, e.g. op_wo | op_explicit */ 620 enum x86_op_foreach_type { 621 op_any = 0, /* ALL operands (explicit, implicit, rwx) */ 622 op_dest = 1, /* operands with Write access */ 623 op_src = 2, /* operands with Read access */ 624 op_ro = 3, /* operands with Read but not Write access */ 625 op_wo = 4, /* operands with Write but not Read access */ 626 op_xo = 5, /* operands with Execute access */ 627 op_rw = 6, /* operands with Read AND Write access */ 628 op_implicit = 0x10, /* operands that are implied by the opcode */ 629 op_explicit = 0x20 /* operands that are not side-effects */ 630 }; 631 632 633 /* free the operand list associated with an instruction -- useful for 634 * preventing memory leaks when free()ing an x86_insn_t */ 635 void x86_oplist_free( x86_insn_t *insn ); 636 637 /* Operand foreach: invokes 'func' with 'insn' and 'arg' as arguments. The 638 * 'type' parameter is used to select only operands matching specific 639 * criteria. */ 640 int x86_operand_foreach( x86_insn_t *insn, x86_operand_fn func, void *arg, 641 enum x86_op_foreach_type type); 642 643 /* convenience routine: returns count of operands matching 'type' */ 644 size_t x86_operand_count( x86_insn_t *insn, enum x86_op_foreach_type type ); 645 646 /* accessor functions for the operands */ 647 x86_op_t * x86_operand_1st( x86_insn_t *insn ); 648 x86_op_t * x86_operand_2nd( x86_insn_t *insn ); 649 x86_op_t * x86_operand_3rd( x86_insn_t *insn ); 650 651 /* these allow libdisasm 2.0 accessor functions to still be used */ 652 #define x86_get_dest_operand( insn ) x86_operand_1st( insn ) 653 #define x86_get_src_operand( insn ) x86_operand_2nd( insn ) 654 #define x86_get_imm_operand( insn ) x86_operand_3rd( insn ) 655 656 /* get size of operand data in bytes */ 657 unsigned int x86_operand_size( x86_op_t *op ); 658 659 /* Operand Convenience Routines: the following three routines are common 660 * operations on operands, intended to ease the burden of the programmer. */ 661 662 /* Get Address: return the value of an offset operand, or the offset of 663 * a segment:offset absolute address */ 664 uint32_t x86_get_address( x86_insn_t *insn ); 665 666 /* Get Relative Offset: return as a sign-extended int32_t the near or far 667 * relative offset operand, or 0 if there is none. There can be only one 668 * relaive offset operand in an instruction. */ 669 int32_t x86_get_rel_offset( x86_insn_t *insn ); 670 671 /* Get Branch Target: return the x86_op_t containing the target of 672 * a jump or call operand, or NULL if there is no branch target. 673 * Internally, a 'branch target' is defined as any operand with 674 * Execute Access set. There can be only one branch target per instruction. */ 675 x86_op_t * x86_get_branch_target( x86_insn_t *insn ); 676 677 /* Get Immediate: return the x86_op_t containing the immediate operand 678 * for this instruction, or NULL if there is no immediate operand. There 679 * can be only one immediate operand per instruction */ 680 x86_op_t * x86_get_imm( x86_insn_t *insn ); 681 682 /* Get Raw Immediate Data: returns a pointer to the immediate data encoded 683 * in the instruction. This is useful for large data types [>32 bits] currently 684 * not supported by libdisasm, or for determining if the disassembler 685 * screwed up the conversion of the immediate data. Note that 'imm' in this 686 * context refers to immediate data encoded at the end of an instruction as 687 * detailed in the Intel Manual Vol II Chapter 2; it does not refer to the 688 * 'op_imm' operand (the third operand in instructions like 'mul' */ 689 unsigned char * x86_get_raw_imm( x86_insn_t *insn ); 690 691 692 /* More accessor fuctions, this time for user-defined info... */ 693 /* set the address (usually RVA) of the insn */ 694 void x86_set_insn_addr( x86_insn_t *insn, uint32_t addr ); 695 696 /* set the offset (usually offset into file) of the insn */ 697 void x86_set_insn_offset( x86_insn_t *insn, unsigned int offset ); 698 699 /* set a pointer to the function owning the instruction. The 700 * type of 'func' is user-defined; libdisasm does not use the func field. */ 701 void x86_set_insn_function( x86_insn_t *insn, void * func ); 702 703 /* set a pointer to the block of code owning the instruction. The 704 * type of 'block' is user-defined; libdisasm does not use the block field. */ 705 void x86_set_insn_block( x86_insn_t *insn, void * block ); 706 707 /* instruction tagging: these routines allow the programmer to mark 708 * instructions as "seen" in a DFS, for example. libdisasm does not use 709 * the tag field.*/ 710 /* set insn->tag to 1 */ 711 void x86_tag_insn( x86_insn_t *insn ); 712 /* set insn->tag to 0 */ 713 void x86_untag_insn( x86_insn_t *insn ); 714 /* return insn->tag */ 715 int x86_insn_is_tagged( x86_insn_t *insn ); 716 717 718 /* Disassembly formats: 719 * AT&T is standard AS/GAS-style: "mnemonic\tsrc, dest, imm" 720 * Intel is standard MASM/NASM/TASM: "mnemonic\tdest,src, imm" 721 * Native is tab-delimited: "RVA\tbytes\tmnemonic\tdest\tsrc\timm" 722 * XML is your typical <insn> ... </insn> 723 * Raw is addr|offset|size|bytes|prefix... see libdisasm_formats.7 724 */ 725 enum x86_asm_format { 726 unknown_syntax = 0, /* never use! */ 727 native_syntax, /* header: 35 bytes */ 728 intel_syntax, /* header: 23 bytes */ 729 att_syntax, /* header: 23 bytes */ 730 xml_syntax, /* header: 679 bytes */ 731 raw_syntax /* header: 172 bytes */ 732 }; 733 734 /* format (sprintf) an operand into 'buf' using specified syntax */ 735 int x86_format_operand(x86_op_t *op, char *buf, int len, 736 enum x86_asm_format format); 737 738 /* format (sprintf) an instruction mnemonic into 'buf' using specified syntax */ 739 int x86_format_mnemonic(x86_insn_t *insn, char *buf, int len, 740 enum x86_asm_format format); 741 742 /* format (sprintf) an instruction into 'buf' using specified syntax; 743 * this includes formatting all operands */ 744 int x86_format_insn(x86_insn_t *insn, char *buf, int len, enum x86_asm_format); 745 746 /* fill 'buf' with a description of the format's syntax */ 747 int x86_format_header( char *buf, int len, enum x86_asm_format format); 748 749 /* Endianness of an x86 CPU : 0 is big, 1 is little; always returns 1 */ 750 unsigned int x86_endian(void); 751 752 /* Default address and operand size in bytes */ 753 unsigned int x86_addr_size(void); 754 unsigned int x86_op_size(void); 755 756 /* Size of a machine word in bytes */ 757 unsigned int x86_word_size(void); 758 759 /* maximum size of a code instruction */ 760 #define x86_max_inst_size(x) x86_max_insn_size(x) 761 unsigned int x86_max_insn_size(void); 762 763 /* register IDs of Stack, Frame, Instruction pointer and Flags register */ 764 unsigned int x86_sp_reg(void); 765 unsigned int x86_fp_reg(void); 766 unsigned int x86_ip_reg(void); 767 unsigned int x86_flag_reg(void); 768 769 /* fill 'reg' struct with details of register 'id' */ 770 void x86_reg_from_id( unsigned int id, x86_reg_t * reg ); 771 772 /* convenience macro demonstrating how to get an aliased register; proto is 773 * void x86_get_aliased_reg( x86_reg_t *alias_reg, x86_reg_t *output_reg ) 774 * where 'alias_reg' is a reg operand and 'output_reg' is filled with the 775 * register that the operand is an alias for */ 776 #define x86_get_aliased_reg( alias_reg, output_reg ) \ 777 x86_reg_from_id( alias_reg->alias, output_reg ) 778 779 780 /* ================================== Invariant Instruction Representation */ 781 /* Invariant instructions are used for generating binary signatures; 782 * the instruction is modified so that all variant bytes in an instruction 783 * are replaced with a wildcard byte. 784 * 785 * A 'variant byte' is one that is expected to be modified by either the 786 * static or the dynamic linker: for example, an address encoded in an 787 * instruction. 788 * 789 * By comparing the invariant representation of one instruction [or of a 790 * sequence of instructions] with the invariant representation of another, 791 * one determine whether the two invariant representations are from the same 792 * relocatable object [.o] file. Thus one can use binary signatures [which 793 * are just sequences of invariant instruction representations] to look for 794 * library routines which have been statically-linked into a binary. 795 * 796 * The invariant routines are faster and smaller than the disassembly 797 * routines; they can be used to determine the size of an instruction 798 * without all of the overhead of a full instruction disassembly. 799 */ 800 801 /* This byte is used to replace variant bytes */ 802 #define X86_WILDCARD_BYTE 0xF4 803 804 typedef struct { 805 enum x86_op_type type; /* operand type */ 806 enum x86_op_datatype datatype; /* operand size */ 807 enum x86_op_access access; /* operand access [RWX] */ 808 enum x86_op_flags flags; /* misc flags */ 809 } x86_invariant_op_t; 810 811 typedef struct { 812 unsigned char bytes[64]; /* invariant representation */ 813 unsigned int size; /* number of bytes in insn */ 814 enum x86_insn_group group; /* meta-type, e.g. INS_EXEC */ 815 enum x86_insn_type type; /* type, e.g. INS_BRANCH */ 816 x86_invariant_op_t operands[3]; /* operands: dest, src, imm */ 817 } x86_invariant_t; 818 819 820 /* return a version of the instruction with the variant bytes masked out */ 821 size_t x86_invariant_disasm( unsigned char *buf, int buf_len, 822 x86_invariant_t *inv ); 823 /* return the size in bytes of the intruction pointed to by 'buf'; 824 * this used x86_invariant_disasm since it faster than x86_disasm */ 825 size_t x86_size_disasm( unsigned char *buf, unsigned int buf_len ); 826 827 #ifdef __cplusplus 828 } 829 #endif 830 831 832 #endif 833