1#!/usr/bin/python 2# Test tool to compare Capstone output with llvm-mc. By Nguyen Anh Quynh, 2014 3import array, os.path, sys 4from subprocess import Popen, PIPE, STDOUT 5from capstone import * 6 7 8# convert all hex numbers to decimal numbers in a text 9def normalize_hex(a): 10 while(True): 11 i = a.find('0x') 12 if i == -1: # no more hex number 13 break 14 hexnum = '0x' 15 for c in a[i + 2:]: 16 if c in '0123456789abcdefABCDEF': 17 hexnum += c 18 else: 19 break 20 num = int(hexnum, 16) 21 a = a.replace(hexnum, str(num)) 22 return a 23 24 25def run_mc(arch, hexcode, option, syntax=None): 26 def normalize(text): 27 # remove tabs 28 text = text.lower() 29 items = text.split() 30 text = ' '.join(items) 31 if arch == CS_ARCH_X86: 32 # remove comment after # 33 i = text.find('# ') 34 if i != -1: 35 return text[:i].strip() 36 if arch == CS_ARCH_ARM64: 37 # remove comment after # 38 i = text.find('// ') 39 if i != -1: 40 return text[:i].strip() 41 # remove some redundant spaces 42 text = text.replace('{ ', '{') 43 text = text.replace(' }', '}') 44 return text.strip() 45 46 #print("Trying to decode: %s" %hexcode) 47 if syntax: 48 if arch == CS_ARCH_MIPS: 49 p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa', syntax] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT) 50 else: 51 p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', syntax] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT) 52 else: 53 if arch == CS_ARCH_MIPS: 54 p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa'] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT) 55 else: 56 p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex'] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT) 57 output = p.communicate(input=hexcode)[0] 58 lines = output.split('\n') 59 #print lines 60 if 'invalid' in lines[0]: 61 #print 'invalid ----' 62 return 'FAILED to disassemble (MC)' 63 else: 64 #print 'OK:', lines[1] 65 return normalize(lines[1].strip()) 66 67def test_file(fname): 68 print("Test %s" %fname); 69 f = open(fname) 70 lines = f.readlines() 71 f.close() 72 73 if not lines[0].startswith('# '): 74 print("ERROR: decoding information is missing") 75 return 76 77 # skip '# ' at the front, then split line to get out hexcode 78 # Note: option can be '', or 'None' 79 #print lines[0] 80 #print lines[0][2:].split(', ') 81 (arch, mode, option) = lines[0][2:].split(', ') 82 mode = mode.replace(' ', '') 83 option = option.strip() 84 85 archs = { 86 "CS_ARCH_ARM": CS_ARCH_ARM, 87 "CS_ARCH_ARM64": CS_ARCH_ARM64, 88 "CS_ARCH_MIPS": CS_ARCH_MIPS, 89 "CS_ARCH_PPC": CS_ARCH_PPC, 90 "CS_ARCH_SPARC": CS_ARCH_SPARC, 91 "CS_ARCH_SYSZ": CS_ARCH_SYSZ, 92 "CS_ARCH_X86": CS_ARCH_X86, 93 "CS_ARCH_XCORE": CS_ARCH_XCORE, 94 } 95 96 modes = { 97 "CS_MODE_16": CS_MODE_16, 98 "CS_MODE_32": CS_MODE_32, 99 "CS_MODE_64": CS_MODE_64, 100 "CS_MODE_MIPS32": CS_MODE_MIPS32, 101 "CS_MODE_MIPS64": CS_MODE_MIPS64, 102 "0": CS_MODE_ARM, 103 "CS_MODE_ARM": CS_MODE_ARM, 104 "CS_MODE_THUMB": CS_MODE_THUMB, 105 "CS_MODE_ARM+CS_MODE_V8": CS_MODE_ARM+CS_MODE_V8, 106 "CS_MODE_THUMB+CS_MODE_V8": CS_MODE_THUMB+CS_MODE_V8, 107 "CS_MODE_THUMB+CS_MODE_MCLASS": CS_MODE_THUMB+CS_MODE_MCLASS, 108 "CS_MODE_LITTLE_ENDIAN": CS_MODE_LITTLE_ENDIAN, 109 "CS_MODE_BIG_ENDIAN": CS_MODE_BIG_ENDIAN, 110 "CS_MODE_64+CS_MODE_LITTLE_ENDIAN": CS_MODE_64+CS_MODE_LITTLE_ENDIAN, 111 "CS_MODE_64+CS_MODE_BIG_ENDIAN": CS_MODE_64+CS_MODE_BIG_ENDIAN, 112 "CS_MODE_MIPS32+CS_MODE_MICRO": CS_MODE_MIPS32+CS_MODE_MICRO, 113 "CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN, 114 "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO": CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN, 115 "CS_MODE_BIG_ENDIAN+CS_MODE_V9": CS_MODE_BIG_ENDIAN + CS_MODE_V9, 116 "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN, 117 "CS_MODE_MIPS32+CS_MODE_LITTLE_ENDIAN": CS_MODE_MIPS32+CS_MODE_LITTLE_ENDIAN, 118 "CS_MODE_MIPS64+CS_MODE_LITTLE_ENDIAN": CS_MODE_MIPS64+CS_MODE_LITTLE_ENDIAN, 119 "CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN, 120 } 121 122 options = { 123 "CS_OPT_SYNTAX_ATT": CS_OPT_SYNTAX_ATT, 124 "CS_OPT_SYNTAX_NOREGNAME": CS_OPT_SYNTAX_NOREGNAME, 125 } 126 127 mc_modes = { 128 ("CS_ARCH_X86", "CS_MODE_32"): ['-triple=i386'], 129 ("CS_ARCH_X86", "CS_MODE_64"): ['-triple=x86_64'], 130 ("CS_ARCH_ARM", "CS_MODE_ARM"): ['-triple=armv7'], 131 ("CS_ARCH_ARM", "CS_MODE_THUMB"): ['-triple=thumbv7'], 132 ("CS_ARCH_ARM", "CS_MODE_ARM+CS_MODE_V8"): ['-triple=armv8'], 133 ("CS_ARCH_ARM", "CS_MODE_THUMB+CS_MODE_V8"): ['-triple=thumbv8'], 134 ("CS_ARCH_ARM", "CS_MODE_THUMB+CS_MODE_MCLASS"): ['-triple=thumbv7m'], 135 ("CS_ARCH_ARM64", "0"): ['-triple=aarch64'], 136 ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN"): ['-triple=mips'], 137 ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_MICRO"): ['-triple=mipsel', '-mattr=+micromips'], 138 ("CS_ARCH_MIPS", "CS_MODE_MIPS64"): ['-triple=mips64el'], 139 ("CS_ARCH_MIPS", "CS_MODE_MIPS32"): ['-triple=mipsel'], 140 ("CS_ARCH_MIPS", "CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN"): ['-triple=mips64'], 141 ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN"): ['-triple=mips', '-mattr=+micromips'], 142 ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO"): ['-triple=mips', '-mattr=+micromips'], 143 ("CS_ARCH_PPC", "CS_MODE_BIG_ENDIAN"): ['-triple=powerpc64'], 144 ('CS_ARCH_SPARC', 'CS_MODE_BIG_ENDIAN'): ['-triple=sparc'], 145 ('CS_ARCH_SPARC', 'CS_MODE_BIG_ENDIAN+CS_MODE_V9'): ['-triple=sparcv9'], 146 ('CS_ARCH_SYSZ', '0'): ['-triple=s390x', '-mcpu=z196'], 147 } 148 149 #if not option in ('', 'None'): 150 # print archs[arch], modes[mode], options[option] 151 152 #print(arch, mode, option) 153 md = Cs(archs[arch], modes[mode]) 154 155 mc_option = None 156 if arch == 'CS_ARCH_X86': 157 # tell llvm-mc to use Intel syntax 158 mc_option = '-output-asm-variant=1' 159 160 if arch == 'CS_ARCH_ARM' or arch == 'CS_ARCH_PPC' : 161 md.syntax = CS_OPT_SYNTAX_NOREGNAME 162 163 if fname.endswith('3DNow.s.cs'): 164 md.syntax = CS_OPT_SYNTAX_ATT 165 166 for line in lines[1:]: 167 # ignore all the input lines having # in front. 168 if line.startswith('#'): 169 continue 170 #print("Check %s" %line) 171 code = line.split(' = ')[0] 172 asm = ''.join(line.split(' = ')[1:]) 173 hex_code = code.replace('0x', '') 174 hex_code = hex_code.replace(',', '') 175 hex_data = hex_code.decode('hex') 176 #hex_bytes = array.array('B', hex_data) 177 178 x = list(md.disasm(hex_data, 0)) 179 if len(x) > 0: 180 if x[0].op_str != '': 181 cs_output = "%s %s" %(x[0].mnemonic, x[0].op_str) 182 else: 183 cs_output = x[0].mnemonic 184 else: 185 cs_output = 'FAILED to disassemble' 186 187 cs_output2 = normalize_hex(cs_output) 188 cs_output2 = cs_output2.replace(' ', '') 189 190 if arch == 'CS_ARCH_MIPS': 191 # normalize register alias names 192 cs_output2 = cs_output2.replace('$at', '$1') 193 cs_output2 = cs_output2.replace('$v0', '$2') 194 cs_output2 = cs_output2.replace('$v1', '$3') 195 196 cs_output2 = cs_output2.replace('$a0', '$4') 197 cs_output2 = cs_output2.replace('$a1', '$5') 198 cs_output2 = cs_output2.replace('$a2', '$6') 199 cs_output2 = cs_output2.replace('$a3', '$7') 200 201 cs_output2 = cs_output2.replace('$t0', '$8') 202 cs_output2 = cs_output2.replace('$t1', '$9') 203 cs_output2 = cs_output2.replace('$t2', '$10') 204 cs_output2 = cs_output2.replace('$t3', '$11') 205 cs_output2 = cs_output2.replace('$t4', '$12') 206 cs_output2 = cs_output2.replace('$t5', '$13') 207 cs_output2 = cs_output2.replace('$t6', '$14') 208 cs_output2 = cs_output2.replace('$t7', '$15') 209 cs_output2 = cs_output2.replace('$t8', '$24') 210 cs_output2 = cs_output2.replace('$t9', '$25') 211 212 cs_output2 = cs_output2.replace('$s0', '$16') 213 cs_output2 = cs_output2.replace('$s1', '$17') 214 cs_output2 = cs_output2.replace('$s2', '$18') 215 cs_output2 = cs_output2.replace('$s3', '$19') 216 cs_output2 = cs_output2.replace('$s4', '$20') 217 cs_output2 = cs_output2.replace('$s5', '$21') 218 cs_output2 = cs_output2.replace('$s6', '$22') 219 cs_output2 = cs_output2.replace('$s7', '$23') 220 221 cs_output2 = cs_output2.replace('$k0', '$26') 222 cs_output2 = cs_output2.replace('$k1', '$27') 223 224 #print("Running MC ...") 225 if fname.endswith('thumb-fp-armv8.s.cs'): 226 mc_output = run_mc(archs[arch], code, ['-triple=thumbv8'], mc_option) 227 elif fname.endswith('mips64-alu-instructions.s.cs'): 228 mc_output = run_mc(archs[arch], code, ['-triple=mips64el', '-mcpu=mips64r2'], mc_option) 229 else: 230 mc_output = run_mc(archs[arch], code, mc_modes[(arch, mode)], mc_option) 231 mc_output2 = normalize_hex(mc_output) 232 233 if arch == 'CS_ARCH_MIPS': 234 mc_output2 = mc_output2.replace(' 0(', '(') 235 236 if arch == 'CS_ARCH_PPC': 237 mc_output2 = mc_output2.replace('.+', '') 238 mc_output2 = mc_output2.replace('.', '') 239 mc_output2 = mc_output2.replace(' 0(', '(') 240 241 mc_output2 = mc_output2.replace(' ', '') 242 mc_output2 = mc_output2.replace('opaque', '') 243 244 245 if (cs_output2 != mc_output2): 246 asm = asm.replace(' ', '').strip().lower() 247 if asm != cs_output2: 248 print("Mismatch: %s" %line.strip()) 249 print("\tMC = %s" %mc_output) 250 print("\tCS = %s" %cs_output) 251 252 253if __name__ == '__main__': 254 if len(sys.argv) == 1: 255 fnames = sys.stdin.readlines() 256 for fname in fnames: 257 test_file(fname.strip()) 258 else: 259 #print("Usage: ./test_mc.py <input-file.s.cs>") 260 test_file(sys.argv[1]) 261 262