1#!/usr/bin/env python3 2 3# Copyright 2016, VIXL authors 4# All rights reserved. 5# 6# Redistribution and use in source and binary forms, with or without 7# modification, are permitted provided that the following conditions are met: 8# 9# * Redistributions of source code must retain the above copyright notice, 10# this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above copyright notice, 12# this list of conditions and the following disclaimer in the documentation 13# and/or other materials provided with the distribution. 14# * Neither the name of ARM Limited nor the names of its contributors may be 15# used to endorse or promote products derived from this software without 16# specific prior written permission. 17# 18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND 19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29""" 30Verify generated AArch32 assembler traces against `llvm-mc`. 31 32This script will find all files in `test/aarch32/traces/` with names starting 33will `assembler`, and check them against `llvm-mc`. It checks our assembler is 34correct by looking up what instruction we meant to asssemble, assemble it with 35`llvm` and check the result is bit identical to what our assembler generated. 36 37You may run the script with no arguments from VIXL's top-level directory as long 38as `llvm-mc` is in your PATH. You may provide a different `llvm-mc` path with 39the `--llvm-mc` option. This script relies on version 3.8 or higher of 40LLVM. Previous versions refuse to assemble some instructions that ARMv8 allows, 41but ARMv7 did not. 42 43For example, let's say we have the following assembler trace for CLZ 44(the real trace is a lot bigger): 45 46~~~ 47static const byte kInstruction_Clz_eq_r0_r0[] = { 48 0x10, 0x0f, 0x6f, 0x01 // Clz eq r0 r0 49}; 50static const byte kInstruction_Clz_eq_r0_r1[] = { 51 0x11, 0x0f, 0x6f, 0x01 // Clz eq r0 r1 52}; 53static const byte kInstruction_Clz_eq_r0_r2[] = { 54 0x12, 0x0f, 0x6f, 0x01 // Clz eq r0 r2 55}; 56static const TestResult kReferenceClz[] = { 57 { 58 ARRAY_SIZE(kInstruction_Clz_eq_r0_r0), 59 kInstruction_Clz_eq_r0_r0, 60 }, 61 { 62 ARRAY_SIZE(kInstruction_Clz_eq_r0_r1), 63 kInstruction_Clz_eq_r0_r1, 64 }, 65 { 66 ARRAY_SIZE(kInstruction_Clz_eq_r0_r2), 67 kInstruction_Clz_eq_r0_r2, 68 }, 69}; 70~~~ 71 72The traces contain both the list of bytes that were encoded as well as a comment 73with a description of the instruction this is. This script searches for these 74lines and checks them. 75 76With our example, the script will find the following: 77 78 [ 79 ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]), 80 ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]), 81 ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"]) 82 ] 83 84Then the tricky part is to convert the description of the instruction into the 85following valid assembly syntax: 86 87 clzeq r0, r0 88 clzeq r0, r1 89 clzeq r0, r2 90 91Our example is easy, but it gets more complicated with load and store 92instructions for example. We can feed this as input to `llvm-mc`: 93 94 $ echo " 95 clzeq r0, r0 96 clzeq r0, r1 97 clzeq r0, r2 98 " | llvm-mc -assemble -arch=arm -mattr=v8,crc -show-encoding 99 100And we will get the following output: 101 102 .text 103 clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01] 104 clzeq r0, r1 @ encoding: [0x11,0x0f,0x6f,0x01] 105 clzeq r0, r2 @ encoding: [0x12,0x0f,0x6f,0x01] 106 107The script will finally extract the encoding and compare it to what VIXL 108generated. 109""" 110 111import argparse 112import subprocess 113import os 114import re 115import itertools 116import types 117 118def BuildOptions(): 119 result = argparse.ArgumentParser( 120 description = 'Use `llvm-mc` to check the assembler traces are correct.', 121 formatter_class = argparse.ArgumentDefaultsHelpFormatter) 122 result.add_argument('--llvm-mc', default='llvm-mc', help='Path to llvm-mc') 123 result.add_argument('--verbose', '-v', action='store_true') 124 return result.parse_args() 125 126 127def CheckLLVMVersion(llvm_mc): 128 version = subprocess.check_output([llvm_mc, '-version']) 129 m = re.search("^ LLVM version (\d)\.(\d)\.\d$", version.decode(), re.M) 130 major, minor = m.groups() 131 if int(major) < 3 or (int(major) == 3 and int(minor) < 8): 132 raise Exception("This script requires LLVM version 3.8 or higher.") 133 134 135def ConvertToLLVMFormat(vixl_instruction, triple): 136 """ 137 Take an string representing an instruction and convert it to assembly syntax 138 for LLVM. VIXL's test generation framework will print instruction 139 representations as a space seperated list. The first element is the mnemonic 140 and the following elements are operands. 141 """ 142 143 def DtUntypedToLLVM(matches): 144 dt = "" 145 if matches[1] == "untyped8": 146 dt = "8" 147 elif matches[1] == "untyped16": 148 dt = "16" 149 elif matches[1] == "untyped32": 150 dt = "32" 151 else: 152 raise Exception() 153 154 return "{}.{} {}, {}, {}".format(matches[0], dt, matches[2], matches[3], matches[4]) 155 156 # Dictionnary of patterns. The key is an identifier used in 157 # `llvm_mc_instruction_converters` below. The value needs to be a capturing 158 # regular expression. 159 pattern_matchers = { 160 # Allow an optional underscore in case this an "and" instruction. 161 "mnemonic": "(\w+?)_?", 162 "condition": 163 "(al|eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le)", 164 "register": 165 "(r0|r1|r2|r3|r4|r5|r6|r7|r8|r9|r10|r11|r12|r13|r14|r15|pc|sp|lr)", 166 "immediate": "(0x[0-9a-f]+|[0-9]+)", 167 "shift": "(lsl|lsr|asr|ror)", 168 "dregister": "(d[0-9]|d[12][0-9]|d3[01])", 169 "dt": "(s8|s16|s32|s64|u8|u16|u32|u64|f16|f32|f64|i8|i16|i32|i64|p8|p64)", 170 "dt_untyped": "(untyped8|untyped16|untyped32)" 171 } 172 173 # List of converters. Each of them represents an instruction form and what to 174 # convert it to. This list needs to be complete; an exception is raised if we 175 # couldn't find a converter for the instruction. 176 # 177 # The first part of each tuple is a pattern to match. It's simply a regular 178 # expression. Additionally, each identifier in curly braces is replaced by the 179 # corresponding pattern from `pattern_matchers`. 180 # 181 # The second part of the tuple is a string that describes what the result will 182 # look like. Empty curly braces are replaced by matches, in order. 183 llvm_mc_instruction_converters = [ 184 ("it {condition}", "it {}"), 185 ("{mnemonic} {condition} {register} {immediate}", 186 "{}{} {}, #{}"), 187 ("{mnemonic} {condition} {register} {register} {immediate}", 188 "{}{} {}, {}, #{}"), 189 ("{mnemonic} {condition} {register} {register}", 190 "{}{} {}, {}"), 191 ("{mnemonic} {condition} {register} {register} {register}", 192 "{}{} {}, {}, {}"), 193 ("{mnemonic} {register} {register} {register}", 194 "{} {}, {}, {}"), 195 ("{mnemonic} {condition} {register} {register} {immediate}", 196 "{}{} {}, {}, #{}"), 197 ("{mnemonic} {condition} {register} {register} {register} {shift} " 198 "{immediate}", 199 "{}{} {}, {}, {}, {} #{}"), 200 ("{mnemonic} {condition} {register} {register} {register} {shift} " 201 "{register}", 202 "{}{} {}, {}, {}, {} {}"), 203 ("{mnemonic} {condition} {register} {register} {shift} {immediate}", 204 "{}{} {}, {}, {} #{}"), 205 ("{mnemonic} {condition} {register} {register} {shift} {register}", 206 "{}{} {}, {}, {} {}"), 207 ("{mnemonic} {condition} {register} {register} plus {immediate} offset", 208 "{}{} {}, [{}, #{}]"), 209 ("{mnemonic} {condition} {register} {register} minus {immediate} offset", 210 "{}{} {}, [{}, #-{}]"), 211 ("{mnemonic} {condition} {register} {register} plus {immediate} postindex", 212 "{}{} {}, [{}], #{}"), 213 ("{mnemonic} {condition} {register} {register} minus {immediate} " 214 "postindex", 215 "{}{} {}, [{}], #-{}"), 216 ("{mnemonic} {condition} {register} {register} plus {immediate} preindex", 217 "{}{} {}, [{}, #{}]!"), 218 ("{mnemonic} {condition} {register} {register} minus {immediate} " 219 "preindex", 220 "{}{} {}, [{}, #-{}]!"), 221 ("{mnemonic} {condition} {register} {register} plus {register} offset", 222 "{}{} {}, [{}, {}]"), 223 ("{mnemonic} {condition} {register} {register} minus {register} offset", 224 "{}{} {}, [{}, -{}]"), 225 ("{mnemonic} {condition} {register} {register} plus {register} postindex", 226 "{}{} {}, [{}], {}"), 227 ("{mnemonic} {condition} {register} {register} minus {register} " 228 "postindex", 229 "{}{} {}, [{}], -{}"), 230 ("{mnemonic} {condition} {register} {register} plus {register} preindex", 231 "{}{} {}, [{}, {}]!"), 232 ("{mnemonic} {condition} {register} {register} minus {register} preindex", 233 "{}{} {}, [{}, -{}]!"), 234 ("{mnemonic} {condition} {register} {register} plus {register} {shift} " 235 "{immediate} offset", 236 "{}{} {}, [{}, {}, {} #{}]"), 237 ("{mnemonic} {condition} {register} {register} minus {register} {shift} " 238 "{immediate} offset", 239 "{}{} {}, [{}, -{}, {} #{}]"), 240 ("{mnemonic} {condition} {register} {register} plus {register} {shift} " 241 "{immediate} postindex", 242 "{}{} {}, [{}], {}, {} #{}"), 243 ("{mnemonic} {condition} {register} {register} minus {register} {shift} " 244 "{immediate} postindex", 245 "{}{} {}, [{}], -{}, {} #{}"), 246 ("{mnemonic} {condition} {register} {register} plus {register} {shift} " 247 "{immediate} preindex", 248 "{}{} {}, [{}, {}, {} #{}]!"), 249 ("{mnemonic} {condition} {register} {register} minus {register} {shift} " 250 "{immediate} preindex", 251 "{}{} {}, [{}, -{}, {} #{}]!"), 252 ("{mnemonic} {dt} {dregister} {dregister} {dregister}", 253 "{}.{} {}, {}, {}"), 254 ("{mnemonic} {dt_untyped} {dregister} {dregister} {dregister}", DtUntypedToLLVM) 255 ] 256 257 # Work around issues in LLVM 3.8. 258 if triple == "thumbv8": 259 def ConvertMovRdImm(matches): 260 """ 261 LLVM chooses the T3 encoding for `mov <rd>, #<immediate>` when the 262 immediate fits both into a modified immediate (T2 encoding) and 16 263 bits (T3 encoding). Adding the `.W` modifier forces the T2 encoding to 264 be used. 265 """ 266 # The immediate is the second capture in "mov al {register} {immediate}". 267 imm = int(matches[1], 16) 268 if imm <= 0xffff: 269 lsb = imm & -imm 270 if (imm >> 8) < lsb: 271 return "mov.w {}, #{}".format(*matches) 272 # Fall back to a LLVM making the right decision. 273 return "mov {}, #{}".format(*matches) 274 llvm_mc_instruction_converters[:0] = [ 275 # The ARM ARM specifies that if <Rn> is PC in either an ADD or SUB 276 # instruction with an immediate, the assembler should use the ADR 277 # encoding. LLVM does not know about this subtlety. We get around this 278 # by manually translating the instruction to their ADR form. 279 ("add al {register} pc {immediate}", "adr {}, #{}"), 280 ("sub al {register} pc {immediate}", "adr {}, #-{}"), 281 282 # LLVM is (rightfully) being helpful by swapping register operands so 283 # that the 16 bit encoding of the following instructions is used. 284 # However, VIXL does not do this. These rules specifically add the `.w` 285 # modifier to force LLVM to use the 32 bit encoding if the last register 286 # is identical to first one. But at the same time, we should still use 287 # the narrow encoding if all registers are the same. 288 ("adcs al {register} (\\1) (\\1)", "adcs.n {}, {}, {}"), 289 ("adcs al {register} {register} (\\1)", "adcs.w {}, {}, {}"), 290 ("orrs al {register} (\\1) (\\1)", "orrs.n {}, {}, {}"), 291 ("orrs al {register} {register} (\\1)", "orrs.w {}, {}, {}"), 292 ("eors al {register} (\\1) (\\1)", "eors.n {}, {}, {}"), 293 ("eors al {register} {register} (\\1)", "eors.w {}, {}, {}"), 294 ("ands al {register} (\\1) (\\1)", "ands.n {}, {}, {}"), 295 ("ands al {register} {register} (\\1)", "ands.w {}, {}, {}"), 296 # Solve the same issue as for the previous rules, however, we need to 297 # take into account that ADD instructions with the stack pointer have 298 # additional 16 bit forms. 299 ("add al {register} (\\1) (\\1)", "add.n {}, {}, {}"), 300 ("add al {register} (\\1) r13", "add.w {}, {}, sp"), 301 ("add al {register} r13 (\\1)", "add.n {}, sp, {}"), 302 ("add al {register} {register} (\\1)", "add.w {}, {}, {}"), 303 ("mov al {register} {immediate}", ConvertMovRdImm) 304 ] 305 306 # Our test generator framework uses mnemonics starting with a capital letters. 307 # We need everythin to be lower case for LLVM. 308 vixl_instruction = vixl_instruction.lower() 309 310 llvm_instruction = [] 311 312 # VIXL may have generated more than one instruction seperated by ';' 313 # (an IT instruction for example). 314 for instruction in vixl_instruction.split(';'): 315 # Strip out extra white spaces. 316 instruction = instruction.strip() 317 # Try all converters in the list. 318 for pattern, result in llvm_mc_instruction_converters: 319 # Build the regular expression for this converter. 320 instruction_matcher = "^" + pattern.format(**pattern_matchers) + "$" 321 match = re.match(instruction_matcher, instruction) 322 if match: 323 # If we have a match, the object will contain a tuple of substrings. 324 if isinstance(result, types.FunctionType): 325 # `result` is a function, call it produce the instruction. 326 llvm_instruction.append(result(match.groups())) 327 else: 328 # `result` is a string, use it as the format string. 329 assert(isinstance(result, str)) 330 llvm_instruction.append(result.format(*match.groups())) 331 break 332 333 if llvm_instruction: 334 return "\n".join(llvm_instruction) 335 336 # No converters worked so raise an exception. 337 raise Exception("Unsupported instruction {}.".format(instruction)) 338 339 340def ReadTrace(trace): 341 """ 342 Receive the content of an assembler trace, extract the relevant information 343 and return it as a list of tuples. The first part of each typle is a string 344 representing the instruction. The second part is a list of bytes representing 345 the encoding. 346 347 For example: 348 349 [ 350 ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]), 351 ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]), 352 ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"]) 353 ] 354 """ 355 356 pattern = re.compile( 357 "^ (?P<encoding>(:?0x[0-9a-f]{2}, )+0x[0-9a-f]{2}) // (?P<instruction>.*)$", 358 re.M) 359 return [ 360 (m.group('instruction'), m.group('encoding').replace(" ", "").split(",")) 361 for m in re.finditer(pattern, trace) 362 ] 363 364 365def VerifyInstructionsWithLLVMMC(llvm_mc, f, triple): 366 """ 367 Extract all instructions from `f`, feed them to `llvm-mc` and make sure it's 368 encoded them the same way as VIXL. `triple` allows us to specify either 369 "thumbv8" or "armv8". 370 """ 371 372 vixl_reference = ReadTrace(f.read()) 373 vixl_instructions, vixl_encodings = zip(*vixl_reference) 374 instructions = [ 375 ConvertToLLVMFormat(instruction, triple) 376 for instruction in vixl_instructions 377 ] 378 llvm_mc_proc = subprocess.Popen( 379 [llvm_mc, '-assemble', '-triple={}'.format(triple), '-mattr=v8,crc', 380 # LLVM fails to recognize some instructions as valid T32 when we do not 381 # set `-mcpu`. 382 '-mcpu=cortex-a53', '-show-encoding'], 383 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 384 out, err = llvm_mc_proc.communicate("\n".join(instructions).encode()) 385 # If `llvm-mc` printed something to stderr then stop. 386 if err: 387 print(err.decode()) 388 return 389 390 # Extract list of bytes from `llvm-mc` output. It's in the following form: 391 # 392 # clzeq r0, r0 @ encoding: [0x10,0x0f,0x6f,0x01] 393 # ^^^^ ^^^^ ^^^^ ^^^^ 394 llvm_encodings = [ 395 match_object.group('encoding').replace(" ", "").split(",") 396 for match_object in re.finditer(".*@ encoding: \[(?P<encoding>.*)\]", 397 out.decode()) 398 ] 399 400 # If LLVM has generated exactly twice as much instructions, we assume this is 401 # due to IT instructions preceding every instruction under test. VIXL's 402 # assembly reference files will contain a single array of 4 bytes encoding 403 # both the IT and the following instruction. While LLVM will have decoded them 404 # into two seperate 2 bytes arrays. 405 if len(llvm_encodings) == 2 * len(vixl_encodings): 406 llvm_encodings = [ 407 llvm_encodings[i * 2] + llvm_encodings[(i * 2) + 1] 408 for i in range(0, len(vixl_encodings)) 409 ] 410 411 # Check the encodings from LLVM are identical to VIXL's. 412 if len(llvm_encodings) != len(vixl_encodings): 413 print("""Error: llvm-mc generated {} instructions than there are in the 414generated trace. 415 """.format("fewer" if len(llvm_encodings) < len(vixl_encodings) else "more")) 416 else: 417 for i in range(0, len(vixl_encodings)): 418 if llvm_encodings[i] != vixl_encodings[i]: 419 print("""Error: llvm-mc disagrees on the encoding of \"{instruction}\": 420 LLVM-MC: {llvm} 421 VIXL: {vixl} 422 """.format(instruction=vixl_instructions[i].replace("\n", "; "), 423 llvm=llvm_encodings[i], 424 vixl=vixl_encodings[i])) 425 426 427if __name__ == "__main__": 428 args = BuildOptions() 429 430 CheckLLVMVersion(args.llvm_mc) 431 432 trace_dir = 'test/aarch32/traces/' 433 trace_files = [ 434 trace_file 435 for trace_file in os.listdir(trace_dir) 436 if trace_file.startswith("assembler-") 437 ] 438 trace_files.sort() 439 for trace_file in trace_files: 440 if args.verbose: 441 print("Verifying \"" + trace_file + "\".") 442 with open(os.path.join(trace_dir, trace_file), "r") as f: 443 if "t32" in trace_file: 444 VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "thumbv8") 445 elif "a32" in trace_file: 446 VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "armv8") 447 else: 448 raise Exception("Failed to recognize the ISA in \"" + trace_file + "\".") 449