• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2
3# Copyright 2016, VIXL authors
4# All rights reserved.
5#
6# Redistribution and use in source and binary forms, with or without
7# modification, are permitted provided that the following conditions are met:
8#
9#   * Redistributions of source code must retain the above copyright notice,
10#     this list of conditions and the following disclaimer.
11#   * Redistributions in binary form must reproduce the above copyright notice,
12#     this list of conditions and the following disclaimer in the documentation
13#     and/or other materials provided with the distribution.
14#   * Neither the name of ARM Limited nor the names of its contributors may be
15#     used to endorse or promote products derived from this software without
16#     specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
19# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
22# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29"""
30Verify generated AArch32 assembler traces against `llvm-mc`.
31
32This script will find all files in `test/aarch32/traces/` with names starting
33will `assembler`, and check them against `llvm-mc`. It checks our assembler is
34correct by looking up what instruction we meant to asssemble, assemble it with
35`llvm` and check the result is bit identical to what our assembler generated.
36
37You may run the script with no arguments from VIXL's top-level directory as long
38as `llvm-mc` is in your PATH. You may provide a different `llvm-mc` path with
39the `--llvm-mc` option. This script relies on version 3.8 or higher of
40LLVM. Previous versions refuse to assemble some instructions that ARMv8 allows,
41but ARMv7 did not.
42
43For example, let's say we have the following assembler trace for CLZ
44(the real trace is a lot bigger):
45
46~~~
47static const byte kInstruction_Clz_eq_r0_r0[] = {
48  0x10, 0x0f, 0x6f, 0x01 // Clz eq r0 r0
49};
50static const byte kInstruction_Clz_eq_r0_r1[] = {
51  0x11, 0x0f, 0x6f, 0x01 // Clz eq r0 r1
52};
53static const byte kInstruction_Clz_eq_r0_r2[] = {
54  0x12, 0x0f, 0x6f, 0x01 // Clz eq r0 r2
55};
56static const TestResult kReferenceClz[] = {
57  {
58    ARRAY_SIZE(kInstruction_Clz_eq_r0_r0),
59    kInstruction_Clz_eq_r0_r0,
60  },
61  {
62    ARRAY_SIZE(kInstruction_Clz_eq_r0_r1),
63    kInstruction_Clz_eq_r0_r1,
64  },
65  {
66    ARRAY_SIZE(kInstruction_Clz_eq_r0_r2),
67    kInstruction_Clz_eq_r0_r2,
68  },
69};
70~~~
71
72The traces contain both the list of bytes that were encoded as well as a comment
73with a description of the instruction this is. This script searches for these
74lines and checks them.
75
76With our example, the script will find the following:
77
78    [
79      ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
80      ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
81      ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
82    ]
83
84Then the tricky part is to convert the description of the instruction into the
85following valid assembly syntax:
86
87    clzeq r0, r0
88    clzeq r0, r1
89    clzeq r0, r2
90
91Our example is easy, but it gets more complicated with load and store
92instructions for example. We can feed this as input to `llvm-mc`:
93
94    $ echo "
95      clzeq r0, r0
96      clzeq r0, r1
97      clzeq r0, r2
98    " | llvm-mc -assemble -arch=arm -mattr=v8,crc -show-encoding
99
100And we will get the following output:
101
102            .text
103            clzeq   r0, r0                  @ encoding: [0x10,0x0f,0x6f,0x01]
104            clzeq   r0, r1                  @ encoding: [0x11,0x0f,0x6f,0x01]
105            clzeq   r0, r2                  @ encoding: [0x12,0x0f,0x6f,0x01]
106
107The script will finally extract the encoding and compare it to what VIXL
108generated.
109"""
110
111import argparse
112import subprocess
113import os
114import re
115import itertools
116import types
117
118def BuildOptions():
119  result = argparse.ArgumentParser(
120      description = 'Use `llvm-mc` to check the assembler traces are correct.',
121      formatter_class = argparse.ArgumentDefaultsHelpFormatter)
122  result.add_argument('--llvm-mc', default='llvm-mc', help='Path to llvm-mc')
123  result.add_argument('--verbose', '-v', action='store_true')
124  return result.parse_args()
125
126
127def CheckLLVMVersion(llvm_mc):
128  version = subprocess.check_output([llvm_mc, '-version'])
129  m = re.search("^  LLVM version (\d)\.(\d)\.\d$", version.decode(), re.M)
130  major, minor = m.groups()
131  if int(major) < 3 or (int(major) == 3 and int(minor) < 8):
132    raise Exception("This script requires LLVM version 3.8 or higher.")
133
134
135def ConvertToLLVMFormat(vixl_instruction, triple):
136  """
137  Take an string representing an instruction and convert it to assembly syntax
138  for LLVM. VIXL's test generation framework will print instruction
139  representations as a space seperated list. The first element is the mnemonic
140  and the following elements are operands.
141  """
142
143  # Dictionnary of patterns. The key is an identifier used in
144  # `llvm_mc_instruction_converters` below. The value needs to be a capturing
145  # regular expression.
146  pattern_matchers = {
147      # Allow an optional underscore in case this an "and" instruction.
148      "mnemonic": "(\w+?)_?",
149      "condition":
150          "(al|eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le)",
151      "register":
152          "(r0|r1|r2|r3|r4|r5|r6|r7|r8|r9|r10|r11|r12|r13|r14|r15|pc|sp|lr)",
153      "immediate": "(0x[0-9a-f]+|[0-9]+)",
154      "shift": "(lsl|lsr|asr|ror)",
155  }
156
157  # List of converters. Each of them represents an instruction form and what to
158  # convert it to. This list needs to be complete; an exception is raised if we
159  # couldn't find a converter for the instruction.
160  #
161  # The first part of each tuple is a pattern to match. It's simply a regular
162  # expression. Additionally, each identifier in curly braces is replaced by the
163  # corresponding pattern from `pattern_matchers`.
164  #
165  # The second part of the tuple is a string that describes what the result will
166  # look like. Empty curly braces are replaced by matches, in order.
167  llvm_mc_instruction_converters = [
168      ("it {condition}", "it {}"),
169      ("{mnemonic} {condition} {register} {immediate}",
170       "{}{} {}, #{}"),
171      ("{mnemonic} {condition} {register} {register} {immediate}",
172       "{}{} {}, {}, #{}"),
173      ("{mnemonic} {condition} {register} {register}",
174       "{}{} {}, {}"),
175      ("{mnemonic} {condition} {register} {register} {register}",
176       "{}{} {}, {}, {}"),
177      ("{mnemonic} {register} {register} {register}",
178       "{} {}, {}, {}"),
179      ("{mnemonic} {condition} {register} {register} {immediate}",
180       "{}{} {}, {}, #{}"),
181      ("{mnemonic} {condition} {register} {register} {register} {shift} "
182           "{immediate}",
183       "{}{} {}, {}, {}, {} #{}"),
184      ("{mnemonic} {condition} {register} {register} {register} {shift} "
185           "{register}",
186       "{}{} {}, {}, {}, {} {}"),
187      ("{mnemonic} {condition} {register} {register} {shift} {immediate}",
188       "{}{} {}, {}, {} #{}"),
189      ("{mnemonic} {condition} {register} {register} {shift} {register}",
190       "{}{} {}, {}, {} {}"),
191      ("{mnemonic} {condition} {register} {register} plus {immediate} offset",
192       "{}{} {}, [{}, #{}]"),
193      ("{mnemonic} {condition} {register} {register} minus {immediate} offset",
194       "{}{} {}, [{}, #-{}]"),
195      ("{mnemonic} {condition} {register} {register} plus {immediate} postindex",
196       "{}{} {}, [{}], #{}"),
197      ("{mnemonic} {condition} {register} {register} minus {immediate} "
198           "postindex",
199       "{}{} {}, [{}], #-{}"),
200      ("{mnemonic} {condition} {register} {register} plus {immediate} preindex",
201       "{}{} {}, [{}, #{}]!"),
202      ("{mnemonic} {condition} {register} {register} minus {immediate} "
203           "preindex",
204       "{}{} {}, [{}, #-{}]!"),
205      ("{mnemonic} {condition} {register} {register} plus {register} offset",
206       "{}{} {}, [{}, {}]"),
207      ("{mnemonic} {condition} {register} {register} minus {register} offset",
208       "{}{} {}, [{}, -{}]"),
209      ("{mnemonic} {condition} {register} {register} plus {register} postindex",
210       "{}{} {}, [{}], {}"),
211      ("{mnemonic} {condition} {register} {register} minus {register} "
212           "postindex",
213       "{}{} {}, [{}], -{}"),
214      ("{mnemonic} {condition} {register} {register} plus {register} preindex",
215       "{}{} {}, [{}, {}]!"),
216      ("{mnemonic} {condition} {register} {register} minus {register} preindex",
217       "{}{} {}, [{}, -{}]!"),
218      ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
219           "{immediate} offset",
220       "{}{} {}, [{}, {}, {} #{}]"),
221      ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
222           "{immediate} offset",
223       "{}{} {}, [{}, -{}, {} #{}]"),
224      ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
225           "{immediate} postindex",
226       "{}{} {}, [{}], {}, {} #{}"),
227      ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
228           "{immediate} postindex",
229       "{}{} {}, [{}], -{}, {} #{}"),
230      ("{mnemonic} {condition} {register} {register} plus {register} {shift} "
231           "{immediate} preindex",
232       "{}{} {}, [{}, {}, {} #{}]!"),
233      ("{mnemonic} {condition} {register} {register} minus {register} {shift} "
234           "{immediate} preindex",
235       "{}{} {}, [{}, -{}, {} #{}]!"),
236  ]
237
238  # Work around issues in LLVM 3.8.
239  if triple == "thumbv8":
240    def ConvertMovRdImm(matches):
241      """
242      LLVM chooses the T3 encoding for `mov <rd>, #<immediate>` when the
243      immediate fits both into a modified immediate (T2 encoding) and 16
244      bits (T3 encoding). Adding the `.W` modifier forces the T2 encoding to
245      be used.
246      """
247      # The immediate is the second capture in "mov al {register} {immediate}".
248      imm = int(matches[1], 16)
249      if imm <= 0xffff:
250        lsb = imm & -imm
251        if (imm >> 8) < lsb:
252          return "mov.w {}, #{}".format(*matches)
253      # Fall back to a LLVM making the right decision.
254      return "mov {}, #{}".format(*matches)
255    llvm_mc_instruction_converters[:0] = [
256        # The ARM ARM specifies that if <Rn> is PC in either an ADD or SUB
257        # instruction with an immediate, the assembler should use the ADR
258        # encoding. LLVM does not know about this subtlety. We get around this
259        # by manually translating the instruction to their ADR form.
260        ("add al {register} pc {immediate}", "adr {}, #{}"),
261        ("sub al {register} pc {immediate}", "adr {}, #-{}"),
262
263        # LLVM is (rightfully) being helpful by swapping register operands so
264        # that the 16 bit encoding of the following instructions is used.
265        # However, VIXL does not do this. These rules specifically add the `.w`
266        # modifier to force LLVM to use the 32 bit encoding if the last register
267        # is identical to first one. But at the same time, we should still use
268        # the narrow encoding if all registers are the same.
269        ("adcs al {register} (\\1) (\\1)", "adcs.n {}, {}, {}"),
270        ("adcs al {register} {register} (\\1)", "adcs.w {}, {}, {}"),
271        ("orrs al {register} (\\1) (\\1)", "orrs.n {}, {}, {}"),
272        ("orrs al {register} {register} (\\1)", "orrs.w {}, {}, {}"),
273        ("eors al {register} (\\1) (\\1)", "eors.n {}, {}, {}"),
274        ("eors al {register} {register} (\\1)", "eors.w {}, {}, {}"),
275        ("ands al {register} (\\1) (\\1)", "ands.n {}, {}, {}"),
276        ("ands al {register} {register} (\\1)", "ands.w {}, {}, {}"),
277        # Solve the same issue as for the previous rules, however, we need to
278        # take into account that ADD instructions with the stack pointer have
279        # additional 16 bit forms.
280        ("add al {register} (\\1) (\\1)", "add.n {}, {}, {}"),
281        ("add al {register} (\\1) r13", "add.w {}, {}, sp"),
282        ("add al {register} r13 (\\1)", "add.n {}, sp, {}"),
283        ("add al {register} {register} (\\1)", "add.w {}, {}, {}"),
284        ("mov al {register} {immediate}", ConvertMovRdImm)
285    ]
286
287  # Our test generator framework uses mnemonics starting with a capital letters.
288  # We need everythin to be lower case for LLVM.
289  vixl_instruction = vixl_instruction.lower()
290
291  llvm_instruction = []
292
293  # VIXL may have generated more than one instruction seperated by ';'
294  # (an IT instruction for example).
295  for instruction in vixl_instruction.split(';'):
296    # Strip out extra white spaces.
297    instruction = instruction.strip()
298    # Try all converters in the list.
299    for pattern, result in llvm_mc_instruction_converters:
300      # Build the regular expression for this converter.
301      instruction_matcher = "^" + pattern.format(**pattern_matchers) + "$"
302      match = re.match(instruction_matcher, instruction)
303      if match:
304        # If we have a match, the object will contain a tuple of substrings.
305        if isinstance(result, types.FunctionType):
306          # `result` is a function, call it produce the instruction.
307          llvm_instruction.append(result(match.groups()))
308        else:
309          # `result` is a string, use it as the format string.
310          assert(isinstance(result, str))
311          llvm_instruction.append(result.format(*match.groups()))
312        break
313
314  if llvm_instruction:
315    return "\n".join(llvm_instruction)
316
317  # No converters worked so raise an exception.
318  raise Exception("Unsupported instruction {}.".format(instruction))
319
320
321def ReadTrace(trace):
322  """
323  Receive the content of an assembler trace, extract the relevant information
324  and return it as a list of tuples. The first part of each typle is a string
325  representing the instruction. The second part is a list of bytes representing
326  the encoding.
327
328  For example:
329
330      [
331        ("Clz eq r0 r0", ["0x10", "0x0f", "0x6f", "0x01"]),
332        ("Clz eq r0 r1", ["0x11", "0x0f", "0x6f", "0x01"]),
333        ("Clz eq r0 r2", ["0x12", "0x0f", "0x6f", "0x01"])
334      ]
335  """
336
337  pattern = re.compile(
338      "^  (?P<encoding>(:?0x[0-9a-f]{2}, )+0x[0-9a-f]{2}) // (?P<instruction>.*)$",
339      re.M)
340  return [
341      (m.group('instruction'), m.group('encoding').replace(" ", "").split(","))
342      for m in re.finditer(pattern, trace)
343  ]
344
345
346def VerifyInstructionsWithLLVMMC(llvm_mc, f, triple):
347  """
348  Extract all instructions from `f`, feed them to `llvm-mc` and make sure it's
349  encoded them the same way as VIXL. `triple` allows us to specify either
350  "thumbv8" or "armv8".
351  """
352
353  vixl_reference = ReadTrace(f.read())
354  vixl_instructions, vixl_encodings = zip(*vixl_reference)
355  instructions = [
356      ConvertToLLVMFormat(instruction, triple)
357      for instruction in vixl_instructions
358  ]
359  llvm_mc_proc = subprocess.Popen(
360      [llvm_mc, '-assemble', '-triple={}'.format(triple), '-mattr=v8,crc',
361       # LLVM fails to recognize some instructions as valid T32 when we do not
362       # set `-mcpu`.
363       '-mcpu=cortex-a53', '-show-encoding'],
364      stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
365  out, err = llvm_mc_proc.communicate("\n".join(instructions).encode())
366  # If `llvm-mc` printed something to stderr then stop.
367  if err:
368    print(err.decode())
369    return
370
371  # Extract list of bytes from `llvm-mc` output. It's in the following form:
372  #
373  #         clzeq   r0, r0                  @ encoding: [0x10,0x0f,0x6f,0x01]
374  #                                                      ^^^^ ^^^^ ^^^^ ^^^^
375  llvm_encodings = [
376      match_object.group('encoding').replace(" ", "").split(",")
377      for match_object in re.finditer(".*@ encoding: \[(?P<encoding>.*)\]",
378                                      out.decode())
379  ]
380
381  # If LLVM has generated exactly twice as much instructions, we assume this is
382  # due to IT instructions preceding every instruction under test. VIXL's
383  # assembly reference files will contain a single array of 4 bytes encoding
384  # both the IT and the following instruction. While LLVM will have decoded them
385  # into two seperate 2 bytes arrays.
386  if len(llvm_encodings) == 2 * len(vixl_encodings):
387    llvm_encodings = [
388        llvm_encodings[i * 2] + llvm_encodings[(i * 2) + 1]
389        for i in range(0, len(vixl_encodings))
390    ]
391
392  # Check the encodings from LLVM are identical to VIXL's.
393  if len(llvm_encodings) != len(vixl_encodings):
394    print("""Error: llvm-mc generated {} instructions than there are in the
395generated trace.
396        """.format("fewer" if len(llvm_encodings) < len(vixl_encodings) else "more"))
397  else:
398    for i in range(0, len(vixl_encodings)):
399      if llvm_encodings[i] != vixl_encodings[i]:
400        print("""Error: llvm-mc disagrees on the encoding of \"{instruction}\":
401  LLVM-MC: {llvm}
402  VIXL:    {vixl}
403            """.format(instruction=vixl_instructions[i].replace("\n", "; "),
404                       llvm=llvm_encodings[i],
405                       vixl=vixl_encodings[i]))
406
407
408if __name__ == "__main__":
409  args = BuildOptions()
410
411  CheckLLVMVersion(args.llvm_mc)
412
413  trace_dir = 'test/aarch32/traces/'
414  trace_files = [
415      trace_file
416      for trace_file in os.listdir(trace_dir)
417      if trace_file.startswith("assembler-")
418  ]
419  trace_files.sort()
420  for trace_file in trace_files:
421    if args.verbose:
422      print("Verifying \"" + trace_file + "\".")
423    with open(os.path.join(trace_dir, trace_file), "r") as f:
424      if "t32" in trace_file:
425        VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "thumbv8")
426      elif "a32" in trace_file:
427        VerifyInstructionsWithLLVMMC(args.llvm_mc, f, "armv8")
428      else:
429        raise Exception("Failed to recognize the ISA in \"" + trace_file + "\".")
430